Fix for bug 9413 and the related Malayalam issue reported on wikitech-l.

* Added $wgFixArchaicUnicode, which, if enabled, converts some deprecated Unicode sequences in Arabic and Malayalam text to their Unicode 5.1 equivalents.
* Added generateNormalizerData.php to generate the relevant data files. Added the generated data files also. 
* Made most things call the new wrapper method $wgContLang->normalize() instead of UtfNormal::cleanUp(), so that Unicode normalization can be customised on a per-language basis.
* Added some generic support for conversion tables to Language so that subclasses can easily implement these kinds of transformations.
This commit is contained in:
Tim Starling 2010-01-04 08:28:50 +00:00
parent dff724821e
commit ad19c032b0
16 changed files with 250 additions and 11 deletions

View file

@ -283,6 +283,9 @@ it from source control: http://www.mediawiki.org/wiki/Download_from_SVN
* (bug 19791) Add URL of file source as comment to thumbs (for ImageMagick)
* (bug 21946) Sorted wikitables do not properly handle minus signs
* (bug 18885) Red links for media files do not support shared repositories
* Added $wgFixArchaicUnicode, which, if enabled, converts some deprecated
Unicode sequences in Arabic and Malayalam text to their Unicode 5.1
equivalents.
=== Bug fixes in 1.16 ===

View file

@ -867,6 +867,19 @@ $wgInputEncoding = 'UTF-8';
$wgOutputEncoding = 'UTF-8';
$wgEditEncoding = '';
/**
* Set this to true to clean up archaic Unicode sequences in Arabic and
* Malayalam text. Currently only works if $wgLanguageCode is set to Arabic
* or Malayalam.
*
* Enabling this is generally a good idea for new wikis, since it fixes a few
* technical problems to do with editing these languages. However, if it's
* enabled on an existing wiki, pages which contain the problematic characters
* in their page titles may become inaccessible. Running maintenance/cleanupTitles.php
* after enabling it may fix this.
*/
$wgFixArchaicUnicode = false;
/**
* Locale for LC_CTYPE, to work around http://bugs.php.net/bug.php?id=45132
* For Unix-like operating systems, set this to to a locale that has a UTF-8

View file

@ -203,7 +203,8 @@ class WebRequest {
$data[$key] = $this->normalizeUnicode( $val );
}
} else {
$data = UtfNormal::cleanUp( $data );
global $wgContLang;
$data = $wgContLang->normalize( $data );
}
return $data;
}
@ -600,6 +601,7 @@ class WebRequest {
* @return string or NULL if no such file.
*/
public function getFileName( $key ) {
global $wgContLang;
if( !isset( $_FILES[$key] ) ) {
return null;
}
@ -608,7 +610,7 @@ class WebRequest {
# Safari sends filenames in HTML-encoded Unicode form D...
# Horrid and evil! Let's try to make some kind of sense of it.
$name = Sanitizer::decodeCharReferences( $name );
$name = UtfNormal::cleanUp( $name );
$name = $wgContLang->normalize( $name );
wfDebug( "WebRequest::getFileName() '" . $_FILES[$key]['name'] . "' normalized to '$name'\n" );
return $name;
}

View file

@ -56,7 +56,7 @@ class Xml {
/**
* Format an XML element as with self::element(), but run text through the
* UtfNormal::cleanUp() validator first to ensure that no invalid UTF-8
* $wgContLang->normalize() validator first to ensure that no invalid UTF-8
* is passed.
*
* @param $element String:
@ -65,12 +65,13 @@ class Xml {
* @return string
*/
public static function elementClean( $element, $attribs = array(), $contents = '') {
global $wgContLang;
if( $attribs ) {
$attribs = array_map( array( 'UtfNormal', 'cleanUp' ), $attribs );
}
if( $contents ) {
wfProfileIn( __METHOD__ . '-norm' );
$contents = UtfNormal::cleanUp( $contents );
$contents = $wgContLang->normalize( $contents );
wfProfileOut( __METHOD__ . '-norm' );
}
return self::element( $element, $attribs, $contents );

View file

@ -304,7 +304,8 @@ class ApiResult extends ApiBase {
{
if(!is_string($s))
return;
$s = UtfNormal::cleanUp($s);
global $wgContLang;
$s = $wgContLang->normalize($s);
}
public function execute() {

View file

@ -62,6 +62,11 @@ class Language {
var $minSearchLength;
var $mExtendedSpecialPageAliases;
/**
* ReplacementArray object caches
*/
var $transformData = array();
static public $dataCache;
static public $mLangObjCache = array();
@ -1865,6 +1870,36 @@ class Language {
}
}
/**
* Convert a UTF-8 string to normal form C. In Malayalam and Arabic, this
* also cleans up certain backwards-compatible sequences, converting them
* to the modern Unicode equivalent.
*
* This is language-specific for performance reasons only.
*/
function normalize( $s ) {
return UtfNormal::cleanUp( $s );
}
/**
* Transform a string using serialized data stored in the given file (which
* must be in the serialized subdirectory of $IP). The file contains pairs
* mapping source characters to destination characters.
*
* The data is cached in process memory. This will go faster if you have the
* FastStringSearch extension.
*/
function transformUsingPairFile( $file, $string ) {
if ( !isset( $this->transformData[$file] ) ) {
$data = wfGetPrecompiledData( $file );
if ( $data === false ) {
throw new MWException( __METHOD__.": The transformation file $file is missing" );
}
$this->transformData[$file] = new ReplacementArray( $data );
}
return $this->transformData[$file]->replace( $string );
}
/**
* For right-to-left language support
*

View file

@ -6,6 +6,7 @@
* @author Niklas Laxström
*/
class LanguageAr extends Language {
var $normalizeArray;
function convertPlural( $count, $forms ) {
if ( !count($forms) ) { return ''; }
@ -26,4 +27,20 @@ class LanguageAr extends Language {
}
return $forms[$index];
}
/**
* Temporary hack for bug 9413: replace Arabic presentation forms with their
* standard equivalents.
*
* FIXME: This is language-specific for now only to avoid the negative
* performance impact of enabling it for all languages.
*/
function normalize( $s ) {
global $wgFixArchaicUnicode;
$s = parent::normalize( $s );
if ( $wgFixArchaicUnicode ) {
$s = $this->transformUsingPairFile( 'normalize-ar.ser', $s );
}
return $s;
}
}

View file

@ -0,0 +1,22 @@
<?php
class LanguageMl extends Language {
/**
* Temporary hack for the issue described at
* http://permalink.gmane.org/gmane.science.linguistics.wikipedia.technical/46396
* Convert Unicode 5.0 style Malayalam input to Unicode 5.1. Similar to
* bug 9413. Also fixes miscellaneous problems due to mishandling of ZWJ,
* e.g. bug 11162.
*
* FIXME: This is language-specific for now only to avoid the negative
* performance impact of enabling it for all languages.
*/
function normalize( $s ) {
global $wgFixArchaicUnicode;
$s = parent::normalize( $s );
if ( $wgFixArchaicUnicode ) {
$s = $this->transformUsingPairFile( 'normalize-ml.ser', $s );
}
return $s;
}
}

View file

@ -65,7 +65,7 @@ class ImageCleanup extends TableCleanup {
$cleaned = $wgContLang->checkTitleEncoding( $cleaned );
// Many of remainder look like non-normalized unicode
$cleaned = UtfNormal::cleanUp( $cleaned );
$cleaned = $wgContLang->normalize( $cleaned );
$title = Title::makeTitleSafe( NS_FILE, $cleaned );

View file

@ -37,8 +37,9 @@ class TitleCleanup extends TableCleanup {
}
protected function processRow( $row ) {
global $wgContLang;
$display = Title::makeName( $row->page_namespace, $row->page_title );
$verified = UtfNormal::cleanUp( $display );
$verified = $wgContLang->normalize( $display );
$title = Title::newFromText( $verified );
if( !is_null( $title )

View file

@ -52,9 +52,10 @@ class WatchlistCleanup extends TableCleanup {
}
protected function processRow( $row ) {
global $wgContLang;
$current = Title::makeTitle( $row->wl_namespace, $row->wl_title );
$display = $current->getPrefixedText();
$verified = UtfNormal::cleanUp( $display );
$verified = $wgContLang->normalize( $display );
$title = Title::newFromText( $verified );
if( $row->wl_user == 0 || is_null( $title ) || !$title->equals( $current ) ) {

View file

@ -236,6 +236,7 @@ class TextPassDumper extends BackupDumper {
* May throw a database error if, say, the server dies during query.
*/
private function getTextDb( $id ) {
global $wgContLang;
$id = intval( $id );
$row = $this->db->selectRow( 'text',
array( 'old_text', 'old_flags' ),
@ -246,7 +247,7 @@ class TextPassDumper extends BackupDumper {
return false;
}
$stripped = str_replace( "\r", "", $text );
$normalized = UtfNormal::cleanUp( $stripped );
$normalized = $wgContLang->normalize( $stripped );
return $normalized;
}
@ -321,6 +322,8 @@ class TextPassDumper extends BackupDumper {
}
private function getTextSpawnedOnce( $id ) {
global $wgContLang;
$ok = fwrite( $this->spawnWrite, "$id\n" );
//$this->progress( ">> $id" );
if( !$ok ) return false;
@ -351,7 +354,7 @@ class TextPassDumper extends BackupDumper {
// Do normalization in the dump thread...
$stripped = str_replace( "\r", "", $text );
$normalized = UtfNormal::cleanUp( $stripped );
$normalized = $wgContLang->normalize( $stripped );
return $normalized;
}

View file

@ -0,0 +1,137 @@
<?php
require_once( dirname( __FILE__ ) . '/../Maintenance.php' );
require_once( dirname( __FILE__ ) . '/../../includes/normal/UtfNormalUtil.php' );
/**
* Generates normalizer data files for Arabic and Malayalam.
* For NFC see includes/normal.
*/
class GenerateNormalizerData extends Maintenance {
var $dataFile;
public function __construct() {
parent::__construct();
$this->addOption( 'unicode-data-file', 'The local location of the data file ' .
'from http://unicode.org/Public/UNIDATA/UnicodeData.txt', false, true );
}
public function execute() {
if ( !$this->hasOption( 'unicode-data-file' ) ) {
$this->dataFile = 'UnicodeData.txt';
if ( !file_exists( $this->dataFile ) ) {
$this->error( "Unable to find UnicodeData.txt. Please specify its location with --unicode-data-file=<FILE>" );
exit( 1 );
}
} else {
$this->dataFile = $this->getOption( 'unicode-data-file' );
if ( !file_exists( $this->dataFile ) ) {
$this->error( 'Unable to find the specified data file.' );
exit( 1 );
}
}
$this->generateArabic();
$this->generateMalayalam();
}
function generateArabic() {
$file = fopen( $this->dataFile, 'r' );
if ( !$file ) {
$this->error( 'Unable to open the data file.' );
exit( 1 );
}
// For the file format, see http://www.unicode.org/reports/tr44/
$fieldNames = array(
'Code',
'Name',
'General_Category',
'Canonical_Combining_Class',
'Bidi_Class',
'Decomposition_Type_Mapping',
'Numeric_Type_Value',
'Bidi_Mirrored',
'Unicode_1_Name',
'ISO_Comment',
'Simple_Uppercase_Mapping',
'Simple_Lowercase_Mapping',
'Simple_Titlecase_Mapping'
);
$pairs = array();
$lineNum = 0;
while ( false !== ( $line = fgets( $file ) ) ) {
++$lineNum;
# Strip comments
$line = trim( substr( $line, 0, strcspn( $line, '#' ) ) );
if ( $line === '' ) {
continue;
}
# Split fields
$numberedData = explode( ';', $line );
$data = array();
foreach ( $fieldNames as $number => $name ) {
$data[$name] = $numberedData[$number];
}
$code = base_convert( $data['Code'], 16, 10 );
if ( ( $code >= 0xFB50 && $code <= 0xFDFF ) # Arabic presentation forms A
|| ( $code >= 0xFE70 && $code <= 0xFEFF ) ) # Arabic presentation forms B
{
if ( $data['Decomposition_Type_Mapping'] === '' ) {
// No decomposition
continue;
}
if ( !preg_match( '/^ *(<\w*>) +([0-9A-F ]*)$/',
$data['Decomposition_Type_Mapping'], $m ) )
{
$this->error( "Can't parse Decomposition_Type/Mapping on line $lineNum" );
$this->error( $line );
continue;
}
$source = hexSequenceToUtf8( $data['Code'] );
$dest = hexSequenceToUtf8( $m[2] );
$pairs[$source] = $dest;
}
}
global $IP;
file_put_contents( "$IP/serialized/normalize-ar.ser", serialize( $pairs ) );
echo "ar: " . count( $pairs ) . " pairs written.\n";
}
function generateMalayalam() {
$hexPairs = array(
# From http://unicode.org/versions/Unicode5.1.0/#Malayalam_Chillu_Characters
'0D23 0D4D 200D' => '0D7A',
'0D28 0D4D 200D' => '0D7B',
'0D30 0D4D 200D' => '0D7C',
'0D32 0D4D 200D' => '0D7D',
'0D33 0D4D 200D' => '0D7E',
# From http://permalink.gmane.org/gmane.science.linguistics.wikipedia.technical/46413
'0D15 0D4D 200D' => '0D7F',
);
$pairs = array();
foreach ( $hexPairs as $hexSource => $hexDest ) {
$source = hexSequenceToUtf8( $hexSource );
$dest = hexSequenceToUtf8( $hexDest );
$pairs[$source] = $dest;
}
global $IP;
file_put_contents( "$IP/serialized/normalize-ml.ser", serialize( $pairs ) );
echo "ml: " . count( $pairs ) . " pairs written.\n";
}
}
$maintClass = 'GenerateNormalizerData';
require_once( DO_MAINTENANCE );

View file

@ -102,7 +102,8 @@ class PPFuzzTester {
// This resolves a few differences between the old preprocessor and the
// XML-based one, which doesn't like illegals and converts line endings.
// It's done by the MW UI, so it's a reasonably legitimate thing to do.
$s = UtfNormal::cleanUp( $s );
global $wgContLang;
$s = $wgContLang->normalize( $s );
return $s;
}

File diff suppressed because one or more lines are too long

View file

@ -0,0 +1 @@
a:6:{s:9:"ണ്‍";s:3:"ൺ";s:9:"ന്‍";s:3:"ൻ";s:9:"ര്‍";s:3:"ർ";s:9:"ല്‍";s:3:"ൽ";s:9:"ള്‍";s:3:"ൾ";s:9:"ക്‍";s:3:"ൿ";}