Fix for bug 9413 and the related Malayalam issue reported on wikitech-l.
* Added $wgFixArchaicUnicode, which, if enabled, converts some deprecated Unicode sequences in Arabic and Malayalam text to their Unicode 5.1 equivalents. * Added generateNormalizerData.php to generate the relevant data files. Added the generated data files also. * Made most things call the new wrapper method $wgContLang->normalize() instead of UtfNormal::cleanUp(), so that Unicode normalization can be customised on a per-language basis. * Added some generic support for conversion tables to Language so that subclasses can easily implement these kinds of transformations.
This commit is contained in:
parent
dff724821e
commit
ad19c032b0
16 changed files with 250 additions and 11 deletions
|
|
@ -283,6 +283,9 @@ it from source control: http://www.mediawiki.org/wiki/Download_from_SVN
|
|||
* (bug 19791) Add URL of file source as comment to thumbs (for ImageMagick)
|
||||
* (bug 21946) Sorted wikitables do not properly handle minus signs
|
||||
* (bug 18885) Red links for media files do not support shared repositories
|
||||
* Added $wgFixArchaicUnicode, which, if enabled, converts some deprecated
|
||||
Unicode sequences in Arabic and Malayalam text to their Unicode 5.1
|
||||
equivalents.
|
||||
|
||||
=== Bug fixes in 1.16 ===
|
||||
|
||||
|
|
|
|||
|
|
@ -867,6 +867,19 @@ $wgInputEncoding = 'UTF-8';
|
|||
$wgOutputEncoding = 'UTF-8';
|
||||
$wgEditEncoding = '';
|
||||
|
||||
/**
|
||||
* Set this to true to clean up archaic Unicode sequences in Arabic and
|
||||
* Malayalam text. Currently only works if $wgLanguageCode is set to Arabic
|
||||
* or Malayalam.
|
||||
*
|
||||
* Enabling this is generally a good idea for new wikis, since it fixes a few
|
||||
* technical problems to do with editing these languages. However, if it's
|
||||
* enabled on an existing wiki, pages which contain the problematic characters
|
||||
* in their page titles may become inaccessible. Running maintenance/cleanupTitles.php
|
||||
* after enabling it may fix this.
|
||||
*/
|
||||
$wgFixArchaicUnicode = false;
|
||||
|
||||
/**
|
||||
* Locale for LC_CTYPE, to work around http://bugs.php.net/bug.php?id=45132
|
||||
* For Unix-like operating systems, set this to to a locale that has a UTF-8
|
||||
|
|
|
|||
|
|
@ -203,7 +203,8 @@ class WebRequest {
|
|||
$data[$key] = $this->normalizeUnicode( $val );
|
||||
}
|
||||
} else {
|
||||
$data = UtfNormal::cleanUp( $data );
|
||||
global $wgContLang;
|
||||
$data = $wgContLang->normalize( $data );
|
||||
}
|
||||
return $data;
|
||||
}
|
||||
|
|
@ -600,6 +601,7 @@ class WebRequest {
|
|||
* @return string or NULL if no such file.
|
||||
*/
|
||||
public function getFileName( $key ) {
|
||||
global $wgContLang;
|
||||
if( !isset( $_FILES[$key] ) ) {
|
||||
return null;
|
||||
}
|
||||
|
|
@ -608,7 +610,7 @@ class WebRequest {
|
|||
# Safari sends filenames in HTML-encoded Unicode form D...
|
||||
# Horrid and evil! Let's try to make some kind of sense of it.
|
||||
$name = Sanitizer::decodeCharReferences( $name );
|
||||
$name = UtfNormal::cleanUp( $name );
|
||||
$name = $wgContLang->normalize( $name );
|
||||
wfDebug( "WebRequest::getFileName() '" . $_FILES[$key]['name'] . "' normalized to '$name'\n" );
|
||||
return $name;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -56,7 +56,7 @@ class Xml {
|
|||
|
||||
/**
|
||||
* Format an XML element as with self::element(), but run text through the
|
||||
* UtfNormal::cleanUp() validator first to ensure that no invalid UTF-8
|
||||
* $wgContLang->normalize() validator first to ensure that no invalid UTF-8
|
||||
* is passed.
|
||||
*
|
||||
* @param $element String:
|
||||
|
|
@ -65,12 +65,13 @@ class Xml {
|
|||
* @return string
|
||||
*/
|
||||
public static function elementClean( $element, $attribs = array(), $contents = '') {
|
||||
global $wgContLang;
|
||||
if( $attribs ) {
|
||||
$attribs = array_map( array( 'UtfNormal', 'cleanUp' ), $attribs );
|
||||
}
|
||||
if( $contents ) {
|
||||
wfProfileIn( __METHOD__ . '-norm' );
|
||||
$contents = UtfNormal::cleanUp( $contents );
|
||||
$contents = $wgContLang->normalize( $contents );
|
||||
wfProfileOut( __METHOD__ . '-norm' );
|
||||
}
|
||||
return self::element( $element, $attribs, $contents );
|
||||
|
|
|
|||
|
|
@ -304,7 +304,8 @@ class ApiResult extends ApiBase {
|
|||
{
|
||||
if(!is_string($s))
|
||||
return;
|
||||
$s = UtfNormal::cleanUp($s);
|
||||
global $wgContLang;
|
||||
$s = $wgContLang->normalize($s);
|
||||
}
|
||||
|
||||
public function execute() {
|
||||
|
|
|
|||
|
|
@ -62,6 +62,11 @@ class Language {
|
|||
var $minSearchLength;
|
||||
var $mExtendedSpecialPageAliases;
|
||||
|
||||
/**
|
||||
* ReplacementArray object caches
|
||||
*/
|
||||
var $transformData = array();
|
||||
|
||||
static public $dataCache;
|
||||
static public $mLangObjCache = array();
|
||||
|
||||
|
|
@ -1865,6 +1870,36 @@ class Language {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert a UTF-8 string to normal form C. In Malayalam and Arabic, this
|
||||
* also cleans up certain backwards-compatible sequences, converting them
|
||||
* to the modern Unicode equivalent.
|
||||
*
|
||||
* This is language-specific for performance reasons only.
|
||||
*/
|
||||
function normalize( $s ) {
|
||||
return UtfNormal::cleanUp( $s );
|
||||
}
|
||||
|
||||
/**
|
||||
* Transform a string using serialized data stored in the given file (which
|
||||
* must be in the serialized subdirectory of $IP). The file contains pairs
|
||||
* mapping source characters to destination characters.
|
||||
*
|
||||
* The data is cached in process memory. This will go faster if you have the
|
||||
* FastStringSearch extension.
|
||||
*/
|
||||
function transformUsingPairFile( $file, $string ) {
|
||||
if ( !isset( $this->transformData[$file] ) ) {
|
||||
$data = wfGetPrecompiledData( $file );
|
||||
if ( $data === false ) {
|
||||
throw new MWException( __METHOD__.": The transformation file $file is missing" );
|
||||
}
|
||||
$this->transformData[$file] = new ReplacementArray( $data );
|
||||
}
|
||||
return $this->transformData[$file]->replace( $string );
|
||||
}
|
||||
|
||||
/**
|
||||
* For right-to-left language support
|
||||
*
|
||||
|
|
|
|||
|
|
@ -6,6 +6,7 @@
|
|||
* @author Niklas Laxström
|
||||
*/
|
||||
class LanguageAr extends Language {
|
||||
var $normalizeArray;
|
||||
|
||||
function convertPlural( $count, $forms ) {
|
||||
if ( !count($forms) ) { return ''; }
|
||||
|
|
@ -26,4 +27,20 @@ class LanguageAr extends Language {
|
|||
}
|
||||
return $forms[$index];
|
||||
}
|
||||
|
||||
/**
|
||||
* Temporary hack for bug 9413: replace Arabic presentation forms with their
|
||||
* standard equivalents.
|
||||
*
|
||||
* FIXME: This is language-specific for now only to avoid the negative
|
||||
* performance impact of enabling it for all languages.
|
||||
*/
|
||||
function normalize( $s ) {
|
||||
global $wgFixArchaicUnicode;
|
||||
$s = parent::normalize( $s );
|
||||
if ( $wgFixArchaicUnicode ) {
|
||||
$s = $this->transformUsingPairFile( 'normalize-ar.ser', $s );
|
||||
}
|
||||
return $s;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
22
languages/classes/LanguageMl.php
Normal file
22
languages/classes/LanguageMl.php
Normal file
|
|
@ -0,0 +1,22 @@
|
|||
<?php
|
||||
|
||||
class LanguageMl extends Language {
|
||||
/**
|
||||
* Temporary hack for the issue described at
|
||||
* http://permalink.gmane.org/gmane.science.linguistics.wikipedia.technical/46396
|
||||
* Convert Unicode 5.0 style Malayalam input to Unicode 5.1. Similar to
|
||||
* bug 9413. Also fixes miscellaneous problems due to mishandling of ZWJ,
|
||||
* e.g. bug 11162.
|
||||
*
|
||||
* FIXME: This is language-specific for now only to avoid the negative
|
||||
* performance impact of enabling it for all languages.
|
||||
*/
|
||||
function normalize( $s ) {
|
||||
global $wgFixArchaicUnicode;
|
||||
$s = parent::normalize( $s );
|
||||
if ( $wgFixArchaicUnicode ) {
|
||||
$s = $this->transformUsingPairFile( 'normalize-ml.ser', $s );
|
||||
}
|
||||
return $s;
|
||||
}
|
||||
}
|
||||
|
|
@ -65,7 +65,7 @@ class ImageCleanup extends TableCleanup {
|
|||
$cleaned = $wgContLang->checkTitleEncoding( $cleaned );
|
||||
|
||||
// Many of remainder look like non-normalized unicode
|
||||
$cleaned = UtfNormal::cleanUp( $cleaned );
|
||||
$cleaned = $wgContLang->normalize( $cleaned );
|
||||
|
||||
$title = Title::makeTitleSafe( NS_FILE, $cleaned );
|
||||
|
||||
|
|
|
|||
|
|
@ -37,8 +37,9 @@ class TitleCleanup extends TableCleanup {
|
|||
}
|
||||
|
||||
protected function processRow( $row ) {
|
||||
global $wgContLang;
|
||||
$display = Title::makeName( $row->page_namespace, $row->page_title );
|
||||
$verified = UtfNormal::cleanUp( $display );
|
||||
$verified = $wgContLang->normalize( $display );
|
||||
$title = Title::newFromText( $verified );
|
||||
|
||||
if( !is_null( $title )
|
||||
|
|
|
|||
|
|
@ -52,9 +52,10 @@ class WatchlistCleanup extends TableCleanup {
|
|||
}
|
||||
|
||||
protected function processRow( $row ) {
|
||||
global $wgContLang;
|
||||
$current = Title::makeTitle( $row->wl_namespace, $row->wl_title );
|
||||
$display = $current->getPrefixedText();
|
||||
$verified = UtfNormal::cleanUp( $display );
|
||||
$verified = $wgContLang->normalize( $display );
|
||||
$title = Title::newFromText( $verified );
|
||||
|
||||
if( $row->wl_user == 0 || is_null( $title ) || !$title->equals( $current ) ) {
|
||||
|
|
|
|||
|
|
@ -236,6 +236,7 @@ class TextPassDumper extends BackupDumper {
|
|||
* May throw a database error if, say, the server dies during query.
|
||||
*/
|
||||
private function getTextDb( $id ) {
|
||||
global $wgContLang;
|
||||
$id = intval( $id );
|
||||
$row = $this->db->selectRow( 'text',
|
||||
array( 'old_text', 'old_flags' ),
|
||||
|
|
@ -246,7 +247,7 @@ class TextPassDumper extends BackupDumper {
|
|||
return false;
|
||||
}
|
||||
$stripped = str_replace( "\r", "", $text );
|
||||
$normalized = UtfNormal::cleanUp( $stripped );
|
||||
$normalized = $wgContLang->normalize( $stripped );
|
||||
return $normalized;
|
||||
}
|
||||
|
||||
|
|
@ -321,6 +322,8 @@ class TextPassDumper extends BackupDumper {
|
|||
}
|
||||
|
||||
private function getTextSpawnedOnce( $id ) {
|
||||
global $wgContLang;
|
||||
|
||||
$ok = fwrite( $this->spawnWrite, "$id\n" );
|
||||
//$this->progress( ">> $id" );
|
||||
if( !$ok ) return false;
|
||||
|
|
@ -351,7 +354,7 @@ class TextPassDumper extends BackupDumper {
|
|||
|
||||
// Do normalization in the dump thread...
|
||||
$stripped = str_replace( "\r", "", $text );
|
||||
$normalized = UtfNormal::cleanUp( $stripped );
|
||||
$normalized = $wgContLang->normalize( $stripped );
|
||||
return $normalized;
|
||||
}
|
||||
|
||||
|
|
|
|||
137
maintenance/language/generateNormalizerData.php
Normal file
137
maintenance/language/generateNormalizerData.php
Normal file
|
|
@ -0,0 +1,137 @@
|
|||
<?php
|
||||
|
||||
require_once( dirname( __FILE__ ) . '/../Maintenance.php' );
|
||||
|
||||
require_once( dirname( __FILE__ ) . '/../../includes/normal/UtfNormalUtil.php' );
|
||||
|
||||
/**
|
||||
* Generates normalizer data files for Arabic and Malayalam.
|
||||
* For NFC see includes/normal.
|
||||
*/
|
||||
class GenerateNormalizerData extends Maintenance {
|
||||
var $dataFile;
|
||||
|
||||
public function __construct() {
|
||||
parent::__construct();
|
||||
$this->addOption( 'unicode-data-file', 'The local location of the data file ' .
|
||||
'from http://unicode.org/Public/UNIDATA/UnicodeData.txt', false, true );
|
||||
}
|
||||
|
||||
public function execute() {
|
||||
if ( !$this->hasOption( 'unicode-data-file' ) ) {
|
||||
$this->dataFile = 'UnicodeData.txt';
|
||||
if ( !file_exists( $this->dataFile ) ) {
|
||||
$this->error( "Unable to find UnicodeData.txt. Please specify its location with --unicode-data-file=<FILE>" );
|
||||
exit( 1 );
|
||||
}
|
||||
} else {
|
||||
$this->dataFile = $this->getOption( 'unicode-data-file' );
|
||||
if ( !file_exists( $this->dataFile ) ) {
|
||||
$this->error( 'Unable to find the specified data file.' );
|
||||
exit( 1 );
|
||||
}
|
||||
}
|
||||
|
||||
$this->generateArabic();
|
||||
$this->generateMalayalam();
|
||||
}
|
||||
|
||||
function generateArabic() {
|
||||
$file = fopen( $this->dataFile, 'r' );
|
||||
if ( !$file ) {
|
||||
$this->error( 'Unable to open the data file.' );
|
||||
exit( 1 );
|
||||
}
|
||||
|
||||
// For the file format, see http://www.unicode.org/reports/tr44/
|
||||
$fieldNames = array(
|
||||
'Code',
|
||||
'Name',
|
||||
'General_Category',
|
||||
'Canonical_Combining_Class',
|
||||
'Bidi_Class',
|
||||
'Decomposition_Type_Mapping',
|
||||
'Numeric_Type_Value',
|
||||
'Bidi_Mirrored',
|
||||
'Unicode_1_Name',
|
||||
'ISO_Comment',
|
||||
'Simple_Uppercase_Mapping',
|
||||
'Simple_Lowercase_Mapping',
|
||||
'Simple_Titlecase_Mapping'
|
||||
);
|
||||
|
||||
$pairs = array();
|
||||
|
||||
$lineNum = 0;
|
||||
while ( false !== ( $line = fgets( $file ) ) ) {
|
||||
++$lineNum;
|
||||
|
||||
# Strip comments
|
||||
$line = trim( substr( $line, 0, strcspn( $line, '#' ) ) );
|
||||
if ( $line === '' ) {
|
||||
continue;
|
||||
}
|
||||
|
||||
# Split fields
|
||||
$numberedData = explode( ';', $line );
|
||||
$data = array();
|
||||
foreach ( $fieldNames as $number => $name ) {
|
||||
$data[$name] = $numberedData[$number];
|
||||
}
|
||||
|
||||
$code = base_convert( $data['Code'], 16, 10 );
|
||||
if ( ( $code >= 0xFB50 && $code <= 0xFDFF ) # Arabic presentation forms A
|
||||
|| ( $code >= 0xFE70 && $code <= 0xFEFF ) ) # Arabic presentation forms B
|
||||
{
|
||||
if ( $data['Decomposition_Type_Mapping'] === '' ) {
|
||||
// No decomposition
|
||||
continue;
|
||||
}
|
||||
if ( !preg_match( '/^ *(<\w*>) +([0-9A-F ]*)$/',
|
||||
$data['Decomposition_Type_Mapping'], $m ) )
|
||||
{
|
||||
$this->error( "Can't parse Decomposition_Type/Mapping on line $lineNum" );
|
||||
$this->error( $line );
|
||||
continue;
|
||||
}
|
||||
|
||||
$source = hexSequenceToUtf8( $data['Code'] );
|
||||
$dest = hexSequenceToUtf8( $m[2] );
|
||||
$pairs[$source] = $dest;
|
||||
}
|
||||
}
|
||||
|
||||
global $IP;
|
||||
file_put_contents( "$IP/serialized/normalize-ar.ser", serialize( $pairs ) );
|
||||
echo "ar: " . count( $pairs ) . " pairs written.\n";
|
||||
}
|
||||
|
||||
function generateMalayalam() {
|
||||
$hexPairs = array(
|
||||
# From http://unicode.org/versions/Unicode5.1.0/#Malayalam_Chillu_Characters
|
||||
'0D23 0D4D 200D' => '0D7A',
|
||||
'0D28 0D4D 200D' => '0D7B',
|
||||
'0D30 0D4D 200D' => '0D7C',
|
||||
'0D32 0D4D 200D' => '0D7D',
|
||||
'0D33 0D4D 200D' => '0D7E',
|
||||
|
||||
# From http://permalink.gmane.org/gmane.science.linguistics.wikipedia.technical/46413
|
||||
'0D15 0D4D 200D' => '0D7F',
|
||||
);
|
||||
|
||||
$pairs = array();
|
||||
foreach ( $hexPairs as $hexSource => $hexDest ) {
|
||||
$source = hexSequenceToUtf8( $hexSource );
|
||||
$dest = hexSequenceToUtf8( $hexDest );
|
||||
$pairs[$source] = $dest;
|
||||
}
|
||||
|
||||
global $IP;
|
||||
file_put_contents( "$IP/serialized/normalize-ml.ser", serialize( $pairs ) );
|
||||
echo "ml: " . count( $pairs ) . " pairs written.\n";
|
||||
}
|
||||
}
|
||||
|
||||
$maintClass = 'GenerateNormalizerData';
|
||||
require_once( DO_MAINTENANCE );
|
||||
|
||||
|
|
@ -102,7 +102,8 @@ class PPFuzzTester {
|
|||
// This resolves a few differences between the old preprocessor and the
|
||||
// XML-based one, which doesn't like illegals and converts line endings.
|
||||
// It's done by the MW UI, so it's a reasonably legitimate thing to do.
|
||||
$s = UtfNormal::cleanUp( $s );
|
||||
global $wgContLang;
|
||||
$s = $wgContLang->normalize( $s );
|
||||
return $s;
|
||||
}
|
||||
|
||||
|
|
|
|||
1
serialized/normalize-ar.ser
Normal file
1
serialized/normalize-ar.ser
Normal file
File diff suppressed because one or more lines are too long
1
serialized/normalize-ml.ser
Normal file
1
serialized/normalize-ml.ser
Normal file
|
|
@ -0,0 +1 @@
|
|||
a:6:{s:9:"ണ്";s:3:"ൺ";s:9:"ന്";s:3:"ൻ";s:9:"ര്";s:3:"ർ";s:9:"ല്";s:3:"ൽ";s:9:"ള്";s:3:"ൾ";s:9:"ക്";s:3:"ൿ";}
|
||||
Loading…
Reference in a new issue