Merge "Generate Utf8Case.ser directly from UnicodeData.txt"

This commit is contained in:
jenkins-bot 2014-01-12 03:10:50 +00:00 committed by Gerrit Code Review
commit 12238a3704
10 changed files with 251 additions and 2368 deletions

7
includes/normal/.gitignore vendored Normal file
View file

@ -0,0 +1,7 @@
/CompositionExclusions.txt
/DerivedNormalizationProps.txt
/NormalizationCorrections.txt
/NormalizationTest.txt
/UTF-8-test.txt
/UnicodeData.txt
/testdata

View file

@ -16,14 +16,11 @@ PHP=php
FETCH=wget
#FETCH=fetch
all : UtfNormalData.inc Utf8Case.php
all : UtfNormalData.inc
UtfNormalData.inc : UtfNormalGenerate.php UtfNormalUtil.php UnicodeData.txt CompositionExclusions.txt NormalizationCorrections.txt DerivedNormalizationProps.txt
$(PHP) UtfNormalGenerate.php
Utf8Case.php : Utf8CaseGenerate.php UtfNormalUtil.php UnicodeData.txt
$(PHP) Utf8CaseGenerate.php
test : testutf8 UtfNormalTest.php UtfNormalData.inc NormalizationTest.txt
$(PHP) UtfNormalTest.php

File diff suppressed because it is too large Load diff

View file

@ -1,112 +0,0 @@
<?php
/**
* This script generates Utf8Case.php from the Unicode Character Database
* and supplementary files.
*
* Copyright © 2004,2008 Brion Vibber <brion@pobox.com>
* http://www.mediawiki.org/
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
* http://www.gnu.org/copyleft/gpl.html
*
* @file
* @ingroup UtfNormal
*/
if( PHP_SAPI != 'cli' ) {
die( "Run me from the command line please.\n" );
}
require_once 'UtfNormalDefines.php';
require_once 'UtfNormalUtil.php';
$in = fopen("UnicodeData.txt", "rt" );
if( !$in ) {
print "Can't open UnicodeData.txt for reading.\n";
print "If necessary, fetch this file from the internet:\n";
print "http://www.unicode.org/Public/UNIDATA/UnicodeData.txt\n";
exit(-1);
}
$wikiUpperChars = array();
$wikiLowerChars = array();
print "Reading character definitions...\n";
while( false !== ($line = fgets( $in ) ) ) {
$columns = explode(';', $line);
$codepoint = $columns[0];
$name = $columns[1];
$simpleUpper = $columns[12];
$simpleLower = $columns[13];
$source = codepointToUtf8( hexdec( $codepoint ) );
if( $simpleUpper ) {
$wikiUpperChars[$source] = codepointToUtf8( hexdec( $simpleUpper ) );
}
if( $simpleLower ) {
$wikiLowerChars[$source] = codepointToUtf8( hexdec( $simpleLower ) );
}
}
fclose( $in );
$out = fopen( "Utf8Case.php", "wt" );
if( $out ) {
$outUpperChars = escapeArray( $wikiUpperChars );
$outLowerChars = escapeArray( $wikiLowerChars );
$outdata = "<" . "?php
/**
* Simple 1:1 upper/lowercase switching arrays for utf-8 text.
* Won't get context-sensitive things yet.
*
* Hack for bugs in ucfirst() and company
*
* These are pulled from memcached if possible, as this is faster than filling
* up a big array manually.
*
* @file
* @ingroup Language
*/
/**
* Translation array to get upper case character
*/
\$wikiUpperChars = $outUpperChars;
/**
* Translation array to get lower case character
*/
\$wikiLowerChars = $outLowerChars;\n";
fputs( $out, $outdata );
fclose( $out );
print "Wrote out Utf8Case.php\n";
} else {
print "Can't create file Utf8Case.php\n";
exit(-1);
}
function escapeArray( $arr ) {
return "array(\n" .
implode( ",\n",
array_map( "escapeLine",
array_keys( $arr ),
array_values( $arr ) ) ) .
"\n)";
}
function escapeLine( $key, $val ) {
$encKey = escapeSingleString( $key );
$encVal = escapeSingleString( $val );
return "\t'$encKey' => '$encVal'";
}

View file

@ -1,6 +1,6 @@
<?php
/**
* Generates normalizer data files for Arabic and Malayalam.
* Generates the normalizer data file for Arabic.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@ -21,47 +21,43 @@
* @ingroup MaintenanceLanguage
*/
require_once __DIR__ . '/../../includes/normal/UtfNormalUtil.php';
require_once __DIR__ . '/../Maintenance.php';
/**
* Generates normalizer data files for Arabic and Malayalam.
* Generates the normalizer data file for Arabic.
* For NFC see includes/normal.
*
* @ingroup MaintenanceLanguage
*/
class GenerateNormalizerData extends Maintenance {
public $dataFile;
class GenerateNormalizerDataAr extends Maintenance {
public function __construct() {
parent::__construct();
$this->mDescription = 'Generate the normalizer data file for Arabic';
$this->addOption( 'unicode-data-file', 'The local location of the data file ' .
'from http://unicode.org/Public/UNIDATA/UnicodeData.txt', false, true );
}
public function getDbType() {
return Maintenance::DB_NONE;
}
public function execute() {
if ( !$this->hasOption( 'unicode-data-file' ) ) {
$this->dataFile = 'UnicodeData.txt';
if ( !file_exists( $this->dataFile ) ) {
$dataFile = 'UnicodeData.txt';
if ( !file_exists( $dataFile ) ) {
$this->error( "Unable to find UnicodeData.txt. Please specify " .
"its location with --unicode-data-file=<FILE>" );
exit( 1 );
}
} else {
$this->dataFile = $this->getOption( 'unicode-data-file' );
if ( !file_exists( $this->dataFile ) ) {
$dataFile = $this->getOption( 'unicode-data-file' );
if ( !file_exists( $dataFile ) ) {
$this->error( 'Unable to find the specified data file.' );
exit( 1 );
}
}
$this->generateArabic();
$this->generateMalayalam();
}
function generateArabic() {
$file = fopen( $this->dataFile, 'r' );
$file = fopen( $dataFile, 'r' );
if ( !$file ) {
$this->error( 'Unable to open the data file.' );
exit( 1 );
@ -75,7 +71,9 @@ class GenerateNormalizerData extends Maintenance {
'Canonical_Combining_Class',
'Bidi_Class',
'Decomposition_Type_Mapping',
'Numeric_Type_Value',
'Numeric_Type_Value_6',
'Numeric_Type_Value_7',
'Numeric_Type_Value_8',
'Bidi_Mirrored',
'Unicode_1_Name',
'ISO_Comment',
@ -129,32 +127,7 @@ class GenerateNormalizerData extends Maintenance {
file_put_contents( "$IP/serialized/normalize-ar.ser", serialize( $pairs ) );
echo "ar: " . count( $pairs ) . " pairs written.\n";
}
function generateMalayalam() {
$hexPairs = array(
# From http://unicode.org/versions/Unicode5.1.0/#Malayalam_Chillu_Characters
'0D23 0D4D 200D' => '0D7A',
'0D28 0D4D 200D' => '0D7B',
'0D30 0D4D 200D' => '0D7C',
'0D32 0D4D 200D' => '0D7D',
'0D33 0D4D 200D' => '0D7E',
# From http://permalink.gmane.org/gmane.science.linguistics.wikipedia.technical/46413
'0D15 0D4D 200D' => '0D7F',
);
$pairs = array();
foreach ( $hexPairs as $hexSource => $hexDest ) {
$source = hexSequenceToUtf8( $hexSource );
$dest = hexSequenceToUtf8( $hexDest );
$pairs[$source] = $dest;
}
global $IP;
file_put_contents( "$IP/serialized/normalize-ml.ser", serialize( $pairs ) );
echo "ml: " . count( $pairs ) . " pairs written.\n";
}
}
$maintClass = 'GenerateNormalizerData';
$maintClass = 'GenerateNormalizerDataAr';
require_once RUN_MAINTENANCE_IF_MAIN;

View file

@ -0,0 +1,69 @@
<?php
/**
* Generates the normalizer data file for Malayalam.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
* http://www.gnu.org/copyleft/gpl.html
*
* @file
* @ingroup MaintenanceLanguage
*/
require_once __DIR__ . '/../Maintenance.php';
/**
* Generates the normalizer data file for Malayalam.
* For NFC see includes/normal.
*
* @ingroup MaintenanceLanguage
*/
class GenerateNormalizerDataMl extends Maintenance {
public function __construct() {
parent::__construct();
$this->mDescription = 'Generate the normalizer data file for Malayalam';
}
public function getDbType() {
return Maintenance::DB_NONE;
}
public function execute() {
$hexPairs = array(
# From http://unicode.org/versions/Unicode5.1.0/#Malayalam_Chillu_Characters
'0D23 0D4D 200D' => '0D7A',
'0D28 0D4D 200D' => '0D7B',
'0D30 0D4D 200D' => '0D7C',
'0D32 0D4D 200D' => '0D7D',
'0D33 0D4D 200D' => '0D7E',
# From http://permalink.gmane.org/gmane.science.linguistics.wikipedia.technical/46413
'0D15 0D4D 200D' => '0D7F',
);
$pairs = array();
foreach ( $hexPairs as $hexSource => $hexDest ) {
$source = hexSequenceToUtf8( $hexSource );
$dest = hexSequenceToUtf8( $hexDest );
$pairs[$source] = $dest;
}
global $IP;
file_put_contents( "$IP/serialized/normalize-ml.ser", serialize( $pairs ) );
echo "ml: " . count( $pairs ) . " pairs written.\n";
}
}
$maintClass = 'GenerateNormalizerDataMl';
require_once RUN_MAINTENANCE_IF_MAIN;

View file

@ -0,0 +1,129 @@
<?php
/**
* Generates Utf8Case.ser from the Unicode Character Database and
* supplementary files.
*
* Copyright © 2004, 2008 Brion Vibber <brion@pobox.com>
* http://www.mediawiki.org/
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
* http://www.gnu.org/copyleft/gpl.html
*
* @file
* @ingroup MaintenanceLanguage
*/
require_once __DIR__ . '/../Maintenance.php';
/**
* Generates Utf8Case.ser from the Unicode Character Database and
* supplementary files.
*
* @ingroup MaintenanceLanguage
*/
class GenerateUtf8Case extends Maintenance {
public function __construct() {
parent::__construct();
$this->mDescription = 'Generate Utf8Case.ser from the Unicode Character Database ' .
'and supplementary files';
$this->addOption( 'unicode-data-file', 'The local location of the data file ' .
'from http://unicode.org/Public/UNIDATA/UnicodeData.txt', false, true );
}
public function getDbType() {
return Maintenance::DB_NONE;
}
public function execute() {
if ( !$this->hasOption( 'unicode-data-file' ) ) {
$dataFile = 'UnicodeData.txt';
if ( !file_exists( $dataFile ) ) {
$this->error( "Unable to find UnicodeData.txt. Please specify " .
"its location with --unicode-data-file=<FILE>" );
exit( 1 );
}
} else {
$dataFile = $this->getOption( 'unicode-data-file' );
if ( !file_exists( $dataFile ) ) {
$this->error( 'Unable to find the specified data file.' );
exit( 1 );
}
}
$file = fopen( $dataFile, 'r' );
if ( !$file ) {
$this->error( 'Unable to open the data file.' );
exit( 1 );
}
// For the file format, see http://www.unicode.org/reports/tr44/
$fieldNames = array(
'Code',
'Name',
'General_Category',
'Canonical_Combining_Class',
'Bidi_Class',
'Decomposition_Type_Mapping',
'Numeric_Type_Value_6',
'Numeric_Type_Value_7',
'Numeric_Type_Value_8',
'Bidi_Mirrored',
'Unicode_1_Name',
'ISO_Comment',
'Simple_Uppercase_Mapping',
'Simple_Lowercase_Mapping',
'Simple_Titlecase_Mapping'
);
$upper = array();
$lower = array();
$lineNum = 0;
while ( false !== ( $line = fgets( $file ) ) ) {
++$lineNum;
# Strip comments
$line = trim( substr( $line, 0, strcspn( $line, '#' ) ) );
if ( $line === '' ) {
continue;
}
# Split fields
$numberedData = explode( ';', $line );
$data = array();
foreach ( $fieldNames as $number => $name ) {
$data[$name] = $numberedData[$number];
}
$source = hexSequenceToUtf8( $data['Code'] );
if ( $data['Simple_Uppercase_Mapping'] ) {
$upper[$source] = hexSequenceToUtf8( $data['Simple_Uppercase_Mapping'] );
}
if ( $data['Simple_Lowercase_Mapping'] ) {
$lower[$source] = hexSequenceToUtf8( $data['Simple_Lowercase_Mapping'] );
}
}
global $IP;
file_put_contents( "$IP/serialized/Utf8Case.ser", serialize( array(
'wikiUpperChars' => $upper,
'wikiLowerChars' => $lower,
) ) );
}
}
$maintClass = 'GenerateUtf8Case';
require_once RUN_MAINTENANCE_IF_MAIN;

4
serialized/.gitignore vendored Normal file
View file

@ -0,0 +1,4 @@
/UnicodeData.txt
/allkeys.txt
/ucd.all.grouped.xml
/ucd.all.grouped.zip

View file

@ -1,7 +1,7 @@
SPECIAL_TARGETS=Utf8Case.ser
SPECIAL_TARGETS=Utf8Case.ser normalize-ar.ser normalize-ml.ser first-letters-root.ser
ALL_TARGETS=$(SPECIAL_TARGETS)
DIST_TARGETS=$(SPECIAL_TARGETS)
UNICODE_VERSION=6.0.0
.PHONY: all dist clean
@ -13,6 +13,26 @@ dist: $(DIST_TARGETS)
clean:
rm -f $(ALL_TARGETS)
Utf8Case.ser : ../includes/normal/Utf8Case.php
php serialize.php -o $@ $<
Utf8Case.ser: UnicodeData.txt
php ../maintenance/language/generateUtf8Case.php
normalize-ar.ser: UnicodeData.txt
php ../maintenance/language/generateNormalizerDataAr.php
normalize-ml.ser:
php ../maintenance/language/generateNormalizerDataMl.php
first-letters-root.ser: allkeys.txt ucd.all.grouped.xml
php ../maintenance/language/generateCollationData.php
UnicodeData.txt:
wget http://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/UnicodeData.txt
allkeys.txt:
wget http://www.unicode.org/Public/UCA/$(UNICODE_VERSION)/allkeys.txt
ucd.all.grouped.xml: ucd.all.grouped.zip
unzip ucd.all.grouped.zip ucd.all.grouped.xml
ucd.all.grouped.zip:
wget http://www.unicode.org/Public/$(UNICODE_VERSION)/ucdxml/ucd.all.grouped.zip

View file

@ -1,95 +0,0 @@
<?php
/**
* Serialize variables found in input file and store the result in the
* specified file.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
* http://www.gnu.org/copyleft/gpl.html
*
* @file
*/
if ( !defined( 'MEDIAWIKI' ) ) {
$wgNoDBParam = true;
$optionsWithArgs = array( 'o' );
require_once __DIR__ .'/../maintenance/commandLine.inc';
$stderr = fopen( 'php://stderr', 'w' );
if ( !isset( $args[0] ) ) {
fwrite( $stderr, "No input file specified\n" );
exit( 1 );
}
if ( wfIsWindows() ) {
$files = array();
foreach ( $args as $arg ) {
$files = array_merge( $files, glob( $arg ) );
}
if ( !$files ) {
fwrite( $stderr, "No files found\n" );
}
} else {
$files = $args;
}
if ( isset( $options['o'] ) ) {
$out = fopen( $options['o'], 'wb' );
if ( !$out ) {
fwrite( $stderr, "Unable to open file \"{$options['o']}\" for output\n" );
exit( 1 );
}
} else {
$out = fopen( 'php://stdout', 'wb' );
}
$vars = array();
foreach ( $files as $inputFile ) {
$vars = array_merge( $vars, getVars( $inputFile ) );
}
fwrite( $out, serialize( $vars ) );
fclose( $out );
exit( 0 );
}
//----------------------------------------------------------------------------
function getVars( $_gv_filename ) {
require $_gv_filename;
$vars = get_defined_vars();
unset( $vars['_gv_filename'] );
# Clean up line endings
if ( wfIsWindows() ) {
$vars = unixLineEndings( $vars );
}
return $vars;
}
function unixLineEndings( $var ) {
static $recursionLevel = 0;
if ( $recursionLevel > 50 ) {
global $stderr;
fwrite( $stderr, "Error: Recursion limit exceeded. Possible circular reference in array variable.\n" );
exit( 2 );
}
if ( is_array( $var ) ) {
++$recursionLevel;
$var = array_map( 'unixLineEndings', $var );
--$recursionLevel;
} elseif ( is_string( $var ) ) {
$var = str_replace( "\r\n", "\n", $var );
}
return $var;
}