Merge "Generate Utf8Case.ser directly from UnicodeData.txt"

2014-01-12 03:10:50 +00:00 · 2014-01-12 03:10:50 +00:00 · 12238a3704
commit 12238a3704
parent 920a45e654 74557dedd0
10 changed files with 251 additions and 2368 deletions
--- a/includes/normal/.gitignore
+++ b/includes/normal/.gitignore
@ -0,0 +1,7 @@
+/CompositionExclusions.txt
+/DerivedNormalizationProps.txt
+/NormalizationCorrections.txt
+/NormalizationTest.txt
+/UTF-8-test.txt
+/UnicodeData.txt
+/testdata
--- a/includes/normal/Makefile
+++ b/includes/normal/Makefile
@ -16,14 +16,11 @@ PHP=php
 FETCH=wget
 #FETCH=fetch

-all : UtfNormalData.inc Utf8Case.php
+all : UtfNormalData.inc

 UtfNormalData.inc : UtfNormalGenerate.php UtfNormalUtil.php UnicodeData.txt CompositionExclusions.txt NormalizationCorrections.txt DerivedNormalizationProps.txt
 	$(PHP) UtfNormalGenerate.php

-Utf8Case.php : Utf8CaseGenerate.php UtfNormalUtil.php UnicodeData.txt
-	$(PHP) Utf8CaseGenerate.php
-
 test : testutf8 UtfNormalTest.php UtfNormalData.inc NormalizationTest.txt
 	$(PHP) UtfNormalTest.php

--- a/includes/normal/Utf8Case.php
+++ b/includes/normal/Utf8Case.php
--- a/includes/normal/Utf8CaseGenerate.php
+++ b/includes/normal/Utf8CaseGenerate.php
@ -1,112 +0,0 @@
-<?php
-/**
- * This script generates Utf8Case.php from the Unicode Character Database
- * and supplementary files.
- *
- * Copyright © 2004,2008 Brion Vibber <brion@pobox.com>
- * http://www.mediawiki.org/
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
- * http://www.gnu.org/copyleft/gpl.html
- *
- * @file
- * @ingroup UtfNormal
- */
-
-if( PHP_SAPI != 'cli' ) {
-	die( "Run me from the command line please.\n" );
-}
-
-require_once 'UtfNormalDefines.php';
-require_once 'UtfNormalUtil.php';
-
-$in = fopen("UnicodeData.txt", "rt" );
-if( !$in ) {
-	print "Can't open UnicodeData.txt for reading.\n";
-	print "If necessary, fetch this file from the internet:\n";
-	print "http://www.unicode.org/Public/UNIDATA/UnicodeData.txt\n";
-	exit(-1);
-}
-$wikiUpperChars = array();
-$wikiLowerChars = array();
-
-print "Reading character definitions...\n";
-while( false !== ($line = fgets( $in ) ) ) {
-	$columns = explode(';', $line);
-	$codepoint = $columns[0];
-	$name = $columns[1];
-	$simpleUpper = $columns[12];
-	$simpleLower = $columns[13];
-
-	$source = codepointToUtf8( hexdec( $codepoint ) );
-	if( $simpleUpper ) {
-		$wikiUpperChars[$source] = codepointToUtf8( hexdec( $simpleUpper ) );
-	}
-	if( $simpleLower ) {
-		$wikiLowerChars[$source] = codepointToUtf8( hexdec( $simpleLower ) );
-	}
-}
-fclose( $in );
-
-$out = fopen( "Utf8Case.php", "wt" );
-if( $out ) {
-	$outUpperChars = escapeArray( $wikiUpperChars );
-	$outLowerChars = escapeArray( $wikiLowerChars );
-	$outdata = "<" . "?php
-/**
- * Simple 1:1 upper/lowercase switching arrays for utf-8 text.
- * Won't get context-sensitive things yet.
- *
- * Hack for bugs in ucfirst() and company
- *
- * These are pulled from memcached if possible, as this is faster than filling
- * up a big array manually.
- *
- * @file
- * @ingroup Language
- */
-
-/**
- * Translation array to get upper case character
- */
-\$wikiUpperChars = $outUpperChars;
-
-/**
- * Translation array to get lower case character
- */
-\$wikiLowerChars = $outLowerChars;\n";
-	fputs( $out, $outdata );
-	fclose( $out );
-	print "Wrote out Utf8Case.php\n";
-} else {
-	print "Can't create file Utf8Case.php\n";
-	exit(-1);
-}
-
-
-function escapeArray( $arr ) {
-	return "array(\n" .
-		implode( ",\n",
-			array_map( "escapeLine",
-				array_keys( $arr ),
-				array_values( $arr ) ) ) .
-		"\n)";
-}
-
-function escapeLine( $key, $val ) {
-	$encKey = escapeSingleString( $key );
-	$encVal = escapeSingleString( $val );
-	return "\t'$encKey' => '$encVal'";
-}
--- a/maintenance/language/generateNormalizerDataAr.php
+++ b/maintenance/language/generateNormalizerDataAr.php
@ -1,6 +1,6 @@
 <?php
 /**
- * Generates normalizer data files for Arabic and Malayalam.
+ * Generates the normalizer data file for Arabic.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
@ -21,47 +21,43 @@
 * @ingroup MaintenanceLanguage
 */

-require_once __DIR__ . '/../../includes/normal/UtfNormalUtil.php';
-
 require_once __DIR__ . '/../Maintenance.php';

 /**
- * Generates normalizer data files for Arabic and Malayalam.
+ * Generates the normalizer data file for Arabic.
 * For NFC see includes/normal.
 *
 * @ingroup MaintenanceLanguage
 */
-class GenerateNormalizerData extends Maintenance {
-	public $dataFile;
-
+class GenerateNormalizerDataAr extends Maintenance {
 	public function __construct() {
 		parent::__construct();
+		$this->mDescription = 'Generate the normalizer data file for Arabic';
 		$this->addOption( 'unicode-data-file', 'The local location of the data file ' .
 			'from http://unicode.org/Public/UNIDATA/UnicodeData.txt', false, true );
 	}

+	public function getDbType() {
+		return Maintenance::DB_NONE;
+	}
+
 	public function execute() {
 		if ( !$this->hasOption( 'unicode-data-file' ) ) {
-			$this->dataFile = 'UnicodeData.txt';
-			if ( !file_exists( $this->dataFile ) ) {
+			$dataFile = 'UnicodeData.txt';
+			if ( !file_exists( $dataFile ) ) {
 				$this->error( "Unable to find UnicodeData.txt. Please specify " .
 					"its location with --unicode-data-file=<FILE>" );
 				exit( 1 );
 			}
 		} else {
-			$this->dataFile = $this->getOption( 'unicode-data-file' );
-			if ( !file_exists( $this->dataFile ) ) {
+			$dataFile = $this->getOption( 'unicode-data-file' );
+			if ( !file_exists( $dataFile ) ) {
 				$this->error( 'Unable to find the specified data file.' );
 				exit( 1 );
 			}
 		}

-		$this->generateArabic();
-		$this->generateMalayalam();
-	}
-
-	function generateArabic() {
-		$file = fopen( $this->dataFile, 'r' );
+		$file = fopen( $dataFile, 'r' );
 		if ( !$file ) {
 			$this->error( 'Unable to open the data file.' );
 			exit( 1 );
@ -75,7 +71,9 @@ class GenerateNormalizerData extends Maintenance {
 			'Canonical_Combining_Class',
 			'Bidi_Class',
 			'Decomposition_Type_Mapping',
-			'Numeric_Type_Value',
+			'Numeric_Type_Value_6',
+			'Numeric_Type_Value_7',
+			'Numeric_Type_Value_8',
 			'Bidi_Mirrored',
 			'Unicode_1_Name',
 			'ISO_Comment',
@ -129,32 +127,7 @@ class GenerateNormalizerData extends Maintenance {
 		file_put_contents( "$IP/serialized/normalize-ar.ser", serialize( $pairs ) );
 		echo "ar: " . count( $pairs ) . " pairs written.\n";
 	}
-
-	function generateMalayalam() {
-		$hexPairs = array(
-			# From http://unicode.org/versions/Unicode5.1.0/#Malayalam_Chillu_Characters
-			'0D23 0D4D 200D' => '0D7A',
-			'0D28 0D4D 200D' => '0D7B',
-			'0D30 0D4D 200D' => '0D7C',
-			'0D32 0D4D 200D' => '0D7D',
-			'0D33 0D4D 200D' => '0D7E',
-
-			# From http://permalink.gmane.org/gmane.science.linguistics.wikipedia.technical/46413
-			'0D15 0D4D 200D' => '0D7F',
-		);
-
-		$pairs = array();
-		foreach ( $hexPairs as $hexSource => $hexDest ) {
-			$source = hexSequenceToUtf8( $hexSource );
-			$dest = hexSequenceToUtf8( $hexDest );
-			$pairs[$source] = $dest;
-		}
-
-		global $IP;
-		file_put_contents( "$IP/serialized/normalize-ml.ser", serialize( $pairs ) );
-		echo "ml: " . count( $pairs ) . " pairs written.\n";
-	}
 }

-$maintClass = 'GenerateNormalizerData';
+$maintClass = 'GenerateNormalizerDataAr';
 require_once RUN_MAINTENANCE_IF_MAIN;
--- a/maintenance/language/generateNormalizerDataMl.php
+++ b/maintenance/language/generateNormalizerDataMl.php
@ -0,0 +1,69 @@
+<?php
+/**
+ * Generates the normalizer data file for Malayalam.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ * http://www.gnu.org/copyleft/gpl.html
+ *
+ * @file
+ * @ingroup MaintenanceLanguage
+ */
+
+require_once __DIR__ . '/../Maintenance.php';
+
+/**
+ * Generates the normalizer data file for Malayalam.
+ * For NFC see includes/normal.
+ *
+ * @ingroup MaintenanceLanguage
+ */
+class GenerateNormalizerDataMl extends Maintenance {
+	public function __construct() {
+		parent::__construct();
+		$this->mDescription = 'Generate the normalizer data file for Malayalam';
+	}
+
+	public function getDbType() {
+		return Maintenance::DB_NONE;
+	}
+
+	public function execute() {
+		$hexPairs = array(
+			# From http://unicode.org/versions/Unicode5.1.0/#Malayalam_Chillu_Characters
+			'0D23 0D4D 200D' => '0D7A',
+			'0D28 0D4D 200D' => '0D7B',
+			'0D30 0D4D 200D' => '0D7C',
+			'0D32 0D4D 200D' => '0D7D',
+			'0D33 0D4D 200D' => '0D7E',
+
+			# From http://permalink.gmane.org/gmane.science.linguistics.wikipedia.technical/46413
+			'0D15 0D4D 200D' => '0D7F',
+		);
+
+		$pairs = array();
+		foreach ( $hexPairs as $hexSource => $hexDest ) {
+			$source = hexSequenceToUtf8( $hexSource );
+			$dest = hexSequenceToUtf8( $hexDest );
+			$pairs[$source] = $dest;
+		}
+
+		global $IP;
+		file_put_contents( "$IP/serialized/normalize-ml.ser", serialize( $pairs ) );
+		echo "ml: " . count( $pairs ) . " pairs written.\n";
+	}
+}
+
+$maintClass = 'GenerateNormalizerDataMl';
+require_once RUN_MAINTENANCE_IF_MAIN;
--- a/maintenance/language/generateUtf8Case.php
+++ b/maintenance/language/generateUtf8Case.php
@ -0,0 +1,129 @@
+<?php
+/**
+ * Generates Utf8Case.ser from the Unicode Character Database and
+ * supplementary files.
+ *
+ * Copyright © 2004, 2008 Brion Vibber <brion@pobox.com>
+ * http://www.mediawiki.org/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ * http://www.gnu.org/copyleft/gpl.html
+ *
+ * @file
+ * @ingroup MaintenanceLanguage
+ */
+
+require_once __DIR__ . '/../Maintenance.php';
+
+/**
+ * Generates Utf8Case.ser from the Unicode Character Database and
+ * supplementary files.
+ *
+ * @ingroup MaintenanceLanguage
+ */
+class GenerateUtf8Case extends Maintenance {
+
+	public function __construct() {
+		parent::__construct();
+		$this->mDescription = 'Generate Utf8Case.ser from the Unicode Character Database ' .
+			'and supplementary files';
+		$this->addOption( 'unicode-data-file', 'The local location of the data file ' .
+			'from http://unicode.org/Public/UNIDATA/UnicodeData.txt', false, true );
+	}
+
+	public function getDbType() {
+		return Maintenance::DB_NONE;
+	}
+
+	public function execute() {
+		if ( !$this->hasOption( 'unicode-data-file' ) ) {
+			$dataFile = 'UnicodeData.txt';
+			if ( !file_exists( $dataFile ) ) {
+				$this->error( "Unable to find UnicodeData.txt. Please specify " .
+					"its location with --unicode-data-file=<FILE>" );
+				exit( 1 );
+			}
+		} else {
+			$dataFile = $this->getOption( 'unicode-data-file' );
+			if ( !file_exists( $dataFile ) ) {
+				$this->error( 'Unable to find the specified data file.' );
+				exit( 1 );
+			}
+		}
+
+		$file = fopen( $dataFile, 'r' );
+		if ( !$file ) {
+			$this->error( 'Unable to open the data file.' );
+			exit( 1 );
+		}
+
+		// For the file format, see http://www.unicode.org/reports/tr44/
+		$fieldNames = array(
+			'Code',
+			'Name',
+			'General_Category',
+			'Canonical_Combining_Class',
+			'Bidi_Class',
+			'Decomposition_Type_Mapping',
+			'Numeric_Type_Value_6',
+			'Numeric_Type_Value_7',
+			'Numeric_Type_Value_8',
+			'Bidi_Mirrored',
+			'Unicode_1_Name',
+			'ISO_Comment',
+			'Simple_Uppercase_Mapping',
+			'Simple_Lowercase_Mapping',
+			'Simple_Titlecase_Mapping'
+		);
+
+		$upper = array();
+		$lower = array();
+
+		$lineNum = 0;
+		while ( false !== ( $line = fgets( $file ) ) ) {
+			++$lineNum;
+
+			# Strip comments
+			$line = trim( substr( $line, 0, strcspn( $line, '#' ) ) );
+			if ( $line === '' ) {
+				continue;
+			}
+
+			# Split fields
+			$numberedData = explode( ';', $line );
+			$data = array();
+			foreach ( $fieldNames as $number => $name ) {
+				$data[$name] = $numberedData[$number];
+			}
+
+			$source = hexSequenceToUtf8( $data['Code'] );
+			if ( $data['Simple_Uppercase_Mapping'] ) {
+				$upper[$source] = hexSequenceToUtf8( $data['Simple_Uppercase_Mapping'] );
+			}
+			if ( $data['Simple_Lowercase_Mapping'] ) {
+				$lower[$source] = hexSequenceToUtf8( $data['Simple_Lowercase_Mapping'] );
+			}
+		}
+
+		global $IP;
+		file_put_contents( "$IP/serialized/Utf8Case.ser", serialize( array(
+			'wikiUpperChars' => $upper,
+			'wikiLowerChars' => $lower,
+		) ) );
+	}
+}
+
+$maintClass = 'GenerateUtf8Case';
+require_once RUN_MAINTENANCE_IF_MAIN;
--- a/serialized/.gitignore
+++ b/serialized/.gitignore
@ -0,0 +1,4 @@
+/UnicodeData.txt
+/allkeys.txt
+/ucd.all.grouped.xml
+/ucd.all.grouped.zip
--- a/serialized/Makefile
+++ b/serialized/Makefile
@ -1,7 +1,7 @@
-
-SPECIAL_TARGETS=Utf8Case.ser
+SPECIAL_TARGETS=Utf8Case.ser normalize-ar.ser normalize-ml.ser first-letters-root.ser
 ALL_TARGETS=$(SPECIAL_TARGETS)
 DIST_TARGETS=$(SPECIAL_TARGETS)
+UNICODE_VERSION=6.0.0

 .PHONY: all dist clean

@ -13,6 +13,26 @@ dist: $(DIST_TARGETS)
 clean:
 	rm -f $(ALL_TARGETS)

-Utf8Case.ser : ../includes/normal/Utf8Case.php
-	php serialize.php -o $@ $<
+Utf8Case.ser: UnicodeData.txt
+	php ../maintenance/language/generateUtf8Case.php

+normalize-ar.ser: UnicodeData.txt
+	php ../maintenance/language/generateNormalizerDataAr.php
+
+normalize-ml.ser:
+	php ../maintenance/language/generateNormalizerDataMl.php
+
+first-letters-root.ser: allkeys.txt ucd.all.grouped.xml
+	php ../maintenance/language/generateCollationData.php
+
+UnicodeData.txt:
+	wget http://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/UnicodeData.txt
+
+allkeys.txt:
+	wget http://www.unicode.org/Public/UCA/$(UNICODE_VERSION)/allkeys.txt
+
+ucd.all.grouped.xml: ucd.all.grouped.zip
+	unzip ucd.all.grouped.zip ucd.all.grouped.xml
+
+ucd.all.grouped.zip:
+	wget http://www.unicode.org/Public/$(UNICODE_VERSION)/ucdxml/ucd.all.grouped.zip
--- a/serialized/serialize.php
+++ b/serialized/serialize.php
@ -1,95 +0,0 @@
-<?php
-/**
- * Serialize variables found in input file and store the result in the
- * specified file.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
- * http://www.gnu.org/copyleft/gpl.html
- *
- * @file
- */
-
-if ( !defined( 'MEDIAWIKI' ) ) {
-	$wgNoDBParam = true;
-	$optionsWithArgs = array( 'o' );
-	require_once __DIR__ .'/../maintenance/commandLine.inc';
-
-	$stderr = fopen( 'php://stderr', 'w' );
-	if ( !isset( $args[0] ) ) {
-		fwrite( $stderr, "No input file specified\n" );
-		exit( 1 );
-	}
-	if ( wfIsWindows() ) {
-		$files = array();
-		foreach ( $args as $arg ) {
-			$files = array_merge( $files, glob( $arg ) );
-		}
-		if ( !$files ) {
-			fwrite( $stderr, "No files found\n" );
-		}
-	} else {
-		$files = $args;
-	}
-
-	if ( isset( $options['o'] ) ) {
-		$out = fopen( $options['o'], 'wb' );
-		if ( !$out ) {
-			fwrite( $stderr, "Unable to open file \"{$options['o']}\" for output\n" );
-			exit( 1 );
-		}
-	} else {
-		$out = fopen( 'php://stdout', 'wb' );
-	}
-
-	$vars = array();
-	foreach ( $files as $inputFile ) {
-		$vars = array_merge( $vars, getVars( $inputFile ) );
-	}
-	fwrite( $out, serialize( $vars ) );
-	fclose( $out );
-	exit( 0 );
-}
-
-//----------------------------------------------------------------------------
-
-function getVars( $_gv_filename ) {
-	require $_gv_filename;
-	$vars = get_defined_vars();
-	unset( $vars['_gv_filename'] );
-
-	# Clean up line endings
-	if ( wfIsWindows() ) {
-		$vars = unixLineEndings( $vars );
-	}
-	return $vars;
-}
-
-function unixLineEndings( $var ) {
-	static $recursionLevel = 0;
-	if ( $recursionLevel > 50 ) {
-		global $stderr;
-		fwrite( $stderr, "Error: Recursion limit exceeded. Possible circular reference in array variable.\n" );
-		exit( 2 );
-	}
-
-	if ( is_array( $var ) ) {
-		++$recursionLevel;
-		$var = array_map( 'unixLineEndings', $var );
-		--$recursionLevel;
-	} elseif ( is_string( $var ) ) {
-		$var = str_replace( "\r\n", "\n", $var );
-	}
-	return $var;
-}