* (bug 13615) Update case mappings and normalization to Unicode 5.1.0
Note that case mappings will only be used if mbstring extension is not present. Normalization data files updated to Unicode 5.1.0; passes the automated tests. Seem to have long since lost the script I originally used to generate the Utf8Case.php mapping file, which appears not to have been updated since 2002 or so. :) Made a new one and moved it into the UtfNormal sub-library. Note a couple limitations: * Case mapping (still) uses only the 1:1 simple mappings. Any full or locale-specific mappings are ignored. * These case mappings are not used anyway when the PHP mbstring extension is available; mbstring's case conversion functions are used instead, with whatever version of Unicode support and whatever complex mapping support they may or may not have. * The generated Utf8Case.php file is not used directly -- you must also regenerate the serialized version in the 'serialized' directory after updating it to a new Unicode version.
This commit is contained in:
parent
4b4ae60c52
commit
c012a63d95
9 changed files with 2202 additions and 1510 deletions
|
|
@ -258,6 +258,8 @@ it from source control: http://www.mediawiki.org/wiki/Download_from_SVN
|
|||
* (bug 13949) Special:PrefixIndex/AllPages paging links contain invalid XML
|
||||
* (bug 13770) Use Preprocessor_Hash by default to avoid missing DOM module errors
|
||||
* (bug 13982) Disable ccmeonemails preference when user-to-user mails disabled
|
||||
* (bug 13615) Update case mappings and normalization to Unicode 5.1.0
|
||||
Note that case mappings will only be used if mbstring extension is not present.
|
||||
|
||||
|
||||
=== API changes in 1.13 ===
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load diff
|
|
@ -5,8 +5,8 @@
|
|||
## when the data was generated from a previous version.
|
||||
#BASE=http://www.unicode.org/Public/UNIDATA
|
||||
|
||||
# Explicitly using Unicode 5.0
|
||||
BASE=http://www.unicode.org/Public/5.0.0/ucd
|
||||
# Explicitly using Unicode 5.1
|
||||
BASE=http://www.unicode.org/Public/5.1.0/ucd
|
||||
|
||||
# Can override to php-cli or php5 or whatevah
|
||||
PHP=php
|
||||
|
|
@ -16,11 +16,14 @@ PHP=php
|
|||
FETCH=wget
|
||||
#FETCH=fetch
|
||||
|
||||
all : UtfNormalData.inc
|
||||
all : UtfNormalData.inc Utf8Case.php
|
||||
|
||||
UtfNormalData.inc : UtfNormalGenerate.php UtfNormalUtil.php UnicodeData.txt CompositionExclusions.txt NormalizationCorrections.txt DerivedNormalizationProps.txt
|
||||
$(PHP) UtfNormalGenerate.php
|
||||
|
||||
Utf8Case.php : Utf8CaseGenerate.php UtfNormalUtil.php UnicodeData.txt
|
||||
$(PHP) Utf8CaseGenerate.php
|
||||
|
||||
test : testutf8 testclean UtfNormalTest.php UtfNormalData.inc NormalizationTest.txt
|
||||
$(PHP) UtfNormalTest.php
|
||||
|
||||
|
|
|
|||
2078
includes/normal/Utf8Case.php
Normal file
2078
includes/normal/Utf8Case.php
Normal file
File diff suppressed because it is too large
Load diff
112
includes/normal/Utf8CaseGenerate.php
Normal file
112
includes/normal/Utf8CaseGenerate.php
Normal file
|
|
@ -0,0 +1,112 @@
|
|||
<?php
|
||||
# Copyright (C) 2004,2008 Brion Vibber <brion@pobox.com>
|
||||
# http://www.mediawiki.org/
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation; either version 2 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License along
|
||||
# with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
||||
# http://www.gnu.org/copyleft/gpl.html
|
||||
|
||||
/**
|
||||
* This script generates Utf8Case.inc from the Unicode Character Database
|
||||
* and supplementary files.
|
||||
*
|
||||
* @addtogroup UtfNormal
|
||||
* @access private
|
||||
*/
|
||||
|
||||
/** */
|
||||
|
||||
if( php_sapi_name() != 'cli' ) {
|
||||
die( "Run me from the command line please.\n" );
|
||||
}
|
||||
|
||||
require_once 'UtfNormalUtil.php';
|
||||
|
||||
$in = fopen("UnicodeData.txt", "rt" );
|
||||
if( !$in ) {
|
||||
print "Can't open UnicodeData.txt for reading.\n";
|
||||
print "If necessary, fetch this file from the internet:\n";
|
||||
print "http://www.unicode.org/Public/UNIDATA/UnicodeData.txt\n";
|
||||
exit(-1);
|
||||
}
|
||||
$wikiUpperChars = array();
|
||||
$wikiLowerChars = array();
|
||||
|
||||
print "Reading character definitions...\n";
|
||||
while( false !== ($line = fgets( $in ) ) ) {
|
||||
$columns = split(';', $line);
|
||||
$codepoint = $columns[0];
|
||||
$name = $columns[1];
|
||||
$simpleUpper = $columns[12];
|
||||
$simpleLower = $columns[13];
|
||||
|
||||
$source = codepointToUtf8( hexdec( $codepoint ) );
|
||||
if( $simpleUpper ) {
|
||||
$wikiUpperChars[$source] = codepointToUtf8( hexdec( $simpleUpper ) );
|
||||
}
|
||||
if( $simpleLower ) {
|
||||
$wikiLowerChars[$source] = codepointToUtf8( hexdec( $simpleLower ) );
|
||||
}
|
||||
}
|
||||
fclose( $in );
|
||||
|
||||
$out = fopen("Utf8Case.php", "wt");
|
||||
if( $out ) {
|
||||
$outUpperChars = escapeArray( $wikiUpperChars );
|
||||
$outLowerChars = escapeArray( $wikiLowerChars );
|
||||
$outdata = "<" . "?php
|
||||
/**
|
||||
* Simple 1:1 upper/lowercase switching arrays for utf-8 text
|
||||
* Won't get context-sensitive things yet
|
||||
*
|
||||
* Hack for bugs in ucfirst() and company
|
||||
*
|
||||
* These are pulled from memcached if possible, as this is faster than filling
|
||||
* up a big array manually.
|
||||
* @addtogroup Language
|
||||
*/
|
||||
|
||||
/*
|
||||
* Translation array to get upper case character
|
||||
*/
|
||||
|
||||
\$wikiUpperChars = $outUpperChars;
|
||||
|
||||
/*
|
||||
* Translation array to get lower case character
|
||||
*/
|
||||
\$wikiLowerChars = $outLowerChars;\n";
|
||||
fputs( $out, $outdata );
|
||||
fclose( $out );
|
||||
print "Wrote out Utf8Case.php\n";
|
||||
} else {
|
||||
print "Can't create file Utf8Case.php\n";
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
|
||||
function escapeArray( $arr ) {
|
||||
return "array(\n" .
|
||||
implode( ",\n",
|
||||
array_map( "escapeLine",
|
||||
array_keys( $arr ),
|
||||
array_values( $arr ) ) ) .
|
||||
"\n)";
|
||||
}
|
||||
|
||||
function escapeLine( $key, $val ) {
|
||||
$encKey = escapeSingleString( $key );
|
||||
$encVal = escapeSingleString( $val );
|
||||
return "\t'$encKey' => '$encVal'";
|
||||
}
|
||||
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
|
|
@ -21,7 +21,7 @@ dist: $(DIST_TARGETS)
|
|||
clean:
|
||||
rm -f $(ALL_TARGETS)
|
||||
|
||||
Utf8Case.ser : ../includes/Utf8Case.php
|
||||
Utf8Case.ser : ../includes/normal/Utf8Case.php
|
||||
php serialize.php -o $@ $<
|
||||
|
||||
Messages%.ser : ../languages/messages/Messages%.php ../languages/messages/MessagesEn.php
|
||||
|
|
|
|||
File diff suppressed because one or more lines are too long
Loading…
Reference in a new issue