Merge "Use wikimedia/utfnormal library, add backwards-compatability layer"
This commit is contained in:
commit
b594c133f3
42 changed files with 170 additions and 44342 deletions
|
|
@ -25,6 +25,7 @@
|
|||
"psr/log": "1.0.0",
|
||||
"wikimedia/cdb": "1.0.1",
|
||||
"wikimedia/composer-merge-plugin": "1.0.0",
|
||||
"wikimedia/utfnormal": "1.0.2",
|
||||
"zordius/lightncandy": "0.18"
|
||||
},
|
||||
"require-dev": {
|
||||
|
|
|
|||
|
|
@ -341,7 +341,7 @@ class IcuCollation extends Collation {
|
|||
|
||||
// Check for CJK
|
||||
$firstChar = mb_substr( $string, 0, 1, 'UTF-8' );
|
||||
if ( ord( $firstChar ) > 0x7f && self::isCjk( utf8ToCodepoint( $firstChar ) ) ) {
|
||||
if ( ord( $firstChar ) > 0x7f && self::isCjk( UtfNormal\Utils::utf8ToCodepoint( $firstChar ) ) ) {
|
||||
return $firstChar;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -3984,7 +3984,7 @@ HTML
|
|||
// breaks one of the entities whilst editing.
|
||||
if ( ( substr( $invalue, $i, 1 ) == ";" ) && ( strlen( $hexstring ) <= 6 ) ) {
|
||||
$codepoint = hexdec( $hexstring );
|
||||
$result .= codepointToUtf8( $codepoint );
|
||||
$result .= UtfNormal\Utils::codepointToUtf8( $codepoint );
|
||||
} else {
|
||||
$result .= "&#x" . $hexstring . substr( $invalue, $i, 1 );
|
||||
}
|
||||
|
|
|
|||
|
|
@ -164,7 +164,7 @@ class FeedUtils {
|
|||
$diffText = "<p>Can't load revision $newid</p>";
|
||||
} else {
|
||||
// Diff output fine, clean up any illegal UTF-8
|
||||
$diffText = UtfNormal::cleanUp( $diffText );
|
||||
$diffText = UtfNormal\Validator::cleanUp( $diffText );
|
||||
$diffText = self::applyDiffStyle( $diffText );
|
||||
}
|
||||
} else {
|
||||
|
|
|
|||
|
|
@ -865,7 +865,7 @@ class Sanitizer {
|
|||
$value = preg_replace_callback(
|
||||
'/[!-[]-z]/u', // U+FF01 to U+FF5A, excluding U+FF3C (bug 58088)
|
||||
function ( $matches ) {
|
||||
$cp = utf8ToCodepoint( $matches[0] );
|
||||
$cp = UtfNormal\Utils::utf8ToCodepoint( $matches[0] );
|
||||
if ( $cp === false ) {
|
||||
return '';
|
||||
}
|
||||
|
|
@ -971,7 +971,7 @@ class Sanitizer {
|
|||
// Line continuation
|
||||
return '';
|
||||
} elseif ( $matches[2] !== '' ) {
|
||||
$char = codepointToUtf8( hexdec( $matches[2] ) );
|
||||
$char = UtfNormal\Utils::codepointToUtf8( hexdec( $matches[2] ) );
|
||||
} elseif ( $matches[3] !== '' ) {
|
||||
$char = $matches[3];
|
||||
} else {
|
||||
|
|
@ -1452,9 +1452,9 @@ class Sanitizer {
|
|||
*/
|
||||
static function decodeChar( $codepoint ) {
|
||||
if ( Sanitizer::validateCodepoint( $codepoint ) ) {
|
||||
return codepointToUtf8( $codepoint );
|
||||
return UtfNormal\Utils::codepointToUtf8( $codepoint );
|
||||
} else {
|
||||
return UTF8_REPLACEMENT;
|
||||
return UtfNormal\Constants::UTF8_REPLACEMENT;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -1471,7 +1471,7 @@ class Sanitizer {
|
|||
$name = self::$htmlEntityAliases[$name];
|
||||
}
|
||||
if ( isset( self::$htmlEntities[$name] ) ) {
|
||||
return codepointToUtf8( self::$htmlEntities[$name] );
|
||||
return UtfNormal\Utils::codepointToUtf8( self::$htmlEntities[$name] );
|
||||
} else {
|
||||
return "&$name;";
|
||||
}
|
||||
|
|
|
|||
|
|
@ -289,7 +289,7 @@ class WebRequest {
|
|||
}
|
||||
} else {
|
||||
global $wgContLang;
|
||||
$data = isset( $wgContLang ) ? $wgContLang->normalize( $data ) : UtfNormal::cleanUp( $data );
|
||||
$data = isset( $wgContLang ) ? $wgContLang->normalize( $data ) : UtfNormal\Validator::cleanUp( $data );
|
||||
}
|
||||
return $data;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -327,7 +327,7 @@ class MWDebug {
|
|||
}
|
||||
$str = $prefix . $str;
|
||||
}
|
||||
self::$debug[] = rtrim( UtfNormal::cleanUp( $str ) );
|
||||
self::$debug[] = rtrim( UtfNormal\Validator::cleanUp( $str ) );
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -1140,9 +1140,6 @@ abstract class Installer {
|
|||
* Check the libicu version
|
||||
*/
|
||||
protected function envCheckLibicu() {
|
||||
$utf8 = function_exists( 'utf8_normalize' );
|
||||
$intl = function_exists( 'normalizer_normalize' );
|
||||
|
||||
/**
|
||||
* This needs to be updated something that the latest libicu
|
||||
* will properly normalize. This normalization was found at
|
||||
|
|
@ -1156,18 +1153,7 @@ abstract class Installer {
|
|||
$useNormalizer = 'php';
|
||||
$needsUpdate = false;
|
||||
|
||||
/**
|
||||
* We're going to prefer the pecl extension here unless
|
||||
* utf8_normalize is more up to date.
|
||||
*/
|
||||
if ( $utf8 ) {
|
||||
$useNormalizer = 'utf8';
|
||||
$utf8 = utf8_normalize( $not_normal_c, UtfNormal::UNORM_NFC );
|
||||
if ( $utf8 !== $normal_c ) {
|
||||
$needsUpdate = true;
|
||||
}
|
||||
}
|
||||
if ( $intl ) {
|
||||
if ( function_exists( 'normalizer_normalize' ) ) {
|
||||
$useNormalizer = 'intl';
|
||||
$intl = normalizer_normalize( $not_normal_c, Normalizer::FORM_C );
|
||||
if ( $intl !== $normal_c ) {
|
||||
|
|
@ -1175,8 +1161,7 @@ abstract class Installer {
|
|||
}
|
||||
}
|
||||
|
||||
// Uses messages 'config-unicode-using-php', 'config-unicode-using-utf8',
|
||||
// 'config-unicode-using-intl'
|
||||
// Uses messages 'config-unicode-using-php' and 'config-unicode-using-intl'
|
||||
if ( $useNormalizer === 'php' ) {
|
||||
$this->showMessage( 'config-unicode-pure-php-warning' );
|
||||
} else {
|
||||
|
|
|
|||
|
|
@ -45,7 +45,6 @@
|
|||
"config-env-bad": "The environment has been checked.\nYou cannot install MediaWiki.",
|
||||
"config-env-php": "PHP $1 is installed.",
|
||||
"config-env-hhvm": "HHVM $1 is installed.",
|
||||
"config-unicode-using-utf8": "Using Brion Vibber's utf8_normalize.so for Unicode normalization.",
|
||||
"config-unicode-using-intl": "Using the [http://pecl.php.net/intl intl PECL extension] for Unicode normalization.",
|
||||
"config-unicode-pure-php-warning": "<strong>Warning:</strong> The [http://pecl.php.net/intl intl PECL extension] is not available to handle Unicode normalization, falling back to slow pure-PHP implementation.\nIf you run a high-traffic site, you should read a little on [//www.mediawiki.org/wiki/Special:MyLanguage/Unicode_normalization_considerations Unicode normalization].",
|
||||
"config-unicode-update-warning": "<strong>Warning:</strong> The installed version of the Unicode normalization wrapper uses an older version of [http://site.icu-project.org/ the ICU project's] library.\nYou should [//www.mediawiki.org/wiki/Special:MyLanguage/Unicode_normalization_considerations upgrade] if you are at all concerned about using Unicode.",
|
||||
|
|
|
|||
|
|
@ -63,7 +63,6 @@
|
|||
"config-env-bad": "See also:\n* {{msg-mw|Config-env-good}}",
|
||||
"config-env-php": "Parameters:\n* $1 - the version of PHP that has been installed\nSee also:\n* {{msg-mw|config-env-php-toolow}}",
|
||||
"config-env-hhvm": "Parameters:\n* $1 - the version of HHVM that has been installed",
|
||||
"config-unicode-using-utf8": "Status message in the MediaWiki installer environment checks.",
|
||||
"config-unicode-using-intl": "Status message in the MediaWiki installer environment checks.",
|
||||
"config-unicode-pure-php-warning": "PECL is the name of a group producing standard pieces of software for PHP, and intl is the name of their library handling some aspects of internationalization.",
|
||||
"config-unicode-update-warning": "ICU is a body producing standard software tools for support of Unicode and other internationalization aspects. This message warns the system administrator installing MediaWiki that the server's software is not up-to-date and MediaWiki will have problems handling some characters.",
|
||||
|
|
|
|||
|
|
@ -1,69 +0,0 @@
|
|||
.PHONY : all test testutf8 testclean icutest bench icubench clean distclean
|
||||
|
||||
## Latest greatest version of Unicode
|
||||
## May cause confusion if running test suite from these files
|
||||
## when the data was generated from a previous version.
|
||||
#BASE=http://www.unicode.org/Public/UNIDATA
|
||||
|
||||
# Explicitly using Unicode 6.0
|
||||
BASE=http://www.unicode.org/Public/6.0.0/ucd
|
||||
|
||||
# Can override to php-cli or php5 or whatever
|
||||
PHP=php
|
||||
#PHP=php-cli
|
||||
|
||||
# Some nice tool to grab URLs with
|
||||
FETCH=wget
|
||||
#FETCH=fetch
|
||||
|
||||
all : UtfNormalData.inc
|
||||
|
||||
UtfNormalData.inc : UtfNormalGenerate.php UtfNormalUtil.php UnicodeData.txt CompositionExclusions.txt NormalizationCorrections.txt DerivedNormalizationProps.txt
|
||||
$(PHP) UtfNormalGenerate.php
|
||||
|
||||
bench : UtfNormalData.inc testdata/washington.txt testdata/berlin.txt testdata/tokyo.txt testdata/young.txt testdata/bulgakov.txt
|
||||
$(PHP) UtfNormalBench.php
|
||||
|
||||
icutest : UtfNormalData.inc NormalizationTest.txt
|
||||
$(PHP) Utf8Test.php --icu
|
||||
$(PHP) UtfNormalTest.php --icu
|
||||
|
||||
icubench : UtfNormalData.inc testdata/washington.txt testdata/berlin.txt testdata/tokyo.txt testdata/young.txt testdata/bulgakov.txt
|
||||
$(PHP) UtfNormalBench.php --icu
|
||||
|
||||
clean :
|
||||
rm -f UtfNormalData.inc UtfNormalDataK.inc
|
||||
|
||||
distclean : clean
|
||||
rm -f CompositionExclusions.txt NormalizationTest.txt NormalizationCorrections.txt UnicodeData.txt DerivedNormalizationProps.txt UTF-8-test.txt
|
||||
|
||||
# The Unicode data files...
|
||||
CompositionExclusions.txt :
|
||||
$(FETCH) $(BASE)/CompositionExclusions.txt
|
||||
|
||||
NormalizationTest.txt :
|
||||
$(FETCH) $(BASE)/NormalizationTest.txt
|
||||
|
||||
NormalizationCorrections.txt :
|
||||
$(FETCH) $(BASE)/NormalizationCorrections.txt
|
||||
|
||||
DerivedNormalizationProps.txt :
|
||||
$(FETCH) $(BASE)/DerivedNormalizationProps.txt
|
||||
|
||||
UnicodeData.txt :
|
||||
$(FETCH) $(BASE)/UnicodeData.txt
|
||||
|
||||
testdata/berlin.txt :
|
||||
mkdir -p testdata && wget -U MediaWiki/test -O testdata/berlin.txt "http://de.wikipedia.org/w/index.php?title=Berlin&oldid=2775712&action=raw"
|
||||
|
||||
testdata/washington.txt :
|
||||
mkdir -p testdata && wget -U MediaWiki/test -O testdata/washington.txt "http://en.wikipedia.org/w/index.php?title=Washington%2C_D.C.&oldid=6370218&action=raw"
|
||||
|
||||
testdata/tokyo.txt :
|
||||
mkdir -p testdata && wget -U MediaWiki/test -O testdata/tokyo.txt "http://ja.wikipedia.org/w/index.php?title=%E6%9D%B1%E4%BA%AC%E9%83%BD&oldid=940880&action=raw"
|
||||
|
||||
testdata/young.txt :
|
||||
mkdir -p testdata && wget -U MediaWiki/test -O testdata/young.txt "http://ko.wikipedia.org/w/index.php?title=%EC%9D%B4%EC%88%98%EC%98%81&oldid=627688&action=raw"
|
||||
|
||||
testdata/bulgakov.txt :
|
||||
mkdir -p testdata && wget -U MediaWiki/test -O testdata/bulgakov.txt "http://ru.wikipedia.org/w/index.php?title=%D0%91%D1%83%D0%BB%D0%B3%D0%B0%D0%BA%D0%BE%D0%B2%2C_%D0%A1%D0%B5%D1%80%D0%B3%D0%B5%D0%B9_%D0%9D%D0%B8%D0%BA%D0%BE%D0%BB%D0%B0%D0%B5%D0%B2%D0%B8%D1%87&oldid=17704&action=raw"
|
||||
|
|
@ -1,59 +0,0 @@
|
|||
This directory contains some Unicode normalization routines. These routines
|
||||
are meant to be reusable in other projects, so I'm not tying them to the
|
||||
MediaWiki utility functions.
|
||||
|
||||
The main function to care about is UtfNormal::toNFC(); this will convert
|
||||
a given UTF-8 string to Normalization Form C if it's not already such.
|
||||
The function assumes that the input string is already valid UTF-8; if there
|
||||
are corrupt characters this may produce erroneous results.
|
||||
|
||||
To also check for illegal characters, use UtfNormal::cleanUp(). This will
|
||||
strip illegal UTF-8 sequences and characters that are illegal in XML, and
|
||||
if necessary convert to normalization form C.
|
||||
|
||||
Performance is kind of stinky in absolute terms, though it should be speedy
|
||||
on pure ASCII text. ;) On text that can be determined quickly to already be
|
||||
in NFC it's not too awful but it can quickly get uncomfortably slow,
|
||||
particularly for Korean text (the hangul decomposition/composition code is
|
||||
extra slow).
|
||||
|
||||
|
||||
== Regenerating data tables ==
|
||||
|
||||
UtfNormalData.inc and UtfNormalDataK.inc are generated from the Unicode
|
||||
Character Database by the script UtfNormalGenerate.php. On a *nix system
|
||||
'make' should fetch the necessary files and regenerate it if the scripts
|
||||
have been changed or you remove it.
|
||||
|
||||
|
||||
== Testing ==
|
||||
|
||||
'make test' will run the conformance test (UtfNormalTest.php), fetching the
|
||||
data from the net if necessary. If it reports failure, something is
|
||||
going wrong!
|
||||
|
||||
You may have to set up PHPUnit first.
|
||||
|
||||
$ pear channel-discover pear.phpunit.de
|
||||
$ pear install phpunit/PHPUnit
|
||||
|
||||
== Benchmarks ==
|
||||
|
||||
Run 'make bench' to download some sample texts from Wikipedia and run some
|
||||
cheap benchmarks of some of the functions. Take all numbers with large
|
||||
grains of salt.
|
||||
|
||||
|
||||
== PHP module extension ==
|
||||
|
||||
There's an experimental PHP extension module which wraps the ICU library's
|
||||
normalization functions. This is *MUCH* faster than doing this work in pure
|
||||
PHP code. This is at https://git.wikimedia.org/summary/mediawiki%2Fextensions%2Fnormal.git.
|
||||
It is used by the WMF, which currently runs PHP 5.3.10 on Linux. It hasn't been
|
||||
thoroughly tested on other configurations, but may work.
|
||||
|
||||
If the php_normal.so module is loaded in php.ini, the normalization functions
|
||||
will automatically use it. If you can't (or don't want to) load it in php.ini,
|
||||
you may be able to load it using the dl() function before the inclusion of
|
||||
UtfNormal.php, and it will be picked up.
|
||||
|
||||
|
|
@ -1,102 +0,0 @@
|
|||
<?php
|
||||
/**
|
||||
* Test feeds random 16-byte strings to both the pure PHP and ICU-based
|
||||
* UtfNormal::cleanUp() code paths, and checks to see if there's a
|
||||
* difference. Will run forever until it finds one or you kill it.
|
||||
*
|
||||
* Copyright (C) 2004 Brion Vibber <brion@pobox.com>
|
||||
* https://www.mediawiki.org/
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License along
|
||||
* with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
||||
* http://www.gnu.org/copyleft/gpl.html
|
||||
*
|
||||
* @file
|
||||
* @ingroup UtfNormal
|
||||
*/
|
||||
|
||||
if ( PHP_SAPI != 'cli' ) {
|
||||
die( "Run me from the command line please.\n" );
|
||||
}
|
||||
|
||||
/** */
|
||||
require_once 'UtfNormal.php';
|
||||
require_once '../diff/DifferenceEngine.php';
|
||||
|
||||
dl( 'php_utfnormal.so' );
|
||||
|
||||
# mt_srand( 99999 );
|
||||
|
||||
function randomString( $length, $nullOk, $ascii = false ) {
|
||||
$out = '';
|
||||
for ( $i = 0; $i < $length; $i++ )
|
||||
$out .= chr( mt_rand( $nullOk ? 0 : 1, $ascii ? 127 : 255 ) );
|
||||
|
||||
return $out;
|
||||
}
|
||||
|
||||
/* Duplicate of the cleanUp() path for ICU usage */
|
||||
function donorm( $str ) {
|
||||
# We exclude a few chars that ICU would not.
|
||||
$str = preg_replace( '/[\x00-\x08\x0b\x0c\x0e-\x1f]/', UTF8_REPLACEMENT, $str );
|
||||
$str = str_replace( UTF8_FFFE, UTF8_REPLACEMENT, $str );
|
||||
$str = str_replace( UTF8_FFFF, UTF8_REPLACEMENT, $str );
|
||||
|
||||
# UnicodeString constructor fails if the string ends with a head byte.
|
||||
# Add a junk char at the end, we'll strip it off
|
||||
return rtrim( utf8_normalize( $str . "\x01", UtfNormal::UNORM_NFC ), "\x01" );
|
||||
}
|
||||
|
||||
function showDiffs( $a, $b ) {
|
||||
$ota = explode( "\n", str_replace( "\r\n", "\n", $a ) );
|
||||
$nta = explode( "\n", str_replace( "\r\n", "\n", $b ) );
|
||||
|
||||
$diffs = new Diff( $ota, $nta );
|
||||
$formatter = new TableDiffFormatter();
|
||||
$funky = $formatter->format( $diffs );
|
||||
$matches = array();
|
||||
preg_match_all( '/<(?:ins|del) class="diffchange">(.*?)<\/(?:ins|del)>/', $funky, $matches );
|
||||
foreach ( $matches[1] as $bit ) {
|
||||
$hex = bin2hex( $bit );
|
||||
echo "\t$hex\n";
|
||||
}
|
||||
}
|
||||
|
||||
$size = 16;
|
||||
$n = 0;
|
||||
while ( true ) {
|
||||
$n++;
|
||||
echo "$n\n";
|
||||
|
||||
$str = randomString( $size, true );
|
||||
$clean = UtfNormal::cleanUp( $str );
|
||||
$norm = donorm( $str );
|
||||
|
||||
echo strlen( $clean ) . ", " . strlen( $norm );
|
||||
if ( $clean == $norm ) {
|
||||
echo " (match)\n";
|
||||
} else {
|
||||
echo " (FAIL)\n";
|
||||
echo "\traw: " . bin2hex( $str ) . "\n" .
|
||||
"\tphp: " . bin2hex( $clean ) . "\n" .
|
||||
"\ticu: " . bin2hex( $norm ) . "\n";
|
||||
echo "\n\tdiffs:\n";
|
||||
showDiffs( $clean, $norm );
|
||||
die();
|
||||
}
|
||||
|
||||
$str = '';
|
||||
$clean = '';
|
||||
$norm = '';
|
||||
}
|
||||
|
|
@ -28,8 +28,7 @@
|
|||
* @defgroup UtfNormal UtfNormal
|
||||
*/
|
||||
|
||||
define( 'NORMALIZE_ICU', function_exists( 'utf8_normalize' ) );
|
||||
define( 'NORMALIZE_INTL', function_exists( 'normalizer_normalize' ) );
|
||||
use UtfNormal\Validator;
|
||||
|
||||
/**
|
||||
* Unicode normalization routines for working with UTF-8 strings.
|
||||
|
|
@ -43,28 +42,10 @@ define( 'NORMALIZE_INTL', function_exists( 'normalizer_normalize' ) );
|
|||
*
|
||||
* See description of forms at http://www.unicode.org/reports/tr15/
|
||||
*
|
||||
* @deprecated since 1.25, use UtfNormal\Validator directly
|
||||
* @ingroup UtfNormal
|
||||
*/
|
||||
class UtfNormal {
|
||||
/**
|
||||
* For using the ICU wrapper
|
||||
*/
|
||||
const UNORM_NONE = 1;
|
||||
const UNORM_NFD = 2;
|
||||
const UNORM_NFKD = 3;
|
||||
const UNORM_NFC = 4;
|
||||
const UNORM_NFKC = 5;
|
||||
const UNORM_FCD = 6;
|
||||
const UNORM_DEFAULT = self::UNORM_NFC;
|
||||
|
||||
public static $utfCombiningClass = null;
|
||||
public static $utfCanonicalComp = null;
|
||||
public static $utfCanonicalDecomp = null;
|
||||
|
||||
# Load compatibility decompositions on demand if they are needed.
|
||||
public static $utfCompatibilityDecomp = null;
|
||||
public static $utfCheckNFC;
|
||||
|
||||
/**
|
||||
* The ultimate convenience function! Clean up invalid UTF-8 sequences,
|
||||
* and convert to normal form C, canonical composition.
|
||||
|
|
@ -76,36 +57,7 @@ class UtfNormal {
|
|||
* @return string a clean, shiny, normalized UTF-8 string
|
||||
*/
|
||||
static function cleanUp( $string ) {
|
||||
if ( NORMALIZE_ICU ) {
|
||||
$string = self::replaceForNativeNormalize( $string );
|
||||
|
||||
# UnicodeString constructor fails if the string ends with a
|
||||
# head byte. Add a junk char at the end, we'll strip it off.
|
||||
return rtrim( utf8_normalize( $string . "\x01", self::UNORM_NFC ), "\x01" );
|
||||
} elseif ( NORMALIZE_INTL ) {
|
||||
$string = self::replaceForNativeNormalize( $string );
|
||||
$norm = normalizer_normalize( $string, Normalizer::FORM_C );
|
||||
if ( $norm === null || $norm === false ) {
|
||||
# normalizer_normalize will either return false or null
|
||||
# (depending on which doc you read) if invalid utf8 string.
|
||||
# quickIsNFCVerify cleans up invalid sequences.
|
||||
|
||||
if ( UtfNormal::quickIsNFCVerify( $string ) ) {
|
||||
# if that's true, the string is actually already normal.
|
||||
return $string;
|
||||
} else {
|
||||
# Now we are valid but non-normal
|
||||
return normalizer_normalize( $string, Normalizer::FORM_C );
|
||||
}
|
||||
} else {
|
||||
return $norm;
|
||||
}
|
||||
} elseif ( UtfNormal::quickIsNFCVerify( $string ) ) {
|
||||
# Side effect -- $string has had UTF-8 errors cleaned up.
|
||||
return $string;
|
||||
} else {
|
||||
return UtfNormal::NFC( $string );
|
||||
}
|
||||
return Validator::cleanUp( $string );
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -117,14 +69,7 @@ class UtfNormal {
|
|||
* @return string a UTF-8 string in normal form C
|
||||
*/
|
||||
static function toNFC( $string ) {
|
||||
if ( NORMALIZE_INTL )
|
||||
return normalizer_normalize( $string, Normalizer::FORM_C );
|
||||
elseif ( NORMALIZE_ICU )
|
||||
return utf8_normalize( $string, self::UNORM_NFC );
|
||||
elseif ( UtfNormal::quickIsNFC( $string ) )
|
||||
return $string;
|
||||
else
|
||||
return UtfNormal::NFC( $string );
|
||||
return Validator::toNFC( $string );
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -135,14 +80,7 @@ class UtfNormal {
|
|||
* @return string a UTF-8 string in normal form D
|
||||
*/
|
||||
static function toNFD( $string ) {
|
||||
if ( NORMALIZE_INTL )
|
||||
return normalizer_normalize( $string, Normalizer::FORM_D );
|
||||
elseif ( NORMALIZE_ICU )
|
||||
return utf8_normalize( $string, self::UNORM_NFD );
|
||||
elseif ( preg_match( '/[\x80-\xff]/', $string ) )
|
||||
return UtfNormal::NFD( $string );
|
||||
else
|
||||
return $string;
|
||||
return Validator::toNFD( $string );
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -154,14 +92,7 @@ class UtfNormal {
|
|||
* @return string a UTF-8 string in normal form KC
|
||||
*/
|
||||
static function toNFKC( $string ) {
|
||||
if ( NORMALIZE_INTL )
|
||||
return normalizer_normalize( $string, Normalizer::FORM_KC );
|
||||
elseif ( NORMALIZE_ICU )
|
||||
return utf8_normalize( $string, self::UNORM_NFKC );
|
||||
elseif ( preg_match( '/[\x80-\xff]/', $string ) )
|
||||
return UtfNormal::NFKC( $string );
|
||||
else
|
||||
return $string;
|
||||
return Validator::toNFKC( $string );
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -173,24 +104,7 @@ class UtfNormal {
|
|||
* @return string a UTF-8 string in normal form KD
|
||||
*/
|
||||
static function toNFKD( $string ) {
|
||||
if ( NORMALIZE_INTL )
|
||||
return normalizer_normalize( $string, Normalizer::FORM_KD );
|
||||
elseif ( NORMALIZE_ICU )
|
||||
return utf8_normalize( $string, self::UNORM_NFKD );
|
||||
elseif ( preg_match( '/[\x80-\xff]/', $string ) )
|
||||
return UtfNormal::NFKD( $string );
|
||||
else
|
||||
return $string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Load the basic composition data if necessary
|
||||
* @private
|
||||
*/
|
||||
static function loadData() {
|
||||
if ( !isset( self::$utfCombiningClass ) ) {
|
||||
require_once __DIR__ . '/UtfNormalData.inc';
|
||||
}
|
||||
return Validator::toNFKD( $string );
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -200,38 +114,7 @@ class UtfNormal {
|
|||
* @return bool
|
||||
*/
|
||||
static function quickIsNFC( $string ) {
|
||||
# ASCII is always valid NFC!
|
||||
# If it's pure ASCII, let it through.
|
||||
if ( !preg_match( '/[\x80-\xff]/', $string ) ) return true;
|
||||
|
||||
UtfNormal::loadData();
|
||||
$len = strlen( $string );
|
||||
for ( $i = 0; $i < $len; $i++ ) {
|
||||
$c = $string[$i];
|
||||
$n = ord( $c );
|
||||
if ( $n < 0x80 ) {
|
||||
continue;
|
||||
} elseif ( $n >= 0xf0 ) {
|
||||
$c = substr( $string, $i, 4 );
|
||||
$i += 3;
|
||||
} elseif ( $n >= 0xe0 ) {
|
||||
$c = substr( $string, $i, 3 );
|
||||
$i += 2;
|
||||
} elseif ( $n >= 0xc0 ) {
|
||||
$c = substr( $string, $i, 2 );
|
||||
$i++;
|
||||
}
|
||||
if ( isset( self::$utfCheckNFC[$c] ) ) {
|
||||
# If it's NO or MAYBE, bail and do the slow check.
|
||||
return false;
|
||||
}
|
||||
if ( isset( self::$utfCombiningClass[$c] ) ) {
|
||||
# Combining character? We might have to do sorting, at least.
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
return Validator::quickIsNFC( $string );
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -241,550 +124,6 @@ class UtfNormal {
|
|||
* @return bool
|
||||
*/
|
||||
static function quickIsNFCVerify( &$string ) {
|
||||
# Screen out some characters that eg won't be allowed in XML
|
||||
$string = preg_replace( '/[\x00-\x08\x0b\x0c\x0e-\x1f]/', UTF8_REPLACEMENT, $string );
|
||||
|
||||
# ASCII is always valid NFC!
|
||||
# If we're only ever given plain ASCII, we can avoid the overhead
|
||||
# of initializing the decomposition tables by skipping out early.
|
||||
if ( !preg_match( '/[\x80-\xff]/', $string ) ) return true;
|
||||
|
||||
static $checkit = null, $tailBytes = null, $utfCheckOrCombining = null;
|
||||
if ( !isset( $checkit ) ) {
|
||||
# Load/build some scary lookup tables...
|
||||
UtfNormal::loadData();
|
||||
|
||||
$utfCheckOrCombining = array_merge( self::$utfCheckNFC, self::$utfCombiningClass );
|
||||
|
||||
# Head bytes for sequences which we should do further validity checks
|
||||
$checkit = array_flip( array_map( 'chr',
|
||||
array( 0xc0, 0xc1, 0xe0, 0xed, 0xef,
|
||||
0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
|
||||
0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff ) ) );
|
||||
|
||||
# Each UTF-8 head byte is followed by a certain
|
||||
# number of tail bytes.
|
||||
$tailBytes = array();
|
||||
for ( $n = 0; $n < 256; $n++ ) {
|
||||
if ( $n < 0xc0 ) {
|
||||
$remaining = 0;
|
||||
} elseif ( $n < 0xe0 ) {
|
||||
$remaining = 1;
|
||||
} elseif ( $n < 0xf0 ) {
|
||||
$remaining = 2;
|
||||
} elseif ( $n < 0xf8 ) {
|
||||
$remaining = 3;
|
||||
} elseif ( $n < 0xfc ) {
|
||||
$remaining = 4;
|
||||
} elseif ( $n < 0xfe ) {
|
||||
$remaining = 5;
|
||||
} else {
|
||||
$remaining = 0;
|
||||
}
|
||||
$tailBytes[chr( $n )] = $remaining;
|
||||
}
|
||||
}
|
||||
|
||||
# Chop the text into pure-ASCII and non-ASCII areas;
|
||||
# large ASCII parts can be handled much more quickly.
|
||||
# Don't chop up Unicode areas for punctuation, though,
|
||||
# that wastes energy.
|
||||
$matches = array();
|
||||
preg_match_all(
|
||||
'/([\x00-\x7f]+|[\x80-\xff][\x00-\x40\x5b-\x5f\x7b-\xff]*)/',
|
||||
$string, $matches );
|
||||
|
||||
$looksNormal = true;
|
||||
$base = 0;
|
||||
$replace = array();
|
||||
foreach ( $matches[1] as $str ) {
|
||||
$chunk = strlen( $str );
|
||||
|
||||
if ( $str[0] < "\x80" ) {
|
||||
# ASCII chunk: guaranteed to be valid UTF-8
|
||||
# and in normal form C, so skip over it.
|
||||
$base += $chunk;
|
||||
continue;
|
||||
}
|
||||
|
||||
# We'll have to examine the chunk byte by byte to ensure
|
||||
# that it consists of valid UTF-8 sequences, and to see
|
||||
# if any of them might not be normalized.
|
||||
#
|
||||
# Since PHP is not the fastest language on earth, some of
|
||||
# this code is a little ugly with inner loop optimizations.
|
||||
|
||||
$head = '';
|
||||
$len = $chunk + 1; # Counting down is faster. I'm *so* sorry.
|
||||
|
||||
for ( $i = -1; --$len; ) {
|
||||
$remaining = $tailBytes[$c = $str[++$i]];
|
||||
if ( $remaining ) {
|
||||
# UTF-8 head byte!
|
||||
$sequence = $head = $c;
|
||||
do {
|
||||
# Look for the defined number of tail bytes...
|
||||
if ( --$len && ( $c = $str[++$i] ) >= "\x80" && $c < "\xc0" ) {
|
||||
# Legal tail bytes are nice.
|
||||
$sequence .= $c;
|
||||
} else {
|
||||
if ( 0 == $len ) {
|
||||
# Premature end of string!
|
||||
# Drop a replacement character into output to
|
||||
# represent the invalid UTF-8 sequence.
|
||||
$replace[] = array( UTF8_REPLACEMENT,
|
||||
$base + $i + 1 - strlen( $sequence ),
|
||||
strlen( $sequence ) );
|
||||
break 2;
|
||||
} else {
|
||||
# Illegal tail byte; abandon the sequence.
|
||||
$replace[] = array( UTF8_REPLACEMENT,
|
||||
$base + $i - strlen( $sequence ),
|
||||
strlen( $sequence ) );
|
||||
# Back up and reprocess this byte; it may itself
|
||||
# be a legal ASCII or UTF-8 sequence head.
|
||||
--$i;
|
||||
++$len;
|
||||
continue 2;
|
||||
}
|
||||
}
|
||||
} while ( --$remaining );
|
||||
|
||||
if ( isset( $checkit[$head] ) ) {
|
||||
# Do some more detailed validity checks, for
|
||||
# invalid characters and illegal sequences.
|
||||
if ( $head == "\xed" ) {
|
||||
# 0xed is relatively frequent in Korean, which
|
||||
# abuts the surrogate area, so we're doing
|
||||
# this check separately to speed things up.
|
||||
|
||||
if ( $sequence >= UTF8_SURROGATE_FIRST ) {
|
||||
# Surrogates are legal only in UTF-16 code.
|
||||
# They are totally forbidden here in UTF-8
|
||||
# utopia.
|
||||
$replace[] = array( UTF8_REPLACEMENT,
|
||||
$base + $i + 1 - strlen( $sequence ),
|
||||
strlen( $sequence ) );
|
||||
$head = '';
|
||||
continue;
|
||||
}
|
||||
} else {
|
||||
# Slower, but rarer checks...
|
||||
$n = ord( $head );
|
||||
if (
|
||||
# "Overlong sequences" are those that are syntactically
|
||||
# correct but use more UTF-8 bytes than are necessary to
|
||||
# encode a character. Naïve string comparisons can be
|
||||
# tricked into failing to see a match for an ASCII
|
||||
# character, for instance, which can be a security hole
|
||||
# if blacklist checks are being used.
|
||||
( $n < 0xc2 && $sequence <= UTF8_OVERLONG_A )
|
||||
|| ( $n == 0xe0 && $sequence <= UTF8_OVERLONG_B )
|
||||
|| ( $n == 0xf0 && $sequence <= UTF8_OVERLONG_C )
|
||||
|
||||
# U+FFFE and U+FFFF are explicitly forbidden in Unicode.
|
||||
|| ( $n == 0xef &&
|
||||
( $sequence == UTF8_FFFE )
|
||||
|| ( $sequence == UTF8_FFFF ) )
|
||||
|
||||
# Unicode has been limited to 21 bits; longer
|
||||
# sequences are not allowed.
|
||||
|| ( $n >= 0xf0 && $sequence > UTF8_MAX )
|
||||
) {
|
||||
|
||||
$replace[] = array( UTF8_REPLACEMENT,
|
||||
$base + $i + 1 - strlen( $sequence ),
|
||||
strlen( $sequence ) );
|
||||
$head = '';
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if ( isset( $utfCheckOrCombining[$sequence] ) ) {
|
||||
# If it's NO or MAYBE, we'll have to rip
|
||||
# the string apart and put it back together.
|
||||
# That's going to be mighty slow.
|
||||
$looksNormal = false;
|
||||
}
|
||||
|
||||
# The sequence is legal!
|
||||
$head = '';
|
||||
} elseif ( $c < "\x80" ) {
|
||||
# ASCII byte.
|
||||
$head = '';
|
||||
} elseif ( $c < "\xc0" ) {
|
||||
# Illegal tail bytes
|
||||
if ( $head == '' ) {
|
||||
# Out of the blue!
|
||||
$replace[] = array( UTF8_REPLACEMENT, $base + $i, 1 );
|
||||
} else {
|
||||
# Don't add if we're continuing a broken sequence;
|
||||
# we already put a replacement character when we looked
|
||||
# at the broken sequence.
|
||||
$replace[] = array( '', $base + $i, 1 );
|
||||
}
|
||||
} else {
|
||||
# Miscellaneous freaks.
|
||||
$replace[] = array( UTF8_REPLACEMENT, $base + $i, 1 );
|
||||
$head = '';
|
||||
}
|
||||
}
|
||||
$base += $chunk;
|
||||
}
|
||||
if ( count( $replace ) ) {
|
||||
# There were illegal UTF-8 sequences we need to fix up.
|
||||
$out = '';
|
||||
$last = 0;
|
||||
foreach ( $replace as $rep ) {
|
||||
list( $replacement, $start, $length ) = $rep;
|
||||
if ( $last < $start ) {
|
||||
$out .= substr( $string, $last, $start - $last );
|
||||
}
|
||||
$out .= $replacement;
|
||||
$last = $start + $length;
|
||||
}
|
||||
if ( $last < strlen( $string ) ) {
|
||||
$out .= substr( $string, $last );
|
||||
}
|
||||
$string = $out;
|
||||
}
|
||||
|
||||
return $looksNormal;
|
||||
}
|
||||
|
||||
# These take a string and run the normalization on them, without
|
||||
# checking for validity or any optimization etc. Input must be
|
||||
# VALID UTF-8!
|
||||
/**
|
||||
* @param $string string
|
||||
* @return string
|
||||
* @private
|
||||
*/
|
||||
static function NFC( $string ) {
|
||||
return UtfNormal::fastCompose( UtfNormal::NFD( $string ) );
|
||||
}
|
||||
|
||||
/**
|
||||
* @param $string string
|
||||
* @return string
|
||||
* @private
|
||||
*/
|
||||
static function NFD( $string ) {
|
||||
UtfNormal::loadData();
|
||||
|
||||
return UtfNormal::fastCombiningSort(
|
||||
UtfNormal::fastDecompose( $string, self::$utfCanonicalDecomp ) );
|
||||
}
|
||||
|
||||
/**
|
||||
* @param $string string
|
||||
* @return string
|
||||
* @private
|
||||
*/
|
||||
static function NFKC( $string ) {
|
||||
return UtfNormal::fastCompose( UtfNormal::NFKD( $string ) );
|
||||
}
|
||||
|
||||
/**
|
||||
* @param $string string
|
||||
* @return string
|
||||
* @private
|
||||
*/
|
||||
static function NFKD( $string ) {
|
||||
if ( !isset( self::$utfCompatibilityDecomp ) ) {
|
||||
require_once 'UtfNormalDataK.inc';
|
||||
}
|
||||
|
||||
return self::fastCombiningSort(
|
||||
self::fastDecompose( $string, self::$utfCompatibilityDecomp ) );
|
||||
}
|
||||
|
||||
/**
|
||||
* Perform decomposition of a UTF-8 string into either D or KD form
|
||||
* (depending on which decomposition map is passed to us).
|
||||
* Input is assumed to be *valid* UTF-8. Invalid code will break.
|
||||
* @private
|
||||
* @param string $string valid UTF-8 string
|
||||
* @param array $map hash of expanded decomposition map
|
||||
* @return string a UTF-8 string decomposed, not yet normalized (needs sorting)
|
||||
*/
|
||||
static function fastDecompose( $string, $map ) {
|
||||
UtfNormal::loadData();
|
||||
$len = strlen( $string );
|
||||
$out = '';
|
||||
for ( $i = 0; $i < $len; $i++ ) {
|
||||
$c = $string[$i];
|
||||
$n = ord( $c );
|
||||
if ( $n < 0x80 ) {
|
||||
# ASCII chars never decompose
|
||||
# THEY ARE IMMORTAL
|
||||
$out .= $c;
|
||||
continue;
|
||||
} elseif ( $n >= 0xf0 ) {
|
||||
$c = substr( $string, $i, 4 );
|
||||
$i += 3;
|
||||
} elseif ( $n >= 0xe0 ) {
|
||||
$c = substr( $string, $i, 3 );
|
||||
$i += 2;
|
||||
} elseif ( $n >= 0xc0 ) {
|
||||
$c = substr( $string, $i, 2 );
|
||||
$i++;
|
||||
}
|
||||
if ( isset( $map[$c] ) ) {
|
||||
$out .= $map[$c];
|
||||
continue;
|
||||
} else {
|
||||
if ( $c >= UTF8_HANGUL_FIRST && $c <= UTF8_HANGUL_LAST ) {
|
||||
# Decompose a hangul syllable into jamo;
|
||||
# hardcoded for three-byte UTF-8 sequence.
|
||||
# A lookup table would be slightly faster,
|
||||
# but adds a lot of memory & disk needs.
|
||||
#
|
||||
$index = ( ( ord( $c[0] ) & 0x0f ) << 12
|
||||
| ( ord( $c[1] ) & 0x3f ) << 6
|
||||
| ( ord( $c[2] ) & 0x3f ) )
|
||||
- UNICODE_HANGUL_FIRST;
|
||||
$l = intval( $index / UNICODE_HANGUL_NCOUNT );
|
||||
$v = intval( ( $index % UNICODE_HANGUL_NCOUNT ) / UNICODE_HANGUL_TCOUNT );
|
||||
$t = $index % UNICODE_HANGUL_TCOUNT;
|
||||
$out .= "\xe1\x84" . chr( 0x80 + $l ) . "\xe1\x85" . chr( 0xa1 + $v );
|
||||
if ( $t >= 25 ) {
|
||||
$out .= "\xe1\x87" . chr( 0x80 + $t - 25 );
|
||||
} elseif ( $t ) {
|
||||
$out .= "\xe1\x86" . chr( 0xa7 + $t );
|
||||
}
|
||||
continue;
|
||||
}
|
||||
}
|
||||
$out .= $c;
|
||||
}
|
||||
|
||||
return $out;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sorts combining characters into canonical order. This is the
|
||||
* final step in creating decomposed normal forms D and KD.
|
||||
* @private
|
||||
* @param string $string a valid, decomposed UTF-8 string. Input is not validated.
|
||||
* @return string a UTF-8 string with combining characters sorted in canonical order
|
||||
*/
|
||||
static function fastCombiningSort( $string ) {
|
||||
UtfNormal::loadData();
|
||||
$len = strlen( $string );
|
||||
$out = '';
|
||||
$combiners = array();
|
||||
$lastClass = -1;
|
||||
for ( $i = 0; $i < $len; $i++ ) {
|
||||
$c = $string[$i];
|
||||
$n = ord( $c );
|
||||
if ( $n >= 0x80 ) {
|
||||
if ( $n >= 0xf0 ) {
|
||||
$c = substr( $string, $i, 4 );
|
||||
$i += 3;
|
||||
} elseif ( $n >= 0xe0 ) {
|
||||
$c = substr( $string, $i, 3 );
|
||||
$i += 2;
|
||||
} elseif ( $n >= 0xc0 ) {
|
||||
$c = substr( $string, $i, 2 );
|
||||
$i++;
|
||||
}
|
||||
if ( isset( self::$utfCombiningClass[$c] ) ) {
|
||||
$lastClass = self::$utfCombiningClass[$c];
|
||||
if ( isset( $combiners[$lastClass] ) ) {
|
||||
$combiners[$lastClass] .= $c;
|
||||
} else {
|
||||
$combiners[$lastClass] = $c;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if ( $lastClass ) {
|
||||
ksort( $combiners );
|
||||
$out .= implode( '', $combiners );
|
||||
$combiners = array();
|
||||
}
|
||||
$out .= $c;
|
||||
$lastClass = 0;
|
||||
}
|
||||
if ( $lastClass ) {
|
||||
ksort( $combiners );
|
||||
$out .= implode( '', $combiners );
|
||||
}
|
||||
|
||||
return $out;
|
||||
}
|
||||
|
||||
/**
|
||||
* Produces canonically composed sequences, i.e. normal form C or KC.
|
||||
*
|
||||
* @private
|
||||
* @param string $string a valid UTF-8 string in sorted normal form D or KD.
|
||||
* Input is not validated.
|
||||
* @return string a UTF-8 string with canonical precomposed characters used
|
||||
* where possible.
|
||||
*/
|
||||
static function fastCompose( $string ) {
|
||||
UtfNormal::loadData();
|
||||
$len = strlen( $string );
|
||||
$out = '';
|
||||
$lastClass = -1;
|
||||
$lastHangul = 0;
|
||||
$startChar = '';
|
||||
$combining = '';
|
||||
$x1 = ord( substr( UTF8_HANGUL_VBASE, 0, 1 ) );
|
||||
$x2 = ord( substr( UTF8_HANGUL_TEND, 0, 1 ) );
|
||||
for ( $i = 0; $i < $len; $i++ ) {
|
||||
$c = $string[$i];
|
||||
$n = ord( $c );
|
||||
if ( $n < 0x80 ) {
|
||||
# No combining characters here...
|
||||
$out .= $startChar;
|
||||
$out .= $combining;
|
||||
$startChar = $c;
|
||||
$combining = '';
|
||||
$lastClass = 0;
|
||||
continue;
|
||||
} elseif ( $n >= 0xf0 ) {
|
||||
$c = substr( $string, $i, 4 );
|
||||
$i += 3;
|
||||
} elseif ( $n >= 0xe0 ) {
|
||||
$c = substr( $string, $i, 3 );
|
||||
$i += 2;
|
||||
} elseif ( $n >= 0xc0 ) {
|
||||
$c = substr( $string, $i, 2 );
|
||||
$i++;
|
||||
}
|
||||
$pair = $startChar . $c;
|
||||
if ( $n > 0x80 ) {
|
||||
if ( isset( self::$utfCombiningClass[$c] ) ) {
|
||||
# A combining char; see what we can do with it
|
||||
$class = self::$utfCombiningClass[$c];
|
||||
if ( !empty( $startChar ) &&
|
||||
$lastClass < $class &&
|
||||
$class > 0 &&
|
||||
isset( self::$utfCanonicalComp[$pair] )
|
||||
) {
|
||||
$startChar = self::$utfCanonicalComp[$pair];
|
||||
$class = 0;
|
||||
} else {
|
||||
$combining .= $c;
|
||||
}
|
||||
$lastClass = $class;
|
||||
$lastHangul = 0;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
# New start char
|
||||
if ( $lastClass == 0 ) {
|
||||
if ( isset( self::$utfCanonicalComp[$pair] ) ) {
|
||||
$startChar = self::$utfCanonicalComp[$pair];
|
||||
$lastHangul = 0;
|
||||
continue;
|
||||
}
|
||||
if ( $n >= $x1 && $n <= $x2 ) {
|
||||
# WARNING: Hangul code is painfully slow.
|
||||
# I apologize for this ugly, ugly code; however
|
||||
# performance is even more teh suck if we call
|
||||
# out to nice clean functions. Lookup tables are
|
||||
# marginally faster, but require a lot of space.
|
||||
#
|
||||
if ( $c >= UTF8_HANGUL_VBASE &&
|
||||
$c <= UTF8_HANGUL_VEND &&
|
||||
$startChar >= UTF8_HANGUL_LBASE &&
|
||||
$startChar <= UTF8_HANGUL_LEND
|
||||
) {
|
||||
#
|
||||
#$lIndex = utf8ToCodepoint( $startChar ) - UNICODE_HANGUL_LBASE;
|
||||
#$vIndex = utf8ToCodepoint( $c ) - UNICODE_HANGUL_VBASE;
|
||||
$lIndex = ord( $startChar[2] ) - 0x80;
|
||||
$vIndex = ord( $c[2] ) - 0xa1;
|
||||
|
||||
$hangulPoint = UNICODE_HANGUL_FIRST +
|
||||
UNICODE_HANGUL_TCOUNT *
|
||||
( UNICODE_HANGUL_VCOUNT * $lIndex + $vIndex );
|
||||
|
||||
# Hardcode the limited-range UTF-8 conversion:
|
||||
$startChar = chr( $hangulPoint >> 12 & 0x0f | 0xe0 ) .
|
||||
chr( $hangulPoint >> 6 & 0x3f | 0x80 ) .
|
||||
chr( $hangulPoint & 0x3f | 0x80 );
|
||||
$lastHangul = 0;
|
||||
continue;
|
||||
} elseif ( $c >= UTF8_HANGUL_TBASE &&
|
||||
$c <= UTF8_HANGUL_TEND &&
|
||||
$startChar >= UTF8_HANGUL_FIRST &&
|
||||
$startChar <= UTF8_HANGUL_LAST &&
|
||||
!$lastHangul
|
||||
) {
|
||||
# $tIndex = utf8ToCodepoint( $c ) - UNICODE_HANGUL_TBASE;
|
||||
$tIndex = ord( $c[2] ) - 0xa7;
|
||||
if ( $tIndex < 0 ) $tIndex = ord( $c[2] ) - 0x80 + ( 0x11c0 - 0x11a7 );
|
||||
|
||||
# Increment the code point by $tIndex, without
|
||||
# the function overhead of decoding and recoding UTF-8
|
||||
#
|
||||
$tail = ord( $startChar[2] ) + $tIndex;
|
||||
if ( $tail > 0xbf ) {
|
||||
$tail -= 0x40;
|
||||
$mid = ord( $startChar[1] ) + 1;
|
||||
if ( $mid > 0xbf ) {
|
||||
$startChar[0] = chr( ord( $startChar[0] ) + 1 );
|
||||
$mid -= 0x40;
|
||||
}
|
||||
$startChar[1] = chr( $mid );
|
||||
}
|
||||
$startChar[2] = chr( $tail );
|
||||
|
||||
# If there's another jamo char after this, *don't* try to merge it.
|
||||
$lastHangul = 1;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
$out .= $startChar;
|
||||
$out .= $combining;
|
||||
$startChar = $c;
|
||||
$combining = '';
|
||||
$lastClass = 0;
|
||||
$lastHangul = 0;
|
||||
}
|
||||
$out .= $startChar . $combining;
|
||||
|
||||
return $out;
|
||||
}
|
||||
|
||||
/**
|
||||
* This is just used for the benchmark, comparing how long it takes to
|
||||
* interate through a string without really doing anything of substance.
|
||||
* @param $string string
|
||||
* @return string
|
||||
*/
|
||||
static function placebo( $string ) {
|
||||
$len = strlen( $string );
|
||||
$out = '';
|
||||
for ( $i = 0; $i < $len; $i++ ) {
|
||||
$out .= $string[$i];
|
||||
}
|
||||
|
||||
return $out;
|
||||
}
|
||||
|
||||
/**
|
||||
* Function to replace some characters that we don't want
|
||||
* but most of the native normalize functions keep.
|
||||
*
|
||||
* @param string $string The string
|
||||
* @return String String with the character codes replaced.
|
||||
*/
|
||||
private static function replaceForNativeNormalize( $string ) {
|
||||
$string = preg_replace(
|
||||
'/[\x00-\x08\x0b\x0c\x0e-\x1f]/',
|
||||
UTF8_REPLACEMENT,
|
||||
$string );
|
||||
$string = str_replace( UTF8_FFFE, UTF8_REPLACEMENT, $string );
|
||||
$string = str_replace( UTF8_FFFF, UTF8_REPLACEMENT, $string );
|
||||
|
||||
return $string;
|
||||
return Validator::quickIsNFCVerify( $string );
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,105 +0,0 @@
|
|||
<?php
|
||||
/**
|
||||
* Approximate benchmark for some basic operations.
|
||||
*
|
||||
* Copyright © 2004 Brion Vibber <brion@pobox.com>
|
||||
* https://www.mediawiki.org/
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License along
|
||||
* with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
||||
* http://www.gnu.org/copyleft/gpl.html
|
||||
*
|
||||
* @file
|
||||
* @ingroup UtfNormal
|
||||
*/
|
||||
|
||||
if ( PHP_SAPI != 'cli' ) {
|
||||
die( "Run me from the command line please.\n" );
|
||||
}
|
||||
|
||||
if ( isset( $_SERVER['argv'] ) && in_array( '--icu', $_SERVER['argv'] ) ) {
|
||||
dl( 'php_utfnormal.so' );
|
||||
}
|
||||
|
||||
require_once 'UtfNormalDefines.php';
|
||||
require_once 'UtfNormalUtil.php';
|
||||
require_once 'UtfNormal.php';
|
||||
|
||||
define( 'BENCH_CYCLES', 5 );
|
||||
|
||||
$testfiles = array(
|
||||
'testdata/washington.txt' => 'English text',
|
||||
'testdata/berlin.txt' => 'German text',
|
||||
'testdata/bulgakov.txt' => 'Russian text',
|
||||
'testdata/tokyo.txt' => 'Japanese text',
|
||||
'testdata/young.txt' => 'Korean text'
|
||||
);
|
||||
$normalizer = new UtfNormal;
|
||||
UtfNormal::loadData();
|
||||
foreach ( $testfiles as $file => $desc ) {
|
||||
benchmarkTest( $normalizer, $file, $desc );
|
||||
}
|
||||
|
||||
# -------
|
||||
|
||||
function benchmarkTest( &$u, $filename, $desc ) {
|
||||
print "Testing $filename ($desc)...\n";
|
||||
$data = file_get_contents( $filename );
|
||||
$forms = array(
|
||||
# 'placebo',
|
||||
'cleanUp',
|
||||
'toNFC',
|
||||
# 'toNFKC',
|
||||
# 'toNFD', 'toNFKD',
|
||||
'NFC',
|
||||
# 'NFKC',
|
||||
# 'NFD', 'NFKD',
|
||||
array( 'fastDecompose', 'fastCombiningSort', 'fastCompose' ),
|
||||
# 'quickIsNFC', 'quickIsNFCVerify',
|
||||
);
|
||||
|
||||
foreach ( $forms as $form ) {
|
||||
if ( is_array( $form ) ) {
|
||||
$str = $data;
|
||||
foreach ( $form as $step ) {
|
||||
$str = benchmarkForm( $u, $str, $step );
|
||||
}
|
||||
} else {
|
||||
benchmarkForm( $u, $data, $form );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function benchmarkForm( &$u, &$data, $form ) {
|
||||
#$start = microtime( true );
|
||||
for ( $i = 0; $i < BENCH_CYCLES; $i++ ) {
|
||||
$start = microtime( true );
|
||||
$out = $u->$form( $data, UtfNormal::$utfCanonicalDecomp );
|
||||
$deltas[] = ( microtime( true ) - $start );
|
||||
}
|
||||
#$delta = (microtime( true ) - $start) / BENCH_CYCLES;
|
||||
sort( $deltas );
|
||||
$delta = $deltas[0]; # Take shortest time
|
||||
|
||||
$rate = intval( strlen( $data ) / $delta );
|
||||
$same = ( 0 == strcmp( $data, $out ) );
|
||||
|
||||
printf( " %20s %6.1fms %12s bytes/s (%s)\n",
|
||||
$form,
|
||||
$delta * 1000.0,
|
||||
number_format( $rate ),
|
||||
( $same ? 'no change' : 'changed' ) );
|
||||
|
||||
return $out;
|
||||
}
|
||||
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
|
|
@ -1,10 +1,8 @@
|
|||
<?php
|
||||
/**
|
||||
* Some constant definitions for the unicode normalization module.
|
||||
*
|
||||
* Note: these constants must all be resolvable at compile time by HipHop,
|
||||
* since this file will not be executed during request startup for a compiled
|
||||
* MediaWiki.
|
||||
* Backwards-compatability constants which are now provided by the
|
||||
* UtfNormal library. They are hardcoded here since they are needed
|
||||
* before the composer autoloader is initialized.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
|
|
@ -25,53 +23,164 @@
|
|||
* @ingroup UtfNormal
|
||||
*/
|
||||
|
||||
/**
|
||||
* @deprecated since 1.25, use UtfNormal\Constants instead
|
||||
*/
|
||||
define( 'UNICODE_HANGUL_FIRST', 0xac00 );
|
||||
/**
|
||||
* @deprecated since 1.25, use UtfNormal\Constants instead
|
||||
*/
|
||||
define( 'UNICODE_HANGUL_LAST', 0xd7a3 );
|
||||
|
||||
/**
|
||||
* @deprecated since 1.25, use UtfNormal\Constants instead
|
||||
*/
|
||||
define( 'UNICODE_HANGUL_LBASE', 0x1100 );
|
||||
/**
|
||||
* @deprecated since 1.25, use UtfNormal\Constants instead
|
||||
*/
|
||||
define( 'UNICODE_HANGUL_VBASE', 0x1161 );
|
||||
/**
|
||||
* @deprecated since 1.25, use UtfNormal\Constants instead
|
||||
*/
|
||||
define( 'UNICODE_HANGUL_TBASE', 0x11a7 );
|
||||
|
||||
/**
|
||||
* @deprecated since 1.25, use UtfNormal\Constants instead
|
||||
*/
|
||||
define( 'UNICODE_HANGUL_LCOUNT', 19 );
|
||||
/**
|
||||
* @deprecated since 1.25, use UtfNormal\Constants instead
|
||||
*/
|
||||
define( 'UNICODE_HANGUL_VCOUNT', 21 );
|
||||
/**
|
||||
* @deprecated since 1.25, use UtfNormal\Constants instead
|
||||
*/
|
||||
define( 'UNICODE_HANGUL_TCOUNT', 28 );
|
||||
/**
|
||||
* @deprecated since 1.25, use UtfNormal\Constants instead
|
||||
*/
|
||||
define( 'UNICODE_HANGUL_NCOUNT', UNICODE_HANGUL_VCOUNT * UNICODE_HANGUL_TCOUNT );
|
||||
|
||||
/**
|
||||
* @deprecated since 1.25, use UtfNormal\Constants instead
|
||||
*/
|
||||
define( 'UNICODE_HANGUL_LEND', UNICODE_HANGUL_LBASE + UNICODE_HANGUL_LCOUNT - 1 );
|
||||
/**
|
||||
* @deprecated since 1.25, use UtfNormal\Constants instead
|
||||
*/
|
||||
define( 'UNICODE_HANGUL_VEND', UNICODE_HANGUL_VBASE + UNICODE_HANGUL_VCOUNT - 1 );
|
||||
/**
|
||||
* @deprecated since 1.25, use UtfNormal\Constants instead
|
||||
*/
|
||||
define( 'UNICODE_HANGUL_TEND', UNICODE_HANGUL_TBASE + UNICODE_HANGUL_TCOUNT - 1 );
|
||||
|
||||
/**
|
||||
* @deprecated since 1.25, use UtfNormal\Constants instead
|
||||
*/
|
||||
define( 'UNICODE_SURROGATE_FIRST', 0xd800 );
|
||||
/**
|
||||
* @deprecated since 1.25, use UtfNormal\Constants instead
|
||||
*/
|
||||
define( 'UNICODE_SURROGATE_LAST', 0xdfff );
|
||||
/**
|
||||
* @deprecated since 1.25, use UtfNormal\Constants instead
|
||||
*/
|
||||
define( 'UNICODE_MAX', 0x10ffff );
|
||||
/**
|
||||
* @deprecated since 1.25, use UtfNormal\Constants instead
|
||||
*/
|
||||
define( 'UNICODE_REPLACEMENT', 0xfffd );
|
||||
|
||||
/**
|
||||
* @deprecated since 1.25, use UtfNormal\Constants instead
|
||||
*/
|
||||
define( 'UTF8_HANGUL_FIRST', "\xea\xb0\x80" /*codepointToUtf8( UNICODE_HANGUL_FIRST )*/ );
|
||||
/**
|
||||
* @deprecated since 1.25, use UtfNormal\Constants instead
|
||||
*/
|
||||
define( 'UTF8_HANGUL_LAST', "\xed\x9e\xa3" /*codepointToUtf8( UNICODE_HANGUL_LAST )*/ );
|
||||
|
||||
/**
|
||||
* @deprecated since 1.25, use UtfNormal\Constants instead
|
||||
*/
|
||||
define( 'UTF8_HANGUL_LBASE', "\xe1\x84\x80" /*codepointToUtf8( UNICODE_HANGUL_LBASE )*/ );
|
||||
/**
|
||||
* @deprecated since 1.25, use UtfNormal\Constants instead
|
||||
*/
|
||||
define( 'UTF8_HANGUL_VBASE', "\xe1\x85\xa1" /*codepointToUtf8( UNICODE_HANGUL_VBASE )*/ );
|
||||
/**
|
||||
* @deprecated since 1.25, use UtfNormal\Constants instead
|
||||
*/
|
||||
define( 'UTF8_HANGUL_TBASE', "\xe1\x86\xa7" /*codepointToUtf8( UNICODE_HANGUL_TBASE )*/ );
|
||||
|
||||
/**
|
||||
* @deprecated since 1.25, use UtfNormal\Constants instead
|
||||
*/
|
||||
define( 'UTF8_HANGUL_LEND', "\xe1\x84\x92" /*codepointToUtf8( UNICODE_HANGUL_LEND )*/ );
|
||||
/**
|
||||
* @deprecated since 1.25, use UtfNormal\Constants instead
|
||||
*/
|
||||
define( 'UTF8_HANGUL_VEND', "\xe1\x85\xb5" /*codepointToUtf8( UNICODE_HANGUL_VEND )*/ );
|
||||
/**
|
||||
* @deprecated since 1.25, use UtfNormal\Constants instead
|
||||
*/
|
||||
define( 'UTF8_HANGUL_TEND', "\xe1\x87\x82" /*codepointToUtf8( UNICODE_HANGUL_TEND )*/ );
|
||||
|
||||
/**
|
||||
* @deprecated since 1.25, use UtfNormal\Constants instead
|
||||
*/
|
||||
define( 'UTF8_SURROGATE_FIRST', "\xed\xa0\x80" /*codepointToUtf8( UNICODE_SURROGATE_FIRST )*/ );
|
||||
/**
|
||||
* @deprecated since 1.25, use UtfNormal\Constants instead
|
||||
*/
|
||||
define( 'UTF8_SURROGATE_LAST', "\xed\xbf\xbf" /*codepointToUtf8( UNICODE_SURROGATE_LAST )*/ );
|
||||
/**
|
||||
* @deprecated since 1.25, use UtfNormal\Constants instead
|
||||
*/
|
||||
define( 'UTF8_MAX', "\xf4\x8f\xbf\xbf" /*codepointToUtf8( UNICODE_MAX )*/ );
|
||||
/**
|
||||
* @deprecated since 1.25, use UtfNormal\Constants instead
|
||||
*/
|
||||
define( 'UTF8_REPLACEMENT', "\xef\xbf\xbd" /*codepointToUtf8( UNICODE_REPLACEMENT )*/ );
|
||||
#define( 'UTF8_REPLACEMENT', '!' );
|
||||
|
||||
/**
|
||||
* @deprecated since 1.25, use UtfNormal\Constants instead
|
||||
*/
|
||||
define( 'UTF8_OVERLONG_A', "\xc1\xbf" );
|
||||
/**
|
||||
* @deprecated since 1.25, use UtfNormal\Constants instead
|
||||
*/
|
||||
define( 'UTF8_OVERLONG_B', "\xe0\x9f\xbf" );
|
||||
/**
|
||||
* @deprecated since 1.25, use UtfNormal\Constants instead
|
||||
*/
|
||||
define( 'UTF8_OVERLONG_C', "\xf0\x8f\xbf\xbf" );
|
||||
|
||||
# These two ranges are illegal
|
||||
/**
|
||||
* @deprecated since 1.25, use UtfNormal\Constants instead
|
||||
*/
|
||||
define( 'UTF8_FDD0', "\xef\xb7\x90" /*codepointToUtf8( 0xfdd0 )*/ );
|
||||
/**
|
||||
* @deprecated since 1.25, use UtfNormal\Constants instead
|
||||
*/
|
||||
define( 'UTF8_FDEF', "\xef\xb7\xaf" /*codepointToUtf8( 0xfdef )*/ );
|
||||
/**
|
||||
* @deprecated since 1.25, use UtfNormal\Constants instead
|
||||
*/
|
||||
define( 'UTF8_FFFE', "\xef\xbf\xbe" /*codepointToUtf8( 0xfffe )*/ );
|
||||
/**
|
||||
* @deprecated since 1.25, use UtfNormal\Constants instead
|
||||
*/
|
||||
define( 'UTF8_FFFF', "\xef\xbf\xbf" /*codepointToUtf8( 0xffff )*/ );
|
||||
|
||||
/**
|
||||
* @deprecated since 1.25, use UtfNormal\Constants instead
|
||||
*/
|
||||
define( 'UTF8_HEAD', false );
|
||||
/**
|
||||
* @deprecated since 1.25, use UtfNormal\Constants instead
|
||||
*/
|
||||
define( 'UTF8_TAIL', true );
|
||||
|
|
|
|||
|
|
@ -1,250 +0,0 @@
|
|||
<?php
|
||||
/**
|
||||
* This script generates UniNormalData.inc from the Unicode Character Database
|
||||
* and supplementary files.
|
||||
*
|
||||
* Copyright (C) 2004 Brion Vibber <brion@pobox.com>
|
||||
* https://www.mediawiki.org/
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License along
|
||||
* with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
||||
* http://www.gnu.org/copyleft/gpl.html
|
||||
*
|
||||
* @file
|
||||
* @ingroup UtfNormal
|
||||
*/
|
||||
|
||||
if ( PHP_SAPI != 'cli' ) {
|
||||
die( "Run me from the command line please.\n" );
|
||||
}
|
||||
|
||||
require_once 'UtfNormalDefines.php';
|
||||
require_once 'UtfNormalUtil.php';
|
||||
|
||||
$in = fopen( "DerivedNormalizationProps.txt", "rt" );
|
||||
if ( !$in ) {
|
||||
print "Can't open DerivedNormalizationProps.txt for reading.\n";
|
||||
print "If necessary, fetch this file from the internet:\n";
|
||||
print "http://www.unicode.org/Public/UNIDATA/DerivedNormalizationProps.txt\n";
|
||||
exit( -1 );
|
||||
}
|
||||
print "Initializing normalization quick check tables...\n";
|
||||
$checkNFC = array();
|
||||
while ( false !== ( $line = fgets( $in ) ) ) {
|
||||
$matches = array();
|
||||
if ( preg_match(
|
||||
'/^([0-9A-F]+)(?:..([0-9A-F]+))?\s*;\s*(NFC_QC)\s*;\s*([MN])/',
|
||||
$line,
|
||||
$matches )
|
||||
) {
|
||||
list( $junk, $first, $last, $prop, $value ) = $matches;
|
||||
#print "$first $last $prop $value\n";
|
||||
if ( !$last ) {
|
||||
$last = $first;
|
||||
}
|
||||
|
||||
$lastInDecimal = hexdec( $last );
|
||||
for ( $i = hexdec( $first ); $i <= $lastInDecimal; $i++ ) {
|
||||
$char = codepointToUtf8( $i );
|
||||
$checkNFC[$char] = $value;
|
||||
}
|
||||
}
|
||||
}
|
||||
fclose( $in );
|
||||
|
||||
$in = fopen( "CompositionExclusions.txt", "rt" );
|
||||
if ( !$in ) {
|
||||
print "Can't open CompositionExclusions.txt for reading.\n";
|
||||
print "If necessary, fetch this file from the internet:\n";
|
||||
print "http://www.unicode.org/Public/UNIDATA/CompositionExclusions.txt\n";
|
||||
exit( -1 );
|
||||
}
|
||||
$exclude = array();
|
||||
while ( false !== ( $line = fgets( $in ) ) ) {
|
||||
if ( preg_match( '/^([0-9A-F]+)/i', $line, $matches ) ) {
|
||||
$codepoint = $matches[1];
|
||||
$source = codepointToUtf8( hexdec( $codepoint ) );
|
||||
$exclude[$source] = true;
|
||||
}
|
||||
}
|
||||
fclose( $in );
|
||||
|
||||
$in = fopen( "UnicodeData.txt", "rt" );
|
||||
if ( !$in ) {
|
||||
print "Can't open UnicodeData.txt for reading.\n";
|
||||
print "If necessary, fetch this file from the internet:\n";
|
||||
print "http://www.unicode.org/Public/UNIDATA/UnicodeData.txt\n";
|
||||
exit( -1 );
|
||||
}
|
||||
|
||||
$compatibilityDecomp = array();
|
||||
$canonicalDecomp = array();
|
||||
$canonicalComp = array();
|
||||
$combiningClass = array();
|
||||
$total = 0;
|
||||
$compat = 0;
|
||||
$canon = 0;
|
||||
|
||||
print "Reading character definitions...\n";
|
||||
while ( false !== ( $line = fgets( $in ) ) ) {
|
||||
$columns = explode( ';', $line );
|
||||
$codepoint = $columns[0];
|
||||
$name = $columns[1];
|
||||
$canonicalCombiningClass = $columns[3];
|
||||
$decompositionMapping = $columns[5];
|
||||
|
||||
$source = codepointToUtf8( hexdec( $codepoint ) );
|
||||
|
||||
if ( $canonicalCombiningClass != 0 ) {
|
||||
$combiningClass[$source] = intval( $canonicalCombiningClass );
|
||||
}
|
||||
|
||||
if ( $decompositionMapping === '' ) continue;
|
||||
if ( preg_match( '/^<(.+)> (.*)$/', $decompositionMapping, $matches ) ) {
|
||||
# Compatibility decomposition
|
||||
$canonical = false;
|
||||
$decompositionMapping = $matches[2];
|
||||
$compat++;
|
||||
} else {
|
||||
$canonical = true;
|
||||
$canon++;
|
||||
}
|
||||
$total++;
|
||||
$dest = hexSequenceToUtf8( $decompositionMapping );
|
||||
|
||||
$compatibilityDecomp[$source] = $dest;
|
||||
if ( $canonical ) {
|
||||
$canonicalDecomp[$source] = $dest;
|
||||
if ( empty( $exclude[$source] ) ) {
|
||||
$canonicalComp[$dest] = $source;
|
||||
}
|
||||
}
|
||||
#print "$codepoint | $canonicalCombiningClasses | $decompositionMapping\n";
|
||||
}
|
||||
fclose( $in );
|
||||
|
||||
print "Recursively expanding canonical mappings...\n";
|
||||
$changed = 42;
|
||||
$pass = 1;
|
||||
while ( $changed > 0 ) {
|
||||
print "pass $pass\n";
|
||||
$changed = 0;
|
||||
foreach ( $canonicalDecomp as $source => $dest ) {
|
||||
$newDest = preg_replace_callback(
|
||||
'/([\xc0-\xff][\x80-\xbf]+)/',
|
||||
'callbackCanonical',
|
||||
$dest );
|
||||
if ( $newDest === $dest ) continue;
|
||||
$changed++;
|
||||
$canonicalDecomp[$source] = $newDest;
|
||||
}
|
||||
$pass++;
|
||||
}
|
||||
|
||||
print "Recursively expanding compatibility mappings...\n";
|
||||
$changed = 42;
|
||||
$pass = 1;
|
||||
while ( $changed > 0 ) {
|
||||
print "pass $pass\n";
|
||||
$changed = 0;
|
||||
foreach ( $compatibilityDecomp as $source => $dest ) {
|
||||
$newDest = preg_replace_callback(
|
||||
'/([\xc0-\xff][\x80-\xbf]+)/',
|
||||
'callbackCompat',
|
||||
$dest );
|
||||
if ( $newDest === $dest ) continue;
|
||||
$changed++;
|
||||
$compatibilityDecomp[$source] = $newDest;
|
||||
}
|
||||
$pass++;
|
||||
}
|
||||
|
||||
print "$total decomposition mappings ($canon canonical, $compat compatibility)\n";
|
||||
|
||||
$out = fopen( "UtfNormalData.inc", "wt" );
|
||||
if ( $out ) {
|
||||
$serCombining = escapeSingleString( serialize( $combiningClass ) );
|
||||
$serComp = escapeSingleString( serialize( $canonicalComp ) );
|
||||
$serCanon = escapeSingleString( serialize( $canonicalDecomp ) );
|
||||
$serCheckNFC = escapeSingleString( serialize( $checkNFC ) );
|
||||
$outdata = "<" . "?php
|
||||
/**
|
||||
* This file was automatically generated -- do not edit!
|
||||
* Run UtfNormalGenerate.php to create this file again (make clean && make)
|
||||
*
|
||||
* @file
|
||||
*/
|
||||
// @codingStandardsIgnoreFile
|
||||
|
||||
UtfNormal::\$utfCombiningClass = unserialize( '$serCombining' );
|
||||
UtfNormal::\$utfCanonicalComp = unserialize( '$serComp' );
|
||||
UtfNormal::\$utfCanonicalDecomp = unserialize( '$serCanon' );
|
||||
UtfNormal::\$utfCheckNFC = unserialize( '$serCheckNFC' );
|
||||
\n";
|
||||
fputs( $out, $outdata );
|
||||
fclose( $out );
|
||||
print "Wrote out UtfNormalData.inc\n";
|
||||
} else {
|
||||
print "Can't create file UtfNormalData.inc\n";
|
||||
exit( -1 );
|
||||
}
|
||||
|
||||
$out = fopen( "UtfNormalDataK.inc", "wt" );
|
||||
if ( $out ) {
|
||||
$serCompat = escapeSingleString( serialize( $compatibilityDecomp ) );
|
||||
$outdata = "<" . "?php
|
||||
/**
|
||||
* This file was automatically generated -- do not edit!
|
||||
* Run UtfNormalGenerate.php to create this file again (make clean && make)
|
||||
*
|
||||
* @file
|
||||
*/
|
||||
// @codingStandardsIgnoreFile
|
||||
|
||||
UtfNormal::\$utfCompatibilityDecomp = unserialize( '$serCompat' );
|
||||
\n";
|
||||
fputs( $out, $outdata );
|
||||
fclose( $out );
|
||||
print "Wrote out UtfNormalDataK.inc\n";
|
||||
exit( 0 );
|
||||
} else {
|
||||
print "Can't create file UtfNormalDataK.inc\n";
|
||||
exit( -1 );
|
||||
}
|
||||
|
||||
# ---------------
|
||||
|
||||
function callbackCanonical( $matches ) {
|
||||
// @codingStandardsIgnoreStart MediaWiki.NamingConventions.ValidGlobalName.wgPrefix
|
||||
global $canonicalDecomp;
|
||||
// @codingStandardsIgnoreEnd
|
||||
|
||||
if ( isset( $canonicalDecomp[$matches[1]] ) ) {
|
||||
return $canonicalDecomp[$matches[1]];
|
||||
}
|
||||
|
||||
return $matches[1];
|
||||
}
|
||||
|
||||
function callbackCompat( $matches ) {
|
||||
// @codingStandardsIgnoreStart MediaWiki.NamingConventions.ValidGlobalName.wgPrefix
|
||||
global $compatibilityDecomp;
|
||||
// @codingStandardsIgnoreEnd
|
||||
|
||||
if ( isset( $compatibilityDecomp[$matches[1]] ) ) {
|
||||
return $compatibilityDecomp[$matches[1]];
|
||||
}
|
||||
|
||||
return $matches[1];
|
||||
}
|
||||
|
|
@ -1,107 +0,0 @@
|
|||
<?php
|
||||
/**
|
||||
* Approximate benchmark for some basic operations.
|
||||
* Runs large chunks of text through cleanup with a lowish memory limit,
|
||||
* to test regression on mem usage (bug 28146)
|
||||
*
|
||||
* Copyright © 2004-2011 Brion Vibber <brion@wikimedia.org>
|
||||
* https://www.mediawiki.org/
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License along
|
||||
* with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
||||
* http://www.gnu.org/copyleft/gpl.html
|
||||
*
|
||||
* @file
|
||||
* @ingroup UtfNormal
|
||||
*/
|
||||
|
||||
if ( PHP_SAPI != 'cli' ) {
|
||||
die( "Run me from the command line please.\n" );
|
||||
}
|
||||
|
||||
if ( isset( $_SERVER['argv'] ) && in_array( '--icu', $_SERVER['argv'] ) ) {
|
||||
dl( 'php_utfnormal.so' );
|
||||
}
|
||||
|
||||
require_once 'UtfNormalDefines.php';
|
||||
require_once 'UtfNormalUtil.php';
|
||||
require_once 'UtfNormal.php';
|
||||
|
||||
define( 'BENCH_CYCLES', 1 );
|
||||
define( 'BIGSIZE', 1024 * 1024 * 10 ); // 10m
|
||||
ini_set( 'memory_limit', BIGSIZE + 120 * 1024 * 1024 );
|
||||
|
||||
$testfiles = array(
|
||||
'testdata/washington.txt' => 'English text',
|
||||
'testdata/berlin.txt' => 'German text',
|
||||
'testdata/bulgakov.txt' => 'Russian text',
|
||||
'testdata/tokyo.txt' => 'Japanese text',
|
||||
'testdata/young.txt' => 'Korean text'
|
||||
);
|
||||
$normalizer = new UtfNormal;
|
||||
UtfNormal::loadData();
|
||||
foreach ( $testfiles as $file => $desc ) {
|
||||
benchmarkTest( $normalizer, $file, $desc );
|
||||
}
|
||||
|
||||
# -------
|
||||
|
||||
function benchmarkTest( &$u, $filename, $desc ) {
|
||||
print "Testing $filename ($desc)...\n";
|
||||
$data = file_get_contents( $filename );
|
||||
$all = $data;
|
||||
while ( strlen( $all ) < BIGSIZE ) {
|
||||
$all .= $all;
|
||||
}
|
||||
$data = $all;
|
||||
echo "Data is " . strlen( $data ) . " bytes.\n";
|
||||
$forms = array(
|
||||
'quickIsNFCVerify',
|
||||
'cleanUp',
|
||||
);
|
||||
|
||||
foreach ( $forms as $form ) {
|
||||
if ( is_array( $form ) ) {
|
||||
$str = $data;
|
||||
foreach ( $form as $step ) {
|
||||
$str = benchmarkForm( $u, $str, $step );
|
||||
}
|
||||
} else {
|
||||
benchmarkForm( $u, $data, $form );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function benchmarkForm( &$u, &$data, $form ) {
|
||||
#$start = microtime( true );
|
||||
for ( $i = 0; $i < BENCH_CYCLES; $i++ ) {
|
||||
$start = microtime( true );
|
||||
$out = $u->$form( $data, UtfNormal::$utfCanonicalDecomp );
|
||||
$deltas[] = ( microtime( true ) - $start );
|
||||
}
|
||||
#$delta = (microtime( true ) - $start) / BENCH_CYCLES;
|
||||
sort( $deltas );
|
||||
$delta = $deltas[0]; # Take shortest time
|
||||
|
||||
$rate = intval( strlen( $data ) / $delta );
|
||||
$same = ( 0 == strcmp( $data, $out ) );
|
||||
|
||||
printf( " %20s %6.1fms %12s bytes/s (%s)\n",
|
||||
$form,
|
||||
$delta * 1000.0,
|
||||
number_format( $rate ),
|
||||
( $same ? 'no change' : 'changed' ) );
|
||||
|
||||
return $out;
|
||||
}
|
||||
|
|
@ -1,275 +0,0 @@
|
|||
#!/usr/bin/env php
|
||||
<?php
|
||||
/**
|
||||
* Other tests for the unicode normalization module.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License along
|
||||
* with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
||||
* http://www.gnu.org/copyleft/gpl.html
|
||||
*
|
||||
* @file
|
||||
* @ingroup UtfNormal
|
||||
*/
|
||||
|
||||
if ( PHP_SAPI != 'cli' ) {
|
||||
die( "Run me from the command line please.\n" );
|
||||
}
|
||||
|
||||
// From http://unicode.org/Public/UNIDATA/NormalizationTest.txt
|
||||
$file = "NormalizationTest.txt";
|
||||
|
||||
// Anything after this character is a comment
|
||||
define ( 'COMMENT', '#' );
|
||||
|
||||
// Semicolons are used to separate the columns
|
||||
define ( 'SEPARATOR', ';' );
|
||||
|
||||
$f = fopen( $file, "r" );
|
||||
|
||||
/**
|
||||
* The following section will be used for testing different normalization methods.
|
||||
* - Pure PHP
|
||||
* ~ no assertion errors
|
||||
* ~ 6.25 minutes
|
||||
* - php_utfnormal.so or intl extension: both are wrappers around
|
||||
* libicu so we list the version of libicu when making the
|
||||
* comparison
|
||||
* - libicu Ubuntu 3.8.1-3ubuntu1.1 php 5.2.6-3ubuntu4.5
|
||||
* ~ 2200 assertion errors
|
||||
* ~ 5 seconds
|
||||
* ~ output: http://paste2.org/p/921566
|
||||
* - libicu Ubuntu 4.2.1-3 php 5.3.2-1ubuntu4.2
|
||||
* ~ 1384 assertion errors
|
||||
* ~ 15 seconds
|
||||
* ~ output: http://paste2.org/p/921435
|
||||
* - libicu Debian 4.4.1-5 php 5.3.2-1ubuntu4.2
|
||||
* ~ no assertion errors
|
||||
* ~ 13 seconds
|
||||
* - Tests comparing pure PHP output with libicu output were added
|
||||
* later and slow down the runtime.
|
||||
*/
|
||||
|
||||
require_once './UtfNormal.php';
|
||||
function normalize_form_c( $c ) {
|
||||
return UtfNormal::toNFC( $c );
|
||||
}
|
||||
|
||||
function normalize_form_d( $c ) {
|
||||
return UtfNormal::toNFD( $c );
|
||||
}
|
||||
|
||||
function normalize_form_kc( $c ) {
|
||||
return UtfNormal::toNFKC( $c );
|
||||
}
|
||||
|
||||
function normalize_form_kd( $c ) {
|
||||
return UtfNormal::toNFKD( $c );
|
||||
}
|
||||
|
||||
/**
|
||||
* This set of functions is only useful if youve added a param to the
|
||||
* following functions to force pure PHP usage. I decided not to
|
||||
* commit that code since might produce a slowdown in the UTF
|
||||
* normalization code just for the sake of these tests. -- hexmode
|
||||
* @return string
|
||||
*/
|
||||
function normalize_form_c_php( $c ) {
|
||||
return UtfNormal::toNFC( $c, "php" );
|
||||
}
|
||||
|
||||
function normalize_form_d_php( $c ) {
|
||||
return UtfNormal::toNFD( $c, "php" );
|
||||
}
|
||||
|
||||
function normalize_form_kc_php( $c ) {
|
||||
return UtfNormal::toNFKC( $c, "php" );
|
||||
}
|
||||
|
||||
function normalize_form_kd_php( $c ) {
|
||||
return UtfNormal::toNFKD( $c, "php" );
|
||||
}
|
||||
|
||||
assert_options( ASSERT_ACTIVE, 1 );
|
||||
assert_options( ASSERT_WARNING, 0 );
|
||||
assert_options( ASSERT_QUIET_EVAL, 1 );
|
||||
assert_options( ASSERT_CALLBACK, 'my_assert' );
|
||||
|
||||
function my_assert( $file, $line, $code ) {
|
||||
// @codingStandardsIgnoreStart MediaWiki.NamingConventions.ValidGlobalName.wgPrefix
|
||||
global $col, $lineNo;
|
||||
// @codingStandardsIgnoreEnd
|
||||
|
||||
echo "Assertion that '$code' failed on line $lineNo ($col[5])\n";
|
||||
}
|
||||
|
||||
$count = 0;
|
||||
$lineNo = 0;
|
||||
if ( $f !== false ) {
|
||||
while ( ( $col = getRow( $f ) ) !== false ) {
|
||||
$lineNo++;
|
||||
|
||||
if ( count( $col ) == 6 ) {
|
||||
$count++;
|
||||
if ( $count % 100 === 0 ) echo "Count: $count\n";
|
||||
} else {
|
||||
continue;
|
||||
}
|
||||
|
||||
# verify that the pure PHP version is correct
|
||||
$NFCc1 = normalize_form_c( $col[0] );
|
||||
$NFCc1p = normalize_form_c_php( $col[0] );
|
||||
assert( '$NFCc1 === $NFCc1p' );
|
||||
$NFCc2 = normalize_form_c( $col[1] );
|
||||
$NFCc2p = normalize_form_c_php( $col[1] );
|
||||
assert( '$NFCc2 === $NFCc2p' );
|
||||
$NFCc3 = normalize_form_c( $col[2] );
|
||||
$NFCc3p = normalize_form_c_php( $col[2] );
|
||||
assert( '$NFCc3 === $NFCc3p' );
|
||||
$NFCc4 = normalize_form_c( $col[3] );
|
||||
$NFCc4p = normalize_form_c_php( $col[3] );
|
||||
assert( '$NFCc4 === $NFCc4p' );
|
||||
$NFCc5 = normalize_form_c( $col[4] );
|
||||
$NFCc5p = normalize_form_c_php( $col[4] );
|
||||
assert( '$NFCc5 === $NFCc5p' );
|
||||
|
||||
$NFDc1 = normalize_form_d( $col[0] );
|
||||
$NFDc1p = normalize_form_d_php( $col[0] );
|
||||
assert( '$NFDc1 === $NFDc1p' );
|
||||
$NFDc2 = normalize_form_d( $col[1] );
|
||||
$NFDc2p = normalize_form_d_php( $col[1] );
|
||||
assert( '$NFDc2 === $NFDc2p' );
|
||||
$NFDc3 = normalize_form_d( $col[2] );
|
||||
$NFDc3p = normalize_form_d_php( $col[2] );
|
||||
assert( '$NFDc3 === $NFDc3p' );
|
||||
$NFDc4 = normalize_form_d( $col[3] );
|
||||
$NFDc4p = normalize_form_d_php( $col[3] );
|
||||
assert( '$NFDc4 === $NFDc4p' );
|
||||
$NFDc5 = normalize_form_d( $col[4] );
|
||||
$NFDc5p = normalize_form_d_php( $col[4] );
|
||||
assert( '$NFDc5 === $NFDc5p' );
|
||||
|
||||
$NFKDc1 = normalize_form_kd( $col[0] );
|
||||
$NFKDc1p = normalize_form_kd_php( $col[0] );
|
||||
assert( '$NFKDc1 === $NFKDc1p' );
|
||||
$NFKDc2 = normalize_form_kd( $col[1] );
|
||||
$NFKDc2p = normalize_form_kd_php( $col[1] );
|
||||
assert( '$NFKDc2 === $NFKDc2p' );
|
||||
$NFKDc3 = normalize_form_kd( $col[2] );
|
||||
$NFKDc3p = normalize_form_kd_php( $col[2] );
|
||||
assert( '$NFKDc3 === $NFKDc3p' );
|
||||
$NFKDc4 = normalize_form_kd( $col[3] );
|
||||
$NFKDc4p = normalize_form_kd_php( $col[3] );
|
||||
assert( '$NFKDc4 === $NFKDc4p' );
|
||||
$NFKDc5 = normalize_form_kd( $col[4] );
|
||||
$NFKDc5p = normalize_form_kd_php( $col[4] );
|
||||
assert( '$NFKDc5 === $NFKDc5p' );
|
||||
|
||||
$NFKCc1 = normalize_form_kc( $col[0] );
|
||||
$NFKCc1p = normalize_form_kc_php( $col[0] );
|
||||
assert( '$NFKCc1 === $NFKCc1p' );
|
||||
$NFKCc2 = normalize_form_kc( $col[1] );
|
||||
$NFKCc2p = normalize_form_kc_php( $col[1] );
|
||||
assert( '$NFKCc2 === $NFKCc2p' );
|
||||
$NFKCc3 = normalize_form_kc( $col[2] );
|
||||
$NFKCc3p = normalize_form_kc_php( $col[2] );
|
||||
assert( '$NFKCc3 === $NFKCc3p' );
|
||||
$NFKCc4 = normalize_form_kc( $col[3] );
|
||||
$NFKCc4p = normalize_form_kc_php( $col[3] );
|
||||
assert( '$NFKCc4 === $NFKCc4p' );
|
||||
$NFKCc5 = normalize_form_kc( $col[4] );
|
||||
$NFKCc5p = normalize_form_kc_php( $col[4] );
|
||||
assert( '$NFKCc5 === $NFKCc5p' );
|
||||
|
||||
# c2 == NFC(c1) == NFC(c2) == NFC(c3)
|
||||
assert( '$col[1] === $NFCc1' );
|
||||
assert( '$col[1] === $NFCc2' );
|
||||
assert( '$col[1] === $NFCc3' );
|
||||
|
||||
# c4 == NFC(c4) == NFC(c5)
|
||||
assert( '$col[3] === $NFCc4' );
|
||||
assert( '$col[3] === $NFCc5' );
|
||||
|
||||
# c3 == NFD(c1) == NFD(c2) == NFD(c3)
|
||||
assert( '$col[2] === $NFDc1' );
|
||||
assert( '$col[2] === $NFDc2' );
|
||||
assert( '$col[2] === $NFDc3' );
|
||||
|
||||
# c5 == NFD(c4) == NFD(c5)
|
||||
assert( '$col[4] === $NFDc4' );
|
||||
assert( '$col[4] === $NFDc5' );
|
||||
|
||||
# c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NFKC(c4) == NFKC(c5)
|
||||
assert( '$col[3] === $NFKCc1' );
|
||||
assert( '$col[3] === $NFKCc2' );
|
||||
assert( '$col[3] === $NFKCc3' );
|
||||
assert( '$col[3] === $NFKCc4' );
|
||||
assert( '$col[3] === $NFKCc5' );
|
||||
|
||||
# c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5)
|
||||
assert( '$col[4] === $NFKDc1' );
|
||||
assert( '$col[4] === $NFKDc2' );
|
||||
assert( '$col[4] === $NFKDc3' );
|
||||
assert( '$col[4] === $NFKDc4' );
|
||||
assert( '$col[4] === $NFKDc5' );
|
||||
}
|
||||
}
|
||||
echo "done.\n";
|
||||
|
||||
// Compare against http://en.wikipedia.org/wiki/UTF-8#Description
|
||||
function unichr( $c ) {
|
||||
if ( $c <= 0x7F ) {
|
||||
return chr( $c );
|
||||
} elseif ( $c <= 0x7FF ) {
|
||||
return chr( 0xC0 | $c >> 6 ) . chr( 0x80 | $c & 0x3F );
|
||||
} elseif ( $c <= 0xFFFF ) {
|
||||
return chr( 0xE0 | $c >> 12 ) . chr( 0x80 | $c >> 6 & 0x3F )
|
||||
. chr( 0x80 | $c & 0x3F );
|
||||
} elseif ( $c <= 0x10FFFF ) {
|
||||
return chr( 0xF0 | $c >> 18 ) . chr( 0x80 | $c >> 12 & 0x3F )
|
||||
. chr( 0x80 | $c >> 6 & 0x3F )
|
||||
. chr( 0x80 | $c & 0x3F );
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
function unistr( $c ) {
|
||||
return implode( "", array_map( "unichr", array_map( "hexdec", explode( " ", $c ) ) ) );
|
||||
}
|
||||
|
||||
function getRow( $f ) {
|
||||
$row = fgets( $f );
|
||||
if ( $row === false ) return false;
|
||||
$row = rtrim( $row );
|
||||
$pos = strpos( $row, COMMENT );
|
||||
$pos2 = strpos( $row, ")" );
|
||||
if ( $pos === 0 ) return array( $row );
|
||||
$c = "";
|
||||
|
||||
if ( $pos ) {
|
||||
if ( $pos2 ) $c = substr( $row, $pos2 + 2 );
|
||||
else $c = substr( $row, $pos );
|
||||
$row = substr( $row, 0, $pos );
|
||||
}
|
||||
|
||||
$ret = array();
|
||||
foreach ( explode( SEPARATOR, $row ) as $ent ) {
|
||||
if ( trim( $ent ) !== "" ) {
|
||||
$ret[] = unistr( $ent );
|
||||
}
|
||||
}
|
||||
$ret[] = $c;
|
||||
|
||||
return $ret;
|
||||
}
|
||||
|
|
@ -25,6 +25,8 @@
|
|||
* @ingroup UtfNormal
|
||||
*/
|
||||
|
||||
|
||||
use Utfnormal\Utils;
|
||||
/**
|
||||
* Return UTF-8 sequence for a given Unicode code point.
|
||||
*
|
||||
|
|
@ -32,31 +34,10 @@
|
|||
* @return String
|
||||
* @throws InvalidArgumentException if fed out of range data.
|
||||
* @public
|
||||
* @deprecated since 1.25, use UtfNormal\Utils directly
|
||||
*/
|
||||
function codepointToUtf8( $codepoint ) {
|
||||
if ( $codepoint < 0x80 ) {
|
||||
return chr( $codepoint );
|
||||
}
|
||||
|
||||
if ( $codepoint < 0x800 ) {
|
||||
return chr( $codepoint >> 6 & 0x3f | 0xc0 ) .
|
||||
chr( $codepoint & 0x3f | 0x80 );
|
||||
}
|
||||
|
||||
if ( $codepoint < 0x10000 ) {
|
||||
return chr( $codepoint >> 12 & 0x0f | 0xe0 ) .
|
||||
chr( $codepoint >> 6 & 0x3f | 0x80 ) .
|
||||
chr( $codepoint & 0x3f | 0x80 );
|
||||
}
|
||||
|
||||
if ( $codepoint < 0x110000 ) {
|
||||
return chr( $codepoint >> 18 & 0x07 | 0xf0 ) .
|
||||
chr( $codepoint >> 12 & 0x3f | 0x80 ) .
|
||||
chr( $codepoint >> 6 & 0x3f | 0x80 ) .
|
||||
chr( $codepoint & 0x3f | 0x80 );
|
||||
}
|
||||
|
||||
throw new InvalidArgumentException( "Asked for code outside of range ($codepoint)" );
|
||||
Utils::codepointToUtf8( $codepoint );
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -68,21 +49,17 @@ function codepointToUtf8( $codepoint ) {
|
|||
* @return String
|
||||
* @throws InvalidArgumentException if fed out of range data.
|
||||
* @private
|
||||
* @deprecated since 1.25, use UtfNormal\Utils directly
|
||||
*/
|
||||
function hexSequenceToUtf8( $sequence ) {
|
||||
$utf = '';
|
||||
foreach ( explode( ' ', $sequence ) as $hex ) {
|
||||
$n = hexdec( $hex );
|
||||
$utf .= codepointToUtf8( $n );
|
||||
}
|
||||
|
||||
return $utf;
|
||||
return Utils::hexSequenceToUtf8( $sequence );
|
||||
}
|
||||
|
||||
/**
|
||||
* Take a UTF-8 string and return a space-separated series of hex
|
||||
* numbers representing Unicode code points. For debugging.
|
||||
*
|
||||
* @fixme this is private but extensions + maint scripts are using it
|
||||
* @param string $str UTF-8 string.
|
||||
* @return string
|
||||
* @private
|
||||
|
|
@ -90,7 +67,7 @@ function hexSequenceToUtf8( $sequence ) {
|
|||
function utf8ToHexSequence( $str ) {
|
||||
$buf = '';
|
||||
foreach ( preg_split( '//u', $str, -1, PREG_SPLIT_NO_EMPTY ) as $cp ) {
|
||||
$buf .= sprintf( '%04x ', utf8ToCodepoint( $cp ) );
|
||||
$buf .= sprintf( '%04x ', UtfNormal\Utils::utf8ToCodepoint( $cp ) );
|
||||
}
|
||||
|
||||
return rtrim( $buf );
|
||||
|
|
@ -103,39 +80,10 @@ function utf8ToHexSequence( $str ) {
|
|||
* @param $char String
|
||||
* @return Integer
|
||||
* @public
|
||||
* @deprecated since 1.25, use UtfNormal\Utils directly
|
||||
*/
|
||||
function utf8ToCodepoint( $char ) {
|
||||
# Find the length
|
||||
$z = ord( $char[0] );
|
||||
if ( $z & 0x80 ) {
|
||||
$length = 0;
|
||||
while ( $z & 0x80 ) {
|
||||
$length++;
|
||||
$z <<= 1;
|
||||
}
|
||||
} else {
|
||||
$length = 1;
|
||||
}
|
||||
|
||||
if ( $length != strlen( $char ) ) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if ( $length == 1 ) {
|
||||
return ord( $char );
|
||||
}
|
||||
|
||||
# Mask off the length-determining bits and shift back to the original location
|
||||
$z &= 0xff;
|
||||
$z >>= $length;
|
||||
|
||||
# Add in the free bits from subsequent bytes
|
||||
for ( $i = 1; $i < $length; $i++ ) {
|
||||
$z <<= 6;
|
||||
$z |= ord( $char[$i] ) & 0x3f;
|
||||
}
|
||||
|
||||
return $z;
|
||||
return Utils::utf8ToCodepoint( $char );
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -144,11 +92,8 @@ function utf8ToCodepoint( $char ) {
|
|||
* @param string $string string to be escaped.
|
||||
* @return String: escaped string.
|
||||
* @public
|
||||
* @deprecated since 1.25, use UtfNormal\Utils directly
|
||||
*/
|
||||
function escapeSingleString( $string ) {
|
||||
return strtr( $string,
|
||||
array(
|
||||
'\\' => '\\\\',
|
||||
'\'' => '\\\''
|
||||
) );
|
||||
return Utils::escapeSingleString( $string );
|
||||
}
|
||||
|
|
|
|||
|
|
@ -319,7 +319,7 @@ EOR;
|
|||
|
||||
function pageTextCallback( $matches ) {
|
||||
# Get rid of invalid UTF-8, strip control characters
|
||||
$val = htmlspecialchars( UtfNormal::cleanUp( stripcslashes( $matches[1] ) ) );
|
||||
$val = htmlspecialchars( UtfNormal\Validator::cleanUp( stripcslashes( $matches[1] ) ) );
|
||||
$val = str_replace( array( "\n", '<27>' ), array( ' ', '' ), $val );
|
||||
return '<PAGE value="' . $val . '" />';
|
||||
}
|
||||
|
|
|
|||
|
|
@ -477,7 +477,7 @@ class Exif {
|
|||
} else {
|
||||
// if valid utf-8, assume that, otherwise assume windows-1252
|
||||
$valCopy = $val;
|
||||
UtfNormal::quickIsNFCVerify( $valCopy ); //validates $valCopy.
|
||||
UtfNormal\Validator::quickIsNFCVerify( $valCopy ); //validates $valCopy.
|
||||
if ( $valCopy !== $val ) {
|
||||
wfSuppressWarnings();
|
||||
$val = iconv( 'Windows-1252', 'UTF-8//IGNORE', $val );
|
||||
|
|
|
|||
|
|
@ -158,7 +158,7 @@ class GIFMetadataExtractor {
|
|||
// assume its that, otherwise assume its windows-1252 (iso-8859-1)
|
||||
$dataCopy = $data;
|
||||
// quickIsNFCVerify has the side effect of replacing any invalid characters
|
||||
UtfNormal::quickIsNFCVerify( $dataCopy );
|
||||
UtfNormal\Validator::quickIsNFCVerify( $dataCopy );
|
||||
|
||||
if ( $dataCopy !== $data ) {
|
||||
wfSuppressWarnings();
|
||||
|
|
|
|||
|
|
@ -456,7 +456,7 @@ class IPTC {
|
|||
//treat as utf-8 if is valid utf-8. otherwise pretend its windows-1252
|
||||
// most of the time if there is no 1:90 tag, it is either ascii, latin1, or utf-8
|
||||
$oldData = $data;
|
||||
UtfNormal::quickIsNFCVerify( $data ); //make $data valid utf-8
|
||||
UtfNormal\Validator::quickIsNFCVerify( $data ); //make $data valid utf-8
|
||||
if ( $data === $oldData ) {
|
||||
return $data; //if validation didn't change $data
|
||||
} else {
|
||||
|
|
|
|||
|
|
@ -98,7 +98,7 @@ class JpegMetadataExtractor {
|
|||
// First see if valid utf-8,
|
||||
// if not try to convert it to windows-1252.
|
||||
$com = $oldCom = trim( self::jpegExtractMarker( $fh ) );
|
||||
UtfNormal::quickIsNFCVerify( $com );
|
||||
UtfNormal\Validator::quickIsNFCVerify( $com );
|
||||
// turns $com to valid utf-8.
|
||||
// thus if no change, its utf-8, otherwise its something else.
|
||||
if ( $com !== $oldCom ) {
|
||||
|
|
@ -108,7 +108,7 @@ class JpegMetadataExtractor {
|
|||
}
|
||||
// Try it again, if its still not a valid string, then probably
|
||||
// binary junk or some really weird encoding, so don't extract.
|
||||
UtfNormal::quickIsNFCVerify( $com );
|
||||
UtfNormal\Validator::quickIsNFCVerify( $com );
|
||||
if ( $com === $oldCom ) {
|
||||
$segments["COM"][] = $oldCom;
|
||||
} else {
|
||||
|
|
|
|||
|
|
@ -92,7 +92,7 @@ class Preprocessor_DOM implements Preprocessor {
|
|||
wfRestoreWarnings();
|
||||
if ( !$result ) {
|
||||
// Try running the XML through UtfNormal to get rid of invalid characters
|
||||
$xml = UtfNormal::cleanUp( $xml );
|
||||
$xml = UtfNormal\Validator::cleanUp( $xml );
|
||||
// 1 << 19 == XML_PARSE_HUGE, needed so newer versions of libxml2
|
||||
// don't barf when the XML is >256 levels deep
|
||||
$result = $dom->loadXML( $xml, 1 << 19 );
|
||||
|
|
@ -191,7 +191,7 @@ class Preprocessor_DOM implements Preprocessor {
|
|||
wfRestoreWarnings();
|
||||
if ( !$result ) {
|
||||
// Try running the XML through UtfNormal to get rid of invalid characters
|
||||
$xml = UtfNormal::cleanUp( $xml );
|
||||
$xml = UtfNormal\Validator::cleanUp( $xml );
|
||||
// 1 << 19 == XML_PARSE_HUGE, needed so newer versions of libxml2
|
||||
// don't barf when the XML is >256 levels deep.
|
||||
$result = $dom->loadXML( $xml, 1 << 19 );
|
||||
|
|
|
|||
|
|
@ -116,7 +116,7 @@ class MediaWikiSite extends Site {
|
|||
// Make sure the string is normalized into NFC (due to the bug 40017)
|
||||
// but do nothing to the whitespaces, that should work appropriately.
|
||||
// @see https://bugzilla.wikimedia.org/show_bug.cgi?id=40017
|
||||
$pageName = UtfNormal::cleanUp( $pageName );
|
||||
$pageName = UtfNormal\Validator::cleanUp( $pageName );
|
||||
|
||||
// Build the args for the specific call
|
||||
$args = array(
|
||||
|
|
|
|||
|
|
@ -230,7 +230,7 @@ class MediaWikiTitleCodec implements TitleFormatter, TitleParser {
|
|||
);
|
||||
$dbkey = trim( $dbkey, '_' );
|
||||
|
||||
if ( strpos( $dbkey, UTF8_REPLACEMENT ) !== false ) {
|
||||
if ( strpos( $dbkey, UtfNormal\Constants::UTF8_REPLACEMENT ) !== false ) {
|
||||
# Contained illegal UTF-8 sequences or forbidden Unicode chars.
|
||||
throw new MalformedTitleException( 'Bad UTF-8 sequences found in title: ' . $text );
|
||||
}
|
||||
|
|
|
|||
|
|
@ -2945,7 +2945,7 @@ class Language {
|
|||
}
|
||||
|
||||
// Break down Hangul syllables to grab the first jamo
|
||||
$code = utf8ToCodepoint( $matches[1] );
|
||||
$code = UtfNormal\Utils::utf8ToCodepoint( $matches[1] );
|
||||
if ( $code < 0xac00 || 0xd7a4 <= $code ) {
|
||||
return $matches[1];
|
||||
} elseif ( $code < 0xb098 ) {
|
||||
|
|
@ -3037,7 +3037,7 @@ class Language {
|
|||
*/
|
||||
function normalize( $s ) {
|
||||
global $wgAllUnicodeFixes;
|
||||
$s = UtfNormal::cleanUp( $s );
|
||||
$s = UtfNormal\Validator::cleanUp( $s );
|
||||
if ( $wgAllUnicodeFixes ) {
|
||||
$s = $this->transformUsingPairFile( 'normalize-ar.ser', $s );
|
||||
$s = $this->transformUsingPairFile( 'normalize-ml.ser', $s );
|
||||
|
|
|
|||
|
|
@ -323,7 +323,7 @@ class GenerateCollationData extends Maintenance {
|
|||
$tertiaryCollator->sort( $x );
|
||||
$cp = $x[0];
|
||||
|
||||
$char = codepointToUtf8( $cp );
|
||||
$char = UtfNormal\Utils::codepointToUtf8( $cp );
|
||||
$headerChars[] = $char;
|
||||
if ( $primaryCollator->compare( $char, $prevChar ) <= 0 ) {
|
||||
$numOutOfOrder++;
|
||||
|
|
@ -337,7 +337,7 @@ class GenerateCollationData extends Maintenance {
|
|||
|
||||
if ( $this->debugOutFile ) {
|
||||
fwrite( $this->debugOutFile, sprintf( "%05X %s %s (%s)\n", $cp, $weight, $char,
|
||||
implode( ' ', array_map( 'codepointToUtf8', $group ) ) ) );
|
||||
implode( ' ', array_map( 'UtfNormal\Utils::codepointToUtf8', $group ) ) ) );
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -117,8 +117,8 @@ class GenerateNormalizerDataAr extends Maintenance {
|
|||
continue;
|
||||
}
|
||||
|
||||
$source = hexSequenceToUtf8( $data['Code'] );
|
||||
$dest = hexSequenceToUtf8( $m[2] );
|
||||
$source = UtfNormal\Utils::hexSequenceToUtf8( $data['Code'] );
|
||||
$dest = UtfNormal\Utils::hexSequenceToUtf8( $m[2] );
|
||||
$pairs[$source] = $dest;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -54,8 +54,8 @@ class GenerateNormalizerDataMl extends Maintenance {
|
|||
|
||||
$pairs = array();
|
||||
foreach ( $hexPairs as $hexSource => $hexDest ) {
|
||||
$source = hexSequenceToUtf8( $hexSource );
|
||||
$dest = hexSequenceToUtf8( $hexDest );
|
||||
$source = UtfNormal\Utils::hexSequenceToUtf8( $hexSource );
|
||||
$dest = UtfNormal\Utils::hexSequenceToUtf8( $hexDest );
|
||||
$pairs[$source] = $dest;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -108,12 +108,12 @@ class GenerateUtf8Case extends Maintenance {
|
|||
$data[$name] = $numberedData[$number];
|
||||
}
|
||||
|
||||
$source = hexSequenceToUtf8( $data['Code'] );
|
||||
$source = UtfNormal\Utils::hexSequenceToUtf8( $data['Code'] );
|
||||
if ( $data['Simple_Uppercase_Mapping'] ) {
|
||||
$upper[$source] = hexSequenceToUtf8( $data['Simple_Uppercase_Mapping'] );
|
||||
$upper[$source] = UtfNormal\Utils::hexSequenceToUtf8( $data['Simple_Uppercase_Mapping'] );
|
||||
}
|
||||
if ( $data['Simple_Lowercase_Mapping'] ) {
|
||||
$lower[$source] = hexSequenceToUtf8( $data['Simple_Lowercase_Mapping'] );
|
||||
$lower[$source] = UtfNormal\Utils::hexSequenceToUtf8( $data['Simple_Lowercase_Mapping'] );
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load diff
Binary file not shown.
File diff suppressed because it is too large
Load diff
|
|
@ -79,7 +79,7 @@ class SanitizerTest extends MediaWikiTestCase {
|
|||
*/
|
||||
public function testInvalidNumberedEntities() {
|
||||
$this->assertEquals(
|
||||
UTF8_REPLACEMENT,
|
||||
UtfNormal\Constants::UTF8_REPLACEMENT,
|
||||
Sanitizer::decodeCharReferences( "�" ),
|
||||
'Invalid numbered entity'
|
||||
);
|
||||
|
|
|
|||
|
|
@ -1,394 +0,0 @@
|
|||
<?php
|
||||
/**
|
||||
* Tests for UtfNormal::cleanUp() function.
|
||||
*
|
||||
* Copyright © 2004 Brion Vibber <brion@pobox.com>
|
||||
* https://www.mediawiki.org/
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License along
|
||||
* with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
||||
* http://www.gnu.org/copyleft/gpl.html
|
||||
*
|
||||
* @file
|
||||
*/
|
||||
|
||||
/**
|
||||
* Additional tests for UtfNormal::cleanUp() function, inclusion
|
||||
* regression checks for known problems.
|
||||
* Requires PHPUnit.
|
||||
*
|
||||
* @ingroup UtfNormal
|
||||
* @group Large
|
||||
*
|
||||
* @todo covers tags, will be UtfNormal::cleanUp once the below is resolved
|
||||
* @todo split me into test methods and providers per the below comment
|
||||
* @todo Document individual tests
|
||||
*
|
||||
* We ignore code coverage for this test suite until they are rewritten
|
||||
* to use data providers (bug 46561).
|
||||
* @codeCoverageIgnore
|
||||
*/
|
||||
class CleanUpTest extends PHPUnit_Framework_TestCase {
|
||||
public function testAscii() {
|
||||
$text = 'This is plain ASCII text.';
|
||||
$this->assertEquals( $text, UtfNormal::cleanUp( $text ) );
|
||||
}
|
||||
|
||||
public function testNull() {
|
||||
$text = "a \x00 null";
|
||||
$expect = "a \xef\xbf\xbd null";
|
||||
$this->assertEquals(
|
||||
bin2hex( $expect ),
|
||||
bin2hex( UtfNormal::cleanUp( $text ) ) );
|
||||
}
|
||||
|
||||
public function testLatin() {
|
||||
$text = "L'\xc3\xa9cole";
|
||||
$this->assertEquals( $text, UtfNormal::cleanUp( $text ) );
|
||||
}
|
||||
|
||||
public function testLatinNormal() {
|
||||
$text = "L'e\xcc\x81cole";
|
||||
$expect = "L'\xc3\xa9cole";
|
||||
$this->assertEquals( $expect, UtfNormal::cleanUp( $text ) );
|
||||
}
|
||||
|
||||
/**
|
||||
* This test is *very* expensive!
|
||||
*/
|
||||
function XtestAllChars() {
|
||||
$rep = UTF8_REPLACEMENT;
|
||||
for ( $i = 0x0; $i < UNICODE_MAX; $i++ ) {
|
||||
$char = codepointToUtf8( $i );
|
||||
$clean = UtfNormal::cleanUp( $char );
|
||||
$x = sprintf( "%04X", $i );
|
||||
|
||||
if ( $i % 0x1000 == 0 ) {
|
||||
echo "U+$x\n";
|
||||
}
|
||||
|
||||
if ( $i == 0x0009 ||
|
||||
$i == 0x000a ||
|
||||
$i == 0x000d ||
|
||||
( $i > 0x001f && $i < UNICODE_SURROGATE_FIRST ) ||
|
||||
( $i > UNICODE_SURROGATE_LAST && $i < 0xfffe ) ||
|
||||
( $i > 0xffff && $i <= UNICODE_MAX )
|
||||
) {
|
||||
if ( isset( UtfNormal::$utfCanonicalComp[$char] )
|
||||
|| isset( UtfNormal::$utfCanonicalDecomp[$char] )
|
||||
) {
|
||||
$comp = UtfNormal::NFC( $char );
|
||||
$this->assertEquals(
|
||||
bin2hex( $comp ),
|
||||
bin2hex( $clean ),
|
||||
"U+$x should be decomposed" );
|
||||
} else {
|
||||
$this->assertEquals(
|
||||
bin2hex( $char ),
|
||||
bin2hex( $clean ),
|
||||
"U+$x should be intact" );
|
||||
}
|
||||
} else {
|
||||
$this->assertEquals( bin2hex( $rep ), bin2hex( $clean ), $x );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public static function provideAllBytes() {
|
||||
return array(
|
||||
array( '', '' ),
|
||||
array( 'x', '' ),
|
||||
array( '', 'x' ),
|
||||
array( 'x', 'x' ),
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* @dataProvider provideAllBytes
|
||||
*/
|
||||
function testBytes( $head, $tail ) {
|
||||
for ( $i = 0x0; $i < 256; $i++ ) {
|
||||
$char = $head . chr( $i ) . $tail;
|
||||
$clean = UtfNormal::cleanUp( $char );
|
||||
$x = sprintf( "%02X", $i );
|
||||
|
||||
if ( $i == 0x0009 ||
|
||||
$i == 0x000a ||
|
||||
$i == 0x000d ||
|
||||
( $i > 0x001f && $i < 0x80 )
|
||||
) {
|
||||
$this->assertEquals(
|
||||
bin2hex( $char ),
|
||||
bin2hex( $clean ),
|
||||
"ASCII byte $x should be intact" );
|
||||
if ( $char != $clean ) {
|
||||
return;
|
||||
}
|
||||
} else {
|
||||
$norm = $head . UTF8_REPLACEMENT . $tail;
|
||||
$this->assertEquals(
|
||||
bin2hex( $norm ),
|
||||
bin2hex( $clean ),
|
||||
"Forbidden byte $x should be rejected" );
|
||||
if ( $norm != $clean ) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @dataProvider provideAllBytes
|
||||
*/
|
||||
function testDoubleBytes( $head, $tail ) {
|
||||
for ( $first = 0xc0; $first < 0x100; $first += 2 ) {
|
||||
for ( $second = 0x80; $second < 0x100; $second += 2 ) {
|
||||
$char = $head . chr( $first ) . chr( $second ) . $tail;
|
||||
$clean = UtfNormal::cleanUp( $char );
|
||||
$x = sprintf( "%02X,%02X", $first, $second );
|
||||
if ( $first > 0xc1 &&
|
||||
$first < 0xe0 &&
|
||||
$second < 0xc0
|
||||
) {
|
||||
$norm = UtfNormal::NFC( $char );
|
||||
$this->assertEquals(
|
||||
bin2hex( $norm ),
|
||||
bin2hex( $clean ),
|
||||
"Pair $x should be intact" );
|
||||
if ( $norm != $clean ) {
|
||||
return;
|
||||
}
|
||||
} elseif ( $first > 0xfd || $second > 0xbf ) {
|
||||
# fe and ff are not legal head bytes -- expect two replacement chars
|
||||
$norm = $head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail;
|
||||
$this->assertEquals(
|
||||
bin2hex( $norm ),
|
||||
bin2hex( $clean ),
|
||||
"Forbidden pair $x should be rejected" );
|
||||
if ( $norm != $clean ) {
|
||||
return;
|
||||
}
|
||||
} else {
|
||||
$norm = $head . UTF8_REPLACEMENT . $tail;
|
||||
$this->assertEquals(
|
||||
bin2hex( $norm ),
|
||||
bin2hex( $clean ),
|
||||
"Forbidden pair $x should be rejected" );
|
||||
if ( $norm != $clean ) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @dataProvider provideAllBytes
|
||||
*/
|
||||
function testTripleBytes( $head, $tail ) {
|
||||
for ( $first = 0xc0; $first < 0x100; $first += 2 ) {
|
||||
for ( $second = 0x80; $second < 0x100; $second += 2 ) {
|
||||
#for( $third = 0x80; $third < 0x100; $third++ ) {
|
||||
for ( $third = 0x80; $third < 0x81; $third++ ) {
|
||||
$char = $head . chr( $first ) . chr( $second ) . chr( $third ) . $tail;
|
||||
$clean = UtfNormal::cleanUp( $char );
|
||||
$x = sprintf( "%02X,%02X,%02X", $first, $second, $third );
|
||||
|
||||
if ( $first >= 0xe0 &&
|
||||
$first < 0xf0 &&
|
||||
$second < 0xc0 &&
|
||||
$third < 0xc0
|
||||
) {
|
||||
if ( $first == 0xe0 && $second < 0xa0 ) {
|
||||
$this->assertEquals(
|
||||
bin2hex( $head . UTF8_REPLACEMENT . $tail ),
|
||||
bin2hex( $clean ),
|
||||
"Overlong triplet $x should be rejected" );
|
||||
} elseif ( $first == 0xed &&
|
||||
( chr( $first ) . chr( $second ) . chr( $third ) ) >= UTF8_SURROGATE_FIRST
|
||||
) {
|
||||
$this->assertEquals(
|
||||
bin2hex( $head . UTF8_REPLACEMENT . $tail ),
|
||||
bin2hex( $clean ),
|
||||
"Surrogate triplet $x should be rejected" );
|
||||
} else {
|
||||
$this->assertEquals(
|
||||
bin2hex( UtfNormal::NFC( $char ) ),
|
||||
bin2hex( $clean ),
|
||||
"Triplet $x should be intact" );
|
||||
}
|
||||
} elseif ( $first > 0xc1 && $first < 0xe0 && $second < 0xc0 ) {
|
||||
$this->assertEquals(
|
||||
bin2hex( UtfNormal::NFC( $head . chr( $first ) .
|
||||
chr( $second ) ) . UTF8_REPLACEMENT . $tail ),
|
||||
bin2hex( $clean ),
|
||||
"Valid 2-byte $x + broken tail" );
|
||||
} elseif ( $second > 0xc1 && $second < 0xe0 && $third < 0xc0 ) {
|
||||
$this->assertEquals(
|
||||
bin2hex( $head . UTF8_REPLACEMENT .
|
||||
UtfNormal::NFC( chr( $second ) . chr( $third ) . $tail ) ),
|
||||
bin2hex( $clean ),
|
||||
"Broken head + valid 2-byte $x" );
|
||||
} elseif ( ( $first > 0xfd || $second > 0xfd ) &&
|
||||
( ( $second > 0xbf && $third > 0xbf ) ||
|
||||
( $second < 0xc0 && $third < 0xc0 ) ||
|
||||
( $second > 0xfd ) ||
|
||||
( $third > 0xfd ) )
|
||||
) {
|
||||
# fe and ff are not legal head bytes -- expect three replacement chars
|
||||
$this->assertEquals(
|
||||
bin2hex( $head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail ),
|
||||
bin2hex( $clean ),
|
||||
"Forbidden triplet $x should be rejected" );
|
||||
} elseif ( $first > 0xc2 && $second < 0xc0 && $third < 0xc0 ) {
|
||||
$this->assertEquals(
|
||||
bin2hex( $head . UTF8_REPLACEMENT . $tail ),
|
||||
bin2hex( $clean ),
|
||||
"Forbidden triplet $x should be rejected" );
|
||||
} else {
|
||||
$this->assertEquals(
|
||||
bin2hex( $head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail ),
|
||||
bin2hex( $clean ),
|
||||
"Forbidden triplet $x should be rejected" );
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public function testChunkRegression() {
|
||||
# Check for regression against a chunking bug
|
||||
$text = "\x46\x55\xb8" .
|
||||
"\xdc\x96" .
|
||||
"\xee" .
|
||||
"\xe7" .
|
||||
"\x44" .
|
||||
"\xaa" .
|
||||
"\x2f\x25";
|
||||
$expect = "\x46\x55\xef\xbf\xbd" .
|
||||
"\xdc\x96" .
|
||||
"\xef\xbf\xbd" .
|
||||
"\xef\xbf\xbd" .
|
||||
"\x44" .
|
||||
"\xef\xbf\xbd" .
|
||||
"\x2f\x25";
|
||||
|
||||
$this->assertEquals(
|
||||
bin2hex( $expect ),
|
||||
bin2hex( UtfNormal::cleanUp( $text ) ) );
|
||||
}
|
||||
|
||||
public function testInterposeRegression() {
|
||||
$text = "\x4e\x30" .
|
||||
"\xb1" . # bad tail
|
||||
"\x3a" .
|
||||
"\x92" . # bad tail
|
||||
"\x62\x3a" .
|
||||
"\x84" . # bad tail
|
||||
"\x43" .
|
||||
"\xc6" . # bad head
|
||||
"\x3f" .
|
||||
"\x92" . # bad tail
|
||||
"\xad" . # bad tail
|
||||
"\x7d" .
|
||||
"\xd9\x95";
|
||||
|
||||
$expect = "\x4e\x30" .
|
||||
"\xef\xbf\xbd" .
|
||||
"\x3a" .
|
||||
"\xef\xbf\xbd" .
|
||||
"\x62\x3a" .
|
||||
"\xef\xbf\xbd" .
|
||||
"\x43" .
|
||||
"\xef\xbf\xbd" .
|
||||
"\x3f" .
|
||||
"\xef\xbf\xbd" .
|
||||
"\xef\xbf\xbd" .
|
||||
"\x7d" .
|
||||
"\xd9\x95";
|
||||
|
||||
$this->assertEquals(
|
||||
bin2hex( $expect ),
|
||||
bin2hex( UtfNormal::cleanUp( $text ) ) );
|
||||
}
|
||||
|
||||
public function testOverlongRegression() {
|
||||
$text = "\x67" .
|
||||
"\x1a" . # forbidden ascii
|
||||
"\xea" . # bad head
|
||||
"\xc1\xa6" . # overlong sequence
|
||||
"\xad" . # bad tail
|
||||
"\x1c" . # forbidden ascii
|
||||
"\xb0" . # bad tail
|
||||
"\x3c" .
|
||||
"\x9e"; # bad tail
|
||||
$expect = "\x67" .
|
||||
"\xef\xbf\xbd" .
|
||||
"\xef\xbf\xbd" .
|
||||
"\xef\xbf\xbd" .
|
||||
"\xef\xbf\xbd" .
|
||||
"\xef\xbf\xbd" .
|
||||
"\xef\xbf\xbd" .
|
||||
"\x3c" .
|
||||
"\xef\xbf\xbd";
|
||||
$this->assertEquals(
|
||||
bin2hex( $expect ),
|
||||
bin2hex( UtfNormal::cleanUp( $text ) ) );
|
||||
}
|
||||
|
||||
public function testSurrogateRegression() {
|
||||
$text = "\xed\xb4\x96" . # surrogate 0xDD16
|
||||
"\x83" . # bad tail
|
||||
"\xb4" . # bad tail
|
||||
"\xac"; # bad head
|
||||
$expect = "\xef\xbf\xbd" .
|
||||
"\xef\xbf\xbd" .
|
||||
"\xef\xbf\xbd" .
|
||||
"\xef\xbf\xbd";
|
||||
$this->assertEquals(
|
||||
bin2hex( $expect ),
|
||||
bin2hex( UtfNormal::cleanUp( $text ) ) );
|
||||
}
|
||||
|
||||
public function testBomRegression() {
|
||||
$text = "\xef\xbf\xbe" . # U+FFFE, illegal char
|
||||
"\xb2" . # bad tail
|
||||
"\xef" . # bad head
|
||||
"\x59";
|
||||
$expect = "\xef\xbf\xbd" .
|
||||
"\xef\xbf\xbd" .
|
||||
"\xef\xbf\xbd" .
|
||||
"\x59";
|
||||
$this->assertEquals(
|
||||
bin2hex( $expect ),
|
||||
bin2hex( UtfNormal::cleanUp( $text ) ) );
|
||||
}
|
||||
|
||||
public function testForbiddenRegression() {
|
||||
$text = "\xef\xbf\xbf"; # U+FFFF, illegal char
|
||||
$expect = "\xef\xbf\xbd";
|
||||
$this->assertEquals(
|
||||
bin2hex( $expect ),
|
||||
bin2hex( UtfNormal::cleanUp( $text ) ) );
|
||||
}
|
||||
|
||||
public function testHangulRegression() {
|
||||
$text = "\xed\x9c\xaf" . # Hangul char
|
||||
"\xe1\x87\x81"; # followed by another final jamo
|
||||
$expect = $text; # Should *not* change.
|
||||
$this->assertEquals(
|
||||
bin2hex( $expect ),
|
||||
bin2hex( UtfNormal::cleanUp( $text ) ) );
|
||||
}
|
||||
}
|
||||
|
|
@ -1,123 +0,0 @@
|
|||
<?php
|
||||
/**
|
||||
* Runs the UTF-8 decoder test at:
|
||||
* http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
|
||||
*
|
||||
* Copyright © 2004 Brion Vibber <brion@pobox.com>
|
||||
* https://www.mediawiki.org/
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License along
|
||||
* with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
||||
* http://www.gnu.org/copyleft/gpl.html
|
||||
*
|
||||
* @file
|
||||
* @ingroup UtfNormal
|
||||
*/
|
||||
|
||||
class Utf8Test extends PHPUnit_Framework_TestCase {
|
||||
public static function provideLines() {
|
||||
global $IP;
|
||||
$in = fopen( "$IP/tests/phpunit/data/normal/UTF-8-test.txt", "rt" );
|
||||
|
||||
$columns = 0;
|
||||
while ( false !== ( $line = fgets( $in ) ) ) {
|
||||
$matches = array();
|
||||
if ( preg_match( '/^(Here come the tests:\s*)\|$/', $line, $matches ) ) {
|
||||
$columns = strpos( $line, '|' );
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if ( !$columns ) {
|
||||
print "Something seems to be wrong; couldn't extract line length.\n";
|
||||
print "Check that UTF-8-test.txt was downloaded correctly from\n";
|
||||
print "http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt\n";
|
||||
exit( -1 );
|
||||
}
|
||||
|
||||
|
||||
$ignore = array(
|
||||
# These two lines actually seem to be corrupt
|
||||
'2.1.1', '2.2.1' );
|
||||
|
||||
$exceptions = array(
|
||||
# Tests that should mark invalid characters due to using long
|
||||
# sequences beyond what is now considered legal.
|
||||
'2.1.5', '2.1.6', '2.2.4', '2.2.5', '2.2.6', '2.3.5',
|
||||
|
||||
# Literal 0xffff, which is illegal
|
||||
'2.2.3' );
|
||||
|
||||
$longTests = array(
|
||||
# These tests span multiple lines
|
||||
'3.1.9', '3.2.1', '3.2.2', '3.2.3', '3.2.4', '3.2.5',
|
||||
'3.4' );
|
||||
|
||||
$testCases = array();
|
||||
|
||||
$section = null;
|
||||
while ( false !== ( $line = fgets( $in ) ) ) {
|
||||
$matches = array();
|
||||
if ( preg_match( '/^(\d+)\s+(.*?)\s*\|/', $line, $matches ) ) {
|
||||
continue;
|
||||
}
|
||||
if ( preg_match( '/^(\d+\.\d+\.\d+)\s*/', $line, $matches ) ) {
|
||||
$test = $matches[1];
|
||||
|
||||
if ( in_array( $test, $ignore ) ) {
|
||||
continue;
|
||||
}
|
||||
if ( in_array( $test, $longTests ) ) {
|
||||
fgets( $in );
|
||||
|
||||
// @codingStandardsIgnoreStart Generic.CodeAnalysis.ForLoopWithTestFunctionCall.NotAllowed
|
||||
for ( $line = fgets( $in ); !preg_match( '/^\s+\|/', $line ); $line = fgets( $in ) ) {
|
||||
// @codingStandardsIgnoreEnd
|
||||
|
||||
$testCases[] = array( $test, $line, $columns, $exceptions );
|
||||
}
|
||||
} else {
|
||||
$testCases[] = array( $test, $line, $columns, $exceptions );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return $testCases;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @dataProvider provideLines
|
||||
* @covers UtfNormal::quickisNFCVerify
|
||||
*/
|
||||
function testLine( $test, $line, $columns, $exceptions ) {
|
||||
$stripped = $line;
|
||||
UtfNormal::quickisNFCVerify( $stripped );
|
||||
|
||||
$same = ( $line == $stripped );
|
||||
$len = mb_strlen( substr( $stripped, 0, strpos( $stripped, '|' ) ) );
|
||||
if ( $len == 0 ) {
|
||||
$len = strlen( substr( $stripped, 0, strpos( $stripped, '|' ) ) );
|
||||
}
|
||||
|
||||
$ok = $same ^ ( $test >= 3 );
|
||||
|
||||
$ok ^= in_array( $test, $exceptions );
|
||||
|
||||
$ok &= ( $columns == $len );
|
||||
|
||||
$this->assertEquals( 1, $ok );
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -1,160 +0,0 @@
|
|||
<?php
|
||||
/**
|
||||
* Implements the conformance test at:
|
||||
* http://www.unicode.org/Public/UNIDATA/NormalizationTest.txt
|
||||
*
|
||||
* Copyright © 2004 Brion Vibber <brion@pobox.com>
|
||||
* https://www.mediawiki.org/
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License along
|
||||
* with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
||||
* http://www.gnu.org/copyleft/gpl.html
|
||||
*
|
||||
* @file
|
||||
* @group UtfNormal
|
||||
* @group large
|
||||
*/
|
||||
|
||||
class UtfNormalTest extends PHPUnit_Framework_TestCase {
|
||||
|
||||
protected static $testedChars = array();
|
||||
|
||||
public static function provideNormalizationTest() {
|
||||
global $IP;
|
||||
$in = fopen( "$IP/tests/phpunit/data/normal/NormalizationTest.txt", "rt" );
|
||||
|
||||
$testCases = array();
|
||||
while ( false !== ( $line = fgets( $in ) ) ) {
|
||||
list( $data, $comment ) = explode( '#', $line );
|
||||
if ( $data === '' ) continue;
|
||||
$matches = array();
|
||||
if ( preg_match( '/@Part([\d])/', $data, $matches ) ) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$columns = array_map( "hexSequenceToUtf8", explode( ";", $data ) );
|
||||
array_unshift( $columns, '' );
|
||||
|
||||
self::$testedChars[$columns[1]] = true;
|
||||
$testCases[] = array( $columns, $comment );
|
||||
}
|
||||
fclose( $in );
|
||||
|
||||
return array( array( $testCases ) );
|
||||
}
|
||||
|
||||
function assertStringEquals( $a, $b, $desc ) {
|
||||
$this->assertEquals( 0, strcmp( $a, $b ), $desc );
|
||||
}
|
||||
|
||||
function assertNFC( $c, $desc ) {
|
||||
$this->assertStringEquals( $c[2], UtfNormal::toNFC( $c[1] ), $desc );
|
||||
$this->assertStringEquals( $c[2], UtfNormal::toNFC( $c[2] ), $desc );
|
||||
$this->assertStringEquals( $c[2], UtfNormal::toNFC( $c[3] ), $desc );
|
||||
$this->assertStringEquals( $c[4], UtfNormal::toNFC( $c[4] ), $desc );
|
||||
$this->assertStringEquals( $c[4], UtfNormal::toNFC( $c[5] ), $desc );
|
||||
}
|
||||
|
||||
function assertNFD( $c, $desc ) {
|
||||
$this->assertStringEquals( $c[3], UtfNormal::toNFD( $c[1] ), $desc );
|
||||
$this->assertStringEquals( $c[3], UtfNormal::toNFD( $c[2] ), $desc );
|
||||
$this->assertStringEquals( $c[3], UtfNormal::toNFD( $c[3] ), $desc );
|
||||
$this->assertStringEquals( $c[5], UtfNormal::toNFD( $c[4] ), $desc );
|
||||
$this->assertStringEquals( $c[5], UtfNormal::toNFD( $c[5] ), $desc );
|
||||
}
|
||||
|
||||
function assertNFKC( $c, $desc ) {
|
||||
$this->assertStringEquals( $c[4], UtfNormal::toNFKC( $c[1] ), $desc );
|
||||
$this->assertStringEquals( $c[4], UtfNormal::toNFKC( $c[2] ), $desc );
|
||||
$this->assertStringEquals( $c[4], UtfNormal::toNFKC( $c[3] ), $desc );
|
||||
$this->assertStringEquals( $c[4], UtfNormal::toNFKC( $c[4] ), $desc );
|
||||
$this->assertStringEquals( $c[4], UtfNormal::toNFKC( $c[5] ), $desc );
|
||||
}
|
||||
|
||||
function assertNFKD( $c, $desc ) {
|
||||
$this->assertStringEquals( $c[5], UtfNormal::toNFKD( $c[1] ), $desc );
|
||||
$this->assertStringEquals( $c[5], UtfNormal::toNFKD( $c[2] ), $desc );
|
||||
$this->assertStringEquals( $c[5], UtfNormal::toNFKD( $c[3] ), $desc );
|
||||
$this->assertStringEquals( $c[5], UtfNormal::toNFKD( $c[4] ), $desc );
|
||||
$this->assertStringEquals( $c[5], UtfNormal::toNFKD( $c[5] ), $desc );
|
||||
}
|
||||
|
||||
function assertCleanUp( $c, $desc ) {
|
||||
$this->assertStringEquals( $c[2], UtfNormal::cleanUp( $c[1] ), $desc );
|
||||
$this->assertStringEquals( $c[2], UtfNormal::cleanUp( $c[2] ), $desc );
|
||||
$this->assertStringEquals( $c[2], UtfNormal::cleanUp( $c[3] ), $desc );
|
||||
$this->assertStringEquals( $c[4], UtfNormal::cleanUp( $c[4] ), $desc );
|
||||
$this->assertStringEquals( $c[4], UtfNormal::cleanUp( $c[5] ), $desc );
|
||||
}
|
||||
|
||||
/**
|
||||
* The data provider for this intentionally returns all the
|
||||
* test cases as one since PHPUnit is too slow otherwise
|
||||
*
|
||||
* @dataProvider provideNormalizationTest
|
||||
*/
|
||||
function testNormals( $testCases ) {
|
||||
foreach ( $testCases as $case ) {
|
||||
$c = $case[0];
|
||||
$desc = $case[1];
|
||||
$this->assertNFC( $c, $desc );
|
||||
$this->assertNFD( $c, $desc );
|
||||
$this->assertNFKC( $c, $desc );
|
||||
$this->assertNFKD( $c, $desc );
|
||||
$this->assertCleanUp( $c, $desc );
|
||||
}
|
||||
}
|
||||
|
||||
public static function provideUnicodeData() {
|
||||
global $IP;
|
||||
$in = fopen( "$IP/tests/phpunit/data/normal/UnicodeData.txt", "rt" );
|
||||
$testCases = array();
|
||||
while ( false !== ( $line = fgets( $in ) ) ) {
|
||||
$cols = explode( ';', $line );
|
||||
$char = codepointToUtf8( hexdec( $cols[0] ) );
|
||||
$desc = $cols[0] . ": " . $cols[1];
|
||||
if ( $char < "\x20" || $char >= UTF8_SURROGATE_FIRST && $char <= UTF8_SURROGATE_LAST ) {
|
||||
# Can't check NULL with the ICU plugin, as null bytes fail in C land.
|
||||
# Skip other control characters, as we strip them for XML safety.
|
||||
# Surrogates are illegal on their own or in UTF-8, ignore.
|
||||
continue;
|
||||
}
|
||||
if ( empty( self::$testedChars[$char] ) ) {
|
||||
$testCases[] = array( $char, $desc );
|
||||
}
|
||||
}
|
||||
fclose( $in );
|
||||
|
||||
return array( array( $testCases ) );
|
||||
}
|
||||
|
||||
/**
|
||||
* The data provider for this intentionally returns all the
|
||||
* test cases as one since PHPUnit is too slow otherwise
|
||||
*
|
||||
* @depends testNormals
|
||||
* @dataProvider provideUnicodeData
|
||||
*/
|
||||
public function testInvariant( $testCases ) {
|
||||
foreach ( $testCases as $case ) {
|
||||
$char = $case[0];
|
||||
$desc = $case[1];
|
||||
$this->assertStringEquals( $char, UtfNormal::toNFC( $char ), $desc );
|
||||
$this->assertStringEquals( $char, UtfNormal::toNFD( $char ), $desc );
|
||||
$this->assertStringEquals( $char, UtfNormal::toNFKC( $char ), $desc );
|
||||
$this->assertStringEquals( $char, UtfNormal::toNFKD( $char ), $desc );
|
||||
$this->assertStringEquals( $char, UtfNormal::cleanUp( $char ), $desc );
|
||||
}
|
||||
}
|
||||
}
|
||||
Loading…
Reference in a new issue