Merge "Use wikimedia/utfnormal library, add backwards-compatability layer"

This commit is contained in:
jenkins-bot 2015-03-31 00:51:16 +00:00 committed by Gerrit Code Review
commit b594c133f3
42 changed files with 170 additions and 44342 deletions

View file

@ -25,6 +25,7 @@
"psr/log": "1.0.0",
"wikimedia/cdb": "1.0.1",
"wikimedia/composer-merge-plugin": "1.0.0",
"wikimedia/utfnormal": "1.0.2",
"zordius/lightncandy": "0.18"
},
"require-dev": {

View file

@ -341,7 +341,7 @@ class IcuCollation extends Collation {
// Check for CJK
$firstChar = mb_substr( $string, 0, 1, 'UTF-8' );
if ( ord( $firstChar ) > 0x7f && self::isCjk( utf8ToCodepoint( $firstChar ) ) ) {
if ( ord( $firstChar ) > 0x7f && self::isCjk( UtfNormal\Utils::utf8ToCodepoint( $firstChar ) ) ) {
return $firstChar;
}

View file

@ -3984,7 +3984,7 @@ HTML
// breaks one of the entities whilst editing.
if ( ( substr( $invalue, $i, 1 ) == ";" ) && ( strlen( $hexstring ) <= 6 ) ) {
$codepoint = hexdec( $hexstring );
$result .= codepointToUtf8( $codepoint );
$result .= UtfNormal\Utils::codepointToUtf8( $codepoint );
} else {
$result .= "&#x" . $hexstring . substr( $invalue, $i, 1 );
}

View file

@ -164,7 +164,7 @@ class FeedUtils {
$diffText = "<p>Can't load revision $newid</p>";
} else {
// Diff output fine, clean up any illegal UTF-8
$diffText = UtfNormal::cleanUp( $diffText );
$diffText = UtfNormal\Validator::cleanUp( $diffText );
$diffText = self::applyDiffStyle( $diffText );
}
} else {

View file

@ -865,7 +865,7 @@ class Sanitizer {
$value = preg_replace_callback(
'/[--]/u', // U+FF01 to U+FF5A, excluding U+FF3C (bug 58088)
function ( $matches ) {
$cp = utf8ToCodepoint( $matches[0] );
$cp = UtfNormal\Utils::utf8ToCodepoint( $matches[0] );
if ( $cp === false ) {
return '';
}
@ -971,7 +971,7 @@ class Sanitizer {
// Line continuation
return '';
} elseif ( $matches[2] !== '' ) {
$char = codepointToUtf8( hexdec( $matches[2] ) );
$char = UtfNormal\Utils::codepointToUtf8( hexdec( $matches[2] ) );
} elseif ( $matches[3] !== '' ) {
$char = $matches[3];
} else {
@ -1452,9 +1452,9 @@ class Sanitizer {
*/
static function decodeChar( $codepoint ) {
if ( Sanitizer::validateCodepoint( $codepoint ) ) {
return codepointToUtf8( $codepoint );
return UtfNormal\Utils::codepointToUtf8( $codepoint );
} else {
return UTF8_REPLACEMENT;
return UtfNormal\Constants::UTF8_REPLACEMENT;
}
}
@ -1471,7 +1471,7 @@ class Sanitizer {
$name = self::$htmlEntityAliases[$name];
}
if ( isset( self::$htmlEntities[$name] ) ) {
return codepointToUtf8( self::$htmlEntities[$name] );
return UtfNormal\Utils::codepointToUtf8( self::$htmlEntities[$name] );
} else {
return "&$name;";
}

View file

@ -289,7 +289,7 @@ class WebRequest {
}
} else {
global $wgContLang;
$data = isset( $wgContLang ) ? $wgContLang->normalize( $data ) : UtfNormal::cleanUp( $data );
$data = isset( $wgContLang ) ? $wgContLang->normalize( $data ) : UtfNormal\Validator::cleanUp( $data );
}
return $data;
}

View file

@ -327,7 +327,7 @@ class MWDebug {
}
$str = $prefix . $str;
}
self::$debug[] = rtrim( UtfNormal::cleanUp( $str ) );
self::$debug[] = rtrim( UtfNormal\Validator::cleanUp( $str ) );
}
}

View file

@ -1140,9 +1140,6 @@ abstract class Installer {
* Check the libicu version
*/
protected function envCheckLibicu() {
$utf8 = function_exists( 'utf8_normalize' );
$intl = function_exists( 'normalizer_normalize' );
/**
* This needs to be updated something that the latest libicu
* will properly normalize. This normalization was found at
@ -1156,18 +1153,7 @@ abstract class Installer {
$useNormalizer = 'php';
$needsUpdate = false;
/**
* We're going to prefer the pecl extension here unless
* utf8_normalize is more up to date.
*/
if ( $utf8 ) {
$useNormalizer = 'utf8';
$utf8 = utf8_normalize( $not_normal_c, UtfNormal::UNORM_NFC );
if ( $utf8 !== $normal_c ) {
$needsUpdate = true;
}
}
if ( $intl ) {
if ( function_exists( 'normalizer_normalize' ) ) {
$useNormalizer = 'intl';
$intl = normalizer_normalize( $not_normal_c, Normalizer::FORM_C );
if ( $intl !== $normal_c ) {
@ -1175,8 +1161,7 @@ abstract class Installer {
}
}
// Uses messages 'config-unicode-using-php', 'config-unicode-using-utf8',
// 'config-unicode-using-intl'
// Uses messages 'config-unicode-using-php' and 'config-unicode-using-intl'
if ( $useNormalizer === 'php' ) {
$this->showMessage( 'config-unicode-pure-php-warning' );
} else {

View file

@ -45,7 +45,6 @@
"config-env-bad": "The environment has been checked.\nYou cannot install MediaWiki.",
"config-env-php": "PHP $1 is installed.",
"config-env-hhvm": "HHVM $1 is installed.",
"config-unicode-using-utf8": "Using Brion Vibber's utf8_normalize.so for Unicode normalization.",
"config-unicode-using-intl": "Using the [http://pecl.php.net/intl intl PECL extension] for Unicode normalization.",
"config-unicode-pure-php-warning": "<strong>Warning:</strong> The [http://pecl.php.net/intl intl PECL extension] is not available to handle Unicode normalization, falling back to slow pure-PHP implementation.\nIf you run a high-traffic site, you should read a little on [//www.mediawiki.org/wiki/Special:MyLanguage/Unicode_normalization_considerations Unicode normalization].",
"config-unicode-update-warning": "<strong>Warning:</strong> The installed version of the Unicode normalization wrapper uses an older version of [http://site.icu-project.org/ the ICU project's] library.\nYou should [//www.mediawiki.org/wiki/Special:MyLanguage/Unicode_normalization_considerations upgrade] if you are at all concerned about using Unicode.",

View file

@ -63,7 +63,6 @@
"config-env-bad": "See also:\n* {{msg-mw|Config-env-good}}",
"config-env-php": "Parameters:\n* $1 - the version of PHP that has been installed\nSee also:\n* {{msg-mw|config-env-php-toolow}}",
"config-env-hhvm": "Parameters:\n* $1 - the version of HHVM that has been installed",
"config-unicode-using-utf8": "Status message in the MediaWiki installer environment checks.",
"config-unicode-using-intl": "Status message in the MediaWiki installer environment checks.",
"config-unicode-pure-php-warning": "PECL is the name of a group producing standard pieces of software for PHP, and intl is the name of their library handling some aspects of internationalization.",
"config-unicode-update-warning": "ICU is a body producing standard software tools for support of Unicode and other internationalization aspects. This message warns the system administrator installing MediaWiki that the server's software is not up-to-date and MediaWiki will have problems handling some characters.",

View file

@ -1,69 +0,0 @@
.PHONY : all test testutf8 testclean icutest bench icubench clean distclean
## Latest greatest version of Unicode
## May cause confusion if running test suite from these files
## when the data was generated from a previous version.
#BASE=http://www.unicode.org/Public/UNIDATA
# Explicitly using Unicode 6.0
BASE=http://www.unicode.org/Public/6.0.0/ucd
# Can override to php-cli or php5 or whatever
PHP=php
#PHP=php-cli
# Some nice tool to grab URLs with
FETCH=wget
#FETCH=fetch
all : UtfNormalData.inc
UtfNormalData.inc : UtfNormalGenerate.php UtfNormalUtil.php UnicodeData.txt CompositionExclusions.txt NormalizationCorrections.txt DerivedNormalizationProps.txt
$(PHP) UtfNormalGenerate.php
bench : UtfNormalData.inc testdata/washington.txt testdata/berlin.txt testdata/tokyo.txt testdata/young.txt testdata/bulgakov.txt
$(PHP) UtfNormalBench.php
icutest : UtfNormalData.inc NormalizationTest.txt
$(PHP) Utf8Test.php --icu
$(PHP) UtfNormalTest.php --icu
icubench : UtfNormalData.inc testdata/washington.txt testdata/berlin.txt testdata/tokyo.txt testdata/young.txt testdata/bulgakov.txt
$(PHP) UtfNormalBench.php --icu
clean :
rm -f UtfNormalData.inc UtfNormalDataK.inc
distclean : clean
rm -f CompositionExclusions.txt NormalizationTest.txt NormalizationCorrections.txt UnicodeData.txt DerivedNormalizationProps.txt UTF-8-test.txt
# The Unicode data files...
CompositionExclusions.txt :
$(FETCH) $(BASE)/CompositionExclusions.txt
NormalizationTest.txt :
$(FETCH) $(BASE)/NormalizationTest.txt
NormalizationCorrections.txt :
$(FETCH) $(BASE)/NormalizationCorrections.txt
DerivedNormalizationProps.txt :
$(FETCH) $(BASE)/DerivedNormalizationProps.txt
UnicodeData.txt :
$(FETCH) $(BASE)/UnicodeData.txt
testdata/berlin.txt :
mkdir -p testdata && wget -U MediaWiki/test -O testdata/berlin.txt "http://de.wikipedia.org/w/index.php?title=Berlin&oldid=2775712&action=raw"
testdata/washington.txt :
mkdir -p testdata && wget -U MediaWiki/test -O testdata/washington.txt "http://en.wikipedia.org/w/index.php?title=Washington%2C_D.C.&oldid=6370218&action=raw"
testdata/tokyo.txt :
mkdir -p testdata && wget -U MediaWiki/test -O testdata/tokyo.txt "http://ja.wikipedia.org/w/index.php?title=%E6%9D%B1%E4%BA%AC%E9%83%BD&oldid=940880&action=raw"
testdata/young.txt :
mkdir -p testdata && wget -U MediaWiki/test -O testdata/young.txt "http://ko.wikipedia.org/w/index.php?title=%EC%9D%B4%EC%88%98%EC%98%81&oldid=627688&action=raw"
testdata/bulgakov.txt :
mkdir -p testdata && wget -U MediaWiki/test -O testdata/bulgakov.txt "http://ru.wikipedia.org/w/index.php?title=%D0%91%D1%83%D0%BB%D0%B3%D0%B0%D0%BA%D0%BE%D0%B2%2C_%D0%A1%D0%B5%D1%80%D0%B3%D0%B5%D0%B9_%D0%9D%D0%B8%D0%BA%D0%BE%D0%BB%D0%B0%D0%B5%D0%B2%D0%B8%D1%87&oldid=17704&action=raw"

View file

@ -1,59 +0,0 @@
This directory contains some Unicode normalization routines. These routines
are meant to be reusable in other projects, so I'm not tying them to the
MediaWiki utility functions.
The main function to care about is UtfNormal::toNFC(); this will convert
a given UTF-8 string to Normalization Form C if it's not already such.
The function assumes that the input string is already valid UTF-8; if there
are corrupt characters this may produce erroneous results.
To also check for illegal characters, use UtfNormal::cleanUp(). This will
strip illegal UTF-8 sequences and characters that are illegal in XML, and
if necessary convert to normalization form C.
Performance is kind of stinky in absolute terms, though it should be speedy
on pure ASCII text. ;) On text that can be determined quickly to already be
in NFC it's not too awful but it can quickly get uncomfortably slow,
particularly for Korean text (the hangul decomposition/composition code is
extra slow).
== Regenerating data tables ==
UtfNormalData.inc and UtfNormalDataK.inc are generated from the Unicode
Character Database by the script UtfNormalGenerate.php. On a *nix system
'make' should fetch the necessary files and regenerate it if the scripts
have been changed or you remove it.
== Testing ==
'make test' will run the conformance test (UtfNormalTest.php), fetching the
data from the net if necessary. If it reports failure, something is
going wrong!
You may have to set up PHPUnit first.
$ pear channel-discover pear.phpunit.de
$ pear install phpunit/PHPUnit
== Benchmarks ==
Run 'make bench' to download some sample texts from Wikipedia and run some
cheap benchmarks of some of the functions. Take all numbers with large
grains of salt.
== PHP module extension ==
There's an experimental PHP extension module which wraps the ICU library's
normalization functions. This is *MUCH* faster than doing this work in pure
PHP code. This is at https://git.wikimedia.org/summary/mediawiki%2Fextensions%2Fnormal.git.
It is used by the WMF, which currently runs PHP 5.3.10 on Linux. It hasn't been
thoroughly tested on other configurations, but may work.
If the php_normal.so module is loaded in php.ini, the normalization functions
will automatically use it. If you can't (or don't want to) load it in php.ini,
you may be able to load it using the dl() function before the inclusion of
UtfNormal.php, and it will be picked up.

View file

@ -1,102 +0,0 @@
<?php
/**
* Test feeds random 16-byte strings to both the pure PHP and ICU-based
* UtfNormal::cleanUp() code paths, and checks to see if there's a
* difference. Will run forever until it finds one or you kill it.
*
* Copyright (C) 2004 Brion Vibber <brion@pobox.com>
* https://www.mediawiki.org/
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
* http://www.gnu.org/copyleft/gpl.html
*
* @file
* @ingroup UtfNormal
*/
if ( PHP_SAPI != 'cli' ) {
die( "Run me from the command line please.\n" );
}
/** */
require_once 'UtfNormal.php';
require_once '../diff/DifferenceEngine.php';
dl( 'php_utfnormal.so' );
# mt_srand( 99999 );
function randomString( $length, $nullOk, $ascii = false ) {
$out = '';
for ( $i = 0; $i < $length; $i++ )
$out .= chr( mt_rand( $nullOk ? 0 : 1, $ascii ? 127 : 255 ) );
return $out;
}
/* Duplicate of the cleanUp() path for ICU usage */
function donorm( $str ) {
# We exclude a few chars that ICU would not.
$str = preg_replace( '/[\x00-\x08\x0b\x0c\x0e-\x1f]/', UTF8_REPLACEMENT, $str );
$str = str_replace( UTF8_FFFE, UTF8_REPLACEMENT, $str );
$str = str_replace( UTF8_FFFF, UTF8_REPLACEMENT, $str );
# UnicodeString constructor fails if the string ends with a head byte.
# Add a junk char at the end, we'll strip it off
return rtrim( utf8_normalize( $str . "\x01", UtfNormal::UNORM_NFC ), "\x01" );
}
function showDiffs( $a, $b ) {
$ota = explode( "\n", str_replace( "\r\n", "\n", $a ) );
$nta = explode( "\n", str_replace( "\r\n", "\n", $b ) );
$diffs = new Diff( $ota, $nta );
$formatter = new TableDiffFormatter();
$funky = $formatter->format( $diffs );
$matches = array();
preg_match_all( '/<(?:ins|del) class="diffchange">(.*?)<\/(?:ins|del)>/', $funky, $matches );
foreach ( $matches[1] as $bit ) {
$hex = bin2hex( $bit );
echo "\t$hex\n";
}
}
$size = 16;
$n = 0;
while ( true ) {
$n++;
echo "$n\n";
$str = randomString( $size, true );
$clean = UtfNormal::cleanUp( $str );
$norm = donorm( $str );
echo strlen( $clean ) . ", " . strlen( $norm );
if ( $clean == $norm ) {
echo " (match)\n";
} else {
echo " (FAIL)\n";
echo "\traw: " . bin2hex( $str ) . "\n" .
"\tphp: " . bin2hex( $clean ) . "\n" .
"\ticu: " . bin2hex( $norm ) . "\n";
echo "\n\tdiffs:\n";
showDiffs( $clean, $norm );
die();
}
$str = '';
$clean = '';
$norm = '';
}

View file

@ -28,8 +28,7 @@
* @defgroup UtfNormal UtfNormal
*/
define( 'NORMALIZE_ICU', function_exists( 'utf8_normalize' ) );
define( 'NORMALIZE_INTL', function_exists( 'normalizer_normalize' ) );
use UtfNormal\Validator;
/**
* Unicode normalization routines for working with UTF-8 strings.
@ -43,28 +42,10 @@ define( 'NORMALIZE_INTL', function_exists( 'normalizer_normalize' ) );
*
* See description of forms at http://www.unicode.org/reports/tr15/
*
* @deprecated since 1.25, use UtfNormal\Validator directly
* @ingroup UtfNormal
*/
class UtfNormal {
/**
* For using the ICU wrapper
*/
const UNORM_NONE = 1;
const UNORM_NFD = 2;
const UNORM_NFKD = 3;
const UNORM_NFC = 4;
const UNORM_NFKC = 5;
const UNORM_FCD = 6;
const UNORM_DEFAULT = self::UNORM_NFC;
public static $utfCombiningClass = null;
public static $utfCanonicalComp = null;
public static $utfCanonicalDecomp = null;
# Load compatibility decompositions on demand if they are needed.
public static $utfCompatibilityDecomp = null;
public static $utfCheckNFC;
/**
* The ultimate convenience function! Clean up invalid UTF-8 sequences,
* and convert to normal form C, canonical composition.
@ -76,36 +57,7 @@ class UtfNormal {
* @return string a clean, shiny, normalized UTF-8 string
*/
static function cleanUp( $string ) {
if ( NORMALIZE_ICU ) {
$string = self::replaceForNativeNormalize( $string );
# UnicodeString constructor fails if the string ends with a
# head byte. Add a junk char at the end, we'll strip it off.
return rtrim( utf8_normalize( $string . "\x01", self::UNORM_NFC ), "\x01" );
} elseif ( NORMALIZE_INTL ) {
$string = self::replaceForNativeNormalize( $string );
$norm = normalizer_normalize( $string, Normalizer::FORM_C );
if ( $norm === null || $norm === false ) {
# normalizer_normalize will either return false or null
# (depending on which doc you read) if invalid utf8 string.
# quickIsNFCVerify cleans up invalid sequences.
if ( UtfNormal::quickIsNFCVerify( $string ) ) {
# if that's true, the string is actually already normal.
return $string;
} else {
# Now we are valid but non-normal
return normalizer_normalize( $string, Normalizer::FORM_C );
}
} else {
return $norm;
}
} elseif ( UtfNormal::quickIsNFCVerify( $string ) ) {
# Side effect -- $string has had UTF-8 errors cleaned up.
return $string;
} else {
return UtfNormal::NFC( $string );
}
return Validator::cleanUp( $string );
}
/**
@ -117,14 +69,7 @@ class UtfNormal {
* @return string a UTF-8 string in normal form C
*/
static function toNFC( $string ) {
if ( NORMALIZE_INTL )
return normalizer_normalize( $string, Normalizer::FORM_C );
elseif ( NORMALIZE_ICU )
return utf8_normalize( $string, self::UNORM_NFC );
elseif ( UtfNormal::quickIsNFC( $string ) )
return $string;
else
return UtfNormal::NFC( $string );
return Validator::toNFC( $string );
}
/**
@ -135,14 +80,7 @@ class UtfNormal {
* @return string a UTF-8 string in normal form D
*/
static function toNFD( $string ) {
if ( NORMALIZE_INTL )
return normalizer_normalize( $string, Normalizer::FORM_D );
elseif ( NORMALIZE_ICU )
return utf8_normalize( $string, self::UNORM_NFD );
elseif ( preg_match( '/[\x80-\xff]/', $string ) )
return UtfNormal::NFD( $string );
else
return $string;
return Validator::toNFD( $string );
}
/**
@ -154,14 +92,7 @@ class UtfNormal {
* @return string a UTF-8 string in normal form KC
*/
static function toNFKC( $string ) {
if ( NORMALIZE_INTL )
return normalizer_normalize( $string, Normalizer::FORM_KC );
elseif ( NORMALIZE_ICU )
return utf8_normalize( $string, self::UNORM_NFKC );
elseif ( preg_match( '/[\x80-\xff]/', $string ) )
return UtfNormal::NFKC( $string );
else
return $string;
return Validator::toNFKC( $string );
}
/**
@ -173,24 +104,7 @@ class UtfNormal {
* @return string a UTF-8 string in normal form KD
*/
static function toNFKD( $string ) {
if ( NORMALIZE_INTL )
return normalizer_normalize( $string, Normalizer::FORM_KD );
elseif ( NORMALIZE_ICU )
return utf8_normalize( $string, self::UNORM_NFKD );
elseif ( preg_match( '/[\x80-\xff]/', $string ) )
return UtfNormal::NFKD( $string );
else
return $string;
}
/**
* Load the basic composition data if necessary
* @private
*/
static function loadData() {
if ( !isset( self::$utfCombiningClass ) ) {
require_once __DIR__ . '/UtfNormalData.inc';
}
return Validator::toNFKD( $string );
}
/**
@ -200,38 +114,7 @@ class UtfNormal {
* @return bool
*/
static function quickIsNFC( $string ) {
# ASCII is always valid NFC!
# If it's pure ASCII, let it through.
if ( !preg_match( '/[\x80-\xff]/', $string ) ) return true;
UtfNormal::loadData();
$len = strlen( $string );
for ( $i = 0; $i < $len; $i++ ) {
$c = $string[$i];
$n = ord( $c );
if ( $n < 0x80 ) {
continue;
} elseif ( $n >= 0xf0 ) {
$c = substr( $string, $i, 4 );
$i += 3;
} elseif ( $n >= 0xe0 ) {
$c = substr( $string, $i, 3 );
$i += 2;
} elseif ( $n >= 0xc0 ) {
$c = substr( $string, $i, 2 );
$i++;
}
if ( isset( self::$utfCheckNFC[$c] ) ) {
# If it's NO or MAYBE, bail and do the slow check.
return false;
}
if ( isset( self::$utfCombiningClass[$c] ) ) {
# Combining character? We might have to do sorting, at least.
return false;
}
}
return true;
return Validator::quickIsNFC( $string );
}
/**
@ -241,550 +124,6 @@ class UtfNormal {
* @return bool
*/
static function quickIsNFCVerify( &$string ) {
# Screen out some characters that eg won't be allowed in XML
$string = preg_replace( '/[\x00-\x08\x0b\x0c\x0e-\x1f]/', UTF8_REPLACEMENT, $string );
# ASCII is always valid NFC!
# If we're only ever given plain ASCII, we can avoid the overhead
# of initializing the decomposition tables by skipping out early.
if ( !preg_match( '/[\x80-\xff]/', $string ) ) return true;
static $checkit = null, $tailBytes = null, $utfCheckOrCombining = null;
if ( !isset( $checkit ) ) {
# Load/build some scary lookup tables...
UtfNormal::loadData();
$utfCheckOrCombining = array_merge( self::$utfCheckNFC, self::$utfCombiningClass );
# Head bytes for sequences which we should do further validity checks
$checkit = array_flip( array_map( 'chr',
array( 0xc0, 0xc1, 0xe0, 0xed, 0xef,
0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff ) ) );
# Each UTF-8 head byte is followed by a certain
# number of tail bytes.
$tailBytes = array();
for ( $n = 0; $n < 256; $n++ ) {
if ( $n < 0xc0 ) {
$remaining = 0;
} elseif ( $n < 0xe0 ) {
$remaining = 1;
} elseif ( $n < 0xf0 ) {
$remaining = 2;
} elseif ( $n < 0xf8 ) {
$remaining = 3;
} elseif ( $n < 0xfc ) {
$remaining = 4;
} elseif ( $n < 0xfe ) {
$remaining = 5;
} else {
$remaining = 0;
}
$tailBytes[chr( $n )] = $remaining;
}
}
# Chop the text into pure-ASCII and non-ASCII areas;
# large ASCII parts can be handled much more quickly.
# Don't chop up Unicode areas for punctuation, though,
# that wastes energy.
$matches = array();
preg_match_all(
'/([\x00-\x7f]+|[\x80-\xff][\x00-\x40\x5b-\x5f\x7b-\xff]*)/',
$string, $matches );
$looksNormal = true;
$base = 0;
$replace = array();
foreach ( $matches[1] as $str ) {
$chunk = strlen( $str );
if ( $str[0] < "\x80" ) {
# ASCII chunk: guaranteed to be valid UTF-8
# and in normal form C, so skip over it.
$base += $chunk;
continue;
}
# We'll have to examine the chunk byte by byte to ensure
# that it consists of valid UTF-8 sequences, and to see
# if any of them might not be normalized.
#
# Since PHP is not the fastest language on earth, some of
# this code is a little ugly with inner loop optimizations.
$head = '';
$len = $chunk + 1; # Counting down is faster. I'm *so* sorry.
for ( $i = -1; --$len; ) {
$remaining = $tailBytes[$c = $str[++$i]];
if ( $remaining ) {
# UTF-8 head byte!
$sequence = $head = $c;
do {
# Look for the defined number of tail bytes...
if ( --$len && ( $c = $str[++$i] ) >= "\x80" && $c < "\xc0" ) {
# Legal tail bytes are nice.
$sequence .= $c;
} else {
if ( 0 == $len ) {
# Premature end of string!
# Drop a replacement character into output to
# represent the invalid UTF-8 sequence.
$replace[] = array( UTF8_REPLACEMENT,
$base + $i + 1 - strlen( $sequence ),
strlen( $sequence ) );
break 2;
} else {
# Illegal tail byte; abandon the sequence.
$replace[] = array( UTF8_REPLACEMENT,
$base + $i - strlen( $sequence ),
strlen( $sequence ) );
# Back up and reprocess this byte; it may itself
# be a legal ASCII or UTF-8 sequence head.
--$i;
++$len;
continue 2;
}
}
} while ( --$remaining );
if ( isset( $checkit[$head] ) ) {
# Do some more detailed validity checks, for
# invalid characters and illegal sequences.
if ( $head == "\xed" ) {
# 0xed is relatively frequent in Korean, which
# abuts the surrogate area, so we're doing
# this check separately to speed things up.
if ( $sequence >= UTF8_SURROGATE_FIRST ) {
# Surrogates are legal only in UTF-16 code.
# They are totally forbidden here in UTF-8
# utopia.
$replace[] = array( UTF8_REPLACEMENT,
$base + $i + 1 - strlen( $sequence ),
strlen( $sequence ) );
$head = '';
continue;
}
} else {
# Slower, but rarer checks...
$n = ord( $head );
if (
# "Overlong sequences" are those that are syntactically
# correct but use more UTF-8 bytes than are necessary to
# encode a character. Naïve string comparisons can be
# tricked into failing to see a match for an ASCII
# character, for instance, which can be a security hole
# if blacklist checks are being used.
( $n < 0xc2 && $sequence <= UTF8_OVERLONG_A )
|| ( $n == 0xe0 && $sequence <= UTF8_OVERLONG_B )
|| ( $n == 0xf0 && $sequence <= UTF8_OVERLONG_C )
# U+FFFE and U+FFFF are explicitly forbidden in Unicode.
|| ( $n == 0xef &&
( $sequence == UTF8_FFFE )
|| ( $sequence == UTF8_FFFF ) )
# Unicode has been limited to 21 bits; longer
# sequences are not allowed.
|| ( $n >= 0xf0 && $sequence > UTF8_MAX )
) {
$replace[] = array( UTF8_REPLACEMENT,
$base + $i + 1 - strlen( $sequence ),
strlen( $sequence ) );
$head = '';
continue;
}
}
}
if ( isset( $utfCheckOrCombining[$sequence] ) ) {
# If it's NO or MAYBE, we'll have to rip
# the string apart and put it back together.
# That's going to be mighty slow.
$looksNormal = false;
}
# The sequence is legal!
$head = '';
} elseif ( $c < "\x80" ) {
# ASCII byte.
$head = '';
} elseif ( $c < "\xc0" ) {
# Illegal tail bytes
if ( $head == '' ) {
# Out of the blue!
$replace[] = array( UTF8_REPLACEMENT, $base + $i, 1 );
} else {
# Don't add if we're continuing a broken sequence;
# we already put a replacement character when we looked
# at the broken sequence.
$replace[] = array( '', $base + $i, 1 );
}
} else {
# Miscellaneous freaks.
$replace[] = array( UTF8_REPLACEMENT, $base + $i, 1 );
$head = '';
}
}
$base += $chunk;
}
if ( count( $replace ) ) {
# There were illegal UTF-8 sequences we need to fix up.
$out = '';
$last = 0;
foreach ( $replace as $rep ) {
list( $replacement, $start, $length ) = $rep;
if ( $last < $start ) {
$out .= substr( $string, $last, $start - $last );
}
$out .= $replacement;
$last = $start + $length;
}
if ( $last < strlen( $string ) ) {
$out .= substr( $string, $last );
}
$string = $out;
}
return $looksNormal;
}
# These take a string and run the normalization on them, without
# checking for validity or any optimization etc. Input must be
# VALID UTF-8!
/**
* @param $string string
* @return string
* @private
*/
static function NFC( $string ) {
return UtfNormal::fastCompose( UtfNormal::NFD( $string ) );
}
/**
* @param $string string
* @return string
* @private
*/
static function NFD( $string ) {
UtfNormal::loadData();
return UtfNormal::fastCombiningSort(
UtfNormal::fastDecompose( $string, self::$utfCanonicalDecomp ) );
}
/**
* @param $string string
* @return string
* @private
*/
static function NFKC( $string ) {
return UtfNormal::fastCompose( UtfNormal::NFKD( $string ) );
}
/**
* @param $string string
* @return string
* @private
*/
static function NFKD( $string ) {
if ( !isset( self::$utfCompatibilityDecomp ) ) {
require_once 'UtfNormalDataK.inc';
}
return self::fastCombiningSort(
self::fastDecompose( $string, self::$utfCompatibilityDecomp ) );
}
/**
* Perform decomposition of a UTF-8 string into either D or KD form
* (depending on which decomposition map is passed to us).
* Input is assumed to be *valid* UTF-8. Invalid code will break.
* @private
* @param string $string valid UTF-8 string
* @param array $map hash of expanded decomposition map
* @return string a UTF-8 string decomposed, not yet normalized (needs sorting)
*/
static function fastDecompose( $string, $map ) {
UtfNormal::loadData();
$len = strlen( $string );
$out = '';
for ( $i = 0; $i < $len; $i++ ) {
$c = $string[$i];
$n = ord( $c );
if ( $n < 0x80 ) {
# ASCII chars never decompose
# THEY ARE IMMORTAL
$out .= $c;
continue;
} elseif ( $n >= 0xf0 ) {
$c = substr( $string, $i, 4 );
$i += 3;
} elseif ( $n >= 0xe0 ) {
$c = substr( $string, $i, 3 );
$i += 2;
} elseif ( $n >= 0xc0 ) {
$c = substr( $string, $i, 2 );
$i++;
}
if ( isset( $map[$c] ) ) {
$out .= $map[$c];
continue;
} else {
if ( $c >= UTF8_HANGUL_FIRST && $c <= UTF8_HANGUL_LAST ) {
# Decompose a hangul syllable into jamo;
# hardcoded for three-byte UTF-8 sequence.
# A lookup table would be slightly faster,
# but adds a lot of memory & disk needs.
#
$index = ( ( ord( $c[0] ) & 0x0f ) << 12
| ( ord( $c[1] ) & 0x3f ) << 6
| ( ord( $c[2] ) & 0x3f ) )
- UNICODE_HANGUL_FIRST;
$l = intval( $index / UNICODE_HANGUL_NCOUNT );
$v = intval( ( $index % UNICODE_HANGUL_NCOUNT ) / UNICODE_HANGUL_TCOUNT );
$t = $index % UNICODE_HANGUL_TCOUNT;
$out .= "\xe1\x84" . chr( 0x80 + $l ) . "\xe1\x85" . chr( 0xa1 + $v );
if ( $t >= 25 ) {
$out .= "\xe1\x87" . chr( 0x80 + $t - 25 );
} elseif ( $t ) {
$out .= "\xe1\x86" . chr( 0xa7 + $t );
}
continue;
}
}
$out .= $c;
}
return $out;
}
/**
* Sorts combining characters into canonical order. This is the
* final step in creating decomposed normal forms D and KD.
* @private
* @param string $string a valid, decomposed UTF-8 string. Input is not validated.
* @return string a UTF-8 string with combining characters sorted in canonical order
*/
static function fastCombiningSort( $string ) {
UtfNormal::loadData();
$len = strlen( $string );
$out = '';
$combiners = array();
$lastClass = -1;
for ( $i = 0; $i < $len; $i++ ) {
$c = $string[$i];
$n = ord( $c );
if ( $n >= 0x80 ) {
if ( $n >= 0xf0 ) {
$c = substr( $string, $i, 4 );
$i += 3;
} elseif ( $n >= 0xe0 ) {
$c = substr( $string, $i, 3 );
$i += 2;
} elseif ( $n >= 0xc0 ) {
$c = substr( $string, $i, 2 );
$i++;
}
if ( isset( self::$utfCombiningClass[$c] ) ) {
$lastClass = self::$utfCombiningClass[$c];
if ( isset( $combiners[$lastClass] ) ) {
$combiners[$lastClass] .= $c;
} else {
$combiners[$lastClass] = $c;
}
continue;
}
}
if ( $lastClass ) {
ksort( $combiners );
$out .= implode( '', $combiners );
$combiners = array();
}
$out .= $c;
$lastClass = 0;
}
if ( $lastClass ) {
ksort( $combiners );
$out .= implode( '', $combiners );
}
return $out;
}
/**
* Produces canonically composed sequences, i.e. normal form C or KC.
*
* @private
* @param string $string a valid UTF-8 string in sorted normal form D or KD.
* Input is not validated.
* @return string a UTF-8 string with canonical precomposed characters used
* where possible.
*/
static function fastCompose( $string ) {
UtfNormal::loadData();
$len = strlen( $string );
$out = '';
$lastClass = -1;
$lastHangul = 0;
$startChar = '';
$combining = '';
$x1 = ord( substr( UTF8_HANGUL_VBASE, 0, 1 ) );
$x2 = ord( substr( UTF8_HANGUL_TEND, 0, 1 ) );
for ( $i = 0; $i < $len; $i++ ) {
$c = $string[$i];
$n = ord( $c );
if ( $n < 0x80 ) {
# No combining characters here...
$out .= $startChar;
$out .= $combining;
$startChar = $c;
$combining = '';
$lastClass = 0;
continue;
} elseif ( $n >= 0xf0 ) {
$c = substr( $string, $i, 4 );
$i += 3;
} elseif ( $n >= 0xe0 ) {
$c = substr( $string, $i, 3 );
$i += 2;
} elseif ( $n >= 0xc0 ) {
$c = substr( $string, $i, 2 );
$i++;
}
$pair = $startChar . $c;
if ( $n > 0x80 ) {
if ( isset( self::$utfCombiningClass[$c] ) ) {
# A combining char; see what we can do with it
$class = self::$utfCombiningClass[$c];
if ( !empty( $startChar ) &&
$lastClass < $class &&
$class > 0 &&
isset( self::$utfCanonicalComp[$pair] )
) {
$startChar = self::$utfCanonicalComp[$pair];
$class = 0;
} else {
$combining .= $c;
}
$lastClass = $class;
$lastHangul = 0;
continue;
}
}
# New start char
if ( $lastClass == 0 ) {
if ( isset( self::$utfCanonicalComp[$pair] ) ) {
$startChar = self::$utfCanonicalComp[$pair];
$lastHangul = 0;
continue;
}
if ( $n >= $x1 && $n <= $x2 ) {
# WARNING: Hangul code is painfully slow.
# I apologize for this ugly, ugly code; however
# performance is even more teh suck if we call
# out to nice clean functions. Lookup tables are
# marginally faster, but require a lot of space.
#
if ( $c >= UTF8_HANGUL_VBASE &&
$c <= UTF8_HANGUL_VEND &&
$startChar >= UTF8_HANGUL_LBASE &&
$startChar <= UTF8_HANGUL_LEND
) {
#
#$lIndex = utf8ToCodepoint( $startChar ) - UNICODE_HANGUL_LBASE;
#$vIndex = utf8ToCodepoint( $c ) - UNICODE_HANGUL_VBASE;
$lIndex = ord( $startChar[2] ) - 0x80;
$vIndex = ord( $c[2] ) - 0xa1;
$hangulPoint = UNICODE_HANGUL_FIRST +
UNICODE_HANGUL_TCOUNT *
( UNICODE_HANGUL_VCOUNT * $lIndex + $vIndex );
# Hardcode the limited-range UTF-8 conversion:
$startChar = chr( $hangulPoint >> 12 & 0x0f | 0xe0 ) .
chr( $hangulPoint >> 6 & 0x3f | 0x80 ) .
chr( $hangulPoint & 0x3f | 0x80 );
$lastHangul = 0;
continue;
} elseif ( $c >= UTF8_HANGUL_TBASE &&
$c <= UTF8_HANGUL_TEND &&
$startChar >= UTF8_HANGUL_FIRST &&
$startChar <= UTF8_HANGUL_LAST &&
!$lastHangul
) {
# $tIndex = utf8ToCodepoint( $c ) - UNICODE_HANGUL_TBASE;
$tIndex = ord( $c[2] ) - 0xa7;
if ( $tIndex < 0 ) $tIndex = ord( $c[2] ) - 0x80 + ( 0x11c0 - 0x11a7 );
# Increment the code point by $tIndex, without
# the function overhead of decoding and recoding UTF-8
#
$tail = ord( $startChar[2] ) + $tIndex;
if ( $tail > 0xbf ) {
$tail -= 0x40;
$mid = ord( $startChar[1] ) + 1;
if ( $mid > 0xbf ) {
$startChar[0] = chr( ord( $startChar[0] ) + 1 );
$mid -= 0x40;
}
$startChar[1] = chr( $mid );
}
$startChar[2] = chr( $tail );
# If there's another jamo char after this, *don't* try to merge it.
$lastHangul = 1;
continue;
}
}
}
$out .= $startChar;
$out .= $combining;
$startChar = $c;
$combining = '';
$lastClass = 0;
$lastHangul = 0;
}
$out .= $startChar . $combining;
return $out;
}
/**
* This is just used for the benchmark, comparing how long it takes to
* interate through a string without really doing anything of substance.
* @param $string string
* @return string
*/
static function placebo( $string ) {
$len = strlen( $string );
$out = '';
for ( $i = 0; $i < $len; $i++ ) {
$out .= $string[$i];
}
return $out;
}
/**
* Function to replace some characters that we don't want
* but most of the native normalize functions keep.
*
* @param string $string The string
* @return String String with the character codes replaced.
*/
private static function replaceForNativeNormalize( $string ) {
$string = preg_replace(
'/[\x00-\x08\x0b\x0c\x0e-\x1f]/',
UTF8_REPLACEMENT,
$string );
$string = str_replace( UTF8_FFFE, UTF8_REPLACEMENT, $string );
$string = str_replace( UTF8_FFFF, UTF8_REPLACEMENT, $string );
return $string;
return Validator::quickIsNFCVerify( $string );
}
}

View file

@ -1,105 +0,0 @@
<?php
/**
* Approximate benchmark for some basic operations.
*
* Copyright © 2004 Brion Vibber <brion@pobox.com>
* https://www.mediawiki.org/
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
* http://www.gnu.org/copyleft/gpl.html
*
* @file
* @ingroup UtfNormal
*/
if ( PHP_SAPI != 'cli' ) {
die( "Run me from the command line please.\n" );
}
if ( isset( $_SERVER['argv'] ) && in_array( '--icu', $_SERVER['argv'] ) ) {
dl( 'php_utfnormal.so' );
}
require_once 'UtfNormalDefines.php';
require_once 'UtfNormalUtil.php';
require_once 'UtfNormal.php';
define( 'BENCH_CYCLES', 5 );
$testfiles = array(
'testdata/washington.txt' => 'English text',
'testdata/berlin.txt' => 'German text',
'testdata/bulgakov.txt' => 'Russian text',
'testdata/tokyo.txt' => 'Japanese text',
'testdata/young.txt' => 'Korean text'
);
$normalizer = new UtfNormal;
UtfNormal::loadData();
foreach ( $testfiles as $file => $desc ) {
benchmarkTest( $normalizer, $file, $desc );
}
# -------
function benchmarkTest( &$u, $filename, $desc ) {
print "Testing $filename ($desc)...\n";
$data = file_get_contents( $filename );
$forms = array(
# 'placebo',
'cleanUp',
'toNFC',
# 'toNFKC',
# 'toNFD', 'toNFKD',
'NFC',
# 'NFKC',
# 'NFD', 'NFKD',
array( 'fastDecompose', 'fastCombiningSort', 'fastCompose' ),
# 'quickIsNFC', 'quickIsNFCVerify',
);
foreach ( $forms as $form ) {
if ( is_array( $form ) ) {
$str = $data;
foreach ( $form as $step ) {
$str = benchmarkForm( $u, $str, $step );
}
} else {
benchmarkForm( $u, $data, $form );
}
}
}
function benchmarkForm( &$u, &$data, $form ) {
#$start = microtime( true );
for ( $i = 0; $i < BENCH_CYCLES; $i++ ) {
$start = microtime( true );
$out = $u->$form( $data, UtfNormal::$utfCanonicalDecomp );
$deltas[] = ( microtime( true ) - $start );
}
#$delta = (microtime( true ) - $start) / BENCH_CYCLES;
sort( $deltas );
$delta = $deltas[0]; # Take shortest time
$rate = intval( strlen( $data ) / $delta );
$same = ( 0 == strcmp( $data, $out ) );
printf( " %20s %6.1fms %12s bytes/s (%s)\n",
$form,
$delta * 1000.0,
number_format( $rate ),
( $same ? 'no change' : 'changed' ) );
return $out;
}

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View file

@ -1,10 +1,8 @@
<?php
/**
* Some constant definitions for the unicode normalization module.
*
* Note: these constants must all be resolvable at compile time by HipHop,
* since this file will not be executed during request startup for a compiled
* MediaWiki.
* Backwards-compatability constants which are now provided by the
* UtfNormal library. They are hardcoded here since they are needed
* before the composer autoloader is initialized.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@ -25,53 +23,164 @@
* @ingroup UtfNormal
*/
/**
* @deprecated since 1.25, use UtfNormal\Constants instead
*/
define( 'UNICODE_HANGUL_FIRST', 0xac00 );
/**
* @deprecated since 1.25, use UtfNormal\Constants instead
*/
define( 'UNICODE_HANGUL_LAST', 0xd7a3 );
/**
* @deprecated since 1.25, use UtfNormal\Constants instead
*/
define( 'UNICODE_HANGUL_LBASE', 0x1100 );
/**
* @deprecated since 1.25, use UtfNormal\Constants instead
*/
define( 'UNICODE_HANGUL_VBASE', 0x1161 );
/**
* @deprecated since 1.25, use UtfNormal\Constants instead
*/
define( 'UNICODE_HANGUL_TBASE', 0x11a7 );
/**
* @deprecated since 1.25, use UtfNormal\Constants instead
*/
define( 'UNICODE_HANGUL_LCOUNT', 19 );
/**
* @deprecated since 1.25, use UtfNormal\Constants instead
*/
define( 'UNICODE_HANGUL_VCOUNT', 21 );
/**
* @deprecated since 1.25, use UtfNormal\Constants instead
*/
define( 'UNICODE_HANGUL_TCOUNT', 28 );
/**
* @deprecated since 1.25, use UtfNormal\Constants instead
*/
define( 'UNICODE_HANGUL_NCOUNT', UNICODE_HANGUL_VCOUNT * UNICODE_HANGUL_TCOUNT );
/**
* @deprecated since 1.25, use UtfNormal\Constants instead
*/
define( 'UNICODE_HANGUL_LEND', UNICODE_HANGUL_LBASE + UNICODE_HANGUL_LCOUNT - 1 );
/**
* @deprecated since 1.25, use UtfNormal\Constants instead
*/
define( 'UNICODE_HANGUL_VEND', UNICODE_HANGUL_VBASE + UNICODE_HANGUL_VCOUNT - 1 );
/**
* @deprecated since 1.25, use UtfNormal\Constants instead
*/
define( 'UNICODE_HANGUL_TEND', UNICODE_HANGUL_TBASE + UNICODE_HANGUL_TCOUNT - 1 );
/**
* @deprecated since 1.25, use UtfNormal\Constants instead
*/
define( 'UNICODE_SURROGATE_FIRST', 0xd800 );
/**
* @deprecated since 1.25, use UtfNormal\Constants instead
*/
define( 'UNICODE_SURROGATE_LAST', 0xdfff );
/**
* @deprecated since 1.25, use UtfNormal\Constants instead
*/
define( 'UNICODE_MAX', 0x10ffff );
/**
* @deprecated since 1.25, use UtfNormal\Constants instead
*/
define( 'UNICODE_REPLACEMENT', 0xfffd );
/**
* @deprecated since 1.25, use UtfNormal\Constants instead
*/
define( 'UTF8_HANGUL_FIRST', "\xea\xb0\x80" /*codepointToUtf8( UNICODE_HANGUL_FIRST )*/ );
/**
* @deprecated since 1.25, use UtfNormal\Constants instead
*/
define( 'UTF8_HANGUL_LAST', "\xed\x9e\xa3" /*codepointToUtf8( UNICODE_HANGUL_LAST )*/ );
/**
* @deprecated since 1.25, use UtfNormal\Constants instead
*/
define( 'UTF8_HANGUL_LBASE', "\xe1\x84\x80" /*codepointToUtf8( UNICODE_HANGUL_LBASE )*/ );
/**
* @deprecated since 1.25, use UtfNormal\Constants instead
*/
define( 'UTF8_HANGUL_VBASE', "\xe1\x85\xa1" /*codepointToUtf8( UNICODE_HANGUL_VBASE )*/ );
/**
* @deprecated since 1.25, use UtfNormal\Constants instead
*/
define( 'UTF8_HANGUL_TBASE', "\xe1\x86\xa7" /*codepointToUtf8( UNICODE_HANGUL_TBASE )*/ );
/**
* @deprecated since 1.25, use UtfNormal\Constants instead
*/
define( 'UTF8_HANGUL_LEND', "\xe1\x84\x92" /*codepointToUtf8( UNICODE_HANGUL_LEND )*/ );
/**
* @deprecated since 1.25, use UtfNormal\Constants instead
*/
define( 'UTF8_HANGUL_VEND', "\xe1\x85\xb5" /*codepointToUtf8( UNICODE_HANGUL_VEND )*/ );
/**
* @deprecated since 1.25, use UtfNormal\Constants instead
*/
define( 'UTF8_HANGUL_TEND', "\xe1\x87\x82" /*codepointToUtf8( UNICODE_HANGUL_TEND )*/ );
/**
* @deprecated since 1.25, use UtfNormal\Constants instead
*/
define( 'UTF8_SURROGATE_FIRST', "\xed\xa0\x80" /*codepointToUtf8( UNICODE_SURROGATE_FIRST )*/ );
/**
* @deprecated since 1.25, use UtfNormal\Constants instead
*/
define( 'UTF8_SURROGATE_LAST', "\xed\xbf\xbf" /*codepointToUtf8( UNICODE_SURROGATE_LAST )*/ );
/**
* @deprecated since 1.25, use UtfNormal\Constants instead
*/
define( 'UTF8_MAX', "\xf4\x8f\xbf\xbf" /*codepointToUtf8( UNICODE_MAX )*/ );
/**
* @deprecated since 1.25, use UtfNormal\Constants instead
*/
define( 'UTF8_REPLACEMENT', "\xef\xbf\xbd" /*codepointToUtf8( UNICODE_REPLACEMENT )*/ );
#define( 'UTF8_REPLACEMENT', '!' );
/**
* @deprecated since 1.25, use UtfNormal\Constants instead
*/
define( 'UTF8_OVERLONG_A', "\xc1\xbf" );
/**
* @deprecated since 1.25, use UtfNormal\Constants instead
*/
define( 'UTF8_OVERLONG_B', "\xe0\x9f\xbf" );
/**
* @deprecated since 1.25, use UtfNormal\Constants instead
*/
define( 'UTF8_OVERLONG_C', "\xf0\x8f\xbf\xbf" );
# These two ranges are illegal
/**
* @deprecated since 1.25, use UtfNormal\Constants instead
*/
define( 'UTF8_FDD0', "\xef\xb7\x90" /*codepointToUtf8( 0xfdd0 )*/ );
/**
* @deprecated since 1.25, use UtfNormal\Constants instead
*/
define( 'UTF8_FDEF', "\xef\xb7\xaf" /*codepointToUtf8( 0xfdef )*/ );
/**
* @deprecated since 1.25, use UtfNormal\Constants instead
*/
define( 'UTF8_FFFE', "\xef\xbf\xbe" /*codepointToUtf8( 0xfffe )*/ );
/**
* @deprecated since 1.25, use UtfNormal\Constants instead
*/
define( 'UTF8_FFFF', "\xef\xbf\xbf" /*codepointToUtf8( 0xffff )*/ );
/**
* @deprecated since 1.25, use UtfNormal\Constants instead
*/
define( 'UTF8_HEAD', false );
/**
* @deprecated since 1.25, use UtfNormal\Constants instead
*/
define( 'UTF8_TAIL', true );

View file

@ -1,250 +0,0 @@
<?php
/**
* This script generates UniNormalData.inc from the Unicode Character Database
* and supplementary files.
*
* Copyright (C) 2004 Brion Vibber <brion@pobox.com>
* https://www.mediawiki.org/
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
* http://www.gnu.org/copyleft/gpl.html
*
* @file
* @ingroup UtfNormal
*/
if ( PHP_SAPI != 'cli' ) {
die( "Run me from the command line please.\n" );
}
require_once 'UtfNormalDefines.php';
require_once 'UtfNormalUtil.php';
$in = fopen( "DerivedNormalizationProps.txt", "rt" );
if ( !$in ) {
print "Can't open DerivedNormalizationProps.txt for reading.\n";
print "If necessary, fetch this file from the internet:\n";
print "http://www.unicode.org/Public/UNIDATA/DerivedNormalizationProps.txt\n";
exit( -1 );
}
print "Initializing normalization quick check tables...\n";
$checkNFC = array();
while ( false !== ( $line = fgets( $in ) ) ) {
$matches = array();
if ( preg_match(
'/^([0-9A-F]+)(?:..([0-9A-F]+))?\s*;\s*(NFC_QC)\s*;\s*([MN])/',
$line,
$matches )
) {
list( $junk, $first, $last, $prop, $value ) = $matches;
#print "$first $last $prop $value\n";
if ( !$last ) {
$last = $first;
}
$lastInDecimal = hexdec( $last );
for ( $i = hexdec( $first ); $i <= $lastInDecimal; $i++ ) {
$char = codepointToUtf8( $i );
$checkNFC[$char] = $value;
}
}
}
fclose( $in );
$in = fopen( "CompositionExclusions.txt", "rt" );
if ( !$in ) {
print "Can't open CompositionExclusions.txt for reading.\n";
print "If necessary, fetch this file from the internet:\n";
print "http://www.unicode.org/Public/UNIDATA/CompositionExclusions.txt\n";
exit( -1 );
}
$exclude = array();
while ( false !== ( $line = fgets( $in ) ) ) {
if ( preg_match( '/^([0-9A-F]+)/i', $line, $matches ) ) {
$codepoint = $matches[1];
$source = codepointToUtf8( hexdec( $codepoint ) );
$exclude[$source] = true;
}
}
fclose( $in );
$in = fopen( "UnicodeData.txt", "rt" );
if ( !$in ) {
print "Can't open UnicodeData.txt for reading.\n";
print "If necessary, fetch this file from the internet:\n";
print "http://www.unicode.org/Public/UNIDATA/UnicodeData.txt\n";
exit( -1 );
}
$compatibilityDecomp = array();
$canonicalDecomp = array();
$canonicalComp = array();
$combiningClass = array();
$total = 0;
$compat = 0;
$canon = 0;
print "Reading character definitions...\n";
while ( false !== ( $line = fgets( $in ) ) ) {
$columns = explode( ';', $line );
$codepoint = $columns[0];
$name = $columns[1];
$canonicalCombiningClass = $columns[3];
$decompositionMapping = $columns[5];
$source = codepointToUtf8( hexdec( $codepoint ) );
if ( $canonicalCombiningClass != 0 ) {
$combiningClass[$source] = intval( $canonicalCombiningClass );
}
if ( $decompositionMapping === '' ) continue;
if ( preg_match( '/^<(.+)> (.*)$/', $decompositionMapping, $matches ) ) {
# Compatibility decomposition
$canonical = false;
$decompositionMapping = $matches[2];
$compat++;
} else {
$canonical = true;
$canon++;
}
$total++;
$dest = hexSequenceToUtf8( $decompositionMapping );
$compatibilityDecomp[$source] = $dest;
if ( $canonical ) {
$canonicalDecomp[$source] = $dest;
if ( empty( $exclude[$source] ) ) {
$canonicalComp[$dest] = $source;
}
}
#print "$codepoint | $canonicalCombiningClasses | $decompositionMapping\n";
}
fclose( $in );
print "Recursively expanding canonical mappings...\n";
$changed = 42;
$pass = 1;
while ( $changed > 0 ) {
print "pass $pass\n";
$changed = 0;
foreach ( $canonicalDecomp as $source => $dest ) {
$newDest = preg_replace_callback(
'/([\xc0-\xff][\x80-\xbf]+)/',
'callbackCanonical',
$dest );
if ( $newDest === $dest ) continue;
$changed++;
$canonicalDecomp[$source] = $newDest;
}
$pass++;
}
print "Recursively expanding compatibility mappings...\n";
$changed = 42;
$pass = 1;
while ( $changed > 0 ) {
print "pass $pass\n";
$changed = 0;
foreach ( $compatibilityDecomp as $source => $dest ) {
$newDest = preg_replace_callback(
'/([\xc0-\xff][\x80-\xbf]+)/',
'callbackCompat',
$dest );
if ( $newDest === $dest ) continue;
$changed++;
$compatibilityDecomp[$source] = $newDest;
}
$pass++;
}
print "$total decomposition mappings ($canon canonical, $compat compatibility)\n";
$out = fopen( "UtfNormalData.inc", "wt" );
if ( $out ) {
$serCombining = escapeSingleString( serialize( $combiningClass ) );
$serComp = escapeSingleString( serialize( $canonicalComp ) );
$serCanon = escapeSingleString( serialize( $canonicalDecomp ) );
$serCheckNFC = escapeSingleString( serialize( $checkNFC ) );
$outdata = "<" . "?php
/**
* This file was automatically generated -- do not edit!
* Run UtfNormalGenerate.php to create this file again (make clean && make)
*
* @file
*/
// @codingStandardsIgnoreFile
UtfNormal::\$utfCombiningClass = unserialize( '$serCombining' );
UtfNormal::\$utfCanonicalComp = unserialize( '$serComp' );
UtfNormal::\$utfCanonicalDecomp = unserialize( '$serCanon' );
UtfNormal::\$utfCheckNFC = unserialize( '$serCheckNFC' );
\n";
fputs( $out, $outdata );
fclose( $out );
print "Wrote out UtfNormalData.inc\n";
} else {
print "Can't create file UtfNormalData.inc\n";
exit( -1 );
}
$out = fopen( "UtfNormalDataK.inc", "wt" );
if ( $out ) {
$serCompat = escapeSingleString( serialize( $compatibilityDecomp ) );
$outdata = "<" . "?php
/**
* This file was automatically generated -- do not edit!
* Run UtfNormalGenerate.php to create this file again (make clean && make)
*
* @file
*/
// @codingStandardsIgnoreFile
UtfNormal::\$utfCompatibilityDecomp = unserialize( '$serCompat' );
\n";
fputs( $out, $outdata );
fclose( $out );
print "Wrote out UtfNormalDataK.inc\n";
exit( 0 );
} else {
print "Can't create file UtfNormalDataK.inc\n";
exit( -1 );
}
# ---------------
function callbackCanonical( $matches ) {
// @codingStandardsIgnoreStart MediaWiki.NamingConventions.ValidGlobalName.wgPrefix
global $canonicalDecomp;
// @codingStandardsIgnoreEnd
if ( isset( $canonicalDecomp[$matches[1]] ) ) {
return $canonicalDecomp[$matches[1]];
}
return $matches[1];
}
function callbackCompat( $matches ) {
// @codingStandardsIgnoreStart MediaWiki.NamingConventions.ValidGlobalName.wgPrefix
global $compatibilityDecomp;
// @codingStandardsIgnoreEnd
if ( isset( $compatibilityDecomp[$matches[1]] ) ) {
return $compatibilityDecomp[$matches[1]];
}
return $matches[1];
}

View file

@ -1,107 +0,0 @@
<?php
/**
* Approximate benchmark for some basic operations.
* Runs large chunks of text through cleanup with a lowish memory limit,
* to test regression on mem usage (bug 28146)
*
* Copyright © 2004-2011 Brion Vibber <brion@wikimedia.org>
* https://www.mediawiki.org/
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
* http://www.gnu.org/copyleft/gpl.html
*
* @file
* @ingroup UtfNormal
*/
if ( PHP_SAPI != 'cli' ) {
die( "Run me from the command line please.\n" );
}
if ( isset( $_SERVER['argv'] ) && in_array( '--icu', $_SERVER['argv'] ) ) {
dl( 'php_utfnormal.so' );
}
require_once 'UtfNormalDefines.php';
require_once 'UtfNormalUtil.php';
require_once 'UtfNormal.php';
define( 'BENCH_CYCLES', 1 );
define( 'BIGSIZE', 1024 * 1024 * 10 ); // 10m
ini_set( 'memory_limit', BIGSIZE + 120 * 1024 * 1024 );
$testfiles = array(
'testdata/washington.txt' => 'English text',
'testdata/berlin.txt' => 'German text',
'testdata/bulgakov.txt' => 'Russian text',
'testdata/tokyo.txt' => 'Japanese text',
'testdata/young.txt' => 'Korean text'
);
$normalizer = new UtfNormal;
UtfNormal::loadData();
foreach ( $testfiles as $file => $desc ) {
benchmarkTest( $normalizer, $file, $desc );
}
# -------
function benchmarkTest( &$u, $filename, $desc ) {
print "Testing $filename ($desc)...\n";
$data = file_get_contents( $filename );
$all = $data;
while ( strlen( $all ) < BIGSIZE ) {
$all .= $all;
}
$data = $all;
echo "Data is " . strlen( $data ) . " bytes.\n";
$forms = array(
'quickIsNFCVerify',
'cleanUp',
);
foreach ( $forms as $form ) {
if ( is_array( $form ) ) {
$str = $data;
foreach ( $form as $step ) {
$str = benchmarkForm( $u, $str, $step );
}
} else {
benchmarkForm( $u, $data, $form );
}
}
}
function benchmarkForm( &$u, &$data, $form ) {
#$start = microtime( true );
for ( $i = 0; $i < BENCH_CYCLES; $i++ ) {
$start = microtime( true );
$out = $u->$form( $data, UtfNormal::$utfCanonicalDecomp );
$deltas[] = ( microtime( true ) - $start );
}
#$delta = (microtime( true ) - $start) / BENCH_CYCLES;
sort( $deltas );
$delta = $deltas[0]; # Take shortest time
$rate = intval( strlen( $data ) / $delta );
$same = ( 0 == strcmp( $data, $out ) );
printf( " %20s %6.1fms %12s bytes/s (%s)\n",
$form,
$delta * 1000.0,
number_format( $rate ),
( $same ? 'no change' : 'changed' ) );
return $out;
}

View file

@ -1,275 +0,0 @@
#!/usr/bin/env php
<?php
/**
* Other tests for the unicode normalization module.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
* http://www.gnu.org/copyleft/gpl.html
*
* @file
* @ingroup UtfNormal
*/
if ( PHP_SAPI != 'cli' ) {
die( "Run me from the command line please.\n" );
}
// From http://unicode.org/Public/UNIDATA/NormalizationTest.txt
$file = "NormalizationTest.txt";
// Anything after this character is a comment
define ( 'COMMENT', '#' );
// Semicolons are used to separate the columns
define ( 'SEPARATOR', ';' );
$f = fopen( $file, "r" );
/**
* The following section will be used for testing different normalization methods.
* - Pure PHP
* ~ no assertion errors
* ~ 6.25 minutes
* - php_utfnormal.so or intl extension: both are wrappers around
* libicu so we list the version of libicu when making the
* comparison
* - libicu Ubuntu 3.8.1-3ubuntu1.1 php 5.2.6-3ubuntu4.5
* ~ 2200 assertion errors
* ~ 5 seconds
* ~ output: http://paste2.org/p/921566
* - libicu Ubuntu 4.2.1-3 php 5.3.2-1ubuntu4.2
* ~ 1384 assertion errors
* ~ 15 seconds
* ~ output: http://paste2.org/p/921435
* - libicu Debian 4.4.1-5 php 5.3.2-1ubuntu4.2
* ~ no assertion errors
* ~ 13 seconds
* - Tests comparing pure PHP output with libicu output were added
* later and slow down the runtime.
*/
require_once './UtfNormal.php';
function normalize_form_c( $c ) {
return UtfNormal::toNFC( $c );
}
function normalize_form_d( $c ) {
return UtfNormal::toNFD( $c );
}
function normalize_form_kc( $c ) {
return UtfNormal::toNFKC( $c );
}
function normalize_form_kd( $c ) {
return UtfNormal::toNFKD( $c );
}
/**
* This set of functions is only useful if youve added a param to the
* following functions to force pure PHP usage. I decided not to
* commit that code since might produce a slowdown in the UTF
* normalization code just for the sake of these tests. -- hexmode
* @return string
*/
function normalize_form_c_php( $c ) {
return UtfNormal::toNFC( $c, "php" );
}
function normalize_form_d_php( $c ) {
return UtfNormal::toNFD( $c, "php" );
}
function normalize_form_kc_php( $c ) {
return UtfNormal::toNFKC( $c, "php" );
}
function normalize_form_kd_php( $c ) {
return UtfNormal::toNFKD( $c, "php" );
}
assert_options( ASSERT_ACTIVE, 1 );
assert_options( ASSERT_WARNING, 0 );
assert_options( ASSERT_QUIET_EVAL, 1 );
assert_options( ASSERT_CALLBACK, 'my_assert' );
function my_assert( $file, $line, $code ) {
// @codingStandardsIgnoreStart MediaWiki.NamingConventions.ValidGlobalName.wgPrefix
global $col, $lineNo;
// @codingStandardsIgnoreEnd
echo "Assertion that '$code' failed on line $lineNo ($col[5])\n";
}
$count = 0;
$lineNo = 0;
if ( $f !== false ) {
while ( ( $col = getRow( $f ) ) !== false ) {
$lineNo++;
if ( count( $col ) == 6 ) {
$count++;
if ( $count % 100 === 0 ) echo "Count: $count\n";
} else {
continue;
}
# verify that the pure PHP version is correct
$NFCc1 = normalize_form_c( $col[0] );
$NFCc1p = normalize_form_c_php( $col[0] );
assert( '$NFCc1 === $NFCc1p' );
$NFCc2 = normalize_form_c( $col[1] );
$NFCc2p = normalize_form_c_php( $col[1] );
assert( '$NFCc2 === $NFCc2p' );
$NFCc3 = normalize_form_c( $col[2] );
$NFCc3p = normalize_form_c_php( $col[2] );
assert( '$NFCc3 === $NFCc3p' );
$NFCc4 = normalize_form_c( $col[3] );
$NFCc4p = normalize_form_c_php( $col[3] );
assert( '$NFCc4 === $NFCc4p' );
$NFCc5 = normalize_form_c( $col[4] );
$NFCc5p = normalize_form_c_php( $col[4] );
assert( '$NFCc5 === $NFCc5p' );
$NFDc1 = normalize_form_d( $col[0] );
$NFDc1p = normalize_form_d_php( $col[0] );
assert( '$NFDc1 === $NFDc1p' );
$NFDc2 = normalize_form_d( $col[1] );
$NFDc2p = normalize_form_d_php( $col[1] );
assert( '$NFDc2 === $NFDc2p' );
$NFDc3 = normalize_form_d( $col[2] );
$NFDc3p = normalize_form_d_php( $col[2] );
assert( '$NFDc3 === $NFDc3p' );
$NFDc4 = normalize_form_d( $col[3] );
$NFDc4p = normalize_form_d_php( $col[3] );
assert( '$NFDc4 === $NFDc4p' );
$NFDc5 = normalize_form_d( $col[4] );
$NFDc5p = normalize_form_d_php( $col[4] );
assert( '$NFDc5 === $NFDc5p' );
$NFKDc1 = normalize_form_kd( $col[0] );
$NFKDc1p = normalize_form_kd_php( $col[0] );
assert( '$NFKDc1 === $NFKDc1p' );
$NFKDc2 = normalize_form_kd( $col[1] );
$NFKDc2p = normalize_form_kd_php( $col[1] );
assert( '$NFKDc2 === $NFKDc2p' );
$NFKDc3 = normalize_form_kd( $col[2] );
$NFKDc3p = normalize_form_kd_php( $col[2] );
assert( '$NFKDc3 === $NFKDc3p' );
$NFKDc4 = normalize_form_kd( $col[3] );
$NFKDc4p = normalize_form_kd_php( $col[3] );
assert( '$NFKDc4 === $NFKDc4p' );
$NFKDc5 = normalize_form_kd( $col[4] );
$NFKDc5p = normalize_form_kd_php( $col[4] );
assert( '$NFKDc5 === $NFKDc5p' );
$NFKCc1 = normalize_form_kc( $col[0] );
$NFKCc1p = normalize_form_kc_php( $col[0] );
assert( '$NFKCc1 === $NFKCc1p' );
$NFKCc2 = normalize_form_kc( $col[1] );
$NFKCc2p = normalize_form_kc_php( $col[1] );
assert( '$NFKCc2 === $NFKCc2p' );
$NFKCc3 = normalize_form_kc( $col[2] );
$NFKCc3p = normalize_form_kc_php( $col[2] );
assert( '$NFKCc3 === $NFKCc3p' );
$NFKCc4 = normalize_form_kc( $col[3] );
$NFKCc4p = normalize_form_kc_php( $col[3] );
assert( '$NFKCc4 === $NFKCc4p' );
$NFKCc5 = normalize_form_kc( $col[4] );
$NFKCc5p = normalize_form_kc_php( $col[4] );
assert( '$NFKCc5 === $NFKCc5p' );
# c2 == NFC(c1) == NFC(c2) == NFC(c3)
assert( '$col[1] === $NFCc1' );
assert( '$col[1] === $NFCc2' );
assert( '$col[1] === $NFCc3' );
# c4 == NFC(c4) == NFC(c5)
assert( '$col[3] === $NFCc4' );
assert( '$col[3] === $NFCc5' );
# c3 == NFD(c1) == NFD(c2) == NFD(c3)
assert( '$col[2] === $NFDc1' );
assert( '$col[2] === $NFDc2' );
assert( '$col[2] === $NFDc3' );
# c5 == NFD(c4) == NFD(c5)
assert( '$col[4] === $NFDc4' );
assert( '$col[4] === $NFDc5' );
# c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NFKC(c4) == NFKC(c5)
assert( '$col[3] === $NFKCc1' );
assert( '$col[3] === $NFKCc2' );
assert( '$col[3] === $NFKCc3' );
assert( '$col[3] === $NFKCc4' );
assert( '$col[3] === $NFKCc5' );
# c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5)
assert( '$col[4] === $NFKDc1' );
assert( '$col[4] === $NFKDc2' );
assert( '$col[4] === $NFKDc3' );
assert( '$col[4] === $NFKDc4' );
assert( '$col[4] === $NFKDc5' );
}
}
echo "done.\n";
// Compare against http://en.wikipedia.org/wiki/UTF-8#Description
function unichr( $c ) {
if ( $c <= 0x7F ) {
return chr( $c );
} elseif ( $c <= 0x7FF ) {
return chr( 0xC0 | $c >> 6 ) . chr( 0x80 | $c & 0x3F );
} elseif ( $c <= 0xFFFF ) {
return chr( 0xE0 | $c >> 12 ) . chr( 0x80 | $c >> 6 & 0x3F )
. chr( 0x80 | $c & 0x3F );
} elseif ( $c <= 0x10FFFF ) {
return chr( 0xF0 | $c >> 18 ) . chr( 0x80 | $c >> 12 & 0x3F )
. chr( 0x80 | $c >> 6 & 0x3F )
. chr( 0x80 | $c & 0x3F );
} else {
return false;
}
}
function unistr( $c ) {
return implode( "", array_map( "unichr", array_map( "hexdec", explode( " ", $c ) ) ) );
}
function getRow( $f ) {
$row = fgets( $f );
if ( $row === false ) return false;
$row = rtrim( $row );
$pos = strpos( $row, COMMENT );
$pos2 = strpos( $row, ")" );
if ( $pos === 0 ) return array( $row );
$c = "";
if ( $pos ) {
if ( $pos2 ) $c = substr( $row, $pos2 + 2 );
else $c = substr( $row, $pos );
$row = substr( $row, 0, $pos );
}
$ret = array();
foreach ( explode( SEPARATOR, $row ) as $ent ) {
if ( trim( $ent ) !== "" ) {
$ret[] = unistr( $ent );
}
}
$ret[] = $c;
return $ret;
}

View file

@ -25,6 +25,8 @@
* @ingroup UtfNormal
*/
use Utfnormal\Utils;
/**
* Return UTF-8 sequence for a given Unicode code point.
*
@ -32,31 +34,10 @@
* @return String
* @throws InvalidArgumentException if fed out of range data.
* @public
* @deprecated since 1.25, use UtfNormal\Utils directly
*/
function codepointToUtf8( $codepoint ) {
if ( $codepoint < 0x80 ) {
return chr( $codepoint );
}
if ( $codepoint < 0x800 ) {
return chr( $codepoint >> 6 & 0x3f | 0xc0 ) .
chr( $codepoint & 0x3f | 0x80 );
}
if ( $codepoint < 0x10000 ) {
return chr( $codepoint >> 12 & 0x0f | 0xe0 ) .
chr( $codepoint >> 6 & 0x3f | 0x80 ) .
chr( $codepoint & 0x3f | 0x80 );
}
if ( $codepoint < 0x110000 ) {
return chr( $codepoint >> 18 & 0x07 | 0xf0 ) .
chr( $codepoint >> 12 & 0x3f | 0x80 ) .
chr( $codepoint >> 6 & 0x3f | 0x80 ) .
chr( $codepoint & 0x3f | 0x80 );
}
throw new InvalidArgumentException( "Asked for code outside of range ($codepoint)" );
Utils::codepointToUtf8( $codepoint );
}
/**
@ -68,21 +49,17 @@ function codepointToUtf8( $codepoint ) {
* @return String
* @throws InvalidArgumentException if fed out of range data.
* @private
* @deprecated since 1.25, use UtfNormal\Utils directly
*/
function hexSequenceToUtf8( $sequence ) {
$utf = '';
foreach ( explode( ' ', $sequence ) as $hex ) {
$n = hexdec( $hex );
$utf .= codepointToUtf8( $n );
}
return $utf;
return Utils::hexSequenceToUtf8( $sequence );
}
/**
* Take a UTF-8 string and return a space-separated series of hex
* numbers representing Unicode code points. For debugging.
*
* @fixme this is private but extensions + maint scripts are using it
* @param string $str UTF-8 string.
* @return string
* @private
@ -90,7 +67,7 @@ function hexSequenceToUtf8( $sequence ) {
function utf8ToHexSequence( $str ) {
$buf = '';
foreach ( preg_split( '//u', $str, -1, PREG_SPLIT_NO_EMPTY ) as $cp ) {
$buf .= sprintf( '%04x ', utf8ToCodepoint( $cp ) );
$buf .= sprintf( '%04x ', UtfNormal\Utils::utf8ToCodepoint( $cp ) );
}
return rtrim( $buf );
@ -103,39 +80,10 @@ function utf8ToHexSequence( $str ) {
* @param $char String
* @return Integer
* @public
* @deprecated since 1.25, use UtfNormal\Utils directly
*/
function utf8ToCodepoint( $char ) {
# Find the length
$z = ord( $char[0] );
if ( $z & 0x80 ) {
$length = 0;
while ( $z & 0x80 ) {
$length++;
$z <<= 1;
}
} else {
$length = 1;
}
if ( $length != strlen( $char ) ) {
return false;
}
if ( $length == 1 ) {
return ord( $char );
}
# Mask off the length-determining bits and shift back to the original location
$z &= 0xff;
$z >>= $length;
# Add in the free bits from subsequent bytes
for ( $i = 1; $i < $length; $i++ ) {
$z <<= 6;
$z |= ord( $char[$i] ) & 0x3f;
}
return $z;
return Utils::utf8ToCodepoint( $char );
}
/**
@ -144,11 +92,8 @@ function utf8ToCodepoint( $char ) {
* @param string $string string to be escaped.
* @return String: escaped string.
* @public
* @deprecated since 1.25, use UtfNormal\Utils directly
*/
function escapeSingleString( $string ) {
return strtr( $string,
array(
'\\' => '\\\\',
'\'' => '\\\''
) );
return Utils::escapeSingleString( $string );
}

View file

@ -319,7 +319,7 @@ EOR;
function pageTextCallback( $matches ) {
# Get rid of invalid UTF-8, strip control characters
$val = htmlspecialchars( UtfNormal::cleanUp( stripcslashes( $matches[1] ) ) );
$val = htmlspecialchars( UtfNormal\Validator::cleanUp( stripcslashes( $matches[1] ) ) );
$val = str_replace( array( "\n", '<27>' ), array( '&#10;', '' ), $val );
return '<PAGE value="' . $val . '" />';
}

View file

@ -477,7 +477,7 @@ class Exif {
} else {
// if valid utf-8, assume that, otherwise assume windows-1252
$valCopy = $val;
UtfNormal::quickIsNFCVerify( $valCopy ); //validates $valCopy.
UtfNormal\Validator::quickIsNFCVerify( $valCopy ); //validates $valCopy.
if ( $valCopy !== $val ) {
wfSuppressWarnings();
$val = iconv( 'Windows-1252', 'UTF-8//IGNORE', $val );

View file

@ -158,7 +158,7 @@ class GIFMetadataExtractor {
// assume its that, otherwise assume its windows-1252 (iso-8859-1)
$dataCopy = $data;
// quickIsNFCVerify has the side effect of replacing any invalid characters
UtfNormal::quickIsNFCVerify( $dataCopy );
UtfNormal\Validator::quickIsNFCVerify( $dataCopy );
if ( $dataCopy !== $data ) {
wfSuppressWarnings();

View file

@ -456,7 +456,7 @@ class IPTC {
//treat as utf-8 if is valid utf-8. otherwise pretend its windows-1252
// most of the time if there is no 1:90 tag, it is either ascii, latin1, or utf-8
$oldData = $data;
UtfNormal::quickIsNFCVerify( $data ); //make $data valid utf-8
UtfNormal\Validator::quickIsNFCVerify( $data ); //make $data valid utf-8
if ( $data === $oldData ) {
return $data; //if validation didn't change $data
} else {

View file

@ -98,7 +98,7 @@ class JpegMetadataExtractor {
// First see if valid utf-8,
// if not try to convert it to windows-1252.
$com = $oldCom = trim( self::jpegExtractMarker( $fh ) );
UtfNormal::quickIsNFCVerify( $com );
UtfNormal\Validator::quickIsNFCVerify( $com );
// turns $com to valid utf-8.
// thus if no change, its utf-8, otherwise its something else.
if ( $com !== $oldCom ) {
@ -108,7 +108,7 @@ class JpegMetadataExtractor {
}
// Try it again, if its still not a valid string, then probably
// binary junk or some really weird encoding, so don't extract.
UtfNormal::quickIsNFCVerify( $com );
UtfNormal\Validator::quickIsNFCVerify( $com );
if ( $com === $oldCom ) {
$segments["COM"][] = $oldCom;
} else {

View file

@ -92,7 +92,7 @@ class Preprocessor_DOM implements Preprocessor {
wfRestoreWarnings();
if ( !$result ) {
// Try running the XML through UtfNormal to get rid of invalid characters
$xml = UtfNormal::cleanUp( $xml );
$xml = UtfNormal\Validator::cleanUp( $xml );
// 1 << 19 == XML_PARSE_HUGE, needed so newer versions of libxml2
// don't barf when the XML is >256 levels deep
$result = $dom->loadXML( $xml, 1 << 19 );
@ -191,7 +191,7 @@ class Preprocessor_DOM implements Preprocessor {
wfRestoreWarnings();
if ( !$result ) {
// Try running the XML through UtfNormal to get rid of invalid characters
$xml = UtfNormal::cleanUp( $xml );
$xml = UtfNormal\Validator::cleanUp( $xml );
// 1 << 19 == XML_PARSE_HUGE, needed so newer versions of libxml2
// don't barf when the XML is >256 levels deep.
$result = $dom->loadXML( $xml, 1 << 19 );

View file

@ -116,7 +116,7 @@ class MediaWikiSite extends Site {
// Make sure the string is normalized into NFC (due to the bug 40017)
// but do nothing to the whitespaces, that should work appropriately.
// @see https://bugzilla.wikimedia.org/show_bug.cgi?id=40017
$pageName = UtfNormal::cleanUp( $pageName );
$pageName = UtfNormal\Validator::cleanUp( $pageName );
// Build the args for the specific call
$args = array(

View file

@ -230,7 +230,7 @@ class MediaWikiTitleCodec implements TitleFormatter, TitleParser {
);
$dbkey = trim( $dbkey, '_' );
if ( strpos( $dbkey, UTF8_REPLACEMENT ) !== false ) {
if ( strpos( $dbkey, UtfNormal\Constants::UTF8_REPLACEMENT ) !== false ) {
# Contained illegal UTF-8 sequences or forbidden Unicode chars.
throw new MalformedTitleException( 'Bad UTF-8 sequences found in title: ' . $text );
}

View file

@ -2945,7 +2945,7 @@ class Language {
}
// Break down Hangul syllables to grab the first jamo
$code = utf8ToCodepoint( $matches[1] );
$code = UtfNormal\Utils::utf8ToCodepoint( $matches[1] );
if ( $code < 0xac00 || 0xd7a4 <= $code ) {
return $matches[1];
} elseif ( $code < 0xb098 ) {
@ -3037,7 +3037,7 @@ class Language {
*/
function normalize( $s ) {
global $wgAllUnicodeFixes;
$s = UtfNormal::cleanUp( $s );
$s = UtfNormal\Validator::cleanUp( $s );
if ( $wgAllUnicodeFixes ) {
$s = $this->transformUsingPairFile( 'normalize-ar.ser', $s );
$s = $this->transformUsingPairFile( 'normalize-ml.ser', $s );

View file

@ -323,7 +323,7 @@ class GenerateCollationData extends Maintenance {
$tertiaryCollator->sort( $x );
$cp = $x[0];
$char = codepointToUtf8( $cp );
$char = UtfNormal\Utils::codepointToUtf8( $cp );
$headerChars[] = $char;
if ( $primaryCollator->compare( $char, $prevChar ) <= 0 ) {
$numOutOfOrder++;
@ -337,7 +337,7 @@ class GenerateCollationData extends Maintenance {
if ( $this->debugOutFile ) {
fwrite( $this->debugOutFile, sprintf( "%05X %s %s (%s)\n", $cp, $weight, $char,
implode( ' ', array_map( 'codepointToUtf8', $group ) ) ) );
implode( ' ', array_map( 'UtfNormal\Utils::codepointToUtf8', $group ) ) ) );
}
}

View file

@ -117,8 +117,8 @@ class GenerateNormalizerDataAr extends Maintenance {
continue;
}
$source = hexSequenceToUtf8( $data['Code'] );
$dest = hexSequenceToUtf8( $m[2] );
$source = UtfNormal\Utils::hexSequenceToUtf8( $data['Code'] );
$dest = UtfNormal\Utils::hexSequenceToUtf8( $m[2] );
$pairs[$source] = $dest;
}
}

View file

@ -54,8 +54,8 @@ class GenerateNormalizerDataMl extends Maintenance {
$pairs = array();
foreach ( $hexPairs as $hexSource => $hexDest ) {
$source = hexSequenceToUtf8( $hexSource );
$dest = hexSequenceToUtf8( $hexDest );
$source = UtfNormal\Utils::hexSequenceToUtf8( $hexSource );
$dest = UtfNormal\Utils::hexSequenceToUtf8( $hexDest );
$pairs[$source] = $dest;
}

View file

@ -108,12 +108,12 @@ class GenerateUtf8Case extends Maintenance {
$data[$name] = $numberedData[$number];
}
$source = hexSequenceToUtf8( $data['Code'] );
$source = UtfNormal\Utils::hexSequenceToUtf8( $data['Code'] );
if ( $data['Simple_Uppercase_Mapping'] ) {
$upper[$source] = hexSequenceToUtf8( $data['Simple_Uppercase_Mapping'] );
$upper[$source] = UtfNormal\Utils::hexSequenceToUtf8( $data['Simple_Uppercase_Mapping'] );
}
if ( $data['Simple_Lowercase_Mapping'] ) {
$lower[$source] = hexSequenceToUtf8( $data['Simple_Lowercase_Mapping'] );
$lower[$source] = UtfNormal\Utils::hexSequenceToUtf8( $data['Simple_Lowercase_Mapping'] );
}
}

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -79,7 +79,7 @@ class SanitizerTest extends MediaWikiTestCase {
*/
public function testInvalidNumberedEntities() {
$this->assertEquals(
UTF8_REPLACEMENT,
UtfNormal\Constants::UTF8_REPLACEMENT,
Sanitizer::decodeCharReferences( "&#88888888888888;" ),
'Invalid numbered entity'
);

View file

@ -1,394 +0,0 @@
<?php
/**
* Tests for UtfNormal::cleanUp() function.
*
* Copyright © 2004 Brion Vibber <brion@pobox.com>
* https://www.mediawiki.org/
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
* http://www.gnu.org/copyleft/gpl.html
*
* @file
*/
/**
* Additional tests for UtfNormal::cleanUp() function, inclusion
* regression checks for known problems.
* Requires PHPUnit.
*
* @ingroup UtfNormal
* @group Large
*
* @todo covers tags, will be UtfNormal::cleanUp once the below is resolved
* @todo split me into test methods and providers per the below comment
* @todo Document individual tests
*
* We ignore code coverage for this test suite until they are rewritten
* to use data providers (bug 46561).
* @codeCoverageIgnore
*/
class CleanUpTest extends PHPUnit_Framework_TestCase {
public function testAscii() {
$text = 'This is plain ASCII text.';
$this->assertEquals( $text, UtfNormal::cleanUp( $text ) );
}
public function testNull() {
$text = "a \x00 null";
$expect = "a \xef\xbf\xbd null";
$this->assertEquals(
bin2hex( $expect ),
bin2hex( UtfNormal::cleanUp( $text ) ) );
}
public function testLatin() {
$text = "L'\xc3\xa9cole";
$this->assertEquals( $text, UtfNormal::cleanUp( $text ) );
}
public function testLatinNormal() {
$text = "L'e\xcc\x81cole";
$expect = "L'\xc3\xa9cole";
$this->assertEquals( $expect, UtfNormal::cleanUp( $text ) );
}
/**
* This test is *very* expensive!
*/
function XtestAllChars() {
$rep = UTF8_REPLACEMENT;
for ( $i = 0x0; $i < UNICODE_MAX; $i++ ) {
$char = codepointToUtf8( $i );
$clean = UtfNormal::cleanUp( $char );
$x = sprintf( "%04X", $i );
if ( $i % 0x1000 == 0 ) {
echo "U+$x\n";
}
if ( $i == 0x0009 ||
$i == 0x000a ||
$i == 0x000d ||
( $i > 0x001f && $i < UNICODE_SURROGATE_FIRST ) ||
( $i > UNICODE_SURROGATE_LAST && $i < 0xfffe ) ||
( $i > 0xffff && $i <= UNICODE_MAX )
) {
if ( isset( UtfNormal::$utfCanonicalComp[$char] )
|| isset( UtfNormal::$utfCanonicalDecomp[$char] )
) {
$comp = UtfNormal::NFC( $char );
$this->assertEquals(
bin2hex( $comp ),
bin2hex( $clean ),
"U+$x should be decomposed" );
} else {
$this->assertEquals(
bin2hex( $char ),
bin2hex( $clean ),
"U+$x should be intact" );
}
} else {
$this->assertEquals( bin2hex( $rep ), bin2hex( $clean ), $x );
}
}
}
public static function provideAllBytes() {
return array(
array( '', '' ),
array( 'x', '' ),
array( '', 'x' ),
array( 'x', 'x' ),
);
}
/**
* @dataProvider provideAllBytes
*/
function testBytes( $head, $tail ) {
for ( $i = 0x0; $i < 256; $i++ ) {
$char = $head . chr( $i ) . $tail;
$clean = UtfNormal::cleanUp( $char );
$x = sprintf( "%02X", $i );
if ( $i == 0x0009 ||
$i == 0x000a ||
$i == 0x000d ||
( $i > 0x001f && $i < 0x80 )
) {
$this->assertEquals(
bin2hex( $char ),
bin2hex( $clean ),
"ASCII byte $x should be intact" );
if ( $char != $clean ) {
return;
}
} else {
$norm = $head . UTF8_REPLACEMENT . $tail;
$this->assertEquals(
bin2hex( $norm ),
bin2hex( $clean ),
"Forbidden byte $x should be rejected" );
if ( $norm != $clean ) {
return;
}
}
}
}
/**
* @dataProvider provideAllBytes
*/
function testDoubleBytes( $head, $tail ) {
for ( $first = 0xc0; $first < 0x100; $first += 2 ) {
for ( $second = 0x80; $second < 0x100; $second += 2 ) {
$char = $head . chr( $first ) . chr( $second ) . $tail;
$clean = UtfNormal::cleanUp( $char );
$x = sprintf( "%02X,%02X", $first, $second );
if ( $first > 0xc1 &&
$first < 0xe0 &&
$second < 0xc0
) {
$norm = UtfNormal::NFC( $char );
$this->assertEquals(
bin2hex( $norm ),
bin2hex( $clean ),
"Pair $x should be intact" );
if ( $norm != $clean ) {
return;
}
} elseif ( $first > 0xfd || $second > 0xbf ) {
# fe and ff are not legal head bytes -- expect two replacement chars
$norm = $head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail;
$this->assertEquals(
bin2hex( $norm ),
bin2hex( $clean ),
"Forbidden pair $x should be rejected" );
if ( $norm != $clean ) {
return;
}
} else {
$norm = $head . UTF8_REPLACEMENT . $tail;
$this->assertEquals(
bin2hex( $norm ),
bin2hex( $clean ),
"Forbidden pair $x should be rejected" );
if ( $norm != $clean ) {
return;
}
}
}
}
}
/**
* @dataProvider provideAllBytes
*/
function testTripleBytes( $head, $tail ) {
for ( $first = 0xc0; $first < 0x100; $first += 2 ) {
for ( $second = 0x80; $second < 0x100; $second += 2 ) {
#for( $third = 0x80; $third < 0x100; $third++ ) {
for ( $third = 0x80; $third < 0x81; $third++ ) {
$char = $head . chr( $first ) . chr( $second ) . chr( $third ) . $tail;
$clean = UtfNormal::cleanUp( $char );
$x = sprintf( "%02X,%02X,%02X", $first, $second, $third );
if ( $first >= 0xe0 &&
$first < 0xf0 &&
$second < 0xc0 &&
$third < 0xc0
) {
if ( $first == 0xe0 && $second < 0xa0 ) {
$this->assertEquals(
bin2hex( $head . UTF8_REPLACEMENT . $tail ),
bin2hex( $clean ),
"Overlong triplet $x should be rejected" );
} elseif ( $first == 0xed &&
( chr( $first ) . chr( $second ) . chr( $third ) ) >= UTF8_SURROGATE_FIRST
) {
$this->assertEquals(
bin2hex( $head . UTF8_REPLACEMENT . $tail ),
bin2hex( $clean ),
"Surrogate triplet $x should be rejected" );
} else {
$this->assertEquals(
bin2hex( UtfNormal::NFC( $char ) ),
bin2hex( $clean ),
"Triplet $x should be intact" );
}
} elseif ( $first > 0xc1 && $first < 0xe0 && $second < 0xc0 ) {
$this->assertEquals(
bin2hex( UtfNormal::NFC( $head . chr( $first ) .
chr( $second ) ) . UTF8_REPLACEMENT . $tail ),
bin2hex( $clean ),
"Valid 2-byte $x + broken tail" );
} elseif ( $second > 0xc1 && $second < 0xe0 && $third < 0xc0 ) {
$this->assertEquals(
bin2hex( $head . UTF8_REPLACEMENT .
UtfNormal::NFC( chr( $second ) . chr( $third ) . $tail ) ),
bin2hex( $clean ),
"Broken head + valid 2-byte $x" );
} elseif ( ( $first > 0xfd || $second > 0xfd ) &&
( ( $second > 0xbf && $third > 0xbf ) ||
( $second < 0xc0 && $third < 0xc0 ) ||
( $second > 0xfd ) ||
( $third > 0xfd ) )
) {
# fe and ff are not legal head bytes -- expect three replacement chars
$this->assertEquals(
bin2hex( $head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail ),
bin2hex( $clean ),
"Forbidden triplet $x should be rejected" );
} elseif ( $first > 0xc2 && $second < 0xc0 && $third < 0xc0 ) {
$this->assertEquals(
bin2hex( $head . UTF8_REPLACEMENT . $tail ),
bin2hex( $clean ),
"Forbidden triplet $x should be rejected" );
} else {
$this->assertEquals(
bin2hex( $head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail ),
bin2hex( $clean ),
"Forbidden triplet $x should be rejected" );
}
}
}
}
}
public function testChunkRegression() {
# Check for regression against a chunking bug
$text = "\x46\x55\xb8" .
"\xdc\x96" .
"\xee" .
"\xe7" .
"\x44" .
"\xaa" .
"\x2f\x25";
$expect = "\x46\x55\xef\xbf\xbd" .
"\xdc\x96" .
"\xef\xbf\xbd" .
"\xef\xbf\xbd" .
"\x44" .
"\xef\xbf\xbd" .
"\x2f\x25";
$this->assertEquals(
bin2hex( $expect ),
bin2hex( UtfNormal::cleanUp( $text ) ) );
}
public function testInterposeRegression() {
$text = "\x4e\x30" .
"\xb1" . # bad tail
"\x3a" .
"\x92" . # bad tail
"\x62\x3a" .
"\x84" . # bad tail
"\x43" .
"\xc6" . # bad head
"\x3f" .
"\x92" . # bad tail
"\xad" . # bad tail
"\x7d" .
"\xd9\x95";
$expect = "\x4e\x30" .
"\xef\xbf\xbd" .
"\x3a" .
"\xef\xbf\xbd" .
"\x62\x3a" .
"\xef\xbf\xbd" .
"\x43" .
"\xef\xbf\xbd" .
"\x3f" .
"\xef\xbf\xbd" .
"\xef\xbf\xbd" .
"\x7d" .
"\xd9\x95";
$this->assertEquals(
bin2hex( $expect ),
bin2hex( UtfNormal::cleanUp( $text ) ) );
}
public function testOverlongRegression() {
$text = "\x67" .
"\x1a" . # forbidden ascii
"\xea" . # bad head
"\xc1\xa6" . # overlong sequence
"\xad" . # bad tail
"\x1c" . # forbidden ascii
"\xb0" . # bad tail
"\x3c" .
"\x9e"; # bad tail
$expect = "\x67" .
"\xef\xbf\xbd" .
"\xef\xbf\xbd" .
"\xef\xbf\xbd" .
"\xef\xbf\xbd" .
"\xef\xbf\xbd" .
"\xef\xbf\xbd" .
"\x3c" .
"\xef\xbf\xbd";
$this->assertEquals(
bin2hex( $expect ),
bin2hex( UtfNormal::cleanUp( $text ) ) );
}
public function testSurrogateRegression() {
$text = "\xed\xb4\x96" . # surrogate 0xDD16
"\x83" . # bad tail
"\xb4" . # bad tail
"\xac"; # bad head
$expect = "\xef\xbf\xbd" .
"\xef\xbf\xbd" .
"\xef\xbf\xbd" .
"\xef\xbf\xbd";
$this->assertEquals(
bin2hex( $expect ),
bin2hex( UtfNormal::cleanUp( $text ) ) );
}
public function testBomRegression() {
$text = "\xef\xbf\xbe" . # U+FFFE, illegal char
"\xb2" . # bad tail
"\xef" . # bad head
"\x59";
$expect = "\xef\xbf\xbd" .
"\xef\xbf\xbd" .
"\xef\xbf\xbd" .
"\x59";
$this->assertEquals(
bin2hex( $expect ),
bin2hex( UtfNormal::cleanUp( $text ) ) );
}
public function testForbiddenRegression() {
$text = "\xef\xbf\xbf"; # U+FFFF, illegal char
$expect = "\xef\xbf\xbd";
$this->assertEquals(
bin2hex( $expect ),
bin2hex( UtfNormal::cleanUp( $text ) ) );
}
public function testHangulRegression() {
$text = "\xed\x9c\xaf" . # Hangul char
"\xe1\x87\x81"; # followed by another final jamo
$expect = $text; # Should *not* change.
$this->assertEquals(
bin2hex( $expect ),
bin2hex( UtfNormal::cleanUp( $text ) ) );
}
}

View file

@ -1,123 +0,0 @@
<?php
/**
* Runs the UTF-8 decoder test at:
* http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
*
* Copyright © 2004 Brion Vibber <brion@pobox.com>
* https://www.mediawiki.org/
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
* http://www.gnu.org/copyleft/gpl.html
*
* @file
* @ingroup UtfNormal
*/
class Utf8Test extends PHPUnit_Framework_TestCase {
public static function provideLines() {
global $IP;
$in = fopen( "$IP/tests/phpunit/data/normal/UTF-8-test.txt", "rt" );
$columns = 0;
while ( false !== ( $line = fgets( $in ) ) ) {
$matches = array();
if ( preg_match( '/^(Here come the tests:\s*)\|$/', $line, $matches ) ) {
$columns = strpos( $line, '|' );
break;
}
}
if ( !$columns ) {
print "Something seems to be wrong; couldn't extract line length.\n";
print "Check that UTF-8-test.txt was downloaded correctly from\n";
print "http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt\n";
exit( -1 );
}
$ignore = array(
# These two lines actually seem to be corrupt
'2.1.1', '2.2.1' );
$exceptions = array(
# Tests that should mark invalid characters due to using long
# sequences beyond what is now considered legal.
'2.1.5', '2.1.6', '2.2.4', '2.2.5', '2.2.6', '2.3.5',
# Literal 0xffff, which is illegal
'2.2.3' );
$longTests = array(
# These tests span multiple lines
'3.1.9', '3.2.1', '3.2.2', '3.2.3', '3.2.4', '3.2.5',
'3.4' );
$testCases = array();
$section = null;
while ( false !== ( $line = fgets( $in ) ) ) {
$matches = array();
if ( preg_match( '/^(\d+)\s+(.*?)\s*\|/', $line, $matches ) ) {
continue;
}
if ( preg_match( '/^(\d+\.\d+\.\d+)\s*/', $line, $matches ) ) {
$test = $matches[1];
if ( in_array( $test, $ignore ) ) {
continue;
}
if ( in_array( $test, $longTests ) ) {
fgets( $in );
// @codingStandardsIgnoreStart Generic.CodeAnalysis.ForLoopWithTestFunctionCall.NotAllowed
for ( $line = fgets( $in ); !preg_match( '/^\s+\|/', $line ); $line = fgets( $in ) ) {
// @codingStandardsIgnoreEnd
$testCases[] = array( $test, $line, $columns, $exceptions );
}
} else {
$testCases[] = array( $test, $line, $columns, $exceptions );
}
}
}
return $testCases;
}
/**
* @dataProvider provideLines
* @covers UtfNormal::quickisNFCVerify
*/
function testLine( $test, $line, $columns, $exceptions ) {
$stripped = $line;
UtfNormal::quickisNFCVerify( $stripped );
$same = ( $line == $stripped );
$len = mb_strlen( substr( $stripped, 0, strpos( $stripped, '|' ) ) );
if ( $len == 0 ) {
$len = strlen( substr( $stripped, 0, strpos( $stripped, '|' ) ) );
}
$ok = $same ^ ( $test >= 3 );
$ok ^= in_array( $test, $exceptions );
$ok &= ( $columns == $len );
$this->assertEquals( 1, $ok );
}
}

View file

@ -1,160 +0,0 @@
<?php
/**
* Implements the conformance test at:
* http://www.unicode.org/Public/UNIDATA/NormalizationTest.txt
*
* Copyright © 2004 Brion Vibber <brion@pobox.com>
* https://www.mediawiki.org/
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
* http://www.gnu.org/copyleft/gpl.html
*
* @file
* @group UtfNormal
* @group large
*/
class UtfNormalTest extends PHPUnit_Framework_TestCase {
protected static $testedChars = array();
public static function provideNormalizationTest() {
global $IP;
$in = fopen( "$IP/tests/phpunit/data/normal/NormalizationTest.txt", "rt" );
$testCases = array();
while ( false !== ( $line = fgets( $in ) ) ) {
list( $data, $comment ) = explode( '#', $line );
if ( $data === '' ) continue;
$matches = array();
if ( preg_match( '/@Part([\d])/', $data, $matches ) ) {
continue;
}
$columns = array_map( "hexSequenceToUtf8", explode( ";", $data ) );
array_unshift( $columns, '' );
self::$testedChars[$columns[1]] = true;
$testCases[] = array( $columns, $comment );
}
fclose( $in );
return array( array( $testCases ) );
}
function assertStringEquals( $a, $b, $desc ) {
$this->assertEquals( 0, strcmp( $a, $b ), $desc );
}
function assertNFC( $c, $desc ) {
$this->assertStringEquals( $c[2], UtfNormal::toNFC( $c[1] ), $desc );
$this->assertStringEquals( $c[2], UtfNormal::toNFC( $c[2] ), $desc );
$this->assertStringEquals( $c[2], UtfNormal::toNFC( $c[3] ), $desc );
$this->assertStringEquals( $c[4], UtfNormal::toNFC( $c[4] ), $desc );
$this->assertStringEquals( $c[4], UtfNormal::toNFC( $c[5] ), $desc );
}
function assertNFD( $c, $desc ) {
$this->assertStringEquals( $c[3], UtfNormal::toNFD( $c[1] ), $desc );
$this->assertStringEquals( $c[3], UtfNormal::toNFD( $c[2] ), $desc );
$this->assertStringEquals( $c[3], UtfNormal::toNFD( $c[3] ), $desc );
$this->assertStringEquals( $c[5], UtfNormal::toNFD( $c[4] ), $desc );
$this->assertStringEquals( $c[5], UtfNormal::toNFD( $c[5] ), $desc );
}
function assertNFKC( $c, $desc ) {
$this->assertStringEquals( $c[4], UtfNormal::toNFKC( $c[1] ), $desc );
$this->assertStringEquals( $c[4], UtfNormal::toNFKC( $c[2] ), $desc );
$this->assertStringEquals( $c[4], UtfNormal::toNFKC( $c[3] ), $desc );
$this->assertStringEquals( $c[4], UtfNormal::toNFKC( $c[4] ), $desc );
$this->assertStringEquals( $c[4], UtfNormal::toNFKC( $c[5] ), $desc );
}
function assertNFKD( $c, $desc ) {
$this->assertStringEquals( $c[5], UtfNormal::toNFKD( $c[1] ), $desc );
$this->assertStringEquals( $c[5], UtfNormal::toNFKD( $c[2] ), $desc );
$this->assertStringEquals( $c[5], UtfNormal::toNFKD( $c[3] ), $desc );
$this->assertStringEquals( $c[5], UtfNormal::toNFKD( $c[4] ), $desc );
$this->assertStringEquals( $c[5], UtfNormal::toNFKD( $c[5] ), $desc );
}
function assertCleanUp( $c, $desc ) {
$this->assertStringEquals( $c[2], UtfNormal::cleanUp( $c[1] ), $desc );
$this->assertStringEquals( $c[2], UtfNormal::cleanUp( $c[2] ), $desc );
$this->assertStringEquals( $c[2], UtfNormal::cleanUp( $c[3] ), $desc );
$this->assertStringEquals( $c[4], UtfNormal::cleanUp( $c[4] ), $desc );
$this->assertStringEquals( $c[4], UtfNormal::cleanUp( $c[5] ), $desc );
}
/**
* The data provider for this intentionally returns all the
* test cases as one since PHPUnit is too slow otherwise
*
* @dataProvider provideNormalizationTest
*/
function testNormals( $testCases ) {
foreach ( $testCases as $case ) {
$c = $case[0];
$desc = $case[1];
$this->assertNFC( $c, $desc );
$this->assertNFD( $c, $desc );
$this->assertNFKC( $c, $desc );
$this->assertNFKD( $c, $desc );
$this->assertCleanUp( $c, $desc );
}
}
public static function provideUnicodeData() {
global $IP;
$in = fopen( "$IP/tests/phpunit/data/normal/UnicodeData.txt", "rt" );
$testCases = array();
while ( false !== ( $line = fgets( $in ) ) ) {
$cols = explode( ';', $line );
$char = codepointToUtf8( hexdec( $cols[0] ) );
$desc = $cols[0] . ": " . $cols[1];
if ( $char < "\x20" || $char >= UTF8_SURROGATE_FIRST && $char <= UTF8_SURROGATE_LAST ) {
# Can't check NULL with the ICU plugin, as null bytes fail in C land.
# Skip other control characters, as we strip them for XML safety.
# Surrogates are illegal on their own or in UTF-8, ignore.
continue;
}
if ( empty( self::$testedChars[$char] ) ) {
$testCases[] = array( $char, $desc );
}
}
fclose( $in );
return array( array( $testCases ) );
}
/**
* The data provider for this intentionally returns all the
* test cases as one since PHPUnit is too slow otherwise
*
* @depends testNormals
* @dataProvider provideUnicodeData
*/
public function testInvariant( $testCases ) {
foreach ( $testCases as $case ) {
$char = $case[0];
$desc = $case[1];
$this->assertStringEquals( $char, UtfNormal::toNFC( $char ), $desc );
$this->assertStringEquals( $char, UtfNormal::toNFD( $char ), $desc );
$this->assertStringEquals( $char, UtfNormal::toNFKC( $char ), $desc );
$this->assertStringEquals( $char, UtfNormal::toNFKD( $char ), $desc );
$this->assertStringEquals( $char, UtfNormal::cleanUp( $char ), $desc );
}
}
}