2013-02-25 20:09:38 +00:00
|
|
|
#!/usr/bin/env php
|
2010-07-20 21:17:22 +00:00
|
|
|
<?php
|
2010-08-15 07:47:23 +00:00
|
|
|
/**
|
2012-06-05 22:58:54 +00:00
|
|
|
* Other tests for the unicode normalization module.
|
|
|
|
|
*
|
|
|
|
|
* This program is free software; you can redistribute it and/or modify
|
|
|
|
|
* it under the terms of the GNU General Public License as published by
|
|
|
|
|
* the Free Software Foundation; either version 2 of the License, or
|
|
|
|
|
* (at your option) any later version.
|
|
|
|
|
*
|
|
|
|
|
* This program is distributed in the hope that it will be useful,
|
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
|
* GNU General Public License for more details.
|
|
|
|
|
*
|
|
|
|
|
* You should have received a copy of the GNU General Public License along
|
|
|
|
|
* with this program; if not, write to the Free Software Foundation, Inc.,
|
|
|
|
|
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
|
|
|
|
* http://www.gnu.org/copyleft/gpl.html
|
2010-08-15 07:47:23 +00:00
|
|
|
*
|
|
|
|
|
* @file
|
|
|
|
|
* @ingroup UtfNormal
|
|
|
|
|
*/
|
2010-07-20 21:17:22 +00:00
|
|
|
|
2014-04-24 19:33:40 +00:00
|
|
|
if ( PHP_SAPI != 'cli' ) {
|
2010-07-20 21:17:22 +00:00
|
|
|
die( "Run me from the command line please.\n" );
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// From http://unicode.org/Public/UNIDATA/NormalizationTest.txt
|
|
|
|
|
$file = "NormalizationTest.txt";
|
2010-11-30 23:46:14 +00:00
|
|
|
|
|
|
|
|
// Anything after this character is a comment
|
|
|
|
|
define ( 'COMMENT', '#' );
|
|
|
|
|
|
|
|
|
|
// Semicolons are used to separate the columns
|
|
|
|
|
define ( 'SEPARATOR', ';' );
|
|
|
|
|
|
2014-04-24 19:33:40 +00:00
|
|
|
$f = fopen( $file, "r" );
|
2010-07-20 21:17:22 +00:00
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* The following section will be used for testing different normalization methods.
|
|
|
|
|
* - Pure PHP
|
2014-04-24 19:33:40 +00:00
|
|
|
* ~ no assertion errors
|
|
|
|
|
* ~ 6.25 minutes
|
2010-07-20 21:17:22 +00:00
|
|
|
* - php_utfnormal.so or intl extension: both are wrappers around
|
2014-04-24 19:33:40 +00:00
|
|
|
* libicu so we list the version of libicu when making the
|
|
|
|
|
* comparison
|
2010-07-20 21:17:22 +00:00
|
|
|
* - libicu Ubuntu 3.8.1-3ubuntu1.1 php 5.2.6-3ubuntu4.5
|
2014-04-24 19:33:40 +00:00
|
|
|
* ~ 2200 assertion errors
|
|
|
|
|
* ~ 5 seconds
|
|
|
|
|
* ~ output: http://paste2.org/p/921566
|
2010-07-20 21:17:22 +00:00
|
|
|
* - libicu Ubuntu 4.2.1-3 php 5.3.2-1ubuntu4.2
|
2014-04-24 19:33:40 +00:00
|
|
|
* ~ 1384 assertion errors
|
|
|
|
|
* ~ 15 seconds
|
|
|
|
|
* ~ output: http://paste2.org/p/921435
|
2010-07-20 21:17:22 +00:00
|
|
|
* - libicu Debian 4.4.1-5 php 5.3.2-1ubuntu4.2
|
2014-04-24 19:33:40 +00:00
|
|
|
* ~ no assertion errors
|
|
|
|
|
* ~ 13 seconds
|
2010-07-20 21:17:22 +00:00
|
|
|
* - Tests comparing pure PHP output with libicu output were added
|
2014-04-24 19:33:40 +00:00
|
|
|
* later and slow down the runtime.
|
2010-07-20 21:17:22 +00:00
|
|
|
*/
|
|
|
|
|
|
2013-05-07 23:00:15 +00:00
|
|
|
require_once './UtfNormal.php';
|
2014-04-24 19:33:40 +00:00
|
|
|
function normalize_form_c( $c ) {
|
|
|
|
|
return UtfNormal::toNFC( $c );
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
function normalize_form_d( $c ) {
|
|
|
|
|
return UtfNormal::toNFD( $c );
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
function normalize_form_kc( $c ) {
|
|
|
|
|
return UtfNormal::toNFKC( $c );
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
function normalize_form_kd( $c ) {
|
|
|
|
|
return UtfNormal::toNFKD( $c );
|
|
|
|
|
}
|
2010-07-20 21:24:07 +00:00
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* This set of functions is only useful if youve added a param to the
|
|
|
|
|
* following functions to force pure PHP usage. I decided not to
|
|
|
|
|
* commit that code since might produce a slowdown in the UTF
|
|
|
|
|
* normalization code just for the sake of these tests. -- hexmode
|
2012-02-09 21:35:05 +00:00
|
|
|
* @return string
|
2010-07-20 21:24:07 +00:00
|
|
|
*/
|
2014-04-24 19:33:40 +00:00
|
|
|
function normalize_form_c_php( $c ) {
|
|
|
|
|
return UtfNormal::toNFC( $c, "php" );
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
function normalize_form_d_php( $c ) {
|
|
|
|
|
return UtfNormal::toNFD( $c, "php" );
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
function normalize_form_kc_php( $c ) {
|
|
|
|
|
return UtfNormal::toNFKC( $c, "php" );
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
function normalize_form_kd_php( $c ) {
|
|
|
|
|
return UtfNormal::toNFKD( $c, "php" );
|
|
|
|
|
}
|
2010-07-20 21:17:22 +00:00
|
|
|
|
2014-04-24 19:33:40 +00:00
|
|
|
assert_options( ASSERT_ACTIVE, 1 );
|
|
|
|
|
assert_options( ASSERT_WARNING, 0 );
|
|
|
|
|
assert_options( ASSERT_QUIET_EVAL, 1 );
|
|
|
|
|
assert_options( ASSERT_CALLBACK, 'my_assert' );
|
2010-07-20 21:17:22 +00:00
|
|
|
|
|
|
|
|
function my_assert( $file, $line, $code ) {
|
2014-04-24 19:44:48 +00:00
|
|
|
// @codingStandardsIgnoreStart MediaWiki.NamingConventions.ValidGlobalName.wgPrefix
|
2010-07-25 18:26:44 +00:00
|
|
|
global $col, $lineNo;
|
2014-04-24 19:44:48 +00:00
|
|
|
// @codingStandardsIgnoreEnd
|
|
|
|
|
|
2010-07-20 21:17:22 +00:00
|
|
|
echo "Assertion that '$code' failed on line $lineNo ($col[5])\n";
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$count = 0;
|
|
|
|
|
$lineNo = 0;
|
2014-04-24 19:33:40 +00:00
|
|
|
if ( $f !== false ) {
|
|
|
|
|
while ( ( $col = getRow( $f ) ) !== false ) {
|
2010-07-20 21:17:22 +00:00
|
|
|
$lineNo++;
|
|
|
|
|
|
2014-04-24 19:33:40 +00:00
|
|
|
if ( count( $col ) == 6 ) {
|
2010-07-20 21:17:22 +00:00
|
|
|
$count++;
|
2014-04-24 19:33:40 +00:00
|
|
|
if ( $count % 100 === 0 ) echo "Count: $count\n";
|
2010-07-20 21:17:22 +00:00
|
|
|
} else {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# verify that the pure PHP version is correct
|
2014-04-24 19:33:40 +00:00
|
|
|
$NFCc1 = normalize_form_c( $col[0] );
|
|
|
|
|
$NFCc1p = normalize_form_c_php( $col[0] );
|
|
|
|
|
assert( '$NFCc1 === $NFCc1p' );
|
|
|
|
|
$NFCc2 = normalize_form_c( $col[1] );
|
|
|
|
|
$NFCc2p = normalize_form_c_php( $col[1] );
|
|
|
|
|
assert( '$NFCc2 === $NFCc2p' );
|
|
|
|
|
$NFCc3 = normalize_form_c( $col[2] );
|
|
|
|
|
$NFCc3p = normalize_form_c_php( $col[2] );
|
|
|
|
|
assert( '$NFCc3 === $NFCc3p' );
|
|
|
|
|
$NFCc4 = normalize_form_c( $col[3] );
|
|
|
|
|
$NFCc4p = normalize_form_c_php( $col[3] );
|
|
|
|
|
assert( '$NFCc4 === $NFCc4p' );
|
|
|
|
|
$NFCc5 = normalize_form_c( $col[4] );
|
|
|
|
|
$NFCc5p = normalize_form_c_php( $col[4] );
|
|
|
|
|
assert( '$NFCc5 === $NFCc5p' );
|
|
|
|
|
|
|
|
|
|
$NFDc1 = normalize_form_d( $col[0] );
|
|
|
|
|
$NFDc1p = normalize_form_d_php( $col[0] );
|
|
|
|
|
assert( '$NFDc1 === $NFDc1p' );
|
|
|
|
|
$NFDc2 = normalize_form_d( $col[1] );
|
|
|
|
|
$NFDc2p = normalize_form_d_php( $col[1] );
|
|
|
|
|
assert( '$NFDc2 === $NFDc2p' );
|
|
|
|
|
$NFDc3 = normalize_form_d( $col[2] );
|
|
|
|
|
$NFDc3p = normalize_form_d_php( $col[2] );
|
|
|
|
|
assert( '$NFDc3 === $NFDc3p' );
|
|
|
|
|
$NFDc4 = normalize_form_d( $col[3] );
|
|
|
|
|
$NFDc4p = normalize_form_d_php( $col[3] );
|
|
|
|
|
assert( '$NFDc4 === $NFDc4p' );
|
|
|
|
|
$NFDc5 = normalize_form_d( $col[4] );
|
|
|
|
|
$NFDc5p = normalize_form_d_php( $col[4] );
|
|
|
|
|
assert( '$NFDc5 === $NFDc5p' );
|
|
|
|
|
|
|
|
|
|
$NFKDc1 = normalize_form_kd( $col[0] );
|
|
|
|
|
$NFKDc1p = normalize_form_kd_php( $col[0] );
|
|
|
|
|
assert( '$NFKDc1 === $NFKDc1p' );
|
|
|
|
|
$NFKDc2 = normalize_form_kd( $col[1] );
|
|
|
|
|
$NFKDc2p = normalize_form_kd_php( $col[1] );
|
|
|
|
|
assert( '$NFKDc2 === $NFKDc2p' );
|
|
|
|
|
$NFKDc3 = normalize_form_kd( $col[2] );
|
|
|
|
|
$NFKDc3p = normalize_form_kd_php( $col[2] );
|
|
|
|
|
assert( '$NFKDc3 === $NFKDc3p' );
|
|
|
|
|
$NFKDc4 = normalize_form_kd( $col[3] );
|
|
|
|
|
$NFKDc4p = normalize_form_kd_php( $col[3] );
|
|
|
|
|
assert( '$NFKDc4 === $NFKDc4p' );
|
|
|
|
|
$NFKDc5 = normalize_form_kd( $col[4] );
|
|
|
|
|
$NFKDc5p = normalize_form_kd_php( $col[4] );
|
|
|
|
|
assert( '$NFKDc5 === $NFKDc5p' );
|
|
|
|
|
|
|
|
|
|
$NFKCc1 = normalize_form_kc( $col[0] );
|
|
|
|
|
$NFKCc1p = normalize_form_kc_php( $col[0] );
|
|
|
|
|
assert( '$NFKCc1 === $NFKCc1p' );
|
|
|
|
|
$NFKCc2 = normalize_form_kc( $col[1] );
|
|
|
|
|
$NFKCc2p = normalize_form_kc_php( $col[1] );
|
|
|
|
|
assert( '$NFKCc2 === $NFKCc2p' );
|
|
|
|
|
$NFKCc3 = normalize_form_kc( $col[2] );
|
|
|
|
|
$NFKCc3p = normalize_form_kc_php( $col[2] );
|
|
|
|
|
assert( '$NFKCc3 === $NFKCc3p' );
|
|
|
|
|
$NFKCc4 = normalize_form_kc( $col[3] );
|
|
|
|
|
$NFKCc4p = normalize_form_kc_php( $col[3] );
|
|
|
|
|
assert( '$NFKCc4 === $NFKCc4p' );
|
|
|
|
|
$NFKCc5 = normalize_form_kc( $col[4] );
|
|
|
|
|
$NFKCc5p = normalize_form_kc_php( $col[4] );
|
|
|
|
|
assert( '$NFKCc5 === $NFKCc5p' );
|
2010-07-20 21:17:22 +00:00
|
|
|
|
|
|
|
|
# c2 == NFC(c1) == NFC(c2) == NFC(c3)
|
2014-04-24 19:33:40 +00:00
|
|
|
assert( '$col[1] === $NFCc1' );
|
|
|
|
|
assert( '$col[1] === $NFCc2' );
|
|
|
|
|
assert( '$col[1] === $NFCc3' );
|
2010-07-20 21:17:22 +00:00
|
|
|
|
|
|
|
|
# c4 == NFC(c4) == NFC(c5)
|
2014-04-24 19:33:40 +00:00
|
|
|
assert( '$col[3] === $NFCc4' );
|
|
|
|
|
assert( '$col[3] === $NFCc5' );
|
2010-07-20 21:17:22 +00:00
|
|
|
|
|
|
|
|
# c3 == NFD(c1) == NFD(c2) == NFD(c3)
|
2014-04-24 19:33:40 +00:00
|
|
|
assert( '$col[2] === $NFDc1' );
|
|
|
|
|
assert( '$col[2] === $NFDc2' );
|
|
|
|
|
assert( '$col[2] === $NFDc3' );
|
2010-07-20 21:17:22 +00:00
|
|
|
|
|
|
|
|
# c5 == NFD(c4) == NFD(c5)
|
2014-04-24 19:33:40 +00:00
|
|
|
assert( '$col[4] === $NFDc4' );
|
|
|
|
|
assert( '$col[4] === $NFDc5' );
|
2010-07-20 21:17:22 +00:00
|
|
|
|
|
|
|
|
# c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NFKC(c4) == NFKC(c5)
|
2014-04-24 19:33:40 +00:00
|
|
|
assert( '$col[3] === $NFKCc1' );
|
|
|
|
|
assert( '$col[3] === $NFKCc2' );
|
|
|
|
|
assert( '$col[3] === $NFKCc3' );
|
|
|
|
|
assert( '$col[3] === $NFKCc4' );
|
|
|
|
|
assert( '$col[3] === $NFKCc5' );
|
2010-07-20 21:17:22 +00:00
|
|
|
|
|
|
|
|
# c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5)
|
2014-04-24 19:33:40 +00:00
|
|
|
assert( '$col[4] === $NFKDc1' );
|
|
|
|
|
assert( '$col[4] === $NFKDc2' );
|
|
|
|
|
assert( '$col[4] === $NFKDc3' );
|
|
|
|
|
assert( '$col[4] === $NFKDc4' );
|
|
|
|
|
assert( '$col[4] === $NFKDc5' );
|
2010-07-20 21:17:22 +00:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
echo "done.\n";
|
|
|
|
|
|
|
|
|
|
// Compare against http://en.wikipedia.org/wiki/UTF-8#Description
|
2014-04-24 19:33:40 +00:00
|
|
|
function unichr( $c ) {
|
|
|
|
|
if ( $c <= 0x7F ) {
|
|
|
|
|
return chr( $c );
|
|
|
|
|
} elseif ( $c <= 0x7FF ) {
|
|
|
|
|
return chr( 0xC0 | $c >> 6 ) . chr( 0x80 | $c & 0x3F );
|
|
|
|
|
} elseif ( $c <= 0xFFFF ) {
|
|
|
|
|
return chr( 0xE0 | $c >> 12 ) . chr( 0x80 | $c >> 6 & 0x3F )
|
|
|
|
|
. chr( 0x80 | $c & 0x3F );
|
|
|
|
|
} elseif ( $c <= 0x10FFFF ) {
|
|
|
|
|
return chr( 0xF0 | $c >> 18 ) . chr( 0x80 | $c >> 12 & 0x3F )
|
|
|
|
|
. chr( 0x80 | $c >> 6 & 0x3F )
|
|
|
|
|
. chr( 0x80 | $c & 0x3F );
|
2010-07-20 21:17:22 +00:00
|
|
|
} else {
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2014-04-24 19:33:40 +00:00
|
|
|
function unistr( $c ) {
|
|
|
|
|
return implode( "", array_map( "unichr", array_map( "hexdec", explode( " ", $c ) ) ) );
|
2010-07-20 21:17:22 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
function getRow( $f ) {
|
|
|
|
|
$row = fgets( $f );
|
2014-04-24 19:33:40 +00:00
|
|
|
if ( $row === false ) return false;
|
|
|
|
|
$row = rtrim( $row );
|
2010-11-30 23:46:14 +00:00
|
|
|
$pos = strpos( $row, COMMENT );
|
2010-07-20 21:17:22 +00:00
|
|
|
$pos2 = strpos( $row, ")" );
|
2014-04-24 19:33:40 +00:00
|
|
|
if ( $pos === 0 ) return array( $row );
|
2010-07-20 21:17:22 +00:00
|
|
|
$c = "";
|
|
|
|
|
|
2014-04-24 19:33:40 +00:00
|
|
|
if ( $pos ) {
|
|
|
|
|
if ( $pos2 ) $c = substr( $row, $pos2 + 2 );
|
|
|
|
|
else $c = substr( $row, $pos );
|
2010-07-20 21:17:22 +00:00
|
|
|
$row = substr( $row, 0, $pos );
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$ret = array();
|
2014-04-24 19:33:40 +00:00
|
|
|
foreach ( explode( SEPARATOR, $row ) as $ent ) {
|
|
|
|
|
if ( trim( $ent ) !== "" ) {
|
|
|
|
|
$ret[] = unistr( $ent );
|
2010-07-20 21:17:22 +00:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
$ret[] = $c;
|
|
|
|
|
|
|
|
|
|
return $ret;
|
2010-07-25 18:26:44 +00:00
|
|
|
}
|