2004-08-29 10:30:23 +00:00
|
|
|
<?php
|
2010-08-15 07:47:23 +00:00
|
|
|
/**
|
2011-03-27 12:21:45 +00:00
|
|
|
* Unicode normalization routines
|
|
|
|
|
*
|
|
|
|
|
* Copyright © 2004 Brion Vibber <brion@pobox.com>
|
|
|
|
|
* http://www.mediawiki.org/
|
|
|
|
|
*
|
|
|
|
|
* This program is free software; you can redistribute it and/or modify
|
|
|
|
|
* it under the terms of the GNU General Public License as published by
|
|
|
|
|
* the Free Software Foundation; either version 2 of the License, or
|
|
|
|
|
* (at your option) any later version.
|
|
|
|
|
*
|
|
|
|
|
* This program is distributed in the hope that it will be useful,
|
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
|
* GNU General Public License for more details.
|
|
|
|
|
*
|
|
|
|
|
* You should have received a copy of the GNU General Public License along
|
|
|
|
|
* with this program; if not, write to the Free Software Foundation, Inc.,
|
|
|
|
|
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
|
|
|
|
* http://www.gnu.org/copyleft/gpl.html
|
|
|
|
|
*
|
|
|
|
|
* @file
|
|
|
|
|
* @ingroup UtfNormal
|
|
|
|
|
*/
|
2004-08-29 10:30:23 +00:00
|
|
|
|
2011-03-27 12:21:45 +00:00
|
|
|
/**
|
|
|
|
|
* @defgroup UtfNormal UtfNormal
|
|
|
|
|
*/
|
WARNING: HUGE COMMIT
Doxygen documentation update:
* Changed alls @addtogroup to @ingroup. @addtogroup adds the comment to the group description, but doesn't add the file, class, function, ... to the group like @ingroup does. See for example http://svn.wikimedia.org/doc/group__SpecialPage.html where it's impossible to see related files, classes, ... that should belong to that group.
* Added @file to file description, it seems that it should be explicitely decalred for file descriptions, otherwise doxygen will think that the comment document the first class, variabled, function, ... that is in that file.
* Removed some empty comments
* Removed some ?>
Added following groups:
* ExternalStorage
* JobQueue
* MaintenanceLanguage
One more thing: there are still a lot of warnings when generating the doc.
2008-05-20 17:13:28 +00:00
|
|
|
|
2011-03-27 12:21:45 +00:00
|
|
|
define( 'NORMALIZE_ICU', function_exists( 'utf8_normalize' ) );
|
|
|
|
|
define( 'NORMALIZE_INTL', function_exists( 'normalizer_normalize' ) );
|
2004-10-07 05:59:10 +00:00
|
|
|
|
2004-09-03 23:00:01 +00:00
|
|
|
/**
|
2011-03-27 12:21:45 +00:00
|
|
|
* Unicode normalization routines for working with UTF-8 strings.
|
|
|
|
|
* Currently assumes that input strings are valid UTF-8!
|
|
|
|
|
*
|
|
|
|
|
* Not as fast as I'd like, but should be usable for most purposes.
|
|
|
|
|
* UtfNormal::toNFC() will bail early if given ASCII text or text
|
|
|
|
|
* it can quickly deterimine is already normalized.
|
|
|
|
|
*
|
|
|
|
|
* All functions can be called static.
|
|
|
|
|
*
|
|
|
|
|
* See description of forms at http://www.unicode.org/reports/tr15/
|
|
|
|
|
*
|
|
|
|
|
* @ingroup UtfNormal
|
|
|
|
|
*/
|
2004-08-29 10:30:23 +00:00
|
|
|
class UtfNormal {
|
2011-08-04 21:54:45 +00:00
|
|
|
/**
|
|
|
|
|
* For using the ICU wrapper
|
|
|
|
|
*/
|
|
|
|
|
const UNORM_NONE = 1;
|
|
|
|
|
const UNORM_NFD = 2;
|
|
|
|
|
const UNORM_NFKD = 3;
|
2011-08-04 22:06:05 +00:00
|
|
|
const UNORM_NFC = 4;
|
2011-08-04 21:54:45 +00:00
|
|
|
const UNORM_NFKC = 5;
|
|
|
|
|
const UNORM_FCD = 6;
|
|
|
|
|
const UNORM_DEFAULT = self::UNORM_NFC;
|
|
|
|
|
|
2011-03-27 12:21:45 +00:00
|
|
|
static $utfCombiningClass = null;
|
|
|
|
|
static $utfCanonicalComp = null;
|
|
|
|
|
static $utfCanonicalDecomp = null;
|
2010-07-26 12:39:44 +00:00
|
|
|
|
2011-03-27 12:21:45 +00:00
|
|
|
# Load compatibility decompositions on demand if they are needed.
|
|
|
|
|
static $utfCompatibilityDecomp = null;
|
2010-07-26 12:39:44 +00:00
|
|
|
|
2011-03-27 12:21:45 +00:00
|
|
|
static $utfCheckNFC;
|
2011-03-24 20:51:38 +00:00
|
|
|
|
2011-03-27 12:21:45 +00:00
|
|
|
/**
|
|
|
|
|
* The ultimate convenience function! Clean up invalid UTF-8 sequences,
|
|
|
|
|
* and convert to normal form C, canonical composition.
|
|
|
|
|
*
|
|
|
|
|
* Fast return for pure ASCII strings; some lesser optimizations for
|
|
|
|
|
* strings containing only known-good characters. Not as fast as toNFC().
|
|
|
|
|
*
|
|
|
|
|
* @param $string String: a UTF-8 string
|
|
|
|
|
* @return string a clean, shiny, normalized UTF-8 string
|
|
|
|
|
*/
|
|
|
|
|
static function cleanUp( $string ) {
|
2011-04-15 18:39:43 +00:00
|
|
|
if( NORMALIZE_ICU ) {
|
2011-04-16 15:32:19 +00:00
|
|
|
$string = self::replaceForNativeNormalize( $string );
|
2011-03-27 12:21:45 +00:00
|
|
|
|
|
|
|
|
# UnicodeString constructor fails if the string ends with a
|
|
|
|
|
# head byte. Add a junk char at the end, we'll strip it off.
|
2011-08-04 21:54:45 +00:00
|
|
|
return rtrim( utf8_normalize( $string . "\x01", self::UNORM_NFC ), "\x01" );
|
2011-04-15 18:39:43 +00:00
|
|
|
} elseif( NORMALIZE_INTL ) {
|
2011-04-16 15:32:19 +00:00
|
|
|
$string = self::replaceForNativeNormalize( $string );
|
2011-04-15 18:39:43 +00:00
|
|
|
$norm = normalizer_normalize( $string, Normalizer::FORM_C );
|
|
|
|
|
if( $norm === null || $norm === false ) {
|
|
|
|
|
# normalizer_normalize will either return false or null
|
|
|
|
|
# (depending on which doc you read) if invalid utf8 string.
|
|
|
|
|
# quickIsNFCVerify cleans up invalid sequences.
|
|
|
|
|
|
|
|
|
|
if( UtfNormal::quickIsNFCVerify( $string ) ) {
|
|
|
|
|
# if that's true, the string is actually already normal.
|
|
|
|
|
return $string;
|
|
|
|
|
} else {
|
|
|
|
|
# Now we are valid but non-normal
|
|
|
|
|
return normalizer_normalize( $string, Normalizer::FORM_C );
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
return $norm;
|
|
|
|
|
}
|
2011-03-27 12:21:45 +00:00
|
|
|
} elseif( UtfNormal::quickIsNFCVerify( $string ) ) {
|
|
|
|
|
# Side effect -- $string has had UTF-8 errors cleaned up.
|
|
|
|
|
return $string;
|
|
|
|
|
} else {
|
|
|
|
|
return UtfNormal::NFC( $string );
|
2004-11-14 05:17:29 +00:00
|
|
|
}
|
2004-09-03 05:39:30 +00:00
|
|
|
}
|
|
|
|
|
|
2011-03-27 12:21:45 +00:00
|
|
|
/**
|
|
|
|
|
* Convert a UTF-8 string to normal form C, canonical composition.
|
|
|
|
|
* Fast return for pure ASCII strings; some lesser optimizations for
|
|
|
|
|
* strings containing only known-good characters.
|
|
|
|
|
*
|
|
|
|
|
* @param $string String: a valid UTF-8 string. Input is not validated.
|
|
|
|
|
* @return string a UTF-8 string in normal form C
|
|
|
|
|
*/
|
|
|
|
|
static function toNFC( $string ) {
|
|
|
|
|
if( NORMALIZE_INTL )
|
|
|
|
|
return normalizer_normalize( $string, Normalizer::FORM_C );
|
|
|
|
|
elseif( NORMALIZE_ICU )
|
2011-08-04 21:54:45 +00:00
|
|
|
return utf8_normalize( $string, self::UNORM_NFC );
|
2011-03-27 12:21:45 +00:00
|
|
|
elseif( UtfNormal::quickIsNFC( $string ) )
|
|
|
|
|
return $string;
|
|
|
|
|
else
|
|
|
|
|
return UtfNormal::NFC( $string );
|
2011-03-24 20:51:38 +00:00
|
|
|
}
|
|
|
|
|
|
2004-09-04 09:35:01 +00:00
|
|
|
/**
|
2011-03-27 12:21:45 +00:00
|
|
|
* Convert a UTF-8 string to normal form D, canonical decomposition.
|
|
|
|
|
* Fast return for pure ASCII strings.
|
|
|
|
|
*
|
|
|
|
|
* @param $string String: a valid UTF-8 string. Input is not validated.
|
|
|
|
|
* @return string a UTF-8 string in normal form D
|
|
|
|
|
*/
|
|
|
|
|
static function toNFD( $string ) {
|
|
|
|
|
if( NORMALIZE_INTL )
|
|
|
|
|
return normalizer_normalize( $string, Normalizer::FORM_D );
|
|
|
|
|
elseif( NORMALIZE_ICU )
|
2011-08-04 21:54:45 +00:00
|
|
|
return utf8_normalize( $string, self::UNORM_NFD );
|
2011-03-27 12:21:45 +00:00
|
|
|
elseif( preg_match( '/[\x80-\xff]/', $string ) )
|
|
|
|
|
return UtfNormal::NFD( $string );
|
|
|
|
|
else
|
|
|
|
|
return $string;
|
2004-08-29 10:30:23 +00:00
|
|
|
}
|
2006-01-07 13:31:29 +00:00
|
|
|
|
2004-09-04 09:35:01 +00:00
|
|
|
/**
|
2011-03-27 12:21:45 +00:00
|
|
|
* Convert a UTF-8 string to normal form KC, compatibility composition.
|
|
|
|
|
* This may cause irreversible information loss, use judiciously.
|
|
|
|
|
* Fast return for pure ASCII strings.
|
|
|
|
|
*
|
|
|
|
|
* @param $string String: a valid UTF-8 string. Input is not validated.
|
|
|
|
|
* @return string a UTF-8 string in normal form KC
|
|
|
|
|
*/
|
|
|
|
|
static function toNFKC( $string ) {
|
|
|
|
|
if( NORMALIZE_INTL )
|
|
|
|
|
return normalizer_normalize( $string, Normalizer::FORM_KC );
|
|
|
|
|
elseif( NORMALIZE_ICU )
|
2011-08-04 21:54:45 +00:00
|
|
|
return utf8_normalize( $string, self::UNORM_NFKC );
|
2011-03-27 12:21:45 +00:00
|
|
|
elseif( preg_match( '/[\x80-\xff]/', $string ) )
|
|
|
|
|
return UtfNormal::NFKC( $string );
|
|
|
|
|
else
|
|
|
|
|
return $string;
|
2004-08-29 10:30:23 +00:00
|
|
|
}
|
2006-01-07 13:31:29 +00:00
|
|
|
|
2004-09-04 09:35:01 +00:00
|
|
|
/**
|
2011-03-27 12:21:45 +00:00
|
|
|
* Convert a UTF-8 string to normal form KD, compatibility decomposition.
|
|
|
|
|
* This may cause irreversible information loss, use judiciously.
|
|
|
|
|
* Fast return for pure ASCII strings.
|
|
|
|
|
*
|
|
|
|
|
* @param $string String: a valid UTF-8 string. Input is not validated.
|
|
|
|
|
* @return string a UTF-8 string in normal form KD
|
|
|
|
|
*/
|
|
|
|
|
static function toNFKD( $string ) {
|
|
|
|
|
if( NORMALIZE_INTL )
|
|
|
|
|
return normalizer_normalize( $string, Normalizer::FORM_KD );
|
|
|
|
|
elseif( NORMALIZE_ICU )
|
2011-08-04 21:54:45 +00:00
|
|
|
return utf8_normalize( $string, self::UNORM_NFKD );
|
2011-03-27 12:21:45 +00:00
|
|
|
elseif( preg_match( '/[\x80-\xff]/', $string ) )
|
|
|
|
|
return UtfNormal::NFKD( $string );
|
|
|
|
|
else
|
|
|
|
|
return $string;
|
2004-08-29 10:30:23 +00:00
|
|
|
}
|
2006-01-07 13:31:29 +00:00
|
|
|
|
2004-10-09 08:08:26 +00:00
|
|
|
/**
|
2011-03-27 12:21:45 +00:00
|
|
|
* Load the basic composition data if necessary
|
|
|
|
|
* @private
|
|
|
|
|
*/
|
|
|
|
|
static function loadData() {
|
|
|
|
|
if( !isset( self::$utfCombiningClass ) ) {
|
|
|
|
|
require_once( dirname(__FILE__) . '/UtfNormalData.inc' );
|
2004-10-09 08:08:26 +00:00
|
|
|
}
|
|
|
|
|
}
|
2006-01-07 13:31:29 +00:00
|
|
|
|
2004-09-04 09:35:01 +00:00
|
|
|
/**
|
2011-03-27 12:21:45 +00:00
|
|
|
* Returns true if the string is _definitely_ in NFC.
|
|
|
|
|
* Returns false if not or uncertain.
|
|
|
|
|
* @param $string String: a valid UTF-8 string. Input is not validated.
|
|
|
|
|
* @return bool
|
|
|
|
|
*/
|
|
|
|
|
static function quickIsNFC( $string ) {
|
|
|
|
|
# ASCII is always valid NFC!
|
|
|
|
|
# If it's pure ASCII, let it through.
|
|
|
|
|
if( !preg_match( '/[\x80-\xff]/', $string ) ) return true;
|
|
|
|
|
|
|
|
|
|
UtfNormal::loadData();
|
|
|
|
|
$len = strlen( $string );
|
|
|
|
|
for( $i = 0; $i < $len; $i++ ) {
|
2011-04-17 07:59:58 +00:00
|
|
|
$c = $string[$i];
|
2011-03-27 12:21:45 +00:00
|
|
|
$n = ord( $c );
|
|
|
|
|
if( $n < 0x80 ) {
|
|
|
|
|
continue;
|
|
|
|
|
} elseif( $n >= 0xf0 ) {
|
|
|
|
|
$c = substr( $string, $i, 4 );
|
|
|
|
|
$i += 3;
|
|
|
|
|
} elseif( $n >= 0xe0 ) {
|
|
|
|
|
$c = substr( $string, $i, 3 );
|
|
|
|
|
$i += 2;
|
|
|
|
|
} elseif( $n >= 0xc0 ) {
|
|
|
|
|
$c = substr( $string, $i, 2 );
|
|
|
|
|
$i++;
|
|
|
|
|
}
|
|
|
|
|
if( isset( self::$utfCheckNFC[$c] ) ) {
|
|
|
|
|
# If it's NO or MAYBE, bail and do the slow check.
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
if( isset( self::$utfCombiningClass[$c] ) ) {
|
|
|
|
|
# Combining character? We might have to do sorting, at least.
|
|
|
|
|
return false;
|
|
|
|
|
}
|
2004-08-29 10:30:23 +00:00
|
|
|
}
|
2011-03-27 12:21:45 +00:00
|
|
|
return true;
|
2004-08-29 10:30:23 +00:00
|
|
|
}
|
2004-09-03 05:39:30 +00:00
|
|
|
|
2004-09-04 09:35:01 +00:00
|
|
|
/**
|
2011-03-27 12:21:45 +00:00
|
|
|
* Returns true if the string is _definitely_ in NFC.
|
|
|
|
|
* Returns false if not or uncertain.
|
|
|
|
|
* @param $string String: a UTF-8 string, altered on output to be valid UTF-8 safe for XML.
|
2012-02-09 21:35:05 +00:00
|
|
|
* @return bool
|
2011-03-27 12:21:45 +00:00
|
|
|
*/
|
|
|
|
|
static function quickIsNFCVerify( &$string ) {
|
|
|
|
|
# Screen out some characters that eg won't be allowed in XML
|
|
|
|
|
$string = preg_replace( '/[\x00-\x08\x0b\x0c\x0e-\x1f]/', UTF8_REPLACEMENT, $string );
|
|
|
|
|
|
|
|
|
|
# ASCII is always valid NFC!
|
|
|
|
|
# If we're only ever given plain ASCII, we can avoid the overhead
|
|
|
|
|
# of initializing the decomposition tables by skipping out early.
|
|
|
|
|
if( !preg_match( '/[\x80-\xff]/', $string ) ) return true;
|
|
|
|
|
|
|
|
|
|
static $checkit = null, $tailBytes = null, $utfCheckOrCombining = null;
|
|
|
|
|
if( !isset( $checkit ) ) {
|
|
|
|
|
# Load/build some scary lookup tables...
|
|
|
|
|
UtfNormal::loadData();
|
|
|
|
|
|
|
|
|
|
$utfCheckOrCombining = array_merge( self::$utfCheckNFC, self::$utfCombiningClass );
|
|
|
|
|
|
|
|
|
|
# Head bytes for sequences which we should do further validity checks
|
|
|
|
|
$checkit = array_flip( array_map( 'chr',
|
|
|
|
|
array( 0xc0, 0xc1, 0xe0, 0xed, 0xef,
|
|
|
|
|
0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
|
|
|
|
|
0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff ) ) );
|
|
|
|
|
|
|
|
|
|
# Each UTF-8 head byte is followed by a certain
|
|
|
|
|
# number of tail bytes.
|
|
|
|
|
$tailBytes = array();
|
|
|
|
|
for( $n = 0; $n < 256; $n++ ) {
|
|
|
|
|
if( $n < 0xc0 ) {
|
|
|
|
|
$remaining = 0;
|
|
|
|
|
} elseif( $n < 0xe0 ) {
|
|
|
|
|
$remaining = 1;
|
|
|
|
|
} elseif( $n < 0xf0 ) {
|
|
|
|
|
$remaining = 2;
|
|
|
|
|
} elseif( $n < 0xf8 ) {
|
|
|
|
|
$remaining = 3;
|
|
|
|
|
} elseif( $n < 0xfc ) {
|
|
|
|
|
$remaining = 4;
|
|
|
|
|
} elseif( $n < 0xfe ) {
|
|
|
|
|
$remaining = 5;
|
2011-03-24 20:51:38 +00:00
|
|
|
} else {
|
2011-03-27 12:21:45 +00:00
|
|
|
$remaining = 0;
|
2011-03-24 20:51:38 +00:00
|
|
|
}
|
2011-03-27 12:21:45 +00:00
|
|
|
$tailBytes[chr($n)] = $remaining;
|
|
|
|
|
}
|
|
|
|
|
}
|
2011-03-24 20:51:38 +00:00
|
|
|
|
2011-03-27 12:21:45 +00:00
|
|
|
# Chop the text into pure-ASCII and non-ASCII areas;
|
|
|
|
|
# large ASCII parts can be handled much more quickly.
|
|
|
|
|
# Don't chop up Unicode areas for punctuation, though,
|
|
|
|
|
# that wastes energy.
|
|
|
|
|
$matches = array();
|
|
|
|
|
preg_match_all(
|
|
|
|
|
'/([\x00-\x7f]+|[\x80-\xff][\x00-\x40\x5b-\x5f\x7b-\xff]*)/',
|
|
|
|
|
$string, $matches );
|
|
|
|
|
|
|
|
|
|
$looksNormal = true;
|
|
|
|
|
$base = 0;
|
|
|
|
|
$replace = array();
|
|
|
|
|
foreach( $matches[1] as $str ) {
|
|
|
|
|
$chunk = strlen( $str );
|
|
|
|
|
|
2011-04-17 07:59:58 +00:00
|
|
|
if( $str[0] < "\x80" ) {
|
2011-03-27 12:21:45 +00:00
|
|
|
# ASCII chunk: guaranteed to be valid UTF-8
|
|
|
|
|
# and in normal form C, so skip over it.
|
|
|
|
|
$base += $chunk;
|
|
|
|
|
continue;
|
|
|
|
|
}
|
2011-03-24 20:51:38 +00:00
|
|
|
|
2011-03-27 12:21:45 +00:00
|
|
|
# We'll have to examine the chunk byte by byte to ensure
|
|
|
|
|
# that it consists of valid UTF-8 sequences, and to see
|
|
|
|
|
# if any of them might not be normalized.
|
|
|
|
|
#
|
|
|
|
|
# Since PHP is not the fastest language on earth, some of
|
|
|
|
|
# this code is a little ugly with inner loop optimizations.
|
|
|
|
|
|
|
|
|
|
$head = '';
|
|
|
|
|
$len = $chunk + 1; # Counting down is faster. I'm *so* sorry.
|
|
|
|
|
|
|
|
|
|
for( $i = -1; --$len; ) {
|
2011-04-17 07:59:58 +00:00
|
|
|
$remaining = $tailBytes[$c = $str[++$i]];
|
2011-03-27 12:21:45 +00:00
|
|
|
if( $remaining ) {
|
|
|
|
|
# UTF-8 head byte!
|
|
|
|
|
$sequence = $head = $c;
|
2011-03-24 20:51:38 +00:00
|
|
|
do {
|
2011-03-27 12:21:45 +00:00
|
|
|
# Look for the defined number of tail bytes...
|
2011-04-17 07:59:58 +00:00
|
|
|
if( --$len && ( $c = $str[++$i] ) >= "\x80" && $c < "\xc0" ) {
|
2011-03-27 12:21:45 +00:00
|
|
|
# Legal tail bytes are nice.
|
|
|
|
|
$sequence .= $c;
|
2004-11-07 11:28:00 +00:00
|
|
|
} else {
|
2011-03-27 12:21:45 +00:00
|
|
|
if( 0 == $len ) {
|
|
|
|
|
# Premature end of string!
|
|
|
|
|
# Drop a replacement character into output to
|
|
|
|
|
# represent the invalid UTF-8 sequence.
|
|
|
|
|
$replace[] = array( UTF8_REPLACEMENT,
|
|
|
|
|
$base + $i + 1 - strlen( $sequence ),
|
|
|
|
|
strlen( $sequence ) );
|
|
|
|
|
break 2;
|
2011-03-24 20:51:38 +00:00
|
|
|
} else {
|
2011-03-27 12:21:45 +00:00
|
|
|
# Illegal tail byte; abandon the sequence.
|
|
|
|
|
$replace[] = array( UTF8_REPLACEMENT,
|
|
|
|
|
$base + $i - strlen( $sequence ),
|
|
|
|
|
strlen( $sequence ) );
|
|
|
|
|
# Back up and reprocess this byte; it may itself
|
|
|
|
|
# be a legal ASCII or UTF-8 sequence head.
|
|
|
|
|
--$i;
|
|
|
|
|
++$len;
|
|
|
|
|
continue 2;
|
2011-03-24 20:51:38 +00:00
|
|
|
}
|
|
|
|
|
}
|
2011-03-27 12:21:45 +00:00
|
|
|
} while( --$remaining );
|
|
|
|
|
|
|
|
|
|
if( isset( $checkit[$head] ) ) {
|
|
|
|
|
# Do some more detailed validity checks, for
|
|
|
|
|
# invalid characters and illegal sequences.
|
|
|
|
|
if( $head == "\xed" ) {
|
|
|
|
|
# 0xed is relatively frequent in Korean, which
|
|
|
|
|
# abuts the surrogate area, so we're doing
|
|
|
|
|
# this check separately to speed things up.
|
|
|
|
|
|
|
|
|
|
if( $sequence >= UTF8_SURROGATE_FIRST ) {
|
|
|
|
|
# Surrogates are legal only in UTF-16 code.
|
|
|
|
|
# They are totally forbidden here in UTF-8
|
|
|
|
|
# utopia.
|
|
|
|
|
$replace[] = array( UTF8_REPLACEMENT,
|
|
|
|
|
$base + $i + 1 - strlen( $sequence ),
|
|
|
|
|
strlen( $sequence ) );
|
|
|
|
|
$head = '';
|
|
|
|
|
continue;
|
2011-03-24 20:51:38 +00:00
|
|
|
}
|
|
|
|
|
} else {
|
2011-03-27 12:21:45 +00:00
|
|
|
# Slower, but rarer checks...
|
|
|
|
|
$n = ord( $head );
|
|
|
|
|
if(
|
|
|
|
|
# "Overlong sequences" are those that are syntactically
|
|
|
|
|
# correct but use more UTF-8 bytes than are necessary to
|
|
|
|
|
# encode a character. Naïve string comparisons can be
|
|
|
|
|
# tricked into failing to see a match for an ASCII
|
|
|
|
|
# character, for instance, which can be a security hole
|
|
|
|
|
# if blacklist checks are being used.
|
|
|
|
|
($n < 0xc2 && $sequence <= UTF8_OVERLONG_A)
|
|
|
|
|
|| ($n == 0xe0 && $sequence <= UTF8_OVERLONG_B)
|
|
|
|
|
|| ($n == 0xf0 && $sequence <= UTF8_OVERLONG_C)
|
|
|
|
|
|
|
|
|
|
# U+FFFE and U+FFFF are explicitly forbidden in Unicode.
|
|
|
|
|
|| ($n == 0xef &&
|
|
|
|
|
($sequence == UTF8_FFFE)
|
|
|
|
|
|| ($sequence == UTF8_FFFF) )
|
|
|
|
|
|
|
|
|
|
# Unicode has been limited to 21 bits; longer
|
|
|
|
|
# sequences are not allowed.
|
|
|
|
|
|| ($n >= 0xf0 && $sequence > UTF8_MAX) ) {
|
|
|
|
|
|
|
|
|
|
$replace[] = array( UTF8_REPLACEMENT,
|
|
|
|
|
$base + $i + 1 - strlen( $sequence ),
|
|
|
|
|
strlen( $sequence ) );
|
|
|
|
|
$head = '';
|
|
|
|
|
continue;
|
2011-03-24 20:51:38 +00:00
|
|
|
}
|
|
|
|
|
}
|
2004-11-05 00:26:09 +00:00
|
|
|
}
|
2011-03-24 20:51:38 +00:00
|
|
|
|
2011-03-27 12:21:45 +00:00
|
|
|
if( isset( $utfCheckOrCombining[$sequence] ) ) {
|
|
|
|
|
# If it's NO or MAYBE, we'll have to rip
|
|
|
|
|
# the string apart and put it back together.
|
|
|
|
|
# That's going to be mighty slow.
|
|
|
|
|
$looksNormal = false;
|
2004-10-30 12:06:31 +00:00
|
|
|
}
|
2004-08-29 10:30:23 +00:00
|
|
|
|
2011-03-27 12:21:45 +00:00
|
|
|
# The sequence is legal!
|
|
|
|
|
$head = '';
|
|
|
|
|
} elseif( $c < "\x80" ) {
|
|
|
|
|
# ASCII byte.
|
|
|
|
|
$head = '';
|
|
|
|
|
} elseif( $c < "\xc0" ) {
|
|
|
|
|
# Illegal tail bytes
|
|
|
|
|
if( $head == '' ) {
|
|
|
|
|
# Out of the blue!
|
|
|
|
|
$replace[] = array( UTF8_REPLACEMENT, $base + $i, 1 );
|
Prevent some unnecessary lstat system calls, generated by include or require directives.
This can be done either by:
* Using explicit full paths, using the $IP global for the installation directory full path, and then working down the tree from there.
* Using explicit full paths, using the "dirname(__FILE__)" directive to get a full directory path for the includer file.
* Occasionally removing the line altogether, and then for some files the inclusion is handled by the autoloader.
For example, if the "extensions/wikihiero/wh_main.php" file does an include or require on "wh_list.php", then PHP does the following:
* tries to open "wiki/wh_list.php", and fails.
* tries to open "wiki/includes/wh_list.php", and fails.
* tries to open "wiki/languages/wh_list.php", and fails.
* tries to open "wiki/extensions/wikihiero/wh_list.php", and succeeds.
So in this example, the first 3 calls can be prevented if PHP is told where the file is.
Testing Method: On a Linux box, run these commands to attach strace to all the apache2 processes, and log their system calls to a temporary file, then generate some activity, and then stop the strace:
-----------------------------------
rm /tmp/strace-log.txt
strace -tt -o /tmp/strace-log.txt -p `pidof apache2 | sed 's/ / -p /g'` &
php maintenance/fuzz-tester.php --keep-passed-tests --include-binary --max-runtime=3 > /tmp/strace-tests.txt
killall -9 strace
grep "No such file or directory" /tmp/strace-log.txt | sort -u
-----------------------------------
Any failed file stats will be marked with: "-1 ENOENT (No such file or directory)".
Also:
* Strict Standards: Undefined offset: 230 in includes/normal/UtfNormal.php on line 637
* Strict Standards: iconv() [<a href='function.iconv'>function.iconv</a>]: Detected an illegal character in input string in languages/Language.php on line 776
[Note: Partial only - despite adding "//IGNORE", it still seems to be possible with some
messed- up binary input to cause PHP 5.1.2's iconv() function to squeal like a stuck pig].
* Update one $fname variable (method belongs to HistoryBlobStub class).
2007-02-09 05:36:56 +00:00
|
|
|
} else {
|
2011-03-27 12:21:45 +00:00
|
|
|
# Don't add if we're continuing a broken sequence;
|
|
|
|
|
# we already put a replacement character when we looked
|
|
|
|
|
# at the broken sequence.
|
|
|
|
|
$replace[] = array( '', $base + $i, 1 );
|
Prevent some unnecessary lstat system calls, generated by include or require directives.
This can be done either by:
* Using explicit full paths, using the $IP global for the installation directory full path, and then working down the tree from there.
* Using explicit full paths, using the "dirname(__FILE__)" directive to get a full directory path for the includer file.
* Occasionally removing the line altogether, and then for some files the inclusion is handled by the autoloader.
For example, if the "extensions/wikihiero/wh_main.php" file does an include or require on "wh_list.php", then PHP does the following:
* tries to open "wiki/wh_list.php", and fails.
* tries to open "wiki/includes/wh_list.php", and fails.
* tries to open "wiki/languages/wh_list.php", and fails.
* tries to open "wiki/extensions/wikihiero/wh_list.php", and succeeds.
So in this example, the first 3 calls can be prevented if PHP is told where the file is.
Testing Method: On a Linux box, run these commands to attach strace to all the apache2 processes, and log their system calls to a temporary file, then generate some activity, and then stop the strace:
-----------------------------------
rm /tmp/strace-log.txt
strace -tt -o /tmp/strace-log.txt -p `pidof apache2 | sed 's/ / -p /g'` &
php maintenance/fuzz-tester.php --keep-passed-tests --include-binary --max-runtime=3 > /tmp/strace-tests.txt
killall -9 strace
grep "No such file or directory" /tmp/strace-log.txt | sort -u
-----------------------------------
Any failed file stats will be marked with: "-1 ENOENT (No such file or directory)".
Also:
* Strict Standards: Undefined offset: 230 in includes/normal/UtfNormal.php on line 637
* Strict Standards: iconv() [<a href='function.iconv'>function.iconv</a>]: Detected an illegal character in input string in languages/Language.php on line 776
[Note: Partial only - despite adding "//IGNORE", it still seems to be possible with some
messed- up binary input to cause PHP 5.1.2's iconv() function to squeal like a stuck pig].
* Update one $fname variable (method belongs to HistoryBlobStub class).
2007-02-09 05:36:56 +00:00
|
|
|
}
|
2011-03-27 12:21:45 +00:00
|
|
|
} else {
|
|
|
|
|
# Miscellaneous freaks.
|
|
|
|
|
$replace[] = array( UTF8_REPLACEMENT, $base + $i, 1 );
|
|
|
|
|
$head = '';
|
2004-08-29 10:30:23 +00:00
|
|
|
}
|
|
|
|
|
}
|
2011-03-27 12:21:45 +00:00
|
|
|
$base += $chunk;
|
2004-10-30 12:06:31 +00:00
|
|
|
}
|
2011-03-27 12:21:45 +00:00
|
|
|
if( count( $replace ) ) {
|
|
|
|
|
# There were illegal UTF-8 sequences we need to fix up.
|
|
|
|
|
$out = '';
|
|
|
|
|
$last = 0;
|
|
|
|
|
foreach( $replace as $rep ) {
|
|
|
|
|
list( $replacement, $start, $length ) = $rep;
|
|
|
|
|
if( $last < $start ) {
|
|
|
|
|
$out .= substr( $string, $last, $start - $last );
|
|
|
|
|
}
|
|
|
|
|
$out .= $replacement;
|
|
|
|
|
$last = $start + $length;
|
2011-03-24 20:51:38 +00:00
|
|
|
}
|
2011-03-27 12:21:45 +00:00
|
|
|
if( $last < strlen( $string ) ) {
|
|
|
|
|
$out .= substr( $string, $last );
|
|
|
|
|
}
|
|
|
|
|
$string = $out;
|
2004-08-29 10:30:23 +00:00
|
|
|
}
|
2011-03-27 12:21:45 +00:00
|
|
|
return $looksNormal;
|
2004-08-29 10:30:23 +00:00
|
|
|
}
|
|
|
|
|
|
2011-03-27 12:21:45 +00:00
|
|
|
# These take a string and run the normalization on them, without
|
|
|
|
|
# checking for validity or any optimization etc. Input must be
|
|
|
|
|
# VALID UTF-8!
|
2004-09-04 09:35:01 +00:00
|
|
|
/**
|
2011-03-27 12:21:45 +00:00
|
|
|
* @param $string string
|
|
|
|
|
* @return string
|
|
|
|
|
* @private
|
|
|
|
|
*/
|
|
|
|
|
static function NFC( $string ) {
|
|
|
|
|
return UtfNormal::fastCompose( UtfNormal::NFD( $string ) );
|
|
|
|
|
}
|
2011-03-24 20:51:38 +00:00
|
|
|
|
2011-03-27 12:21:45 +00:00
|
|
|
/**
|
|
|
|
|
* @param $string string
|
|
|
|
|
* @return string
|
|
|
|
|
* @private
|
|
|
|
|
*/
|
|
|
|
|
static function NFD( $string ) {
|
|
|
|
|
UtfNormal::loadData();
|
|
|
|
|
|
|
|
|
|
return UtfNormal::fastCombiningSort(
|
|
|
|
|
UtfNormal::fastDecompose( $string, self::$utfCanonicalDecomp ) );
|
|
|
|
|
}
|
2011-03-24 20:51:38 +00:00
|
|
|
|
2011-03-27 12:21:45 +00:00
|
|
|
/**
|
|
|
|
|
* @param $string string
|
|
|
|
|
* @return string
|
|
|
|
|
* @private
|
|
|
|
|
*/
|
|
|
|
|
static function NFKC( $string ) {
|
|
|
|
|
return UtfNormal::fastCompose( UtfNormal::NFKD( $string ) );
|
|
|
|
|
}
|
2011-03-24 20:51:38 +00:00
|
|
|
|
2011-03-27 12:21:45 +00:00
|
|
|
/**
|
|
|
|
|
* @param $string string
|
|
|
|
|
* @return string
|
|
|
|
|
* @private
|
|
|
|
|
*/
|
|
|
|
|
static function NFKD( $string ) {
|
|
|
|
|
if( !isset( self::$utfCompatibilityDecomp ) ) {
|
|
|
|
|
require_once( 'UtfNormalDataK.inc' );
|
|
|
|
|
}
|
|
|
|
|
return self::fastCombiningSort(
|
|
|
|
|
self::fastDecompose( $string, self::$utfCompatibilityDecomp ) );
|
|
|
|
|
}
|
2011-03-24 20:51:38 +00:00
|
|
|
|
|
|
|
|
|
2011-03-27 12:21:45 +00:00
|
|
|
/**
|
|
|
|
|
* Perform decomposition of a UTF-8 string into either D or KD form
|
|
|
|
|
* (depending on which decomposition map is passed to us).
|
|
|
|
|
* Input is assumed to be *valid* UTF-8. Invalid code will break.
|
|
|
|
|
* @private
|
|
|
|
|
* @param $string String: valid UTF-8 string
|
|
|
|
|
* @param $map Array: hash of expanded decomposition map
|
|
|
|
|
* @return string a UTF-8 string decomposed, not yet normalized (needs sorting)
|
|
|
|
|
*/
|
|
|
|
|
static function fastDecompose( $string, $map ) {
|
|
|
|
|
UtfNormal::loadData();
|
|
|
|
|
$len = strlen( $string );
|
|
|
|
|
$out = '';
|
|
|
|
|
for( $i = 0; $i < $len; $i++ ) {
|
2011-04-17 07:59:58 +00:00
|
|
|
$c = $string[$i];
|
2011-03-27 12:21:45 +00:00
|
|
|
$n = ord( $c );
|
|
|
|
|
if( $n < 0x80 ) {
|
|
|
|
|
# ASCII chars never decompose
|
|
|
|
|
# THEY ARE IMMORTAL
|
|
|
|
|
$out .= $c;
|
|
|
|
|
continue;
|
|
|
|
|
} elseif( $n >= 0xf0 ) {
|
|
|
|
|
$c = substr( $string, $i, 4 );
|
|
|
|
|
$i += 3;
|
|
|
|
|
} elseif( $n >= 0xe0 ) {
|
|
|
|
|
$c = substr( $string, $i, 3 );
|
|
|
|
|
$i += 2;
|
|
|
|
|
} elseif( $n >= 0xc0 ) {
|
|
|
|
|
$c = substr( $string, $i, 2 );
|
|
|
|
|
$i++;
|
|
|
|
|
}
|
|
|
|
|
if( isset( $map[$c] ) ) {
|
|
|
|
|
$out .= $map[$c];
|
|
|
|
|
continue;
|
|
|
|
|
} else {
|
|
|
|
|
if( $c >= UTF8_HANGUL_FIRST && $c <= UTF8_HANGUL_LAST ) {
|
|
|
|
|
# Decompose a hangul syllable into jamo;
|
|
|
|
|
# hardcoded for three-byte UTF-8 sequence.
|
|
|
|
|
# A lookup table would be slightly faster,
|
|
|
|
|
# but adds a lot of memory & disk needs.
|
|
|
|
|
#
|
2011-04-17 07:59:58 +00:00
|
|
|
$index = ( (ord( $c[0] ) & 0x0f) << 12
|
|
|
|
|
| (ord( $c[1] ) & 0x3f) << 6
|
|
|
|
|
| (ord( $c[2] ) & 0x3f) )
|
2011-03-27 12:21:45 +00:00
|
|
|
- UNICODE_HANGUL_FIRST;
|
|
|
|
|
$l = intval( $index / UNICODE_HANGUL_NCOUNT );
|
|
|
|
|
$v = intval( ($index % UNICODE_HANGUL_NCOUNT) / UNICODE_HANGUL_TCOUNT);
|
|
|
|
|
$t = $index % UNICODE_HANGUL_TCOUNT;
|
|
|
|
|
$out .= "\xe1\x84" . chr( 0x80 + $l ) . "\xe1\x85" . chr( 0xa1 + $v );
|
|
|
|
|
if( $t >= 25 ) {
|
|
|
|
|
$out .= "\xe1\x87" . chr( 0x80 + $t - 25 );
|
|
|
|
|
} elseif( $t ) {
|
|
|
|
|
$out .= "\xe1\x86" . chr( 0xa7 + $t );
|
2004-10-30 06:02:30 +00:00
|
|
|
}
|
|
|
|
|
continue;
|
|
|
|
|
}
|
2011-03-27 12:21:45 +00:00
|
|
|
}
|
|
|
|
|
$out .= $c;
|
|
|
|
|
}
|
|
|
|
|
return $out;
|
|
|
|
|
}
|
2011-03-24 20:51:38 +00:00
|
|
|
|
2011-03-27 12:21:45 +00:00
|
|
|
/**
|
|
|
|
|
* Sorts combining characters into canonical order. This is the
|
|
|
|
|
* final step in creating decomposed normal forms D and KD.
|
|
|
|
|
* @private
|
|
|
|
|
* @param $string String: a valid, decomposed UTF-8 string. Input is not validated.
|
|
|
|
|
* @return string a UTF-8 string with combining characters sorted in canonical order
|
|
|
|
|
*/
|
|
|
|
|
static function fastCombiningSort( $string ) {
|
|
|
|
|
UtfNormal::loadData();
|
|
|
|
|
$len = strlen( $string );
|
|
|
|
|
$out = '';
|
|
|
|
|
$combiners = array();
|
|
|
|
|
$lastClass = -1;
|
|
|
|
|
for( $i = 0; $i < $len; $i++ ) {
|
2011-04-17 07:59:58 +00:00
|
|
|
$c = $string[$i];
|
2011-03-27 12:21:45 +00:00
|
|
|
$n = ord( $c );
|
|
|
|
|
if( $n >= 0x80 ) {
|
|
|
|
|
if( $n >= 0xf0 ) {
|
|
|
|
|
$c = substr( $string, $i, 4 );
|
|
|
|
|
$i += 3;
|
|
|
|
|
} elseif( $n >= 0xe0 ) {
|
|
|
|
|
$c = substr( $string, $i, 3 );
|
|
|
|
|
$i += 2;
|
|
|
|
|
} elseif( $n >= 0xc0 ) {
|
|
|
|
|
$c = substr( $string, $i, 2 );
|
|
|
|
|
$i++;
|
|
|
|
|
}
|
|
|
|
|
if( isset( self::$utfCombiningClass[$c] ) ) {
|
|
|
|
|
$lastClass = self::$utfCombiningClass[$c];
|
|
|
|
|
if( isset( $combiners[$lastClass] ) ) {
|
|
|
|
|
$combiners[$lastClass] .= $c;
|
2011-03-24 20:51:38 +00:00
|
|
|
} else {
|
2011-03-27 12:21:45 +00:00
|
|
|
$combiners[$lastClass] = $c;
|
2011-03-24 20:51:38 +00:00
|
|
|
}
|
2011-03-27 12:21:45 +00:00
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if( $lastClass ) {
|
|
|
|
|
ksort( $combiners );
|
|
|
|
|
$out .= implode( '', $combiners );
|
|
|
|
|
$combiners = array();
|
|
|
|
|
}
|
|
|
|
|
$out .= $c;
|
|
|
|
|
$lastClass = 0;
|
|
|
|
|
}
|
|
|
|
|
if( $lastClass ) {
|
|
|
|
|
ksort( $combiners );
|
|
|
|
|
$out .= implode( '', $combiners );
|
|
|
|
|
}
|
|
|
|
|
return $out;
|
|
|
|
|
}
|
2006-01-07 13:31:29 +00:00
|
|
|
|
2011-03-27 12:21:45 +00:00
|
|
|
/**
|
|
|
|
|
* Produces canonically composed sequences, i.e. normal form C or KC.
|
|
|
|
|
*
|
|
|
|
|
* @private
|
|
|
|
|
* @param $string String: a valid UTF-8 string in sorted normal form D or KD. Input is not validated.
|
|
|
|
|
* @return string a UTF-8 string with canonical precomposed characters used where possible
|
|
|
|
|
*/
|
|
|
|
|
static function fastCompose( $string ) {
|
|
|
|
|
UtfNormal::loadData();
|
|
|
|
|
$len = strlen( $string );
|
|
|
|
|
$out = '';
|
|
|
|
|
$lastClass = -1;
|
|
|
|
|
$lastHangul = 0;
|
|
|
|
|
$startChar = '';
|
|
|
|
|
$combining = '';
|
|
|
|
|
$x1 = ord(substr(UTF8_HANGUL_VBASE,0,1));
|
|
|
|
|
$x2 = ord(substr(UTF8_HANGUL_TEND,0,1));
|
|
|
|
|
for( $i = 0; $i < $len; $i++ ) {
|
2011-04-17 07:59:58 +00:00
|
|
|
$c = $string[$i];
|
2011-03-27 12:21:45 +00:00
|
|
|
$n = ord( $c );
|
|
|
|
|
if( $n < 0x80 ) {
|
|
|
|
|
# No combining characters here...
|
|
|
|
|
$out .= $startChar;
|
|
|
|
|
$out .= $combining;
|
|
|
|
|
$startChar = $c;
|
|
|
|
|
$combining = '';
|
|
|
|
|
$lastClass = 0;
|
|
|
|
|
continue;
|
|
|
|
|
} elseif( $n >= 0xf0 ) {
|
|
|
|
|
$c = substr( $string, $i, 4 );
|
|
|
|
|
$i += 3;
|
|
|
|
|
} elseif( $n >= 0xe0 ) {
|
|
|
|
|
$c = substr( $string, $i, 3 );
|
|
|
|
|
$i += 2;
|
|
|
|
|
} elseif( $n >= 0xc0 ) {
|
|
|
|
|
$c = substr( $string, $i, 2 );
|
|
|
|
|
$i++;
|
|
|
|
|
}
|
|
|
|
|
$pair = $startChar . $c;
|
|
|
|
|
if( $n > 0x80 ) {
|
|
|
|
|
if( isset( self::$utfCombiningClass[$c] ) ) {
|
|
|
|
|
# A combining char; see what we can do with it
|
|
|
|
|
$class = self::$utfCombiningClass[$c];
|
|
|
|
|
if( !empty( $startChar ) &&
|
|
|
|
|
$lastClass < $class &&
|
|
|
|
|
$class > 0 &&
|
|
|
|
|
isset( self::$utfCanonicalComp[$pair] ) ) {
|
|
|
|
|
$startChar = self::$utfCanonicalComp[$pair];
|
|
|
|
|
$class = 0;
|
2011-03-24 20:51:38 +00:00
|
|
|
} else {
|
2011-03-27 12:21:45 +00:00
|
|
|
$combining .= $c;
|
2011-03-24 20:51:38 +00:00
|
|
|
}
|
2011-03-27 12:21:45 +00:00
|
|
|
$lastClass = $class;
|
|
|
|
|
$lastHangul = 0;
|
|
|
|
|
continue;
|
2011-03-24 20:51:38 +00:00
|
|
|
}
|
2011-03-27 12:21:45 +00:00
|
|
|
}
|
|
|
|
|
# New start char
|
|
|
|
|
if( $lastClass == 0 ) {
|
|
|
|
|
if( isset( self::$utfCanonicalComp[$pair] ) ) {
|
|
|
|
|
$startChar = self::$utfCanonicalComp[$pair];
|
|
|
|
|
$lastHangul = 0;
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
if( $n >= $x1 && $n <= $x2 ) {
|
|
|
|
|
# WARNING: Hangul code is painfully slow.
|
|
|
|
|
# I apologize for this ugly, ugly code; however
|
|
|
|
|
# performance is even more teh suck if we call
|
|
|
|
|
# out to nice clean functions. Lookup tables are
|
|
|
|
|
# marginally faster, but require a lot of space.
|
|
|
|
|
#
|
|
|
|
|
if( $c >= UTF8_HANGUL_VBASE &&
|
|
|
|
|
$c <= UTF8_HANGUL_VEND &&
|
|
|
|
|
$startChar >= UTF8_HANGUL_LBASE &&
|
|
|
|
|
$startChar <= UTF8_HANGUL_LEND ) {
|
|
|
|
|
#
|
|
|
|
|
#$lIndex = utf8ToCodepoint( $startChar ) - UNICODE_HANGUL_LBASE;
|
|
|
|
|
#$vIndex = utf8ToCodepoint( $c ) - UNICODE_HANGUL_VBASE;
|
2011-04-17 07:59:58 +00:00
|
|
|
$lIndex = ord( $startChar[2] ) - 0x80;
|
|
|
|
|
$vIndex = ord( $c[2] ) - 0xa1;
|
2011-03-27 12:21:45 +00:00
|
|
|
|
|
|
|
|
$hangulPoint = UNICODE_HANGUL_FIRST +
|
|
|
|
|
UNICODE_HANGUL_TCOUNT *
|
|
|
|
|
(UNICODE_HANGUL_VCOUNT * $lIndex + $vIndex);
|
|
|
|
|
|
|
|
|
|
# Hardcode the limited-range UTF-8 conversion:
|
|
|
|
|
$startChar = chr( $hangulPoint >> 12 & 0x0f | 0xe0 ) .
|
|
|
|
|
chr( $hangulPoint >> 6 & 0x3f | 0x80 ) .
|
|
|
|
|
chr( $hangulPoint & 0x3f | 0x80 );
|
|
|
|
|
$lastHangul = 0;
|
|
|
|
|
continue;
|
|
|
|
|
} elseif( $c >= UTF8_HANGUL_TBASE &&
|
|
|
|
|
$c <= UTF8_HANGUL_TEND &&
|
|
|
|
|
$startChar >= UTF8_HANGUL_FIRST &&
|
|
|
|
|
$startChar <= UTF8_HANGUL_LAST &&
|
|
|
|
|
!$lastHangul ) {
|
|
|
|
|
# $tIndex = utf8ToCodepoint( $c ) - UNICODE_HANGUL_TBASE;
|
2011-04-17 07:59:58 +00:00
|
|
|
$tIndex = ord( $c[2] ) - 0xa7;
|
|
|
|
|
if( $tIndex < 0 ) $tIndex = ord( $c[2] ) - 0x80 + (0x11c0 - 0x11a7);
|
2011-03-27 12:21:45 +00:00
|
|
|
|
|
|
|
|
# Increment the code point by $tIndex, without
|
|
|
|
|
# the function overhead of decoding and recoding UTF-8
|
|
|
|
|
#
|
2011-04-17 07:59:58 +00:00
|
|
|
$tail = ord( $startChar[2] ) + $tIndex;
|
2011-03-27 12:21:45 +00:00
|
|
|
if( $tail > 0xbf ) {
|
|
|
|
|
$tail -= 0x40;
|
2011-04-17 07:59:58 +00:00
|
|
|
$mid = ord( $startChar[1] ) + 1;
|
2011-03-27 12:21:45 +00:00
|
|
|
if( $mid > 0xbf ) {
|
2011-04-17 07:59:58 +00:00
|
|
|
$startChar[0] = chr( ord( $startChar[0] ) + 1 );
|
2011-03-27 12:21:45 +00:00
|
|
|
$mid -= 0x40;
|
|
|
|
|
}
|
2011-04-17 07:59:58 +00:00
|
|
|
$startChar[1] = chr( $mid );
|
2011-03-24 20:51:38 +00:00
|
|
|
}
|
2011-04-17 07:59:58 +00:00
|
|
|
$startChar[2] = chr( $tail );
|
2011-03-24 20:51:38 +00:00
|
|
|
|
2011-03-27 12:21:45 +00:00
|
|
|
# If there's another jamo char after this, *don't* try to merge it.
|
|
|
|
|
$lastHangul = 1;
|
|
|
|
|
continue;
|
2004-10-30 06:02:30 +00:00
|
|
|
}
|
2004-08-29 10:30:23 +00:00
|
|
|
}
|
|
|
|
|
}
|
2011-03-27 12:21:45 +00:00
|
|
|
$out .= $startChar;
|
|
|
|
|
$out .= $combining;
|
|
|
|
|
$startChar = $c;
|
|
|
|
|
$combining = '';
|
|
|
|
|
$lastClass = 0;
|
|
|
|
|
$lastHangul = 0;
|
2004-08-29 10:30:23 +00:00
|
|
|
}
|
2011-03-27 12:21:45 +00:00
|
|
|
$out .= $startChar . $combining;
|
|
|
|
|
return $out;
|
|
|
|
|
}
|
2011-03-24 20:51:38 +00:00
|
|
|
|
2011-03-27 12:21:45 +00:00
|
|
|
/**
|
|
|
|
|
* This is just used for the benchmark, comparing how long it takes to
|
|
|
|
|
* interate through a string without really doing anything of substance.
|
|
|
|
|
* @param $string string
|
|
|
|
|
* @return string
|
|
|
|
|
*/
|
|
|
|
|
static function placebo( $string ) {
|
|
|
|
|
$len = strlen( $string );
|
|
|
|
|
$out = '';
|
|
|
|
|
for( $i = 0; $i < $len; $i++ ) {
|
2011-04-17 07:59:58 +00:00
|
|
|
$out .= $string[$i];
|
2004-08-29 10:30:23 +00:00
|
|
|
}
|
2011-03-27 12:21:45 +00:00
|
|
|
return $out;
|
2004-08-29 10:30:23 +00:00
|
|
|
}
|
2011-04-16 15:32:19 +00:00
|
|
|
/**
|
|
|
|
|
* Function to replace some characters that we don't want
|
|
|
|
|
* but most of the native normalize functions keep.
|
|
|
|
|
*
|
|
|
|
|
* @param $string String The string
|
|
|
|
|
* @return String String with the character codes replaced.
|
|
|
|
|
*/
|
|
|
|
|
private static function replaceForNativeNormalize( $string ) {
|
|
|
|
|
$string = preg_replace(
|
|
|
|
|
'/[\x00-\x08\x0b\x0c\x0e-\x1f]/',
|
|
|
|
|
UTF8_REPLACEMENT,
|
|
|
|
|
$string );
|
|
|
|
|
$string = str_replace( UTF8_FFFE, UTF8_REPLACEMENT, $string );
|
|
|
|
|
$string = str_replace( UTF8_FFFF, UTF8_REPLACEMENT, $string );
|
|
|
|
|
return $string;
|
|
|
|
|
}
|
2004-08-29 10:30:23 +00:00
|
|
|
}
|