RFC 3629 defines the legal range of characters as U+0000..U+10FFFF and forbids overlong forms (encodings of a character that use more bytes than necessary). Let's make StringUtils::isUtf8() match the specification. * Changed the maximum value in the pure PHP code path and added a check for overlong forms. * Added another check, specific to PHP 5.3's mbstring extension, for values above U+10FFFF. * Fixed the mbstring test errors in PHP 5.4 using changes to StringUtilsTest by Platonides <platonides@gmail.com>. * Uncommented some other tests that could fail because of the missing check for overlong forms. * Added additional tests for extra continuation bytes, overlong sequences/forms, and values in the UTF-16 surrogate range. The changes to the function were so extensive that I might as well say I rewrote it. Bug: 43679 Change-Id: I56ae496d17ffc3747550e06a72dacab3ac55da61
158 lines
4.5 KiB
PHP
158 lines
4.5 KiB
PHP
<?php
|
|
|
|
class StringUtilsTest extends MediaWikiTestCase {
|
|
|
|
/**
|
|
* This test StringUtils::isUtf8 whenever we have mbstring extension
|
|
* loaded.
|
|
*
|
|
* @covers StringUtils::isUtf8
|
|
* @dataProvider provideStringsForIsUtf8Check
|
|
*/
|
|
function testIsUtf8WithMbstring( $expected, $string ) {
|
|
if ( !function_exists( 'mb_check_encoding' ) ) {
|
|
$this->markTestSkipped( 'Test requires the mbstring PHP extension' );
|
|
}
|
|
$this->assertEquals( $expected,
|
|
StringUtils::isUtf8( $string ),
|
|
'Testing string "' . $this->escaped( $string ) . '" with mb_check_encoding'
|
|
);
|
|
}
|
|
|
|
/**
|
|
* This test StringUtils::isUtf8 making sure we use the pure PHP
|
|
* implementation used as a fallback when mb_check_encoding() is
|
|
* not available.
|
|
*
|
|
* @covers StringUtils::isUtf8
|
|
* @dataProvider provideStringsForIsUtf8Check
|
|
*/
|
|
function testIsUtf8WithPhpFallbackImplementation( $expected, $string ) {
|
|
$this->assertEquals( $expected,
|
|
StringUtils::isUtf8( $string, /** disable mbstring: */true ),
|
|
'Testing string "' . $this->escaped( $string ) . '" with pure PHP implementation'
|
|
);
|
|
}
|
|
|
|
/**
|
|
* Print high range characters as an hexadecimal
|
|
*/
|
|
function escaped( $string ) {
|
|
$escaped = '';
|
|
$length = strlen( $string );
|
|
for ( $i = 0; $i < $length; $i++ ) {
|
|
$char = $string[$i];
|
|
$val = ord( $char );
|
|
if ( $val > 127 ) {
|
|
$escaped .= '\x' . dechex( $val );
|
|
} else {
|
|
$escaped .= $char;
|
|
}
|
|
}
|
|
|
|
return $escaped;
|
|
}
|
|
|
|
/**
|
|
* See also "UTF-8 decoder capability and stress test" by
|
|
* Markus Kuhn:
|
|
* http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
|
|
*/
|
|
public static function provideStringsForIsUtf8Check() {
|
|
// Expected return values for StringUtils::isUtf8()
|
|
$PASS = true;
|
|
$FAIL = false;
|
|
|
|
return array(
|
|
array( $PASS, 'Some ASCII' ),
|
|
array( $PASS, "Euro sign €" ),
|
|
|
|
// First possible sequences
|
|
array( $PASS, "\x00" ),
|
|
array( $PASS, "\xc2\x80" ),
|
|
array( $PASS, "\xe0\xa0\x80" ),
|
|
array( $PASS, "\xf0\x90\x80\x80" ),
|
|
array( $FAIL, "\xf8\x88\x80\x80\x80" ),
|
|
array( $FAIL, "\xfc\x84\x80\x80\x80\x80" ),
|
|
|
|
// Last possible sequence
|
|
array( $PASS, "\x7f" ),
|
|
array( $PASS, "\xdf\xbf" ),
|
|
array( $PASS, "\xef\xbf\xbf" ),
|
|
array( $FAIL, "\xf7\xbf\xbf\xbf" ), // U+1FFFFF
|
|
array( $FAIL, "\xfb\xbf\xbf\xbf\xbf" ),
|
|
array( $FAIL, "\xfd\xbf\xbf\xbf\xbf\xbf" ),
|
|
|
|
// Boundaries
|
|
array( $PASS, "\xed\x9f\xbf" ),
|
|
array( $PASS, "\xee\x80\x80" ),
|
|
array( $PASS, "\xef\xbf\xbd" ),
|
|
array( $PASS, "\xf2\x80\x80\x80" ),
|
|
array( $PASS, "\xf3\xbf\xbf\xbf" ), // U+FFFFF
|
|
array( $PASS, "\xf4\x80\x80\x80" ), // U+100000
|
|
array( $PASS, "\xf4\x8f\xbf\xbf" ), // U+10FFFF
|
|
array( $FAIL, "\xf4\x90\x80\x80" ), // U+110000
|
|
|
|
// Malformed
|
|
array( $FAIL, "\x80" ),
|
|
array( $FAIL, "\xbf" ),
|
|
array( $FAIL, "\x80\xbf" ),
|
|
array( $FAIL, "\x80\xbf\x80" ),
|
|
array( $FAIL, "\x80\xbf\x80\xbf" ),
|
|
array( $FAIL, "\x80\xbf\x80\xbf\x80" ),
|
|
array( $FAIL, "\x80\xbf\x80\xbf\x80\xbf" ),
|
|
array( $FAIL, "\x80\xbf\x80\xbf\x80\xbf\x80" ),
|
|
|
|
// Last byte missing
|
|
array( $FAIL, "\xc0" ),
|
|
array( $FAIL, "\xe0\x80" ),
|
|
array( $FAIL, "\xf0\x80\x80" ),
|
|
array( $FAIL, "\xf8\x80\x80\x80" ),
|
|
array( $FAIL, "\xfc\x80\x80\x80\x80" ),
|
|
array( $FAIL, "\xdf" ),
|
|
array( $FAIL, "\xef\xbf" ),
|
|
array( $FAIL, "\xf7\xbf\xbf" ),
|
|
array( $FAIL, "\xfb\xbf\xbf\xbf" ),
|
|
array( $FAIL, "\xfd\xbf\xbf\xbf\xbf" ),
|
|
|
|
// Extra continuation byte
|
|
array( $FAIL, "e\xaf" ),
|
|
array( $FAIL, "\xc3\x89\xaf" ),
|
|
array( $FAIL, "\xef\xbc\xa5\xaf" ),
|
|
array( $FAIL, "\xf0\x9d\x99\xb4\xaf" ),
|
|
|
|
// Impossible bytes
|
|
array( $FAIL, "\xfe" ),
|
|
array( $FAIL, "\xff" ),
|
|
array( $FAIL, "\xfe\xfe\xff\xff" ),
|
|
|
|
// Overlong sequences
|
|
array( $FAIL, "\xc0\xaf" ),
|
|
array( $FAIL, "\xc1\xaf" ),
|
|
array( $FAIL, "\xe0\x80\xaf" ),
|
|
array( $FAIL, "\xf0\x80\x80\xaf" ),
|
|
array( $FAIL, "\xf8\x80\x80\x80\xaf" ),
|
|
array( $FAIL, "\xfc\x80\x80\x80\x80\xaf" ),
|
|
|
|
// Maximum overlong sequences
|
|
array( $FAIL, "\xc1\xbf" ),
|
|
array( $FAIL, "\xe0\x9f\xbf" ),
|
|
array( $FAIL, "\xf0\x8f\xbf\xbf" ),
|
|
array( $FAIL, "\xf8\x87\xbf\xbf" ),
|
|
array( $FAIL, "\xfc\x83\xbf\xbf\xbf\xbf" ),
|
|
|
|
// Surrogates
|
|
array( $PASS, "\xed\x9f\xbf" ), // U+D799
|
|
array( $PASS, "\xee\x80\x80" ), // U+E000
|
|
array( $FAIL, "\xed\xa0\x80" ), // U+D800
|
|
array( $FAIL, "\xed\xaf\xbf" ), // U+DBFF
|
|
array( $FAIL, "\xed\xb0\x80" ), // U+DC00
|
|
array( $FAIL, "\xed\xbf\xbf" ), // U+DFFF
|
|
array( $FAIL, "\xed\xa0\x80\xed\xb0\x80" ), // U+D800 U+DC00
|
|
|
|
// Noncharacters
|
|
array( $PASS, "\xef\xbf\xbe" ),
|
|
array( $PASS, "\xef\xbf\xbf" ),
|
|
);
|
|
}
|
|
}
|