2012-11-16 12:47:10 +00:00
|
|
|
<?php
|
|
|
|
|
|
|
|
|
|
class StringUtilsTest extends MediaWikiTestCase {
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* This test StringUtils::isUtf8 whenever we have mbstring extension
|
|
|
|
|
* loaded.
|
|
|
|
|
*
|
2013-03-11 03:16:28 +00:00
|
|
|
* @covers StringUtils::isUtf8
|
2012-11-16 12:47:10 +00:00
|
|
|
* @dataProvider provideStringsForIsUtf8Check
|
|
|
|
|
*/
|
2013-02-14 11:36:35 +00:00
|
|
|
function testIsUtf8WithMbstring( $expected, $string ) {
|
|
|
|
|
if ( !function_exists( 'mb_check_encoding' ) ) {
|
2012-11-16 12:47:10 +00:00
|
|
|
$this->markTestSkipped( 'Test requires the mbstring PHP extension' );
|
|
|
|
|
}
|
|
|
|
|
$this->assertEquals( $expected,
|
|
|
|
|
StringUtils::isUtf8( $string ),
|
|
|
|
|
'Testing string "' . $this->escaped( $string ) . '" with mb_check_encoding'
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* This test StringUtils::isUtf8 making sure we use the pure PHP
|
|
|
|
|
* implementation used as a fallback when mb_check_encoding() is
|
|
|
|
|
* not available.
|
|
|
|
|
*
|
2013-03-11 03:16:28 +00:00
|
|
|
* @covers StringUtils::isUtf8
|
2012-11-16 12:47:10 +00:00
|
|
|
* @dataProvider provideStringsForIsUtf8Check
|
|
|
|
|
*/
|
2013-02-14 11:36:35 +00:00
|
|
|
function testIsUtf8WithPhpFallbackImplementation( $expected, $string ) {
|
2012-11-16 12:47:10 +00:00
|
|
|
$this->assertEquals( $expected,
|
2013-03-25 23:27:14 +00:00
|
|
|
StringUtils::isUtf8( $string, /** disable mbstring: */true ),
|
2012-11-16 12:47:10 +00:00
|
|
|
'Testing string "' . $this->escaped( $string ) . '" with pure PHP implementation'
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Print high range characters as an hexadecimal
|
|
|
|
|
*/
|
|
|
|
|
function escaped( $string ) {
|
|
|
|
|
$escaped = '';
|
2013-02-19 21:00:09 +00:00
|
|
|
$length = strlen( $string );
|
|
|
|
|
for ( $i = 0; $i < $length; $i++ ) {
|
2012-11-16 12:47:10 +00:00
|
|
|
$char = $string[$i];
|
2013-02-14 11:36:35 +00:00
|
|
|
$val = ord( $char );
|
|
|
|
|
if ( $val > 127 ) {
|
|
|
|
|
$escaped .= '\x' . dechex( $val );
|
2012-11-16 12:47:10 +00:00
|
|
|
} else {
|
|
|
|
|
$escaped .= $char;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return $escaped;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* See also "UTF-8 decoder capability and stress test" by
|
|
|
|
|
* Markus Kuhn:
|
|
|
|
|
* http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
|
|
|
|
|
*/
|
2013-03-22 02:12:37 +00:00
|
|
|
public static function provideStringsForIsUtf8Check() {
|
2012-11-16 12:47:10 +00:00
|
|
|
// Expected return values for StringUtils::isUtf8()
|
|
|
|
|
$PASS = true;
|
|
|
|
|
$FAIL = false;
|
|
|
|
|
|
|
|
|
|
return array(
|
|
|
|
|
array( $PASS, 'Some ASCII' ),
|
|
|
|
|
array( $PASS, "Euro sign €" ),
|
|
|
|
|
|
|
|
|
|
# First possible sequences
|
|
|
|
|
array( $PASS, "\x00" ),
|
|
|
|
|
array( $PASS, "\xc2\x80" ),
|
|
|
|
|
array( $PASS, "\xe0\xa0\x80" ),
|
|
|
|
|
array( $PASS, "\xf0\x90\x80\x80" ),
|
|
|
|
|
array( $PASS, "\xf8\x88\x80\x80\x80" ),
|
|
|
|
|
array( $PASS, "\xfc\x84\x80\x80\x80\x80" ),
|
|
|
|
|
|
|
|
|
|
# Last possible sequence
|
|
|
|
|
array( $PASS, "\x7f" ),
|
|
|
|
|
array( $PASS, "\xdf\xbf" ),
|
|
|
|
|
array( $PASS, "\xef\xbf\xbf" ),
|
|
|
|
|
array( $PASS, "\xf7\xbf\xbf\xbf" ),
|
|
|
|
|
array( $PASS, "\xfb\xbf\xbf\xbf\xbf" ),
|
|
|
|
|
array( $FAIL, "\xfd\xbf\xbf\xbf\xbf\xbf" ),
|
|
|
|
|
|
|
|
|
|
# boundaries:
|
|
|
|
|
array( $PASS, "\xed\x9f\xbf" ),
|
|
|
|
|
array( $PASS, "\xee\x80\x80" ),
|
|
|
|
|
array( $PASS, "\xef\xbf\xbd" ),
|
|
|
|
|
array( $PASS, "\xf4\x8f\xbf\xbf" ),
|
|
|
|
|
array( $PASS, "\xf4\x90\x80\x80" ),
|
|
|
|
|
|
|
|
|
|
# Malformed
|
|
|
|
|
array( $FAIL, "\x80" ),
|
|
|
|
|
array( $FAIL, "\xBF" ),
|
|
|
|
|
array( $FAIL, "\x80\xbf" ),
|
|
|
|
|
array( $FAIL, "\x80\xbf\x80" ),
|
|
|
|
|
array( $FAIL, "\x80\xbf\x80\xbf" ),
|
|
|
|
|
array( $FAIL, "\x80\xbf\x80\xbf\x80" ),
|
|
|
|
|
array( $FAIL, "\x80\xbf\x80\xbf\x80\xbf" ),
|
|
|
|
|
array( $FAIL, "\x80\xbf\x80\xbf\x80\xbf\x80" ),
|
|
|
|
|
|
|
|
|
|
# last byte missing
|
|
|
|
|
array( $FAIL, "\xc0" ),
|
|
|
|
|
array( $FAIL, "\xe0\x80" ),
|
|
|
|
|
array( $FAIL, "\xf0\x80\x80" ),
|
|
|
|
|
array( $FAIL, "\xf8\x80\x80\x80" ),
|
|
|
|
|
array( $FAIL, "\xfc\x80\x80\x80\x80" ),
|
|
|
|
|
array( $FAIL, "\xdf" ),
|
|
|
|
|
array( $FAIL, "\xef\xbf" ),
|
|
|
|
|
array( $FAIL, "\xf7\xbf\xbf" ),
|
|
|
|
|
array( $FAIL, "\xfb\xbf\xbf\xbf" ),
|
|
|
|
|
array( $FAIL, "\xfd\xbf\xbf\xbf\xbf" ),
|
|
|
|
|
|
|
|
|
|
# impossible bytes
|
|
|
|
|
array( $FAIL, "\xfe" ),
|
|
|
|
|
array( $FAIL, "\xff" ),
|
|
|
|
|
array( $FAIL, "\xfe\xfe\xff\xff" ),
|
|
|
|
|
|
2013-03-25 23:27:14 +00:00
|
|
|
/*
|
2012-11-16 12:47:10 +00:00
|
|
|
# The PHP implementation does not handle characters
|
|
|
|
|
# being represented in a form which is too long :(
|
|
|
|
|
|
|
|
|
|
# overlong sequences
|
|
|
|
|
array( $FAIL, "\xc0\xaf" ),
|
|
|
|
|
array( $FAIL, "\xe0\x80\xaf" ),
|
|
|
|
|
array( $FAIL, "\xf0\x80\x80\xaf" ),
|
|
|
|
|
array( $FAIL, "\xf8\x80\x80\x80\xaf" ),
|
|
|
|
|
array( $FAIL, "\xfc\x80\x80\x80\x80\xaf" ),
|
|
|
|
|
|
|
|
|
|
# Maximum overlong sequences
|
|
|
|
|
array( $FAIL, "\xc1\xbf" ),
|
|
|
|
|
array( $FAIL, "\xe0\x9f\xbf" ),
|
|
|
|
|
array( $FAIL, "\xf0\x8F\xbf\xbf" ),
|
|
|
|
|
array( $FAIL, "\xf8\x87\xbf\xbf" ),
|
|
|
|
|
array( $FAIL, "\xfc\x83\xbf\xbf\xbf\xbf" ),
|
2013-03-25 23:27:14 +00:00
|
|
|
*/
|
2012-11-16 12:47:10 +00:00
|
|
|
|
|
|
|
|
# non characters
|
|
|
|
|
array( $PASS, "\xef\xbf\xbe" ),
|
|
|
|
|
array( $PASS, "\xef\xbf\xbf" ),
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
}
|