Merge "Title: Add byte class to unicode class conversion for js"

This commit is contained in:
jenkins-bot 2013-09-27 17:58:45 +00:00 committed by Gerrit Code Review
commit 2f0636db20
3 changed files with 171 additions and 0 deletions

View file

@ -491,6 +491,108 @@ class Title {
return $rxTc;
}
/**
* Utility method for converting a character sequence from bytes to Unicode.
*
* Primary usecase being converting $wgLegalTitleChars to a sequence usable in
* javascript, as PHP uses UTF-8 bytes where javascript uses Unicode code units.
*
* @param string $byteClass
* @return string
*/
public static function convertByteClassToUnicodeClass( $byteClass ) {
$length = strlen( $byteClass );
// Input token queue
$x0 = $x1 = $x2 = '';
// Decoded queue
$d0 = $d1 = $d2 = '';
// Decoded integer codepoints
$ord0 = $ord1 = $ord2 = 0;
// Re-encoded queue
$r0 = $r1 = $r2 = '';
// Output
$out = '';
// Flags
$allowUnicode = false;
for ( $pos = 0; $pos < $length; $pos++ ) {
// Shift the queues down
$x2 = $x1;
$x1 = $x0;
$d2 = $d1;
$d1 = $d0;
$ord2 = $ord1;
$ord1 = $ord0;
$r2 = $r1;
$r1 = $r0;
// Load the current input token and decoded values
$inChar = $byteClass[$pos];
if ( $inChar == '\\' ) {
if ( preg_match( '/x([0-9a-fA-F]{2})/A', $byteClass, $m, 0, $pos + 1 ) ) {
$x0 = $inChar . $m[0];
$d0 = chr( hexdec( $m[1] ) );
$pos += strlen( $m[0] );
} elseif ( preg_match( '/[0-7]{3}/A', $byteClass, $m, 0, $pos + 1 ) ) {
$x0 = $inChar . $m[0];
$d0 = chr( octdec( $m[0] ) );
$pos += strlen( $m[0] );
} elseif ( $pos + 1 >= $length ) {
$x0 = $d0 = '\\';
} else {
$d0 = $byteClass[$pos + 1];
$x0 = $inChar . $d0;
$pos += 1;
}
} else {
$x0 = $d0 = $inChar;
}
$ord0 = ord( $d0 );
// Load the current re-encoded value
if ( $ord0 < 32 || $ord0 == 0x7f ) {
$r0 = sprintf( '\x%02x', $ord0 );
} elseif ( $ord0 >= 0x80 ) {
// Allow unicode if a single high-bit character appears
$r0 = sprintf( '\x%02x', $ord0 );
$allowUnicode = true;
} elseif ( strpos( '-\\[]^', $d0 ) !== false ) {
$r0 = '\\' . $d0;
} else {
$r0 = $d0;
}
// Do the output
if ( $x0 !== '' && $x1 === '-' && $x2 !== '' ) {
// Range
if ( $ord2 > $ord0 ) {
// Empty range
} elseif ( $ord0 >= 0x80 ) {
// Unicode range
$allowUnicode = true;
if ( $ord2 < 0x80 ) {
// Keep the non-unicode section of the range
$out .= "$r2-\\x7F";
}
} else {
// Normal range
$out .= "$r2-$r0";
}
// Reset state to the initial value
$x0 = $x1 = $d0 = $d1 = $r0 = $r1 = '';
} elseif ( $ord2 < 0x80 ) {
// ASCII character
$out .= $r2;
}
}
if ( $ord1 < 0x80 ) {
$out .= $r1;
}
if ( $ord0 < 0x80 ) {
$out .= $r0;
}
if ( $allowUnicode ) {
$out .= '\u0080-\uFFFF';
}
return $out;
}
/**
* Get a string representation of a title suitable for
* including in a search index

View file

@ -95,6 +95,7 @@ class ResourceLoaderStartUpModule extends ResourceLoaderModule {
'wgCookiePrefix' => $wgCookiePrefix,
'wgResourceLoaderMaxQueryLength' => $wgResourceLoaderMaxQueryLength,
'wgCaseSensitiveNamespaces' => $caseSensitiveNamespaces,
'wgLegalTitleChars' => Title::convertByteClassToUnicodeClass( Title::legalChars() ),
);
wfRunHooks( 'ResourceLoaderGetConfigVars', array( &$vars ) );

View file

@ -32,6 +32,74 @@ class TitleTest extends MediaWikiTestCase {
}
}
public static function provideConvertByteClassToUnicodeClass() {
return array(
array(
' %!"$&\'()*,\\-.\\/0-9:;=?@A-Z\\\\^_`a-z~\\x80-\\xFF+',
' %!"$&\'()*,\\-./0-9:;=?@A-Z\\\\\\^_`a-z~+\\u0080-\\uFFFF',
),
array(
'QWERTYf-\\xFF+',
'QWERTYf-\\x7F+\\u0080-\\uFFFF',
),
array(
'QWERTY\\x66-\\xFD+',
'QWERTYf-\\x7F+\\u0080-\\uFFFF',
),
array(
'QWERTYf-y+',
'QWERTYf-y+',
),
array(
'QWERTYf-\\x80+',
'QWERTYf-\\x7F+\\u0080-\\uFFFF',
),
array(
'QWERTY\\x66-\\x80+\\x23',
'QWERTYf-\\x7F+#\\u0080-\\uFFFF',
),
array(
'QWERTY\\x66-\\x80+\\xD3',
'QWERTYf-\\x7F+\\u0080-\\uFFFF',
),
array(
'\\\\\\x99',
'\\\\\\u0080-\\uFFFF',
),
array(
'-\\x99',
'\\-\\u0080-\\uFFFF',
),
array(
'QWERTY\\-\\x99',
'QWERTY\\-\\u0080-\\uFFFF',
),
array(
'\\\\x99',
'\\\\x99',
),
array(
'A-\\x9F',
'A-\\x7F\\u0080-\\uFFFF',
),
array(
'\\x66-\\x77QWERTY\\x88-\\x91FXZ',
'f-wQWERTYFXZ\\u0080-\\uFFFF',
),
array(
'\\x66-\\x99QWERTY\\xAA-\\xEEFXZ',
'f-\\x7FQWERTYFXZ\\u0080-\\uFFFF',
),
);
}
/**
* @dataProvider provideConvertByteClassToUnicodeClass
*/
function testConvertByteClassToUnicodeClass( $byteClass, $unicodeClass ) {
$this->assertEquals( $unicodeClass, Title::convertByteClassToUnicodeClass( $byteClass ) );
}
/**
* @dataProvider provideBug31100
*/