Title: Add byte class to unicode class conversion for js

The upcoming rewrite of mw.Title needs to use wgLegalTitleChars,
but for that to work, it needs to be converted into something
that can work in javascript.

Signed-off-by: Timo Tijhof <krinklemail@gmail.com>
Signed-off-by: David Chan <david@sheetmusic.org.uk>
Change-Id: I163f3d7e3a680d52640a93f4bd195d8209669918
This commit is contained in:
Timo Tijhof 2013-08-30 14:00:40 -07:00
parent aa6f866bd1
commit dc9c9ee7fc
3 changed files with 171 additions and 0 deletions

View file

@ -491,6 +491,108 @@ class Title {
return $rxTc;
}
/**
* Utility method for converting a character sequence from bytes to Unicode.
*
* Primary usecase being converting $wgLegalTitleChars to a sequence usable in
* javascript, as PHP uses UTF-8 bytes where javascript uses Unicode code units.
*
* @param string $byteClass
* @return string
*/
public static function convertByteClassToUnicodeClass( $byteClass ) {
$length = strlen( $byteClass );
// Input token queue
$x0 = $x1 = $x2 = '';
// Decoded queue
$d0 = $d1 = $d2 = '';
// Decoded integer codepoints
$ord0 = $ord1 = $ord2 = 0;
// Re-encoded queue
$r0 = $r1 = $r2 = '';
// Output
$out = '';
// Flags
$allowUnicode = false;
for ( $pos = 0; $pos < $length; $pos++ ) {
// Shift the queues down
$x2 = $x1;
$x1 = $x0;
$d2 = $d1;
$d1 = $d0;
$ord2 = $ord1;
$ord1 = $ord0;
$r2 = $r1;
$r1 = $r0;
// Load the current input token and decoded values
$inChar = $byteClass[$pos];
if ( $inChar == '\\' ) {
if ( preg_match( '/x([0-9a-fA-F]{2})/A', $byteClass, $m, 0, $pos + 1 ) ) {
$x0 = $inChar . $m[0];
$d0 = chr( hexdec( $m[1] ) );
$pos += strlen( $m[0] );
} elseif ( preg_match( '/[0-7]{3}/A', $byteClass, $m, 0, $pos + 1 ) ) {
$x0 = $inChar . $m[0];
$d0 = chr( octdec( $m[0] ) );
$pos += strlen( $m[0] );
} elseif ( $pos + 1 >= $length ) {
$x0 = $d0 = '\\';
} else {
$d0 = $byteClass[$pos + 1];
$x0 = $inChar . $d0;
$pos += 1;
}
} else {
$x0 = $d0 = $inChar;
}
$ord0 = ord( $d0 );
// Load the current re-encoded value
if ( $ord0 < 32 || $ord0 == 0x7f ) {
$r0 = sprintf( '\x%02x', $ord0 );
} elseif ( $ord0 >= 0x80 ) {
// Allow unicode if a single high-bit character appears
$r0 = sprintf( '\x%02x', $ord0 );
$allowUnicode = true;
} elseif ( strpos( '-\\[]^', $d0 ) !== false ) {
$r0 = '\\' . $d0;
} else {
$r0 = $d0;
}
// Do the output
if ( $x0 !== '' && $x1 === '-' && $x2 !== '' ) {
// Range
if ( $ord2 > $ord0 ) {
// Empty range
} elseif ( $ord0 >= 0x80 ) {
// Unicode range
$allowUnicode = true;
if ( $ord2 < 0x80 ) {
// Keep the non-unicode section of the range
$out .= "$r2-\\x7F";
}
} else {
// Normal range
$out .= "$r2-$r0";
}
// Reset state to the initial value
$x0 = $x1 = $d0 = $d1 = $r0 = $r1 = '';
} elseif ( $ord2 < 0x80 ) {
// ASCII character
$out .= $r2;
}
}
if ( $ord1 < 0x80 ) {
$out .= $r1;
}
if ( $ord0 < 0x80 ) {
$out .= $r0;
}
if ( $allowUnicode ) {
$out .= '\u0080-\uFFFF';
}
return $out;
}
/**
* Get a string representation of a title suitable for
* including in a search index

View file

@ -95,6 +95,7 @@ class ResourceLoaderStartUpModule extends ResourceLoaderModule {
'wgCookiePrefix' => $wgCookiePrefix,
'wgResourceLoaderMaxQueryLength' => $wgResourceLoaderMaxQueryLength,
'wgCaseSensitiveNamespaces' => $caseSensitiveNamespaces,
'wgLegalTitleChars' => Title::convertByteClassToUnicodeClass( Title::legalChars() ),
);
wfRunHooks( 'ResourceLoaderGetConfigVars', array( &$vars ) );

View file

@ -32,6 +32,74 @@ class TitleTest extends MediaWikiTestCase {
}
}
public static function provideConvertByteClassToUnicodeClass() {
return array(
array(
' %!"$&\'()*,\\-.\\/0-9:;=?@A-Z\\\\^_`a-z~\\x80-\\xFF+',
' %!"$&\'()*,\\-./0-9:;=?@A-Z\\\\\\^_`a-z~+\\u0080-\\uFFFF',
),
array(
'QWERTYf-\\xFF+',
'QWERTYf-\\x7F+\\u0080-\\uFFFF',
),
array(
'QWERTY\\x66-\\xFD+',
'QWERTYf-\\x7F+\\u0080-\\uFFFF',
),
array(
'QWERTYf-y+',
'QWERTYf-y+',
),
array(
'QWERTYf-\\x80+',
'QWERTYf-\\x7F+\\u0080-\\uFFFF',
),
array(
'QWERTY\\x66-\\x80+\\x23',
'QWERTYf-\\x7F+#\\u0080-\\uFFFF',
),
array(
'QWERTY\\x66-\\x80+\\xD3',
'QWERTYf-\\x7F+\\u0080-\\uFFFF',
),
array(
'\\\\\\x99',
'\\\\\\u0080-\\uFFFF',
),
array(
'-\\x99',
'\\-\\u0080-\\uFFFF',
),
array(
'QWERTY\\-\\x99',
'QWERTY\\-\\u0080-\\uFFFF',
),
array(
'\\\\x99',
'\\\\x99',
),
array(
'A-\\x9F',
'A-\\x7F\\u0080-\\uFFFF',
),
array(
'\\x66-\\x77QWERTY\\x88-\\x91FXZ',
'f-wQWERTYFXZ\\u0080-\\uFFFF',
),
array(
'\\x66-\\x99QWERTY\\xAA-\\xEEFXZ',
'f-\\x7FQWERTYFXZ\\u0080-\\uFFFF',
),
);
}
/**
* @dataProvider provideConvertByteClassToUnicodeClass
*/
function testConvertByteClassToUnicodeClass( $byteClass, $unicodeClass ) {
$this->assertEquals( $unicodeClass, Title::convertByteClassToUnicodeClass( $byteClass ) );
}
/**
* @dataProvider provideBug31100
*/