Title: Add byte class to unicode class conversion for js
The upcoming rewrite of mw.Title needs to use wgLegalTitleChars, but for that to work, it needs to be converted into something that can work in javascript. Signed-off-by: Timo Tijhof <krinklemail@gmail.com> Signed-off-by: David Chan <david@sheetmusic.org.uk> Change-Id: I163f3d7e3a680d52640a93f4bd195d8209669918
This commit is contained in:
parent
aa6f866bd1
commit
dc9c9ee7fc
3 changed files with 171 additions and 0 deletions
|
|
@ -491,6 +491,108 @@ class Title {
|
|||
return $rxTc;
|
||||
}
|
||||
|
||||
/**
|
||||
* Utility method for converting a character sequence from bytes to Unicode.
|
||||
*
|
||||
* Primary usecase being converting $wgLegalTitleChars to a sequence usable in
|
||||
* javascript, as PHP uses UTF-8 bytes where javascript uses Unicode code units.
|
||||
*
|
||||
* @param string $byteClass
|
||||
* @return string
|
||||
*/
|
||||
public static function convertByteClassToUnicodeClass( $byteClass ) {
|
||||
$length = strlen( $byteClass );
|
||||
// Input token queue
|
||||
$x0 = $x1 = $x2 = '';
|
||||
// Decoded queue
|
||||
$d0 = $d1 = $d2 = '';
|
||||
// Decoded integer codepoints
|
||||
$ord0 = $ord1 = $ord2 = 0;
|
||||
// Re-encoded queue
|
||||
$r0 = $r1 = $r2 = '';
|
||||
// Output
|
||||
$out = '';
|
||||
// Flags
|
||||
$allowUnicode = false;
|
||||
for ( $pos = 0; $pos < $length; $pos++ ) {
|
||||
// Shift the queues down
|
||||
$x2 = $x1;
|
||||
$x1 = $x0;
|
||||
$d2 = $d1;
|
||||
$d1 = $d0;
|
||||
$ord2 = $ord1;
|
||||
$ord1 = $ord0;
|
||||
$r2 = $r1;
|
||||
$r1 = $r0;
|
||||
// Load the current input token and decoded values
|
||||
$inChar = $byteClass[$pos];
|
||||
if ( $inChar == '\\' ) {
|
||||
if ( preg_match( '/x([0-9a-fA-F]{2})/A', $byteClass, $m, 0, $pos + 1 ) ) {
|
||||
$x0 = $inChar . $m[0];
|
||||
$d0 = chr( hexdec( $m[1] ) );
|
||||
$pos += strlen( $m[0] );
|
||||
} elseif ( preg_match( '/[0-7]{3}/A', $byteClass, $m, 0, $pos + 1 ) ) {
|
||||
$x0 = $inChar . $m[0];
|
||||
$d0 = chr( octdec( $m[0] ) );
|
||||
$pos += strlen( $m[0] );
|
||||
} elseif ( $pos + 1 >= $length ) {
|
||||
$x0 = $d0 = '\\';
|
||||
} else {
|
||||
$d0 = $byteClass[$pos + 1];
|
||||
$x0 = $inChar . $d0;
|
||||
$pos += 1;
|
||||
}
|
||||
} else {
|
||||
$x0 = $d0 = $inChar;
|
||||
}
|
||||
$ord0 = ord( $d0 );
|
||||
// Load the current re-encoded value
|
||||
if ( $ord0 < 32 || $ord0 == 0x7f ) {
|
||||
$r0 = sprintf( '\x%02x', $ord0 );
|
||||
} elseif ( $ord0 >= 0x80 ) {
|
||||
// Allow unicode if a single high-bit character appears
|
||||
$r0 = sprintf( '\x%02x', $ord0 );
|
||||
$allowUnicode = true;
|
||||
} elseif ( strpos( '-\\[]^', $d0 ) !== false ) {
|
||||
$r0 = '\\' . $d0;
|
||||
} else {
|
||||
$r0 = $d0;
|
||||
}
|
||||
// Do the output
|
||||
if ( $x0 !== '' && $x1 === '-' && $x2 !== '' ) {
|
||||
// Range
|
||||
if ( $ord2 > $ord0 ) {
|
||||
// Empty range
|
||||
} elseif ( $ord0 >= 0x80 ) {
|
||||
// Unicode range
|
||||
$allowUnicode = true;
|
||||
if ( $ord2 < 0x80 ) {
|
||||
// Keep the non-unicode section of the range
|
||||
$out .= "$r2-\\x7F";
|
||||
}
|
||||
} else {
|
||||
// Normal range
|
||||
$out .= "$r2-$r0";
|
||||
}
|
||||
// Reset state to the initial value
|
||||
$x0 = $x1 = $d0 = $d1 = $r0 = $r1 = '';
|
||||
} elseif ( $ord2 < 0x80 ) {
|
||||
// ASCII character
|
||||
$out .= $r2;
|
||||
}
|
||||
}
|
||||
if ( $ord1 < 0x80 ) {
|
||||
$out .= $r1;
|
||||
}
|
||||
if ( $ord0 < 0x80 ) {
|
||||
$out .= $r0;
|
||||
}
|
||||
if ( $allowUnicode ) {
|
||||
$out .= '\u0080-\uFFFF';
|
||||
}
|
||||
return $out;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get a string representation of a title suitable for
|
||||
* including in a search index
|
||||
|
|
|
|||
|
|
@ -95,6 +95,7 @@ class ResourceLoaderStartUpModule extends ResourceLoaderModule {
|
|||
'wgCookiePrefix' => $wgCookiePrefix,
|
||||
'wgResourceLoaderMaxQueryLength' => $wgResourceLoaderMaxQueryLength,
|
||||
'wgCaseSensitiveNamespaces' => $caseSensitiveNamespaces,
|
||||
'wgLegalTitleChars' => Title::convertByteClassToUnicodeClass( Title::legalChars() ),
|
||||
);
|
||||
|
||||
wfRunHooks( 'ResourceLoaderGetConfigVars', array( &$vars ) );
|
||||
|
|
|
|||
|
|
@ -32,6 +32,74 @@ class TitleTest extends MediaWikiTestCase {
|
|||
}
|
||||
}
|
||||
|
||||
public static function provideConvertByteClassToUnicodeClass() {
|
||||
return array(
|
||||
array(
|
||||
' %!"$&\'()*,\\-.\\/0-9:;=?@A-Z\\\\^_`a-z~\\x80-\\xFF+',
|
||||
' %!"$&\'()*,\\-./0-9:;=?@A-Z\\\\\\^_`a-z~+\\u0080-\\uFFFF',
|
||||
),
|
||||
array(
|
||||
'QWERTYf-\\xFF+',
|
||||
'QWERTYf-\\x7F+\\u0080-\\uFFFF',
|
||||
),
|
||||
array(
|
||||
'QWERTY\\x66-\\xFD+',
|
||||
'QWERTYf-\\x7F+\\u0080-\\uFFFF',
|
||||
),
|
||||
array(
|
||||
'QWERTYf-y+',
|
||||
'QWERTYf-y+',
|
||||
),
|
||||
array(
|
||||
'QWERTYf-\\x80+',
|
||||
'QWERTYf-\\x7F+\\u0080-\\uFFFF',
|
||||
),
|
||||
array(
|
||||
'QWERTY\\x66-\\x80+\\x23',
|
||||
'QWERTYf-\\x7F+#\\u0080-\\uFFFF',
|
||||
),
|
||||
array(
|
||||
'QWERTY\\x66-\\x80+\\xD3',
|
||||
'QWERTYf-\\x7F+\\u0080-\\uFFFF',
|
||||
),
|
||||
array(
|
||||
'\\\\\\x99',
|
||||
'\\\\\\u0080-\\uFFFF',
|
||||
),
|
||||
array(
|
||||
'-\\x99',
|
||||
'\\-\\u0080-\\uFFFF',
|
||||
),
|
||||
array(
|
||||
'QWERTY\\-\\x99',
|
||||
'QWERTY\\-\\u0080-\\uFFFF',
|
||||
),
|
||||
array(
|
||||
'\\\\x99',
|
||||
'\\\\x99',
|
||||
),
|
||||
array(
|
||||
'A-\\x9F',
|
||||
'A-\\x7F\\u0080-\\uFFFF',
|
||||
),
|
||||
array(
|
||||
'\\x66-\\x77QWERTY\\x88-\\x91FXZ',
|
||||
'f-wQWERTYFXZ\\u0080-\\uFFFF',
|
||||
),
|
||||
array(
|
||||
'\\x66-\\x99QWERTY\\xAA-\\xEEFXZ',
|
||||
'f-\\x7FQWERTYFXZ\\u0080-\\uFFFF',
|
||||
),
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* @dataProvider provideConvertByteClassToUnicodeClass
|
||||
*/
|
||||
function testConvertByteClassToUnicodeClass( $byteClass, $unicodeClass ) {
|
||||
$this->assertEquals( $unicodeClass, Title::convertByteClassToUnicodeClass( $byteClass ) );
|
||||
}
|
||||
|
||||
/**
|
||||
* @dataProvider provideBug31100
|
||||
*/
|
||||
|
|
|
|||
Loading…
Reference in a new issue