Merge "Title: Add byte class to unicode class conversion for js"
This commit is contained in:
commit
2f0636db20
3 changed files with 171 additions and 0 deletions
|
|
@ -491,6 +491,108 @@ class Title {
|
|||
return $rxTc;
|
||||
}
|
||||
|
||||
/**
|
||||
* Utility method for converting a character sequence from bytes to Unicode.
|
||||
*
|
||||
* Primary usecase being converting $wgLegalTitleChars to a sequence usable in
|
||||
* javascript, as PHP uses UTF-8 bytes where javascript uses Unicode code units.
|
||||
*
|
||||
* @param string $byteClass
|
||||
* @return string
|
||||
*/
|
||||
public static function convertByteClassToUnicodeClass( $byteClass ) {
|
||||
$length = strlen( $byteClass );
|
||||
// Input token queue
|
||||
$x0 = $x1 = $x2 = '';
|
||||
// Decoded queue
|
||||
$d0 = $d1 = $d2 = '';
|
||||
// Decoded integer codepoints
|
||||
$ord0 = $ord1 = $ord2 = 0;
|
||||
// Re-encoded queue
|
||||
$r0 = $r1 = $r2 = '';
|
||||
// Output
|
||||
$out = '';
|
||||
// Flags
|
||||
$allowUnicode = false;
|
||||
for ( $pos = 0; $pos < $length; $pos++ ) {
|
||||
// Shift the queues down
|
||||
$x2 = $x1;
|
||||
$x1 = $x0;
|
||||
$d2 = $d1;
|
||||
$d1 = $d0;
|
||||
$ord2 = $ord1;
|
||||
$ord1 = $ord0;
|
||||
$r2 = $r1;
|
||||
$r1 = $r0;
|
||||
// Load the current input token and decoded values
|
||||
$inChar = $byteClass[$pos];
|
||||
if ( $inChar == '\\' ) {
|
||||
if ( preg_match( '/x([0-9a-fA-F]{2})/A', $byteClass, $m, 0, $pos + 1 ) ) {
|
||||
$x0 = $inChar . $m[0];
|
||||
$d0 = chr( hexdec( $m[1] ) );
|
||||
$pos += strlen( $m[0] );
|
||||
} elseif ( preg_match( '/[0-7]{3}/A', $byteClass, $m, 0, $pos + 1 ) ) {
|
||||
$x0 = $inChar . $m[0];
|
||||
$d0 = chr( octdec( $m[0] ) );
|
||||
$pos += strlen( $m[0] );
|
||||
} elseif ( $pos + 1 >= $length ) {
|
||||
$x0 = $d0 = '\\';
|
||||
} else {
|
||||
$d0 = $byteClass[$pos + 1];
|
||||
$x0 = $inChar . $d0;
|
||||
$pos += 1;
|
||||
}
|
||||
} else {
|
||||
$x0 = $d0 = $inChar;
|
||||
}
|
||||
$ord0 = ord( $d0 );
|
||||
// Load the current re-encoded value
|
||||
if ( $ord0 < 32 || $ord0 == 0x7f ) {
|
||||
$r0 = sprintf( '\x%02x', $ord0 );
|
||||
} elseif ( $ord0 >= 0x80 ) {
|
||||
// Allow unicode if a single high-bit character appears
|
||||
$r0 = sprintf( '\x%02x', $ord0 );
|
||||
$allowUnicode = true;
|
||||
} elseif ( strpos( '-\\[]^', $d0 ) !== false ) {
|
||||
$r0 = '\\' . $d0;
|
||||
} else {
|
||||
$r0 = $d0;
|
||||
}
|
||||
// Do the output
|
||||
if ( $x0 !== '' && $x1 === '-' && $x2 !== '' ) {
|
||||
// Range
|
||||
if ( $ord2 > $ord0 ) {
|
||||
// Empty range
|
||||
} elseif ( $ord0 >= 0x80 ) {
|
||||
// Unicode range
|
||||
$allowUnicode = true;
|
||||
if ( $ord2 < 0x80 ) {
|
||||
// Keep the non-unicode section of the range
|
||||
$out .= "$r2-\\x7F";
|
||||
}
|
||||
} else {
|
||||
// Normal range
|
||||
$out .= "$r2-$r0";
|
||||
}
|
||||
// Reset state to the initial value
|
||||
$x0 = $x1 = $d0 = $d1 = $r0 = $r1 = '';
|
||||
} elseif ( $ord2 < 0x80 ) {
|
||||
// ASCII character
|
||||
$out .= $r2;
|
||||
}
|
||||
}
|
||||
if ( $ord1 < 0x80 ) {
|
||||
$out .= $r1;
|
||||
}
|
||||
if ( $ord0 < 0x80 ) {
|
||||
$out .= $r0;
|
||||
}
|
||||
if ( $allowUnicode ) {
|
||||
$out .= '\u0080-\uFFFF';
|
||||
}
|
||||
return $out;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get a string representation of a title suitable for
|
||||
* including in a search index
|
||||
|
|
|
|||
|
|
@ -95,6 +95,7 @@ class ResourceLoaderStartUpModule extends ResourceLoaderModule {
|
|||
'wgCookiePrefix' => $wgCookiePrefix,
|
||||
'wgResourceLoaderMaxQueryLength' => $wgResourceLoaderMaxQueryLength,
|
||||
'wgCaseSensitiveNamespaces' => $caseSensitiveNamespaces,
|
||||
'wgLegalTitleChars' => Title::convertByteClassToUnicodeClass( Title::legalChars() ),
|
||||
);
|
||||
|
||||
wfRunHooks( 'ResourceLoaderGetConfigVars', array( &$vars ) );
|
||||
|
|
|
|||
|
|
@ -32,6 +32,74 @@ class TitleTest extends MediaWikiTestCase {
|
|||
}
|
||||
}
|
||||
|
||||
public static function provideConvertByteClassToUnicodeClass() {
|
||||
return array(
|
||||
array(
|
||||
' %!"$&\'()*,\\-.\\/0-9:;=?@A-Z\\\\^_`a-z~\\x80-\\xFF+',
|
||||
' %!"$&\'()*,\\-./0-9:;=?@A-Z\\\\\\^_`a-z~+\\u0080-\\uFFFF',
|
||||
),
|
||||
array(
|
||||
'QWERTYf-\\xFF+',
|
||||
'QWERTYf-\\x7F+\\u0080-\\uFFFF',
|
||||
),
|
||||
array(
|
||||
'QWERTY\\x66-\\xFD+',
|
||||
'QWERTYf-\\x7F+\\u0080-\\uFFFF',
|
||||
),
|
||||
array(
|
||||
'QWERTYf-y+',
|
||||
'QWERTYf-y+',
|
||||
),
|
||||
array(
|
||||
'QWERTYf-\\x80+',
|
||||
'QWERTYf-\\x7F+\\u0080-\\uFFFF',
|
||||
),
|
||||
array(
|
||||
'QWERTY\\x66-\\x80+\\x23',
|
||||
'QWERTYf-\\x7F+#\\u0080-\\uFFFF',
|
||||
),
|
||||
array(
|
||||
'QWERTY\\x66-\\x80+\\xD3',
|
||||
'QWERTYf-\\x7F+\\u0080-\\uFFFF',
|
||||
),
|
||||
array(
|
||||
'\\\\\\x99',
|
||||
'\\\\\\u0080-\\uFFFF',
|
||||
),
|
||||
array(
|
||||
'-\\x99',
|
||||
'\\-\\u0080-\\uFFFF',
|
||||
),
|
||||
array(
|
||||
'QWERTY\\-\\x99',
|
||||
'QWERTY\\-\\u0080-\\uFFFF',
|
||||
),
|
||||
array(
|
||||
'\\\\x99',
|
||||
'\\\\x99',
|
||||
),
|
||||
array(
|
||||
'A-\\x9F',
|
||||
'A-\\x7F\\u0080-\\uFFFF',
|
||||
),
|
||||
array(
|
||||
'\\x66-\\x77QWERTY\\x88-\\x91FXZ',
|
||||
'f-wQWERTYFXZ\\u0080-\\uFFFF',
|
||||
),
|
||||
array(
|
||||
'\\x66-\\x99QWERTY\\xAA-\\xEEFXZ',
|
||||
'f-\\x7FQWERTYFXZ\\u0080-\\uFFFF',
|
||||
),
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* @dataProvider provideConvertByteClassToUnicodeClass
|
||||
*/
|
||||
function testConvertByteClassToUnicodeClass( $byteClass, $unicodeClass ) {
|
||||
$this->assertEquals( $unicodeClass, Title::convertByteClassToUnicodeClass( $byteClass ) );
|
||||
}
|
||||
|
||||
/**
|
||||
* @dataProvider provideBug31100
|
||||
*/
|
||||
|
|
|
|||
Loading…
Reference in a new issue