Use PHP \u{xxxx} syntax

Let PHP do the UTF-8 encoding of Unicode characters in PHP strings.

Also use faster str_replace instead of preg_replace.

Change-Id: I4e99de694a607e2b5df52c6efcd3d863bb42f76e
This commit is contained in:
Fomafix 2021-08-27 20:37:54 +00:00
parent a7786688dd
commit 3a322ef9b0
3 changed files with 10 additions and 10 deletions

View file

@ -47,7 +47,7 @@ class LanguageBe_tarask extends Language {
# need to fold cases and convert to hex
# Replacing apostrophe sign U+2019 with U+0027
$s = preg_replace( '/\xe2\x80\x99/', '\'', $string );
$s = str_replace( "\u{2019}", '\'', $string );
$s = parent::normalizeForSearch( $s );

View file

@ -104,7 +104,7 @@ class FormatJsonTest extends MediaWikiUnitTestCase {
public function testEncodePhpBug46944() {
$this->assertNotEquals(
'\ud840\udc00',
strtolower( FormatJson::encode( "\xf0\xa0\x80\x80" ) ),
strtolower( FormatJson::encode( "\u{20000}" ) ),
'Test encoding an broken json_encode character (U+20000)'
);
}
@ -286,12 +286,12 @@ class FormatJsonTest extends MediaWikiUnitTestCase {
'\\u00e9' => '\\\u00e9', // security check for Unicode unescaping
// Line terminators
"\xe2\x80\xa8" => '\u2028',
"\xe2\x80\xa9" => '\u2029',
"\u{2028}" => '\u2028',
"\u{2029}" => '\u2029',
],
'unicode' => [
"\xc3\xa9" => '\u00e9',
"\xf0\x9d\x92\x9e" => '\ud835\udc9e', // U+1D49E, outside the BMP
"\u{00E9}" => '\u00e9',
"\u{1D49E}" => '\ud835\udc9e', // U+1D49E, outside the BMP
],
'xmlmeta' => [
'<' => '\u003C', // JSON_HEX_TAG uses uppercase hex digits

View file

@ -16,19 +16,19 @@ class SanitizerUnitTest extends MediaWikiUnitTestCase {
public function provideDecodeCharReferences() {
return [
'decode named entities' => [
"\xc3\xa9cole",
"\u{00E9}cole",
'&eacute;cole',
],
'decode numeric entities' => [
"\xc4\x88io bonas dans l'\xc3\xa9cole!",
"\u{0108}io bonas dans l'\u{00E9}cole!",
"&#x108;io bonas dans l'&#233;cole!",
],
'decode mixed numeric/named entities' => [
"\xc4\x88io bonas dans l'\xc3\xa9cole!",
"\u{0108}io bonas dans l'\u{00E9}cole!",
"&#x108;io bonas dans l'&eacute;cole!",
],
'decode mixed complex entities' => [
"\xc4\x88io bonas dans l'\xc3\xa9cole! (mais pas &#x108;io dans l'&eacute;cole)",
"\u{0108}io bonas dans l'\u{00E9}cole! (mais pas &#x108;io dans l'&eacute;cole)",
"&#x108;io bonas dans l'&eacute;cole! (mais pas &amp;#x108;io dans l'&#38;eacute;cole)",
],
'Invalid ampersand' => [