From 3a322ef9b0d7d6b546927f3190a500f25cf9bbed Mon Sep 17 00:00:00 2001 From: Fomafix Date: Fri, 27 Aug 2021 20:37:54 +0000 Subject: [PATCH] Use PHP \u{xxxx} syntax Let PHP do the UTF-8 encoding of Unicode characters in PHP strings. Also use faster str_replace instead of preg_replace. Change-Id: I4e99de694a607e2b5df52c6efcd3d863bb42f76e --- includes/languages/LanguageBe_tarask.php | 2 +- tests/phpunit/unit/includes/json/FormatJsonTest.php | 10 +++++----- .../phpunit/unit/includes/parser/SanitizerUnitTest.php | 8 ++++---- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/includes/languages/LanguageBe_tarask.php b/includes/languages/LanguageBe_tarask.php index b7087f1f297..921009258cd 100644 --- a/includes/languages/LanguageBe_tarask.php +++ b/includes/languages/LanguageBe_tarask.php @@ -47,7 +47,7 @@ class LanguageBe_tarask extends Language { # need to fold cases and convert to hex # Replacing apostrophe sign U+2019 with U+0027 - $s = preg_replace( '/\xe2\x80\x99/', '\'', $string ); + $s = str_replace( "\u{2019}", '\'', $string ); $s = parent::normalizeForSearch( $s ); diff --git a/tests/phpunit/unit/includes/json/FormatJsonTest.php b/tests/phpunit/unit/includes/json/FormatJsonTest.php index 39de4825342..0ba9480f4e1 100644 --- a/tests/phpunit/unit/includes/json/FormatJsonTest.php +++ b/tests/phpunit/unit/includes/json/FormatJsonTest.php @@ -104,7 +104,7 @@ class FormatJsonTest extends MediaWikiUnitTestCase { public function testEncodePhpBug46944() { $this->assertNotEquals( '\ud840\udc00', - strtolower( FormatJson::encode( "\xf0\xa0\x80\x80" ) ), + strtolower( FormatJson::encode( "\u{20000}" ) ), 'Test encoding an broken json_encode character (U+20000)' ); } @@ -286,12 +286,12 @@ class FormatJsonTest extends MediaWikiUnitTestCase { '\\u00e9' => '\\\u00e9', // security check for Unicode unescaping // Line terminators - "\xe2\x80\xa8" => '\u2028', - "\xe2\x80\xa9" => '\u2029', + "\u{2028}" => '\u2028', + "\u{2029}" => '\u2029', ], 'unicode' => [ - "\xc3\xa9" => '\u00e9', - "\xf0\x9d\x92\x9e" => '\ud835\udc9e', // U+1D49E, outside the BMP + "\u{00E9}" => '\u00e9', + "\u{1D49E}" => '\ud835\udc9e', // U+1D49E, outside the BMP ], 'xmlmeta' => [ '<' => '\u003C', // JSON_HEX_TAG uses uppercase hex digits diff --git a/tests/phpunit/unit/includes/parser/SanitizerUnitTest.php b/tests/phpunit/unit/includes/parser/SanitizerUnitTest.php index 77771565295..82c73f76343 100644 --- a/tests/phpunit/unit/includes/parser/SanitizerUnitTest.php +++ b/tests/phpunit/unit/includes/parser/SanitizerUnitTest.php @@ -16,19 +16,19 @@ class SanitizerUnitTest extends MediaWikiUnitTestCase { public function provideDecodeCharReferences() { return [ 'decode named entities' => [ - "\xc3\xa9cole", + "\u{00E9}cole", 'école', ], 'decode numeric entities' => [ - "\xc4\x88io bonas dans l'\xc3\xa9cole!", + "\u{0108}io bonas dans l'\u{00E9}cole!", "Ĉio bonas dans l'école!", ], 'decode mixed numeric/named entities' => [ - "\xc4\x88io bonas dans l'\xc3\xa9cole!", + "\u{0108}io bonas dans l'\u{00E9}cole!", "Ĉio bonas dans l'école!", ], 'decode mixed complex entities' => [ - "\xc4\x88io bonas dans l'\xc3\xa9cole! (mais pas Ĉio dans l'école)", + "\u{0108}io bonas dans l'\u{00E9}cole! (mais pas Ĉio dans l'école)", "Ĉio bonas dans l'école! (mais pas &#x108;io dans l'&eacute;cole)", ], 'Invalid ampersand' => [