From a6739e066edc9290ee711d59348b051f1fc2408d Mon Sep 17 00:00:00 2001 From: "C. Scott Ananian" Date: Fri, 6 Dec 2024 15:53:20 -0500 Subject: [PATCH] Use Remex/HtmlHelper to implement Parser::replaceTableOfContents This is more robust and secure than the regular expression previously used to extract the tag. We also improve HtmlHelper slightly be adding the ability to replace an element with an 'outerHTML' string. Because our output is being run through Remex, there is a slightly larger degree of HTML normalization in the output than previously, which is visible in some small tweaks to test case outputs. Bug: T381617 Depends-On: I2712e0fa9272106e8cd686980f847ee7f6385b6f Change-Id: I4cb2f29cf890af90f295624c586d9e1eb1939b95 (cherry picked from commit 7ebd8034b54495f28f4c5583d4fa55071634b593) --- includes/Html/HtmlHelper.php | 3 +- includes/Html/HtmlHelperTrait.php | 6 ++- includes/parser/Parser.php | 41 +++++++++---------- tests/parser/headings.txt | 12 +++--- tests/parser/legacyHeadings.txt | 12 +++--- tests/parser/parserTests.txt | 2 +- .../CssContentHandlerIntegrationTest.php | 6 +-- ...avaScriptContentHandlerIntegrationTest.php | 6 +-- .../JsonContentHandlerIntegrationTest.php | 2 +- .../WikitextContentHandlerIntegrationTest.php | 2 +- .../phpunit/includes/page/WikiPageDbTest.php | 4 +- 11 files changed, 49 insertions(+), 47 deletions(-) diff --git a/includes/Html/HtmlHelper.php b/includes/Html/HtmlHelper.php index 66a8399dbe3..0ff50bf75c0 100644 --- a/includes/Html/HtmlHelper.php +++ b/includes/Html/HtmlHelper.php @@ -23,7 +23,8 @@ class HtmlHelper { * RemexHtml\Serializer\SerializerNode argument, and returns true if it should be modified. * @param callable $modifyCallback A callback which takes a single * RemexHtml\Serializer\SerializerNode argument and actually performs the modification on it. - * It must return the new node (which can be the original node object). + * It must return the new node (which can be the original node object) + * or a string, which is treated as the outerHTML of a replacement. * @param bool $html5format Defaults to true, which uses standard HTML5 * serialization for the parsed HTML. If set to false, uses a * serialization which is more compatible with the output of the diff --git a/includes/Html/HtmlHelperTrait.php b/includes/Html/HtmlHelperTrait.php index 270d84df9e1..1a938d8825e 100644 --- a/includes/Html/HtmlHelperTrait.php +++ b/includes/Html/HtmlHelperTrait.php @@ -34,7 +34,11 @@ trait HtmlHelperTrait { $node = clone $node; $node->attrs = clone $node->attrs; $newNode = ( $this->modifyCallback )( $node ); - Assert::parameterType( SerializerNode::class, $newNode, 'return value' ); + Assert::parameterType( [ SerializerNode::class, 'string' ], $newNode, 'return value' ); + if ( is_string( $newNode ) ) { + // Replace this element with an "outerHTML" string. + return $newNode; + } return parent::element( $parent, $newNode, $contents ); } else { return parent::element( $parent, $node, $contents ); diff --git a/includes/parser/Parser.php b/includes/parser/Parser.php index c9a3329e509..11a8dd0408e 100644 --- a/includes/parser/Parser.php +++ b/includes/parser/Parser.php @@ -42,6 +42,7 @@ use MediaWiki\Debug\DeprecationHelper; use MediaWiki\HookContainer\HookContainer; use MediaWiki\HookContainer\HookRunner; use MediaWiki\Html\Html; +use MediaWiki\Html\HtmlHelper; use MediaWiki\Http\HttpRequestFactory; use MediaWiki\Language\ILanguageConverter; use MediaWiki\Language\Language; @@ -97,6 +98,7 @@ use Wikimedia\Parsoid\DOM\Element; use Wikimedia\Parsoid\DOM\Node; use Wikimedia\Parsoid\Utils\DOMCompat; use Wikimedia\Parsoid\Utils\DOMUtils; +use Wikimedia\RemexHtml\Serializer\SerializerNode; use Wikimedia\ScopedCallback; /** @@ -225,15 +227,6 @@ class Parser { */ public const TOC_PLACEHOLDER = ''; - /** - * Permissive regexp matching TOC_PLACEHOLDER. This allows for some - * minor modifications to the placeholder to be made by extensions - * without breaking the TOC (T317857); note also that Parsoid's version - * of the placeholder might include additional attributes. - * @var string - */ - private const TOC_PLACEHOLDER_REGEX = '/]*\\bproperty\\s*=\\s*"mw:PageProp\\/toc"[^>]*>/'; - # Persistent: /** @var array */ private array $mTagHooks = []; @@ -4878,20 +4871,24 @@ class Parser { */ public static function replaceTableOfContentsMarker( $text, $toc ) { $replaced = false; - // remove the additional metas. while not strictly necessary, this also ensures idempotence if we run - // the pass more than once on a given content and TOC markers are not inserted by $toc. At the same time, - // if $toc inserts TOC markers (which, as of 2024-05, it shouldn't be able to), these are preserved by the - // fact that we run a single pass with a callback (rather than doing a first replacement with the $toc and - // a replacement of leftover markers as a second pass). - $callback = static function ( array $matches ) use( &$replaced, $toc ): string { - if ( !$replaced ) { + return HtmlHelper::modifyElements( + $text, + static function ( SerializerNode $node ): bool { + $prop = $node->attrs['property'] ?? ''; + return $node->name === 'meta' && $prop === 'mw:PageProp/toc'; + }, + static function ( SerializerNode $node ) use ( &$replaced, $toc ) { + if ( $replaced ) { + // Remove the additional metas. While not strictly + // necessary, this also ensures idempotence if we + // run the pass more than once on a given content. + return ''; + } $replaced = true; - return $toc; - } - return ''; - }; - - return preg_replace_callback( self::TOC_PLACEHOLDER_REGEX, $callback, $text ); + return $toc; // outerHTML replacement. + }, + false /* use legacy-compatible serialization */ + ); } /** diff --git a/tests/parser/headings.txt b/tests/parser/headings.txt index 271747addb0..4ddb94d2eac 100644 --- a/tests/parser/headings.txt +++ b/tests/parser/headings.txt @@ -1860,7 +1860,7 @@ section 5

text & text

[edit]

section 3

-

text ' text

[edit]
+

text ' text

[edit]

section 4

text " text

[edit]
@@ -2444,7 +2444,7 @@ wgParserEnableLegacyHeadingDOM=false !! wikitext ==   A   B   == !! html/php -

  A   B  

[edit]
+

  A   B  

[edit]
!! html/parsoid

  A   B  

!! end @@ -2690,7 +2690,7 @@ wgParserEnableLegacyHeadingDOM=false

Тест

[edit]

Тест

[edit]

тест

[edit]
-

Hey < # " > % : '

[edit]
+

Hey < # " > % : '

[edit]

#Foo bar #foo Bar #Тест #тест #Hey < # " > % : '

💩

#啤酒 #啤酒 @@ -2757,7 +2757,7 @@ wgParserEnableLegacyHeadingDOM=false

Тест

[edit]

Тест

[edit]

тест

[edit]
-

Hey < # " > % : '

[edit]
+

Hey < # " > % : '

[edit]

#Foo bar #foo Bar #Тест #тест #Hey < # " > % : '

.F0.9F.92.A9

#啤酒 #啤酒 @@ -2806,7 +2806,7 @@ wgParserEnableLegacyHeadingDOM=false

Тест

[edit]

Тест

[edit]

тест

[edit]
-

Hey < # " > % : '

[edit]
+

Hey < # " > % : '

[edit]

#Foo bar #foo Bar #Тест #тест #Hey < # " > % : '

💩

#啤酒 #啤酒 @@ -2823,7 +2823,7 @@ wgParserEnableLegacyHeadingDOM=false [[#Foo bar]] !! html/php -

Foo bar

[edit]
+

Foo bar

[edit]

#Foo bar

!! html/parsoid diff --git a/tests/parser/legacyHeadings.txt b/tests/parser/legacyHeadings.txt index a4023e6dce1..ae3408d5047 100644 --- a/tests/parser/legacyHeadings.txt +++ b/tests/parser/legacyHeadings.txt @@ -1857,7 +1857,7 @@ section 5

text & text[edit]

section 3

-

text ' text[edit]

+

text ' text[edit]

section 4

text " text[edit]

@@ -2441,7 +2441,7 @@ wgParserEnableLegacyHeadingDOM=true !! wikitext ==   A   B   == !! html/php -

  A   B  [edit]

+

  A   B  [edit]

!! html/parsoid

  A   B  

!! end @@ -2686,7 +2686,7 @@ wgParserEnableLegacyHeadingDOM=true

Тест[edit]

Тест[edit]

тест[edit]

-

Hey < # " > % : '[edit]

+

Hey < # " > % : '[edit]

#Foo bar #foo Bar #Тест #тест #Hey < # " > % : '

💩

#啤酒 #啤酒 @@ -2753,7 +2753,7 @@ wgParserEnableLegacyHeadingDOM=true

Тест[edit]

Тест[edit]

тест[edit]

-

Hey < # " > % : '[edit]

+

Hey < # " > % : '[edit]

#Foo bar #foo Bar #Тест #тест #Hey < # " > % : '

.F0.9F.92.A9

#啤酒 #啤酒 @@ -2802,7 +2802,7 @@ wgParserEnableLegacyHeadingDOM=true

Тест[edit]

Тест[edit]

тест[edit]

-

Hey < # " > % : '[edit]

+

Hey < # " > % : '[edit]

#Foo bar #foo Bar #Тест #тест #Hey < # " > % : '

💩

#啤酒 #啤酒 @@ -2819,7 +2819,7 @@ wgParserEnableLegacyHeadingDOM=true [[#Foo bar]] !! html/php -

Foo bar[edit]

+

Foo bar[edit]

#Foo bar

!! html/parsoid diff --git a/tests/parser/parserTests.txt b/tests/parser/parserTests.txt index de840c1428c..256b4f4353d 100644 --- a/tests/parser/parserTests.txt +++ b/tests/parser/parserTests.txt @@ -11090,7 +11090,7 @@ parsoid={ "modes": ["wt2html","html2html"], "normalizePhp": true }
This is an okay multi-codepoint HTML5 entity
∾̳
This are valid wikitext-specific entities
-
‏ ‏
+
‏ ‏
Highest valid characters
􏿽 􏿾 􏿿
!! end diff --git a/tests/phpunit/includes/content/CssContentHandlerIntegrationTest.php b/tests/phpunit/includes/content/CssContentHandlerIntegrationTest.php index 6bb2fea2776..74e654dde08 100644 --- a/tests/phpunit/includes/content/CssContentHandlerIntegrationTest.php +++ b/tests/phpunit/includes/content/CssContentHandlerIntegrationTest.php @@ -12,7 +12,7 @@ class CssContentHandlerIntegrationTest extends TextContentHandlerIntegrationTest 'title' => 'MediaWiki:Test.css', 'model' => null, 'text' => "hello x\n", - 'expectedHtml' => "
\nhello <world>x\n\n
", + 'expectedHtml' => "
hello <world>x\n\n
", 'expectedFields' => [ 'Links' => [ ], @@ -24,7 +24,7 @@ class CssContentHandlerIntegrationTest extends TextContentHandlerIntegrationTest 'title' => 'MediaWiki:Test.css', 'model' => null, 'text' => "/* hello [[world]] */\n", - 'expectedHtml' => "
\n/* hello [[world]] */\n\n
", + 'expectedHtml' => "
/* hello [[world]] */\n\n
", 'expectedFields' => [ 'Links' => [ [ 'World' => 0, ], @@ -37,7 +37,7 @@ class CssContentHandlerIntegrationTest extends TextContentHandlerIntegrationTest 'title' => 'MediaWiki:Test.css', 'model' => null, 'text' => "==One==\n

Two

", - 'expectedHtml' => "
\n==One==\n<h2>Two</h2>\n
", + 'expectedHtml' => "
==One==\n<h2>Two</h2>\n
", 'expectedFields' => [ 'Links' => [ ], diff --git a/tests/phpunit/includes/content/JavaScriptContentHandlerIntegrationTest.php b/tests/phpunit/includes/content/JavaScriptContentHandlerIntegrationTest.php index 52fa08cbe06..2b04512f14c 100644 --- a/tests/phpunit/includes/content/JavaScriptContentHandlerIntegrationTest.php +++ b/tests/phpunit/includes/content/JavaScriptContentHandlerIntegrationTest.php @@ -12,7 +12,7 @@ class JavaScriptContentHandlerIntegrationTest extends TextContentHandlerIntegrat 'title' => 'MediaWiki:Test.js', 'model' => null, 'text' => "hello \n", - 'expectedHtml' => "
\nhello <world>\n\n
", + 'expectedHtml' => "
hello <world>\n\n
", 'expectedFields' => [ 'Links' => [ ], @@ -24,7 +24,7 @@ class JavaScriptContentHandlerIntegrationTest extends TextContentHandlerIntegrat 'title' => 'MediaWiki:Test.js', 'model' => null, 'text' => "hello(); // [[world]]\n", - 'expectedHtml' => "
\nhello(); // [[world]]\n\n
", + 'expectedHtml' => "
hello(); // [[world]]\n\n
", 'expectedFields' => [ 'Links' => [ [ 'World' => 0, ], @@ -37,7 +37,7 @@ class JavaScriptContentHandlerIntegrationTest extends TextContentHandlerIntegrat 'title' => 'MediaWiki:Test.js', 'model' => null, 'text' => "==One==\n

Two

", - 'expectedHtml' => "
\n==One==\n<h2>Two</h2>\n
", + 'expectedHtml' => "
==One==\n<h2>Two</h2>\n
", 'expectedFields' => [ 'Links' => [ ], diff --git a/tests/phpunit/includes/content/JsonContentHandlerIntegrationTest.php b/tests/phpunit/includes/content/JsonContentHandlerIntegrationTest.php index 04810575a05..6fea47ca7e5 100644 --- a/tests/phpunit/includes/content/JsonContentHandlerIntegrationTest.php +++ b/tests/phpunit/includes/content/JsonContentHandlerIntegrationTest.php @@ -54,7 +54,7 @@ class JsonContentHandlerIntegrationTest extends MediaWikiLangTestCase { [ (object)[ '' ], '
0"' . - '<script>alert("evil!")</script>"' . + '<script>alert("evil!")</script>"' . '
', ], [ diff --git a/tests/phpunit/includes/content/WikitextContentHandlerIntegrationTest.php b/tests/phpunit/includes/content/WikitextContentHandlerIntegrationTest.php index 3bf9415ff49..3ebb7364880 100644 --- a/tests/phpunit/includes/content/WikitextContentHandlerIntegrationTest.php +++ b/tests/phpunit/includes/content/WikitextContentHandlerIntegrationTest.php @@ -99,7 +99,7 @@ class WikitextContentHandlerIntegrationTest extends TextContentHandlerIntegratio 'title' => 'WikitextContentTest_testGetParserOutput', 'model' => CONTENT_MODEL_WIKITEXT, 'text' => "#REDIRECT [[Main Page]]", - 'expectedHtml' => "

Redirect to:

", + 'expectedHtml' => "

Redirect to:

", 'expectedFields' => [ 'Links' => [ [ 'Main_Page' => 0 ], diff --git a/tests/phpunit/includes/page/WikiPageDbTest.php b/tests/phpunit/includes/page/WikiPageDbTest.php index b3e114c937e..40c841fd8bc 100644 --- a/tests/phpunit/includes/page/WikiPageDbTest.php +++ b/tests/phpunit/includes/page/WikiPageDbTest.php @@ -926,12 +926,12 @@ class WikiPageDbTest extends MediaWikiLangTestCase { [ CONTENT_MODEL_JAVASCRIPT, "var test='

not really a heading

';", - "
\nvar test='<h2>not really a heading</h2>';\n
", + "
var test='<h2>not really a heading</h2>';\n
", ], [ CONTENT_MODEL_CSS, "/* Not ''wikitext'' */", - "
\n/* Not ''wikitext'' */\n
", + "
/* Not ''wikitext'' */\n
", ], // @todo more...? ];