From 82da9cf14be08e9458f58fa96be51966a2fe7cb1 Mon Sep 17 00:00:00 2001
From: "C. Scott Ananian"
Date: Mon, 27 Nov 2023 21:27:17 -0500
Subject: [PATCH] Use Remex for DeduplicateStyles transform
The previous implementation was using an ad-hoc regular expression which
was matching inside the data-mw attribute of Parsoid output, eg:
After substitution, the element inserted contained " instead of
" and so broke out of the attribute.
Instead use a proper HTML tokenizer (via wikimedia/remex-html) so that
we don't allow bogus matches inside attribute values.
To fix up tests:
* Don't deduplicate styles when parsing UX messages (also helps performance)
* Don't deduplicate styles in ContentHandler integration tests
* Don't deduplicate styles by default in parser tests
(unless explicit option is set)
Depends-On: Id9801a9ff540bd818a32bc6fa35c48a9cff12d3a
Depends-On: I5111f1fdb7140948b82113adbc774af286174ab3
Followup-To: Ic0b17e361bf6eb0e71c498abc17f5f67f82318f8
Change-Id: I32d3d1772243c3819e1e1486351d16871b6e21c4
---
.../Stages/DeduplicateStyles.php | 35 +++++++++++--------
includes/language/Message.php | 6 ++++
tests/parser/ParserTestRunner.php | 1 +
tests/parser/parserTests.txt | 1 +
.../Stages/DeduplicateStylesTest.php | 8 ++---
.../JsonContentHandlerIntegrationTest.php | 4 ++-
.../TextContentHandlerIntegrationTest.php | 4 ++-
.../phpunit/includes/page/WikiPageDbTest.php | 4 ++-
.../includes/parser/ParserOutputTest.php | 8 ++---
9 files changed, 45 insertions(+), 26 deletions(-)
diff --git a/includes/OutputTransform/Stages/DeduplicateStyles.php b/includes/OutputTransform/Stages/DeduplicateStyles.php
index 22a0f9ce428..b46208d24d1 100644
--- a/includes/OutputTransform/Stages/DeduplicateStyles.php
+++ b/includes/OutputTransform/Stages/DeduplicateStyles.php
@@ -2,11 +2,12 @@
namespace Mediawiki\OutputTransform\Stages;
-use MediaWiki\Html\Html;
+use MediaWiki\Html\HtmlHelper;
use Mediawiki\OutputTransform\ContentTextTransformStage;
use MediaWiki\Parser\ParserOutput;
-use MediaWiki\Parser\Sanitizer;
use ParserOptions;
+use Wikimedia\RemexHtml\Serializer\SerializerNode;
+use Wikimedia\RemexHtml\Tokenizer\PlainAttributes;
/**
* Generates a list of unique style links
@@ -20,28 +21,32 @@ class DeduplicateStyles extends ContentTextTransformStage {
protected function transformText( string $text, ParserOutput $po, ?ParserOptions $popts, array &$options ): string {
$seen = [];
- return preg_replace_callback( '##s',
- static function ( $m ) use ( &$seen ) {
- $attr = Sanitizer::decodeTagAttributes( $m[1] );
- if ( !isset( $attr['data-mw-deduplicate'] ) ) {
- return $m[0];
- }
-
- $key = $attr['data-mw-deduplicate'];
+ return HtmlHelper::modifyElements(
+ $text,
+ static function ( SerializerNode $node ): bool {
+ return $node->name === 'style' &&
+ ( $node->attrs['data-mw-deduplicate'] ?? '' ) !== '';
+ },
+ static function ( SerializerNode $node ) use ( &$seen ): SerializerNode {
+ $key = $node->attrs['data-mw-deduplicate'];
if ( !isset( $seen[$key] ) ) {
$seen[$key] = true;
-
- return $m[0];
+ return $node;
}
-
// We were going to use an empty
-
+
-
-
+
+
-
+
EOF;
diff --git a/tests/phpunit/includes/content/JsonContentHandlerIntegrationTest.php b/tests/phpunit/includes/content/JsonContentHandlerIntegrationTest.php
index 9b91214e172..7955ced731f 100644
--- a/tests/phpunit/includes/content/JsonContentHandlerIntegrationTest.php
+++ b/tests/phpunit/includes/content/JsonContentHandlerIntegrationTest.php
@@ -75,7 +75,9 @@ class JsonContentHandlerIntegrationTest extends MediaWikiLangTestCase {
true
);
$this->assertInstanceOf( ParserOutput::class, $parserOutput );
- $this->assertEquals( $expected, $parserOutput->getText() );
+ $this->assertEquals( $expected, $parserOutput->getText( [
+ 'deduplicateStyles' => false,
+ ] ) );
}
/**
diff --git a/tests/phpunit/includes/content/TextContentHandlerIntegrationTest.php b/tests/phpunit/includes/content/TextContentHandlerIntegrationTest.php
index 8d13f4b3a6f..230ff215d11 100644
--- a/tests/phpunit/includes/content/TextContentHandlerIntegrationTest.php
+++ b/tests/phpunit/includes/content/TextContentHandlerIntegrationTest.php
@@ -38,7 +38,9 @@ class TextContentHandlerIntegrationTest extends MediaWikiLangTestCase {
$contentRenderer = $this->getServiceContainer()->getContentRenderer();
$po = $contentRenderer->getParserOutput( $content, $title, null, $parserOptions );
- $html = $po->getText();
+ $html = $po->getText( [
+ 'deduplicateStyles' => false,
+ ] );
$html = preg_replace( '##sm', '', $html ); // strip comments
if ( $expectedHtml !== null ) {
diff --git a/tests/phpunit/includes/page/WikiPageDbTest.php b/tests/phpunit/includes/page/WikiPageDbTest.php
index 591d0a3fff9..a1f760f2827 100644
--- a/tests/phpunit/includes/page/WikiPageDbTest.php
+++ b/tests/phpunit/includes/page/WikiPageDbTest.php
@@ -937,7 +937,9 @@ class WikiPageDbTest extends MediaWikiLangTestCase {
$opt = $page->makeParserOptions( 'canonical' );
$po = $page->getParserOutput( $opt );
- $text = $po->getText();
+ $text = $po->getText( [
+ 'deduplicateStyles' => false,
+ ] );
$text = trim( preg_replace( '//sm', '', $text ) ); # strip injected comments
$text = preg_replace( '!\s*(
|)!m', '\1', $text ); # don't let tidy confuse us
diff --git a/tests/phpunit/includes/parser/ParserOutputTest.php b/tests/phpunit/includes/parser/ParserOutputTest.php
index a4ef882a99e..6469af5f5bd 100644
--- a/tests/phpunit/includes/parser/ParserOutputTest.php
+++ b/tests/phpunit/includes/parser/ParserOutputTest.php
@@ -449,12 +449,12 @@ EOF
[], $dedupText, <<This is a test document.
-
+
-
-
+
+
-
+
EOF