From 82da9cf14be08e9458f58fa96be51966a2fe7cb1 Mon Sep 17 00:00:00 2001 From: "C. Scott Ananian" Date: Mon, 27 Nov 2023 21:27:17 -0500 Subject: [PATCH] Use Remex for DeduplicateStyles transform The previous implementation was using an ad-hoc regular expression which was matching inside the data-mw attribute of Parsoid output, eg: After substitution, the element inserted contained " instead of " and so broke out of the attribute. Instead use a proper HTML tokenizer (via wikimedia/remex-html) so that we don't allow bogus matches inside attribute values. To fix up tests: * Don't deduplicate styles when parsing UX messages (also helps performance) * Don't deduplicate styles in ContentHandler integration tests * Don't deduplicate styles by default in parser tests (unless explicit option is set) Depends-On: Id9801a9ff540bd818a32bc6fa35c48a9cff12d3a Depends-On: I5111f1fdb7140948b82113adbc774af286174ab3 Followup-To: Ic0b17e361bf6eb0e71c498abc17f5f67f82318f8 Change-Id: I32d3d1772243c3819e1e1486351d16871b6e21c4 --- .../Stages/DeduplicateStyles.php | 35 +++++++++++-------- includes/language/Message.php | 6 ++++ tests/parser/ParserTestRunner.php | 1 + tests/parser/parserTests.txt | 1 + .../Stages/DeduplicateStylesTest.php | 8 ++--- .../JsonContentHandlerIntegrationTest.php | 4 ++- .../TextContentHandlerIntegrationTest.php | 4 ++- .../phpunit/includes/page/WikiPageDbTest.php | 4 ++- .../includes/parser/ParserOutputTest.php | 8 ++--- 9 files changed, 45 insertions(+), 26 deletions(-) diff --git a/includes/OutputTransform/Stages/DeduplicateStyles.php b/includes/OutputTransform/Stages/DeduplicateStyles.php index 22a0f9ce428..b46208d24d1 100644 --- a/includes/OutputTransform/Stages/DeduplicateStyles.php +++ b/includes/OutputTransform/Stages/DeduplicateStyles.php @@ -2,11 +2,12 @@ namespace Mediawiki\OutputTransform\Stages; -use MediaWiki\Html\Html; +use MediaWiki\Html\HtmlHelper; use Mediawiki\OutputTransform\ContentTextTransformStage; use MediaWiki\Parser\ParserOutput; -use MediaWiki\Parser\Sanitizer; use ParserOptions; +use Wikimedia\RemexHtml\Serializer\SerializerNode; +use Wikimedia\RemexHtml\Tokenizer\PlainAttributes; /** * Generates a list of unique style links @@ -20,28 +21,32 @@ class DeduplicateStyles extends ContentTextTransformStage { protected function transformText( string $text, ParserOutput $po, ?ParserOptions $popts, array &$options ): string { $seen = []; - return preg_replace_callback( '#]*data-mw-deduplicate\s*=[\'"][^>]*)>.*?#s', - static function ( $m ) use ( &$seen ) { - $attr = Sanitizer::decodeTagAttributes( $m[1] ); - if ( !isset( $attr['data-mw-deduplicate'] ) ) { - return $m[0]; - } - - $key = $attr['data-mw-deduplicate']; + return HtmlHelper::modifyElements( + $text, + static function ( SerializerNode $node ): bool { + return $node->name === 'style' && + ( $node->attrs['data-mw-deduplicate'] ?? '' ) !== ''; + }, + static function ( SerializerNode $node ) use ( &$seen ): SerializerNode { + $key = $node->attrs['data-mw-deduplicate']; if ( !isset( $seen[$key] ) ) { $seen[$key] = true; - - return $m[0]; + return $node; } - // We were going to use an empty - + - - + + - + EOF; diff --git a/tests/phpunit/includes/content/JsonContentHandlerIntegrationTest.php b/tests/phpunit/includes/content/JsonContentHandlerIntegrationTest.php index 9b91214e172..7955ced731f 100644 --- a/tests/phpunit/includes/content/JsonContentHandlerIntegrationTest.php +++ b/tests/phpunit/includes/content/JsonContentHandlerIntegrationTest.php @@ -75,7 +75,9 @@ class JsonContentHandlerIntegrationTest extends MediaWikiLangTestCase { true ); $this->assertInstanceOf( ParserOutput::class, $parserOutput ); - $this->assertEquals( $expected, $parserOutput->getText() ); + $this->assertEquals( $expected, $parserOutput->getText( [ + 'deduplicateStyles' => false, + ] ) ); } /** diff --git a/tests/phpunit/includes/content/TextContentHandlerIntegrationTest.php b/tests/phpunit/includes/content/TextContentHandlerIntegrationTest.php index 8d13f4b3a6f..230ff215d11 100644 --- a/tests/phpunit/includes/content/TextContentHandlerIntegrationTest.php +++ b/tests/phpunit/includes/content/TextContentHandlerIntegrationTest.php @@ -38,7 +38,9 @@ class TextContentHandlerIntegrationTest extends MediaWikiLangTestCase { $contentRenderer = $this->getServiceContainer()->getContentRenderer(); $po = $contentRenderer->getParserOutput( $content, $title, null, $parserOptions ); - $html = $po->getText(); + $html = $po->getText( [ + 'deduplicateStyles' => false, + ] ); $html = preg_replace( '##sm', '', $html ); // strip comments if ( $expectedHtml !== null ) { diff --git a/tests/phpunit/includes/page/WikiPageDbTest.php b/tests/phpunit/includes/page/WikiPageDbTest.php index 591d0a3fff9..a1f760f2827 100644 --- a/tests/phpunit/includes/page/WikiPageDbTest.php +++ b/tests/phpunit/includes/page/WikiPageDbTest.php @@ -937,7 +937,9 @@ class WikiPageDbTest extends MediaWikiLangTestCase { $opt = $page->makeParserOptions( 'canonical' ); $po = $page->getParserOutput( $opt ); - $text = $po->getText(); + $text = $po->getText( [ + 'deduplicateStyles' => false, + ] ); $text = trim( preg_replace( '//sm', '', $text ) ); # strip injected comments $text = preg_replace( '!\s*(

|)!m', '\1', $text ); # don't let tidy confuse us diff --git a/tests/phpunit/includes/parser/ParserOutputTest.php b/tests/phpunit/includes/parser/ParserOutputTest.php index a4ef882a99e..6469af5f5bd 100644 --- a/tests/phpunit/includes/parser/ParserOutputTest.php +++ b/tests/phpunit/includes/parser/ParserOutputTest.php @@ -449,12 +449,12 @@ EOF [], $dedupText, <<This is a test document.

- + - - + + - + EOF