Use Remex for DeduplicateStyles transform

The previous implementation was using an ad-hoc regular expression which
was matching inside the data-mw attribute of Parsoid output, eg:

 <sup about="#mwt42" [...] typeof="mw:Extension/ref mw:Error" data-mw="{&quot;name&quot;:&quot;ref&quot;,&quot;attrs&quot;:{&quot;name&quot;:&quot;infobox_stats_ref_rail&quot;},&quot;body&quot;:{&quot;html&quot;:&quot;<style data-mw-deduplicate=\&quot;TemplateStyles:r1133582631\&quot; typeof=\&quot;...">

After substitution, the <link> element inserted contained " instead of
&quot; and so broke out of the attribute.

Instead use a proper HTML tokenizer (via wikimedia/remex-html) so that
we don't allow bogus matches inside attribute values.

To fix up tests:
* Don't deduplicate styles when parsing UX messages (also helps performance)
* Don't deduplicate styles in ContentHandler integration tests
* Don't deduplicate styles by default in parser tests
  (unless explicit option is set)

Depends-On: Id9801a9ff540bd818a32bc6fa35c48a9cff12d3a
Depends-On: I5111f1fdb7140948b82113adbc774af286174ab3
Followup-To: Ic0b17e361bf6eb0e71c498abc17f5f67f82318f8
Change-Id: I32d3d1772243c3819e1e1486351d16871b6e21c4
This commit is contained in:
C. Scott Ananian 2023-11-27 21:27:17 -05:00 committed by Bartosz Dziewoński
parent 5a95e4e702
commit 82da9cf14b
9 changed files with 45 additions and 26 deletions

View file

@ -2,11 +2,12 @@
namespace Mediawiki\OutputTransform\Stages;
use MediaWiki\Html\Html;
use MediaWiki\Html\HtmlHelper;
use Mediawiki\OutputTransform\ContentTextTransformStage;
use MediaWiki\Parser\ParserOutput;
use MediaWiki\Parser\Sanitizer;
use ParserOptions;
use Wikimedia\RemexHtml\Serializer\SerializerNode;
use Wikimedia\RemexHtml\Tokenizer\PlainAttributes;
/**
* Generates a list of unique style links
@ -20,28 +21,32 @@ class DeduplicateStyles extends ContentTextTransformStage {
protected function transformText( string $text, ParserOutput $po, ?ParserOptions $popts, array &$options ): string {
$seen = [];
return preg_replace_callback( '#<style\s+([^>]*data-mw-deduplicate\s*=[\'"][^>]*)>.*?</style>#s',
static function ( $m ) use ( &$seen ) {
$attr = Sanitizer::decodeTagAttributes( $m[1] );
if ( !isset( $attr['data-mw-deduplicate'] ) ) {
return $m[0];
}
$key = $attr['data-mw-deduplicate'];
return HtmlHelper::modifyElements(
$text,
static function ( SerializerNode $node ): bool {
return $node->name === 'style' &&
( $node->attrs['data-mw-deduplicate'] ?? '' ) !== '';
},
static function ( SerializerNode $node ) use ( &$seen ): SerializerNode {
$key = $node->attrs['data-mw-deduplicate'];
if ( !isset( $seen[$key] ) ) {
$seen[$key] = true;
return $m[0];
return $node;
}
// We were going to use an empty <style> here, but there
// was concern that would be too much overhead for browsers.
// So let's hope a <link> with a non-standard rel and href isn't
// going to be misinterpreted or mangled by any subsequent processing.
return Html::element( 'link', [
$node->name = 'link';
$node->attrs = new PlainAttributes( [
'rel' => 'mw-deduplicated-inline-style',
'href' => "mw-data:" . wfUrlencode( $key ),
] );
}, $text );
$node->children = [];
$node->void = true;
return $node;
},
$options['isParsoidContent'] ?? false
);
}
}

View file

@ -1447,6 +1447,12 @@ class Message implements MessageSpecifier, Serializable {
// they're inside they already are from the outer div.
'unwrap' => true,
'userLang' => $this->getLanguage(),
// Although we *could* have template styles in messages,
// they are rare enough that it's not worth the performance
// hit on all messages to run deduplication (and the
// deduplication would only extend within the message anyway,
// not to the article content)
'deduplicateStyles' => false,
] )
: $out;
}

View file

@ -1456,6 +1456,7 @@ class ParserTestRunner {
'allowTOC' => !isset( $opts['notoc'] ),
'unwrap' => !isset( $opts['wrap'] ),
'skin' => $this->getSkin( $opts['skin'] ?? 'fallback' ),
'deduplicateStyles' => isset( $opts['deduplicateStyles'] ),
] );
$out = preg_replace( '/\s+$/', '', $out );
}

View file

@ -46,6 +46,7 @@ version=2
# thumbsize=NNN set the default thumb size to NNNpx for this test
# wrap include the normal wrapper <div class="mw-parser-output"> (since 1.30)
# local format section links in edit comment text as local links
# deduplicateStyles apply style deduplication to the output
#
# Configuration globals:
#

View file

@ -33,12 +33,12 @@ class DeduplicateStylesTest extends OutputTransformStageTest {
$dedup = <<<EOF
<p>This is a test document.</p>
<style data-mw-deduplicate="duplicate1">.Duplicate1 {}</style>
<link rel="mw-deduplicated-inline-style" href="mw-data:duplicate1">
<link rel="mw-deduplicated-inline-style" href="mw-data:duplicate1" />
<style data-mw-deduplicate="duplicate2">.Duplicate2 {}</style>
<link rel="mw-deduplicated-inline-style" href="mw-data:duplicate1">
<link rel="mw-deduplicated-inline-style" href="mw-data:duplicate2">
<link rel="mw-deduplicated-inline-style" href="mw-data:duplicate1" />
<link rel="mw-deduplicated-inline-style" href="mw-data:duplicate2" />
<style data-mw-not-deduplicate="duplicate1">.Duplicate1 {}</style>
<link rel="mw-deduplicated-inline-style" href="mw-data:duplicate1">
<link rel="mw-deduplicated-inline-style" href="mw-data:duplicate1" />
<style data-mw-deduplicate="duplicate3">.Duplicate1 {}</style>
<style>.Duplicate1 {}</style>
EOF;

View file

@ -75,7 +75,9 @@ class JsonContentHandlerIntegrationTest extends MediaWikiLangTestCase {
true
);
$this->assertInstanceOf( ParserOutput::class, $parserOutput );
$this->assertEquals( $expected, $parserOutput->getText() );
$this->assertEquals( $expected, $parserOutput->getText( [
'deduplicateStyles' => false,
] ) );
}
/**

View file

@ -38,7 +38,9 @@ class TextContentHandlerIntegrationTest extends MediaWikiLangTestCase {
$contentRenderer = $this->getServiceContainer()->getContentRenderer();
$po = $contentRenderer->getParserOutput( $content, $title, null, $parserOptions );
$html = $po->getText();
$html = $po->getText( [
'deduplicateStyles' => false,
] );
$html = preg_replace( '#<!--.*?-->#sm', '', $html ); // strip comments
if ( $expectedHtml !== null ) {

View file

@ -937,7 +937,9 @@ class WikiPageDbTest extends MediaWikiLangTestCase {
$opt = $page->makeParserOptions( 'canonical' );
$po = $page->getParserOutput( $opt );
$text = $po->getText();
$text = $po->getText( [
'deduplicateStyles' => false,
] );
$text = trim( preg_replace( '/<!--.*?-->/sm', '', $text ) ); # strip injected comments
$text = preg_replace( '!\s*(</p>|</div>)!m', '\1', $text ); # don't let tidy confuse us

View file

@ -449,12 +449,12 @@ EOF
[], $dedupText, <<<EOF
<p>This is a test document.</p>
<style data-mw-deduplicate="duplicate1">.Duplicate1 {}</style>
<link rel="mw-deduplicated-inline-style" href="mw-data:duplicate1">
<link rel="mw-deduplicated-inline-style" href="mw-data:duplicate1" />
<style data-mw-deduplicate="duplicate2">.Duplicate2 {}</style>
<link rel="mw-deduplicated-inline-style" href="mw-data:duplicate1">
<link rel="mw-deduplicated-inline-style" href="mw-data:duplicate2">
<link rel="mw-deduplicated-inline-style" href="mw-data:duplicate1" />
<link rel="mw-deduplicated-inline-style" href="mw-data:duplicate2" />
<style data-mw-not-deduplicate="duplicate1">.Duplicate1 {}</style>
<link rel="mw-deduplicated-inline-style" href="mw-data:duplicate1">
<link rel="mw-deduplicated-inline-style" href="mw-data:duplicate1" />
<style data-mw-deduplicate="duplicate3">.Duplicate1 {}</style>
<style>.Duplicate1 {}</style>
EOF