DeduplicateStyles: Only transform possible style nodes

Why:

- DeduplicateStyles runs as a default post-cache output transformation
  for every backend pageview. It tokenizes the article HTML via Remex to
  deduplicate style nodes within.
- This is expensive for large pages. On the Barack Obama page, the
  transform takes 350+ ms on a parser cache hit.
- Some other transforms, like HandleSectionLinks, already use regexes to
  only run Remex-driven transforms on relevant elements to avoid a
  potentially expensive tokenization of the whole page.

What:

- Use a regular expression to limit this transform so that it only
  tokenizes potential <style> nodes. This takes ~2ms to execute on a
  large page[1], compared to ~166ms currently.
- Restrict this optimization to legacy parser output transformations,
  since the naïve regex used might otherwise match encoded style tags
  within data-parsoid attribute values, as described in
  I32d3d1772243c3819e1e1486351d16871b6e21c4.
  Add a test for this.

[1] https://en.m.wikipedia.org/wiki/Democratic_Party_(United_States)?action=render

Bug: T394059
Change-Id: I33ebcc2da7685b4b6dafdad3ed3ef2a9edea9a00
(cherry picked from commit 02f69d5dc99a964981c57b597eedffa1f253a14c)
This commit is contained in:
Máté Szabó 2025-05-13 17:30:39 +02:00 committed by Reedy
parent 0699f46299
commit acb403ccfd
2 changed files with 78 additions and 35 deletions

View file

@ -21,32 +21,51 @@ class DeduplicateStyles extends ContentTextTransformStage {
protected function transformText( string $text, ParserOutput $po, ?ParserOptions $popts, array &$options ): string {
$seen = [];
return HtmlHelper::modifyElements(
$text,
static function ( SerializerNode $node ): bool {
return $node->name === 'style' &&
( $node->attrs['data-mw-deduplicate'] ?? '' ) !== '';
},
static function ( SerializerNode $node ) use ( &$seen ): SerializerNode {
$key = $node->attrs['data-mw-deduplicate'];
if ( !isset( $seen[$key] ) ) {
$seen[$key] = true;
$isParsoidContent = $options['isParsoidContent'] ?? false;
$transform = static function ( $fragment ) use ( &$seen, $isParsoidContent ) {
return HtmlHelper::modifyElements(
$fragment,
static function ( SerializerNode $node ): bool {
return $node->name === 'style' &&
( $node->attrs['data-mw-deduplicate'] ?? '' ) !== '';
},
static function ( SerializerNode $node ) use ( &$seen ): SerializerNode {
$key = $node->attrs['data-mw-deduplicate'];
if ( !isset( $seen[$key] ) ) {
$seen[$key] = true;
return $node;
}
// We were going to use an empty <style> here, but there
// was concern that would be too much overhead for browsers.
// So let's hope a <link> with a non-standard rel and href isn't
// going to be misinterpreted or mangled by any subsequent processing.
$node->name = 'link';
$node->attrs = new PlainAttributes( [
'rel' => 'mw-deduplicated-inline-style',
'href' => "mw-data:" . wfUrlencode( $key ),
] );
$node->children = [];
$node->void = true;
return $node;
}
// We were going to use an empty <style> here, but there
// was concern that would be too much overhead for browsers.
// So let's hope a <link> with a non-standard rel and href isn't
// going to be misinterpreted or mangled by any subsequent processing.
$node->name = 'link';
$node->attrs = new PlainAttributes( [
'rel' => 'mw-deduplicated-inline-style',
'href' => "mw-data:" . wfUrlencode( $key ),
] );
$node->children = [];
$node->void = true;
return $node;
},
$options['isParsoidContent'] ?? false
);
},
$isParsoidContent
);
};
if ( !$isParsoidContent ) {
// Optimization: Only transform possible style nodes to avoid having to tokenize the entire output,
// which is expensive for large pages (T394059).
// This is unsafe to do for Parsoid content, since the naïve regex below might match encoded style
// tags within data-parsoid attribute values, so only apply it to legacy parser output.
// Parsoid content transformations will be further optimized in T394005.
return preg_replace_callback(
'#<style\s+([^>]*data-mw-deduplicate\s*=[\'"][^>]*)>.*?</style>#s',
static fn ( array $matches ) => $transform( $matches[0] ),
$text
);
}
return $transform( $text );
}
}

View file

@ -35,8 +35,12 @@ class DeduplicateStylesTest extends OutputTransformStageTestBase {
] );
}
public function provideTransform(): array {
$dedup = <<<EOF
public function provideTransform(): iterable {
$testCases = [
'legacy parser output' => [
TestUtils::TEST_TO_DEDUP,
[],
<<<EOF
<p>This is a test document.</p>
<style data-mw-deduplicate="duplicate1">.Duplicate1 {}</style>
<link rel="mw-deduplicated-inline-style" href="mw-data:duplicate1" />
@ -47,13 +51,33 @@ class DeduplicateStylesTest extends OutputTransformStageTestBase {
<link rel="mw-deduplicated-inline-style" href="mw-data:duplicate1" />
<style data-mw-deduplicate="duplicate3">.Duplicate1 {}</style>
<style>.Duplicate1 {}</style>
EOF;
$po = new ParserOutput( TestUtils::TEST_TO_DEDUP );
$expected = new ParserOutput( $dedup );
$opts = [];
return [
[ $po, null, $opts, $expected ]
EOF
],
'parsoid content with encoded style tags in data-mw attribute' => [
<<<EOF
<style data-mw-deduplicate="duplicate1">.Duplicate1 {}</style>
<span data-mw="{&quot;name&quot;:&quot;ref&quot;,&quot;attrs&quot;:{&quot;name&quot;:&quot;blank&quot;},
&quot;body&quot;:{&quot;html&quot;:&quot;<style data-mw-deduplicate=\&quot;duplicate1\&quot;>.Duplicate1 {}</style>&quot;}"></span>
<style data-mw-deduplicate="duplicate1">.Duplicate1 {}</style>
EOF
,
[ 'isParsoidContent' => true ],
<<<EOF
<style data-mw-deduplicate="duplicate1">.Duplicate1 {}</style>
<span data-mw="{&quot;name&quot;:&quot;ref&quot;,&quot;attrs&quot;:{&quot;name&quot;:&quot;blank&quot;},
&quot;body&quot;:{&quot;html&quot;:&quot;<style data-mw-deduplicate=\&quot;duplicate1\&quot;>.Duplicate1 {}</style>&quot;}"></span>
<link rel="mw-deduplicated-inline-style" href="mw-data:duplicate1">
EOF
]
];
foreach ( $testCases as $name => [ $input, $options, $expected ] ) {
yield $name => [
new ParserOutput( $input ),
null,
$options,
new ParserOutput( $expected )
];
}
}
}