DeduplicateStyles: Only transform possible style nodes
Why: - DeduplicateStyles runs as a default post-cache output transformation for every backend pageview. It tokenizes the article HTML via Remex to deduplicate style nodes within. - This is expensive for large pages. On the Barack Obama page, the transform takes 350+ ms on a parser cache hit. - Some other transforms, like HandleSectionLinks, already use regexes to only run Remex-driven transforms on relevant elements to avoid a potentially expensive tokenization of the whole page. What: - Use a regular expression to limit this transform so that it only tokenizes potential <style> nodes. This takes ~2ms to execute on a large page[1], compared to ~166ms currently. - Restrict this optimization to legacy parser output transformations, since the naïve regex used might otherwise match encoded style tags within data-parsoid attribute values, as described in I32d3d1772243c3819e1e1486351d16871b6e21c4. Add a test for this. [1] https://en.m.wikipedia.org/wiki/Democratic_Party_(United_States)?action=render Bug: T394059 Change-Id: I33ebcc2da7685b4b6dafdad3ed3ef2a9edea9a00 (cherry picked from commit 02f69d5dc99a964981c57b597eedffa1f253a14c)
This commit is contained in:
parent
0699f46299
commit
acb403ccfd
2 changed files with 78 additions and 35 deletions
|
|
@ -21,32 +21,51 @@ class DeduplicateStyles extends ContentTextTransformStage {
|
|||
|
||||
protected function transformText( string $text, ParserOutput $po, ?ParserOptions $popts, array &$options ): string {
|
||||
$seen = [];
|
||||
return HtmlHelper::modifyElements(
|
||||
$text,
|
||||
static function ( SerializerNode $node ): bool {
|
||||
return $node->name === 'style' &&
|
||||
( $node->attrs['data-mw-deduplicate'] ?? '' ) !== '';
|
||||
},
|
||||
static function ( SerializerNode $node ) use ( &$seen ): SerializerNode {
|
||||
$key = $node->attrs['data-mw-deduplicate'];
|
||||
if ( !isset( $seen[$key] ) ) {
|
||||
$seen[$key] = true;
|
||||
$isParsoidContent = $options['isParsoidContent'] ?? false;
|
||||
|
||||
$transform = static function ( $fragment ) use ( &$seen, $isParsoidContent ) {
|
||||
return HtmlHelper::modifyElements(
|
||||
$fragment,
|
||||
static function ( SerializerNode $node ): bool {
|
||||
return $node->name === 'style' &&
|
||||
( $node->attrs['data-mw-deduplicate'] ?? '' ) !== '';
|
||||
},
|
||||
static function ( SerializerNode $node ) use ( &$seen ): SerializerNode {
|
||||
$key = $node->attrs['data-mw-deduplicate'];
|
||||
if ( !isset( $seen[$key] ) ) {
|
||||
$seen[$key] = true;
|
||||
return $node;
|
||||
}
|
||||
// We were going to use an empty <style> here, but there
|
||||
// was concern that would be too much overhead for browsers.
|
||||
// So let's hope a <link> with a non-standard rel and href isn't
|
||||
// going to be misinterpreted or mangled by any subsequent processing.
|
||||
$node->name = 'link';
|
||||
$node->attrs = new PlainAttributes( [
|
||||
'rel' => 'mw-deduplicated-inline-style',
|
||||
'href' => "mw-data:" . wfUrlencode( $key ),
|
||||
] );
|
||||
$node->children = [];
|
||||
$node->void = true;
|
||||
return $node;
|
||||
}
|
||||
// We were going to use an empty <style> here, but there
|
||||
// was concern that would be too much overhead for browsers.
|
||||
// So let's hope a <link> with a non-standard rel and href isn't
|
||||
// going to be misinterpreted or mangled by any subsequent processing.
|
||||
$node->name = 'link';
|
||||
$node->attrs = new PlainAttributes( [
|
||||
'rel' => 'mw-deduplicated-inline-style',
|
||||
'href' => "mw-data:" . wfUrlencode( $key ),
|
||||
] );
|
||||
$node->children = [];
|
||||
$node->void = true;
|
||||
return $node;
|
||||
},
|
||||
$options['isParsoidContent'] ?? false
|
||||
);
|
||||
},
|
||||
$isParsoidContent
|
||||
);
|
||||
};
|
||||
|
||||
if ( !$isParsoidContent ) {
|
||||
// Optimization: Only transform possible style nodes to avoid having to tokenize the entire output,
|
||||
// which is expensive for large pages (T394059).
|
||||
// This is unsafe to do for Parsoid content, since the naïve regex below might match encoded style
|
||||
// tags within data-parsoid attribute values, so only apply it to legacy parser output.
|
||||
// Parsoid content transformations will be further optimized in T394005.
|
||||
return preg_replace_callback(
|
||||
'#<style\s+([^>]*data-mw-deduplicate\s*=[\'"][^>]*)>.*?</style>#s',
|
||||
static fn ( array $matches ) => $transform( $matches[0] ),
|
||||
$text
|
||||
);
|
||||
}
|
||||
|
||||
return $transform( $text );
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -35,8 +35,12 @@ class DeduplicateStylesTest extends OutputTransformStageTestBase {
|
|||
] );
|
||||
}
|
||||
|
||||
public function provideTransform(): array {
|
||||
$dedup = <<<EOF
|
||||
public function provideTransform(): iterable {
|
||||
$testCases = [
|
||||
'legacy parser output' => [
|
||||
TestUtils::TEST_TO_DEDUP,
|
||||
[],
|
||||
<<<EOF
|
||||
<p>This is a test document.</p>
|
||||
<style data-mw-deduplicate="duplicate1">.Duplicate1 {}</style>
|
||||
<link rel="mw-deduplicated-inline-style" href="mw-data:duplicate1" />
|
||||
|
|
@ -47,13 +51,33 @@ class DeduplicateStylesTest extends OutputTransformStageTestBase {
|
|||
<link rel="mw-deduplicated-inline-style" href="mw-data:duplicate1" />
|
||||
<style data-mw-deduplicate="duplicate3">.Duplicate1 {}</style>
|
||||
<style>.Duplicate1 {}</style>
|
||||
EOF;
|
||||
|
||||
$po = new ParserOutput( TestUtils::TEST_TO_DEDUP );
|
||||
$expected = new ParserOutput( $dedup );
|
||||
$opts = [];
|
||||
return [
|
||||
[ $po, null, $opts, $expected ]
|
||||
EOF
|
||||
],
|
||||
'parsoid content with encoded style tags in data-mw attribute' => [
|
||||
<<<EOF
|
||||
<style data-mw-deduplicate="duplicate1">.Duplicate1 {}</style>
|
||||
<span data-mw="{"name":"ref","attrs":{"name":"blank"},
|
||||
"body":{"html":"<style data-mw-deduplicate=\"duplicate1\">.Duplicate1 {}</style>"}"></span>
|
||||
<style data-mw-deduplicate="duplicate1">.Duplicate1 {}</style>
|
||||
EOF
|
||||
,
|
||||
[ 'isParsoidContent' => true ],
|
||||
<<<EOF
|
||||
<style data-mw-deduplicate="duplicate1">.Duplicate1 {}</style>
|
||||
<span data-mw="{"name":"ref","attrs":{"name":"blank"},
|
||||
"body":{"html":"<style data-mw-deduplicate=\"duplicate1\">.Duplicate1 {}</style>"}"></span>
|
||||
<link rel="mw-deduplicated-inline-style" href="mw-data:duplicate1">
|
||||
EOF
|
||||
]
|
||||
];
|
||||
|
||||
foreach ( $testCases as $name => [ $input, $options, $expected ] ) {
|
||||
yield $name => [
|
||||
new ParserOutput( $input ),
|
||||
null,
|
||||
$options,
|
||||
new ParserOutput( $expected )
|
||||
];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in a new issue