Use Remex for DeduplicateStyles transform
The previous implementation was using an ad-hoc regular expression which
was matching inside the data-mw attribute of Parsoid output, eg:
<sup about="#mwt42" [...] typeof="mw:Extension/ref mw:Error" data-mw="{"name":"ref","attrs":{"name":"infobox_stats_ref_rail"},"body":{"html":"<style data-mw-deduplicate=\"TemplateStyles:r1133582631\" typeof=\"...">
After substitution, the <link> element inserted contained " instead of
" and so broke out of the attribute.
Instead use a proper HTML tokenizer (via wikimedia/remex-html) so that
we don't allow bogus matches inside attribute values.
To fix up tests:
* Don't deduplicate styles when parsing UX messages (also helps performance)
* Don't deduplicate styles in ContentHandler integration tests
* Don't deduplicate styles by default in parser tests
(unless explicit option is set)
Depends-On: Id9801a9ff540bd818a32bc6fa35c48a9cff12d3a
Depends-On: I5111f1fdb7140948b82113adbc774af286174ab3
Followup-To: Ic0b17e361bf6eb0e71c498abc17f5f67f82318f8
Change-Id: I32d3d1772243c3819e1e1486351d16871b6e21c4
This commit is contained in:
parent
5a95e4e702
commit
82da9cf14b
9 changed files with 45 additions and 26 deletions
|
|
@ -2,11 +2,12 @@
|
|||
|
||||
namespace Mediawiki\OutputTransform\Stages;
|
||||
|
||||
use MediaWiki\Html\Html;
|
||||
use MediaWiki\Html\HtmlHelper;
|
||||
use Mediawiki\OutputTransform\ContentTextTransformStage;
|
||||
use MediaWiki\Parser\ParserOutput;
|
||||
use MediaWiki\Parser\Sanitizer;
|
||||
use ParserOptions;
|
||||
use Wikimedia\RemexHtml\Serializer\SerializerNode;
|
||||
use Wikimedia\RemexHtml\Tokenizer\PlainAttributes;
|
||||
|
||||
/**
|
||||
* Generates a list of unique style links
|
||||
|
|
@ -20,28 +21,32 @@ class DeduplicateStyles extends ContentTextTransformStage {
|
|||
|
||||
protected function transformText( string $text, ParserOutput $po, ?ParserOptions $popts, array &$options ): string {
|
||||
$seen = [];
|
||||
return preg_replace_callback( '#<style\s+([^>]*data-mw-deduplicate\s*=[\'"][^>]*)>.*?</style>#s',
|
||||
static function ( $m ) use ( &$seen ) {
|
||||
$attr = Sanitizer::decodeTagAttributes( $m[1] );
|
||||
if ( !isset( $attr['data-mw-deduplicate'] ) ) {
|
||||
return $m[0];
|
||||
}
|
||||
|
||||
$key = $attr['data-mw-deduplicate'];
|
||||
return HtmlHelper::modifyElements(
|
||||
$text,
|
||||
static function ( SerializerNode $node ): bool {
|
||||
return $node->name === 'style' &&
|
||||
( $node->attrs['data-mw-deduplicate'] ?? '' ) !== '';
|
||||
},
|
||||
static function ( SerializerNode $node ) use ( &$seen ): SerializerNode {
|
||||
$key = $node->attrs['data-mw-deduplicate'];
|
||||
if ( !isset( $seen[$key] ) ) {
|
||||
$seen[$key] = true;
|
||||
|
||||
return $m[0];
|
||||
return $node;
|
||||
}
|
||||
|
||||
// We were going to use an empty <style> here, but there
|
||||
// was concern that would be too much overhead for browsers.
|
||||
// So let's hope a <link> with a non-standard rel and href isn't
|
||||
// going to be misinterpreted or mangled by any subsequent processing.
|
||||
return Html::element( 'link', [
|
||||
$node->name = 'link';
|
||||
$node->attrs = new PlainAttributes( [
|
||||
'rel' => 'mw-deduplicated-inline-style',
|
||||
'href' => "mw-data:" . wfUrlencode( $key ),
|
||||
] );
|
||||
}, $text );
|
||||
$node->children = [];
|
||||
$node->void = true;
|
||||
return $node;
|
||||
},
|
||||
$options['isParsoidContent'] ?? false
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1447,6 +1447,12 @@ class Message implements MessageSpecifier, Serializable {
|
|||
// they're inside they already are from the outer div.
|
||||
'unwrap' => true,
|
||||
'userLang' => $this->getLanguage(),
|
||||
// Although we *could* have template styles in messages,
|
||||
// they are rare enough that it's not worth the performance
|
||||
// hit on all messages to run deduplication (and the
|
||||
// deduplication would only extend within the message anyway,
|
||||
// not to the article content)
|
||||
'deduplicateStyles' => false,
|
||||
] )
|
||||
: $out;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1456,6 +1456,7 @@ class ParserTestRunner {
|
|||
'allowTOC' => !isset( $opts['notoc'] ),
|
||||
'unwrap' => !isset( $opts['wrap'] ),
|
||||
'skin' => $this->getSkin( $opts['skin'] ?? 'fallback' ),
|
||||
'deduplicateStyles' => isset( $opts['deduplicateStyles'] ),
|
||||
] );
|
||||
$out = preg_replace( '/\s+$/', '', $out );
|
||||
}
|
||||
|
|
|
|||
|
|
@ -46,6 +46,7 @@ version=2
|
|||
# thumbsize=NNN set the default thumb size to NNNpx for this test
|
||||
# wrap include the normal wrapper <div class="mw-parser-output"> (since 1.30)
|
||||
# local format section links in edit comment text as local links
|
||||
# deduplicateStyles apply style deduplication to the output
|
||||
#
|
||||
# Configuration globals:
|
||||
#
|
||||
|
|
|
|||
|
|
@ -33,12 +33,12 @@ class DeduplicateStylesTest extends OutputTransformStageTest {
|
|||
$dedup = <<<EOF
|
||||
<p>This is a test document.</p>
|
||||
<style data-mw-deduplicate="duplicate1">.Duplicate1 {}</style>
|
||||
<link rel="mw-deduplicated-inline-style" href="mw-data:duplicate1">
|
||||
<link rel="mw-deduplicated-inline-style" href="mw-data:duplicate1" />
|
||||
<style data-mw-deduplicate="duplicate2">.Duplicate2 {}</style>
|
||||
<link rel="mw-deduplicated-inline-style" href="mw-data:duplicate1">
|
||||
<link rel="mw-deduplicated-inline-style" href="mw-data:duplicate2">
|
||||
<link rel="mw-deduplicated-inline-style" href="mw-data:duplicate1" />
|
||||
<link rel="mw-deduplicated-inline-style" href="mw-data:duplicate2" />
|
||||
<style data-mw-not-deduplicate="duplicate1">.Duplicate1 {}</style>
|
||||
<link rel="mw-deduplicated-inline-style" href="mw-data:duplicate1">
|
||||
<link rel="mw-deduplicated-inline-style" href="mw-data:duplicate1" />
|
||||
<style data-mw-deduplicate="duplicate3">.Duplicate1 {}</style>
|
||||
<style>.Duplicate1 {}</style>
|
||||
EOF;
|
||||
|
|
|
|||
|
|
@ -75,7 +75,9 @@ class JsonContentHandlerIntegrationTest extends MediaWikiLangTestCase {
|
|||
true
|
||||
);
|
||||
$this->assertInstanceOf( ParserOutput::class, $parserOutput );
|
||||
$this->assertEquals( $expected, $parserOutput->getText() );
|
||||
$this->assertEquals( $expected, $parserOutput->getText( [
|
||||
'deduplicateStyles' => false,
|
||||
] ) );
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -38,7 +38,9 @@ class TextContentHandlerIntegrationTest extends MediaWikiLangTestCase {
|
|||
$contentRenderer = $this->getServiceContainer()->getContentRenderer();
|
||||
$po = $contentRenderer->getParserOutput( $content, $title, null, $parserOptions );
|
||||
|
||||
$html = $po->getText();
|
||||
$html = $po->getText( [
|
||||
'deduplicateStyles' => false,
|
||||
] );
|
||||
$html = preg_replace( '#<!--.*?-->#sm', '', $html ); // strip comments
|
||||
|
||||
if ( $expectedHtml !== null ) {
|
||||
|
|
|
|||
|
|
@ -937,7 +937,9 @@ class WikiPageDbTest extends MediaWikiLangTestCase {
|
|||
|
||||
$opt = $page->makeParserOptions( 'canonical' );
|
||||
$po = $page->getParserOutput( $opt );
|
||||
$text = $po->getText();
|
||||
$text = $po->getText( [
|
||||
'deduplicateStyles' => false,
|
||||
] );
|
||||
|
||||
$text = trim( preg_replace( '/<!--.*?-->/sm', '', $text ) ); # strip injected comments
|
||||
$text = preg_replace( '!\s*(</p>|</div>)!m', '\1', $text ); # don't let tidy confuse us
|
||||
|
|
|
|||
|
|
@ -449,12 +449,12 @@ EOF
|
|||
[], $dedupText, <<<EOF
|
||||
<p>This is a test document.</p>
|
||||
<style data-mw-deduplicate="duplicate1">.Duplicate1 {}</style>
|
||||
<link rel="mw-deduplicated-inline-style" href="mw-data:duplicate1">
|
||||
<link rel="mw-deduplicated-inline-style" href="mw-data:duplicate1" />
|
||||
<style data-mw-deduplicate="duplicate2">.Duplicate2 {}</style>
|
||||
<link rel="mw-deduplicated-inline-style" href="mw-data:duplicate1">
|
||||
<link rel="mw-deduplicated-inline-style" href="mw-data:duplicate2">
|
||||
<link rel="mw-deduplicated-inline-style" href="mw-data:duplicate1" />
|
||||
<link rel="mw-deduplicated-inline-style" href="mw-data:duplicate2" />
|
||||
<style data-mw-not-deduplicate="duplicate1">.Duplicate1 {}</style>
|
||||
<link rel="mw-deduplicated-inline-style" href="mw-data:duplicate1">
|
||||
<link rel="mw-deduplicated-inline-style" href="mw-data:duplicate1" />
|
||||
<style data-mw-deduplicate="duplicate3">.Duplicate1 {}</style>
|
||||
<style>.Duplicate1 {}</style>
|
||||
EOF
|
||||
|
|
|
|||
Loading…
Reference in a new issue