The existing Sanitizer::removeHTMLtags() method, in addition to having dodgy capitalization, uses regular expressions to parse the HTML. That produces corner cases like T298401 and T67747 and is not guaranteed to yield balanced or well-formed HTML. Instead, introduce and use a new Sanitizer::removeSomeTags() method which is guaranteed to always return balanced and well-formed HTML. Note that Sanitizer::removeHTMLtags()/::removeSomeTags() take a callback argument which (as far as I can tell) is never used outside core. Mark that argument as @internal, and clean up the version used by ::removeSomeTags(). Use the new ::removeSomeTags() method in the two places where DISPLAYTITLE is handled (following up on T67747). The use by the legacy parser is more difficult to replace (and would have a performace cost), so leave the old ::removeHTMLtags() method in place for that call site for now: when the legacy parser is replaced by Parsoid the need for the old ::removeHTMLtags() will go away. In a follow-up patch we'll rename ::removeHTMLtags() and mark it @internal so that we can deprecate ::removeHTMLtags() for external use. Some benchmarking code added. On my machine, with PHP 7.4, the new method tidies short 30-character title strings at a rate of about 6764/s while the tidy-based method being replaced here managed 6384/s. Sanitizer::removeHTMLtags blazes through short strings 20x faster (120,915/s); some of this difference is due to the set up cost of creating the tag whitelist and the Remex pipeline, so further optimizations could doubtless be done if Sanitizer::removeSomeTags() is more widely used. Bug: T299722 Bug: T67747 Change-Id: Ic864c01471c292f11799c4fbdac4d7d30b8bc50f
172 lines
5.2 KiB
PHP
172 lines
5.2 KiB
PHP
<?php
|
|
|
|
namespace MediaWiki\Parser;
|
|
|
|
use Sanitizer;
|
|
use Wikimedia\RemexHtml\Tokenizer\Attributes;
|
|
use Wikimedia\RemexHtml\Tokenizer\PlainAttributes;
|
|
use Wikimedia\RemexHtml\Tokenizer\RelayTokenHandler;
|
|
use Wikimedia\RemexHtml\Tokenizer\TokenHandler;
|
|
|
|
/**
|
|
* Helper class for Sanitizer::removeSomeTags().
|
|
* @internal
|
|
*/
|
|
class RemexRemoveTagHandler extends RelayTokenHandler {
|
|
/**
|
|
* @var string The original HTML source string (used for fallback text
|
|
* when rejecting an HTML tag).
|
|
*/
|
|
private $source;
|
|
|
|
/**
|
|
* @var array<string,true> Set of HTML tags which can be self-closed.
|
|
*/
|
|
private $htmlsingle;
|
|
|
|
/**
|
|
* @var array<string,true> Self-closed tags which are on $htmlsingle
|
|
* but not on $htmlsingleonly will be emitted as an empty element.
|
|
*/
|
|
private $htmlsingleonly;
|
|
|
|
/**
|
|
* @var array<string,true> Set of allowed HTML open/close tags.
|
|
*/
|
|
private $htmlelements;
|
|
|
|
/**
|
|
* @var ?callable(Attributes,mixed...):Attributes Callback to mutate or
|
|
* sanitize attributes.
|
|
*/
|
|
private $attrCallback;
|
|
|
|
/**
|
|
* @var ?array $args Optional extra arguments to provide to the
|
|
* $attrCallback.
|
|
*/
|
|
private $callbackArgs;
|
|
|
|
/**
|
|
* @param TokenHandler $nextHandler Handler to relay accepted tokens.
|
|
* @param string $source Input source string.
|
|
* @param array $tagData Information about allowed/rejected tags.
|
|
* @param ?callable $attrCallback Attribute handler callback.
|
|
* The full signature is ?callable(Attributes,mixed...):Attributes
|
|
* @param ?array $callbackArgs Optional arguments to attribute handler.
|
|
*/
|
|
public function __construct(
|
|
TokenHandler $nextHandler,
|
|
string $source,
|
|
array $tagData,
|
|
?callable $attrCallback,
|
|
?array $callbackArgs
|
|
) {
|
|
parent::__construct( $nextHandler );
|
|
$this->source = $source;
|
|
$this->htmlsingle = $tagData['htmlsingle'];
|
|
$this->htmlsingleonly = $tagData['htmlsingleonly'];
|
|
$this->htmlelements = $tagData['htmlelements'];
|
|
$this->attrCallback = $attrCallback;
|
|
$this->callbackArgs = $callbackArgs ?? [];
|
|
}
|
|
|
|
/**
|
|
* @inheritDoc
|
|
*/
|
|
public function comment( $text, $sourceStart, $sourceLength ) {
|
|
// Don't relay comments.
|
|
}
|
|
|
|
/**
|
|
* Takes attribute names and values for a tag and the tag name and
|
|
* validates that the tag is allowed to be present.
|
|
* This DOES NOT validate the attributes, nor does it validate the
|
|
* tags themselves. This method only handles the special circumstances
|
|
* where we may want to allow a tag within content but ONLY when it has
|
|
* specific attributes set.
|
|
*
|
|
* @param string $element
|
|
* @param Attributes $attrs
|
|
* @return bool
|
|
*
|
|
* @see Sanitizer::validateTag()
|
|
*/
|
|
private static function validateTag( string $element, Attributes $attrs ): bool {
|
|
if ( $element == 'meta' || $element == 'link' ) {
|
|
$params = $attrs->getValues();
|
|
if ( !isset( $params['itemprop'] ) ) {
|
|
// <meta> and <link> must have an itemprop="" otherwise they are not valid or safe in content
|
|
return false;
|
|
}
|
|
if ( $element == 'meta' && !isset( $params['content'] ) ) {
|
|
// <meta> must have a content="" for the itemprop
|
|
return false;
|
|
}
|
|
if ( $element == 'link' && !isset( $params['href'] ) ) {
|
|
// <link> must have an associated href=""
|
|
return false;
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
/**
|
|
* @inheritDoc
|
|
*/
|
|
public function startTag( $name, Attributes $attrs, $selfClose, $sourceStart, $sourceLength ) {
|
|
// Handle a start tag from the tokenizer: either relay it to the
|
|
// next stage, or re-emit it as raw text.
|
|
|
|
$badtag = false;
|
|
$t = strtolower( $name );
|
|
if ( isset( $this->htmlelements[$t] ) ) {
|
|
if ( $this->attrCallback ) {
|
|
$attrs = ( $this->attrCallback )( $attrs, ...$this->callbackArgs );
|
|
}
|
|
if ( $selfClose && !( isset( $this->htmlsingle[$t] ) || isset( $this->htmlsingleonly[$t] ) ) ) {
|
|
// Remove the self-closing slash, to be consistent with
|
|
// HTML5 semantics. T134423
|
|
$selfClose = false;
|
|
}
|
|
if ( !self::validateTag( $t, $attrs ) ) {
|
|
$badtag = true;
|
|
}
|
|
$fixedAttrs = Sanitizer::validateTagAttributes( $attrs->getValues(), $t );
|
|
$attrs = new PlainAttributes( $fixedAttrs );
|
|
if ( !$badtag ) {
|
|
if ( $selfClose && !isset( $this->htmlsingleonly[$t] ) ) {
|
|
// Interpret self-closing tags as empty tags even when
|
|
// HTML5 would interpret them as start tags. Such input
|
|
// is commonly seen on Wikimedia wikis with this intention.
|
|
$this->nextHandler->startTag( $name, $attrs, false, $sourceStart, $sourceLength );
|
|
$this->nextHandler->endTag( $name, $sourceStart + $sourceLength, 0 );
|
|
} else {
|
|
$this->nextHandler->startTag( $name, $attrs, $selfClose, $sourceStart, $sourceLength );
|
|
}
|
|
return;
|
|
}
|
|
}
|
|
// Emit this as a text node instead.
|
|
$this->nextHandler->characters( $this->source, $sourceStart, $sourceLength, $sourceStart, $sourceLength );
|
|
}
|
|
|
|
/**
|
|
* @inheritDoc
|
|
*/
|
|
public function endTag( $name, $sourceStart, $sourceLength ) {
|
|
// Handle an end tag from the tokenizer: either relay it to the
|
|
// next stage, or re-emit it as raw text.
|
|
|
|
$t = strtolower( $name );
|
|
if ( isset( $this->htmlelements[$t] ) ) {
|
|
// This is a good tag, relay it.
|
|
$this->nextHandler->endTag( $name, $sourceStart, $sourceLength );
|
|
} else {
|
|
// Emit this as a text node instead.
|
|
$this->nextHandler->characters( $this->source, $sourceStart, $sourceLength, $sourceStart, $sourceLength );
|
|
}
|
|
}
|
|
|
|
}
|