Merge "Use Remex in Sanitizer::stripAllTags()"

This commit is contained in:
jenkins-bot 2017-11-16 20:34:31 +00:00 committed by Gerrit Code Review
commit 2f15b22b91
4 changed files with 57 additions and 12 deletions

View file

@ -1219,6 +1219,7 @@ $wgAutoloadLocalClasses = [
'RefreshLinks' => __DIR__ . '/maintenance/refreshLinks.php',
'RefreshLinksJob' => __DIR__ . '/includes/jobqueue/jobs/RefreshLinksJob.php',
'RegexlikeReplacer' => __DIR__ . '/includes/libs/replacers/RegexlikeReplacer.php',
'RemexStripTagHandler' => __DIR__ . '/includes/parser/RemexStripTagHandler.php',
'RemoveInvalidEmails' => __DIR__ . '/maintenance/removeInvalidEmails.php',
'RemoveUnusedAccounts' => __DIR__ . '/maintenance/removeUnusedAccounts.php',
'RenameDbPrefix' => __DIR__ . '/maintenance/renameDbPrefix.php',

View file

@ -0,0 +1,40 @@
<?php
use RemexHtml\Tokenizer\Attributes;
use RemexHtml\Tokenizer\TokenHandler;
use RemexHtml\Tokenizer\Tokenizer;
/**
* @internal
*/
class RemexStripTagHandler implements TokenHandler {
private $text = '';
public function getResult() {
return $this->text;
}
function startDocument( Tokenizer $t, $fns, $fn ) {
// Do nothing.
}
function endDocument( $pos ) {
// Do nothing.
}
function error( $text, $pos ) {
// Do nothing.
}
function characters( $text, $start, $length, $sourceStart, $sourceLength ) {
$this->text .= substr( $text, $start, $length );
}
function startTag( $name, Attributes $attrs, $selfClose, $sourceStart, $sourceLength ) {
// Do nothing.
}
function endTag( $name, $sourceStart, $sourceLength ) {
// Do nothing.
}
function doctype( $name, $public, $system, $quirks, $sourceStart, $sourceLength ) {
// Do nothing.
}
function comment( $text, $sourceStart, $sourceLength ) {
// Do nothing.
}
}

View file

@ -1967,17 +1967,22 @@ class Sanitizer {
* Warning: this return value must be further escaped for literal
* inclusion in HTML output as of 1.10!
*
* @param string $text HTML fragment
* @param string $html HTML fragment
* @return string
*/
static function stripAllTags( $text ) {
# Actual <tags>
$text = StringUtils::delimiterReplace( '<', '>', '', $text );
static function stripAllTags( $html ) {
// Use RemexHtml to tokenize $html and extract the text
$handler = new RemexStripTagHandler;
$tokenizer = new RemexHtml\Tokenizer\Tokenizer( $handler, $html, [
'ignoreErrors' => true,
// don't ignore char refs, we want them to be decoded
'ignoreNulls' => true,
'skipPreprocess' => true,
] );
$tokenizer->execute();
$text = $handler->getResult();
# Normalize &entities and whitespace
$text = self::decodeCharReferences( $text );
$text = self::normalizeWhitespace( $text );
return $text;
}

View file

@ -530,11 +530,10 @@ class SanitizerTest extends MediaWikiTestCase {
[ '<p id="one">Foo</p><p id="two">Bar</p>', 'FooBar' ],
[ "<p>Foo</p>\n<p>Bar</p>", 'Foo Bar' ],
[ '<p>Hello &lt;strong&gt; wor&#x6c;&#100; caf&eacute;</p>', 'Hello <strong> world café' ],
// This one is broken, see T179978
//[
// '<p><small data-foo=\'bar"&lt;baz>quux\'><a href="./Foo">Bar</a></small> Whee!</p>',
// 'Bar Whee!'
//],
[
'<p><small data-foo=\'bar"&lt;baz>quux\'><a href="./Foo">Bar</a></small> Whee!</p>',
'Bar Whee!'
],
[ '1<span class="<?php">2</span>3', '123' ],
[ '1<span class="<?">2</span>3', '123' ],
];