Merge "Use Remex in Sanitizer::stripAllTags()"
This commit is contained in:
commit
2f15b22b91
4 changed files with 57 additions and 12 deletions
|
|
@ -1219,6 +1219,7 @@ $wgAutoloadLocalClasses = [
|
|||
'RefreshLinks' => __DIR__ . '/maintenance/refreshLinks.php',
|
||||
'RefreshLinksJob' => __DIR__ . '/includes/jobqueue/jobs/RefreshLinksJob.php',
|
||||
'RegexlikeReplacer' => __DIR__ . '/includes/libs/replacers/RegexlikeReplacer.php',
|
||||
'RemexStripTagHandler' => __DIR__ . '/includes/parser/RemexStripTagHandler.php',
|
||||
'RemoveInvalidEmails' => __DIR__ . '/maintenance/removeInvalidEmails.php',
|
||||
'RemoveUnusedAccounts' => __DIR__ . '/maintenance/removeUnusedAccounts.php',
|
||||
'RenameDbPrefix' => __DIR__ . '/maintenance/renameDbPrefix.php',
|
||||
|
|
|
|||
40
includes/parser/RemexStripTagHandler.php
Normal file
40
includes/parser/RemexStripTagHandler.php
Normal file
|
|
@ -0,0 +1,40 @@
|
|||
<?php
|
||||
|
||||
use RemexHtml\Tokenizer\Attributes;
|
||||
use RemexHtml\Tokenizer\TokenHandler;
|
||||
use RemexHtml\Tokenizer\Tokenizer;
|
||||
|
||||
/**
|
||||
* @internal
|
||||
*/
|
||||
class RemexStripTagHandler implements TokenHandler {
|
||||
private $text = '';
|
||||
public function getResult() {
|
||||
return $this->text;
|
||||
}
|
||||
|
||||
function startDocument( Tokenizer $t, $fns, $fn ) {
|
||||
// Do nothing.
|
||||
}
|
||||
function endDocument( $pos ) {
|
||||
// Do nothing.
|
||||
}
|
||||
function error( $text, $pos ) {
|
||||
// Do nothing.
|
||||
}
|
||||
function characters( $text, $start, $length, $sourceStart, $sourceLength ) {
|
||||
$this->text .= substr( $text, $start, $length );
|
||||
}
|
||||
function startTag( $name, Attributes $attrs, $selfClose, $sourceStart, $sourceLength ) {
|
||||
// Do nothing.
|
||||
}
|
||||
function endTag( $name, $sourceStart, $sourceLength ) {
|
||||
// Do nothing.
|
||||
}
|
||||
function doctype( $name, $public, $system, $quirks, $sourceStart, $sourceLength ) {
|
||||
// Do nothing.
|
||||
}
|
||||
function comment( $text, $sourceStart, $sourceLength ) {
|
||||
// Do nothing.
|
||||
}
|
||||
}
|
||||
|
|
@ -1967,17 +1967,22 @@ class Sanitizer {
|
|||
* Warning: this return value must be further escaped for literal
|
||||
* inclusion in HTML output as of 1.10!
|
||||
*
|
||||
* @param string $text HTML fragment
|
||||
* @param string $html HTML fragment
|
||||
* @return string
|
||||
*/
|
||||
static function stripAllTags( $text ) {
|
||||
# Actual <tags>
|
||||
$text = StringUtils::delimiterReplace( '<', '>', '', $text );
|
||||
static function stripAllTags( $html ) {
|
||||
// Use RemexHtml to tokenize $html and extract the text
|
||||
$handler = new RemexStripTagHandler;
|
||||
$tokenizer = new RemexHtml\Tokenizer\Tokenizer( $handler, $html, [
|
||||
'ignoreErrors' => true,
|
||||
// don't ignore char refs, we want them to be decoded
|
||||
'ignoreNulls' => true,
|
||||
'skipPreprocess' => true,
|
||||
] );
|
||||
$tokenizer->execute();
|
||||
$text = $handler->getResult();
|
||||
|
||||
# Normalize &entities and whitespace
|
||||
$text = self::decodeCharReferences( $text );
|
||||
$text = self::normalizeWhitespace( $text );
|
||||
|
||||
return $text;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -530,11 +530,10 @@ class SanitizerTest extends MediaWikiTestCase {
|
|||
[ '<p id="one">Foo</p><p id="two">Bar</p>', 'FooBar' ],
|
||||
[ "<p>Foo</p>\n<p>Bar</p>", 'Foo Bar' ],
|
||||
[ '<p>Hello <strong> world café</p>', 'Hello <strong> world café' ],
|
||||
// This one is broken, see T179978
|
||||
//[
|
||||
// '<p><small data-foo=\'bar"<baz>quux\'><a href="./Foo">Bar</a></small> Whee!</p>',
|
||||
// 'Bar Whee!'
|
||||
//],
|
||||
[
|
||||
'<p><small data-foo=\'bar"<baz>quux\'><a href="./Foo">Bar</a></small> Whee!</p>',
|
||||
'Bar Whee!'
|
||||
],
|
||||
[ '1<span class="<?php">2</span>3', '123' ],
|
||||
[ '1<span class="<?">2</span>3', '123' ],
|
||||
];
|
||||
|
|
|
|||
Loading…
Reference in a new issue