We use Sanitizer::stripAllTags primarily to remove formatting from html so that we can use it in places like notifications, emails, search result blurbs etc etc. It is very unlikely we want the raw contents of css and/or js tags anywhere in those places, so lets surpress that content, to make it more readable as template styles are showing up in more and more places. Bug: T228856 Change-Id: I7930361068ddcf3a6c2fdebd0177d142f025b64f
127 lines
3.2 KiB
PHP
127 lines
3.2 KiB
PHP
<?php
|
|
|
|
use Wikimedia\RemexHtml\Tokenizer\Attributes;
|
|
use Wikimedia\RemexHtml\Tokenizer\NullTokenHandler;
|
|
|
|
/**
|
|
* @internal
|
|
*/
|
|
class RemexStripTagHandler extends NullTokenHandler {
|
|
private $insideNonVisibleTag = false;
|
|
private $text = '';
|
|
|
|
public function getResult() {
|
|
return $this->text;
|
|
}
|
|
|
|
public function characters( $text, $start, $length, $sourceStart, $sourceLength ) {
|
|
if ( !$this->insideNonVisibleTag ) {
|
|
$this->text .= substr( $text, $start, $length );
|
|
}
|
|
}
|
|
|
|
public function startTag( $name, Attributes $attrs, $selfClose, $sourceStart, $sourceLength ) {
|
|
if ( $this->isNonVisibleTag( $name ) ) {
|
|
$this->insideNonVisibleTag = true;
|
|
}
|
|
// Inject whitespace for typical block-level tags to
|
|
// prevent merging unrelated<br>words.
|
|
if ( $this->isBlockLevelTag( $name ) ) {
|
|
$this->text .= ' ';
|
|
}
|
|
}
|
|
|
|
public function endTag( $name, $sourceStart, $sourceLength ) {
|
|
if ( $this->isNonVisibleTag( $name ) ) {
|
|
$this->insideNonVisibleTag = false;
|
|
}
|
|
// Inject whitespace for typical block-level tags to
|
|
// prevent merging unrelated<br>words.
|
|
if ( $this->isBlockLevelTag( $name ) ) {
|
|
$this->text .= ' ';
|
|
}
|
|
}
|
|
|
|
// Per https://developer.mozilla.org/en-US/docs/Web/HTML/Block-level_elements
|
|
// retrieved on sept 12, 2018. <br> is not block level but was added anyways.
|
|
// The following is a complete list of all HTML block level elements
|
|
// (although "block-level" is not technically defined for elements that are
|
|
// new in HTML5).
|
|
// Structured as tag => true to allow O(1) membership test.
|
|
private const BLOCK_LEVEL_TAGS = [
|
|
'address' => true,
|
|
'article' => true,
|
|
'aside' => true,
|
|
'blockquote' => true,
|
|
'br' => true,
|
|
'canvas' => true,
|
|
'dd' => true,
|
|
'div' => true,
|
|
'dl' => true,
|
|
'dt' => true,
|
|
'fieldset' => true,
|
|
'figcaption' => true,
|
|
'figure' => true,
|
|
'footer' => true,
|
|
'form' => true,
|
|
'h1' => true,
|
|
'h2' => true,
|
|
'h3' => true,
|
|
'h4' => true,
|
|
'h5' => true,
|
|
'h6' => true,
|
|
'header' => true,
|
|
'hgroup' => true,
|
|
'hr' => true,
|
|
'li' => true,
|
|
'main' => true,
|
|
'nav' => true,
|
|
'noscript' => true,
|
|
'ol' => true,
|
|
'output' => true,
|
|
'p' => true,
|
|
'pre' => true,
|
|
'section' => true,
|
|
'table' => true,
|
|
'td' => true,
|
|
'tfoot' => true,
|
|
'th' => true,
|
|
'tr' => true,
|
|
'ul' => true,
|
|
'video' => true,
|
|
];
|
|
|
|
/**
|
|
* Detect block level tags. Of course css can make anything a block
|
|
* level tag, but this is still better than nothing.
|
|
*
|
|
* @param string $tagName HTML tag name
|
|
* @return bool True when tag is an html block level element
|
|
*/
|
|
private function isBlockLevelTag( $tagName ) {
|
|
$key = strtolower( trim( $tagName ) );
|
|
return isset( self::BLOCK_LEVEL_TAGS[$key] );
|
|
}
|
|
|
|
private const NON_VISIBLE_TAGS = [
|
|
'style' => true,
|
|
'script' => true,
|
|
];
|
|
|
|
/**
|
|
* Detect block tags which by default are non-visible items.
|
|
* Of course css can make anything non-visible,
|
|
* but this is still better than nothing.
|
|
*
|
|
* We use this primarily to hide TemplateStyles
|
|
* from output in notifications/emails etc.
|
|
*
|
|
* @param string $tagName HTML tag name
|
|
* @return bool True when tag is a html element which should be filtered out
|
|
*/
|
|
private function isNonVisibleTag( $tagName ) {
|
|
$key = strtolower( trim( $tagName ) );
|
|
return isset( self::NON_VISIBLE_TAGS[$key] );
|
|
}
|
|
|
|
}
|