Quoted attributes don't need to be followed by a space

Further, this splits up attribute parsing from filtering.

Change-Id: Ib4e0a808a6ca2ba032873e885837233e2f2feefe
This commit is contained in:
Arlo Breault 2018-11-02 19:20:52 -04:00
parent a95290576c
commit 59bb8864a2
2 changed files with 60 additions and 30 deletions

View file

@ -349,18 +349,18 @@ class Sanitizer {
/**
* Regular expression to match HTML/XML attribute pairs within a tag.
* Allows some... latitude. Based on,
* https://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
* Used in Sanitizer::fixTagAttributes and Sanitizer::decodeTagAttributes
* Based on https://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
* Used in Sanitizer::decodeTagAttributes
* @return string
*/
static function getAttribsRegex() {
if ( self::$attribsRegex === null ) {
$attribFirst = "[:_\p{L}\p{N}]";
$attrib = "[:_\.\-\p{L}\p{N}]";
$space = '[\x09\x0a\x0c\x0d\x20]';
$spaceChars = '\x09\x0a\x0c\x0d\x20';
$space = "[{$spaceChars}]";
$attrib = "[^{$spaceChars}\/>=]";
$attribFirst = "(?:{$attrib}|=)";
self::$attribsRegex =
"/(?:^|$space)({$attribFirst}{$attrib}*)
"/({$attribFirst}{$attrib}*)
($space*=$space*
(?:
# The attribute value: quoted or alone
@ -368,11 +368,29 @@ class Sanitizer {
| '([^']*)(?:'|\$)
| (((?!$space|>).)*)
)
)?(?=$space|\$)/sxu";
)?/sxu";
}
return self::$attribsRegex;
}
/**
* Lazy-initialised attribute name regex, see getAttribNameRegex()
*/
private static $attribNameRegex;
/**
* Used in Sanitizer::decodeTagAttributes to filter attributes.
* @return string
*/
static function getAttribNameRegex() {
if ( self::$attribNameRegex === null ) {
$attribFirst = "[:_\p{L}\p{N}]";
$attrib = "[:_\.\-\p{L}\p{N}]";
self::$attribNameRegex = "/^({$attribFirst}{$attrib}*)$/sxu";
}
return self::$attribNameRegex;
}
/**
* Return the various lists of recognized tags
* @param array $extratags For any extra tags to include
@ -1433,18 +1451,24 @@ class Sanitizer {
return [];
}
$attribs = [];
$pairs = [];
if ( !preg_match_all(
self::getAttribsRegex(),
$text,
$pairs,
PREG_SET_ORDER ) ) {
return $attribs;
return [];
}
$attribs = [];
foreach ( $pairs as $set ) {
$attribute = strtolower( $set[1] );
// Filter attribute names with unacceptable characters
if ( !preg_match( self::getAttribNameRegex(), $attribute ) ) {
continue;
}
$value = self::getTagAttributeCallback( $set );
// Normalize whitespace

View file

@ -6273,8 +6273,6 @@ parsoid=wt2html
!! end
# Note that the PHP parser output appears to be broken when the table
# end tag is not separated by a space from the style attribute
!! test
A table with stray table end tags on start tag line (wt2html)
!! options
@ -6294,13 +6292,13 @@ parsoid=wt2html
|foo
|}
!! html/php+tidy
<table style="&quot;color:">
<table style="color: red;">
</table><table style="color: red;">
<tbody><tr>
<td>foo
</td></tr></tbody></table>
<table style="&quot;color:" id="foo">
<table style="color: red;" id="foo">
<tbody><tr>
<td>foo
</td></tr></tbody></table>
@ -9648,17 +9646,14 @@ Handling html with a div self-closing tag
<div title=bar />
<div title=bar/>
<div title=bar/ >
!! html/php
<p>&lt;div title /&gt;
&lt;div title/&gt;
</p>
<div>
<p>&lt;div title=bar /&gt;
&lt;div title=bar/&gt;
</p>
<div title="bar/"></div>
</div>
!! html/php+tidy
<div title=""></div>
<div title=""></div>
<div title="">
<div title="bar"></div>
<div title="bar"></div>
<div title="bar/">
</div></div>
!! html/parsoid
<div title="" data-parsoid='{"stx":"html","selfClose":true}'></div>
<div title="" data-parsoid='{"stx":"html","selfClose":true}'></div>
@ -9699,10 +9694,10 @@ Handling html with a br self-closing tag
<br title=bar />
<br title=bar/>
<br title=bar/ >
!! html/php
!! html/php+tidy
<p><br title="" />
<br title="" />
<br />
<br title="" />
<br title="bar" />
<br title="bar" />
<br title="bar/" />
@ -9717,6 +9712,18 @@ Handling html with a br self-closing tag
</p>
!! end
!! test
Quoted attributes without spaces
!! options
parsoid=wt2html
!! wikitext
<div class="foo"style="color:red">red</div>
!! html/php+tidy
<div class="foo" style="color:red">red</div>
!! html/parsoid
<div class="foo" style="color:red">red</div>
!! end
!! test
Horizontal ruler (should it add that extra space?)
!! wikitext
@ -18177,8 +18184,7 @@ HTML tag with leading space is parsed as text
</p>
!! end
## Don't expect Parsoid and PHP to match, since PHP isn't exactly following
## the HTML5 parsing spec.
## FIXME: The untrimmed attribute in Parsoid is T205737
!! test
Element with broken attribute syntax
!! options
@ -18187,7 +18193,7 @@ parsoid=wt2html
<div style=" style="123">hi</div>
<div =>ho</div>
!! html/php
<div style="123">hi</div>
<div style="style=">hi</div>
<div>ho</div>
!! html/parsoid