Quoted attributes don't need to be followed by a space
Further, this splits up attribute parsing from filtering. Change-Id: Ib4e0a808a6ca2ba032873e885837233e2f2feefe
This commit is contained in:
parent
a95290576c
commit
59bb8864a2
2 changed files with 60 additions and 30 deletions
|
|
@ -349,18 +349,18 @@ class Sanitizer {
|
|||
|
||||
/**
|
||||
* Regular expression to match HTML/XML attribute pairs within a tag.
|
||||
* Allows some... latitude. Based on,
|
||||
* https://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
|
||||
* Used in Sanitizer::fixTagAttributes and Sanitizer::decodeTagAttributes
|
||||
* Based on https://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
|
||||
* Used in Sanitizer::decodeTagAttributes
|
||||
* @return string
|
||||
*/
|
||||
static function getAttribsRegex() {
|
||||
if ( self::$attribsRegex === null ) {
|
||||
$attribFirst = "[:_\p{L}\p{N}]";
|
||||
$attrib = "[:_\.\-\p{L}\p{N}]";
|
||||
$space = '[\x09\x0a\x0c\x0d\x20]';
|
||||
$spaceChars = '\x09\x0a\x0c\x0d\x20';
|
||||
$space = "[{$spaceChars}]";
|
||||
$attrib = "[^{$spaceChars}\/>=]";
|
||||
$attribFirst = "(?:{$attrib}|=)";
|
||||
self::$attribsRegex =
|
||||
"/(?:^|$space)({$attribFirst}{$attrib}*)
|
||||
"/({$attribFirst}{$attrib}*)
|
||||
($space*=$space*
|
||||
(?:
|
||||
# The attribute value: quoted or alone
|
||||
|
|
@ -368,11 +368,29 @@ class Sanitizer {
|
|||
| '([^']*)(?:'|\$)
|
||||
| (((?!$space|>).)*)
|
||||
)
|
||||
)?(?=$space|\$)/sxu";
|
||||
)?/sxu";
|
||||
}
|
||||
return self::$attribsRegex;
|
||||
}
|
||||
|
||||
/**
|
||||
* Lazy-initialised attribute name regex, see getAttribNameRegex()
|
||||
*/
|
||||
private static $attribNameRegex;
|
||||
|
||||
/**
|
||||
* Used in Sanitizer::decodeTagAttributes to filter attributes.
|
||||
* @return string
|
||||
*/
|
||||
static function getAttribNameRegex() {
|
||||
if ( self::$attribNameRegex === null ) {
|
||||
$attribFirst = "[:_\p{L}\p{N}]";
|
||||
$attrib = "[:_\.\-\p{L}\p{N}]";
|
||||
self::$attribNameRegex = "/^({$attribFirst}{$attrib}*)$/sxu";
|
||||
}
|
||||
return self::$attribNameRegex;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the various lists of recognized tags
|
||||
* @param array $extratags For any extra tags to include
|
||||
|
|
@ -1433,18 +1451,24 @@ class Sanitizer {
|
|||
return [];
|
||||
}
|
||||
|
||||
$attribs = [];
|
||||
$pairs = [];
|
||||
if ( !preg_match_all(
|
||||
self::getAttribsRegex(),
|
||||
$text,
|
||||
$pairs,
|
||||
PREG_SET_ORDER ) ) {
|
||||
return $attribs;
|
||||
return [];
|
||||
}
|
||||
|
||||
$attribs = [];
|
||||
foreach ( $pairs as $set ) {
|
||||
$attribute = strtolower( $set[1] );
|
||||
|
||||
// Filter attribute names with unacceptable characters
|
||||
if ( !preg_match( self::getAttribNameRegex(), $attribute ) ) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$value = self::getTagAttributeCallback( $set );
|
||||
|
||||
// Normalize whitespace
|
||||
|
|
|
|||
|
|
@ -6273,8 +6273,6 @@ parsoid=wt2html
|
|||
|
||||
!! end
|
||||
|
||||
# Note that the PHP parser output appears to be broken when the table
|
||||
# end tag is not separated by a space from the style attribute
|
||||
!! test
|
||||
A table with stray table end tags on start tag line (wt2html)
|
||||
!! options
|
||||
|
|
@ -6294,13 +6292,13 @@ parsoid=wt2html
|
|||
|foo
|
||||
|}
|
||||
!! html/php+tidy
|
||||
<table style=""color:">
|
||||
<table style="color: red;">
|
||||
|
||||
</table><table style="color: red;">
|
||||
<tbody><tr>
|
||||
<td>foo
|
||||
</td></tr></tbody></table>
|
||||
<table style=""color:" id="foo">
|
||||
<table style="color: red;" id="foo">
|
||||
<tbody><tr>
|
||||
<td>foo
|
||||
</td></tr></tbody></table>
|
||||
|
|
@ -9648,17 +9646,14 @@ Handling html with a div self-closing tag
|
|||
<div title=bar />
|
||||
<div title=bar/>
|
||||
<div title=bar/ >
|
||||
!! html/php
|
||||
<p><div title />
|
||||
<div title/>
|
||||
</p>
|
||||
<div>
|
||||
<p><div title=bar />
|
||||
<div title=bar/>
|
||||
</p>
|
||||
<div title="bar/"></div>
|
||||
</div>
|
||||
|
||||
!! html/php+tidy
|
||||
<div title=""></div>
|
||||
<div title=""></div>
|
||||
<div title="">
|
||||
<div title="bar"></div>
|
||||
<div title="bar"></div>
|
||||
<div title="bar/">
|
||||
</div></div>
|
||||
!! html/parsoid
|
||||
<div title="" data-parsoid='{"stx":"html","selfClose":true}'></div>
|
||||
<div title="" data-parsoid='{"stx":"html","selfClose":true}'></div>
|
||||
|
|
@ -9699,10 +9694,10 @@ Handling html with a br self-closing tag
|
|||
<br title=bar />
|
||||
<br title=bar/>
|
||||
<br title=bar/ >
|
||||
!! html/php
|
||||
!! html/php+tidy
|
||||
<p><br title="" />
|
||||
<br title="" />
|
||||
<br />
|
||||
<br title="" />
|
||||
<br title="bar" />
|
||||
<br title="bar" />
|
||||
<br title="bar/" />
|
||||
|
|
@ -9717,6 +9712,18 @@ Handling html with a br self-closing tag
|
|||
</p>
|
||||
!! end
|
||||
|
||||
!! test
|
||||
Quoted attributes without spaces
|
||||
!! options
|
||||
parsoid=wt2html
|
||||
!! wikitext
|
||||
<div class="foo"style="color:red">red</div>
|
||||
!! html/php+tidy
|
||||
<div class="foo" style="color:red">red</div>
|
||||
!! html/parsoid
|
||||
<div class="foo" style="color:red">red</div>
|
||||
!! end
|
||||
|
||||
!! test
|
||||
Horizontal ruler (should it add that extra space?)
|
||||
!! wikitext
|
||||
|
|
@ -18177,8 +18184,7 @@ HTML tag with leading space is parsed as text
|
|||
</p>
|
||||
!! end
|
||||
|
||||
## Don't expect Parsoid and PHP to match, since PHP isn't exactly following
|
||||
## the HTML5 parsing spec.
|
||||
## FIXME: The untrimmed attribute in Parsoid is T205737
|
||||
!! test
|
||||
Element with broken attribute syntax
|
||||
!! options
|
||||
|
|
@ -18187,7 +18193,7 @@ parsoid=wt2html
|
|||
<div style=" style="123">hi</div>
|
||||
<div =>ho</div>
|
||||
!! html/php
|
||||
<div style="123">hi</div>
|
||||
<div style="style=">hi</div>
|
||||
<div>ho</div>
|
||||
|
||||
!! html/parsoid
|
||||
|
|
|
|||
Loading…
Reference in a new issue