Non-word characters shouldn't terminate tag names on the tidy side too
* Follow up to Iceec404f46703065bf080dd2cbfed1f88c204fa5. * The accepted charset is changed to match the HTML5 parsing spec at: http://dev.w3.org/html5/spec-preview/tokenization.html#tag-open-state * Equivalent in parsoid at I462c336f9a00c8ccd11f3220a8738389e8ba7c7c. Change-Id: I69cb000538fe195dd77273da5f91697fe1e7d283
This commit is contained in:
parent
d43e51a42c
commit
8e8b15afc6
2 changed files with 38 additions and 15 deletions
|
|
@ -39,6 +39,12 @@ class Sanitizer {
|
|||
|&\#[xX]([0-9A-Fa-f]+);
|
||||
|(&)/x';
|
||||
|
||||
/**
|
||||
* Acceptable tag name charset from HTML5 parsing spec
|
||||
* http://dev.w3.org/html5/spec-preview/tokenization.html#tag-open-state
|
||||
*/
|
||||
const ELEMENT_BITS_REGEX = '!^(/?)([A-Za-z][^\t\n\v />\0]*+)([^>]*?)(/?>)([^<]*)$!';
|
||||
|
||||
/**
|
||||
* Blacklist for evil uris like javascript:
|
||||
* WARNING: DO NOT use this in any place that actually requires blacklisting
|
||||
|
|
@ -444,7 +450,7 @@ class Sanitizer {
|
|||
# $params: String between element name and >
|
||||
# $brace: Ending '>' or '/>'
|
||||
# $rest: Everything until the next element of $bits
|
||||
if ( preg_match( '!^(/?)([^\\s/>]+)([^>]*?)(/{0,1}>)([^<]*)$!', $x, $regs ) ) {
|
||||
if ( preg_match( self::ELEMENT_BITS_REGEX, $x, $regs ) ) {
|
||||
list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
|
||||
} else {
|
||||
$slash = $t = $params = $brace = $rest = null;
|
||||
|
|
@ -567,11 +573,7 @@ class Sanitizer {
|
|||
} else {
|
||||
# this might be possible using tidy itself
|
||||
foreach ( $bits as $x ) {
|
||||
preg_match(
|
||||
'/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
|
||||
$x,
|
||||
$regs
|
||||
);
|
||||
preg_match( self::ELEMENT_BITS_REGEX, $x, $regs );
|
||||
|
||||
wfSuppressWarnings();
|
||||
list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
|
||||
|
|
|
|||
|
|
@ -1191,28 +1191,49 @@ Ruby markup (W3C-style)
|
|||
</p>
|
||||
!! end
|
||||
|
||||
# There is a tidy bug here: http://sourceforge.net/p/tidy/bugs/946/
|
||||
# The next two test different paths in the sanitizer.
|
||||
!! test
|
||||
Non-word characters don't terminate tag names (bug 17663, 40670, 52022)
|
||||
!! wikitext
|
||||
<b→> doesn't work! </b→>
|
||||
<b→> doesn't terminate </b→>
|
||||
|
||||
<bä> doesn't work! </bä>
|
||||
<bä> doesn't terminate </bä>
|
||||
|
||||
<boo> works fine </boo>
|
||||
<boo> doesn't terminate </boo>
|
||||
|
||||
<s.foo>s.foo</s.foo>
|
||||
<s.foo> doesn't terminate </s.foo>
|
||||
|
||||
<sub-ID#1>
|
||||
!! html
|
||||
<p><b→> doesn't work! </b→>
|
||||
</p><p><bä> doesn't work! </bä>
|
||||
</p><p><boo> works fine </boo>
|
||||
</p><p><s.foo>s.foo</s.foo>
|
||||
<p><b→> doesn't terminate </b→>
|
||||
</p><p><bä> doesn't terminate </bä>
|
||||
</p><p><boo> doesn't terminate </boo>
|
||||
</p><p><s.foo> doesn't terminate </s.foo>
|
||||
</p><p><sub-ID#1>
|
||||
</p>
|
||||
!! end
|
||||
|
||||
# There is a tidy bug here: http://sourceforge.net/p/tidy/bugs/946/
|
||||
!! test
|
||||
Non-word characters don't terminate tag names + tidy
|
||||
!! wikitext
|
||||
<b→> doesn't terminate </b→>
|
||||
|
||||
<bä> doesn't terminate </bä>
|
||||
|
||||
<boo> doesn't terminate </boo>
|
||||
|
||||
<s.foo> doesn't terminate </s.foo>
|
||||
|
||||
<sub-ID#1>
|
||||
!! html+tidy
|
||||
<p><b→> doesn't terminate </b→></p>
|
||||
<p><bä> doesn't terminate </bä></p>
|
||||
<p><boo> doesn't terminate </boo></p>
|
||||
<p><s.foo> doesn't terminate </s.foo></p>
|
||||
<p><sub-ID#1></p>
|
||||
!! end
|
||||
|
||||
!! test
|
||||
Isolated close tags should be treated as literal text (bug 52760)
|
||||
!! wikitext
|
||||
|
|
|
|||
Loading…
Reference in a new issue