Protect language converter markup in the preprocessor (take 2).

This revises 2877402276, which was
reverted in master due to unexpected issues with `-{{...}} ` markup
on translatewiki and enwiki.  Test cases are added to ensure that this
is parsed as a template, not as language converter markup.

https://www.mediawiki.org/wiki/Preprocessor_ABNF is the canonical
documentation for the preprocessor; this will be updated after this
patch is merged.  The basic principles described in that page are
maintained in this patch:

* Rightmost opening structure has precedence: `-{{` is parsed as a
dash followed by template opening.

* `{{{` has precedence over `{{` and `-{`: `-{{{{` is parsed as
`-{` `{{{` since we first grab the rightmost `{{{`.

A bunch of test cases were added to verify the "ideal precedence"
order described on that wiki page.

This patch introduced some minor incompatibilities in existing
markup, in particular with chemical formulae in templates.
Fixes for these are being tracked at
https://www.mediawiki.org/wiki/Parsoid/Language_conversion/Preprocessor_fixups

Bug: T146304
Bug: T153761
Change-Id: I2f0c186c75e392c95e1a3d89266cae2586349150
This commit is contained in:
C. Scott Ananian 2017-01-19 14:58:05 -05:00 committed by Reedy
parent 1928a85867
commit 186a182a15
5 changed files with 444 additions and 35 deletions

View file

@ -79,6 +79,10 @@ changes to languages because of Phabricator reports.
deprecated in 1.24) were removed.
* wfMemcKey() and wfGlobalCacheKey() were deprecated. ObjectCache::makeKey() and
ObjectCache::makeGlobalKey() should be used instead.
* (T146304) Preprocessor handling of LanguageConverter markup has been improved.
As a result of the new uniform handling, '-{' may need to be escaped
(for example, as '-<nowiki/>{') where it occurs inside template arguments
or wikilinks.
== Compatibility ==
MediaWiki 1.30 requires PHP 5.5.9 or later. There is experimental support for

View file

@ -51,9 +51,9 @@ abstract class Preprocessor {
],
'-{' => [
'end' => '}-',
'names' => [ 1 => null ],
'min' => 1,
'max' => 1,
'names' => [ 2 => null ],
'min' => 2,
'max' => 2,
],
];

View file

@ -223,8 +223,7 @@ class Preprocessor_DOM extends Preprocessor {
$searchBase = "[{<\n"; # }
if ( !$wgDisableLangConversion ) {
// FIXME: disabled due to T153761
// $searchBase .= '-';
$searchBase .= '-';
}
// For fast reverse searches
@ -277,6 +276,13 @@ class Preprocessor_DOM extends Preprocessor {
$search = $searchBase;
if ( $stack->top === false ) {
$currentClosing = '';
} elseif (
$stack->top->close === '}-' &&
$stack->top->count > 2
) {
# adjust closing for -{{{...{{
$currentClosing = '}';
$search .= $currentClosing;
} else {
$currentClosing = $stack->top->close;
$search .= $currentClosing;
@ -333,11 +339,15 @@ class Preprocessor_DOM extends Preprocessor {
} elseif ( isset( $this->rules[$curChar] ) ) {
$found = 'open';
$rule = $this->rules[$curChar];
} elseif ( $curChar == '-' ) {
$found = 'dash';
} else {
# Some versions of PHP have a strcspn which stops on null characters
# Ignore and continue
# Some versions of PHP have a strcspn which stops on
# null characters; ignore these and continue.
# We also may get '-' and '}' characters here which
# don't match -{ or $currentClosing. Add these to
# output and continue.
if ( $curChar == '-' || $curChar == '}' ) {
$accum .= $curChar;
}
++$i;
continue;
}
@ -615,7 +625,10 @@ class Preprocessor_DOM extends Preprocessor {
} elseif ( $found == 'open' ) {
# count opening brace characters
$curLen = strlen( $curChar );
$count = ( $curLen > 1 ) ? 1 : strspn( $text, $curChar, $i );
$count = ( $curLen > 1 ) ?
# allow the final character to repeat
strspn( $text, $curChar[$curLen-1], $i+1 ) + 1 :
strspn( $text, $curChar, $i );
# we need to add to stack only if opening brace count is enough for one of the rules
if ( $count >= $rule['min'] ) {
@ -635,17 +648,25 @@ class Preprocessor_DOM extends Preprocessor {
# Add literal brace(s)
$accum .= htmlspecialchars( str_repeat( $curChar, $count ) );
}
$i += $curLen * $count;
$i += $count;
} elseif ( $found == 'close' ) {
$piece = $stack->top;
# lets check if there are enough characters for closing brace
$maxCount = $piece->count;
if ( $piece->close === '}-' && $curChar === '}' ) {
$maxCount--; # don't try to match closing '-' as a '}'
}
$curLen = strlen( $curChar );
$count = ( $curLen > 1 ) ? 1 : strspn( $text, $curChar, $i, $maxCount );
$count = ( $curLen > 1 ) ? $curLen :
strspn( $text, $curChar, $i, $maxCount );
# check for maximum matching characters (if there are 5 closing
# characters, we will probably need only 3 - depending on the rules)
$rule = $this->rules[$piece->open];
if ( $piece->close === '}-' && $piece->count > 2 ) {
# tweak for -{..{{ }}..}-
$rule = $this->rules['{'];
}
if ( $count > $rule['max'] ) {
# The specified maximum exists in the callback array, unless the caller
# has made an error
@ -663,14 +684,16 @@ class Preprocessor_DOM extends Preprocessor {
if ( $matchingCount <= 0 ) {
# No matching element found in callback array
# Output a literal closing brace and continue
$accum .= htmlspecialchars( str_repeat( $curChar, $count ) );
$i += $curLen * $count;
$endText = substr( $text, $i, $count );
$accum .= htmlspecialchars( $endText );
$i += $count;
continue;
}
$name = $rule['names'][$matchingCount];
if ( $name === null ) {
// No element, just literal text
$element = $piece->breakSyntax( $matchingCount ) . str_repeat( $rule['end'], $matchingCount );
$endText = substr( $text, $i, $matchingCount );
$element = $piece->breakSyntax( $matchingCount ) . $endText;
} else {
# Create XML element
# Note: $parts is already XML, does not need to be encoded further
@ -703,7 +726,7 @@ class Preprocessor_DOM extends Preprocessor {
}
# Advance input pointer
$i += $curLen * $matchingCount;
$i += $matchingCount;
# Unwind the stack
$stack->pop();
@ -719,7 +742,12 @@ class Preprocessor_DOM extends Preprocessor {
$stack->push( $piece );
$accum =& $stack->getAccum();
} else {
$accum .= str_repeat( $piece->open, $piece->count );
$s = substr( $piece->open, 0, -1 );
$s .= str_repeat(
substr( $piece->open, -1 ),
$piece->count - strlen( $s )
);
$accum .= $s;
}
}
$flags = $stack->getFlags();
@ -924,7 +952,11 @@ class PPDStackElement {
if ( $openingCount === false ) {
$openingCount = $this->count;
}
$s = str_repeat( $this->open, $openingCount );
$s = substr( $this->open, 0, -1 );
$s .= str_repeat(
substr( $this->open, -1 ),
$openingCount - strlen( $s )
);
$first = true;
foreach ( $this->parts as $part ) {
if ( $first ) {

View file

@ -155,8 +155,7 @@ class Preprocessor_Hash extends Preprocessor {
$searchBase = "[{<\n";
if ( !$wgDisableLangConversion ) {
// FIXME: disabled due to T153761
// $searchBase .= '-';
$searchBase .= '-';
}
// For fast reverse searches
@ -208,6 +207,13 @@ class Preprocessor_Hash extends Preprocessor {
$search = $searchBase;
if ( $stack->top === false ) {
$currentClosing = '';
} elseif (
$stack->top->close === '}-' &&
$stack->top->count > 2
) {
# adjust closing for -{{{...{{
$currentClosing = '}';
$search .= $currentClosing;
} else {
$currentClosing = $stack->top->close;
$search .= $currentClosing;
@ -264,11 +270,15 @@ class Preprocessor_Hash extends Preprocessor {
} elseif ( isset( $this->rules[$curChar] ) ) {
$found = 'open';
$rule = $this->rules[$curChar];
} elseif ( $curChar == '-' ) {
$found = 'dash';
} else {
# Some versions of PHP have a strcspn which stops on null characters
# Ignore and continue
# Some versions of PHP have a strcspn which stops on
# null characters; ignore these and continue.
# We also may get '-' and '}' characters here which
# don't match -{ or $currentClosing. Add these to
# output and continue.
if ( $curChar == '-' || $curChar == '}' ) {
self::addLiteral( $accum, $curChar );
}
++$i;
continue;
}
@ -558,7 +568,10 @@ class Preprocessor_Hash extends Preprocessor {
} elseif ( $found == 'open' ) {
# count opening brace characters
$curLen = strlen( $curChar );
$count = ( $curLen > 1 ) ? 1 : strspn( $text, $curChar, $i );
$count = ( $curLen > 1 ) ?
# allow the final character to repeat
strspn( $text, $curChar[$curLen-1], $i+1 ) + 1 :
strspn( $text, $curChar, $i );
# we need to add to stack only if opening brace count is enough for one of the rules
if ( $count >= $rule['min'] ) {
@ -577,17 +590,25 @@ class Preprocessor_Hash extends Preprocessor {
# Add literal brace(s)
self::addLiteral( $accum, str_repeat( $curChar, $count ) );
}
$i += $curLen * $count;
$i += $count;
} elseif ( $found == 'close' ) {
$piece = $stack->top;
# lets check if there are enough characters for closing brace
$maxCount = $piece->count;
if ( $piece->close === '}-' && $curChar === '}' ) {
$maxCount--; # don't try to match closing '-' as a '}'
}
$curLen = strlen( $curChar );
$count = ( $curLen > 1 ) ? 1 : strspn( $text, $curChar, $i, $maxCount );
$count = ( $curLen > 1 ) ? $curLen :
strspn( $text, $curChar, $i, $maxCount );
# check for maximum matching characters (if there are 5 closing
# characters, we will probably need only 3 - depending on the rules)
$rule = $this->rules[$piece->open];
if ( $piece->close === '}-' && $piece->count > 2 ) {
# tweak for -{..{{ }}..}-
$rule = $this->rules['{'];
}
if ( $count > $rule['max'] ) {
# The specified maximum exists in the callback array, unless the caller
# has made an error
@ -605,15 +626,17 @@ class Preprocessor_Hash extends Preprocessor {
if ( $matchingCount <= 0 ) {
# No matching element found in callback array
# Output a literal closing brace and continue
self::addLiteral( $accum, str_repeat( $curChar, $count ) );
$i += $curLen * $count;
$endText = substr( $text, $i, $count );
self::addLiteral( $accum, $endText );
$i += $count;
continue;
}
$name = $rule['names'][$matchingCount];
if ( $name === null ) {
// No element, just literal text
$endText = substr( $text, $i, $matchingCount );
$element = $piece->breakSyntax( $matchingCount );
self::addLiteral( $element, str_repeat( $rule['end'], $matchingCount ) );
self::addLiteral( $element, $endText );
} else {
# Create XML element
$parts = $piece->parts;
@ -648,7 +671,7 @@ class Preprocessor_Hash extends Preprocessor {
}
# Advance input pointer
$i += $curLen * $matchingCount;
$i += $matchingCount;
# Unwind the stack
$stack->pop();
@ -664,7 +687,12 @@ class Preprocessor_Hash extends Preprocessor {
$stack->push( $piece );
$accum =& $stack->getAccum();
} else {
self::addLiteral( $accum, str_repeat( $piece->open, $piece->count ) );
$s = substr( $piece->open, 0, -1 );
$s .= str_repeat(
substr( $piece->open, -1 ),
$piece->count - strlen( $s )
);
self::addLiteral( $accum, $s );
}
}
@ -762,7 +790,12 @@ class PPDStackElement_Hash extends PPDStackElement {
if ( $openingCount === false ) {
$openingCount = $this->count;
}
$accum = [ str_repeat( $this->open, $openingCount ) ];
$s = substr( $this->open, 0, -1 );
$s .= str_repeat(
substr( $this->open, -1 ),
$openingCount - strlen( $s )
);
$accum = [ $s ];
$lastIndex = 0;
$first = true;
foreach ( $this->parts as $part ) {

View file

@ -11840,6 +11840,326 @@ parsoid
!!end
###
### Preprocessor precedence tests
### See: https://www.mediawiki.org/wiki/Preprocessor_ABNF
###
##{{[[-{{{{{{[[Foo|bar}}]]}-}}}}}]]
!! test
Preprocessor precedence 1: link is rightmost opening
!! wikitext
{{[[Foo|bar}}]]
But close-brace is not a valid character in a link title:
{{[[Foo}}|bar]]
However, we can still tell this was handled as a link in the preprocessor:
{{echo|[[Foo}}|bar]]|bat}}
!! html
<p>{{<a href="/wiki/Foo" title="Foo">bar}}</a>
</p><p>But close-brace is not a valid character in a link title:
{{[[Foo}}|bar]]
</p><p>However, we can still tell this was handled as a link in the preprocessor:
[[Foo}}|bar]]
</p>
!! end
!! test
Preprocessor precedence 2: template is rightmost opening
!! options
language=zh
!! wikitext
-{{echo|foo}-}}-
!! html
<p>-foo}--
</p>
!! end
!! test
Preprocessor precedence 3: language converter is rightmost opening
!! options
language=zh
!! wikitext
{{echo|hi}}
{{-{R|echo|hi}}}-
[[-{R|raw]]}-
!! html
<p>hi
</p><p>{{echo|hi}}
</p><p>[[raw]]
</p>
!! end
!! test
Preprocessor precedence 4: left-most angle bracket
!! options
language=zh
!! wikitext
<!--{raw}-->
!! html
!! end
!! article
Template:Precedence5
!! text
{{{{{1}}}}}
!! endarticle
!! test
Preprocessor precedence 5: tplarg takes precedence over template
!! wikitext
{{Precedence5|Bullet}}
!! html
<ul><li> Bar</li></ul>
!! end
!! test
Preprocessor precedence 6: broken link is rightmost opening
!! wikitext
{{echo|[[Foo}}
{{echo|[[Foo|bar|bat=baz}}
!! html
<p>{{echo|[[Foo}}
</p><p>{{echo|[[Foo|bar|bat=baz}}
</p>
!! end
# This next test exposes a difference between PHP and Parsoid:
# Given [[Foo|{{echo|Bar]]x}}y]]z:
# 1) Both PHP and Parsoid ignore the `]]` inside the `echo` in the
# "preprocessor" stage. The `{{echo` extends until the `x}}`, and the
# outer `[[Foo` extends until the `y]]`
# 2a) But then the PHP preprocessor emits `[[Foo|Bar]]xy]]z` as an
# intermediate result (after template expansion), and link processing
# happens on this intermediate result, which moves the wikilink
# boundary leftward to `[[Foo|Bar]]`
# 2b) Parsoid works in a single step, so it's going to keep the
# wikilink as extending to the `y]]`
# 3a) Then PHP does linktrail processing which slurps up the trailing
# `xy` inside the link.
# 3b) Parsoid will do linktrail processing to slurp up the trailing
# `z` inside the link.
# This is "correct" behavior. Parsoid's basic worldview is that the
# `]]` inside the template shouldn't be allowed to leak out to affect
# the surrounding wikilink. PHP may match Parsoid (in the future)
# if you use {{#balance}} (T114445).
!! test
Preprocessor precedence 7: broken template is rightmost opening
!! wikitext
[[Foo|{{echo|Bar]]
[[Foo|{{echo|Bar]]-x}}-y]]-z
Careful: linktrails can move the end of the wikilink:
[[Foo|{{echo|y']]a}}l]]l
!! html
<p><a href="/wiki/Foo" title="Foo">{{echo|Bar</a>
</p><p><a href="/wiki/Foo" title="Foo">Bar</a>-x-y]]-z
</p><p>Careful: linktrails can move the end of the wikilink:
<a href="/wiki/Foo" title="Foo">y'al</a>]]l
</p>
!! end
!! test
Preprocessor precedence 8: broken language converter is rightmost opening
!! options
language=zh
!! wikitext
[[Foo-{R|raw]]
!! html
<p>[[Foo-{R|raw]]
</p>
!! end
!! article
Template:Preprocessor_precedence_9
!! text
;4: {{{{1}}}}
;5: {{{{{2}}}}}
;6: {{{{{{3}}}}}}
;7: {{{{{{{4}}}}}}}
!! endarticle
!! test
Preprocessor precedence 9: groups of braces
!! wikitext
{{Preprocessor precedence 9|Four|Bullet|1|2}}
!! html
<dl><dt>4</dt>
<dd> {Four}</dd>
<dt>5</dt>
<dd> </dd></dl>
<ul><li> Bar</li></ul>
<dl><dt>6</dt>
<dd> Four</dd>
<dt>7</dt>
<dd> {Bullet}</dd></dl>
!! end
!! article
Template:Preprocessor_precedence_10
!! text
;1: -{R|raw}-
;2: -{{Bullet}}-
;3: -{{{1}}}-
;4: -{{{{2}}}}-
;5: -{{{{{3}}}}}-
;6: -{{{{{{4}}}}}}-
;7: -{{{{{{{5}}}}}}}-
!! endarticle
!! test
Preprocessor precedence 10: groups of braces with leading dash
!! options
language=zh
!! wikitext
{{Preprocessor precedence 10|Three|raw2|Bullet|1|2}}
!! html
<dl><dt>1</dt>
<dd> raw</dd>
<dt>2</dt>
<dd> -</dd></dl>
<ul><li> Bar-</li></ul>
<dl><dt>3</dt>
<dd> -Three-</dd>
<dt>4</dt>
<dd> raw2</dd>
<dt>5</dt>
<dd> -</dd></dl>
<ul><li> Bar-</li></ul>
<dl><dt>6</dt>
<dd> -Three-</dd>
<dt>7</dt>
<dd> raw2</dd></dl>
!! end
!! test
Preprocessor precedence 11: found during visual diff testing
!! wikitext
{{#tag:span|-{{#tag:span|-{{echo|x}}}}}}
{{echo|-{{echo|-{{echo|x}}}}}}
{{echo|-{{echo|x}}}}
!! html
<p><span>-<span>-x</span></span>
</p><p>--x
</p><p>-x
</p>
!! end
!! test
Preprocessor precedence 12: broken language converter closed by brace.
!! wikitext
This form breaks the template, which is unfortunate:
* {{echo|foo-{bar}bat}}
But if the broken language converter markup is inside an extension
tag, nothing bad happens:
* <nowiki>foo-{bar}bat</nowiki>
* {{echo|<nowiki>foo-{bar}bat</nowiki>}}
* <pre>foo-{bar}bat</pre>
* {{echo|<pre>foo-{bar}bat</pre>}}
<tag>foo-{bar}bat</tag>
{{echo|<tag>foo-{bar}bat</tag>}}
!! html+tidy
<p>This form breaks the template, which is unfortunate:</p>
<ul>
<li>{{echo|foo-{bar}bat}}</li>
</ul>
<p>But if the broken language converter markup is inside an extension tag, nothing bad happens:</p>
<ul>
<li>foo-{bar}bat</li>
<li>foo-{bar}bat</li>
<li>
<pre>
foo-{bar}bat
</pre></li>
<li>
<pre>
foo-{bar}bat
</pre></li>
</ul>
<pre>
'foo-{bar}bat'
array (
)
</pre>
<pre>
'foo-{bar}bat'
array (
)
</pre>
!! end
!! test
Preprocessor precedence, 13: broken language converter in external link
!! wikitext
* [http://example.com/-{foo Example in URL]
* [http://example.com Example in -{link} description]
* {{echo|[http://example.com/-{foo Breaks template, however]}}
!! html+tidy
<ul>
<li><a rel="nofollow" class="external text" href="http://example.com/-{foo">Example in URL</a></li>
<li><a rel="nofollow" class="external text" href="http://example.com">Example in -{link} description</a></li>
<li>{{echo|<a rel="nofollow" class="external text" href="http://example.com/-{foo">Breaks template, however</a>}}</li>
</ul>
!! end
!! test
Preprocessor precedence, 14: broken language converter in comment
!! wikitext
* <!--{{foo}}--> ...should be ok
* <!---{{foo}}--> ...extra dashes
* {{echo|foo<!-- -{bar} -->bat}} ...should be ok
!! html+tidy
<ul>
<li>...should be ok</li>
<li>...extra dashes</li>
<li>foobat ...should be ok</li>
</ul>
!! end
!! test
Preprocessor precedence, 15: broken brace markup in headings
!! wikitext
__NOTOC__ __NOEDITSECTION__
===1 foo[bar 1===
1
===2 foo[[bar 2===
2
===3 foo{bar 3===
3
===4 foo{{bar 4===
4
===5 foo{{{bar 5===
5
===6 foo-{bar 6===
6
!! html+tidy
<h3><span class="mw-headline" id="1_foo.5Bbar_1">1 foo[bar 1</span></h3>
<p>1</p>
<h3><span class="mw-headline" id="2_foo.5B.5Bbar_2">2 foo[[bar 2</span></h3>
<p>2</p>
<h3><span class="mw-headline" id="3_foo.7Bbar_3">3 foo{bar 3</span></h3>
<p>3</p>
<h3><span class="mw-headline" id="4_foo.7B.7Bbar_4">4 foo{{bar 4</span></h3>
<p>4</p>
<h3><span class="mw-headline" id="5_foo.7B.7B.7Bbar_5">5 foo{{{bar 5</span></h3>
<p>5</p>
<h3><span class="mw-headline" id="6_foo-.7Bbar_6">6 foo-{bar 6</span></h3>
<p>6</p>
!! end
###
### Token Stream Patcher tests
###
@ -20946,6 +21266,28 @@ Raw: -{R|zh:China;zh-tw:Taiwan}-
</p>
!! end
!! test
Nested markup inside raw output of variant escape tags (R flag)
!! options
language=zh variant=zh-tw
!! wikitext
Nested raw: -{R|nested -{zh:China;zh-tw:Taiwan}- nested}-
!! html
<p>Nested raw: nested Taiwan nested
</p>
!! end
!! test
Templates inside raw output of variant escape tags (R flag)
!! options
language=zh variant=zh-tw
!! wikitext
Nested raw: -{R|nested {{echo|hi}} templates}-
!! html
<p>Nested raw: nested hi templates
</p>
!! end
!! test
Strings evaluating false shouldn't be ignored by Language converter (T51072)
!! options
@ -21113,12 +21455,10 @@ language=sr variant=sr-ec
</p>
!! end
# FIXME: This test is currently broken in the PHP parser T153761
!! test
T146304: Don't break template parsing if language converter markup is in the parameter.
!! options
language=sr variant=sr-ec
disabled
!! wikitext
{{echo|-{R|foo}-}}
!! html/php