Remove nbsp and similar characters from section IDs

Bug: T90902
Change-Id: I71bdb7dd43c3e532287290e3c691d9739da45475
This commit is contained in:
Max Semenik 2017-11-02 19:35:11 -07:00
parent 6f9738d832
commit 129067c907
3 changed files with 39 additions and 0 deletions

View file

@ -41,6 +41,7 @@ production.
* …
=== Bug fixes in 1.31 ===
* (T90902) Non-breaking space in header ID breaks anchor
* …
=== Action API changes in 1.31 ===

View file

@ -4206,6 +4206,9 @@ class Parser {
# Decode HTML entities
$safeHeadline = Sanitizer::decodeCharReferences( $safeHeadline );
$safeHeadline = $this->normalizeSectionName( $safeHeadline );
$fallbackHeadline = Sanitizer::escapeIdForAttribute( $safeHeadline, Sanitizer::ID_FALLBACK );
$linkAnchor = Sanitizer::escapeIdForLink( $safeHeadline );
$safeHeadline = Sanitizer::escapeIdForAttribute( $safeHeadline, Sanitizer::ID_PRIMARY );
@ -5767,6 +5770,8 @@ class Parser {
$text = $this->stripSectionName( $text );
$text = Sanitizer::normalizeSectionNameWhitespace( $text );
$text = Sanitizer::decodeCharReferences( $text );
$text = $this->normalizeSectionName( $text );
return '#' . Sanitizer::escapeIdForLink( $text );
}
@ -5786,6 +5791,7 @@ class Parser {
$text = $this->stripSectionName( $text );
$text = Sanitizer::normalizeSectionNameWhitespace( $text );
$text = Sanitizer::decodeCharReferences( $text );
$text = $this->normalizeSectionName( $text );
if ( isset( $wgFragmentMode[1] ) && $wgFragmentMode[1] === 'legacy' ) {
// ForAttribute() and ForLink() are the same for legacy encoding
@ -5797,6 +5803,24 @@ class Parser {
return "#$id";
}
/**
* Apply the same normalization as code making links to this section would
*
* @param string $text
* @return string
*/
private function normalizeSectionName( $text ) {
# T90902: ensure the same normalization is applied for IDs as to links
$titleParser = MediaWikiServices::getInstance()->getTitleParser();
try {
$parts = $titleParser->splitTitleString( "#$text" );
} catch ( MalformedTitleException $ex ) {
return $text;
}
return $parts['fragment'];
}
/**
* Strips a text string of wikitext for use in a section anchor
*

View file

@ -29536,3 +29536,17 @@ wgFragmentMode=[ 'html5' ]
</p><p><a href="#啤酒">#啤酒</a> <a href="#啤酒">#啤酒</a>
</p>
!! end
!! test
T90902: Normalize weird characters in section IDs
!! config
wgFragmentMode=[ 'html5', 'legacy' ]
!! wikitext
== Foo&nbsp;bar ==
[[#Foo&nbsp;bar]]
!! html/php
<h2><span class="mw-headline" id="Foo_bar">Foo&#160;bar</span><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/index.php?title=Parser_test&amp;action=edit&amp;section=1" title="Edit section: Foo bar">edit</a><span class="mw-editsection-bracket">]</span></span></h2>
<p><a href="#Foo_bar">#Foo&#160;bar</a>
</p>
!! end