Reduce code duplication correctly this time, again

The test cases I thought up are at:

http://www.mediawiki.org/wiki/User:Simetrical/Id_tests

All of them pass with the patch, except for some that fail on current
code as well: the ones involving templates, multiply-occurring section
headers, or numeric id's (there seems to be a weird bug with those that
probably involves string and numeric id's being used in the same array).
This is true whether $wgEnforceHtmlIds is on or off.  (Actually, the
problem with numeric keys doesn't happen with $wgEnforceHtmlIds off,
because of course numeric ids aren't allowed then.)
This commit is contained in:
Aryeh Gregor 2009-01-08 23:59:25 +00:00
parent a445dd5e11
commit f7486d8642

View file

@ -3448,7 +3448,7 @@ class Parser
* @private
*/
function formatHeadings( $text, $isMain=true ) {
global $wgMaxTocLevel, $wgContLang, $wgEnforceHtmlIds;
global $wgMaxTocLevel, $wgContLang;
$doNumberHeadings = $this->mOptions->getNumberHeadings();
$showEditLink = $this->mOptions->getEditSection();
@ -3593,71 +3593,17 @@ class Parser
}
}
# The safe header is a version of the header text safe to use for links
# Avoid insertion of weird stuff like <math> by expanding the relevant sections
$safeHeadline = $this->mStripState->unstripBoth( $headline );
# Remove link placeholders by the link text.
# <!--LINK number-->
# turns into
# link text with suffix
$safeHeadline = $this->replaceLinkHoldersText( $safeHeadline );
# Strip out HTML (other than plain <sup> and <sub>: bug 8393)
$tocline = preg_replace(
array( '#<(?!/?(sup|sub)).*?'.'>#', '#<(/?(sup|sub)).*?'.'>#' ),
array( '', '<$1>'),
$safeHeadline
);
$tocline = trim( $tocline );
# For the anchor, strip out HTML-y stuff period
$safeHeadline = preg_replace( '/<.*?'.'>/', '', $safeHeadline );
$safeHeadline = trim( $safeHeadline );
# Save headline for section edit hint before it's escaped
$headlineHint = $safeHeadline;
if ( $wgEnforceHtmlIds ) {
$legacyHeadline = false;
$safeHeadline = Sanitizer::escapeId( $safeHeadline,
'noninitial' );
} else {
# For reverse compatibility, provide an id that's
# HTML4-compatible, like we used to.
#
# It may be worth noting, academically, that it's possible for
# the legacy anchor to conflict with a non-legacy headline
# anchor on the page. In this case likely the "correct" thing
# would be to either drop the legacy anchors or make sure
# they're numbered first. However, this would require people
# to type in section names like "abc_.D7.93.D7.90.D7.A4"
# manually, so let's not bother worrying about it.
$legacyHeadline = Sanitizer::escapeId( $safeHeadline,
'noninitial' );
$safeHeadline = Sanitizer::escapeId( $safeHeadline, 'xml' );
if ( $legacyHeadline == $safeHeadline ) {
# No reason to have both (in fact, we can't)
$legacyHeadline = false;
} elseif ( $legacyHeadline != Sanitizer::escapeId(
$legacyHeadline, 'xml' ) ) {
# The legacy id is invalid XML. We used to allow this, but
# there's no reason to do so anymore. Backward
# compatibility will fail slightly in this case, but it's
# no big deal.
$legacyHeadline = false;
}
}
list( $anchor, $legacyAnchor, $tocline, $headlineHint ) =
$this->processHeadingText( $headline );
# HTML names must be case-insensitively unique (bug 10721). FIXME:
# Does this apply to Unicode characters? Because we aren't
# handling those here.
$arrayKey = strtolower( $safeHeadline );
if ( $legacyHeadline === false ) {
$arrayKey = strtolower( $anchor );
if ( $legacyAnchor === false ) {
$legacyArrayKey = false;
} else {
$legacyArrayKey = strtolower( $legacyHeadline );
$legacyArrayKey = strtolower( $legacyAnchor );
}
# count how many in assoc. array so we can track dupes in anchors
@ -3679,12 +3625,10 @@ class Parser
}
# Create the anchor for linking from the TOC to the section
$anchor = $safeHeadline;
$legacyAnchor = $legacyHeadline;
if ( $refers[$arrayKey] > 1 ) {
$anchor .= '_' . $refers[$arrayKey];
}
if ( $legacyHeadline !== false && $refers[$legacyArrayKey] > 1 ) {
if ( $legacyAnchor !== false && $refers[$legacyArrayKey] > 1 ) {
$legacyAnchor .= '_' . $refers[$legacyArrayKey];
}
if( $enoughToc && ( !isset($wgMaxTocLevel) || $toclevel<$wgMaxTocLevel ) ) {
@ -3756,6 +3700,70 @@ class Parser
}
}
private function processHeadingText( $headline ) {
global $wgEnforceHtmlIds;
# The safe header is a version of the header text safe to use for links
# Avoid insertion of weird stuff like <math> by expanding the relevant sections
$safeHeadline = $this->mStripState->unstripBoth( $headline );
# Remove link placeholders by the link text.
# <!--LINK number-->
# turns into
# link text with suffix
$safeHeadline = $this->replaceLinkHoldersText( $safeHeadline );
# Strip out HTML (other than plain <sup> and <sub>: bug 8393)
$tocline = preg_replace(
array( '#<(?!/?(sup|sub)).*?'.'>#', '#<(/?(sup|sub)).*?'.'>#' ),
array( '', '<$1>'),
$safeHeadline
);
$tocline = trim( $tocline );
# For the anchor, strip out HTML-y stuff period
$safeHeadline = preg_replace( '/<.*?'.'>/', '', $safeHeadline );
$safeHeadline = trim( $safeHeadline );
# Save headline for section edit hint before it's escaped
$headlineHint = $safeHeadline;
if ( $wgEnforceHtmlIds ) {
$legacyHeadline = false;
$safeHeadline = Sanitizer::escapeId( $safeHeadline,
'noninitial' );
} else {
# For reverse compatibility, provide an id that's
# HTML4-compatible, like we used to.
#
# It may be worth noting, academically, that it's possible for
# the legacy anchor to conflict with a non-legacy headline
# anchor on the page. In this case likely the "correct" thing
# would be to either drop the legacy anchors or make sure
# they're numbered first. However, this would require people
# to type in section names like "abc_.D7.93.D7.90.D7.A4"
# manually, so let's not bother worrying about it.
$legacyHeadline = Sanitizer::escapeId( $safeHeadline,
'noninitial' );
$safeHeadline = Sanitizer::escapeId( $safeHeadline, 'xml' );
if ( $legacyHeadline == $safeHeadline ) {
# No reason to have both (in fact, we can't)
$legacyHeadline = false;
} elseif ( $legacyHeadline != Sanitizer::escapeId(
$legacyHeadline, 'xml' ) ) {
# The legacy id is invalid XML. We used to allow this, but
# there's no reason to do so anymore. Backward
# compatibility will fail slightly in this case, but it's
# no big deal.
$legacyHeadline = false;
}
}
return array( $safeHeadline, $legacyHeadline, $tocline,
$headlineHint );
}
/**
* Transform wiki markup when saving a page by doing \r\n -> \n
* conversion, substitting signatures, {{subst:}} templates, etc.
@ -4736,21 +4744,9 @@ class Parser
* "== Header ==".
*/
public function guessSectionNameFromWikiText( $text ) {
# Strip out wikitext links(they break the anchor)
$text = $this->stripSectionName( $text );
$headline = Sanitizer::decodeCharReferences( $text );
# strip out HTML
$headline = StringUtils::delimiterReplace( '<', '>', '', $headline );
$headline = trim( $headline );
$sectionanchor = '#' . urlencode( str_replace( ' ', '_', $headline ) );
$replacearray = array(
'%3A' => ':',
'%' => '.'
);
return str_replace(
array_keys( $replacearray ),
array_values( $replacearray ),
$sectionanchor );
list( $text, /* unneeded here */ ) = $this->processHeadingText( $text );
return "#$text";
}
/**