Optimised Sanitizer::removeHTMLtags, Parser::unstrip, Parser::doMagicLinks, Parser::replaceInternalLinks and Parser::replace_callback. Introduced fast abridged entry point Parser::recursiveTagParse, for recursive parsing called from Cite.php and similar extensions. Resolved a known fail parser test.

This commit is contained in:
Tim Starling 2006-08-06 14:01:47 +00:00
parent 6530b5eb12
commit 91006d5acb
3 changed files with 354 additions and 387 deletions

View file

@ -187,6 +187,7 @@ class Parser
* @private
*/
function clearState() {
wfProfileIn( __METHOD__ );
if ( $this->mFirstCall ) {
$this->firstCallInit();
}
@ -228,6 +229,7 @@ class Parser
$this->mForceTocPosition = false;
wfRunHooks( 'ParserClearState', array( &$this ) );
wfProfileOut( __METHOD__ );
}
/**
@ -235,7 +237,7 @@ class Parser
*
* @public
*/
function UniqPrefix() {
function uniqPrefix() {
return $this->mUniqPrefix;
}
@ -282,12 +284,6 @@ class Parser
$text = $this->strip( $text, $x );
wfRunHooks( 'ParserAfterStrip', array( &$this, &$text, &$x ) );
# Hook to suspend the parser in this state
if ( !wfRunHooks( 'ParserBeforeInternalParse', array( &$this, &$text, &$x ) ) ) {
wfProfileOut( $fname );
return $text ;
}
$text = $this->internalParse( $text );
$text = $this->unstrip( $text, $this->mStripState );
@ -358,6 +354,21 @@ class Parser
return $this->mOutput;
}
/**
* Recursive parser entry point that can be called from an extension tag
* hook.
*/
function recursiveTagParse( $text ) {
wfProfileIn( __METHOD__ );
$x =& $this->mStripState;
wfRunHooks( 'ParserBeforeStrip', array( &$this, &$text, &$x ) );
$text = $this->strip( $text, $x );
wfRunHooks( 'ParserAfterStrip', array( &$this, &$text, &$x ) );
$text = $this->internalParse( $text );
wfProfileOut( __METHOD__ );
return $text;
}
/**
* Get a random string
*
@ -395,8 +406,7 @@ class Parser
* @static
*/
function extractTagsAndParams($elements, $text, &$matches, $uniq_prefix = ''){
$rand = Parser::getRandomString();
$n = 1;
static $n = 1;
$stripped = '';
$matches = array();
@ -423,7 +433,7 @@ class Parser
$inside = $p[4];
}
$marker = "$uniq_prefix-$element-$rand" . sprintf('%08X', $n++) . '-QINU';
$marker = "$uniq_prefix-$element-" . sprintf('%08X', $n++) . '-QINU';
$stripped .= $marker;
if ( $close === '/>' ) {
@ -474,6 +484,7 @@ class Parser
* @private
*/
function strip( $text, &$state, $stripcomments = false , $dontstrip = array () ) {
wfProfileIn( __METHOD__ );
$render = ($this->mOutputType == OT_HTML);
# Replace any instances of the placeholders
@ -505,6 +516,7 @@ class Parser
list( $element, $content, $params, $tag ) = $data;
if( $render ) {
$tagName = strtolower( $element );
wfProfileIn( __METHOD__."-render-$tagName" );
switch( $tagName ) {
case '!--':
// Comment
@ -539,14 +551,22 @@ class Parser
throw new MWException( "Invalid call hook $element" );
}
}
wfProfileOut( __METHOD__."-render-$tagName" );
} else {
// Just stripping tags; keep the source
$output = $tag;
}
// Unstrip the output, because unstrip() is no longer recursive so
// it won't do it itself
$output = $this->unstrip( $output, $state );
if( !$stripcomments && $element == '!--' ) {
$commentState[$marker] = $output;
} elseif ( $element == 'html' || $element == 'nowiki' ) {
$state['nowiki'][$marker] = $output;
} else {
$state[$element][$marker] = $output;
$state['general'][$marker] = $output;
}
}
@ -559,6 +579,7 @@ class Parser
$text = strtr( $text, $commentState );
}
wfProfileOut( __METHOD__ );
return $text;
}
@ -569,20 +590,14 @@ class Parser
* @private
*/
function unstrip( $text, &$state ) {
if ( !is_array( $state ) ) {
if ( !isset( $state['general'] ) ) {
return $text;
}
$replacements = array();
foreach( $state as $tag => $contentDict ) {
if( $tag != 'nowiki' && $tag != 'html' ) {
foreach( $contentDict as $uniq => $content ) {
$replacements[$uniq] = $content;
}
}
}
$text = strtr( $text, $replacements );
wfProfileIn( __METHOD__ );
# TODO: good candidate for FSS
$text = strtr( $text, $state['general'] );
wfProfileOut( __METHOD__ );
return $text;
}
@ -592,20 +607,15 @@ class Parser
* @private
*/
function unstripNoWiki( $text, &$state ) {
if ( !is_array( $state ) ) {
if ( !isset( $state['nowiki'] ) ) {
return $text;
}
$replacements = array();
foreach( $state as $tag => $contentDict ) {
if( $tag == 'nowiki' || $tag == 'html' ) {
foreach( $contentDict as $uniq => $content ) {
$replacements[$uniq] = $content;
}
}
}
$text = strtr( $text, $replacements );
wfProfileIn( __METHOD__ );
# TODO: good candidate for FSS
$text = strtr( $text, $state['nowiki'] );
wfProfileOut( __METHOD__ );
return $text;
}
@ -621,7 +631,7 @@ class Parser
if ( !$state ) {
$state = array();
}
$state['item'][$rnd] = $text;
$state['general'][$rnd] = $text;
return $rnd;
}
@ -881,6 +891,12 @@ class Parser
$fname = 'Parser::internalParse';
wfProfileIn( $fname );
# Hook to suspend the parser in this state
if ( !wfRunHooks( 'ParserBeforeInternalParse', array( &$this, &$text, &$x ) ) ) {
wfProfileOut( $fname );
return $text ;
}
# Remove <noinclude> tags and <includeonly> sections
$text = strtr( $text, array( '<onlyinclude>' => '' , '</onlyinclude>' => '' ) );
$text = strtr( $text, array( '<noinclude>' => '', '</noinclude>' => '') );
@ -927,9 +943,52 @@ class Parser
* @private
*/
function &doMagicLinks( &$text ) {
$text = $this->magicISBN( $text );
$text = $this->magicRFC( $text, 'RFC ', 'rfcurl' );
$text = $this->magicRFC( $text, 'PMID ', 'pubmedurl' );
wfProfileIn( __METHOD__ );
$text = preg_replace_callback(
'!(?: # Start cases
<a.*?</a> # Skip link text
<.*?> | # Skip stuff inside HTML elements
(?:RFC|PMID)\s+([0-9]+) | # RFC or PMID, capture number as m[1]
ISBN\s+([0-9Xx-]+) # ISBN, capture number as m[2]
)!x', array( &$this, 'magicLinkCallback' ), $text );
wfProfileOut( __METHOD__ );
return $text;
}
function magicLinkCallback( $m ) {
if ( substr( $m[0], 0, 1 ) == '<' ) {
# Skip HTML element
return $m[0];
} elseif ( substr( $m[0], 0, 4 ) == 'ISBN' ) {
$isbn = $m[2];
$num = strtr( $isbn, array(
'-' => '',
' ' => '',
'x' => 'X',
));
$titleObj = Title::makeTitle( NS_SPECIAL, 'Booksources' );
$text = '<a href="' .
$titleObj->escapeLocalUrl( "isbn=$num" ) .
"\" class=\"internal\">ISBN $isbn</a>";
} else {
if ( substr( $m[0], 0, 3 ) == 'RFC' ) {
$keyword = 'RFC';
$urlmsg = 'rfcurl';
$id = $m[1];
} elseif ( substr( $m[0], 0, 4 ) == 'PMID' ) {
$keyword = 'PMID';
$urlmsg = 'pubmedurl';
$id = $m[1];
} else {
throw new MWException( __METHOD__.': unrecognised match type "' .
substr($m[0], 0, 20 ) . '"' );
}
$url = wfMsg( $urlmsg, $id);
$sk =& $this->mOptions->getSkin();
$la = $sk->getExternalLinkAttributes( $url, $keyword.$id );
$text = "<a href=\"{$url}\"{$la}>{$keyword} {$id}</a>";
}
return $text;
}
@ -1391,7 +1450,7 @@ class Parser
$useLinkPrefixExtension = $wgContLang->linkPrefixExtension();
if( is_null( $this->mTitle ) ) {
throw new MWException( 'nooo' );
throw new MWException( __METHOD__.": \$this->mTitle is null\n" );
}
$nottalk = !$this->mTitle->isTalkPage();
@ -1406,10 +1465,9 @@ class Parser
}
$selflink = $this->mTitle->getPrefixedText();
wfProfileOut( $fname.'-setup' );
$checkVariantLink = sizeof($wgContLang->getVariants())>1;
$useSubpages = $this->areSubpagesAllowed();
wfProfileOut( $fname.'-setup' );
# Loop for each link
for ($k = 0; isset( $a[$k] ); $k++) {
@ -1432,6 +1490,7 @@ class Parser
$might_be_img = false;
wfProfileIn( "$fname-e1" );
if ( preg_match( $e1, $line, $m ) ) { # page with normal text or alt
$text = $m[2];
# If we get a ] at the beginning of $m[3] that means we have a link that's something like:
@ -1443,27 +1502,33 @@ class Parser
# and no image is in sight. See bug 2095.
#
if( $text !== '' &&
preg_match( "/^\](.*)/s", $m[3], $n ) &&
substr( $m[3], 0, 1 ) === ']' &&
strpos($text, '[') !== false
)
{
$text .= ']'; # so that replaceExternalLinks($text) works later
$m[3] = $n[1];
$m[3] = substr( $m[3], 1 );
}
# fix up urlencoded title texts
if(preg_match('/%/', $m[1] ))
if( strpos( $m[1], '%' ) !== false ) {
# Should anchors '#' also be rejected?
$m[1] = str_replace( array('<', '>'), array('&lt;', '&gt;'), urldecode($m[1]) );
}
$trail = $m[3];
} elseif( preg_match($e1_img, $line, $m) ) { # Invalid, but might be an image with a link in its caption
$might_be_img = true;
$text = $m[2];
if(preg_match('/%/', $m[1] )) $m[1] = urldecode($m[1]);
if ( strpos( $m[1], '%' ) !== false ) {
$m[1] = urldecode($m[1]);
}
$trail = "";
} else { # Invalid form; output directly
$s .= $prefix . '[[' . $line ;
wfProfileOut( "$fname-e1" );
continue;
}
wfProfileOut( "$fname-e1" );
wfProfileIn( "$fname-misc" );
# Don't allow internal links to pages containing
# PROTO: where PROTO is a valid URL protocol; these
@ -1486,9 +1551,12 @@ class Parser
$link = substr($link, 1);
}
wfProfileOut( "$fname-misc" );
wfProfileIn( "$fname-title" );
$nt = Title::newFromText( $this->unstripNoWiki($link, $this->mStripState) );
if( !$nt ) {
$s .= $prefix . '[[' . $line;
wfProfileOut( "$fname-title" );
continue;
}
@ -1501,23 +1569,26 @@ class Parser
$ns = $nt->getNamespace();
$iw = $nt->getInterWiki();
wfProfileOut( "$fname-title" );
if ($might_be_img) { # if this is actually an invalid link
wfProfileIn( "$fname-might_be_img" );
if ($ns == NS_IMAGE && $noforce) { #but might be an image
$found = false;
while (isset ($a[$k+1]) ) {
#look at the next 'line' to see if we can close it there
$spliced = array_splice( $a, $k + 1, 1 );
$next_line = array_shift( $spliced );
if( preg_match("/^(.*?]].*?)]](.*)$/sD", $next_line, $m) ) {
# the first ]] closes the inner link, the second the image
$m = explode( ']]', $next_line, 3 );
if ( count( $m ) == 3 ) {
# the first ]] closes the inner link, the second the image
$found = true;
$text .= '[[' . $m[1];
$text .= "[[{$m[0]}]]{$m[1]}";
$trail = $m[2];
break;
} elseif( preg_match("/^.*?]].*$/sD", $next_line, $m) ) {
} elseif ( count( $m ) == 2 ) {
#if there's exactly one ]] that's fine, we'll keep looking
$text .= '[[' . $m[0];
$text .= "[[{$m[0]}]]{$m[1]}";
} else {
#if $next_line is invalid too, we need look no further
$text .= '[[' . $next_line;
@ -1528,31 +1599,36 @@ class Parser
# we couldn't find the end of this imageLink, so output it raw
#but don't ignore what might be perfectly normal links in the text we've examined
$text = $this->replaceInternalLinks($text);
$s .= $prefix . '[[' . $link . '|' . $text;
$s .= "{$prefix}[[$link|$text";
# note: no $trail, because without an end, there *is* no trail
wfProfileOut( "$fname-might_be_img" );
continue;
}
} else { #it's not an image, so output it raw
$s .= $prefix . '[[' . $link . '|' . $text;
$s .= "{$prefix}[[$link|$text";
# note: no $trail, because without an end, there *is* no trail
wfProfileOut( "$fname-might_be_img" );
continue;
}
wfProfileOut( "$fname-might_be_img" );
}
$wasblank = ( '' == $text );
if( $wasblank ) $text = $link;
# Link not escaped by : , create the various objects
if( $noforce ) {
# Interwikis
wfProfileIn( "$fname-interwiki" );
if( $iw && $this->mOptions->getInterwikiMagic() && $nottalk && $wgContLang->getLanguageName( $iw ) ) {
$this->mOutput->addLanguageLink( $nt->getFullText() );
$s = rtrim($s . "\n");
$s .= trim($prefix . $trail, "\n") == '' ? '': $prefix . $trail;
wfProfileOut( "$fname-interwiki" );
continue;
}
wfProfileOut( "$fname-interwiki" );
if ( $ns == NS_IMAGE ) {
wfProfileIn( "$fname-image" );
@ -1642,11 +1718,12 @@ class Parser
/**
* Make a link placeholder. The text returned can be later resolved to a real link with
* replaceLinkHolders(). This is done for two reasons: firstly to avoid further
* parsing of interwiki links, and secondly to allow all extistence checks and
* parsing of interwiki links, and secondly to allow all existence checks and
* article length checks (for stub links) to be bundled into a single query.
*
*/
function makeLinkHolder( &$nt, $text = '', $query = '', $trail = '', $prefix = '' ) {
wfProfileIn( __METHOD__ );
if ( ! is_object($nt) ) {
# Fail gracefully
$retVal = "<!-- ERROR -->{$prefix}{$text}{$trail}";
@ -1668,6 +1745,7 @@ class Parser
$retVal = '<!--LINK '. ($nr-1) ."-->{$trail}";
}
}
wfProfileOut( __METHOD__ );
return $retVal;
}
@ -2355,172 +2433,164 @@ class Parser
* '{' => array( # opening parentheses
* 'end' => '}', # closing parentheses
* 'cb' => array(2 => callback, # replacement callback to call if {{..}} is found
* 4 => callback # replacement callback to call if {{{{..}}}} is found
* 3 => callback # replacement callback to call if {{{..}}} is found
* )
* )
* 'min' => 2, # Minimum parenthesis count in cb
* 'max' => 3, # Maximum parenthesis count in cb
* @private
*/
function replace_callback ($text, $callbacks) {
wfProfileIn( __METHOD__ . '-self' );
wfProfileIn( __METHOD__ );
$openingBraceStack = array(); # this array will hold a stack of parentheses which are not closed yet
$lastOpeningBrace = -1; # last not closed parentheses
for ($i = 0; $i < strlen($text); $i++) {
# check for any opening brace
$validOpeningBraces = implode( '', array_keys( $callbacks ) );
$i = 0;
while ( $i < strlen( $text ) ) {
# Find next opening brace, closing brace or pipe
if ( $lastOpeningBrace == -1 ) {
$currentClosing = '';
$search = $validOpeningBraces;
} else {
$currentClosing = $openingBraceStack[$lastOpeningBrace]['braceEnd'];
$search = $validOpeningBraces . '|' . $currentClosing;
}
$rule = null;
$nextPos = -1;
foreach ($callbacks as $key => $value) {
$pos = strpos ($text, $key, $i);
if (false !== $pos && (-1 == $nextPos || $pos < $nextPos)) {
$rule = $value;
$nextPos = $pos;
$i += strcspn( $text, $search, $i );
if ( $i < strlen( $text ) ) {
if ( $text[$i] == '|' ) {
$found = 'pipe';
} elseif ( $text[$i] == $currentClosing ) {
$found = 'close';
} else {
$found = 'open';
$rule = $callbacks[$text[$i]];
}
}
if ($lastOpeningBrace >= 0) {
$pos = strpos ($text, $openingBraceStack[$lastOpeningBrace]['braceEnd'], $i);
if (false !== $pos && (-1 == $nextPos || $pos < $nextPos)){
$rule = null;
$nextPos = $pos;
}
$pos = strpos ($text, '|', $i);
if (false !== $pos && (-1 == $nextPos || $pos < $nextPos)){
$rule = null;
$nextPos = $pos;
}
}
if ($nextPos == -1)
} else {
# All done
break;
}
$i = $nextPos;
# found openning brace, lets add it to parentheses stack
if (null != $rule) {
if ( $found == 'open' ) {
# found opening brace, let's add it to parentheses stack
$piece = array('brace' => $text[$i],
'braceEnd' => $rule['end'],
'count' => 1,
'title' => '',
'parts' => null);
# count openning brace characters
while ($i+1 < strlen($text) && $text[$i+1] == $piece['brace']) {
$piece['count']++;
$i++;
# count opening brace characters
$piece['count'] = strspn( $text, $piece['brace'], $i );
$piece['startAt'] = $piece['partStart'] = $i + $piece['count'];
$i += $piece['count'];
# we need to add to stack only if opening brace count is enough for one of the rules
if ( $piece['count'] >= $rule['min'] ) {
$lastOpeningBrace ++;
$openingBraceStack[$lastOpeningBrace] = $piece;
}
} elseif ( $found == 'close' ) {
# lets check if it is enough characters for closing brace
$maxCount = $openingBraceStack[$lastOpeningBrace]['count'];
$count = strspn( $text, $text[$i], $i, $maxCount );
$piece['startAt'] = $i+1;
$piece['partStart'] = $i+1;
# we need to add to stack only if openning brace count is enough for any given rule
foreach ($rule['cb'] as $cnt => $fn) {
if ($piece['count'] >= $cnt) {
$lastOpeningBrace ++;
$openingBraceStack[$lastOpeningBrace] = $piece;
break;
# check for maximum matching characters (if there are 5 closing
# characters, we will probably need only 3 - depending on the rules)
$matchingCount = 0;
$matchingCallback = null;
$cbType = $callbacks[$openingBraceStack[$lastOpeningBrace]['brace']];
if ( $count > $cbType['max'] ) {
# The specified maximum exists in the callback array, unless the caller
# has made an error
$matchingCount = $cbType['max'];
} else {
# Count is less than the maximum
# Skip any gaps in the callback array to find the true largest match
# Need to use array_key_exists not isset because the callback can be null
$matchingCount = $count;
while ( $matchingCount > 0 && !array_key_exists( $matchingCount, $cbType['cb'] ) ) {
--$matchingCount;
}
}
continue;
}
else if ($lastOpeningBrace >= 0) {
# first check if it is a closing brace
if ($openingBraceStack[$lastOpeningBrace]['braceEnd'] == $text[$i]) {
# lets check if it is enough characters for closing brace
$count = 1;
while ($i+$count < strlen($text) && $text[$i+$count] == $text[$i])
$count++;
# if there are more closing parentheses than opening ones, we parse less
if ($openingBraceStack[$lastOpeningBrace]['count'] < $count)
$count = $openingBraceStack[$lastOpeningBrace]['count'];
# check for maximum matching characters (if there are 5 closing characters, we will probably need only 3 - depending on the rules)
$matchingCount = 0;
$matchingCallback = null;
foreach ($callbacks[$openingBraceStack[$lastOpeningBrace]['brace']]['cb'] as $cnt => $fn) {
if ($count >= $cnt && $matchingCount < $cnt) {
$matchingCount = $cnt;
$matchingCallback = $fn;
}
}
if ($matchingCount == 0) {
$i += $count - 1;
continue;
}
# lets set a title or last part (if '|' was found)
if (null === $openingBraceStack[$lastOpeningBrace]['parts'])
$openingBraceStack[$lastOpeningBrace]['title'] = substr($text, $openingBraceStack[$lastOpeningBrace]['partStart'], $i - $openingBraceStack[$lastOpeningBrace]['partStart']);
else
$openingBraceStack[$lastOpeningBrace]['parts'][] = substr($text, $openingBraceStack[$lastOpeningBrace]['partStart'], $i - $openingBraceStack[$lastOpeningBrace]['partStart']);
$pieceStart = $openingBraceStack[$lastOpeningBrace]['startAt'] - $matchingCount;
$pieceEnd = $i + $matchingCount;
if( is_callable( $matchingCallback ) ) {
$cbArgs = array (
'text' => substr($text, $pieceStart, $pieceEnd - $pieceStart),
'title' => trim($openingBraceStack[$lastOpeningBrace]['title']),
'parts' => $openingBraceStack[$lastOpeningBrace]['parts'],
'lineStart' => (($pieceStart > 0) && ($text[$pieceStart-1] == "\n")),
);
# finally we can call a user callback and replace piece of text
wfProfileOut( __METHOD__ . '-self' );
$replaceWith = call_user_func( $matchingCallback, $cbArgs );
wfProfileIn( __METHOD__ . '-self' );
$text = substr($text, 0, $pieceStart) . $replaceWith . substr($text, $pieceEnd);
$i = $pieceStart + strlen($replaceWith) - 1;
}
else {
# null value for callback means that parentheses should be parsed, but not replaced
$i += $matchingCount - 1;
}
# reset last openning parentheses, but keep it in case there are unused characters
$piece = array('brace' => $openingBraceStack[$lastOpeningBrace]['brace'],
'braceEnd' => $openingBraceStack[$lastOpeningBrace]['braceEnd'],
'count' => $openingBraceStack[$lastOpeningBrace]['count'],
'title' => '',
'parts' => null,
'startAt' => $openingBraceStack[$lastOpeningBrace]['startAt']);
$openingBraceStack[$lastOpeningBrace--] = null;
if ($matchingCount < $piece['count']) {
$piece['count'] -= $matchingCount;
$piece['startAt'] -= $matchingCount;
$piece['partStart'] = $piece['startAt'];
# do we still qualify for any callback with remaining count?
foreach ($callbacks[$piece['brace']]['cb'] as $cnt => $fn) {
if ($piece['count'] >= $cnt) {
$lastOpeningBrace ++;
$openingBraceStack[$lastOpeningBrace] = $piece;
break;
}
}
}
if ($matchingCount <= 0) {
$i += $count;
continue;
}
$matchingCallback = $cbType['cb'][$matchingCount];
# lets set a title if it is a first separator, or next part otherwise
if ($text[$i] == '|') {
if (null === $openingBraceStack[$lastOpeningBrace]['parts']) {
$openingBraceStack[$lastOpeningBrace]['title'] = substr($text, $openingBraceStack[$lastOpeningBrace]['partStart'], $i - $openingBraceStack[$lastOpeningBrace]['partStart']);
$openingBraceStack[$lastOpeningBrace]['parts'] = array();
}
else
$openingBraceStack[$lastOpeningBrace]['parts'][] = substr($text, $openingBraceStack[$lastOpeningBrace]['partStart'], $i - $openingBraceStack[$lastOpeningBrace]['partStart']);
$openingBraceStack[$lastOpeningBrace]['partStart'] = $i + 1;
# let's set a title or last part (if '|' was found)
if (null === $openingBraceStack[$lastOpeningBrace]['parts']) {
$openingBraceStack[$lastOpeningBrace]['title'] =
substr($text, $openingBraceStack[$lastOpeningBrace]['partStart'],
$i - $openingBraceStack[$lastOpeningBrace]['partStart']);
} else {
$openingBraceStack[$lastOpeningBrace]['parts'][] =
substr($text, $openingBraceStack[$lastOpeningBrace]['partStart'],
$i - $openingBraceStack[$lastOpeningBrace]['partStart']);
}
$pieceStart = $openingBraceStack[$lastOpeningBrace]['startAt'] - $matchingCount;
$pieceEnd = $i + $matchingCount;
if( is_callable( $matchingCallback ) ) {
$cbArgs = array (
'text' => substr($text, $pieceStart, $pieceEnd - $pieceStart),
'title' => trim($openingBraceStack[$lastOpeningBrace]['title']),
'parts' => $openingBraceStack[$lastOpeningBrace]['parts'],
'lineStart' => (($pieceStart > 0) && ($text[$pieceStart-1] == "\n")),
);
# finally we can call a user callback and replace piece of text
$replaceWith = call_user_func( $matchingCallback, $cbArgs );
$text = substr($text, 0, $pieceStart) . $replaceWith . substr($text, $pieceEnd);
$i = $pieceStart + strlen($replaceWith);
} else {
# null value for callback means that parentheses should be parsed, but not replaced
$i += $matchingCount;
}
# reset last opening parentheses, but keep it in case there are unused characters
$piece = array('brace' => $openingBraceStack[$lastOpeningBrace]['brace'],
'braceEnd' => $openingBraceStack[$lastOpeningBrace]['braceEnd'],
'count' => $openingBraceStack[$lastOpeningBrace]['count'],
'title' => '',
'parts' => null,
'startAt' => $openingBraceStack[$lastOpeningBrace]['startAt']);
$openingBraceStack[$lastOpeningBrace--] = null;
if ($matchingCount < $piece['count']) {
$piece['count'] -= $matchingCount;
$piece['startAt'] -= $matchingCount;
$piece['partStart'] = $piece['startAt'];
# do we still qualify for any callback with remaining count?
$currentCbList = $callbacks[$piece['brace']]['cb'];
while ( $piece['count'] ) {
if ( array_key_exists( $piece['count'], $currentCbList ) ) {
$lastOpeningBrace++;
$openingBraceStack[$lastOpeningBrace] = $piece;
break;
}
--$piece['count'];
}
}
} elseif ( $found == 'pipe' ) {
# lets set a title if it is a first separator, or next part otherwise
if (null === $openingBraceStack[$lastOpeningBrace]['parts']) {
$openingBraceStack[$lastOpeningBrace]['title'] =
substr($text, $openingBraceStack[$lastOpeningBrace]['partStart'],
$i - $openingBraceStack[$lastOpeningBrace]['partStart']);
$openingBraceStack[$lastOpeningBrace]['parts'] = array();
} else {
$openingBraceStack[$lastOpeningBrace]['parts'][] =
substr($text, $openingBraceStack[$lastOpeningBrace]['partStart'],
$i - $openingBraceStack[$lastOpeningBrace]['partStart']);
}
$openingBraceStack[$lastOpeningBrace]['partStart'] = ++$i;
}
}
wfProfileOut( __METHOD__ . '-self' );
wfProfileOut( __METHOD__ );
return $text;
}
@ -2545,7 +2615,7 @@ class Parser
return $text;
}
$fname = 'Parser::replaceVariables';
$fname = __METHOD__ /*. '-L' . count( $this->mArgStack )*/;
wfProfileIn( $fname );
# This function is called recursively. To keep track of arguments we need a stack:
@ -2558,13 +2628,25 @@ class Parser
if ( $this->mOutputType == OT_HTML || $this->mOutputType == OT_WIKI ) {
$braceCallbacks[3] = array( &$this, 'argSubstitution' );
}
$callbacks = array();
$callbacks['{'] = array('end' => '}', 'cb' => $braceCallbacks);
$callbacks['['] = array('end' => ']', 'cb' => array(2=>null));
$text = $this->replace_callback ($text, $callbacks);
array_pop( $this->mArgStack );
if ( $braceCallbacks ) {
$callbacks = array(
'{' => array(
'end' => '}',
'cb' => $braceCallbacks,
'min' => $argsOnly ? 3 : 2,
'max' => isset( $braceCallbacks[3] ) ? 3 : 2,
),
'[' => array(
'end' => ']',
'cb' => array(2=>null),
'min' => 2,
'max' => 2,
)
);
$text = $this->replace_callback ($text, $callbacks);
array_pop( $this->mArgStack );
}
wfProfileOut( $fname );
return $text;
}
@ -2636,8 +2718,9 @@ class Parser
*/
function braceSubstitution( $piece ) {
global $wgContLang, $wgLang, $wgAllowDisplayTitle, $action;
$fname = 'Parser::braceSubstitution';
$fname = __METHOD__ /*. '-L' . count( $this->mArgStack )*/;
wfProfileIn( $fname );
wfProfileIn( __METHOD__.'-setup' );
# Flags
$found = false; # $text has been filled
@ -2671,8 +2754,10 @@ class Parser
$args = (null == $piece['parts']) ? array() : $piece['parts'];
$argc = count( $args );
wfProfileOut( __METHOD__.'-setup' );
# SUBST
wfProfileIn( __METHOD__.'-modifiers' );
if ( !$found ) {
$mwSubst =& MagicWord::get( 'subst' );
if ( $mwSubst->matchStartAndRemove( $part1 ) xor ($this->mOutputType == OT_WIKI) ) {
@ -2714,6 +2799,7 @@ class Parser
}
}
}
wfProfileOut( __METHOD__.'-modifiers' );
# Parser functions
if ( !$found ) {
@ -2776,7 +2862,7 @@ class Parser
$text = $linestart .
'{{' . $part1 . '}}' .
'<!-- WARNING: template loop detected -->';
wfDebug( "$fname: template loop broken at '$part1'\n" );
wfDebug( __METHOD__.": template loop broken at '$part1'\n" );
} else {
# set $text to cached message.
$text = $linestart . $this->mTemplates[$piece['title']];
@ -3374,137 +3460,6 @@ class Parser
}
}
/**
* Return an HTML link for the "ISBN 123456" text
* @private
*/
function magicISBN( $text ) {
$fname = 'Parser::magicISBN';
wfProfileIn( $fname );
$a = split( 'ISBN ', ' '.$text );
if ( count ( $a ) < 2 ) {
wfProfileOut( $fname );
return $text;
}
$text = substr( array_shift( $a ), 1);
$valid = '0123456789-Xx';
foreach ( $a as $x ) {
# hack: don't replace inside thumbnail title/alt
# attributes
if(preg_match('/<[^>]+(alt|title)="[^">]*$/', $text)) {
$text .= "ISBN $x";
continue;
}
$isbn = $blank = '' ;
while ( $x !== '' && ' ' == $x{0} ) {
$blank .= ' ';
$x = substr( $x, 1 );
}
if ( $x == '' ) { # blank isbn
$text .= "ISBN $blank";
continue;
}
while ( strstr( $valid, $x{0} ) != false ) {
$isbn .= $x{0};
$x = substr( $x, 1 );
}
$num = str_replace( '-', '', $isbn );
$num = str_replace( ' ', '', $num );
$num = str_replace( 'x', 'X', $num );
if ( '' == $num ) {
$text .= "ISBN $blank$x";
} else {
$titleObj = Title::makeTitle( NS_SPECIAL, 'Booksources' );
$text .= '<a href="' .
$titleObj->escapeLocalUrl( 'isbn='.$num ) .
"\" class=\"internal\">ISBN $isbn</a>";
$text .= $x;
}
}
wfProfileOut( $fname );
return $text;
}
/**
* Return an HTML link for the "RFC 1234" text
*
* @private
* @param string $text Text to be processed
* @param string $keyword Magic keyword to use (default RFC)
* @param string $urlmsg Interface message to use (default rfcurl)
* @return string
*/
function magicRFC( $text, $keyword='RFC ', $urlmsg='rfcurl' ) {
$valid = '0123456789';
$internal = false;
$a = split( $keyword, ' '.$text );
if ( count ( $a ) < 2 ) {
return $text;
}
$text = substr( array_shift( $a ), 1);
/* Check if keyword is preceed by [[.
* This test is made here cause of the array_shift above
* that prevent the test to be done in the foreach.
*/
if ( substr( $text, -2 ) == '[[' ) {
$internal = true;
}
foreach ( $a as $x ) {
/* token might be empty if we have RFC RFC 1234 */
if ( $x=='' ) {
$text.=$keyword;
continue;
}
# hack: don't replace inside thumbnail title/alt
# attributes
if(preg_match('/<[^>]+(alt|title)="[^">]*$/', $text)) {
$text .= $keyword . $x;
continue;
}
$id = $blank = '' ;
/** remove and save whitespaces in $blank */
while ( $x{0} == ' ' ) {
$blank .= ' ';
$x = substr( $x, 1 );
}
/** remove and save the rfc number in $id */
while ( strstr( $valid, $x{0} ) != false ) {
$id .= $x{0};
$x = substr( $x, 1 );
}
if ( $id == '' ) {
/* call back stripped spaces*/
$text .= $keyword.$blank.$x;
} elseif( $internal ) {
/* normal link */
$text .= $keyword.$id.$x;
} else {
/* build the external link*/
$url = wfMsg( $urlmsg, $id);
$sk =& $this->mOptions->getSkin();
$la = $sk->getExternalLinkAttributes( $url, $keyword.$id );
$text .= "<a href=\"{$url}\"{$la}>{$keyword}{$id}</a>{$x}";
}
/* Check if the next RFC keyword is preceed by [[ */
$internal = ( substr($x,-2) == '[[' );
}
return $text;
}
/**
* Transform wiki markup when saving a page by doing \r\n -> \n
* conversion, substitting signatures, {{subst:}} templates, etc.

View file

@ -329,75 +329,88 @@ class Sanitizer {
*/
static function removeHTMLtags( $text, $processCallback = null, $args = array() ) {
global $wgUseTidy, $wgUserHtml;
$fname = 'Parser::removeHTMLtags';
wfProfileIn( $fname );
if( $wgUserHtml ) {
$htmlpairs = array( # Tags that must be closed
'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
'strike', 'strong', 'tt', 'var', 'div', 'center',
'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'u'
);
$htmlsingle = array(
'br', 'hr', 'li', 'dt', 'dd'
);
$htmlsingleonly = array( # Elements that cannot have close tags
'br', 'hr'
);
$htmlnest = array( # Tags that can be nested--??
'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
);
$tabletags = array( # Can only appear inside table
'td', 'th', 'tr',
);
$htmllist = array( # Tags used by list
'ul','ol',
);
$listtags = array( # Tags that can appear in a list
'li',
);
static $htmlpairs, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
$htmllist, $listtags, $htmlsingleallowed, $htmlelements, $staticInitialised;
wfProfileIn( __METHOD__ );
if ( !$staticInitialised ) {
if( $wgUserHtml ) {
$htmlpairs = array( # Tags that must be closed
'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
'strike', 'strong', 'tt', 'var', 'div', 'center',
'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'u'
);
$htmlsingle = array(
'br', 'hr', 'li', 'dt', 'dd'
);
$htmlsingleonly = array( # Elements that cannot have close tags
'br', 'hr'
);
$htmlnest = array( # Tags that can be nested--??
'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
);
$tabletags = array( # Can only appear inside table
'td', 'th', 'tr',
);
$htmllist = array( # Tags used by list
'ul','ol',
);
$listtags = array( # Tags that can appear in a list
'li',
);
} else {
$htmlpairs = array();
$htmlsingle = array();
$htmlnest = array();
$tabletags = array();
} else {
$htmlpairs = array();
$htmlsingle = array();
$htmlnest = array();
$tabletags = array();
}
$htmlsingleallowed = array_merge( $htmlsingle, $tabletags );
$htmlelements = array_merge( $htmlsingle, $htmlpairs, $htmlnest );
# Convert them all to hashtables for faster lookup
$vars = array( 'htmlpairs', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags',
'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelements' );
foreach ( $vars as $var ) {
$$var = array_flip( $$var );
}
$staticInitialised = true;
}
$htmlsingleallowed = array_merge( $htmlsingle, $tabletags );
$htmlelements = array_merge( $htmlsingle, $htmlpairs, $htmlnest );
# Remove HTML comments
$text = Sanitizer::removeHTMLcomments( $text );
$bits = explode( '<', $text );
$text = array_shift( $bits );
if(!$wgUseTidy) {
$tagstack = array(); $tablestack = array();
$tagstack = $tablestack = array();
foreach ( $bits as $x ) {
$prev = error_reporting( E_ALL & ~( E_NOTICE | E_WARNING ) );
preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
$x, $regs );
preg_match( '!^(/?)(\\w+)([^>]*?)(/{0,1}>)([^<]*)$!', $x, $regs );
list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
error_reporting( $prev );
$badtag = 0 ;
if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
# Check our stack
if ( $slash ) {
# Closing a tag...
if( in_array( $t, $htmlsingleonly ) ) {
if( isset( $htmlsingleonly[$t] ) ) {
$badtag = 1;
} elseif ( ( $ot = @array_pop( $tagstack ) ) != $t ) {
if ( in_array($ot, $htmlsingleallowed) ) {
if ( isset( $htmlsingleallowed[$ot] ) ) {
# Pop all elements with an optional close tag
# and see if we find a match below them
$optstack = array();
array_push ($optstack, $ot);
while ( ( ( $ot = @array_pop( $tagstack ) ) != $t ) &&
in_array($ot, $htmlsingleallowed) ) {
isset( $htmlsingleallowed[$ot] ) )
{
array_push ($optstack, $ot);
}
if ( $t != $ot ) {
@ -410,7 +423,7 @@ class Sanitizer {
} else {
@array_push( $tagstack, $ot );
# <li> can be nested in <ul> or <ol>, skip those cases:
if(!(in_array($ot, $htmllist) && in_array($t, $listtags) )) {
if(!(isset( $htmllist[$ot] ) && isset( $listtags[$t] ) )) {
$badtag = 1;
}
}
@ -422,20 +435,20 @@ class Sanitizer {
$newparams = '';
} else {
# Keep track for later
if ( in_array( $t, $tabletags ) &&
if ( isset( $tabletags[$t] ) &&
! in_array( 'table', $tagstack ) ) {
$badtag = 1;
} else if ( in_array( $t, $tagstack ) &&
! in_array ( $t , $htmlnest ) ) {
! isset( $htmlnest [$t ] ) ) {
$badtag = 1 ;
# Is it a self closed htmlpair ? (bug 5487)
} else if( $brace == '/>' &&
in_array($t, $htmlpairs) ) {
isset( $htmlpairs[$t] ) ) {
$badtag = 1;
} elseif( in_array( $t, $htmlsingleonly ) ) {
} elseif( isset( $htmlsingleonly[$t] ) ) {
# Hack to force empty tag for uncloseable elements
$brace = '/>';
} else if( in_array( $t, $htmlsingle ) ) {
} else if( isset( $htmlsingle[$t] ) ) {
# Hack to not close $htmlsingle tags
$brace = NULL;
} else {
@ -475,7 +488,7 @@ class Sanitizer {
preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
$x, $regs );
@list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
if( is_callable( $processCallback ) ) {
call_user_func_array( $processCallback, array( &$params, $args ) );
}
@ -487,7 +500,7 @@ class Sanitizer {
}
}
}
wfProfileOut( $fname );
wfProfileOut( __METHOD__ );
return $text;
}
@ -502,8 +515,7 @@ class Sanitizer {
* @return string
*/
static function removeHTMLcomments( $text ) {
$fname='Parser::removeHTMLcomments';
wfProfileIn( $fname );
wfProfileIn( __METHOD__ );
while (($start = strpos($text, '<!--')) !== false) {
$end = strpos($text, '-->', $start + 4);
if ($end === false) {
@ -533,7 +545,7 @@ class Sanitizer {
$text = substr_replace($text, '', $start, $end - $start);
}
}
wfProfileOut( $fname );
wfProfileOut( __METHOD__ );
return $text;
}

View file

@ -2808,7 +2808,7 @@ Thumbnail image caption with a free URL
!! end
!! test
TODO: BUG 1887: A ISBN with a thumbnail
BUG 1887: A ISBN with a thumbnail
!! input
[[Image:foobar.jpg|thumb|ISBN 12354]]
!! result