wiki.techinc.nl/includes/parser/LinkHolderArray.php
Tim Starling a20350dd31 * Rewrote StripState to not use ReplacementArray. The memory usage of FSS was excessive when there were many (>10k) strip items. I used preg_replace_callback(), which is slower than strtr() in the simplest case, but much faster than it when the markers have different lengths, which they usually do.
* It was not necessary to preserve the $stripState->general->setPair() interface since it wasn't used by any extensions.
* Moved StripState to its own file.
* Refactored serialiseHalfParsedText() and unserialiseHalfParsedText() so that the bulk of the functionality is in the relevant modules, instead of using scary direct access to object member variables. Made it support the new StripState. It seemed like a lot of work to go to to support an "emergency optimisation" feature in Cite. Cite updates will be in a subsequent commit.
* Fixed spelling of serialiseHalfParsedText() and unserialiseHalfParsedText(), there is unavoidable interface breakage anyway, due to cache object versioning. 
* Moved transparent tags to their own function, as requested in a fixme comment.
* Added documentation for markerSkipCallback().
* Removed OnlyIncludeReplacer, unused since MW 1.12.
2011-02-23 06:58:15 +00:00

563 lines
16 KiB
PHP

<?php
/**
* Holder of replacement pairs for wiki links
*
* @file
*/
/**
* @ingroup Parser
*/
class LinkHolderArray {
var $internals = array(), $interwikis = array();
var $size = 0;
var $parent;
protected $tempIdOffset;
function __construct( $parent ) {
$this->parent = $parent;
}
/**
* Reduce memory usage to reduce the impact of circular references
*/
function __destruct() {
foreach ( $this as $name => $value ) {
unset( $this->$name );
}
}
/**
* Don't serialize the parent object, it is big, and not needed when it is
* a parameter to mergeForeign(), which is the only application of
* serializing at present.
*/
function __sleep() {
return array( 'internals', 'interwikis', 'size' );
}
/**
* Merge another LinkHolderArray into this one
* @param $other LinkHolderArray
*/
function merge( $other ) {
foreach ( $other->internals as $ns => $entries ) {
$this->size += count( $entries );
if ( !isset( $this->internals[$ns] ) ) {
$this->internals[$ns] = $entries;
} else {
$this->internals[$ns] += $entries;
}
}
$this->interwikis += $other->interwikis;
}
/**
* Merge a LinkHolderArray from another parser instance into this one. The
* keys will not be preserved. Any text which went with the old
* LinkHolderArray and needs to work with the new one should be passed in
* the $texts array. The strings in this array will have their link holders
* converted for use in the destination link holder. The resulting array of
* strings will be returned.
*
* @param $other LinkHolderArray
* @param $text Array of strings
* @return Array
*/
function mergeForeign( $other, $texts ) {
$this->tempIdOffset = $idOffset = $this->parent->nextLinkID();
$maxId = 0;
# Renumber internal links
foreach ( $other->internals as $ns => $nsLinks ) {
foreach ( $nsLinks as $key => $entry ) {
$newKey = $idOffset + $key;
$this->internals[$ns][$newKey] = $entry;
$maxId = $newKey > $maxId ? $newKey : $maxId;
}
}
$texts = preg_replace_callback( '(<!--LINK \d+:)(\d+)(-->)',
array( $this, 'mergeForeignCallback' ), $texts );
# Renumber interwiki links
foreach ( $links['interwiki'] as $key => $entry ) {
$newKey = $idOffset + $key;
$this->interwikis[$newKey] = $entry;
$maxId = $newKey > $maxId ? $newKey : $maxId;
}
$texts = preg_replace_callback( '(<!--IWLINK )(\d+)(-->)',
array( $this, 'mergeForeignCallback' ), $texts );
# Set the parent link ID to be the highest used ID
$this->parent->setLinkID( $maxId );
$this->tempIdOffset = null;
}
protected function mergeForeignCallback( $m ) {
return $m[1] . ( $m[2] + $this->tempIdOffset ) . $m[3];
}
/**
* Get a subset of the current LinkHolderArray which is sufficient to
* interpret the given text.
*/
function getSubArray( $text ) {
$sub = new LinkHolderArray( $this->parent );
# Internal links
$pos = 0;
while ( $pos < strlen( $text ) ) {
if ( !preg_match( '/<!--LINK (\d+):(\d+)-->/',
$text, $m, PREG_OFFSET_CAPTURE, $pos ) )
{
break;
}
$ns = $m[1][0];
$key = $m[2][0];
$sub->internals[$ns][$key] = $this->internals[$ns][$key];
$pos = $m[0][1] + strlen( $m[0][0] );
}
# Interwiki links
$pos = 0;
while ( $pos < strlen( $text ) ) {
if ( !preg_match( '/<!--IWLINK (\d+)-->/', $text, $m, PREG_OFFSET_CAPTURE, $pos ) ) {
break;
}
$key = $m[1][0];
$sub->interwikis[$key] = $this->interwikis[$key];
$pos = $m[0][1] + strlen( $m[0][0] );
}
return $sub;
}
/**
* Returns true if the memory requirements of this object are getting large
*/
function isBig() {
global $wgLinkHolderBatchSize;
return $this->size > $wgLinkHolderBatchSize;
}
/**
* Clear all stored link holders.
* Make sure you don't have any text left using these link holders, before you call this
*/
function clear() {
$this->internals = array();
$this->interwikis = array();
$this->size = 0;
}
/**
* Make a link placeholder. The text returned can be later resolved to a real link with
* replaceLinkHolders(). This is done for two reasons: firstly to avoid further
* parsing of interwiki links, and secondly to allow all existence checks and
* article length checks (for stub links) to be bundled into a single query.
*
* @param $nt Title
*/
function makeHolder( $nt, $text = '', $query = '', $trail = '', $prefix = '' ) {
wfProfileIn( __METHOD__ );
if ( ! is_object($nt) ) {
# Fail gracefully
$retVal = "<!-- ERROR -->{$prefix}{$text}{$trail}";
} else {
# Separate the link trail from the rest of the link
list( $inside, $trail ) = Linker::splitTrail( $trail );
$entry = array(
'title' => $nt,
'text' => $prefix.$text.$inside,
'pdbk' => $nt->getPrefixedDBkey(),
);
if ( $query !== '' ) {
$entry['query'] = $query;
}
if ( $nt->isExternal() ) {
// Use a globally unique ID to keep the objects mergable
$key = $this->parent->nextLinkID();
$this->interwikis[$key] = $entry;
$retVal = "<!--IWLINK $key-->{$trail}";
} else {
$key = $this->parent->nextLinkID();
$ns = $nt->getNamespace();
$this->internals[$ns][$key] = $entry;
$retVal = "<!--LINK $ns:$key-->{$trail}";
}
$this->size++;
}
wfProfileOut( __METHOD__ );
return $retVal;
}
/**
* FIXME: update documentation. makeLinkObj() is deprecated.
* Replace <!--LINK--> link placeholders with actual links, in the buffer
* Placeholders created in Skin::makeLinkObj()
* Returns an array of link CSS classes, indexed by PDBK.
*/
function replace( &$text ) {
wfProfileIn( __METHOD__ );
$colours = $this->replaceInternal( $text );
$this->replaceInterwiki( $text );
wfProfileOut( __METHOD__ );
return $colours;
}
/**
* Replace internal links
*/
protected function replaceInternal( &$text ) {
if ( !$this->internals ) {
return;
}
wfProfileIn( __METHOD__ );
global $wgContLang;
$colours = array();
$sk = $this->parent->getOptions()->getSkin( $this->parent->mTitle );
$linkCache = LinkCache::singleton();
$output = $this->parent->getOutput();
wfProfileIn( __METHOD__.'-check' );
$dbr = wfGetDB( DB_SLAVE );
$page = $dbr->tableName( 'page' );
$threshold = $this->parent->getOptions()->getStubThreshold();
# Sort by namespace
ksort( $this->internals );
$linkcolour_ids = array();
# Generate query
$query = false;
$current = null;
foreach ( $this->internals as $ns => $entries ) {
foreach ( $entries as $entry ) {
$title = $entry['title'];
$pdbk = $entry['pdbk'];
# Skip invalid entries.
# Result will be ugly, but prevents crash.
if ( is_null( $title ) ) {
continue;
}
# Check if it's a static known link, e.g. interwiki
if ( $title->isAlwaysKnown() ) {
$colours[$pdbk] = '';
} elseif ( $ns == NS_SPECIAL ) {
$colours[$pdbk] = 'new';
} elseif ( ( $id = $linkCache->getGoodLinkID( $pdbk ) ) != 0 ) {
$colours[$pdbk] = $sk->getLinkColour( $title, $threshold );
$output->addLink( $title, $id );
$linkcolour_ids[$id] = $pdbk;
} elseif ( $linkCache->isBadLink( $pdbk ) ) {
$colours[$pdbk] = 'new';
} else {
# Not in the link cache, add it to the query
if ( !isset( $current ) ) {
$current = $ns;
$query = "SELECT page_id, page_namespace, page_title, page_is_redirect, page_len, page_latest";
$query .= " FROM $page WHERE (page_namespace=$ns AND page_title IN(";
} elseif ( $current != $ns ) {
$current = $ns;
$query .= ")) OR (page_namespace=$ns AND page_title IN(";
} else {
$query .= ', ';
}
$query .= $dbr->addQuotes( $title->getDBkey() );
}
}
}
if ( $query ) {
$query .= '))';
$res = $dbr->query( $query, __METHOD__ );
# Fetch data and form into an associative array
# non-existent = broken
foreach ( $res as $s ) {
$title = Title::makeTitle( $s->page_namespace, $s->page_title );
$pdbk = $title->getPrefixedDBkey();
$linkCache->addGoodLinkObj( $s->page_id, $title, $s->page_len, $s->page_is_redirect, $s->page_latest );
$output->addLink( $title, $s->page_id );
# FIXME: convoluted data flow
# The redirect status and length is passed to getLinkColour via the LinkCache
# Use formal parameters instead
$colours[$pdbk] = $sk->getLinkColour( $title, $threshold );
//add id to the extension todolist
$linkcolour_ids[$s->page_id] = $pdbk;
}
unset( $res );
}
if ( count($linkcolour_ids) ) {
//pass an array of page_ids to an extension
wfRunHooks( 'GetLinkColours', array( $linkcolour_ids, &$colours ) );
}
wfProfileOut( __METHOD__.'-check' );
# Do a second query for different language variants of links and categories
if($wgContLang->hasVariants()) {
$this->doVariants( $colours );
}
# Construct search and replace arrays
wfProfileIn( __METHOD__.'-construct' );
$replacePairs = array();
foreach ( $this->internals as $ns => $entries ) {
foreach ( $entries as $index => $entry ) {
$pdbk = $entry['pdbk'];
$title = $entry['title'];
$query = isset( $entry['query'] ) ? $entry['query'] : '';
$key = "$ns:$index";
$searchkey = "<!--LINK $key-->";
if ( !isset( $colours[$pdbk] ) || $colours[$pdbk] == 'new' ) {
$linkCache->addBadLinkObj( $title );
$colours[$pdbk] = 'new';
$output->addLink( $title, 0 );
// FIXME: replace deprecated makeBrokenLinkObj() by link()
$replacePairs[$searchkey] = $sk->makeBrokenLinkObj( $title,
$entry['text'],
$query );
} else {
// FIXME: replace deprecated makeColouredLinkObj() by link()
$replacePairs[$searchkey] = $sk->makeColouredLinkObj( $title, $colours[$pdbk],
$entry['text'],
$query );
}
}
}
$replacer = new HashtableReplacer( $replacePairs, 1 );
wfProfileOut( __METHOD__.'-construct' );
# Do the thing
wfProfileIn( __METHOD__.'-replace' );
$text = preg_replace_callback(
'/(<!--LINK .*?-->)/',
$replacer->cb(),
$text);
wfProfileOut( __METHOD__.'-replace' );
wfProfileOut( __METHOD__ );
}
/**
* Replace interwiki links
*/
protected function replaceInterwiki( &$text ) {
if ( empty( $this->interwikis ) ) {
return;
}
wfProfileIn( __METHOD__ );
# Make interwiki link HTML
$sk = $this->parent->getOptions()->getSkin( $this->parent->mTitle );
$output = $this->parent->getOutput();
$replacePairs = array();
foreach( $this->interwikis as $key => $link ) {
$replacePairs[$key] = $sk->link( $link['title'], $link['text'] );
$output->addInterwikiLink( $link['title'] );
}
$replacer = new HashtableReplacer( $replacePairs, 1 );
$text = preg_replace_callback(
'/<!--IWLINK (.*?)-->/',
$replacer->cb(),
$text );
wfProfileOut( __METHOD__ );
}
/**
* Modify $this->internals and $colours according to language variant linking rules
*/
protected function doVariants( &$colours ) {
global $wgContLang;
$linkBatch = new LinkBatch();
$variantMap = array(); // maps $pdbkey_Variant => $keys (of link holders)
$output = $this->parent->getOutput();
$linkCache = LinkCache::singleton();
$sk = $this->parent->getOptions()->getSkin( $this->parent->mTitle );
$threshold = $this->parent->getOptions()->getStubThreshold();
$titlesToBeConverted = '';
$titlesAttrs = array();
// Concatenate titles to a single string, thus we only need auto convert the
// single string to all variants. This would improve parser's performance
// significantly.
foreach ( $this->internals as $ns => $entries ) {
foreach ( $entries as $index => $entry ) {
$pdbk = $entry['pdbk'];
// we only deal with new links (in its first query)
if ( !isset( $colours[$pdbk] ) ) {
$title = $entry['title'];
$titleText = $title->getText();
$titlesAttrs[] = array(
'ns' => $ns,
'key' => "$ns:$index",
'titleText' => $titleText,
);
// separate titles with \0 because it would never appears
// in a valid title
$titlesToBeConverted .= $titleText . "\0";
}
}
}
// Now do the conversion and explode string to text of titles
$titlesAllVariants = $wgContLang->autoConvertToAllVariants( $titlesToBeConverted );
$allVariantsName = array_keys( $titlesAllVariants );
foreach ( $titlesAllVariants as &$titlesVariant ) {
$titlesVariant = explode( "\0", $titlesVariant );
}
$l = count( $titlesAttrs );
// Then add variants of links to link batch
for ( $i = 0; $i < $l; $i ++ ) {
foreach ( $allVariantsName as $variantName ) {
$textVariant = $titlesAllVariants[$variantName][$i];
extract( $titlesAttrs[$i] );
if($textVariant != $titleText){
$variantTitle = Title::makeTitle( $ns, $textVariant );
if( is_null( $variantTitle ) ) {
continue;
}
$linkBatch->addObj( $variantTitle );
$variantMap[$variantTitle->getPrefixedDBkey()][] = $key;
}
}
}
// process categories, check if a category exists in some variant
$categoryMap = array(); // maps $category_variant => $category (dbkeys)
$varCategories = array(); // category replacements oldDBkey => newDBkey
foreach( $output->getCategoryLinks() as $category ){
$variants = $wgContLang->autoConvertToAllVariants( $category );
foreach($variants as $variant){
if($variant != $category){
$variantTitle = Title::newFromDBkey( Title::makeName(NS_CATEGORY,$variant) );
if(is_null($variantTitle)) continue;
$linkBatch->addObj( $variantTitle );
$categoryMap[$variant] = $category;
}
}
}
if(!$linkBatch->isEmpty()){
// construct query
$dbr = wfGetDB( DB_SLAVE );
$varRes = $dbr->select( 'page',
array( 'page_id', 'page_namespace', 'page_title', 'page_is_redirect', 'page_len' ),
$linkBatch->constructSet( 'page', $dbr ),
__METHOD__
);
$linkcolour_ids = array();
// for each found variants, figure out link holders and replace
foreach ( $varRes as $s ) {
$variantTitle = Title::makeTitle( $s->page_namespace, $s->page_title );
$varPdbk = $variantTitle->getPrefixedDBkey();
$vardbk = $variantTitle->getDBkey();
$holderKeys = array();
if( isset( $variantMap[$varPdbk] ) ) {
$holderKeys = $variantMap[$varPdbk];
$linkCache->addGoodLinkObj( $s->page_id, $variantTitle, $s->page_len, $s->page_is_redirect );
$output->addLink( $variantTitle, $s->page_id );
}
// loop over link holders
foreach( $holderKeys as $key ) {
list( $ns, $index ) = explode( ':', $key, 2 );
$entry =& $this->internals[$ns][$index];
$pdbk = $entry['pdbk'];
if(!isset($colours[$pdbk])){
// found link in some of the variants, replace the link holder data
$entry['title'] = $variantTitle;
$entry['pdbk'] = $varPdbk;
// set pdbk and colour
# FIXME: convoluted data flow
# The redirect status and length is passed to getLinkColour via the LinkCache
# Use formal parameters instead
$colours[$varPdbk] = $sk->getLinkColour( $variantTitle, $threshold );
$linkcolour_ids[$s->page_id] = $pdbk;
}
}
// check if the object is a variant of a category
if(isset($categoryMap[$vardbk])){
$oldkey = $categoryMap[$vardbk];
if($oldkey != $vardbk)
$varCategories[$oldkey]=$vardbk;
}
}
wfRunHooks( 'GetLinkColours', array( $linkcolour_ids, &$colours ) );
// rebuild the categories in original order (if there are replacements)
if(count($varCategories)>0){
$newCats = array();
$originalCats = $output->getCategories();
foreach($originalCats as $cat => $sortkey){
// make the replacement
if( array_key_exists($cat,$varCategories) )
$newCats[$varCategories[$cat]] = $sortkey;
else $newCats[$cat] = $sortkey;
}
$output->setCategoryLinks($newCats);
}
}
}
/**
* Replace <!--LINK--> link placeholders with plain text of links
* (not HTML-formatted).
*
* @param $text String
* @return String
*/
function replaceText( $text ) {
wfProfileIn( __METHOD__ );
$text = preg_replace_callback(
'/<!--(LINK|IWLINK) (.*?)-->/',
array( &$this, 'replaceTextCallback' ),
$text );
wfProfileOut( __METHOD__ );
return $text;
}
/**
* Callback for replaceText()
*
* @param $matches Array
* @return string
* @private
*/
function replaceTextCallback( $matches ) {
$type = $matches[1];
$key = $matches[2];
if( $type == 'LINK' ) {
list( $ns, $index ) = explode( ':', $key, 2 );
if( isset( $this->internals[$ns][$index]['text'] ) ) {
return $this->internals[$ns][$index]['text'];
}
} elseif( $type == 'IWLINK' ) {
if( isset( $this->interwikis[$key]['text'] ) ) {
return $this->interwikis[$key]['text'];
}
}
return $matches[0];
}
}