wiki.techinc.nl/includes/cache/BacklinkCache.php

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

586 lines
17 KiB
PHP
Raw Normal View History

<?php
/**
* Class for fetching backlink lists, approximate backlink counts and
* partitions.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
* http://www.gnu.org/copyleft/gpl.html
*
* @file
* @author Tim Starling
* @copyright © 2009, Tim Starling, Domas Mituzas
* @copyright © 2010, Max Sem
* @copyright © 2011, Antoine Musso
*/
namespace MediaWiki\Cache;
use Iterator;
use LogicException;
use MediaWiki\Config\ServiceOptions;
use MediaWiki\HookContainer\HookContainer;
use MediaWiki\HookContainer\HookRunner;
use MediaWiki\Linker\LinksMigration;
use MediaWiki\MainConfigNames;
use MediaWiki\Page\PageIdentity;
use MediaWiki\Page\PageIdentityValue;
use MediaWiki\Page\PageReference;
use MediaWiki\Title\Title;
use MediaWiki\Title\TitleValue;
use Psr\Log\LoggerInterface;
use RuntimeException;
use stdClass;
use WANObjectCache;
use Wikimedia\Rdbms\Database;
use Wikimedia\Rdbms\IConnectionProvider;
use Wikimedia\Rdbms\IReadableDatabase;
use Wikimedia\Rdbms\IResultWrapper;
use Wikimedia\Rdbms\SelectQueryBuilder;
/**
* Class for fetching backlink lists, approximate backlink counts and
* partitions. This is a shared cache.
*
* Instances of this class should typically be fetched with the method
* ::getBacklinkCache() from the BacklinkCacheFactory service.
*
* Ideally you should only get your backlinks from here when you think
* there is some advantage in caching them. Otherwise, it's just a waste
* of memory.
*/
class BacklinkCache {
/**
* @internal Used by ServiceWiring.php
*/
public const CONSTRUCTOR_OPTIONS = [
MainConfigNames::UpdateRowsPerJob,
];
Hooks::run() call site migration Migrate all callers of Hooks::run() to use the new HookContainer/HookRunner system. General principles: * Use DI if it is already used. We're not changing the way state is managed in this patch. * HookContainer is always injected, not HookRunner. HookContainer is a service, it's a more generic interface, it is the only thing that provides isRegistered() which is needed in some cases, and a HookRunner can be efficiently constructed from it (confirmed by benchmark). Because HookContainer is needed for object construction, it is also needed by all factories. * "Ask your friendly local base class". Big hierarchies like SpecialPage and ApiBase have getHookContainer() and getHookRunner() methods in the base class, and classes that extend that base class are not expected to know or care where the base class gets its HookContainer from. * ProtectedHookAccessorTrait provides protected getHookContainer() and getHookRunner() methods, getting them from the global service container. The point of this is to ease migration to DI by ensuring that call sites ask their local friendly base class rather than getting a HookRunner from the service container directly. * Private $this->hookRunner. In some smaller classes where accessor methods did not seem warranted, there is a private HookRunner property which is accessed directly. Very rarely (two cases), there is a protected property, for consistency with code that conventionally assumes protected=private, but in cases where the class might actually be overridden, a protected accessor is preferred over a protected property. * The last resort: Hooks::runner(). Mostly for static, file-scope and global code. In a few cases it was used for objects with broken construction schemes, out of horror or laziness. Constructors with new required arguments: * AuthManager * BadFileLookup * BlockManager * ClassicInterwikiLookup * ContentHandlerFactory * ContentSecurityPolicy * DefaultOptionsManager * DerivedPageDataUpdater * FullSearchResultWidget * HtmlCacheUpdater * LanguageFactory * LanguageNameUtils * LinkRenderer * LinkRendererFactory * LocalisationCache * MagicWordFactory * MessageCache * NamespaceInfo * PageEditStash * PageHandlerFactory * PageUpdater * ParserFactory * PermissionManager * RevisionStore * RevisionStoreFactory * SearchEngineConfig * SearchEngineFactory * SearchFormWidget * SearchNearMatcher * SessionBackend * SpecialPageFactory * UserNameUtils * UserOptionsManager * WatchedItemQueryService * WatchedItemStore Constructors with new optional arguments: * DefaultPreferencesFactory * Language * LinkHolderArray * MovePage * Parser * ParserCache * PasswordReset * Router setHookContainer() now required after construction: * AuthenticationProvider * ResourceLoaderModule * SearchEngine Change-Id: Id442b0dbe43aba84bd5cf801d86dedc768b082c7
2020-03-19 02:42:09 +00:00
/**
* Multi-dimensional array representing batches. Keys are:
* > (string) links table name
* > (int) batch size
* > 'numRows' : Number of rows for this link table
* > 'batches' : [ [ $start, $end ] ]
*
* @see BacklinkCache::partitionResult()
* @var array[]
*/
private $partitionCache = [];
/**
* Contains the whole links from a database result.
* This is raw data that will be partitioned in $partitionCache
*
* Initialized with BacklinkCache::queryLinks()
*
* @var IResultWrapper[]
*/
private $fullResultCache = [];
Fix use of stale backlink partition cache If a template is created, and then used on a page, and then the template is updated, all within an hour, then the page_touched of the page is never updated and the user will always see the old template contents. This is because htmlCacheUpdate jobs are fully suppressed for one hour following template creation, due to the WAN backlink partition cache. So: * Revert 4f3efbf4065a9, so that htmlCacheUpdate jobs always do something regardless of the state of the partition cache. * Factor out the job queueing parts of WikiPage::onArticleCreate, ::onArticleDelete and ::onArticleEdit. Instead of queueing a job unconditionally, check for the existence of backlinks in a post-send deferred update. If there are none, don't queue the job. * It's convenient to use BacklinkCache::hasLinks(), however, it suffered from the same stale cache problem as BacklinkCache::partition(). It's a short and fast query, and code review shows that none of the callers are particularly performance sensitive. So, do not use the WAN cache in BacklinkCache::hasLinks(). * Since hasLinks() and getNumLinks() no longer share a significant amount of code, separate them. Remove the $max parameter from getNumLinks(), which only existed to support hasLinks() and has no other usages in codesearch. * Log a debug message when entering the post-send request stage, so that it's easier to confirm that no additional pre-send queries are done. * Add a regression test, confirmed to previously fail. Bug: T368006 Change-Id: Id5c7af6d4fcdbeb6724a9036133742c5f76624df
2024-06-25 01:22:35 +00:00
/**
* Cache for hasLinks()
*
* @var bool[]
*/
private $hasLinksCache = [];
/** @var WANObjectCache */
private $wanCache;
/** @var HookRunner */
private $hookRunner;
/**
* Local copy of a PageReference object
* @var PageReference
*/
private $page;
private const CACHE_EXPIRY = 3600;
private IConnectionProvider $dbProvider;
private ServiceOptions $options;
private LinksMigration $linksMigration;
private LoggerInterface $logger;
/**
* Create a new BacklinkCache
*
* @param ServiceOptions $options
* @param LinksMigration $linksMigration
* @param WANObjectCache $wanCache
* @param HookContainer $hookContainer
* @param IConnectionProvider $dbProvider
* @param LoggerInterface $logger
* @param PageReference $page Page to create a backlink cache for
*/
public function __construct(
ServiceOptions $options,
LinksMigration $linksMigration,
WANObjectCache $wanCache,
HookContainer $hookContainer,
IConnectionProvider $dbProvider,
LoggerInterface $logger,
PageReference $page
) {
$options->assertRequiredOptions( self::CONSTRUCTOR_OPTIONS );
$this->options = $options;
$this->linksMigration = $linksMigration;
$this->wanCache = $wanCache;
$this->hookRunner = new HookRunner( $hookContainer );
$this->dbProvider = $dbProvider;
$this->logger = $logger;
$this->page = $page;
}
/**
* @since 1.37
* @return PageReference
*/
public function getPage(): PageReference {
return $this->page;
}
/**
* Get the replica DB connection to the database
*
* @return IReadableDatabase
*/
private function getDB() {
return $this->dbProvider->getReplicaDatabase();
}
/**
* Get the backlinks for a given table. Cached in process memory only.
* @param string $table
* @param int|bool $startId
* @param int|bool $endId
* @param int|float $max Integer, or INF for no max
* @return Iterator<PageIdentity>
* @since 1.37
*/
public function getLinkPages(
string $table, $startId = false, $endId = false, $max = INF
): Iterator {
$i = 0;
foreach ( $this->queryLinks( $table, $startId, $endId, $max ) as $row ) {
yield PageIdentityValue::localIdentity(
$row->page_id, $row->page_namespace, $row->page_title );
// queryLinks() may return too many rows
if ( is_finite( $max ) && ++$i >= $max ) {
break;
}
}
}
/**
* Get the backlinks for a given table. Cached in process memory only.
*
* @param string $table
* @param int|bool $startId
* @param int|bool $endId
* @param int|float $max A hint for the maximum number of rows to return.
* May return more rows if there is a previously cached result set.
* @param string $select 'all' or 'ids'
* @return IResultWrapper
*/
private function queryLinks( $table, $startId, $endId, $max, $select = 'all' ) {
if ( !$startId && !$endId && isset( $this->fullResultCache[$table] ) ) {
$this->logger->debug( __METHOD__ . ': got results from cache' );
return $this->fullResultCache[$table];
}
$this->logger->debug( __METHOD__ . ': got results from DB' );
$queryBuilder = $this->initQueryBuilderForTable( $table, $select );
$fromField = $this->getPrefix( $table ) . '_from';
// Use the from field in the condition rather than the joined page_id,
// because databases are stupid and don't necessarily propagate indexes.
if ( $startId ) {
$queryBuilder->where(
$this->getDB()->expr( $fromField, '>=', $startId )
);
}
if ( $endId ) {
$queryBuilder->where(
$this->getDB()->expr( $fromField, '<=', $endId )
);
}
$queryBuilder->orderBy( $fromField );
if ( is_finite( $max ) && $max > 0 ) {
$queryBuilder->limit( $max );
}
$res = $queryBuilder->caller( __METHOD__ )->fetchResultSet();
if ( $select === 'all' && !$startId && !$endId && $res->numRows() < $max ) {
// The full results fit within the limit, so cache them
$this->fullResultCache[$table] = $res;
} else {
$this->logger->debug( __METHOD__ . ": results from DB were uncacheable" );
}
return $res;
}
/**
* Get the field name prefix for a given table
* @param string $table
2012-02-09 21:33:27 +00:00
* @return null|string
*/
private function getPrefix( $table ) {
static $prefixes = [
'pagelinks' => 'pl',
'imagelinks' => 'il',
'categorylinks' => 'cl',
'templatelinks' => 'tl',
'redirect' => 'rd',
];
if ( isset( $prefixes[$table] ) ) {
return $prefixes[$table];
} else {
$prefix = null;
$this->hookRunner->onBacklinkCacheGetPrefix( $table, $prefix );
if ( $prefix ) {
return $prefix;
} else {
throw new LogicException( "Invalid table \"$table\" in " . __CLASS__ );
}
}
}
/**
* Initialize a new SelectQueryBuilder for selecting backlinks,
* with a join on the page table if needed.
*
* @param string $table
* @param string $select
* @return SelectQueryBuilder
*/
private function initQueryBuilderForTable( string $table, string $select ): SelectQueryBuilder {
$prefix = $this->getPrefix( $table );
$queryBuilder = $this->getDB()->newSelectQueryBuilder();
$joinPageTable = $select !== 'ids';
if ( $select === 'ids' ) {
$queryBuilder->select( [ 'page_id' => $prefix . '_from' ] );
} else {
$queryBuilder->select( [ 'page_namespace', 'page_title', 'page_id' ] );
}
$queryBuilder->from( $table );
/*
* If the table is one of the tables known to this method,
* we can use a nice join() method later, always joining on page_id={$prefix}_from.
* If the table is unknown here, and only supported via a hook,
* the hook only produces a single $conds array,
* so we have to use a traditional / ANSI-89 JOIN,
* with the page table just added to the list of tables and the join conds in the WHERE part.
*/
$knownTable = true;
switch ( $table ) {
case 'pagelinks':
case 'templatelinks':
$queryBuilder->where(
$this->linksMigration->getLinksConditions( $table, TitleValue::newFromPage( $this->page ) )
);
break;
case 'redirect':
$queryBuilder->where( [
"{$prefix}_namespace" => $this->page->getNamespace(),
"{$prefix}_title" => $this->page->getDBkey(),
"{$prefix}_interwiki" => [ '', null ],
] );
break;
case 'imagelinks':
case 'categorylinks':
$queryBuilder->where( [
"{$prefix}_to" => $this->page->getDBkey(),
] );
break;
default:
$knownTable = false;
$conds = null;
$this->hookRunner->onBacklinkCacheGetConditions( $table,
Title::newFromPageReference( $this->page ),
$conds
);
if ( !$conds ) {
throw new LogicException( "Invalid table \"$table\" in " . __CLASS__ );
}
if ( $joinPageTable ) {
$queryBuilder->table( 'page' ); // join condition in $conds
} else {
// remove any page_id condition from $conds
$conds = array_filter( (array)$conds, static function ( $clause ) { // kind of janky
return !preg_match( '/(\b|=)page_id(\b|=)/', (string)$clause );
} );
}
$queryBuilder->where( $conds );
break;
}
if ( $knownTable && $joinPageTable ) {
$queryBuilder->join( 'page', null, "page_id={$prefix}_from" );
}
if ( $joinPageTable ) {
$queryBuilder->straightJoinOption();
}
return $queryBuilder;
}
/**
Fix use of stale backlink partition cache If a template is created, and then used on a page, and then the template is updated, all within an hour, then the page_touched of the page is never updated and the user will always see the old template contents. This is because htmlCacheUpdate jobs are fully suppressed for one hour following template creation, due to the WAN backlink partition cache. So: * Revert 4f3efbf4065a9, so that htmlCacheUpdate jobs always do something regardless of the state of the partition cache. * Factor out the job queueing parts of WikiPage::onArticleCreate, ::onArticleDelete and ::onArticleEdit. Instead of queueing a job unconditionally, check for the existence of backlinks in a post-send deferred update. If there are none, don't queue the job. * It's convenient to use BacklinkCache::hasLinks(), however, it suffered from the same stale cache problem as BacklinkCache::partition(). It's a short and fast query, and code review shows that none of the callers are particularly performance sensitive. So, do not use the WAN cache in BacklinkCache::hasLinks(). * Since hasLinks() and getNumLinks() no longer share a significant amount of code, separate them. Remove the $max parameter from getNumLinks(), which only existed to support hasLinks() and has no other usages in codesearch. * Log a debug message when entering the post-send request stage, so that it's easier to confirm that no additional pre-send queries are done. * Add a regression test, confirmed to previously fail. Bug: T368006 Change-Id: Id5c7af6d4fcdbeb6724a9036133742c5f76624df
2024-06-25 01:22:35 +00:00
* Check if there are any backlinks. Only use the process cache, since the
* WAN cache is potentially stale (T368006).
*
* @param string $table
* @return bool
*/
public function hasLinks( $table ) {
Fix use of stale backlink partition cache If a template is created, and then used on a page, and then the template is updated, all within an hour, then the page_touched of the page is never updated and the user will always see the old template contents. This is because htmlCacheUpdate jobs are fully suppressed for one hour following template creation, due to the WAN backlink partition cache. So: * Revert 4f3efbf4065a9, so that htmlCacheUpdate jobs always do something regardless of the state of the partition cache. * Factor out the job queueing parts of WikiPage::onArticleCreate, ::onArticleDelete and ::onArticleEdit. Instead of queueing a job unconditionally, check for the existence of backlinks in a post-send deferred update. If there are none, don't queue the job. * It's convenient to use BacklinkCache::hasLinks(), however, it suffered from the same stale cache problem as BacklinkCache::partition(). It's a short and fast query, and code review shows that none of the callers are particularly performance sensitive. So, do not use the WAN cache in BacklinkCache::hasLinks(). * Since hasLinks() and getNumLinks() no longer share a significant amount of code, separate them. Remove the $max parameter from getNumLinks(), which only existed to support hasLinks() and has no other usages in codesearch. * Log a debug message when entering the post-send request stage, so that it's easier to confirm that no additional pre-send queries are done. * Add a regression test, confirmed to previously fail. Bug: T368006 Change-Id: Id5c7af6d4fcdbeb6724a9036133742c5f76624df
2024-06-25 01:22:35 +00:00
if ( isset( $this->hasLinksCache[$table] ) ) {
return $this->hasLinksCache[$table];
}
if ( isset( $this->partitionCache[$table] ) ) {
$entry = reset( $this->partitionCache[$table] );
return (bool)$entry['numRows'];
}
if ( isset( $this->fullResultCache[$table] ) ) {
return (bool)$this->fullResultCache[$table]->numRows();
}
$hasLinks = (bool)$this->queryLinks( $table, false, false, 1 )->numRows();
$this->hasLinksCache[$table] = $hasLinks;
return $hasLinks;
}
/**
* Get the approximate number of backlinks
* @param string $table
* @return int
*/
Fix use of stale backlink partition cache If a template is created, and then used on a page, and then the template is updated, all within an hour, then the page_touched of the page is never updated and the user will always see the old template contents. This is because htmlCacheUpdate jobs are fully suppressed for one hour following template creation, due to the WAN backlink partition cache. So: * Revert 4f3efbf4065a9, so that htmlCacheUpdate jobs always do something regardless of the state of the partition cache. * Factor out the job queueing parts of WikiPage::onArticleCreate, ::onArticleDelete and ::onArticleEdit. Instead of queueing a job unconditionally, check for the existence of backlinks in a post-send deferred update. If there are none, don't queue the job. * It's convenient to use BacklinkCache::hasLinks(), however, it suffered from the same stale cache problem as BacklinkCache::partition(). It's a short and fast query, and code review shows that none of the callers are particularly performance sensitive. So, do not use the WAN cache in BacklinkCache::hasLinks(). * Since hasLinks() and getNumLinks() no longer share a significant amount of code, separate them. Remove the $max parameter from getNumLinks(), which only existed to support hasLinks() and has no other usages in codesearch. * Log a debug message when entering the post-send request stage, so that it's easier to confirm that no additional pre-send queries are done. * Add a regression test, confirmed to previously fail. Bug: T368006 Change-Id: Id5c7af6d4fcdbeb6724a9036133742c5f76624df
2024-06-25 01:22:35 +00:00
public function getNumLinks( $table ) {
if ( isset( $this->partitionCache[$table] ) ) {
$entry = reset( $this->partitionCache[$table] );
Fix use of stale backlink partition cache If a template is created, and then used on a page, and then the template is updated, all within an hour, then the page_touched of the page is never updated and the user will always see the old template contents. This is because htmlCacheUpdate jobs are fully suppressed for one hour following template creation, due to the WAN backlink partition cache. So: * Revert 4f3efbf4065a9, so that htmlCacheUpdate jobs always do something regardless of the state of the partition cache. * Factor out the job queueing parts of WikiPage::onArticleCreate, ::onArticleDelete and ::onArticleEdit. Instead of queueing a job unconditionally, check for the existence of backlinks in a post-send deferred update. If there are none, don't queue the job. * It's convenient to use BacklinkCache::hasLinks(), however, it suffered from the same stale cache problem as BacklinkCache::partition(). It's a short and fast query, and code review shows that none of the callers are particularly performance sensitive. So, do not use the WAN cache in BacklinkCache::hasLinks(). * Since hasLinks() and getNumLinks() no longer share a significant amount of code, separate them. Remove the $max parameter from getNumLinks(), which only existed to support hasLinks() and has no other usages in codesearch. * Log a debug message when entering the post-send request stage, so that it's easier to confirm that no additional pre-send queries are done. * Add a regression test, confirmed to previously fail. Bug: T368006 Change-Id: Id5c7af6d4fcdbeb6724a9036133742c5f76624df
2024-06-25 01:22:35 +00:00
return $entry['numRows'];
}
if ( isset( $this->fullResultCache[$table] ) ) {
Fix use of stale backlink partition cache If a template is created, and then used on a page, and then the template is updated, all within an hour, then the page_touched of the page is never updated and the user will always see the old template contents. This is because htmlCacheUpdate jobs are fully suppressed for one hour following template creation, due to the WAN backlink partition cache. So: * Revert 4f3efbf4065a9, so that htmlCacheUpdate jobs always do something regardless of the state of the partition cache. * Factor out the job queueing parts of WikiPage::onArticleCreate, ::onArticleDelete and ::onArticleEdit. Instead of queueing a job unconditionally, check for the existence of backlinks in a post-send deferred update. If there are none, don't queue the job. * It's convenient to use BacklinkCache::hasLinks(), however, it suffered from the same stale cache problem as BacklinkCache::partition(). It's a short and fast query, and code review shows that none of the callers are particularly performance sensitive. So, do not use the WAN cache in BacklinkCache::hasLinks(). * Since hasLinks() and getNumLinks() no longer share a significant amount of code, separate them. Remove the $max parameter from getNumLinks(), which only existed to support hasLinks() and has no other usages in codesearch. * Log a debug message when entering the post-send request stage, so that it's easier to confirm that no additional pre-send queries are done. * Add a regression test, confirmed to previously fail. Bug: T368006 Change-Id: Id5c7af6d4fcdbeb6724a9036133742c5f76624df
2024-06-25 01:22:35 +00:00
return $this->fullResultCache[$table]->numRows();
}
Fix use of stale backlink partition cache If a template is created, and then used on a page, and then the template is updated, all within an hour, then the page_touched of the page is never updated and the user will always see the old template contents. This is because htmlCacheUpdate jobs are fully suppressed for one hour following template creation, due to the WAN backlink partition cache. So: * Revert 4f3efbf4065a9, so that htmlCacheUpdate jobs always do something regardless of the state of the partition cache. * Factor out the job queueing parts of WikiPage::onArticleCreate, ::onArticleDelete and ::onArticleEdit. Instead of queueing a job unconditionally, check for the existence of backlinks in a post-send deferred update. If there are none, don't queue the job. * It's convenient to use BacklinkCache::hasLinks(), however, it suffered from the same stale cache problem as BacklinkCache::partition(). It's a short and fast query, and code review shows that none of the callers are particularly performance sensitive. So, do not use the WAN cache in BacklinkCache::hasLinks(). * Since hasLinks() and getNumLinks() no longer share a significant amount of code, separate them. Remove the $max parameter from getNumLinks(), which only existed to support hasLinks() and has no other usages in codesearch. * Log a debug message when entering the post-send request stage, so that it's easier to confirm that no additional pre-send queries are done. * Add a regression test, confirmed to previously fail. Bug: T368006 Change-Id: Id5c7af6d4fcdbeb6724a9036133742c5f76624df
2024-06-25 01:22:35 +00:00
return $this->wanCache->getWithSetCallback(
$this->wanCache->makeKey(
'numbacklinks',
CacheKeyHelper::getKeyForPage( $this->page ),
$table
),
self::CACHE_EXPIRY,
Fix use of stale backlink partition cache If a template is created, and then used on a page, and then the template is updated, all within an hour, then the page_touched of the page is never updated and the user will always see the old template contents. This is because htmlCacheUpdate jobs are fully suppressed for one hour following template creation, due to the WAN backlink partition cache. So: * Revert 4f3efbf4065a9, so that htmlCacheUpdate jobs always do something regardless of the state of the partition cache. * Factor out the job queueing parts of WikiPage::onArticleCreate, ::onArticleDelete and ::onArticleEdit. Instead of queueing a job unconditionally, check for the existence of backlinks in a post-send deferred update. If there are none, don't queue the job. * It's convenient to use BacklinkCache::hasLinks(), however, it suffered from the same stale cache problem as BacklinkCache::partition(). It's a short and fast query, and code review shows that none of the callers are particularly performance sensitive. So, do not use the WAN cache in BacklinkCache::hasLinks(). * Since hasLinks() and getNumLinks() no longer share a significant amount of code, separate them. Remove the $max parameter from getNumLinks(), which only existed to support hasLinks() and has no other usages in codesearch. * Log a debug message when entering the post-send request stage, so that it's easier to confirm that no additional pre-send queries are done. * Add a regression test, confirmed to previously fail. Bug: T368006 Change-Id: Id5c7af6d4fcdbeb6724a9036133742c5f76624df
2024-06-25 01:22:35 +00:00
function ( $oldValue, &$ttl, array &$setOpts ) use ( $table ) {
$setOpts += Database::getCacheSetOptions( $this->getDB() );
Fix use of stale backlink partition cache If a template is created, and then used on a page, and then the template is updated, all within an hour, then the page_touched of the page is never updated and the user will always see the old template contents. This is because htmlCacheUpdate jobs are fully suppressed for one hour following template creation, due to the WAN backlink partition cache. So: * Revert 4f3efbf4065a9, so that htmlCacheUpdate jobs always do something regardless of the state of the partition cache. * Factor out the job queueing parts of WikiPage::onArticleCreate, ::onArticleDelete and ::onArticleEdit. Instead of queueing a job unconditionally, check for the existence of backlinks in a post-send deferred update. If there are none, don't queue the job. * It's convenient to use BacklinkCache::hasLinks(), however, it suffered from the same stale cache problem as BacklinkCache::partition(). It's a short and fast query, and code review shows that none of the callers are particularly performance sensitive. So, do not use the WAN cache in BacklinkCache::hasLinks(). * Since hasLinks() and getNumLinks() no longer share a significant amount of code, separate them. Remove the $max parameter from getNumLinks(), which only existed to support hasLinks() and has no other usages in codesearch. * Log a debug message when entering the post-send request stage, so that it's easier to confirm that no additional pre-send queries are done. * Add a regression test, confirmed to previously fail. Bug: T368006 Change-Id: Id5c7af6d4fcdbeb6724a9036133742c5f76624df
2024-06-25 01:22:35 +00:00
// Use partition() since it will batch the query and skip the JOIN.
// Use $wgUpdateRowsPerJob just to encourage cache reuse for jobs.
$batchSize = $this->options->get( MainConfigNames::UpdateRowsPerJob );
$this->partition( $table, $batchSize );
return $this->partitionCache[$table][$batchSize]['numRows'];
}
);
}
/**
* Partition the backlinks into batches.
* Returns an array giving the start and end of each range. The first
* batch has a start of false, and the last batch has an end of false.
*
* @param string $table The links table name
* @param int $batchSize
* @return array
*/
public function partition( $table, $batchSize ) {
if ( isset( $this->partitionCache[$table][$batchSize] ) ) {
$this->logger->debug( __METHOD__ . ": got from partition cache" );
return $this->partitionCache[$table][$batchSize]['batches'];
}
$this->partitionCache[$table][$batchSize] = false;
$cacheEntry =& $this->partitionCache[$table][$batchSize];
if ( isset( $this->fullResultCache[$table] ) ) {
$res = $this->fullResultCache[$table];
$numRows = $res->numRows();
$batches = $this->partitionResult( $res, $numRows, $batchSize );
$this->openBatchEnds( $batches );
$cacheEntry = [ 'numRows' => $numRows, 'batches' => $batches ];
$this->logger->debug( __METHOD__ . ": got from full result cache" );
return $cacheEntry['batches'];
}
$cacheEntry = $this->wanCache->getWithSetCallback(
$this->wanCache->makeKey(
'backlinks',
CacheKeyHelper::getKeyForPage( $this->page ),
$table,
$batchSize
),
self::CACHE_EXPIRY,
function ( $oldValue, &$ttl, array &$setOpts ) use ( $table, $batchSize ) {
$setOpts += Database::getCacheSetOptions( $this->getDB() );
$value = [ 'numRows' => 0, 'batches' => [] ];
// Do the selects in batches to avoid client-side OOMs (T45452).
// Use a LIMIT that plays well with $batchSize to keep equal sized partitions.
$selectSize = max( $batchSize, 200_000 - ( 200_000 % $batchSize ) );
$start = false;
do {
$res = $this->queryLinks( $table, $start, false, $selectSize, 'ids' );
$numRows = $res->numRows();
$batches = $this->partitionResult( $res, $numRows, $batchSize );
// Merge the link count and range partitions for this chunk
$value['numRows'] += $numRows;
$value['batches'] = array_merge( $value['batches'], $batches );
if ( count( $batches ) ) {
// pick up after this inclusive range
$start = end( $batches )[1] + 1;
}
} while ( $numRows >= $selectSize );
// Make sure the first range has start=false and the last one has end=false
$this->openBatchEnds( $value['batches'] );
return $value;
}
);
return $cacheEntry['batches'];
}
/**
* Modify an array of batches, setting the start of the first batch to
* false, and the end of the last batch to false, so that the complete
* set of batches covers the entire ID range from 0 to infinity.
*
* @param array &$batches
*/
private function openBatchEnds( array &$batches ) {
if ( !count( $batches ) ) {
$batches = [ [ false, false ] ];
} else {
$batches[0][0] = false;
$batches[ array_key_last( $batches ) ][1] = false;
}
}
/**
* Partition a DB result with backlinks in it into batches
* @param IResultWrapper $res Database result
* @param int $numRows The number of rows to use from the result set
* @param int $batchSize
* @return int[][]
*/
private function partitionResult( $res, $numRows, $batchSize ) {
Fix use of stale backlink partition cache If a template is created, and then used on a page, and then the template is updated, all within an hour, then the page_touched of the page is never updated and the user will always see the old template contents. This is because htmlCacheUpdate jobs are fully suppressed for one hour following template creation, due to the WAN backlink partition cache. So: * Revert 4f3efbf4065a9, so that htmlCacheUpdate jobs always do something regardless of the state of the partition cache. * Factor out the job queueing parts of WikiPage::onArticleCreate, ::onArticleDelete and ::onArticleEdit. Instead of queueing a job unconditionally, check for the existence of backlinks in a post-send deferred update. If there are none, don't queue the job. * It's convenient to use BacklinkCache::hasLinks(), however, it suffered from the same stale cache problem as BacklinkCache::partition(). It's a short and fast query, and code review shows that none of the callers are particularly performance sensitive. So, do not use the WAN cache in BacklinkCache::hasLinks(). * Since hasLinks() and getNumLinks() no longer share a significant amount of code, separate them. Remove the $max parameter from getNumLinks(), which only existed to support hasLinks() and has no other usages in codesearch. * Log a debug message when entering the post-send request stage, so that it's easier to confirm that no additional pre-send queries are done. * Add a regression test, confirmed to previously fail. Bug: T368006 Change-Id: Id5c7af6d4fcdbeb6724a9036133742c5f76624df
2024-06-25 01:22:35 +00:00
$numBatches = ceil( $numRows / $batchSize );
$batches = [];
for ( $i = 0; $i < $numBatches; $i++ ) {
$rowNum = $i * $batchSize;
$res->seek( $rowNum );
$row = $res->fetchObject();
$start = (int)$row->page_id;
$rowNum = min( $numRows - 1, ( $i + 1 ) * $batchSize - 1 );
$res->seek( $rowNum );
$row = $res->fetchObject();
$end = (int)$row->page_id;
// Check order
if ( $start && $end && $start > $end ) {
throw new RuntimeException( __METHOD__ . ': Internal error: query result out of order' );
}
$batches[] = [ $start, $end ];
}
return $batches;
}
/**
* Get a PageIdentity iterator for cascade-protected template/file use backlinks
*
* @return Iterator<PageIdentity>
* @since 1.37
*/
public function getCascadeProtectedLinkPages(): Iterator {
foreach ( $this->getCascadeProtectedLinksInternal() as $row ) {
yield PageIdentityValue::localIdentity(
$row->page_id, $row->page_namespace, $row->page_title );
}
}
/**
* Get an array of cascade-protected template/file use backlinks
*
* @return stdClass[]
*/
private function getCascadeProtectedLinksInternal(): array {
$dbr = $this->getDB();
// @todo: use UNION without breaking tests that use temp tables
$resSets = [];
$linkConds = $this->linksMigration->getLinksConditions(
'templatelinks', TitleValue::newFromPage( $this->page )
);
$resSets[] = $dbr->newSelectQueryBuilder()
->select( [ 'page_namespace', 'page_title', 'page_id' ] )
->from( 'templatelinks' )
->join( 'page_restrictions', null, 'tl_from = pr_page' )
->join( 'page', null, 'page_id = tl_from' )
->where( $linkConds )
->andWhere( [ 'pr_cascade' => 1 ] )
->distinct()
->caller( __METHOD__ )->fetchResultSet();
if ( $this->page->getNamespace() === NS_FILE ) {
$resSets[] = $dbr->newSelectQueryBuilder()
->select( [ 'page_namespace', 'page_title', 'page_id' ] )
->from( 'imagelinks' )
->join( 'page_restrictions', null, 'il_from = pr_page' )
->join( 'page', null, 'page_id = il_from' )
->where( [
'il_to' => $this->page->getDBkey(),
'pr_cascade' => 1,
] )
->distinct()
->caller( __METHOD__ )->fetchResultSet();
}
// Combine and de-duplicate the results
$mergedRes = [];
foreach ( $resSets as $res ) {
foreach ( $res as $row ) {
// Index by page_id to remove duplicates
$mergedRes[$row->page_id] = $row;
}
}
// Now that we've de-duplicated, throw away the keys
return array_values( $mergedRes );
}
}
/** @deprecated class alias since 1.42 */
class_alias( BacklinkCache::class, 'BacklinkCache' );