refreshLinks.php: allow refreshing by categories, tracking or not

Needed for selective updates of pages using a particular feature.
Intended to be run in production, so needs to scale.

Bug: T149723
Change-Id: If20fb1f91de8d4227def5b07d6d52b91161ed3fd
This commit is contained in:
Max Semenik 2016-11-29 15:04:07 -08:00
parent c06e055f4f
commit 13054a4c70
6 changed files with 239 additions and 104 deletions

View file

@ -206,6 +206,8 @@ changes to languages because of Phabricator reports.
* Article::doEditContent() was marked as deprecated, to be removed in 1.30
or later.
* ContentHandler::runLegacyHooks() was removed.
* refreshLinks.php now can be limited to a particular category with --category=...
or a tracking category with --tracking-category=...
== Compatibility ==

View file

@ -1459,6 +1459,7 @@ $wgAutoloadLocalClasses = [
'TitlePrefixSearch' => __DIR__ . '/includes/PrefixSearch.php',
'TitleValue' => __DIR__ . '/includes/title/TitleValue.php',
'TrackBlobs' => __DIR__ . '/maintenance/storage/trackBlobs.php',
'TrackingCategories' => __DIR__ . '/includes/TrackingCategories.php',
'TraditionalImageGallery' => __DIR__ . '/includes/gallery/TraditionalImageGallery.php',
'TransactionProfiler' => __DIR__ . '/includes/libs/rdbms/TransactionProfiler.php',
'TransformParameterError' => __DIR__ . '/includes/media/MediaTransformOutput.php',

View file

@ -0,0 +1,130 @@
<?php
/**
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
* http://www.gnu.org/copyleft/gpl.html
*
* @file
* @ingroup Categories
*/
/**
* This class performs some operations related to tracking categories, such as creating
* a list of all such categories.
*/
class TrackingCategories {
/** @var Config */
private $config;
/**
* Tracking categories that exist in core
*
* @var array
*/
private static $coreTrackingCategories = [
'index-category',
'noindex-category',
'duplicate-args-category',
'expensive-parserfunction-category',
'post-expand-template-argument-category',
'post-expand-template-inclusion-category',
'hidden-category-category',
'broken-file-category',
'node-count-exceeded-category',
'expansion-depth-exceeded-category',
'restricted-displaytitle-ignored',
'deprecated-self-close-category',
];
/**
* @param Config $config
*/
public function __construct( Config $config ) {
$this->config = $config;
}
/**
* Read the global and extract title objects from the corresponding messages
* @return array Array( 'msg' => Title, 'cats' => Title[] )
*/
public function getTrackingCategories() {
$categories = array_merge(
self::$coreTrackingCategories,
ExtensionRegistry::getInstance()->getAttribute( 'TrackingCategories' ),
$this->config->get( 'TrackingCategories' ) // deprecated
);
// Only show magic link tracking categories if they are enabled
$enableMagicLinks = $this->config->get( 'EnableMagicLinks' );
if ( $enableMagicLinks['ISBN'] ) {
$categories[] = 'magiclink-tracking-isbn';
}
if ( $enableMagicLinks['RFC'] ) {
$categories[] = 'magiclink-tracking-rfc';
}
if ( $enableMagicLinks['PMID'] ) {
$categories[] = 'magiclink-tracking-pmid';
}
$trackingCategories = [];
foreach ( $categories as $catMsg ) {
/*
* Check if the tracking category varies by namespace
* Otherwise only pages in the current namespace will be displayed
* If it does vary, show pages considering all namespaces
*/
$msgObj = wfMessage( $catMsg )->inContentLanguage();
$allCats = [];
$catMsgTitle = Title::makeTitleSafe( NS_MEDIAWIKI, $catMsg );
if ( !$catMsgTitle ) {
continue;
}
// Match things like {{NAMESPACE}} and {{NAMESPACENUMBER}}.
// False positives are ok, this is just an efficiency shortcut
if ( strpos( $msgObj->plain(), '{{' ) !== false ) {
$ns = MWNamespace::getValidNamespaces();
foreach ( $ns as $namesp ) {
$tempTitle = Title::makeTitleSafe( $namesp, $catMsg );
if ( !$tempTitle ) {
continue;
}
$catName = $msgObj->title( $tempTitle )->text();
# Allow tracking categories to be disabled by setting them to "-"
if ( $catName !== '-' ) {
$catTitle = Title::makeTitleSafe( NS_CATEGORY, $catName );
if ( $catTitle ) {
$allCats[] = $catTitle;
}
}
}
} else {
$catName = $msgObj->text();
# Allow tracking categories to be disabled by setting them to "-"
if ( $catName !== '-' ) {
$catTitle = Title::makeTitleSafe( NS_CATEGORY, $catName );
if ( $catTitle ) {
$allCats[] = $catTitle;
}
}
}
$trackingCategories[$catMsg] = [
'cats' => $allCats,
'msg' => $catMsgTitle,
];
}
return $trackingCategories;
}
}

View file

@ -696,6 +696,8 @@ class ParserOutput extends CacheTime {
* to SpecialTrackingCategories::$coreTrackingCategories, and extensions
* should add to "TrackingCategories" in their extension.json.
*
* @todo Migrate some code to TrackingCategories
*
* @param string $msg Message key
* @param Title $title title of the page which is being tracked
* @return bool Whether the addition was successful

View file

@ -36,26 +36,6 @@ class SpecialTrackingCategories extends SpecialPage {
parent::__construct( 'TrackingCategories' );
}
/**
* Tracking categories that exist in core
*
* @var array
*/
private static $coreTrackingCategories = [
'index-category',
'noindex-category',
'duplicate-args-category',
'expensive-parserfunction-category',
'post-expand-template-argument-category',
'post-expand-template-inclusion-category',
'hidden-category-category',
'broken-file-category',
'node-count-exceeded-category',
'expansion-depth-exceeded-category',
'restricted-displaytitle-ignored',
'deprecated-self-close-category',
];
function execute( $par ) {
$this->setHeaders();
$this->outputHeader();
@ -76,10 +56,11 @@ class SpecialTrackingCategories extends SpecialPage {
</tr></thead>"
);
$trackingCategories = $this->prepareTrackingCategoriesData();
$trackingCategories = new TrackingCategories( $this->getConfig() );
$categoryList = $trackingCategories->getTrackingCategories();
$batch = new LinkBatch();
foreach ( $trackingCategories as $catMsg => $data ) {
foreach ( $categoryList as $catMsg => $data ) {
$batch->addObj( $data['msg'] );
foreach ( $data['cats'] as $catTitle ) {
$batch->addObj( $catTitle );
@ -87,11 +68,11 @@ class SpecialTrackingCategories extends SpecialPage {
}
$batch->execute();
Hooks::run( 'SpecialTrackingCategories::preprocess', [ $this, $trackingCategories ] );
Hooks::run( 'SpecialTrackingCategories::preprocess', [ $this, $categoryList ] );
$linkRenderer = $this->getLinkRenderer();
foreach ( $trackingCategories as $catMsg => $data ) {
foreach ( $categoryList as $catMsg => $data ) {
$allMsgs = [];
$catDesc = $catMsg . '-desc';
@ -143,80 +124,6 @@ class SpecialTrackingCategories extends SpecialPage {
$this->getOutput()->addHTML( Html::closeElement( 'table' ) );
}
/**
* Read the global and extract title objects from the corresponding messages
* @return array Array( 'msg' => Title, 'cats' => Title[] )
*/
private function prepareTrackingCategoriesData() {
$categories = array_merge(
self::$coreTrackingCategories,
ExtensionRegistry::getInstance()->getAttribute( 'TrackingCategories' ),
$this->getConfig()->get( 'TrackingCategories' ) // deprecated
);
// Only show magic link tracking categories if they are enabled
$enableMagicLinks = $this->getConfig()->get( 'EnableMagicLinks' );
if ( $enableMagicLinks['ISBN'] ) {
$categories[] = 'magiclink-tracking-isbn';
}
if ( $enableMagicLinks['RFC'] ) {
$categories[] = 'magiclink-tracking-rfc';
}
if ( $enableMagicLinks['PMID'] ) {
$categories[] = 'magiclink-tracking-pmid';
}
$trackingCategories = [];
foreach ( $categories as $catMsg ) {
/*
* Check if the tracking category varies by namespace
* Otherwise only pages in the current namespace will be displayed
* If it does vary, show pages considering all namespaces
*/
$msgObj = $this->msg( $catMsg )->inContentLanguage();
$allCats = [];
$catMsgTitle = Title::makeTitleSafe( NS_MEDIAWIKI, $catMsg );
if ( !$catMsgTitle ) {
continue;
}
// Match things like {{NAMESPACE}} and {{NAMESPACENUMBER}}.
// False positives are ok, this is just an efficiency shortcut
if ( strpos( $msgObj->plain(), '{{' ) !== false ) {
$ns = MWNamespace::getValidNamespaces();
foreach ( $ns as $namesp ) {
$tempTitle = Title::makeTitleSafe( $namesp, $catMsg );
if ( !$tempTitle ) {
continue;
}
$catName = $msgObj->title( $tempTitle )->text();
# Allow tracking categories to be disabled by setting them to "-"
if ( $catName !== '-' ) {
$catTitle = Title::makeTitleSafe( NS_CATEGORY, $catName );
if ( $catTitle ) {
$allCats[] = $catTitle;
}
}
}
} else {
$catName = $msgObj->text();
# Allow tracking categories to be disabled by setting them to "-"
if ( $catName !== '-' ) {
$catTitle = Title::makeTitleSafe( NS_CATEGORY, $catName );
if ( $catTitle ) {
$allCats[] = $catTitle;
}
}
}
$trackingCategories[$catMsg] = [
'cats' => $allCats,
'msg' => $catMsgTitle,
];
}
return $trackingCategories;
}
protected function getGroupName() {
return 'pages';
}

View file

@ -29,6 +29,8 @@ require_once __DIR__ . '/Maintenance.php';
* @ingroup Maintenance
*/
class RefreshLinks extends Maintenance {
const REPORTING_INTERVAL = 100;
/** @var int|bool */
protected $namespace = false;
@ -43,6 +45,8 @@ class RefreshLinks extends Maintenance {
$this->addOption( 'dfn-chunk-size', 'Maximum number of existent IDs to check per ' .
'query, default 100000', false, true );
$this->addOption( 'namespace', 'Only fix pages in this namespace', false, true );
$this->addOption( 'category', 'Only fix pages in this category', false, true );
$this->addOption( 'tracking-category', 'Only fix pages in this tracking category', false, true );
$this->addArg( 'start', 'Page_id to start from, default 1', false );
$this->setBatchSize( 100 );
}
@ -61,7 +65,15 @@ class RefreshLinks extends Maintenance {
} else {
$this->namespace = (int)$ns;
}
if ( !$this->hasOption( 'dfn-only' ) ) {
if ( ( $category = $this->getOption( 'category', false ) ) !== false ) {
$title = Title::makeTitleSafe( NS_CATEGORY, $category );
if ( !$title ) {
$this->error( "'$category' is an invalid category name!\n", true );
}
$this->refreshCategory( $category );
} elseif ( ( $category = $this->getOption( 'tracking-category', false ) ) !== false ) {
$this->refreshTrackingCategory( $category );
} elseif ( !$this->hasOption( 'dfn-only' ) ) {
$new = $this->getOption( 'new-only', false );
$redir = $this->getOption( 'redirects-only', false );
$oldRedir = $this->getOption( 'old-redirects-only', false );
@ -89,7 +101,6 @@ class RefreshLinks extends Maintenance {
private function doRefreshLinks( $start, $newOnly = false,
$end = null, $redirectsOnly = false, $oldRedirectsOnly = false
) {
$reportingInterval = 100;
$dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] );
if ( $start === null ) {
@ -124,7 +135,7 @@ class RefreshLinks extends Maintenance {
$i = 0;
foreach ( $res as $row ) {
if ( !( ++$i % $reportingInterval ) ) {
if ( !( ++$i % self::REPORTING_INTERVAL ) ) {
$this->output( "$i\n" );
wfWaitForSlaves();
}
@ -145,7 +156,7 @@ class RefreshLinks extends Maintenance {
$i = 0;
foreach ( $res as $row ) {
if ( !( ++$i % $reportingInterval ) ) {
if ( !( ++$i % self::REPORTING_INTERVAL ) ) {
$this->output( "$i\n" );
wfWaitForSlaves();
}
@ -166,7 +177,7 @@ class RefreshLinks extends Maintenance {
for ( $id = $start; $id <= $end; $id++ ) {
if ( !( $id % $reportingInterval ) ) {
if ( !( $id % self::REPORTING_INTERVAL ) ) {
$this->output( "$id\n" );
wfWaitForSlaves();
}
@ -179,7 +190,7 @@ class RefreshLinks extends Maintenance {
for ( $id = $start; $id <= $end; $id++ ) {
if ( !( $id % $reportingInterval ) ) {
if ( !( $id % self::REPORTING_INTERVAL ) ) {
$this->output( "$id\n" );
wfWaitForSlaves();
}
@ -379,6 +390,7 @@ class RefreshLinks extends Maintenance {
* @param string $var Field name
* @param mixed $start First value to include or null
* @param mixed $end Last value to include or null
* @return string
*/
private static function intervalCond( IDatabase $db, $var, $start, $end ) {
if ( $start === null && $end === null ) {
@ -391,6 +403,87 @@ class RefreshLinks extends Maintenance {
return "$var BETWEEN {$db->addQuotes( $start )} AND {$db->addQuotes( $end )}";
}
}
/**
* Refershes links for pages in a tracking category
*
* @param string $category Category key
*/
private function refreshTrackingCategory( $category ) {
$cats = $this->getPossibleCategories( $category );
if ( !$cats ) {
$this->error( "Tracking category '$category' is disabled\n" );
// Output to stderr but don't bail out,
}
foreach ( $cats as $cat ) {
$this->refreshCategory( $cat );
}
}
/**
* Refreshes links to a category
*
* @param Title $category
*/
private function refreshCategory( Title $category ) {
$this->output( "Refreshing pages in category '{$category->getText()}'...\n" );
$dbr = $this->getDB( DB_REPLICA );
$conds = [
'page_id=cl_from',
'cl_to' => $category->getDBkey(),
];
if ( $this->namespace !== false ) {
$conds['page_namespace'] = $this->namespace;
}
$i = 0;
$timestamp = '';
$lastId = 0;
do {
$finalConds = $conds;
$timestamp = $dbr->addQuotes( $timestamp );
$finalConds []=
"(cl_timestamp > $timestamp OR (cl_timestamp = $timestamp AND cl_from > $lastId))";
$res = $dbr->select( [ 'page', 'categorylinks' ],
[ 'page_id', 'cl_timestamp' ],
$finalConds,
__METHOD__,
[
'ORDER BY' => [ 'cl_timestamp', 'cl_from' ],
'LIMIT' => $this->mBatchSize,
]
);
foreach ( $res as $row ) {
if ( !( ++$i % self::REPORTING_INTERVAL ) ) {
$this->output( "$i\n" );
wfWaitForSlaves();
}
$lastId = $row->page_id;
$timestamp = $row->cl_timestamp;
self::fixLinksFromArticle( $row->page_id );
}
} while ( $res->numRows() == $this->mBatchSize );
}
/**
* Returns a list of possible categories for a given tracking category key
*
* @param string $categoryKey
* @return Title[]
*/
private function getPossibleCategories( $categoryKey ) {
$trackingCategories = new TrackingCategories( $this->getConfig() );
$cats = $trackingCategories->getTrackingCategories();
if ( isset( $cats[$categoryKey] ) ) {
return $cats[$categoryKey]['cats'];
}
$this->error( "Unknown tracking category {$categoryKey}\n", true );
}
}
$maintClass = 'RefreshLinks';