2004-02-23 07:43:20 +00:00
|
|
|
<?php
|
2004-09-03 20:33:01 +00:00
|
|
|
/**
|
2009-08-02 19:35:17 +00:00
|
|
|
* This program is free software; you can redistribute it and/or modify
|
|
|
|
|
* it under the terms of the GNU General Public License as published by
|
|
|
|
|
* the Free Software Foundation; either version 2 of the License, or
|
|
|
|
|
* (at your option) any later version.
|
|
|
|
|
*
|
|
|
|
|
* This program is distributed in the hope that it will be useful,
|
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
|
* GNU General Public License for more details.
|
|
|
|
|
*
|
|
|
|
|
* You should have received a copy of the GNU General Public License along
|
|
|
|
|
* with this program; if not, write to the Free Software Foundation, Inc.,
|
|
|
|
|
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
|
|
|
|
* http://www.gnu.org/copyleft/gpl.html
|
|
|
|
|
*
|
WARNING: HUGE COMMIT
Doxygen documentation update:
* Changed alls @addtogroup to @ingroup. @addtogroup adds the comment to the group description, but doesn't add the file, class, function, ... to the group like @ingroup does. See for example http://svn.wikimedia.org/doc/group__SpecialPage.html where it's impossible to see related files, classes, ... that should belong to that group.
* Added @file to file description, it seems that it should be explicitely decalred for file descriptions, otherwise doxygen will think that the comment document the first class, variabled, function, ... that is in that file.
* Removed some empty comments
* Removed some ?>
Added following groups:
* ExternalStorage
* JobQueue
* MaintenanceLanguage
One more thing: there are still a lot of warnings when generating the doc.
2008-05-20 17:13:28 +00:00
|
|
|
* @ingroup Maintenance
|
2004-09-03 20:33:01 +00:00
|
|
|
*/
|
2004-02-23 07:43:20 +00:00
|
|
|
|
2009-08-03 21:56:41 +00:00
|
|
|
require_once( dirname(__FILE__) . '/Maintenance.php' );
|
2007-11-26 17:58:08 +00:00
|
|
|
|
2009-08-02 19:35:17 +00:00
|
|
|
class RefreshLinks extends Maintenance {
|
|
|
|
|
public function __construct() {
|
|
|
|
|
parent::__construct();
|
|
|
|
|
$this->mDescription = "Refresh link tables";
|
|
|
|
|
$this->addOption( 'dfn-only', 'Delete links from nonexistent articles only' );
|
|
|
|
|
$this->addOption( 'new-only', 'Only affect articles with just a single edit' );
|
|
|
|
|
$this->addOption( 'redirects-only', 'Only fix redirects, not all links' );
|
|
|
|
|
$this->addOption( 'old-redirects-only', 'Only fix redirects with no redirect table entry' );
|
|
|
|
|
$this->addOption( 'm', 'Maximum replication lag', false, true );
|
|
|
|
|
$this->addOption( 'e', 'Last page id to refresh', false, true );
|
2010-01-13 18:28:31 +00:00
|
|
|
$this->addArg( 'start', 'Page_id to start from, default 1', false );
|
2009-08-02 19:35:17 +00:00
|
|
|
$this->setBatchSize( 100 );
|
|
|
|
|
}
|
2004-02-23 08:08:38 +00:00
|
|
|
|
2009-08-02 19:35:17 +00:00
|
|
|
public function execute() {
|
|
|
|
|
if( !$this->hasOption( 'dfn-only' ) ) {
|
|
|
|
|
$start = $this->getArg( 0, 1 );
|
|
|
|
|
$new = $this->getOption( 'new-only', false );
|
|
|
|
|
$max = $this->getOption( 'm', false );
|
|
|
|
|
$end = $this->getOption( 'e', 0 );
|
|
|
|
|
$redir = $this->getOption( 'redirects-only', false );
|
|
|
|
|
$oldRedir = $this->getOption( 'old-redirects-only', false );
|
|
|
|
|
$this->doRefreshLinks( $start, $new, $max, $end, $redir, $oldRedir );
|
|
|
|
|
}
|
|
|
|
|
$this->deleteLinksFromNonexistent( $max, $this->mBatchSize );
|
2005-06-19 01:05:56 +00:00
|
|
|
}
|
2004-02-23 07:43:20 +00:00
|
|
|
|
2009-08-02 19:35:17 +00:00
|
|
|
/**
|
|
|
|
|
* Do the actual link refreshing.
|
|
|
|
|
* @param $start int Page_id to start from
|
|
|
|
|
* @param $newOnly bool Only do pages with 1 edit
|
|
|
|
|
* @param $maxLag int Max DB replication lag
|
|
|
|
|
* @param $end int Page_id to stop at
|
|
|
|
|
* @param $redirectsOnly bool Only fix redirects
|
|
|
|
|
* @param $oldRedirectsOnly bool Only fix redirects without redirect entries
|
|
|
|
|
*/
|
|
|
|
|
private function doRefreshLinks( $start, $newOnly = false, $maxLag = false,
|
|
|
|
|
$end = 0, $redirectsOnly = false, $oldRedirectsOnly = false ) {
|
|
|
|
|
global $wgUser, $wgParser, $wgUseTidy;
|
2009-01-07 19:51:36 +00:00
|
|
|
|
2009-08-02 19:35:17 +00:00
|
|
|
$reportingInterval = 100;
|
|
|
|
|
$dbr = wfGetDB( DB_SLAVE );
|
|
|
|
|
$start = intval( $start );
|
|
|
|
|
|
|
|
|
|
# Don't generate TeX PNGs (lack of a sensible current directory causes errors anyway)
|
|
|
|
|
$wgUser->setOption('math', MW_MATH_SOURCE);
|
|
|
|
|
|
|
|
|
|
# Don't generate extension images (e.g. Timeline)
|
|
|
|
|
if( method_exists( $wgParser, "clearTagHooks" ) ) {
|
|
|
|
|
$wgParser->clearTagHooks();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# Don't use HTML tidy
|
|
|
|
|
$wgUseTidy = false;
|
|
|
|
|
|
|
|
|
|
$what = $redirectsOnly ? "redirects" : "links";
|
|
|
|
|
|
|
|
|
|
if( $oldRedirectsOnly ) {
|
|
|
|
|
# This entire code path is cut-and-pasted from below. Hurrah.
|
|
|
|
|
$res = $dbr->query(
|
|
|
|
|
"SELECT page_id ".
|
|
|
|
|
"FROM page ".
|
|
|
|
|
"LEFT JOIN redirect ON page_id=rd_from ".
|
|
|
|
|
"WHERE page_is_redirect=1 AND rd_from IS NULL AND ".
|
|
|
|
|
($end == 0 ? "page_id >= $start"
|
|
|
|
|
: "page_id BETWEEN $start AND $end"),
|
|
|
|
|
__METHOD__
|
|
|
|
|
);
|
|
|
|
|
$num = $dbr->numRows( $res );
|
|
|
|
|
$this->output( "Refreshing $num old redirects from $start...\n" );
|
|
|
|
|
|
2009-08-17 21:15:31 +00:00
|
|
|
foreach( $res as $row ) {
|
2009-08-02 19:35:17 +00:00
|
|
|
if ( !( ++$i % $reportingInterval ) ) {
|
|
|
|
|
$this->output( "$i\n" );
|
|
|
|
|
wfWaitForSlaves( $maxLag );
|
|
|
|
|
}
|
|
|
|
|
$this->fixRedirect( $row->page_id );
|
|
|
|
|
}
|
|
|
|
|
} elseif( $newOnly ) {
|
|
|
|
|
$this->output( "Refreshing $what from " );
|
|
|
|
|
$res = $dbr->select( 'page',
|
|
|
|
|
array( 'page_id' ),
|
|
|
|
|
array(
|
|
|
|
|
'page_is_new' => 1,
|
|
|
|
|
"page_id >= $start" ),
|
|
|
|
|
__METHOD__
|
|
|
|
|
);
|
|
|
|
|
$num = $dbr->numRows( $res );
|
|
|
|
|
$this->output( "$num new articles...\n" );
|
|
|
|
|
|
|
|
|
|
$i = 0;
|
2009-08-17 21:15:31 +00:00
|
|
|
foreach ( $res as $row ) {
|
2009-08-02 19:35:17 +00:00
|
|
|
if ( !( ++$i % $reportingInterval ) ) {
|
|
|
|
|
$this->output( "$i\n" );
|
|
|
|
|
wfWaitForSlaves( $maxLag );
|
|
|
|
|
}
|
|
|
|
|
if($redirectsOnly)
|
|
|
|
|
$this->fixRedirect( $row->page_id );
|
|
|
|
|
else
|
|
|
|
|
$this->fixLinksFromArticle( $row->page_id );
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
if ( !$end ) {
|
2009-12-29 11:44:45 +00:00
|
|
|
$maxPage = $dbr->selectField( 'page', 'max(page_id)', false );
|
|
|
|
|
$maxRD = $dbr->selectField( 'redirect', 'max(rd_from)', false );
|
|
|
|
|
$end = max( $maxPage, $maxRD );
|
2009-08-02 19:35:17 +00:00
|
|
|
}
|
2009-12-02 13:37:24 +00:00
|
|
|
$this->output( "Refreshing redirects table.\n" );
|
2009-08-02 19:35:17 +00:00
|
|
|
$this->output( "Starting from page_id $start of $end.\n" );
|
|
|
|
|
|
|
|
|
|
for ($id = $start; $id <= $end; $id++) {
|
|
|
|
|
|
|
|
|
|
if ( !($id % $reportingInterval) ) {
|
|
|
|
|
$this->output( "$id\n" );
|
|
|
|
|
wfWaitForSlaves( $maxLag );
|
|
|
|
|
}
|
2009-12-02 13:37:24 +00:00
|
|
|
$this->fixRedirect( $id );
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if(!$redirectsOnly) {
|
|
|
|
|
$this->output( "Refreshing links table.\n" );
|
|
|
|
|
$this->output( "Starting from page_id $start of $end.\n" );
|
|
|
|
|
|
|
|
|
|
for ($id = $start; $id <= $end; $id++) {
|
|
|
|
|
|
|
|
|
|
if ( !($id % $reportingInterval) ) {
|
|
|
|
|
$this->output( "$id\n" );
|
|
|
|
|
wfWaitForSlaves( $maxLag );
|
|
|
|
|
}
|
2009-08-02 19:35:17 +00:00
|
|
|
$this->fixLinksFromArticle( $id );
|
2009-12-02 13:37:24 +00:00
|
|
|
}
|
2009-08-02 19:35:17 +00:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Update the redirect entry for a given page
|
|
|
|
|
* @param $id int The page_id of the redirect
|
|
|
|
|
*/
|
|
|
|
|
private function fixRedirect( $id ){
|
|
|
|
|
global $wgTitle, $wgArticle;
|
|
|
|
|
|
|
|
|
|
$wgTitle = Title::newFromID( $id );
|
|
|
|
|
$dbw = wfGetDB( DB_MASTER );
|
|
|
|
|
|
|
|
|
|
if ( is_null( $wgTitle ) ) {
|
2009-12-29 11:44:45 +00:00
|
|
|
// This page doesn't exist (any more)
|
|
|
|
|
// Delete any redirect table entry for it
|
|
|
|
|
$dbw->delete( 'redirect', array( 'rd_from' => $id ),
|
|
|
|
|
__METHOD__ );
|
2009-08-02 19:35:17 +00:00
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
$wgArticle = new Article($wgTitle);
|
|
|
|
|
|
|
|
|
|
$rt = $wgArticle->followRedirect();
|
|
|
|
|
|
2009-12-29 11:44:45 +00:00
|
|
|
if($rt == false || !is_object($rt)) {
|
|
|
|
|
// $wgTitle is not a redirect
|
|
|
|
|
// Delete any redirect table entry for it
|
|
|
|
|
$dbw->delete( 'redirect', array( 'rd_from' => $id ),
|
|
|
|
|
__METHOD__ );
|
|
|
|
|
} else {
|
|
|
|
|
$wgArticle->updateRedirectOn($dbw,$rt);
|
|
|
|
|
}
|
2009-08-02 19:35:17 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Run LinksUpdate for all links on a given page_id
|
|
|
|
|
* @param $id int The page_id
|
|
|
|
|
*/
|
|
|
|
|
private function fixLinksFromArticle( $id ) {
|
|
|
|
|
global $wgTitle, $wgParser;
|
|
|
|
|
|
|
|
|
|
$wgTitle = Title::newFromID( $id );
|
|
|
|
|
$dbw = wfGetDB( DB_MASTER );
|
2009-01-13 23:58:45 +00:00
|
|
|
|
2009-08-02 19:35:17 +00:00
|
|
|
$linkCache =& LinkCache::singleton();
|
|
|
|
|
$linkCache->clear();
|
2006-01-26 11:25:26 +00:00
|
|
|
|
2009-08-02 19:35:17 +00:00
|
|
|
if ( is_null( $wgTitle ) ) {
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
$dbw->begin();
|
|
|
|
|
|
|
|
|
|
$revision = Revision::newFromTitle( $wgTitle );
|
|
|
|
|
if ( !$revision ) {
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$options = new ParserOptions;
|
|
|
|
|
$parserOutput = $wgParser->parse( $revision->getText(), $wgTitle, $options, true, true, $revision->getId() );
|
|
|
|
|
$update = new LinksUpdate( $wgTitle, $parserOutput, false );
|
|
|
|
|
$update->doUpdate();
|
2009-12-14 23:18:03 +00:00
|
|
|
$dbw->commit();
|
2009-08-02 19:35:17 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Removes non-existing links from pages from pagelinks, imagelinks,
|
|
|
|
|
* categorylinks, templatelinks and externallinks tables.
|
|
|
|
|
*
|
|
|
|
|
* @param $maxLag
|
|
|
|
|
* @param $batchSize The size of deletion batches
|
|
|
|
|
*
|
|
|
|
|
* @author Merlijn van Deen <valhallasw@arctus.nl>
|
|
|
|
|
*/
|
|
|
|
|
private function deleteLinksFromNonexistent( $maxLag = 0, $batchSize = 100 ) {
|
|
|
|
|
wfWaitForSlaves( $maxLag );
|
|
|
|
|
|
|
|
|
|
$dbw = wfGetDB( DB_MASTER );
|
|
|
|
|
|
|
|
|
|
$lb = wfGetLBFactory()->newMainLB();
|
|
|
|
|
$dbr = $lb->getConnection( DB_SLAVE );
|
|
|
|
|
$dbr->bufferResults( false );
|
|
|
|
|
|
|
|
|
|
$linksTables = array( // table name => page_id field
|
|
|
|
|
'pagelinks' => 'pl_from',
|
|
|
|
|
'imagelinks' => 'il_from',
|
|
|
|
|
'categorylinks' => 'cl_from',
|
|
|
|
|
'templatelinks' => 'tl_from',
|
|
|
|
|
'externallinks' => 'el_from',
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
foreach ( $linksTables as $table => $field ) {
|
|
|
|
|
$this->output( "Retrieving illegal entries from $table... " );
|
|
|
|
|
|
|
|
|
|
// SELECT DISTINCT( $field ) FROM $table LEFT JOIN page ON $field=page_id WHERE page_id IS NULL;
|
|
|
|
|
$results = $dbr->select( array( $table, 'page' ),
|
|
|
|
|
$field,
|
|
|
|
|
array('page_id' => null ),
|
|
|
|
|
__METHOD__,
|
|
|
|
|
'DISTINCT',
|
|
|
|
|
array( 'page' => array( 'LEFT JOIN', "$field=page_id"))
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
$counter = 0;
|
|
|
|
|
$list = array();
|
|
|
|
|
$this->output( "0.." );
|
|
|
|
|
|
|
|
|
|
foreach( $results as $row ) {
|
|
|
|
|
$counter++;
|
|
|
|
|
$list[] = $row->$field;
|
|
|
|
|
if ( ( $counter % $batchSize ) == 0 ) {
|
|
|
|
|
wfWaitForSlaves(5);
|
|
|
|
|
$dbw->delete( $table, array( $field => $list ), __METHOD__ );
|
|
|
|
|
|
|
|
|
|
$this->output( $counter . ".." );
|
|
|
|
|
$list = array();
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
$this->output( $counter );
|
|
|
|
|
if (count($list) > 0) {
|
|
|
|
|
$dbw->delete( $table, array( $field => $list ), __METHOD__ );
|
|
|
|
|
}
|
|
|
|
|
$this->output( "\n" );
|
|
|
|
|
}
|
|
|
|
|
$lb->closeAll();
|
|
|
|
|
}
|
2004-02-23 07:43:20 +00:00
|
|
|
}
|
2009-08-02 19:35:17 +00:00
|
|
|
|
|
|
|
|
$maintClass = 'RefreshLinks';
|
|
|
|
|
require_once( DO_MAINTENANCE );
|