2005-10-16 17:33:41 +00:00
|
|
|
<?php
|
|
|
|
|
/**
|
2010-10-03 09:25:28 +00:00
|
|
|
* Script that postprocesses XML dumps from dumpBackup.php to add page text
|
|
|
|
|
*
|
2011-07-20 23:06:24 +00:00
|
|
|
* Copyright (C) 2005 Brion Vibber <brion@pobox.com>
|
2005-10-16 17:33:41 +00:00
|
|
|
* http://www.mediawiki.org/
|
2006-01-07 13:09:30 +00:00
|
|
|
*
|
2005-10-16 17:33:41 +00:00
|
|
|
* This program is free software; you can redistribute it and/or modify
|
|
|
|
|
* it under the terms of the GNU General Public License as published by
|
2006-01-07 13:09:30 +00:00
|
|
|
* the Free Software Foundation; either version 2 of the License, or
|
2005-10-16 17:33:41 +00:00
|
|
|
* (at your option) any later version.
|
2006-01-07 13:09:30 +00:00
|
|
|
*
|
2005-10-16 17:33:41 +00:00
|
|
|
* This program is distributed in the hope that it will be useful,
|
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
|
* GNU General Public License for more details.
|
2006-01-07 13:09:30 +00:00
|
|
|
*
|
2005-10-16 17:33:41 +00:00
|
|
|
* You should have received a copy of the GNU General Public License along
|
|
|
|
|
* with this program; if not, write to the Free Software Foundation, Inc.,
|
2006-04-05 07:43:17 +00:00
|
|
|
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
2005-10-16 17:33:41 +00:00
|
|
|
* http://www.gnu.org/copyleft/gpl.html
|
|
|
|
|
*
|
WARNING: HUGE COMMIT
Doxygen documentation update:
* Changed alls @addtogroup to @ingroup. @addtogroup adds the comment to the group description, but doesn't add the file, class, function, ... to the group like @ingroup does. See for example http://svn.wikimedia.org/doc/group__SpecialPage.html where it's impossible to see related files, classes, ... that should belong to that group.
* Added @file to file description, it seems that it should be explicitely decalred for file descriptions, otherwise doxygen will think that the comment document the first class, variabled, function, ... that is in that file.
* Removed some empty comments
* Removed some ?>
Added following groups:
* ExternalStorage
* JobQueue
* MaintenanceLanguage
One more thing: there are still a lot of warnings when generating the doc.
2008-05-20 17:13:28 +00:00
|
|
|
* @file
|
|
|
|
|
* @ingroup Maintenance
|
2005-10-16 17:33:41 +00:00
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
$originalDir = getcwd();
|
|
|
|
|
|
2010-05-22 16:50:39 +00:00
|
|
|
require_once( dirname( __FILE__ ) . '/commandLine.inc' );
|
2008-01-08 23:58:16 +00:00
|
|
|
require_once( 'backup.inc' );
|
2006-05-12 11:14:40 +00:00
|
|
|
|
WARNING: HUGE COMMIT
Doxygen documentation update:
* Changed alls @addtogroup to @ingroup. @addtogroup adds the comment to the group description, but doesn't add the file, class, function, ... to the group like @ingroup does. See for example http://svn.wikimedia.org/doc/group__SpecialPage.html where it's impossible to see related files, classes, ... that should belong to that group.
* Added @file to file description, it seems that it should be explicitely decalred for file descriptions, otherwise doxygen will think that the comment document the first class, variabled, function, ... that is in that file.
* Removed some empty comments
* Removed some ?>
Added following groups:
* ExternalStorage
* JobQueue
* MaintenanceLanguage
One more thing: there are still a lot of warnings when generating the doc.
2008-05-20 17:13:28 +00:00
|
|
|
/**
|
|
|
|
|
* @ingroup Maintenance
|
|
|
|
|
*/
|
2005-10-16 17:33:41 +00:00
|
|
|
class TextPassDumper extends BackupDumper {
|
|
|
|
|
var $prefetch = null;
|
2006-01-14 05:46:48 +00:00
|
|
|
var $input = "php://stdin";
|
2011-07-20 23:06:24 +00:00
|
|
|
var $history = WikiExporter::FULL;
|
2006-05-12 08:50:14 +00:00
|
|
|
var $fetchCount = 0;
|
|
|
|
|
var $prefetchCount = 0;
|
2011-03-27 13:27:05 +00:00
|
|
|
var $lastTime = 0;
|
|
|
|
|
var $pageCountLast = 0;
|
|
|
|
|
var $revCountLast = 0;
|
|
|
|
|
var $prefetchCountLast = 0;
|
|
|
|
|
var $fetchCountLast = 0;
|
2010-07-19 18:36:52 +00:00
|
|
|
|
2006-09-27 19:48:50 +00:00
|
|
|
var $failures = 0;
|
2010-06-16 20:12:29 +00:00
|
|
|
var $maxFailures = 5;
|
2010-07-19 18:36:52 +00:00
|
|
|
var $failedTextRetrievals = 0;
|
2010-06-16 20:12:29 +00:00
|
|
|
var $maxConsecutiveFailedTextRetrievals = 200;
|
2006-09-27 19:48:50 +00:00
|
|
|
var $failureTimeout = 5; // Seconds to sleep after db failure
|
2010-05-22 16:50:39 +00:00
|
|
|
|
2007-11-20 12:58:34 +00:00
|
|
|
var $php = "php";
|
|
|
|
|
var $spawn = false;
|
|
|
|
|
var $spawnProc = false;
|
|
|
|
|
var $spawnWrite = false;
|
|
|
|
|
var $spawnRead = false;
|
|
|
|
|
var $spawnErr = false;
|
2006-01-07 13:31:29 +00:00
|
|
|
|
2011-03-27 13:27:05 +00:00
|
|
|
var $ID = 0;
|
|
|
|
|
|
|
|
|
|
function initProgress( $history ) {
|
|
|
|
|
parent::initProgress();
|
|
|
|
|
$this->ID = getmypid();
|
|
|
|
|
$this->lastTime = $this->startTime;
|
|
|
|
|
}
|
|
|
|
|
|
2010-10-02 21:58:04 +00:00
|
|
|
function dump( $history, $text = WikiExporter::TEXT ) {
|
2005-10-16 17:33:41 +00:00
|
|
|
# This shouldn't happen if on console... ;)
|
|
|
|
|
header( 'Content-type: text/html; charset=UTF-8' );
|
2006-01-07 13:31:29 +00:00
|
|
|
|
2005-10-16 17:33:41 +00:00
|
|
|
# Notice messages will foul up your XML output even if they're
|
|
|
|
|
# relatively harmless.
|
2010-05-22 16:50:39 +00:00
|
|
|
if ( ini_get( 'display_errors' ) )
|
2007-12-06 21:07:49 +00:00
|
|
|
ini_set( 'display_errors', 'stderr' );
|
2006-01-07 13:31:29 +00:00
|
|
|
|
2011-07-20 23:06:24 +00:00
|
|
|
$this->initProgress( $this->history );
|
2006-01-07 13:31:29 +00:00
|
|
|
|
2007-03-09 15:26:41 +00:00
|
|
|
$this->db = $this->backupDb();
|
2006-01-07 13:31:29 +00:00
|
|
|
|
2011-07-20 23:06:24 +00:00
|
|
|
$this->egress = new ExportProgressFilter( $this->sink, $this );
|
|
|
|
|
|
|
|
|
|
$input = fopen( $this->input, "rt" );
|
|
|
|
|
$result = $this->readDump( $input );
|
|
|
|
|
|
|
|
|
|
if ( WikiError::isError( $result ) ) {
|
|
|
|
|
wfDie( $result->getMessage() );
|
|
|
|
|
}
|
2010-07-19 18:36:52 +00:00
|
|
|
|
2010-05-22 16:50:39 +00:00
|
|
|
if ( $this->spawnProc ) {
|
2007-11-20 17:34:42 +00:00
|
|
|
$this->closeSpawn();
|
|
|
|
|
}
|
2006-01-07 13:31:29 +00:00
|
|
|
|
2005-10-16 17:33:41 +00:00
|
|
|
$this->report( true );
|
|
|
|
|
}
|
2006-01-07 13:31:29 +00:00
|
|
|
|
2005-10-16 17:33:41 +00:00
|
|
|
function processOption( $opt, $val, $param ) {
|
2010-08-12 14:25:07 +00:00
|
|
|
global $IP;
|
2006-01-14 05:46:48 +00:00
|
|
|
$url = $this->processFileOpt( $val, $param );
|
2010-07-19 18:36:52 +00:00
|
|
|
|
2006-01-14 05:46:48 +00:00
|
|
|
switch( $opt ) {
|
|
|
|
|
case 'prefetch':
|
2008-01-08 23:53:20 +00:00
|
|
|
require_once "$IP/maintenance/backupPrefetch.inc";
|
2006-01-14 05:46:48 +00:00
|
|
|
$this->prefetch = new BaseDump( $url );
|
|
|
|
|
break;
|
|
|
|
|
case 'stub':
|
|
|
|
|
$this->input = $url;
|
|
|
|
|
break;
|
2011-07-20 23:06:24 +00:00
|
|
|
case 'current':
|
|
|
|
|
$this->history = WikiExporter::CURRENT;
|
|
|
|
|
break;
|
|
|
|
|
case 'full':
|
|
|
|
|
$this->history = WikiExporter::FULL;
|
|
|
|
|
break;
|
2007-11-20 12:58:34 +00:00
|
|
|
case 'spawn':
|
|
|
|
|
$this->spawn = true;
|
2010-05-22 16:50:39 +00:00
|
|
|
if ( $val ) {
|
2007-11-20 12:58:34 +00:00
|
|
|
$this->php = $val;
|
|
|
|
|
}
|
|
|
|
|
break;
|
2006-01-14 05:46:48 +00:00
|
|
|
}
|
|
|
|
|
}
|
2010-07-19 18:36:52 +00:00
|
|
|
|
2006-01-14 05:46:48 +00:00
|
|
|
function processFileOpt( $val, $param ) {
|
2011-01-28 19:20:49 +00:00
|
|
|
$fileURIs = explode(';',$param);
|
|
|
|
|
foreach ( $fileURIs as $URI ) {
|
|
|
|
|
switch( $val ) {
|
|
|
|
|
case "file":
|
|
|
|
|
$newURI = $URI;
|
|
|
|
|
break;
|
|
|
|
|
case "gzip":
|
|
|
|
|
$newURI = "compress.zlib://$URI";
|
|
|
|
|
break;
|
|
|
|
|
case "bzip2":
|
|
|
|
|
$newURI = "compress.bzip2://$URI";
|
|
|
|
|
break;
|
|
|
|
|
case "7zip":
|
|
|
|
|
$newURI = "mediawiki.compress.7z://$URI";
|
|
|
|
|
break;
|
|
|
|
|
default:
|
|
|
|
|
$newURI = $URI;
|
|
|
|
|
}
|
|
|
|
|
$newFileURIs[] = $newURI;
|
2005-10-16 17:33:41 +00:00
|
|
|
}
|
2011-01-28 19:20:49 +00:00
|
|
|
$val = implode( ';', $newFileURIs );
|
|
|
|
|
return $val;
|
2005-10-16 17:33:41 +00:00
|
|
|
}
|
2006-01-07 13:31:29 +00:00
|
|
|
|
2006-05-12 08:50:14 +00:00
|
|
|
/**
|
|
|
|
|
* Overridden to include prefetch ratio if enabled.
|
|
|
|
|
*/
|
|
|
|
|
function showReport() {
|
2010-05-22 16:50:39 +00:00
|
|
|
if ( !$this->prefetch ) {
|
2006-05-12 08:50:14 +00:00
|
|
|
return parent::showReport();
|
|
|
|
|
}
|
2010-07-19 18:36:52 +00:00
|
|
|
|
2010-05-22 16:50:39 +00:00
|
|
|
if ( $this->reporting ) {
|
2006-05-12 08:50:14 +00:00
|
|
|
$now = wfTimestamp( TS_DB );
|
2011-03-27 13:27:05 +00:00
|
|
|
$deltaAll = wfTime() - $this->startTime;
|
|
|
|
|
$deltaPart = wfTime() - $this->lastTime;
|
|
|
|
|
$this->pageCountPart = $this->pageCount - $this->pageCountLast;
|
|
|
|
|
$this->revCountPart = $this->revCount - $this->revCountLast;
|
|
|
|
|
|
|
|
|
|
if ( $deltaAll ) {
|
2006-05-12 08:50:14 +00:00
|
|
|
$portion = $this->revCount / $this->maxCount;
|
2011-03-27 13:27:05 +00:00
|
|
|
$eta = $this->startTime + $deltaAll / $portion;
|
2006-05-12 08:50:14 +00:00
|
|
|
$etats = wfTimestamp( TS_DB, intval( $eta ) );
|
2011-03-27 13:27:05 +00:00
|
|
|
if ( $this->fetchCount ) {
|
|
|
|
|
$fetchRate = 100.0 * $this->prefetchCount / $this->fetchCount;
|
|
|
|
|
}
|
|
|
|
|
else {
|
|
|
|
|
$fetchRate = '-';
|
|
|
|
|
}
|
|
|
|
|
$pageRate = $this->pageCount / $deltaAll;
|
|
|
|
|
$revRate = $this->revCount / $deltaAll;
|
2006-05-12 08:50:14 +00:00
|
|
|
} else {
|
2011-03-27 13:27:05 +00:00
|
|
|
$pageRate = '-';
|
|
|
|
|
$revRate = '-';
|
2006-05-12 08:50:14 +00:00
|
|
|
$etats = '-';
|
2011-03-27 13:27:05 +00:00
|
|
|
$fetchRate = '-';
|
|
|
|
|
}
|
|
|
|
|
if ( $deltaPart ) {
|
|
|
|
|
if ( $this->fetchCountLast ) {
|
|
|
|
|
$fetchRatePart = 100.0 * $this->prefetchCountLast / $this->fetchCountLast;
|
|
|
|
|
}
|
|
|
|
|
else {
|
|
|
|
|
$fetchRatePart = '-';
|
|
|
|
|
}
|
|
|
|
|
$pageRatePart = $this->pageCountPart / $deltaPart;
|
|
|
|
|
$revRatePart = $this->revCountPart / $deltaPart;
|
|
|
|
|
|
|
|
|
|
} else {
|
|
|
|
|
$fetchRatePart = '-';
|
|
|
|
|
$pageRatePart = '-';
|
|
|
|
|
$revRatePart = '-';
|
2006-05-12 08:50:14 +00:00
|
|
|
}
|
2011-03-27 13:27:05 +00:00
|
|
|
$this->progress( sprintf( "%s: %s (ID %d) %d pages (%0.1f|%0.1f/sec all|curr), %d revs (%0.1f|%0.1f/sec all|curr), %0.1f%%|%0.1f%% prefetched (all|curr), ETA %s [max %d]",-
|
|
|
|
|
$now, wfWikiID(), $this->ID, $this->pageCount, $pageRate, $pageRatePart, $this->revCount, $revRate, $revRatePart, $fetchRate, $fetchRatePart, $etats, $this->maxCount ) );
|
|
|
|
|
$this->lastTime = $now;
|
|
|
|
|
$this->partCountLast = $this->partCount;
|
|
|
|
|
$this->revCountLast = $this->revCount;
|
|
|
|
|
$this->prefetchCountLast = $this->prefetchCount;
|
|
|
|
|
$this->fetchCountLast = $this->fetchCount;
|
2006-05-12 08:50:14 +00:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2011-07-20 23:06:24 +00:00
|
|
|
function readDump( $input ) {
|
|
|
|
|
$this->buffer = "";
|
|
|
|
|
$this->openElement = false;
|
|
|
|
|
$this->atStart = true;
|
|
|
|
|
$this->state = "";
|
|
|
|
|
$this->lastName = "";
|
2005-10-16 17:33:41 +00:00
|
|
|
$this->thisPage = 0;
|
|
|
|
|
$this->thisRev = 0;
|
2006-01-07 13:31:29 +00:00
|
|
|
|
2011-07-20 23:06:24 +00:00
|
|
|
$parser = xml_parser_create( "UTF-8" );
|
|
|
|
|
xml_parser_set_option( $parser, XML_OPTION_CASE_FOLDING, false );
|
2010-12-03 10:05:01 +00:00
|
|
|
|
2011-07-20 23:06:24 +00:00
|
|
|
xml_set_element_handler( $parser, array( &$this, 'startElement' ), array( &$this, 'endElement' ) );
|
|
|
|
|
xml_set_character_data_handler( $parser, array( &$this, 'characterData' ) );
|
2010-12-03 10:05:01 +00:00
|
|
|
|
2011-07-20 23:06:24 +00:00
|
|
|
$offset = 0; // for context extraction on error reporting
|
|
|
|
|
$bufferSize = 512 * 1024;
|
|
|
|
|
do {
|
|
|
|
|
$chunk = fread( $input, $bufferSize );
|
|
|
|
|
if ( !xml_parse( $parser, $chunk, feof( $input ) ) ) {
|
|
|
|
|
wfDebug( "TextDumpPass::readDump encountered XML parsing error\n" );
|
|
|
|
|
return new WikiXmlError( $parser, 'XML import parse failure', $chunk, $offset );
|
2010-12-03 10:05:01 +00:00
|
|
|
}
|
2011-07-20 23:06:24 +00:00
|
|
|
$offset += strlen( $chunk );
|
|
|
|
|
} while ( $chunk !== false && !feof( $input ) );
|
|
|
|
|
xml_parser_free( $parser );
|
|
|
|
|
|
|
|
|
|
return true;
|
2005-10-16 17:33:41 +00:00
|
|
|
}
|
2006-01-07 13:31:29 +00:00
|
|
|
|
2005-10-16 17:33:41 +00:00
|
|
|
function getText( $id ) {
|
2006-05-12 08:50:14 +00:00
|
|
|
$this->fetchCount++;
|
2010-05-22 16:50:39 +00:00
|
|
|
if ( isset( $this->prefetch ) ) {
|
2005-10-16 17:33:41 +00:00
|
|
|
$text = $this->prefetch->prefetch( $this->thisPage, $this->thisRev );
|
2010-12-04 19:30:26 +00:00
|
|
|
if ( $text !== null ) { // Entry missing from prefetch dump
|
2010-06-04 03:10:34 +00:00
|
|
|
$dbr = wfGetDB( DB_SLAVE );
|
2010-12-04 19:30:26 +00:00
|
|
|
$revID = intval( $this->thisRev );
|
|
|
|
|
$revLength = $dbr->selectField( 'revision', 'rev_len', array( 'rev_id' => $revID ) );
|
2010-06-04 03:10:34 +00:00
|
|
|
// if length of rev text in file doesn't match length in db, we reload
|
2010-07-19 18:36:52 +00:00
|
|
|
// this avoids carrying forward broken data from previous xml dumps
|
2010-12-04 19:30:26 +00:00
|
|
|
if( strlen( $text ) == $revLength ) {
|
2010-06-04 03:10:34 +00:00
|
|
|
$this->prefetchCount++;
|
|
|
|
|
return $text;
|
|
|
|
|
}
|
2006-05-11 22:56:19 +00:00
|
|
|
}
|
2005-10-16 17:33:41 +00:00
|
|
|
}
|
2007-11-20 12:58:34 +00:00
|
|
|
return $this->doGetText( $id );
|
|
|
|
|
}
|
2010-07-19 18:36:52 +00:00
|
|
|
|
2007-11-20 12:58:34 +00:00
|
|
|
private function doGetText( $id ) {
|
2011-07-20 23:06:24 +00:00
|
|
|
|
2010-07-15 21:25:31 +00:00
|
|
|
$id = intval( $id );
|
2010-06-16 20:12:29 +00:00
|
|
|
$this->failures = 0;
|
|
|
|
|
$ex = new MWException( "Graceful storage failure" );
|
|
|
|
|
while (true) {
|
|
|
|
|
if ( $this->spawn ) {
|
|
|
|
|
if ($this->failures) {
|
2010-07-19 18:36:52 +00:00
|
|
|
// we don't know why it failed, could be the child process
|
|
|
|
|
// borked, could be db entry busted, could be db server out to lunch,
|
2010-06-16 20:12:29 +00:00
|
|
|
// so cover all bases
|
|
|
|
|
$this->closeSpawn();
|
|
|
|
|
$this->openSpawn();
|
|
|
|
|
}
|
2010-07-19 18:36:52 +00:00
|
|
|
$text = $this->getTextSpawned( $id );
|
2010-06-16 20:12:29 +00:00
|
|
|
} else {
|
2010-07-19 18:36:52 +00:00
|
|
|
$text = $this->getTextDbSafe( $id );
|
2010-06-16 20:12:29 +00:00
|
|
|
}
|
|
|
|
|
if ( $text === false ) {
|
|
|
|
|
$this->failures++;
|
|
|
|
|
if ( $this->failures > $this->maxFailures) {
|
|
|
|
|
$this->progress( "Failed to retrieve revision text for text id ".
|
|
|
|
|
"$id after $this->maxFailures tries, giving up" );
|
|
|
|
|
// were there so many bad retrievals in a row we want to bail?
|
2010-07-19 18:36:52 +00:00
|
|
|
// at some point we have to declare the dump irretrievably broken
|
2010-06-16 20:12:29 +00:00
|
|
|
$this->failedTextRetrievals++;
|
|
|
|
|
if ($this->failedTextRetrievals > $this->maxConsecutiveFailedTextRetrievals) {
|
|
|
|
|
throw $ex;
|
|
|
|
|
}
|
|
|
|
|
else {
|
|
|
|
|
// would be nice to return something better to the caller someday,
|
|
|
|
|
// log what we know about the failure and about the revision
|
|
|
|
|
return("");
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
$this->progress( "Error $this->failures " .
|
|
|
|
|
"of allowed $this->maxFailures retrieving revision text for text id $id! " .
|
|
|
|
|
"Pausing $this->failureTimeout seconds before retry..." );
|
|
|
|
|
sleep( $this->failureTimeout );
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
$this->failedTextRetrievals= 0;
|
|
|
|
|
return( $text );
|
|
|
|
|
}
|
2007-11-20 12:58:34 +00:00
|
|
|
}
|
2010-06-16 20:12:29 +00:00
|
|
|
|
2007-11-20 12:58:34 +00:00
|
|
|
}
|
2010-07-19 18:36:52 +00:00
|
|
|
|
2007-11-20 12:58:34 +00:00
|
|
|
/**
|
|
|
|
|
* Fetch a text revision from the database, retrying in case of failure.
|
|
|
|
|
* This may survive some transitory errors by reconnecting, but
|
|
|
|
|
* may not survive a long-term server outage.
|
|
|
|
|
*/
|
|
|
|
|
private function getTextDbSafe( $id ) {
|
2010-05-22 16:50:39 +00:00
|
|
|
while ( true ) {
|
2006-09-27 19:48:50 +00:00
|
|
|
try {
|
2007-11-20 12:58:34 +00:00
|
|
|
$text = $this->getTextDb( $id );
|
2010-05-22 16:50:39 +00:00
|
|
|
} catch ( DBQueryError $ex ) {
|
2007-08-02 04:15:30 +00:00
|
|
|
$text = false;
|
|
|
|
|
}
|
2010-06-16 20:12:29 +00:00
|
|
|
return $text;
|
2006-09-27 19:48:50 +00:00
|
|
|
}
|
|
|
|
|
}
|
2010-07-19 18:36:52 +00:00
|
|
|
|
2006-09-27 19:48:50 +00:00
|
|
|
/**
|
|
|
|
|
* May throw a database error if, say, the server dies during query.
|
|
|
|
|
*/
|
2007-11-20 12:58:34 +00:00
|
|
|
private function getTextDb( $id ) {
|
2010-01-04 08:28:50 +00:00
|
|
|
global $wgContLang;
|
2005-10-16 17:33:41 +00:00
|
|
|
$row = $this->db->selectRow( 'text',
|
|
|
|
|
array( 'old_text', 'old_flags' ),
|
|
|
|
|
array( 'old_id' => $id ),
|
2010-10-09 12:30:20 +00:00
|
|
|
__METHOD__ );
|
2005-10-19 20:51:01 +00:00
|
|
|
$text = Revision::getRevisionText( $row );
|
2010-05-22 16:50:39 +00:00
|
|
|
if ( $text === false ) {
|
2007-08-02 04:15:30 +00:00
|
|
|
return false;
|
|
|
|
|
}
|
2005-10-19 20:51:01 +00:00
|
|
|
$stripped = str_replace( "\r", "", $text );
|
2010-01-04 08:28:50 +00:00
|
|
|
$normalized = $wgContLang->normalize( $stripped );
|
2005-10-19 20:51:01 +00:00
|
|
|
return $normalized;
|
2005-10-16 17:33:41 +00:00
|
|
|
}
|
2010-07-19 18:36:52 +00:00
|
|
|
|
2007-11-20 12:58:34 +00:00
|
|
|
private function getTextSpawned( $id ) {
|
|
|
|
|
wfSuppressWarnings();
|
2010-05-22 16:50:39 +00:00
|
|
|
if ( !$this->spawnProc ) {
|
2007-11-20 12:58:34 +00:00
|
|
|
// First time?
|
|
|
|
|
$this->openSpawn();
|
|
|
|
|
}
|
2010-06-16 20:12:29 +00:00
|
|
|
$text = $this->getTextSpawnedOnce( $id );
|
|
|
|
|
wfRestoreWarnings();
|
|
|
|
|
return $text;
|
2007-11-20 12:58:34 +00:00
|
|
|
}
|
2010-07-19 18:36:52 +00:00
|
|
|
|
2007-11-20 12:58:34 +00:00
|
|
|
function openSpawn() {
|
2010-12-06 09:20:07 +00:00
|
|
|
global $IP;
|
2010-07-19 18:36:52 +00:00
|
|
|
|
2007-11-20 12:58:34 +00:00
|
|
|
$cmd = implode( " ",
|
|
|
|
|
array_map( 'wfEscapeShellArg',
|
|
|
|
|
array(
|
|
|
|
|
$this->php,
|
|
|
|
|
"$IP/maintenance/fetchText.php",
|
2010-12-06 09:20:07 +00:00
|
|
|
'--wiki', wfWikiID() ) ) );
|
2007-11-20 12:58:34 +00:00
|
|
|
$spec = array(
|
|
|
|
|
0 => array( "pipe", "r" ),
|
|
|
|
|
1 => array( "pipe", "w" ),
|
|
|
|
|
2 => array( "file", "/dev/null", "a" ) );
|
|
|
|
|
$pipes = array();
|
2010-07-19 18:36:52 +00:00
|
|
|
|
2007-11-20 12:58:34 +00:00
|
|
|
$this->progress( "Spawning database subprocess: $cmd" );
|
|
|
|
|
$this->spawnProc = proc_open( $cmd, $spec, $pipes );
|
2010-05-22 16:50:39 +00:00
|
|
|
if ( !$this->spawnProc ) {
|
2007-11-20 12:58:34 +00:00
|
|
|
// shit
|
|
|
|
|
$this->progress( "Subprocess spawn failed." );
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
list(
|
|
|
|
|
$this->spawnWrite, // -> stdin
|
|
|
|
|
$this->spawnRead, // <- stdout
|
|
|
|
|
) = $pipes;
|
2010-07-19 18:36:52 +00:00
|
|
|
|
2007-11-20 12:58:34 +00:00
|
|
|
return true;
|
|
|
|
|
}
|
2010-07-19 18:36:52 +00:00
|
|
|
|
2007-11-20 12:58:34 +00:00
|
|
|
private function closeSpawn() {
|
2007-11-20 17:34:42 +00:00
|
|
|
wfSuppressWarnings();
|
2010-05-22 16:50:39 +00:00
|
|
|
if ( $this->spawnRead )
|
2007-11-20 12:58:34 +00:00
|
|
|
fclose( $this->spawnRead );
|
|
|
|
|
$this->spawnRead = false;
|
2010-05-22 16:50:39 +00:00
|
|
|
if ( $this->spawnWrite )
|
2007-11-20 12:58:34 +00:00
|
|
|
fclose( $this->spawnWrite );
|
|
|
|
|
$this->spawnWrite = false;
|
2010-05-22 16:50:39 +00:00
|
|
|
if ( $this->spawnErr )
|
2007-11-20 12:58:34 +00:00
|
|
|
fclose( $this->spawnErr );
|
|
|
|
|
$this->spawnErr = false;
|
2010-05-22 16:50:39 +00:00
|
|
|
if ( $this->spawnProc )
|
2007-11-20 12:58:34 +00:00
|
|
|
pclose( $this->spawnProc );
|
|
|
|
|
$this->spawnProc = false;
|
2007-11-20 17:34:42 +00:00
|
|
|
wfRestoreWarnings();
|
2007-11-20 12:58:34 +00:00
|
|
|
}
|
2010-07-19 18:36:52 +00:00
|
|
|
|
2007-11-20 12:58:34 +00:00
|
|
|
private function getTextSpawnedOnce( $id ) {
|
2010-01-04 08:28:50 +00:00
|
|
|
global $wgContLang;
|
|
|
|
|
|
2007-11-20 12:58:34 +00:00
|
|
|
$ok = fwrite( $this->spawnWrite, "$id\n" );
|
2010-05-22 16:50:39 +00:00
|
|
|
// $this->progress( ">> $id" );
|
|
|
|
|
if ( !$ok ) return false;
|
2010-07-19 18:36:52 +00:00
|
|
|
|
2007-11-20 12:58:34 +00:00
|
|
|
$ok = fflush( $this->spawnWrite );
|
2010-05-22 16:50:39 +00:00
|
|
|
// $this->progress( ">> [flush]" );
|
|
|
|
|
if ( !$ok ) return false;
|
2010-06-04 02:24:56 +00:00
|
|
|
|
|
|
|
|
// check that the text id they are sending is the one we asked for
|
|
|
|
|
// this avoids out of sync revision text errors we have encountered in the past
|
|
|
|
|
$newId = fgets( $this->spawnRead );
|
|
|
|
|
if ( $newId === false ) {
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
if ( $id != intval( $newId ) ) {
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
2007-11-20 12:58:34 +00:00
|
|
|
$len = fgets( $this->spawnRead );
|
2010-05-22 16:50:39 +00:00
|
|
|
// $this->progress( "<< " . trim( $len ) );
|
|
|
|
|
if ( $len === false ) return false;
|
2010-07-19 18:36:52 +00:00
|
|
|
|
2007-11-20 12:58:34 +00:00
|
|
|
$nbytes = intval( $len );
|
2010-06-16 20:12:29 +00:00
|
|
|
// actual error, not zero-length text
|
|
|
|
|
if ($nbytes < 0 ) return false;
|
|
|
|
|
|
2007-11-20 12:58:34 +00:00
|
|
|
$text = "";
|
2010-07-19 18:36:52 +00:00
|
|
|
|
2007-11-20 12:58:34 +00:00
|
|
|
// Subprocess may not send everything at once, we have to loop.
|
2010-05-22 16:50:39 +00:00
|
|
|
while ( $nbytes > strlen( $text ) ) {
|
2007-11-20 12:58:34 +00:00
|
|
|
$buffer = fread( $this->spawnRead, $nbytes - strlen( $text ) );
|
2010-05-22 16:50:39 +00:00
|
|
|
if ( $buffer === false ) break;
|
2007-11-20 12:58:34 +00:00
|
|
|
$text .= $buffer;
|
|
|
|
|
}
|
2010-07-19 18:36:52 +00:00
|
|
|
|
2007-11-20 12:58:34 +00:00
|
|
|
$gotbytes = strlen( $text );
|
2010-05-22 16:50:39 +00:00
|
|
|
if ( $gotbytes != $nbytes ) {
|
|
|
|
|
$this->progress( "Expected $nbytes bytes from database subprocess, got $gotbytes " );
|
2007-11-20 12:58:34 +00:00
|
|
|
return false;
|
|
|
|
|
}
|
2010-07-19 18:36:52 +00:00
|
|
|
|
2007-11-20 17:34:42 +00:00
|
|
|
// Do normalization in the dump thread...
|
|
|
|
|
$stripped = str_replace( "\r", "", $text );
|
2010-01-04 08:28:50 +00:00
|
|
|
$normalized = $wgContLang->normalize( $stripped );
|
2007-11-20 17:34:42 +00:00
|
|
|
return $normalized;
|
2007-11-20 12:58:34 +00:00
|
|
|
}
|
2011-07-20 23:06:24 +00:00
|
|
|
|
|
|
|
|
function startElement( $parser, $name, $attribs ) {
|
|
|
|
|
$this->clearOpenElement( null );
|
|
|
|
|
$this->lastName = $name;
|
|
|
|
|
|
|
|
|
|
if ( $name == 'revision' ) {
|
|
|
|
|
$this->state = $name;
|
|
|
|
|
$this->egress->writeOpenPage( null, $this->buffer );
|
|
|
|
|
$this->buffer = "";
|
|
|
|
|
} elseif ( $name == 'page' ) {
|
|
|
|
|
$this->state = $name;
|
|
|
|
|
if ( $this->atStart ) {
|
|
|
|
|
$this->egress->writeOpenStream( $this->buffer );
|
|
|
|
|
$this->buffer = "";
|
|
|
|
|
$this->atStart = false;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if ( $name == "text" && isset( $attribs['id'] ) ) {
|
|
|
|
|
$text = $this->getText( $attribs['id'] );
|
|
|
|
|
$this->openElement = array( $name, array( 'xml:space' => 'preserve' ) );
|
|
|
|
|
if ( strlen( $text ) > 0 ) {
|
|
|
|
|
$this->characterData( $parser, $text );
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
$this->openElement = array( $name, $attribs );
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
function endElement( $parser, $name ) {
|
|
|
|
|
if ( $this->openElement ) {
|
|
|
|
|
$this->clearOpenElement( "" );
|
|
|
|
|
} else {
|
|
|
|
|
$this->buffer .= "</$name>";
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if ( $name == 'revision' ) {
|
|
|
|
|
$this->egress->writeRevision( null, $this->buffer );
|
|
|
|
|
$this->buffer = "";
|
|
|
|
|
$this->thisRev = "";
|
|
|
|
|
} elseif ( $name == 'page' ) {
|
|
|
|
|
$this->egress->writeClosePage( $this->buffer );
|
|
|
|
|
$this->buffer = "";
|
|
|
|
|
$this->thisPage = "";
|
|
|
|
|
} elseif ( $name == 'mediawiki' ) {
|
|
|
|
|
$this->egress->writeCloseStream( $this->buffer );
|
|
|
|
|
$this->buffer = "";
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
function characterData( $parser, $data ) {
|
|
|
|
|
$this->clearOpenElement( null );
|
|
|
|
|
if ( $this->lastName == "id" ) {
|
|
|
|
|
if ( $this->state == "revision" ) {
|
|
|
|
|
$this->thisRev .= $data;
|
|
|
|
|
} elseif ( $this->state == "page" ) {
|
|
|
|
|
$this->thisPage .= $data;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
$this->buffer .= htmlspecialchars( $data );
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
function clearOpenElement( $style ) {
|
|
|
|
|
if ( $this->openElement ) {
|
|
|
|
|
$this->buffer .= Xml::element( $this->openElement[0], $this->openElement[1], $style );
|
|
|
|
|
$this->openElement = false;
|
|
|
|
|
}
|
|
|
|
|
}
|
2005-10-16 17:33:41 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
$dumper = new TextPassDumper( $argv );
|
|
|
|
|
|
2010-10-02 21:58:04 +00:00
|
|
|
if ( !isset( $options['help'] ) ) {
|
2011-07-20 23:06:24 +00:00
|
|
|
$dumper->dump( true );
|
2005-10-16 17:33:41 +00:00
|
|
|
} else {
|
2008-06-04 01:48:25 +00:00
|
|
|
$dumper->progress( <<<ENDS
|
2005-10-16 17:33:41 +00:00
|
|
|
This script postprocesses XML dumps from dumpBackup.php to add
|
|
|
|
|
page text which was stubbed out (using --stub).
|
|
|
|
|
|
|
|
|
|
XML input is accepted on stdin.
|
|
|
|
|
XML output is sent to stdout; progress reports are sent to stderr.
|
|
|
|
|
|
|
|
|
|
Usage: php dumpTextPass.php [<options>]
|
|
|
|
|
Options:
|
2006-01-14 05:46:48 +00:00
|
|
|
--stub=<type>:<file> To load a compressed stub dump instead of stdin
|
|
|
|
|
--prefetch=<type>:<file> Use a prior dump file as a text source, to save
|
2010-07-19 18:36:52 +00:00
|
|
|
pressure on the database.
|
2011-07-20 23:06:24 +00:00
|
|
|
(Requires the XMLReader extension)
|
2010-07-19 18:36:52 +00:00
|
|
|
--quiet Don't dump status reports to stderr.
|
2005-10-16 17:33:41 +00:00
|
|
|
--report=n Report position and speed after every n pages processed.
|
2010-07-19 18:36:52 +00:00
|
|
|
(Default: 100)
|
2005-12-13 22:26:15 +00:00
|
|
|
--server=h Force reading from MySQL server h
|
2010-07-19 18:36:52 +00:00
|
|
|
--current Base ETA on number of pages in database instead of all revisions
|
|
|
|
|
--spawn Spawn a subprocess for loading text records
|
2010-10-02 21:58:04 +00:00
|
|
|
--help Display this help message
|
2008-06-04 01:48:25 +00:00
|
|
|
ENDS
|
2005-10-16 17:33:41 +00:00
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
|
2007-06-29 01:19:14 +00:00
|
|
|
|