Add findBadBlobs script.

This script scans for content blobs that can't be loaded due to
database corruption, and can change their entry in the content table
to an address starting with "bad:". Such addresses cause the content
to be read as empty, with no log entry. This is useful to avoid
errors and log spam due to known bad revisions.

The script is designed to scan a limited number of revisions from a
given start date. The assumption is that database corruption is
generally caused by an intermedia bug or system failure which will
affect many revisions over a short period of time.

Bug: T205936
Change-Id: I6f513133e90701bee89d63efa618afc3f91c2d2b
This commit is contained in:
daniel 2020-03-30 22:20:27 +02:00
parent 412d9c8bbc
commit 071ce36abd
5 changed files with 727 additions and 97 deletions

View file

@ -531,6 +531,7 @@ $wgAutoloadLocalClasses = [
'FileOpBatch' => __DIR__ . '/includes/libs/filebackend/FileOpBatch.php',
'FileOpPerfTest' => __DIR__ . '/maintenance/fileOpPerfTest.php',
'FileRepo' => __DIR__ . '/includes/filerepo/FileRepo.php',
'FindBadBlobs' => __DIR__ . '/maintenance/findBadBlobs.php',
'FindDeprecated' => __DIR__ . '/maintenance/findDeprecated.php',
'FindHooks' => __DIR__ . '/maintenance/findHooks.php',
'FindMissingFiles' => __DIR__ . '/maintenance/findMissingFiles.php',

View file

@ -1315,6 +1315,57 @@ class RevisionStore
$queryFlags = 0,
Title $title = null,
array $overrides = []
) {
return $this->newRevisionFromArchiveRowAndSlots( $row, null, $queryFlags, $title, $overrides );
}
/**
* @see RevisionFactory::newRevisionFromRow
*
* MCR migration note: this replaces Revision::newFromRow
*
* @param object $row A database row generated from a query based on getQueryInfo()
* @param int $queryFlags
* @param Title|null $title Preloaded title object based on Title::newFromRow from database row
* when query was build with option 'page' on getQueryInfo
* @param bool $fromCache if true, the returned RevisionRecord will ensure that no stale
* data is returned from getters, by querying the database as needed
* @return RevisionRecord
*/
public function newRevisionFromRow(
$row,
$queryFlags = 0,
Title $title = null,
$fromCache = false
) {
return $this->newRevisionFromRowAndSlots( $row, null, $queryFlags, $title, $fromCache );
}
/**
* @see newRevisionFromArchiveRow()
* @since 1.35
*
* @param object $row
* @param null|object[]|RevisionSlots $slots
* - Database rows generated from a query based on getSlotsQueryInfo
* with the 'content' flag set. Or
* - RevisionSlots instance
* @param int $queryFlags
* @param Title|null $title
* @param array $overrides associative array with fields of $row to override. This may be
* used e.g. to force the parent revision ID or page ID. Keys in the array are fields
* names from the archive table without the 'ar_' prefix, i.e. use 'parent_id' to
* override ar_parent_id.
*
* @return RevisionRecord
* @throws MWException
*/
public function newRevisionFromArchiveRowAndSlots(
$row,
$slots,
$queryFlags = 0,
Title $title = null,
array $overrides = []
) {
Assert::parameterType( 'object', $row, '$row' );
@ -1366,38 +1417,20 @@ class RevisionStore
// Legacy because $row may have come from self::selectFields()
$comment = $this->commentStore->getCommentLegacy( $db, 'ar_comment', $row, true );
$slots = $this->newRevisionSlots( $row->ar_rev_id, $row, null, $queryFlags, $title );
if ( !( $slots instanceof RevisionSlots ) ) {
$slots = $this->newRevisionSlots( $row->ar_rev_id, $row, $slots, $queryFlags, $title );
}
return new RevisionArchiveRecord( $title, $user, $comment, $row, $slots, $this->dbDomain );
}
/**
* @see RevisionFactory::newRevisionFromRow
* @see newFromRevisionRow()
*
* MCR migration note: this replaces Revision::newFromRow
*
* @param object $row A database row generated from a query based on getQueryInfo()
* @param int $queryFlags
* @param Title|null $title Preloaded title object based on Title::newFromRow from database row
* when query was build with option 'page' on getQueryInfo
* @param bool $fromCache if true, the returned RevisionRecord will ensure that no stale
* data is returned from getters, by querying the database as needed
* @return RevisionRecord
*/
public function newRevisionFromRow(
$row,
$queryFlags = 0,
Title $title = null,
$fromCache = false
) {
return $this->newRevisionFromRowAndSlots( $row, null, $queryFlags, $title, $fromCache );
}
/**
* @param object $row A database row generated from a query based on getQueryInfo()
* @param null|object[]|RevisionSlots $slots
* - Database rows generated from a query based on getSlotsQueryInfo
* with the 'content' flag set. Or
* - Database rows generated from a query based on getSlotsQueryInfo
* with the 'content' flag set. Or
* - RevisionSlots instance
* @param int $queryFlags
* @param Title|null $title
@ -1407,8 +1440,6 @@ class RevisionStore
* @return RevisionRecord
* @throws MWException
* @see RevisionFactory::newRevisionFromRow
*
* MCR migration note: this replaces Revision::newFromRow
*/
public function newRevisionFromRowAndSlots(
$row,
@ -1475,14 +1506,19 @@ class RevisionStore
/**
* Construct a RevisionRecord instance for each row in $rows,
* and return them as an associative array indexed by revision ID.
* Use getQueryInfo() or getArchiveQueryInfo() to construct the
* query that produces the rows.
*
* @param Traversable|array $rows the rows to construct revision records from
* @param array $options Supports the following options:
* 'slots' - whether metadata about revision slots should be
* loaded immediately. Supports falsy or truthy value as well
* as an explicit list of slot role names. The main slot will
* always be loaded.
* 'content'- whether the actual content of the slots should be
* 'content' - whether the actual content of the slots should be
* preloaded.
* 'archive' - whether the rows where generated using getArchiveQueryInfo(),
* rather than getQueryInfo.
* @param int $queryFlags
* @param Title|null $title The title to which all the revision rows belong, if there
* is such a title and the caller has it handy, so we don't have to look it up again.
@ -1499,33 +1535,63 @@ class RevisionStore
Title $title = null
) {
$result = new StatusValue();
$archiveMode = $options['archive'] ?? false;
if ( $archiveMode ) {
$revIdField = 'ar_rev_id';
} else {
$revIdField = 'rev_id';
}
$rowsByRevId = [];
$pageIdsToFetchTitles = [];
$titlesByPageId = [];
$titlesByPageKey = [];
foreach ( $rows as $row ) {
if ( isset( $rowsByRevId[$row->rev_id] ) ) {
if ( isset( $rowsByRevId[$row->$revIdField] ) ) {
$result->warning(
'internalerror',
"Duplicate rows in newRevisionsFromBatch, rev_id {$row->rev_id}"
"Duplicate rows in newRevisionsFromBatch, $revIdField {$row->$revIdField}"
);
}
if ( $title && $row->rev_page != $title->getArticleID() ) {
throw new InvalidArgumentException(
"Revision {$row->rev_id} doesn't belong to page {$title->getArticleID()}"
);
} elseif ( !$title && !isset( $titlesByPageId[ $row->rev_page ] ) ) {
if ( isset( $row->page_namespace ) && isset( $row->page_title ) &&
// This should not happen, but just in case we don't have a page_id
// set or it doesn't match rev_page, let's fetch the title again.
isset( $row->page_id ) && $row->rev_page === $row->page_id
// Attach a page key to the row, so we can find and reuse Title objects easily.
$row->_page_key =
$archiveMode ? $row->ar_namespace . ':' . $row->ar_title : $row->rev_page;
if ( $title ) {
if ( !$archiveMode && $row->rev_page != $title->getArticleID() ) {
throw new InvalidArgumentException(
"Revision {$row->$revIdField} doesn't belong to page "
. $title->getArticleID()
);
}
if ( $archiveMode
&& ( $row->ar_namespace != $title->getNamespace()
|| $row->ar_title !== $title->getDBkey() )
) {
$titlesByPageId[ $row->rev_page ] = Title::newFromRow( $row );
throw new InvalidArgumentException(
"Revision {$row->$revIdField} doesn't belong to page "
. $title->getPrefixedDBkey()
);
}
} elseif ( !isset( $titlesByPageKey[ $row->_page_key ] ) ) {
if ( isset( $row->page_namespace ) && isset( $row->page_title )
// This should always be true, but just in case we don't have a page_id
// set or it doesn't match rev_page, let's fetch the title again.
&& isset( $row->page_id ) && isset( $row->rev_page )
&& $row->rev_page === $row->page_id
) {
$titlesByPageKey[ $row->_page_key ] = Title::newFromRow( $row );
} elseif ( $archiveMode ) {
// Can't look up deleted pages by ID, but we have namespace and title
$titlesByPageKey[ $row->_page_key ] =
Title::makeTitle( $row->ar_namespace, $row->ar_title );
} else {
$pageIdsToFetchTitles[] = $row->rev_page;
}
}
$rowsByRevId[$row->rev_id] = $row;
$rowsByRevId[$row->$revIdField] = $row;
}
if ( empty( $rowsByRevId ) ) {
@ -1535,28 +1601,45 @@ class RevisionStore
// If the title is not supplied, batch-fetch Title objects.
if ( $title ) {
$titlesByPageId[$title->getArticleID()] = $title;
// same logic as for $row->_page_key above
$pageKey = $archiveMode
? $title->getNamespace() . ':' . $title->getDBkey()
: $title->getArticleID();
$titlesByPageKey[$pageKey] = $title;
} elseif ( !empty( $pageIdsToFetchTitles ) ) {
// Note: when we fetch titles by ID, the page key is also the ID.
// We should never get here if $archiveMode is true.
Assert::invariant( !$archiveMode, 'Titles are not loaded by ID in archive mode.' );
$pageIdsToFetchTitles = array_unique( $pageIdsToFetchTitles );
foreach ( Title::newFromIDs( $pageIdsToFetchTitles ) as $t ) {
$titlesByPageId[$t->getArticleID()] = $t;
$titlesByPageKey[$t->getArticleID()] = $t;
}
}
// which method to use for creating RevisionRecords
$newRevisionRecord = [
$this,
$archiveMode ? 'newRevisionFromArchiveRowAndSlots' : 'newRevisionFromRowAndSlots'
];
if ( !isset( $options['slots'] ) ) {
$result->setResult( true,
array_map( function ( $row ) use ( $queryFlags, $titlesByPageId, $result ) {
try {
return $this->newRevisionFromRow(
$row,
$queryFlags,
$titlesByPageId[$row->rev_page]
);
} catch ( MWException $e ) {
$result->warning( 'internalerror', $e->getMessage() );
return null;
}
}, $rowsByRevId )
$result->setResult(
true,
array_map(
function ( $row )
use ( $queryFlags, $titlesByPageKey, $result, $newRevisionRecord ) {
try {
return $newRevisionRecord( $row, null, $queryFlags,
$titlesByPageKey[ $row->_page_key ] ?? null );
} catch ( MWException $e ) {
$result->warning( 'internalerror', $e->getMessage() );
return null;
}
},
$rowsByRevId
)
);
return $result;
}
@ -1578,34 +1661,42 @@ class RevisionStore
$result->merge( $slotRowsStatus );
$slotRowsByRevId = $slotRowsStatus->getValue();
$result->setResult( true, array_map( function ( $row ) use
( $slotRowsByRevId, $queryFlags, $titlesByPageId, $result ) {
if ( !isset( $slotRowsByRevId[$row->rev_id] ) ) {
$result->warning(
'internalerror',
"Couldn't find slots for rev {$row->rev_id}"
);
return null;
}
try {
return $this->newRevisionFromRowAndSlots(
$row,
new RevisionSlots(
$this->constructSlotRecords(
$row->rev_id,
$slotRowsByRevId[$row->rev_id],
$queryFlags,
$titlesByPageId[$row->rev_page]
)
),
$queryFlags,
$titlesByPageId[$row->rev_page]
);
} catch ( MWException $e ) {
$result->warning( 'internalerror', $e->getMessage() );
return null;
}
}, $rowsByRevId ) );
$result->setResult(
true,
array_map(
function ( $row )
use ( $slotRowsByRevId, $queryFlags, $titlesByPageKey, $result,
$revIdField, $newRevisionRecord
) {
if ( !isset( $slotRowsByRevId[$row->$revIdField] ) ) {
$result->warning(
'internalerror',
"Couldn't find slots for rev {$row->$revIdField}"
);
return null;
}
try {
return $newRevisionRecord(
$row,
new RevisionSlots(
$this->constructSlotRecords(
$row->$revIdField,
$slotRowsByRevId[$row->$revIdField],
$queryFlags,
$titlesByPageKey[$row->_page_key] ?? null
)
),
$queryFlags,
$titlesByPageKey[$row->_page_key]
);
} catch ( MWException $e ) {
$result->warning( 'internalerror', $e->getMessage() );
return null;
}
},
$rowsByRevId
)
);
return $result;
}
@ -1615,11 +1706,12 @@ class RevisionStore
* Callers are responsible for unserializing and interpreting the content blobs
* based on the model_name and role_name fields.
*
* @param Traversable|array $rowsOrIds list of revision ids, or revision rows from a db query.
* @param Traversable|array $rowsOrIds list of revision ids, or revision or archive rows
* from a db query.
* @param array $options Supports the following options:
* 'slots' - a list of slot role names to fetch. If omitted or true or null,
* all slots are fetched
* 'blobs'- whether the serialized content of each slot should be loaded.
* 'blobs' - whether the serialized content of each slot should be loaded.
* If true, the serialiezd content will be present in the slot row
* in the blob_data field.
* @param int $queryFlags
@ -1640,7 +1732,11 @@ class RevisionStore
$revIds = [];
foreach ( $rowsOrIds as $row ) {
$revIds[] = is_object( $row ) ? (int)$row->rev_id : (int)$row;
if ( is_object( $row ) ) {
$revIds[] = isset( $row->ar_rev_id ) ? (int)$row->ar_rev_id : (int)$row->rev_id;
} else {
$revIds[] = (int)$row;
}
}
// Nothing to do.
@ -2847,7 +2943,7 @@ class RevisionStore
*
* @param int $pageId The id of the page
* @param RevisionRecord|null $old Old revision.
* If null is provided, count starting from the first revision (inclusive).
* If null is provided, count starting from the first revision (inclusive).
* @param RevisionRecord|null $new New revision.
* If null is provided, count until the last revision (inclusive).
* @param User|null $user the user who's access rights to apply
@ -2857,7 +2953,7 @@ class RevisionStore
* 'include_new' Include $new in the range; $old is excluded.
* 'include_both' Include both $old and $new in the range.
* @throws InvalidArgumentException in case either revision is unsaved or
* the revisions do not belong to the same page or unknown option is passed.
* the revisions do not belong to the same page or unknown option is passed.
* @return UserIdentity[] Names of revision authors in the range
*/
public function getAuthorsBetween(
@ -2918,7 +3014,7 @@ class RevisionStore
*
* @param int $pageId The id of the page
* @param RevisionRecord|null $old Old revision .
* If null is provided, count starting from the first revision (inclusive).
* If null is provided, count starting from the first revision (inclusive).
* @param RevisionRecord|null $new New revision.
* If null is provided, count until the last revision (inclusive).
* @param User|null $user the user who's access rights to apply
@ -2928,7 +3024,7 @@ class RevisionStore
* 'include_new' Include $new in the range; $old is excluded.
* 'include_both' Include both $old and $new in the range.
* @throws InvalidArgumentException in case either revision is unsaved or
* the revisions do not belong to the same page or unknown option is passed.
* the revisions do not belong to the same page or unknown option is passed.
* @return int Number of revisions authors in the range.
*/
public function countAuthorsBetween(
@ -2952,7 +3048,7 @@ class RevisionStore
*
* @param int $pageId The id of the page
* @param RevisionRecord|null $old Old revision.
* If null is provided, count starting from the first revision (inclusive).
* If null is provided, count starting from the first revision (inclusive).
* @param RevisionRecord|null $new New revision.
* If null is provided, count until the last revision (inclusive).
* @param int|null $max Limit of Revisions to count, will be incremented to detect truncations.
@ -2961,7 +3057,7 @@ class RevisionStore
* 'include_new' Include $new in the range; $old is excluded.
* 'include_both' Include both $old and $new in the range.
* @throws InvalidArgumentException in case either revision is unsaved or
* the revisions do not belong to the same page.
* the revisions do not belong to the same page.
* @return int Number of revisions between these revisions.
*/
public function countRevisionsBetween(

View file

@ -345,7 +345,12 @@ class SqlBlobStore implements IDBAccessObject, BlobStore {
$result = [];
$errors = [];
foreach ( $blobAddresses as $blobAddress ) {
list( $schema, $id ) = self::splitBlobAddress( $blobAddress );
try {
list( $schema, $id ) = self::splitBlobAddress( $blobAddress );
} catch ( InvalidArgumentException $ex ) {
throw new BlobAccessException( $ex->getMessage(), 0, $ex );
}
//TODO: MCR: also support 'ex' schema with ExternalStore URLs, plus flags encoded in the URL!
if ( $schema === 'bad' ) {
// Database row was marked as "known bad", no need to trigger an error.

View file

@ -0,0 +1,371 @@
<?php
/**
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
* http://www.gnu.org/copyleft/gpl.html
*
* @file
* @ingroup Maintenance
*/
use MediaWiki\MediaWikiServices;
use MediaWiki\Revision\RevisionArchiveRecord;
use MediaWiki\Revision\RevisionRecord;
use MediaWiki\Revision\RevisionStore;
use MediaWiki\Revision\RevisionStoreRecord;
use MediaWiki\Revision\SlotRecord;
use MediaWiki\Storage\BlobAccessException;
use MediaWiki\Storage\BlobStore;
use Wikimedia\Rdbms\LBFactory;
use Wikimedia\Rdbms\LoadBalancer;
require_once __DIR__ . '/cleanupTable.inc';
/**
* Maintenance script for finding and marking bad content blobs.
*
* @ingroup Maintenance
*/
class FindBadBlobs extends Maintenance {
/**
* @var RevisionStore|null
*/
private $revisionStore;
/**
* @var BlobStore|null
*/
private $blobStore;
/**
* @var LoadBalancer|null
*/
private $loadBalancer;
/**
* @var LBFactory
*/
private $lbFactory;
public function __construct() {
parent::__construct();
$this->setBatchSize( 1000 );
$this->addDescription( 'Scan for bad content blobs' );
$this->addOption( 'from-date', 'Start scanning revisions at the given date. '
. 'Format: Anything supported by MediaWiki, e.g. YYYYMMDDHHMMSS or YYYY-MM-DD_HH:MM:SS',
true, true );
$this->addOption( 'limit', 'Maximum number of revisions to scan. Default: 1000', false, true );
$this->addOption( 'mark', 'Mark the blob as "known bad", to avoid errors when '
. 'attempting to read it. The value given is the reason for marking the blob as bad, '
. 'typically a ticket ID', false, true );
}
public function initializeServices(
?RevisionStore $revisionStore = null,
?BlobStore $blobStore = null,
?LoadBalancer $loadBalancer = null,
?LBFactory $lbFactory = null
) {
$services = MediaWikiServices::getInstance();
$this->revisionStore = $revisionStore ?? $this->revisionStore ?? $services->getRevisionStore();
$this->blobStore = $blobStore ?? $this->blobStore ?? $services->getBlobStore();
$this->loadBalancer = $loadBalancer ?? $this->loadBalancer ?? $services->getDBLoadBalancer();
$this->lbFactory = $lbFactory ?? $this->lbFactory ?? $services->getDBLoadBalancerFactory();
}
/**
* @return string
*/
private function getStartTimestamp() {
$tsOpt = $this->getOption( 'from-date' );
if ( strlen( $tsOpt ) < 14 ) {
$this->fatalError( 'Bad timestamp: ' . $tsOpt
. ', please provide time and date down to the second.' );
}
$ts = wfTimestamp( TS_MW, $tsOpt );
if ( !$ts ) {
$this->fatalError( 'Bad timestamp: ' . $tsOpt );
}
return $ts;
}
/**
* @inheritDoc
*/
public function execute() {
$this->initializeServices();
$fromTimestamp = $this->getStartTimestamp();
$total = $this->getOption( 'limit', 1000 );
$this->scanRevisionsByTimestamp( $fromTimestamp, $total );
}
/**
* @param string $fromTimestamp
* @param int $total
*
* @return int
*/
private function scanRevisionsByTimestamp( $fromTimestamp, $total ) {
$count = 0;
$lastRevId = 0;
$firstRevId = 0;
$lastTimestamp = $fromTimestamp;
$revisionRowsScanned = 0;
$archiveRowsScanned = 0;
$this->output( "Scanning revisions table, "
. "$total rows starting at rev_timestamp $fromTimestamp\n" );
while ( $revisionRowsScanned < $total ) {
$batchSize = min( $total - $revisionRowsScanned, $this->getBatchSize() );
$revisions = $this->loadRevisionsByTimestamp( $lastRevId, $lastTimestamp, $batchSize );
if ( !$revisions ) {
break;
}
foreach ( $revisions as $rev ) {
// we are sorting by timestamp, so we may encounter revision IDs out of sequence
$firstRevId = $firstRevId ? min( $firstRevId, $rev->getId() ) : $rev->getId();
$lastRevId = max( $lastRevId, $rev->getId() );
$count += $this->checkRevision( $rev );
}
$lastTimestamp = $rev->getTimestamp();
$batchSize = count( $revisions );
$revisionRowsScanned += $batchSize;
$this->output(
"\t- Scanned a batch of $batchSize revisions, "
. "up to revision $lastRevId ($lastTimestamp)\n"
);
$this->waitForReplication();
}
// NOTE: the archive table isn't indexed by timestamp, so the best we can do is use the
// revision ID just before the first revision ID we found above as the starting point
// of the scan, and scan up to on revision after the last revision ID we found above.
// If $firstRevId is 0, the loop body above didn't execute,
// so we should skip the one below as well.
$fromArchived = $this->getNextRevision( $firstRevId, '<', 'DESC' );
$maxArchived = $this->getNextRevision( $lastRevId, '>', 'ASC' );
$maxArchived = $maxArchived ?: PHP_INT_MAX;
$this->output( "Scanning archive table by ar_rev_id, $fromArchived to $maxArchived\n" );
while ( $firstRevId > 0 && $fromArchived < $maxArchived ) {
$batchSize = min( $total - $archiveRowsScanned, $this->getBatchSize() );
$revisions = $this->loadArchiveByRevisionId( $fromArchived, $maxArchived, $batchSize );
if ( !$revisions ) {
break;
}
/** @var RevisionRecord $rev */
foreach ( $revisions as $rev ) {
$count += $this->checkRevision( $rev );
}
$fromArchived = $rev->getId();
$batchSize = count( $revisions );
$archiveRowsScanned += $batchSize;
$this->output(
"\t- Scanned a batch of $batchSize archived revisions, "
. "up to revision $fromArchived ($lastTimestamp)\n"
);
$this->waitForReplication();
}
if ( $this->hasOption( 'mark' ) ) {
$this->output( "Marked $count bad revisions\n" );
} else {
$this->output( "Found $count bad revisions\n" );
}
$this->output( "The range of archive rows scanned is based on the range of revision IDs "
. "scanned in the revision table.\n" );
return $count;
}
/**
* @param int $afterId
* @param string $fromTimestamp
* @param int $batchSize
*
* @return RevisionStoreRecord[]
*/
private function loadRevisionsByTimestamp( int $afterId, string $fromTimestamp, $batchSize ) {
$db = $this->loadBalancer->getConnectionRef( DB_REPLICA );
$queryInfo = $this->revisionStore->getQueryInfo();
$quotedTimestamp = $db->addQuotes( $fromTimestamp );
$rows = $db->select(
$queryInfo['tables'],
$queryInfo['fields'],
"rev_timestamp > $quotedTimestamp OR "
. "(rev_timestamp = $quotedTimestamp AND rev_id > $afterId )",
__METHOD__,
[ 'LIMIT' => $batchSize, 'ORDER BY' => 'rev_timestamp, rev_id' ],
$queryInfo['joins']
);
$result = $this->revisionStore->newRevisionsFromBatch( $rows, [ 'slots' => true ] );
if ( !$result->isOK() ) {
$this->fatalError( Status::wrap( $result )->getMessage( false, false, 'en' )->text() );
}
return $result->value;
}
/**
* @param int $afterId
* @param int $uptoId
* @param int $batchSize
*
* @return RevisionArchiveRecord[]
*/
private function loadArchiveByRevisionId( int $afterId, int $uptoId, $batchSize ) {
$db = $this->loadBalancer->getConnectionRef( DB_REPLICA );
$queryInfo = $this->revisionStore->getArchiveQueryInfo();
$rows = $db->select(
$queryInfo['tables'],
$queryInfo['fields'],
[ "ar_rev_id > $afterId", "ar_rev_id <= $uptoId" ],
__METHOD__,
[ 'LIMIT' => $batchSize, 'ORDER BY' => 'ar_rev_id' ],
$queryInfo['joins']
);
$result = $this->revisionStore->newRevisionsFromBatch(
$rows,
[ 'archive' => true, 'slots' => true ]
);
if ( !$result->isOK() ) {
$this->fatalError( Status::wrap( $result )->getMessage( false, false, 'en' )->text() );
}
return $result->value;
}
/**
* Returns the revision ID next to $revId, according to $comp and $dir
*
* @param int $revId
* @param string $comp the comparator, either '<' or '>', to go with $dir
* @param string $dir the sort direction to go with $comp, either 'ARC' or 'DESC'
*
* @return int
*/
private function getNextRevision( int $revId, string $comp, string $dir ) {
$db = $this->loadBalancer->getConnectionRef( DB_REPLICA );
$next = $db->selectField(
'revision',
'rev_id',
"rev_id $comp $revId",
__METHOD__,
[ 'ORDER BY' => "rev_id $dir" ]
);
return (int)$next;
}
/**
* @param RevisionRecord $rev
*
* @return int
*/
private function checkRevision( RevisionRecord $rev ) {
$count = 0;
foreach ( $rev->getSlots()->getSlots() as $slot ) {
$count += $this->checkSlot( $rev, $slot );
}
return $count;
}
/**
* @param RevisionRecord $rev
* @param SlotRecord $slot
*
* @return int
*/
private function checkSlot( RevisionRecord $rev, SlotRecord $slot ) {
$address = $slot->getAddress();
$error = null;
try {
$this->blobStore->getBlob( $address );
return 0; // nothing to do
} catch ( BlobAccessException $ex ) {
$error = $ex->getMessage();
} catch ( ExternalStoreException $ex ) {
$error = $ex->getMessage();
}
$this->output( "\t! Found bad blob on revision {$rev->getId()} ({$slot->getRole()} slot): "
. "content_id={$slot->getContentId()}, address=<{$slot->getAddress()}>, error='$error'\n" );
if ( $this->hasOption( 'mark' ) ) {
$newAddress = $this->markBlob( $rev, $slot, $error );
$this->output( "\tChanged address to <$newAddress>\n" );
}
return 1;
}
/**
* @param RevisionRecord $rev
* @param SlotRecord $slot
* @param string|null $error
*
* @return false|string
*/
private function markBlob( RevisionRecord $rev, SlotRecord $slot, string $error = null ) {
$args = [];
if ( $this->hasOption( 'mark' ) ) {
$args['reason'] = $this->getOption( 'mark' );
}
if ( $error ) {
$args['error'] = $error;
}
$address = $slot->getAddress() ?: 'empty';
$badAddress = 'bad:' . urlencode( $address );
if ( $args ) {
$badAddress .= '?' . wfArrayToCgi( $args );
}
$badAddress = substr( $badAddress, 0, 255 );
$dbw = $this->loadBalancer->getConnectionRef( DB_MASTER );
$dbw->update(
'content',
[ 'content_address' => $badAddress ],
[ 'content_id' => $slot->getContentId() ],
__METHOD__
);
return $badAddress;
}
private function waitForReplication() {
return $this->lbFactory->waitForReplication();
}
}
$maintClass = FindBadBlobs::class;
require_once RUN_MAINTENANCE_IF_MAIN;

View file

@ -907,7 +907,7 @@ abstract class RevisionStoreDbTestBase extends MediaWikiTestCase {
* @covers \MediaWiki\Revision\RevisionStore::newRevisionFromRowAndSlots
* @covers \MediaWiki\Revision\RevisionStore::getQueryInfo
*/
public function testNewRevisionFromRowAndSlot_getQueryInfo() {
public function testNewRevisionFromRowAndSlots_getQueryInfo() {
$page = $this->getTestPage();
$text = __METHOD__ . 'o-ö';
/** @var Revision $rev */
@ -1053,8 +1053,49 @@ abstract class RevisionStoreDbTestBase extends MediaWikiTestCase {
$this->assertSame( $text, $rev->getContent()->serialize() );
}
/**
* @covers \MediaWiki\Revision\RevisionStore::newRevisionFromArchiveRowAndSlots
* @covers \MediaWiki\Revision\RevisionStore::getArchiveQueryInfo
*/
public function testNewRevisionFromArchiveRowAndSlots_getArchiveQueryInfo() {
$store = MediaWikiServices::getInstance()->getRevisionStore();
$title = Title::newFromText( __METHOD__ );
$text = __METHOD__ . '-bä';
$page = WikiPage::factory( $title );
/** @var Revision $orig */
$orig = $page->doEditContent( new WikitextContent( $text ), __METHOD__ )
->value['revision'];
$page->doDeleteArticleReal( __METHOD__, $this->getTestSysop()->getUser() );
$db = wfGetDB( DB_MASTER );
$arQuery = $store->getArchiveQueryInfo();
$res = $db->select(
$arQuery['tables'], $arQuery['fields'], [ 'ar_rev_id' => $orig->getId() ],
__METHOD__, [], $arQuery['joins']
);
$this->assertIsObject( $res, 'query failed' );
$info = $store->getSlotsQueryInfo( [ 'content' ] );
$slotRows = $this->db->select(
$info['tables'],
$info['fields'],
[ 'slot_revision_id' => $orig->getId() ],
__METHOD__,
[],
$info['joins']
);
$row = $res->fetchObject();
$res->free();
$record = $store->newRevisionFromArchiveRowAndSlots( $row, iterator_to_array( $slotRows ) );
$this->assertRevisionRecordMatchesRevision( $orig, $record );
$this->assertSame( $text, $record->getContent( SlotRecord::MAIN )->serialize() );
}
/**
* @covers \MediaWiki\Revision\RevisionStore::newRevisionFromArchiveRow
* @covers \MediaWiki\Revision\RevisionStore::newRevisionFromArchiveRowAndSlots
* @covers \MediaWiki\Revision\RevisionStore::getArchiveQueryInfo
*/
public function testNewRevisionFromArchiveRow_getArchiveQueryInfo() {
@ -1085,6 +1126,7 @@ abstract class RevisionStoreDbTestBase extends MediaWikiTestCase {
/**
* @covers \MediaWiki\Revision\RevisionStore::newRevisionFromArchiveRow
* @covers \MediaWiki\Revision\RevisionStore::newRevisionFromArchiveRowAndSlots
*/
public function testNewRevisionFromArchiveRow_legacyEncoding() {
$this->setMwGlobals( 'wgLegacyEncoding', 'windows-1252' );
@ -1115,6 +1157,7 @@ abstract class RevisionStoreDbTestBase extends MediaWikiTestCase {
/**
* @covers \MediaWiki\Revision\RevisionStore::newRevisionFromArchiveRow
* @covers \MediaWiki\Revision\RevisionStore::newRevisionFromArchiveRowAndSlots
*/
public function testNewRevisionFromArchiveRow_no_user() {
$store = MediaWikiServices::getInstance()->getRevisionStore();
@ -1155,6 +1198,7 @@ abstract class RevisionStoreDbTestBase extends MediaWikiTestCase {
* Test for T236624.
*
* @covers \MediaWiki\Revision\RevisionStore::newRevisionFromArchiveRow
* @covers \MediaWiki\Revision\RevisionStore::newRevisionFromArchiveRowAndSlots
*/
public function testNewRevisionFromArchiveRow_empty_actor() {
$store = MediaWikiServices::getInstance()->getRevisionStore();
@ -2007,7 +2051,7 @@ abstract class RevisionStoreDbTestBase extends MediaWikiTestCase {
/**
* @dataProvider provideGetContentBlobsForBatchOptions
* @covers \MediaWiki\Revision\RevisionStore::newRevisionsFromBatch
* @covers \MediaWiki\Revision\RevisionStore::getContentBlobsForBatch
* @param array|null $slots
* @throws \MWException
*/
@ -2055,6 +2099,50 @@ abstract class RevisionStoreDbTestBase extends MediaWikiTestCase {
$this->assertSame( $text . '1', $mainSlotRow1->blob_data );
$this->assertSame( $text . '2', $mainSlotRow2->blob_data );
}
// try again, with objects instead of ids:
$result2 = $store->getContentBlobsForBatch( [
(object)[ 'rev_id' => $rev1->getId() ],
(object)[ 'rev_id' => $rev2->getId() ],
], $slots );
$this->assertTrue( $result2->isGood() );
$exp1 = var_export( $result->getValue(), true );
$exp2 = var_export( $result2->getValue(), true );
$this->assertSame( $exp1, $exp2 );
}
/**
* @covers \MediaWiki\Revision\RevisionStore::getContentBlobsForBatch
* @throws \MWException
*/
public function testGetContentBlobsForBatch_archive() {
$page1 = $this->getTestPage( __METHOD__ );
$text = __METHOD__ . 'b-ä';
$editStatus = $this->editPage( $page1->getTitle()->getPrefixedDBkey(), $text . '1' );
$this->assertTrue( $editStatus->isGood(), 'Sanity: must create revision 1' );
/** @var Revision $rev1 */
$rev1 = $editStatus->getValue()['revision'];
$page1->doDeleteArticleReal( __METHOD__, $this->getTestSysop()->getUser() );
$page2 = $this->getTestPage( $page1->getTitle()->getPrefixedText() . '_other' );
$editStatus = $this->editPage( $page2->getTitle()->getPrefixedDBkey(), $text . '2' );
$this->assertTrue( $editStatus->isGood(), 'Sanity: must create revision 2' );
/** @var Revision $rev2 */
$rev2 = $editStatus->getValue()['revision'];
$page2->doDeleteArticleReal( __METHOD__, $this->getTestSysop()->getUser() );
$store = MediaWikiServices::getInstance()->getRevisionStore();
$result = $store->getContentBlobsForBatch( [
(object)[ 'ar_rev_id' => $rev1->getId() ],
(object)[ 'ar_rev_id' => $rev2->getId() ],
] );
$this->assertTrue( $result->isGood() );
$this->assertSame( [], $result->getErrors() );
$rowSetsByRevId = $result->getValue();
$this->assertArrayHasKey( $rev1->getId(), $rowSetsByRevId );
$this->assertArrayHasKey( $rev2->getId(), $rowSetsByRevId );
}
/**
@ -2143,7 +2231,8 @@ abstract class RevisionStoreDbTestBase extends MediaWikiTestCase {
$this->revisionToRow( $rev1, $queryOptions ),
$this->revisionToRow( $rev2, $queryOptions )
],
$options
$options,
0, $otherPageTitle ? null : $page1->getTitle()
);
$this->assertTrue( $result->isGood() );
$this->assertSame( [], $result->getErrors() );
@ -2162,6 +2251,74 @@ abstract class RevisionStoreDbTestBase extends MediaWikiTestCase {
$records[$rev2->getId()]->getPageAsLinkTarget()->getDBkey() );
}
/**
* @dataProvider provideNewRevisionsFromBatchOptions
* @covers \MediaWiki\Revision\RevisionStore::newRevisionsFromBatch
* @param array|null $queryOptions options to provide to revisionToRow
* @param string|null $otherPageTitle
* @param array|null $options
* @throws \MWException
*/
public function testNewRevisionsFromBatch_archive(
$queryOptions,
$otherPageTitle = null,
array $options = []
) {
$title1 = Title::newFromText( __METHOD__ );
$text1 = __METHOD__ . '-bä';
$page1 = WikiPage::factory( $title1 );
$title2 = $otherPageTitle ? Title::newFromText( $otherPageTitle ) : $title1;
$text2 = __METHOD__ . '-bö';
$page2 = $otherPageTitle ? WikiPage::factory( $title2 ) : $page1;
/** @var Revision $rev1 */
/** @var Revision $rev2 */
$rev1 = $page1->doEditContent( new WikitextContent( $text1 ), __METHOD__ )
->value['revision'];
$rev2 = $page2->doEditContent( new WikitextContent( $text2 ), __METHOD__ )
->value['revision'];
$page1->doDeleteArticleReal( __METHOD__, $this->getTestSysop()->getUser() );
if ( $page2 !== $page1 ) {
$page2->doDeleteArticleReal( __METHOD__, $this->getTestSysop()->getUser() );
}
$store = MediaWikiServices::getInstance()->getRevisionStore();
$queryInfo = $store->getArchiveQueryInfo();
$rows = $this->db->select(
$queryInfo['tables'],
$queryInfo['fields'],
[ 'ar_rev_id' => [ $rev1->getId(), $rev2->getId() ] ],
__METHOD__,
[],
$queryInfo['joins']
);
$options['archive'] = true;
$rows = iterator_to_array( $rows );
$result = $store->newRevisionsFromBatch(
$rows, $options, 0, $otherPageTitle ? null : $title1 );
$this->assertTrue( $result->isGood() );
$this->assertSame( [], $result->getErrors() );
/** @var RevisionRecord[] $records */
$records = $result->getValue();
$this->assertCount( 2, $records );
$this->assertRevisionRecordMatchesRevision( $rev1, $records[$rev1->getId()] );
$this->assertRevisionRecordMatchesRevision( $rev2, $records[$rev2->getId()] );
$this->assertSame( $text1,
ContentHandler::getContentText( $records[$rev1->getId()]->getContent( SlotRecord::MAIN ) ) );
$this->assertSame( $text2,
ContentHandler::getContentText( $records[$rev2->getId()]->getContent( SlotRecord::MAIN ) ) );
$this->assertEquals( $page1->getTitle()->getDBkey(),
$records[$rev1->getId()]->getPageAsLinkTarget()->getDBkey() );
$this->assertEquals( $page2->getTitle()->getDBkey(),
$records[$rev2->getId()]->getPageAsLinkTarget()->getDBkey() );
}
/**
* @covers \MediaWiki\Revision\RevisionStore::newRevisionsFromBatch
*/