2020-03-30 20:20:27 +00:00
|
|
|
<?php
|
|
|
|
|
/**
|
|
|
|
|
* This program is free software; you can redistribute it and/or modify
|
|
|
|
|
* it under the terms of the GNU General Public License as published by
|
|
|
|
|
* the Free Software Foundation; either version 2 of the License, or
|
|
|
|
|
* (at your option) any later version.
|
|
|
|
|
*
|
|
|
|
|
* This program is distributed in the hope that it will be useful,
|
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
|
* GNU General Public License for more details.
|
|
|
|
|
*
|
|
|
|
|
* You should have received a copy of the GNU General Public License along
|
|
|
|
|
* with this program; if not, write to the Free Software Foundation, Inc.,
|
|
|
|
|
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
|
|
|
|
* http://www.gnu.org/copyleft/gpl.html
|
|
|
|
|
*
|
|
|
|
|
* @file
|
|
|
|
|
* @ingroup Maintenance
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
use MediaWiki\Revision\RevisionArchiveRecord;
|
|
|
|
|
use MediaWiki\Revision\RevisionRecord;
|
|
|
|
|
use MediaWiki\Revision\RevisionStore;
|
|
|
|
|
use MediaWiki\Revision\RevisionStoreRecord;
|
|
|
|
|
use MediaWiki\Revision\SlotRecord;
|
2023-08-25 12:29:41 +00:00
|
|
|
use MediaWiki\Status\Status;
|
2020-03-30 20:20:27 +00:00
|
|
|
use MediaWiki\Storage\BlobStore;
|
|
|
|
|
|
2020-09-23 00:34:11 +00:00
|
|
|
require_once __DIR__ . '/Maintenance.php';
|
2020-03-30 20:20:27 +00:00
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Maintenance script for finding and marking bad content blobs.
|
|
|
|
|
*
|
|
|
|
|
* @ingroup Maintenance
|
|
|
|
|
*/
|
|
|
|
|
class FindBadBlobs extends Maintenance {
|
|
|
|
|
|
2024-01-23 16:09:20 +00:00
|
|
|
private RevisionStore $revisionStore;
|
|
|
|
|
private BlobStore $blobStore;
|
2020-03-30 20:20:27 +00:00
|
|
|
|
|
|
|
|
public function __construct() {
|
|
|
|
|
parent::__construct();
|
|
|
|
|
|
|
|
|
|
$this->setBatchSize( 1000 );
|
2020-08-26 15:35:17 +00:00
|
|
|
$this->addDescription( 'Find and mark bad content blobs. Marked blobs will be read as empty. '
|
findBadBlobs: better separate scan and mark modes.
This makes the following changes to the findBadBlobs utility:
- rename --from-date to --scan-from, to match the intended use.
- require the usage of --revisions with --mark, so revisions
cannot be marked directly when found by a scan.
- catch any exception when testing for bad blobs, casting
a wider net.
- change the output format, so the IDs of bad revisions can easily be
extracted by command line tools for further processing.
- warn when trying to mark blobs that can successfully we read.
The idea is to allow detection of blobs that are "bad" in a
large variety of ways, including due to misconfiguration, while at the
same time making sure that blobs do not get marked as bad due to
temporary outages.
The intended usage of findBadBlobs is to first scan a potentially
problematic set of revisions using --scan-from, review to errors found,
and then determine which of the revisions should be marked as bad.
Once the bad revisions have been identified, a list with their IDs
can be extracted from the output, and supplied back to findBadBlobs
via the --revisions option.
Bug: T251778
Change-Id: I47c11190b665c1dac88db32ee2bf683728cb3dc6
2020-06-22 11:43:22 +00:00
|
|
|
. 'Use --scan-from to find revisions with bad blobs, use --mark to mark them.' );
|
|
|
|
|
$this->addOption( 'scan-from', 'Start scanning revisions at the given date. '
|
2020-08-26 15:35:17 +00:00
|
|
|
. 'Format: Anything supported by MediaWiki, e.g. YYYYMMDDHHMMSS or YYYY-MM-DDTHH:MM:SS',
|
2020-03-31 12:29:23 +00:00
|
|
|
false, true );
|
findBadBlobs: better separate scan and mark modes.
This makes the following changes to the findBadBlobs utility:
- rename --from-date to --scan-from, to match the intended use.
- require the usage of --revisions with --mark, so revisions
cannot be marked directly when found by a scan.
- catch any exception when testing for bad blobs, casting
a wider net.
- change the output format, so the IDs of bad revisions can easily be
extracted by command line tools for further processing.
- warn when trying to mark blobs that can successfully we read.
The idea is to allow detection of blobs that are "bad" in a
large variety of ways, including due to misconfiguration, while at the
same time making sure that blobs do not get marked as bad due to
temporary outages.
The intended usage of findBadBlobs is to first scan a potentially
problematic set of revisions using --scan-from, review to errors found,
and then determine which of the revisions should be marked as bad.
Once the bad revisions have been identified, a list with their IDs
can be extracted from the output, and supplied back to findBadBlobs
via the --revisions option.
Bug: T251778
Change-Id: I47c11190b665c1dac88db32ee2bf683728cb3dc6
2020-06-22 11:43:22 +00:00
|
|
|
$this->addOption( 'revisions', 'A list of revision IDs to process, separated by comma or '
|
|
|
|
|
. 'colon or whitespace. Revisions belonging to deleted pages will work. '
|
2020-03-31 12:29:23 +00:00
|
|
|
. 'If set to "-" IDs are read from stdin, one per line.', false, true );
|
findBadBlobs: better separate scan and mark modes.
This makes the following changes to the findBadBlobs utility:
- rename --from-date to --scan-from, to match the intended use.
- require the usage of --revisions with --mark, so revisions
cannot be marked directly when found by a scan.
- catch any exception when testing for bad blobs, casting
a wider net.
- change the output format, so the IDs of bad revisions can easily be
extracted by command line tools for further processing.
- warn when trying to mark blobs that can successfully we read.
The idea is to allow detection of blobs that are "bad" in a
large variety of ways, including due to misconfiguration, while at the
same time making sure that blobs do not get marked as bad due to
temporary outages.
The intended usage of findBadBlobs is to first scan a potentially
problematic set of revisions using --scan-from, review to errors found,
and then determine which of the revisions should be marked as bad.
Once the bad revisions have been identified, a list with their IDs
can be extracted from the output, and supplied back to findBadBlobs
via the --revisions option.
Bug: T251778
Change-Id: I47c11190b665c1dac88db32ee2bf683728cb3dc6
2020-06-22 11:43:22 +00:00
|
|
|
$this->addOption( 'limit', 'Maximum number of revisions for --scan-from to scan. '
|
|
|
|
|
. 'Default: 1000', false, true );
|
2020-03-30 20:20:27 +00:00
|
|
|
$this->addOption( 'mark', 'Mark the blob as "known bad", to avoid errors when '
|
|
|
|
|
. 'attempting to read it. The value given is the reason for marking the blob as bad, '
|
findBadBlobs: better separate scan and mark modes.
This makes the following changes to the findBadBlobs utility:
- rename --from-date to --scan-from, to match the intended use.
- require the usage of --revisions with --mark, so revisions
cannot be marked directly when found by a scan.
- catch any exception when testing for bad blobs, casting
a wider net.
- change the output format, so the IDs of bad revisions can easily be
extracted by command line tools for further processing.
- warn when trying to mark blobs that can successfully we read.
The idea is to allow detection of blobs that are "bad" in a
large variety of ways, including due to misconfiguration, while at the
same time making sure that blobs do not get marked as bad due to
temporary outages.
The intended usage of findBadBlobs is to first scan a potentially
problematic set of revisions using --scan-from, review to errors found,
and then determine which of the revisions should be marked as bad.
Once the bad revisions have been identified, a list with their IDs
can be extracted from the output, and supplied back to findBadBlobs
via the --revisions option.
Bug: T251778
Change-Id: I47c11190b665c1dac88db32ee2bf683728cb3dc6
2020-06-22 11:43:22 +00:00
|
|
|
. 'typically a ticket ID. Requires --revisions to also be set.', false, true );
|
2020-03-30 20:20:27 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* @return string
|
|
|
|
|
*/
|
|
|
|
|
private function getStartTimestamp() {
|
findBadBlobs: better separate scan and mark modes.
This makes the following changes to the findBadBlobs utility:
- rename --from-date to --scan-from, to match the intended use.
- require the usage of --revisions with --mark, so revisions
cannot be marked directly when found by a scan.
- catch any exception when testing for bad blobs, casting
a wider net.
- change the output format, so the IDs of bad revisions can easily be
extracted by command line tools for further processing.
- warn when trying to mark blobs that can successfully we read.
The idea is to allow detection of blobs that are "bad" in a
large variety of ways, including due to misconfiguration, while at the
same time making sure that blobs do not get marked as bad due to
temporary outages.
The intended usage of findBadBlobs is to first scan a potentially
problematic set of revisions using --scan-from, review to errors found,
and then determine which of the revisions should be marked as bad.
Once the bad revisions have been identified, a list with their IDs
can be extracted from the output, and supplied back to findBadBlobs
via the --revisions option.
Bug: T251778
Change-Id: I47c11190b665c1dac88db32ee2bf683728cb3dc6
2020-06-22 11:43:22 +00:00
|
|
|
$tsOpt = $this->getOption( 'scan-from' );
|
2020-03-30 20:20:27 +00:00
|
|
|
if ( strlen( $tsOpt ) < 14 ) {
|
|
|
|
|
$this->fatalError( 'Bad timestamp: ' . $tsOpt
|
|
|
|
|
. ', please provide time and date down to the second.' );
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$ts = wfTimestamp( TS_MW, $tsOpt );
|
|
|
|
|
if ( !$ts ) {
|
|
|
|
|
$this->fatalError( 'Bad timestamp: ' . $tsOpt );
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return $ts;
|
|
|
|
|
}
|
|
|
|
|
|
2020-03-31 12:29:23 +00:00
|
|
|
/**
|
|
|
|
|
* @return int[]
|
|
|
|
|
*/
|
|
|
|
|
private function getRevisionIds() {
|
|
|
|
|
$opt = $this->getOption( 'revisions' );
|
|
|
|
|
|
|
|
|
|
if ( $opt === '-' ) {
|
|
|
|
|
$opt = stream_get_contents( STDIN );
|
|
|
|
|
|
|
|
|
|
if ( !$opt ) {
|
|
|
|
|
return [];
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2020-09-07 21:53:07 +00:00
|
|
|
return $this->parseIntList( $opt );
|
2020-03-31 12:29:23 +00:00
|
|
|
}
|
|
|
|
|
|
2020-03-30 20:20:27 +00:00
|
|
|
/**
|
|
|
|
|
* @inheritDoc
|
|
|
|
|
*/
|
|
|
|
|
public function execute() {
|
2024-01-23 16:09:20 +00:00
|
|
|
$services = $this->getServiceContainer();
|
|
|
|
|
$this->revisionStore = $services->getRevisionStore();
|
|
|
|
|
$this->blobStore = $services->getBlobStore();
|
|
|
|
|
$this->setDBProvider( $services->getConnectionProvider() );
|
2020-03-30 20:20:27 +00:00
|
|
|
|
2020-03-31 12:29:23 +00:00
|
|
|
if ( $this->hasOption( 'revisions' ) ) {
|
findBadBlobs: better separate scan and mark modes.
This makes the following changes to the findBadBlobs utility:
- rename --from-date to --scan-from, to match the intended use.
- require the usage of --revisions with --mark, so revisions
cannot be marked directly when found by a scan.
- catch any exception when testing for bad blobs, casting
a wider net.
- change the output format, so the IDs of bad revisions can easily be
extracted by command line tools for further processing.
- warn when trying to mark blobs that can successfully we read.
The idea is to allow detection of blobs that are "bad" in a
large variety of ways, including due to misconfiguration, while at the
same time making sure that blobs do not get marked as bad due to
temporary outages.
The intended usage of findBadBlobs is to first scan a potentially
problematic set of revisions using --scan-from, review to errors found,
and then determine which of the revisions should be marked as bad.
Once the bad revisions have been identified, a list with their IDs
can be extracted from the output, and supplied back to findBadBlobs
via the --revisions option.
Bug: T251778
Change-Id: I47c11190b665c1dac88db32ee2bf683728cb3dc6
2020-06-22 11:43:22 +00:00
|
|
|
if ( $this->hasOption( 'scan-from' ) ) {
|
|
|
|
|
$this->fatalError( 'Cannot use --revisions together with --scan-from' );
|
|
|
|
|
}
|
|
|
|
|
|
2020-03-31 12:29:23 +00:00
|
|
|
$ids = $this->getRevisionIds();
|
|
|
|
|
|
|
|
|
|
$count = $this->scanRevisionsById( $ids );
|
findBadBlobs: better separate scan and mark modes.
This makes the following changes to the findBadBlobs utility:
- rename --from-date to --scan-from, to match the intended use.
- require the usage of --revisions with --mark, so revisions
cannot be marked directly when found by a scan.
- catch any exception when testing for bad blobs, casting
a wider net.
- change the output format, so the IDs of bad revisions can easily be
extracted by command line tools for further processing.
- warn when trying to mark blobs that can successfully we read.
The idea is to allow detection of blobs that are "bad" in a
large variety of ways, including due to misconfiguration, while at the
same time making sure that blobs do not get marked as bad due to
temporary outages.
The intended usage of findBadBlobs is to first scan a potentially
problematic set of revisions using --scan-from, review to errors found,
and then determine which of the revisions should be marked as bad.
Once the bad revisions have been identified, a list with their IDs
can be extracted from the output, and supplied back to findBadBlobs
via the --revisions option.
Bug: T251778
Change-Id: I47c11190b665c1dac88db32ee2bf683728cb3dc6
2020-06-22 11:43:22 +00:00
|
|
|
} elseif ( $this->hasOption( 'scan-from' ) ) {
|
|
|
|
|
if ( $this->hasOption( 'mark' ) ) {
|
|
|
|
|
$this->fatalError( 'Cannot use --mark with --scan-from, '
|
|
|
|
|
. 'use --revisions to specify revisions to mark.' );
|
|
|
|
|
}
|
|
|
|
|
|
2020-03-31 12:29:23 +00:00
|
|
|
$fromTimestamp = $this->getStartTimestamp();
|
|
|
|
|
$total = $this->getOption( 'limit', 1000 );
|
|
|
|
|
|
|
|
|
|
$count = $this->scanRevisionsByTimestamp( $fromTimestamp, $total );
|
2020-03-30 20:20:27 +00:00
|
|
|
|
2020-03-31 12:29:23 +00:00
|
|
|
$this->output( "The range of archive rows scanned is based on the range of revision IDs "
|
|
|
|
|
. "scanned in the revision table.\n" );
|
|
|
|
|
} else {
|
findBadBlobs: better separate scan and mark modes.
This makes the following changes to the findBadBlobs utility:
- rename --from-date to --scan-from, to match the intended use.
- require the usage of --revisions with --mark, so revisions
cannot be marked directly when found by a scan.
- catch any exception when testing for bad blobs, casting
a wider net.
- change the output format, so the IDs of bad revisions can easily be
extracted by command line tools for further processing.
- warn when trying to mark blobs that can successfully we read.
The idea is to allow detection of blobs that are "bad" in a
large variety of ways, including due to misconfiguration, while at the
same time making sure that blobs do not get marked as bad due to
temporary outages.
The intended usage of findBadBlobs is to first scan a potentially
problematic set of revisions using --scan-from, review to errors found,
and then determine which of the revisions should be marked as bad.
Once the bad revisions have been identified, a list with their IDs
can be extracted from the output, and supplied back to findBadBlobs
via the --revisions option.
Bug: T251778
Change-Id: I47c11190b665c1dac88db32ee2bf683728cb3dc6
2020-06-22 11:43:22 +00:00
|
|
|
if ( $this->hasOption( 'mark' ) ) {
|
|
|
|
|
$this->fatalError( 'The --mark must be used together with --revisions' );
|
|
|
|
|
} else {
|
|
|
|
|
$this->fatalError( 'Must specify one of --revisions or --scan-from' );
|
|
|
|
|
}
|
2020-03-31 12:29:23 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if ( $this->hasOption( 'mark' ) ) {
|
findBadBlobs: better separate scan and mark modes.
This makes the following changes to the findBadBlobs utility:
- rename --from-date to --scan-from, to match the intended use.
- require the usage of --revisions with --mark, so revisions
cannot be marked directly when found by a scan.
- catch any exception when testing for bad blobs, casting
a wider net.
- change the output format, so the IDs of bad revisions can easily be
extracted by command line tools for further processing.
- warn when trying to mark blobs that can successfully we read.
The idea is to allow detection of blobs that are "bad" in a
large variety of ways, including due to misconfiguration, while at the
same time making sure that blobs do not get marked as bad due to
temporary outages.
The intended usage of findBadBlobs is to first scan a potentially
problematic set of revisions using --scan-from, review to errors found,
and then determine which of the revisions should be marked as bad.
Once the bad revisions have been identified, a list with their IDs
can be extracted from the output, and supplied back to findBadBlobs
via the --revisions option.
Bug: T251778
Change-Id: I47c11190b665c1dac88db32ee2bf683728cb3dc6
2020-06-22 11:43:22 +00:00
|
|
|
$this->output( "Marked $count bad revisions.\n" );
|
2020-03-31 12:29:23 +00:00
|
|
|
} else {
|
findBadBlobs: better separate scan and mark modes.
This makes the following changes to the findBadBlobs utility:
- rename --from-date to --scan-from, to match the intended use.
- require the usage of --revisions with --mark, so revisions
cannot be marked directly when found by a scan.
- catch any exception when testing for bad blobs, casting
a wider net.
- change the output format, so the IDs of bad revisions can easily be
extracted by command line tools for further processing.
- warn when trying to mark blobs that can successfully we read.
The idea is to allow detection of blobs that are "bad" in a
large variety of ways, including due to misconfiguration, while at the
same time making sure that blobs do not get marked as bad due to
temporary outages.
The intended usage of findBadBlobs is to first scan a potentially
problematic set of revisions using --scan-from, review to errors found,
and then determine which of the revisions should be marked as bad.
Once the bad revisions have been identified, a list with their IDs
can be extracted from the output, and supplied back to findBadBlobs
via the --revisions option.
Bug: T251778
Change-Id: I47c11190b665c1dac88db32ee2bf683728cb3dc6
2020-06-22 11:43:22 +00:00
|
|
|
$this->output( "Found $count bad revisions.\n" );
|
|
|
|
|
|
|
|
|
|
if ( $count > 0 ) {
|
|
|
|
|
$this->output( "On a unix/linux environment, you can use grep and cut to list of IDs\n" );
|
|
|
|
|
$this->output( "that can then be used with the --revisions option. E.g.\n" );
|
2020-08-26 15:35:17 +00:00
|
|
|
$this->output( " grep '! Found bad blob' | cut -s -f 3\n" );
|
findBadBlobs: better separate scan and mark modes.
This makes the following changes to the findBadBlobs utility:
- rename --from-date to --scan-from, to match the intended use.
- require the usage of --revisions with --mark, so revisions
cannot be marked directly when found by a scan.
- catch any exception when testing for bad blobs, casting
a wider net.
- change the output format, so the IDs of bad revisions can easily be
extracted by command line tools for further processing.
- warn when trying to mark blobs that can successfully we read.
The idea is to allow detection of blobs that are "bad" in a
large variety of ways, including due to misconfiguration, while at the
same time making sure that blobs do not get marked as bad due to
temporary outages.
The intended usage of findBadBlobs is to first scan a potentially
problematic set of revisions using --scan-from, review to errors found,
and then determine which of the revisions should be marked as bad.
Once the bad revisions have been identified, a list with their IDs
can be extracted from the output, and supplied back to findBadBlobs
via the --revisions option.
Bug: T251778
Change-Id: I47c11190b665c1dac88db32ee2bf683728cb3dc6
2020-06-22 11:43:22 +00:00
|
|
|
}
|
2020-03-31 12:29:23 +00:00
|
|
|
}
|
2020-03-30 20:20:27 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* @param string $fromTimestamp
|
|
|
|
|
* @param int $total
|
|
|
|
|
*
|
|
|
|
|
* @return int
|
|
|
|
|
*/
|
|
|
|
|
private function scanRevisionsByTimestamp( $fromTimestamp, $total ) {
|
|
|
|
|
$count = 0;
|
|
|
|
|
$lastRevId = 0;
|
|
|
|
|
$firstRevId = 0;
|
|
|
|
|
$lastTimestamp = $fromTimestamp;
|
|
|
|
|
$revisionRowsScanned = 0;
|
|
|
|
|
$archiveRowsScanned = 0;
|
|
|
|
|
|
|
|
|
|
$this->output( "Scanning revisions table, "
|
|
|
|
|
. "$total rows starting at rev_timestamp $fromTimestamp\n" );
|
|
|
|
|
|
|
|
|
|
while ( $revisionRowsScanned < $total ) {
|
|
|
|
|
$batchSize = min( $total - $revisionRowsScanned, $this->getBatchSize() );
|
|
|
|
|
$revisions = $this->loadRevisionsByTimestamp( $lastRevId, $lastTimestamp, $batchSize );
|
|
|
|
|
if ( !$revisions ) {
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
foreach ( $revisions as $rev ) {
|
|
|
|
|
// we are sorting by timestamp, so we may encounter revision IDs out of sequence
|
|
|
|
|
$firstRevId = $firstRevId ? min( $firstRevId, $rev->getId() ) : $rev->getId();
|
|
|
|
|
$lastRevId = max( $lastRevId, $rev->getId() );
|
|
|
|
|
|
|
|
|
|
$count += $this->checkRevision( $rev );
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$lastTimestamp = $rev->getTimestamp();
|
|
|
|
|
$batchSize = count( $revisions );
|
|
|
|
|
$revisionRowsScanned += $batchSize;
|
|
|
|
|
$this->output(
|
|
|
|
|
"\t- Scanned a batch of $batchSize revisions, "
|
|
|
|
|
. "up to revision $lastRevId ($lastTimestamp)\n"
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
$this->waitForReplication();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// NOTE: the archive table isn't indexed by timestamp, so the best we can do is use the
|
|
|
|
|
// revision ID just before the first revision ID we found above as the starting point
|
|
|
|
|
// of the scan, and scan up to on revision after the last revision ID we found above.
|
|
|
|
|
// If $firstRevId is 0, the loop body above didn't execute,
|
|
|
|
|
// so we should skip the one below as well.
|
|
|
|
|
$fromArchived = $this->getNextRevision( $firstRevId, '<', 'DESC' );
|
|
|
|
|
$maxArchived = $this->getNextRevision( $lastRevId, '>', 'ASC' );
|
|
|
|
|
$maxArchived = $maxArchived ?: PHP_INT_MAX;
|
|
|
|
|
|
|
|
|
|
$this->output( "Scanning archive table by ar_rev_id, $fromArchived to $maxArchived\n" );
|
|
|
|
|
while ( $firstRevId > 0 && $fromArchived < $maxArchived ) {
|
|
|
|
|
$batchSize = min( $total - $archiveRowsScanned, $this->getBatchSize() );
|
|
|
|
|
$revisions = $this->loadArchiveByRevisionId( $fromArchived, $maxArchived, $batchSize );
|
|
|
|
|
if ( !$revisions ) {
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
/** @var RevisionRecord $rev */
|
|
|
|
|
foreach ( $revisions as $rev ) {
|
|
|
|
|
$count += $this->checkRevision( $rev );
|
|
|
|
|
}
|
|
|
|
|
$fromArchived = $rev->getId();
|
|
|
|
|
$batchSize = count( $revisions );
|
|
|
|
|
$archiveRowsScanned += $batchSize;
|
|
|
|
|
$this->output(
|
|
|
|
|
"\t- Scanned a batch of $batchSize archived revisions, "
|
|
|
|
|
. "up to revision $fromArchived ($lastTimestamp)\n"
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
$this->waitForReplication();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return $count;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* @param int $afterId
|
|
|
|
|
* @param string $fromTimestamp
|
|
|
|
|
* @param int $batchSize
|
|
|
|
|
*
|
|
|
|
|
* @return RevisionStoreRecord[]
|
|
|
|
|
*/
|
|
|
|
|
private function loadRevisionsByTimestamp( int $afterId, string $fromTimestamp, $batchSize ) {
|
2024-01-23 16:09:20 +00:00
|
|
|
$db = $this->getReplicaDB();
|
2023-08-28 08:52:19 +00:00
|
|
|
$queryBuilder = $this->revisionStore->newSelectQueryBuilder( $db );
|
|
|
|
|
$rows = $queryBuilder->joinComment()
|
2022-11-15 08:54:05 +00:00
|
|
|
->where( $db->buildComparison( '>', [
|
|
|
|
|
'rev_timestamp' => $fromTimestamp,
|
|
|
|
|
'rev_id' => $afterId,
|
|
|
|
|
] ) )
|
2022-07-21 09:36:56 +00:00
|
|
|
->useIndex( [ 'revision' => 'rev_timestamp' ] )
|
|
|
|
|
->orderBy( [ 'rev_timestamp', 'rev_id' ] )
|
|
|
|
|
->limit( $batchSize )
|
2023-08-28 08:52:19 +00:00
|
|
|
->caller( __METHOD__ )->fetchResultSet();
|
2020-03-30 20:20:27 +00:00
|
|
|
$result = $this->revisionStore->newRevisionsFromBatch( $rows, [ 'slots' => true ] );
|
2020-04-28 19:28:59 +00:00
|
|
|
$this->handleStatus( $result );
|
|
|
|
|
|
|
|
|
|
$records = array_filter( $result->value );
|
|
|
|
|
|
|
|
|
|
'@phan-var RevisionStoreRecord[] $records';
|
|
|
|
|
return $records;
|
2020-03-30 20:20:27 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* @param int $afterId
|
|
|
|
|
* @param int $uptoId
|
|
|
|
|
* @param int $batchSize
|
|
|
|
|
*
|
|
|
|
|
* @return RevisionArchiveRecord[]
|
|
|
|
|
*/
|
|
|
|
|
private function loadArchiveByRevisionId( int $afterId, int $uptoId, $batchSize ) {
|
2024-01-23 16:09:20 +00:00
|
|
|
$db = $this->getReplicaDB();
|
2023-09-07 10:19:48 +00:00
|
|
|
$rows = $this->revisionStore->newArchiveSelectQueryBuilder( $db )
|
|
|
|
|
->joinComment()
|
2022-07-21 09:36:56 +00:00
|
|
|
->where( [ "ar_rev_id > $afterId", "ar_rev_id <= $uptoId" ] )
|
|
|
|
|
->orderBy( 'ar_rev_id' )
|
|
|
|
|
->limit( $batchSize )
|
2023-09-07 10:19:48 +00:00
|
|
|
->caller( __METHOD__ )->fetchResultSet();
|
2020-03-30 20:20:27 +00:00
|
|
|
$result = $this->revisionStore->newRevisionsFromBatch(
|
|
|
|
|
$rows,
|
|
|
|
|
[ 'archive' => true, 'slots' => true ]
|
|
|
|
|
);
|
2020-04-28 19:28:59 +00:00
|
|
|
$this->handleStatus( $result );
|
|
|
|
|
|
|
|
|
|
$records = array_filter( $result->value );
|
|
|
|
|
|
|
|
|
|
'@phan-var RevisionArchiveRecord[] $records';
|
|
|
|
|
return $records;
|
2020-03-30 20:20:27 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Returns the revision ID next to $revId, according to $comp and $dir
|
|
|
|
|
*
|
|
|
|
|
* @param int $revId
|
|
|
|
|
* @param string $comp the comparator, either '<' or '>', to go with $dir
|
|
|
|
|
* @param string $dir the sort direction to go with $comp, either 'ARC' or 'DESC'
|
|
|
|
|
*
|
|
|
|
|
* @return int
|
|
|
|
|
*/
|
|
|
|
|
private function getNextRevision( int $revId, string $comp, string $dir ) {
|
2024-01-23 16:09:20 +00:00
|
|
|
$db = $this->getReplicaDB();
|
2022-07-21 09:36:56 +00:00
|
|
|
$next = $db->newSelectQueryBuilder()
|
|
|
|
|
->select( 'rev_id' )
|
|
|
|
|
->from( 'revision' )
|
|
|
|
|
->where( "rev_id $comp $revId" )
|
|
|
|
|
->orderBy( [ "rev_id" ], $dir )
|
|
|
|
|
->caller( __METHOD__ )
|
|
|
|
|
->fetchField();
|
2020-03-30 20:20:27 +00:00
|
|
|
return (int)$next;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
2020-03-31 12:29:23 +00:00
|
|
|
* @param array $ids
|
|
|
|
|
*
|
|
|
|
|
* @return int
|
|
|
|
|
*/
|
|
|
|
|
private function scanRevisionsById( array $ids ) {
|
|
|
|
|
$count = 0;
|
|
|
|
|
$total = count( $ids );
|
|
|
|
|
|
|
|
|
|
$this->output( "Scanning $total ids\n" );
|
|
|
|
|
|
|
|
|
|
foreach ( array_chunk( $ids, $this->getBatchSize() ) as $batch ) {
|
|
|
|
|
$revisions = $this->loadRevisionsById( $batch );
|
|
|
|
|
|
|
|
|
|
if ( !$revisions ) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/** @var RevisionRecord $rev */
|
|
|
|
|
foreach ( $revisions as $rev ) {
|
|
|
|
|
$count += $this->checkRevision( $rev );
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$batchSize = count( $revisions );
|
|
|
|
|
$this->output( "\t- Scanned a batch of $batchSize revisions\n" );
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return $count;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* @param int[] $ids
|
|
|
|
|
*
|
|
|
|
|
* @return RevisionRecord[]
|
|
|
|
|
*/
|
|
|
|
|
private function loadRevisionsById( array $ids ) {
|
2024-01-23 16:09:20 +00:00
|
|
|
$db = $this->getReplicaDB();
|
2023-08-28 08:52:19 +00:00
|
|
|
$queryBuilder = $this->revisionStore->newSelectQueryBuilder( $db );
|
2020-03-31 12:29:23 +00:00
|
|
|
|
2023-08-28 08:52:19 +00:00
|
|
|
$rows = $queryBuilder
|
|
|
|
|
->joinComment()
|
2022-07-21 09:36:56 +00:00
|
|
|
->where( [ 'rev_id' => $ids ] )
|
2023-08-28 08:52:19 +00:00
|
|
|
->caller( __METHOD__ )->fetchResultSet();
|
2020-03-31 12:29:23 +00:00
|
|
|
|
|
|
|
|
$result = $this->revisionStore->newRevisionsFromBatch( $rows, [ 'slots' => true ] );
|
|
|
|
|
|
2020-04-28 19:28:59 +00:00
|
|
|
$this->handleStatus( $result );
|
2020-03-31 12:29:23 +00:00
|
|
|
|
2020-04-28 19:28:59 +00:00
|
|
|
$revisions = array_filter( $result->value );
|
|
|
|
|
'@phan-var RevisionArchiveRecord[] $revisions';
|
2020-03-31 12:29:23 +00:00
|
|
|
|
|
|
|
|
// if not all revisions were found, check the archive table.
|
|
|
|
|
if ( count( $revisions ) < count( $ids ) ) {
|
2023-09-07 10:19:48 +00:00
|
|
|
$rows = $this->revisionStore->newArchiveSelectQueryBuilder( $db )
|
|
|
|
|
->joinComment()
|
|
|
|
|
->where( [ 'ar_rev_id' => array_diff( $ids, array_keys( $revisions ) ) ] )
|
|
|
|
|
->caller( __METHOD__ )->fetchResultSet();
|
2020-03-31 12:29:23 +00:00
|
|
|
|
|
|
|
|
$archiveResult = $this->revisionStore->newRevisionsFromBatch(
|
|
|
|
|
$rows,
|
|
|
|
|
[ 'slots' => true, 'archive' => true ]
|
|
|
|
|
);
|
|
|
|
|
|
2020-04-28 19:28:59 +00:00
|
|
|
$this->handleStatus( $archiveResult );
|
2020-03-31 12:29:23 +00:00
|
|
|
|
|
|
|
|
// don't use array_merge, since it will re-index
|
2020-07-31 19:41:28 +00:00
|
|
|
$revisions += array_filter( $archiveResult->value );
|
2020-03-31 12:29:23 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return $revisions;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
2020-03-30 20:20:27 +00:00
|
|
|
* @param RevisionRecord $rev
|
|
|
|
|
*
|
|
|
|
|
* @return int
|
|
|
|
|
*/
|
|
|
|
|
private function checkRevision( RevisionRecord $rev ) {
|
|
|
|
|
$count = 0;
|
|
|
|
|
foreach ( $rev->getSlots()->getSlots() as $slot ) {
|
|
|
|
|
$count += $this->checkSlot( $rev, $slot );
|
|
|
|
|
}
|
|
|
|
|
|
findBadBlobs: better separate scan and mark modes.
This makes the following changes to the findBadBlobs utility:
- rename --from-date to --scan-from, to match the intended use.
- require the usage of --revisions with --mark, so revisions
cannot be marked directly when found by a scan.
- catch any exception when testing for bad blobs, casting
a wider net.
- change the output format, so the IDs of bad revisions can easily be
extracted by command line tools for further processing.
- warn when trying to mark blobs that can successfully we read.
The idea is to allow detection of blobs that are "bad" in a
large variety of ways, including due to misconfiguration, while at the
same time making sure that blobs do not get marked as bad due to
temporary outages.
The intended usage of findBadBlobs is to first scan a potentially
problematic set of revisions using --scan-from, review to errors found,
and then determine which of the revisions should be marked as bad.
Once the bad revisions have been identified, a list with their IDs
can be extracted from the output, and supplied back to findBadBlobs
via the --revisions option.
Bug: T251778
Change-Id: I47c11190b665c1dac88db32ee2bf683728cb3dc6
2020-06-22 11:43:22 +00:00
|
|
|
if ( $count === 0 && $this->hasOption( 'mark' ) ) {
|
|
|
|
|
$this->output( "\t# No bad blob found on revision {$rev->getId()}, skipped!\n" );
|
|
|
|
|
}
|
|
|
|
|
|
2020-03-30 20:20:27 +00:00
|
|
|
return $count;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* @param RevisionRecord $rev
|
|
|
|
|
* @param SlotRecord $slot
|
|
|
|
|
*
|
|
|
|
|
* @return int
|
|
|
|
|
*/
|
|
|
|
|
private function checkSlot( RevisionRecord $rev, SlotRecord $slot ) {
|
|
|
|
|
$address = $slot->getAddress();
|
|
|
|
|
|
|
|
|
|
try {
|
|
|
|
|
$this->blobStore->getBlob( $address );
|
2020-05-19 22:10:27 +00:00
|
|
|
// nothing to do
|
|
|
|
|
return 0;
|
findBadBlobs: better separate scan and mark modes.
This makes the following changes to the findBadBlobs utility:
- rename --from-date to --scan-from, to match the intended use.
- require the usage of --revisions with --mark, so revisions
cannot be marked directly when found by a scan.
- catch any exception when testing for bad blobs, casting
a wider net.
- change the output format, so the IDs of bad revisions can easily be
extracted by command line tools for further processing.
- warn when trying to mark blobs that can successfully we read.
The idea is to allow detection of blobs that are "bad" in a
large variety of ways, including due to misconfiguration, while at the
same time making sure that blobs do not get marked as bad due to
temporary outages.
The intended usage of findBadBlobs is to first scan a potentially
problematic set of revisions using --scan-from, review to errors found,
and then determine which of the revisions should be marked as bad.
Once the bad revisions have been identified, a list with their IDs
can be extracted from the output, and supplied back to findBadBlobs
via the --revisions option.
Bug: T251778
Change-Id: I47c11190b665c1dac88db32ee2bf683728cb3dc6
2020-06-22 11:43:22 +00:00
|
|
|
} catch ( Exception $ex ) {
|
2020-03-30 20:20:27 +00:00
|
|
|
$error = $ex->getMessage();
|
findBadBlobs: better separate scan and mark modes.
This makes the following changes to the findBadBlobs utility:
- rename --from-date to --scan-from, to match the intended use.
- require the usage of --revisions with --mark, so revisions
cannot be marked directly when found by a scan.
- catch any exception when testing for bad blobs, casting
a wider net.
- change the output format, so the IDs of bad revisions can easily be
extracted by command line tools for further processing.
- warn when trying to mark blobs that can successfully we read.
The idea is to allow detection of blobs that are "bad" in a
large variety of ways, including due to misconfiguration, while at the
same time making sure that blobs do not get marked as bad due to
temporary outages.
The intended usage of findBadBlobs is to first scan a potentially
problematic set of revisions using --scan-from, review to errors found,
and then determine which of the revisions should be marked as bad.
Once the bad revisions have been identified, a list with their IDs
can be extracted from the output, and supplied back to findBadBlobs
via the --revisions option.
Bug: T251778
Change-Id: I47c11190b665c1dac88db32ee2bf683728cb3dc6
2020-06-22 11:43:22 +00:00
|
|
|
$type = get_class( $ex );
|
2020-03-30 20:20:27 +00:00
|
|
|
}
|
|
|
|
|
|
findBadBlobs: better separate scan and mark modes.
This makes the following changes to the findBadBlobs utility:
- rename --from-date to --scan-from, to match the intended use.
- require the usage of --revisions with --mark, so revisions
cannot be marked directly when found by a scan.
- catch any exception when testing for bad blobs, casting
a wider net.
- change the output format, so the IDs of bad revisions can easily be
extracted by command line tools for further processing.
- warn when trying to mark blobs that can successfully we read.
The idea is to allow detection of blobs that are "bad" in a
large variety of ways, including due to misconfiguration, while at the
same time making sure that blobs do not get marked as bad due to
temporary outages.
The intended usage of findBadBlobs is to first scan a potentially
problematic set of revisions using --scan-from, review to errors found,
and then determine which of the revisions should be marked as bad.
Once the bad revisions have been identified, a list with their IDs
can be extracted from the output, and supplied back to findBadBlobs
via the --revisions option.
Bug: T251778
Change-Id: I47c11190b665c1dac88db32ee2bf683728cb3dc6
2020-06-22 11:43:22 +00:00
|
|
|
// NOTE: output the revision ID again at the end in a separate column for easy processing
|
|
|
|
|
// via the "cut" shell command.
|
2020-08-26 15:35:17 +00:00
|
|
|
$this->output( "\t! Found bad blob on revision {$rev->getId()} "
|
|
|
|
|
. "from {$rev->getTimestamp()} ({$slot->getRole()} slot): "
|
findBadBlobs: better separate scan and mark modes.
This makes the following changes to the findBadBlobs utility:
- rename --from-date to --scan-from, to match the intended use.
- require the usage of --revisions with --mark, so revisions
cannot be marked directly when found by a scan.
- catch any exception when testing for bad blobs, casting
a wider net.
- change the output format, so the IDs of bad revisions can easily be
extracted by command line tools for further processing.
- warn when trying to mark blobs that can successfully we read.
The idea is to allow detection of blobs that are "bad" in a
large variety of ways, including due to misconfiguration, while at the
same time making sure that blobs do not get marked as bad due to
temporary outages.
The intended usage of findBadBlobs is to first scan a potentially
problematic set of revisions using --scan-from, review to errors found,
and then determine which of the revisions should be marked as bad.
Once the bad revisions have been identified, a list with their IDs
can be extracted from the output, and supplied back to findBadBlobs
via the --revisions option.
Bug: T251778
Change-Id: I47c11190b665c1dac88db32ee2bf683728cb3dc6
2020-06-22 11:43:22 +00:00
|
|
|
. "content_id={$slot->getContentId()}, address=<{$slot->getAddress()}>, "
|
|
|
|
|
. "error='$error', type='$type'. ID:\t{$rev->getId()}\n" );
|
2020-03-30 20:20:27 +00:00
|
|
|
|
|
|
|
|
if ( $this->hasOption( 'mark' ) ) {
|
2023-02-05 19:21:50 +00:00
|
|
|
$newAddress = $this->markBlob( $slot, $error );
|
2020-03-30 20:20:27 +00:00
|
|
|
$this->output( "\tChanged address to <$newAddress>\n" );
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return 1;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* @param SlotRecord $slot
|
|
|
|
|
* @param string|null $error
|
|
|
|
|
*
|
|
|
|
|
* @return false|string
|
|
|
|
|
*/
|
2023-02-05 19:21:50 +00:00
|
|
|
private function markBlob( SlotRecord $slot, string $error = null ) {
|
2020-03-30 20:20:27 +00:00
|
|
|
$args = [];
|
|
|
|
|
|
|
|
|
|
if ( $this->hasOption( 'mark' ) ) {
|
|
|
|
|
$args['reason'] = $this->getOption( 'mark' );
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if ( $error ) {
|
|
|
|
|
$args['error'] = $error;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$address = $slot->getAddress() ?: 'empty';
|
|
|
|
|
$badAddress = 'bad:' . urlencode( $address );
|
|
|
|
|
|
|
|
|
|
if ( $args ) {
|
|
|
|
|
$badAddress .= '?' . wfArrayToCgi( $args );
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$badAddress = substr( $badAddress, 0, 255 );
|
|
|
|
|
|
2024-01-23 16:09:20 +00:00
|
|
|
$dbw = $this->getPrimaryDB();
|
2024-01-16 22:47:08 +00:00
|
|
|
$dbw->newUpdateQueryBuilder()
|
|
|
|
|
->update( 'content' )
|
|
|
|
|
->set( [ 'content_address' => $badAddress ] )
|
|
|
|
|
->where( [ 'content_id' => $slot->getContentId() ] )
|
|
|
|
|
->caller( __METHOD__ )->execute();
|
2020-03-30 20:20:27 +00:00
|
|
|
|
|
|
|
|
return $badAddress;
|
|
|
|
|
}
|
|
|
|
|
|
2020-04-28 19:28:59 +00:00
|
|
|
private function handleStatus( StatusValue $status ) {
|
|
|
|
|
if ( !$status->isOK() ) {
|
|
|
|
|
$this->fatalError(
|
|
|
|
|
Status::wrap( $status )->getMessage( false, false, 'en' )->text()
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
if ( !$status->isGood() ) {
|
|
|
|
|
$this->error(
|
|
|
|
|
"\t! " . Status::wrap( $status )->getMessage( false, false, 'en' )->text()
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2020-03-30 20:20:27 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$maintClass = FindBadBlobs::class;
|
|
|
|
|
require_once RUN_MAINTENANCE_IF_MAIN;
|