wiki.techinc.nl/maintenance/populateContentTables.php
daniel 135673b98e NameTableStore: ensure consistency upon rollback.
This ensures consistent behavior when an ID for a name is first acquired
within a transaction that is rolled back. The documentation for acquireId
now reads:

@note If called within a transaction, there is a chance for the acquired ID to be lost
if the transaction is rolled back. A best effort is made to re-insert the mapping
after a rollback, and consistency of the cache with the database table is ensured
by re-loading the map after a failed transaction. However, there is no guarantee
that an ID returned by this method is valid outside the transaction in which it
was produced. This means that calling code should not retain the return value beyond
the scope of a transaction, but rather call acquireId() again after the transaction
is complete. In some rare cases, this may produce an ID different from the first call.

Bug: T224949
Change-Id: I6d05e4112a649675bfb9083cab2d1bbe394e65b3
2019-10-10 12:03:29 +02:00

402 lines
12 KiB
PHP

<?php
/**
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
* http://www.gnu.org/copyleft/gpl.html
*
* @file
* @ingroup Maintenance
*/
use MediaWiki\MediaWikiServices;
use MediaWiki\Revision\SlotRecord;
use MediaWiki\Storage\BlobStore;
use MediaWiki\Storage\NameTableStore;
use MediaWiki\Storage\SqlBlobStore;
use Wikimedia\Assert\Assert;
use Wikimedia\Rdbms\IDatabase;
use Wikimedia\Rdbms\IResultWrapper;
require_once __DIR__ . '/Maintenance.php';
/**
* Populate the content and slot tables.
* @since 1.32
*/
class PopulateContentTables extends Maintenance {
/** @var IDatabase */
private $dbw;
/** @var NameTableStore */
private $contentModelStore;
/** @var NameTableStore */
private $slotRoleStore;
/** @var BlobStore */
private $blobStore;
/** @var int */
private $mainRoleId;
/** @var array|null Map "{$modelId}:{$address}" to content_id */
private $contentRowMap = null;
private $count = 0, $totalCount = 0;
public function __construct() {
parent::__construct();
$this->addDescription( 'Populate content and slot tables' );
$this->addOption( 'table', 'revision or archive table, or `all` to populate both', false,
true );
$this->addOption( 'reuse-content',
'Reuse content table rows when the address and model are the same. '
. 'This will increase the script\'s time and memory usage, perhaps significantly.',
false, false );
$this->addOption( 'start-revision', 'The rev_id to start at', false, true );
$this->addOption( 'start-archive', 'The ar_rev_id to start at', false, true );
$this->setBatchSize( 500 );
}
private function initServices() {
$this->dbw = $this->getDB( DB_MASTER );
$this->contentModelStore = MediaWikiServices::getInstance()->getContentModelStore();
$this->slotRoleStore = MediaWikiServices::getInstance()->getSlotRoleStore();
$this->blobStore = MediaWikiServices::getInstance()->getBlobStore();
// Don't trust the cache for the NameTableStores, in case something went
// wrong during a previous run (see T224949#5325895).
$this->contentModelStore->reloadMap();
$this->slotRoleStore->reloadMap();
$this->mainRoleId = $this->slotRoleStore->acquireId( SlotRecord::MAIN );
}
public function execute() {
$multiContentRevisionSchemaMigrationStage =
$this->getConfig()->get( 'MultiContentRevisionSchemaMigrationStage' );
$t0 = microtime( true );
if ( ( $multiContentRevisionSchemaMigrationStage & SCHEMA_COMPAT_WRITE_NEW ) === 0 ) {
$this->writeln(
'...cannot update while \$wgMultiContentRevisionSchemaMigrationStage '
. 'does not have the SCHEMA_COMPAT_WRITE_NEW bit set.'
);
return false;
}
$this->initServices();
if ( $this->getOption( 'reuse-content', false ) ) {
$this->loadContentMap();
}
foreach ( $this->getTables() as $table ) {
$this->populateTable( $table );
}
$elapsed = microtime( true ) - $t0;
$this->writeln( "Done. Processed $this->totalCount rows in $elapsed seconds" );
return true;
}
/**
* @return string[]
*/
private function getTables() {
$table = $this->getOption( 'table', 'all' );
$validTableOptions = [ 'all', 'revision', 'archive' ];
if ( !in_array( $table, $validTableOptions ) ) {
$this->fatalError( 'Invalid table. Must be either `revision` or `archive` or `all`' );
}
if ( $table === 'all' ) {
$tables = [ 'revision', 'archive' ];
} else {
$tables = [ $table ];
}
return $tables;
}
private function loadContentMap() {
$t0 = microtime( true );
$this->writeln( "Loading existing content table rows..." );
$this->contentRowMap = [];
$dbr = $this->getDB( DB_REPLICA );
$from = false;
while ( true ) {
$res = $dbr->select(
'content',
[ 'content_id', 'content_address', 'content_model' ],
$from ? "content_id > $from" : '',
__METHOD__,
[ 'ORDER BY' => 'content_id', 'LIMIT' => $this->getBatchSize() ]
);
if ( !$res || !$res->numRows() ) {
break;
}
foreach ( $res as $row ) {
$from = $row->content_id;
$this->contentRowMap["{$row->content_model}:{$row->content_address}"] = $row->content_id;
}
}
$elapsed = microtime( true ) - $t0;
$this->writeln( "Loaded " . count( $this->contentRowMap ) . " rows in $elapsed seconds" );
}
/**
* @param string $table
*/
private function populateTable( $table ) {
$t0 = microtime( true );
$this->count = 0;
$this->writeln( "Populating $table..." );
if ( $table === 'revision' ) {
$idField = 'rev_id';
$tables = [ 'revision', 'slots', 'page' ];
$fields = [
'rev_id',
'len' => 'rev_len',
'sha1' => 'rev_sha1',
'text_id' => 'rev_text_id',
'content_model' => 'rev_content_model',
'namespace' => 'page_namespace',
'title' => 'page_title',
];
$joins = [
'slots' => [ 'LEFT JOIN', 'rev_id=slot_revision_id' ],
'page' => [ 'LEFT JOIN', 'rev_page=page_id' ],
];
$startOption = 'start-revision';
} else {
$idField = 'ar_rev_id';
$tables = [ 'archive', 'slots' ];
$fields = [
'rev_id' => 'ar_rev_id',
'len' => 'ar_len',
'sha1' => 'ar_sha1',
'text_id' => 'ar_text_id',
'content_model' => 'ar_content_model',
'namespace' => 'ar_namespace',
'title' => 'ar_title',
];
$joins = [
'slots' => [ 'LEFT JOIN', 'ar_rev_id=slot_revision_id' ],
];
$startOption = 'start-archive';
}
if ( !$this->dbw->fieldExists( $table, $fields['text_id'], __METHOD__ ) ) {
$this->writeln( "No need to populate, $table.{$fields['text_id']} field does not exist" );
return;
}
$minmax = $this->dbw->selectRow(
$table,
[ 'min' => "MIN( $idField )", 'max' => "MAX( $idField )" ],
'',
__METHOD__
);
if ( $this->hasOption( $startOption ) ) {
$minmax->min = (int)$this->getOption( $startOption );
}
if ( !$minmax || !is_numeric( $minmax->min ) || !is_numeric( $minmax->max ) ) {
// No rows?
$minmax = (object)[ 'min' => 1, 'max' => 0 ];
}
$batchSize = $this->getBatchSize();
for ( $startId = $minmax->min; $startId <= $minmax->max; $startId += $batchSize ) {
$endId = min( $startId + $batchSize - 1, $minmax->max );
$rows = $this->dbw->select(
$tables,
$fields,
[
"$idField >= $startId",
"$idField <= $endId",
'slot_revision_id IS NULL',
],
__METHOD__,
[ 'ORDER BY' => 'rev_id' ],
$joins
);
if ( $rows->numRows() !== 0 ) {
$this->populateContentTablesForRowBatch( $rows, $startId, $table );
}
$elapsed = microtime( true ) - $t0;
$this->writeln(
"... $table processed up to revision id $endId of {$minmax->max}"
. " ($this->count rows in $elapsed seconds)"
);
}
$elapsed = microtime( true ) - $t0;
$this->writeln( "Done populating $table table. Processed $this->count rows in $elapsed seconds" );
}
/**
* @param IResultWrapper $rows
* @param int $startId
* @param string $table
* @return int|null
*/
private function populateContentTablesForRowBatch( IResultWrapper $rows, $startId, $table ) {
$this->beginTransaction( $this->dbw, __METHOD__ );
if ( $this->contentRowMap === null ) {
$map = [];
} else {
$map = &$this->contentRowMap;
}
$contentKeys = [];
try {
// Step 1: Figure out content rows needing insertion.
$contentRows = [];
foreach ( $rows as $row ) {
$revisionId = $row->rev_id;
Assert::invariant( $revisionId !== null, 'rev_id must not be null' );
$model = $this->getContentModel( $row );
$modelId = $this->contentModelStore->acquireId( $model );
$address = SqlBlobStore::makeAddressFromTextId( $row->text_id );
$key = "{$modelId}:{$address}";
$contentKeys[$revisionId] = $key;
if ( !isset( $map[$key] ) ) {
$this->fillMissingFields( $row, $model, $address );
$map[$key] = false;
$contentRows[] = [
'content_size' => (int)$row->len,
'content_sha1' => $row->sha1,
'content_model' => $modelId,
'content_address' => $address,
];
}
}
// Step 2: Insert them, then read them back in for use in the next step.
if ( $contentRows ) {
$id = $this->dbw->selectField( 'content', 'MAX(content_id)', '', __METHOD__ );
$this->dbw->insert( 'content', $contentRows, __METHOD__ );
$res = $this->dbw->select(
'content',
[ 'content_id', 'content_model', 'content_address' ],
'content_id > ' . (int)$id,
__METHOD__
);
foreach ( $res as $row ) {
$key = $row->content_model . ':' . $row->content_address;
$map[$key] = $row->content_id;
}
}
// Step 3: Insert the slot rows.
$slotRows = [];
foreach ( $rows as $row ) {
$revisionId = $row->rev_id;
$contentId = $map[$contentKeys[$revisionId]] ?? false;
if ( $contentId === false ) {
throw new \RuntimeException( "Content row for $revisionId not found after content insert" );
}
$slotRows[] = [
'slot_revision_id' => $revisionId,
'slot_role_id' => $this->mainRoleId,
'slot_content_id' => $contentId,
// There's no way to really know the previous revision, so assume no inheriting.
// rev_parent_id can get changed on undeletions, and deletions can screw up
// rev_timestamp ordering.
'slot_origin' => $revisionId,
];
}
$this->dbw->insert( 'slots', $slotRows, __METHOD__ );
$this->count += count( $slotRows );
$this->totalCount += count( $slotRows );
} catch ( \Exception $e ) {
$this->rollbackTransaction( $this->dbw, __METHOD__ );
$this->fatalError( "Failed to populate content table $table row batch starting at $startId "
. "due to exception: " . $e->__toString() );
}
$this->commitTransaction( $this->dbw, __METHOD__ );
}
/**
* @param \stdClass $row
* @return string
*/
private function getContentModel( $row ) {
if ( isset( $row->content_model ) ) {
return $row->content_model;
}
$title = Title::makeTitle( $row->namespace, $row->title );
return ContentHandler::getDefaultModelFor( $title );
}
/**
* @param string $msg
*/
private function writeln( $msg ) {
$this->output( "$msg\n" );
}
/**
* Compute any missing fields in $row.
* The way the missing values are computed must correspond to the way this is done in SlotRecord.
*
* @param object $row to be modified
* @param string $model
* @param string $address
*/
private function fillMissingFields( $row, $model, $address ) {
if ( !isset( $row->content_model ) ) {
// just for completeness
$row->content_model = $model;
}
if ( isset( $row->len ) && isset( $row->sha1 ) && $row->sha1 !== '' ) {
// No need to load the content, quite now.
return;
}
$blob = $this->blobStore->getBlob( $address );
if ( !isset( $row->len ) ) {
// NOTE: The nominal size of the content may not be the length of the raw blob.
$handler = ContentHandler::getForModelID( $model );
$content = $handler->unserializeContent( $blob );
$row->len = $content->getSize();
}
if ( !isset( $row->sha1 ) || $row->sha1 === '' ) {
$row->sha1 = SlotRecord::base36Sha1( $blob );
}
}
}
$maintClass = 'PopulateContentTables';
require_once RUN_MAINTENANCE_IF_MAIN;