This does not include use of MediaWiki\Maintenance\Maintenance, assuming the maintenance scripts going into the same namespace Change-Id: I488f95b537ce86eb5e463be7bce3653610dd13d9
204 lines
6.1 KiB
PHP
204 lines
6.1 KiB
PHP
<?php
|
|
use MediaWiki\Page\PageLookup;
|
|
use MediaWiki\Page\PageRecord;
|
|
use MediaWiki\Page\ParserOutputAccess;
|
|
use MediaWiki\Parser\ParserOptions;
|
|
use MediaWiki\Parser\Parsoid\Config\SiteConfig as ParsoidSiteConfig;
|
|
use MediaWiki\Revision\RevisionLookup;
|
|
use MediaWiki\Revision\RevisionRecord;
|
|
use MediaWiki\Revision\SlotRecord;
|
|
use MediaWiki\Status\Status;
|
|
use Wikimedia\Parsoid\Core\ClientError;
|
|
use Wikimedia\Parsoid\Core\ResourceLimitExceededException;
|
|
use Wikimedia\Rdbms\SelectQueryBuilder;
|
|
|
|
// @codeCoverageIgnoreStart
|
|
require_once __DIR__ . '/Maintenance.php';
|
|
// @codeCoverageIgnoreEnd
|
|
|
|
/**
|
|
* Maintenance script for populating parser cache with parsoid output.
|
|
*
|
|
* @since 1.41
|
|
*
|
|
* @license GPL-2.0-or-later
|
|
* @author Richika Rana
|
|
*/
|
|
class PrewarmParsoidParserCache extends Maintenance {
|
|
private int $forceParse = 0;
|
|
private ParserOutputAccess $parserOutputAccess;
|
|
private PageLookup $pageLookup;
|
|
private RevisionLookup $revisionLookup;
|
|
private ParsoidSiteConfig $parsoidSiteConfig;
|
|
|
|
public function __construct() {
|
|
parent::__construct();
|
|
|
|
$this->addDescription(
|
|
'Populate parser cache with parsoid output. By default, script attempt to run' .
|
|
'for supported content model pages (in a specified batch if provided)'
|
|
);
|
|
$this->addOption(
|
|
'force',
|
|
'Re-parse pages even if the cached entry seems up to date',
|
|
false,
|
|
false
|
|
);
|
|
$this->addOption( 'start-from', 'Start from this page ID', false, true );
|
|
$this->addOption( 'namespace', 'Filter pages in this namespace', false, true );
|
|
$this->setBatchSize( 100 );
|
|
}
|
|
|
|
private function getPageLookup(): PageLookup {
|
|
$this->pageLookup = $this->getServiceContainer()->getPageStore();
|
|
return $this->pageLookup;
|
|
}
|
|
|
|
private function getRevisionLookup(): RevisionLookup {
|
|
$this->revisionLookup = $this->getServiceContainer()->getRevisionLookup();
|
|
return $this->revisionLookup;
|
|
}
|
|
|
|
private function getParserOutputAccess(): ParserOutputAccess {
|
|
$this->parserOutputAccess = $this->getServiceContainer()->getParserOutputAccess();
|
|
return $this->parserOutputAccess;
|
|
}
|
|
|
|
private function getParsoidSiteConfig(): ParsoidSiteConfig {
|
|
$this->parsoidSiteConfig = $this->getServiceContainer()->getParsoidSiteConfig();
|
|
return $this->parsoidSiteConfig;
|
|
}
|
|
|
|
private function getQueryBuilder(): SelectQueryBuilder {
|
|
$dbr = $this->getReplicaDB();
|
|
|
|
return $dbr->newSelectQueryBuilder()
|
|
->select( [ 'page_id' ] )
|
|
->from( 'page' )
|
|
->caller( __METHOD__ )
|
|
->orderBy( 'page_id', SelectQueryBuilder::SORT_ASC );
|
|
}
|
|
|
|
private function parse(
|
|
PageRecord $page,
|
|
RevisionRecord $revision
|
|
): Status {
|
|
$popts = ParserOptions::newFromAnon();
|
|
$popts->setUseParsoid();
|
|
try {
|
|
return $this->getParserOutputAccess()->getParserOutput(
|
|
$page,
|
|
$popts,
|
|
$revision,
|
|
$this->forceParse
|
|
);
|
|
} catch ( ClientError $e ) {
|
|
return Status::newFatal( 'parsoid-client-error', $e->getMessage() );
|
|
} catch ( ResourceLimitExceededException $e ) {
|
|
return Status::newFatal( 'parsoid-resource-limit-exceeded', $e->getMessage() );
|
|
}
|
|
}
|
|
|
|
/*
|
|
* NamespaceInfo::getCanonicalIndex() requires the namespace to be in lowercase,
|
|
* so let's do some normalization and return its canonical index.
|
|
*
|
|
* @param string $namespace The namespace string from the command line
|
|
* @return int The canonical index of the namespace
|
|
*/
|
|
private function normalizeNamespace( string $namespace ): int {
|
|
return $this->getServiceContainer()->getNamespaceInfo()
|
|
->getCanonicalIndex( strtolower( $namespace ) );
|
|
}
|
|
|
|
/**
|
|
* Populate parser cache with parsoid output.
|
|
*
|
|
* @return bool
|
|
*/
|
|
public function execute() {
|
|
$force = $this->getOption( 'force' );
|
|
$startFrom = $this->getOption( 'start-from' );
|
|
|
|
// We need the namespace index instead of the name to perform the query
|
|
// on, because that's what the page table stores (in the page_namespace field).
|
|
$namespaceIndex = null;
|
|
$namespace = $this->getOption( 'namespace' );
|
|
if ( $namespace !== null ) {
|
|
$namespaceIndex = $this->normalizeNamespace( $namespace );
|
|
}
|
|
|
|
if ( $force !== null ) {
|
|
// If --force is supplied, for a parse for supported pages or supported
|
|
// pages in the specified batch.
|
|
$this->forceParse = ParserOutputAccess::OPT_FORCE_PARSE;
|
|
}
|
|
|
|
$startFrom = (int)$startFrom;
|
|
|
|
$this->output( "\nWarming parsoid parser cache with Parsoid output...\n\n" );
|
|
while ( true ) {
|
|
$query = $this->getQueryBuilder();
|
|
if ( $namespaceIndex !== null ) {
|
|
$query = $query->where( [ 'page_namespace' => $namespaceIndex ] );
|
|
}
|
|
$query = $query->where( $this->getReplicaDB()->expr( 'page_id', '>=', $startFrom ) )
|
|
->limit( $this->getBatchSize() );
|
|
|
|
$result = $query->fetchResultSet();
|
|
|
|
if ( !$result->numRows() ) {
|
|
break;
|
|
}
|
|
|
|
$currentBatch = $startFrom + ( $this->getBatchSize() - 1 );
|
|
$this->output( "\n\nBatch: $startFrom - $currentBatch\n----\n" );
|
|
|
|
// Look through pages by pageId and populate the parserCache
|
|
foreach ( $result as $row ) {
|
|
$page = $this->getPageLookup()->getPageById( $row->page_id );
|
|
$startFrom = ( (int)$row->page_id + 1 );
|
|
|
|
if ( $page === null ) {
|
|
$this->output( "\n[Skipped] Page ID: $row->page_id not found.\n" );
|
|
continue;
|
|
}
|
|
|
|
$latestRevision = $page->getLatest();
|
|
$revision = $this->getRevisionLookup()->getRevisionById( $latestRevision );
|
|
$mainSlot = $revision->getSlot( SlotRecord::MAIN );
|
|
|
|
// POA will write a dummy output to PC, but we don't want that here. Just skip!
|
|
if ( !$this->getParsoidSiteConfig()->supportsContentModel( $mainSlot->getModel() ) ) {
|
|
$this->output(
|
|
'[Skipped] Content model "' .
|
|
$mainSlot->getModel() .
|
|
"\" not supported for page ID: $row->page_id.\n"
|
|
);
|
|
continue;
|
|
}
|
|
|
|
$status = $this->parse( $page, $revision );
|
|
if ( !$status->isOK() ) {
|
|
$this->output(
|
|
__METHOD__ .
|
|
": Error parsing page ID: $row->page_id or writing to parser cache\n"
|
|
);
|
|
continue;
|
|
}
|
|
|
|
$this->output( "[Done] Page ID: $row->page_id ✔️\n" );
|
|
}
|
|
$this->waitForReplication();
|
|
}
|
|
|
|
$this->output( "\nDone pre-warming parsoid parser cache...\n" );
|
|
|
|
return true;
|
|
}
|
|
}
|
|
|
|
// @codeCoverageIgnoreStart
|
|
$maintClass = PrewarmParsoidParserCache::class;
|
|
require_once RUN_MAINTENANCE_IF_MAIN;
|
|
// @codeCoverageIgnoreEnd
|