wiki.techinc.nl/maintenance/renderDump.php

158 lines
5.2 KiB
PHP
Raw Normal View History

<?php
/**
* Take page text out of an XML dump file and render basic HTML out to files.
* This is *NOT* suitable for publishing or offline use; it's intended for
* running comparative tests of parsing behavior using real-world data.
*
* Templates etc are pulled from the local wiki database, not from the dump.
*
* Copyright (C) 2006 Brooke Vibber <bvibber@wikimedia.org>
* https://www.mediawiki.org/
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
* http://www.gnu.org/copyleft/gpl.html
*
* @file
* @ingroup Maintenance
*/
use MediaWiki\Parser\ParserOptions;
use MediaWiki\Permissions\UltimateAuthority;
Add ParserOutput::{get,set}RenderId() and set render id in ContentRenderer Set the render ID for each parse stored into cache so that we are able to identify a specific parse when there are dependencies (for example in an edit based on that parse). This is recorded as a property added to the ParserOutput, not the parent CacheTime interface. Even though the render ID is /related/ to the CacheTime interface, CacheTime is also used directly as a parser cache key, and the UUID should not be part of the lookup key. In general we are trying to move the location where these cache properties are set as early as possible, so we check at each location to ensure we don't overwrite a previously-set value. Eventually we can convert most of these checks into assertions that the cache properties have already been set (T350538). The primary location for setting cache properties is the ContentRenderer. Moved setting the revision timestamp into ContentRenderer as well, as it was set along the same code paths. An extra parameter was added to ContentRenderer::getParserOutput() to support this. Added merge code to ParserOutput::mergeInternalMetaDataFrom() which should ensure that cache time, revision, timestamp, and render id are all set properly when multiple slots are combined together in MCR. In order to ensure the render ID is set on all codepaths we needed to plumb the GlobalIdGenerator service into ContentRenderer, ParserCache, ParserCacheFactory, and RevisionOutputCache. Eventually (T350538) it should only be necessary in the ContentRenderer. Bug: T350538 Bug: T349868 Followup-To: Ic9b7cc0fcf365e772b7d080d76a065e3fd585f80 Change-Id: I72c5e6f86b7f081ab5ce7a56f5365d2f75067a78
2023-09-14 16:11:20 +00:00
use MediaWiki\Revision\MutableRevisionRecord;
use MediaWiki\User\User;
// @codeCoverageIgnoreStart
require_once __DIR__ . '/Maintenance.php';
// @codeCoverageIgnoreEnd
/**
* Maintenance script that takes page text out of an XML dump file
* and render basic HTML out to files.
*
* @ingroup Maintenance
*/
class DumpRenderer extends Maintenance {
/** @var int */
private $count = 0;
private string $outputDirectory;
private float $startTime;
/** @var string */
private $prefix;
public function __construct() {
parent::__construct();
$this->addDescription(
'Take page text out of an XML dump file and render basic HTML out to files' );
$this->addOption( 'output-dir', 'The directory to output the HTML files to', true, true );
2010-07-27 20:44:42 +00:00
$this->addOption( 'prefix', 'Prefix for the rendered files (defaults to wiki)', false, true );
$this->addOption( 'parser', 'Use an alternative parser class', false, true );
}
public function execute() {
$this->outputDirectory = $this->getOption( 'output-dir' );
2010-07-27 20:44:42 +00:00
$this->prefix = $this->getOption( 'prefix', 'wiki' );
$this->startTime = microtime( true );
2010-07-27 20:44:42 +00:00
if ( $this->hasOption( 'parser' ) ) {
$this->prefix .= '-' . $this->getOption( 'parser' );
// T236809: We'll need to provide an alternate ParserFactory
// service to make this work.
$this->fatalError( 'Parser class configuration temporarily disabled.' );
2010-07-27 20:44:42 +00:00
}
$user = User::newSystemUser( User::MAINTENANCE_SCRIPT_USER, [ 'steal' => true ] );
$source = new ImportStreamSource( $this->getStdin() );
$importer = $this->getServiceContainer()
->getWikiImporterFactory()
->getWikiImporter( $source, new UltimateAuthority( $user ) );
$importer->setRevisionCallback(
[ $this, 'handleRevision' ] );
$importer->setNoticeCallback( static function ( $msg, $params ) {
echo wfMessage( $msg, $params )->text() . "\n";
} );
$importer->doImport();
$delta = microtime( true ) - $this->startTime;
$this->error( "Rendered {$this->count} pages in " . round( $delta, 2 ) . " seconds " );
if ( $delta > 0 ) {
$this->error( round( $this->count / $delta, 2 ) . " pages/sec" );
}
2010-07-27 20:44:42 +00:00
$this->error( "\n" );
}
/**
* Callback function for each revision, turn into HTML and save
* @param WikiRevision $rev
*/
public function handleRevision( WikiRevision $rev ) {
$title = $rev->getTitle();
2010-05-22 16:50:39 +00:00
if ( !$title ) {
$this->error( "Got bogus revision with null title!" );
return;
}
$display = $title->getPrefixedText();
$this->count++;
$sanitized = rawurlencode( $display );
2010-07-27 20:44:42 +00:00
$filename = sprintf( "%s/%s-%07d-%s.html",
$this->outputDirectory,
2010-07-27 20:44:42 +00:00
$this->prefix,
$this->count,
$sanitized );
$this->output( sprintf( "%s\t%s\n", $filename, $display ) );
$user = new User();
$options = ParserOptions::newFromUser( $user );
$content = $rev->getContent();
$contentRenderer = $this->getServiceContainer()->getContentRenderer();
Add ParserOutput::{get,set}RenderId() and set render id in ContentRenderer Set the render ID for each parse stored into cache so that we are able to identify a specific parse when there are dependencies (for example in an edit based on that parse). This is recorded as a property added to the ParserOutput, not the parent CacheTime interface. Even though the render ID is /related/ to the CacheTime interface, CacheTime is also used directly as a parser cache key, and the UUID should not be part of the lookup key. In general we are trying to move the location where these cache properties are set as early as possible, so we check at each location to ensure we don't overwrite a previously-set value. Eventually we can convert most of these checks into assertions that the cache properties have already been set (T350538). The primary location for setting cache properties is the ContentRenderer. Moved setting the revision timestamp into ContentRenderer as well, as it was set along the same code paths. An extra parameter was added to ContentRenderer::getParserOutput() to support this. Added merge code to ParserOutput::mergeInternalMetaDataFrom() which should ensure that cache time, revision, timestamp, and render id are all set properly when multiple slots are combined together in MCR. In order to ensure the render ID is set on all codepaths we needed to plumb the GlobalIdGenerator service into ContentRenderer, ParserCache, ParserCacheFactory, and RevisionOutputCache. Eventually (T350538) it should only be necessary in the ContentRenderer. Bug: T350538 Bug: T349868 Followup-To: Ic9b7cc0fcf365e772b7d080d76a065e3fd585f80 Change-Id: I72c5e6f86b7f081ab5ce7a56f5365d2f75067a78
2023-09-14 16:11:20 +00:00
// ContentRenderer expects a RevisionRecord, and all we have is a
// WikiRevision from the dump. Make a fake MutableRevisionRecord to
// satisfy it -- the only thing ::getParserOutput actually needs is
// the revision ID and revision timestamp.
$mutableRev = new MutableRevisionRecord( $rev->getTitle() );
$mutableRev->setId( $rev->getID() );
$mutableRev->setTimestamp( $rev->getTimestamp() );
$output = $contentRenderer->getParserOutput(
$content, $title, $mutableRev, $options
);
file_put_contents( $filename,
"<!DOCTYPE html>\n" .
"<html lang=\"en\" dir=\"ltr\">\n" .
"<head>\n" .
"<meta charset=\"UTF-8\" />\n" .
"<meta name=\"color-scheme\" content=\"light dark\">" .
"<title>" . htmlspecialchars( $display, ENT_COMPAT ) . "</title>\n" .
2010-05-22 16:50:39 +00:00
"</head>\n" .
"<body>\n" .
// TODO T371004 move runOutputPipeline out of $parserOutput
$output->runOutputPipeline( $options, [] )->getContentHolderText() .
"</body>\n" .
"</html>" );
}
}
// @codeCoverageIgnoreStart
$maintClass = DumpRenderer::class;
require_once RUN_MAINTENANCE_IF_MAIN;
// @codeCoverageIgnoreEnd