2006-06-03 08:50:22 +00:00
|
|
|
<?php
|
|
|
|
|
/**
|
|
|
|
|
* Take page text out of an XML dump file and render basic HTML out to files.
|
|
|
|
|
* This is *NOT* suitable for publishing or offline use; it's intended for
|
2007-06-06 16:01:14 +00:00
|
|
|
* running comparative tests of parsing behavior using real-world data.
|
2006-06-03 08:50:22 +00:00
|
|
|
*
|
|
|
|
|
* Templates etc are pulled from the local wiki database, not from the dump.
|
|
|
|
|
*
|
2024-02-09 01:02:16 +00:00
|
|
|
* Copyright (C) 2006 Brooke Vibber <bvibber@wikimedia.org>
|
2014-03-20 15:45:01 +00:00
|
|
|
* https://www.mediawiki.org/
|
2006-06-03 08:50:22 +00:00
|
|
|
*
|
|
|
|
|
* This program is free software; you can redistribute it and/or modify
|
|
|
|
|
* it under the terms of the GNU General Public License as published by
|
|
|
|
|
* the Free Software Foundation; either version 2 of the License, or
|
|
|
|
|
* (at your option) any later version.
|
|
|
|
|
*
|
|
|
|
|
* This program is distributed in the hope that it will be useful,
|
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
|
* GNU General Public License for more details.
|
|
|
|
|
*
|
|
|
|
|
* You should have received a copy of the GNU General Public License along
|
|
|
|
|
* with this program; if not, write to the Free Software Foundation, Inc.,
|
|
|
|
|
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
|
|
|
|
* http://www.gnu.org/copyleft/gpl.html
|
|
|
|
|
*
|
WARNING: HUGE COMMIT
Doxygen documentation update:
* Changed alls @addtogroup to @ingroup. @addtogroup adds the comment to the group description, but doesn't add the file, class, function, ... to the group like @ingroup does. See for example http://svn.wikimedia.org/doc/group__SpecialPage.html where it's impossible to see related files, classes, ... that should belong to that group.
* Added @file to file description, it seems that it should be explicitely decalred for file descriptions, otherwise doxygen will think that the comment document the first class, variabled, function, ... that is in that file.
* Removed some empty comments
* Removed some ?>
Added following groups:
* ExternalStorage
* JobQueue
* MaintenanceLanguage
One more thing: there are still a lot of warnings when generating the doc.
2008-05-20 17:13:28 +00:00
|
|
|
* @file
|
|
|
|
|
* @ingroup Maintenance
|
2006-06-03 08:50:22 +00:00
|
|
|
*/
|
2010-12-04 03:20:14 +00:00
|
|
|
|
2024-10-21 17:06:13 +00:00
|
|
|
use MediaWiki\Parser\ParserOptions;
|
2023-12-08 21:07:22 +00:00
|
|
|
use MediaWiki\Permissions\UltimateAuthority;
|
Add ParserOutput::{get,set}RenderId() and set render id in ContentRenderer
Set the render ID for each parse stored into cache so that we are able
to identify a specific parse when there are dependencies (for example
in an edit based on that parse). This is recorded as a property added
to the ParserOutput, not the parent CacheTime interface. Even though
the render ID is /related/ to the CacheTime interface, CacheTime is
also used directly as a parser cache key, and the UUID should not be
part of the lookup key.
In general we are trying to move the location where these cache
properties are set as early as possible, so we check at each location
to ensure we don't overwrite a previously-set value. Eventually we
can convert most of these checks into assertions that the cache
properties have already been set (T350538). The primary location for
setting cache properties is the ContentRenderer.
Moved setting the revision timestamp into ContentRenderer as well, as
it was set along the same code paths. An extra parameter was added to
ContentRenderer::getParserOutput() to support this.
Added merge code to ParserOutput::mergeInternalMetaDataFrom() which
should ensure that cache time, revision, timestamp, and render id are
all set properly when multiple slots are combined together in MCR.
In order to ensure the render ID is set on all codepaths we needed to
plumb the GlobalIdGenerator service into ContentRenderer, ParserCache,
ParserCacheFactory, and RevisionOutputCache. Eventually (T350538) it
should only be necessary in the ContentRenderer.
Bug: T350538
Bug: T349868
Followup-To: Ic9b7cc0fcf365e772b7d080d76a065e3fd585f80
Change-Id: I72c5e6f86b7f081ab5ce7a56f5365d2f75067a78
2023-09-14 16:11:20 +00:00
|
|
|
use MediaWiki\Revision\MutableRevisionRecord;
|
2023-09-19 12:13:45 +00:00
|
|
|
use MediaWiki\User\User;
|
|
|
|
|
|
2024-08-27 12:00:25 +00:00
|
|
|
// @codeCoverageIgnoreStart
|
2013-05-17 00:16:59 +00:00
|
|
|
require_once __DIR__ . '/Maintenance.php';
|
2024-08-27 12:00:25 +00:00
|
|
|
// @codeCoverageIgnoreEnd
|
2006-06-03 08:50:22 +00:00
|
|
|
|
2012-08-20 14:55:28 +00:00
|
|
|
/**
|
|
|
|
|
* Maintenance script that takes page text out of an XML dump file
|
|
|
|
|
* and render basic HTML out to files.
|
|
|
|
|
*
|
|
|
|
|
* @ingroup Maintenance
|
|
|
|
|
*/
|
2009-08-02 19:35:17 +00:00
|
|
|
class DumpRenderer extends Maintenance {
|
2006-06-03 08:50:22 +00:00
|
|
|
|
2024-09-12 19:59:28 +00:00
|
|
|
/** @var int */
|
2009-08-02 19:35:17 +00:00
|
|
|
private $count = 0;
|
2024-04-21 14:43:04 +00:00
|
|
|
private string $outputDirectory;
|
|
|
|
|
private float $startTime;
|
2019-09-09 09:11:50 +00:00
|
|
|
/** @var string */
|
|
|
|
|
private $prefix;
|
2006-06-03 08:50:22 +00:00
|
|
|
|
2009-08-02 19:35:17 +00:00
|
|
|
public function __construct() {
|
|
|
|
|
parent::__construct();
|
2016-01-30 02:48:47 +00:00
|
|
|
$this->addDescription(
|
|
|
|
|
'Take page text out of an XML dump file and render basic HTML out to files' );
|
2009-08-02 19:35:17 +00:00
|
|
|
$this->addOption( 'output-dir', 'The directory to output the HTML files to', true, true );
|
2010-07-27 20:44:42 +00:00
|
|
|
$this->addOption( 'prefix', 'Prefix for the rendered files (defaults to wiki)', false, true );
|
|
|
|
|
$this->addOption( 'parser', 'Use an alternative parser class', false, true );
|
2006-06-03 08:50:22 +00:00
|
|
|
}
|
|
|
|
|
|
2009-08-02 19:35:17 +00:00
|
|
|
public function execute() {
|
|
|
|
|
$this->outputDirectory = $this->getOption( 'output-dir' );
|
2010-07-27 20:44:42 +00:00
|
|
|
$this->prefix = $this->getOption( 'prefix', 'wiki' );
|
2012-09-05 15:50:13 +00:00
|
|
|
$this->startTime = microtime( true );
|
2009-08-02 19:35:17 +00:00
|
|
|
|
2010-07-27 20:44:42 +00:00
|
|
|
if ( $this->hasOption( 'parser' ) ) {
|
2022-08-05 17:50:22 +00:00
|
|
|
$this->prefix .= '-' . $this->getOption( 'parser' );
|
2020-04-16 18:14:42 +00:00
|
|
|
// T236809: We'll need to provide an alternate ParserFactory
|
|
|
|
|
// service to make this work.
|
|
|
|
|
$this->fatalError( 'Parser class configuration temporarily disabled.' );
|
2010-07-27 20:44:42 +00:00
|
|
|
}
|
|
|
|
|
|
2023-12-08 21:07:22 +00:00
|
|
|
$user = User::newSystemUser( User::MAINTENANCE_SCRIPT_USER, [ 'steal' => true ] );
|
|
|
|
|
|
2009-08-02 19:35:17 +00:00
|
|
|
$source = new ImportStreamSource( $this->getStdin() );
|
2023-08-31 09:21:12 +00:00
|
|
|
$importer = $this->getServiceContainer()
|
2021-05-14 23:16:34 +00:00
|
|
|
->getWikiImporterFactory()
|
2023-12-08 21:07:22 +00:00
|
|
|
->getWikiImporter( $source, new UltimateAuthority( $user ) );
|
2009-08-02 19:35:17 +00:00
|
|
|
|
|
|
|
|
$importer->setRevisionCallback(
|
2016-02-17 09:09:32 +00:00
|
|
|
[ $this, 'handleRevision' ] );
|
2021-02-10 22:31:02 +00:00
|
|
|
$importer->setNoticeCallback( static function ( $msg, $params ) {
|
2017-12-22 03:12:28 +00:00
|
|
|
echo wfMessage( $msg, $params )->text() . "\n";
|
|
|
|
|
} );
|
2009-08-02 19:35:17 +00:00
|
|
|
|
2010-12-04 03:20:14 +00:00
|
|
|
$importer->doImport();
|
|
|
|
|
|
2012-09-05 15:50:13 +00:00
|
|
|
$delta = microtime( true ) - $this->startTime;
|
2013-04-27 11:23:52 +00:00
|
|
|
$this->error( "Rendered {$this->count} pages in " . round( $delta, 2 ) . " seconds " );
|
2013-04-18 18:48:44 +00:00
|
|
|
if ( $delta > 0 ) {
|
2013-04-27 11:23:52 +00:00
|
|
|
$this->error( round( $this->count / $delta, 2 ) . " pages/sec" );
|
2013-04-18 18:48:44 +00:00
|
|
|
}
|
2010-07-27 20:44:42 +00:00
|
|
|
$this->error( "\n" );
|
2009-08-02 19:35:17 +00:00
|
|
|
}
|
2010-12-04 03:20:14 +00:00
|
|
|
|
2009-08-02 19:35:17 +00:00
|
|
|
/**
|
|
|
|
|
* Callback function for each revision, turn into HTML and save
|
2020-04-03 23:03:40 +00:00
|
|
|
* @param WikiRevision $rev
|
2009-08-02 19:35:17 +00:00
|
|
|
*/
|
2020-04-03 23:03:40 +00:00
|
|
|
public function handleRevision( WikiRevision $rev ) {
|
2006-06-03 08:50:22 +00:00
|
|
|
$title = $rev->getTitle();
|
2010-05-22 16:50:39 +00:00
|
|
|
if ( !$title ) {
|
2009-08-02 19:35:17 +00:00
|
|
|
$this->error( "Got bogus revision with null title!" );
|
2014-04-23 18:09:26 +00:00
|
|
|
|
2006-06-03 08:50:22 +00:00
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
$display = $title->getPrefixedText();
|
2009-08-02 19:35:17 +00:00
|
|
|
|
2006-06-03 08:50:22 +00:00
|
|
|
$this->count++;
|
2009-08-02 19:35:17 +00:00
|
|
|
|
2006-06-03 08:50:22 +00:00
|
|
|
$sanitized = rawurlencode( $display );
|
2010-07-27 20:44:42 +00:00
|
|
|
$filename = sprintf( "%s/%s-%07d-%s.html",
|
2006-06-03 08:50:22 +00:00
|
|
|
$this->outputDirectory,
|
2010-07-27 20:44:42 +00:00
|
|
|
$this->prefix,
|
2006-06-03 08:50:22 +00:00
|
|
|
$this->count,
|
|
|
|
|
$sanitized );
|
2023-03-24 00:27:05 +00:00
|
|
|
$this->output( sprintf( "%s\t%s\n", $filename, $display ) );
|
2009-08-02 19:35:17 +00:00
|
|
|
|
2006-06-03 08:50:22 +00:00
|
|
|
$user = new User();
|
|
|
|
|
$options = ParserOptions::newFromUser( $user );
|
2009-08-02 19:35:17 +00:00
|
|
|
|
2012-09-14 11:13:29 +00:00
|
|
|
$content = $rev->getContent();
|
2023-08-31 09:21:12 +00:00
|
|
|
$contentRenderer = $this->getServiceContainer()->getContentRenderer();
|
Add ParserOutput::{get,set}RenderId() and set render id in ContentRenderer
Set the render ID for each parse stored into cache so that we are able
to identify a specific parse when there are dependencies (for example
in an edit based on that parse). This is recorded as a property added
to the ParserOutput, not the parent CacheTime interface. Even though
the render ID is /related/ to the CacheTime interface, CacheTime is
also used directly as a parser cache key, and the UUID should not be
part of the lookup key.
In general we are trying to move the location where these cache
properties are set as early as possible, so we check at each location
to ensure we don't overwrite a previously-set value. Eventually we
can convert most of these checks into assertions that the cache
properties have already been set (T350538). The primary location for
setting cache properties is the ContentRenderer.
Moved setting the revision timestamp into ContentRenderer as well, as
it was set along the same code paths. An extra parameter was added to
ContentRenderer::getParserOutput() to support this.
Added merge code to ParserOutput::mergeInternalMetaDataFrom() which
should ensure that cache time, revision, timestamp, and render id are
all set properly when multiple slots are combined together in MCR.
In order to ensure the render ID is set on all codepaths we needed to
plumb the GlobalIdGenerator service into ContentRenderer, ParserCache,
ParserCacheFactory, and RevisionOutputCache. Eventually (T350538) it
should only be necessary in the ContentRenderer.
Bug: T350538
Bug: T349868
Followup-To: Ic9b7cc0fcf365e772b7d080d76a065e3fd585f80
Change-Id: I72c5e6f86b7f081ab5ce7a56f5365d2f75067a78
2023-09-14 16:11:20 +00:00
|
|
|
// ContentRenderer expects a RevisionRecord, and all we have is a
|
|
|
|
|
// WikiRevision from the dump. Make a fake MutableRevisionRecord to
|
|
|
|
|
// satisfy it -- the only thing ::getParserOutput actually needs is
|
|
|
|
|
// the revision ID and revision timestamp.
|
|
|
|
|
$mutableRev = new MutableRevisionRecord( $rev->getTitle() );
|
|
|
|
|
$mutableRev->setId( $rev->getID() );
|
|
|
|
|
$mutableRev->setTimestamp( $rev->getTimestamp() );
|
|
|
|
|
$output = $contentRenderer->getParserOutput(
|
|
|
|
|
$content, $title, $mutableRev, $options
|
|
|
|
|
);
|
2009-08-02 19:35:17 +00:00
|
|
|
|
2006-06-03 08:50:22 +00:00
|
|
|
file_put_contents( $filename,
|
2013-05-10 04:04:33 +00:00
|
|
|
"<!DOCTYPE html>\n" .
|
|
|
|
|
"<html lang=\"en\" dir=\"ltr\">\n" .
|
2006-06-03 08:50:22 +00:00
|
|
|
"<head>\n" .
|
2013-05-10 04:04:33 +00:00
|
|
|
"<meta charset=\"UTF-8\" />\n" .
|
2024-06-09 11:21:45 +00:00
|
|
|
"<meta name=\"color-scheme\" content=\"light dark\">" .
|
2022-01-25 05:23:23 +00:00
|
|
|
"<title>" . htmlspecialchars( $display, ENT_COMPAT ) . "</title>\n" .
|
2010-05-22 16:50:39 +00:00
|
|
|
"</head>\n" .
|
2006-06-03 08:50:22 +00:00
|
|
|
"<body>\n" .
|
2024-08-19 07:17:23 +00:00
|
|
|
// TODO T371004 move runOutputPipeline out of $parserOutput
|
|
|
|
|
$output->runOutputPipeline( $options, [] )->getContentHolderText() .
|
2006-06-03 08:50:22 +00:00
|
|
|
"</body>\n" .
|
|
|
|
|
"</html>" );
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2024-08-27 12:00:25 +00:00
|
|
|
// @codeCoverageIgnoreStart
|
2018-01-13 00:02:09 +00:00
|
|
|
$maintClass = DumpRenderer::class;
|
2013-05-07 23:00:15 +00:00
|
|
|
require_once RUN_MAINTENANCE_IF_MAIN;
|
2024-08-27 12:00:25 +00:00
|
|
|
// @codeCoverageIgnoreEnd
|