wiki.techinc.nl/maintenance/compareLanguageConverterOutput.php
C. Scott Ananian 16de2c0851 [ParsoidParser] Remove unneeded code to set render ID
Since I72c5e6f86b7f081ab5ce7a56f5365d2f75067a78 it is part of the
contract of ContentRenderer::getParserOutput() that the render ID (and
other cache parameters) will be set when it returns.
(ContentHandler::getParserOutput() can set them even earlier if it has
custom content-based overrides.)  We had a lot of temporary
backward-compatibility code "later" in the parse process to try to close
the barn door if some code path "forgot" to set them, but these are
unnecessary now.

This patch removes that backward-compatibility code in ParsoidParser;
there is similar remaining code in ParserCache etc. which can be
addressed in follow ups.

(For compatibility we do have to temporarily copy the render ID code
inside ParsoidOutputAccess::parseUncachable, but that class is
deprecated and will be removed.)

The HtmlOutputRendererHelper path which used to call
ParsoidParser::parseFakeRevision() is now replaced with a codepath that
goes through RevisionRenderer.  In order to maintain the same behavior
of the ParsoidHandler, we have also added 'useParsoid' handling to the
JsonContentHandler.  This support can perhaps be deprecated eventually.

Bug: T350538
Change-Id: I0853624cf785f72fd956c6c2336f979f4402a68f
2024-07-19 16:09:32 -04:00

283 lines
9 KiB
PHP

<?php
/**
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
* http://www.gnu.org/copyleft/gpl.html
*
* @file
* @ingroup Maintenance
*/
use MediaWiki\Config\ServiceOptions;
use MediaWiki\Content\TextContent;
use MediaWiki\Parser\ParserOutput;
use MediaWiki\Rest\Handler\Helper\PageRestHelperFactory;
use MediaWiki\Revision\SlotRecord;
use MediaWiki\Title\Title;
use MediaWiki\User\User;
use Wikimedia\Bcp47Code\Bcp47Code;
use Wikimedia\Diff\ArrayDiffFormatter;
use Wikimedia\Diff\ComplexityException;
use Wikimedia\Diff\Diff;
require_once __DIR__ . '/Maintenance.php';
/**
* Maintenance script that compares variant conversion output between Parser and
* HtmlOutputRendererHelper.
*
* @ingroup Maintenance
*/
class CompareLanguageConverterOutput extends Maintenance {
public function __construct() {
parent::__construct();
$this->addDescription( 'Compares variant conversion output between Parser and HtmlOutputRendererHelper' );
$this->addArg(
'page-title',
'Name of the page to be parsed and compared',
true
);
$this->addArg(
'target-variant',
'Target variant language code to transform the content to',
true
);
}
public function execute() {
$mwInstance = $this->getServiceContainer();
$pageName = $this->getArg( 'page-title' );
$pageTitle = Title::newFromText( $pageName );
if ( !$pageTitle || !$pageTitle->exists() ) {
$this->fatalError( "Title with name $pageName not found" );
}
$targetVariantCode = $this->getArg( 'target-variant' );
$languageNameUtils = $mwInstance->getLanguageNameUtils();
if ( !$languageNameUtils->isValidBuiltInCode( $targetVariantCode ) ) {
$this->fatalError( "$targetVariantCode is not a supported variant" );
}
$targetVariant = $mwInstance->getLanguageFactory()->getLanguage(
$targetVariantCode
);
$user = User::newSystemUser( User::MAINTENANCE_SCRIPT_USER, [ 'steal' => true ] );
$baseLanguage = $pageTitle->getPageLanguage();
$parserOutput = $this->getParserOutput( $pageTitle, $baseLanguage, $targetVariant );
$parsoidOutput = $this->getParsoidOutput( $pageTitle, $targetVariant, $user );
$converterUsed = $this->getConverterUsed( $parsoidOutput );
$this->compareOutput( $parserOutput, $parsoidOutput, $converterUsed );
return true;
}
private function newPageRestHelperFactory(): PageRestHelperFactory {
$services = $this->getServiceContainer();
$factory = new PageRestHelperFactory(
new ServiceOptions( PageRestHelperFactory::CONSTRUCTOR_OPTIONS, $services->getMainConfig() ),
$services->getRevisionLookup(),
$services->getRevisionRenderer(),
$services->getTitleFormatter(),
$services->getPageStore(),
$services->getParsoidOutputStash(),
new NullStatsdDataFactory(),
$services->getParserOutputAccess(),
$services->getParsoidSiteConfig(),
$services->getHtmlTransformFactory(),
$services->getContentHandlerFactory(),
$services->getLanguageFactory(),
$services->getRedirectStore(),
$services->getLanguageConverterFactory()
);
return $factory;
}
private function getParserOptions( Language $language ): ParserOptions {
$parserOpts = ParserOptions::newFromAnon();
$parserOpts->setTargetLanguage( $language );
$parserOpts->disableContentConversion( false );
$parserOpts->disableTitleConversion( false );
return $parserOpts;
}
private function getParserOutput(
Title $pageTitle,
Language $baseLanguage,
Language $targetVariant
): ParserOutput {
// We update the default language variant because we want Parser to
// perform variant conversion to it.
global $wgDefaultLanguageVariant;
$wgDefaultLanguageVariant = $targetVariant->getCode();
$mwInstance = $this->getServiceContainer();
$languageFactory = $mwInstance->getLanguageFactory();
$parser = $mwInstance->getParser();
$parserOptions = $this->getParserOptions(
$languageFactory->getParentLanguage( $baseLanguage )
);
$content = $mwInstance->getRevisionLookup()
->getRevisionByTitle( $pageTitle )
->getContent( SlotRecord::MAIN );
$wikiContent = ( $content instanceof TextContent ) ? $content->getText() : '';
return $parser->parse( $wikiContent, $pageTitle, $parserOptions );
}
private function getParsoidOutput(
Title $pageTitle,
Bcp47Code $targetVariant,
User $user
): ParserOutput {
$htmlOutputRendererHelper = $this->newPageRestHelperFactory()->newHtmlOutputRendererHelper( $pageTitle, [
'stash' => false,
'flavor' => 'view',
], $user );
$htmlOutputRendererHelper->setVariantConversionLanguage( $targetVariant );
return $htmlOutputRendererHelper->getHtml();
}
private function getWords( string $output ): array {
$tagsRemoved = strip_tags( $output );
$words = preg_split( '/\s+/', trim( $tagsRemoved ), -1, PREG_SPLIT_NO_EMPTY );
return $words;
}
private function getBody( string $output ): string {
$dom = new DOMDocument();
// phpcs:disable Generic.PHP.NoSilencedErrors.Discouraged
@$dom->loadHTML( $output );
$body = $dom->getElementsByTagName( 'body' )->item( 0 );
if ( $body === null ) {
// Body element not present
return $output;
}
return $body->textContent;
}
private function compareOutput(
ParserOutput $parserOutput,
ParserOutput $parsoidOutput,
string $converterUsed
): void {
$parsoidText = $parsoidOutput->getText( [ 'deduplicateStyles' => false ] );
$parserText = $parserOutput->getText( [ 'deduplicateStyles' => false ] );
$parsoidWords = $this->getWords( $this->getBody( $parsoidText ) );
$parserWords = $this->getWords( $parserText );
$parserWordCount = count( $parserWords );
$parsoidWordCount = count( $parsoidWords );
$this->output( "Word count: Parsoid: $parsoidWordCount; Parser: $parserWordCount\n" );
$this->outputSimilarity( $parsoidWords, $parserWords );
$this->output( "\n" );
$this->outputDiff( $parsoidWords, $parserWords, $converterUsed );
}
private function getConverterUsed( ParserOutput $parsoidOutput ): string {
$isCoreConverterUsed = strpos(
$parsoidOutput->getRawText(),
'Variant conversion performed using the core LanguageConverter'
);
if ( $isCoreConverterUsed ) {
return 'Core LanguageConverter';
} else {
return 'Parsoid LanguageConverter';
}
}
// Inspired from: https://stackoverflow.com/a/55927237/903324
private function mb_sprintf( string $format, ...$args ): string {
$params = $args;
return sprintf(
preg_replace_callback(
'/(?<=%|%-)\d+(?=s)/',
static function ( array $matches ) use ( &$params ) {
$value = array_shift( $params );
return (string)( strlen( $value ) - mb_strlen( $value ) + $matches[0] );
},
$format
),
...$args
);
}
private function outputSimilarity( array $parsoidWords, array $parserWords ): void {
$parsoidOutput = implode( ' ', $parsoidWords );
$parserOutput = implode( ' ', $parserWords );
$this->output(
'Total characters: Parsoid: ' . strlen( $parsoidOutput ) .
'; Parser: ' . strlen( $parserOutput ) . "\n"
);
$similarityPercent = 0;
$similarCharacters = similar_text( $parsoidOutput, $parserOutput, $similarityPercent );
$similarityPercent = round( $similarityPercent, 2 );
$this->output(
"Similarity via similar_text(): $similarityPercent%; Similar characters: $similarCharacters"
);
}
private function outputDiff( array $parsoidWords, array $parserWords, string $converterUsed ): void {
$out = str_repeat( '-', 96 ) . "\n";
$out .= sprintf( "| %5s | %-35s | %-35s | %-8s |\n", 'Line', 'Parsoid', 'Parser', 'Diff' );
$out .= sprintf( "| %5s | %-35s | %-35s | %-8s |\n", '', "($converterUsed)", '', '' );
$out .= str_repeat( '-', 96 ) . "\n";
try {
$diff = new Diff( $parsoidWords, $parserWords );
} catch ( ComplexityException $e ) {
$this->output( $e->getMessage() );
$this->error( 'Encountered ComplexityException while computing diff' );
}
// Print the difference between the words
$wordDiffFormat = ( new ArrayDiffFormatter() )->format( $diff );
foreach ( $wordDiffFormat as $index => $wordDiff ) {
$action = $wordDiff['action'];
$old = $wordDiff['old'] ?? null;
$new = $wordDiff['new'] ?? null;
$out .= $this->mb_sprintf(
"| %5s | %-35s | %-35s | %-8s |\n",
str_pad( (string)( $index + 1 ), 5, ' ', STR_PAD_LEFT ),
mb_strimwidth( $old ?? '- N/A -', 0, 35, '…' ),
mb_strimwidth( $new ?? '- N/A -', 0, 35, '…' ),
$action
);
}
// Print the footer.
$out .= str_repeat( '-', 96 ) . "\n";
$this->output( "\n" . $out );
}
}
$maintClass = CompareLanguageConverterOutput::class;
require_once RUN_MAINTENANCE_IF_MAIN;