wiki.techinc.nl/includes/ParserOutputTransform/DefaultOutputTransform.php
C. Scott Ananian 0e1b889a0f [parsoid] Fix Parsoid relative links
Bug: T350952
Change-Id: I60165a9946a35cfb42a78ed2f833c34570fefffc
2023-11-16 16:28:55 -05:00

434 lines
14 KiB
PHP
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

<?php
namespace Mediawiki\ParserOutputTransform;
use Language;
use Linker;
use MediaWiki\HookContainer\HookContainer;
use MediaWiki\HookContainer\HookRunner;
use MediaWiki\Html\Html;
use MediaWiki\Html\HtmlHelper;
use MediaWiki\Languages\LanguageFactory;
use MediaWiki\Parser\Parsoid\PageBundleParserOutputConverter;
use MediaWiki\Tidy\TidyDriverBase;
use Parser;
use ParserOutput;
use Psr\Log\LoggerInterface;
use RawMessage;
use RequestContext;
use Sanitizer;
use Skin;
use Title;
use Wikimedia\RemexHtml\Serializer\SerializerNode;
/**
* This class contains the default output transformation pipeline for wikitext. It is a postprocessor for
* ParserOutput objects either directly resulting from a parse or fetched from ParserCache.
* @unstable
*/
class DefaultOutputTransform {
private HookRunner $hookRunner;
private TidyDriverBase $tidy;
private LanguageFactory $langFactory;
private Language $contentLang;
private LoggerInterface $logger;
private const EDITSECTION_REGEX =
'#<mw:editsection page="(.*?)" section="(.*?)">(.*?)</mw:editsection>#s';
public function __construct(
HookContainer $hc,
TidyDriverBase $tidy,
LanguageFactory $langFactory,
Language $contentLang,
LoggerInterface $logger
) {
$this->hookRunner = new HookRunner( $hc );
$this->tidy = $tidy;
$this->langFactory = $langFactory;
$this->contentLang = $contentLang;
$this->logger = $logger;
}
/**
* Transforms the content of the ParserOutput object from "parsed HTML" to "output HTML" and returns it.
* @internal
* @param ParserOutput $po - may be mutated in place
* @param array $options Transformations to apply to the HTML
* - allowTOC: (bool) Show the TOC, assuming there were enough headings
* to generate one and `__NOTOC__` wasn't used. Default is true,
* but might be statefully overridden.
* - injectTOC: (bool) Replace the TOC_PLACEHOLDER with TOC contents;
* otherwise the marker will be left in the article (and the skin
* will be responsible for replacing or removing it). Default is
* true.
* - enableSectionEditLinks: (bool) Include section edit links, assuming
* section edit link tokens are present in the HTML. Default is true,
* but might be statefully overridden.
* - userLang: (Language) Language object used for localizing UX messages,
* for example the heading of the table of contents. If omitted, will
* use the language of the main request context.
* - skin: (Skin) Skin object used for transforming section edit links.
* - unwrap: (bool) Return text without a wrapper div. Default is false,
* meaning a wrapper div will be added if getWrapperDivClass() returns
* a non-empty string.
* - wrapperDivClass: (string) Wrap the output in a div and apply the given
* CSS class to that div. This overrides the output of getWrapperDivClass().
* Setting this to an empty string has the same effect as 'unwrap' => true.
* - deduplicateStyles: (bool) When true, which is the default, `<style>`
* tags with the `data-mw-deduplicate` attribute set are deduplicated by
* value of the attribute: all but the first will be replaced by `<link
* rel="mw-deduplicated-inline-style" href="mw-data:..."/>` tags, where
* the scheme-specific-part of the href is the (percent-encoded) value
* of the `data-mw-deduplicate` attribute.
* - absoluteURLs: (bool) use absolute URLs in all links. Default: false
* - includeDebugInfo: (bool) render PP limit report in HTML. Default: false
* - bodyContentOnly: (bool) . Default: true
* @return ParserOutput - may be an in-place mutation of the argument input
*/
public function transform( ParserOutput $po, array $options = [] ): ParserOutput {
$options += [
'allowTOC' => true,
'injectTOC' => true,
'enableSectionEditLinks' => true,
'userLang' => null,
'skin' => null,
'unwrap' => false,
'wrapperDivClass' => $po->getWrapperDivClass(),
'deduplicateStyles' => true,
'absoluteURLs' => false,
'includeDebugInfo' => false,
'bodyContentOnly' => true,
];
$text = $po->getRawText();
if (
$options['bodyContentOnly'] &&
PageBundleParserOutputConverter::hasPageBundle( $po )
) {
$text = $this->extractBody( $text );
}
$redirectHeader = $po->getRedirectHeader();
if ( $redirectHeader ) {
$text = $redirectHeader . $text;
}
if ( $options['includeDebugInfo'] ) {
$text .= $this->renderDebugInfo( $po );
}
$this->hookRunner->onParserOutputPostCacheTransform( $po, $text, $options );
if ( $options['wrapperDivClass'] !== '' && !$options['unwrap'] ) {
$pageLang = $this->getLanguageWithFallbackGuess( $po );
$text = Html::rawElement( 'div', [
'class' => 'mw-content-' . $pageLang->getDir() . ' ' . $options['wrapperDivClass'],
'lang' => $pageLang->toBcp47Code(),
'dir' => $pageLang->getDir(),
], $text );
}
'@phan-var string $text';
if ( $options['enableSectionEditLinks'] ) {
// TODO: Skin should not be required.
// It would be better to define one or more narrow interfaces to use here,
// so this code doesn't have to depend on all of Skin.
// See OutputPage::addParserOutputText()
$text = $this->addSectionLinks( $text, $po, $this->resolveSkin( $options ) );
} else {
$text = $this->removeSectionLinks( $text );
}
if ( $options['allowTOC'] ) {
if ( $options['injectTOC'] ) {
$text = $this->injectToc( $text, $this->resolveUserLanguage( $options ), $po );
}
} else {
$text = Parser::replaceTableOfContentsMarker( $text, '' );
}
if ( $options['deduplicateStyles'] ) {
$text = $this->deduplicateStyles( $text );
}
// Expand all relative URLs
if ( $options['absoluteURLs'] && $text ) {
$text = Linker::expandLocalLinks( $text );
}
$text = $this->hydrateHeaderPlaceholders( $text );
$po->setTransformedText( $text );
return $po;
}
private function getLanguageWithFallbackGuess( ParserOutput $po ): Language {
$pageLang = $po->getLanguage();
if ( $pageLang ) {
return $this->langFactory->getLanguage( $pageLang );
} else {
return $this->contentLang;
}
}
/**
* Strip everynthing but the <body>
* @param string $text
* @return string
*/
private function extractBody( string $text ): string {
// This is a full HTML document, generated by Parsoid.
// T350952: temporary fix for subpage paths: use Parsoid's
// <base href> to expand relative links
$baseHref = '';
if ( preg_match( '{<base href=["\']([^"\']+)["\'][^>]+>}', $text, $matches ) === 1 ) {
$baseHref = $matches[1];
}
// Strip everything but the <body>
// Probably would be better to process this as a DOM.
$text = preg_replace( '!^.*?<body[^>]*>!s', '', $text, 1 );
$text = preg_replace( '!</body>\s*</html>\s*$!', '', $text, 1 );
// T350952: Expand relative links
// What we should be doing here is parsing as a title and then
// using Title::getLocalURL()
$text = HtmlHelper::modifyElements(
$text,
static function ( SerializerNode $node ): bool {
return $node->name === 'a' &&
str_starts_with( $node->attrs['href'] ?? '', './' );
},
static function ( SerializerNode $node ) use ( $baseHref ): SerializerNode {
$href = $baseHref . $node->attrs['href'];
$node->attrs['href'] =
wfExpandUrl( $href, PROTO_RELATIVE );
return $node;
}
);
return $text;
}
/**
* Generates a list of unique style links
* @param string $text
* @return string
*/
private function deduplicateStyles( string $text ): string {
$seen = [];
return preg_replace_callback( '#<style\s+([^>]*data-mw-deduplicate\s*=[^>]*)>.*?</style>#s',
static function ( $m ) use ( &$seen ) {
$attr = Sanitizer::decodeTagAttributes( $m[1] );
if ( !isset( $attr['data-mw-deduplicate'] ) ) {
return $m[0];
}
$key = $attr['data-mw-deduplicate'];
if ( !isset( $seen[$key] ) ) {
$seen[$key] = true;
return $m[0];
}
// We were going to use an empty <style> here, but there
// was concern that would be too much overhead for browsers.
// So let's hope a <link> with a non-standard rel and href isn't
// going to be misinterpreted or mangled by any subsequent processing.
return Html::element( 'link', [
'rel' => 'mw-deduplicated-inline-style',
'href' => "mw-data:" . wfUrlencode( $key ),
] );
}, $text );
}
/**
* Inject table of contents (or empty string if there's no sections)
* @param string $text
* @param Language $userLang
* @param ParserOutput $po
* @return string
*/
private function injectToc( string $text, Language $userLang, ParserOutput $po ): string {
if ( count( $po->getSections() ) === 0 ) {
$toc = '';
} else {
$toc = Linker::generateTOC( $po->getTOCData(), $userLang );
$toc = $this->tidy->tidy( $toc, [ Sanitizer::class, 'armorFrenchSpaces' ] );
}
return Parser::replaceTableOfContentsMarker( $text, $toc );
}
/**
* Hydrate slot section header placeholders generated by RevisionRenderer.
* @param string $text
* @return string
*/
private function hydrateHeaderPlaceholders( string $text ) {
return preg_replace_callback( '#<mw:slotheader>(.*?)</mw:slotheader>#', static function ( $m ) {
$role = htmlspecialchars_decode( $m[1] );
// TODO: map to message, using the interface language. Set lang="xyz" accordingly.
$headerText = $role;
return $headerText;
}, $text );
}
/**
* Replace the section link placeholders by their proper value
* @param string $text
* @param ParserOutput $po
* @param Skin $skin
* @return string
*/
private function addSectionLinks( string $text, ParserOutput $po, Skin $skin ): string {
return preg_replace_callback( self::EDITSECTION_REGEX, function ( $m ) use ( $po, $skin ) {
$editsectionPage = Title::newFromText( htmlspecialchars_decode( $m[1] ) );
$editsectionSection = htmlspecialchars_decode( $m[2] );
$editsectionContent = Sanitizer::decodeCharReferences( $m[3] );
if ( !is_object( $editsectionPage ) ) {
$this->logger->error( 'DefaultOutputTransform::transform: bad title in editsection placeholder',
[
'placeholder' => $m[0],
'editsectionPage' => $m[1],
'titletext' => $po->getTitleText(),
'phab' => 'T261347',
]
);
return '';
}
return $skin->doEditSectionLink( $editsectionPage, $editsectionSection, $editsectionContent,
$skin->getLanguage() );
}, $text );
}
/**
* Drops the section links placeholders
* @param string $text
* @return string
*/
private function removeSectionLinks( string $text ): string {
return preg_replace( self::EDITSECTION_REGEX, '', $text );
}
/**
* Extracts the userLanguage from the $options array, with a fallback on skin language and request
* context language
* @param array $options
* @return Language
*/
private function resolveUserLanguage( array $options ): Language {
$userLang = $options['userLang'];
$skin = $options['skin'];
if ( ( !$userLang ) && $skin ) {
// TODO: See above comment about replacing the use of 'skin' here
$userLang = $skin->getLanguage();
}
if ( !$userLang ) {
// T348853 passing either userLang or skin will be mandatory in the future
$userLang = RequestContext::getMain()->getLanguage();
}
return $userLang;
}
/**
* Extracts the skin from the $options array, with a fallback on request context skin
* @param array $options
* @return Skin
*/
private function resolveSkin( array $options ): Skin {
$skin = $options[ 'skin' ] ?? null;
if ( !$skin ) {
// T348853 passing $skin will be mandatory in the future
$skin = RequestContext::getMain()->getSkin();
}
return $skin;
}
/**
* @return string
*/
private function renderDebugInfo( ParserOutput $po ): string {
$text = '';
$limitReportData = $po->getLimitReportData();
// If nothing set it, we can't get it.
if ( $limitReportData ) {
$limitReport = "NewPP limit report\n";
if ( array_key_exists( 'cachereport-origin', $limitReportData ) ) {
$limitReport .= "Parsed by {$limitReportData['cachereport-origin']}\n";
}
if ( array_key_exists( 'cachereport-timestamp', $limitReportData ) ) {
$limitReport .= "Cached time: {$limitReportData['cachereport-timestamp']}\n";
}
if ( array_key_exists( 'cachereport-ttl', $limitReportData ) ) {
$limitReport .= "Cache expiry: {$limitReportData['cachereport-ttl']}\n";
}
if ( array_key_exists( 'cachereport-transientcontent', $limitReportData ) ) {
$transient = $limitReportData['cachereport-transientcontent'] ? 'true' : 'false';
$limitReport .= "Reduced expiry: $transient\n";
}
// TODO: flags should go into limit report too.
$limitReport .= 'Complications: [' . implode( ', ', $po->getAllFlags() ) . "]\n";
foreach ( $limitReportData as $key => $value ) {
if ( in_array( $key, [
'cachereport-origin',
'cachereport-timestamp',
'cachereport-ttl',
'cachereport-transientcontent',
'limitreport-timingprofile',
] ) ) {
// These keys are processed separately.
continue;
}
if ( $this->hookRunner->onParserLimitReportFormat(
$key, $value, $limitReport, false, false )
) {
$keyMsg = wfMessage( $key )->inLanguage( 'en' )->useDatabase( false );
$valueMsg = wfMessage( [ "$key-value-text", "$key-value" ] )
->inLanguage( 'en' )->useDatabase( false );
if ( !$valueMsg->exists() ) {
$valueMsg = new RawMessage( '$1' );
}
if ( !$keyMsg->isDisabled() && !$valueMsg->isDisabled() ) {
$valueMsg->params( $value );
$limitReport .= "{$keyMsg->text()}: {$valueMsg->text()}\n";
}
}
}
// Since we're not really outputting HTML, decode the entities and
// then re-encode the things that need hiding inside HTML comments.
$limitReport = htmlspecialchars_decode( $limitReport );
// Sanitize for comment. Note '' in the replacement is U+2010,
// which looks much like the problematic '-'.
$limitReport = str_replace( [ '-', '&' ], [ '', '&amp;' ], $limitReport );
$text = "\n<!-- \n$limitReport-->\n";
$profileReport = $limitReportData['limitreport-timingprofile'] ?? null;
if ( $profileReport ) {
$text .= "<!--\nTransclusion expansion time report (%,ms,calls,template)\n";
$text .= implode( "\n", $profileReport ) . "\n-->\n";
}
}
if ( $po->getCacheMessage() ) {
$text .= "\n<!-- " . $po->getCacheMessage() . "\n -->\n";
}
$parsoidVersion = $po->getExtensionData( 'core:parsoid-version' );
if ( $parsoidVersion ) {
$text .= "\n<!--Parsoid $parsoidVersion-->\n";
}
return $text;
}
}