[parsoid] Fix Parsoid relative links

Bug: T350952
Change-Id: I60165a9946a35cfb42a78ed2f833c34570fefffc
This commit is contained in:
C. Scott Ananian 2023-11-16 12:19:21 -05:00
parent 7328f958cd
commit 0e1b889a0f
4 changed files with 27 additions and 8 deletions

View file

@ -7,6 +7,7 @@ use Linker;
use MediaWiki\HookContainer\HookContainer;
use MediaWiki\HookContainer\HookRunner;
use MediaWiki\Html\Html;
use MediaWiki\Html\HtmlHelper;
use MediaWiki\Languages\LanguageFactory;
use MediaWiki\Parser\Parsoid\PageBundleParserOutputConverter;
use MediaWiki\Tidy\TidyDriverBase;
@ -18,6 +19,7 @@ use RequestContext;
use Sanitizer;
use Skin;
use Title;
use Wikimedia\RemexHtml\Serializer\SerializerNode;
/**
* This class contains the default output transformation pipeline for wikitext. It is a postprocessor for
@ -177,11 +179,34 @@ class DefaultOutputTransform {
*/
private function extractBody( string $text ): string {
// This is a full HTML document, generated by Parsoid.
// T350952: temporary fix for subpage paths: use Parsoid's
// <base href> to expand relative links
$baseHref = '';
if ( preg_match( '{<base href=["\']([^"\']+)["\'][^>]+>}', $text, $matches ) === 1 ) {
$baseHref = $matches[1];
}
// Strip everything but the <body>
// Probably would be better to process this as a DOM.
$text = preg_replace( '!^.*?<body[^>]*>!s', '', $text, 1 );
$text = preg_replace( '!</body>\s*</html>\s*$!', '', $text, 1 );
// T350952: Expand relative links
// What we should be doing here is parsing as a title and then
// using Title::getLocalURL()
$text = HtmlHelper::modifyElements(
$text,
static function ( SerializerNode $node ): bool {
return $node->name === 'a' &&
str_starts_with( $node->attrs['href'] ?? '', './' );
},
static function ( SerializerNode $node ) use ( $baseHref ): SerializerNode {
$href = $baseHref . $node->attrs['href'];
$node->attrs['href'] =
wfExpandUrl( $href, PROTO_RELATIVE );
return $node;
}
);
return $text;
}

View file

@ -181,12 +181,6 @@ class ParsoidParser /* eventually this will extend \Parser */ {
$this->makeLimitReport( $options, $parserOutput );
// T350952: temporary fix for subpage paths: copy over Parsoid's
// <base href> into ParserOutput's head items.
if ( preg_match( '{<base href=[^>]+>}', $parserOutput->getRawText(), $matches ) === 1 ) {
$parserOutput->addHeadItem( $matches[0] );
}
// Record Parsoid version in extension data; this allows
// us to use the onRejectParserCacheValue hook to selectively
// expire "bad" generated content in the event of a rollback.

View file

@ -593,7 +593,7 @@ class ApiParseTest extends ApiTestCase {
yield [ false, false, $expected ];
yield [ false, true, $expected ];
// Parsoid parses, with and without pre-existing content.
$expected = '!^<section[^>]*><p[^>]*><a rel="mw:WikiLink" href="./Foo" title="Foo"[^>]*>Foo</a></p></section>!';
$expected = '!^<section[^>]*><p[^>]*><a rel="mw:WikiLink" href="[^"]*Foo" title="Foo"[^>]*>Foo</a></p></section>!';
yield [ true, false, $expected ];
yield [ true, true, $expected ];
}

View file

@ -77,7 +77,7 @@ class WikitextContentHandlerIntegrationTest extends TextContentHandlerIntegratio
'title' => 'WikitextContentTest_testGetParserOutput',
'model' => CONTENT_MODEL_WIKITEXT,
'text' => "#REDIRECT [[Main Page]]",
'expectedHtml' => '<div class="mw-content-ltr mw-parser-output" lang="en" dir="ltr">' . "<div class=\"redirectMsg\"><p>Redirect to:</p><ul class=\"redirectText\"><li><a href=\"/index.php?title=Main_Page&amp;action=edit&amp;redlink=1\" class=\"new\" title=\"Main Page (page does not exist)\">Main Page</a></li></ul></div><section data-mw-section-id=\"0\" id=\"mwAQ\"><link rel=\"mw:PageProp/redirect\" href=\"./Main_Page\" id=\"mwAg\"/></section></div>",
'expectedHtml' => '<div class="mw-content-ltr mw-parser-output" lang="en" dir="ltr">' . "<div class=\"redirectMsg\"><p>Redirect to:</p><ul class=\"redirectText\"><li><a href=\"/index.php?title=Main_Page&amp;action=edit&amp;redlink=1\" class=\"new\" title=\"Main Page (page does not exist)\">Main Page</a></li></ul></div><section data-mw-section-id=\"0\" id=\"mwAQ\"><link rel=\"mw:PageProp/redirect\" href=\"./Main_Page\" id=\"mwAg\"></section></div>",
'expectedFields' => [
'Links' => [
[ 'Main_Page' => 0 ],