ExtractBody: Convert page-internal link fragments to pure fragment urls

* This ensures that when you have query params like (?useparsoid=1),
  all cite links no longer take you to the non-Parsoid page but
  resolve internally.

* Additionally, this also unbreaks reference previews in local testing
  - not yet sure if this will fix all breakage in production.

* We don't have ready access to the title string and so this patch
  extracts it from a link tag in the <head> of Parsoid HTML. That is
  guaranteed to be correct and reliably present.

  But, if in the future, this changes (whether by adding it to
  ParserOptions, ParserOutput, or the $opts array), we can use that
  directly.

* Added new unit tests that verify the new expectations.

Bug: T358242
Change-Id: Iaf482cc9803564b4cf4ae04f975573f61ff3b0e4
This commit is contained in:
Subramanya Sastry 2024-04-09 18:12:14 +05:30
parent cddfa25d5a
commit 9a46631029
2 changed files with 65 additions and 13 deletions

View file

@ -23,7 +23,7 @@ class ExtractBody extends ContentTextTransformStage {
'a' => true, 'img' => true, 'video' => true, 'audio' => true,
];
private static function expandRelativeAttrs( string $text, string $baseHref ): string {
private static function expandRelativeAttrs( string $text, string $baseHref, string $pageFragmentPrefix ): string {
// T350952: Expand relative links
// What we should be doing here is parsing as a title and then
// using Title::getLocalURL()
@ -36,11 +36,19 @@ class ExtractBody extends ContentTextTransformStage {
$attr = $node->name === 'a' ? 'href' : 'resource';
return str_starts_with( $node->attrs[$attr] ?? '', './' );
},
static function ( SerializerNode $node ) use ( $baseHref ): SerializerNode {
static function ( SerializerNode $node ) use ( $baseHref, $pageFragmentPrefix ): SerializerNode {
$attr = $node->name === 'a' ? 'href' : 'resource';
$href = $baseHref . $node->attrs[$attr];
$node->attrs[$attr] =
wfExpandUrl( $href, PROTO_RELATIVE );
$href = $node->attrs[$attr];
// Convert page fragment urls to true fragment urls
// This ensures that those fragments include any URL query params
// and resolve internally. (Ex: on pages with ?useparsoid=1,
// cite link fragments should not take you to a different page).
if ( $pageFragmentPrefix && str_starts_with( $href, $pageFragmentPrefix ) ) {
$node->attrs[$attr] = substr( $href, strlen( $pageFragmentPrefix ) - 1 );
} else {
$href = $baseHref . $href;
$node->attrs[$attr] = wfExpandUrl( $href, PROTO_RELATIVE );
}
return $node;
}
);
@ -50,13 +58,23 @@ class ExtractBody extends ContentTextTransformStage {
// T350952: temporary fix for subpage paths: use Parsoid's
// <base href> to expand relative links
$baseHref = '';
$pageFragmentPrefix = '';
if ( preg_match( '{<base href=["\']([^"\']+)["\'][^>]+>}', $text, $matches ) === 1 ) {
$baseHref = $matches[1];
// Since we don't have easy access to the title, extract it from the
// Parsoid HTML where the title is embedded in the <head>.
if ( preg_match(
'%<link rel="dc:isVersionOf" href=(?:"([^"]+)"|\'[^\']+\')\s*/>%',
$text, $matches ) === 1
) {
$title = substr( $matches[1], strlen( $baseHref ) );
$pageFragmentPrefix = "./" . urldecode( $title ) . "#";
}
}
foreach ( $po->getIndicators() as $name => $html ) {
$po->setIndicator( $name, self::expandRelativeAttrs( $html, $baseHref ) );
$po->setIndicator( $name, self::expandRelativeAttrs( $html, $baseHref, $pageFragmentPrefix ) );
}
$text = Parser::extractBody( $text );
return self::expandRelativeAttrs( $text, $baseHref );
return self::expandRelativeAttrs( $text, $baseHref, $pageFragmentPrefix );
}
}

View file

@ -30,15 +30,49 @@ class ExtractBodyTest extends OutputTransformStageTestBase {
}
public function provideTransform(): array {
$text = <<<EOF
$pageTemplate = <<<EOF
<!DOCTYPE html>
<html><head><title>Main Page</title><base href="https://www.example.com/w/"/></head><body data-parsoid="{'dsr':[0,6,0,0]} "lang="en"><p data-parsoid="{'dsr':[0,5,0,0]}"><a href="./Hello">hello</a></p>
<html><head><title>__TITLE__</title><link rel="dc:isVersionOf" href="https://www.example.com/w/__TITLE__"/><base href="https://www.example.com/w/"/></head><body>__BODY__
</body></html>
EOF;
$body = "<p data-parsoid=\"{'dsr':[0,5,0,0]}\"><a href=\"https://www.example.com/w/Hello\">hello</a></p>\n";
$opts = [];
return [
[ new ParserOutput( $text ), null, $opts, new ParserOutput( $body ) ]
$testData = [
[
"title" => "Foo",
"body" => "<p><a href=\"./Hello\">hello</a></p>",
"result" => "<p><a href=\"https://www.example.com/w/Hello\">hello</a></p>\n"
],
// Rest of these tests ensure that self-page cite fragments are converted to fragment urls
// Tests different title scenarios to exercise edge cases
[
"title" => "Foo",
"body" => "<p><a href=\"./Foo#cite1\">hello</a><a href=\"./Foo/Bar#cite1\">boo</a></p>",
"result" => "<p><a href=\"#cite1\">hello</a><a href=\"https://www.example.com/w/Foo/Bar#cite1\">boo</a></p>\n"
],
[
"title" => "Foo/Bar",
"body" => "<p><a href=\"./Foo/Bar#cite1\">hello</a></p>",
"result" => "<p><a href=\"#cite1\">hello</a></p>\n"
],
[
"title" => "Foo/Bar'Baz",
"body" => "<p><a href=\"./Foo/Bar'Baz#cite1\">hello</a></p>",
"result" => "<p><a href=\"#cite1\">hello</a></p>\n"
],
[
"title" => "Foo/Bar%22Baz",
"body" => "<p><a href='./Foo/Bar\"Baz#cite1'>hello</a></p>",
"result" => "<p><a href=\"#cite1\">hello</a></p>\n"
]
];
$opts = [];
$tests = [];
foreach ( $testData as $t ) {
// Strictly speaking, __TITLE__ in the <title> tag will have entites decoded
// but since we aren't testing that or using that tag, we are ignoring that detail.
$text = str_replace( "__TITLE__", $t['title'], $pageTemplate );
$text = str_replace( "__BODY__", $t['body'], $text );
$tests[] = [ new ParserOutput( $text ), null, $opts, new ParserOutput( $t['result'] ) ];
}
return $tests;
}
}