ExtractBody: Convert page-internal link fragments to pure fragment urls
* This ensures that when you have query params like (?useparsoid=1), all cite links no longer take you to the non-Parsoid page but resolve internally. * Additionally, this also unbreaks reference previews in local testing - not yet sure if this will fix all breakage in production. * We don't have ready access to the title string and so this patch extracts it from a link tag in the <head> of Parsoid HTML. That is guaranteed to be correct and reliably present. But, if in the future, this changes (whether by adding it to ParserOptions, ParserOutput, or the $opts array), we can use that directly. * Added new unit tests that verify the new expectations. Bug: T358242 Change-Id: Iaf482cc9803564b4cf4ae04f975573f61ff3b0e4
This commit is contained in:
parent
cddfa25d5a
commit
9a46631029
2 changed files with 65 additions and 13 deletions
|
|
@ -23,7 +23,7 @@ class ExtractBody extends ContentTextTransformStage {
|
|||
'a' => true, 'img' => true, 'video' => true, 'audio' => true,
|
||||
];
|
||||
|
||||
private static function expandRelativeAttrs( string $text, string $baseHref ): string {
|
||||
private static function expandRelativeAttrs( string $text, string $baseHref, string $pageFragmentPrefix ): string {
|
||||
// T350952: Expand relative links
|
||||
// What we should be doing here is parsing as a title and then
|
||||
// using Title::getLocalURL()
|
||||
|
|
@ -36,11 +36,19 @@ class ExtractBody extends ContentTextTransformStage {
|
|||
$attr = $node->name === 'a' ? 'href' : 'resource';
|
||||
return str_starts_with( $node->attrs[$attr] ?? '', './' );
|
||||
},
|
||||
static function ( SerializerNode $node ) use ( $baseHref ): SerializerNode {
|
||||
static function ( SerializerNode $node ) use ( $baseHref, $pageFragmentPrefix ): SerializerNode {
|
||||
$attr = $node->name === 'a' ? 'href' : 'resource';
|
||||
$href = $baseHref . $node->attrs[$attr];
|
||||
$node->attrs[$attr] =
|
||||
wfExpandUrl( $href, PROTO_RELATIVE );
|
||||
$href = $node->attrs[$attr];
|
||||
// Convert page fragment urls to true fragment urls
|
||||
// This ensures that those fragments include any URL query params
|
||||
// and resolve internally. (Ex: on pages with ?useparsoid=1,
|
||||
// cite link fragments should not take you to a different page).
|
||||
if ( $pageFragmentPrefix && str_starts_with( $href, $pageFragmentPrefix ) ) {
|
||||
$node->attrs[$attr] = substr( $href, strlen( $pageFragmentPrefix ) - 1 );
|
||||
} else {
|
||||
$href = $baseHref . $href;
|
||||
$node->attrs[$attr] = wfExpandUrl( $href, PROTO_RELATIVE );
|
||||
}
|
||||
return $node;
|
||||
}
|
||||
);
|
||||
|
|
@ -50,13 +58,23 @@ class ExtractBody extends ContentTextTransformStage {
|
|||
// T350952: temporary fix for subpage paths: use Parsoid's
|
||||
// <base href> to expand relative links
|
||||
$baseHref = '';
|
||||
$pageFragmentPrefix = '';
|
||||
if ( preg_match( '{<base href=["\']([^"\']+)["\'][^>]+>}', $text, $matches ) === 1 ) {
|
||||
$baseHref = $matches[1];
|
||||
// Since we don't have easy access to the title, extract it from the
|
||||
// Parsoid HTML where the title is embedded in the <head>.
|
||||
if ( preg_match(
|
||||
'%<link rel="dc:isVersionOf" href=(?:"([^"]+)"|\'[^\']+\')\s*/>%',
|
||||
$text, $matches ) === 1
|
||||
) {
|
||||
$title = substr( $matches[1], strlen( $baseHref ) );
|
||||
$pageFragmentPrefix = "./" . urldecode( $title ) . "#";
|
||||
}
|
||||
}
|
||||
foreach ( $po->getIndicators() as $name => $html ) {
|
||||
$po->setIndicator( $name, self::expandRelativeAttrs( $html, $baseHref ) );
|
||||
$po->setIndicator( $name, self::expandRelativeAttrs( $html, $baseHref, $pageFragmentPrefix ) );
|
||||
}
|
||||
$text = Parser::extractBody( $text );
|
||||
return self::expandRelativeAttrs( $text, $baseHref );
|
||||
return self::expandRelativeAttrs( $text, $baseHref, $pageFragmentPrefix );
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -30,15 +30,49 @@ class ExtractBodyTest extends OutputTransformStageTestBase {
|
|||
}
|
||||
|
||||
public function provideTransform(): array {
|
||||
$text = <<<EOF
|
||||
$pageTemplate = <<<EOF
|
||||
<!DOCTYPE html>
|
||||
<html><head><title>Main Page</title><base href="https://www.example.com/w/"/></head><body data-parsoid="{'dsr':[0,6,0,0]} "lang="en"><p data-parsoid="{'dsr':[0,5,0,0]}"><a href="./Hello">hello</a></p>
|
||||
<html><head><title>__TITLE__</title><link rel="dc:isVersionOf" href="https://www.example.com/w/__TITLE__"/><base href="https://www.example.com/w/"/></head><body>__BODY__
|
||||
</body></html>
|
||||
EOF;
|
||||
$body = "<p data-parsoid=\"{'dsr':[0,5,0,0]}\"><a href=\"https://www.example.com/w/Hello\">hello</a></p>\n";
|
||||
$opts = [];
|
||||
return [
|
||||
[ new ParserOutput( $text ), null, $opts, new ParserOutput( $body ) ]
|
||||
$testData = [
|
||||
[
|
||||
"title" => "Foo",
|
||||
"body" => "<p><a href=\"./Hello\">hello</a></p>",
|
||||
"result" => "<p><a href=\"https://www.example.com/w/Hello\">hello</a></p>\n"
|
||||
],
|
||||
// Rest of these tests ensure that self-page cite fragments are converted to fragment urls
|
||||
// Tests different title scenarios to exercise edge cases
|
||||
[
|
||||
"title" => "Foo",
|
||||
"body" => "<p><a href=\"./Foo#cite1\">hello</a><a href=\"./Foo/Bar#cite1\">boo</a></p>",
|
||||
"result" => "<p><a href=\"#cite1\">hello</a><a href=\"https://www.example.com/w/Foo/Bar#cite1\">boo</a></p>\n"
|
||||
],
|
||||
[
|
||||
"title" => "Foo/Bar",
|
||||
"body" => "<p><a href=\"./Foo/Bar#cite1\">hello</a></p>",
|
||||
"result" => "<p><a href=\"#cite1\">hello</a></p>\n"
|
||||
],
|
||||
[
|
||||
"title" => "Foo/Bar'Baz",
|
||||
"body" => "<p><a href=\"./Foo/Bar'Baz#cite1\">hello</a></p>",
|
||||
"result" => "<p><a href=\"#cite1\">hello</a></p>\n"
|
||||
],
|
||||
[
|
||||
"title" => "Foo/Bar%22Baz",
|
||||
"body" => "<p><a href='./Foo/Bar\"Baz#cite1'>hello</a></p>",
|
||||
"result" => "<p><a href=\"#cite1\">hello</a></p>\n"
|
||||
]
|
||||
];
|
||||
$opts = [];
|
||||
$tests = [];
|
||||
foreach ( $testData as $t ) {
|
||||
// Strictly speaking, __TITLE__ in the <title> tag will have entites decoded
|
||||
// but since we aren't testing that or using that tag, we are ignoring that detail.
|
||||
$text = str_replace( "__TITLE__", $t['title'], $pageTemplate );
|
||||
$text = str_replace( "__BODY__", $t['body'], $text );
|
||||
$tests[] = [ new ParserOutput( $text ), null, $opts, new ParserOutput( $t['result'] ) ];
|
||||
}
|
||||
return $tests;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in a new issue