ExtractBody: Convert page-internal link fragments to pure fragment urls

* This ensures that when you have query params like (?useparsoid=1), all cite links no longer take you to the non-Parsoid page but resolve internally. * Additionally, this also unbreaks reference previews in local testing - not yet sure if this will fix all breakage in production. * We don't have ready access to the title string and so this patch extracts it from a link tag in the <head> of Parsoid HTML. That is guaranteed to be correct and reliably present. But, if in the future, this changes (whether by adding it to ParserOptions, ParserOutput, or the $opts array), we can use that directly. * Added new unit tests that verify the new expectations. Bug: T358242 Change-Id: Iaf482cc9803564b4cf4ae04f975573f61ff3b0e4
2024-04-09 18:12:14 +05:30 · 2024-04-09 18:12:14 +05:30 · 9a46631029
commit 9a46631029
parent cddfa25d5a
2 changed files with 65 additions and 13 deletions
--- a/includes/OutputTransform/Stages/ExtractBody.php
+++ b/includes/OutputTransform/Stages/ExtractBody.php
@ -23,7 +23,7 @@ class ExtractBody extends ContentTextTransformStage {
 		'a' => true, 'img' => true, 'video' => true, 'audio' => true,
 	];

-	private static function expandRelativeAttrs( string $text, string $baseHref ): string {
+	private static function expandRelativeAttrs( string $text, string $baseHref, string $pageFragmentPrefix ): string {
 		// T350952: Expand relative links
 		// What we should be doing here is parsing as a title and then
 		// using Title::getLocalURL()
@ -36,11 +36,19 @@ class ExtractBody extends ContentTextTransformStage {
 				$attr = $node->name === 'a' ? 'href' : 'resource';
 				return str_starts_with( $node->attrs[$attr] ?? '', './' );
 			},
-			static function ( SerializerNode $node ) use ( $baseHref ): SerializerNode {
+			static function ( SerializerNode $node ) use ( $baseHref, $pageFragmentPrefix ): SerializerNode {
 				$attr = $node->name === 'a' ? 'href' : 'resource';
-				$href = $baseHref . $node->attrs[$attr];
-				$node->attrs[$attr] =
-					wfExpandUrl( $href, PROTO_RELATIVE );
+				$href = $node->attrs[$attr];
+				// Convert page fragment urls to true fragment urls
+				// This ensures that those fragments include any URL query params
+				// and resolve internally. (Ex: on pages with ?useparsoid=1,
+				// cite link fragments should not take you to a different page).
+				if ( $pageFragmentPrefix && str_starts_with( $href, $pageFragmentPrefix ) ) {
+					$node->attrs[$attr] = substr( $href, strlen( $pageFragmentPrefix ) - 1 );
+				} else {
+					$href = $baseHref . $href;
+					$node->attrs[$attr] = wfExpandUrl( $href, PROTO_RELATIVE );
+				}
 				return $node;
 			}
 		);
@ -50,13 +58,23 @@ class ExtractBody extends ContentTextTransformStage {
 		// T350952: temporary fix for subpage paths: use Parsoid's
 		// <base href> to expand relative links
 		$baseHref = '';
+		$pageFragmentPrefix = '';
 		if ( preg_match( '{<base href=["\']([^"\']+)["\'][^>]+>}', $text, $matches ) === 1 ) {
 			$baseHref = $matches[1];
+			// Since we don't have easy access to the title, extract it from the
+			// Parsoid HTML where the title is embedded in the <head>.
+			if ( preg_match(
+				'%<link rel="dc:isVersionOf" href=(?:"([^"]+)"|\'[^\']+\')\s*/>%',
+				$text, $matches ) === 1
+			) {
+				$title = substr( $matches[1], strlen( $baseHref ) );
+				$pageFragmentPrefix = "./" . urldecode( $title ) . "#";
+			}
 		}
 		foreach ( $po->getIndicators() as $name => $html ) {
-			$po->setIndicator( $name, self::expandRelativeAttrs( $html, $baseHref ) );
+			$po->setIndicator( $name, self::expandRelativeAttrs( $html, $baseHref, $pageFragmentPrefix ) );
 		}
 		$text = Parser::extractBody( $text );
-		return self::expandRelativeAttrs( $text, $baseHref );
+		return self::expandRelativeAttrs( $text, $baseHref, $pageFragmentPrefix );
 	}
 }
--- a/tests/phpunit/includes/OutputTransform/Stages/ExtractBodyTest.php
+++ b/tests/phpunit/includes/OutputTransform/Stages/ExtractBodyTest.php
@ -30,15 +30,49 @@ class ExtractBodyTest extends OutputTransformStageTestBase {
 	}

 	public function provideTransform(): array {
-		$text = <<<EOF
+		$pageTemplate = <<<EOF
 <!DOCTYPE html>
-<html><head><title>Main Page</title><base href="https://www.example.com/w/"/></head><body data-parsoid="{'dsr':[0,6,0,0]} "lang="en"><p data-parsoid="{'dsr':[0,5,0,0]}"><a href="./Hello">hello</a></p>
+<html><head><title>__TITLE__</title><link rel="dc:isVersionOf" href="https://www.example.com/w/__TITLE__"/><base href="https://www.example.com/w/"/></head><body>__BODY__
 </body></html>
 EOF;
-		$body = "<p data-parsoid=\"{'dsr':[0,5,0,0]}\"><a href=\"https://www.example.com/w/Hello\">hello</a></p>\n";
-		$opts = [];
-		return [
-			[ new ParserOutput( $text ), null, $opts, new ParserOutput( $body ) ]
+		$testData = [
+			[
+				"title" => "Foo",
+				"body" => "<p><a href=\"./Hello\">hello</a></p>",
+				"result" => "<p><a href=\"https://www.example.com/w/Hello\">hello</a></p>\n"
+			],
+			// Rest of these tests ensure that self-page cite fragments are converted to fragment urls
+			// Tests different title scenarios to exercise edge cases
+			[
+				"title" => "Foo",
+				"body" => "<p><a href=\"./Foo#cite1\">hello</a><a href=\"./Foo/Bar#cite1\">boo</a></p>",
+				"result" => "<p><a href=\"#cite1\">hello</a><a href=\"https://www.example.com/w/Foo/Bar#cite1\">boo</a></p>\n"
+			],
+			[
+				"title" => "Foo/Bar",
+				"body" => "<p><a href=\"./Foo/Bar#cite1\">hello</a></p>",
+				"result" => "<p><a href=\"#cite1\">hello</a></p>\n"
+			],
+			[
+				"title" => "Foo/Bar'Baz",
+				"body" => "<p><a href=\"./Foo/Bar'Baz#cite1\">hello</a></p>",
+				"result" => "<p><a href=\"#cite1\">hello</a></p>\n"
+			],
+			[
+				"title" => "Foo/Bar%22Baz",
+				"body" => "<p><a href='./Foo/Bar\"Baz#cite1'>hello</a></p>",
+				"result" => "<p><a href=\"#cite1\">hello</a></p>\n"
+			]
 		];
+		$opts = [];
+		$tests = [];
+		foreach ( $testData as $t ) {
+			// Strictly speaking, __TITLE__ in the <title> tag will have entites decoded
+			// but since we aren't testing that or using that tag, we are ignoring that detail.
+			$text = str_replace( "__TITLE__", $t['title'], $pageTemplate );
+			$text = str_replace( "__BODY__", $t['body'], $text );
+			$tests[] = [ new ParserOutput( $text ), null, $opts, new ParserOutput( $t['result'] ) ];
+		}
+		return $tests;
 	}
 }