SECURITY: Ensure emitted HTML is safe against Unicode NFC normalization

CVE-2025-32699

Ensure that Unicode NFC normalization can be applied to our HTML
output safely.  Even though the W3C officially recommends against
normalizing HTML

https://www.w3.org/International/questions/qa-html-css-normalization#converting

this is still easily done inadvertently, especially when using the
MediaWiki action API which normalizes parameters and results by
default.

See also I671648603c4635a35585c860b4857f5ea085e47f in Parsoid, and
T266140 / I2e78e660ba1867744e34eda7d00ea527ec016b71 for another similar
issue.

The following changes are made:

* The various HTML serializers (Remex/Tidy-derived, as well as the
  Html::* helpers) are tweaked to entity-escape U+0338 wherever it
  appears.

* Similarly, Message::escaped() is tweaked to entity-escape U+0338.

* Finally, a post-processing pass is added to the OutputTransform
  pipeline to catch any remaining U+0338 and entity-escape them.
  This catches U+0338 added during any of the previous OutputTransform
  stages (like TOC insertion, section edit links, etc).
  *When backporting* this code will likely need to be moved to
  ParserOutput::getText(), as the OutputTransform pipeline wasn't added
  until MW 1.42.

Bug: T387130
Change-Id: I66564e14e730f5393f4fa5780b80f24de6075af5
This commit is contained in:
C. Scott Ananian 2025-02-25 12:02:34 -05:00 committed by Reedy
parent 8702751d5e
commit 94f193a894
14 changed files with 116 additions and 9 deletions

View file

@ -1873,6 +1873,7 @@ $wgAutoloadLocalClasses = [
'MediaWiki\\OutputTransform\\Stages\\HandleParsoidSectionLinks' => __DIR__ . '/includes/OutputTransform/Stages/HandleParsoidSectionLinks.php', 'MediaWiki\\OutputTransform\\Stages\\HandleParsoidSectionLinks' => __DIR__ . '/includes/OutputTransform/Stages/HandleParsoidSectionLinks.php',
'MediaWiki\\OutputTransform\\Stages\\HandleSectionLinks' => __DIR__ . '/includes/OutputTransform/Stages/HandleSectionLinks.php', 'MediaWiki\\OutputTransform\\Stages\\HandleSectionLinks' => __DIR__ . '/includes/OutputTransform/Stages/HandleSectionLinks.php',
'MediaWiki\\OutputTransform\\Stages\\HandleTOCMarkers' => __DIR__ . '/includes/OutputTransform/Stages/HandleTOCMarkers.php', 'MediaWiki\\OutputTransform\\Stages\\HandleTOCMarkers' => __DIR__ . '/includes/OutputTransform/Stages/HandleTOCMarkers.php',
'MediaWiki\\OutputTransform\\Stages\\HardenNFC' => __DIR__ . '/includes/OutputTransform/Stages/HardenNFC.php',
'MediaWiki\\OutputTransform\\Stages\\HydrateHeaderPlaceholders' => __DIR__ . '/includes/OutputTransform/Stages/HydrateHeaderPlaceholders.php', 'MediaWiki\\OutputTransform\\Stages\\HydrateHeaderPlaceholders' => __DIR__ . '/includes/OutputTransform/Stages/HydrateHeaderPlaceholders.php',
'MediaWiki\\OutputTransform\\Stages\\ParsoidLocalization' => __DIR__ . '/includes/OutputTransform/Stages/ParsoidLocalization.php', 'MediaWiki\\OutputTransform\\Stages\\ParsoidLocalization' => __DIR__ . '/includes/OutputTransform/Stages/ParsoidLocalization.php',
'MediaWiki\\OutputTransform\\Stages\\RenderDebugInfo' => __DIR__ . '/includes/OutputTransform/Stages/RenderDebugInfo.php', 'MediaWiki\\OutputTransform\\Stages\\RenderDebugInfo' => __DIR__ . '/includes/OutputTransform/Stages/RenderDebugInfo.php',

View file

@ -28,6 +28,7 @@ namespace MediaWiki\Html;
use MediaWiki\Json\FormatJson; use MediaWiki\Json\FormatJson;
use MediaWiki\MainConfigNames; use MediaWiki\MainConfigNames;
use MediaWiki\MediaWikiServices; use MediaWiki\MediaWikiServices;
use MediaWiki\Parser\Sanitizer;
use MediaWiki\Request\ContentSecurityPolicy; use MediaWiki\Request\ContentSecurityPolicy;
use UnexpectedValueException; use UnexpectedValueException;
@ -193,6 +194,7 @@ class Html {
if ( isset( self::$voidElements[$element] ) ) { if ( isset( self::$voidElements[$element] ) ) {
return $start; return $start;
} else { } else {
$contents = Sanitizer::escapeCombiningChar( $contents ?? '' );
return $start . $contents . self::closeElement( $element ); return $start . $contents . self::closeElement( $element );
} }
} }

View file

@ -3,6 +3,7 @@
namespace MediaWiki\Html; namespace MediaWiki\Html;
use Wikimedia\Assert\Assert; use Wikimedia\Assert\Assert;
use Wikimedia\RemexHtml\Serializer\HtmlFormatter;
use Wikimedia\RemexHtml\Serializer\SerializerNode; use Wikimedia\RemexHtml\Serializer\SerializerNode;
/** /**
@ -23,6 +24,9 @@ trait HtmlHelperTrait {
parent::__construct( $options ); parent::__construct( $options );
$this->shouldModifyCallback = $shouldModifyCallback; $this->shouldModifyCallback = $shouldModifyCallback;
$this->modifyCallback = $modifyCallback; $this->modifyCallback = $modifyCallback;
// Escape U+0338 (T387130)
'@phan-var HtmlFormatter $this';
$this->textEscapes["\u{0338}"] = '̸';
} }
public function element( SerializerNode $parent, SerializerNode $node, $contents ) { public function element( SerializerNode $parent, SerializerNode $node, $contents ) {

View file

@ -34,6 +34,7 @@ use MediaWiki\Page\PageReference;
use MediaWiki\Page\PageReferenceValue; use MediaWiki\Page\PageReferenceValue;
use MediaWiki\Parser\Parser; use MediaWiki\Parser\Parser;
use MediaWiki\Parser\ParserOutput; use MediaWiki\Parser\ParserOutput;
use MediaWiki\Parser\Sanitizer;
use MediaWiki\StubObject\StubUserLang; use MediaWiki\StubObject\StubUserLang;
use MediaWiki\Title\Title; use MediaWiki\Title\Title;
use RuntimeException; use RuntimeException;
@ -1042,7 +1043,7 @@ class Message implements Stringable, MessageSpecifier, Serializable {
// '⧼' is used instead of '<' to side-step any // '⧼' is used instead of '<' to side-step any
// double-escaping issues. // double-escaping issues.
// (Keep synchronised with mw.Message#toString in JS.) // (Keep synchronised with mw.Message#toString in JS.)
return '⧼' . htmlspecialchars( $this->key ) . '⧽'; return '⧼' . Sanitizer::escapeCombiningChar( htmlspecialchars( $this->key ) ) . '⧽';
} }
if ( in_array( $this->getLanguage()->getCode(), [ 'qqx', 'x-xss' ] ) ) { if ( in_array( $this->getLanguage()->getCode(), [ 'qqx', 'x-xss' ] ) ) {
@ -1078,6 +1079,7 @@ class Message implements Stringable, MessageSpecifier, Serializable {
} elseif ( $format === self::FORMAT_ESCAPED ) { } elseif ( $format === self::FORMAT_ESCAPED ) {
$string = $this->transformText( $string ); $string = $this->transformText( $string );
$string = htmlspecialchars( $string, ENT_QUOTES, 'UTF-8', false ); $string = htmlspecialchars( $string, ENT_QUOTES, 'UTF-8', false );
$string = Sanitizer::escapeCombiningChar( $string );
} }
# Raw parameter replacement # Raw parameter replacement
@ -1591,7 +1593,7 @@ class Message implements Stringable, MessageSpecifier, Serializable {
case self::FORMAT_BLOCK_PARSE: case self::FORMAT_BLOCK_PARSE:
case self::FORMAT_ESCAPED: case self::FORMAT_ESCAPED:
default: default:
return htmlspecialchars( $plaintext, ENT_QUOTES ); return Sanitizer::escapeCombiningChar( htmlspecialchars( $plaintext, ENT_QUOTES ) );
} }
} }

View file

@ -14,6 +14,7 @@ use MediaWiki\OutputTransform\Stages\ExtractBody;
use MediaWiki\OutputTransform\Stages\HandleParsoidSectionLinks; use MediaWiki\OutputTransform\Stages\HandleParsoidSectionLinks;
use MediaWiki\OutputTransform\Stages\HandleSectionLinks; use MediaWiki\OutputTransform\Stages\HandleSectionLinks;
use MediaWiki\OutputTransform\Stages\HandleTOCMarkers; use MediaWiki\OutputTransform\Stages\HandleTOCMarkers;
use MediaWiki\OutputTransform\Stages\HardenNFC;
use MediaWiki\OutputTransform\Stages\HydrateHeaderPlaceholders; use MediaWiki\OutputTransform\Stages\HydrateHeaderPlaceholders;
use MediaWiki\OutputTransform\Stages\ParsoidLocalization; use MediaWiki\OutputTransform\Stages\ParsoidLocalization;
use MediaWiki\OutputTransform\Stages\RenderDebugInfo; use MediaWiki\OutputTransform\Stages\RenderDebugInfo;
@ -98,6 +99,10 @@ class DefaultOutputPipelineFactory {
'HydrateHeaderPlaceholders' => [ 'HydrateHeaderPlaceholders' => [
'class' => HydrateHeaderPlaceholders::class, 'class' => HydrateHeaderPlaceholders::class,
], ],
# This should be last, in order to ensure final output is hardened
'HardenNFC' => [
'class' => HardenNFC::class,
],
]; ];
public function __construct( public function __construct(

View file

@ -0,0 +1,23 @@
<?php
namespace MediaWiki\OutputTransform\Stages;
use MediaWiki\OutputTransform\ContentTextTransformStage;
use MediaWiki\Parser\ParserOptions;
use MediaWiki\Parser\ParserOutput;
use MediaWiki\Parser\Sanitizer;
/**
* Hardens the output against NFC normalization (T387130).
* @internal
*/
class HardenNFC extends ContentTextTransformStage {
public function shouldRun( ParserOutput $po, ?ParserOptions $popts, array $options = [] ): bool {
return true;
}
protected function transformText( string $text, ParserOutput $po, ?ParserOptions $popts, array &$options ): string {
return Sanitizer::escapeCombiningChar( $text );
}
}

View file

@ -1038,6 +1038,12 @@ class Sanitizer {
$class ), '_' ); $class ), '_' );
} }
public static function escapeCombiningChar( string $html ): string {
return strtr( $html, [
"\u{0338}" => '&#x338;', # T387130
] );
}
/** /**
* Given HTML input, escape with htmlspecialchars but un-escape entities. * Given HTML input, escape with htmlspecialchars but un-escape entities.
* This allows (generally harmless) entities like &#160; to survive. * This allows (generally harmless) entities like &#160; to survive.
@ -1053,7 +1059,7 @@ class Sanitizer {
# hurt. Use ENT_SUBSTITUTE so that incorrectly truncated multibyte characters # hurt. Use ENT_SUBSTITUTE so that incorrectly truncated multibyte characters
# don't cause the entire string to disappear. # don't cause the entire string to disappear.
$html = htmlspecialchars( $html, ENT_QUOTES | ENT_SUBSTITUTE ); $html = htmlspecialchars( $html, ENT_QUOTES | ENT_SUBSTITUTE );
return $html; return self::escapeCombiningChar( $html );
} }
/** /**

View file

@ -28,6 +28,8 @@ class RemexCompatFormatter extends HtmlFormatter {
// Escape non-breaking space // Escape non-breaking space
$this->attributeEscapes["\u{00A0}"] = '&#160;'; $this->attributeEscapes["\u{00A0}"] = '&#160;';
$this->textEscapes["\u{00A0}"] = '&#160;'; $this->textEscapes["\u{00A0}"] = '&#160;';
// Escape U+0338 (T387130)
$this->textEscapes["\u{0338}"] = '&#x338;';
// Disable escaping of '&', because we expect to see entities, due to 'ignoreCharRefs' // Disable escaping of '&', because we expect to see entities, due to 'ignoreCharRefs'
unset( $this->attributeEscapes["&"] ); unset( $this->attributeEscapes["&"] );
unset( $this->textEscapes["&"] ); unset( $this->textEscapes["&"] );

Binary file not shown.

View file

@ -96,6 +96,18 @@ class HtmlTest extends MediaWikiIntegrationTestCase {
Html::element( 'element', [], '' ), Html::element( 'element', [], '' ),
'Close tag for empty element (array, string)' 'Close tag for empty element (array, string)'
); );
$this->assertEquals(
"<p test=\"\u{0338}&quot;&amp;\">&#x338; &amp; &lt; ></p>",
Html::element( 'p', [ 'test' => "\u{0338}\"&" ], "\u{0338} & < >" ),
'Attribute and content escaping'
);
$this->assertEquals(
'<p>&#x338; &amp;</p>',
Html::rawElement( 'p', [], "\u{0338} &amp;" ),
"Combining characters escaped even in raw contents (T387130)"
);
} }
public function dataXmlMimeType() { public function dataXmlMimeType() {

View file

@ -0,0 +1,41 @@
<?php
namespace MediaWiki\Tests\OutputTransform\Stages;
use MediaWiki\Config\ServiceOptions;
use MediaWiki\OutputTransform\OutputTransformStage;
use MediaWiki\OutputTransform\Stages\HardenNFC;
use MediaWiki\Parser\ParserOutput;
use MediaWiki\Tests\OutputTransform\OutputTransformStageTestBase;
use Psr\Log\NullLogger;
/**
* @covers \MediaWiki\OutputTransform\Stages\HardenNFC
*/
class HardenNFCTest extends OutputTransformStageTestBase {
public function createStage(): OutputTransformStage {
return new HardenNFC(
new ServiceOptions( [] ),
new NullLogger()
);
}
public function provideShouldRun(): array {
return [
[ new ParserOutput(), null, [] ]
];
}
public function provideShouldNotRun(): array {
$this->markTestSkipped( 'HydrateHeaderPlaceHolders should always run' );
}
public function provideTransform(): array {
$text = "<h1>\u{0338}</h1>";
$expectedText = "<h1>&#x338;</h1>";
return [
[ new ParserOutput( $text ), null, [], new ParserOutput( $expectedText ) ],
];
}
}

View file

@ -282,6 +282,7 @@ class MessageTest extends MediaWikiLangTestCase {
'⧼script&gt;alert(1)&lt;/script⧽' ], '⧼script&gt;alert(1)&lt;/script⧽' ],
[ 'script>alert(1)</script', 'plain', '⧼script&gt;alert(1)&lt;/script⧽', [ 'script>alert(1)</script', 'plain', '⧼script&gt;alert(1)&lt;/script⧽',
'⧼script&gt;alert(1)&lt;/script⧽' ], '⧼script&gt;alert(1)&lt;/script⧽' ],
[ "\u{0338}isolated combining char", 'escaped', '⧼&#x338;isolated combining char⧽', '⧼&#x338;isolated combining char⧽' ],
]; ];
} }
@ -309,6 +310,7 @@ class MessageTest extends MediaWikiLangTestCase {
'&lt;script&gt;alert(1)&lt;/script&gt;' ], '&lt;script&gt;alert(1)&lt;/script&gt;' ],
[ '<script>alert(1)</script>', 'plain', '<script>alert(1)</script>', [ '<script>alert(1)</script>', 'plain', '<script>alert(1)</script>',
'&lt;script&gt;alert(1)&lt;/script&gt;' ], '&lt;script&gt;alert(1)&lt;/script&gt;' ],
[ "\u{0338}isolated combining char", 'escaped', '&#x338;isolated combining char', '&#x338;isolated combining char' ],
]; ];
} }
@ -584,28 +586,28 @@ class MessageTest extends MediaWikiLangTestCase {
public static function providePlaintextParams() { public static function providePlaintextParams() {
return [ return [
[ [
'one $2 <div>foo</div> [[Bar]] {{Baz}} &lt;', "one $2 <div>\u{0338}foo</div> [[Bar]] {{Baz}} &lt;",
'plain', 'plain',
], ],
[ [
// expect // expect
'one $2 <div>foo</div> [[Bar]] {{Baz}} &lt;', "one $2 <div>\u{0338}foo</div> [[Bar]] {{Baz}} &lt;",
// format // format
'text', 'text',
], ],
[ [
'one $2 &lt;div&gt;foo&lt;/div&gt; [[Bar]] {{Baz}} &amp;lt;', 'one $2 &lt;div&gt;&#x338;foo&lt;/div&gt; [[Bar]] {{Baz}} &amp;lt;',
'escaped', 'escaped',
], ],
[ [
'one $2 &lt;div&gt;foo&lt;/div&gt; [[Bar]] {{Baz}} &amp;lt;', 'one $2 &lt;div&gt;&#x338;foo&lt;/div&gt; [[Bar]] {{Baz}} &amp;lt;',
'parse', 'parse',
], ],
[ [
"<p>one $2 &lt;div&gt;foo&lt;/div&gt; [[Bar]] {{Baz}} &amp;lt;\n</p>", "<p>one $2 &lt;div&gt;&#x338;foo&lt;/div&gt; [[Bar]] {{Baz}} &amp;lt;\n</p>",
'parseAsBlock', 'parseAsBlock',
], ],
]; ];
@ -618,7 +620,7 @@ class MessageTest extends MediaWikiLangTestCase {
$msg = new RawMessage( '$1 $2' ); $msg = new RawMessage( '$1 $2' );
$params = [ $params = [
'one $2', 'one $2',
'<div>foo</div> [[Bar]] {{Baz}} &lt;', "<div>\u{0338}foo</div> [[Bar]] {{Baz}} &lt;",
]; ];
$this->assertSame( $this->assertSame(
$expect, $expect,

View file

@ -204,6 +204,8 @@ class SanitizerUnitTest extends MediaWikiUnitTestCase {
[ 'a¡b', 'a&#161;b' ], [ 'a¡b', 'a&#161;b' ],
[ 'foo&#039;bar', "foo'bar" ], [ 'foo&#039;bar', "foo'bar" ],
[ '&lt;script&gt;foo&lt;/script&gt;', '<script>foo</script>' ], [ '&lt;script&gt;foo&lt;/script&gt;', '<script>foo</script>' ],
[ '&#x338;', "\u{0338}" ],
[ '&#x338;', '&#x338;' ],
]; ];
} }

View file

@ -309,6 +309,11 @@ class RemexDriverTest extends MediaWikiUnitTestCase {
'<meta foo="bar"/>foo', '<meta foo="bar"/>foo',
"<meta foo=\"bar\" /><p>foo</p>", "<meta foo=\"bar\" /><p>foo</p>",
], ],
[
'Unicode combining characters (T387130)',
"<p>\u{0338} <!--comment-->\u{0338}</p>",
'<p>&#x338; <!--comment-->&#x338;</p>',
],
]; ];
public static function provider() { public static function provider() {