CVE-2025-32699 Ensure that Unicode NFC normalization can be applied to our HTML output safely. Even though the W3C officially recommends against normalizing HTML https://www.w3.org/International/questions/qa-html-css-normalization#converting this is still easily done inadvertently, especially when using the MediaWiki action API which normalizes parameters and results by default. See also I671648603c4635a35585c860b4857f5ea085e47f in Parsoid, and T266140 / I2e78e660ba1867744e34eda7d00ea527ec016b71 for another similar issue. The following changes are made: * The various HTML serializers (Remex/Tidy-derived, as well as the Html::* helpers) are tweaked to entity-escape U+0338 wherever it appears. * Similarly, Message::escaped() is tweaked to entity-escape U+0338. * Finally, a post-processing pass is added to the OutputTransform pipeline to catch any remaining U+0338 and entity-escape them. This catches U+0338 added during any of the previous OutputTransform stages (like TOC insertion, section edit links, etc). *When backporting* this code will likely need to be moved to ParserOutput::getText(), as the OutputTransform pipeline wasn't added until MW 1.42. Bug: T387130 Change-Id: I66564e14e730f5393f4fa5780b80f24de6075af5
106 lines
3.1 KiB
PHP
106 lines
3.1 KiB
PHP
<?php
|
|
|
|
namespace MediaWiki\Tidy;
|
|
|
|
use MediaWiki\Parser\Sanitizer;
|
|
use Wikimedia\RemexHtml\HTMLData;
|
|
use Wikimedia\RemexHtml\Serializer\HtmlFormatter;
|
|
use Wikimedia\RemexHtml\Serializer\SerializerNode;
|
|
|
|
/**
|
|
* @internal
|
|
*
|
|
* WATCH OUT! Unlike normal HtmlFormatter, this class requires the 'ignoreCharRefs' option
|
|
* in Tokenizer to be used. If that option is not used, it will produce wrong results (T354361).
|
|
*/
|
|
class RemexCompatFormatter extends HtmlFormatter {
|
|
private const MARKED_EMPTY_ELEMENTS = [
|
|
'li' => true,
|
|
'p' => true,
|
|
'tr' => true,
|
|
];
|
|
|
|
/** @var ?callable */
|
|
private $textProcessor;
|
|
|
|
public function __construct( $options = [] ) {
|
|
parent::__construct( $options );
|
|
// Escape non-breaking space
|
|
$this->attributeEscapes["\u{00A0}"] = ' ';
|
|
$this->textEscapes["\u{00A0}"] = ' ';
|
|
// Escape U+0338 (T387130)
|
|
$this->textEscapes["\u{0338}"] = '̸';
|
|
// Disable escaping of '&', because we expect to see entities, due to 'ignoreCharRefs'
|
|
unset( $this->attributeEscapes["&"] );
|
|
unset( $this->textEscapes["&"] );
|
|
$this->textProcessor = $options['textProcessor'] ?? null;
|
|
}
|
|
|
|
public function startDocument( $fragmentNamespace, $fragmentName ) {
|
|
return '';
|
|
}
|
|
|
|
/**
|
|
* WATCH OUT! Unlike normal HtmlFormatter, this class expects that the $text argument contains
|
|
* unexpanded character references (entities), as a result of using the 'ignoreCharRefs' option
|
|
* in Tokenizer. If that option is not used, this method will produce wrong results (T354361).
|
|
*
|
|
* @inheritDoc
|
|
*/
|
|
public function characters( SerializerNode $parent, $text, $start, $length ) {
|
|
$text = parent::characters( $parent, $text, $start, $length );
|
|
|
|
if ( $parent->namespace !== HTMLData::NS_HTML
|
|
|| !isset( $this->rawTextElements[$parent->name] )
|
|
) {
|
|
if ( $this->textProcessor !== null ) {
|
|
$text = call_user_func( $this->textProcessor, $text );
|
|
}
|
|
}
|
|
|
|
// Ensure a consistent representation for all entities
|
|
$text = Sanitizer::normalizeCharReferences( $text );
|
|
return $text;
|
|
}
|
|
|
|
public function element( SerializerNode $parent, SerializerNode $node, $contents ) {
|
|
$data = $node->snData;
|
|
if ( $data && $data->isPWrapper ) {
|
|
if ( $data->nonblankNodeCount ) {
|
|
return "<p>$contents</p>";
|
|
} else {
|
|
return $contents;
|
|
}
|
|
}
|
|
|
|
$name = $node->name;
|
|
$attrs = $node->attrs;
|
|
if ( isset( self::MARKED_EMPTY_ELEMENTS[$name] ) && $attrs->count() === 0
|
|
&& strspn( $contents, "\t\n\f\r " ) === strlen( $contents )
|
|
) {
|
|
return "<{$name} class=\"mw-empty-elt\">$contents</{$name}>";
|
|
}
|
|
|
|
$s = "<$name";
|
|
foreach ( $attrs->getValues() as $attrName => $attrValue ) {
|
|
$encValue = strtr( $attrValue, $this->attributeEscapes );
|
|
$encValue = Sanitizer::normalizeCharReferences( $encValue );
|
|
$s .= " $attrName=\"$encValue\"";
|
|
}
|
|
if ( $node->namespace === HTMLData::NS_HTML && isset( $this->voidElements[$name] ) ) {
|
|
$s .= ' />';
|
|
return $s;
|
|
}
|
|
|
|
$s .= '>';
|
|
if ( $node->namespace === HTMLData::NS_HTML
|
|
&& isset( $contents[0] ) && $contents[0] === "\n"
|
|
&& isset( $this->prefixLfElements[$name] )
|
|
) {
|
|
$s .= "\n$contents</$name>";
|
|
} else {
|
|
$s .= "$contents</$name>";
|
|
}
|
|
return $s;
|
|
}
|
|
}
|