Parser: Move Sanitizer::normalizeCharReferences into RemexCompatFormatter

Choosing a particular encoding of HTML entities is logically a task
of the Remex formatter (which serializes HTML).  Move it out of the
Parser so that it is part of the serialization specification.

This is a follow up to Ic8965e81882d7cf024bdced437f684064a30ac86.

Change-Id: If45907baf24d60987b39cd1f7709c5f7caf19f37
This commit is contained in:
C. Scott Ananian 2021-02-18 10:51:57 -05:00
parent 5c25863bd9
commit 5d317c25be
3 changed files with 7 additions and 2 deletions

View file

@ -1682,8 +1682,6 @@ class Parser {
$text = $this->mStripState->unstripGeneral( $text );
$text = Sanitizer::normalizeCharReferences( $text );
$text = $this->remexDriver->tidy( $text, [ Sanitizer::class, 'armorFrenchSpaces' ] );
if ( $isMain ) {

View file

@ -5,6 +5,7 @@ namespace MediaWiki\Tidy;
use RemexHtml\HTMLData;
use RemexHtml\Serializer\HtmlFormatter;
use RemexHtml\Serializer\SerializerNode;
use Sanitizer;
/**
* @internal
@ -34,6 +35,7 @@ class RemexCompatFormatter extends HtmlFormatter {
public function characters( SerializerNode $parent, $text, $start, $length ) {
$text = parent::characters( $parent, $text, $start, $length );
if ( $parent->namespace !== HTMLData::NS_HTML
|| !isset( $this->rawTextElements[$parent->name] )
) {
@ -41,6 +43,9 @@ class RemexCompatFormatter extends HtmlFormatter {
$text = call_user_func( $this->textProcessor, $text );
}
}
// Ensure a consistent representation for all entities
$text = Sanitizer::normalizeCharReferences( $text );
return $text;
}
@ -65,6 +70,7 @@ class RemexCompatFormatter extends HtmlFormatter {
$s = "<$name";
foreach ( $attrs->getValues() as $attrName => $attrValue ) {
$encValue = strtr( $attrValue, $this->attributeEscapes );
$encValue = Sanitizer::normalizeCharReferences( $encValue );
$s .= " $attrName=\"$encValue\"";
}
if ( $node->namespace === HTMLData::NS_HTML && isset( $this->voidElements[$name] ) ) {

View file

@ -37,6 +37,7 @@ class TidyTest extends \MediaWikiUnitTestCase {
</mrow>
</math>
MathML;
$testMathML = Sanitizer::normalizeCharReferences( $testMathML );
return [
[
'<mw:editsection page="foo" section="bar">foo</mw:editsection>',