wiki.techinc.nl/includes/Html/HtmlHelper.php
C. Scott Ananian a6739e066e Use Remex/HtmlHelper to implement Parser::replaceTableOfContents
This is more robust and secure than the regular expression previously
used to extract the <meta> tag.

We also improve HtmlHelper slightly be adding the ability to replace
an element with an 'outerHTML' string.

Because our output is being run through Remex, there is a slightly
larger degree of HTML normalization in the output than previously,
which is visible in some small tweaks to test case outputs.

Bug: T381617
Depends-On: I2712e0fa9272106e8cd686980f847ee7f6385b6f
Change-Id: I4cb2f29cf890af90f295624c586d9e1eb1939b95
(cherry picked from commit 7ebd8034b54495f28f4c5583d4fa55071634b593)
2025-09-29 22:01:08 +00:00

76 lines
2.7 KiB
PHP

<?php
namespace MediaWiki\Html;
use MediaWiki\Tidy\RemexCompatFormatter;
use Wikimedia\RemexHtml\HTMLData;
use Wikimedia\RemexHtml\Serializer\HtmlFormatter;
use Wikimedia\RemexHtml\Serializer\Serializer;
use Wikimedia\RemexHtml\Tokenizer\Tokenizer;
use Wikimedia\RemexHtml\TreeBuilder\Dispatcher;
use Wikimedia\RemexHtml\TreeBuilder\TreeBuilder;
/**
* Static utilities for manipulating HTML strings.
*/
class HtmlHelper {
/**
* Modify elements of an HTML fragment via a user-provided callback.
* @param string $htmlFragment HTML fragment. Must be valid (ie. coming from the parser, not
* the user).
* @param callable $shouldModifyCallback A callback which takes a single
* RemexHtml\Serializer\SerializerNode argument, and returns true if it should be modified.
* @param callable $modifyCallback A callback which takes a single
* RemexHtml\Serializer\SerializerNode argument and actually performs the modification on it.
* It must return the new node (which can be the original node object)
* or a string, which is treated as the outerHTML of a replacement.
* @param bool $html5format Defaults to true, which uses standard HTML5
* serialization for the parsed HTML. If set to false, uses a
* serialization which is more compatible with the output of the
* legacy parser; see RemexCompatFormatter for more details.
* When false, attributes and text nodes contain unexpanded character references (entities).
* @return string
*/
public static function modifyElements(
string $htmlFragment,
callable $shouldModifyCallback,
callable $modifyCallback,
bool $html5format = true
) {
if ( $html5format ) {
$formatter = new class( [], $shouldModifyCallback, $modifyCallback ) extends HtmlFormatter {
use HtmlHelperTrait;
};
} else {
$formatter = new class( [], $shouldModifyCallback, $modifyCallback ) extends RemexCompatFormatter {
use HtmlHelperTrait;
};
}
$serializer = new Serializer( $formatter );
$treeBuilder = new TreeBuilder( $serializer, $html5format ? [] : [
'ignoreErrors' => true,
'ignoreNulls' => true,
] );
$dispatcher = new Dispatcher( $treeBuilder );
$tokenizer = new Tokenizer( $dispatcher, $htmlFragment, $html5format ? [] : [
// RemexCompatFormatter expects 'ignoreCharRefs' to be used (T354361). The other options are
// for consistency with RemexDriver and supposedly improve performance.
'ignoreErrors' => true,
'ignoreCharRefs' => true,
'ignoreNulls' => true,
'skipPreprocess' => true,
] );
$tokenizer->execute( [
'fragmentNamespace' => HTMLData::NS_HTML,
'fragmentName' => 'body',
] );
return $serializer->getResult();
}
}
/** @deprecated class alias since 1.40 */
class_alias( HtmlHelper::class, 'MediaWiki\\HtmlHelper' );