This is more robust and secure than the regular expression previously used to extract the <meta> tag. We also improve HtmlHelper slightly be adding the ability to replace an element with an 'outerHTML' string. Because our output is being run through Remex, there is a slightly larger degree of HTML normalization in the output than previously, which is visible in some small tweaks to test case outputs. Bug: T381617 Depends-On: I2712e0fa9272106e8cd686980f847ee7f6385b6f Change-Id: I4cb2f29cf890af90f295624c586d9e1eb1939b95 (cherry picked from commit 7ebd8034b54495f28f4c5583d4fa55071634b593)
76 lines
2.7 KiB
PHP
76 lines
2.7 KiB
PHP
<?php
|
|
|
|
namespace MediaWiki\Html;
|
|
|
|
use MediaWiki\Tidy\RemexCompatFormatter;
|
|
use Wikimedia\RemexHtml\HTMLData;
|
|
use Wikimedia\RemexHtml\Serializer\HtmlFormatter;
|
|
use Wikimedia\RemexHtml\Serializer\Serializer;
|
|
use Wikimedia\RemexHtml\Tokenizer\Tokenizer;
|
|
use Wikimedia\RemexHtml\TreeBuilder\Dispatcher;
|
|
use Wikimedia\RemexHtml\TreeBuilder\TreeBuilder;
|
|
|
|
/**
|
|
* Static utilities for manipulating HTML strings.
|
|
*/
|
|
class HtmlHelper {
|
|
|
|
/**
|
|
* Modify elements of an HTML fragment via a user-provided callback.
|
|
* @param string $htmlFragment HTML fragment. Must be valid (ie. coming from the parser, not
|
|
* the user).
|
|
* @param callable $shouldModifyCallback A callback which takes a single
|
|
* RemexHtml\Serializer\SerializerNode argument, and returns true if it should be modified.
|
|
* @param callable $modifyCallback A callback which takes a single
|
|
* RemexHtml\Serializer\SerializerNode argument and actually performs the modification on it.
|
|
* It must return the new node (which can be the original node object)
|
|
* or a string, which is treated as the outerHTML of a replacement.
|
|
* @param bool $html5format Defaults to true, which uses standard HTML5
|
|
* serialization for the parsed HTML. If set to false, uses a
|
|
* serialization which is more compatible with the output of the
|
|
* legacy parser; see RemexCompatFormatter for more details.
|
|
* When false, attributes and text nodes contain unexpanded character references (entities).
|
|
* @return string
|
|
*/
|
|
public static function modifyElements(
|
|
string $htmlFragment,
|
|
callable $shouldModifyCallback,
|
|
callable $modifyCallback,
|
|
bool $html5format = true
|
|
) {
|
|
if ( $html5format ) {
|
|
$formatter = new class( [], $shouldModifyCallback, $modifyCallback ) extends HtmlFormatter {
|
|
use HtmlHelperTrait;
|
|
};
|
|
} else {
|
|
$formatter = new class( [], $shouldModifyCallback, $modifyCallback ) extends RemexCompatFormatter {
|
|
use HtmlHelperTrait;
|
|
};
|
|
}
|
|
$serializer = new Serializer( $formatter );
|
|
$treeBuilder = new TreeBuilder( $serializer, $html5format ? [] : [
|
|
'ignoreErrors' => true,
|
|
'ignoreNulls' => true,
|
|
] );
|
|
$dispatcher = new Dispatcher( $treeBuilder );
|
|
$tokenizer = new Tokenizer( $dispatcher, $htmlFragment, $html5format ? [] : [
|
|
// RemexCompatFormatter expects 'ignoreCharRefs' to be used (T354361). The other options are
|
|
// for consistency with RemexDriver and supposedly improve performance.
|
|
'ignoreErrors' => true,
|
|
'ignoreCharRefs' => true,
|
|
'ignoreNulls' => true,
|
|
'skipPreprocess' => true,
|
|
] );
|
|
|
|
$tokenizer->execute( [
|
|
'fragmentNamespace' => HTMLData::NS_HTML,
|
|
'fragmentName' => 'body',
|
|
] );
|
|
|
|
return $serializer->getResult();
|
|
}
|
|
|
|
}
|
|
|
|
/** @deprecated class alias since 1.40 */
|
|
class_alias( HtmlHelper::class, 'MediaWiki\\HtmlHelper' );
|