2016-08-29 22:27:54 +00:00
|
|
|
<?php
|
2022-01-05 01:41:28 +00:00
|
|
|
|
2016-08-29 22:27:54 +00:00
|
|
|
/**
|
|
|
|
|
* @ingroup Testing
|
|
|
|
|
*/
|
|
|
|
|
class ParserTestResultNormalizer {
|
2022-06-01 18:57:09 +00:00
|
|
|
/** @var DOMDocument */
|
|
|
|
|
protected $doc;
|
2022-06-01 21:19:57 +00:00
|
|
|
/** @var DOMNode|null */
|
2022-06-01 18:57:09 +00:00
|
|
|
protected $body;
|
|
|
|
|
/** @var DOMXPath */
|
|
|
|
|
protected $xpath;
|
|
|
|
|
/** @var bool */
|
|
|
|
|
protected $invalid;
|
2016-08-29 22:27:54 +00:00
|
|
|
|
2024-01-20 19:24:35 +00:00
|
|
|
/**
|
|
|
|
|
* @param string $text
|
|
|
|
|
* @param string[] $funcs
|
|
|
|
|
* @return string
|
|
|
|
|
*/
|
2016-08-29 22:27:54 +00:00
|
|
|
public static function normalize( $text, $funcs ) {
|
|
|
|
|
$norm = new self( $text );
|
|
|
|
|
if ( $norm->invalid ) {
|
|
|
|
|
return $text;
|
|
|
|
|
}
|
|
|
|
|
foreach ( $funcs as $func ) {
|
|
|
|
|
$norm->$func();
|
|
|
|
|
}
|
|
|
|
|
return $norm->serialize();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
protected function __construct( $text ) {
|
|
|
|
|
$this->doc = new DOMDocument( '1.0', 'utf-8' );
|
|
|
|
|
|
2022-02-24 19:57:59 +00:00
|
|
|
// Parsing a supposedly-XHTML document with an XML parser is not
|
2016-08-29 22:27:54 +00:00
|
|
|
// guaranteed to give accurate results. For example, it may introduce
|
|
|
|
|
// differences in the number of line breaks in <pre> tags.
|
2022-02-24 19:57:59 +00:00
|
|
|
if ( !@$this->doc->loadXML( '<html><body>' . $text . '</body></html>' ) ) {
|
2016-08-29 22:27:54 +00:00
|
|
|
$this->invalid = true;
|
|
|
|
|
}
|
2022-02-24 19:57:59 +00:00
|
|
|
|
2016-08-29 22:27:54 +00:00
|
|
|
$this->xpath = new DOMXPath( $this->doc );
|
|
|
|
|
$this->body = $this->xpath->query( '//body' )->item( 0 );
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
protected function removeTbody() {
|
|
|
|
|
foreach ( $this->xpath->query( '//tbody' ) as $tbody ) {
|
|
|
|
|
while ( $tbody->firstChild ) {
|
|
|
|
|
$child = $tbody->firstChild;
|
|
|
|
|
$tbody->removeChild( $child );
|
|
|
|
|
$tbody->parentNode->insertBefore( $child, $tbody );
|
|
|
|
|
}
|
|
|
|
|
$tbody->parentNode->removeChild( $tbody );
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* The point of this function is to produce a normalized DOM in which
|
2019-03-13 21:33:37 +00:00
|
|
|
* Tidy's output matches the output of RemexHtml. Tidy both trims
|
2016-08-29 22:27:54 +00:00
|
|
|
* and pretty-prints, so this requires fairly aggressive treatment.
|
|
|
|
|
*
|
|
|
|
|
* In particular, note that Tidy converts <pre>x</pre> to <pre>\nx\n</pre>,
|
|
|
|
|
* which theoretically affects display since the second line break is not
|
|
|
|
|
* ignored by compliant HTML parsers.
|
|
|
|
|
*
|
|
|
|
|
* This function also removes empty elements, as does Tidy.
|
|
|
|
|
*/
|
|
|
|
|
protected function trimWhitespace() {
|
|
|
|
|
foreach ( $this->xpath->query( '//text()' ) as $child ) {
|
|
|
|
|
if ( strtolower( $child->parentNode->nodeName ) === 'pre' ) {
|
|
|
|
|
// Just trim one line break from the start and end
|
2022-04-13 13:44:41 +00:00
|
|
|
if ( str_starts_with( $child->data, "\n" ) ) {
|
2016-08-29 22:27:54 +00:00
|
|
|
$child->data = substr( $child->data, 1 );
|
|
|
|
|
}
|
2022-04-13 13:44:41 +00:00
|
|
|
if ( str_ends_with( $child->data, "\n" ) ) {
|
2016-08-29 22:27:54 +00:00
|
|
|
$child->data = substr( $child->data, 0, -1 );
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
// Trim all whitespace
|
|
|
|
|
$child->data = trim( $child->data );
|
|
|
|
|
}
|
|
|
|
|
if ( $child->data === '' ) {
|
|
|
|
|
$child->parentNode->removeChild( $child );
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Serialize the XML DOM for comparison purposes. This does not generate HTML.
|
2017-09-09 20:47:04 +00:00
|
|
|
* @return string
|
2016-08-29 22:27:54 +00:00
|
|
|
*/
|
|
|
|
|
protected function serialize() {
|
|
|
|
|
return strtr( $this->doc->saveXML( $this->body ),
|
|
|
|
|
[ '<body>' => '', '</body>' => '' ] );
|
|
|
|
|
}
|
|
|
|
|
}
|