All the other ways of doing it were ridiculous and much harder to read, and usually required repeating the needle expression (to get its length). I found these occurrences by grepping for various expressions, but I undoubtedly missed some. I didn't try replacing the many instances of strpos(...) === 0 with str_starts_with(...), because I think they're readable enough as-is (although less efficient). Likewise I didn't try porting strpos(...) !== false to str_contains(...). For case-insensitive comparisons, Tim Starling requested that we stick with substr_compare() because it's more efficient than calling strtolower(). On PHP < 8 these functions will be included with a polyfill via vendor/autoload.php. This is included at the beginning of includes/AutoLoader.php, so if our autoloader has been included the polyfill will be available. This means it should be safe to call these functions from any code that would not be usable without our autoloader. Three uses that Tim Starling identified as being performance-sensitive have been split out to a separate commit for porting after the switch to PHP 8. Change-Id: I113a8d052b6845852c15969a2f0e6fbbe3e9f8d9
84 lines
2.4 KiB
PHP
84 lines
2.4 KiB
PHP
<?php
|
|
|
|
/**
|
|
* @ingroup Testing
|
|
*/
|
|
class ParserTestResultNormalizer {
|
|
protected $doc, $xpath, $invalid;
|
|
|
|
public static function normalize( $text, $funcs ) {
|
|
$norm = new self( $text );
|
|
if ( $norm->invalid ) {
|
|
return $text;
|
|
}
|
|
foreach ( $funcs as $func ) {
|
|
$norm->$func();
|
|
}
|
|
return $norm->serialize();
|
|
}
|
|
|
|
protected function __construct( $text ) {
|
|
$this->doc = new DOMDocument( '1.0', 'utf-8' );
|
|
|
|
// Parsing a supposedly-XHTML document with an XML parser is not
|
|
// guaranteed to give accurate results. For example, it may introduce
|
|
// differences in the number of line breaks in <pre> tags.
|
|
if ( !@$this->doc->loadXML( '<html><body>' . $text . '</body></html>' ) ) {
|
|
$this->invalid = true;
|
|
}
|
|
|
|
$this->xpath = new DOMXPath( $this->doc );
|
|
$this->body = $this->xpath->query( '//body' )->item( 0 );
|
|
}
|
|
|
|
protected function removeTbody() {
|
|
foreach ( $this->xpath->query( '//tbody' ) as $tbody ) {
|
|
while ( $tbody->firstChild ) {
|
|
$child = $tbody->firstChild;
|
|
$tbody->removeChild( $child );
|
|
$tbody->parentNode->insertBefore( $child, $tbody );
|
|
}
|
|
$tbody->parentNode->removeChild( $tbody );
|
|
}
|
|
}
|
|
|
|
/**
|
|
* The point of this function is to produce a normalized DOM in which
|
|
* Tidy's output matches the output of RemexHtml. Tidy both trims
|
|
* and pretty-prints, so this requires fairly aggressive treatment.
|
|
*
|
|
* In particular, note that Tidy converts <pre>x</pre> to <pre>\nx\n</pre>,
|
|
* which theoretically affects display since the second line break is not
|
|
* ignored by compliant HTML parsers.
|
|
*
|
|
* This function also removes empty elements, as does Tidy.
|
|
*/
|
|
protected function trimWhitespace() {
|
|
foreach ( $this->xpath->query( '//text()' ) as $child ) {
|
|
if ( strtolower( $child->parentNode->nodeName ) === 'pre' ) {
|
|
// Just trim one line break from the start and end
|
|
if ( str_starts_with( $child->data, "\n" ) ) {
|
|
$child->data = substr( $child->data, 1 );
|
|
}
|
|
if ( str_ends_with( $child->data, "\n" ) ) {
|
|
$child->data = substr( $child->data, 0, -1 );
|
|
}
|
|
} else {
|
|
// Trim all whitespace
|
|
$child->data = trim( $child->data );
|
|
}
|
|
if ( $child->data === '' ) {
|
|
$child->parentNode->removeChild( $child );
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Serialize the XML DOM for comparison purposes. This does not generate HTML.
|
|
* @return string
|
|
*/
|
|
protected function serialize() {
|
|
return strtr( $this->doc->saveXML( $this->body ),
|
|
[ '<body>' => '', '</body>' => '' ] );
|
|
}
|
|
}
|