Provide method to merge a ParserOutput into a ContentMetadataCollector

ContentMetadataCollector is a write-only interface defined by Parsoid
that performs the metadata collection functions of ParserOutput.  In
order to support asynchronous and out-of-order parses,
ContentMetadataCollector is write-only and merges of fragments are
defined to be independent of merge order.

This provides an initial implementation of ParserOutput::collectMetadata()
which transfers metadata from a ParserOutput to a ContentMetadataCollector.
It is intended that the flags and accumulators in ParserOutput will be
(incrementally) made more regular so that ::collectMetadata() grows
simpler over time.

An optional $strategy argument is added to ::appendExtensionData() and
::appendJsConfigVars() to allow future expansion of merge strategies,
although only `union` is supported for the moment.

The MW_MERGE_STRATEGY_UNION constant will be upstreamed into Parsoid's
ContentMetadataCollector class as MERGE_STRATEGY_UNION; we've added a
prefix to ParserOutput's copy for now to avoid a conflict with the
constant which Parsoid will define.

Bug: T300979
Change-Id: I4e20b84eb590296fb3c011bb4d658d7a65082a11
This commit is contained in:
C. Scott Ananian 2022-02-07 11:28:23 -05:00
parent 24d897aadc
commit 7f849e965b
2 changed files with 143 additions and 13 deletions

View file

@ -8,6 +8,7 @@ use MediaWiki\Logger\LoggerFactory;
use MediaWiki\MediaWikiServices;
use MediaWiki\Page\PageReference;
use MediaWiki\Parser\ParserOutputFlags;
use Wikimedia\Parsoid\Core\ContentMetadataCollector;
use Wikimedia\Reflection\GhostFieldAccessTrait;
/**
@ -53,7 +54,19 @@ class ParserOutput extends CacheTime {
* @internal
* @since 1.38
*/
public const MERGE_STRATEGY_KEY = '_mw-strategy';
public const MW_MERGE_STRATEGY_KEY = '_mw-strategy';
/**
* Merge strategy to use for ParserOutput accumulators: "union"
* means that values are strings, stored as a set, and exposed as
* a PHP associative array mapping from values to `true`.
*
* This constant should be treated as @internal until we expose
* alternative merge strategies for external use.
* @internal
* @since 1.38
*/
public const MW_MERGE_STRATEGY_UNION = 'union';
/**
* @var string|null The output text
@ -173,6 +186,13 @@ class ParserOutput extends CacheTime {
*/
private $mWarnings = [];
/**
* @var array<string,array> *Unformatted* warning messages and
* arguments to be returned to the user. This is for internal use
* when merging ParserOutputs and are not serialized/deserialized.
*/
private $mWarningMsgs = [];
/**
* @var array Table of contents
*/
@ -752,7 +772,7 @@ class ParserOutput extends CacheTime {
// Don't expose the internal strategy key
foreach ( $result as $key => &$value ) {
if ( is_array( $value ) ) {
unset( $value[self::MERGE_STRATEGY_KEY] );
unset( $value[self::MW_MERGE_STRATEGY_KEY] );
}
}
return $result;
@ -913,6 +933,9 @@ class ParserOutput extends CacheTime {
* @since 1.38
*/
public function addWarningMsg( string $msg, ...$args ): void {
// preserve original arguments in $mWarningMsgs to allow merge
// @todo: these aren't serialized/unserialized
$this->mWarningMsgs[$msg] = $args;
$s = wfMessage( $msg, ...$args )
// some callers set the title here?
->inContentLanguage() // because this ends up in cache
@ -1174,16 +1197,28 @@ class ParserOutput extends CacheTime {
*
* @param string $key Key to use under mw.config
* @param string $value Value to append to the configuration variable.
* @param string $strategy Merge strategy:
* only MW_MERGE_STRATEGY_UNION is currently supported and external callers
* should treat this parameter as @internal at this time and omit it.
* @since 1.38
*/
public function appendJsConfigVar( string $key, string $value ): void {
public function appendJsConfigVar(
string $key,
string $value,
string $strategy = self::MW_MERGE_STRATEGY_UNION
): void {
if ( $strategy !== self::MW_MERGE_STRATEGY_UNION ) {
throw new InvalidArgumentException( "Unknown merge strategy $strategy." );
}
if ( !array_key_exists( $key, $this->mJsConfigVars ) ) {
$this->mJsConfigVars[$key] = [
// Indicate how these values are to be merged.
self::MERGE_STRATEGY_KEY => 'union',
self::MW_MERGE_STRATEGY_KEY => $strategy,
];
} elseif ( !is_array( $this->mJsConfigVars[$key] ) ) {
throw new InvalidArgumentException( "Mixing set and append for $key" );
} elseif ( ( $this->mJsConfigVars[$key][self::MW_MERGE_STRATEGY_KEY] ?? null ) !== $strategy ) {
throw new InvalidArgumentException( "Conflicting merge strategies for $key" );
}
$this->mJsConfigVars[$key][$value] = true;
}
@ -1634,16 +1669,28 @@ class ParserOutput extends CacheTime {
* conflicts in naming keys. It is suggested to use the extension's name as a prefix.
*
* @param int|string $value The value to append to the list.
* @param string $strategy Merge strategy:
* only MW_MERGE_STRATEGY_UNION is currently supported and external callers
* should treat this parameter as @internal at this time and omit it.
* @since 1.38
*/
public function appendExtensionData( string $key, $value ): void {
public function appendExtensionData(
string $key,
$value,
string $strategy = self::MW_MERGE_STRATEGY_UNION
): void {
if ( $strategy !== self::MW_MERGE_STRATEGY_UNION ) {
throw new InvalidArgumentException( "Unknown merge strategy $strategy." );
}
if ( !array_key_exists( $key, $this->mExtensionData ) ) {
$this->mExtensionData[$key] = [
// Indicate how these values are to be merged.
self::MERGE_STRATEGY_KEY => 'union',
self::MW_MERGE_STRATEGY_KEY => $strategy,
];
} elseif ( !is_array( $this->mExtensionData[$key] ) ) {
throw new InvalidArgumentException( "Mixing set and append for $key" );
} elseif ( ( $this->mExtensionData[$key][self::MW_MERGE_STRATEGY_KEY] ?? null ) !== $strategy ) {
throw new InvalidArgumentException( "Conflicting merge strategies for $key" );
}
$this->mExtensionData[$key][$value] = true;
}
@ -1663,7 +1710,7 @@ class ParserOutput extends CacheTime {
$value = $this->mExtensionData[$key] ?? null;
if ( is_array( $value ) ) {
// Don't expose our internal merge strategy key.
unset( $value[self::MERGE_STRATEGY_KEY] );
unset( $value[self::MW_MERGE_STRATEGY_KEY] );
}
return $value;
}
@ -1974,7 +2021,7 @@ class ParserOutput extends CacheTime {
public function __sleep() {
return array_filter( array_keys( get_object_vars( $this ) ),
static function ( $field ) {
if ( $field === 'mParseStartTime' ) {
if ( $field === 'mParseStartTime' || $field === 'mWarningMsgs' ) {
return false;
}
// Unserializing unknown private fields in HHVM causes
@ -2125,6 +2172,69 @@ class ParserOutput extends CacheTime {
);
}
/**
* Adds the metadata collected in this ParserOutput to the supplied
* ContentMetadataCollector. This is similar to ::mergeHtmlMetaDataFrom()
* but in the opposite direction, since ParserOutput is read/write while
* ContentMetadataCollector is write-only.
*
* @param ContentMetadataCollector $metadata
* @since 1.38
*/
public function collectMetadata( ContentMetadataCollector $metadata ): void {
// Uniform handling of all boolean flags: they are OR'ed together.
$flags = array_keys(
$this->mFlags + array_flip( ParserOutputFlags::cases() )
);
foreach ( $flags as $name ) {
if ( $this->getOutputFlag( $name ) ) {
$metadata->setOutputFlag( $name );
}
}
// @todo: Accumulators should also be handled uniformly
foreach ( $this->mCategories as $cat => $key ) {
$metadata->addCategory( $cat, $key );
}
$metadata->addModules( $this->mModules );
$metadata->addModuleStyles( $this->mModuleStyles );
foreach ( $this->mJsConfigVars as $key => $value ) {
if ( is_array( $value ) && isset( $value[self::MW_MERGE_STRATEGY_KEY] ) ) {
$strategy = $value[self::MW_MERGE_STRATEGY_KEY];
foreach ( $value as $item => $ignore ) {
if ( $item !== self::MW_MERGE_STRATEGY_KEY ) {
$metadata->appendJsConfigVar( $key, $item, $strategy );
}
}
} else {
$metadata->setJsConfigVar( $key, $value );
}
}
foreach ( $this->mExtensionData as $key => $value ) {
if ( is_array( $value ) && isset( $value[self::MW_MERGE_STRATEGY_KEY] ) ) {
$strategy = $value[self::MW_MERGE_STRATEGY_KEY];
foreach ( $value as $item => $ignore ) {
if ( $item !== self::MW_MERGE_STRATEGY_KEY ) {
$metadata->appendExtensionData( $key, $item, $strategy );
}
}
} else {
$metadata->setExtensionData( $key, $value );
}
}
foreach ( $this->mExternalLinks as $url => $ignore ) {
$metadata->addExternalLink( $url );
}
foreach ( $this->mProperties as $prop => $value ) {
$metadata->setPageProperty( $prop, $value );
}
foreach ( $this->mWarningMsgs as $msg => $args ) {
$metadata->addWarningMsg( $msg, ...$args );
}
foreach ( $this->mLimitReportData as $key => $value ) {
$metadata->setLimitReportData( $key, $value );
}
}
private static function mergeMixedList( array $a, array $b ): array {
return array_unique( array_merge( $a, $b ), SORT_REGULAR );
}
@ -2142,14 +2252,14 @@ class ParserOutput extends CacheTime {
if ( !array_key_exists( $key, $a ) ) {
$a[$key] = $bValue;
} elseif (
isset( $a[$key][self::MERGE_STRATEGY_KEY] ) &&
isset( $bValue[self::MERGE_STRATEGY_KEY] )
isset( $a[$key][self::MW_MERGE_STRATEGY_KEY] ) &&
isset( $bValue[self::MW_MERGE_STRATEGY_KEY] )
) {
$strategy = $bValue[self::MERGE_STRATEGY_KEY];
if ( $strategy !== $a[$key][self::MERGE_STRATEGY_KEY] ) {
$strategy = $bValue[self::MW_MERGE_STRATEGY_KEY];
if ( $strategy !== $a[$key][self::MW_MERGE_STRATEGY_KEY] ) {
throw new InvalidArgumentException( "Conflicting merge strategy for $key" );
}
if ( $strategy === 'union' ) {
if ( $strategy === self::MW_MERGE_STRATEGY_UNION ) {
// Note the array_merge is *not* safe to use here, because
// the $bValue is expected to be a map from items to `true`.
// If the item is a numeric string like '1' then array_merge

View file

@ -146,4 +146,24 @@ class ParserOutputFlags {
* similar (T230652).
*/
public const USER_SIGNATURE = 'user-signature';
public static function cases(): array {
return [
self::NO_GALLERY,
self::ENABLE_OOUI,
self::INDEX_POLICY,
self::NO_INDEX_POLICY,
self::NEW_SECTION,
self::HIDE_NEW_SECTION,
self::PREVENT_CLICKJACKING,
self::VARY_REVISION,
self::VARY_REVISION_ID,
self::VARY_REVISION_TIMESTAMP,
self::VARY_REVISION_SHA1,
self::VARY_REVISION_EXISTS,
self::VARY_PAGE_ID,
self::VARY_USER,
self::USER_SIGNATURE,
];
}
}