wiki.techinc.nl/includes/parser/Parsoid/LanguageVariantConverter.php
daniel 89e0a7130b Work around brittle parsoid test (fix CI)
Parsoid doesn't expect a newline to be added to the output of a variant
conversion. So don't.

Change-Id: I7fc74534c77e2747d20df87ff4b850f679c55f3b
2023-01-12 20:42:57 +01:00

311 lines
9.9 KiB
PHP

<?php
namespace MediaWiki\Parser\Parsoid;
use MediaWiki\Languages\LanguageConverterFactory;
use MediaWiki\Languages\LanguageFactory;
use MediaWiki\Page\PageIdentity;
use MediaWiki\Parser\Parsoid\Config\PageConfigFactory;
use MediaWiki\Rest\HttpException;
use MediaWiki\Revision\RevisionAccessException;
use MediaWiki\Title\TitleFactory;
use ParserOutput;
use Title;
use Wikimedia\Parsoid\Config\PageConfig;
use Wikimedia\Parsoid\Config\SiteConfig;
use Wikimedia\Parsoid\Core\PageBundle;
use Wikimedia\Parsoid\Parsoid;
/**
* @since 1.40
* @unstable should be marked stable before 1.40 release
*/
class LanguageVariantConverter {
/** @var PageConfigFactory */
private $pageConfigFactory;
/** @var PageConfig */
private $pageConfig;
/** @var PageIdentity */
private $pageIdentity;
/** @var Title */
private $pageTitle;
/** @var Parsoid */
private $parsoid;
/** @var array */
private $parsoidSettings;
/** @var SiteConfig */
private $siteConfig;
/** @var TitleFactory */
private $titleFactory;
/** @var LanguageConverterFactory */
private $languageConverterFactory;
/** @var LanguageFactory */
private $languageFactory;
/** @var string */
private $pageLanguageOverride;
/** @var bool */
private $isFallbackLanguageConverterEnabled = true;
public function __construct(
PageIdentity $pageIdentity,
PageConfigFactory $pageConfigFactory,
Parsoid $parsoid,
array $parsoidSettings,
SiteConfig $siteConfig,
TitleFactory $titleFactory,
LanguageConverterFactory $languageConverterFactory,
LanguageFactory $languageFactory
) {
$this->pageConfigFactory = $pageConfigFactory;
$this->pageIdentity = $pageIdentity;
$this->parsoid = $parsoid;
$this->parsoidSettings = $parsoidSettings;
$this->siteConfig = $siteConfig;
$this->titleFactory = $titleFactory;
// @phan-suppress-next-line PhanPossiblyNullTypeMismatchProperty
$this->pageTitle = $this->titleFactory->castFromPageIdentity( $this->pageIdentity );
$this->languageConverterFactory = $languageConverterFactory;
$this->languageFactory = $languageFactory;
}
/**
* Set the PageConfig object to be used during language variant conversion.
* If not provided, the object will be created.
*
* @param PageConfig $pageConfig
* @return void
*/
public function setPageConfig( PageConfig $pageConfig ) {
$this->pageConfig = $pageConfig;
}
/**
* Set the page content language override.
*
* @param string $language
* @return void
*/
public function setPageLanguageOverride( string $language ) {
$this->pageLanguageOverride = $language;
}
/**
* Perform variant conversion on a PageBundle object.
*
* @param PageBundle $pageBundle
* @param string $targetVariantCode
* @param string|null $sourceVariantCode
*
* @return PageBundle The converted PageBundle, or the object passed in as
* $pageBundle if the conversion is not supported.
* @throws HttpException
*/
public function convertPageBundleVariant(
PageBundle $pageBundle,
string $targetVariantCode,
string $sourceVariantCode = null
): PageBundle {
[ $pageLanguageCode, $sourceVariantCode ] =
$this->getBaseAndSourceLanguageCode( $pageBundle, $sourceVariantCode );
if ( !$this->siteConfig->langConverterEnabledForLanguage( $pageLanguageCode ) ) {
// If the language doesn't support variants, just return the content unmodified.
return $pageBundle;
}
$pageConfig = $this->getPageConfig( $pageLanguageCode, $sourceVariantCode );
if ( !$this->parsoid->implementsLanguageConversion( $pageConfig, $targetVariantCode ) ) {
if ( !$this->isFallbackLanguageConverterEnabled ) {
// Fallback variant conversion is not enabled, return the page bundle as is.
return $pageBundle;
}
$baseLanguage = $this->languageFactory->getParentLanguage( $targetVariantCode );
$languageConverter = $this->languageConverterFactory->getLanguageConverter( $baseLanguage );
$convertedHtml = $languageConverter->convertTo( $pageBundle->html, $targetVariantCode );
// Add a note so that we can identify what was used to perform the variant conversion
$msg = "<!-- Variant conversion performed using the core LanguageConverter -->";
$convertedHtml = $msg . $convertedHtml;
// Hack: Pass the HTML to parsoid for variant conversion in order to add metadata that is
// missing when we use the core LanguageConverter directly.
// Replace the original page bundle, so Parsoid gets the converted HTML as input.
$pageBundle = new PageBundle(
$convertedHtml,
[],
[],
$pageBundle->version,
[ 'content-language' => $targetVariantCode ]
);
}
$modifiedPageBundle = $this->parsoid->pb2pb(
$pageConfig, 'variant', $pageBundle,
[
'variant' => [
'source' => $sourceVariantCode,
'target' => $targetVariantCode,
]
]
);
return $modifiedPageBundle;
}
/**
* Perform variant conversion on a ParserOutput object.
*
* @param ParserOutput $parserOutput
* @param string $targetVariantCode
* @param string|null $sourceVariantCode
*
* @return ParserOutput
*/
public function convertParserOutputVariant(
ParserOutput $parserOutput,
string $targetVariantCode,
string $sourceVariantCode = null
): ParserOutput {
$pageBundle = PageBundleParserOutputConverter::pageBundleFromParserOutput( $parserOutput );
$modifiedPageBundle = $this->convertPageBundleVariant( $pageBundle, $targetVariantCode, $sourceVariantCode );
return PageBundleParserOutputConverter::parserOutputFromPageBundle( $modifiedPageBundle );
}
/**
* Disable fallback language variant converter
* @return void
*/
public function disableFallbackLanguageConverter(): void {
$this->isFallbackLanguageConverterEnabled = false;
}
private function getPageConfig( string $pageLanguageCode, ?string $sourceVariantCode ): PageConfig {
if ( $this->pageConfig ) {
return $this->pageConfig;
}
try {
$this->pageConfig = $this->pageConfigFactory->create(
$this->pageIdentity,
null,
null,
null,
$pageLanguageCode,
$this->parsoidSettings
);
if ( $sourceVariantCode ) {
$this->pageConfig->setVariant( $sourceVariantCode );
}
} catch ( RevisionAccessException $exception ) {
// TODO: Throw a different exception, this class should not know
// about HTTP status codes.
throw new HttpException( 'The specified revision is deleted or suppressed.', 404 );
}
return $this->pageConfig;
}
/**
* Try to determine the page's language code as follows:
*
* First consider any value set by calling setPageLanguageOverride.
* If setPageLanguageOverride() has not been called, check for a content-language header in $pageBundle.
* If that is not given, use the $default if given.
*
* If we don't have $default, but we do have a PageConfig in $this->pageConfig,
* return $this->pageConfig->getPageLanguage().
* Finally, fall back to $this->pageTitle->getPageLanguage()->getCode();
*
* @param PageBundle $pageBundle
* @param string|null $default
*
* @return string A language code. May be a variant.
*/
private function getPageLanguageCode( PageBundle $pageBundle, ?string $default = null ): string {
// If a language was set by calling setPageLanguageOverride(), always use it!
if ( $this->pageLanguageOverride ) {
return $this->pageLanguageOverride;
}
// If the page bundle contains a language code, use that.
$pageBundleLanguage = $pageBundle->headers[ 'content-language' ] ?? null;
if ( $pageBundleLanguage ) {
return $pageBundleLanguage;
}
// NOTE: Use explicit default *before* we try PageBundle, because PageConfig::getPageLanguage()
// falls back to Title::getPageLanguage(). If we did that first, $default would never be used.
if ( $default ) {
return $default;
}
// If we have a PageConfig, we can ask it for the page's language. Note that this will fall back to
// Title::getPageLanguage(), so it has to be the last thing we try.
if ( $this->pageConfig ) {
return $this->pageConfig->getPageLanguage();
}
// Finally, just go by the code associated with the title. This may come from the database or
// it may be determined based on the title itself.
return $this->pageTitle->getPageLanguage()->getCode();
}
/**
* Determine the codes of the base language and the source variant.
*
* The base language will be used to find the appropriate LanguageConverter. It should never be a variant.
* The source variant will be used to instruct the LanguageConverter. It should always be a variant (or
* null to trigger auto-detection of the source variant).
*
* @param PageBundle $pageBundle
* @param string|null $sourceLanguageCode
*
* @return array<string> [ string $baseLanguageCode, ?string $sourceLanguageCode ]
*/
private function getBaseAndSourceLanguageCode( PageBundle $pageBundle, ?string $sourceLanguageCode ): array {
// Try to determine the language code associated with the content of the page.
// The result may be a variant code.
$baseLanguageCode = $this->getPageLanguageCode( $pageBundle, $sourceLanguageCode );
// To find out if $baseLanguageCode is actually a variant, get the parent language and compare.
$parentLang = $this->languageFactory->getParentLanguage( $baseLanguageCode );
// If $parentLang is not the same language as $baseLanguageCode, this means that
// $baseLanguageCode is a variant. In that case, set $sourceLanguageCode to that
// variant (unless $sourceLanguageCode is already set), and set $baseLanguageCode
// to the code of $baseLanguage.
if ( $parentLang && $parentLang->getCode() !== $baseLanguageCode ) {
if ( !$sourceLanguageCode ) {
$sourceLanguageCode = $baseLanguageCode;
}
$baseLanguageCode = $parentLang->getCode();
}
// If the source variant isn't actually a variant, trigger auto-detection
if ( $sourceLanguageCode === $baseLanguageCode ) {
$sourceLanguageCode = null;
}
// Invalid phan error: Returning array{0:string,1:?non-empty-string|?string} but declared to
// return string[]
// @phan-suppress-next-line PhanTypeMismatchReturn
return [ $baseLanguageCode, $sourceLanguageCode ];
}
}