329 lines
12 KiB
PHP
329 lines
12 KiB
PHP
<?php
|
|
|
|
namespace MediaWiki\Parser\Parsoid;
|
|
|
|
use LanguageCode;
|
|
use MediaWiki\Languages\LanguageConverterFactory;
|
|
use MediaWiki\Languages\LanguageFactory;
|
|
use MediaWiki\Page\PageIdentity;
|
|
use MediaWiki\Parser\ParserOutput;
|
|
use MediaWiki\Parser\Parsoid\Config\PageConfigFactory;
|
|
use MediaWiki\Rest\HttpException;
|
|
use MediaWiki\Rest\LocalizedHttpException;
|
|
use MediaWiki\Revision\RevisionAccessException;
|
|
use MediaWiki\Title\Title;
|
|
use MediaWiki\Title\TitleFactory;
|
|
use Wikimedia\Bcp47Code\Bcp47Code;
|
|
use Wikimedia\Bcp47Code\Bcp47CodeValue;
|
|
use Wikimedia\Message\MessageValue;
|
|
use Wikimedia\Parsoid\Config\PageConfig;
|
|
use Wikimedia\Parsoid\Config\SiteConfig;
|
|
use Wikimedia\Parsoid\Core\PageBundle;
|
|
use Wikimedia\Parsoid\DOM\Element;
|
|
use Wikimedia\Parsoid\Parsoid;
|
|
use Wikimedia\Parsoid\Utils\DOMCompat;
|
|
use Wikimedia\Parsoid\Utils\DOMUtils;
|
|
|
|
/**
|
|
* @since 1.40
|
|
* @unstable should be marked stable before 1.40 release
|
|
*/
|
|
class LanguageVariantConverter {
|
|
private PageConfigFactory $pageConfigFactory;
|
|
private ?PageConfig $pageConfig = null;
|
|
private PageIdentity $pageIdentity;
|
|
private Title $pageTitle;
|
|
private Parsoid $parsoid;
|
|
private SiteConfig $siteConfig;
|
|
private LanguageConverterFactory $languageConverterFactory;
|
|
private LanguageFactory $languageFactory;
|
|
/**
|
|
* Page language override from the Content-Language header.
|
|
*/
|
|
private ?Bcp47Code $pageLanguageOverride = null;
|
|
private bool $isFallbackLanguageConverterEnabled = true;
|
|
|
|
public function __construct(
|
|
PageIdentity $pageIdentity,
|
|
PageConfigFactory $pageConfigFactory,
|
|
Parsoid $parsoid,
|
|
SiteConfig $siteConfig,
|
|
TitleFactory $titleFactory,
|
|
LanguageConverterFactory $languageConverterFactory,
|
|
LanguageFactory $languageFactory
|
|
) {
|
|
$this->pageConfigFactory = $pageConfigFactory;
|
|
$this->pageIdentity = $pageIdentity;
|
|
$this->parsoid = $parsoid;
|
|
$this->siteConfig = $siteConfig;
|
|
$this->pageTitle = $titleFactory->newFromPageIdentity( $this->pageIdentity );
|
|
$this->languageConverterFactory = $languageConverterFactory;
|
|
$this->languageFactory = $languageFactory;
|
|
}
|
|
|
|
/**
|
|
* Set the PageConfig object to be used during language variant conversion.
|
|
* If not provided, the object will be created.
|
|
*
|
|
* @param PageConfig $pageConfig
|
|
* @return void
|
|
*/
|
|
public function setPageConfig( PageConfig $pageConfig ) {
|
|
$this->pageConfig = $pageConfig;
|
|
}
|
|
|
|
/**
|
|
* Set the page content language override.
|
|
*
|
|
* @param Bcp47Code $language
|
|
* @return void
|
|
*/
|
|
public function setPageLanguageOverride( Bcp47Code $language ) {
|
|
$this->pageLanguageOverride = $language;
|
|
}
|
|
|
|
/**
|
|
* Perform variant conversion on a PageBundle object.
|
|
*
|
|
* @param PageBundle $pageBundle
|
|
* @param Bcp47Code $targetVariant
|
|
* @param ?Bcp47Code $sourceVariant
|
|
*
|
|
* @return PageBundle The converted PageBundle, or the object passed in as
|
|
* $pageBundle if the conversion is not supported.
|
|
* @throws HttpException
|
|
*/
|
|
public function convertPageBundleVariant(
|
|
PageBundle $pageBundle,
|
|
Bcp47Code $targetVariant,
|
|
?Bcp47Code $sourceVariant = null
|
|
): PageBundle {
|
|
[ $pageLanguage, $sourceVariant ] =
|
|
$this->getBaseAndSourceLanguage( $pageBundle, $sourceVariant );
|
|
|
|
if ( !$this->siteConfig->langConverterEnabledBcp47( $pageLanguage ) ) {
|
|
// If the language doesn't support variants, just return the content unmodified.
|
|
return $pageBundle;
|
|
}
|
|
|
|
$pageConfig = $this->getPageConfig( $pageLanguage, $sourceVariant );
|
|
|
|
if ( $this->parsoid->implementsLanguageConversionBcp47( $pageConfig, $targetVariant ) ) {
|
|
return $this->parsoid->pb2pb(
|
|
$pageConfig, 'variant', $pageBundle,
|
|
[
|
|
'variant' => [
|
|
'source' => $sourceVariant,
|
|
'target' => $targetVariant,
|
|
]
|
|
]
|
|
);
|
|
} else {
|
|
if ( !$this->isFallbackLanguageConverterEnabled ) {
|
|
// Fallback variant conversion is not enabled, return the page bundle as is.
|
|
return $pageBundle;
|
|
}
|
|
|
|
// LanguageConverter::hasVariant and LanguageConverter::convertTo
|
|
// could take a string|Bcp47Code in the future, which would
|
|
// allow us to avoid the $targetVariantCode conversion here.
|
|
$baseLanguage = $this->languageFactory->getParentLanguage( $targetVariant );
|
|
$languageConverter = $this->languageConverterFactory->getLanguageConverter( $baseLanguage );
|
|
$targetVariantCode = $this->languageFactory->getLanguage( $targetVariant )->getCode();
|
|
if ( $languageConverter->hasVariant( $targetVariantCode ) ) {
|
|
// NOTE: This is not a convert() because we have the exact desired variant
|
|
// and don't need to compute a preferred variant based on a base language.
|
|
// Also see T267067 for why convert() should be avoided.
|
|
$convertedHtml = $languageConverter->convertTo( $pageBundle->html, $targetVariantCode );
|
|
$pageVariant = $targetVariant;
|
|
} else {
|
|
// No conversion possible - pass through original HTML in original language
|
|
$convertedHtml = $pageBundle->html;
|
|
$pageVariant = $pageConfig->getPageLanguageBcp47();
|
|
}
|
|
|
|
// Add a note so that we can identify what was used to perform the variant conversion
|
|
$msg = "<!-- Variant conversion performed using the core LanguageConverter -->";
|
|
$convertedHtml = $msg . $convertedHtml;
|
|
|
|
// NOTE: Keep this in sync with code in Parsoid.php in Parsoid repo
|
|
// Add meta information that Parsoid normally adds
|
|
$headers = [
|
|
'content-language' => $pageVariant->toBcp47Code(),
|
|
'vary' => [ 'Accept', 'Accept-Language' ]
|
|
];
|
|
$doc = DOMUtils::parseHTML( '' );
|
|
$doc->appendChild( $doc->createElement( 'head' ) );
|
|
DOMUtils::addHttpEquivHeaders( $doc, $headers );
|
|
$docElt = $doc->documentElement;
|
|
'@phan-var Element $docElt';
|
|
$docHtml = DOMCompat::getOuterHTML( $docElt );
|
|
$convertedHtml = preg_replace( "#</body>#", $docHtml, "$convertedHtml</body>" );
|
|
return new PageBundle(
|
|
$convertedHtml, [], [], $pageBundle->version, $headers
|
|
);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Perform variant conversion on a ParserOutput object.
|
|
*
|
|
* @param ParserOutput $parserOutput
|
|
* @param Bcp47Code $targetVariant
|
|
* @param ?Bcp47Code $sourceVariant
|
|
*
|
|
* @return ParserOutput
|
|
*/
|
|
public function convertParserOutputVariant(
|
|
ParserOutput $parserOutput,
|
|
Bcp47Code $targetVariant,
|
|
?Bcp47Code $sourceVariant = null
|
|
): ParserOutput {
|
|
$pageBundle = PageBundleParserOutputConverter::pageBundleFromParserOutput( $parserOutput );
|
|
$modifiedPageBundle = $this->convertPageBundleVariant( $pageBundle, $targetVariant, $sourceVariant );
|
|
|
|
return PageBundleParserOutputConverter::parserOutputFromPageBundle( $modifiedPageBundle, $parserOutput );
|
|
}
|
|
|
|
/**
|
|
* Disable fallback language variant converter
|
|
* @return void
|
|
*/
|
|
public function disableFallbackLanguageConverter(): void {
|
|
$this->isFallbackLanguageConverterEnabled = false;
|
|
}
|
|
|
|
private function getPageConfig( Bcp47Code $pageLanguage, ?Bcp47Code $sourceVariant ): PageConfig {
|
|
if ( $this->pageConfig ) {
|
|
return $this->pageConfig;
|
|
}
|
|
|
|
try {
|
|
$this->pageConfig = $this->pageConfigFactory->create(
|
|
$this->pageIdentity,
|
|
null,
|
|
null,
|
|
null,
|
|
$pageLanguage
|
|
);
|
|
|
|
if ( $sourceVariant ) {
|
|
$this->pageConfig->setVariantBcp47( $sourceVariant );
|
|
}
|
|
} catch ( RevisionAccessException $exception ) {
|
|
// TODO: Throw a different exception, this class should not know
|
|
// about HTTP status codes.
|
|
throw new LocalizedHttpException( new MessageValue( "rest-specified-revision-unavailable" ), 404 );
|
|
}
|
|
|
|
return $this->pageConfig;
|
|
}
|
|
|
|
/**
|
|
* Try to determine the page's language code as follows:
|
|
*
|
|
* First consider any value set by calling ::setPageLanguageOverride();
|
|
* this would have come from a Content-Language header.
|
|
*
|
|
* If ::setPageLanguageOverride() has not been called, check for a
|
|
* content-language header in $pageBundle, which should be
|
|
* equivalent. These are used when the title/article doesn't
|
|
* (yet) exist.
|
|
*
|
|
* If these are not given, use the $default if given; this is used
|
|
* to allow additional parameters to the request to be used as
|
|
* fallbacks.
|
|
*
|
|
* If we don't have $default, but we do have a PageConfig in
|
|
* $this->pageConfig, return $this->pageConfig->getPageLanguage().
|
|
*
|
|
* Finally, fall back to $this->pageTitle->getPageLanguage().
|
|
*
|
|
* @param PageBundle $pageBundle
|
|
* @param Bcp47Code|null $default A default language, used after
|
|
* Content-Language but before PageConfig/Title lookup.
|
|
*
|
|
* @return Bcp47Code the page language; may be a variant.
|
|
*/
|
|
private function getPageLanguage( PageBundle $pageBundle, ?Bcp47Code $default = null ): Bcp47Code {
|
|
// If a language was set by calling setPageLanguageOverride(), always use it!
|
|
if ( $this->pageLanguageOverride ) {
|
|
return $this->pageLanguageOverride;
|
|
}
|
|
|
|
// If the page bundle contains a language code, use that.
|
|
$pageBundleLanguage = $pageBundle->headers[ 'content-language' ] ?? null;
|
|
if ( $pageBundleLanguage ) {
|
|
// The HTTP header will contain a BCP-47 language code, not a
|
|
// mediawiki-internal one.
|
|
return new Bcp47CodeValue( $pageBundleLanguage );
|
|
}
|
|
|
|
// NOTE: Use explicit default *before* we try PageBundle, because PageConfig::getPageLanguage()
|
|
// falls back to Title::getPageLanguage(). If we did that first, $default would never be used.
|
|
if ( $default ) {
|
|
return $default;
|
|
}
|
|
|
|
// If we have a PageConfig, we can ask it for the page's language. Note that this will fall back to
|
|
// Title::getPageLanguage(), so it has to be the last thing we try.
|
|
if ( $this->pageConfig ) {
|
|
return $this->pageConfig->getPageLanguageBcp47();
|
|
}
|
|
|
|
// Finally, just go by the code associated with the title. This may come from the database or
|
|
// it may be determined based on the title itself.
|
|
return $this->pageTitle->getPageLanguage();
|
|
}
|
|
|
|
/**
|
|
* Determine the codes of the base language and the source variant.
|
|
*
|
|
* The base language will be used to find the appropriate LanguageConverter.
|
|
* It should never be a variant.
|
|
*
|
|
* The source variant will be used to instruct the LanguageConverter.
|
|
* It should always be a variant (or null to trigger auto-detection of
|
|
* the source variant).
|
|
*
|
|
* @param PageBundle $pageBundle
|
|
* @param ?Bcp47Code $sourceLanguage
|
|
*
|
|
* @return array{0:Bcp47Code,1:?Bcp47Code} [ Bcp47Code $pageLanguage, ?Bcp47Code $sourceLanguage ]
|
|
*/
|
|
private function getBaseAndSourceLanguage( PageBundle $pageBundle, ?Bcp47Code $sourceLanguage ): array {
|
|
// Try to determine the language code associated with the content of the page.
|
|
// The result may be a variant code.
|
|
$baseLanguage = $this->getPageLanguage( $pageBundle, $sourceLanguage );
|
|
|
|
// To find out if $baseLanguage is actually a variant, get the parent language and compare.
|
|
$parentLang = $this->languageFactory->getParentLanguage( $baseLanguage );
|
|
|
|
// If $parentLang is not the same language as $baseLanguage, this means that
|
|
// $baseLanguage is a variant. In that case, set $sourceLanguage to that
|
|
// variant (unless $sourceLanguage is already set), and set $baseLanguage
|
|
// to the $parentLang
|
|
if ( $parentLang && strcasecmp( $parentLang->toBcp47Code(), $baseLanguage->toBcp47Code() ) !== 0 ) {
|
|
if ( !$sourceLanguage ) {
|
|
$sourceLanguage = $baseLanguage;
|
|
}
|
|
$baseLanguage = $parentLang;
|
|
}
|
|
|
|
if ( $sourceLanguage !== null ) {
|
|
$parentConverter = $this->languageConverterFactory->getLanguageConverter( $parentLang );
|
|
// If the source variant isn't actually a variant, trigger auto-detection
|
|
$sourceIsVariant = (
|
|
strcasecmp( $parentLang->toBcp47Code(), $sourceLanguage->toBcp47Code() ) !== 0 &&
|
|
$parentConverter->hasVariant(
|
|
LanguageCode::bcp47ToInternal( $sourceLanguage->toBcp47Code() )
|
|
)
|
|
);
|
|
if ( !$sourceIsVariant ) {
|
|
$sourceLanguage = null;
|
|
}
|
|
}
|
|
|
|
return [ $baseLanguage, $sourceLanguage ];
|
|
}
|
|
}
|