* @author fdcn * @author shinjiman * @author PhiLiP */ use MediaWiki\Linker\LinkTarget; use MediaWiki\Logger\LoggerFactory; use MediaWiki\MediaWikiServices; use MediaWiki\Page\PageReference; use MediaWiki\Revision\RevisionRecord; use MediaWiki\Revision\SlotRecord; /** * Base class for multi-variant language conversion. * * @ingroup Language */ abstract class LanguageConverter implements ILanguageConverter { use DeprecationHelper; /** * languages supporting variants * @since 1.20 * @var array */ public static $languagesWithVariants = [ 'ban', 'en', 'crh', 'gan', 'iu', 'kk', 'ku', 'shi', 'sr', 'tg', 'uz', 'zh', ]; private $mTablesLoaded = false; /** * @var ReplacementArray[]|bool[] */ protected $mTables; /** * @var Language */ private $mLangObj; private $mUcfirst = false; private $mConvRuleTitle = false; private $mURLVariant; private $mUserVariant; private $mHeaderVariant; private $mMaxDepth = 10; private $mVarSeparatorPattern; private const CACHE_VERSION_KEY = 'VERSION 7'; /** * @param Language $langobj */ public function __construct( $langobj ) { $this->deprecatePublicProperty( 'mUcfirst', '1.35', __CLASS__ ); $this->deprecatePublicProperty( 'mConvRuleTitle', '1.35', __CLASS__ ); $this->deprecatePublicProperty( 'mUserVariant', '1.35', __CLASS__ ); $this->deprecatePublicProperty( 'mHeaderVariant', '1.35', __CLASS__ ); $this->deprecatePublicProperty( 'mMaxDepth = 10', '1.35', __CLASS__ ); $this->deprecatePublicProperty( 'mVarSeparatorPattern', '1.35', __CLASS__ ); $this->deprecatePublicProperty( 'mLangObj', '1.35', __CLASS__ ); $this->deprecatePublicProperty( 'mVariantFallbacks', '1.35', __CLASS__ ); $this->deprecatePublicProperty( 'mTablesLoaded', '1.35', __CLASS__ ); $this->deprecatePublicProperty( 'mTables', '1.35', __CLASS__ ); $this->mLangObj = $langobj; $this->deprecatePublicPropertyFallback( 'mVariants', '1.36', function () { return $this->getVariants(); } ); $this->deprecatePublicPropertyFallback( 'mMainLanguageCode', '1.36', function () { return $this->getMainCode(); } ); $this->deprecatePublicPropertyFallback( 'mVariantFallbacks', '1.36', function () { return $this->getVariantsFallbacks(); } ); $this->deprecatePublicPropertyFallback( 'mFlags', '1.36', function () { return $this->getFlags(); } ); $this->deprecatePublicPropertyFallback( 'mVariantNames', '1.36', function () { return $this->getVariantNames(); } ); $this->deprecatePublicPropertyFallback( 'mDescCodeSep', '1.36', function () { return $this->getDescCodeSeparator(); } ); $this->deprecatePublicPropertyFallback( 'mDescVarSep', '1.36', function () { return $this->getDescVarSeparator(); } ); } /** * Get main language code. * @since 1.36 * * @return string */ abstract public function getMainCode(): string; /** * Get supported variants of the language. * @since 1.36 * * @return array */ abstract protected function getLanguageVariants(): array; /** * Get language variants fallbacks. * @since 1.36 * * @return array */ abstract public function getVariantsFallbacks(): array; /** * Get strings that maps to the flags. * @since 1.36 * * @return array */ final public function getFlags(): array { $defaultflags = [ // 'S' show converted text // '+' add rules for alltext // 'E' the gave flags is error // these flags above are reserved for program 'A' => 'A', // add rule for convert code (all text convert) 'T' => 'T', // title convert 'R' => 'R', // raw content 'D' => 'D', // convert description (subclass implement) '-' => '-', // remove convert (not implement) 'H' => 'H', // add rule for convert code (but no display in placed code) 'N' => 'N', // current variant name ]; $flags = array_merge( $defaultflags, $this->getAdditionalFlags() ); foreach ( $this->getVariants() as $v ) { $flags[$v] = $v; } return $flags; } /** * Provides additinal flags for converter. By default it return empty array and * typicslly should be overridden by implementation of converter.. * * @return array */ protected function getAdditionalFlags(): array { return []; } /** * Get manual level limit for supported variants. * @since 1.36 * * @return array */ final public function getManualLevel() { $manualLevel = $this->getAdditionalManualLevel(); $result = []; foreach ( $this->getVariants() as $v ) { if ( array_key_exists( $v, $manualLevel ) ) { $result[$v] = $manualLevel[$v]; } else { $result[$v] = 'bidirectional'; } } return $result; } /** * Provides additinal flags for converter. By default it return empty array and * typicslly should be overridden by implementation of converter. * @since 1.36 * * @return array */ protected function getAdditionalManualLevel(): array { return []; } /** * Get desc code separator. By default returns ":", can be overridden by * implementation of converter. * @since 1.36 * * @return string */ public function getDescCodeSeparator(): string { return ':'; } /** * Get desc var separator. By default returns ";", can be overridden by * implementation of converter. * @since 1.36 * * @return string */ public function getDescVarSeparator(): string { return ';'; } /** * Get variant names. * * @return array */ public function getVariantNames(): array { return MediaWikiServices::getInstance() ->getLanguageNameUtils() ->getLanguageNames(); } /** * Get all valid variants for current Coverter. It uses abstract * * @return string[] Contains all valid variants */ final public function getVariants() { global $wgDisabledVariants; return array_diff( $this->getLanguageVariants(), $wgDisabledVariants ); } /** * In case some variant is not defined in the markup, we need * to have some fallback. For example, in zh, normally people * will define zh-hans and zh-hant, but less so for zh-sg or zh-hk. * when zh-sg is preferred but not defined, we will pick zh-hans * in this case. Right now this is only used by zh. * * @param string $variant The language code of the variant * @return string|array The code of the fallback language or the * main code if there is no fallback */ public function getVariantFallbacks( $variant ) { return $this->getVariantsFallbacks()[$variant] ?? $this->getMainCode(); } /** * Get the title produced by the conversion rule. * @return string The converted title text */ public function getConvRuleTitle() { return $this->mConvRuleTitle; } /** * Get preferred language variant. * @return string The preferred language code */ public function getPreferredVariant() { global $wgDefaultLanguageVariant; $req = $this->getURLVariant(); Hooks::runner()->onGetLangPreferredVariant( $req ); $user = RequestContext::getMain()->getUser(); // NOTE: For some calls there may not be a context user or session that is safe // to use, see (T235360) // Use case: During autocreation, UserNameUtils::isUsable is called which uses interface // messages for reserved usernames. if ( $user->isSafeToLoad() && $user->isRegistered() && !$req ) { $req = $this->getUserVariant( $user ); } elseif ( !$req ) { $req = $this->getHeaderVariant(); } if ( $wgDefaultLanguageVariant && !$req ) { $req = $this->validateVariant( $wgDefaultLanguageVariant ); } $req = $this->validateVariant( $req ); // This function, unlike the other get*Variant functions, is // not memoized (i.e. there return value is not cached) since // new information might appear during processing after this // is first called. if ( $req ) { return $req; } return $this->getMainCode(); } /** * This function would not be affected by user's settings * @return string The default variant code */ public function getDefaultVariant() { global $wgDefaultLanguageVariant; $req = $this->getURLVariant(); if ( !$req ) { $req = $this->getHeaderVariant(); } if ( $wgDefaultLanguageVariant && !$req ) { $req = $this->validateVariant( $wgDefaultLanguageVariant ); } if ( $req ) { return $req; } return $this->getMainCode(); } /** * Validate the variant and return an appropriate strict internal * variant code if one exists. Compare to Language::hasVariant() * which does a strict test. * * @param string|null $variant The variant to validate * @return mixed Returns an equivalent valid variant code if possible, * null otherwise */ public function validateVariant( $variant = null ) { if ( $variant === null ) { return null; } // Our internal variants are always lower-case; the variant we // are validating may have mixed case. $variant = LanguageCode::replaceDeprecatedCodes( strtolower( $variant ) ); if ( in_array( $variant, $this->getVariants() ) ) { return $variant; } // Browsers are supposed to use BCP 47 standard in the // Accept-Language header, but not all of our internal // mediawiki variant codes are BCP 47. Map BCP 47 code // to our internal code. foreach ( $this->getVariants() as $v ) { // Case-insensitive match (BCP 47 is mixed case) if ( strtolower( LanguageCode::bcp47( $v ) ) === $variant ) { return $v; } } return null; } /** * Get the variant specified in the URL * * @return mixed Variant if one found, null otherwise */ public function getURLVariant() { global $wgRequest; if ( $this->mURLVariant ) { return $this->mURLVariant; } // see if the preference is set in the request $ret = $wgRequest->getText( 'variant' ); if ( !$ret ) { $ret = $wgRequest->getVal( 'uselang' ); } $this->mURLVariant = $this->validateVariant( $ret ); return $this->mURLVariant; } /** * Determine if the user has a variant set. * * @param User $user * @return mixed Variant if one found, null otherwise */ protected function getUserVariant( User $user ) { // This should only be called within the class after the user is known to be // safe to load and logged in, but check just in case. if ( !$user->isSafeToLoad() ) { return false; } if ( $user->isRegistered() ) { // Get language variant preference from logged in users if ( $this->getMainCode() == MediaWikiServices::getInstance()->getContentLanguage()->getCode() ) { $ret = $user->getOption( 'variant' ); } else { $ret = $user->getOption( 'variant-' . $this->getMainCode() ); } } else { // figure out user lang without constructing wgLang to avoid // infinite recursion $ret = $user->getOption( 'language' ); } $this->mUserVariant = $this->validateVariant( $ret ); return $this->mUserVariant; } /** * Determine the language variant from the Accept-Language header. * * @return mixed Variant if one found, null otherwise */ protected function getHeaderVariant() { global $wgRequest; if ( $this->mHeaderVariant ) { return $this->mHeaderVariant; } // See if some supported language variant is set in the // HTTP header. $languages = array_keys( $wgRequest->getAcceptLang() ); if ( empty( $languages ) ) { return null; } $fallbackLanguages = []; foreach ( $languages as $language ) { $this->mHeaderVariant = $this->validateVariant( $language ); if ( $this->mHeaderVariant ) { break; } // To see if there are fallbacks of current language. // We record these fallback variants, and process // them later. $fallbacks = $this->getVariantFallbacks( $language ); if ( is_string( $fallbacks ) && $fallbacks !== $this->getMainCode() ) { $fallbackLanguages[] = $fallbacks; } elseif ( is_array( $fallbacks ) ) { $fallbackLanguages = array_merge( $fallbackLanguages, $fallbacks ); } } if ( !$this->mHeaderVariant ) { // process fallback languages now $fallback_languages = array_unique( $fallbackLanguages ); foreach ( $fallback_languages as $language ) { $this->mHeaderVariant = $this->validateVariant( $language ); if ( $this->mHeaderVariant ) { break; } } } return $this->mHeaderVariant; } /** * Dictionary-based conversion. * This function would not parse the conversion rules. * If you want to parse rules, try to use convert() or * convertTo(). * * @param string $text The text to be converted * @param bool|string $toVariant The target language code * @return string The converted text */ public function autoConvert( $text, $toVariant = false ) { $this->loadTables(); if ( !$toVariant ) { $toVariant = $this->getPreferredVariant(); if ( !$toVariant ) { return $text; } } if ( $this->guessVariant( $text, $toVariant ) ) { return $text; } /* we convert everything except: 1. HTML markups (anything between < and >) 2. HTML entities 3. placeholders created by the parser IMPORTANT: Beware of failure from pcre.backtrack_limit (T124404). Minimize use of backtracking where possible. */ static $reg; if ( $reg === null ) { $marker = '|' . Parser::MARKER_PREFIX . '[^\x7f]++\x7f'; // this one is needed when the text is inside an HTML markup $htmlfix = '|<[^>\004]++(?=\004$)|^[^<>]*+>'; // Optimize for the common case where these tags have // few or no children. Thus try and possesively get as much as // possible, and only engage in backtracking when we hit a '<'. // disable convert to variants between tags $codefix = '[^<]*+(?:(?:(?!<\/code>).)[^<]*+)*+<\/code>|'; // disable conversion of