wiki.techinc.nl/tests/phpunit/unit/includes/language/LanguageCodeTest.php

<?php

use MediaWiki\Config\ServiceOptions;
use MediaWiki\HookContainer\HookContainer;
use MediaWiki\Languages\LanguageNameUtils;
use MediaWiki\MainConfigNames;
use Psr\Container\ContainerInterface;
use Wikimedia\ObjectFactory\ObjectFactory;
use Wikimedia\Services\NoSuchServiceException;

/**
 * @covers LanguageCode
 * @group Language
 *
 * @author Thiemo Kreuz
 */
class LanguageCodeTest extends MediaWikiUnitTestCase {

	public function testConstructor() {
		$instance = new LanguageCode();

		$this->assertInstanceOf( LanguageCode::class, $instance );
	}

	public function testGetDeprecatedCodeMapping() {
		$map = LanguageCode::getDeprecatedCodeMapping();

		$this->assertIsArray( $map );
		$this->assertContainsOnly( 'string', array_keys( $map ) );
		$this->assertArrayNotHasKey( '', $map );
		$this->assertContainsOnly( 'string', $map );
		$this->assertNotContains( '', $map );

		// Codes special to MediaWiki should never appear in a map of "deprecated" codes
		$this->assertArrayNotHasKey( 'qqq', $map, 'documentation' );
		$this->assertNotContains( 'qqq', $map, 'documentation' );
		$this->assertArrayNotHasKey( 'qqx', $map, 'debug code' );
		$this->assertNotContains( 'qqx', $map, 'debug code' );

		// Valid language codes that are currently not "deprecated"
		$this->assertArrayNotHasKey( 'bh', $map, 'family of Bihari languages' );
		$this->assertArrayNotHasKey( 'no', $map, 'family of Norwegian languages' );
		$this->assertArrayNotHasKey( 'simple', $map );
	}

	public function testReplaceDeprecatedCodes() {
		$this->assertEquals( 'gsw', LanguageCode::replaceDeprecatedCodes( 'als' ) );
		$this->assertEquals( 'gsw', LanguageCode::replaceDeprecatedCodes( 'gsw' ) );
		$this->assertNull( LanguageCode::replaceDeprecatedCodes( null ) );
	}

	/**
	 * test @see LanguageCode::bcp47().
	 * Please note the BCP 47 explicitly state that language codes are case
	 * insensitive, there are some exceptions to the rule :)
	 * This test is used to verify our formatting against all lower and
	 * all upper cases language code.
	 *
	 * @see https://tools.ietf.org/html/bcp47
	 * @dataProvider provideLanguageCodes()
	 */
	public function testBcp47( $code, $expected ) {
		$this->assertEquals( $expected, LanguageCode::bcp47( $code ),
			"Applying BCP 47 standard to '$code'"
		);

		$code = strtolower( $code );
		$this->assertEquals( $expected, LanguageCode::bcp47( $code ),
			"Applying BCP 47 standard to lower case '$code'"
		);

		$code = strtoupper( $code );
		$this->assertEquals( $expected, LanguageCode::bcp47( $code ),
			"Applying BCP 47 standard to upper case '$code'"
		);
	}

	/**
	 * Array format is ($code, $expected)
	 */
	public static function provideLanguageCodes() {
		return [
			// Extracted from BCP 47 (list not exhaustive)
			# 2.1.1
			[ 'en-ca-x-ca', 'en-CA-x-ca' ],
			[ 'sgn-be-fr', 'sgn-BE-FR' ],
			[ 'az-latn-x-latn', 'az-Latn-x-latn' ],
			# 2.2
			[ 'sr-Latn-RS', 'sr-Latn-RS' ],
			[ 'az-arab-ir', 'az-Arab-IR' ],

			# 2.2.5
			[ 'sl-nedis', 'sl-nedis' ],
			[ 'de-ch-1996', 'de-CH-1996' ],

			# 2.2.6
			[
				'en-latn-gb-boont-r-extended-sequence-x-private',
				'en-Latn-GB-boont-r-extended-sequence-x-private'
			],

			// Examples from BCP 47 Appendix A
			# Simple language subtag:
			[ 'DE', 'de' ],
			[ 'fR', 'fr' ],
			[ 'ja', 'ja' ],

			# Language subtag plus script subtag:
			[ 'zh-hans', 'zh-Hans' ],
			[ 'sr-cyrl', 'sr-Cyrl' ],
			[ 'sr-latn', 'sr-Latn' ],

			# Extended language subtags and their primary language subtag
			# counterparts:
			[ 'zh-cmn-hans-cn', 'zh-cmn-Hans-CN' ],
			[ 'cmn-hans-cn', 'cmn-Hans-CN' ],
			[ 'zh-yue-hk', 'zh-yue-HK' ],
			[ 'yue-hk', 'yue-HK' ],

			# Language-Script-Region:
			[ 'zh-hans-cn', 'zh-Hans-CN' ],
			[ 'sr-latn-RS', 'sr-Latn-RS' ],

			# Language-Variant:
			[ 'sl-rozaj', 'sl-rozaj' ],
			[ 'sl-rozaj-biske', 'sl-rozaj-biske' ],
			[ 'sl-nedis', 'sl-nedis' ],

			# Language-Region-Variant:
			[ 'de-ch-1901', 'de-CH-1901' ],
			[ 'sl-it-nedis', 'sl-IT-nedis' ],

			# Language-Script-Region-Variant:
			[ 'hy-latn-it-arevela', 'hy-Latn-IT-arevela' ],

			# Language-Region:
			[ 'de-de', 'de-DE' ],
			[ 'en-us', 'en-US' ],
			[ 'es-419', 'es-419' ],

			# Private use subtags:
			[ 'de-ch-x-phonebk', 'de-CH-x-phonebk' ],
			[ 'az-arab-x-aze-derbend', 'az-Arab-x-aze-derbend' ],
			/**
			 * Previous test does not reflect the BCP 47 which states:
			 *  az-Arab-x-AZE-derbend
			 * AZE being private, it should be lower case, hence the test above
			 * should probably be:
			 * [ 'az-arab-x-aze-derbend', 'az-Arab-x-AZE-derbend' ],
			 */

			# Private use registry values:
			[ 'x-whatever', 'x-whatever' ],
			[ 'qaa-qaaa-qm-x-southern', 'qaa-Qaaa-QM-x-southern' ],
			[ 'de-qaaa', 'de-Qaaa' ],
			[ 'sr-latn-qm', 'sr-Latn-QM' ],
			[ 'sr-qaaa-rs', 'sr-Qaaa-RS' ],

			# Tags that use extensions
			[ 'en-us-u-islamcal', 'en-US-u-islamcal' ],
			[ 'zh-cn-a-myext-x-private', 'zh-CN-a-myext-x-private' ],
			[ 'en-a-myext-b-another', 'en-a-myext-b-another' ],

			# Invalid:
			// de-419-DE
			// a-DE
			// ar-a-aaa-b-bbb-a-ccc

			# Non-standard and deprecated language codes used by MediaWiki
			[ 'als', 'gsw' ],
			[ 'bat-smg', 'sgs' ],
			[ 'be-x-old', 'be-tarask' ],
			[ 'fiu-vro', 'vro' ],
			[ 'roa-rup', 'rup' ],
			[ 'zh-classical', 'lzh' ],
			[ 'zh-min-nan', 'nan' ],
			[ 'zh-yue', 'yue' ],
			[ 'cbk-zam', 'cbk' ],
			[ 'de-formal', 'de-x-formal' ],
			[ 'eml', 'egl' ],
			[ 'en-rtl', 'en-x-rtl' ],
			[ 'es-formal', 'es-x-formal' ],
			[ 'hu-formal', 'hu-x-formal' ],
			[ 'kk-Arab', 'kk-Arab' ],
			[ 'kk-Cyrl', 'kk-Cyrl' ],
			[ 'kk-Latn', 'kk-Latn' ],
			[ 'map-bms', 'jv-x-bms' ],
			[ 'mo', 'ro-Cyrl-MD' ],
			[ 'nrm', 'nrf' ],
			[ 'nl-informal', 'nl-x-informal' ],
			[ 'roa-tara', 'nap-x-tara' ],
			[ 'simple', 'en-simple' ],
			[ 'sr-ec', 'sr-Cyrl' ],
			[ 'sr-el', 'sr-Latn' ],
			[ 'zh-cn', 'zh-Hans-CN' ],
			[ 'zh-sg', 'zh-Hans-SG' ],
			[ 'zh-my', 'zh-Hans-MY' ],
			[ 'zh-tw', 'zh-Hant-TW' ],
			[ 'zh-hk', 'zh-Hant-HK' ],
			[ 'zh-mo', 'zh-Hant-MO' ],
			[ 'zh-hans', 'zh-Hans' ],
			[ 'zh-hant', 'zh-Hant' ],
		];
	}

	/**
	 * @covers LanguageCode::bcp47()
	 * @covers LanguageCode::bcp47ToInternal()
	 * @dataProvider provideSupportedLanguageCodes()
	 */
	public function testBcp47ToInternal( $internalCode ) {
		if ( $internalCode === 'egl' ) {
			# 'egl' was added as an internal code prematurely; 'eml' hasn't
			# been added to the deprecated list yet (T36217) and so only
			# 'eml' is a "real" internal code.
			$internalCode = 'eml';
		}
		// Test that ::bcp47 and ::bcp47ToInternal are inverses.
		$bcp47 = LanguageCode::bcp47( $internalCode );
		$result = LanguageCode::bcp47ToInternal( $bcp47 );
		$this->assertEquals( $internalCode, $result );
		// Verify case-insensitivity
		$result = LanguageCode::bcp47ToInternal( strtolower( $bcp47 ) );
		$this->assertEquals( $internalCode, $result );
		$result = LanguageCode::bcp47ToInternal( strtoupper( $bcp47 ) );
		$this->assertEquals( $internalCode, $result );
	}

	public static function provideSupportedLanguageCodes() {
		$lnu = new LanguageNameUtils(
			new ServiceOptions(
				LanguageNameUtils::CONSTRUCTOR_OPTIONS,
				[
					MainConfigNames::ExtraLanguageNames => [],
					MainConfigNames::LanguageCode => 'en',
					MainConfigNames::UsePigLatinVariant => true,
				]
			),
			new HookContainer(
				new \MediaWiki\HookContainer\StaticHookRegistry(),
				new ObjectFactory( new class implements ContainerInterface {
					public function has( string $id ): bool {
						return false;
					}

					public function get( string $id ) {
						throw new NoSuchServiceException( $id );
					}
				} )
			)
		);
		$languages = $lnu->getLanguageNames(
			LanguageNameUtils::AUTONYMS, LanguageNameUtils::SUPPORTED
		);
		foreach ( $languages as $code => $autonym ) {
			yield [ $code ];
		}
	}

	/**
	 * Test LanguageCode::isWellFormedLanguageTag()
	 * @dataProvider provideWellFormedLanguageTags
	 * @covers LanguageCode::isWellFormedLanguageTag
	 */
	public function testWellFormedLanguageTag( $code, $message = '' ) {
		$this->assertTrue(
			LanguageCode::isWellFormedLanguageTag( $code ),
			"validating code $code $message"
		);
	}

	/**
	 * The test cases are based on the tests in the GaBuZoMeu parser
	 * written by Stéphane Bortzmeyer <bortzmeyer@nic.fr>
	 * and distributed as free software, under the GNU General Public Licence.
	 * https://www.bortzmeyer.org/gabuzomeu-parsing-language-tags.html
	 */
	public static function provideWellFormedLanguageTags() {
		return [
			[ 'fr', 'two-letter code' ],
			[ 'fr-latn', 'two-letter code with lower case script code' ],
			[ 'fr-Latn-FR', 'two-letter code with title case script code and uppercase country code' ],
			[ 'fr-Latn-419', 'two-letter code with title case script code and region number' ],
			[ 'fr-FR', 'two-letter code with uppercase' ],
			[ 'ax-TZ', 'Not in the registry, but well-formed' ],
			[ 'fr-shadok', 'two-letter code with variant' ],
			[ 'fr-y-myext-myext2', 'non-x singleton' ],
			[ 'fra-Latn', 'ISO 639 can be 3-letters' ],
			[ 'fra', 'three-letter language code' ],
			[ 'fra-FX', 'three-letter language code with country code' ],
			[ 'i-klingon', 'grandfathered with singleton' ],
			[ 'I-kLINgon', 'tags are case-insensitive...' ],
			[ 'no-bok', 'grandfathered without singleton' ],
			[ 'i-enochian', 'Grandfathered' ],
			[ 'x-fr-CH', 'private use' ],
			[ 'es-419', 'two-letter code with region number' ],
			[ 'en-Latn-GB-boont-r-extended-sequence-x-private', 'weird, but well-formed' ],
			[ 'ab-x-abc-x-abc', 'anything goes after x' ],
			[ 'ab-x-abc-a-a', 'anything goes after x, including several non-x singletons' ],
			[ 'i-default', 'grandfathered' ],
			[ 'abcd-Latn', 'Language of 4 chars reserved for future use' ],
			[ 'AaBbCcDd-x-y-any-x', 'Language of 5-8 chars, registered' ],
			[ 'de-CH-1901', 'with country and year' ],
			[ 'en-US-x-twain', 'with country and singleton' ],
			[ 'zh-cmn', 'three-letter variant' ],
			[ 'zh-cmn-Hant', 'three-letter variant and script' ],
			[ 'zh-cmn-Hant-HK', 'three-letter variant, script and country' ],
			[ 'xr-p-lze', 'Extension' ],
		];
	}

	/**
	 * Negative test for LanguageCode::isWellFormedLanguageTag()
	 * @dataProvider provideMalformedLanguageTags
	 * @covers LanguageCode::isWellFormedLanguageTag
	 */
	public function testMalformedLanguageTag( $code, $message = '' ) {
		$this->assertFalse(
			LanguageCode::isWellFormedLanguageTag( $code ),
			"validating that code $code is a malformed language tag - $message"
		);
	}

	/**
	 * The test cases are based on the tests in the GaBuZoMeu parser
	 * written by Stéphane Bortzmeyer <bortzmeyer@nic.fr>
	 * and distributed as free software, under the GNU General Public Licence.
	 * https://www.bortzmeyer.org/gabuzomeu-parsing-language-tags.html
	 */
	public static function provideMalformedLanguageTags() {
		return [
			[ 'f', 'language too short' ],
			[ 'f-Latn', 'language too short with script' ],
			[ 'xr-lxs-qut', 'variants too short' ], # extlangS
			[ 'fr-Latn-F', 'region too short' ],
			[ 'a-value', 'language too short with region' ],
			[ 'tlh-a-b-foo', 'valid three-letter with wrong variant' ],
			[
				'i-notexist',
				'grandfathered but not registered: invalid, even if we only test well-formedness'
			],
			[ 'abcdefghi-012345678', 'numbers too long' ],
			[ 'ab-abc-abc-abc-abc', 'invalid extensions' ],
			[ 'ab-abcd-abc', 'invalid extensions' ],
			[ 'ab-ab-abc', 'invalid extensions' ],
			[ 'ab-123-abc', 'invalid extensions' ],
			[ 'a-Hant-ZH', 'short language with valid extensions' ],
			[ 'a1-Hant-ZH', 'invalid character in language' ],
			[ 'ab-abcde-abc', 'invalid extensions' ],
			[ 'ab-1abc-abc', 'invalid characters in extensions' ],
			[ 'ab-ab-abcd', 'invalid order of extensions' ],
			[ 'ab-123-abcd', 'invalid order of extensions' ],
			[ 'ab-abcde-abcd', 'invalid extensions' ],
			[ 'ab-1abc-abcd', 'invalid characters in extensions' ],
			[ 'ab-a-b', 'extensions too short' ],
			[ 'ab-a-x', 'extensions too short, even with singleton' ],
			[ 'ab--ab', 'two separators' ],
			[ 'ab-abc-', 'separator in the end' ],
			[ '-ab-abc', 'separator in the beginning' ],
			[ 'abcd-efg', 'language too long' ],
			[ 'aabbccddE', 'tag too long' ],
			[ 'pa_guru', 'A tag with underscore is invalid in strict mode' ],
			[ 'de-f', 'subtag too short' ],
			[ 'zh-classical', 'internal language code zh-classical is not a well-formed language tag' ],
		];
	}

	/**
	 * Negative test for LanguageCode::isWellFormedLanguageTag()
	 * @covers LanguageCode::isWellFormedLanguageTag
	 */
	public function testLenientLanguageTag() {
		$this->assertTrue(
			LanguageCode::isWellFormedLanguageTag( 'pa_guru', true ),
			'pa_guru is a well-formed language tag in lenient mode'
		);
	}

}