MediaWiki uses a number of nonstandard codes which do not validate according to the IANA language subtag registry. Some of them have the wrong semantics entirely: MediaWiki's `sr-ec` variant maps to BCP 47 `sr-EC` which is "Serbian as used in Ethiopia" (!). Extend LanguageCode::bcp47() to map our nonstandard codes to valid BCP 47 language codes. Export the mapping so that it can be used in JavaScript's corresponding mw.language.bcp47() implementation as well, and return the standard BCP 47 codes in the siteinfo API. Thanks to TheDJ (I10b4473c7e53f027812bbccf26bb47aec15fddfd) and Fomafix (I93efc190714ba76247d30ba49fc21ae872fc3555) for previous attempts at this! Also removed a fixme for the name of 'Twi', dating back to 2004 (f59c3be23b) -- checking tw.wikipedia.org it certainly appears that the autonym of 'Twi' is correctly 'Twi'. Tracking bugs for invalid language codes are T125073 and T145535. Discussion of zh-XX => zh-HanX-XX mapping is at T198419. This is a replay of an earlier merged patch,8380f0173e, which had to be reverted because it caused regressions in the Babel extension (T199941). Bug: T34483 Bug: T106367 Bug: T120847 Depends-On: I27a5b8e45b34c6b57c1b612b11548001c88cd483 Change-Id: Iebbc604af21d7f2af9c1f1ab2574cb5f309bf6ed
200 lines
5.7 KiB
PHP
200 lines
5.7 KiB
PHP
<?php
|
|
|
|
/**
|
|
* @covers LanguageCode
|
|
* @group Language
|
|
*
|
|
* @author Thiemo Kreuz
|
|
*/
|
|
class LanguageCodeTest extends PHPUnit\Framework\TestCase {
|
|
|
|
use MediaWikiCoversValidator;
|
|
|
|
public function testConstructor() {
|
|
$instance = new LanguageCode();
|
|
|
|
$this->assertInstanceOf( LanguageCode::class, $instance );
|
|
}
|
|
|
|
public function testGetDeprecatedCodeMapping() {
|
|
$map = LanguageCode::getDeprecatedCodeMapping();
|
|
|
|
$this->assertInternalType( 'array', $map );
|
|
$this->assertContainsOnly( 'string', array_keys( $map ) );
|
|
$this->assertArrayNotHasKey( '', $map );
|
|
$this->assertContainsOnly( 'string', $map );
|
|
$this->assertNotContains( '', $map );
|
|
|
|
// Codes special to MediaWiki should never appear in a map of "deprecated" codes
|
|
$this->assertArrayNotHasKey( 'qqq', $map, 'documentation' );
|
|
$this->assertNotContains( 'qqq', $map, 'documentation' );
|
|
$this->assertArrayNotHasKey( 'qqx', $map, 'debug code' );
|
|
$this->assertNotContains( 'qqx', $map, 'debug code' );
|
|
|
|
// Valid language codes that are currently not "deprecated"
|
|
$this->assertArrayNotHasKey( 'bh', $map, 'family of Bihari languages' );
|
|
$this->assertArrayNotHasKey( 'no', $map, 'family of Norwegian languages' );
|
|
$this->assertArrayNotHasKey( 'simple', $map );
|
|
}
|
|
|
|
public function testReplaceDeprecatedCodes() {
|
|
$this->assertEquals( 'gsw', LanguageCode::replaceDeprecatedCodes( 'als' ) );
|
|
$this->assertEquals( 'gsw', LanguageCode::replaceDeprecatedCodes( 'gsw' ) );
|
|
$this->assertEquals( null, LanguageCode::replaceDeprecatedCodes( null ) );
|
|
}
|
|
|
|
/**
|
|
* test @see LanguageCode::bcp47().
|
|
* Please note the BCP 47 explicitly state that language codes are case
|
|
* insensitive, there are some exceptions to the rule :)
|
|
* This test is used to verify our formatting against all lower and
|
|
* all upper cases language code.
|
|
*
|
|
* @see https://tools.ietf.org/html/bcp47
|
|
* @dataProvider provideLanguageCodes()
|
|
*/
|
|
public function testBcp47( $code, $expected ) {
|
|
$this->assertEquals( $expected, LanguageCode::bcp47( $code ),
|
|
"Applying BCP 47 standard to '$code'"
|
|
);
|
|
|
|
$code = strtolower( $code );
|
|
$this->assertEquals( $expected, LanguageCode::bcp47( $code ),
|
|
"Applying BCP 47 standard to lower case '$code'"
|
|
);
|
|
|
|
$code = strtoupper( $code );
|
|
$this->assertEquals( $expected, LanguageCode::bcp47( $code ),
|
|
"Applying BCP 47 standard to upper case '$code'"
|
|
);
|
|
}
|
|
|
|
/**
|
|
* Array format is ($code, $expected)
|
|
*/
|
|
public static function provideLanguageCodes() {
|
|
return [
|
|
// Extracted from BCP 47 (list not exhaustive)
|
|
# 2.1.1
|
|
[ 'en-ca-x-ca', 'en-CA-x-ca' ],
|
|
[ 'sgn-be-fr', 'sgn-BE-FR' ],
|
|
[ 'az-latn-x-latn', 'az-Latn-x-latn' ],
|
|
# 2.2
|
|
[ 'sr-Latn-RS', 'sr-Latn-RS' ],
|
|
[ 'az-arab-ir', 'az-Arab-IR' ],
|
|
|
|
# 2.2.5
|
|
[ 'sl-nedis', 'sl-nedis' ],
|
|
[ 'de-ch-1996', 'de-CH-1996' ],
|
|
|
|
# 2.2.6
|
|
[
|
|
'en-latn-gb-boont-r-extended-sequence-x-private',
|
|
'en-Latn-GB-boont-r-extended-sequence-x-private'
|
|
],
|
|
|
|
// Examples from BCP 47 Appendix A
|
|
# Simple language subtag:
|
|
[ 'DE', 'de' ],
|
|
[ 'fR', 'fr' ],
|
|
[ 'ja', 'ja' ],
|
|
|
|
# Language subtag plus script subtag:
|
|
[ 'zh-hans', 'zh-Hans' ],
|
|
[ 'sr-cyrl', 'sr-Cyrl' ],
|
|
[ 'sr-latn', 'sr-Latn' ],
|
|
|
|
# Extended language subtags and their primary language subtag
|
|
# counterparts:
|
|
[ 'zh-cmn-hans-cn', 'zh-cmn-Hans-CN' ],
|
|
[ 'cmn-hans-cn', 'cmn-Hans-CN' ],
|
|
[ 'zh-yue-hk', 'zh-yue-HK' ],
|
|
[ 'yue-hk', 'yue-HK' ],
|
|
|
|
# Language-Script-Region:
|
|
[ 'zh-hans-cn', 'zh-Hans-CN' ],
|
|
[ 'sr-latn-RS', 'sr-Latn-RS' ],
|
|
|
|
# Language-Variant:
|
|
[ 'sl-rozaj', 'sl-rozaj' ],
|
|
[ 'sl-rozaj-biske', 'sl-rozaj-biske' ],
|
|
[ 'sl-nedis', 'sl-nedis' ],
|
|
|
|
# Language-Region-Variant:
|
|
[ 'de-ch-1901', 'de-CH-1901' ],
|
|
[ 'sl-it-nedis', 'sl-IT-nedis' ],
|
|
|
|
# Language-Script-Region-Variant:
|
|
[ 'hy-latn-it-arevela', 'hy-Latn-IT-arevela' ],
|
|
|
|
# Language-Region:
|
|
[ 'de-de', 'de-DE' ],
|
|
[ 'en-us', 'en-US' ],
|
|
[ 'es-419', 'es-419' ],
|
|
|
|
# Private use subtags:
|
|
[ 'de-ch-x-phonebk', 'de-CH-x-phonebk' ],
|
|
[ 'az-arab-x-aze-derbend', 'az-Arab-x-aze-derbend' ],
|
|
/**
|
|
* Previous test does not reflect the BCP 47 which states:
|
|
* az-Arab-x-AZE-derbend
|
|
* AZE being private, it should be lower case, hence the test above
|
|
* should probably be:
|
|
* [ 'az-arab-x-aze-derbend', 'az-Arab-x-AZE-derbend' ],
|
|
*/
|
|
|
|
# Private use registry values:
|
|
[ 'x-whatever', 'x-whatever' ],
|
|
[ 'qaa-qaaa-qm-x-southern', 'qaa-Qaaa-QM-x-southern' ],
|
|
[ 'de-qaaa', 'de-Qaaa' ],
|
|
[ 'sr-latn-qm', 'sr-Latn-QM' ],
|
|
[ 'sr-qaaa-rs', 'sr-Qaaa-RS' ],
|
|
|
|
# Tags that use extensions
|
|
[ 'en-us-u-islamcal', 'en-US-u-islamcal' ],
|
|
[ 'zh-cn-a-myext-x-private', 'zh-CN-a-myext-x-private' ],
|
|
[ 'en-a-myext-b-another', 'en-a-myext-b-another' ],
|
|
|
|
# Invalid:
|
|
// de-419-DE
|
|
// a-DE
|
|
// ar-a-aaa-b-bbb-a-ccc
|
|
|
|
# Non-standard and deprecated language codes used by MediaWiki
|
|
[ 'als', 'gsw' ],
|
|
[ 'bat-smg', 'sgs' ],
|
|
[ 'be-x-old', 'be-tarask' ],
|
|
[ 'fiu-vro', 'vro' ],
|
|
[ 'roa-rup', 'rup' ],
|
|
[ 'zh-classical', 'lzh' ],
|
|
[ 'zh-min-nan', 'nan' ],
|
|
[ 'zh-yue', 'yue' ],
|
|
[ 'cbk-zam', 'cbk' ],
|
|
[ 'de-formal', 'de-x-formal' ],
|
|
[ 'eml', 'egl' ],
|
|
[ 'en-rtl', 'en-x-rtl' ],
|
|
[ 'es-formal', 'es-x-formal' ],
|
|
[ 'hu-formal', 'hu-x-formal' ],
|
|
[ 'kk-Arab', 'kk-Arab' ],
|
|
[ 'kk-Cyrl', 'kk-Cyrl' ],
|
|
[ 'kk-Latn', 'kk-Latn' ],
|
|
[ 'map-bms', 'jv-x-bms' ],
|
|
[ 'mo', 'ro-Cyrl-MD' ],
|
|
[ 'nrm', 'nrf' ],
|
|
[ 'nl-informal', 'nl-x-informal' ],
|
|
[ 'roa-tara', 'nap-x-tara' ],
|
|
[ 'simple', 'en-simple' ],
|
|
[ 'sr-ec', 'sr-Cyrl' ],
|
|
[ 'sr-el', 'sr-Latn' ],
|
|
[ 'zh-cn', 'zh-Hans-CN' ],
|
|
[ 'zh-sg', 'zh-Hans-SG' ],
|
|
[ 'zh-my', 'zh-Hans-MY' ],
|
|
[ 'zh-tw', 'zh-Hant-TW' ],
|
|
[ 'zh-hk', 'zh-Hant-HK' ],
|
|
[ 'zh-mo', 'zh-Hant-MO' ],
|
|
[ 'zh-hans', 'zh-Hans' ],
|
|
[ 'zh-hant', 'zh-Hant' ],
|
|
];
|
|
}
|
|
|
|
}
|