wiki.techinc.nl/includes/collation/IcuCollation.php

<?php
/**
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License along
 * with this program; if not, write to the Free Software Foundation, Inc.,
 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
 * http://www.gnu.org/copyleft/gpl.html
 *
 * @file
 */

use MediaWiki\Languages\LanguageFactory;

/**
 * @since 1.16.3
 */
class IcuCollation extends Collation {
	private const FIRST_LETTER_VERSION = 4;

	/** @var Collator */
	private $primaryCollator;

	/** @var Collator */
	private $mainCollator;

	/** @var string */
	private $locale;

	/** @var Language */
	protected $digitTransformLanguage;

	/** @var bool */
	private $useNumericCollation = false;

	/** @var array */
	private $firstLetterData;

	/**
	 * Unified CJK blocks.
	 *
	 * The same definition of a CJK block must be used for both Collation and
	 * generateCollationData.php. These blocks are omitted from the first
	 * letter data, as an optimisation measure and because the default UCA table
	 * is pretty useless for sorting Chinese text anyway. Japanese and Korean
	 * blocks are not included here, because they are smaller and more useful.
	 */
	private const CJK_BLOCKS = [
		[ 0x2E80, 0x2EFF ], // CJK Radicals Supplement
		[ 0x2F00, 0x2FDF ], // Kangxi Radicals
		[ 0x2FF0, 0x2FFF ], // Ideographic Description Characters
		[ 0x3000, 0x303F ], // CJK Symbols and Punctuation
		[ 0x31C0, 0x31EF ], // CJK Strokes
		[ 0x3200, 0x32FF ], // Enclosed CJK Letters and Months
		[ 0x3300, 0x33FF ], // CJK Compatibility
		[ 0x3400, 0x4DBF ], // CJK Unified Ideographs Extension A
		[ 0x4E00, 0x9FFF ], // CJK Unified Ideographs
		[ 0xF900, 0xFAFF ], // CJK Compatibility Ideographs
		[ 0xFE30, 0xFE4F ], // CJK Compatibility Forms
		[ 0x20000, 0x2A6DF ], // CJK Unified Ideographs Extension B
		[ 0x2A700, 0x2B73F ], // CJK Unified Ideographs Extension C
		[ 0x2B740, 0x2B81F ], // CJK Unified Ideographs Extension D
		[ 0x2F800, 0x2FA1F ], // CJK Compatibility Ideographs Supplement
	];

	/**
	 * Additional characters (or character groups) to be considered separate
	 * letters for given languages, or to be removed from the list of such
	 * letters (denoted by keys starting with '-').
	 *
	 * These are additions to (or subtractions from) the data stored in the
	 * first-letters-root.php data file (which among others includes full basic Latin,
	 * Cyrillic and Greek alphabets).
	 *
	 * "Separate letter" is a letter that would have a separate heading/section
	 * for it in a dictionary or a phone book in this language. This data isn't
	 * used for sorting (the ICU library handles that), only for deciding which
	 * characters (or character groups) to use as headings.
	 *
	 * Initially generated based on the primary level of Unicode collation
	 * tailorings available at http://developer.mimer.com/charts/tailorings.htm ,
	 * later modified.
	 *
	 * Empty arrays are intended; this signifies that the data for the language is
	 * available and that there are, in fact, no additional letters to consider.
	 */
	private const TAILORING_FIRST_LETTERS = [
		'af' => [],
		'am' => [],
		'ar' => [],
		'as' => [ "\u{0982}", "\u{0981}", "\u{0983}", "\u{09CE}", "ক্ষ " ],
		'ast' => [ "Ch", "Ll", "Ñ" ], // not in libicu
		'az' => [ "Ç", "Ə", "Ğ", "İ", "Ö", "Ş", "Ü" ],
		'be' => [ "Ё" ],
		'be-tarask' => [ "Ё" ],
		'bg' => [],
		'bn' => [ 'ং', 'ঃ', 'ঁ' ],
		'bn@collation=traditional' => [
			'ং', 'ঃ', 'ঁ', 'ক্', 'খ্', 'গ্', 'ঘ্', 'ঙ্', 'চ্', 'ছ্', 'জ্', 'ঝ্',
			'ঞ্', 'ট্', 'ঠ্', 'ড্', 'ঢ্', 'ণ্', 'ৎ', 'থ্', 'দ্', 'ধ্', 'ন্', 'প্',
			'ফ্', 'ব্', 'ভ্', 'ম্', 'য্', 'র্', 'ৰ্', 'ল্', 'ৱ্', 'শ্', 'ষ্', 'স্', 'হ্'
		],
		'bo' => [],
		'br' => [ "Ch", "C'h" ],
		'bs' => [ "Č", "Ć", "Dž", "Đ", "Lj", "Nj", "Š", "Ž" ],
		'bs-Cyrl' => [],
		'ca' => [],
		'chr' => [],
		'co' => [], // not in libicu
		'cs' => [ "Č", "Ch", "Ř", "Š", "Ž" ],
		'cy' => [ "Ch", "Dd", "Ff", "Ng", "Ll", "Ph", "Rh", "Th" ],
		'da' => [ "Æ", "Ø", "Å" ],
		'de' => [],
		'de-AT@collation=phonebook' => [ 'ä', 'ö', 'ü', 'ß' ],
		'dsb' => [ "Č", "Ć", "Dź", "Ě", "Ch", "Ł", "Ń", "Ŕ", "Š", "Ś", "Ž", "Ź" ],
		'ee' => [ "Dz", "Ɖ", "Ɛ", "Ƒ", "Gb", "Ɣ", "Kp", "Ny", "Ŋ", "Ɔ", "Ts", "Ʋ" ],
		'el' => [],
		'en' => [],
		'eo' => [ "Ĉ", "Ĝ", "Ĥ", "Ĵ", "Ŝ", "Ŭ" ],
		'es' => [ "Ñ" ],
		'et' => [ "Š", "Ž", "Õ", "Ä", "Ö", "Ü" ],
		'eu' => [ "Ñ" ], // not in libicu
		'fa' => [
			// RTL, let's put each letter on a new line
			"آ",
			"ء",
			"ه",
			"ا",
			"و"
		],
		'fi' => [ "Å", "Ä", "Ö" ],
		'fil' => [ "Ñ", "Ng" ],
		'fo' => [ "Á", "Ð", "Í", "Ó", "Ú", "Ý", "Æ", "Ø", "Å" ],
		'fr' => [],
		'fr-CA' => [], // fr-CA sorts accents slightly different from fr.
		'fur' => [ "À", "Á", "Â", "È", "Ì", "Ò", "Ù" ], // not in libicu
		'fy' => [], // not in libicu
		'ga' => [],
		'gd' => [], // not in libicu
		'gl' => [ "Ch", "Ll", "Ñ" ],
		'gu' => [ "\u{0A82}", "\u{0A83}", "\u{0A81}", "\u{0AB3}" ],
		'ha' => [ 'Ɓ', 'Ɗ', 'Ƙ', 'Sh', 'Ts', 'Ƴ' ],
		'haw' => [ 'ʻ' ],
		'he' => [],
		'hi' => [ "\u{0902}", "\u{0903}" ],
		'hr' => [ "Č", "Ć", "Dž", "Đ", "Lj", "Nj", "Š", "Ž" ],
		'hsb' => [ "Č", "Dź", "Ě", "Ch", "Ł", "Ń", "Ř", "Š", "Ć", "Ž" ],
		'hu' => [ "Cs", "Dz", "Dzs", "Gy", "Ly", "Ny", "Ö", "Sz", "Ty", "Ü", "Zs" ],
		'hy' => [ "և" ],
		'id' => [],
		'ig' => [ "Ch", "Gb", "Gh", "Gw", "Ị", "Kp", "Kw", "Ṅ", "Nw", "Ny", "Ọ", "Sh", "Ụ" ],
		'is' => [ "Á", "Ð", "É", "Í", "Ó", "Ú", "Ý", "Þ", "Æ", "Ö", "Å" ],
		'it' => [],
		'ka' => [],
		'kk' => [ "Ү", "І" ],
		'kl' => [ "Æ", "Ø", "Å" ],
		'km' => [
			"រ", "ឫ", "ឬ", "ល", "ឭ", "ឮ", "\u{17BB}\u{17C6}",
			"\u{17C6}", "\u{17B6}\u{17C6}", "\u{17C7}",
			"\u{17B7}\u{17C7}", "\u{17BB}\u{17C7}",
			"\u{17C1}\u{17C7}", "\u{17C4}\u{17C7}",
		],
		'kn' => [ "\u{0C81}", "\u{0C83}", "\u{0CF1}", "\u{0CF2}" ],
		'kok' => [ "\u{0902}", "\u{0903}", "ळ", "क्ष" ],
		'ku' => [ "Ç", "Ê", "Î", "Ş", "Û" ], // not in libicu
		'ky' => [ "Ё" ],
		'la' => [], // not in libicu
		'lb' => [],
		'lkt' => [ 'Č', 'Ǧ', 'Ȟ', 'Š', 'Ž' ],
		'ln' => [ 'Ɛ' ],
		'lo' => [],
		'lt' => [ "Č", "Š", "Ž" ],
		'lv' => [ "Č", "Ģ", "Ķ", "Ļ", "Ņ", "Š", "Ž" ],
		'mk' => [ "Ѓ", "Ќ" ],
		'ml' => [],
		'mn' => [],
		'mo' => [ "Ă", "Â", "Î", "Ș", "Ț" ], // not in libicu
		'mr' => [ "\u{0902}", "\u{0903}", "ळ", "क्ष", "ज्ञ" ],
		'ms' => [],
		'mt' => [ "Ċ", "Ġ", "Għ", "Ħ", "Ż" ],
		'nb' => [ "Æ", "Ø", "Å" ],
		'ne' => [],
		'nl' => [],
		'nn' => [ "Æ", "Ø", "Å" ],
		'no' => [ "Æ", "Ø", "Å" ], // not in libicu. You should probably use nb or nn instead.
		'oc' => [], // not in libicu
		'om' => [ 'Ch', 'Dh', 'Kh', 'Ny', 'Ph', 'Sh' ],
		'or' => [ "\u{0B01}", "\u{0B02}", "\u{0B03}", "କ୍ଷ" ],
		'pa' => [ "\u{0A4D}" ],
		'pl' => [ "Ą", "Ć", "Ę", "Ł", "Ń", "Ó", "Ś", "Ź", "Ż" ],
		'pt' => [],
		'rm' => [], // not in libicu
		'ro' => [ "Ă", "Â", "Î", "Ș", "Ț" ],
		'ru' => [],
		'rup' => [ "Ă", "Â", "Î", "Ľ", "Ń", "Ș", "Ț" ], // not in libicu
		'sco' => [],
		'se' => [
			'Á', 'Č', 'Ʒ', 'Ǯ', 'Đ', 'Ǧ', 'Ǥ', 'Ǩ', 'Ŋ',
			'Š', 'Ŧ', 'Ž', 'Ø', 'Æ', 'Ȧ', 'Ä', 'Ö'
		],
		'si' => [ "\u{0D82}", "\u{0D83}", "\u{0DA4}" ],
		'sk' => [ "Ä", "Č", "Ch", "Ô", "Š", "Ž" ],
		'sl' => [ "Č", "Š", "Ž" ],
		'smn' => [ "Á", "Č", "Đ", "Ŋ", "Š", "Ŧ", "Ž", "Æ", "Ø", "Å", "Ä", "Ö" ],
		'sq' => [ "Ç", "Dh", "Ë", "Gj", "Ll", "Nj", "Rr", "Sh", "Th", "Xh", "Zh" ],
		'sr' => [],
		'sr-Latn' => [ "Č", "Ć", "Dž", "Đ", "Lj", "Nj", "Š", "Ž" ],
		'sv' => [ "Å", "Ä", "Ö" ],
		'sv@collation=standard' => [ "Å", "Ä", "Ö" ],
		'sw' => [],
		'ta' => [
			"\u{0B82}", "ஃ", "க்ஷ", "க்", "ங்", "ச்", "ஞ்", "ட்", "ண்", "த்", "ந்",
			"ப்", "ம்", "ய்", "ர்", "ல்", "வ்", "ழ்", "ள்", "ற்", "ன்", "ஜ்", "ஶ்", "ஷ்",
			"ஸ்", "ஹ்", "க்ஷ்"
		],
		'te' => [ "\u{0C01}", "\u{0C02}", "\u{0C03}" ],
		'th' => [ "ฯ", "\u{0E46}", "\u{0E4D}", "\u{0E3A}" ],
		'tk' => [ "Ç", "Ä", "Ž", "Ň", "Ö", "Ş", "Ü", "Ý" ],
		'tl' => [ "Ñ", "Ng" ], // not in libicu
		'to' => [ "Ng", "ʻ" ],
		'tr' => [ "Ç", "Ğ", "İ", "Ö", "Ş", "Ü" ],
		'-tr' => [ "ı" ],
		'tt' => [ "Ә", "Ө", "Ү", "Җ", "Ң", "Һ" ], // not in libicu
		'uk' => [ "Ґ", "Ь" ],
		'uz' => [ "Ch", "G'", "Ng", "O'", "Sh" ], // not in libicu
		'vi' => [ "Ă", "Â", "Đ", "Ê", "Ô", "Ơ", "Ư" ],
		'vo' => [ "Ä", "Ö", "Ü" ],
		'yi' => [
			"\u{05D1}\u{05BF}", "\u{05DB}\u{05BC}", "\u{05E4}\u{05BC}",
			"\u{05E9}\u{05C2}", "\u{05EA}\u{05BC}"
		],
		'yo' => [ "Ẹ", "Gb", "Ọ", "Ṣ" ],
		'zu' => [],
	];

	/**
	 * @param LanguageFactory $languageFactory
	 * @param string $locale
	 */
	public function __construct(
		LanguageFactory $languageFactory,
		$locale
	) {
		$this->locale = $locale;
		// Drop everything after the '@' in locale's name
		$localeParts = explode( '@', $locale );
		$this->digitTransformLanguage = $languageFactory->getLanguage( $locale === 'root' ? 'en' : $localeParts[0] );

		$mainCollator = Collator::create( $locale );
		if ( !$mainCollator ) {
			throw new MWException( "Invalid ICU locale specified for collation: $locale" );
		}
		$this->mainCollator = $mainCollator;

		$this->primaryCollator = Collator::create( $locale );
		$this->primaryCollator->setStrength( Collator::PRIMARY );

		// If the special suffix for numeric collation is present, turn on numeric collation.
		if ( substr( $locale, -5, 5 ) === '-u-kn' ) {
			$this->useNumericCollation = true;
			// Strip off the special suffix so it doesn't trip up fetchFirstLetterData().
			$this->locale = substr( $this->locale, 0, -5 );
			$this->mainCollator->setAttribute( Collator::NUMERIC_COLLATION, Collator::ON );
			$this->primaryCollator->setAttribute( Collator::NUMERIC_COLLATION, Collator::ON );
		}
	}

	public function getSortKey( $string ) {
		return $this->mainCollator->getSortKey( $string );
	}

	public function getPrimarySortKey( $string ) {
		return $this->primaryCollator->getSortKey( $string );
	}

	public function getFirstLetter( $string ) {
		$string = strval( $string );
		if ( $string === '' ) {
			return '';
		}

		$firstChar = mb_substr( $string, 0, 1, 'UTF-8' );

		// If the first character is a CJK character, just return that character.
		if ( ord( $firstChar ) > 0x7f && self::isCjk( UtfNormal\Utils::utf8ToCodepoint( $firstChar ) ) ) {
			return $firstChar;
		}

		$sortKey = $this->getPrimarySortKey( $string );

		// Do a binary search to find the correct letter to sort under
		$min = ArrayUtils::findLowerBound(
			[ $this, 'getSortKeyByLetterIndex' ],
			$this->getFirstLetterCount(),
			'strcmp',
			$sortKey );

		if ( $min === false ) {
			// Before the first letter
			return '';
		}

		$sortLetter = $this->getLetterByIndex( $min );

		if ( $this->useNumericCollation ) {
			// If the sort letter is a number, return '0–9' (or localized equivalent).
			// ASCII value of 0 is 48. ASCII value of 9 is 57.
			// Note that this also applies to non-Arabic numerals since they are
			// mapped to Arabic numeral sort letters. For example, ২ sorts as 2.
			if ( ord( $sortLetter ) >= 48 && ord( $sortLetter ) <= 57 ) {
				$sortLetter = wfMessage( 'category-header-numerals' )->numParams( 0, 9 )->text();
			}
		}
		return $sortLetter;
	}

	/**
	 * @since 1.16.3
	 * @return array
	 */
	public function getFirstLetterData() {
		if ( $this->firstLetterData === null ) {
			$cache = ObjectCache::getLocalServerInstance( CACHE_ANYTHING );
			$cacheKey = $cache->makeKey(
				'first-letters',
				static::class,
				$this->locale,
				$this->digitTransformLanguage->getCode(),
				INTL_ICU_VERSION,
				self::FIRST_LETTER_VERSION
			);
			$this->firstLetterData = $cache->getWithSetCallback( $cacheKey, $cache::TTL_WEEK, function () {
				return $this->fetchFirstLetterData();
			} );
		}
		return $this->firstLetterData;
	}

	/**
	 * @return array
	 * @throws MWException
	 */
	private function fetchFirstLetterData() {
		global $IP;
		// Generate data from serialized data file
		if ( isset( self::TAILORING_FIRST_LETTERS[$this->locale] ) ) {
			$letters = require "$IP/includes/collation/data/first-letters-root.php";
			// Append additional characters
			$letters = array_merge( $letters, self::TAILORING_FIRST_LETTERS[$this->locale] );
			// Remove unnecessary ones, if any
			if ( isset( self::TAILORING_FIRST_LETTERS['-' . $this->locale] ) ) {
				$letters = array_diff( $letters, self::TAILORING_FIRST_LETTERS['-' . $this->locale] );
			}
			// Apply digit transforms
			$digits = [ '0', '1', '2', '3', '4', '5', '6', '7', '8', '9' ];
			$letters = array_diff( $letters, $digits );
			foreach ( $digits as $digit ) {
				$letters[] = $this->digitTransformLanguage->formatNumNoSeparators( $digit );
			}
		} elseif ( $this->locale === 'root' ) {
			$letters = require "$IP/includes/collation/data/first-letters-root.php";
		} else {
			// FIXME: Is this still used?
			$letters = $this->getPrecompiledData( "first-letters-{$this->locale}.ser" );
			if ( $letters === false ) {
				throw new MWException( "MediaWiki does not support ICU locale " .
					"\"{$this->locale}\"" );
			}
		}

		/* Sort the letters.
		 *
		 * It's impossible to have the precompiled data file properly sorted,
		 * because the sort order changes depending on ICU version. If the
		 * array is not properly sorted, the binary search will return random
		 * results.
		 *
		 * We also take this opportunity to remove primary collisions.
		 */
		$letterMap = [];
		foreach ( $letters as $letter ) {
			$key = $this->getPrimarySortKey( $letter );
			if ( isset( $letterMap[$key] ) ) {
				// Primary collision (two characters with the same sort position).
				// Keep whichever one sorts first in the main collator.
				$comp = $this->mainCollator->compare( $letter, $letterMap[$key] );
				wfDebug( "Primary collision '$letter' '{$letterMap[$key]}' (comparison: $comp)" );
				// If that also has a collision, use codepoint as a tiebreaker.
				if ( $comp === 0 ) {
					$comp = UtfNormal\Utils::utf8ToCodepoint( $letter ) <=>
						UtfNormal\Utils::utf8ToCodepoint( $letterMap[$key] );
				}
				if ( $comp < 0 ) {
					$letterMap[$key] = $letter;
				}
			} else {
				$letterMap[$key] = $letter;
			}
		}
		ksort( $letterMap, SORT_STRING );

		/* Remove duplicate prefixes. Basically if something has a sortkey
		 * which is a prefix of some other sortkey, then it is an
		 * expansion and probably should not be considered a section
		 * header.
		 *
		 * For example 'þ' is sometimes sorted as if it is the letters
		 * 'th'. Other times it is its own primary element. Another
		 * example is '₨'. Sometimes its a currency symbol. Sometimes it
		 * is an 'R' followed by an 's'.
		 *
		 * Additionally an expanded element should always sort directly
		 * after its first element due to they way sortkeys work.
		 *
		 * UCA sortkey elements are of variable length but no collation
		 * element should be a prefix of some other element, so I think
		 * this is safe. See:
		 * - https://ssl.icu-project.org/repos/icu/icuhtml/trunk/design/collation/ICU_collation_design.htm
		 * - https://icu.unicode.org/design/collation/uca-weight-allocation
		 *
		 * Additionally, there is something called primary compression to
		 * worry about. Basically, if you have two primary elements that
		 * are more than one byte and both start with the same byte then
		 * the first byte is dropped on the second primary. Additionally
		 * either \x03 or \xFF may be added to mean that the next primary
		 * does not start with the first byte of the first primary.
		 *
		 * This shouldn't matter much, as the first primary is not
		 * changed, and that is what we are comparing against.
		 *
		 * tl;dr: This makes some assumptions about how icu implements
		 * collations. It seems incredibly unlikely these assumptions
		 * will change, but nonetheless they are assumptions.
		 */

		$prev = false;
		$duplicatePrefixes = [];
		foreach ( $letterMap as $key => $value ) {
			// Remove terminator byte. Otherwise the prefix
			// comparison will get hung up on that.
			$trimmedKey = rtrim( $key, "\0" );
			if ( $prev === false || $prev === '' ) {
				$prev = $trimmedKey;
				// We don't yet have a collation element
				// to compare against, so continue.
				continue;
			}

			// Due to the fact the array is sorted, we only have
			// to compare with the element directly previous
			// to the current element (skipping expansions).
			// An element "X" will always sort directly
			// before "XZ" (Unless we have "XY", but we
			// do not update $prev in that case).
			if ( substr( $trimmedKey, 0, strlen( $prev ) ) === $prev ) {
				$duplicatePrefixes[] = $key;
				// If this is an expansion, we don't want to
				// compare the next element to this element,
				// but to what is currently $prev
				continue;
			}
			$prev = $trimmedKey;
		}
		foreach ( $duplicatePrefixes as $badKey ) {
			wfDebug( "Removing '{$letterMap[$badKey]}' from first letters." );
			unset( $letterMap[$badKey] );
			// This code assumes that unsetting does not change sort order.
		}
		$data = [
			'chars' => array_values( $letterMap ),
			'keys' => array_keys( $letterMap ),
		];

		// Reduce memory usage before caching
		unset( $letterMap );

		return $data;
	}

	/**
	 * Get an object from the precompiled serialized directory
	 *
	 * Replaced use of wfGetPrecompiledData
	 *
	 * @param string $name
	 * @return mixed The variable on success, false on failure
	 */
	private function getPrecompiledData( $name ) {
		global $IP;
		$file = "$IP/serialized/$name";
		if ( file_exists( $file ) ) {
			$blob = file_get_contents( $file );
			if ( $blob ) {
				return unserialize( $blob );
			}
		}
		return false;
	}

	/**
	 * @param string $index
	 * @return string
	 * @since 1.16.3
	 */
	public function getLetterByIndex( $index ) {
		return $this->getFirstLetterData()['chars'][$index];
	}

	/**
	 * @param string $index
	 * @return string
	 * @since 1.16.3
	 */
	public function getSortKeyByLetterIndex( $index ) {
		return $this->getFirstLetterData()['keys'][$index];
	}

	/**
	 * @return int
	 * @since 1.16.3
	 */
	public function getFirstLetterCount() {
		return count( $this->getFirstLetterData()['chars'] );
	}

	/**
	 * Test if a code point is a CJK (Chinese, Japanese, Korean) character
	 * @param int $codepoint
	 * @return bool
	 * @since 1.16.3
	 */
	public static function isCjk( $codepoint ) {
		foreach ( self::CJK_BLOCKS as $block ) {
			if ( $codepoint >= $block[0] && $codepoint <= $block[1] ) {
				return true;
			}
		}
		return false;
	}

	/**
	 * Return the version of Unicode appropriate for the version of ICU library
	 * currently in use, or false when it can't be determined.
	 *
	 * @since 1.21
	 * @return string|bool
	 */
	public static function getUnicodeVersionForICU() {
		$icuVersion = INTL_ICU_VERSION;
		if ( !$icuVersion ) {
			return false;
		}

		$versionPrefix = substr( $icuVersion, 0, 3 );
		// Source: https://icu.unicode.org/download
		$map = [
			'69.' => '13.0',
			'68.' => '13.0',
			'67.' => '13.0',
			'66.' => '13.0',
			'65.' => '12.0',
			'64.' => '12.0',
			'63.' => '11.0',
			'62.' => '11.0',
			'61.' => '10.0',
			'60.' => '10.0',
			'59.' => '9.0',
			'58.' => '9.0',
			'57.' => '8.0',
			'56.' => '8.0',
			'55.' => '7.0',
			'54.' => '7.0',
			'53.' => '6.3',
			'52.' => '6.3',
			'51.' => '6.2',
			'50.' => '6.2',
			'49.' => '6.1',
			'4.8' => '6.0',
			'4.6' => '6.0',
			'4.4' => '5.2',
			'4.2' => '5.1',
			'4.0' => '5.1',
			'3.8' => '5.0',
			'3.6' => '5.0',
			'3.4' => '4.1',
		];

		return $map[$versionPrefix] ?? false;
	}
}
-												* Introduced a non-dummy collation for $wgCategoryCollation, namely UCA with default tables. 
* Added a maintenance script which generates a list of first letters. Unified Han are omitted for performance, and because they shouldn't be used as headings anyway. A future collation specific to Chinese would provide the KangXi radicals as "first letters".
* Provided a precomputed list of first letters. Used Unicode 6.0.0 data and ICU 4.2. 
* Moved collation functionality from Language to a Collation class hierarchy with factory function. Removed the recently-added methods from Language and updated all callers.
* Changed Title::getCategorySortkey() to separate its parts with a line break instead of a null character. All collations supported by the intl extension ignore the null character, i.e. "ab" == "a\0b". It would have required a lot of hacking to make it work.
* Fixed the uppercase collation to handle non-ASCII characters, redundantly with r80436. I don't think it's necessary to change the collation name as was done there, so I reverted that in the course of my conflict merge. A --force option to updateCollation.php might be nice though.


											
										
										
											2011-01-17 14:02:22 +00:00
+								<?php
-												Added missing GPLv2 headers in some places.

Also made file/class documentation more consistent.

Change-Id: Ibe7815124d6915792dcbb150d01df21d9b22b0b0

											
										
										
											2012-05-21 19:56:04 +00:00
+								/**
 								 * This program is free software; you can redistribute it and/or modify
 								 * it under the terms of the GNU General Public License as published by
 								 * the Free Software Foundation; either version 2 of the License, or
 								 * (at your option) any later version.
 								 *
 								 * This program is distributed in the hope that it will be useful,
 								 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 								 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 								 * GNU General Public License for more details.
 								 *
 								 * You should have received a copy of the GNU General Public License along
 								 * with this program; if not, write to the Free Software Foundation, Inc.,
 								 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
 								 * http://www.gnu.org/copyleft/gpl.html
 								 *
 								 * @file
 								 */
-												* Introduced a non-dummy collation for $wgCategoryCollation, namely UCA with default tables. 
* Added a maintenance script which generates a list of first letters. Unified Han are omitted for performance, and because they shouldn't be used as headings anyway. A future collation specific to Chinese would provide the KangXi radicals as "first letters".
* Provided a precomputed list of first letters. Used Unicode 6.0.0 data and ICU 4.2. 
* Moved collation functionality from Language to a Collation class hierarchy with factory function. Removed the recently-added methods from Language and updated all callers.
* Changed Title::getCategorySortkey() to separate its parts with a line break instead of a null character. All collations supported by the intl extension ignore the null character, i.e. "ab" == "a\0b". It would have required a lot of hacking to make it work.
* Fixed the uppercase collation to handle non-ASCII characters, redundantly with r80436. I don't think it's necessary to change the collation name as was done there, so I reverted that in the course of my conflict merge. A --force option to updateCollation.php might be nice though.


											
										
										
											2011-01-17 14:02:22 +00:00
-												Inject services into Collation classes

Might be worth converting Collation::singleton/::factory
to a service at some point...

Change-Id: Ifc96f851e6091ce834dbaf0e91695c648a42169c

											
										
										
											2021-03-30 19:02:21 +00:00
+								use MediaWiki\Languages\LanguageFactory;
-												Remove Language::factory and getParentLanguage use

Change-Id: I11f8801ef47ec1a1f63d840116e69667e6f3ae3c

											
										
										
											2019-08-26 12:24:37 +00:00
-												Add @since tags to Collation stuff

Change-Id: Iec56ac4d1418737d171f8faa9c8f498fba5383ee

											
										
										
											2016-04-03 08:36:49 +00:00
+								/**
 								 * @since 1.16.3
 								 */
-												* Introduced a non-dummy collation for $wgCategoryCollation, namely UCA with default tables. 
* Added a maintenance script which generates a list of first letters. Unified Han are omitted for performance, and because they shouldn't be used as headings anyway. A future collation specific to Chinese would provide the KangXi radicals as "first letters".
* Provided a precomputed list of first letters. Used Unicode 6.0.0 data and ICU 4.2. 
* Moved collation functionality from Language to a Collation class hierarchy with factory function. Removed the recently-added methods from Language and updated all callers.
* Changed Title::getCategorySortkey() to separate its parts with a line break instead of a null character. All collations supported by the intl extension ignore the null character, i.e. "ab" == "a\0b". It would have required a lot of hacking to make it work.
* Fixed the uppercase collation to handle non-ASCII characters, redundantly with r80436. I don't think it's necessary to change the collation name as was done there, so I reverted that in the course of my conflict merge. A --force option to updateCollation.php might be nice though.


											
										
										
											2011-01-17 14:02:22 +00:00
+								class IcuCollation extends Collation {
-												Fix numerous PSR12.Properties.ConstantVisibility.NotFound

Change-Id: I9b08bde11727f47e262f5f7f422eac5585ea7fca

											
										
										
											2020-05-11 00:40:56 +00:00
+									private const FIRST_LETTER_VERSION = 4;
-												Allow first letter data to be invalidated

Just a class constant for now, but that should suffice to deal with the
current emergency. Proper dependency tracking via the CacheDependency
hierarchy would be pretty cool in the long term.

Change-Id: Ibbe7fa2814434d4869aba20f628bd43269e611fa

											
										
										
											2013-03-13 03:53:20 +00:00
-												Make phpcs-strict pass on includes/ (6/~10)

Change-Id: I566183b5d660a55bb3b2aa7186aaed5355ead2c6

											
										
										
											2014-05-12 14:42:51 +00:00
+									/** @var Collator */
 									private $primaryCollator;
 									/** @var Collator */
 									private $mainCollator;
-												Cleanup some docs (includes/*.php)

- Swap "$variable type" to "type $variable"
- Added missing types
- Fixed spacing inside docs
- Makes beginning of @param/@return/@var/@throws in capital
- Changed some types to match the more common spelling

Change-Id: I783e4dbfe5f6f98b32b9a03ccf6439e13e132bcc

											
										
										
											2014-07-24 17:42:24 +00:00
+									/** @var string */
-												Make phpcs-strict pass on includes/ (6/~10)

Change-Id: I566183b5d660a55bb3b2aa7186aaed5355ead2c6

											
										
										
											2014-05-12 14:42:51 +00:00
+									private $locale;
 									/** @var Language */
 									protected $digitTransformLanguage;
-												Use short type bool/int in param documentation

Enable the phpcs sniffs for this and used phpcbf

Change-Id: Iaa36687154ddd2bf663b9dd519f5c99409d37925

											
										
										
											2017-08-20 11:20:59 +00:00
+									/** @var bool */
-												Adding support for numeric collation when using UCA collations

To use, add '-u-kn' to the end of a collation name and set it as
the value for $wgCategoryCollation.

Bug: T8948
Change-Id: Ica7908daf80624fa2648127114d01665e96234c0

											
										
										
											2016-07-15 03:47:52 +00:00
+									private $useNumericCollation = false;
-												Make phpcs-strict pass on includes/ (6/~10)

Change-Id: I566183b5d660a55bb3b2aa7186aaed5355ead2c6

											
										
										
											2014-05-12 14:42:51 +00:00
+									/** @var array */
 									private $firstLetterData;
-												* Introduced a non-dummy collation for $wgCategoryCollation, namely UCA with default tables. 
* Added a maintenance script which generates a list of first letters. Unified Han are omitted for performance, and because they shouldn't be used as headings anyway. A future collation specific to Chinese would provide the KangXi radicals as "first letters".
* Provided a precomputed list of first letters. Used Unicode 6.0.0 data and ICU 4.2. 
* Moved collation functionality from Language to a Collation class hierarchy with factory function. Removed the recently-added methods from Language and updated all callers.
* Changed Title::getCategorySortkey() to separate its parts with a line break instead of a null character. All collations supported by the intl extension ignore the null character, i.e. "ab" == "a\0b". It would have required a lot of hacking to make it work.
* Fixed the uppercase collation to handle non-ASCII characters, redundantly with r80436. I don't think it's necessary to change the collation name as was done there, so I reverted that in the course of my conflict merge. A --force option to updateCollation.php might be nice though.


											
										
										
											2011-01-17 14:02:22 +00:00
 									/**
 									 * Unified CJK blocks.
 									 *
-												Remove a bunch of trailing spaces and unneeded newlines

Change-Id: I00f369641320acd7f087427ef031f3ee7efa0997

											
										
										
											2012-10-10 18:13:40 +00:00
+									 * The same definition of a CJK block must be used for both Collation and
 									 * generateCollationData.php. These blocks are omitted from the first
 									 * letter data, as an optimisation measure and because the default UCA table
 									 * is pretty useless for sorting Chinese text anyway. Japanese and Korean
-												* Introduced a non-dummy collation for $wgCategoryCollation, namely UCA with default tables. 
* Added a maintenance script which generates a list of first letters. Unified Han are omitted for performance, and because they shouldn't be used as headings anyway. A future collation specific to Chinese would provide the KangXi radicals as "first letters".
* Provided a precomputed list of first letters. Used Unicode 6.0.0 data and ICU 4.2. 
* Moved collation functionality from Language to a Collation class hierarchy with factory function. Removed the recently-added methods from Language and updated all callers.
* Changed Title::getCategorySortkey() to separate its parts with a line break instead of a null character. All collations supported by the intl extension ignore the null character, i.e. "ab" == "a\0b". It would have required a lot of hacking to make it work.
* Fixed the uppercase collation to handle non-ASCII characters, redundantly with r80436. I don't think it's necessary to change the collation name as was done there, so I reverted that in the course of my conflict merge. A --force option to updateCollation.php might be nice though.


											
										
										
											2011-01-17 14:02:22 +00:00
+									 * blocks are not included here, because they are smaller and more useful.
 									 */
-												Use consts in IcuCollation class

Change-Id: I664e7ea57b98975a3ff1c0c78477c18eb56837b4

											
										
										
											2020-11-21 00:41:58 +00:00
+									private const CJK_BLOCKS = [
-												Convert all array() syntax to []

Per wikitech-l consensus:
 https://lists.wikimedia.org/pipermail/wikitech-l/2016-February/084821.html

Notes:
* Disabled CallTimePassByReference due to false positives (T127163)

Change-Id: I2c8ce713ce6600a0bb7bf67537c87044c7a45c4b

											
										
										
											2016-02-17 09:09:32 +00:00
+										[ 0x2E80, 0x2EFF ], // CJK Radicals Supplement
 										[ 0x2F00, 0x2FDF ], // Kangxi Radicals
 										[ 0x2FF0, 0x2FFF ], // Ideographic Description Characters
 										[ 0x3000, 0x303F ], // CJK Symbols and Punctuation
 										[ 0x31C0, 0x31EF ], // CJK Strokes
 										[ 0x3200, 0x32FF ], // Enclosed CJK Letters and Months
 										[ 0x3300, 0x33FF ], // CJK Compatibility
 										[ 0x3400, 0x4DBF ], // CJK Unified Ideographs Extension A
 										[ 0x4E00, 0x9FFF ], // CJK Unified Ideographs
 										[ 0xF900, 0xFAFF ], // CJK Compatibility Ideographs
 										[ 0xFE30, 0xFE4F ], // CJK Compatibility Forms
 										[ 0x20000, 0x2A6DF ], // CJK Unified Ideographs Extension B
 										[ 0x2A700, 0x2B73F ], // CJK Unified Ideographs Extension C
 										[ 0x2B740, 0x2B81F ], // CJK Unified Ideographs Extension D
 										[ 0x2F800, 0x2FA1F ], // CJK Compatibility Ideographs Supplement
 									];
-												* Introduced a non-dummy collation for $wgCategoryCollation, namely UCA with default tables. 
* Added a maintenance script which generates a list of first letters. Unified Han are omitted for performance, and because they shouldn't be used as headings anyway. A future collation specific to Chinese would provide the KangXi radicals as "first letters".
* Provided a precomputed list of first letters. Used Unicode 6.0.0 data and ICU 4.2. 
* Moved collation functionality from Language to a Collation class hierarchy with factory function. Removed the recently-added methods from Language and updated all callers.
* Changed Title::getCategorySortkey() to separate its parts with a line break instead of a null character. All collations supported by the intl extension ignore the null character, i.e. "ab" == "a\0b". It would have required a lot of hacking to make it work.
* Fixed the uppercase collation to handle non-ASCII characters, redundantly with r80436. I don't think it's necessary to change the collation name as was done there, so I reverted that in the course of my conflict merge. A --force option to updateCollation.php might be nice though.


											
										
										
											2011-01-17 14:02:22 +00:00
-												(bug 43799) create language-specific collations for category sorting

This allows one to *finally* get articles to be correctly sorted on
category pages for 67 languages based in latin, greek and cyrillic
alphabets.

Fixes bug 29788, bug 41040, and bug 42412 (implementing collations for
Swedish, Polish, Ukrainian).

Full list of language codes this adds support for: af, ast, az, be,
bg, br, bs, ca, co, cs, cy, da, de, dsb, el, en, eo, es, et, eu, fi,
fo, fr, fur, fy, ga, gd, gl, hr, hsb, hu, is, it, kk, kl, ku, ky, la,
lb, lt, lv, mk, mo, mt, nl, no, oc, pl, pt, rm, ro, ru, rup, sco, sk,
sl, smn, sq, sr, sv, tk, tl, tr, tt, uk, uz, vi.

* Include data about first-letter characters for 67 language
  tailorings. This data was generated from based on
  http://developer.mimer.com/charts/tailorings.htm by a Ruby script
  (https://www.mediawiki.org/wiki/User:Matma_Rex/generateCollationTailoringData.rb),
  then adjusted by hand (removed duplicate definitions for Spanish and
  German, changed code fil -> tl (Filipino -> Tagalog).

* Mark languages verified by native speakers (currently only pl
  (Polish) I verified by myself and fi (Finnish) checked by Niklas).

* Allow for collations named like 'uca-<langcode>', mapping them to
  IcuCollation with appropriate parameter. The code doesn't check if
  we actually have data for given language, as it's checked after the
  IcuCollation class instance is constructed.

* Add the tailoring data to the default first-letter file (for root
  collation) before it's cached for given locale.

Change-Id: I838484b9aaf23945fe7880fef2e3da5f5c06877f

											
										
										
											2013-02-18 21:09:16 +00:00
+									/**
-												adjusted comments for IcuCollation::$tailoringFirstLetters

More information about what actually sits in that array.

Summary of modifications to the Mimer data so far:
* removed data for "traditional" variants of de (German) and es (Spanish)
* used code 'tl' instead of 'fil' for Tagalog/Filipino
* added be-tarask (Belarusian Taraškievica)

Change-Id: I97273c52599a5eda3f63366d697b077d6b17ba81

											
										
										
											2013-02-28 15:44:26 +00:00
+									 * Additional characters (or character groups) to be considered separate
-												IcuCollation::$tailoringFirstLetters: implement letter removal

This is necessary for Swedish, where 'Þ' ("thorn") - considered a
separate letter by default in the first-letters-root.ser file - is
sorted as 'th', causing unexpected output on category pages - words
starting with 'th'..'u' were placed under a heading with the thorn.

There were three obvious ways to do this:
* somehow include information that this letter is to be removed in the
  string itself, as in 'sv' => array( "Å", "Ä", "Ö", "-Þ" ) - could
  potentially clash with valid uses
* create a separate array other than $tailoringFirstLetters to store
  this information - would cause the data to be fragmented all over
  the file
* include information about letters to be removed in a separate key
  "linked" to the regular one, as in '-sv' => array( "Þ" ) - I see no
  obvious downsides, so this is what I ended up doing

Bug: 45446
Change-Id: I57e07a2027c391c5baa767a68f4409b9de7b4618

											
										
										
											2013-03-11 21:24:09 +00:00
+									 * letters for given languages, or to be removed from the list of such
 									 * letters (denoted by keys starting with '-').
 									 *
 									 * These are additions to (or subtractions from) the data stored in the
-												Write Latin and other scripts with captial letter

Change-Id: I16c660e54191b63cd6eb3407cb00504665930c4e

											
										
										
											2018-10-05 16:37:53 +00:00
+									 * first-letters-root.php data file (which among others includes full basic Latin,
 									 * Cyrillic and Greek alphabets).
-												(bug 43799) create language-specific collations for category sorting

This allows one to *finally* get articles to be correctly sorted on
category pages for 67 languages based in latin, greek and cyrillic
alphabets.

Fixes bug 29788, bug 41040, and bug 42412 (implementing collations for
Swedish, Polish, Ukrainian).

Full list of language codes this adds support for: af, ast, az, be,
bg, br, bs, ca, co, cs, cy, da, de, dsb, el, en, eo, es, et, eu, fi,
fo, fr, fur, fy, ga, gd, gl, hr, hsb, hu, is, it, kk, kl, ku, ky, la,
lb, lt, lv, mk, mo, mt, nl, no, oc, pl, pt, rm, ro, ru, rup, sco, sk,
sl, smn, sq, sr, sv, tk, tl, tr, tt, uk, uz, vi.

* Include data about first-letter characters for 67 language
  tailorings. This data was generated from based on
  http://developer.mimer.com/charts/tailorings.htm by a Ruby script
  (https://www.mediawiki.org/wiki/User:Matma_Rex/generateCollationTailoringData.rb),
  then adjusted by hand (removed duplicate definitions for Spanish and
  German, changed code fil -> tl (Filipino -> Tagalog).

* Mark languages verified by native speakers (currently only pl
  (Polish) I verified by myself and fi (Finnish) checked by Niklas).

* Allow for collations named like 'uca-<langcode>', mapping them to
  IcuCollation with appropriate parameter. The code doesn't check if
  we actually have data for given language, as it's checked after the
  IcuCollation class instance is constructed.

* Add the tailoring data to the default first-letter file (for root
  collation) before it's cached for given locale.

Change-Id: I838484b9aaf23945fe7880fef2e3da5f5c06877f

											
										
										
											2013-02-18 21:09:16 +00:00
+									 *
-												adjusted comments for IcuCollation::$tailoringFirstLetters

More information about what actually sits in that array.

Summary of modifications to the Mimer data so far:
* removed data for "traditional" variants of de (German) and es (Spanish)
* used code 'tl' instead of 'fil' for Tagalog/Filipino
* added be-tarask (Belarusian Taraškievica)

Change-Id: I97273c52599a5eda3f63366d697b077d6b17ba81

											
										
										
											2013-02-28 15:44:26 +00:00
+									 * "Separate letter" is a letter that would have a separate heading/section
 									 * for it in a dictionary or a phone book in this language. This data isn't
 									 * used for sorting (the ICU library handles that), only for deciding which
 									 * characters (or character groups) to use as headings.
 									 *
 									 * Initially generated based on the primary level of Unicode collation
 									 * tailorings available at http://developer.mimer.com/charts/tailorings.htm ,
 									 * later modified.
-												(bug 43799) create language-specific collations for category sorting

This allows one to *finally* get articles to be correctly sorted on
category pages for 67 languages based in latin, greek and cyrillic
alphabets.

Fixes bug 29788, bug 41040, and bug 42412 (implementing collations for
Swedish, Polish, Ukrainian).

Full list of language codes this adds support for: af, ast, az, be,
bg, br, bs, ca, co, cs, cy, da, de, dsb, el, en, eo, es, et, eu, fi,
fo, fr, fur, fy, ga, gd, gl, hr, hsb, hu, is, it, kk, kl, ku, ky, la,
lb, lt, lv, mk, mo, mt, nl, no, oc, pl, pt, rm, ro, ru, rup, sco, sk,
sl, smn, sq, sr, sv, tk, tl, tr, tt, uk, uz, vi.

* Include data about first-letter characters for 67 language
  tailorings. This data was generated from based on
  http://developer.mimer.com/charts/tailorings.htm by a Ruby script
  (https://www.mediawiki.org/wiki/User:Matma_Rex/generateCollationTailoringData.rb),
  then adjusted by hand (removed duplicate definitions for Spanish and
  German, changed code fil -> tl (Filipino -> Tagalog).

* Mark languages verified by native speakers (currently only pl
  (Polish) I verified by myself and fi (Finnish) checked by Niklas).

* Allow for collations named like 'uca-<langcode>', mapping them to
  IcuCollation with appropriate parameter. The code doesn't check if
  we actually have data for given language, as it's checked after the
  IcuCollation class instance is constructed.

* Add the tailoring data to the default first-letter file (for root
  collation) before it's cached for given locale.

Change-Id: I838484b9aaf23945fe7880fef2e3da5f5c06877f

											
										
										
											2013-02-18 21:09:16 +00:00
+									 *
 									 * Empty arrays are intended; this signifies that the data for the language is
 									 * available and that there are, in fact, no additional letters to consider.
 									 */
-												Use consts in IcuCollation class

Change-Id: I664e7ea57b98975a3ff1c0c78477c18eb56837b4

											
										
										
											2020-11-21 00:41:58 +00:00
+									private const TAILORING_FIRST_LETTERS = [
-												Convert all array() syntax to []

Per wikitech-l consensus:
 https://lists.wikimedia.org/pipermail/wikitech-l/2016-February/084821.html

Notes:
* Disabled CallTimePassByReference due to false positives (T127163)

Change-Id: I2c8ce713ce6600a0bb7bf67537c87044c7a45c4b

											
										
										
											2016-02-17 09:09:32 +00:00
+										'af' => [],
-												Add firstLetter data for ~50 additional languages

Based on CLDR 29 data files.

This did the relatively easy languages in CLDR 29 (Which is most
of them). I skipped languages with complicated tailoring files.

Change-Id: I8367604f7d3a1cdef9cb4e15813893c8cbfff1ff

											
										
										
											2016-10-29 11:55:27 +00:00
+										'am' => [],
 										'ar' => [],
-												Use PHP 7 "\u{NNNN}" Unicode codepoint escapes in string literals

In cases where we're operating on text data (and not binary data),
use e.g. "\u{00A0}" to refer directly to the Unicode character
'NO-BREAK SPACE' instead of "\xc2\xa0" to specify the bytes C2h A0h
(which correspond to the UTF-8 encoding of that character). This
makes it easier to look up those mysterious sequences, as not all
are as recognizable as the no-break space.

This is not enforced by PHP, but I think we should write those in
uppercase and zero-padded to at least four characters, like the
Unicode standard does.

Note that not all "\xNN" escapes can be automatically replaced:
* We can't use Unicode escapes for binary data that is not UTF-8
  (e.g. in code converting from legacy encodings or testing the
  handling of invalid UTF-8 byte sequences).
* '\xNN' escapes in regular expressions in single-quoted strings
  are actually handled by PCRE and have to be dealt with carefully
  (those regexps should probably be changed to use the /u modifier).
* "\xNN" referring to ASCII characters ("\x7F" and lower) should
  probably be left as-is.

The replacements in this commit were done semi-manually by piping
the existing "\xNN" escapes through the following terrible Ruby
script I devised:

  chars = eval('"' + ARGV[0] + '"').force_encoding('utf-8')
  puts chars.split('').map{|char|
    '\\u{' + char.ord.to_s(16).upcase.rjust(4, '0') + '}'
  }.join('')

Change-Id: Idc3dee3a7fb5ebfaef395754d8859b18f1f8769a

											
										
										
											2017-10-07 00:26:23 +00:00
+										'as' => [ "\u{0982}", "\u{0981}", "\u{0983}", "\u{09CE}", "ক্ষ " ],
-												IcuCollation: Do not split $tailoringFirstLetters into verified/not verified

At this point I think it's safe to assume that these mostly work well,
and the split makes maintenance of the alphabetical list more difficult
(some entries were already in wrong order). We've been enabling these
collations for more and more Wikimedia wikis and not hearing about any
problems. Mistakes, if any are present, should be treated like any
other bug.

Also made some comments consistent.

Change-Id: I4b5fbcf4dbbdd4dc194ed821341296171fa64bb0

											
										
										
											2016-10-31 15:47:05 +00:00
+										'ast' => [ "Ch", "Ll", "Ñ" ], // not in libicu
-												Convert all array() syntax to []

Per wikitech-l consensus:
 https://lists.wikimedia.org/pipermail/wikitech-l/2016-February/084821.html

Notes:
* Disabled CallTimePassByReference due to false positives (T127163)

Change-Id: I2c8ce713ce6600a0bb7bf67537c87044c7a45c4b

											
										
										
											2016-02-17 09:09:32 +00:00
+										'az' => [ "Ç", "Ə", "Ğ", "İ", "Ö", "Ş", "Ü" ],
-												IcuCollation: Do not split $tailoringFirstLetters into verified/not verified

At this point I think it's safe to assume that these mostly work well,
and the split makes maintenance of the alphabetical list more difficult
(some entries were already in wrong order). We've been enabling these
collations for more and more Wikimedia wikis and not hearing about any
problems. Mistakes, if any are present, should be treated like any
other bug.

Also made some comments consistent.

Change-Id: I4b5fbcf4dbbdd4dc194ed821341296171fa64bb0

											
										
										
											2016-10-31 15:47:05 +00:00
+										'be' => [ "Ё" ],
 										'be-tarask' => [ "Ё" ],
-												Convert all array() syntax to []

Per wikitech-l consensus:
 https://lists.wikimedia.org/pipermail/wikitech-l/2016-February/084821.html

Notes:
* Disabled CallTimePassByReference due to false positives (T127163)

Change-Id: I2c8ce713ce6600a0bb7bf67537c87044c7a45c4b

											
										
										
											2016-02-17 09:09:32 +00:00
+										'bg' => [],
-												Add first letter data for bn collation (Standard and Traditional)

This is based solely on looking at the bn.txt collation data
file. It has not been tested by native speakers.

Bug: T148885
Change-Id: Ide926bc5ee8752269ef6a1bfe972e19b7188d193

											
										
										
											2016-10-27 08:09:11 +00:00
+										'bn' => [ 'ং', 'ঃ', 'ঁ' ],
 										'bn@collation=traditional' => [
 											'ং', 'ঃ', 'ঁ', 'ক্', 'খ্', 'গ্', 'ঘ্', 'ঙ্', 'চ্', 'ছ্', 'জ্', 'ঝ্',
 											'ঞ্', 'ট্', 'ঠ্', 'ড্', 'ঢ্', 'ণ্', 'ৎ', 'থ্', 'দ্', 'ধ্', 'ন্', 'প্',
 											'ফ্', 'ব্', 'ভ্', 'ম্', 'য্', 'র্', 'ৰ্', 'ল্', 'ৱ্', 'শ্', 'ষ্', 'স্', 'হ্'
 										],
-												Add firstLetter data for ~50 additional languages

Based on CLDR 29 data files.

This did the relatively easy languages in CLDR 29 (Which is most
of them). I skipped languages with complicated tailoring files.

Change-Id: I8367604f7d3a1cdef9cb4e15813893c8cbfff1ff

											
										
										
											2016-10-29 11:55:27 +00:00
+										'bo' => [],
-												Convert all array() syntax to []

Per wikitech-l consensus:
 https://lists.wikimedia.org/pipermail/wikitech-l/2016-February/084821.html

Notes:
* Disabled CallTimePassByReference due to false positives (T127163)

Change-Id: I2c8ce713ce6600a0bb7bf67537c87044c7a45c4b

											
										
										
											2016-02-17 09:09:32 +00:00
+										'br' => [ "Ch", "C'h" ],
-												IcuCollation: Do not split $tailoringFirstLetters into verified/not verified

At this point I think it's safe to assume that these mostly work well,
and the split makes maintenance of the alphabetical list more difficult
(some entries were already in wrong order). We've been enabling these
collations for more and more Wikimedia wikis and not hearing about any
problems. Mistakes, if any are present, should be treated like any
other bug.

Also made some comments consistent.

Change-Id: I4b5fbcf4dbbdd4dc194ed821341296171fa64bb0

											
										
										
											2016-10-31 15:47:05 +00:00
+										'bs' => [ "Č", "Ć", "Dž", "Đ", "Lj", "Nj", "Š", "Ž" ],
-												Add firstLetter data for ~50 additional languages

Based on CLDR 29 data files.

This did the relatively easy languages in CLDR 29 (Which is most
of them). I skipped languages with complicated tailoring files.

Change-Id: I8367604f7d3a1cdef9cb4e15813893c8cbfff1ff

											
										
										
											2016-10-29 11:55:27 +00:00
+										'bs-Cyrl' => [],
-												Convert all array() syntax to []

Per wikitech-l consensus:
 https://lists.wikimedia.org/pipermail/wikitech-l/2016-February/084821.html

Notes:
* Disabled CallTimePassByReference due to false positives (T127163)

Change-Id: I2c8ce713ce6600a0bb7bf67537c87044c7a45c4b

											
										
										
											2016-02-17 09:09:32 +00:00
+										'ca' => [],
-												Add firstLetter data for ~50 additional languages

Based on CLDR 29 data files.

This did the relatively easy languages in CLDR 29 (Which is most
of them). I skipped languages with complicated tailoring files.

Change-Id: I8367604f7d3a1cdef9cb4e15813893c8cbfff1ff

											
										
										
											2016-10-29 11:55:27 +00:00
+										'chr' => [],
-												IcuCollation: Do not split $tailoringFirstLetters into verified/not verified

At this point I think it's safe to assume that these mostly work well,
and the split makes maintenance of the alphabetical list more difficult
(some entries were already in wrong order). We've been enabling these
collations for more and more Wikimedia wikis and not hearing about any
problems. Mistakes, if any are present, should be treated like any
other bug.

Also made some comments consistent.

Change-Id: I4b5fbcf4dbbdd4dc194ed821341296171fa64bb0

											
										
										
											2016-10-31 15:47:05 +00:00
+										'co' => [], // not in libicu
 										'cs' => [ "Č", "Ch", "Ř", "Š", "Ž" ],
 										'cy' => [ "Ch", "Dd", "Ff", "Ng", "Ll", "Ph", "Rh", "Th" ],
-												Convert all array() syntax to []

Per wikitech-l consensus:
 https://lists.wikimedia.org/pipermail/wikitech-l/2016-February/084821.html

Notes:
* Disabled CallTimePassByReference due to false positives (T127163)

Change-Id: I2c8ce713ce6600a0bb7bf67537c87044c7a45c4b

											
										
										
											2016-02-17 09:09:32 +00:00
+										'da' => [ "Æ", "Ø", "Å" ],
 										'de' => [],
-												Add firstLetter data for ~50 additional languages

Based on CLDR 29 data files.

This did the relatively easy languages in CLDR 29 (Which is most
of them). I skipped languages with complicated tailoring files.

Change-Id: I8367604f7d3a1cdef9cb4e15813893c8cbfff1ff

											
										
										
											2016-10-29 11:55:27 +00:00
+										'de-AT@collation=phonebook' => [ 'ä', 'ö', 'ü', 'ß' ],
-												Convert all array() syntax to []

Per wikitech-l consensus:
 https://lists.wikimedia.org/pipermail/wikitech-l/2016-February/084821.html

Notes:
* Disabled CallTimePassByReference due to false positives (T127163)

Change-Id: I2c8ce713ce6600a0bb7bf67537c87044c7a45c4b

											
										
										
											2016-02-17 09:09:32 +00:00
+										'dsb' => [ "Č", "Ć", "Dź", "Ě", "Ch", "Ł", "Ń", "Ŕ", "Š", "Ś", "Ž", "Ź" ],
-												Add firstLetter data for ~50 additional languages

Based on CLDR 29 data files.

This did the relatively easy languages in CLDR 29 (Which is most
of them). I skipped languages with complicated tailoring files.

Change-Id: I8367604f7d3a1cdef9cb4e15813893c8cbfff1ff

											
										
										
											2016-10-29 11:55:27 +00:00
+										'ee' => [ "Dz", "Ɖ", "Ɛ", "Ƒ", "Gb", "Ɣ", "Kp", "Ny", "Ŋ", "Ɔ", "Ts", "Ʋ" ],
-												Convert all array() syntax to []

Per wikitech-l consensus:
 https://lists.wikimedia.org/pipermail/wikitech-l/2016-February/084821.html

Notes:
* Disabled CallTimePassByReference due to false positives (T127163)

Change-Id: I2c8ce713ce6600a0bb7bf67537c87044c7a45c4b

											
										
										
											2016-02-17 09:09:32 +00:00
+										'el' => [],
-												IcuCollation: Do not split $tailoringFirstLetters into verified/not verified

At this point I think it's safe to assume that these mostly work well,
and the split makes maintenance of the alphabetical list more difficult
(some entries were already in wrong order). We've been enabling these
collations for more and more Wikimedia wikis and not hearing about any
problems. Mistakes, if any are present, should be treated like any
other bug.

Also made some comments consistent.

Change-Id: I4b5fbcf4dbbdd4dc194ed821341296171fa64bb0

											
										
										
											2016-10-31 15:47:05 +00:00
+										'en' => [],
-												Convert all array() syntax to []

Per wikitech-l consensus:
 https://lists.wikimedia.org/pipermail/wikitech-l/2016-February/084821.html

Notes:
* Disabled CallTimePassByReference due to false positives (T127163)

Change-Id: I2c8ce713ce6600a0bb7bf67537c87044c7a45c4b

											
										
										
											2016-02-17 09:09:32 +00:00
+										'eo' => [ "Ĉ", "Ĝ", "Ĥ", "Ĵ", "Ŝ", "Ŭ" ],
 										'es' => [ "Ñ" ],
-												Remove xx-uca-et collation workaround

Remove workaround introduced in I3e8031b9. No longer needed.

Bug: T202977
Change-Id: I39921ef83cddc33535b99bd9c0b75f8afb52ea9a

											
										
										
											2018-09-11 13:30:06 +00:00
+										'et' => [ "Š", "Ž", "Õ", "Ä", "Ö", "Ü" ],
-												IcuCollation: Do not split $tailoringFirstLetters into verified/not verified

At this point I think it's safe to assume that these mostly work well,
and the split makes maintenance of the alphabetical list more difficult
(some entries were already in wrong order). We've been enabling these
collations for more and more Wikimedia wikis and not hearing about any
problems. Mistakes, if any are present, should be treated like any
other bug.

Also made some comments consistent.

Change-Id: I4b5fbcf4dbbdd4dc194ed821341296171fa64bb0

											
										
										
											2016-10-31 15:47:05 +00:00
+										'eu' => [ "Ñ" ], // not in libicu
 										'fa' => [
 											// RTL, let's put each letter on a new line
 											"آ",
 											"ء",
 											"ه",
 											"ا",
 											"و"
 										],
 										'fi' => [ "Å", "Ä", "Ö" ],
-												Add firstLetter data for ~50 additional languages

Based on CLDR 29 data files.

This did the relatively easy languages in CLDR 29 (Which is most
of them). I skipped languages with complicated tailoring files.

Change-Id: I8367604f7d3a1cdef9cb4e15813893c8cbfff1ff

											
										
										
											2016-10-29 11:55:27 +00:00
+										'fil' => [ "Ñ", "Ng" ],
-												Convert all array() syntax to []

Per wikitech-l consensus:
 https://lists.wikimedia.org/pipermail/wikitech-l/2016-February/084821.html

Notes:
* Disabled CallTimePassByReference due to false positives (T127163)

Change-Id: I2c8ce713ce6600a0bb7bf67537c87044c7a45c4b

											
										
										
											2016-02-17 09:09:32 +00:00
+										'fo' => [ "Á", "Ð", "Í", "Ó", "Ú", "Ý", "Æ", "Ø", "Å" ],
-												IcuCollation: Do not split $tailoringFirstLetters into verified/not verified

At this point I think it's safe to assume that these mostly work well,
and the split makes maintenance of the alphabetical list more difficult
(some entries were already in wrong order). We've been enabling these
collations for more and more Wikimedia wikis and not hearing about any
problems. Mistakes, if any are present, should be treated like any
other bug.

Also made some comments consistent.

Change-Id: I4b5fbcf4dbbdd4dc194ed821341296171fa64bb0

											
										
										
											2016-10-31 15:47:05 +00:00
+										'fr' => [],
-												Add firstLetter data for ~50 additional languages

Based on CLDR 29 data files.

This did the relatively easy languages in CLDR 29 (Which is most
of them). I skipped languages with complicated tailoring files.

Change-Id: I8367604f7d3a1cdef9cb4e15813893c8cbfff1ff

											
										
										
											2016-10-29 11:55:27 +00:00
+										'fr-CA' => [], // fr-CA sorts accents slightly different from fr.
 										'fur' => [ "À", "Á", "Â", "È", "Ì", "Ò", "Ù" ], // not in libicu
 										'fy' => [], // not in libicu
-												Convert all array() syntax to []

Per wikitech-l consensus:
 https://lists.wikimedia.org/pipermail/wikitech-l/2016-February/084821.html

Notes:
* Disabled CallTimePassByReference due to false positives (T127163)

Change-Id: I2c8ce713ce6600a0bb7bf67537c87044c7a45c4b

											
										
										
											2016-02-17 09:09:32 +00:00
+										'ga' => [],
-												Add firstLetter data for ~50 additional languages

Based on CLDR 29 data files.

This did the relatively easy languages in CLDR 29 (Which is most
of them). I skipped languages with complicated tailoring files.

Change-Id: I8367604f7d3a1cdef9cb4e15813893c8cbfff1ff

											
										
										
											2016-10-29 11:55:27 +00:00
+										'gd' => [], // not in libicu
-												Convert all array() syntax to []

Per wikitech-l consensus:
 https://lists.wikimedia.org/pipermail/wikitech-l/2016-February/084821.html

Notes:
* Disabled CallTimePassByReference due to false positives (T127163)

Change-Id: I2c8ce713ce6600a0bb7bf67537c87044c7a45c4b

											
										
										
											2016-02-17 09:09:32 +00:00
+										'gl' => [ "Ch", "Ll", "Ñ" ],
-												Use PHP 7 "\u{NNNN}" Unicode codepoint escapes in string literals

In cases where we're operating on text data (and not binary data),
use e.g. "\u{00A0}" to refer directly to the Unicode character
'NO-BREAK SPACE' instead of "\xc2\xa0" to specify the bytes C2h A0h
(which correspond to the UTF-8 encoding of that character). This
makes it easier to look up those mysterious sequences, as not all
are as recognizable as the no-break space.

This is not enforced by PHP, but I think we should write those in
uppercase and zero-padded to at least four characters, like the
Unicode standard does.

Note that not all "\xNN" escapes can be automatically replaced:
* We can't use Unicode escapes for binary data that is not UTF-8
  (e.g. in code converting from legacy encodings or testing the
  handling of invalid UTF-8 byte sequences).
* '\xNN' escapes in regular expressions in single-quoted strings
  are actually handled by PCRE and have to be dealt with carefully
  (those regexps should probably be changed to use the /u modifier).
* "\xNN" referring to ASCII characters ("\x7F" and lower) should
  probably be left as-is.

The replacements in this commit were done semi-manually by piping
the existing "\xNN" escapes through the following terrible Ruby
script I devised:

  chars = eval('"' + ARGV[0] + '"').force_encoding('utf-8')
  puts chars.split('').map{|char|
    '\\u{' + char.ord.to_s(16).upcase.rjust(4, '0') + '}'
  }.join('')

Change-Id: Idc3dee3a7fb5ebfaef395754d8859b18f1f8769a

											
										
										
											2017-10-07 00:26:23 +00:00
+										'gu' => [ "\u{0A82}", "\u{0A83}", "\u{0A81}", "\u{0AB3}" ],
-												Add firstLetter data for ~50 additional languages

Based on CLDR 29 data files.

This did the relatively easy languages in CLDR 29 (Which is most
of them). I skipped languages with complicated tailoring files.

Change-Id: I8367604f7d3a1cdef9cb4e15813893c8cbfff1ff

											
										
										
											2016-10-29 11:55:27 +00:00
+										'ha' => [ 'Ɓ', 'Ɗ', 'Ƙ', 'Sh', 'Ts', 'Ƴ' ],
 										'haw' => [ 'ʻ' ],
 										'he' => [],
-												Use PHP 7 "\u{NNNN}" Unicode codepoint escapes in string literals

In cases where we're operating on text data (and not binary data),
use e.g. "\u{00A0}" to refer directly to the Unicode character
'NO-BREAK SPACE' instead of "\xc2\xa0" to specify the bytes C2h A0h
(which correspond to the UTF-8 encoding of that character). This
makes it easier to look up those mysterious sequences, as not all
are as recognizable as the no-break space.

This is not enforced by PHP, but I think we should write those in
uppercase and zero-padded to at least four characters, like the
Unicode standard does.

Note that not all "\xNN" escapes can be automatically replaced:
* We can't use Unicode escapes for binary data that is not UTF-8
  (e.g. in code converting from legacy encodings or testing the
  handling of invalid UTF-8 byte sequences).
* '\xNN' escapes in regular expressions in single-quoted strings
  are actually handled by PCRE and have to be dealt with carefully
  (those regexps should probably be changed to use the /u modifier).
* "\xNN" referring to ASCII characters ("\x7F" and lower) should
  probably be left as-is.

The replacements in this commit were done semi-manually by piping
the existing "\xNN" escapes through the following terrible Ruby
script I devised:

  chars = eval('"' + ARGV[0] + '"').force_encoding('utf-8')
  puts chars.split('').map{|char|
    '\\u{' + char.ord.to_s(16).upcase.rjust(4, '0') + '}'
  }.join('')

Change-Id: Idc3dee3a7fb5ebfaef395754d8859b18f1f8769a

											
										
										
											2017-10-07 00:26:23 +00:00
+										'hi' => [ "\u{0902}", "\u{0903}" ],
-												IcuCollation: Do not split $tailoringFirstLetters into verified/not verified

At this point I think it's safe to assume that these mostly work well,
and the split makes maintenance of the alphabetical list more difficult
(some entries were already in wrong order). We've been enabling these
collations for more and more Wikimedia wikis and not hearing about any
problems. Mistakes, if any are present, should be treated like any
other bug.

Also made some comments consistent.

Change-Id: I4b5fbcf4dbbdd4dc194ed821341296171fa64bb0

											
										
										
											2016-10-31 15:47:05 +00:00
+										'hr' => [ "Č", "Ć", "Dž", "Đ", "Lj", "Nj", "Š", "Ž" ],
 										'hsb' => [ "Č", "Dź", "Ě", "Ch", "Ł", "Ń", "Ř", "Š", "Ć", "Ž" ],
 										'hu' => [ "Cs", "Dz", "Dzs", "Gy", "Ly", "Ny", "Ö", "Sz", "Ty", "Ü", "Zs" ],
-												Add firstLetter data for ~50 additional languages

Based on CLDR 29 data files.

This did the relatively easy languages in CLDR 29 (Which is most
of them). I skipped languages with complicated tailoring files.

Change-Id: I8367604f7d3a1cdef9cb4e15813893c8cbfff1ff

											
										
										
											2016-10-29 11:55:27 +00:00
+										'hy' => [ "և" ],
 										'id' => [],
 										'ig' => [ "Ch", "Gb", "Gh", "Gw", "Ị", "Kp", "Kw", "Ṅ", "Nw", "Ny", "Ọ", "Sh", "Ụ" ],
-												IcuCollation: Do not split $tailoringFirstLetters into verified/not verified

At this point I think it's safe to assume that these mostly work well,
and the split makes maintenance of the alphabetical list more difficult
(some entries were already in wrong order). We've been enabling these
collations for more and more Wikimedia wikis and not hearing about any
problems. Mistakes, if any are present, should be treated like any
other bug.

Also made some comments consistent.

Change-Id: I4b5fbcf4dbbdd4dc194ed821341296171fa64bb0

											
										
										
											2016-10-31 15:47:05 +00:00
+										'is' => [ "Á", "Ð", "É", "Í", "Ó", "Ú", "Ý", "Þ", "Æ", "Ö", "Å" ],
 										'it' => [],
-												Add firstLetter data for ~50 additional languages

Based on CLDR 29 data files.

This did the relatively easy languages in CLDR 29 (Which is most
of them). I skipped languages with complicated tailoring files.

Change-Id: I8367604f7d3a1cdef9cb4e15813893c8cbfff1ff

											
										
										
											2016-10-29 11:55:27 +00:00
+										'ka' => [],
-												IcuCollation: Do not split $tailoringFirstLetters into verified/not verified

At this point I think it's safe to assume that these mostly work well,
and the split makes maintenance of the alphabetical list more difficult
(some entries were already in wrong order). We've been enabling these
collations for more and more Wikimedia wikis and not hearing about any
problems. Mistakes, if any are present, should be treated like any
other bug.

Also made some comments consistent.

Change-Id: I4b5fbcf4dbbdd4dc194ed821341296171fa64bb0

											
										
										
											2016-10-31 15:47:05 +00:00
+										'kk' => [ "Ү", "І" ],
 										'kl' => [ "Æ", "Ø", "Å" ],
-												Add firstLetter data for ~50 additional languages

Based on CLDR 29 data files.

This did the relatively easy languages in CLDR 29 (Which is most
of them). I skipped languages with complicated tailoring files.

Change-Id: I8367604f7d3a1cdef9cb4e15813893c8cbfff1ff

											
										
										
											2016-10-29 11:55:27 +00:00
+										'km' => [
-												Use PHP 7 "\u{NNNN}" Unicode codepoint escapes in string literals

In cases where we're operating on text data (and not binary data),
use e.g. "\u{00A0}" to refer directly to the Unicode character
'NO-BREAK SPACE' instead of "\xc2\xa0" to specify the bytes C2h A0h
(which correspond to the UTF-8 encoding of that character). This
makes it easier to look up those mysterious sequences, as not all
are as recognizable as the no-break space.

This is not enforced by PHP, but I think we should write those in
uppercase and zero-padded to at least four characters, like the
Unicode standard does.

Note that not all "\xNN" escapes can be automatically replaced:
* We can't use Unicode escapes for binary data that is not UTF-8
  (e.g. in code converting from legacy encodings or testing the
  handling of invalid UTF-8 byte sequences).
* '\xNN' escapes in regular expressions in single-quoted strings
  are actually handled by PCRE and have to be dealt with carefully
  (those regexps should probably be changed to use the /u modifier).
* "\xNN" referring to ASCII characters ("\x7F" and lower) should
  probably be left as-is.

The replacements in this commit were done semi-manually by piping
the existing "\xNN" escapes through the following terrible Ruby
script I devised:

  chars = eval('"' + ARGV[0] + '"').force_encoding('utf-8')
  puts chars.split('').map{|char|
    '\\u{' + char.ord.to_s(16).upcase.rjust(4, '0') + '}'
  }.join('')

Change-Id: Idc3dee3a7fb5ebfaef395754d8859b18f1f8769a

											
										
										
											2017-10-07 00:26:23 +00:00
+											"រ", "ឫ", "ឬ", "ល", "ឭ", "ឮ", "\u{17BB}\u{17C6}",
 											"\u{17C6}", "\u{17B6}\u{17C6}", "\u{17C7}",
 											"\u{17B7}\u{17C7}", "\u{17BB}\u{17C7}",
 											"\u{17C1}\u{17C7}", "\u{17C4}\u{17C7}",
-												Add firstLetter data for ~50 additional languages

Based on CLDR 29 data files.

This did the relatively easy languages in CLDR 29 (Which is most
of them). I skipped languages with complicated tailoring files.

Change-Id: I8367604f7d3a1cdef9cb4e15813893c8cbfff1ff

											
										
										
											2016-10-29 11:55:27 +00:00
+										],
-												Use PHP 7 "\u{NNNN}" Unicode codepoint escapes in string literals

In cases where we're operating on text data (and not binary data),
use e.g. "\u{00A0}" to refer directly to the Unicode character
'NO-BREAK SPACE' instead of "\xc2\xa0" to specify the bytes C2h A0h
(which correspond to the UTF-8 encoding of that character). This
makes it easier to look up those mysterious sequences, as not all
are as recognizable as the no-break space.

This is not enforced by PHP, but I think we should write those in
uppercase and zero-padded to at least four characters, like the
Unicode standard does.

Note that not all "\xNN" escapes can be automatically replaced:
* We can't use Unicode escapes for binary data that is not UTF-8
  (e.g. in code converting from legacy encodings or testing the
  handling of invalid UTF-8 byte sequences).
* '\xNN' escapes in regular expressions in single-quoted strings
  are actually handled by PCRE and have to be dealt with carefully
  (those regexps should probably be changed to use the /u modifier).
* "\xNN" referring to ASCII characters ("\x7F" and lower) should
  probably be left as-is.

The replacements in this commit were done semi-manually by piping
the existing "\xNN" escapes through the following terrible Ruby
script I devised:

  chars = eval('"' + ARGV[0] + '"').force_encoding('utf-8')
  puts chars.split('').map{|char|
    '\\u{' + char.ord.to_s(16).upcase.rjust(4, '0') + '}'
  }.join('')

Change-Id: Idc3dee3a7fb5ebfaef395754d8859b18f1f8769a

											
										
										
											2017-10-07 00:26:23 +00:00
+										'kn' => [ "\u{0C81}", "\u{0C83}", "\u{0CF1}", "\u{0CF2}" ],
 										'kok' => [ "\u{0902}", "\u{0903}", "ळ", "क्ष" ],
-												IcuCollation: Do not split $tailoringFirstLetters into verified/not verified

At this point I think it's safe to assume that these mostly work well,
and the split makes maintenance of the alphabetical list more difficult
(some entries were already in wrong order). We've been enabling these
collations for more and more Wikimedia wikis and not hearing about any
problems. Mistakes, if any are present, should be treated like any
other bug.

Also made some comments consistent.

Change-Id: I4b5fbcf4dbbdd4dc194ed821341296171fa64bb0

											
										
										
											2016-10-31 15:47:05 +00:00
+										'ku' => [ "Ç", "Ê", "Î", "Ş", "Û" ], // not in libicu
-												Convert all array() syntax to []

Per wikitech-l consensus:
 https://lists.wikimedia.org/pipermail/wikitech-l/2016-February/084821.html

Notes:
* Disabled CallTimePassByReference due to false positives (T127163)

Change-Id: I2c8ce713ce6600a0bb7bf67537c87044c7a45c4b

											
										
										
											2016-02-17 09:09:32 +00:00
+										'ky' => [ "Ё" ],
-												IcuCollation: Do not split $tailoringFirstLetters into verified/not verified

At this point I think it's safe to assume that these mostly work well,
and the split makes maintenance of the alphabetical list more difficult
(some entries were already in wrong order). We've been enabling these
collations for more and more Wikimedia wikis and not hearing about any
problems. Mistakes, if any are present, should be treated like any
other bug.

Also made some comments consistent.

Change-Id: I4b5fbcf4dbbdd4dc194ed821341296171fa64bb0

											
										
										
											2016-10-31 15:47:05 +00:00
+										'la' => [], // not in libicu
-												Convert all array() syntax to []

Per wikitech-l consensus:
 https://lists.wikimedia.org/pipermail/wikitech-l/2016-February/084821.html

Notes:
* Disabled CallTimePassByReference due to false positives (T127163)

Change-Id: I2c8ce713ce6600a0bb7bf67537c87044c7a45c4b

											
										
										
											2016-02-17 09:09:32 +00:00
+										'lb' => [],
-												Add firstLetter data for ~50 additional languages

Based on CLDR 29 data files.

This did the relatively easy languages in CLDR 29 (Which is most
of them). I skipped languages with complicated tailoring files.

Change-Id: I8367604f7d3a1cdef9cb4e15813893c8cbfff1ff

											
										
										
											2016-10-29 11:55:27 +00:00
+										'lkt' => [ 'Č', 'Ǧ', 'Ȟ', 'Š', 'Ž' ],
 										'ln' => [ 'Ɛ' ],
 										'lo' => [],
-												IcuCollation: Do not split $tailoringFirstLetters into verified/not verified

At this point I think it's safe to assume that these mostly work well,
and the split makes maintenance of the alphabetical list more difficult
(some entries were already in wrong order). We've been enabling these
collations for more and more Wikimedia wikis and not hearing about any
problems. Mistakes, if any are present, should be treated like any
other bug.

Also made some comments consistent.

Change-Id: I4b5fbcf4dbbdd4dc194ed821341296171fa64bb0

											
										
										
											2016-10-31 15:47:05 +00:00
+										'lt' => [ "Č", "Š", "Ž" ],
 										'lv' => [ "Č", "Ģ", "Ķ", "Ļ", "Ņ", "Š", "Ž" ],
 										'mk' => [ "Ѓ", "Ќ" ],
-												Add firstLetter data for ~50 additional languages

Based on CLDR 29 data files.

This did the relatively easy languages in CLDR 29 (Which is most
of them). I skipped languages with complicated tailoring files.

Change-Id: I8367604f7d3a1cdef9cb4e15813893c8cbfff1ff

											
										
										
											2016-10-29 11:55:27 +00:00
+										'ml' => [],
 										'mn' => [],
-												IcuCollation: Fix diacritic characters for Aromanian (rup) and Moldovan (mo) headings

They should be Ș, Ț (comma-below) and instead they were cedilla-below (Ş, Ţ).
Same as for Romanian (ro) in 486f64f28302ecceed04977180fd21470cb54c81.

Both of these languages are unsupported by libicu and so the collations
are unlikely to have been used in practice.

Bug: T171043
Bug: T171044
Change-Id: Idd0d593e73cd784fbef7b75e8985f988f5555e26

											
										
										
											2017-07-19 19:48:56 +00:00
+										'mo' => [ "Ă", "Â", "Î", "Ș", "Ț" ], // not in libicu
-												Use PHP 7 "\u{NNNN}" Unicode codepoint escapes in string literals

In cases where we're operating on text data (and not binary data),
use e.g. "\u{00A0}" to refer directly to the Unicode character
'NO-BREAK SPACE' instead of "\xc2\xa0" to specify the bytes C2h A0h
(which correspond to the UTF-8 encoding of that character). This
makes it easier to look up those mysterious sequences, as not all
are as recognizable as the no-break space.

This is not enforced by PHP, but I think we should write those in
uppercase and zero-padded to at least four characters, like the
Unicode standard does.

Note that not all "\xNN" escapes can be automatically replaced:
* We can't use Unicode escapes for binary data that is not UTF-8
  (e.g. in code converting from legacy encodings or testing the
  handling of invalid UTF-8 byte sequences).
* '\xNN' escapes in regular expressions in single-quoted strings
  are actually handled by PCRE and have to be dealt with carefully
  (those regexps should probably be changed to use the /u modifier).
* "\xNN" referring to ASCII characters ("\x7F" and lower) should
  probably be left as-is.

The replacements in this commit were done semi-manually by piping
the existing "\xNN" escapes through the following terrible Ruby
script I devised:

  chars = eval('"' + ARGV[0] + '"').force_encoding('utf-8')
  puts chars.split('').map{|char|
    '\\u{' + char.ord.to_s(16).upcase.rjust(4, '0') + '}'
  }.join('')

Change-Id: Idc3dee3a7fb5ebfaef395754d8859b18f1f8769a

											
										
										
											2017-10-07 00:26:23 +00:00
+										'mr' => [ "\u{0902}", "\u{0903}", "ळ", "क्ष", "ज्ञ" ],
-												Add firstLetter data for ~50 additional languages

Based on CLDR 29 data files.

This did the relatively easy languages in CLDR 29 (Which is most
of them). I skipped languages with complicated tailoring files.

Change-Id: I8367604f7d3a1cdef9cb4e15813893c8cbfff1ff

											
										
										
											2016-10-29 11:55:27 +00:00
+										'ms' => [],
-												Convert all array() syntax to []

Per wikitech-l consensus:
 https://lists.wikimedia.org/pipermail/wikitech-l/2016-February/084821.html

Notes:
* Disabled CallTimePassByReference due to false positives (T127163)

Change-Id: I2c8ce713ce6600a0bb7bf67537c87044c7a45c4b

											
										
										
											2016-02-17 09:09:32 +00:00
+										'mt' => [ "Ċ", "Ġ", "Għ", "Ħ", "Ż" ],
-												Add firstLetter data for ~50 additional languages

Based on CLDR 29 data files.

This did the relatively easy languages in CLDR 29 (Which is most
of them). I skipped languages with complicated tailoring files.

Change-Id: I8367604f7d3a1cdef9cb4e15813893c8cbfff1ff

											
										
										
											2016-10-29 11:55:27 +00:00
+										'nb' => [ "Æ", "Ø", "Å" ],
 										'ne' => [],
-												IcuCollation: Do not split $tailoringFirstLetters into verified/not verified

At this point I think it's safe to assume that these mostly work well,
and the split makes maintenance of the alphabetical list more difficult
(some entries were already in wrong order). We've been enabling these
collations for more and more Wikimedia wikis and not hearing about any
problems. Mistakes, if any are present, should be treated like any
other bug.

Also made some comments consistent.

Change-Id: I4b5fbcf4dbbdd4dc194ed821341296171fa64bb0

											
										
										
											2016-10-31 15:47:05 +00:00
+										'nl' => [],
-												Add firstLetter data for ~50 additional languages

Based on CLDR 29 data files.

This did the relatively easy languages in CLDR 29 (Which is most
of them). I skipped languages with complicated tailoring files.

Change-Id: I8367604f7d3a1cdef9cb4e15813893c8cbfff1ff

											
										
										
											2016-10-29 11:55:27 +00:00
+										'nn' => [ "Æ", "Ø", "Å" ],
-												IcuCollation: Do not split $tailoringFirstLetters into verified/not verified

At this point I think it's safe to assume that these mostly work well,
and the split makes maintenance of the alphabetical list more difficult
(some entries were already in wrong order). We've been enabling these
collations for more and more Wikimedia wikis and not hearing about any
problems. Mistakes, if any are present, should be treated like any
other bug.

Also made some comments consistent.

Change-Id: I4b5fbcf4dbbdd4dc194ed821341296171fa64bb0

											
										
										
											2016-10-31 15:47:05 +00:00
+										'no' => [ "Æ", "Ø", "Å" ], // not in libicu. You should probably use nb or nn instead.
-												Add firstLetter data for ~50 additional languages

Based on CLDR 29 data files.

This did the relatively easy languages in CLDR 29 (Which is most
of them). I skipped languages with complicated tailoring files.

Change-Id: I8367604f7d3a1cdef9cb4e15813893c8cbfff1ff

											
										
										
											2016-10-29 11:55:27 +00:00
+										'oc' => [], // not in libicu
 										'om' => [ 'Ch', 'Dh', 'Kh', 'Ny', 'Ph', 'Sh' ],
-												Use PHP 7 "\u{NNNN}" Unicode codepoint escapes in string literals

In cases where we're operating on text data (and not binary data),
use e.g. "\u{00A0}" to refer directly to the Unicode character
'NO-BREAK SPACE' instead of "\xc2\xa0" to specify the bytes C2h A0h
(which correspond to the UTF-8 encoding of that character). This
makes it easier to look up those mysterious sequences, as not all
are as recognizable as the no-break space.

This is not enforced by PHP, but I think we should write those in
uppercase and zero-padded to at least four characters, like the
Unicode standard does.

Note that not all "\xNN" escapes can be automatically replaced:
* We can't use Unicode escapes for binary data that is not UTF-8
  (e.g. in code converting from legacy encodings or testing the
  handling of invalid UTF-8 byte sequences).
* '\xNN' escapes in regular expressions in single-quoted strings
  are actually handled by PCRE and have to be dealt with carefully
  (those regexps should probably be changed to use the /u modifier).
* "\xNN" referring to ASCII characters ("\x7F" and lower) should
  probably be left as-is.

The replacements in this commit were done semi-manually by piping
the existing "\xNN" escapes through the following terrible Ruby
script I devised:

  chars = eval('"' + ARGV[0] + '"').force_encoding('utf-8')
  puts chars.split('').map{|char|
    '\\u{' + char.ord.to_s(16).upcase.rjust(4, '0') + '}'
  }.join('')

Change-Id: Idc3dee3a7fb5ebfaef395754d8859b18f1f8769a

											
										
										
											2017-10-07 00:26:23 +00:00
+										'or' => [ "\u{0B01}", "\u{0B02}", "\u{0B03}", "କ୍ଷ" ],
 										'pa' => [ "\u{0A4D}" ],
-												IcuCollation: Do not split $tailoringFirstLetters into verified/not verified

At this point I think it's safe to assume that these mostly work well,
and the split makes maintenance of the alphabetical list more difficult
(some entries were already in wrong order). We've been enabling these
collations for more and more Wikimedia wikis and not hearing about any
problems. Mistakes, if any are present, should be treated like any
other bug.

Also made some comments consistent.

Change-Id: I4b5fbcf4dbbdd4dc194ed821341296171fa64bb0

											
										
										
											2016-10-31 15:47:05 +00:00
+										'pl' => [ "Ą", "Ć", "Ę", "Ł", "Ń", "Ó", "Ś", "Ź", "Ż" ],
 										'pt' => [],
-												Add firstLetter data for ~50 additional languages

Based on CLDR 29 data files.

This did the relatively easy languages in CLDR 29 (Which is most
of them). I skipped languages with complicated tailoring files.

Change-Id: I8367604f7d3a1cdef9cb4e15813893c8cbfff1ff

											
										
										
											2016-10-29 11:55:27 +00:00
+										'rm' => [], // not in libicu
-												IcuCollation: Fix diacritic characters for Romanian (ro) headings

They should be Ș, Ț (comma-below) and instead they were cedilla-below (Ş, Ţ).

Bug: T168711
Change-Id: I6dc873c3ce93bca3e425439f70d0fb30aecc9533

											
										
										
											2017-07-19 14:27:50 +00:00
+										'ro' => [ "Ă", "Â", "Î", "Ș", "Ț" ],
-												IcuCollation: Do not split $tailoringFirstLetters into verified/not verified

At this point I think it's safe to assume that these mostly work well,
and the split makes maintenance of the alphabetical list more difficult
(some entries were already in wrong order). We've been enabling these
collations for more and more Wikimedia wikis and not hearing about any
problems. Mistakes, if any are present, should be treated like any
other bug.

Also made some comments consistent.

Change-Id: I4b5fbcf4dbbdd4dc194ed821341296171fa64bb0

											
										
										
											2016-10-31 15:47:05 +00:00
+										'ru' => [],
-												IcuCollation: Fix diacritic characters for Aromanian (rup) and Moldovan (mo) headings

They should be Ș, Ț (comma-below) and instead they were cedilla-below (Ş, Ţ).
Same as for Romanian (ro) in 486f64f28302ecceed04977180fd21470cb54c81.

Both of these languages are unsupported by libicu and so the collations
are unlikely to have been used in practice.

Bug: T171043
Bug: T171044
Change-Id: Idd0d593e73cd784fbef7b75e8985f988f5555e26

											
										
										
											2017-07-19 19:48:56 +00:00
+										'rup' => [ "Ă", "Â", "Î", "Ľ", "Ń", "Ș", "Ț" ], // not in libicu
-												Convert all array() syntax to []

Per wikitech-l consensus:
 https://lists.wikimedia.org/pipermail/wikitech-l/2016-February/084821.html

Notes:
* Disabled CallTimePassByReference due to false positives (T127163)

Change-Id: I2c8ce713ce6600a0bb7bf67537c87044c7a45c4b

											
										
										
											2016-02-17 09:09:32 +00:00
+										'sco' => [],
-												Add firstLetter data for ~50 additional languages

Based on CLDR 29 data files.

This did the relatively easy languages in CLDR 29 (Which is most
of them). I skipped languages with complicated tailoring files.

Change-Id: I8367604f7d3a1cdef9cb4e15813893c8cbfff1ff

											
										
										
											2016-10-29 11:55:27 +00:00
+										'se' => [
 											'Á', 'Č', 'Ʒ', 'Ǯ', 'Đ', 'Ǧ', 'Ǥ', 'Ǩ', 'Ŋ',
 											'Š', 'Ŧ', 'Ž', 'Ø', 'Æ', 'Ȧ', 'Ä', 'Ö'
 										],
-												Use PHP 7 "\u{NNNN}" Unicode codepoint escapes in string literals

In cases where we're operating on text data (and not binary data),
use e.g. "\u{00A0}" to refer directly to the Unicode character
'NO-BREAK SPACE' instead of "\xc2\xa0" to specify the bytes C2h A0h
(which correspond to the UTF-8 encoding of that character). This
makes it easier to look up those mysterious sequences, as not all
are as recognizable as the no-break space.

This is not enforced by PHP, but I think we should write those in
uppercase and zero-padded to at least four characters, like the
Unicode standard does.

Note that not all "\xNN" escapes can be automatically replaced:
* We can't use Unicode escapes for binary data that is not UTF-8
  (e.g. in code converting from legacy encodings or testing the
  handling of invalid UTF-8 byte sequences).
* '\xNN' escapes in regular expressions in single-quoted strings
  are actually handled by PCRE and have to be dealt with carefully
  (those regexps should probably be changed to use the /u modifier).
* "\xNN" referring to ASCII characters ("\x7F" and lower) should
  probably be left as-is.

The replacements in this commit were done semi-manually by piping
the existing "\xNN" escapes through the following terrible Ruby
script I devised:

  chars = eval('"' + ARGV[0] + '"').force_encoding('utf-8')
  puts chars.split('').map{|char|
    '\\u{' + char.ord.to_s(16).upcase.rjust(4, '0') + '}'
  }.join('')

Change-Id: Idc3dee3a7fb5ebfaef395754d8859b18f1f8769a

											
										
										
											2017-10-07 00:26:23 +00:00
+										'si' => [ "\u{0D82}", "\u{0D83}", "\u{0DA4}" ],
-												IcuCollation: Do not split $tailoringFirstLetters into verified/not verified

At this point I think it's safe to assume that these mostly work well,
and the split makes maintenance of the alphabetical list more difficult
(some entries were already in wrong order). We've been enabling these
collations for more and more Wikimedia wikis and not hearing about any
problems. Mistakes, if any are present, should be treated like any
other bug.

Also made some comments consistent.

Change-Id: I4b5fbcf4dbbdd4dc194ed821341296171fa64bb0

											
										
										
											2016-10-31 15:47:05 +00:00
+										'sk' => [ "Ä", "Č", "Ch", "Ô", "Š", "Ž" ],
-												Convert all array() syntax to []

Per wikitech-l consensus:
 https://lists.wikimedia.org/pipermail/wikitech-l/2016-February/084821.html

Notes:
* Disabled CallTimePassByReference due to false positives (T127163)

Change-Id: I2c8ce713ce6600a0bb7bf67537c87044c7a45c4b

											
										
										
											2016-02-17 09:09:32 +00:00
+										'sl' => [ "Č", "Š", "Ž" ],
 										'smn' => [ "Á", "Č", "Đ", "Ŋ", "Š", "Ŧ", "Ž", "Æ", "Ø", "Å", "Ä", "Ö" ],
 										'sq' => [ "Ç", "Dh", "Ë", "Gj", "Ll", "Nj", "Rr", "Sh", "Th", "Xh", "Zh" ],
-												IcuCollation: Do not split $tailoringFirstLetters into verified/not verified

At this point I think it's safe to assume that these mostly work well,
and the split makes maintenance of the alphabetical list more difficult
(some entries were already in wrong order). We've been enabling these
collations for more and more Wikimedia wikis and not hearing about any
problems. Mistakes, if any are present, should be treated like any
other bug.

Also made some comments consistent.

Change-Id: I4b5fbcf4dbbdd4dc194ed821341296171fa64bb0

											
										
										
											2016-10-31 15:47:05 +00:00
+										'sr' => [],
-												Add firstLetter data for ~50 additional languages

Based on CLDR 29 data files.

This did the relatively easy languages in CLDR 29 (Which is most
of them). I skipped languages with complicated tailoring files.

Change-Id: I8367604f7d3a1cdef9cb4e15813893c8cbfff1ff

											
										
										
											2016-10-29 11:55:27 +00:00
+										'sr-Latn' => [ "Č", "Ć", "Dž", "Đ", "Lj", "Nj", "Š", "Ž" ],
-												IcuCollation: Do not split $tailoringFirstLetters into verified/not verified

At this point I think it's safe to assume that these mostly work well,
and the split makes maintenance of the alphabetical list more difficult
(some entries were already in wrong order). We've been enabling these
collations for more and more Wikimedia wikis and not hearing about any
problems. Mistakes, if any are present, should be treated like any
other bug.

Also made some comments consistent.

Change-Id: I4b5fbcf4dbbdd4dc194ed821341296171fa64bb0

											
										
										
											2016-10-31 15:47:05 +00:00
+										'sv' => [ "Å", "Ä", "Ö" ],
 										'sv@collation=standard' => [ "Å", "Ä", "Ö" ],
-												Add firstLetter data for ~50 additional languages

Based on CLDR 29 data files.

This did the relatively easy languages in CLDR 29 (Which is most
of them). I skipped languages with complicated tailoring files.

Change-Id: I8367604f7d3a1cdef9cb4e15813893c8cbfff1ff

											
										
										
											2016-10-29 11:55:27 +00:00
+										'sw' => [],
-												IcuCollation: Do not split $tailoringFirstLetters into verified/not verified

At this point I think it's safe to assume that these mostly work well,
and the split makes maintenance of the alphabetical list more difficult
(some entries were already in wrong order). We've been enabling these
collations for more and more Wikimedia wikis and not hearing about any
problems. Mistakes, if any are present, should be treated like any
other bug.

Also made some comments consistent.

Change-Id: I4b5fbcf4dbbdd4dc194ed821341296171fa64bb0

											
										
										
											2016-10-31 15:47:05 +00:00
+										'ta' => [
-												Use PHP 7 "\u{NNNN}" Unicode codepoint escapes in string literals

In cases where we're operating on text data (and not binary data),
use e.g. "\u{00A0}" to refer directly to the Unicode character
'NO-BREAK SPACE' instead of "\xc2\xa0" to specify the bytes C2h A0h
(which correspond to the UTF-8 encoding of that character). This
makes it easier to look up those mysterious sequences, as not all
are as recognizable as the no-break space.

This is not enforced by PHP, but I think we should write those in
uppercase and zero-padded to at least four characters, like the
Unicode standard does.

Note that not all "\xNN" escapes can be automatically replaced:
* We can't use Unicode escapes for binary data that is not UTF-8
  (e.g. in code converting from legacy encodings or testing the
  handling of invalid UTF-8 byte sequences).
* '\xNN' escapes in regular expressions in single-quoted strings
  are actually handled by PCRE and have to be dealt with carefully
  (those regexps should probably be changed to use the /u modifier).
* "\xNN" referring to ASCII characters ("\x7F" and lower) should
  probably be left as-is.

The replacements in this commit were done semi-manually by piping
the existing "\xNN" escapes through the following terrible Ruby
script I devised:

  chars = eval('"' + ARGV[0] + '"').force_encoding('utf-8')
  puts chars.split('').map{|char|
    '\\u{' + char.ord.to_s(16).upcase.rjust(4, '0') + '}'
  }.join('')

Change-Id: Idc3dee3a7fb5ebfaef395754d8859b18f1f8769a

											
										
										
											2017-10-07 00:26:23 +00:00
+											"\u{0B82}", "ஃ", "க்ஷ", "க்", "ங்", "ச்", "ஞ்", "ட்", "ண்", "த்", "ந்",
-												IcuCollation: Do not split $tailoringFirstLetters into verified/not verified

At this point I think it's safe to assume that these mostly work well,
and the split makes maintenance of the alphabetical list more difficult
(some entries were already in wrong order). We've been enabling these
collations for more and more Wikimedia wikis and not hearing about any
problems. Mistakes, if any are present, should be treated like any
other bug.

Also made some comments consistent.

Change-Id: I4b5fbcf4dbbdd4dc194ed821341296171fa64bb0

											
										
										
											2016-10-31 15:47:05 +00:00
+											"ப்", "ம்", "ய்", "ர்", "ல்", "வ்", "ழ்", "ள்", "ற்", "ன்", "ஜ்", "ஶ்", "ஷ்",
 											"ஸ்", "ஹ்", "க்ஷ்"
 										],
-												Use PHP 7 "\u{NNNN}" Unicode codepoint escapes in string literals

In cases where we're operating on text data (and not binary data),
use e.g. "\u{00A0}" to refer directly to the Unicode character
'NO-BREAK SPACE' instead of "\xc2\xa0" to specify the bytes C2h A0h
(which correspond to the UTF-8 encoding of that character). This
makes it easier to look up those mysterious sequences, as not all
are as recognizable as the no-break space.

This is not enforced by PHP, but I think we should write those in
uppercase and zero-padded to at least four characters, like the
Unicode standard does.

Note that not all "\xNN" escapes can be automatically replaced:
* We can't use Unicode escapes for binary data that is not UTF-8
  (e.g. in code converting from legacy encodings or testing the
  handling of invalid UTF-8 byte sequences).
* '\xNN' escapes in regular expressions in single-quoted strings
  are actually handled by PCRE and have to be dealt with carefully
  (those regexps should probably be changed to use the /u modifier).
* "\xNN" referring to ASCII characters ("\x7F" and lower) should
  probably be left as-is.

The replacements in this commit were done semi-manually by piping
the existing "\xNN" escapes through the following terrible Ruby
script I devised:

  chars = eval('"' + ARGV[0] + '"').force_encoding('utf-8')
  puts chars.split('').map{|char|
    '\\u{' + char.ord.to_s(16).upcase.rjust(4, '0') + '}'
  }.join('')

Change-Id: Idc3dee3a7fb5ebfaef395754d8859b18f1f8769a

											
										
										
											2017-10-07 00:26:23 +00:00
+										'te' => [ "\u{0C01}", "\u{0C02}", "\u{0C03}" ],
 										'th' => [ "ฯ", "\u{0E46}", "\u{0E4D}", "\u{0E3A}" ],
-												Convert all array() syntax to []

Per wikitech-l consensus:
 https://lists.wikimedia.org/pipermail/wikitech-l/2016-February/084821.html

Notes:
* Disabled CallTimePassByReference due to false positives (T127163)

Change-Id: I2c8ce713ce6600a0bb7bf67537c87044c7a45c4b

											
										
										
											2016-02-17 09:09:32 +00:00
+										'tk' => [ "Ç", "Ä", "Ž", "Ň", "Ö", "Ş", "Ü", "Ý" ],
-												Add firstLetter data for ~50 additional languages

Based on CLDR 29 data files.

This did the relatively easy languages in CLDR 29 (Which is most
of them). I skipped languages with complicated tailoring files.

Change-Id: I8367604f7d3a1cdef9cb4e15813893c8cbfff1ff

											
										
										
											2016-10-29 11:55:27 +00:00
+										'tl' => [ "Ñ", "Ng" ], // not in libicu
 										'to' => [ "Ng", "ʻ" ],
-												Convert all array() syntax to []

Per wikitech-l consensus:
 https://lists.wikimedia.org/pipermail/wikitech-l/2016-February/084821.html

Notes:
* Disabled CallTimePassByReference due to false positives (T127163)

Change-Id: I2c8ce713ce6600a0bb7bf67537c87044c7a45c4b

											
										
										
											2016-02-17 09:09:32 +00:00
+										'tr' => [ "Ç", "Ğ", "İ", "Ö", "Ş", "Ü" ],
-												Make uca-tr use I as uppercase of dotless ı instead of reverse

The primary collision resolution makes wrong choice

Bug: T203158
Change-Id: Id677476937cc6575950496767b50c1e8c21f2fbc

											
										
										
											2019-02-20 16:55:44 +00:00
+										'-tr' => [ "ı" ],
-												Add firstLetter data for ~50 additional languages

Based on CLDR 29 data files.

This did the relatively easy languages in CLDR 29 (Which is most
of them). I skipped languages with complicated tailoring files.

Change-Id: I8367604f7d3a1cdef9cb4e15813893c8cbfff1ff

											
										
										
											2016-10-29 11:55:27 +00:00
+										'tt' => [ "Ә", "Ө", "Ү", "Җ", "Ң", "Һ" ], // not in libicu
-												IcuCollation: Do not split $tailoringFirstLetters into verified/not verified

At this point I think it's safe to assume that these mostly work well,
and the split makes maintenance of the alphabetical list more difficult
(some entries were already in wrong order). We've been enabling these
collations for more and more Wikimedia wikis and not hearing about any
problems. Mistakes, if any are present, should be treated like any
other bug.

Also made some comments consistent.

Change-Id: I4b5fbcf4dbbdd4dc194ed821341296171fa64bb0

											
										
										
											2016-10-31 15:47:05 +00:00
+										'uk' => [ "Ґ", "Ь" ],
-												Add firstLetter data for ~50 additional languages

Based on CLDR 29 data files.

This did the relatively easy languages in CLDR 29 (Which is most
of them). I skipped languages with complicated tailoring files.

Change-Id: I8367604f7d3a1cdef9cb4e15813893c8cbfff1ff

											
										
										
											2016-10-29 11:55:27 +00:00
+										'uz' => [ "Ch", "G'", "Ng", "O'", "Sh" ], // not in libicu
-												IcuCollation: Do not split $tailoringFirstLetters into verified/not verified

At this point I think it's safe to assume that these mostly work well,
and the split makes maintenance of the alphabetical list more difficult
(some entries were already in wrong order). We've been enabling these
collations for more and more Wikimedia wikis and not hearing about any
problems. Mistakes, if any are present, should be treated like any
other bug.

Also made some comments consistent.

Change-Id: I4b5fbcf4dbbdd4dc194ed821341296171fa64bb0

											
										
										
											2016-10-31 15:47:05 +00:00
+										'vi' => [ "Ă", "Â", "Đ", "Ê", "Ô", "Ơ", "Ư" ],
-												Add firstLetter data for ~50 additional languages

Based on CLDR 29 data files.

This did the relatively easy languages in CLDR 29 (Which is most
of them). I skipped languages with complicated tailoring files.

Change-Id: I8367604f7d3a1cdef9cb4e15813893c8cbfff1ff

											
										
										
											2016-10-29 11:55:27 +00:00
+										'vo' => [ "Ä", "Ö", "Ü" ],
 										'yi' => [
-												Use PHP 7 "\u{NNNN}" Unicode codepoint escapes in string literals

In cases where we're operating on text data (and not binary data),
use e.g. "\u{00A0}" to refer directly to the Unicode character
'NO-BREAK SPACE' instead of "\xc2\xa0" to specify the bytes C2h A0h
(which correspond to the UTF-8 encoding of that character). This
makes it easier to look up those mysterious sequences, as not all
are as recognizable as the no-break space.

This is not enforced by PHP, but I think we should write those in
uppercase and zero-padded to at least four characters, like the
Unicode standard does.

Note that not all "\xNN" escapes can be automatically replaced:
* We can't use Unicode escapes for binary data that is not UTF-8
  (e.g. in code converting from legacy encodings or testing the
  handling of invalid UTF-8 byte sequences).
* '\xNN' escapes in regular expressions in single-quoted strings
  are actually handled by PCRE and have to be dealt with carefully
  (those regexps should probably be changed to use the /u modifier).
* "\xNN" referring to ASCII characters ("\x7F" and lower) should
  probably be left as-is.

The replacements in this commit were done semi-manually by piping
the existing "\xNN" escapes through the following terrible Ruby
script I devised:

  chars = eval('"' + ARGV[0] + '"').force_encoding('utf-8')
  puts chars.split('').map{|char|
    '\\u{' + char.ord.to_s(16).upcase.rjust(4, '0') + '}'
  }.join('')

Change-Id: Idc3dee3a7fb5ebfaef395754d8859b18f1f8769a

											
										
										
											2017-10-07 00:26:23 +00:00
+											"\u{05D1}\u{05BF}", "\u{05DB}\u{05BC}", "\u{05E4}\u{05BC}",
 											"\u{05E9}\u{05C2}", "\u{05EA}\u{05BC}"
-												Add firstLetter data for ~50 additional languages

Based on CLDR 29 data files.

This did the relatively easy languages in CLDR 29 (Which is most
of them). I skipped languages with complicated tailoring files.

Change-Id: I8367604f7d3a1cdef9cb4e15813893c8cbfff1ff

											
										
										
											2016-10-29 11:55:27 +00:00
+										],
 										'yo' => [ "Ẹ", "Gb", "Ọ", "Ṣ" ],
 										'zu' => [],
-												Convert all array() syntax to []

Per wikitech-l consensus:
 https://lists.wikimedia.org/pipermail/wikitech-l/2016-February/084821.html

Notes:
* Disabled CallTimePassByReference due to false positives (T127163)

Change-Id: I2c8ce713ce6600a0bb7bf67537c87044c7a45c4b

											
										
										
											2016-02-17 09:09:32 +00:00
+									];
-												(bug 43799) create language-specific collations for category sorting

This allows one to *finally* get articles to be correctly sorted on
category pages for 67 languages based in latin, greek and cyrillic
alphabets.

Fixes bug 29788, bug 41040, and bug 42412 (implementing collations for
Swedish, Polish, Ukrainian).

Full list of language codes this adds support for: af, ast, az, be,
bg, br, bs, ca, co, cs, cy, da, de, dsb, el, en, eo, es, et, eu, fi,
fo, fr, fur, fy, ga, gd, gl, hr, hsb, hu, is, it, kk, kl, ku, ky, la,
lb, lt, lv, mk, mo, mt, nl, no, oc, pl, pt, rm, ro, ru, rup, sco, sk,
sl, smn, sq, sr, sv, tk, tl, tr, tt, uk, uz, vi.

* Include data about first-letter characters for 67 language
  tailorings. This data was generated from based on
  http://developer.mimer.com/charts/tailorings.htm by a Ruby script
  (https://www.mediawiki.org/wiki/User:Matma_Rex/generateCollationTailoringData.rb),
  then adjusted by hand (removed duplicate definitions for Spanish and
  German, changed code fil -> tl (Filipino -> Tagalog).

* Mark languages verified by native speakers (currently only pl
  (Polish) I verified by myself and fi (Finnish) checked by Niklas).

* Allow for collations named like 'uca-<langcode>', mapping them to
  IcuCollation with appropriate parameter. The code doesn't check if
  we actually have data for given language, as it's checked after the
  IcuCollation class instance is constructed.

* Add the tailoring data to the default first-letter file (for root
  collation) before it's cached for given locale.

Change-Id: I838484b9aaf23945fe7880fef2e3da5f5c06877f

											
										
										
											2013-02-18 21:09:16 +00:00
-												Inject services into Collation classes

Might be worth converting Collation::singleton/::factory
to a service at some point...

Change-Id: Ifc96f851e6091ce834dbaf0e91695c648a42169c

											
										
										
											2021-03-30 19:02:21 +00:00
+									/**
 									 * @param LanguageFactory $languageFactory
 									 * @param string $locale
 									 */
 									public function __construct(
 										LanguageFactory $languageFactory,
 										$locale
 									) {
-												* Introduced a non-dummy collation for $wgCategoryCollation, namely UCA with default tables. 
* Added a maintenance script which generates a list of first letters. Unified Han are omitted for performance, and because they shouldn't be used as headings anyway. A future collation specific to Chinese would provide the KangXi radicals as "first letters".
* Provided a precomputed list of first letters. Used Unicode 6.0.0 data and ICU 4.2. 
* Moved collation functionality from Language to a Collation class hierarchy with factory function. Removed the recently-added methods from Language and updated all callers.
* Changed Title::getCategorySortkey() to separate its parts with a line break instead of a null character. All collations supported by the intl extension ignore the null character, i.e. "ab" == "a\0b". It would have required a lot of hacking to make it work.
* Fixed the uppercase collation to handle non-ASCII characters, redundantly with r80436. I don't think it's necessary to change the collation name as was done there, so I reverted that in the course of my conflict merge. A --force option to updateCollation.php might be nice though.


											
										
										
											2011-01-17 14:02:22 +00:00
+										$this->locale = $locale;
-												IcuCollation: Sort digits under localised digits' headings

Previously both '1' and '۱' ("DIGIT ONE" and "EXTENDED ARABIC-INDIC
DIGIT ONE") were sorted under '1' heading, regardless of collation
locale.

Now they will be both sorted under localised heading name (transformed
using Language#formatNum), for example '1' for 'uca-en' collation or
'۱' for 'uca-fa' collation.

Bug: 55630
Change-Id: I0b745d955a6e72f53873c95648087aa5f90a8852

											
										
										
											2013-10-12 22:21:01 +00:00
+										// Drop everything after the '@' in locale's name
 										$localeParts = explode( '@', $locale );
-												Inject services into Collation classes

Might be worth converting Collation::singleton/::factory
to a service at some point...

Change-Id: Ifc96f851e6091ce834dbaf0e91695c648a42169c

											
										
										
											2021-03-30 19:02:21 +00:00
+										$this->digitTransformLanguage = $languageFactory->getLanguage( $locale === 'root' ? 'en' : $localeParts[0] );
-												IcuCollation: Sort digits under localised digits' headings

Previously both '1' and '۱' ("DIGIT ONE" and "EXTENDED ARABIC-INDIC
DIGIT ONE") were sorted under '1' heading, regardless of collation
locale.

Now they will be both sorted under localised heading name (transformed
using Language#formatNum), for example '1' for 'uca-en' collation or
'۱' for 'uca-fa' collation.

Bug: 55630
Change-Id: I0b745d955a6e72f53873c95648087aa5f90a8852

											
										
										
											2013-10-12 22:21:01 +00:00
-												collation: Improve IcuCollation for static code analyzer

phan says that $this->mainCollator is not documented to get null
assigned.
Use a local variable to check for null and than set the class property.

Change-Id: I000c935da8d99184f2ae0382fc5caac81e80c8d7

											
										
										
											2021-11-08 22:11:19 +00:00
+										$mainCollator = Collator::create( $locale );
 										if ( !$mainCollator ) {
-												* Introduced a non-dummy collation for $wgCategoryCollation, namely UCA with default tables. 
* Added a maintenance script which generates a list of first letters. Unified Han are omitted for performance, and because they shouldn't be used as headings anyway. A future collation specific to Chinese would provide the KangXi radicals as "first letters".
* Provided a precomputed list of first letters. Used Unicode 6.0.0 data and ICU 4.2. 
* Moved collation functionality from Language to a Collation class hierarchy with factory function. Removed the recently-added methods from Language and updated all callers.
* Changed Title::getCategorySortkey() to separate its parts with a line break instead of a null character. All collations supported by the intl extension ignore the null character, i.e. "ab" == "a\0b". It would have required a lot of hacking to make it work.
* Fixed the uppercase collation to handle non-ASCII characters, redundantly with r80436. I don't think it's necessary to change the collation name as was done there, so I reverted that in the course of my conflict merge. A --force option to updateCollation.php might be nice though.


											
										
										
											2011-01-17 14:02:22 +00:00
+											throw new MWException( "Invalid ICU locale specified for collation: $locale" );
 										}
-												collation: Improve IcuCollation for static code analyzer

phan says that $this->mainCollator is not documented to get null
assigned.
Use a local variable to check for null and than set the class property.

Change-Id: I000c935da8d99184f2ae0382fc5caac81e80c8d7

											
										
										
											2021-11-08 22:11:19 +00:00
+										$this->mainCollator = $mainCollator;
-												* Introduced a non-dummy collation for $wgCategoryCollation, namely UCA with default tables. 
* Added a maintenance script which generates a list of first letters. Unified Han are omitted for performance, and because they shouldn't be used as headings anyway. A future collation specific to Chinese would provide the KangXi radicals as "first letters".
* Provided a precomputed list of first letters. Used Unicode 6.0.0 data and ICU 4.2. 
* Moved collation functionality from Language to a Collation class hierarchy with factory function. Removed the recently-added methods from Language and updated all callers.
* Changed Title::getCategorySortkey() to separate its parts with a line break instead of a null character. All collations supported by the intl extension ignore the null character, i.e. "ab" == "a\0b". It would have required a lot of hacking to make it work.
* Fixed the uppercase collation to handle non-ASCII characters, redundantly with r80436. I don't think it's necessary to change the collation name as was done there, so I reverted that in the course of my conflict merge. A --force option to updateCollation.php might be nice though.


											
										
										
											2011-01-17 14:02:22 +00:00
 										$this->primaryCollator = Collator::create( $locale );
 										$this->primaryCollator->setStrength( Collator::PRIMARY );
-												Adding support for numeric collation when using UCA collations

To use, add '-u-kn' to the end of a collation name and set it as
the value for $wgCategoryCollation.

Bug: T8948
Change-Id: Ica7908daf80624fa2648127114d01665e96234c0

											
										
										
											2016-07-15 03:47:52 +00:00
 										// If the special suffix for numeric collation is present, turn on numeric collation.
 										if ( substr( $locale, -5, 5 ) === '-u-kn' ) {
 											$this->useNumericCollation = true;
 											// Strip off the special suffix so it doesn't trip up fetchFirstLetterData().
 											$this->locale = substr( $this->locale, 0, -5 );
 											$this->mainCollator->setAttribute( Collator::NUMERIC_COLLATION, Collator::ON );
 											$this->primaryCollator->setAttribute( Collator::NUMERIC_COLLATION, Collator::ON );
 										}
-												* Introduced a non-dummy collation for $wgCategoryCollation, namely UCA with default tables. 
* Added a maintenance script which generates a list of first letters. Unified Han are omitted for performance, and because they shouldn't be used as headings anyway. A future collation specific to Chinese would provide the KangXi radicals as "first letters".
* Provided a precomputed list of first letters. Used Unicode 6.0.0 data and ICU 4.2. 
* Moved collation functionality from Language to a Collation class hierarchy with factory function. Removed the recently-added methods from Language and updated all callers.
* Changed Title::getCategorySortkey() to separate its parts with a line break instead of a null character. All collations supported by the intl extension ignore the null character, i.e. "ab" == "a\0b". It would have required a lot of hacking to make it work.
* Fixed the uppercase collation to handle non-ASCII characters, redundantly with r80436. I don't think it's necessary to change the collation name as was done there, so I reverted that in the course of my conflict merge. A --force option to updateCollation.php might be nice though.


											
										
										
											2011-01-17 14:02:22 +00:00
+									}
-												Split Collation.php

Change-Id: I6abfecf91cdce83dd34b1e8aa8e0b35315f62742

											
										
										
											2016-04-03 08:23:20 +00:00
+									public function getSortKey( $string ) {
-												collation: Remove suppressWarnings() for getSortKey()

Small optimization to IcuCollation::fetchFirstLetterData().

This used to suppress / restore warnings once per every letter of
every alphabet. The workaround for string casting and error
suppression is no longer needed as of PHP 5.3, in which the
bug was fixed.

Change-Id: Idd41a509858c0887df4f632b480b387bd74027b2

											
										
										
											2016-05-23 22:43:16 +00:00
+										return $this->mainCollator->getSortKey( $string );
-												* Introduced a non-dummy collation for $wgCategoryCollation, namely UCA with default tables. 
* Added a maintenance script which generates a list of first letters. Unified Han are omitted for performance, and because they shouldn't be used as headings anyway. A future collation specific to Chinese would provide the KangXi radicals as "first letters".
* Provided a precomputed list of first letters. Used Unicode 6.0.0 data and ICU 4.2. 
* Moved collation functionality from Language to a Collation class hierarchy with factory function. Removed the recently-added methods from Language and updated all callers.
* Changed Title::getCategorySortkey() to separate its parts with a line break instead of a null character. All collations supported by the intl extension ignore the null character, i.e. "ab" == "a\0b". It would have required a lot of hacking to make it work.
* Fixed the uppercase collation to handle non-ASCII characters, redundantly with r80436. I don't think it's necessary to change the collation name as was done there, so I reverted that in the course of my conflict merge. A --force option to updateCollation.php might be nice though.


											
										
										
											2011-01-17 14:02:22 +00:00
+									}
-												Split Collation.php

Change-Id: I6abfecf91cdce83dd34b1e8aa8e0b35315f62742

											
										
										
											2016-04-03 08:23:20 +00:00
+									public function getPrimarySortKey( $string ) {
-												collation: Remove suppressWarnings() for getSortKey()

Small optimization to IcuCollation::fetchFirstLetterData().

This used to suppress / restore warnings once per every letter of
every alphabet. The workaround for string casting and error
suppression is no longer needed as of PHP 5.3, in which the
bug was fixed.

Change-Id: Idd41a509858c0887df4f632b480b387bd74027b2

											
										
										
											2016-05-23 22:43:16 +00:00
+										return $this->primaryCollator->getSortKey( $string );
-												* Introduced a non-dummy collation for $wgCategoryCollation, namely UCA with default tables. 
* Added a maintenance script which generates a list of first letters. Unified Han are omitted for performance, and because they shouldn't be used as headings anyway. A future collation specific to Chinese would provide the KangXi radicals as "first letters".
* Provided a precomputed list of first letters. Used Unicode 6.0.0 data and ICU 4.2. 
* Moved collation functionality from Language to a Collation class hierarchy with factory function. Removed the recently-added methods from Language and updated all callers.
* Changed Title::getCategorySortkey() to separate its parts with a line break instead of a null character. All collations supported by the intl extension ignore the null character, i.e. "ab" == "a\0b". It would have required a lot of hacking to make it work.
* Fixed the uppercase collation to handle non-ASCII characters, redundantly with r80436. I don't think it's necessary to change the collation name as was done there, so I reverted that in the course of my conflict merge. A --force option to updateCollation.php might be nice though.


											
										
										
											2011-01-17 14:02:22 +00:00
+									}
-												Split Collation.php

Change-Id: I6abfecf91cdce83dd34b1e8aa8e0b35315f62742

											
										
										
											2016-04-03 08:23:20 +00:00
+									public function getFirstLetter( $string ) {
-												* Introduced a non-dummy collation for $wgCategoryCollation, namely UCA with default tables. 
* Added a maintenance script which generates a list of first letters. Unified Han are omitted for performance, and because they shouldn't be used as headings anyway. A future collation specific to Chinese would provide the KangXi radicals as "first letters".
* Provided a precomputed list of first letters. Used Unicode 6.0.0 data and ICU 4.2. 
* Moved collation functionality from Language to a Collation class hierarchy with factory function. Removed the recently-added methods from Language and updated all callers.
* Changed Title::getCategorySortkey() to separate its parts with a line break instead of a null character. All collations supported by the intl extension ignore the null character, i.e. "ab" == "a\0b". It would have required a lot of hacking to make it work.
* Fixed the uppercase collation to handle non-ASCII characters, redundantly with r80436. I don't think it's necessary to change the collation name as was done there, so I reverted that in the course of my conflict merge. A --force option to updateCollation.php might be nice though.


											
										
										
											2011-01-17 14:02:22 +00:00
+										$string = strval( $string );
 										if ( $string === '' ) {
 											return '';
 										}
 										$firstChar = mb_substr( $string, 0, 1, 'UTF-8' );
-												Adding support for numeric collation when using UCA collations

To use, add '-u-kn' to the end of a collation name and set it as
the value for $wgCategoryCollation.

Bug: T8948
Change-Id: Ica7908daf80624fa2648127114d01665e96234c0

											
										
										
											2016-07-15 03:47:52 +00:00
 										// If the first character is a CJK character, just return that character.
-												Use wikimedia/utfnormal library, add backwards-compatability layer

This drops support for the custom utf8 normal PHP extension in favor
of the intl extension.

Bug: T90825
Change-Id: Ifbaeb2ef684217cf6187ccc4fb4d303f89608300

											
										
										
											2015-03-07 09:27:42 +00:00
+										if ( ord( $firstChar ) > 0x7f && self::isCjk( UtfNormal\Utils::utf8ToCodepoint( $firstChar ) ) ) {
-												* Introduced a non-dummy collation for $wgCategoryCollation, namely UCA with default tables. 
* Added a maintenance script which generates a list of first letters. Unified Han are omitted for performance, and because they shouldn't be used as headings anyway. A future collation specific to Chinese would provide the KangXi radicals as "first letters".
* Provided a precomputed list of first letters. Used Unicode 6.0.0 data and ICU 4.2. 
* Moved collation functionality from Language to a Collation class hierarchy with factory function. Removed the recently-added methods from Language and updated all callers.
* Changed Title::getCategorySortkey() to separate its parts with a line break instead of a null character. All collations supported by the intl extension ignore the null character, i.e. "ab" == "a\0b". It would have required a lot of hacking to make it work.
* Fixed the uppercase collation to handle non-ASCII characters, redundantly with r80436. I don't think it's necessary to change the collation name as was done there, so I reverted that in the course of my conflict merge. A --force option to updateCollation.php might be nice though.


											
										
										
											2011-01-17 14:02:22 +00:00
+											return $firstChar;
 										}
 										$sortKey = $this->getPrimarySortKey( $string );
 										// Do a binary search to find the correct letter to sort under
-												Create and move some functions for class ArrayUtils

Change-Id: Id9ca20925f49e314918810fb54b3819ba9cf9c39

											
										
										
											2012-10-18 09:33:15 +00:00
+										$min = ArrayUtils::findLowerBound(
-												Convert all array() syntax to []

Per wikitech-l consensus:
 https://lists.wikimedia.org/pipermail/wikitech-l/2016-February/084821.html

Notes:
* Disabled CallTimePassByReference due to false positives (T127163)

Change-Id: I2c8ce713ce6600a0bb7bf67537c87044c7a45c4b

											
										
										
											2016-02-17 09:09:32 +00:00
+											[ $this, 'getSortKeyByLetterIndex' ],
-												* Introduced a non-dummy collation for $wgCategoryCollation, namely UCA with default tables. 
* Added a maintenance script which generates a list of first letters. Unified Han are omitted for performance, and because they shouldn't be used as headings anyway. A future collation specific to Chinese would provide the KangXi radicals as "first letters".
* Provided a precomputed list of first letters. Used Unicode 6.0.0 data and ICU 4.2. 
* Moved collation functionality from Language to a Collation class hierarchy with factory function. Removed the recently-added methods from Language and updated all callers.
* Changed Title::getCategorySortkey() to separate its parts with a line break instead of a null character. All collations supported by the intl extension ignore the null character, i.e. "ab" == "a\0b". It would have required a lot of hacking to make it work.
* Fixed the uppercase collation to handle non-ASCII characters, redundantly with r80436. I don't think it's necessary to change the collation name as was done there, so I reverted that in the course of my conflict merge. A --force option to updateCollation.php might be nice though.


											
										
										
											2011-01-17 14:02:22 +00:00
+											$this->getFirstLetterCount(),
 											'strcmp',
 											$sortKey );
 										if ( $min === false ) {
 											// Before the first letter
 											return '';
 										}
-												Adding support for numeric collation when using UCA collations

To use, add '-u-kn' to the end of a collation name and set it as
the value for $wgCategoryCollation.

Bug: T8948
Change-Id: Ica7908daf80624fa2648127114d01665e96234c0

											
										
										
											2016-07-15 03:47:52 +00:00
 										$sortLetter = $this->getLetterByIndex( $min );
 										if ( $this->useNumericCollation ) {
 											// If the sort letter is a number, return '0–9' (or localized equivalent).
 											// ASCII value of 0 is 48. ASCII value of 9 is 57.
 											// Note that this also applies to non-Arabic numerals since they are
 											// mapped to Arabic numeral sort letters. For example, ২ sorts as 2.
 											if ( ord( $sortLetter ) >= 48 && ord( $sortLetter ) <= 57 ) {
 												$sortLetter = wfMessage( 'category-header-numerals' )->numParams( 0, 9 )->text();
 											}
 										}
 										return $sortLetter;
-												* Introduced a non-dummy collation for $wgCategoryCollation, namely UCA with default tables. 
* Added a maintenance script which generates a list of first letters. Unified Han are omitted for performance, and because they shouldn't be used as headings anyway. A future collation specific to Chinese would provide the KangXi radicals as "first letters".
* Provided a precomputed list of first letters. Used Unicode 6.0.0 data and ICU 4.2. 
* Moved collation functionality from Language to a Collation class hierarchy with factory function. Removed the recently-added methods from Language and updated all callers.
* Changed Title::getCategorySortkey() to separate its parts with a line break instead of a null character. All collations supported by the intl extension ignore the null character, i.e. "ab" == "a\0b". It would have required a lot of hacking to make it work.
* Fixed the uppercase collation to handle non-ASCII characters, redundantly with r80436. I don't think it's necessary to change the collation name as was done there, so I reverted that in the course of my conflict merge. A --force option to updateCollation.php might be nice though.


											
										
										
											2011-01-17 14:02:22 +00:00
+									}
-												Add @since tags to Collation stuff

Change-Id: Iec56ac4d1418737d171f8faa9c8f498fba5383ee

											
										
										
											2016-04-03 08:36:49 +00:00
+									/**
 									 * @since 1.16.3
-												collation: Refactor getFirstLetterData() cache handling

* Factor out fetchFirstLetterData() as a separate method.
* Move 'version' into the key instead of checking afterwards.
* Use getWithSetCallback() for the cache handling.
  (Depends on version being in the key).

Change-Id: I15bddf5d1dabcdcef47a938447ba59436bd8a294

											
										
										
											2016-04-19 21:27:22 +00:00
+									 * @return array
-												Add @since tags to Collation stuff

Change-Id: Iec56ac4d1418737d171f8faa9c8f498fba5383ee

											
										
										
											2016-04-03 08:36:49 +00:00
+									 */
-												Split Collation.php

Change-Id: I6abfecf91cdce83dd34b1e8aa8e0b35315f62742

											
										
										
											2016-04-03 08:23:20 +00:00
+									public function getFirstLetterData() {
-												collation: Refactor getFirstLetterData() cache handling

* Factor out fetchFirstLetterData() as a separate method.
* Move 'version' into the key instead of checking afterwards.
* Use getWithSetCallback() for the cache handling.
  (Depends on version being in the key).

Change-Id: I15bddf5d1dabcdcef47a938447ba59436bd8a294

											
										
										
											2016-04-19 21:27:22 +00:00
+										if ( $this->firstLetterData === null ) {
 											$cache = ObjectCache::getLocalServerInstance( CACHE_ANYTHING );
 											$cacheKey = $cache->makeKey(
 												'first-letters',
-												Clean up remaining get_class() uses

* get_class()        -> __CLASS__ (same as self::class)
* get_called_class() -> static::class
* get_class($this)   -> static::class

Change-Id: I1888a1897ecf4548a2e5a67a942e5c080dd7e3d3

											
										
										
											2017-03-07 02:14:14 +00:00
+												static::class,
-												collation: Refactor getFirstLetterData() cache handling

* Factor out fetchFirstLetterData() as a separate method.
* Move 'version' into the key instead of checking afterwards.
* Use getWithSetCallback() for the cache handling.
  (Depends on version being in the key).

Change-Id: I15bddf5d1dabcdcef47a938447ba59436bd8a294

											
										
										
											2016-04-19 21:27:22 +00:00
+												$this->locale,
 												$this->digitTransformLanguage->getCode(),
-												IcuCollation: Deprecate getICUVersion(), no need for PHP53 back-compat

Change-Id: If8dfdaf187b32b7b9a2c09a240416b9f481593f1

											
										
										
											2018-05-24 20:11:11 +00:00
+												INTL_ICU_VERSION,
-												collation: Refactor getFirstLetterData() cache handling

* Factor out fetchFirstLetterData() as a separate method.
* Move 'version' into the key instead of checking afterwards.
* Use getWithSetCallback() for the cache handling.
  (Depends on version being in the key).

Change-Id: I15bddf5d1dabcdcef47a938447ba59436bd8a294

											
										
										
											2016-04-19 21:27:22 +00:00
+												self::FIRST_LETTER_VERSION
 											);
 											$this->firstLetterData = $cache->getWithSetCallback( $cacheKey, $cache::TTL_WEEK, function () {
 												return $this->fetchFirstLetterData();
 											} );
-												* Introduced a non-dummy collation for $wgCategoryCollation, namely UCA with default tables. 
* Added a maintenance script which generates a list of first letters. Unified Han are omitted for performance, and because they shouldn't be used as headings anyway. A future collation specific to Chinese would provide the KangXi radicals as "first letters".
* Provided a precomputed list of first letters. Used Unicode 6.0.0 data and ICU 4.2. 
* Moved collation functionality from Language to a Collation class hierarchy with factory function. Removed the recently-added methods from Language and updated all callers.
* Changed Title::getCategorySortkey() to separate its parts with a line break instead of a null character. All collations supported by the intl extension ignore the null character, i.e. "ab" == "a\0b". It would have required a lot of hacking to make it work.
* Fixed the uppercase collation to handle non-ASCII characters, redundantly with r80436. I don't think it's necessary to change the collation name as was done there, so I reverted that in the course of my conflict merge. A --force option to updateCollation.php might be nice though.


											
										
										
											2011-01-17 14:02:22 +00:00
+										}
-												collation: Refactor getFirstLetterData() cache handling

* Factor out fetchFirstLetterData() as a separate method.
* Move 'version' into the key instead of checking afterwards.
* Use getWithSetCallback() for the cache handling.
  (Depends on version being in the key).

Change-Id: I15bddf5d1dabcdcef47a938447ba59436bd8a294

											
										
										
											2016-04-19 21:27:22 +00:00
+										return $this->firstLetterData;
 									}
-												* Introduced a non-dummy collation for $wgCategoryCollation, namely UCA with default tables. 
* Added a maintenance script which generates a list of first letters. Unified Han are omitted for performance, and because they shouldn't be used as headings anyway. A future collation specific to Chinese would provide the KangXi radicals as "first letters".
* Provided a precomputed list of first letters. Used Unicode 6.0.0 data and ICU 4.2. 
* Moved collation functionality from Language to a Collation class hierarchy with factory function. Removed the recently-added methods from Language and updated all callers.
* Changed Title::getCategorySortkey() to separate its parts with a line break instead of a null character. All collations supported by the intl extension ignore the null character, i.e. "ab" == "a\0b". It would have required a lot of hacking to make it work.
* Fixed the uppercase collation to handle non-ASCII characters, redundantly with r80436. I don't think it's necessary to change the collation name as was done there, so I reverted that in the course of my conflict merge. A --force option to updateCollation.php might be nice though.


											
										
										
											2011-01-17 14:02:22 +00:00
-												collation: Refactor getFirstLetterData() cache handling

* Factor out fetchFirstLetterData() as a separate method.
* Move 'version' into the key instead of checking afterwards.
* Use getWithSetCallback() for the cache handling.
  (Depends on version being in the key).

Change-Id: I15bddf5d1dabcdcef47a938447ba59436bd8a294

											
										
										
											2016-04-19 21:27:22 +00:00
+									/**
 									 * @return array
 									 * @throws MWException
 									 */
 									private function fetchFirstLetterData() {
-												collation: Move first-letters-root to includes/collation/data

For consistency with other data files. Also, like the other data files:

* For automated fetching of the Unicode files,
  move the steps from Makefile to a bash script.

* Switch to a static array file format.

Change-Id: If07487950a270283b8eaeda9a507e723ed2d89c4

											
										
										
											2018-07-20 03:23:50 +00:00
+										global $IP;
-												* Introduced a non-dummy collation for $wgCategoryCollation, namely UCA with default tables. 
* Added a maintenance script which generates a list of first letters. Unified Han are omitted for performance, and because they shouldn't be used as headings anyway. A future collation specific to Chinese would provide the KangXi radicals as "first letters".
* Provided a precomputed list of first letters. Used Unicode 6.0.0 data and ICU 4.2. 
* Moved collation functionality from Language to a Collation class hierarchy with factory function. Removed the recently-added methods from Language and updated all callers.
* Changed Title::getCategorySortkey() to separate its parts with a line break instead of a null character. All collations supported by the intl extension ignore the null character, i.e. "ab" == "a\0b". It would have required a lot of hacking to make it work.
* Fixed the uppercase collation to handle non-ASCII characters, redundantly with r80436. I don't think it's necessary to change the collation name as was done there, so I reverted that in the course of my conflict merge. A --force option to updateCollation.php might be nice though.


											
										
										
											2011-01-17 14:02:22 +00:00
+										// Generate data from serialized data file
-												Use consts in IcuCollation class

Change-Id: I664e7ea57b98975a3ff1c0c78477c18eb56837b4

											
										
										
											2020-11-21 00:41:58 +00:00
+										if ( isset( self::TAILORING_FIRST_LETTERS[$this->locale] ) ) {
-												collation: Move first-letters-root to includes/collation/data

For consistency with other data files. Also, like the other data files:

* For automated fetching of the Unicode files,
  move the steps from Makefile to a bash script.

* Switch to a static array file format.

Change-Id: If07487950a270283b8eaeda9a507e723ed2d89c4

											
										
										
											2018-07-20 03:23:50 +00:00
+											$letters = require "$IP/includes/collation/data/first-letters-root.php";
-												IcuCollation::$tailoringFirstLetters: implement letter removal

This is necessary for Swedish, where 'Þ' ("thorn") - considered a
separate letter by default in the first-letters-root.ser file - is
sorted as 'th', causing unexpected output on category pages - words
starting with 'th'..'u' were placed under a heading with the thorn.

There were three obvious ways to do this:
* somehow include information that this letter is to be removed in the
  string itself, as in 'sv' => array( "Å", "Ä", "Ö", "-Þ" ) - could
  potentially clash with valid uses
* create a separate array other than $tailoringFirstLetters to store
  this information - would cause the data to be fragmented all over
  the file
* include information about letters to be removed in a separate key
  "linked" to the regular one, as in '-sv' => array( "Þ" ) - I see no
  obvious downsides, so this is what I ended up doing

Bug: 45446
Change-Id: I57e07a2027c391c5baa767a68f4409b9de7b4618

											
										
										
											2013-03-11 21:24:09 +00:00
+											// Append additional characters
-												Use consts in IcuCollation class

Change-Id: I664e7ea57b98975a3ff1c0c78477c18eb56837b4

											
										
										
											2020-11-21 00:41:58 +00:00
+											$letters = array_merge( $letters, self::TAILORING_FIRST_LETTERS[$this->locale] );
-												IcuCollation::$tailoringFirstLetters: implement letter removal

This is necessary for Swedish, where 'Þ' ("thorn") - considered a
separate letter by default in the first-letters-root.ser file - is
sorted as 'th', causing unexpected output on category pages - words
starting with 'th'..'u' were placed under a heading with the thorn.

There were three obvious ways to do this:
* somehow include information that this letter is to be removed in the
  string itself, as in 'sv' => array( "Å", "Ä", "Ö", "-Þ" ) - could
  potentially clash with valid uses
* create a separate array other than $tailoringFirstLetters to store
  this information - would cause the data to be fragmented all over
  the file
* include information about letters to be removed in a separate key
  "linked" to the regular one, as in '-sv' => array( "Þ" ) - I see no
  obvious downsides, so this is what I ended up doing

Bug: 45446
Change-Id: I57e07a2027c391c5baa767a68f4409b9de7b4618

											
										
										
											2013-03-11 21:24:09 +00:00
+											// Remove unnecessary ones, if any
-												Use consts in IcuCollation class

Change-Id: I664e7ea57b98975a3ff1c0c78477c18eb56837b4

											
										
										
											2020-11-21 00:41:58 +00:00
+											if ( isset( self::TAILORING_FIRST_LETTERS['-' . $this->locale] ) ) {
 												$letters = array_diff( $letters, self::TAILORING_FIRST_LETTERS['-' . $this->locale] );
-												IcuCollation::$tailoringFirstLetters: implement letter removal

This is necessary for Swedish, where 'Þ' ("thorn") - considered a
separate letter by default in the first-letters-root.ser file - is
sorted as 'th', causing unexpected output on category pages - words
starting with 'th'..'u' were placed under a heading with the thorn.

There were three obvious ways to do this:
* somehow include information that this letter is to be removed in the
  string itself, as in 'sv' => array( "Å", "Ä", "Ö", "-Þ" ) - could
  potentially clash with valid uses
* create a separate array other than $tailoringFirstLetters to store
  this information - would cause the data to be fragmented all over
  the file
* include information about letters to be removed in a separate key
  "linked" to the regular one, as in '-sv' => array( "Þ" ) - I see no
  obvious downsides, so this is what I ended up doing

Bug: 45446
Change-Id: I57e07a2027c391c5baa767a68f4409b9de7b4618

											
										
										
											2013-03-11 21:24:09 +00:00
+											}
-												IcuCollation: Sort digits under localised digits' headings

Previously both '1' and '۱' ("DIGIT ONE" and "EXTENDED ARABIC-INDIC
DIGIT ONE") were sorted under '1' heading, regardless of collation
locale.

Now they will be both sorted under localised heading name (transformed
using Language#formatNum), for example '1' for 'uca-en' collation or
'۱' for 'uca-fa' collation.

Bug: 55630
Change-Id: I0b745d955a6e72f53873c95648087aa5f90a8852

											
										
										
											2013-10-12 22:21:01 +00:00
+											// Apply digit transforms
-												Convert all array() syntax to []

Per wikitech-l consensus:
 https://lists.wikimedia.org/pipermail/wikitech-l/2016-February/084821.html

Notes:
* Disabled CallTimePassByReference due to false positives (T127163)

Change-Id: I2c8ce713ce6600a0bb7bf67537c87044c7a45c4b

											
										
										
											2016-02-17 09:09:32 +00:00
+											$digits = [ '0', '1', '2', '3', '4', '5', '6', '7', '8', '9' ];
-												IcuCollation: Sort digits under localised digits' headings

Previously both '1' and '۱' ("DIGIT ONE" and "EXTENDED ARABIC-INDIC
DIGIT ONE") were sorted under '1' heading, regardless of collation
locale.

Now they will be both sorted under localised heading name (transformed
using Language#formatNum), for example '1' for 'uca-en' collation or
'۱' for 'uca-fa' collation.

Bug: 55630
Change-Id: I0b745d955a6e72f53873c95648087aa5f90a8852

											
										
										
											2013-10-12 22:21:01 +00:00
+											$letters = array_diff( $letters, $digits );
 											foreach ( $digits as $digit ) {
-												Language: hard deprecate the `noSeparators` parameter to ::formatNum

Code should use Language::formatNumNoSeparators() instead, which has
existed since MW 1.21.

Code search:
https://codesearch.wmcloud.org/search/?q=formatNum%5C%28%5B%5E%29%5D*%2C&i=nope&files=&repos=

Depends-On: I95c365e2535bb3c47bed69a9b702c8f13d9fab87
Depends-On: I012434d5f6c749fec45a6c160e8d5d03686192e9
Depends-On: If3de5645a92514f605d4117fea3a820ed6c86624
Change-Id: I58a66975e505f16d8db5d663a9ca225535277983

											
										
										
											2020-09-09 18:04:26 +00:00
+												$letters[] = $this->digitTransformLanguage->formatNumNoSeparators( $digit );
-												IcuCollation: Sort digits under localised digits' headings

Previously both '1' and '۱' ("DIGIT ONE" and "EXTENDED ARABIC-INDIC
DIGIT ONE") were sorted under '1' heading, regardless of collation
locale.

Now they will be both sorted under localised heading name (transformed
using Language#formatNum), for example '1' for 'uca-en' collation or
'۱' for 'uca-fa' collation.

Bug: 55630
Change-Id: I0b745d955a6e72f53873c95648087aa5f90a8852

											
										
										
											2013-10-12 22:21:01 +00:00
+											}
-												Collapse some nested if statements

Change-Id: I9a97325d738d09370d29d35d5254bc0dadc57ff4

											
										
										
											2019-03-29 20:12:24 +00:00
+										} elseif ( $this->locale === 'root' ) {
 											$letters = require "$IP/includes/collation/data/first-letters-root.php";
-												(bug 43799) create language-specific collations for category sorting

This allows one to *finally* get articles to be correctly sorted on
category pages for 67 languages based in latin, greek and cyrillic
alphabets.

Fixes bug 29788, bug 41040, and bug 42412 (implementing collations for
Swedish, Polish, Ukrainian).

Full list of language codes this adds support for: af, ast, az, be,
bg, br, bs, ca, co, cs, cy, da, de, dsb, el, en, eo, es, et, eu, fi,
fo, fr, fur, fy, ga, gd, gl, hr, hsb, hu, is, it, kk, kl, ku, ky, la,
lb, lt, lv, mk, mo, mt, nl, no, oc, pl, pt, rm, ro, ru, rup, sco, sk,
sl, smn, sq, sr, sv, tk, tl, tr, tt, uk, uz, vi.

* Include data about first-letter characters for 67 language
  tailorings. This data was generated from based on
  http://developer.mimer.com/charts/tailorings.htm by a Ruby script
  (https://www.mediawiki.org/wiki/User:Matma_Rex/generateCollationTailoringData.rb),
  then adjusted by hand (removed duplicate definitions for Spanish and
  German, changed code fil -> tl (Filipino -> Tagalog).

* Mark languages verified by native speakers (currently only pl
  (Polish) I verified by myself and fi (Finnish) checked by Niklas).

* Allow for collations named like 'uca-<langcode>', mapping them to
  IcuCollation with appropriate parameter. The code doesn't check if
  we actually have data for given language, as it's checked after the
  IcuCollation class instance is constructed.

* Add the tailoring data to the default first-letter file (for root
  collation) before it's cached for given locale.

Change-Id: I838484b9aaf23945fe7880fef2e3da5f5c06877f

											
										
										
											2013-02-18 21:09:16 +00:00
+										} else {
-												Collapse some nested if statements

Change-Id: I9a97325d738d09370d29d35d5254bc0dadc57ff4

											
										
										
											2019-03-29 20:12:24 +00:00
+											// FIXME: Is this still used?
-												Deprecate a bunch of global functions

* wfAcceptToPrefs
* wfClearOutputBuffers
* wfConfiguredReadOnlyReason
* wfDebugMem
* wfGetPrecompiledData
* wfNegotiateType

Bug: T264976
Bug: T264979
Bug: T264981
Bug: T264983
Bug: T264984
Bug: T264985
Change-Id: Ia05bc84e4d1be7c8a02472f32e2c009e4bb32032

											
										
										
											2020-12-18 19:57:09 +00:00
+											$letters = $this->getPrecompiledData( "first-letters-{$this->locale}.ser" );
-												Collapse some nested if statements

Change-Id: I9a97325d738d09370d29d35d5254bc0dadc57ff4

											
										
										
											2019-03-29 20:12:24 +00:00
+											if ( $letters === false ) {
 												throw new MWException( "MediaWiki does not support ICU locale " .
 													"\"{$this->locale}\"" );
-												(bug 43799) create language-specific collations for category sorting

This allows one to *finally* get articles to be correctly sorted on
category pages for 67 languages based in latin, greek and cyrillic
alphabets.

Fixes bug 29788, bug 41040, and bug 42412 (implementing collations for
Swedish, Polish, Ukrainian).

Full list of language codes this adds support for: af, ast, az, be,
bg, br, bs, ca, co, cs, cy, da, de, dsb, el, en, eo, es, et, eu, fi,
fo, fr, fur, fy, ga, gd, gl, hr, hsb, hu, is, it, kk, kl, ku, ky, la,
lb, lt, lv, mk, mo, mt, nl, no, oc, pl, pt, rm, ro, ru, rup, sco, sk,
sl, smn, sq, sr, sv, tk, tl, tr, tt, uk, uz, vi.

* Include data about first-letter characters for 67 language
  tailorings. This data was generated from based on
  http://developer.mimer.com/charts/tailorings.htm by a Ruby script
  (https://www.mediawiki.org/wiki/User:Matma_Rex/generateCollationTailoringData.rb),
  then adjusted by hand (removed duplicate definitions for Spanish and
  German, changed code fil -> tl (Filipino -> Tagalog).

* Mark languages verified by native speakers (currently only pl
  (Polish) I verified by myself and fi (Finnish) checked by Niklas).

* Allow for collations named like 'uca-<langcode>', mapping them to
  IcuCollation with appropriate parameter. The code doesn't check if
  we actually have data for given language, as it's checked after the
  IcuCollation class instance is constructed.

* Add the tailoring data to the default first-letter file (for root
  collation) before it's cached for given locale.

Change-Id: I838484b9aaf23945fe7880fef2e3da5f5c06877f

											
										
										
											2013-02-18 21:09:16 +00:00
+											}
-												* Introduced a non-dummy collation for $wgCategoryCollation, namely UCA with default tables. 
* Added a maintenance script which generates a list of first letters. Unified Han are omitted for performance, and because they shouldn't be used as headings anyway. A future collation specific to Chinese would provide the KangXi radicals as "first letters".
* Provided a precomputed list of first letters. Used Unicode 6.0.0 data and ICU 4.2. 
* Moved collation functionality from Language to a Collation class hierarchy with factory function. Removed the recently-added methods from Language and updated all callers.
* Changed Title::getCategorySortkey() to separate its parts with a line break instead of a null character. All collations supported by the intl extension ignore the null character, i.e. "ab" == "a\0b". It would have required a lot of hacking to make it work.
* Fixed the uppercase collation to handle non-ASCII characters, redundantly with r80436. I don't think it's necessary to change the collation name as was done there, so I reverted that in the course of my conflict merge. A --force option to updateCollation.php might be nice though.


											
										
										
											2011-01-17 14:02:22 +00:00
+										}
-												Change some line comments to multi line comments

This allows empty lines inside the comment as found by the
MediaWiki.WhiteSpace.SpaceBeforeSingleLineComment.EmptyComment sniff

Change-Id: Iac155bbda4a84562db2b452baeae9b8973899453

											
										
										
											2015-10-14 07:40:50 +00:00
+										/* Sort the letters.
 										 *
 										 * It's impossible to have the precompiled data file properly sorted,
 										 * because the sort order changes depending on ICU version. If the
 										 * array is not properly sorted, the binary search will return random
 										 * results.
 										 *
 										 * We also take this opportunity to remove primary collisions.
 										 */
-												Convert all array() syntax to []

Per wikitech-l consensus:
 https://lists.wikimedia.org/pipermail/wikitech-l/2016-February/084821.html

Notes:
* Disabled CallTimePassByReference due to false positives (T127163)

Change-Id: I2c8ce713ce6600a0bb7bf67537c87044c7a45c4b

											
										
										
											2016-02-17 09:09:32 +00:00
+										$letterMap = [];
-												* Introduced a non-dummy collation for $wgCategoryCollation, namely UCA with default tables. 
* Added a maintenance script which generates a list of first letters. Unified Han are omitted for performance, and because they shouldn't be used as headings anyway. A future collation specific to Chinese would provide the KangXi radicals as "first letters".
* Provided a precomputed list of first letters. Used Unicode 6.0.0 data and ICU 4.2. 
* Moved collation functionality from Language to a Collation class hierarchy with factory function. Removed the recently-added methods from Language and updated all callers.
* Changed Title::getCategorySortkey() to separate its parts with a line break instead of a null character. All collations supported by the intl extension ignore the null character, i.e. "ab" == "a\0b". It would have required a lot of hacking to make it work.
* Fixed the uppercase collation to handle non-ASCII characters, redundantly with r80436. I don't think it's necessary to change the collation name as was done there, so I reverted that in the course of my conflict merge. A --force option to updateCollation.php might be nice though.


											
										
										
											2011-01-17 14:02:22 +00:00
+										foreach ( $letters as $letter ) {
 											$key = $this->getPrimarySortKey( $letter );
 											if ( isset( $letterMap[$key] ) ) {
-												IcuCollation: Use codepoint as tiebreaker when getting first-letters

This prevents unexpected cuneiform digits from acting as headings for
2 and 3 on category pages.

Bug: T187645
Change-Id: I0424a24769899cb23b28704f97e1002fa44999fd

											
										
										
											2018-05-08 11:43:10 +00:00
+												// Primary collision (two characters with the same sort position).
 												// Keep whichever one sorts first in the main collator.
 												$comp = $this->mainCollator->compare( $letter, $letterMap[$key] );
-												Remove terminating line breaks from debug messages

A terminating line break has not been required in wfDebug() since 2014,
however no migration was done. Some of these line breaks found their way
into LoggerInterface::debug() calls, where they mess up the formatting
of the debug log.

So, remove terminating line breaks from wfDebug() and
LoggerInterface::debug() calls.

Also:
* Fix the stripping of leading line breaks from the log header emitted
  by Setup.php. This feature, accidentally broken in 2014, allows
  requests to be distinguished in the log file.
* Avoid using the global variable $self.
* Move the logging of the client IP back to Setup.php. It was moved to
  WebRequest in the hopes that it would not always be needed, however
  $wgRequest->getIP() is now called unconditionally a few lines up in
  Setup.php. This means that it is put in its proper place after the
  "start request" message.
* Wrap the log header code in a closure so that variables like $name do
  not leak into global scope.
* In Linker.php, remove a few instances of an unnecessary second
  parameter to wfDebug().

Change-Id: I96651d3044a95b9d210b51cb8368edc76bebbb9e

											
										
										
											2020-06-01 05:00:39 +00:00
+												wfDebug( "Primary collision '$letter' '{$letterMap[$key]}' (comparison: $comp)" );
-												IcuCollation: Use codepoint as tiebreaker when getting first-letters

This prevents unexpected cuneiform digits from acting as headings for
2 and 3 on category pages.

Bug: T187645
Change-Id: I0424a24769899cb23b28704f97e1002fa44999fd

											
										
										
											2018-05-08 11:43:10 +00:00
+												// If that also has a collision, use codepoint as a tiebreaker.
 												if ( $comp === 0 ) {
-												Use PHP 7 '<=>' operator in 'sort()' callbacks

`$a <=> $b` returns `-1` if `$a` is lesser, `1` if `$b` is lesser,
and `0` if they are equal, which are exactly the values 'sort()'
callbacks are supposed to return.

It also enables the neat idiom `$a[x] <=> $b[x] ?: $a[y] <=> $b[y]`
to sort arrays of objects first by 'x', and by 'y' if they are equal.

* Replace a common pattern like `return $a < $b ? -1 : 1` with the
  new operator (and similar patterns with the variables, the numbers
  or the comparison inverted). Some of the uses were previously not
  correctly handling the variables being equal; this is now
  automatically fixed.
* Also replace `return $a - $b`, which is equivalent to `return
  $a <=> $b` if both variables are integers but less intuitive.
* (Do not replace `return strcmp( $a, $b )`. It is also equivalent
  when both variables are strings, but if any of the variables is not,
  'strcmp()' converts it to a string before comparison, which could
  give different results than '<=>', so changing this would require
  careful review and isn't worth it.)
* Also replace `return $a > $b`, which presumably sort of works most
  of the time (returns `1` if `$b` is lesser, and `0` if they are
  equal or `$a` is lesser) but is erroneous.

Change-Id: I19a3d2fc8fcdb208c10330bd7a42c4e05d7f5cf3

											
										
										
											2017-10-06 20:39:13 +00:00
+													$comp = UtfNormal\Utils::utf8ToCodepoint( $letter ) <=>
-												IcuCollation: Use codepoint as tiebreaker when getting first-letters

This prevents unexpected cuneiform digits from acting as headings for
2 and 3 on category pages.

Bug: T187645
Change-Id: I0424a24769899cb23b28704f97e1002fa44999fd

											
										
										
											2018-05-08 11:43:10 +00:00
+														UtfNormal\Utils::utf8ToCodepoint( $letterMap[$key] );
 												}
 												if ( $comp < 0 ) {
-												* Introduced a non-dummy collation for $wgCategoryCollation, namely UCA with default tables. 
* Added a maintenance script which generates a list of first letters. Unified Han are omitted for performance, and because they shouldn't be used as headings anyway. A future collation specific to Chinese would provide the KangXi radicals as "first letters".
* Provided a precomputed list of first letters. Used Unicode 6.0.0 data and ICU 4.2. 
* Moved collation functionality from Language to a Collation class hierarchy with factory function. Removed the recently-added methods from Language and updated all callers.
* Changed Title::getCategorySortkey() to separate its parts with a line break instead of a null character. All collations supported by the intl extension ignore the null character, i.e. "ab" == "a\0b". It would have required a lot of hacking to make it work.
* Fixed the uppercase collation to handle non-ASCII characters, redundantly with r80436. I don't think it's necessary to change the collation name as was done there, so I reverted that in the course of my conflict merge. A --force option to updateCollation.php might be nice though.


											
										
										
											2011-01-17 14:02:22 +00:00
+													$letterMap[$key] = $letter;
 												}
 											} else {
 												$letterMap[$key] = $letter;
 											}
 										}
 										ksort( $letterMap, SORT_STRING );
-												Change some line comments to multi line comments

This allows empty lines inside the comment as found by the
MediaWiki.WhiteSpace.SpaceBeforeSingleLineComment.EmptyComment sniff

Change-Id: Iac155bbda4a84562db2b452baeae9b8973899453

											
										
										
											2015-10-14 07:40:50 +00:00
 										/* Remove duplicate prefixes. Basically if something has a sortkey
 										 * which is a prefix of some other sortkey, then it is an
 										 * expansion and probably should not be considered a section
 										 * header.
 										 *
 										 * For example 'þ' is sometimes sorted as if it is the letters
 										 * 'th'. Other times it is its own primary element. Another
 										 * example is '₨'. Sometimes its a currency symbol. Sometimes it
 										 * is an 'R' followed by an 's'.
 										 *
 										 * Additionally an expanded element should always sort directly
 										 * after its first element due to they way sortkeys work.
 										 *
 										 * UCA sortkey elements are of variable length but no collation
 										 * element should be a prefix of some other element, so I think
 										 * this is safe. See:
 										 * - https://ssl.icu-project.org/repos/icu/icuhtml/trunk/design/collation/ICU_collation_design.htm
-												Update link target

Link redirected on website

Change-Id: Ib7dafdf63dfe1240b2f78c9296999db6ef454ed3

											
										
										
											2021-10-13 13:29:13 +00:00
+										 * - https://icu.unicode.org/design/collation/uca-weight-allocation
-												Change some line comments to multi line comments

This allows empty lines inside the comment as found by the
MediaWiki.WhiteSpace.SpaceBeforeSingleLineComment.EmptyComment sniff

Change-Id: Iac155bbda4a84562db2b452baeae9b8973899453

											
										
										
											2015-10-14 07:40:50 +00:00
+										 *
 										 * Additionally, there is something called primary compression to
 										 * worry about. Basically, if you have two primary elements that
 										 * are more than one byte and both start with the same byte then
 										 * the first byte is dropped on the second primary. Additionally
 										 * either \x03 or \xFF may be added to mean that the next primary
 										 * does not start with the first byte of the first primary.
 										 *
 										 * This shouldn't matter much, as the first primary is not
 										 * changed, and that is what we are comparing against.
 										 *
 										 * tl;dr: This makes some assumptions about how icu implements
 										 * collations. It seems incredibly unlikely these assumptions
 										 * will change, but nonetheless they are assumptions.
 										 */
-												Remove first letters that have an overlapping prefix.

First letters are supposed to be primary collation elements.
However, we do not want expansions to be considered
as firstletters (aka thorn "þ" -> "th" which isn't
the same as any other first letter (since "t" !== "th" )
however if þ was a first letter, the word "the" and
even worse the word "too" would be sorted under it, which
is wrong.

Looking for feedback if this all sounds sane. I have tested
it, it got rid of the contractions while at the same time
not removing any letter it wasn't supposed to.

Once this is merged, we could get rid of all the
-<langcode> entries. The other firstLetter array
entries for tailorings could be merged into
generateCollationData.php too, since incorrect
things would get pruned automatically, which
would probably make the logic in Collation.php
simpler.

Bug: 43740
Change-Id: I4bd3d39ec2938a53e2c6728adc48ee6cf9778d74

											
										
										
											2013-03-24 03:09:43 +00:00
 										$prev = false;
-												Convert all array() syntax to []

Per wikitech-l consensus:
 https://lists.wikimedia.org/pipermail/wikitech-l/2016-February/084821.html

Notes:
* Disabled CallTimePassByReference due to false positives (T127163)

Change-Id: I2c8ce713ce6600a0bb7bf67537c87044c7a45c4b

											
										
										
											2016-02-17 09:09:32 +00:00
+										$duplicatePrefixes = [];
-												Fixed spacing in files direct in includes folder

Added spaces before if, foreach
Added some braces for one line statements

Change-Id: Ibb8dd102db045522d12ff939075ba7420d95ab6b

											
										
										
											2013-04-20 22:49:30 +00:00
+										foreach ( $letterMap as $key => $value ) {
-												Remove first letters that have an overlapping prefix.

First letters are supposed to be primary collation elements.
However, we do not want expansions to be considered
as firstletters (aka thorn "þ" -> "th" which isn't
the same as any other first letter (since "t" !== "th" )
however if þ was a first letter, the word "the" and
even worse the word "too" would be sorted under it, which
is wrong.

Looking for feedback if this all sounds sane. I have tested
it, it got rid of the contractions while at the same time
not removing any letter it wasn't supposed to.

Once this is merged, we could get rid of all the
-<langcode> entries. The other firstLetter array
entries for tailorings could be merged into
generateCollationData.php too, since incorrect
things would get pruned automatically, which
would probably make the logic in Collation.php
simpler.

Bug: 43740
Change-Id: I4bd3d39ec2938a53e2c6728adc48ee6cf9778d74

											
										
										
											2013-03-24 03:09:43 +00:00
+											// Remove terminator byte. Otherwise the prefix
 											// comparison will get hung up on that.
 											$trimmedKey = rtrim( $key, "\0" );
 											if ( $prev === false || $prev === '' ) {
 												$prev = $trimmedKey;
 												// We don't yet have a collation element
 												// to compare against, so continue.
 												continue;
 											}
 											// Due to the fact the array is sorted, we only have
 											// to compare with the element directly previous
 											// to the current element (skipping expansions).
 											// An element "X" will always sort directly
 											// before "XZ" (Unless we have "XY", but we
 											// do not update $prev in that case).
 											if ( substr( $trimmedKey, 0, strlen( $prev ) ) === $prev ) {
 												$duplicatePrefixes[] = $key;
 												// If this is an expansion, we don't want to
 												// compare the next element to this element,
 												// but to what is currently $prev
 												continue;
 											}
 											$prev = $trimmedKey;
 										}
-												Fixed spacing in files direct in includes folder

Added spaces before if, foreach
Added some braces for one line statements

Change-Id: Ibb8dd102db045522d12ff939075ba7420d95ab6b

											
										
										
											2013-04-20 22:49:30 +00:00
+										foreach ( $duplicatePrefixes as $badKey ) {
-												Remove terminating line breaks from debug messages

A terminating line break has not been required in wfDebug() since 2014,
however no migration was done. Some of these line breaks found their way
into LoggerInterface::debug() calls, where they mess up the formatting
of the debug log.

So, remove terminating line breaks from wfDebug() and
LoggerInterface::debug() calls.

Also:
* Fix the stripping of leading line breaks from the log header emitted
  by Setup.php. This feature, accidentally broken in 2014, allows
  requests to be distinguished in the log file.
* Avoid using the global variable $self.
* Move the logging of the client IP back to Setup.php. It was moved to
  WebRequest in the hopes that it would not always be needed, however
  $wgRequest->getIP() is now called unconditionally a few lines up in
  Setup.php. This means that it is put in its proper place after the
  "start request" message.
* Wrap the log header code in a closure so that variables like $name do
  not leak into global scope.
* In Linker.php, remove a few instances of an unnecessary second
  parameter to wfDebug().

Change-Id: I96651d3044a95b9d210b51cb8368edc76bebbb9e

											
										
										
											2020-06-01 05:00:39 +00:00
+											wfDebug( "Removing '{$letterMap[$badKey]}' from first letters." );
-												Remove first letters that have an overlapping prefix.

First letters are supposed to be primary collation elements.
However, we do not want expansions to be considered
as firstletters (aka thorn "þ" -> "th" which isn't
the same as any other first letter (since "t" !== "th" )
however if þ was a first letter, the word "the" and
even worse the word "too" would be sorted under it, which
is wrong.

Looking for feedback if this all sounds sane. I have tested
it, it got rid of the contractions while at the same time
not removing any letter it wasn't supposed to.

Once this is merged, we could get rid of all the
-<langcode> entries. The other firstLetter array
entries for tailorings could be merged into
generateCollationData.php too, since incorrect
things would get pruned automatically, which
would probably make the logic in Collation.php
simpler.

Bug: 43740
Change-Id: I4bd3d39ec2938a53e2c6728adc48ee6cf9778d74

											
										
										
											2013-03-24 03:09:43 +00:00
+											unset( $letterMap[$badKey] );
 											// This code assumes that unsetting does not change sort order.
 										}
-												Convert all array() syntax to []

Per wikitech-l consensus:
 https://lists.wikimedia.org/pipermail/wikitech-l/2016-February/084821.html

Notes:
* Disabled CallTimePassByReference due to false positives (T127163)

Change-Id: I2c8ce713ce6600a0bb7bf67537c87044c7a45c4b

											
										
										
											2016-02-17 09:09:32 +00:00
+										$data = [
-												* Introduced a non-dummy collation for $wgCategoryCollation, namely UCA with default tables. 
* Added a maintenance script which generates a list of first letters. Unified Han are omitted for performance, and because they shouldn't be used as headings anyway. A future collation specific to Chinese would provide the KangXi radicals as "first letters".
* Provided a precomputed list of first letters. Used Unicode 6.0.0 data and ICU 4.2. 
* Moved collation functionality from Language to a Collation class hierarchy with factory function. Removed the recently-added methods from Language and updated all callers.
* Changed Title::getCategorySortkey() to separate its parts with a line break instead of a null character. All collations supported by the intl extension ignore the null character, i.e. "ab" == "a\0b". It would have required a lot of hacking to make it work.
* Fixed the uppercase collation to handle non-ASCII characters, redundantly with r80436. I don't think it's necessary to change the collation name as was done there, so I reverted that in the course of my conflict merge. A --force option to updateCollation.php might be nice though.


											
										
										
											2011-01-17 14:02:22 +00:00
+											'chars' => array_values( $letterMap ),
-												Allow first letter data to be invalidated

Just a class constant for now, but that should suffice to deal with the
current emergency. Proper dependency tracking via the CacheDependency
hierarchy would be pretty cool in the long term.

Change-Id: Ibbe7fa2814434d4869aba20f628bd43269e611fa

											
										
										
											2013-03-13 03:53:20 +00:00
+											'keys' => array_keys( $letterMap ),
-												Convert all array() syntax to []

Per wikitech-l consensus:
 https://lists.wikimedia.org/pipermail/wikitech-l/2016-February/084821.html

Notes:
* Disabled CallTimePassByReference due to false positives (T127163)

Change-Id: I2c8ce713ce6600a0bb7bf67537c87044c7a45c4b

											
										
										
											2016-02-17 09:09:32 +00:00
+										];
-												* Introduced a non-dummy collation for $wgCategoryCollation, namely UCA with default tables. 
* Added a maintenance script which generates a list of first letters. Unified Han are omitted for performance, and because they shouldn't be used as headings anyway. A future collation specific to Chinese would provide the KangXi radicals as "first letters".
* Provided a precomputed list of first letters. Used Unicode 6.0.0 data and ICU 4.2. 
* Moved collation functionality from Language to a Collation class hierarchy with factory function. Removed the recently-added methods from Language and updated all callers.
* Changed Title::getCategorySortkey() to separate its parts with a line break instead of a null character. All collations supported by the intl extension ignore the null character, i.e. "ab" == "a\0b". It would have required a lot of hacking to make it work.
* Fixed the uppercase collation to handle non-ASCII characters, redundantly with r80436. I don't think it's necessary to change the collation name as was done there, so I reverted that in the course of my conflict merge. A --force option to updateCollation.php might be nice though.


											
										
										
											2011-01-17 14:02:22 +00:00
 										// Reduce memory usage before caching
 										unset( $letterMap );
 										return $data;
 									}
-												Deprecate a bunch of global functions

* wfAcceptToPrefs
* wfClearOutputBuffers
* wfConfiguredReadOnlyReason
* wfDebugMem
* wfGetPrecompiledData
* wfNegotiateType

Bug: T264976
Bug: T264979
Bug: T264981
Bug: T264983
Bug: T264984
Bug: T264985
Change-Id: Ia05bc84e4d1be7c8a02472f32e2c009e4bb32032

											
										
										
											2020-12-18 19:57:09 +00:00
+									/**
 									 * Get an object from the precompiled serialized directory
 									 *
 									 * Replaced use of wfGetPrecompiledData
 									 *
 									 * @param string $name
 									 * @return mixed The variable on success, false on failure
 									 */
 									private function getPrecompiledData( $name ) {
 										global $IP;
 										$file = "$IP/serialized/$name";
 										if ( file_exists( $file ) ) {
 											$blob = file_get_contents( $file );
 											if ( $blob ) {
 												return unserialize( $blob );
 											}
 										}
 										return false;
 									}
-												Add @since tags to Collation stuff

Change-Id: Iec56ac4d1418737d171f8faa9c8f498fba5383ee

											
										
										
											2016-04-03 08:36:49 +00:00
+									/**
-												Improve some parameter docs

Add missing @return and @param to function docs and fixed some @param

Change-Id: I810727961057cfdcc274428b239af5975c57468d

											
										
										
											2017-09-09 20:47:04 +00:00
+									 * @param string $index
 									 * @return string
-												Add @since tags to Collation stuff

Change-Id: Iec56ac4d1418737d171f8faa9c8f498fba5383ee

											
										
										
											2016-04-03 08:36:49 +00:00
+									 * @since 1.16.3
 									 */
-												Split Collation.php

Change-Id: I6abfecf91cdce83dd34b1e8aa8e0b35315f62742

											
										
										
											2016-04-03 08:23:20 +00:00
+									public function getLetterByIndex( $index ) {
-												collation: Refactor getFirstLetterData() cache handling

* Factor out fetchFirstLetterData() as a separate method.
* Move 'version' into the key instead of checking afterwards.
* Use getWithSetCallback() for the cache handling.
  (Depends on version being in the key).

Change-Id: I15bddf5d1dabcdcef47a938447ba59436bd8a294

											
										
										
											2016-04-19 21:27:22 +00:00
+										return $this->getFirstLetterData()['chars'][$index];
-												* Introduced a non-dummy collation for $wgCategoryCollation, namely UCA with default tables. 
* Added a maintenance script which generates a list of first letters. Unified Han are omitted for performance, and because they shouldn't be used as headings anyway. A future collation specific to Chinese would provide the KangXi radicals as "first letters".
* Provided a precomputed list of first letters. Used Unicode 6.0.0 data and ICU 4.2. 
* Moved collation functionality from Language to a Collation class hierarchy with factory function. Removed the recently-added methods from Language and updated all callers.
* Changed Title::getCategorySortkey() to separate its parts with a line break instead of a null character. All collations supported by the intl extension ignore the null character, i.e. "ab" == "a\0b". It would have required a lot of hacking to make it work.
* Fixed the uppercase collation to handle non-ASCII characters, redundantly with r80436. I don't think it's necessary to change the collation name as was done there, so I reverted that in the course of my conflict merge. A --force option to updateCollation.php might be nice though.


											
										
										
											2011-01-17 14:02:22 +00:00
+									}
-												Add @since tags to Collation stuff

Change-Id: Iec56ac4d1418737d171f8faa9c8f498fba5383ee

											
										
										
											2016-04-03 08:36:49 +00:00
+									/**
-												Improve some parameter docs

Add missing @return and @param to function docs and fixed some @param

Change-Id: I810727961057cfdcc274428b239af5975c57468d

											
										
										
											2017-09-09 20:47:04 +00:00
+									 * @param string $index
 									 * @return string
-												Add @since tags to Collation stuff

Change-Id: Iec56ac4d1418737d171f8faa9c8f498fba5383ee

											
										
										
											2016-04-03 08:36:49 +00:00
+									 * @since 1.16.3
 									 */
-												Split Collation.php

Change-Id: I6abfecf91cdce83dd34b1e8aa8e0b35315f62742

											
										
										
											2016-04-03 08:23:20 +00:00
+									public function getSortKeyByLetterIndex( $index ) {
-												collation: Refactor getFirstLetterData() cache handling

* Factor out fetchFirstLetterData() as a separate method.
* Move 'version' into the key instead of checking afterwards.
* Use getWithSetCallback() for the cache handling.
  (Depends on version being in the key).

Change-Id: I15bddf5d1dabcdcef47a938447ba59436bd8a294

											
										
										
											2016-04-19 21:27:22 +00:00
+										return $this->getFirstLetterData()['keys'][$index];
-												* Introduced a non-dummy collation for $wgCategoryCollation, namely UCA with default tables. 
* Added a maintenance script which generates a list of first letters. Unified Han are omitted for performance, and because they shouldn't be used as headings anyway. A future collation specific to Chinese would provide the KangXi radicals as "first letters".
* Provided a precomputed list of first letters. Used Unicode 6.0.0 data and ICU 4.2. 
* Moved collation functionality from Language to a Collation class hierarchy with factory function. Removed the recently-added methods from Language and updated all callers.
* Changed Title::getCategorySortkey() to separate its parts with a line break instead of a null character. All collations supported by the intl extension ignore the null character, i.e. "ab" == "a\0b". It would have required a lot of hacking to make it work.
* Fixed the uppercase collation to handle non-ASCII characters, redundantly with r80436. I don't think it's necessary to change the collation name as was done there, so I reverted that in the course of my conflict merge. A --force option to updateCollation.php might be nice though.


											
										
										
											2011-01-17 14:02:22 +00:00
+									}
-												Add @since tags to Collation stuff

Change-Id: Iec56ac4d1418737d171f8faa9c8f498fba5383ee

											
										
										
											2016-04-03 08:36:49 +00:00
+									/**
-												Avoid PHP scalar type juggling in includes/ (part 2)

Continuation of e5444ea55a8000f0040.

Change-Id: I9f95e7de4e219dee3abcdd210bb708d949f378d0

											
										
										
											2019-09-15 15:12:06 +00:00
+									 * @return int
-												Add @since tags to Collation stuff

Change-Id: Iec56ac4d1418737d171f8faa9c8f498fba5383ee

											
										
										
											2016-04-03 08:36:49 +00:00
+									 * @since 1.16.3
 									 */
-												Split Collation.php

Change-Id: I6abfecf91cdce83dd34b1e8aa8e0b35315f62742

											
										
										
											2016-04-03 08:23:20 +00:00
+									public function getFirstLetterCount() {
-												collation: Refactor getFirstLetterData() cache handling

* Factor out fetchFirstLetterData() as a separate method.
* Move 'version' into the key instead of checking afterwards.
* Use getWithSetCallback() for the cache handling.
  (Depends on version being in the key).

Change-Id: I15bddf5d1dabcdcef47a938447ba59436bd8a294

											
										
										
											2016-04-19 21:27:22 +00:00
+										return count( $this->getFirstLetterData()['chars'] );
-												* Introduced a non-dummy collation for $wgCategoryCollation, namely UCA with default tables. 
* Added a maintenance script which generates a list of first letters. Unified Han are omitted for performance, and because they shouldn't be used as headings anyway. A future collation specific to Chinese would provide the KangXi radicals as "first letters".
* Provided a precomputed list of first letters. Used Unicode 6.0.0 data and ICU 4.2. 
* Moved collation functionality from Language to a Collation class hierarchy with factory function. Removed the recently-added methods from Language and updated all callers.
* Changed Title::getCategorySortkey() to separate its parts with a line break instead of a null character. All collations supported by the intl extension ignore the null character, i.e. "ab" == "a\0b". It would have required a lot of hacking to make it work.
* Fixed the uppercase collation to handle non-ASCII characters, redundantly with r80436. I don't think it's necessary to change the collation name as was done there, so I reverted that in the course of my conflict merge. A --force option to updateCollation.php might be nice though.


											
										
										
											2011-01-17 14:02:22 +00:00
+									}
-												Add @since tags to Collation stuff

Change-Id: Iec56ac4d1418737d171f8faa9c8f498fba5383ee

											
										
										
											2016-04-03 08:36:49 +00:00
+									/**
-												Adding support for numeric collation when using UCA collations

To use, add '-u-kn' to the end of a collation name and set it as
the value for $wgCategoryCollation.

Bug: T8948
Change-Id: Ica7908daf80624fa2648127114d01665e96234c0

											
										
										
											2016-07-15 03:47:52 +00:00
+									 * Test if a code point is a CJK (Chinese, Japanese, Korean) character
-												Improve some parameter docs

Add missing @return and @param to function docs and fixed some @param

Change-Id: I810727961057cfdcc274428b239af5975c57468d

											
										
										
											2017-09-09 20:47:04 +00:00
+									 * @param int $codepoint
 									 * @return bool
-												Add @since tags to Collation stuff

Change-Id: Iec56ac4d1418737d171f8faa9c8f498fba5383ee

											
										
										
											2016-04-03 08:36:49 +00:00
+									 * @since 1.16.3
 									 */
 									public static function isCjk( $codepoint ) {
-												Use consts in IcuCollation class

Change-Id: I664e7ea57b98975a3ff1c0c78477c18eb56837b4

											
										
										
											2020-11-21 00:41:58 +00:00
+										foreach ( self::CJK_BLOCKS as $block ) {
-												* Introduced a non-dummy collation for $wgCategoryCollation, namely UCA with default tables. 
* Added a maintenance script which generates a list of first letters. Unified Han are omitted for performance, and because they shouldn't be used as headings anyway. A future collation specific to Chinese would provide the KangXi radicals as "first letters".
* Provided a precomputed list of first letters. Used Unicode 6.0.0 data and ICU 4.2. 
* Moved collation functionality from Language to a Collation class hierarchy with factory function. Removed the recently-added methods from Language and updated all callers.
* Changed Title::getCategorySortkey() to separate its parts with a line break instead of a null character. All collations supported by the intl extension ignore the null character, i.e. "ab" == "a\0b". It would have required a lot of hacking to make it work.
* Fixed the uppercase collation to handle non-ASCII characters, redundantly with r80436. I don't think it's necessary to change the collation name as was done there, so I reverted that in the course of my conflict merge. A --force option to updateCollation.php might be nice though.


											
										
										
											2011-01-17 14:02:22 +00:00
+											if ( $codepoint >= $block[0] && $codepoint <= $block[1] ) {
 												return true;
 											}
 										}
 										return false;
 									}
-												(bug 43801) add a getter for ICU version to ICUCollation

It will be necessary to be able to use correct version of Unicode
data files.

The constant INTL_ICU_VERSION this getter returns isn't really
documented. It is available since PHP 5.3.7 (see PHP bug 54561),
the getter will fail gracefully on older PHPs. It should be possible to
determine the ICU version on these by grepping the output of phpinfo(),
but I don't think such a minor improvement is worth such a huge hack.

Change-Id: Iee4b8380406ae71c980dfdd7b9fdd0b58ecb9cd0

											
										
										
											2013-01-18 21:36:32 +00:00
-												(bug 43801) add a getter for ICU version to ICUCollation

It will be necessary to be able to use correct version of Unicode
data files.

The constant INTL_ICU_VERSION this getter returns isn't really
documented. It is available since PHP 5.3.7 (see PHP bug 54561),
the getter will fail gracefully on older PHPs. It should be possible to
determine the ICU version on these by grepping the output of phpinfo(),
but I don't think such a minor improvement is worth such a huge hack.

Change-Id: I85353559439bfddee7c5ba90894d30dd8ef0e0e8

											
										
										
											2013-01-18 21:36:32 +00:00
+									/**
 									 * Return the version of Unicode appropriate for the version of ICU library
 									 * currently in use, or false when it can't be determined.
 									 *
 									 * @since 1.21
-												Follow-Ups to "Fixed some @params documentation"

Fix of inline comments of the following patch sets:
Follow-Up: I0056b4a8df243cfc0c5f25378de48f7a35170aca
Follow-Up: I7f605aa9e117b5fd80d9b1440864fe526d2b14a5
Follow-Up: I3622f216a2ca8ac1b5e51892be9f98665f65bc36
Follow-Up: I6627ba0e76d3577c40bf2473e0f78a5ad7368634
Follow-Up: Id75b5ecf648ca50f955b3bde3307c82c4366b102
Follow-Up: I4ca5231119f33039d91da3b57a41cd40719a576b

Change-Id: Id9bbe84b2820e9db44af5783411e955f55f643d4

											
										
										
											2014-04-23 11:39:49 +00:00
+									 * @return string|bool
-												(bug 43801) add a getter for ICU version to ICUCollation

It will be necessary to be able to use correct version of Unicode
data files.

The constant INTL_ICU_VERSION this getter returns isn't really
documented. It is available since PHP 5.3.7 (see PHP bug 54561),
the getter will fail gracefully on older PHPs. It should be possible to
determine the ICU version on these by grepping the output of phpinfo(),
but I don't think such a minor improvement is worth such a huge hack.

Change-Id: I85353559439bfddee7c5ba90894d30dd8ef0e0e8

											
										
										
											2013-01-18 21:36:32 +00:00
+									 */
-												Add missing public visibility on some methods

RSSFeed::formatTime and AtomFeed::formatTime are private

Change-Id: I6bf081c31c92e7130ae0ae527ba4a8f4635c7de2

											
										
										
											2020-02-29 20:07:39 +00:00
+									public static function getUnicodeVersionForICU() {
-												IcuCollation: Deprecate getICUVersion(), no need for PHP53 back-compat

Change-Id: If8dfdaf187b32b7b9a2c09a240416b9f481593f1

											
										
										
											2018-05-24 20:11:11 +00:00
+										$icuVersion = INTL_ICU_VERSION;
-												(bug 43801) add a getter for ICU version to ICUCollation

It will be necessary to be able to use correct version of Unicode
data files.

The constant INTL_ICU_VERSION this getter returns isn't really
documented. It is available since PHP 5.3.7 (see PHP bug 54561),
the getter will fail gracefully on older PHPs. It should be possible to
determine the ICU version on these by grepping the output of phpinfo(),
but I don't think such a minor improvement is worth such a huge hack.

Change-Id: I85353559439bfddee7c5ba90894d30dd8ef0e0e8

											
										
										
											2013-01-18 21:36:32 +00:00
+										if ( !$icuVersion ) {
 											return false;
 										}
 										$versionPrefix = substr( $icuVersion, 0, 3 );
-												Update link target

Link redirected on website

Change-Id: Ib7dafdf63dfe1240b2f78c9296999db6ef454ed3

											
										
										
											2021-10-13 13:29:13 +00:00
+										// Source: https://icu.unicode.org/download
-												Convert all array() syntax to []

Per wikitech-l consensus:
 https://lists.wikimedia.org/pipermail/wikitech-l/2016-February/084821.html

Notes:
* Disabled CallTimePassByReference due to false positives (T127163)

Change-Id: I2c8ce713ce6600a0bb7bf67537c87044c7a45c4b

											
										
										
											2016-02-17 09:09:32 +00:00
+										$map = [
-												IcuCollation: Add some more icu to unicode version mappings

Change-Id: I08cf93e45a6422e819ba333e01a5b34e1c03a398

											
										
										
											2021-10-01 00:09:45 +00:00
+											'69.' => '13.0',
 											'68.' => '13.0',
-												collation: Add 64-67 ICU->Unicode mappings

67 not released yet, but due next month according to schedule

Change-Id: I3dedc025e9800bc46040fc606af2b16eb52841a0

											
										
										
											2020-03-21 00:50:57 +00:00
+											'67.' => '13.0',
 											'66.' => '13.0',
 											'65.' => '12.0',
 											'64.' => '12.0',
-												Add ICU mapping for versions 62 and 63

Change-Id: I5e1238e856d4149c30806e6b2cb3619c0c9c1dbf

											
										
										
											2018-10-18 19:03:59 +00:00
+											'63.' => '11.0',
 											'62.' => '11.0',
-												Add unicode mapping for ICU 60 and 61

Change-Id: Ifbbc8d7ecc788bc2c6b07a8ebba46a9648545786

											
										
										
											2018-05-24 21:28:19 +00:00
+											'61.' => '10.0',
 											'60.' => '10.0',
-												Add Unicode to ICU mappings for versions 58 and 59

Change-Id: I87a5e6ce3a44a2be1e6bf8adf2f98cd0a4745574

											
										
										
											2017-10-25 22:42:28 +00:00
+											'59.' => '9.0',
 											'58.' => '9.0',
-												Add Unicode to ICU mappings for versions 51-57

Change-Id: I35c2cdd2c56b491229f1f6d8b69b1de21af23aab

											
										
										
											2016-07-20 19:47:22 +00:00
+											'57.' => '8.0',
 											'56.' => '8.0',
 											'55.' => '7.0',
 											'54.' => '7.0',
 											'53.' => '6.3',
 											'52.' => '6.3',
 											'51.' => '6.2',
-												(bug 43801) add a getter for ICU version to ICUCollation

It will be necessary to be able to use correct version of Unicode
data files.

The constant INTL_ICU_VERSION this getter returns isn't really
documented. It is available since PHP 5.3.7 (see PHP bug 54561),
the getter will fail gracefully on older PHPs. It should be possible to
determine the ICU version on these by grepping the output of phpinfo(),
but I don't think such a minor improvement is worth such a huge hack.

Change-Id: I85353559439bfddee7c5ba90894d30dd8ef0e0e8

											
										
										
											2013-01-18 21:36:32 +00:00
+											'50.' => '6.2',
 											'49.' => '6.1',
 											'4.8' => '6.0',
 											'4.6' => '6.0',
 											'4.4' => '5.2',
 											'4.2' => '5.1',
 											'4.0' => '5.1',
 											'3.8' => '5.0',
 											'3.6' => '5.0',
 											'3.4' => '4.1',
-												Convert all array() syntax to []

Per wikitech-l consensus:
 https://lists.wikimedia.org/pipermail/wikitech-l/2016-February/084821.html

Notes:
* Disabled CallTimePassByReference due to false positives (T127163)

Change-Id: I2c8ce713ce6600a0bb7bf67537c87044c7a45c4b

											
										
										
											2016-02-17 09:09:32 +00:00
+										];
-												(bug 43801) add a getter for ICU version to ICUCollation

It will be necessary to be able to use correct version of Unicode
data files.

The constant INTL_ICU_VERSION this getter returns isn't really
documented. It is available since PHP 5.3.7 (see PHP bug 54561),
the getter will fail gracefully on older PHPs. It should be possible to
determine the ICU version on these by grepping the output of phpinfo(),
but I don't think such a minor improvement is worth such a huge hack.

Change-Id: I85353559439bfddee7c5ba90894d30dd8ef0e0e8

											
										
										
											2013-01-18 21:36:32 +00:00
-												Use PHP 7 '??' operator instead of if-then-else

Change-Id: I790b86e2e9e3e41386144637659516a4bfca1cfe

											
										
										
											2018-06-12 20:44:33 +00:00
+										return $map[$versionPrefix] ?? false;
-												(bug 43801) add a getter for ICU version to ICUCollation

It will be necessary to be able to use correct version of Unicode
data files.

The constant INTL_ICU_VERSION this getter returns isn't really
documented. It is available since PHP 5.3.7 (see PHP bug 54561),
the getter will fail gracefully on older PHPs. It should be possible to
determine the ICU version on these by grepping the output of phpinfo(),
but I don't think such a minor improvement is worth such a huge hack.

Change-Id: I85353559439bfddee7c5ba90894d30dd8ef0e0e8

											
										
										
											2013-01-18 21:36:32 +00:00
+									}
-												* Introduced a non-dummy collation for $wgCategoryCollation, namely UCA with default tables. 
* Added a maintenance script which generates a list of first letters. Unified Han are omitted for performance, and because they shouldn't be used as headings anyway. A future collation specific to Chinese would provide the KangXi radicals as "first letters".
* Provided a precomputed list of first letters. Used Unicode 6.0.0 data and ICU 4.2. 
* Moved collation functionality from Language to a Collation class hierarchy with factory function. Removed the recently-added methods from Language and updated all callers.
* Changed Title::getCategorySortkey() to separate its parts with a line break instead of a null character. All collations supported by the intl extension ignore the null character, i.e. "ab" == "a\0b". It would have required a lot of hacking to make it work.
* Fixed the uppercase collation to handle non-ASCII characters, redundantly with r80436. I don't think it's necessary to change the collation name as was done there, so I reverted that in the course of my conflict merge. A --force option to updateCollation.php might be nice though.


											
										
										
											2011-01-17 14:02:22 +00:00
+								}