* Introduced a non-dummy collation for $wgCategoryCollation, namely UCA with default tables.

* Added a maintenance script which generates a list of first letters. Unified Han are omitted for performance, and because they shouldn't be used as headings anyway. A future collation specific to Chinese would provide the KangXi radicals as "first letters". * Provided a precomputed list of first letters. Used Unicode 6.0.0 data and ICU 4.2. * Moved collation functionality from Language to a Collation class hierarchy with factory function. Removed the recently-added methods from Language and updated all callers. * Changed Title::getCategorySortkey() to separate its parts with a line break instead of a null character. All collations supported by the intl extension ignore the null character, i.e. "ab" == "a\0b". It would have required a lot of hacking to make it work. * Fixed the uppercase collation to handle non-ASCII characters, redundantly with r80436. I don't think it's necessary to change the collation name as was done there, so I reverted that in the course of my conflict merge. A --force option to updateCollation.php might be nice though.
2011-01-17 14:02:22 +00:00 · 2011-01-17 14:02:22 +00:00 · eaeea84b44
commit eaeea84b44
parent 14d576cd86
11 changed files with 727 additions and 73 deletions
--- a/includes/AutoLoader.php
+++ b/includes/AutoLoader.php
@ -43,6 +43,7 @@ $wgAutoloadLocalClasses = array(
 	'ChangesFeed' => 'includes/ChangesFeed.php',
 	'ChangeTags' => 'includes/ChangeTags.php',
 	'ChannelFeed' => 'includes/Feed.php',
 	'Collation' => 'includes/Collation.php',
 	'Cookie' => 'includes/HttpFunctions.php',
 	'CookieJar' => 'includes/HttpFunctions.php',
 	'ConcatenatedGzipHistoryBlob' => 'includes/HistoryBlob.php',
@ -127,6 +128,7 @@ $wgAutoloadLocalClasses = array(
 	'HTMLInfoField' => 'includes/HTMLForm.php',
 	'Http' => 'includes/HttpFunctions.php',
 	'HttpRequest' => 'includes/HttpFunctions.old.php',
 	'IcuCollation' => 'includes/Collation.php',
 	'ImageGallery' => 'includes/ImageGallery.php',
 	'ImageHistoryList' => 'includes/ImagePage.php',
 	'ImageHistoryPseudoPager' => 'includes/ImagePage.php',
@ -243,6 +245,7 @@ $wgAutoloadLocalClasses = array(
 	'TitleListDependency' => 'includes/CacheDependency.php',
 	'Token' => 'includes/Token.php',
 	'UnlistedSpecialPage' => 'includes/SpecialPage.php',
 	'UppercaseCollation' => 'includes/Collation.php',
 	'User' => 'includes/User.php',
 	'UserArray' => 'includes/UserArray.php',
 	'UserArrayFromResult' => 'includes/UserArray.php',
--- a/includes/CategoryPage.php
+++ b/includes/CategoryPage.php
@ -90,7 +90,7 @@ class CategoryViewer {
 		$children, $children_start_char,
 		$showGallery, $gallery,
 		$imgsNoGalley, $imgsNoGallery_start_char,
-		$skin;
+		$skin, $collation;
 	# Category object for this page
 	private $cat;
 	# The original query array, to be used in generating paging links.
@ -104,6 +104,7 @@ class CategoryViewer {
 		$this->limit = $wgCategoryPagingLimit;
 		$this->cat = Category::newFromTitle( $title );
 		$this->query = $query;
 		$this->collation = Collation::singleton();
 		unset( $this->query['title'] );
 	}
@ -212,7 +213,7 @@ class CategoryViewer {
 			$word = $sortkey;
 		}
-		$firstChar = $wgContLang->firstLetterForLists( $word );
+		$firstChar = $this->collation->getFirstLetter( $word );
 		return $wgContLang->convert( $firstChar );
 	}
@ -241,7 +242,8 @@ class CategoryViewer {
 				) . '</span>'
 			: $this->getSkin()->link( $title );
-			$this->imgsNoGallery_start_char[] = $wgContLang->convert( $wgContLang->firstLetterForLists( $sortkey ) );
+			$this->imgsNoGallery_start_char[] = $wgContLang->convert( 
 				$this->collation->getFirstLetter( $sortkey ) );
 		}
 	}
@ -261,7 +263,8 @@ class CategoryViewer {
 				) . '</span>'
 			: $this->getSkin()->link( $title );
-		$this->articles_start_char[] = $wgContLang->convert( $wgContLang->firstLetterForLists( $sortkey ) );
+		$this->articles_start_char[] = $wgContLang->convert( 
 			$this->collation->getFirstLetter( $sortkey ) );
 	}
 	function finaliseCategoryState() {
@ -280,8 +283,6 @@ class CategoryViewer {
 	}
 	function doCategoryQuery() {
 		global $wgContLang;
 		$dbr = wfGetDB( DB_SLAVE, 'category' );
 		$this->nextPage = array(
@ -294,14 +295,14 @@ class CategoryViewer {
 		foreach ( array( 'page', 'subcat', 'file' ) as $type ) {
 			# Get the sortkeys for start/end, if applicable.  Note that if
 			# the collation in the database differs from the one
-			# $wgContLang is using, pagination might go totally haywire.
+			# set in $wgCategoryCollation, pagination might go totally haywire.
 			$extraConds = array( 'cl_type' => $type );
 			if ( $this->from[$type] !== null ) {
 				$extraConds[] = 'cl_sortkey >= '
-					. $dbr->addQuotes( $wgContLang->convertToSortkey( $this->from[$type] ) );
+					. $dbr->addQuotes( $this->collation->getSortKey( $this->from[$type] ) );
 			} elseif ( $this->until[$type] !== null ) {
 				$extraConds[] = 'cl_sortkey < '
-					. $dbr->addQuotes( $wgContLang->convertToSortkey( $this->until[$type] ) );
+					. $dbr->addQuotes( $this->collation->getSortKey( $this->until[$type] ) );
 				$this->flip[$type] = true;
 			}
--- a/includes/Collation.php
+++ b/includes/Collation.php
@ -0,0 +1,304 @@
 <?php
 abstract class Collation {
 	static $instance;
 	static function singleton() {
 		if ( !self::$instance ) {
 			global $wgCategoryCollation;
 			self::$instance = self::factory( $wgCategoryCollation );
 		}
 		return self::$instance;
 	}
 	static function factory( $collationName ) {
 		switch( $collationName ) {
 			case 'uppercase':
 				return new UppercaseCollation;
 			case 'uca-default':
 				return new IcuCollation( 'root' );
 			default:
 				throw new MWException( __METHOD__.": unknown collation type \"$collationName\"" );
 		}
 	}
 	/**
 	 * Given a string, convert it to a (hopefully short) key that can be used
 	 * for efficient sorting.  A binary sort according to the sortkeys
 	 * corresponds to a logical sort of the corresponding strings.  Current
 	 * code expects that a null character should sort before all others, but
 	 * has no other particular expectations (and that one can be changed if
 	 * necessary).
 	 *
 	 * @param string $string UTF-8 string
 	 * @return string Binary sortkey
 	 */
 	abstract function getSortKey( $string );
 	/**
 	 * Given a string, return the logical "first letter" to be used for
 	 * grouping on category pages and so on.  This has to be coordinated
 	 * carefully with convertToSortkey(), or else the sorted list might jump
 	 * back and forth between the same "initial letters" or other pathological
 	 * behavior.  For instance, if you just return the first character, but "a"
 	 * sorts the same as "A" based on getSortKey(), then you might get a
 	 * list like
 	 *
 	 * == A ==
 	 * * [[Aardvark]]
 	 *
 	 * == a ==
 	 * * [[antelope]]
 	 *
 	 * == A ==
 	 * * [[Ape]]
 	 *
 	 * etc., assuming for the sake of argument that $wgCapitalLinks is false.
 	 *
 	 * @param string $string UTF-8 string
 	 * @return string UTF-8 string corresponding to the first letter of input
 	 */
 	abstract function getFirstLetter( $string );
 }
 class UppercaseCollation extends Collation {
 	var $lang;
 	function __construct() {
 		// Get a language object so that we can use the generic UTF-8 uppercase
 		// function there
 		$this->lang = Language::factory( 'en' );
 	}
 	function getSortKey( $string ) {
 		return $this->lang->uc( $string );
 	}
 	function getFirstLetter( $string ) {
 		if ( $string[0] == "\0" ) {
 			$string = substr( $string, 1 );
 		}
 		return $this->lang->ucfirst( $this->lang->firstChar( $string ) );
 	}
 }
 class IcuCollation extends Collation {
 	var $primaryCollator, $mainCollator, $locale;
 	var $firstLetterData;
 	/**
 	 * Unified CJK blocks.
 	 *
 	 * The same definition of a CJK block must be used for both Collation and 
 	 * generateCollationData.php. These blocks are omitted from the first 
 	 * letter data, as an optimisation measure and because the default UCA table 
 	 * is pretty useless for sorting Chinese text anyway. Japanese and Korean 
 	 * blocks are not included here, because they are smaller and more useful.
 	 */
 	static $cjkBlocks = array(
 		array( 0x2E80, 0x2EFF ), // CJK Radicals Supplement
 		array( 0x2F00, 0x2FDF ), // Kangxi Radicals
 		array( 0x2FF0, 0x2FFF ), // Ideographic Description Characters
 		array( 0x3000, 0x303F ), // CJK Symbols and Punctuation
 		array( 0x31C0, 0x31EF ), // CJK Strokes
 		array( 0x3200, 0x32FF ), // Enclosed CJK Letters and Months
 		array( 0x3300, 0x33FF ), // CJK Compatibility
 		array( 0x3400, 0x4DBF ), // CJK Unified Ideographs Extension A
 		array( 0x4E00, 0x9FFF ), // CJK Unified Ideographs
 		array( 0xF900, 0xFAFF ), // CJK Compatibility Ideographs
 		array( 0xFE30, 0xFE4F ), // CJK Compatibility Forms
 		array( 0x20000, 0x2A6DF ), // CJK Unified Ideographs Extension B
 		array( 0x2A700, 0x2B73F ), // CJK Unified Ideographs Extension C
 		array( 0x2B740, 0x2B81F ), // CJK Unified Ideographs Extension D
 		array( 0x2F800, 0x2FA1F ), // CJK Compatibility Ideographs Supplement
 	);
 	const RECORD_LENGTH = 14;
 	function __construct( $locale ) {
 		if ( !extension_loaded( 'intl' ) ) {
 			throw new MWException( 'An ICU collation was requested, ' . 
 				'but the intl extension is not available.' );
 		}
 		$this->locale = $locale;
 		$this->mainCollator = Collator::create( $locale );
 		if ( !$this->mainCollator ) {
 			throw new MWException( "Invalid ICU locale specified for collation: $locale" );
 		}
 		$this->primaryCollator = Collator::create( $locale );
 		$this->primaryCollator->setStrength( Collator::PRIMARY );
 	}
 	function getSortKey( $string ) {
 		wfSuppressWarnings();
 		$key = $this->mainCollator->getSortKey( $string ) . '';
 		wfRestoreWarnings();
 		return $key;
 	}
 	function getPrimarySortKey( $string ) {
 		wfSuppressWarnings();
 		$key = $this->primaryCollator->getSortKey( $string ) . '';
 		wfRestoreWarnings();
 		return $key;
 	}
 	function getFirstLetter( $string ) {
 		$string = strval( $string );
 		if ( $string === '' ) {
 			return '';
 		}
 		// Check for CJK
 		$firstChar = mb_substr( $string, 0, 1, 'UTF-8' );
 		if ( ord( $firstChar ) > 0x7f 
 			&& self::isCjk( utf8ToCodepoint( $firstChar ) ) ) 
 		{
 			return $firstChar;
 		}
 		$sortKey = $this->getPrimarySortKey( $string );
 		// Do a binary search to find the correct letter to sort under
 		$min = $this->findLowerBound(
 			array( $this, 'getSortKeyByLetterIndex' ),
 			$this->getFirstLetterCount(),
 			'strcmp',
 			$sortKey );
 		if ( $min === false ) {
 			// Before the first letter
 			return '';
 		}
 		return $this->getLetterByIndex( $min );
 	}
 	function getFirstLetterData() {
 		if ( $this->firstLetterData !== null ) {
 			return $this->firstLetterData;
 		}
 		$cache = wfGetCache( CACHE_ANYTHING );
 		$cacheKey = wfMemcKey( 'first-letters', $this->locale );
 		$cacheEntry = $cache->get( $cacheKey );
 		if ( $cacheEntry ) {
 			$this->firstLetterData = $cacheEntry;
 			return $this->firstLetterData;
 		}
 		// Generate data from serialized data file
 		$letters = wfGetPrecompiledData( "first-letters-{$this->locale}.ser" );
 		if ( $letters === false ) {
 			throw new MWException( "MediaWiki does not support ICU locale " .
 				"\"{$this->locale}\"" );
 		}
 		// Sort the letters.
 		//
 		// It's impossible to have the precompiled data file properly sorted,
 		// because the sort order changes depending on ICU version. If the 
 		// array is not properly sorted, the binary search will return random 
 		// results. 
 		//
 		// We also take this opportunity to remove primary collisions.
 		$letterMap = array();
 		foreach ( $letters as $letter ) {
 			$key = $this->getPrimarySortKey( $letter );
 			if ( isset( $letterMap[$key] ) ) {
 				// Primary collision
 				// Keep whichever one sorts first in the main collator
 				if ( $this->mainCollator->compare( $letter, $letterMap[$key] ) < 0 ) {
 					$letterMap[$key] = $letter;
 				}
 			} else {
 				$letterMap[$key] = $letter;
 			}
 		}
 		ksort( $letterMap, SORT_STRING );
 		$data = array(
 			'chars' => array_values( $letterMap ),
 			'keys' => array_keys( $letterMap )
 		);
 		// Reduce memory usage before caching
 		unset( $letterMap );
 		// Save to cache
 		$this->firstLetterData = $data;
 		$cache->set( $cacheKey, $data, 86400 * 7 /* 1 week */ );
 		return $data;
 	}
 	function getLetterByIndex( $index ) {
 		if ( $this->firstLetterData === null ) {
 			$this->getFirstLetterData();
 		}
 		return $this->firstLetterData['chars'][$index];
 	}
 	function getSortKeyByLetterIndex( $index ) {
 		if ( $this->firstLetterData === null ) {
 			$this->getFirstLetterData();
 		}
 		return $this->firstLetterData['keys'][$index];
 	}
 	function getFirstLetterCount() {
 		if ( $this->firstLetterData === null ) {
 			$this->getFirstLetterData();
 		}
 		return count( $this->firstLetterData['chars'] );
 	}
 	/**
 	 * Do a binary search, and return the index of the largest item that sorts 
 	 * less than or equal to the target value.
 	 *
 	 * @param $valueCallback A function to call to get the value with 
 	 *     a given array index.
 	 * @param $valueCount The number of items accessible via $valueCallback, 
 	 *     indexed from 0 to $valueCount - 1
 	 * @param $comparisonCallback A callback to compare two values, returning 
 	 *     -1, 0 or 1 in the style of strcmp().
 	 * @param $target The target value to find.
 	 *
 	 * @return The item index of the lower bound, or false if the target value
 	 *     sorts before all items.
 	 */
 	function findLowerBound( $valueCallback, $valueCount, $comparisonCallback, $target ) {
 		$min = 0;
 		$max = $valueCount - 1;
 		do {
 			$mid = $min + ( ( $max - $min ) >> 1 );
 			$item = call_user_func( $valueCallback, $mid );
 			$comparison = call_user_func( $comparisonCallback, $target, $item );
 			if ( $comparison > 0 ) {
 				$min = $mid;
 			} elseif ( $comparison == 0 ) {
 				$min = $mid;
 				break;
 			} else {
 				$max = $mid;
 			}
 		} while ( $min < $max - 1 );
 		if ( $min == 0 && $max == 0 && $comparison > 0 ) {
 			// Before the first item
 			return false;
 		} else {
 			return $min;
 		}
 	}
 	static function isCjk( $codepoint ) {
 		foreach ( self::$cjkBlocks as $block ) {
 			if ( $codepoint >= $block[0] && $codepoint <= $block[1] ) {
 				return true;
 			}
 		}
 		return false;
 	}
 }
--- a/includes/DefaultSettings.php
+++ b/includes/DefaultSettings.php
@ -4611,15 +4611,26 @@ $wgCategoryMagicGallery = true;
 $wgCategoryPagingLimit = 200;
 /**
- * A version indicator for collations that will be stored in cl_collation for
+ * Specify how category names should be sorted, when listed on a category page. 
- * all new rows.  Used when the collation algorithm changes: a script checks
+ * A sorting scheme is also known as a collation.
 * for all rows where cl_collation != $wgCategoryCollation and regenerates
 * cl_sortkey based on the page name and cl_sortkey_prefix.
 *
- * Currently only supports 'uppercase2', which just uppercases the string.  This
+ * Available values are:
- * is a dummy collation, to be replaced later by real ones.
+ *
 *   - uppercase: Converts the category name to upper case, and sorts by that.
 *
 *   - uca-default: Provides access to the Unicode Collation Algorithm with 
 *     the default element table. This is a compromise collation which sorts
 *     all languages in a mediocre way. However, it is better than "uppercase".
 *
 * To use the uca-default collation, you must have PHP's intl extension 
 * installed. See http://php.net/manual/en/intl.setup.php . The details of the 
 * resulting collation will depend on the version of ICU installed on the 
 * server.
 *
 * After you change this, you must run maintenance/updateCollation.php to fix
 * the sort keys in the database. 
 */
-$wgCategoryCollation = 'uppercase2';
+$wgCategoryCollation = 'uppercase';
 /** @} */ # End categories }
--- a/includes/LinksUpdate.php
+++ b/includes/LinksUpdate.php
@ -454,14 +454,14 @@ class LinksUpdate {
 			# (Title::moveTo() has had the same issue for a long time).
 			if ( $this->mTitle->getCategorySortkey() == $sortkey ) {
 				$prefix = '';
-				$sortkey = $wgContLang->convertToSortkey( $sortkey );
+				$sortkey = Collation::singleton()->getSortKey( $sortkey );
 			} else {
 				# Treat custom sortkeys as a prefix, so that if multiple
 				# things are forced to sort as '*' or something, they'll
 				# sort properly in the category rather than in page_id
 				# order or such.
 				$prefix = $sortkey;
-				$sortkey = $wgContLang->convertToSortkey(
+				$sortkey = Collation::singleton()->getSortKey(
 					$this->mTitle->getCategorySortkey( $prefix ) );
 			}
--- a/includes/Title.php
+++ b/includes/Title.php
@ -3088,8 +3088,6 @@ class Title {
 	 * @return Mixed true on success, getUserPermissionsErrors()-like array on failure
 	 */
 	public function moveTo( &$nt, $auth = true, $reason = '', $createRedirect = true ) {
 		global $wgContLang;
 		$err = $this->isValidMoveOperation( $nt, $auth, $reason );
 		if ( is_array( $err ) ) {
 			return $err;
@ -3129,7 +3127,8 @@ class Title {
 		);
 		$dbw->update( 'categorylinks',
 			array(
-				'cl_sortkey' => $wgContLang->convertToSortkey( $nt->getCategorySortkey( $prefix ) ),
+				'cl_sortkey' => Collation::singleton()->getSortKey( 
 					$nt->getCategorySortkey( $prefix ) ),
 				'cl_timestamp=cl_timestamp' ),
 			array( 'cl_from' => $pageid ),
 			__METHOD__ );
@ -4139,7 +4138,7 @@ class Title {
 	/**
 	 * Returns the raw sort key to be used for categories, with the specified
-	 * prefix.  This will be fed to Language::convertToSortkey() to get a
+	 * prefix.  This will be fed to Collation::getSortKey() to get a
 	 * binary sortkey that can be used for actual sorting.
 	 *
 	 * @param $prefix string The prefix to be used, specified using
@ -4153,7 +4152,7 @@ class Title {
 			# Separate with a null byte, so the unprefixed part is only used as
 			# a tiebreaker when two pages have the exact same prefix -- null
 			# sorts before everything else (hopefully).
-			return "$prefix\0$unprefixed";
+			return "$prefix\n$unprefixed";
 		}
 		return $unprefixed;
 	}
--- a/languages/Language.php
+++ b/languages/Language.php
@ -2996,50 +2996,4 @@ class Language {
 	function getConvRuleTitle() {
 		return $this->mConverter->getConvRuleTitle();
 	}
 	/**
 	 * Given a string, convert it to a (hopefully short) key that can be used
 	 * for efficient sorting.  A binary sort according to the sortkeys
 	 * corresponds to a logical sort of the corresponding strings.  Current
 	 * code expects that a null character should sort before all others, but
 	 * has no other particular expectations (and that one can be changed if
 	 * necessary).
 	 *
 	 * @param string $string UTF-8 string
 	 * @return string Binary sortkey
 	 */
 	public function convertToSortkey( $string ) {
 		# Fake function for now
 		return $this->uc( $string );
 	}
 	/**
 	 * Given a string, return the logical "first letter" to be used for
 	 * grouping on category pages and so on.  This has to be coordinated
 	 * carefully with convertToSortkey(), or else the sorted list might jump
 	 * back and forth between the same "initial letters" or other pathological
 	 * behavior.  For instance, if you just return the first character, but "a"
 	 * sorts the same as "A" based on convertToSortkey(), then you might get a
 	 * list like
 	 *
 	 * == A ==
 	 * * [[Aardvark]]
 	 *
 	 * == a ==
 	 * * [[antelope]]
 	 *
 	 * == A ==
 	 * * [[Ape]]
 	 *
 	 * etc., assuming for the sake of argument that $wgCapitalLinks is false.
 	 *
 	 * @param string $string UTF-8 string
 	 * @return string UTF-8 string corresponding to the first letter of input
 	 */
 	public function firstLetterForLists( $string ) {
 		if ( $string[0] == "\0" ) {
 			$string = substr( $string, 1 );
 		}
 		return $this->uc( $this->firstChar( $string ) );
 	}
 }
--- a/maintenance/language/generateCollationData.php
+++ b/maintenance/language/generateCollationData.php
@ -0,0 +1,381 @@
 <?php
 require_once( dirname( __FILE__ ) .'/../Maintenance.php' );
 /**
 * Generate first letter data files for Collation.php
 */
 class GenerateCollationData extends Maintenance {
 	/** The directory with source data files in it */
 	var $dataDir;
 	/** The primary weights, indexed by codepoint */
 	var $weights;
 	/** 
 	 * A hashtable keyed by codepoint, where presence indicates that a character
 	 * has a decomposition mapping. This makes it non-preferred for group header
 	 * selection.
 	 */
 	var $mappedChars;
 	var $debugOutFile;
 	/**
 	 * Important tertiary weights from UTS #10 section 7.2
 	 */
 	const NORMAL_UPPERCASE = 0x08;
 	const NORMAL_HIRAGANA = 0X0E;
 	public function __construct() {
 		parent::__construct();
 		$this->addOption( 'data-dir', 'A directory on the local filesystem ' .
 			'containing allkeys.txt and ucd.all.grouped.xml from unicode.org', 
 			false, true );
 		$this->addOption( 'debug-output', 'Filename for sending debug output to',
 			false, true );
 	}
 	public function execute() {
 		$this->dataDir = $this->getOption( 'data-dir', '.' );
 		if ( !file_exists( "{$this->dataDir}/allkeys.txt" ) ) {
 			$this->error( "Unable to find allkeys.txt. Please download it from " .
 				"http://www.unicode.org/Public/UCA/latest/allkeys.txt and specify " .
 				"its location with --data-dir=<DIR>" );
 			exit( 1 );
 		}
 		if ( !file_exists( "{$this->dataDir}/ucd.all.grouped.xml" ) ) {
 			$this->error( "Unable to find ucd.all.grouped.xml. Please download it " .
 				"from http://www.unicode.org/Public/6.0.0/ucdxml/ucd.all.grouped.zip " .
 				"and specify its location with --data-dir=<DIR>" );
 			exit( 1 );
 		}
 		$debugOutFileName = $this->getOption( 'debug-output' );
 		if ( $debugOutFileName ) {
 			$this->debugOutFile = fopen( $debugOutFileName, 'w' );
 			if ( !$this->debugOutFile ) {
 				$this->error( "Unable to open debug output file for writing" );
 				exit( 1 );
 			}
 		}
 		$this->loadUcd();
 		$this->generateFirstChars();
 	}
 	function loadUcd() {
 		$uxr = new UcdXmlReader( "{$this->dataDir}/ucd.all.grouped.xml" );
 		$uxr->readChars( array( $this, 'charCallback' ) );
 	}
 	function charCallback( $data ) {
 		// Skip non-printable characters
 		$category = substr( $data['gc'], 0, 1 );
 		if ( strpos( 'LNPS', $category ) === false ) {
 			return;
 		}
 		$cp = hexdec( $data['cp'] );
 		// Skip the CJK ideograph blocks, as an optimisation measure.
 		// UCA doesn't sort them properly anyway, without tailoring.
 		if ( IcuCollation::isCjk( $cp ) ) {
 			return;
 		}
 		// Skip the composed Hangul syllables, we will use the bare Jamo 
 		// as first letters
 		if ( $data['block'] == 'Hangul Syllables' ) {
 			return;
 		}
 		// Calculate implicit weight per UTS #10 v6.0.0, sec 7.1.3
 		if ( $data['UIdeo'] === 'Y' ) {
 			if ( $data['block'] == 'CJK Unified Ideographs'
 				|| $data['block'] == 'CJK Compatibility Ideographs' )
 			{
 				$base = 0xFB40;
 			} else {
 				$base = 0xFB80;
 			}
 		} else {
 			$base = 0xFBC0;
 		}
 		$a = $base + ( $cp >> 15 );
 		$b = ( $cp & 0x7fff ) | 0x8000;
 		$this->weights[$cp] = sprintf( ".%04X.%04X", $a, $b );
 		if ( $data['dm'] !== '#' ) {
 			$this->mappedChars[$cp] = true;
 		}
 		if ( $cp % 4096 == 0 ) {
 			print "{$data['cp']}\n";
 		}
 	}
 	function generateFirstChars() {
 		$file = fopen( "{$this->dataDir}/allkeys.txt", 'r' );
 		if ( !$file ) {
 			$this->error( "Unable to open allkeys.txt" );
 			exit( 1 );
 		}
 		global $IP;
 		$outFile = fopen( "$IP/serialized/first-letters-root.ser", 'w' );
 		if ( !$outFile ) {
 			$this->error( "Unable to open output file first-letters-root.ser" );
 			exit( 1 );
 		}
 		$goodTertiaryChars = array();
 		// For each character with an entry in allkeys.txt, overwrite the implicit
 		// entry in $this->weights that came from the UCD.
 		// Also gather a list of tertiary weights, for use in selecting the group header
 		while ( false !== ( $line = fgets( $file ) ) ) {
 			// We're only interested in single-character weights, pick them out with a regex
 			$line = trim( $line );
 			if ( !preg_match( '/^([0-9A-F]+)\s*;\s*([^#]*)/', $line, $m ) ) {
 				continue;
 			}
 			$cp = hexdec( $m[1] );
 			$allWeights = trim( $m[2] );
 			$primary = '';
 			$tertiary = '';
 			if ( !isset( $this->weights[$cp] ) ) {
 				// Non-printable, ignore
 				continue;
 			}
 			foreach ( StringUtils::explode( '[', $allWeights ) as $weightStr ) {
 				preg_match_all( '/[*.]([0-9A-F]+)/', $weightStr, $m );
 				if ( !empty( $m[1] ) ) {
 					if ( $m[1][0] !== '0000' ) {
 						$primary .= '.' . $m[1][0];
 					}
 					if ( $m[1][2] !== '0000' ) {
 						$tertiary .= '.' . $m[1][2];
 					}
 				}
 			}
 			$this->weights[$cp] = $primary;
 			if ( $tertiary === '.0008'
 				|| $tertiary === '.000E' ) 
 			{
 				$goodTertiaryChars[$cp] = true;
 			}
 		}
 		fclose( $file );
 		// Identify groups of characters with the same primary weight
 		$this->groups = array();
 		asort( $this->weights, SORT_STRING );
 		$prevWeight = reset( $this->weights );
 		$group = array();
 		foreach ( $this->weights as $cp => $weight ) {
 			if ( $weight !== $prevWeight ) {
 				$this->groups[$prevWeight] = $group;
 				$prevWeight = $weight;
 				if ( isset( $this->groups[$weight] ) ) {
 					$group = $this->groups[$weight];
 				} else {
 					$group = array();
 				}
 			}
 			$group[] = $cp;
 		}
 		if ( $group ) {
 			$this->groups[$prevWeight] = $group;
 		}
 		// If one character has a given primary weight sequence, and a second
 		// character has a longer primary weight sequence with an initial 
 		// portion equal to the first character, then remove the second 
 		// character. This avoids having characters like U+A732 (double A)
 		// polluting the basic latin sort area.
 		$prevWeights = array();
 		foreach ( $this->groups as $weight => $group ) {
 			if ( preg_match( '/(\.[0-9A-F]*)\./', $weight, $m ) ) {
 				if ( isset( $this->groups[$m[1]] ) ) {
 					unset( $this->groups[$weight] );
 				}
 			}
 		}
 		ksort( $this->groups, SORT_STRING );
 		// Identify the header character in each group
 		$headerChars = array();
 		$prevChar = "\000";
 		$tertiaryCollator = new Collator( 'root' );
 		$primaryCollator = new Collator( 'root' );
 		$primaryCollator->setStrength( Collator::PRIMARY );
 		$numOutOfOrder = 0;
 		foreach ( $this->groups as $weight => $group ) {
 			$uncomposedChars = array();
 			$goodChars = array();
 			foreach ( $group as $cp ) {
 				if ( isset( $goodTertiaryChars[$cp] ) ) {
 					$goodChars[] = $cp;
 				}
 				if ( !isset( $this->mappedChars[$cp] ) ) {
 					$uncomposedChars[] = $cp;
 				}
 			}
 			$x = array_intersect( $goodChars, $uncomposedChars );
 			if ( !$x ) {
 				$x = $uncomposedChars;
 				if ( !$x ) {
 					$x = $group;
 				}
 			}
 			// Use ICU to pick the lowest sorting character in the selection
 			$tertiaryCollator->sort( $x );
 			$cp = $x[0];
 			$char = codepointToUtf8( $cp );
 			$headerChars[] = $char;
 			if ( $primaryCollator->compare( $char, $prevChar ) <= 0 ) {
 				$numOutOfOrder ++;
 				/*
 				printf( "Out of order: U+%05X > U+%05X\n",
 					utf8ToCodepoint( $prevChar ),
 					utf8ToCodepoint( $char ) );
 				 */
 			}
 			$prevChar = $char;
 			if ( $this->debugOutFile ) {
 				fwrite( $this->debugOutFile, sprintf( "%05X %s %s (%s)\n", $cp, $weight, $char,
 					implode( ' ', array_map( 'codepointToUtf8', $group ) ) ) );
 			}
 		}
 		print "Out of order: $numOutOfOrder / " . count( $headerChars ) . "\n";
 		fwrite( $outFile, serialize( $headerChars ) );
 	}
 }
 class UcdXmlReader {
 	var $fileName;
 	var $callback;
 	var $groupAttrs;
 	var $xml;
 	var $blocks = array();
 	var $currentBlock;
 	function __construct( $fileName ) {
 		$this->fileName = $fileName;
 	}
 	public function readChars( $callback ) {
 		$this->getBlocks();
 		$this->currentBlock = reset( $this->blocks );
 		$xml = $this->open();
 		$this->callback = $callback;
 		while ( $xml->name !== 'repertoire' && $xml->next() );
 		while ( $xml->read() ) {
 			if ( $xml->nodeType == XMLReader::ELEMENT ) {
 				if ( $xml->name === 'group' ) {
 					$this->groupAttrs = $this->readAttributes();
 				} elseif ( $xml->name === 'char' ) {
 					$this->handleChar();
 				}
 			} elseif ( $xml->nodeType === XMLReader::END_ELEMENT ) {
 				if ( $xml->name === 'group' ) {
 					$this->groupAttrs = array();
 				}
 			}
 		}
 		$xml->close();
 	}
 	protected function open() {
 		$this->xml = new XMLReader;
 		$this->xml->open( $this->fileName );
 		if ( !$this->xml ) {
 			throw new MWException( __METHOD__.": unable to open {$this->fileName}" );
 		}
 		while ( $this->xml->name !== 'ucd' && $this->xml->read() );
 		$this->xml->read();
 		return $this->xml;
 	}	
 	/**
 	 * Read the attributes of the current element node and return them 
 	 * as an array
 	 */
 	protected function readAttributes() {
 		$attrs = array();
 		while ( $this->xml->moveToNextAttribute() ) {
 			$attrs[$this->xml->name] = $this->xml->value;
 		}
 		return $attrs;
 	}
 	protected function handleChar() {
 		$attrs = $this->readAttributes() + $this->groupAttrs;
 		if ( isset( $attrs['cp'] ) ) {
 			$first = $last = hexdec( $attrs['cp'] );
 		} else {
 			$first = hexdec( $attrs['first-cp'] );
 			$last = hexdec( $attrs['last-cp'] );
 			unset( $attrs['first-cp'] );
 			unset( $attrs['last-cp'] );
 		}
 		for ( $cp = $first; $cp <= $last; $cp++ ) {
 			$hexCp = sprintf( "%04X", $cp );
 			foreach ( array( 'na', 'na1' ) as $nameProp ) {
 				if ( isset( $attrs[$nameProp] ) ) {
 					$attrs[$nameProp] = str_replace( '#', $hexCp, $attrs[$nameProp] );
 				}
 			}
 			while ( $this->currentBlock ) {
 				if ( $cp < $this->currentBlock[0] ) {
 					break;
 				} elseif ( $cp <= $this->currentBlock[1] ) {
 					$attrs['block'] = key( $this->blocks );
 					break;
 				} else {
 					$this->currentBlock = next( $this->blocks );
 				}
 			}
 			$attrs['cp'] = $hexCp;
 			call_user_func( $this->callback, $attrs );
 		}
 	}
 	public function getBlocks() {
 		if ( $this->blocks ) {
 			return $this->blocks;
 		}
 		$xml = $this->open();
 		while ( $xml->name !== 'blocks' && $xml->read() );
 		while ( $xml->read() ) {
 			if ( $xml->nodeType == XMLReader::ELEMENT ) {
 				if ( $xml->name === 'block' ) {
 					$attrs = $this->readAttributes();
 					$first = hexdec( $attrs['first-cp'] );
 					$last = hexdec( $attrs['last-cp'] );
 					$this->blocks[$attrs['name']] = array( $first, $last );
 				}
 			}
 		}
 		$xml->close();
 		return $this->blocks;
 	}
 }
 $maintClass = 'GenerateCollationData';
 require_once( DO_MAINTENANCE );
--- a/maintenance/tables.sql
+++ b/maintenance/tables.sql
@ -493,13 +493,13 @@ CREATE TABLE /*_*/categorylinks (
  cl_to varchar(255) binary NOT NULL default '',
  -- A binary string obtained by applying a sortkey generation algorithm
-  -- (Language::convertToSortkey()) to page_title, or cl_sortkey_prefix . "\0"
+  -- (Collation::getSortKey()) to page_title, or cl_sortkey_prefix . "\n"
  -- . page_title if cl_sortkey_prefix is nonempty.
  cl_sortkey varbinary(230) NOT NULL default '',
  -- A prefix for the raw sortkey manually specified by the user, either via
  -- [[Category:Foo|prefix]] or {{defaultsort:prefix}}.  If nonempty, it's
-  -- concatenated with a null followed by the page title before the sortkey
+  -- concatenated with a line break followed by the page title before the sortkey
  -- conversion algorithm is run.  We store this so that we can update
  -- collations without reparsing all pages.
  -- Note: If you change the length of this field, you also need to change
--- a/maintenance/updateCollation.php
+++ b/maintenance/updateCollation.php
@ -46,7 +46,7 @@ TEXT;
 	}
 	public function execute() {
-		global $wgCategoryCollation, $wgContLang;
+		global $wgCategoryCollation;
 		$dbw = wfGetDB( DB_MASTER );
 		$count = $dbw->selectField(
@ -105,7 +105,7 @@ TEXT;
 				$dbw->update(
 					'categorylinks',
 					array(
-						'cl_sortkey' => $wgContLang->convertToSortkey(
+						'cl_sortkey' => Collation::singleton()->getSortKey(
 							$title->getCategorySortkey( $prefix ) ),
 						'cl_sortkey_prefix' => $prefix,
 						'cl_collation' => $wgCategoryCollation,
--- a/serialized/first-letters-root.ser
+++ b/serialized/first-letters-root.ser