* Introduced a non-dummy collation for $wgCategoryCollation, namely UCA with default tables.

* Added a maintenance script which generates a list of first letters. Unified Han are omitted for performance, and because they shouldn't be used as headings anyway. A future collation specific to Chinese would provide the KangXi radicals as "first letters". * Provided a precomputed list of first letters. Used Unicode 6.0.0 data and ICU 4.2. * Moved collation functionality from Language to a Collation class hierarchy with factory function. Removed the recently-added methods from Language and updated all callers. * Changed Title::getCategorySortkey() to separate its parts with a line break instead of a null character. All collations supported by the intl extension ignore the null character, i.e. "ab" == "a\0b". It would have required a lot of hacking to make it work. * Fixed the uppercase collation to handle non-ASCII characters, redundantly with r80436. I don't think it's necessary to change the collation name as was done there, so I reverted that in the course of my conflict merge. A --force option to updateCollation.php might be nice though.
2011-01-17 14:02:22 +00:00 · 2011-01-17 14:02:22 +00:00 · eaeea84b44
commit eaeea84b44
parent 14d576cd86
11 changed files with 727 additions and 73 deletions
--- a/includes/AutoLoader.php
+++ b/includes/AutoLoader.php
@ -43,6 +43,7 @@ $wgAutoloadLocalClasses = array(
 	'ChangesFeed' => 'includes/ChangesFeed.php',
 	'ChangeTags' => 'includes/ChangeTags.php',
 	'ChannelFeed' => 'includes/Feed.php',
+	'Collation' => 'includes/Collation.php',
 	'Cookie' => 'includes/HttpFunctions.php',
 	'CookieJar' => 'includes/HttpFunctions.php',
 	'ConcatenatedGzipHistoryBlob' => 'includes/HistoryBlob.php',
@ -127,6 +128,7 @@ $wgAutoloadLocalClasses = array(
 	'HTMLInfoField' => 'includes/HTMLForm.php',
 	'Http' => 'includes/HttpFunctions.php',
 	'HttpRequest' => 'includes/HttpFunctions.old.php',
+	'IcuCollation' => 'includes/Collation.php',
 	'ImageGallery' => 'includes/ImageGallery.php',
 	'ImageHistoryList' => 'includes/ImagePage.php',
 	'ImageHistoryPseudoPager' => 'includes/ImagePage.php',
@ -243,6 +245,7 @@ $wgAutoloadLocalClasses = array(
 	'TitleListDependency' => 'includes/CacheDependency.php',
 	'Token' => 'includes/Token.php',
 	'UnlistedSpecialPage' => 'includes/SpecialPage.php',
+	'UppercaseCollation' => 'includes/Collation.php',
 	'User' => 'includes/User.php',
 	'UserArray' => 'includes/UserArray.php',
 	'UserArrayFromResult' => 'includes/UserArray.php',
--- a/includes/CategoryPage.php
+++ b/includes/CategoryPage.php
@ -90,7 +90,7 @@ class CategoryViewer {
 		$children, $children_start_char,
 		$showGallery, $gallery,
 		$imgsNoGalley, $imgsNoGallery_start_char,
-		$skin;
+		$skin, $collation;
 	# Category object for this page
 	private $cat;
 	# The original query array, to be used in generating paging links.
@ -104,6 +104,7 @@ class CategoryViewer {
 		$this->limit = $wgCategoryPagingLimit;
 		$this->cat = Category::newFromTitle( $title );
 		$this->query = $query;
+		$this->collation = Collation::singleton();
 		unset( $this->query['title'] );
 	}

@ -212,7 +213,7 @@ class CategoryViewer {
 			$word = $sortkey;
 		}

-		$firstChar = $wgContLang->firstLetterForLists( $word );
+		$firstChar = $this->collation->getFirstLetter( $word );

 		return $wgContLang->convert( $firstChar );
 	}
@ -241,7 +242,8 @@ class CategoryViewer {
 				) . '</span>'
 			: $this->getSkin()->link( $title );

-			$this->imgsNoGallery_start_char[] = $wgContLang->convert( $wgContLang->firstLetterForLists( $sortkey ) );
+			$this->imgsNoGallery_start_char[] = $wgContLang->convert( 
+				$this->collation->getFirstLetter( $sortkey ) );
 		}
 	}

@ -261,7 +263,8 @@ class CategoryViewer {
 				) . '</span>'
 			: $this->getSkin()->link( $title );

-		$this->articles_start_char[] = $wgContLang->convert( $wgContLang->firstLetterForLists( $sortkey ) );
+		$this->articles_start_char[] = $wgContLang->convert( 
+			$this->collation->getFirstLetter( $sortkey ) );
 	}

 	function finaliseCategoryState() {
@ -280,8 +283,6 @@ class CategoryViewer {
 	}

 	function doCategoryQuery() {
-		global $wgContLang;
-
 		$dbr = wfGetDB( DB_SLAVE, 'category' );

 		$this->nextPage = array(
@ -294,14 +295,14 @@ class CategoryViewer {
 		foreach ( array( 'page', 'subcat', 'file' ) as $type ) {
 			# Get the sortkeys for start/end, if applicable.  Note that if
 			# the collation in the database differs from the one
-			# $wgContLang is using, pagination might go totally haywire.
+			# set in $wgCategoryCollation, pagination might go totally haywire.
 			$extraConds = array( 'cl_type' => $type );
 			if ( $this->from[$type] !== null ) {
 				$extraConds[] = 'cl_sortkey >= '
-					. $dbr->addQuotes( $wgContLang->convertToSortkey( $this->from[$type] ) );
+					. $dbr->addQuotes( $this->collation->getSortKey( $this->from[$type] ) );
 			} elseif ( $this->until[$type] !== null ) {
 				$extraConds[] = 'cl_sortkey < '
-					. $dbr->addQuotes( $wgContLang->convertToSortkey( $this->until[$type] ) );
+					. $dbr->addQuotes( $this->collation->getSortKey( $this->until[$type] ) );
 				$this->flip[$type] = true;
 			}

--- a/includes/Collation.php
+++ b/includes/Collation.php
@ -0,0 +1,304 @@
+<?php
+
+abstract class Collation {
+	static $instance;
+
+	static function singleton() {
+		if ( !self::$instance ) {
+			global $wgCategoryCollation;
+			self::$instance = self::factory( $wgCategoryCollation );
+		}
+		return self::$instance;
+	}
+
+	static function factory( $collationName ) {
+		switch( $collationName ) {
+			case 'uppercase':
+				return new UppercaseCollation;
+			case 'uca-default':
+				return new IcuCollation( 'root' );
+			default:
+				throw new MWException( __METHOD__.": unknown collation type \"$collationName\"" );
+		}
+	}
+
+	/**
+	 * Given a string, convert it to a (hopefully short) key that can be used
+	 * for efficient sorting.  A binary sort according to the sortkeys
+	 * corresponds to a logical sort of the corresponding strings.  Current
+	 * code expects that a null character should sort before all others, but
+	 * has no other particular expectations (and that one can be changed if
+	 * necessary).
+	 *
+	 * @param string $string UTF-8 string
+	 * @return string Binary sortkey
+	 */
+	abstract function getSortKey( $string );
+
+	/**
+	 * Given a string, return the logical "first letter" to be used for
+	 * grouping on category pages and so on.  This has to be coordinated
+	 * carefully with convertToSortkey(), or else the sorted list might jump
+	 * back and forth between the same "initial letters" or other pathological
+	 * behavior.  For instance, if you just return the first character, but "a"
+	 * sorts the same as "A" based on getSortKey(), then you might get a
+	 * list like
+	 *
+	 * == A ==
+	 * * [[Aardvark]]
+	 *
+	 * == a ==
+	 * * [[antelope]]
+	 *
+	 * == A ==
+	 * * [[Ape]]
+	 *
+	 * etc., assuming for the sake of argument that $wgCapitalLinks is false.
+	 *
+	 * @param string $string UTF-8 string
+	 * @return string UTF-8 string corresponding to the first letter of input
+	 */
+	abstract function getFirstLetter( $string );
+}
+
+class UppercaseCollation extends Collation {
+	var $lang;
+	function __construct() {
+		// Get a language object so that we can use the generic UTF-8 uppercase
+		// function there
+		$this->lang = Language::factory( 'en' );
+	}
+
+	function getSortKey( $string ) {
+		return $this->lang->uc( $string );
+	}
+
+	function getFirstLetter( $string ) {
+		if ( $string[0] == "\0" ) {
+			$string = substr( $string, 1 );
+		}
+		return $this->lang->ucfirst( $this->lang->firstChar( $string ) );
+	}
+}
+
+class IcuCollation extends Collation {
+	var $primaryCollator, $mainCollator, $locale;
+	var $firstLetterData;
+
+	/**
+	 * Unified CJK blocks.
+	 *
+	 * The same definition of a CJK block must be used for both Collation and 
+	 * generateCollationData.php. These blocks are omitted from the first 
+	 * letter data, as an optimisation measure and because the default UCA table 
+	 * is pretty useless for sorting Chinese text anyway. Japanese and Korean 
+	 * blocks are not included here, because they are smaller and more useful.
+	 */
+	static $cjkBlocks = array(
+		array( 0x2E80, 0x2EFF ), // CJK Radicals Supplement
+		array( 0x2F00, 0x2FDF ), // Kangxi Radicals
+		array( 0x2FF0, 0x2FFF ), // Ideographic Description Characters
+		array( 0x3000, 0x303F ), // CJK Symbols and Punctuation
+		array( 0x31C0, 0x31EF ), // CJK Strokes
+		array( 0x3200, 0x32FF ), // Enclosed CJK Letters and Months
+		array( 0x3300, 0x33FF ), // CJK Compatibility
+		array( 0x3400, 0x4DBF ), // CJK Unified Ideographs Extension A
+		array( 0x4E00, 0x9FFF ), // CJK Unified Ideographs
+		array( 0xF900, 0xFAFF ), // CJK Compatibility Ideographs
+		array( 0xFE30, 0xFE4F ), // CJK Compatibility Forms
+		array( 0x20000, 0x2A6DF ), // CJK Unified Ideographs Extension B
+		array( 0x2A700, 0x2B73F ), // CJK Unified Ideographs Extension C
+		array( 0x2B740, 0x2B81F ), // CJK Unified Ideographs Extension D
+		array( 0x2F800, 0x2FA1F ), // CJK Compatibility Ideographs Supplement
+	);
+
+	const RECORD_LENGTH = 14;
+
+	function __construct( $locale ) {
+		if ( !extension_loaded( 'intl' ) ) {
+			throw new MWException( 'An ICU collation was requested, ' . 
+				'but the intl extension is not available.' );
+		}
+		$this->locale = $locale;
+		$this->mainCollator = Collator::create( $locale );
+		if ( !$this->mainCollator ) {
+			throw new MWException( "Invalid ICU locale specified for collation: $locale" );
+		}
+
+		$this->primaryCollator = Collator::create( $locale );
+		$this->primaryCollator->setStrength( Collator::PRIMARY );
+	}
+
+	function getSortKey( $string ) {
+		wfSuppressWarnings();
+		$key = $this->mainCollator->getSortKey( $string ) . '';
+		wfRestoreWarnings();
+		return $key;
+	}
+
+	function getPrimarySortKey( $string ) {
+		wfSuppressWarnings();
+		$key = $this->primaryCollator->getSortKey( $string ) . '';
+		wfRestoreWarnings();
+		return $key;
+	}
+
+	function getFirstLetter( $string ) {
+		$string = strval( $string );
+		if ( $string === '' ) {
+			return '';
+		}
+
+		// Check for CJK
+		$firstChar = mb_substr( $string, 0, 1, 'UTF-8' );
+		if ( ord( $firstChar ) > 0x7f 
+			&& self::isCjk( utf8ToCodepoint( $firstChar ) ) ) 
+		{
+			return $firstChar;
+		}
+
+		$sortKey = $this->getPrimarySortKey( $string );
+
+		// Do a binary search to find the correct letter to sort under
+		$min = $this->findLowerBound(
+			array( $this, 'getSortKeyByLetterIndex' ),
+			$this->getFirstLetterCount(),
+			'strcmp',
+			$sortKey );
+
+		if ( $min === false ) {
+			// Before the first letter
+			return '';
+		}
+		return $this->getLetterByIndex( $min );
+	}
+
+	function getFirstLetterData() {
+		if ( $this->firstLetterData !== null ) {
+			return $this->firstLetterData;
+		}
+
+		$cache = wfGetCache( CACHE_ANYTHING );
+		$cacheKey = wfMemcKey( 'first-letters', $this->locale );
+		$cacheEntry = $cache->get( $cacheKey );
+
+		if ( $cacheEntry ) {
+			$this->firstLetterData = $cacheEntry;
+			return $this->firstLetterData;
+		}
+
+		// Generate data from serialized data file
+
+		$letters = wfGetPrecompiledData( "first-letters-{$this->locale}.ser" );
+		if ( $letters === false ) {
+			throw new MWException( "MediaWiki does not support ICU locale " .
+				"\"{$this->locale}\"" );
+		}
+
+		// Sort the letters.
+		//
+		// It's impossible to have the precompiled data file properly sorted,
+		// because the sort order changes depending on ICU version. If the 
+		// array is not properly sorted, the binary search will return random 
+		// results. 
+		//
+		// We also take this opportunity to remove primary collisions.
+		$letterMap = array();
+		foreach ( $letters as $letter ) {
+			$key = $this->getPrimarySortKey( $letter );
+			if ( isset( $letterMap[$key] ) ) {
+				// Primary collision
+				// Keep whichever one sorts first in the main collator
+				if ( $this->mainCollator->compare( $letter, $letterMap[$key] ) < 0 ) {
+					$letterMap[$key] = $letter;
+				}
+			} else {
+				$letterMap[$key] = $letter;
+			}
+		}
+		ksort( $letterMap, SORT_STRING );
+		$data = array(
+			'chars' => array_values( $letterMap ),
+			'keys' => array_keys( $letterMap )
+		);
+
+		// Reduce memory usage before caching
+		unset( $letterMap );
+
+		// Save to cache
+		$this->firstLetterData = $data;
+		$cache->set( $cacheKey, $data, 86400 * 7 /* 1 week */ );
+		return $data;
+	}
+
+	function getLetterByIndex( $index ) {
+		if ( $this->firstLetterData === null ) {
+			$this->getFirstLetterData();
+		}
+		return $this->firstLetterData['chars'][$index];
+	}
+
+	function getSortKeyByLetterIndex( $index ) {
+		if ( $this->firstLetterData === null ) {
+			$this->getFirstLetterData();
+		}
+		return $this->firstLetterData['keys'][$index];
+	}
+
+	function getFirstLetterCount() {
+		if ( $this->firstLetterData === null ) {
+			$this->getFirstLetterData();
+		}
+		return count( $this->firstLetterData['chars'] );
+	}
+
+	/**
+	 * Do a binary search, and return the index of the largest item that sorts 
+	 * less than or equal to the target value.
+	 *
+	 * @param $valueCallback A function to call to get the value with 
+	 *     a given array index.
+	 * @param $valueCount The number of items accessible via $valueCallback, 
+	 *     indexed from 0 to $valueCount - 1
+	 * @param $comparisonCallback A callback to compare two values, returning 
+	 *     -1, 0 or 1 in the style of strcmp().
+	 * @param $target The target value to find.
+	 *
+	 * @return The item index of the lower bound, or false if the target value
+	 *     sorts before all items.
+	 */
+	function findLowerBound( $valueCallback, $valueCount, $comparisonCallback, $target ) {
+		$min = 0;
+		$max = $valueCount - 1;
+		do {
+			$mid = $min + ( ( $max - $min ) >> 1 );
+			$item = call_user_func( $valueCallback, $mid );
+			$comparison = call_user_func( $comparisonCallback, $target, $item );
+			if ( $comparison > 0 ) {
+				$min = $mid;
+			} elseif ( $comparison == 0 ) {
+				$min = $mid;
+				break;
+			} else {
+				$max = $mid;
+			}
+		} while ( $min < $max - 1 );
+
+		if ( $min == 0 && $max == 0 && $comparison > 0 ) {
+			// Before the first item
+			return false;
+		} else {
+			return $min;
+		}
+	}
+
+	static function isCjk( $codepoint ) {
+		foreach ( self::$cjkBlocks as $block ) {
+			if ( $codepoint >= $block[0] && $codepoint <= $block[1] ) {
+				return true;
+			}
+		}
+		return false;
+	}
+}
+
--- a/includes/DefaultSettings.php
+++ b/includes/DefaultSettings.php
@ -4611,15 +4611,26 @@ $wgCategoryMagicGallery = true;
 $wgCategoryPagingLimit = 200;

 /**
- * A version indicator for collations that will be stored in cl_collation for
- * all new rows.  Used when the collation algorithm changes: a script checks
- * for all rows where cl_collation != $wgCategoryCollation and regenerates
- * cl_sortkey based on the page name and cl_sortkey_prefix.
+ * Specify how category names should be sorted, when listed on a category page. 
+ * A sorting scheme is also known as a collation.
 *
- * Currently only supports 'uppercase2', which just uppercases the string.  This
- * is a dummy collation, to be replaced later by real ones.
+ * Available values are:
+ *
+ *   - uppercase: Converts the category name to upper case, and sorts by that.
+ *
+ *   - uca-default: Provides access to the Unicode Collation Algorithm with 
+ *     the default element table. This is a compromise collation which sorts
+ *     all languages in a mediocre way. However, it is better than "uppercase".
+ *
+ * To use the uca-default collation, you must have PHP's intl extension 
+ * installed. See http://php.net/manual/en/intl.setup.php . The details of the 
+ * resulting collation will depend on the version of ICU installed on the 
+ * server.
+ *
+ * After you change this, you must run maintenance/updateCollation.php to fix
+ * the sort keys in the database. 
 */
-$wgCategoryCollation = 'uppercase2';
+$wgCategoryCollation = 'uppercase';

 /** @} */ # End categories }

--- a/includes/LinksUpdate.php
+++ b/includes/LinksUpdate.php
@ -454,14 +454,14 @@ class LinksUpdate {
 			# (Title::moveTo() has had the same issue for a long time).
 			if ( $this->mTitle->getCategorySortkey() == $sortkey ) {
 				$prefix = '';
-				$sortkey = $wgContLang->convertToSortkey( $sortkey );
+				$sortkey = Collation::singleton()->getSortKey( $sortkey );
 			} else {
 				# Treat custom sortkeys as a prefix, so that if multiple
 				# things are forced to sort as '*' or something, they'll
 				# sort properly in the category rather than in page_id
 				# order or such.
 				$prefix = $sortkey;
-				$sortkey = $wgContLang->convertToSortkey(
+				$sortkey = Collation::singleton()->getSortKey(
 					$this->mTitle->getCategorySortkey( $prefix ) );
 			}

--- a/includes/Title.php
+++ b/includes/Title.php
@ -3088,8 +3088,6 @@ class Title {
 	 * @return Mixed true on success, getUserPermissionsErrors()-like array on failure
 	 */
 	public function moveTo( &$nt, $auth = true, $reason = '', $createRedirect = true ) {
-		global $wgContLang;
-
 		$err = $this->isValidMoveOperation( $nt, $auth, $reason );
 		if ( is_array( $err ) ) {
 			return $err;
@ -3129,7 +3127,8 @@ class Title {
 		);
 		$dbw->update( 'categorylinks',
 			array(
-				'cl_sortkey' => $wgContLang->convertToSortkey( $nt->getCategorySortkey( $prefix ) ),
+				'cl_sortkey' => Collation::singleton()->getSortKey( 
+					$nt->getCategorySortkey( $prefix ) ),
 				'cl_timestamp=cl_timestamp' ),
 			array( 'cl_from' => $pageid ),
 			__METHOD__ );
@ -4139,7 +4138,7 @@ class Title {

 	/**
 	 * Returns the raw sort key to be used for categories, with the specified
-	 * prefix.  This will be fed to Language::convertToSortkey() to get a
+	 * prefix.  This will be fed to Collation::getSortKey() to get a
 	 * binary sortkey that can be used for actual sorting.
 	 *
 	 * @param $prefix string The prefix to be used, specified using
@ -4153,7 +4152,7 @@ class Title {
 			# Separate with a null byte, so the unprefixed part is only used as
 			# a tiebreaker when two pages have the exact same prefix -- null
 			# sorts before everything else (hopefully).
-			return "$prefix\0$unprefixed";
+			return "$prefix\n$unprefixed";
 		}
 		return $unprefixed;
 	}
--- a/languages/Language.php
+++ b/languages/Language.php
@ -2996,50 +2996,4 @@ class Language {
 	function getConvRuleTitle() {
 		return $this->mConverter->getConvRuleTitle();
 	}
-
-	/**
-	 * Given a string, convert it to a (hopefully short) key that can be used
-	 * for efficient sorting.  A binary sort according to the sortkeys
-	 * corresponds to a logical sort of the corresponding strings.  Current
-	 * code expects that a null character should sort before all others, but
-	 * has no other particular expectations (and that one can be changed if
-	 * necessary).
-	 *
-	 * @param string $string UTF-8 string
-	 * @return string Binary sortkey
-	 */
-	public function convertToSortkey( $string ) {
-		# Fake function for now
-		return $this->uc( $string );
-	}
-
-	/**
-	 * Given a string, return the logical "first letter" to be used for
-	 * grouping on category pages and so on.  This has to be coordinated
-	 * carefully with convertToSortkey(), or else the sorted list might jump
-	 * back and forth between the same "initial letters" or other pathological
-	 * behavior.  For instance, if you just return the first character, but "a"
-	 * sorts the same as "A" based on convertToSortkey(), then you might get a
-	 * list like
-	 *
-	 * == A ==
-	 * * [[Aardvark]]
-	 *
-	 * == a ==
-	 * * [[antelope]]
-	 *
-	 * == A ==
-	 * * [[Ape]]
-	 *
-	 * etc., assuming for the sake of argument that $wgCapitalLinks is false.
-	 *
-	 * @param string $string UTF-8 string
-	 * @return string UTF-8 string corresponding to the first letter of input
-	 */
-	public function firstLetterForLists( $string ) {
-		if ( $string[0] == "\0" ) {
-			$string = substr( $string, 1 );
-		}
-		return $this->uc( $this->firstChar( $string ) );
-	}
 }
--- a/maintenance/language/generateCollationData.php
+++ b/maintenance/language/generateCollationData.php
@ -0,0 +1,381 @@
+<?php
+
+require_once( dirname( __FILE__ ) .'/../Maintenance.php' );
+
+/**
+ * Generate first letter data files for Collation.php
+ */
+class GenerateCollationData extends Maintenance {
+	/** The directory with source data files in it */
+	var $dataDir;
+
+	/** The primary weights, indexed by codepoint */
+	var $weights;
+
+	/** 
+	 * A hashtable keyed by codepoint, where presence indicates that a character
+	 * has a decomposition mapping. This makes it non-preferred for group header
+	 * selection.
+	 */
+	var $mappedChars;
+
+	var $debugOutFile;
+
+	/**
+	 * Important tertiary weights from UTS #10 section 7.2
+	 */
+	const NORMAL_UPPERCASE = 0x08;
+	const NORMAL_HIRAGANA = 0X0E;
+
+	public function __construct() {
+		parent::__construct();
+		$this->addOption( 'data-dir', 'A directory on the local filesystem ' .
+			'containing allkeys.txt and ucd.all.grouped.xml from unicode.org', 
+			false, true );
+		$this->addOption( 'debug-output', 'Filename for sending debug output to',
+			false, true );
+	}
+
+	public function execute() {
+		$this->dataDir = $this->getOption( 'data-dir', '.' );
+		if ( !file_exists( "{$this->dataDir}/allkeys.txt" ) ) {
+			$this->error( "Unable to find allkeys.txt. Please download it from " .
+				"http://www.unicode.org/Public/UCA/latest/allkeys.txt and specify " .
+				"its location with --data-dir=<DIR>" );
+			exit( 1 );
+		}
+		if ( !file_exists( "{$this->dataDir}/ucd.all.grouped.xml" ) ) {
+			$this->error( "Unable to find ucd.all.grouped.xml. Please download it " .
+				"from http://www.unicode.org/Public/6.0.0/ucdxml/ucd.all.grouped.zip " .
+				"and specify its location with --data-dir=<DIR>" );
+			exit( 1 );
+		}
+		$debugOutFileName = $this->getOption( 'debug-output' );
+		if ( $debugOutFileName ) {
+			$this->debugOutFile = fopen( $debugOutFileName, 'w' );
+			if ( !$this->debugOutFile ) {
+				$this->error( "Unable to open debug output file for writing" );
+				exit( 1 );
+			}
+		}
+		$this->loadUcd();
+		$this->generateFirstChars();
+	}
+
+	function loadUcd() {
+		$uxr = new UcdXmlReader( "{$this->dataDir}/ucd.all.grouped.xml" );
+		$uxr->readChars( array( $this, 'charCallback' ) );
+	}
+
+	function charCallback( $data ) {
+		// Skip non-printable characters
+		$category = substr( $data['gc'], 0, 1 );
+		if ( strpos( 'LNPS', $category ) === false ) {
+			return;
+		}
+		$cp = hexdec( $data['cp'] );
+
+		// Skip the CJK ideograph blocks, as an optimisation measure.
+		// UCA doesn't sort them properly anyway, without tailoring.
+		if ( IcuCollation::isCjk( $cp ) ) {
+			return;
+		}
+
+		// Skip the composed Hangul syllables, we will use the bare Jamo 
+		// as first letters
+		if ( $data['block'] == 'Hangul Syllables' ) {
+			return;
+		}
+
+		// Calculate implicit weight per UTS #10 v6.0.0, sec 7.1.3
+		if ( $data['UIdeo'] === 'Y' ) {
+			if ( $data['block'] == 'CJK Unified Ideographs'
+				|| $data['block'] == 'CJK Compatibility Ideographs' )
+			{
+				$base = 0xFB40;
+			} else {
+				$base = 0xFB80;
+			}
+		} else {
+			$base = 0xFBC0;
+		}
+		$a = $base + ( $cp >> 15 );
+		$b = ( $cp & 0x7fff ) | 0x8000;
+
+		$this->weights[$cp] = sprintf( ".%04X.%04X", $a, $b );
+
+		if ( $data['dm'] !== '#' ) {
+			$this->mappedChars[$cp] = true;
+		}
+
+		if ( $cp % 4096 == 0 ) {
+			print "{$data['cp']}\n";
+		}
+	}
+
+	function generateFirstChars() {
+		$file = fopen( "{$this->dataDir}/allkeys.txt", 'r' );
+		if ( !$file ) {
+			$this->error( "Unable to open allkeys.txt" );
+			exit( 1 );
+		}
+		global $IP;
+		$outFile = fopen( "$IP/serialized/first-letters-root.ser", 'w' );
+		if ( !$outFile ) {
+			$this->error( "Unable to open output file first-letters-root.ser" );
+			exit( 1 );
+		}
+
+		$goodTertiaryChars = array();
+
+		// For each character with an entry in allkeys.txt, overwrite the implicit
+		// entry in $this->weights that came from the UCD.
+		// Also gather a list of tertiary weights, for use in selecting the group header
+		while ( false !== ( $line = fgets( $file ) ) ) {
+			// We're only interested in single-character weights, pick them out with a regex
+			$line = trim( $line );
+			if ( !preg_match( '/^([0-9A-F]+)\s*;\s*([^#]*)/', $line, $m ) ) {
+				continue;
+			}
+
+			$cp = hexdec( $m[1] );
+			$allWeights = trim( $m[2] );
+			$primary = '';
+			$tertiary = '';
+
+			if ( !isset( $this->weights[$cp] ) ) {
+				// Non-printable, ignore
+				continue;
+			}
+			foreach ( StringUtils::explode( '[', $allWeights ) as $weightStr ) {
+				preg_match_all( '/[*.]([0-9A-F]+)/', $weightStr, $m );
+				if ( !empty( $m[1] ) ) {
+					if ( $m[1][0] !== '0000' ) {
+						$primary .= '.' . $m[1][0];
+					}
+					if ( $m[1][2] !== '0000' ) {
+						$tertiary .= '.' . $m[1][2];
+					}
+				}
+			}
+			$this->weights[$cp] = $primary;
+			if ( $tertiary === '.0008'
+				|| $tertiary === '.000E' ) 
+			{
+				$goodTertiaryChars[$cp] = true;
+			}
+		}
+		fclose( $file );
+
+		// Identify groups of characters with the same primary weight
+		$this->groups = array();
+		asort( $this->weights, SORT_STRING );
+		$prevWeight = reset( $this->weights );
+		$group = array();
+		foreach ( $this->weights as $cp => $weight ) {
+			if ( $weight !== $prevWeight ) {
+				$this->groups[$prevWeight] = $group;
+				$prevWeight = $weight;
+				if ( isset( $this->groups[$weight] ) ) {
+					$group = $this->groups[$weight];
+				} else {
+					$group = array();
+				}
+			}
+			$group[] = $cp;
+		}
+		if ( $group ) {
+			$this->groups[$prevWeight] = $group;
+		}
+
+		// If one character has a given primary weight sequence, and a second
+		// character has a longer primary weight sequence with an initial 
+		// portion equal to the first character, then remove the second 
+		// character. This avoids having characters like U+A732 (double A)
+		// polluting the basic latin sort area.
+		$prevWeights = array();
+		foreach ( $this->groups as $weight => $group ) {
+			if ( preg_match( '/(\.[0-9A-F]*)\./', $weight, $m ) ) {
+				if ( isset( $this->groups[$m[1]] ) ) {
+					unset( $this->groups[$weight] );
+				}
+			}
+		}
+
+		ksort( $this->groups, SORT_STRING );
+
+		// Identify the header character in each group
+		$headerChars = array();
+		$prevChar = "\000";
+		$tertiaryCollator = new Collator( 'root' );
+		$primaryCollator = new Collator( 'root' );
+		$primaryCollator->setStrength( Collator::PRIMARY );
+		$numOutOfOrder = 0;
+		foreach ( $this->groups as $weight => $group ) {
+			$uncomposedChars = array();
+			$goodChars = array();
+			foreach ( $group as $cp ) {
+				if ( isset( $goodTertiaryChars[$cp] ) ) {
+					$goodChars[] = $cp;
+				}
+				if ( !isset( $this->mappedChars[$cp] ) ) {
+					$uncomposedChars[] = $cp;
+				}
+			}
+			$x = array_intersect( $goodChars, $uncomposedChars );
+			if ( !$x ) {
+				$x = $uncomposedChars;
+				if ( !$x ) {
+					$x = $group;
+				}
+			}
+
+			// Use ICU to pick the lowest sorting character in the selection
+			$tertiaryCollator->sort( $x );
+			$cp = $x[0];
+
+			$char = codepointToUtf8( $cp );
+			$headerChars[] = $char;
+			if ( $primaryCollator->compare( $char, $prevChar ) <= 0 ) {
+				$numOutOfOrder ++;
+				/*
+				printf( "Out of order: U+%05X > U+%05X\n",
+					utf8ToCodepoint( $prevChar ),
+					utf8ToCodepoint( $char ) );
+				 */
+			}
+			$prevChar = $char;
+
+			if ( $this->debugOutFile ) {
+				fwrite( $this->debugOutFile, sprintf( "%05X %s %s (%s)\n", $cp, $weight, $char,
+					implode( ' ', array_map( 'codepointToUtf8', $group ) ) ) );
+			}
+		}
+
+		print "Out of order: $numOutOfOrder / " . count( $headerChars ) . "\n";
+
+		fwrite( $outFile, serialize( $headerChars ) );
+	}
+}
+
+class UcdXmlReader {
+	var $fileName;
+	var $callback;
+	var $groupAttrs;
+	var $xml;
+	var $blocks = array();
+	var $currentBlock;
+
+	function __construct( $fileName ) {
+		$this->fileName = $fileName;
+	}
+
+	public function readChars( $callback ) {
+		$this->getBlocks();
+		$this->currentBlock = reset( $this->blocks );
+		$xml = $this->open();
+		$this->callback = $callback;
+
+		while ( $xml->name !== 'repertoire' && $xml->next() );
+
+		while ( $xml->read() ) {
+			if ( $xml->nodeType == XMLReader::ELEMENT ) {
+				if ( $xml->name === 'group' ) {
+					$this->groupAttrs = $this->readAttributes();
+				} elseif ( $xml->name === 'char' ) {
+					$this->handleChar();
+				}
+			} elseif ( $xml->nodeType === XMLReader::END_ELEMENT ) {
+				if ( $xml->name === 'group' ) {
+					$this->groupAttrs = array();
+				}
+			}
+		}
+		$xml->close();
+	}
+
+	protected function open() {
+		$this->xml = new XMLReader;
+		$this->xml->open( $this->fileName );
+		if ( !$this->xml ) {
+			throw new MWException( __METHOD__.": unable to open {$this->fileName}" );
+		}
+		while ( $this->xml->name !== 'ucd' && $this->xml->read() );
+		$this->xml->read();
+		return $this->xml;
+	}	
+
+	/**
+	 * Read the attributes of the current element node and return them 
+	 * as an array
+	 */
+	protected function readAttributes() {
+		$attrs = array();
+		while ( $this->xml->moveToNextAttribute() ) {
+			$attrs[$this->xml->name] = $this->xml->value;
+		}
+		return $attrs;
+	}
+
+	protected function handleChar() {
+		$attrs = $this->readAttributes() + $this->groupAttrs;
+		if ( isset( $attrs['cp'] ) ) {
+			$first = $last = hexdec( $attrs['cp'] );
+		} else {
+			$first = hexdec( $attrs['first-cp'] );
+			$last = hexdec( $attrs['last-cp'] );
+			unset( $attrs['first-cp'] );
+			unset( $attrs['last-cp'] );
+		}
+
+		for ( $cp = $first; $cp <= $last; $cp++ ) {
+			$hexCp = sprintf( "%04X", $cp );
+			foreach ( array( 'na', 'na1' ) as $nameProp ) {
+				if ( isset( $attrs[$nameProp] ) ) {
+					$attrs[$nameProp] = str_replace( '#', $hexCp, $attrs[$nameProp] );
+				}
+			}
+
+			while ( $this->currentBlock ) {
+				if ( $cp < $this->currentBlock[0] ) {
+					break;
+				} elseif ( $cp <= $this->currentBlock[1] ) {
+					$attrs['block'] = key( $this->blocks );
+					break;
+				} else {
+					$this->currentBlock = next( $this->blocks );
+				}
+			}
+
+			$attrs['cp'] = $hexCp;
+			call_user_func( $this->callback, $attrs );
+		}
+	}
+
+	public function getBlocks() {
+		if ( $this->blocks ) {
+			return $this->blocks;
+		}
+
+		$xml = $this->open();
+		while ( $xml->name !== 'blocks' && $xml->read() );
+
+		while ( $xml->read() ) {
+			if ( $xml->nodeType == XMLReader::ELEMENT ) {
+				if ( $xml->name === 'block' ) {
+					$attrs = $this->readAttributes();
+					$first = hexdec( $attrs['first-cp'] );
+					$last = hexdec( $attrs['last-cp'] );
+					$this->blocks[$attrs['name']] = array( $first, $last );
+				}
+			}
+		}
+		$xml->close();
+		return $this->blocks;
+	}
+
+}
+
+$maintClass = 'GenerateCollationData';
+require_once( DO_MAINTENANCE );
+
--- a/maintenance/tables.sql
+++ b/maintenance/tables.sql
@ -493,13 +493,13 @@ CREATE TABLE /*_*/categorylinks (
  cl_to varchar(255) binary NOT NULL default '',

  -- A binary string obtained by applying a sortkey generation algorithm
-  -- (Language::convertToSortkey()) to page_title, or cl_sortkey_prefix . "\0"
+  -- (Collation::getSortKey()) to page_title, or cl_sortkey_prefix . "\n"
  -- . page_title if cl_sortkey_prefix is nonempty.
  cl_sortkey varbinary(230) NOT NULL default '',

  -- A prefix for the raw sortkey manually specified by the user, either via
  -- [[Category:Foo|prefix]] or {{defaultsort:prefix}}.  If nonempty, it's
-  -- concatenated with a null followed by the page title before the sortkey
+  -- concatenated with a line break followed by the page title before the sortkey
  -- conversion algorithm is run.  We store this so that we can update
  -- collations without reparsing all pages.
  -- Note: If you change the length of this field, you also need to change
--- a/maintenance/updateCollation.php
+++ b/maintenance/updateCollation.php
@ -46,7 +46,7 @@ TEXT;
 	}

 	public function execute() {
-		global $wgCategoryCollation, $wgContLang;
+		global $wgCategoryCollation;

 		$dbw = wfGetDB( DB_MASTER );
 		$count = $dbw->selectField(
@ -105,7 +105,7 @@ TEXT;
 				$dbw->update(
 					'categorylinks',
 					array(
-						'cl_sortkey' => $wgContLang->convertToSortkey(
+						'cl_sortkey' => Collation::singleton()->getSortKey(
 							$title->getCategorySortkey( $prefix ) ),
 						'cl_sortkey_prefix' => $prefix,
 						'cl_collation' => $wgCategoryCollation,
--- a/serialized/first-letters-root.ser
+++ b/serialized/first-letters-root.ser