* Introduced a non-dummy collation for $wgCategoryCollation, namely UCA with default tables.

* Added a maintenance script which generates a list of first letters. Unified Han are omitted for performance, and because they shouldn't be used as headings anyway. A future collation specific to Chinese would provide the KangXi radicals as "first letters".
* Provided a precomputed list of first letters. Used Unicode 6.0.0 data and ICU 4.2. 
* Moved collation functionality from Language to a Collation class hierarchy with factory function. Removed the recently-added methods from Language and updated all callers.
* Changed Title::getCategorySortkey() to separate its parts with a line break instead of a null character. All collations supported by the intl extension ignore the null character, i.e. "ab" == "a\0b". It would have required a lot of hacking to make it work.
* Fixed the uppercase collation to handle non-ASCII characters, redundantly with r80436. I don't think it's necessary to change the collation name as was done there, so I reverted that in the course of my conflict merge. A --force option to updateCollation.php might be nice though.
This commit is contained in:
Tim Starling 2011-01-17 14:02:22 +00:00
parent 14d576cd86
commit eaeea84b44
11 changed files with 727 additions and 73 deletions

View file

@ -43,6 +43,7 @@ $wgAutoloadLocalClasses = array(
'ChangesFeed' => 'includes/ChangesFeed.php',
'ChangeTags' => 'includes/ChangeTags.php',
'ChannelFeed' => 'includes/Feed.php',
'Collation' => 'includes/Collation.php',
'Cookie' => 'includes/HttpFunctions.php',
'CookieJar' => 'includes/HttpFunctions.php',
'ConcatenatedGzipHistoryBlob' => 'includes/HistoryBlob.php',
@ -127,6 +128,7 @@ $wgAutoloadLocalClasses = array(
'HTMLInfoField' => 'includes/HTMLForm.php',
'Http' => 'includes/HttpFunctions.php',
'HttpRequest' => 'includes/HttpFunctions.old.php',
'IcuCollation' => 'includes/Collation.php',
'ImageGallery' => 'includes/ImageGallery.php',
'ImageHistoryList' => 'includes/ImagePage.php',
'ImageHistoryPseudoPager' => 'includes/ImagePage.php',
@ -243,6 +245,7 @@ $wgAutoloadLocalClasses = array(
'TitleListDependency' => 'includes/CacheDependency.php',
'Token' => 'includes/Token.php',
'UnlistedSpecialPage' => 'includes/SpecialPage.php',
'UppercaseCollation' => 'includes/Collation.php',
'User' => 'includes/User.php',
'UserArray' => 'includes/UserArray.php',
'UserArrayFromResult' => 'includes/UserArray.php',

View file

@ -90,7 +90,7 @@ class CategoryViewer {
$children, $children_start_char,
$showGallery, $gallery,
$imgsNoGalley, $imgsNoGallery_start_char,
$skin;
$skin, $collation;
# Category object for this page
private $cat;
# The original query array, to be used in generating paging links.
@ -104,6 +104,7 @@ class CategoryViewer {
$this->limit = $wgCategoryPagingLimit;
$this->cat = Category::newFromTitle( $title );
$this->query = $query;
$this->collation = Collation::singleton();
unset( $this->query['title'] );
}
@ -212,7 +213,7 @@ class CategoryViewer {
$word = $sortkey;
}
$firstChar = $wgContLang->firstLetterForLists( $word );
$firstChar = $this->collation->getFirstLetter( $word );
return $wgContLang->convert( $firstChar );
}
@ -241,7 +242,8 @@ class CategoryViewer {
) . '</span>'
: $this->getSkin()->link( $title );
$this->imgsNoGallery_start_char[] = $wgContLang->convert( $wgContLang->firstLetterForLists( $sortkey ) );
$this->imgsNoGallery_start_char[] = $wgContLang->convert(
$this->collation->getFirstLetter( $sortkey ) );
}
}
@ -261,7 +263,8 @@ class CategoryViewer {
) . '</span>'
: $this->getSkin()->link( $title );
$this->articles_start_char[] = $wgContLang->convert( $wgContLang->firstLetterForLists( $sortkey ) );
$this->articles_start_char[] = $wgContLang->convert(
$this->collation->getFirstLetter( $sortkey ) );
}
function finaliseCategoryState() {
@ -280,8 +283,6 @@ class CategoryViewer {
}
function doCategoryQuery() {
global $wgContLang;
$dbr = wfGetDB( DB_SLAVE, 'category' );
$this->nextPage = array(
@ -294,14 +295,14 @@ class CategoryViewer {
foreach ( array( 'page', 'subcat', 'file' ) as $type ) {
# Get the sortkeys for start/end, if applicable. Note that if
# the collation in the database differs from the one
# $wgContLang is using, pagination might go totally haywire.
# set in $wgCategoryCollation, pagination might go totally haywire.
$extraConds = array( 'cl_type' => $type );
if ( $this->from[$type] !== null ) {
$extraConds[] = 'cl_sortkey >= '
. $dbr->addQuotes( $wgContLang->convertToSortkey( $this->from[$type] ) );
. $dbr->addQuotes( $this->collation->getSortKey( $this->from[$type] ) );
} elseif ( $this->until[$type] !== null ) {
$extraConds[] = 'cl_sortkey < '
. $dbr->addQuotes( $wgContLang->convertToSortkey( $this->until[$type] ) );
. $dbr->addQuotes( $this->collation->getSortKey( $this->until[$type] ) );
$this->flip[$type] = true;
}

304
includes/Collation.php Normal file
View file

@ -0,0 +1,304 @@
<?php
abstract class Collation {
static $instance;
static function singleton() {
if ( !self::$instance ) {
global $wgCategoryCollation;
self::$instance = self::factory( $wgCategoryCollation );
}
return self::$instance;
}
static function factory( $collationName ) {
switch( $collationName ) {
case 'uppercase':
return new UppercaseCollation;
case 'uca-default':
return new IcuCollation( 'root' );
default:
throw new MWException( __METHOD__.": unknown collation type \"$collationName\"" );
}
}
/**
* Given a string, convert it to a (hopefully short) key that can be used
* for efficient sorting. A binary sort according to the sortkeys
* corresponds to a logical sort of the corresponding strings. Current
* code expects that a null character should sort before all others, but
* has no other particular expectations (and that one can be changed if
* necessary).
*
* @param string $string UTF-8 string
* @return string Binary sortkey
*/
abstract function getSortKey( $string );
/**
* Given a string, return the logical "first letter" to be used for
* grouping on category pages and so on. This has to be coordinated
* carefully with convertToSortkey(), or else the sorted list might jump
* back and forth between the same "initial letters" or other pathological
* behavior. For instance, if you just return the first character, but "a"
* sorts the same as "A" based on getSortKey(), then you might get a
* list like
*
* == A ==
* * [[Aardvark]]
*
* == a ==
* * [[antelope]]
*
* == A ==
* * [[Ape]]
*
* etc., assuming for the sake of argument that $wgCapitalLinks is false.
*
* @param string $string UTF-8 string
* @return string UTF-8 string corresponding to the first letter of input
*/
abstract function getFirstLetter( $string );
}
class UppercaseCollation extends Collation {
var $lang;
function __construct() {
// Get a language object so that we can use the generic UTF-8 uppercase
// function there
$this->lang = Language::factory( 'en' );
}
function getSortKey( $string ) {
return $this->lang->uc( $string );
}
function getFirstLetter( $string ) {
if ( $string[0] == "\0" ) {
$string = substr( $string, 1 );
}
return $this->lang->ucfirst( $this->lang->firstChar( $string ) );
}
}
class IcuCollation extends Collation {
var $primaryCollator, $mainCollator, $locale;
var $firstLetterData;
/**
* Unified CJK blocks.
*
* The same definition of a CJK block must be used for both Collation and
* generateCollationData.php. These blocks are omitted from the first
* letter data, as an optimisation measure and because the default UCA table
* is pretty useless for sorting Chinese text anyway. Japanese and Korean
* blocks are not included here, because they are smaller and more useful.
*/
static $cjkBlocks = array(
array( 0x2E80, 0x2EFF ), // CJK Radicals Supplement
array( 0x2F00, 0x2FDF ), // Kangxi Radicals
array( 0x2FF0, 0x2FFF ), // Ideographic Description Characters
array( 0x3000, 0x303F ), // CJK Symbols and Punctuation
array( 0x31C0, 0x31EF ), // CJK Strokes
array( 0x3200, 0x32FF ), // Enclosed CJK Letters and Months
array( 0x3300, 0x33FF ), // CJK Compatibility
array( 0x3400, 0x4DBF ), // CJK Unified Ideographs Extension A
array( 0x4E00, 0x9FFF ), // CJK Unified Ideographs
array( 0xF900, 0xFAFF ), // CJK Compatibility Ideographs
array( 0xFE30, 0xFE4F ), // CJK Compatibility Forms
array( 0x20000, 0x2A6DF ), // CJK Unified Ideographs Extension B
array( 0x2A700, 0x2B73F ), // CJK Unified Ideographs Extension C
array( 0x2B740, 0x2B81F ), // CJK Unified Ideographs Extension D
array( 0x2F800, 0x2FA1F ), // CJK Compatibility Ideographs Supplement
);
const RECORD_LENGTH = 14;
function __construct( $locale ) {
if ( !extension_loaded( 'intl' ) ) {
throw new MWException( 'An ICU collation was requested, ' .
'but the intl extension is not available.' );
}
$this->locale = $locale;
$this->mainCollator = Collator::create( $locale );
if ( !$this->mainCollator ) {
throw new MWException( "Invalid ICU locale specified for collation: $locale" );
}
$this->primaryCollator = Collator::create( $locale );
$this->primaryCollator->setStrength( Collator::PRIMARY );
}
function getSortKey( $string ) {
wfSuppressWarnings();
$key = $this->mainCollator->getSortKey( $string ) . '';
wfRestoreWarnings();
return $key;
}
function getPrimarySortKey( $string ) {
wfSuppressWarnings();
$key = $this->primaryCollator->getSortKey( $string ) . '';
wfRestoreWarnings();
return $key;
}
function getFirstLetter( $string ) {
$string = strval( $string );
if ( $string === '' ) {
return '';
}
// Check for CJK
$firstChar = mb_substr( $string, 0, 1, 'UTF-8' );
if ( ord( $firstChar ) > 0x7f
&& self::isCjk( utf8ToCodepoint( $firstChar ) ) )
{
return $firstChar;
}
$sortKey = $this->getPrimarySortKey( $string );
// Do a binary search to find the correct letter to sort under
$min = $this->findLowerBound(
array( $this, 'getSortKeyByLetterIndex' ),
$this->getFirstLetterCount(),
'strcmp',
$sortKey );
if ( $min === false ) {
// Before the first letter
return '';
}
return $this->getLetterByIndex( $min );
}
function getFirstLetterData() {
if ( $this->firstLetterData !== null ) {
return $this->firstLetterData;
}
$cache = wfGetCache( CACHE_ANYTHING );
$cacheKey = wfMemcKey( 'first-letters', $this->locale );
$cacheEntry = $cache->get( $cacheKey );
if ( $cacheEntry ) {
$this->firstLetterData = $cacheEntry;
return $this->firstLetterData;
}
// Generate data from serialized data file
$letters = wfGetPrecompiledData( "first-letters-{$this->locale}.ser" );
if ( $letters === false ) {
throw new MWException( "MediaWiki does not support ICU locale " .
"\"{$this->locale}\"" );
}
// Sort the letters.
//
// It's impossible to have the precompiled data file properly sorted,
// because the sort order changes depending on ICU version. If the
// array is not properly sorted, the binary search will return random
// results.
//
// We also take this opportunity to remove primary collisions.
$letterMap = array();
foreach ( $letters as $letter ) {
$key = $this->getPrimarySortKey( $letter );
if ( isset( $letterMap[$key] ) ) {
// Primary collision
// Keep whichever one sorts first in the main collator
if ( $this->mainCollator->compare( $letter, $letterMap[$key] ) < 0 ) {
$letterMap[$key] = $letter;
}
} else {
$letterMap[$key] = $letter;
}
}
ksort( $letterMap, SORT_STRING );
$data = array(
'chars' => array_values( $letterMap ),
'keys' => array_keys( $letterMap )
);
// Reduce memory usage before caching
unset( $letterMap );
// Save to cache
$this->firstLetterData = $data;
$cache->set( $cacheKey, $data, 86400 * 7 /* 1 week */ );
return $data;
}
function getLetterByIndex( $index ) {
if ( $this->firstLetterData === null ) {
$this->getFirstLetterData();
}
return $this->firstLetterData['chars'][$index];
}
function getSortKeyByLetterIndex( $index ) {
if ( $this->firstLetterData === null ) {
$this->getFirstLetterData();
}
return $this->firstLetterData['keys'][$index];
}
function getFirstLetterCount() {
if ( $this->firstLetterData === null ) {
$this->getFirstLetterData();
}
return count( $this->firstLetterData['chars'] );
}
/**
* Do a binary search, and return the index of the largest item that sorts
* less than or equal to the target value.
*
* @param $valueCallback A function to call to get the value with
* a given array index.
* @param $valueCount The number of items accessible via $valueCallback,
* indexed from 0 to $valueCount - 1
* @param $comparisonCallback A callback to compare two values, returning
* -1, 0 or 1 in the style of strcmp().
* @param $target The target value to find.
*
* @return The item index of the lower bound, or false if the target value
* sorts before all items.
*/
function findLowerBound( $valueCallback, $valueCount, $comparisonCallback, $target ) {
$min = 0;
$max = $valueCount - 1;
do {
$mid = $min + ( ( $max - $min ) >> 1 );
$item = call_user_func( $valueCallback, $mid );
$comparison = call_user_func( $comparisonCallback, $target, $item );
if ( $comparison > 0 ) {
$min = $mid;
} elseif ( $comparison == 0 ) {
$min = $mid;
break;
} else {
$max = $mid;
}
} while ( $min < $max - 1 );
if ( $min == 0 && $max == 0 && $comparison > 0 ) {
// Before the first item
return false;
} else {
return $min;
}
}
static function isCjk( $codepoint ) {
foreach ( self::$cjkBlocks as $block ) {
if ( $codepoint >= $block[0] && $codepoint <= $block[1] ) {
return true;
}
}
return false;
}
}

View file

@ -4611,15 +4611,26 @@ $wgCategoryMagicGallery = true;
$wgCategoryPagingLimit = 200;
/**
* A version indicator for collations that will be stored in cl_collation for
* all new rows. Used when the collation algorithm changes: a script checks
* for all rows where cl_collation != $wgCategoryCollation and regenerates
* cl_sortkey based on the page name and cl_sortkey_prefix.
* Specify how category names should be sorted, when listed on a category page.
* A sorting scheme is also known as a collation.
*
* Currently only supports 'uppercase2', which just uppercases the string. This
* is a dummy collation, to be replaced later by real ones.
* Available values are:
*
* - uppercase: Converts the category name to upper case, and sorts by that.
*
* - uca-default: Provides access to the Unicode Collation Algorithm with
* the default element table. This is a compromise collation which sorts
* all languages in a mediocre way. However, it is better than "uppercase".
*
* To use the uca-default collation, you must have PHP's intl extension
* installed. See http://php.net/manual/en/intl.setup.php . The details of the
* resulting collation will depend on the version of ICU installed on the
* server.
*
* After you change this, you must run maintenance/updateCollation.php to fix
* the sort keys in the database.
*/
$wgCategoryCollation = 'uppercase2';
$wgCategoryCollation = 'uppercase';
/** @} */ # End categories }

View file

@ -454,14 +454,14 @@ class LinksUpdate {
# (Title::moveTo() has had the same issue for a long time).
if ( $this->mTitle->getCategorySortkey() == $sortkey ) {
$prefix = '';
$sortkey = $wgContLang->convertToSortkey( $sortkey );
$sortkey = Collation::singleton()->getSortKey( $sortkey );
} else {
# Treat custom sortkeys as a prefix, so that if multiple
# things are forced to sort as '*' or something, they'll
# sort properly in the category rather than in page_id
# order or such.
$prefix = $sortkey;
$sortkey = $wgContLang->convertToSortkey(
$sortkey = Collation::singleton()->getSortKey(
$this->mTitle->getCategorySortkey( $prefix ) );
}

View file

@ -3088,8 +3088,6 @@ class Title {
* @return Mixed true on success, getUserPermissionsErrors()-like array on failure
*/
public function moveTo( &$nt, $auth = true, $reason = '', $createRedirect = true ) {
global $wgContLang;
$err = $this->isValidMoveOperation( $nt, $auth, $reason );
if ( is_array( $err ) ) {
return $err;
@ -3129,7 +3127,8 @@ class Title {
);
$dbw->update( 'categorylinks',
array(
'cl_sortkey' => $wgContLang->convertToSortkey( $nt->getCategorySortkey( $prefix ) ),
'cl_sortkey' => Collation::singleton()->getSortKey(
$nt->getCategorySortkey( $prefix ) ),
'cl_timestamp=cl_timestamp' ),
array( 'cl_from' => $pageid ),
__METHOD__ );
@ -4139,7 +4138,7 @@ class Title {
/**
* Returns the raw sort key to be used for categories, with the specified
* prefix. This will be fed to Language::convertToSortkey() to get a
* prefix. This will be fed to Collation::getSortKey() to get a
* binary sortkey that can be used for actual sorting.
*
* @param $prefix string The prefix to be used, specified using
@ -4153,7 +4152,7 @@ class Title {
# Separate with a null byte, so the unprefixed part is only used as
# a tiebreaker when two pages have the exact same prefix -- null
# sorts before everything else (hopefully).
return "$prefix\0$unprefixed";
return "$prefix\n$unprefixed";
}
return $unprefixed;
}

View file

@ -2996,50 +2996,4 @@ class Language {
function getConvRuleTitle() {
return $this->mConverter->getConvRuleTitle();
}
/**
* Given a string, convert it to a (hopefully short) key that can be used
* for efficient sorting. A binary sort according to the sortkeys
* corresponds to a logical sort of the corresponding strings. Current
* code expects that a null character should sort before all others, but
* has no other particular expectations (and that one can be changed if
* necessary).
*
* @param string $string UTF-8 string
* @return string Binary sortkey
*/
public function convertToSortkey( $string ) {
# Fake function for now
return $this->uc( $string );
}
/**
* Given a string, return the logical "first letter" to be used for
* grouping on category pages and so on. This has to be coordinated
* carefully with convertToSortkey(), or else the sorted list might jump
* back and forth between the same "initial letters" or other pathological
* behavior. For instance, if you just return the first character, but "a"
* sorts the same as "A" based on convertToSortkey(), then you might get a
* list like
*
* == A ==
* * [[Aardvark]]
*
* == a ==
* * [[antelope]]
*
* == A ==
* * [[Ape]]
*
* etc., assuming for the sake of argument that $wgCapitalLinks is false.
*
* @param string $string UTF-8 string
* @return string UTF-8 string corresponding to the first letter of input
*/
public function firstLetterForLists( $string ) {
if ( $string[0] == "\0" ) {
$string = substr( $string, 1 );
}
return $this->uc( $this->firstChar( $string ) );
}
}

View file

@ -0,0 +1,381 @@
<?php
require_once( dirname( __FILE__ ) .'/../Maintenance.php' );
/**
* Generate first letter data files for Collation.php
*/
class GenerateCollationData extends Maintenance {
/** The directory with source data files in it */
var $dataDir;
/** The primary weights, indexed by codepoint */
var $weights;
/**
* A hashtable keyed by codepoint, where presence indicates that a character
* has a decomposition mapping. This makes it non-preferred for group header
* selection.
*/
var $mappedChars;
var $debugOutFile;
/**
* Important tertiary weights from UTS #10 section 7.2
*/
const NORMAL_UPPERCASE = 0x08;
const NORMAL_HIRAGANA = 0X0E;
public function __construct() {
parent::__construct();
$this->addOption( 'data-dir', 'A directory on the local filesystem ' .
'containing allkeys.txt and ucd.all.grouped.xml from unicode.org',
false, true );
$this->addOption( 'debug-output', 'Filename for sending debug output to',
false, true );
}
public function execute() {
$this->dataDir = $this->getOption( 'data-dir', '.' );
if ( !file_exists( "{$this->dataDir}/allkeys.txt" ) ) {
$this->error( "Unable to find allkeys.txt. Please download it from " .
"http://www.unicode.org/Public/UCA/latest/allkeys.txt and specify " .
"its location with --data-dir=<DIR>" );
exit( 1 );
}
if ( !file_exists( "{$this->dataDir}/ucd.all.grouped.xml" ) ) {
$this->error( "Unable to find ucd.all.grouped.xml. Please download it " .
"from http://www.unicode.org/Public/6.0.0/ucdxml/ucd.all.grouped.zip " .
"and specify its location with --data-dir=<DIR>" );
exit( 1 );
}
$debugOutFileName = $this->getOption( 'debug-output' );
if ( $debugOutFileName ) {
$this->debugOutFile = fopen( $debugOutFileName, 'w' );
if ( !$this->debugOutFile ) {
$this->error( "Unable to open debug output file for writing" );
exit( 1 );
}
}
$this->loadUcd();
$this->generateFirstChars();
}
function loadUcd() {
$uxr = new UcdXmlReader( "{$this->dataDir}/ucd.all.grouped.xml" );
$uxr->readChars( array( $this, 'charCallback' ) );
}
function charCallback( $data ) {
// Skip non-printable characters
$category = substr( $data['gc'], 0, 1 );
if ( strpos( 'LNPS', $category ) === false ) {
return;
}
$cp = hexdec( $data['cp'] );
// Skip the CJK ideograph blocks, as an optimisation measure.
// UCA doesn't sort them properly anyway, without tailoring.
if ( IcuCollation::isCjk( $cp ) ) {
return;
}
// Skip the composed Hangul syllables, we will use the bare Jamo
// as first letters
if ( $data['block'] == 'Hangul Syllables' ) {
return;
}
// Calculate implicit weight per UTS #10 v6.0.0, sec 7.1.3
if ( $data['UIdeo'] === 'Y' ) {
if ( $data['block'] == 'CJK Unified Ideographs'
|| $data['block'] == 'CJK Compatibility Ideographs' )
{
$base = 0xFB40;
} else {
$base = 0xFB80;
}
} else {
$base = 0xFBC0;
}
$a = $base + ( $cp >> 15 );
$b = ( $cp & 0x7fff ) | 0x8000;
$this->weights[$cp] = sprintf( ".%04X.%04X", $a, $b );
if ( $data['dm'] !== '#' ) {
$this->mappedChars[$cp] = true;
}
if ( $cp % 4096 == 0 ) {
print "{$data['cp']}\n";
}
}
function generateFirstChars() {
$file = fopen( "{$this->dataDir}/allkeys.txt", 'r' );
if ( !$file ) {
$this->error( "Unable to open allkeys.txt" );
exit( 1 );
}
global $IP;
$outFile = fopen( "$IP/serialized/first-letters-root.ser", 'w' );
if ( !$outFile ) {
$this->error( "Unable to open output file first-letters-root.ser" );
exit( 1 );
}
$goodTertiaryChars = array();
// For each character with an entry in allkeys.txt, overwrite the implicit
// entry in $this->weights that came from the UCD.
// Also gather a list of tertiary weights, for use in selecting the group header
while ( false !== ( $line = fgets( $file ) ) ) {
// We're only interested in single-character weights, pick them out with a regex
$line = trim( $line );
if ( !preg_match( '/^([0-9A-F]+)\s*;\s*([^#]*)/', $line, $m ) ) {
continue;
}
$cp = hexdec( $m[1] );
$allWeights = trim( $m[2] );
$primary = '';
$tertiary = '';
if ( !isset( $this->weights[$cp] ) ) {
// Non-printable, ignore
continue;
}
foreach ( StringUtils::explode( '[', $allWeights ) as $weightStr ) {
preg_match_all( '/[*.]([0-9A-F]+)/', $weightStr, $m );
if ( !empty( $m[1] ) ) {
if ( $m[1][0] !== '0000' ) {
$primary .= '.' . $m[1][0];
}
if ( $m[1][2] !== '0000' ) {
$tertiary .= '.' . $m[1][2];
}
}
}
$this->weights[$cp] = $primary;
if ( $tertiary === '.0008'
|| $tertiary === '.000E' )
{
$goodTertiaryChars[$cp] = true;
}
}
fclose( $file );
// Identify groups of characters with the same primary weight
$this->groups = array();
asort( $this->weights, SORT_STRING );
$prevWeight = reset( $this->weights );
$group = array();
foreach ( $this->weights as $cp => $weight ) {
if ( $weight !== $prevWeight ) {
$this->groups[$prevWeight] = $group;
$prevWeight = $weight;
if ( isset( $this->groups[$weight] ) ) {
$group = $this->groups[$weight];
} else {
$group = array();
}
}
$group[] = $cp;
}
if ( $group ) {
$this->groups[$prevWeight] = $group;
}
// If one character has a given primary weight sequence, and a second
// character has a longer primary weight sequence with an initial
// portion equal to the first character, then remove the second
// character. This avoids having characters like U+A732 (double A)
// polluting the basic latin sort area.
$prevWeights = array();
foreach ( $this->groups as $weight => $group ) {
if ( preg_match( '/(\.[0-9A-F]*)\./', $weight, $m ) ) {
if ( isset( $this->groups[$m[1]] ) ) {
unset( $this->groups[$weight] );
}
}
}
ksort( $this->groups, SORT_STRING );
// Identify the header character in each group
$headerChars = array();
$prevChar = "\000";
$tertiaryCollator = new Collator( 'root' );
$primaryCollator = new Collator( 'root' );
$primaryCollator->setStrength( Collator::PRIMARY );
$numOutOfOrder = 0;
foreach ( $this->groups as $weight => $group ) {
$uncomposedChars = array();
$goodChars = array();
foreach ( $group as $cp ) {
if ( isset( $goodTertiaryChars[$cp] ) ) {
$goodChars[] = $cp;
}
if ( !isset( $this->mappedChars[$cp] ) ) {
$uncomposedChars[] = $cp;
}
}
$x = array_intersect( $goodChars, $uncomposedChars );
if ( !$x ) {
$x = $uncomposedChars;
if ( !$x ) {
$x = $group;
}
}
// Use ICU to pick the lowest sorting character in the selection
$tertiaryCollator->sort( $x );
$cp = $x[0];
$char = codepointToUtf8( $cp );
$headerChars[] = $char;
if ( $primaryCollator->compare( $char, $prevChar ) <= 0 ) {
$numOutOfOrder ++;
/*
printf( "Out of order: U+%05X > U+%05X\n",
utf8ToCodepoint( $prevChar ),
utf8ToCodepoint( $char ) );
*/
}
$prevChar = $char;
if ( $this->debugOutFile ) {
fwrite( $this->debugOutFile, sprintf( "%05X %s %s (%s)\n", $cp, $weight, $char,
implode( ' ', array_map( 'codepointToUtf8', $group ) ) ) );
}
}
print "Out of order: $numOutOfOrder / " . count( $headerChars ) . "\n";
fwrite( $outFile, serialize( $headerChars ) );
}
}
class UcdXmlReader {
var $fileName;
var $callback;
var $groupAttrs;
var $xml;
var $blocks = array();
var $currentBlock;
function __construct( $fileName ) {
$this->fileName = $fileName;
}
public function readChars( $callback ) {
$this->getBlocks();
$this->currentBlock = reset( $this->blocks );
$xml = $this->open();
$this->callback = $callback;
while ( $xml->name !== 'repertoire' && $xml->next() );
while ( $xml->read() ) {
if ( $xml->nodeType == XMLReader::ELEMENT ) {
if ( $xml->name === 'group' ) {
$this->groupAttrs = $this->readAttributes();
} elseif ( $xml->name === 'char' ) {
$this->handleChar();
}
} elseif ( $xml->nodeType === XMLReader::END_ELEMENT ) {
if ( $xml->name === 'group' ) {
$this->groupAttrs = array();
}
}
}
$xml->close();
}
protected function open() {
$this->xml = new XMLReader;
$this->xml->open( $this->fileName );
if ( !$this->xml ) {
throw new MWException( __METHOD__.": unable to open {$this->fileName}" );
}
while ( $this->xml->name !== 'ucd' && $this->xml->read() );
$this->xml->read();
return $this->xml;
}
/**
* Read the attributes of the current element node and return them
* as an array
*/
protected function readAttributes() {
$attrs = array();
while ( $this->xml->moveToNextAttribute() ) {
$attrs[$this->xml->name] = $this->xml->value;
}
return $attrs;
}
protected function handleChar() {
$attrs = $this->readAttributes() + $this->groupAttrs;
if ( isset( $attrs['cp'] ) ) {
$first = $last = hexdec( $attrs['cp'] );
} else {
$first = hexdec( $attrs['first-cp'] );
$last = hexdec( $attrs['last-cp'] );
unset( $attrs['first-cp'] );
unset( $attrs['last-cp'] );
}
for ( $cp = $first; $cp <= $last; $cp++ ) {
$hexCp = sprintf( "%04X", $cp );
foreach ( array( 'na', 'na1' ) as $nameProp ) {
if ( isset( $attrs[$nameProp] ) ) {
$attrs[$nameProp] = str_replace( '#', $hexCp, $attrs[$nameProp] );
}
}
while ( $this->currentBlock ) {
if ( $cp < $this->currentBlock[0] ) {
break;
} elseif ( $cp <= $this->currentBlock[1] ) {
$attrs['block'] = key( $this->blocks );
break;
} else {
$this->currentBlock = next( $this->blocks );
}
}
$attrs['cp'] = $hexCp;
call_user_func( $this->callback, $attrs );
}
}
public function getBlocks() {
if ( $this->blocks ) {
return $this->blocks;
}
$xml = $this->open();
while ( $xml->name !== 'blocks' && $xml->read() );
while ( $xml->read() ) {
if ( $xml->nodeType == XMLReader::ELEMENT ) {
if ( $xml->name === 'block' ) {
$attrs = $this->readAttributes();
$first = hexdec( $attrs['first-cp'] );
$last = hexdec( $attrs['last-cp'] );
$this->blocks[$attrs['name']] = array( $first, $last );
}
}
}
$xml->close();
return $this->blocks;
}
}
$maintClass = 'GenerateCollationData';
require_once( DO_MAINTENANCE );

View file

@ -493,13 +493,13 @@ CREATE TABLE /*_*/categorylinks (
cl_to varchar(255) binary NOT NULL default '',
-- A binary string obtained by applying a sortkey generation algorithm
-- (Language::convertToSortkey()) to page_title, or cl_sortkey_prefix . "\0"
-- (Collation::getSortKey()) to page_title, or cl_sortkey_prefix . "\n"
-- . page_title if cl_sortkey_prefix is nonempty.
cl_sortkey varbinary(230) NOT NULL default '',
-- A prefix for the raw sortkey manually specified by the user, either via
-- [[Category:Foo|prefix]] or {{defaultsort:prefix}}. If nonempty, it's
-- concatenated with a null followed by the page title before the sortkey
-- concatenated with a line break followed by the page title before the sortkey
-- conversion algorithm is run. We store this so that we can update
-- collations without reparsing all pages.
-- Note: If you change the length of this field, you also need to change

View file

@ -46,7 +46,7 @@ TEXT;
}
public function execute() {
global $wgCategoryCollation, $wgContLang;
global $wgCategoryCollation;
$dbw = wfGetDB( DB_MASTER );
$count = $dbw->selectField(
@ -105,7 +105,7 @@ TEXT;
$dbw->update(
'categorylinks',
array(
'cl_sortkey' => $wgContLang->convertToSortkey(
'cl_sortkey' => Collation::singleton()->getSortKey(
$title->getCategorySortkey( $prefix ) ),
'cl_sortkey_prefix' => $prefix,
'cl_collation' => $wgCategoryCollation,

File diff suppressed because one or more lines are too long