* Introduced a non-dummy collation for $wgCategoryCollation, namely UCA with default tables.
* Added a maintenance script which generates a list of first letters. Unified Han are omitted for performance, and because they shouldn't be used as headings anyway. A future collation specific to Chinese would provide the KangXi radicals as "first letters". * Provided a precomputed list of first letters. Used Unicode 6.0.0 data and ICU 4.2. * Moved collation functionality from Language to a Collation class hierarchy with factory function. Removed the recently-added methods from Language and updated all callers. * Changed Title::getCategorySortkey() to separate its parts with a line break instead of a null character. All collations supported by the intl extension ignore the null character, i.e. "ab" == "a\0b". It would have required a lot of hacking to make it work. * Fixed the uppercase collation to handle non-ASCII characters, redundantly with r80436. I don't think it's necessary to change the collation name as was done there, so I reverted that in the course of my conflict merge. A --force option to updateCollation.php might be nice though.
This commit is contained in:
parent
14d576cd86
commit
eaeea84b44
11 changed files with 727 additions and 73 deletions
|
|
@ -43,6 +43,7 @@ $wgAutoloadLocalClasses = array(
|
|||
'ChangesFeed' => 'includes/ChangesFeed.php',
|
||||
'ChangeTags' => 'includes/ChangeTags.php',
|
||||
'ChannelFeed' => 'includes/Feed.php',
|
||||
'Collation' => 'includes/Collation.php',
|
||||
'Cookie' => 'includes/HttpFunctions.php',
|
||||
'CookieJar' => 'includes/HttpFunctions.php',
|
||||
'ConcatenatedGzipHistoryBlob' => 'includes/HistoryBlob.php',
|
||||
|
|
@ -127,6 +128,7 @@ $wgAutoloadLocalClasses = array(
|
|||
'HTMLInfoField' => 'includes/HTMLForm.php',
|
||||
'Http' => 'includes/HttpFunctions.php',
|
||||
'HttpRequest' => 'includes/HttpFunctions.old.php',
|
||||
'IcuCollation' => 'includes/Collation.php',
|
||||
'ImageGallery' => 'includes/ImageGallery.php',
|
||||
'ImageHistoryList' => 'includes/ImagePage.php',
|
||||
'ImageHistoryPseudoPager' => 'includes/ImagePage.php',
|
||||
|
|
@ -243,6 +245,7 @@ $wgAutoloadLocalClasses = array(
|
|||
'TitleListDependency' => 'includes/CacheDependency.php',
|
||||
'Token' => 'includes/Token.php',
|
||||
'UnlistedSpecialPage' => 'includes/SpecialPage.php',
|
||||
'UppercaseCollation' => 'includes/Collation.php',
|
||||
'User' => 'includes/User.php',
|
||||
'UserArray' => 'includes/UserArray.php',
|
||||
'UserArrayFromResult' => 'includes/UserArray.php',
|
||||
|
|
|
|||
|
|
@ -90,7 +90,7 @@ class CategoryViewer {
|
|||
$children, $children_start_char,
|
||||
$showGallery, $gallery,
|
||||
$imgsNoGalley, $imgsNoGallery_start_char,
|
||||
$skin;
|
||||
$skin, $collation;
|
||||
# Category object for this page
|
||||
private $cat;
|
||||
# The original query array, to be used in generating paging links.
|
||||
|
|
@ -104,6 +104,7 @@ class CategoryViewer {
|
|||
$this->limit = $wgCategoryPagingLimit;
|
||||
$this->cat = Category::newFromTitle( $title );
|
||||
$this->query = $query;
|
||||
$this->collation = Collation::singleton();
|
||||
unset( $this->query['title'] );
|
||||
}
|
||||
|
||||
|
|
@ -212,7 +213,7 @@ class CategoryViewer {
|
|||
$word = $sortkey;
|
||||
}
|
||||
|
||||
$firstChar = $wgContLang->firstLetterForLists( $word );
|
||||
$firstChar = $this->collation->getFirstLetter( $word );
|
||||
|
||||
return $wgContLang->convert( $firstChar );
|
||||
}
|
||||
|
|
@ -241,7 +242,8 @@ class CategoryViewer {
|
|||
) . '</span>'
|
||||
: $this->getSkin()->link( $title );
|
||||
|
||||
$this->imgsNoGallery_start_char[] = $wgContLang->convert( $wgContLang->firstLetterForLists( $sortkey ) );
|
||||
$this->imgsNoGallery_start_char[] = $wgContLang->convert(
|
||||
$this->collation->getFirstLetter( $sortkey ) );
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -261,7 +263,8 @@ class CategoryViewer {
|
|||
) . '</span>'
|
||||
: $this->getSkin()->link( $title );
|
||||
|
||||
$this->articles_start_char[] = $wgContLang->convert( $wgContLang->firstLetterForLists( $sortkey ) );
|
||||
$this->articles_start_char[] = $wgContLang->convert(
|
||||
$this->collation->getFirstLetter( $sortkey ) );
|
||||
}
|
||||
|
||||
function finaliseCategoryState() {
|
||||
|
|
@ -280,8 +283,6 @@ class CategoryViewer {
|
|||
}
|
||||
|
||||
function doCategoryQuery() {
|
||||
global $wgContLang;
|
||||
|
||||
$dbr = wfGetDB( DB_SLAVE, 'category' );
|
||||
|
||||
$this->nextPage = array(
|
||||
|
|
@ -294,14 +295,14 @@ class CategoryViewer {
|
|||
foreach ( array( 'page', 'subcat', 'file' ) as $type ) {
|
||||
# Get the sortkeys for start/end, if applicable. Note that if
|
||||
# the collation in the database differs from the one
|
||||
# $wgContLang is using, pagination might go totally haywire.
|
||||
# set in $wgCategoryCollation, pagination might go totally haywire.
|
||||
$extraConds = array( 'cl_type' => $type );
|
||||
if ( $this->from[$type] !== null ) {
|
||||
$extraConds[] = 'cl_sortkey >= '
|
||||
. $dbr->addQuotes( $wgContLang->convertToSortkey( $this->from[$type] ) );
|
||||
. $dbr->addQuotes( $this->collation->getSortKey( $this->from[$type] ) );
|
||||
} elseif ( $this->until[$type] !== null ) {
|
||||
$extraConds[] = 'cl_sortkey < '
|
||||
. $dbr->addQuotes( $wgContLang->convertToSortkey( $this->until[$type] ) );
|
||||
. $dbr->addQuotes( $this->collation->getSortKey( $this->until[$type] ) );
|
||||
$this->flip[$type] = true;
|
||||
}
|
||||
|
||||
|
|
|
|||
304
includes/Collation.php
Normal file
304
includes/Collation.php
Normal file
|
|
@ -0,0 +1,304 @@
|
|||
<?php
|
||||
|
||||
abstract class Collation {
|
||||
static $instance;
|
||||
|
||||
static function singleton() {
|
||||
if ( !self::$instance ) {
|
||||
global $wgCategoryCollation;
|
||||
self::$instance = self::factory( $wgCategoryCollation );
|
||||
}
|
||||
return self::$instance;
|
||||
}
|
||||
|
||||
static function factory( $collationName ) {
|
||||
switch( $collationName ) {
|
||||
case 'uppercase':
|
||||
return new UppercaseCollation;
|
||||
case 'uca-default':
|
||||
return new IcuCollation( 'root' );
|
||||
default:
|
||||
throw new MWException( __METHOD__.": unknown collation type \"$collationName\"" );
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Given a string, convert it to a (hopefully short) key that can be used
|
||||
* for efficient sorting. A binary sort according to the sortkeys
|
||||
* corresponds to a logical sort of the corresponding strings. Current
|
||||
* code expects that a null character should sort before all others, but
|
||||
* has no other particular expectations (and that one can be changed if
|
||||
* necessary).
|
||||
*
|
||||
* @param string $string UTF-8 string
|
||||
* @return string Binary sortkey
|
||||
*/
|
||||
abstract function getSortKey( $string );
|
||||
|
||||
/**
|
||||
* Given a string, return the logical "first letter" to be used for
|
||||
* grouping on category pages and so on. This has to be coordinated
|
||||
* carefully with convertToSortkey(), or else the sorted list might jump
|
||||
* back and forth between the same "initial letters" or other pathological
|
||||
* behavior. For instance, if you just return the first character, but "a"
|
||||
* sorts the same as "A" based on getSortKey(), then you might get a
|
||||
* list like
|
||||
*
|
||||
* == A ==
|
||||
* * [[Aardvark]]
|
||||
*
|
||||
* == a ==
|
||||
* * [[antelope]]
|
||||
*
|
||||
* == A ==
|
||||
* * [[Ape]]
|
||||
*
|
||||
* etc., assuming for the sake of argument that $wgCapitalLinks is false.
|
||||
*
|
||||
* @param string $string UTF-8 string
|
||||
* @return string UTF-8 string corresponding to the first letter of input
|
||||
*/
|
||||
abstract function getFirstLetter( $string );
|
||||
}
|
||||
|
||||
class UppercaseCollation extends Collation {
|
||||
var $lang;
|
||||
function __construct() {
|
||||
// Get a language object so that we can use the generic UTF-8 uppercase
|
||||
// function there
|
||||
$this->lang = Language::factory( 'en' );
|
||||
}
|
||||
|
||||
function getSortKey( $string ) {
|
||||
return $this->lang->uc( $string );
|
||||
}
|
||||
|
||||
function getFirstLetter( $string ) {
|
||||
if ( $string[0] == "\0" ) {
|
||||
$string = substr( $string, 1 );
|
||||
}
|
||||
return $this->lang->ucfirst( $this->lang->firstChar( $string ) );
|
||||
}
|
||||
}
|
||||
|
||||
class IcuCollation extends Collation {
|
||||
var $primaryCollator, $mainCollator, $locale;
|
||||
var $firstLetterData;
|
||||
|
||||
/**
|
||||
* Unified CJK blocks.
|
||||
*
|
||||
* The same definition of a CJK block must be used for both Collation and
|
||||
* generateCollationData.php. These blocks are omitted from the first
|
||||
* letter data, as an optimisation measure and because the default UCA table
|
||||
* is pretty useless for sorting Chinese text anyway. Japanese and Korean
|
||||
* blocks are not included here, because they are smaller and more useful.
|
||||
*/
|
||||
static $cjkBlocks = array(
|
||||
array( 0x2E80, 0x2EFF ), // CJK Radicals Supplement
|
||||
array( 0x2F00, 0x2FDF ), // Kangxi Radicals
|
||||
array( 0x2FF0, 0x2FFF ), // Ideographic Description Characters
|
||||
array( 0x3000, 0x303F ), // CJK Symbols and Punctuation
|
||||
array( 0x31C0, 0x31EF ), // CJK Strokes
|
||||
array( 0x3200, 0x32FF ), // Enclosed CJK Letters and Months
|
||||
array( 0x3300, 0x33FF ), // CJK Compatibility
|
||||
array( 0x3400, 0x4DBF ), // CJK Unified Ideographs Extension A
|
||||
array( 0x4E00, 0x9FFF ), // CJK Unified Ideographs
|
||||
array( 0xF900, 0xFAFF ), // CJK Compatibility Ideographs
|
||||
array( 0xFE30, 0xFE4F ), // CJK Compatibility Forms
|
||||
array( 0x20000, 0x2A6DF ), // CJK Unified Ideographs Extension B
|
||||
array( 0x2A700, 0x2B73F ), // CJK Unified Ideographs Extension C
|
||||
array( 0x2B740, 0x2B81F ), // CJK Unified Ideographs Extension D
|
||||
array( 0x2F800, 0x2FA1F ), // CJK Compatibility Ideographs Supplement
|
||||
);
|
||||
|
||||
const RECORD_LENGTH = 14;
|
||||
|
||||
function __construct( $locale ) {
|
||||
if ( !extension_loaded( 'intl' ) ) {
|
||||
throw new MWException( 'An ICU collation was requested, ' .
|
||||
'but the intl extension is not available.' );
|
||||
}
|
||||
$this->locale = $locale;
|
||||
$this->mainCollator = Collator::create( $locale );
|
||||
if ( !$this->mainCollator ) {
|
||||
throw new MWException( "Invalid ICU locale specified for collation: $locale" );
|
||||
}
|
||||
|
||||
$this->primaryCollator = Collator::create( $locale );
|
||||
$this->primaryCollator->setStrength( Collator::PRIMARY );
|
||||
}
|
||||
|
||||
function getSortKey( $string ) {
|
||||
wfSuppressWarnings();
|
||||
$key = $this->mainCollator->getSortKey( $string ) . '';
|
||||
wfRestoreWarnings();
|
||||
return $key;
|
||||
}
|
||||
|
||||
function getPrimarySortKey( $string ) {
|
||||
wfSuppressWarnings();
|
||||
$key = $this->primaryCollator->getSortKey( $string ) . '';
|
||||
wfRestoreWarnings();
|
||||
return $key;
|
||||
}
|
||||
|
||||
function getFirstLetter( $string ) {
|
||||
$string = strval( $string );
|
||||
if ( $string === '' ) {
|
||||
return '';
|
||||
}
|
||||
|
||||
// Check for CJK
|
||||
$firstChar = mb_substr( $string, 0, 1, 'UTF-8' );
|
||||
if ( ord( $firstChar ) > 0x7f
|
||||
&& self::isCjk( utf8ToCodepoint( $firstChar ) ) )
|
||||
{
|
||||
return $firstChar;
|
||||
}
|
||||
|
||||
$sortKey = $this->getPrimarySortKey( $string );
|
||||
|
||||
// Do a binary search to find the correct letter to sort under
|
||||
$min = $this->findLowerBound(
|
||||
array( $this, 'getSortKeyByLetterIndex' ),
|
||||
$this->getFirstLetterCount(),
|
||||
'strcmp',
|
||||
$sortKey );
|
||||
|
||||
if ( $min === false ) {
|
||||
// Before the first letter
|
||||
return '';
|
||||
}
|
||||
return $this->getLetterByIndex( $min );
|
||||
}
|
||||
|
||||
function getFirstLetterData() {
|
||||
if ( $this->firstLetterData !== null ) {
|
||||
return $this->firstLetterData;
|
||||
}
|
||||
|
||||
$cache = wfGetCache( CACHE_ANYTHING );
|
||||
$cacheKey = wfMemcKey( 'first-letters', $this->locale );
|
||||
$cacheEntry = $cache->get( $cacheKey );
|
||||
|
||||
if ( $cacheEntry ) {
|
||||
$this->firstLetterData = $cacheEntry;
|
||||
return $this->firstLetterData;
|
||||
}
|
||||
|
||||
// Generate data from serialized data file
|
||||
|
||||
$letters = wfGetPrecompiledData( "first-letters-{$this->locale}.ser" );
|
||||
if ( $letters === false ) {
|
||||
throw new MWException( "MediaWiki does not support ICU locale " .
|
||||
"\"{$this->locale}\"" );
|
||||
}
|
||||
|
||||
// Sort the letters.
|
||||
//
|
||||
// It's impossible to have the precompiled data file properly sorted,
|
||||
// because the sort order changes depending on ICU version. If the
|
||||
// array is not properly sorted, the binary search will return random
|
||||
// results.
|
||||
//
|
||||
// We also take this opportunity to remove primary collisions.
|
||||
$letterMap = array();
|
||||
foreach ( $letters as $letter ) {
|
||||
$key = $this->getPrimarySortKey( $letter );
|
||||
if ( isset( $letterMap[$key] ) ) {
|
||||
// Primary collision
|
||||
// Keep whichever one sorts first in the main collator
|
||||
if ( $this->mainCollator->compare( $letter, $letterMap[$key] ) < 0 ) {
|
||||
$letterMap[$key] = $letter;
|
||||
}
|
||||
} else {
|
||||
$letterMap[$key] = $letter;
|
||||
}
|
||||
}
|
||||
ksort( $letterMap, SORT_STRING );
|
||||
$data = array(
|
||||
'chars' => array_values( $letterMap ),
|
||||
'keys' => array_keys( $letterMap )
|
||||
);
|
||||
|
||||
// Reduce memory usage before caching
|
||||
unset( $letterMap );
|
||||
|
||||
// Save to cache
|
||||
$this->firstLetterData = $data;
|
||||
$cache->set( $cacheKey, $data, 86400 * 7 /* 1 week */ );
|
||||
return $data;
|
||||
}
|
||||
|
||||
function getLetterByIndex( $index ) {
|
||||
if ( $this->firstLetterData === null ) {
|
||||
$this->getFirstLetterData();
|
||||
}
|
||||
return $this->firstLetterData['chars'][$index];
|
||||
}
|
||||
|
||||
function getSortKeyByLetterIndex( $index ) {
|
||||
if ( $this->firstLetterData === null ) {
|
||||
$this->getFirstLetterData();
|
||||
}
|
||||
return $this->firstLetterData['keys'][$index];
|
||||
}
|
||||
|
||||
function getFirstLetterCount() {
|
||||
if ( $this->firstLetterData === null ) {
|
||||
$this->getFirstLetterData();
|
||||
}
|
||||
return count( $this->firstLetterData['chars'] );
|
||||
}
|
||||
|
||||
/**
|
||||
* Do a binary search, and return the index of the largest item that sorts
|
||||
* less than or equal to the target value.
|
||||
*
|
||||
* @param $valueCallback A function to call to get the value with
|
||||
* a given array index.
|
||||
* @param $valueCount The number of items accessible via $valueCallback,
|
||||
* indexed from 0 to $valueCount - 1
|
||||
* @param $comparisonCallback A callback to compare two values, returning
|
||||
* -1, 0 or 1 in the style of strcmp().
|
||||
* @param $target The target value to find.
|
||||
*
|
||||
* @return The item index of the lower bound, or false if the target value
|
||||
* sorts before all items.
|
||||
*/
|
||||
function findLowerBound( $valueCallback, $valueCount, $comparisonCallback, $target ) {
|
||||
$min = 0;
|
||||
$max = $valueCount - 1;
|
||||
do {
|
||||
$mid = $min + ( ( $max - $min ) >> 1 );
|
||||
$item = call_user_func( $valueCallback, $mid );
|
||||
$comparison = call_user_func( $comparisonCallback, $target, $item );
|
||||
if ( $comparison > 0 ) {
|
||||
$min = $mid;
|
||||
} elseif ( $comparison == 0 ) {
|
||||
$min = $mid;
|
||||
break;
|
||||
} else {
|
||||
$max = $mid;
|
||||
}
|
||||
} while ( $min < $max - 1 );
|
||||
|
||||
if ( $min == 0 && $max == 0 && $comparison > 0 ) {
|
||||
// Before the first item
|
||||
return false;
|
||||
} else {
|
||||
return $min;
|
||||
}
|
||||
}
|
||||
|
||||
static function isCjk( $codepoint ) {
|
||||
foreach ( self::$cjkBlocks as $block ) {
|
||||
if ( $codepoint >= $block[0] && $codepoint <= $block[1] ) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -4611,15 +4611,26 @@ $wgCategoryMagicGallery = true;
|
|||
$wgCategoryPagingLimit = 200;
|
||||
|
||||
/**
|
||||
* A version indicator for collations that will be stored in cl_collation for
|
||||
* all new rows. Used when the collation algorithm changes: a script checks
|
||||
* for all rows where cl_collation != $wgCategoryCollation and regenerates
|
||||
* cl_sortkey based on the page name and cl_sortkey_prefix.
|
||||
* Specify how category names should be sorted, when listed on a category page.
|
||||
* A sorting scheme is also known as a collation.
|
||||
*
|
||||
* Currently only supports 'uppercase2', which just uppercases the string. This
|
||||
* is a dummy collation, to be replaced later by real ones.
|
||||
* Available values are:
|
||||
*
|
||||
* - uppercase: Converts the category name to upper case, and sorts by that.
|
||||
*
|
||||
* - uca-default: Provides access to the Unicode Collation Algorithm with
|
||||
* the default element table. This is a compromise collation which sorts
|
||||
* all languages in a mediocre way. However, it is better than "uppercase".
|
||||
*
|
||||
* To use the uca-default collation, you must have PHP's intl extension
|
||||
* installed. See http://php.net/manual/en/intl.setup.php . The details of the
|
||||
* resulting collation will depend on the version of ICU installed on the
|
||||
* server.
|
||||
*
|
||||
* After you change this, you must run maintenance/updateCollation.php to fix
|
||||
* the sort keys in the database.
|
||||
*/
|
||||
$wgCategoryCollation = 'uppercase2';
|
||||
$wgCategoryCollation = 'uppercase';
|
||||
|
||||
/** @} */ # End categories }
|
||||
|
||||
|
|
|
|||
|
|
@ -454,14 +454,14 @@ class LinksUpdate {
|
|||
# (Title::moveTo() has had the same issue for a long time).
|
||||
if ( $this->mTitle->getCategorySortkey() == $sortkey ) {
|
||||
$prefix = '';
|
||||
$sortkey = $wgContLang->convertToSortkey( $sortkey );
|
||||
$sortkey = Collation::singleton()->getSortKey( $sortkey );
|
||||
} else {
|
||||
# Treat custom sortkeys as a prefix, so that if multiple
|
||||
# things are forced to sort as '*' or something, they'll
|
||||
# sort properly in the category rather than in page_id
|
||||
# order or such.
|
||||
$prefix = $sortkey;
|
||||
$sortkey = $wgContLang->convertToSortkey(
|
||||
$sortkey = Collation::singleton()->getSortKey(
|
||||
$this->mTitle->getCategorySortkey( $prefix ) );
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -3088,8 +3088,6 @@ class Title {
|
|||
* @return Mixed true on success, getUserPermissionsErrors()-like array on failure
|
||||
*/
|
||||
public function moveTo( &$nt, $auth = true, $reason = '', $createRedirect = true ) {
|
||||
global $wgContLang;
|
||||
|
||||
$err = $this->isValidMoveOperation( $nt, $auth, $reason );
|
||||
if ( is_array( $err ) ) {
|
||||
return $err;
|
||||
|
|
@ -3129,7 +3127,8 @@ class Title {
|
|||
);
|
||||
$dbw->update( 'categorylinks',
|
||||
array(
|
||||
'cl_sortkey' => $wgContLang->convertToSortkey( $nt->getCategorySortkey( $prefix ) ),
|
||||
'cl_sortkey' => Collation::singleton()->getSortKey(
|
||||
$nt->getCategorySortkey( $prefix ) ),
|
||||
'cl_timestamp=cl_timestamp' ),
|
||||
array( 'cl_from' => $pageid ),
|
||||
__METHOD__ );
|
||||
|
|
@ -4139,7 +4138,7 @@ class Title {
|
|||
|
||||
/**
|
||||
* Returns the raw sort key to be used for categories, with the specified
|
||||
* prefix. This will be fed to Language::convertToSortkey() to get a
|
||||
* prefix. This will be fed to Collation::getSortKey() to get a
|
||||
* binary sortkey that can be used for actual sorting.
|
||||
*
|
||||
* @param $prefix string The prefix to be used, specified using
|
||||
|
|
@ -4153,7 +4152,7 @@ class Title {
|
|||
# Separate with a null byte, so the unprefixed part is only used as
|
||||
# a tiebreaker when two pages have the exact same prefix -- null
|
||||
# sorts before everything else (hopefully).
|
||||
return "$prefix\0$unprefixed";
|
||||
return "$prefix\n$unprefixed";
|
||||
}
|
||||
return $unprefixed;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -2996,50 +2996,4 @@ class Language {
|
|||
function getConvRuleTitle() {
|
||||
return $this->mConverter->getConvRuleTitle();
|
||||
}
|
||||
|
||||
/**
|
||||
* Given a string, convert it to a (hopefully short) key that can be used
|
||||
* for efficient sorting. A binary sort according to the sortkeys
|
||||
* corresponds to a logical sort of the corresponding strings. Current
|
||||
* code expects that a null character should sort before all others, but
|
||||
* has no other particular expectations (and that one can be changed if
|
||||
* necessary).
|
||||
*
|
||||
* @param string $string UTF-8 string
|
||||
* @return string Binary sortkey
|
||||
*/
|
||||
public function convertToSortkey( $string ) {
|
||||
# Fake function for now
|
||||
return $this->uc( $string );
|
||||
}
|
||||
|
||||
/**
|
||||
* Given a string, return the logical "first letter" to be used for
|
||||
* grouping on category pages and so on. This has to be coordinated
|
||||
* carefully with convertToSortkey(), or else the sorted list might jump
|
||||
* back and forth between the same "initial letters" or other pathological
|
||||
* behavior. For instance, if you just return the first character, but "a"
|
||||
* sorts the same as "A" based on convertToSortkey(), then you might get a
|
||||
* list like
|
||||
*
|
||||
* == A ==
|
||||
* * [[Aardvark]]
|
||||
*
|
||||
* == a ==
|
||||
* * [[antelope]]
|
||||
*
|
||||
* == A ==
|
||||
* * [[Ape]]
|
||||
*
|
||||
* etc., assuming for the sake of argument that $wgCapitalLinks is false.
|
||||
*
|
||||
* @param string $string UTF-8 string
|
||||
* @return string UTF-8 string corresponding to the first letter of input
|
||||
*/
|
||||
public function firstLetterForLists( $string ) {
|
||||
if ( $string[0] == "\0" ) {
|
||||
$string = substr( $string, 1 );
|
||||
}
|
||||
return $this->uc( $this->firstChar( $string ) );
|
||||
}
|
||||
}
|
||||
|
|
|
|||
381
maintenance/language/generateCollationData.php
Normal file
381
maintenance/language/generateCollationData.php
Normal file
|
|
@ -0,0 +1,381 @@
|
|||
<?php
|
||||
|
||||
require_once( dirname( __FILE__ ) .'/../Maintenance.php' );
|
||||
|
||||
/**
|
||||
* Generate first letter data files for Collation.php
|
||||
*/
|
||||
class GenerateCollationData extends Maintenance {
|
||||
/** The directory with source data files in it */
|
||||
var $dataDir;
|
||||
|
||||
/** The primary weights, indexed by codepoint */
|
||||
var $weights;
|
||||
|
||||
/**
|
||||
* A hashtable keyed by codepoint, where presence indicates that a character
|
||||
* has a decomposition mapping. This makes it non-preferred for group header
|
||||
* selection.
|
||||
*/
|
||||
var $mappedChars;
|
||||
|
||||
var $debugOutFile;
|
||||
|
||||
/**
|
||||
* Important tertiary weights from UTS #10 section 7.2
|
||||
*/
|
||||
const NORMAL_UPPERCASE = 0x08;
|
||||
const NORMAL_HIRAGANA = 0X0E;
|
||||
|
||||
public function __construct() {
|
||||
parent::__construct();
|
||||
$this->addOption( 'data-dir', 'A directory on the local filesystem ' .
|
||||
'containing allkeys.txt and ucd.all.grouped.xml from unicode.org',
|
||||
false, true );
|
||||
$this->addOption( 'debug-output', 'Filename for sending debug output to',
|
||||
false, true );
|
||||
}
|
||||
|
||||
public function execute() {
|
||||
$this->dataDir = $this->getOption( 'data-dir', '.' );
|
||||
if ( !file_exists( "{$this->dataDir}/allkeys.txt" ) ) {
|
||||
$this->error( "Unable to find allkeys.txt. Please download it from " .
|
||||
"http://www.unicode.org/Public/UCA/latest/allkeys.txt and specify " .
|
||||
"its location with --data-dir=<DIR>" );
|
||||
exit( 1 );
|
||||
}
|
||||
if ( !file_exists( "{$this->dataDir}/ucd.all.grouped.xml" ) ) {
|
||||
$this->error( "Unable to find ucd.all.grouped.xml. Please download it " .
|
||||
"from http://www.unicode.org/Public/6.0.0/ucdxml/ucd.all.grouped.zip " .
|
||||
"and specify its location with --data-dir=<DIR>" );
|
||||
exit( 1 );
|
||||
}
|
||||
$debugOutFileName = $this->getOption( 'debug-output' );
|
||||
if ( $debugOutFileName ) {
|
||||
$this->debugOutFile = fopen( $debugOutFileName, 'w' );
|
||||
if ( !$this->debugOutFile ) {
|
||||
$this->error( "Unable to open debug output file for writing" );
|
||||
exit( 1 );
|
||||
}
|
||||
}
|
||||
$this->loadUcd();
|
||||
$this->generateFirstChars();
|
||||
}
|
||||
|
||||
function loadUcd() {
|
||||
$uxr = new UcdXmlReader( "{$this->dataDir}/ucd.all.grouped.xml" );
|
||||
$uxr->readChars( array( $this, 'charCallback' ) );
|
||||
}
|
||||
|
||||
function charCallback( $data ) {
|
||||
// Skip non-printable characters
|
||||
$category = substr( $data['gc'], 0, 1 );
|
||||
if ( strpos( 'LNPS', $category ) === false ) {
|
||||
return;
|
||||
}
|
||||
$cp = hexdec( $data['cp'] );
|
||||
|
||||
// Skip the CJK ideograph blocks, as an optimisation measure.
|
||||
// UCA doesn't sort them properly anyway, without tailoring.
|
||||
if ( IcuCollation::isCjk( $cp ) ) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Skip the composed Hangul syllables, we will use the bare Jamo
|
||||
// as first letters
|
||||
if ( $data['block'] == 'Hangul Syllables' ) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Calculate implicit weight per UTS #10 v6.0.0, sec 7.1.3
|
||||
if ( $data['UIdeo'] === 'Y' ) {
|
||||
if ( $data['block'] == 'CJK Unified Ideographs'
|
||||
|| $data['block'] == 'CJK Compatibility Ideographs' )
|
||||
{
|
||||
$base = 0xFB40;
|
||||
} else {
|
||||
$base = 0xFB80;
|
||||
}
|
||||
} else {
|
||||
$base = 0xFBC0;
|
||||
}
|
||||
$a = $base + ( $cp >> 15 );
|
||||
$b = ( $cp & 0x7fff ) | 0x8000;
|
||||
|
||||
$this->weights[$cp] = sprintf( ".%04X.%04X", $a, $b );
|
||||
|
||||
if ( $data['dm'] !== '#' ) {
|
||||
$this->mappedChars[$cp] = true;
|
||||
}
|
||||
|
||||
if ( $cp % 4096 == 0 ) {
|
||||
print "{$data['cp']}\n";
|
||||
}
|
||||
}
|
||||
|
||||
function generateFirstChars() {
|
||||
$file = fopen( "{$this->dataDir}/allkeys.txt", 'r' );
|
||||
if ( !$file ) {
|
||||
$this->error( "Unable to open allkeys.txt" );
|
||||
exit( 1 );
|
||||
}
|
||||
global $IP;
|
||||
$outFile = fopen( "$IP/serialized/first-letters-root.ser", 'w' );
|
||||
if ( !$outFile ) {
|
||||
$this->error( "Unable to open output file first-letters-root.ser" );
|
||||
exit( 1 );
|
||||
}
|
||||
|
||||
$goodTertiaryChars = array();
|
||||
|
||||
// For each character with an entry in allkeys.txt, overwrite the implicit
|
||||
// entry in $this->weights that came from the UCD.
|
||||
// Also gather a list of tertiary weights, for use in selecting the group header
|
||||
while ( false !== ( $line = fgets( $file ) ) ) {
|
||||
// We're only interested in single-character weights, pick them out with a regex
|
||||
$line = trim( $line );
|
||||
if ( !preg_match( '/^([0-9A-F]+)\s*;\s*([^#]*)/', $line, $m ) ) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$cp = hexdec( $m[1] );
|
||||
$allWeights = trim( $m[2] );
|
||||
$primary = '';
|
||||
$tertiary = '';
|
||||
|
||||
if ( !isset( $this->weights[$cp] ) ) {
|
||||
// Non-printable, ignore
|
||||
continue;
|
||||
}
|
||||
foreach ( StringUtils::explode( '[', $allWeights ) as $weightStr ) {
|
||||
preg_match_all( '/[*.]([0-9A-F]+)/', $weightStr, $m );
|
||||
if ( !empty( $m[1] ) ) {
|
||||
if ( $m[1][0] !== '0000' ) {
|
||||
$primary .= '.' . $m[1][0];
|
||||
}
|
||||
if ( $m[1][2] !== '0000' ) {
|
||||
$tertiary .= '.' . $m[1][2];
|
||||
}
|
||||
}
|
||||
}
|
||||
$this->weights[$cp] = $primary;
|
||||
if ( $tertiary === '.0008'
|
||||
|| $tertiary === '.000E' )
|
||||
{
|
||||
$goodTertiaryChars[$cp] = true;
|
||||
}
|
||||
}
|
||||
fclose( $file );
|
||||
|
||||
// Identify groups of characters with the same primary weight
|
||||
$this->groups = array();
|
||||
asort( $this->weights, SORT_STRING );
|
||||
$prevWeight = reset( $this->weights );
|
||||
$group = array();
|
||||
foreach ( $this->weights as $cp => $weight ) {
|
||||
if ( $weight !== $prevWeight ) {
|
||||
$this->groups[$prevWeight] = $group;
|
||||
$prevWeight = $weight;
|
||||
if ( isset( $this->groups[$weight] ) ) {
|
||||
$group = $this->groups[$weight];
|
||||
} else {
|
||||
$group = array();
|
||||
}
|
||||
}
|
||||
$group[] = $cp;
|
||||
}
|
||||
if ( $group ) {
|
||||
$this->groups[$prevWeight] = $group;
|
||||
}
|
||||
|
||||
// If one character has a given primary weight sequence, and a second
|
||||
// character has a longer primary weight sequence with an initial
|
||||
// portion equal to the first character, then remove the second
|
||||
// character. This avoids having characters like U+A732 (double A)
|
||||
// polluting the basic latin sort area.
|
||||
$prevWeights = array();
|
||||
foreach ( $this->groups as $weight => $group ) {
|
||||
if ( preg_match( '/(\.[0-9A-F]*)\./', $weight, $m ) ) {
|
||||
if ( isset( $this->groups[$m[1]] ) ) {
|
||||
unset( $this->groups[$weight] );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ksort( $this->groups, SORT_STRING );
|
||||
|
||||
// Identify the header character in each group
|
||||
$headerChars = array();
|
||||
$prevChar = "\000";
|
||||
$tertiaryCollator = new Collator( 'root' );
|
||||
$primaryCollator = new Collator( 'root' );
|
||||
$primaryCollator->setStrength( Collator::PRIMARY );
|
||||
$numOutOfOrder = 0;
|
||||
foreach ( $this->groups as $weight => $group ) {
|
||||
$uncomposedChars = array();
|
||||
$goodChars = array();
|
||||
foreach ( $group as $cp ) {
|
||||
if ( isset( $goodTertiaryChars[$cp] ) ) {
|
||||
$goodChars[] = $cp;
|
||||
}
|
||||
if ( !isset( $this->mappedChars[$cp] ) ) {
|
||||
$uncomposedChars[] = $cp;
|
||||
}
|
||||
}
|
||||
$x = array_intersect( $goodChars, $uncomposedChars );
|
||||
if ( !$x ) {
|
||||
$x = $uncomposedChars;
|
||||
if ( !$x ) {
|
||||
$x = $group;
|
||||
}
|
||||
}
|
||||
|
||||
// Use ICU to pick the lowest sorting character in the selection
|
||||
$tertiaryCollator->sort( $x );
|
||||
$cp = $x[0];
|
||||
|
||||
$char = codepointToUtf8( $cp );
|
||||
$headerChars[] = $char;
|
||||
if ( $primaryCollator->compare( $char, $prevChar ) <= 0 ) {
|
||||
$numOutOfOrder ++;
|
||||
/*
|
||||
printf( "Out of order: U+%05X > U+%05X\n",
|
||||
utf8ToCodepoint( $prevChar ),
|
||||
utf8ToCodepoint( $char ) );
|
||||
*/
|
||||
}
|
||||
$prevChar = $char;
|
||||
|
||||
if ( $this->debugOutFile ) {
|
||||
fwrite( $this->debugOutFile, sprintf( "%05X %s %s (%s)\n", $cp, $weight, $char,
|
||||
implode( ' ', array_map( 'codepointToUtf8', $group ) ) ) );
|
||||
}
|
||||
}
|
||||
|
||||
print "Out of order: $numOutOfOrder / " . count( $headerChars ) . "\n";
|
||||
|
||||
fwrite( $outFile, serialize( $headerChars ) );
|
||||
}
|
||||
}
|
||||
|
||||
class UcdXmlReader {
|
||||
var $fileName;
|
||||
var $callback;
|
||||
var $groupAttrs;
|
||||
var $xml;
|
||||
var $blocks = array();
|
||||
var $currentBlock;
|
||||
|
||||
function __construct( $fileName ) {
|
||||
$this->fileName = $fileName;
|
||||
}
|
||||
|
||||
public function readChars( $callback ) {
|
||||
$this->getBlocks();
|
||||
$this->currentBlock = reset( $this->blocks );
|
||||
$xml = $this->open();
|
||||
$this->callback = $callback;
|
||||
|
||||
while ( $xml->name !== 'repertoire' && $xml->next() );
|
||||
|
||||
while ( $xml->read() ) {
|
||||
if ( $xml->nodeType == XMLReader::ELEMENT ) {
|
||||
if ( $xml->name === 'group' ) {
|
||||
$this->groupAttrs = $this->readAttributes();
|
||||
} elseif ( $xml->name === 'char' ) {
|
||||
$this->handleChar();
|
||||
}
|
||||
} elseif ( $xml->nodeType === XMLReader::END_ELEMENT ) {
|
||||
if ( $xml->name === 'group' ) {
|
||||
$this->groupAttrs = array();
|
||||
}
|
||||
}
|
||||
}
|
||||
$xml->close();
|
||||
}
|
||||
|
||||
protected function open() {
|
||||
$this->xml = new XMLReader;
|
||||
$this->xml->open( $this->fileName );
|
||||
if ( !$this->xml ) {
|
||||
throw new MWException( __METHOD__.": unable to open {$this->fileName}" );
|
||||
}
|
||||
while ( $this->xml->name !== 'ucd' && $this->xml->read() );
|
||||
$this->xml->read();
|
||||
return $this->xml;
|
||||
}
|
||||
|
||||
/**
|
||||
* Read the attributes of the current element node and return them
|
||||
* as an array
|
||||
*/
|
||||
protected function readAttributes() {
|
||||
$attrs = array();
|
||||
while ( $this->xml->moveToNextAttribute() ) {
|
||||
$attrs[$this->xml->name] = $this->xml->value;
|
||||
}
|
||||
return $attrs;
|
||||
}
|
||||
|
||||
protected function handleChar() {
|
||||
$attrs = $this->readAttributes() + $this->groupAttrs;
|
||||
if ( isset( $attrs['cp'] ) ) {
|
||||
$first = $last = hexdec( $attrs['cp'] );
|
||||
} else {
|
||||
$first = hexdec( $attrs['first-cp'] );
|
||||
$last = hexdec( $attrs['last-cp'] );
|
||||
unset( $attrs['first-cp'] );
|
||||
unset( $attrs['last-cp'] );
|
||||
}
|
||||
|
||||
for ( $cp = $first; $cp <= $last; $cp++ ) {
|
||||
$hexCp = sprintf( "%04X", $cp );
|
||||
foreach ( array( 'na', 'na1' ) as $nameProp ) {
|
||||
if ( isset( $attrs[$nameProp] ) ) {
|
||||
$attrs[$nameProp] = str_replace( '#', $hexCp, $attrs[$nameProp] );
|
||||
}
|
||||
}
|
||||
|
||||
while ( $this->currentBlock ) {
|
||||
if ( $cp < $this->currentBlock[0] ) {
|
||||
break;
|
||||
} elseif ( $cp <= $this->currentBlock[1] ) {
|
||||
$attrs['block'] = key( $this->blocks );
|
||||
break;
|
||||
} else {
|
||||
$this->currentBlock = next( $this->blocks );
|
||||
}
|
||||
}
|
||||
|
||||
$attrs['cp'] = $hexCp;
|
||||
call_user_func( $this->callback, $attrs );
|
||||
}
|
||||
}
|
||||
|
||||
public function getBlocks() {
|
||||
if ( $this->blocks ) {
|
||||
return $this->blocks;
|
||||
}
|
||||
|
||||
$xml = $this->open();
|
||||
while ( $xml->name !== 'blocks' && $xml->read() );
|
||||
|
||||
while ( $xml->read() ) {
|
||||
if ( $xml->nodeType == XMLReader::ELEMENT ) {
|
||||
if ( $xml->name === 'block' ) {
|
||||
$attrs = $this->readAttributes();
|
||||
$first = hexdec( $attrs['first-cp'] );
|
||||
$last = hexdec( $attrs['last-cp'] );
|
||||
$this->blocks[$attrs['name']] = array( $first, $last );
|
||||
}
|
||||
}
|
||||
}
|
||||
$xml->close();
|
||||
return $this->blocks;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
$maintClass = 'GenerateCollationData';
|
||||
require_once( DO_MAINTENANCE );
|
||||
|
||||
|
|
@ -493,13 +493,13 @@ CREATE TABLE /*_*/categorylinks (
|
|||
cl_to varchar(255) binary NOT NULL default '',
|
||||
|
||||
-- A binary string obtained by applying a sortkey generation algorithm
|
||||
-- (Language::convertToSortkey()) to page_title, or cl_sortkey_prefix . "\0"
|
||||
-- (Collation::getSortKey()) to page_title, or cl_sortkey_prefix . "\n"
|
||||
-- . page_title if cl_sortkey_prefix is nonempty.
|
||||
cl_sortkey varbinary(230) NOT NULL default '',
|
||||
|
||||
-- A prefix for the raw sortkey manually specified by the user, either via
|
||||
-- [[Category:Foo|prefix]] or {{defaultsort:prefix}}. If nonempty, it's
|
||||
-- concatenated with a null followed by the page title before the sortkey
|
||||
-- concatenated with a line break followed by the page title before the sortkey
|
||||
-- conversion algorithm is run. We store this so that we can update
|
||||
-- collations without reparsing all pages.
|
||||
-- Note: If you change the length of this field, you also need to change
|
||||
|
|
|
|||
|
|
@ -46,7 +46,7 @@ TEXT;
|
|||
}
|
||||
|
||||
public function execute() {
|
||||
global $wgCategoryCollation, $wgContLang;
|
||||
global $wgCategoryCollation;
|
||||
|
||||
$dbw = wfGetDB( DB_MASTER );
|
||||
$count = $dbw->selectField(
|
||||
|
|
@ -105,7 +105,7 @@ TEXT;
|
|||
$dbw->update(
|
||||
'categorylinks',
|
||||
array(
|
||||
'cl_sortkey' => $wgContLang->convertToSortkey(
|
||||
'cl_sortkey' => Collation::singleton()->getSortKey(
|
||||
$title->getCategorySortkey( $prefix ) ),
|
||||
'cl_sortkey_prefix' => $prefix,
|
||||
'cl_collation' => $wgCategoryCollation,
|
||||
|
|
|
|||
1
serialized/first-letters-root.ser
Normal file
1
serialized/first-letters-root.ser
Normal file
File diff suppressed because one or more lines are too long
Loading…
Reference in a new issue