* Introduced a non-dummy collation for $wgCategoryCollation, namely UCA with default tables.
* Added a maintenance script which generates a list of first letters. Unified Han are omitted for performance, and because they shouldn't be used as headings anyway. A future collation specific to Chinese would provide the KangXi radicals as "first letters". * Provided a precomputed list of first letters. Used Unicode 6.0.0 data and ICU 4.2. * Moved collation functionality from Language to a Collation class hierarchy with factory function. Removed the recently-added methods from Language and updated all callers. * Changed Title::getCategorySortkey() to separate its parts with a line break instead of a null character. All collations supported by the intl extension ignore the null character, i.e. "ab" == "a\0b". It would have required a lot of hacking to make it work. * Fixed the uppercase collation to handle non-ASCII characters, redundantly with r80436. I don't think it's necessary to change the collation name as was done there, so I reverted that in the course of my conflict merge. A --force option to updateCollation.php might be nice though.
This commit is contained in:
parent
14d576cd86
commit
eaeea84b44
11 changed files with 727 additions and 73 deletions
|
|
@ -43,6 +43,7 @@ $wgAutoloadLocalClasses = array(
|
||||||
'ChangesFeed' => 'includes/ChangesFeed.php',
|
'ChangesFeed' => 'includes/ChangesFeed.php',
|
||||||
'ChangeTags' => 'includes/ChangeTags.php',
|
'ChangeTags' => 'includes/ChangeTags.php',
|
||||||
'ChannelFeed' => 'includes/Feed.php',
|
'ChannelFeed' => 'includes/Feed.php',
|
||||||
|
'Collation' => 'includes/Collation.php',
|
||||||
'Cookie' => 'includes/HttpFunctions.php',
|
'Cookie' => 'includes/HttpFunctions.php',
|
||||||
'CookieJar' => 'includes/HttpFunctions.php',
|
'CookieJar' => 'includes/HttpFunctions.php',
|
||||||
'ConcatenatedGzipHistoryBlob' => 'includes/HistoryBlob.php',
|
'ConcatenatedGzipHistoryBlob' => 'includes/HistoryBlob.php',
|
||||||
|
|
@ -127,6 +128,7 @@ $wgAutoloadLocalClasses = array(
|
||||||
'HTMLInfoField' => 'includes/HTMLForm.php',
|
'HTMLInfoField' => 'includes/HTMLForm.php',
|
||||||
'Http' => 'includes/HttpFunctions.php',
|
'Http' => 'includes/HttpFunctions.php',
|
||||||
'HttpRequest' => 'includes/HttpFunctions.old.php',
|
'HttpRequest' => 'includes/HttpFunctions.old.php',
|
||||||
|
'IcuCollation' => 'includes/Collation.php',
|
||||||
'ImageGallery' => 'includes/ImageGallery.php',
|
'ImageGallery' => 'includes/ImageGallery.php',
|
||||||
'ImageHistoryList' => 'includes/ImagePage.php',
|
'ImageHistoryList' => 'includes/ImagePage.php',
|
||||||
'ImageHistoryPseudoPager' => 'includes/ImagePage.php',
|
'ImageHistoryPseudoPager' => 'includes/ImagePage.php',
|
||||||
|
|
@ -243,6 +245,7 @@ $wgAutoloadLocalClasses = array(
|
||||||
'TitleListDependency' => 'includes/CacheDependency.php',
|
'TitleListDependency' => 'includes/CacheDependency.php',
|
||||||
'Token' => 'includes/Token.php',
|
'Token' => 'includes/Token.php',
|
||||||
'UnlistedSpecialPage' => 'includes/SpecialPage.php',
|
'UnlistedSpecialPage' => 'includes/SpecialPage.php',
|
||||||
|
'UppercaseCollation' => 'includes/Collation.php',
|
||||||
'User' => 'includes/User.php',
|
'User' => 'includes/User.php',
|
||||||
'UserArray' => 'includes/UserArray.php',
|
'UserArray' => 'includes/UserArray.php',
|
||||||
'UserArrayFromResult' => 'includes/UserArray.php',
|
'UserArrayFromResult' => 'includes/UserArray.php',
|
||||||
|
|
|
||||||
|
|
@ -90,7 +90,7 @@ class CategoryViewer {
|
||||||
$children, $children_start_char,
|
$children, $children_start_char,
|
||||||
$showGallery, $gallery,
|
$showGallery, $gallery,
|
||||||
$imgsNoGalley, $imgsNoGallery_start_char,
|
$imgsNoGalley, $imgsNoGallery_start_char,
|
||||||
$skin;
|
$skin, $collation;
|
||||||
# Category object for this page
|
# Category object for this page
|
||||||
private $cat;
|
private $cat;
|
||||||
# The original query array, to be used in generating paging links.
|
# The original query array, to be used in generating paging links.
|
||||||
|
|
@ -104,6 +104,7 @@ class CategoryViewer {
|
||||||
$this->limit = $wgCategoryPagingLimit;
|
$this->limit = $wgCategoryPagingLimit;
|
||||||
$this->cat = Category::newFromTitle( $title );
|
$this->cat = Category::newFromTitle( $title );
|
||||||
$this->query = $query;
|
$this->query = $query;
|
||||||
|
$this->collation = Collation::singleton();
|
||||||
unset( $this->query['title'] );
|
unset( $this->query['title'] );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -212,7 +213,7 @@ class CategoryViewer {
|
||||||
$word = $sortkey;
|
$word = $sortkey;
|
||||||
}
|
}
|
||||||
|
|
||||||
$firstChar = $wgContLang->firstLetterForLists( $word );
|
$firstChar = $this->collation->getFirstLetter( $word );
|
||||||
|
|
||||||
return $wgContLang->convert( $firstChar );
|
return $wgContLang->convert( $firstChar );
|
||||||
}
|
}
|
||||||
|
|
@ -241,7 +242,8 @@ class CategoryViewer {
|
||||||
) . '</span>'
|
) . '</span>'
|
||||||
: $this->getSkin()->link( $title );
|
: $this->getSkin()->link( $title );
|
||||||
|
|
||||||
$this->imgsNoGallery_start_char[] = $wgContLang->convert( $wgContLang->firstLetterForLists( $sortkey ) );
|
$this->imgsNoGallery_start_char[] = $wgContLang->convert(
|
||||||
|
$this->collation->getFirstLetter( $sortkey ) );
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -261,7 +263,8 @@ class CategoryViewer {
|
||||||
) . '</span>'
|
) . '</span>'
|
||||||
: $this->getSkin()->link( $title );
|
: $this->getSkin()->link( $title );
|
||||||
|
|
||||||
$this->articles_start_char[] = $wgContLang->convert( $wgContLang->firstLetterForLists( $sortkey ) );
|
$this->articles_start_char[] = $wgContLang->convert(
|
||||||
|
$this->collation->getFirstLetter( $sortkey ) );
|
||||||
}
|
}
|
||||||
|
|
||||||
function finaliseCategoryState() {
|
function finaliseCategoryState() {
|
||||||
|
|
@ -280,8 +283,6 @@ class CategoryViewer {
|
||||||
}
|
}
|
||||||
|
|
||||||
function doCategoryQuery() {
|
function doCategoryQuery() {
|
||||||
global $wgContLang;
|
|
||||||
|
|
||||||
$dbr = wfGetDB( DB_SLAVE, 'category' );
|
$dbr = wfGetDB( DB_SLAVE, 'category' );
|
||||||
|
|
||||||
$this->nextPage = array(
|
$this->nextPage = array(
|
||||||
|
|
@ -294,14 +295,14 @@ class CategoryViewer {
|
||||||
foreach ( array( 'page', 'subcat', 'file' ) as $type ) {
|
foreach ( array( 'page', 'subcat', 'file' ) as $type ) {
|
||||||
# Get the sortkeys for start/end, if applicable. Note that if
|
# Get the sortkeys for start/end, if applicable. Note that if
|
||||||
# the collation in the database differs from the one
|
# the collation in the database differs from the one
|
||||||
# $wgContLang is using, pagination might go totally haywire.
|
# set in $wgCategoryCollation, pagination might go totally haywire.
|
||||||
$extraConds = array( 'cl_type' => $type );
|
$extraConds = array( 'cl_type' => $type );
|
||||||
if ( $this->from[$type] !== null ) {
|
if ( $this->from[$type] !== null ) {
|
||||||
$extraConds[] = 'cl_sortkey >= '
|
$extraConds[] = 'cl_sortkey >= '
|
||||||
. $dbr->addQuotes( $wgContLang->convertToSortkey( $this->from[$type] ) );
|
. $dbr->addQuotes( $this->collation->getSortKey( $this->from[$type] ) );
|
||||||
} elseif ( $this->until[$type] !== null ) {
|
} elseif ( $this->until[$type] !== null ) {
|
||||||
$extraConds[] = 'cl_sortkey < '
|
$extraConds[] = 'cl_sortkey < '
|
||||||
. $dbr->addQuotes( $wgContLang->convertToSortkey( $this->until[$type] ) );
|
. $dbr->addQuotes( $this->collation->getSortKey( $this->until[$type] ) );
|
||||||
$this->flip[$type] = true;
|
$this->flip[$type] = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
304
includes/Collation.php
Normal file
304
includes/Collation.php
Normal file
|
|
@ -0,0 +1,304 @@
|
||||||
|
<?php
|
||||||
|
|
||||||
|
abstract class Collation {
|
||||||
|
static $instance;
|
||||||
|
|
||||||
|
static function singleton() {
|
||||||
|
if ( !self::$instance ) {
|
||||||
|
global $wgCategoryCollation;
|
||||||
|
self::$instance = self::factory( $wgCategoryCollation );
|
||||||
|
}
|
||||||
|
return self::$instance;
|
||||||
|
}
|
||||||
|
|
||||||
|
static function factory( $collationName ) {
|
||||||
|
switch( $collationName ) {
|
||||||
|
case 'uppercase':
|
||||||
|
return new UppercaseCollation;
|
||||||
|
case 'uca-default':
|
||||||
|
return new IcuCollation( 'root' );
|
||||||
|
default:
|
||||||
|
throw new MWException( __METHOD__.": unknown collation type \"$collationName\"" );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Given a string, convert it to a (hopefully short) key that can be used
|
||||||
|
* for efficient sorting. A binary sort according to the sortkeys
|
||||||
|
* corresponds to a logical sort of the corresponding strings. Current
|
||||||
|
* code expects that a null character should sort before all others, but
|
||||||
|
* has no other particular expectations (and that one can be changed if
|
||||||
|
* necessary).
|
||||||
|
*
|
||||||
|
* @param string $string UTF-8 string
|
||||||
|
* @return string Binary sortkey
|
||||||
|
*/
|
||||||
|
abstract function getSortKey( $string );
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Given a string, return the logical "first letter" to be used for
|
||||||
|
* grouping on category pages and so on. This has to be coordinated
|
||||||
|
* carefully with convertToSortkey(), or else the sorted list might jump
|
||||||
|
* back and forth between the same "initial letters" or other pathological
|
||||||
|
* behavior. For instance, if you just return the first character, but "a"
|
||||||
|
* sorts the same as "A" based on getSortKey(), then you might get a
|
||||||
|
* list like
|
||||||
|
*
|
||||||
|
* == A ==
|
||||||
|
* * [[Aardvark]]
|
||||||
|
*
|
||||||
|
* == a ==
|
||||||
|
* * [[antelope]]
|
||||||
|
*
|
||||||
|
* == A ==
|
||||||
|
* * [[Ape]]
|
||||||
|
*
|
||||||
|
* etc., assuming for the sake of argument that $wgCapitalLinks is false.
|
||||||
|
*
|
||||||
|
* @param string $string UTF-8 string
|
||||||
|
* @return string UTF-8 string corresponding to the first letter of input
|
||||||
|
*/
|
||||||
|
abstract function getFirstLetter( $string );
|
||||||
|
}
|
||||||
|
|
||||||
|
class UppercaseCollation extends Collation {
|
||||||
|
var $lang;
|
||||||
|
function __construct() {
|
||||||
|
// Get a language object so that we can use the generic UTF-8 uppercase
|
||||||
|
// function there
|
||||||
|
$this->lang = Language::factory( 'en' );
|
||||||
|
}
|
||||||
|
|
||||||
|
function getSortKey( $string ) {
|
||||||
|
return $this->lang->uc( $string );
|
||||||
|
}
|
||||||
|
|
||||||
|
function getFirstLetter( $string ) {
|
||||||
|
if ( $string[0] == "\0" ) {
|
||||||
|
$string = substr( $string, 1 );
|
||||||
|
}
|
||||||
|
return $this->lang->ucfirst( $this->lang->firstChar( $string ) );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
class IcuCollation extends Collation {
|
||||||
|
var $primaryCollator, $mainCollator, $locale;
|
||||||
|
var $firstLetterData;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Unified CJK blocks.
|
||||||
|
*
|
||||||
|
* The same definition of a CJK block must be used for both Collation and
|
||||||
|
* generateCollationData.php. These blocks are omitted from the first
|
||||||
|
* letter data, as an optimisation measure and because the default UCA table
|
||||||
|
* is pretty useless for sorting Chinese text anyway. Japanese and Korean
|
||||||
|
* blocks are not included here, because they are smaller and more useful.
|
||||||
|
*/
|
||||||
|
static $cjkBlocks = array(
|
||||||
|
array( 0x2E80, 0x2EFF ), // CJK Radicals Supplement
|
||||||
|
array( 0x2F00, 0x2FDF ), // Kangxi Radicals
|
||||||
|
array( 0x2FF0, 0x2FFF ), // Ideographic Description Characters
|
||||||
|
array( 0x3000, 0x303F ), // CJK Symbols and Punctuation
|
||||||
|
array( 0x31C0, 0x31EF ), // CJK Strokes
|
||||||
|
array( 0x3200, 0x32FF ), // Enclosed CJK Letters and Months
|
||||||
|
array( 0x3300, 0x33FF ), // CJK Compatibility
|
||||||
|
array( 0x3400, 0x4DBF ), // CJK Unified Ideographs Extension A
|
||||||
|
array( 0x4E00, 0x9FFF ), // CJK Unified Ideographs
|
||||||
|
array( 0xF900, 0xFAFF ), // CJK Compatibility Ideographs
|
||||||
|
array( 0xFE30, 0xFE4F ), // CJK Compatibility Forms
|
||||||
|
array( 0x20000, 0x2A6DF ), // CJK Unified Ideographs Extension B
|
||||||
|
array( 0x2A700, 0x2B73F ), // CJK Unified Ideographs Extension C
|
||||||
|
array( 0x2B740, 0x2B81F ), // CJK Unified Ideographs Extension D
|
||||||
|
array( 0x2F800, 0x2FA1F ), // CJK Compatibility Ideographs Supplement
|
||||||
|
);
|
||||||
|
|
||||||
|
const RECORD_LENGTH = 14;
|
||||||
|
|
||||||
|
function __construct( $locale ) {
|
||||||
|
if ( !extension_loaded( 'intl' ) ) {
|
||||||
|
throw new MWException( 'An ICU collation was requested, ' .
|
||||||
|
'but the intl extension is not available.' );
|
||||||
|
}
|
||||||
|
$this->locale = $locale;
|
||||||
|
$this->mainCollator = Collator::create( $locale );
|
||||||
|
if ( !$this->mainCollator ) {
|
||||||
|
throw new MWException( "Invalid ICU locale specified for collation: $locale" );
|
||||||
|
}
|
||||||
|
|
||||||
|
$this->primaryCollator = Collator::create( $locale );
|
||||||
|
$this->primaryCollator->setStrength( Collator::PRIMARY );
|
||||||
|
}
|
||||||
|
|
||||||
|
function getSortKey( $string ) {
|
||||||
|
wfSuppressWarnings();
|
||||||
|
$key = $this->mainCollator->getSortKey( $string ) . '';
|
||||||
|
wfRestoreWarnings();
|
||||||
|
return $key;
|
||||||
|
}
|
||||||
|
|
||||||
|
function getPrimarySortKey( $string ) {
|
||||||
|
wfSuppressWarnings();
|
||||||
|
$key = $this->primaryCollator->getSortKey( $string ) . '';
|
||||||
|
wfRestoreWarnings();
|
||||||
|
return $key;
|
||||||
|
}
|
||||||
|
|
||||||
|
function getFirstLetter( $string ) {
|
||||||
|
$string = strval( $string );
|
||||||
|
if ( $string === '' ) {
|
||||||
|
return '';
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check for CJK
|
||||||
|
$firstChar = mb_substr( $string, 0, 1, 'UTF-8' );
|
||||||
|
if ( ord( $firstChar ) > 0x7f
|
||||||
|
&& self::isCjk( utf8ToCodepoint( $firstChar ) ) )
|
||||||
|
{
|
||||||
|
return $firstChar;
|
||||||
|
}
|
||||||
|
|
||||||
|
$sortKey = $this->getPrimarySortKey( $string );
|
||||||
|
|
||||||
|
// Do a binary search to find the correct letter to sort under
|
||||||
|
$min = $this->findLowerBound(
|
||||||
|
array( $this, 'getSortKeyByLetterIndex' ),
|
||||||
|
$this->getFirstLetterCount(),
|
||||||
|
'strcmp',
|
||||||
|
$sortKey );
|
||||||
|
|
||||||
|
if ( $min === false ) {
|
||||||
|
// Before the first letter
|
||||||
|
return '';
|
||||||
|
}
|
||||||
|
return $this->getLetterByIndex( $min );
|
||||||
|
}
|
||||||
|
|
||||||
|
function getFirstLetterData() {
|
||||||
|
if ( $this->firstLetterData !== null ) {
|
||||||
|
return $this->firstLetterData;
|
||||||
|
}
|
||||||
|
|
||||||
|
$cache = wfGetCache( CACHE_ANYTHING );
|
||||||
|
$cacheKey = wfMemcKey( 'first-letters', $this->locale );
|
||||||
|
$cacheEntry = $cache->get( $cacheKey );
|
||||||
|
|
||||||
|
if ( $cacheEntry ) {
|
||||||
|
$this->firstLetterData = $cacheEntry;
|
||||||
|
return $this->firstLetterData;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Generate data from serialized data file
|
||||||
|
|
||||||
|
$letters = wfGetPrecompiledData( "first-letters-{$this->locale}.ser" );
|
||||||
|
if ( $letters === false ) {
|
||||||
|
throw new MWException( "MediaWiki does not support ICU locale " .
|
||||||
|
"\"{$this->locale}\"" );
|
||||||
|
}
|
||||||
|
|
||||||
|
// Sort the letters.
|
||||||
|
//
|
||||||
|
// It's impossible to have the precompiled data file properly sorted,
|
||||||
|
// because the sort order changes depending on ICU version. If the
|
||||||
|
// array is not properly sorted, the binary search will return random
|
||||||
|
// results.
|
||||||
|
//
|
||||||
|
// We also take this opportunity to remove primary collisions.
|
||||||
|
$letterMap = array();
|
||||||
|
foreach ( $letters as $letter ) {
|
||||||
|
$key = $this->getPrimarySortKey( $letter );
|
||||||
|
if ( isset( $letterMap[$key] ) ) {
|
||||||
|
// Primary collision
|
||||||
|
// Keep whichever one sorts first in the main collator
|
||||||
|
if ( $this->mainCollator->compare( $letter, $letterMap[$key] ) < 0 ) {
|
||||||
|
$letterMap[$key] = $letter;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
$letterMap[$key] = $letter;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
ksort( $letterMap, SORT_STRING );
|
||||||
|
$data = array(
|
||||||
|
'chars' => array_values( $letterMap ),
|
||||||
|
'keys' => array_keys( $letterMap )
|
||||||
|
);
|
||||||
|
|
||||||
|
// Reduce memory usage before caching
|
||||||
|
unset( $letterMap );
|
||||||
|
|
||||||
|
// Save to cache
|
||||||
|
$this->firstLetterData = $data;
|
||||||
|
$cache->set( $cacheKey, $data, 86400 * 7 /* 1 week */ );
|
||||||
|
return $data;
|
||||||
|
}
|
||||||
|
|
||||||
|
function getLetterByIndex( $index ) {
|
||||||
|
if ( $this->firstLetterData === null ) {
|
||||||
|
$this->getFirstLetterData();
|
||||||
|
}
|
||||||
|
return $this->firstLetterData['chars'][$index];
|
||||||
|
}
|
||||||
|
|
||||||
|
function getSortKeyByLetterIndex( $index ) {
|
||||||
|
if ( $this->firstLetterData === null ) {
|
||||||
|
$this->getFirstLetterData();
|
||||||
|
}
|
||||||
|
return $this->firstLetterData['keys'][$index];
|
||||||
|
}
|
||||||
|
|
||||||
|
function getFirstLetterCount() {
|
||||||
|
if ( $this->firstLetterData === null ) {
|
||||||
|
$this->getFirstLetterData();
|
||||||
|
}
|
||||||
|
return count( $this->firstLetterData['chars'] );
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Do a binary search, and return the index of the largest item that sorts
|
||||||
|
* less than or equal to the target value.
|
||||||
|
*
|
||||||
|
* @param $valueCallback A function to call to get the value with
|
||||||
|
* a given array index.
|
||||||
|
* @param $valueCount The number of items accessible via $valueCallback,
|
||||||
|
* indexed from 0 to $valueCount - 1
|
||||||
|
* @param $comparisonCallback A callback to compare two values, returning
|
||||||
|
* -1, 0 or 1 in the style of strcmp().
|
||||||
|
* @param $target The target value to find.
|
||||||
|
*
|
||||||
|
* @return The item index of the lower bound, or false if the target value
|
||||||
|
* sorts before all items.
|
||||||
|
*/
|
||||||
|
function findLowerBound( $valueCallback, $valueCount, $comparisonCallback, $target ) {
|
||||||
|
$min = 0;
|
||||||
|
$max = $valueCount - 1;
|
||||||
|
do {
|
||||||
|
$mid = $min + ( ( $max - $min ) >> 1 );
|
||||||
|
$item = call_user_func( $valueCallback, $mid );
|
||||||
|
$comparison = call_user_func( $comparisonCallback, $target, $item );
|
||||||
|
if ( $comparison > 0 ) {
|
||||||
|
$min = $mid;
|
||||||
|
} elseif ( $comparison == 0 ) {
|
||||||
|
$min = $mid;
|
||||||
|
break;
|
||||||
|
} else {
|
||||||
|
$max = $mid;
|
||||||
|
}
|
||||||
|
} while ( $min < $max - 1 );
|
||||||
|
|
||||||
|
if ( $min == 0 && $max == 0 && $comparison > 0 ) {
|
||||||
|
// Before the first item
|
||||||
|
return false;
|
||||||
|
} else {
|
||||||
|
return $min;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static function isCjk( $codepoint ) {
|
||||||
|
foreach ( self::$cjkBlocks as $block ) {
|
||||||
|
if ( $codepoint >= $block[0] && $codepoint <= $block[1] ) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
@ -4611,15 +4611,26 @@ $wgCategoryMagicGallery = true;
|
||||||
$wgCategoryPagingLimit = 200;
|
$wgCategoryPagingLimit = 200;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A version indicator for collations that will be stored in cl_collation for
|
* Specify how category names should be sorted, when listed on a category page.
|
||||||
* all new rows. Used when the collation algorithm changes: a script checks
|
* A sorting scheme is also known as a collation.
|
||||||
* for all rows where cl_collation != $wgCategoryCollation and regenerates
|
|
||||||
* cl_sortkey based on the page name and cl_sortkey_prefix.
|
|
||||||
*
|
*
|
||||||
* Currently only supports 'uppercase2', which just uppercases the string. This
|
* Available values are:
|
||||||
* is a dummy collation, to be replaced later by real ones.
|
*
|
||||||
|
* - uppercase: Converts the category name to upper case, and sorts by that.
|
||||||
|
*
|
||||||
|
* - uca-default: Provides access to the Unicode Collation Algorithm with
|
||||||
|
* the default element table. This is a compromise collation which sorts
|
||||||
|
* all languages in a mediocre way. However, it is better than "uppercase".
|
||||||
|
*
|
||||||
|
* To use the uca-default collation, you must have PHP's intl extension
|
||||||
|
* installed. See http://php.net/manual/en/intl.setup.php . The details of the
|
||||||
|
* resulting collation will depend on the version of ICU installed on the
|
||||||
|
* server.
|
||||||
|
*
|
||||||
|
* After you change this, you must run maintenance/updateCollation.php to fix
|
||||||
|
* the sort keys in the database.
|
||||||
*/
|
*/
|
||||||
$wgCategoryCollation = 'uppercase2';
|
$wgCategoryCollation = 'uppercase';
|
||||||
|
|
||||||
/** @} */ # End categories }
|
/** @} */ # End categories }
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -454,14 +454,14 @@ class LinksUpdate {
|
||||||
# (Title::moveTo() has had the same issue for a long time).
|
# (Title::moveTo() has had the same issue for a long time).
|
||||||
if ( $this->mTitle->getCategorySortkey() == $sortkey ) {
|
if ( $this->mTitle->getCategorySortkey() == $sortkey ) {
|
||||||
$prefix = '';
|
$prefix = '';
|
||||||
$sortkey = $wgContLang->convertToSortkey( $sortkey );
|
$sortkey = Collation::singleton()->getSortKey( $sortkey );
|
||||||
} else {
|
} else {
|
||||||
# Treat custom sortkeys as a prefix, so that if multiple
|
# Treat custom sortkeys as a prefix, so that if multiple
|
||||||
# things are forced to sort as '*' or something, they'll
|
# things are forced to sort as '*' or something, they'll
|
||||||
# sort properly in the category rather than in page_id
|
# sort properly in the category rather than in page_id
|
||||||
# order or such.
|
# order or such.
|
||||||
$prefix = $sortkey;
|
$prefix = $sortkey;
|
||||||
$sortkey = $wgContLang->convertToSortkey(
|
$sortkey = Collation::singleton()->getSortKey(
|
||||||
$this->mTitle->getCategorySortkey( $prefix ) );
|
$this->mTitle->getCategorySortkey( $prefix ) );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -3088,8 +3088,6 @@ class Title {
|
||||||
* @return Mixed true on success, getUserPermissionsErrors()-like array on failure
|
* @return Mixed true on success, getUserPermissionsErrors()-like array on failure
|
||||||
*/
|
*/
|
||||||
public function moveTo( &$nt, $auth = true, $reason = '', $createRedirect = true ) {
|
public function moveTo( &$nt, $auth = true, $reason = '', $createRedirect = true ) {
|
||||||
global $wgContLang;
|
|
||||||
|
|
||||||
$err = $this->isValidMoveOperation( $nt, $auth, $reason );
|
$err = $this->isValidMoveOperation( $nt, $auth, $reason );
|
||||||
if ( is_array( $err ) ) {
|
if ( is_array( $err ) ) {
|
||||||
return $err;
|
return $err;
|
||||||
|
|
@ -3129,7 +3127,8 @@ class Title {
|
||||||
);
|
);
|
||||||
$dbw->update( 'categorylinks',
|
$dbw->update( 'categorylinks',
|
||||||
array(
|
array(
|
||||||
'cl_sortkey' => $wgContLang->convertToSortkey( $nt->getCategorySortkey( $prefix ) ),
|
'cl_sortkey' => Collation::singleton()->getSortKey(
|
||||||
|
$nt->getCategorySortkey( $prefix ) ),
|
||||||
'cl_timestamp=cl_timestamp' ),
|
'cl_timestamp=cl_timestamp' ),
|
||||||
array( 'cl_from' => $pageid ),
|
array( 'cl_from' => $pageid ),
|
||||||
__METHOD__ );
|
__METHOD__ );
|
||||||
|
|
@ -4139,7 +4138,7 @@ class Title {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns the raw sort key to be used for categories, with the specified
|
* Returns the raw sort key to be used for categories, with the specified
|
||||||
* prefix. This will be fed to Language::convertToSortkey() to get a
|
* prefix. This will be fed to Collation::getSortKey() to get a
|
||||||
* binary sortkey that can be used for actual sorting.
|
* binary sortkey that can be used for actual sorting.
|
||||||
*
|
*
|
||||||
* @param $prefix string The prefix to be used, specified using
|
* @param $prefix string The prefix to be used, specified using
|
||||||
|
|
@ -4153,7 +4152,7 @@ class Title {
|
||||||
# Separate with a null byte, so the unprefixed part is only used as
|
# Separate with a null byte, so the unprefixed part is only used as
|
||||||
# a tiebreaker when two pages have the exact same prefix -- null
|
# a tiebreaker when two pages have the exact same prefix -- null
|
||||||
# sorts before everything else (hopefully).
|
# sorts before everything else (hopefully).
|
||||||
return "$prefix\0$unprefixed";
|
return "$prefix\n$unprefixed";
|
||||||
}
|
}
|
||||||
return $unprefixed;
|
return $unprefixed;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -2996,50 +2996,4 @@ class Language {
|
||||||
function getConvRuleTitle() {
|
function getConvRuleTitle() {
|
||||||
return $this->mConverter->getConvRuleTitle();
|
return $this->mConverter->getConvRuleTitle();
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Given a string, convert it to a (hopefully short) key that can be used
|
|
||||||
* for efficient sorting. A binary sort according to the sortkeys
|
|
||||||
* corresponds to a logical sort of the corresponding strings. Current
|
|
||||||
* code expects that a null character should sort before all others, but
|
|
||||||
* has no other particular expectations (and that one can be changed if
|
|
||||||
* necessary).
|
|
||||||
*
|
|
||||||
* @param string $string UTF-8 string
|
|
||||||
* @return string Binary sortkey
|
|
||||||
*/
|
|
||||||
public function convertToSortkey( $string ) {
|
|
||||||
# Fake function for now
|
|
||||||
return $this->uc( $string );
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Given a string, return the logical "first letter" to be used for
|
|
||||||
* grouping on category pages and so on. This has to be coordinated
|
|
||||||
* carefully with convertToSortkey(), or else the sorted list might jump
|
|
||||||
* back and forth between the same "initial letters" or other pathological
|
|
||||||
* behavior. For instance, if you just return the first character, but "a"
|
|
||||||
* sorts the same as "A" based on convertToSortkey(), then you might get a
|
|
||||||
* list like
|
|
||||||
*
|
|
||||||
* == A ==
|
|
||||||
* * [[Aardvark]]
|
|
||||||
*
|
|
||||||
* == a ==
|
|
||||||
* * [[antelope]]
|
|
||||||
*
|
|
||||||
* == A ==
|
|
||||||
* * [[Ape]]
|
|
||||||
*
|
|
||||||
* etc., assuming for the sake of argument that $wgCapitalLinks is false.
|
|
||||||
*
|
|
||||||
* @param string $string UTF-8 string
|
|
||||||
* @return string UTF-8 string corresponding to the first letter of input
|
|
||||||
*/
|
|
||||||
public function firstLetterForLists( $string ) {
|
|
||||||
if ( $string[0] == "\0" ) {
|
|
||||||
$string = substr( $string, 1 );
|
|
||||||
}
|
|
||||||
return $this->uc( $this->firstChar( $string ) );
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
381
maintenance/language/generateCollationData.php
Normal file
381
maintenance/language/generateCollationData.php
Normal file
|
|
@ -0,0 +1,381 @@
|
||||||
|
<?php
|
||||||
|
|
||||||
|
require_once( dirname( __FILE__ ) .'/../Maintenance.php' );
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Generate first letter data files for Collation.php
|
||||||
|
*/
|
||||||
|
class GenerateCollationData extends Maintenance {
|
||||||
|
/** The directory with source data files in it */
|
||||||
|
var $dataDir;
|
||||||
|
|
||||||
|
/** The primary weights, indexed by codepoint */
|
||||||
|
var $weights;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A hashtable keyed by codepoint, where presence indicates that a character
|
||||||
|
* has a decomposition mapping. This makes it non-preferred for group header
|
||||||
|
* selection.
|
||||||
|
*/
|
||||||
|
var $mappedChars;
|
||||||
|
|
||||||
|
var $debugOutFile;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Important tertiary weights from UTS #10 section 7.2
|
||||||
|
*/
|
||||||
|
const NORMAL_UPPERCASE = 0x08;
|
||||||
|
const NORMAL_HIRAGANA = 0X0E;
|
||||||
|
|
||||||
|
public function __construct() {
|
||||||
|
parent::__construct();
|
||||||
|
$this->addOption( 'data-dir', 'A directory on the local filesystem ' .
|
||||||
|
'containing allkeys.txt and ucd.all.grouped.xml from unicode.org',
|
||||||
|
false, true );
|
||||||
|
$this->addOption( 'debug-output', 'Filename for sending debug output to',
|
||||||
|
false, true );
|
||||||
|
}
|
||||||
|
|
||||||
|
public function execute() {
|
||||||
|
$this->dataDir = $this->getOption( 'data-dir', '.' );
|
||||||
|
if ( !file_exists( "{$this->dataDir}/allkeys.txt" ) ) {
|
||||||
|
$this->error( "Unable to find allkeys.txt. Please download it from " .
|
||||||
|
"http://www.unicode.org/Public/UCA/latest/allkeys.txt and specify " .
|
||||||
|
"its location with --data-dir=<DIR>" );
|
||||||
|
exit( 1 );
|
||||||
|
}
|
||||||
|
if ( !file_exists( "{$this->dataDir}/ucd.all.grouped.xml" ) ) {
|
||||||
|
$this->error( "Unable to find ucd.all.grouped.xml. Please download it " .
|
||||||
|
"from http://www.unicode.org/Public/6.0.0/ucdxml/ucd.all.grouped.zip " .
|
||||||
|
"and specify its location with --data-dir=<DIR>" );
|
||||||
|
exit( 1 );
|
||||||
|
}
|
||||||
|
$debugOutFileName = $this->getOption( 'debug-output' );
|
||||||
|
if ( $debugOutFileName ) {
|
||||||
|
$this->debugOutFile = fopen( $debugOutFileName, 'w' );
|
||||||
|
if ( !$this->debugOutFile ) {
|
||||||
|
$this->error( "Unable to open debug output file for writing" );
|
||||||
|
exit( 1 );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
$this->loadUcd();
|
||||||
|
$this->generateFirstChars();
|
||||||
|
}
|
||||||
|
|
||||||
|
function loadUcd() {
|
||||||
|
$uxr = new UcdXmlReader( "{$this->dataDir}/ucd.all.grouped.xml" );
|
||||||
|
$uxr->readChars( array( $this, 'charCallback' ) );
|
||||||
|
}
|
||||||
|
|
||||||
|
function charCallback( $data ) {
|
||||||
|
// Skip non-printable characters
|
||||||
|
$category = substr( $data['gc'], 0, 1 );
|
||||||
|
if ( strpos( 'LNPS', $category ) === false ) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
$cp = hexdec( $data['cp'] );
|
||||||
|
|
||||||
|
// Skip the CJK ideograph blocks, as an optimisation measure.
|
||||||
|
// UCA doesn't sort them properly anyway, without tailoring.
|
||||||
|
if ( IcuCollation::isCjk( $cp ) ) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Skip the composed Hangul syllables, we will use the bare Jamo
|
||||||
|
// as first letters
|
||||||
|
if ( $data['block'] == 'Hangul Syllables' ) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Calculate implicit weight per UTS #10 v6.0.0, sec 7.1.3
|
||||||
|
if ( $data['UIdeo'] === 'Y' ) {
|
||||||
|
if ( $data['block'] == 'CJK Unified Ideographs'
|
||||||
|
|| $data['block'] == 'CJK Compatibility Ideographs' )
|
||||||
|
{
|
||||||
|
$base = 0xFB40;
|
||||||
|
} else {
|
||||||
|
$base = 0xFB80;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
$base = 0xFBC0;
|
||||||
|
}
|
||||||
|
$a = $base + ( $cp >> 15 );
|
||||||
|
$b = ( $cp & 0x7fff ) | 0x8000;
|
||||||
|
|
||||||
|
$this->weights[$cp] = sprintf( ".%04X.%04X", $a, $b );
|
||||||
|
|
||||||
|
if ( $data['dm'] !== '#' ) {
|
||||||
|
$this->mappedChars[$cp] = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( $cp % 4096 == 0 ) {
|
||||||
|
print "{$data['cp']}\n";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function generateFirstChars() {
|
||||||
|
$file = fopen( "{$this->dataDir}/allkeys.txt", 'r' );
|
||||||
|
if ( !$file ) {
|
||||||
|
$this->error( "Unable to open allkeys.txt" );
|
||||||
|
exit( 1 );
|
||||||
|
}
|
||||||
|
global $IP;
|
||||||
|
$outFile = fopen( "$IP/serialized/first-letters-root.ser", 'w' );
|
||||||
|
if ( !$outFile ) {
|
||||||
|
$this->error( "Unable to open output file first-letters-root.ser" );
|
||||||
|
exit( 1 );
|
||||||
|
}
|
||||||
|
|
||||||
|
$goodTertiaryChars = array();
|
||||||
|
|
||||||
|
// For each character with an entry in allkeys.txt, overwrite the implicit
|
||||||
|
// entry in $this->weights that came from the UCD.
|
||||||
|
// Also gather a list of tertiary weights, for use in selecting the group header
|
||||||
|
while ( false !== ( $line = fgets( $file ) ) ) {
|
||||||
|
// We're only interested in single-character weights, pick them out with a regex
|
||||||
|
$line = trim( $line );
|
||||||
|
if ( !preg_match( '/^([0-9A-F]+)\s*;\s*([^#]*)/', $line, $m ) ) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
$cp = hexdec( $m[1] );
|
||||||
|
$allWeights = trim( $m[2] );
|
||||||
|
$primary = '';
|
||||||
|
$tertiary = '';
|
||||||
|
|
||||||
|
if ( !isset( $this->weights[$cp] ) ) {
|
||||||
|
// Non-printable, ignore
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
foreach ( StringUtils::explode( '[', $allWeights ) as $weightStr ) {
|
||||||
|
preg_match_all( '/[*.]([0-9A-F]+)/', $weightStr, $m );
|
||||||
|
if ( !empty( $m[1] ) ) {
|
||||||
|
if ( $m[1][0] !== '0000' ) {
|
||||||
|
$primary .= '.' . $m[1][0];
|
||||||
|
}
|
||||||
|
if ( $m[1][2] !== '0000' ) {
|
||||||
|
$tertiary .= '.' . $m[1][2];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
$this->weights[$cp] = $primary;
|
||||||
|
if ( $tertiary === '.0008'
|
||||||
|
|| $tertiary === '.000E' )
|
||||||
|
{
|
||||||
|
$goodTertiaryChars[$cp] = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
fclose( $file );
|
||||||
|
|
||||||
|
// Identify groups of characters with the same primary weight
|
||||||
|
$this->groups = array();
|
||||||
|
asort( $this->weights, SORT_STRING );
|
||||||
|
$prevWeight = reset( $this->weights );
|
||||||
|
$group = array();
|
||||||
|
foreach ( $this->weights as $cp => $weight ) {
|
||||||
|
if ( $weight !== $prevWeight ) {
|
||||||
|
$this->groups[$prevWeight] = $group;
|
||||||
|
$prevWeight = $weight;
|
||||||
|
if ( isset( $this->groups[$weight] ) ) {
|
||||||
|
$group = $this->groups[$weight];
|
||||||
|
} else {
|
||||||
|
$group = array();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
$group[] = $cp;
|
||||||
|
}
|
||||||
|
if ( $group ) {
|
||||||
|
$this->groups[$prevWeight] = $group;
|
||||||
|
}
|
||||||
|
|
||||||
|
// If one character has a given primary weight sequence, and a second
|
||||||
|
// character has a longer primary weight sequence with an initial
|
||||||
|
// portion equal to the first character, then remove the second
|
||||||
|
// character. This avoids having characters like U+A732 (double A)
|
||||||
|
// polluting the basic latin sort area.
|
||||||
|
$prevWeights = array();
|
||||||
|
foreach ( $this->groups as $weight => $group ) {
|
||||||
|
if ( preg_match( '/(\.[0-9A-F]*)\./', $weight, $m ) ) {
|
||||||
|
if ( isset( $this->groups[$m[1]] ) ) {
|
||||||
|
unset( $this->groups[$weight] );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
ksort( $this->groups, SORT_STRING );
|
||||||
|
|
||||||
|
// Identify the header character in each group
|
||||||
|
$headerChars = array();
|
||||||
|
$prevChar = "\000";
|
||||||
|
$tertiaryCollator = new Collator( 'root' );
|
||||||
|
$primaryCollator = new Collator( 'root' );
|
||||||
|
$primaryCollator->setStrength( Collator::PRIMARY );
|
||||||
|
$numOutOfOrder = 0;
|
||||||
|
foreach ( $this->groups as $weight => $group ) {
|
||||||
|
$uncomposedChars = array();
|
||||||
|
$goodChars = array();
|
||||||
|
foreach ( $group as $cp ) {
|
||||||
|
if ( isset( $goodTertiaryChars[$cp] ) ) {
|
||||||
|
$goodChars[] = $cp;
|
||||||
|
}
|
||||||
|
if ( !isset( $this->mappedChars[$cp] ) ) {
|
||||||
|
$uncomposedChars[] = $cp;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
$x = array_intersect( $goodChars, $uncomposedChars );
|
||||||
|
if ( !$x ) {
|
||||||
|
$x = $uncomposedChars;
|
||||||
|
if ( !$x ) {
|
||||||
|
$x = $group;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Use ICU to pick the lowest sorting character in the selection
|
||||||
|
$tertiaryCollator->sort( $x );
|
||||||
|
$cp = $x[0];
|
||||||
|
|
||||||
|
$char = codepointToUtf8( $cp );
|
||||||
|
$headerChars[] = $char;
|
||||||
|
if ( $primaryCollator->compare( $char, $prevChar ) <= 0 ) {
|
||||||
|
$numOutOfOrder ++;
|
||||||
|
/*
|
||||||
|
printf( "Out of order: U+%05X > U+%05X\n",
|
||||||
|
utf8ToCodepoint( $prevChar ),
|
||||||
|
utf8ToCodepoint( $char ) );
|
||||||
|
*/
|
||||||
|
}
|
||||||
|
$prevChar = $char;
|
||||||
|
|
||||||
|
if ( $this->debugOutFile ) {
|
||||||
|
fwrite( $this->debugOutFile, sprintf( "%05X %s %s (%s)\n", $cp, $weight, $char,
|
||||||
|
implode( ' ', array_map( 'codepointToUtf8', $group ) ) ) );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
print "Out of order: $numOutOfOrder / " . count( $headerChars ) . "\n";
|
||||||
|
|
||||||
|
fwrite( $outFile, serialize( $headerChars ) );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
class UcdXmlReader {
|
||||||
|
var $fileName;
|
||||||
|
var $callback;
|
||||||
|
var $groupAttrs;
|
||||||
|
var $xml;
|
||||||
|
var $blocks = array();
|
||||||
|
var $currentBlock;
|
||||||
|
|
||||||
|
function __construct( $fileName ) {
|
||||||
|
$this->fileName = $fileName;
|
||||||
|
}
|
||||||
|
|
||||||
|
public function readChars( $callback ) {
|
||||||
|
$this->getBlocks();
|
||||||
|
$this->currentBlock = reset( $this->blocks );
|
||||||
|
$xml = $this->open();
|
||||||
|
$this->callback = $callback;
|
||||||
|
|
||||||
|
while ( $xml->name !== 'repertoire' && $xml->next() );
|
||||||
|
|
||||||
|
while ( $xml->read() ) {
|
||||||
|
if ( $xml->nodeType == XMLReader::ELEMENT ) {
|
||||||
|
if ( $xml->name === 'group' ) {
|
||||||
|
$this->groupAttrs = $this->readAttributes();
|
||||||
|
} elseif ( $xml->name === 'char' ) {
|
||||||
|
$this->handleChar();
|
||||||
|
}
|
||||||
|
} elseif ( $xml->nodeType === XMLReader::END_ELEMENT ) {
|
||||||
|
if ( $xml->name === 'group' ) {
|
||||||
|
$this->groupAttrs = array();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
$xml->close();
|
||||||
|
}
|
||||||
|
|
||||||
|
protected function open() {
|
||||||
|
$this->xml = new XMLReader;
|
||||||
|
$this->xml->open( $this->fileName );
|
||||||
|
if ( !$this->xml ) {
|
||||||
|
throw new MWException( __METHOD__.": unable to open {$this->fileName}" );
|
||||||
|
}
|
||||||
|
while ( $this->xml->name !== 'ucd' && $this->xml->read() );
|
||||||
|
$this->xml->read();
|
||||||
|
return $this->xml;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Read the attributes of the current element node and return them
|
||||||
|
* as an array
|
||||||
|
*/
|
||||||
|
protected function readAttributes() {
|
||||||
|
$attrs = array();
|
||||||
|
while ( $this->xml->moveToNextAttribute() ) {
|
||||||
|
$attrs[$this->xml->name] = $this->xml->value;
|
||||||
|
}
|
||||||
|
return $attrs;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected function handleChar() {
|
||||||
|
$attrs = $this->readAttributes() + $this->groupAttrs;
|
||||||
|
if ( isset( $attrs['cp'] ) ) {
|
||||||
|
$first = $last = hexdec( $attrs['cp'] );
|
||||||
|
} else {
|
||||||
|
$first = hexdec( $attrs['first-cp'] );
|
||||||
|
$last = hexdec( $attrs['last-cp'] );
|
||||||
|
unset( $attrs['first-cp'] );
|
||||||
|
unset( $attrs['last-cp'] );
|
||||||
|
}
|
||||||
|
|
||||||
|
for ( $cp = $first; $cp <= $last; $cp++ ) {
|
||||||
|
$hexCp = sprintf( "%04X", $cp );
|
||||||
|
foreach ( array( 'na', 'na1' ) as $nameProp ) {
|
||||||
|
if ( isset( $attrs[$nameProp] ) ) {
|
||||||
|
$attrs[$nameProp] = str_replace( '#', $hexCp, $attrs[$nameProp] );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
while ( $this->currentBlock ) {
|
||||||
|
if ( $cp < $this->currentBlock[0] ) {
|
||||||
|
break;
|
||||||
|
} elseif ( $cp <= $this->currentBlock[1] ) {
|
||||||
|
$attrs['block'] = key( $this->blocks );
|
||||||
|
break;
|
||||||
|
} else {
|
||||||
|
$this->currentBlock = next( $this->blocks );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
$attrs['cp'] = $hexCp;
|
||||||
|
call_user_func( $this->callback, $attrs );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public function getBlocks() {
|
||||||
|
if ( $this->blocks ) {
|
||||||
|
return $this->blocks;
|
||||||
|
}
|
||||||
|
|
||||||
|
$xml = $this->open();
|
||||||
|
while ( $xml->name !== 'blocks' && $xml->read() );
|
||||||
|
|
||||||
|
while ( $xml->read() ) {
|
||||||
|
if ( $xml->nodeType == XMLReader::ELEMENT ) {
|
||||||
|
if ( $xml->name === 'block' ) {
|
||||||
|
$attrs = $this->readAttributes();
|
||||||
|
$first = hexdec( $attrs['first-cp'] );
|
||||||
|
$last = hexdec( $attrs['last-cp'] );
|
||||||
|
$this->blocks[$attrs['name']] = array( $first, $last );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
$xml->close();
|
||||||
|
return $this->blocks;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
$maintClass = 'GenerateCollationData';
|
||||||
|
require_once( DO_MAINTENANCE );
|
||||||
|
|
||||||
|
|
@ -493,13 +493,13 @@ CREATE TABLE /*_*/categorylinks (
|
||||||
cl_to varchar(255) binary NOT NULL default '',
|
cl_to varchar(255) binary NOT NULL default '',
|
||||||
|
|
||||||
-- A binary string obtained by applying a sortkey generation algorithm
|
-- A binary string obtained by applying a sortkey generation algorithm
|
||||||
-- (Language::convertToSortkey()) to page_title, or cl_sortkey_prefix . "\0"
|
-- (Collation::getSortKey()) to page_title, or cl_sortkey_prefix . "\n"
|
||||||
-- . page_title if cl_sortkey_prefix is nonempty.
|
-- . page_title if cl_sortkey_prefix is nonempty.
|
||||||
cl_sortkey varbinary(230) NOT NULL default '',
|
cl_sortkey varbinary(230) NOT NULL default '',
|
||||||
|
|
||||||
-- A prefix for the raw sortkey manually specified by the user, either via
|
-- A prefix for the raw sortkey manually specified by the user, either via
|
||||||
-- [[Category:Foo|prefix]] or {{defaultsort:prefix}}. If nonempty, it's
|
-- [[Category:Foo|prefix]] or {{defaultsort:prefix}}. If nonempty, it's
|
||||||
-- concatenated with a null followed by the page title before the sortkey
|
-- concatenated with a line break followed by the page title before the sortkey
|
||||||
-- conversion algorithm is run. We store this so that we can update
|
-- conversion algorithm is run. We store this so that we can update
|
||||||
-- collations without reparsing all pages.
|
-- collations without reparsing all pages.
|
||||||
-- Note: If you change the length of this field, you also need to change
|
-- Note: If you change the length of this field, you also need to change
|
||||||
|
|
|
||||||
|
|
@ -46,7 +46,7 @@ TEXT;
|
||||||
}
|
}
|
||||||
|
|
||||||
public function execute() {
|
public function execute() {
|
||||||
global $wgCategoryCollation, $wgContLang;
|
global $wgCategoryCollation;
|
||||||
|
|
||||||
$dbw = wfGetDB( DB_MASTER );
|
$dbw = wfGetDB( DB_MASTER );
|
||||||
$count = $dbw->selectField(
|
$count = $dbw->selectField(
|
||||||
|
|
@ -105,7 +105,7 @@ TEXT;
|
||||||
$dbw->update(
|
$dbw->update(
|
||||||
'categorylinks',
|
'categorylinks',
|
||||||
array(
|
array(
|
||||||
'cl_sortkey' => $wgContLang->convertToSortkey(
|
'cl_sortkey' => Collation::singleton()->getSortKey(
|
||||||
$title->getCategorySortkey( $prefix ) ),
|
$title->getCategorySortkey( $prefix ) ),
|
||||||
'cl_sortkey_prefix' => $prefix,
|
'cl_sortkey_prefix' => $prefix,
|
||||||
'cl_collation' => $wgCategoryCollation,
|
'cl_collation' => $wgCategoryCollation,
|
||||||
|
|
|
||||||
1
serialized/first-letters-root.ser
Normal file
1
serialized/first-letters-root.ser
Normal file
File diff suppressed because one or more lines are too long
Loading…
Reference in a new issue