RemoteIcuCollation
Add a collation that gets its data from a remote Shellbox instance. This is meant as a migration helper to use during an ICU upgrade. Add a batch method to Collation so that this can be somewhat efficient when adding multiple categories. Bug: T263437 Change-Id: I76610d251fb55df90c78acb9f59fd81421f876dd
This commit is contained in:
parent
132cba07f2
commit
e85d532aa2
5 changed files with 263 additions and 0 deletions
|
|
@ -1326,6 +1326,7 @@ $wgAutoloadLocalClasses = [
|
|||
'RefreshLinksJob' => __DIR__ . '/includes/jobqueue/jobs/RefreshLinksJob.php',
|
||||
'RefreshSecondaryDataUpdate' => __DIR__ . '/includes/deferred/RefreshSecondaryDataUpdate.php',
|
||||
'RemexStripTagHandler' => __DIR__ . '/includes/parser/RemexStripTagHandler.php',
|
||||
'RemoteIcuCollation' => __DIR__ . '/includes/collation/RemoteIcuCollation.php',
|
||||
'RemoveInvalidEmails' => __DIR__ . '/maintenance/removeInvalidEmails.php',
|
||||
'RemoveUnusedAccounts' => __DIR__ . '/maintenance/removeUnusedAccounts.php',
|
||||
'RenameDbPrefix' => __DIR__ . '/maintenance/renameDbPrefix.php',
|
||||
|
|
|
|||
|
|
@ -71,6 +71,20 @@ abstract class Collation {
|
|||
*/
|
||||
abstract public function getSortKey( $string );
|
||||
|
||||
/**
|
||||
* Get multiple sort keys
|
||||
*
|
||||
* @param string[] $strings
|
||||
* @return string[]
|
||||
*/
|
||||
public function getSortKeys( $strings ) {
|
||||
$ret = [];
|
||||
foreach ( $strings as $key => $s ) {
|
||||
$ret[$key] = $this->getSortKey( $s );
|
||||
}
|
||||
return $ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* Given a string, return the logical "first letter" to be used for
|
||||
* grouping on category pages and so on. This has to be coordinated
|
||||
|
|
|
|||
|
|
@ -151,6 +151,16 @@ class CollationFactory {
|
|||
$match[1],
|
||||
]
|
||||
] );
|
||||
} elseif ( preg_match( '/^remote-uca-([A-Za-z@=-]+)$/', $collationName, $match ) ) {
|
||||
return $this->instantiateCollation( [
|
||||
'class' => \RemoteIcuCollation::class,
|
||||
'services' => [
|
||||
'ShellboxClientFactory'
|
||||
],
|
||||
'args' => [
|
||||
$match[1]
|
||||
]
|
||||
] );
|
||||
}
|
||||
|
||||
// Provide a mechanism for extensions to hook in.
|
||||
|
|
|
|||
110
includes/collation/RemoteIcuCollation.php
Normal file
110
includes/collation/RemoteIcuCollation.php
Normal file
|
|
@ -0,0 +1,110 @@
|
|||
<?php
|
||||
|
||||
use MediaWiki\Shell\ShellboxClientFactory;
|
||||
|
||||
/**
|
||||
* An ICU collation that uses a remote server to compute sort keys. This can be
|
||||
* used in conjunction with $wgTempCategoryCollations to migrate to a different
|
||||
* version of ICU.
|
||||
*/
|
||||
class RemoteIcuCollation extends Collation {
|
||||
private $rpcClient;
|
||||
private $locale;
|
||||
|
||||
/**
|
||||
* @param ShellboxClientFactory $shellboxClientFactory
|
||||
* @param string $locale
|
||||
*/
|
||||
public function __construct( ShellboxClientFactory $shellboxClientFactory, $locale ) {
|
||||
$this->rpcClient = $shellboxClientFactory->getRpcClient(
|
||||
[ 'service' => 'icu-collation' ] );
|
||||
$this->locale = $locale;
|
||||
}
|
||||
|
||||
public function getSortKey( $string ) {
|
||||
return $this->getSortKeys( [ $string ] )[0];
|
||||
}
|
||||
|
||||
/**
|
||||
* Encode an array of binary strings as a string
|
||||
*
|
||||
* @param string[] $strings
|
||||
* @return string
|
||||
*/
|
||||
private static function encode( $strings ) {
|
||||
$ret = '';
|
||||
foreach ( $strings as $s ) {
|
||||
$ret .= sprintf( "%08x", strlen( $s ) ) . $s;
|
||||
}
|
||||
return $ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* Decode the value returned by encode()
|
||||
*
|
||||
* @param string $blob
|
||||
* @return string[]
|
||||
*/
|
||||
private static function decode( $blob ) {
|
||||
$p = 0;
|
||||
$ret = [];
|
||||
while ( $p < strlen( $blob ) ) {
|
||||
$len = intval( substr( $blob, $p, 8 ), 16 );
|
||||
$p += 8;
|
||||
$ret[] = substr( $blob, $p, $len );
|
||||
$p += $len;
|
||||
}
|
||||
return $ret;
|
||||
}
|
||||
|
||||
public function getSortKeys( $strings ) {
|
||||
if ( !count( $strings ) ) {
|
||||
return [];
|
||||
}
|
||||
$blob = $this->rpcClient->call(
|
||||
'icu-collation',
|
||||
self::class . '::' . 'doGetSortKeys',
|
||||
[
|
||||
$this->locale,
|
||||
self::encode( array_values( $strings ) )
|
||||
],
|
||||
[
|
||||
'classes' => [ parent::class, self::class ],
|
||||
'binary' => true
|
||||
]
|
||||
);
|
||||
return array_combine(
|
||||
array_keys( $strings ),
|
||||
self::decode( $blob )
|
||||
);
|
||||
}
|
||||
|
||||
public function getFirstLetter( $string ) {
|
||||
// @phan-suppress-previous-line PhanPluginNeverReturnMethod
|
||||
throw new RuntimeException( __METHOD__ . ': not implemented' );
|
||||
}
|
||||
|
||||
/**
|
||||
* The remote entry point. Get sort keys for an encoded list of inputs.
|
||||
*
|
||||
* @param string $locale The ICU locale
|
||||
* @param string $blob The input array encoded with encode()
|
||||
* @return string The encoded result
|
||||
*/
|
||||
public static function doGetSortKeys( $locale, $blob ) {
|
||||
$mainCollator = Collator::create( $locale );
|
||||
if ( !$mainCollator ) {
|
||||
throw new RuntimeException( "Invalid ICU locale specified for collation: $locale" );
|
||||
}
|
||||
|
||||
// If the special suffix for numeric collation is present, turn on numeric collation.
|
||||
if ( substr( $locale, -5, 5 ) === '-u-kn' ) {
|
||||
$mainCollator->setAttribute( Collator::NUMERIC_COLLATION, Collator::ON );
|
||||
}
|
||||
$ret = [];
|
||||
foreach ( self::decode( $blob ) as $string ) {
|
||||
$ret[] = $mainCollator->getSortKey( $string );
|
||||
}
|
||||
return self::encode( $ret );
|
||||
}
|
||||
}
|
||||
128
tests/phpunit/includes/collation/RemoteIcuCollationTest.php
Normal file
128
tests/phpunit/includes/collation/RemoteIcuCollationTest.php
Normal file
|
|
@ -0,0 +1,128 @@
|
|||
<?php
|
||||
|
||||
use Wikimedia\TestingAccessWrapper;
|
||||
|
||||
/**
|
||||
* @covers RemoteIcuCollation
|
||||
*/
|
||||
class RemoteIcuCollationTest extends MediaWikiLangTestCase {
|
||||
public static function provideEncode() {
|
||||
return [
|
||||
[
|
||||
[],
|
||||
''
|
||||
],
|
||||
[
|
||||
[ 'foo' ],
|
||||
'00000003foo'
|
||||
],
|
||||
[
|
||||
[ 'foo', 'a somewhat longer string' ],
|
||||
'00000003foo00000018a somewhat longer string'
|
||||
],
|
||||
];
|
||||
}
|
||||
|
||||
/** @dataProvider provideEncode */
|
||||
public function testEncode( $input, $expected ) {
|
||||
$coll = TestingAccessWrapper::newFromClass( RemoteIcuCollation::class );
|
||||
$this->assertSame( $expected, $coll->encode( $input ) );
|
||||
}
|
||||
|
||||
public static function provideEncodeDecode() {
|
||||
return [
|
||||
[ [ "\000" ] ],
|
||||
[ [ "a\000b" ] ],
|
||||
[ [ str_repeat( "\001", 100 ) ] ],
|
||||
[ [ 'foo' ] ],
|
||||
[ [ 'foo', 'bar' ] ],
|
||||
[ [ 'foo', 'bar', str_repeat( 'x', 1000 ) ] ]
|
||||
];
|
||||
}
|
||||
|
||||
/** @dataProvider provideEncodeDecode */
|
||||
public function testEncodeDecode( $input ) {
|
||||
$coll = TestingAccessWrapper::newFromClass( RemoteIcuCollation::class );
|
||||
$this->assertSame( $input, $coll->decode( $coll->encode( $input ) ) );
|
||||
}
|
||||
|
||||
public static function provideGetSortKeys() {
|
||||
$cases = [
|
||||
[],
|
||||
[ '' ],
|
||||
[ 'test1' => 'bar', 'test2' => 'foo' ],
|
||||
[
|
||||
'bar',
|
||||
'foo'
|
||||
],
|
||||
[
|
||||
'first',
|
||||
'Second'
|
||||
],
|
||||
[
|
||||
'',
|
||||
'second'
|
||||
],
|
||||
[
|
||||
'Berić',
|
||||
'Berisha',
|
||||
],
|
||||
[
|
||||
'2',
|
||||
'10',
|
||||
]
|
||||
];
|
||||
foreach ( $cases as $case ) {
|
||||
yield [ $case ];
|
||||
}
|
||||
}
|
||||
|
||||
/** @dataProvider provideGetSortKeys */
|
||||
public function testGetSortKeys( $inputs ) {
|
||||
if ( !extension_loaded( 'intl' ) ) {
|
||||
$this->markTestSkipped( 'Need PHP intl' );
|
||||
}
|
||||
$coll = new RemoteIcuCollation(
|
||||
$this->getServiceContainer()->getShellboxClientFactory(),
|
||||
'uca-default-u-kn'
|
||||
);
|
||||
$sortKeys = $coll->getSortKeys( $inputs );
|
||||
$prevKey = null;
|
||||
if ( count( $inputs ) ) {
|
||||
foreach ( $inputs as $i => $input ) {
|
||||
$key = $sortKeys[$i];
|
||||
$this->assertIsString( $key );
|
||||
if ( $prevKey ) {
|
||||
$this->assertLessThan( 0, strcmp( $prevKey, $key ) );
|
||||
}
|
||||
$prevKey = $key;
|
||||
}
|
||||
} else {
|
||||
$this->assertSame( [], $sortKeys );
|
||||
}
|
||||
}
|
||||
|
||||
/** @dataProvider provideGetSortKeys */
|
||||
public function testGetSortKey( $inputs ) {
|
||||
if ( !count( $inputs ) ) {
|
||||
// Not risky, it's just handy to reuse the provider
|
||||
$this->assertTrue( true );
|
||||
}
|
||||
if ( !extension_loaded( 'intl' ) ) {
|
||||
$this->markTestSkipped( 'Need PHP intl' );
|
||||
}
|
||||
$coll = new RemoteIcuCollation(
|
||||
$this->getServiceContainer()->getShellboxClientFactory(),
|
||||
'uca-default-u-kn'
|
||||
);
|
||||
$prevKey = null;
|
||||
foreach ( $inputs as $input ) {
|
||||
$key = $coll->getSortKey( $input );
|
||||
$this->assertIsString( $key );
|
||||
if ( $prevKey ) {
|
||||
$this->assertLessThan( 0, strcmp( $prevKey, $key ) );
|
||||
}
|
||||
$prevKey = $key;
|
||||
}
|
||||
}
|
||||
}
|
||||
Loading…
Reference in a new issue