RemoteIcuCollation

Add a collation that gets its data from a remote Shellbox instance. This
is meant as a migration helper to use during an ICU upgrade.

Add a batch method to Collation so that this can be somewhat efficient
when adding multiple categories.

Bug: T263437
Change-Id: I76610d251fb55df90c78acb9f59fd81421f876dd
This commit is contained in:
Tim Starling 2021-12-10 10:35:07 +11:00
parent 132cba07f2
commit e85d532aa2
5 changed files with 263 additions and 0 deletions

View file

@ -1326,6 +1326,7 @@ $wgAutoloadLocalClasses = [
'RefreshLinksJob' => __DIR__ . '/includes/jobqueue/jobs/RefreshLinksJob.php',
'RefreshSecondaryDataUpdate' => __DIR__ . '/includes/deferred/RefreshSecondaryDataUpdate.php',
'RemexStripTagHandler' => __DIR__ . '/includes/parser/RemexStripTagHandler.php',
'RemoteIcuCollation' => __DIR__ . '/includes/collation/RemoteIcuCollation.php',
'RemoveInvalidEmails' => __DIR__ . '/maintenance/removeInvalidEmails.php',
'RemoveUnusedAccounts' => __DIR__ . '/maintenance/removeUnusedAccounts.php',
'RenameDbPrefix' => __DIR__ . '/maintenance/renameDbPrefix.php',

View file

@ -71,6 +71,20 @@ abstract class Collation {
*/
abstract public function getSortKey( $string );
/**
* Get multiple sort keys
*
* @param string[] $strings
* @return string[]
*/
public function getSortKeys( $strings ) {
$ret = [];
foreach ( $strings as $key => $s ) {
$ret[$key] = $this->getSortKey( $s );
}
return $ret;
}
/**
* Given a string, return the logical "first letter" to be used for
* grouping on category pages and so on. This has to be coordinated

View file

@ -151,6 +151,16 @@ class CollationFactory {
$match[1],
]
] );
} elseif ( preg_match( '/^remote-uca-([A-Za-z@=-]+)$/', $collationName, $match ) ) {
return $this->instantiateCollation( [
'class' => \RemoteIcuCollation::class,
'services' => [
'ShellboxClientFactory'
],
'args' => [
$match[1]
]
] );
}
// Provide a mechanism for extensions to hook in.

View file

@ -0,0 +1,110 @@
<?php
use MediaWiki\Shell\ShellboxClientFactory;
/**
* An ICU collation that uses a remote server to compute sort keys. This can be
* used in conjunction with $wgTempCategoryCollations to migrate to a different
* version of ICU.
*/
class RemoteIcuCollation extends Collation {
private $rpcClient;
private $locale;
/**
* @param ShellboxClientFactory $shellboxClientFactory
* @param string $locale
*/
public function __construct( ShellboxClientFactory $shellboxClientFactory, $locale ) {
$this->rpcClient = $shellboxClientFactory->getRpcClient(
[ 'service' => 'icu-collation' ] );
$this->locale = $locale;
}
public function getSortKey( $string ) {
return $this->getSortKeys( [ $string ] )[0];
}
/**
* Encode an array of binary strings as a string
*
* @param string[] $strings
* @return string
*/
private static function encode( $strings ) {
$ret = '';
foreach ( $strings as $s ) {
$ret .= sprintf( "%08x", strlen( $s ) ) . $s;
}
return $ret;
}
/**
* Decode the value returned by encode()
*
* @param string $blob
* @return string[]
*/
private static function decode( $blob ) {
$p = 0;
$ret = [];
while ( $p < strlen( $blob ) ) {
$len = intval( substr( $blob, $p, 8 ), 16 );
$p += 8;
$ret[] = substr( $blob, $p, $len );
$p += $len;
}
return $ret;
}
public function getSortKeys( $strings ) {
if ( !count( $strings ) ) {
return [];
}
$blob = $this->rpcClient->call(
'icu-collation',
self::class . '::' . 'doGetSortKeys',
[
$this->locale,
self::encode( array_values( $strings ) )
],
[
'classes' => [ parent::class, self::class ],
'binary' => true
]
);
return array_combine(
array_keys( $strings ),
self::decode( $blob )
);
}
public function getFirstLetter( $string ) {
// @phan-suppress-previous-line PhanPluginNeverReturnMethod
throw new RuntimeException( __METHOD__ . ': not implemented' );
}
/**
* The remote entry point. Get sort keys for an encoded list of inputs.
*
* @param string $locale The ICU locale
* @param string $blob The input array encoded with encode()
* @return string The encoded result
*/
public static function doGetSortKeys( $locale, $blob ) {
$mainCollator = Collator::create( $locale );
if ( !$mainCollator ) {
throw new RuntimeException( "Invalid ICU locale specified for collation: $locale" );
}
// If the special suffix for numeric collation is present, turn on numeric collation.
if ( substr( $locale, -5, 5 ) === '-u-kn' ) {
$mainCollator->setAttribute( Collator::NUMERIC_COLLATION, Collator::ON );
}
$ret = [];
foreach ( self::decode( $blob ) as $string ) {
$ret[] = $mainCollator->getSortKey( $string );
}
return self::encode( $ret );
}
}

View file

@ -0,0 +1,128 @@
<?php
use Wikimedia\TestingAccessWrapper;
/**
* @covers RemoteIcuCollation
*/
class RemoteIcuCollationTest extends MediaWikiLangTestCase {
public static function provideEncode() {
return [
[
[],
''
],
[
[ 'foo' ],
'00000003foo'
],
[
[ 'foo', 'a somewhat longer string' ],
'00000003foo00000018a somewhat longer string'
],
];
}
/** @dataProvider provideEncode */
public function testEncode( $input, $expected ) {
$coll = TestingAccessWrapper::newFromClass( RemoteIcuCollation::class );
$this->assertSame( $expected, $coll->encode( $input ) );
}
public static function provideEncodeDecode() {
return [
[ [ "\000" ] ],
[ [ "a\000b" ] ],
[ [ str_repeat( "\001", 100 ) ] ],
[ [ 'foo' ] ],
[ [ 'foo', 'bar' ] ],
[ [ 'foo', 'bar', str_repeat( 'x', 1000 ) ] ]
];
}
/** @dataProvider provideEncodeDecode */
public function testEncodeDecode( $input ) {
$coll = TestingAccessWrapper::newFromClass( RemoteIcuCollation::class );
$this->assertSame( $input, $coll->decode( $coll->encode( $input ) ) );
}
public static function provideGetSortKeys() {
$cases = [
[],
[ '' ],
[ 'test1' => 'bar', 'test2' => 'foo' ],
[
'bar',
'foo'
],
[
'first',
'Second'
],
[
'',
'second'
],
[
'Berić',
'Berisha',
],
[
'2',
'10',
]
];
foreach ( $cases as $case ) {
yield [ $case ];
}
}
/** @dataProvider provideGetSortKeys */
public function testGetSortKeys( $inputs ) {
if ( !extension_loaded( 'intl' ) ) {
$this->markTestSkipped( 'Need PHP intl' );
}
$coll = new RemoteIcuCollation(
$this->getServiceContainer()->getShellboxClientFactory(),
'uca-default-u-kn'
);
$sortKeys = $coll->getSortKeys( $inputs );
$prevKey = null;
if ( count( $inputs ) ) {
foreach ( $inputs as $i => $input ) {
$key = $sortKeys[$i];
$this->assertIsString( $key );
if ( $prevKey ) {
$this->assertLessThan( 0, strcmp( $prevKey, $key ) );
}
$prevKey = $key;
}
} else {
$this->assertSame( [], $sortKeys );
}
}
/** @dataProvider provideGetSortKeys */
public function testGetSortKey( $inputs ) {
if ( !count( $inputs ) ) {
// Not risky, it's just handy to reuse the provider
$this->assertTrue( true );
}
if ( !extension_loaded( 'intl' ) ) {
$this->markTestSkipped( 'Need PHP intl' );
}
$coll = new RemoteIcuCollation(
$this->getServiceContainer()->getShellboxClientFactory(),
'uca-default-u-kn'
);
$prevKey = null;
foreach ( $inputs as $input ) {
$key = $coll->getSortKey( $input );
$this->assertIsString( $key );
if ( $prevKey ) {
$this->assertLessThan( 0, strcmp( $prevKey, $key ) );
}
$prevKey = $key;
}
}
}