In cases where we're operating on text data (and not binary data),
use e.g. "\u{00A0}" to refer directly to the Unicode character
'NO-BREAK SPACE' instead of "\xc2\xa0" to specify the bytes C2h A0h
(which correspond to the UTF-8 encoding of that character). This
makes it easier to look up those mysterious sequences, as not all
are as recognizable as the no-break space.
This is not enforced by PHP, but I think we should write those in
uppercase and zero-padded to at least four characters, like the
Unicode standard does.
Note that not all "\xNN" escapes can be automatically replaced:
* We can't use Unicode escapes for binary data that is not UTF-8
(e.g. in code converting from legacy encodings or testing the
handling of invalid UTF-8 byte sequences).
* '\xNN' escapes in regular expressions in single-quoted strings
are actually handled by PCRE and have to be dealt with carefully
(those regexps should probably be changed to use the /u modifier).
* "\xNN" referring to ASCII characters ("\x7F" and lower) should
probably be left as-is.
The replacements in this commit were done semi-manually by piping
the existing "\xNN" escapes through the following terrible Ruby
script I devised:
chars = eval('"' + ARGV[0] + '"').force_encoding('utf-8')
puts chars.split('').map{|char|
'\\u{' + char.ord.to_s(16).upcase.rjust(4, '0') + '}'
}.join('')
Change-Id: Idc3dee3a7fb5ebfaef395754d8859b18f1f8769a
68 lines
1.5 KiB
PHP
68 lines
1.5 KiB
PHP
<?php
|
|
|
|
/**
|
|
* @covers CustomUppercaseCollation
|
|
*/
|
|
class CustomUppercaseCollationTest extends MediaWikiTestCase {
|
|
|
|
public function setUp() {
|
|
$this->collation = new CustomUppercaseCollation( [
|
|
'D',
|
|
'C',
|
|
'Cs',
|
|
'B'
|
|
], Language::factory( 'en' ) );
|
|
|
|
parent::setUp();
|
|
}
|
|
|
|
/**
|
|
* @dataProvider providerOrder
|
|
*/
|
|
public function testOrder( $first, $second, $msg ) {
|
|
$sortkey1 = $this->collation->getSortKey( $first );
|
|
$sortkey2 = $this->collation->getSortKey( $second );
|
|
|
|
$this->assertTrue( strcmp( $sortkey1, $sortkey2 ) < 0, $msg );
|
|
}
|
|
|
|
public function providerOrder() {
|
|
return [
|
|
[ 'X', 'Z', 'Maintain order of unrearranged' ],
|
|
[ 'D', 'C', 'Actually resorts' ],
|
|
[ 'D', 'B', 'resort test 2' ],
|
|
[ 'Adobe', 'Abode', 'not first letter' ],
|
|
[ '💩 ', 'C', 'Test relocated to end' ],
|
|
[ 'c', 'b', 'lowercase' ],
|
|
[ 'x', 'z', 'lowercase original' ],
|
|
[ 'Cz', 'Cs', 'digraphs' ],
|
|
[ 'C50D', 'C100', 'Numbers' ]
|
|
];
|
|
}
|
|
|
|
/**
|
|
* @dataProvider provideGetFirstLetter
|
|
*/
|
|
public function testGetFirstLetter( $string, $first ) {
|
|
$this->assertSame( $this->collation->getFirstLetter( $string ), $first );
|
|
}
|
|
|
|
public function provideGetFirstLetter() {
|
|
return [
|
|
[ 'Do', 'D' ],
|
|
[ 'do', 'D' ],
|
|
[ 'Ao', 'A' ],
|
|
[ 'afdsa', 'A' ],
|
|
[ "\u{F3000}Foo", 'D' ],
|
|
[ "\u{F3001}Foo", 'C' ],
|
|
[ "\u{F3002}Foo", 'Cs' ],
|
|
[ "\u{F3003}Foo", 'B' ],
|
|
[ "\u{F3004}Foo", "\u{F3004}" ],
|
|
[ 'C', 'C' ],
|
|
[ 'Cz', 'C' ],
|
|
[ 'Cs', 'Cs' ],
|
|
[ 'CS', 'Cs' ],
|
|
[ 'cs', 'Cs' ],
|
|
];
|
|
}
|
|
}
|