Add ability to override mb_strtoupper in Language::ucfirst
Different PHP versions treat unicode differently, and specifically some wiki resources become unreachable if mb_strtoupper's behavior has changed. This patch allows to introduce an override table that allows to smooth the transition period. It also provides maintenance scripts to generate such an override table. Bug: T219279 Change-Id: I0503ff4207fded4648c58c7b50e67c55422a4849
This commit is contained in:
parent
8414e3c4d0
commit
d46835ef4f
6 changed files with 193 additions and 1 deletions
|
|
@ -564,6 +564,8 @@ $wgAutoloadLocalClasses = [
|
|||
'GenerateNormalizerDataAr' => __DIR__ . '/maintenance/language/generateNormalizerDataAr.php',
|
||||
'GenerateNormalizerDataMl' => __DIR__ . '/maintenance/language/generateNormalizerDataMl.php',
|
||||
'GenerateSitemap' => __DIR__ . '/maintenance/generateSitemap.php',
|
||||
'GenerateUcfirstOverrides' => __DIR__ . '/maintenance/language/generateUcfirstOverrides.php',
|
||||
'GenerateUpperCharTable' => __DIR__ . '/maintenance/language/generateUpperCharTable.php',
|
||||
'GenericArrayObject' => __DIR__ . '/includes/libs/GenericArrayObject.php',
|
||||
'GenericParameterJob' => __DIR__ . '/includes/jobqueue/GenericParameterJob.php',
|
||||
'GetConfiguration' => __DIR__ . '/maintenance/getConfiguration.php',
|
||||
|
|
|
|||
|
|
@ -3194,6 +3194,19 @@ $wgLocaltimezone = null;
|
|||
*/
|
||||
$wgLocalTZoffset = null;
|
||||
|
||||
/**
|
||||
* List of Unicode characters for which capitalization is overridden in
|
||||
* Language::ucfirst. The characters should be
|
||||
* represented as char_to_convert => conversion_override. See T219279 for details
|
||||
* on why this is useful during php version transitions.
|
||||
*
|
||||
* @warning: EXPERIMENTAL!
|
||||
*
|
||||
* @since 1.34
|
||||
* @var array
|
||||
*/
|
||||
$wgOverrideUcfirstCharacters = [];
|
||||
|
||||
/** @} */ # End of language/charset settings
|
||||
|
||||
/*************************************************************************//**
|
||||
|
|
|
|||
|
|
@ -2713,7 +2713,7 @@ class Language {
|
|||
public function uc( $str, $first = false ) {
|
||||
if ( $first ) {
|
||||
if ( $this->isMultibyte( $str ) ) {
|
||||
return mb_strtoupper( mb_substr( $str, 0, 1 ) ) . mb_substr( $str, 1 );
|
||||
return $this->mbUpperChar( mb_substr( $str, 0, 1 ) ) . mb_substr( $str, 1 );
|
||||
} else {
|
||||
return ucfirst( $str );
|
||||
}
|
||||
|
|
@ -2722,6 +2722,28 @@ class Language {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert character to uppercase, allowing overrides of the default mb_upper
|
||||
* behaviour, which is buggy in many ways. Having a conversion table can be
|
||||
* useful during transitions between PHP versions where unicode changes happen.
|
||||
* This can make some resources unreachable on-wiki, see discussion at T219279.
|
||||
* Providing such a conversion table can allow to manage the transition period.
|
||||
*
|
||||
* @since 1.34
|
||||
*
|
||||
* @param string $char
|
||||
*
|
||||
* @return string
|
||||
*/
|
||||
protected function mbUpperChar( $char ) {
|
||||
global $wgOverrideUcfirstCharacters;
|
||||
if ( array_key_exists( $char, $wgOverrideUcfirstCharacters ) ) {
|
||||
return $wgOverrideUcfirstCharacters[$char];
|
||||
} else {
|
||||
return mb_strtoupper( $char );
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param string $str
|
||||
* @return mixed|string
|
||||
|
|
|
|||
83
maintenance/language/generateUcfirstOverrides.php
Normal file
83
maintenance/language/generateUcfirstOverrides.php
Normal file
|
|
@ -0,0 +1,83 @@
|
|||
<?php
|
||||
/**
|
||||
* Generate a php file containg an array of
|
||||
* utf8_lowercase => utf8_uppercase
|
||||
* overrides. Takes as input two json files generated with generateUpperCharTable.php
|
||||
* as input.
|
||||
*
|
||||
* Example run:
|
||||
* # this will prepare a file to use to make hhvm's Language::ucfirst work like php7's
|
||||
*
|
||||
* $ php7.2 maintenance/language/generateUpperCharTable.php --outfile php7.2.json
|
||||
* $ hhvm --php maintenance/language/generateUpperCharTable.php --outfile hhvm.json
|
||||
* $ hhvm maintenance/language/generateUcfirstOverrides.php \
|
||||
* --override hhvm.json --with php7.2.json --outfile test.php
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License along
|
||||
* with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
||||
* http://www.gnu.org/copyleft/gpl.html
|
||||
*
|
||||
* @file
|
||||
* @ingroup MaintenanceLanguage
|
||||
*/
|
||||
|
||||
require_once __DIR__ . '/../Maintenance.php';
|
||||
|
||||
class GenerateUcfirstOverrides extends Maintenance {
|
||||
|
||||
public function __construct() {
|
||||
parent::__construct();
|
||||
$this->addDescription(
|
||||
'Generates a php source file containing a definition for mb_strtoupper overrides' );
|
||||
$this->addOption( 'outfile', 'Output file', true, true, 'o' );
|
||||
$this->addOption( 'override', 'Char table we want to override', true, true );
|
||||
$this->addOption( 'with', 'Char table we want to obtain', true, true );
|
||||
}
|
||||
|
||||
public function execute() {
|
||||
$outfile = $this->getOption( 'outfile' );
|
||||
$from = $this->loadJson( $this->getOption( 'override' ) );
|
||||
$to = $this->loadJson( $this->getOption( 'with' ) );
|
||||
$overrides = [];
|
||||
|
||||
foreach ( $from as $lc => $uc ) {
|
||||
$ref = $to[$lc] ?? null;
|
||||
if ( $ref !== null && $ref !== $uc ) {
|
||||
$overrides[$lc] = $uc;
|
||||
}
|
||||
}
|
||||
$writer = new StaticArrayWriter();
|
||||
file_put_contents(
|
||||
$outfile,
|
||||
$writer->create( $overrides, 'File created by generateUcfirstOverrides.php' )
|
||||
);
|
||||
}
|
||||
|
||||
private function loadJson( $filename ) {
|
||||
$data = file_get_contents( $filename );
|
||||
if ( $data === false ) {
|
||||
$msg = sprintf( "Could not load data from file '%s'\n", $filename );
|
||||
$this->fatalError( $msg );
|
||||
}
|
||||
$json = json_decode( $data );
|
||||
if ( $result === null ) {
|
||||
$msg = sprintf( "Invalid json in the data file %s\n", $filename );
|
||||
$this->fatalError( $msg, 2 );
|
||||
}
|
||||
return $json;
|
||||
}
|
||||
}
|
||||
|
||||
$maintClass = GenerateUcfirstOverrides::class;
|
||||
require_once RUN_MAINTENANCE_IF_MAIN;
|
||||
49
maintenance/language/generateUpperCharTable.php
Normal file
49
maintenance/language/generateUpperCharTable.php
Normal file
|
|
@ -0,0 +1,49 @@
|
|||
<?php
|
||||
/**
|
||||
* Generate a json file containing an array of
|
||||
* utf8_lowercase => utf8_uppercase
|
||||
* for all of the utf-8 range. This provides the input for generateUcfirstOverrides.php
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License along
|
||||
* with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
||||
* http://www.gnu.org/copyleft/gpl.html
|
||||
*
|
||||
* @file
|
||||
* @ingroup MaintenanceLanguage
|
||||
*/
|
||||
|
||||
require_once __DIR__ . '/../Maintenance.php';
|
||||
|
||||
class GenerateUpperCharTable extends Maintenance {
|
||||
|
||||
public function __construct() {
|
||||
parent::__construct();
|
||||
$this->addDescription( 'Generates the lowercase => uppercase json table' );
|
||||
$this->addOption( 'outfile', 'Output file', true, true, 'o' );
|
||||
}
|
||||
|
||||
public function execute() {
|
||||
$outfile = $this->getOption( 'outfile', 'upperchar.json' );
|
||||
$toUpperTable = [];
|
||||
for ( $i = 0; $i <= 0x10ffff; $i++ ) {
|
||||
$char = UtfNormal\Utils::codepointToUtf8( $i );
|
||||
$upper = mb_strtoupper( $char );
|
||||
$toUpperTable[$char] = $upper;
|
||||
}
|
||||
file_put_contents( $outfile, json_encode( $toUpperTable ) );
|
||||
}
|
||||
}
|
||||
|
||||
$maintClass = GenerateUpperCharTable::class;
|
||||
require_once RUN_MAINTENANCE_IF_MAIN;
|
||||
|
|
@ -1909,4 +1909,27 @@ class LanguageTest extends LanguageClassesTestCase {
|
|||
$ar2 = new LanguageAr();
|
||||
$this->assertTrue( $ar1->equals( $ar2 ), 'ar equals ar' );
|
||||
}
|
||||
|
||||
/**
|
||||
* @dataProvider provideUcfirst
|
||||
* @covers Language::ucfirst
|
||||
*/
|
||||
public function testUcfirst( $orig, $expected, $desc, $overrides = false ) {
|
||||
$lang = new Language();
|
||||
if ( is_array( $overrides ) ) {
|
||||
$this->setMwGlobals( [ 'wgOverrideUcfirstCharacters' => $overrides ] );
|
||||
}
|
||||
$this->assertSame( $lang->ucfirst( $orig ), $expected, $desc );
|
||||
}
|
||||
|
||||
public static function provideUcfirst() {
|
||||
return [
|
||||
[ 'alice', 'Alice', 'simple ASCII string', false ],
|
||||
[ 'århus', 'Århus', 'unicode string', false ],
|
||||
//overrides do not affect ASCII characters
|
||||
[ 'foo', 'Foo', 'ASCII is not overriden', [ 'f' => 'b' ] ],
|
||||
// but they do affect non-ascii ones
|
||||
[ 'èl', 'Ll' , 'Non-ASCII is overridden', [ 'è' => 'L' ] ],
|
||||
];
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in a new issue