Add ability to override mb_strtoupper in Language::ucfirst

Different PHP versions treat unicode differently, and specifically some
wiki resources become unreachable if mb_strtoupper's behavior has changed.
This patch allows to introduce an override table that allows to smooth
the transition period.

It also provides maintenance scripts to generate such an override table.

Bug: T219279
Change-Id: I0503ff4207fded4648c58c7b50e67c55422a4849
This commit is contained in:
Giuseppe Lavagetto 2019-04-09 19:02:03 +02:00 committed by Reedy
parent 8414e3c4d0
commit d46835ef4f
6 changed files with 193 additions and 1 deletions

View file

@ -564,6 +564,8 @@ $wgAutoloadLocalClasses = [
'GenerateNormalizerDataAr' => __DIR__ . '/maintenance/language/generateNormalizerDataAr.php',
'GenerateNormalizerDataMl' => __DIR__ . '/maintenance/language/generateNormalizerDataMl.php',
'GenerateSitemap' => __DIR__ . '/maintenance/generateSitemap.php',
'GenerateUcfirstOverrides' => __DIR__ . '/maintenance/language/generateUcfirstOverrides.php',
'GenerateUpperCharTable' => __DIR__ . '/maintenance/language/generateUpperCharTable.php',
'GenericArrayObject' => __DIR__ . '/includes/libs/GenericArrayObject.php',
'GenericParameterJob' => __DIR__ . '/includes/jobqueue/GenericParameterJob.php',
'GetConfiguration' => __DIR__ . '/maintenance/getConfiguration.php',

View file

@ -3194,6 +3194,19 @@ $wgLocaltimezone = null;
*/
$wgLocalTZoffset = null;
/**
* List of Unicode characters for which capitalization is overridden in
* Language::ucfirst. The characters should be
* represented as char_to_convert => conversion_override. See T219279 for details
* on why this is useful during php version transitions.
*
* @warning: EXPERIMENTAL!
*
* @since 1.34
* @var array
*/
$wgOverrideUcfirstCharacters = [];
/** @} */ # End of language/charset settings
/*************************************************************************//**

View file

@ -2713,7 +2713,7 @@ class Language {
public function uc( $str, $first = false ) {
if ( $first ) {
if ( $this->isMultibyte( $str ) ) {
return mb_strtoupper( mb_substr( $str, 0, 1 ) ) . mb_substr( $str, 1 );
return $this->mbUpperChar( mb_substr( $str, 0, 1 ) ) . mb_substr( $str, 1 );
} else {
return ucfirst( $str );
}
@ -2722,6 +2722,28 @@ class Language {
}
}
/**
* Convert character to uppercase, allowing overrides of the default mb_upper
* behaviour, which is buggy in many ways. Having a conversion table can be
* useful during transitions between PHP versions where unicode changes happen.
* This can make some resources unreachable on-wiki, see discussion at T219279.
* Providing such a conversion table can allow to manage the transition period.
*
* @since 1.34
*
* @param string $char
*
* @return string
*/
protected function mbUpperChar( $char ) {
global $wgOverrideUcfirstCharacters;
if ( array_key_exists( $char, $wgOverrideUcfirstCharacters ) ) {
return $wgOverrideUcfirstCharacters[$char];
} else {
return mb_strtoupper( $char );
}
}
/**
* @param string $str
* @return mixed|string

View file

@ -0,0 +1,83 @@
<?php
/**
* Generate a php file containg an array of
* utf8_lowercase => utf8_uppercase
* overrides. Takes as input two json files generated with generateUpperCharTable.php
* as input.
*
* Example run:
* # this will prepare a file to use to make hhvm's Language::ucfirst work like php7's
*
* $ php7.2 maintenance/language/generateUpperCharTable.php --outfile php7.2.json
* $ hhvm --php maintenance/language/generateUpperCharTable.php --outfile hhvm.json
* $ hhvm maintenance/language/generateUcfirstOverrides.php \
* --override hhvm.json --with php7.2.json --outfile test.php
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
* http://www.gnu.org/copyleft/gpl.html
*
* @file
* @ingroup MaintenanceLanguage
*/
require_once __DIR__ . '/../Maintenance.php';
class GenerateUcfirstOverrides extends Maintenance {
public function __construct() {
parent::__construct();
$this->addDescription(
'Generates a php source file containing a definition for mb_strtoupper overrides' );
$this->addOption( 'outfile', 'Output file', true, true, 'o' );
$this->addOption( 'override', 'Char table we want to override', true, true );
$this->addOption( 'with', 'Char table we want to obtain', true, true );
}
public function execute() {
$outfile = $this->getOption( 'outfile' );
$from = $this->loadJson( $this->getOption( 'override' ) );
$to = $this->loadJson( $this->getOption( 'with' ) );
$overrides = [];
foreach ( $from as $lc => $uc ) {
$ref = $to[$lc] ?? null;
if ( $ref !== null && $ref !== $uc ) {
$overrides[$lc] = $uc;
}
}
$writer = new StaticArrayWriter();
file_put_contents(
$outfile,
$writer->create( $overrides, 'File created by generateUcfirstOverrides.php' )
);
}
private function loadJson( $filename ) {
$data = file_get_contents( $filename );
if ( $data === false ) {
$msg = sprintf( "Could not load data from file '%s'\n", $filename );
$this->fatalError( $msg );
}
$json = json_decode( $data );
if ( $result === null ) {
$msg = sprintf( "Invalid json in the data file %s\n", $filename );
$this->fatalError( $msg, 2 );
}
return $json;
}
}
$maintClass = GenerateUcfirstOverrides::class;
require_once RUN_MAINTENANCE_IF_MAIN;

View file

@ -0,0 +1,49 @@
<?php
/**
* Generate a json file containing an array of
* utf8_lowercase => utf8_uppercase
* for all of the utf-8 range. This provides the input for generateUcfirstOverrides.php
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
* http://www.gnu.org/copyleft/gpl.html
*
* @file
* @ingroup MaintenanceLanguage
*/
require_once __DIR__ . '/../Maintenance.php';
class GenerateUpperCharTable extends Maintenance {
public function __construct() {
parent::__construct();
$this->addDescription( 'Generates the lowercase => uppercase json table' );
$this->addOption( 'outfile', 'Output file', true, true, 'o' );
}
public function execute() {
$outfile = $this->getOption( 'outfile', 'upperchar.json' );
$toUpperTable = [];
for ( $i = 0; $i <= 0x10ffff; $i++ ) {
$char = UtfNormal\Utils::codepointToUtf8( $i );
$upper = mb_strtoupper( $char );
$toUpperTable[$char] = $upper;
}
file_put_contents( $outfile, json_encode( $toUpperTable ) );
}
}
$maintClass = GenerateUpperCharTable::class;
require_once RUN_MAINTENANCE_IF_MAIN;

View file

@ -1909,4 +1909,27 @@ class LanguageTest extends LanguageClassesTestCase {
$ar2 = new LanguageAr();
$this->assertTrue( $ar1->equals( $ar2 ), 'ar equals ar' );
}
/**
* @dataProvider provideUcfirst
* @covers Language::ucfirst
*/
public function testUcfirst( $orig, $expected, $desc, $overrides = false ) {
$lang = new Language();
if ( is_array( $overrides ) ) {
$this->setMwGlobals( [ 'wgOverrideUcfirstCharacters' => $overrides ] );
}
$this->assertSame( $lang->ucfirst( $orig ), $expected, $desc );
}
public static function provideUcfirst() {
return [
[ 'alice', 'Alice', 'simple ASCII string', false ],
[ 'århus', 'Århus', 'unicode string', false ],
//overrides do not affect ASCII characters
[ 'foo', 'Foo', 'ASCII is not overriden', [ 'f' => 'b' ] ],
// but they do affect non-ascii ones
[ 'èl', 'Ll' , 'Non-ASCII is overridden', [ 'è' => 'L' ] ],
];
}
}