Kill mbstring fallbacks

In the age when we require PHP 5.5, pretending that mbstring emulation
is not slow and silly is silly.

Bug: T129435
Change-Id: Ic8235c9da9a926df63ec7388900c44eab454eebe
This commit is contained in:
Max Semenik 2016-01-29 11:42:44 -08:00 committed by MaxSem
parent 9f72780bde
commit 943563062f
15 changed files with 38 additions and 386 deletions

View file

@ -7,7 +7,12 @@ production.
=== PHP version requirement ===
As of 1.27, MediaWiki now requires PHP 5.5.9 or higher. This corresponds with
HHVM 3.1.
HHVM 3.1. Additionally, the following PHP extensions are required:
* ctype
* iconv
* json
* mbstring
* xml
=== Configuration changes in 1.27 ===
* $wgAllowMicrodataAttributes and $wgAllowRdfaAttributes were removed,

View file

@ -423,7 +423,6 @@ $wgAutoloadLocalClasses = [
'FakeConverter' => __DIR__ . '/languages/FakeConverter.php',
'FakeMaintenance' => __DIR__ . '/maintenance/Maintenance.php',
'FakeResultWrapper' => __DIR__ . '/includes/db/DatabaseUtility.php',
'Fallback' => __DIR__ . '/includes/Fallback.php',
'FatalError' => __DIR__ . '/includes/exception/FatalError.php',
'FauxRequest' => __DIR__ . '/includes/FauxRequest.php',
'FauxResponse' => __DIR__ . '/includes/WebResponse.php',

View file

@ -19,6 +19,7 @@
"composer/semver": "1.4.0",
"cssjanus/cssjanus": "1.1.2",
"ext-iconv": "*",
"ext-mbstring": "*",
"liuggio/statsd-php-client": "1.0.18",
"mediawiki/at-ease": "1.1.0",
"oojs/oojs-ui": "0.16.4",
@ -52,7 +53,6 @@
"ext-apc": "Local data and opcode cache",
"ext-fileinfo": "Improved mime magic detection",
"ext-intl": "ICU integration",
"ext-mbstring": "Multibyte string support",
"ext-wikidiff2": "Diff accelerator",
"monolog/monolog": "Flexible debug logging system",
"nmred/kafka-php": "Send debug log events to kafka",

View file

@ -585,10 +585,9 @@ $wgLockManagers = [];
* Requires PHP's Exif extension: http://www.php.net/manual/en/ref.exif.php
*
* @note FOR WINDOWS USERS:
* To enable Exif functions, add the following lines to the "Windows
* To enable Exif functions, add the following line to the "Windows
* extensions" section of php.ini:
* @code{.ini}
* extension=extensions/php_mbstring.dll
* extension=extensions/php_exif.dll
* @endcode
*/

View file

@ -1,172 +0,0 @@
<?php
/**
* Fallback functions for PHP installed without mbstring support.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
* http://www.gnu.org/copyleft/gpl.html
*
* @file
*/
/**
* Fallback functions for PHP installed without mbstring support
*/
class Fallback {
/**
* Fallback implementation for mb_substr, hardcoded to UTF-8.
* Attempts to be at least _moderately_ efficient; best optimized
* for relatively small offset and count values -- about 5x slower
* than native mb_string in my testing.
*
* Larger offsets are still fairly efficient for Latin text, but
* can be up to 100x slower than native if the text is heavily
* multibyte and we have to slog through a few hundred kb.
*
* @param string $str
* @param int $start
* @param string $count
*
* @return string
*/
public static function mb_substr( $str, $start, $count = 'end' ) {
if ( $start != 0 ) {
$split = self::mb_substr_split_unicode( $str, intval( $start ) );
$str = substr( $str, $split );
}
if ( $count !== 'end' ) {
$split = self::mb_substr_split_unicode( $str, intval( $count ) );
$str = substr( $str, 0, $split );
}
return $str;
}
/**
* @param string $str
* @param int $splitPos
* @return int
*/
public static function mb_substr_split_unicode( $str, $splitPos ) {
if ( $splitPos == 0 ) {
return 0;
}
$byteLen = strlen( $str );
if ( $splitPos > 0 ) {
if ( $splitPos > 256 ) {
// Optimize large string offsets by skipping ahead N bytes.
// This will cut out most of our slow time on Latin-based text,
// and 1/2 to 1/3 on East European and Asian scripts.
$bytePos = $splitPos;
while ( $bytePos < $byteLen && $str[$bytePos] >= "\x80" && $str[$bytePos] < "\xc0" ) {
++$bytePos;
}
$charPos = mb_strlen( substr( $str, 0, $bytePos ) );
} else {
$charPos = 0;
$bytePos = 0;
}
while ( $charPos++ < $splitPos ) {
++$bytePos;
// Move past any tail bytes
while ( $bytePos < $byteLen && $str[$bytePos] >= "\x80" && $str[$bytePos] < "\xc0" ) {
++$bytePos;
}
}
} else {
$splitPosX = $splitPos + 1;
$charPos = 0; // relative to end of string; we don't care about the actual char position here
$bytePos = $byteLen;
while ( $bytePos > 0 && $charPos-- >= $splitPosX ) {
--$bytePos;
// Move past any tail bytes
while ( $bytePos > 0 && $str[$bytePos] >= "\x80" && $str[$bytePos] < "\xc0" ) {
--$bytePos;
}
}
}
return $bytePos;
}
/**
* Fallback implementation of mb_strlen, hardcoded to UTF-8.
* @param string $str
* @param string $enc Optional encoding; ignored
* @return int
*/
public static function mb_strlen( $str, $enc = '' ) {
$counts = count_chars( $str );
$total = 0;
// Count ASCII bytes
for ( $i = 0; $i < 0x80; $i++ ) {
$total += $counts[$i];
}
// Count multibyte sequence heads
for ( $i = 0xc0; $i < 0xff; $i++ ) {
$total += $counts[$i];
}
return $total;
}
/**
* Fallback implementation of mb_strpos, hardcoded to UTF-8.
* @param string $haystack
* @param string $needle
* @param string $offset Optional start position
* @param string $encoding Optional encoding; ignored
* @return int
*/
public static function mb_strpos( $haystack, $needle, $offset = 0, $encoding = '' ) {
$needle = preg_quote( $needle, '/' );
$ar = [];
preg_match( '/' . $needle . '/u', $haystack, $ar, PREG_OFFSET_CAPTURE, $offset );
if ( isset( $ar[0][1] ) ) {
return $ar[0][1];
} else {
return false;
}
}
/**
* Fallback implementation of mb_strrpos, hardcoded to UTF-8.
* @param string $haystack
* @param string $needle
* @param string $offset Optional start position
* @param string $encoding Optional encoding; ignored
* @return int
*/
public static function mb_strrpos( $haystack, $needle, $offset = 0, $encoding = '' ) {
$needle = preg_quote( $needle, '/' );
$ar = [];
preg_match_all( '/' . $needle . '/u', $haystack, $ar, PREG_OFFSET_CAPTURE, $offset );
if ( isset( $ar[0] ) && count( $ar[0] ) > 0 &&
isset( $ar[0][count( $ar[0] ) - 1][1] ) ) {
return $ar[0][count( $ar[0] ) - 1][1];
} else {
return false;
}
}
}

View file

@ -39,59 +39,6 @@ use MediaWiki\Session\SessionManager;
* PHP extensions may be included here.
*/
if ( !function_exists( 'mb_substr' ) ) {
/**
* @codeCoverageIgnore
* @see Fallback::mb_substr
* @return string
*/
function mb_substr( $str, $start, $count = 'end' ) {
return Fallback::mb_substr( $str, $start, $count );
}
/**
* @codeCoverageIgnore
* @see Fallback::mb_substr_split_unicode
* @return int
*/
function mb_substr_split_unicode( $str, $splitPos ) {
return Fallback::mb_substr_split_unicode( $str, $splitPos );
}
}
if ( !function_exists( 'mb_strlen' ) ) {
/**
* @codeCoverageIgnore
* @see Fallback::mb_strlen
* @return int
*/
function mb_strlen( $str, $enc = '' ) {
return Fallback::mb_strlen( $str, $enc );
}
}
if ( !function_exists( 'mb_strpos' ) ) {
/**
* @codeCoverageIgnore
* @see Fallback::mb_strpos
* @return int
*/
function mb_strpos( $haystack, $needle, $offset = 0, $encoding = '' ) {
return Fallback::mb_strpos( $haystack, $needle, $offset, $encoding );
}
}
if ( !function_exists( 'mb_strrpos' ) ) {
/**
* @codeCoverageIgnore
* @see Fallback::mb_strrpos
* @return int
*/
function mb_strrpos( $haystack, $needle, $offset = 0, $encoding = '' ) {
return Fallback::mb_strrpos( $haystack, $needle, $offset, $encoding );
}
}
// hash_equals function only exists in PHP >= 5.6.0
// http://php.net/hash_equals
if ( !function_exists( 'hash_equals' ) ) {

View file

@ -63,15 +63,9 @@ class HtmlFormatter {
*/
public function getDoc() {
if ( !$this->doc ) {
// DOMDocument::loadHTML apparently isn't very good with encodings, so
// DOMDocument::loadHTML isn't very good with encodings, so
// convert input to ASCII by encoding everything above 128 as entities.
if ( function_exists( 'mb_convert_encoding' ) ) {
$html = mb_convert_encoding( $this->html, 'HTML-ENTITIES', 'UTF-8' );
} else {
$html = preg_replace_callback( '/[\x{80}-\x{10ffff}]/u', function ( $m ) {
return '&#' . UtfNormal\Utils::utf8ToCodepoint( $m[0] ) . ';';
}, $this->html );
}
$html = mb_convert_encoding( $this->html, 'HTML-ENTITIES', 'UTF-8' );
// Workaround for bug that caused spaces before references
// to disappear during processing: https://phabricator.wikimedia.org/T55086
@ -251,13 +245,10 @@ class HtmlFormatter {
}
$html = $replacements->replace( $html );
if ( function_exists( 'mb_convert_encoding' ) ) {
// Just in case the conversion in getDoc() above used named
// entities that aren't known to html_entity_decode().
$html = mb_convert_encoding( $html, 'UTF-8', 'HTML-ENTITIES' );
} else {
$html = html_entity_decode( $html, ENT_COMPAT, 'utf-8' );
}
// Just in case the conversion in getDoc() above used named
// entities that aren't known to html_entity_decode().
$html = mb_convert_encoding( $html, 'UTF-8', 'HTML-ENTITIES' );
return $html;
}

View file

@ -392,9 +392,7 @@ class DatabasePostgres extends Database {
}
public function doQuery( $sql ) {
if ( function_exists( 'mb_convert_encoding' ) ) {
$sql = mb_convert_encoding( $sql, 'UTF-8' );
}
$sql = mb_convert_encoding( $sql, 'UTF-8' );
// Clear previously left over PQresult
while ( $res = pg_get_result( $this->mConn ) ) {
pg_free_result( $res );

View file

@ -153,7 +153,7 @@ class SwiftFileBackend extends FileBackendStore {
}
protected function resolveContainerPath( $container, $relStoragePath ) {
if ( !mb_check_encoding( $relStoragePath, 'UTF-8' ) ) { // mb_string required by CF
if ( !mb_check_encoding( $relStoragePath, 'UTF-8' ) ) {
return null; // not UTF-8, makes it hard to use CF and the swift HTTP API
} elseif ( strlen( urlencode( $relStoragePath ) ) > 1024 ) {
return null; // too long for Swift

View file

@ -757,6 +757,12 @@ abstract class Installer {
return false;
}
if ( !function_exists( 'mb_substr' ) ) {
$this->showError( 'config-mbstring-absent' );
return false;
}
return true;
}

View file

@ -60,6 +60,7 @@
"config-ctype": "<strong>Fatal:</strong> PHP must be compiled with support for the [http://www.php.net/manual/en/ctype.installation.php Ctype extension].",
"config-iconv": "<strong>Fatal:</strong> PHP must be compiled with support for the [http://www.php.net/manual/en/iconv.installation.php iconv extension].",
"config-json": "<strong>Fatal:</strong> PHP was compiled without JSON support.\nYou must install either the PHP JSON extension or the [http://pecl.php.net/package/jsonc PECL jsonc] extension before installing MediaWiki.\n* The PHP extension is included in Red Hat Enterprise Linux (CentOS) 5 and 6, though must be enabled in <code>/etc/php.ini</code> or <code>/etc/php.d/json.ini</code>.\n* Some Linux distributions released after May 2013 omit the PHP extension, instead packaging the PECL extension as <code>php5-json</code> or <code>php-pecl-jsonc</code>.",
"config-mbstring-absent": "<strong>Fatal:</strong> PHP must be compiled with support for the [http://www.php.net/manual/en/mbstring.setup.php mbstring extension].",
"config-xcache": "[http://xcache.lighttpd.net/ XCache] is installed",
"config-apc": "[http://www.php.net/apc APC] is installed",
"config-wincache": "[http://www.iis.net/download/WinCacheForPhp WinCache] is installed",

View file

@ -78,6 +78,7 @@
"config-ctype": "Message if support for [http://www.php.net/manual/en/ctype.installation.php Ctype] is missing from PHP.\n{{Related|Config-fatal}}",
"config-iconv": "Message if support for [http://www.php.net/manual/en/iconv.installation.php iconv] is missing from PHP.\n{{Related|Config-fatal}}",
"config-json": "Message if support for [[wikipedia:JSON|JSON]] is missing from PHP.\n* \"[[wikipedia:Red Hat Enterprise Linux|Red Hat Enterprise Linux]]\" (RHEL) and \"[[wikipedia:CentOS|CentOS]]\" refer to two almost-identical Linux distributions. \"5 and 6\" refers to version 5 or 6 of either distribution. Because RHEL 7 likely will not include the PHP extension, do not translate as \"5 or newer\".\n* \"The [http://www.php.net/json PHP extension]\" is the JSON extension included with PHP 5.2 and newer.\n* \"The [http://pecl.php.net/package/jsonc PECL extension]\" is based on the PHP extension, though excludes code some distributions have found unacceptable (see [[phab:T49431]]).\n{{Related|Config-fatal}}",
"config-mbstring-absent": "Message if support for [http://www.php.net/manual/en/mbstring.installation.php mbstring] is missing from PHP.\n{{Related|Config-fatal}}",
"config-xcache": "Message indicates if this program is available",
"config-apc": "Message indicates if this program is available",
"config-wincache": "Message indicates if this program is available",

View file

@ -30,83 +30,26 @@ class StringUtils {
* The function check for invalid byte sequences, overlong encoding but
* not for different normalisations.
*
* This relies internally on the mbstring function mb_check_encoding()
* hardcoded to check against UTF-8. Whenever the function is not available
* we fallback to a pure PHP implementation. Setting $disableMbstring to
* true will skip the use of mb_check_encoding, this is mostly intended for
* unit testing our internal implementation.
*
* @note In MediaWiki 1.21, this function did not provide proper UTF-8 validation.
* In particular, the pure PHP code path did not in fact check for overlong forms.
* Beware of this when backporting code to that version of MediaWiki.
*
* @since 1.21
* @param string $value String to check
* @param bool $disableMbstring Whether to use the pure PHP
* implementation instead of trying mb_check_encoding. Intended for unit
* testing. Default: false
* @return bool Whether the given $value is a valid UTF-8 encoded string
*/
static function isUtf8( $value, $disableMbstring = false ) {
static function isUtf8( $value ) {
$value = (string)$value;
// If the mbstring extension is loaded, use it. However, before PHP 5.4, values above
// U+10FFFF are incorrectly allowed, so we have to check for them separately.
if ( !$disableMbstring && function_exists( 'mb_check_encoding' ) ) {
static $newPHP;
if ( $newPHP === null ) {
$newPHP = !mb_check_encoding( "\xf4\x90\x80\x80", 'UTF-8' );
}
return mb_check_encoding( $value, 'UTF-8' ) &&
( $newPHP || preg_match( "/\xf4[\x90-\xbf]|[\xf5-\xff]/S", $value ) === 0 );
// Before PHP 5.4, values above U+10FFFF are incorrectly allowed, so we have to
// check for them separately.
static $newPHP;
if ( $newPHP === null ) {
$newPHP = !mb_check_encoding( "\xf4\x90\x80\x80", 'UTF-8' );
}
if ( preg_match( "/[\x80-\xff]/S", $value ) === 0 ) {
// String contains only ASCII characters, has to be valid
return true;
}
// PCRE implements repetition using recursion; to avoid a stack overflow (and segfault)
// for large input, we check for invalid sequences (<= 5 bytes) rather than valid
// sequences, which can be as long as the input string is. Multiple short regexes are
// used rather than a single long regex for performance.
static $regexes;
if ( $regexes === null ) {
$cont = "[\x80-\xbf]";
$after = "(?!$cont)"; // "(?:[^\x80-\xbf]|$)" would work here
$regexes = [
// Continuation byte at the start
"/^$cont/",
// ASCII byte followed by a continuation byte
"/[\\x00-\x7f]$cont/S",
// Illegal byte
"/[\xc0\xc1\xf5-\xff]/S",
// Invalid 2-byte sequence, or valid one then an extra continuation byte
"/[\xc2-\xdf](?!$cont$after)/S",
// Invalid 3-byte sequence, or valid one then an extra continuation byte
"/\xe0(?![\xa0-\xbf]$cont$after)/",
"/[\xe1-\xec\xee\xef](?!$cont{2}$after)/S",
"/\xed(?![\x80-\x9f]$cont$after)/",
// Invalid 4-byte sequence, or valid one then an extra continuation byte
"/\xf0(?![\x90-\xbf]$cont{2}$after)/",
"/[\xf1-\xf3](?!$cont{3}$after)/S",
"/\xf4(?![\x80-\x8f]$cont{2}$after)/",
];
}
foreach ( $regexes as $regex ) {
if ( preg_match( $regex, $value ) !== 0 ) {
return false;
}
}
return true;
return mb_check_encoding( $value, 'UTF-8' ) &&
( $newPHP || preg_match( "/\xf4[\x90-\xbf]|[\xf5-\xff]/S", $value ) === 0 );
}
/**

View file

@ -83,6 +83,12 @@ class UpdateMediaWiki extends Maintenance {
"ABORTING (see https://bugs.php.net/bug.php?id=45996).\n",
true );
}
if ( !function_exists( 'mb_strlen' ) ) {
$this->error(
"MediaWiki now requires the mbstring PHP extension, your system doesn't have it.\n"
. "ABORTING.\n" );
}
}
function execute() {

View file

@ -1,72 +0,0 @@
<?php
/**
* @covers Fallback
*/
class FallbackTest extends MediaWikiTestCase {
public function testFallbackMbstringFunctions() {
if ( !extension_loaded( 'mbstring' ) ) {
$this->markTestSkipped(
"The mb_string functions must be installed to test the fallback functions"
);
}
$sampleUTF = "Östergötland_coat_of_arms.png";
// mb_substr
$substr_params = [
[ 0, 0 ],
[ 5, -4 ],
[ 33 ],
[ 100, -5 ],
[ -8, 10 ],
[ 1, 1 ],
[ 2, -1 ]
];
foreach ( $substr_params as $param_set ) {
$old_param_set = $param_set;
array_unshift( $param_set, $sampleUTF );
$this->assertEquals(
call_user_func_array( 'mb_substr', $param_set ),
call_user_func_array( 'Fallback::mb_substr', $param_set ),
'Fallback mb_substr with params ' . implode( ', ', $old_param_set )
);
}
// mb_strlen
$this->assertEquals(
mb_strlen( $sampleUTF ),
Fallback::mb_strlen( $sampleUTF ),
'Fallback mb_strlen'
);
// mb_str(r?)pos
$strpos_params = [
// array( 'ter' ),
// array( 'Ö' ),
// array( 'Ö', 3 ),
// array( 'oat_', 100 ),
// array( 'c', -10 ),
// Broken for now
];
foreach ( $strpos_params as $param_set ) {
$old_param_set = $param_set;
array_unshift( $param_set, $sampleUTF );
$this->assertEquals(
call_user_func_array( 'mb_strpos', $param_set ),
call_user_func_array( 'Fallback::mb_strpos', $param_set ),
'Fallback mb_strpos with params ' . implode( ', ', $old_param_set )
);
$this->assertEquals(
call_user_func_array( 'mb_strrpos', $param_set ),
call_user_func_array( 'Fallback::mb_strrpos', $param_set ),
'Fallback mb_strrpos with params ' . implode( ', ', $old_param_set )
);
}
}
}