Revert r40837, r40839, r40840 (bug 332 - broken UTF-8)
Char-by-char scan of all output will perform very poorly and fails to address the root problem of bad internal treatment of strings.
This commit is contained in:
parent
d153006396
commit
5b5f7b30b3
3 changed files with 0 additions and 83 deletions
|
|
@ -132,7 +132,6 @@ it from source control: http://www.mediawiki.org/wiki/Download_from_SVN
|
|||
|
||||
=== Bug fixes in 1.14 ===
|
||||
|
||||
* (bug 332) Clean invalid UTF-8 to ensure output is RFC 3629 compliant
|
||||
* (bug 14907) DatabasePostgres::fieldType now defined.
|
||||
* (bug 14659) Passing the default limit param to Special:Recentchanges no more
|
||||
falls back to the user option
|
||||
|
|
|
|||
|
|
@ -901,8 +901,6 @@ class OutputPage {
|
|||
$this->addScriptFile( 'rightclickedit.js' );
|
||||
}
|
||||
|
||||
$this->mBodytext = StringUtils::cleanForCharset( $this->mBodytext, $wgOutputEncoding );
|
||||
|
||||
# Buffer output; final headers may depend on later processing
|
||||
ob_start();
|
||||
|
||||
|
|
|
|||
|
|
@ -179,86 +179,6 @@ class StringUtils {
|
|||
return new ArrayIterator( explode( $separator, $subject ) );
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Clean characters that are invalid in the given character set
|
||||
* from a given string.
|
||||
*
|
||||
* @param $string \type{$string} String to clean
|
||||
* @param $charset \type{$string} Character set (if unspecified, assume $wgOutputEncoding)
|
||||
* @return \type{$string} Cleaned string
|
||||
*/
|
||||
public static function cleanForCharset( $string, $charset='' ) {
|
||||
global $wgOutputEncoding;
|
||||
switch ( $charset ? $charset : $wgOutputEncoding ) {
|
||||
# UTF-8 should be all we need to worry about. :)
|
||||
case 'UTF-8':
|
||||
return self::cleanUtf8( $string );
|
||||
default:
|
||||
return $string;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Clean invalid UTF-8 characters and sequences from a given string,
|
||||
* replacing them with U+FFFD.
|
||||
* Should be RFC 3629 compliant.
|
||||
*
|
||||
* @param $string \type{$string} String to clean
|
||||
* @return \type{$string} Cleaned string
|
||||
*/
|
||||
private static function cleanUtf8( $str ) {
|
||||
# HERE BE DRAGONS!
|
||||
# ABANDON ALL HOPE, ALL YE WHO ENTER THE BITWISE HELLFIRE.
|
||||
|
||||
$illegal = array( 0xD800, 0xDB7F, 0xDB80, 0xDBFF,
|
||||
0xDC00, 0xDF80, 0xDFFF, 0xFFFE, 0xFFFF );
|
||||
$len = strlen( $str );
|
||||
$left = $bytes = 0;
|
||||
for ( $i = 0; $i < $len; $i++ ) {
|
||||
$ch = ord( $str[$i] );
|
||||
if ( !$left ) {
|
||||
if ( !($ch & 0x80 ) )
|
||||
continue;
|
||||
$left = (( $ch & 0xFE ) == 0xFC ? 5 :
|
||||
(( $ch & 0xFC ) == 0xF8 ? 4 :
|
||||
(( $ch & 0xF8 ) == 0xF0 ? 3 :
|
||||
(( $ch & 0xF0 ) == 0xE0 ? 2 :
|
||||
(( $ch & 0xE0 ) == 0xC0 ? 1 :
|
||||
0 )))));
|
||||
if ( $left ) {
|
||||
$bytes = $left + 1;
|
||||
$sum = $ch & ( 0xFF >> $bytes + 1 );
|
||||
continue;
|
||||
} else if ( $ch & 0x80 ) {
|
||||
$bytes = 1;
|
||||
}
|
||||
} else if ( ( $ch & 0xC0 ) == 0x80 ) {
|
||||
$sum <<= 6;
|
||||
$sum += $ch & 0x3F;
|
||||
if ( --$left ) continue;
|
||||
if ( ( $bytes == 2 && $sum < 0x80 ) ||
|
||||
( $bytes == 3 && $sum < 0x800 ) ||
|
||||
( $bytes == 4 && $sum < 0x10000 ) ||
|
||||
( $bytes > 4 || $sum > 0x10FFFF ) ||
|
||||
in_array( $sum, $illegal ) ) {
|
||||
} else continue;
|
||||
|
||||
} else {
|
||||
$bytes -= $left;
|
||||
$i--;
|
||||
}
|
||||
|
||||
$str = ( substr( $str, 0, $i - $bytes + 1 ) .
|
||||
"\xEF\xBF\xBD" .
|
||||
substr( $str, $i + 1 ) );
|
||||
$i += 3 - $bytes;
|
||||
$len += 3 - $bytes;
|
||||
$left = 0;
|
||||
}
|
||||
|
||||
return $str;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
Loading…
Reference in a new issue