Revert r40837, r40839, r40840 (bug 332 - broken UTF-8)

Char-by-char scan of all output will perform very poorly and fails to address the root problem of bad internal treatment of strings.
This commit is contained in:
Brion Vibber 2008-09-15 17:51:53 +00:00
parent d153006396
commit 5b5f7b30b3
3 changed files with 0 additions and 83 deletions

View file

@ -132,7 +132,6 @@ it from source control: http://www.mediawiki.org/wiki/Download_from_SVN
=== Bug fixes in 1.14 ===
* (bug 332) Clean invalid UTF-8 to ensure output is RFC 3629 compliant
* (bug 14907) DatabasePostgres::fieldType now defined.
* (bug 14659) Passing the default limit param to Special:Recentchanges no more
falls back to the user option

View file

@ -901,8 +901,6 @@ class OutputPage {
$this->addScriptFile( 'rightclickedit.js' );
}
$this->mBodytext = StringUtils::cleanForCharset( $this->mBodytext, $wgOutputEncoding );
# Buffer output; final headers may depend on later processing
ob_start();

View file

@ -179,86 +179,6 @@ class StringUtils {
return new ArrayIterator( explode( $separator, $subject ) );
}
}
/**
* Clean characters that are invalid in the given character set
* from a given string.
*
* @param $string \type{$string} String to clean
* @param $charset \type{$string} Character set (if unspecified, assume $wgOutputEncoding)
* @return \type{$string} Cleaned string
*/
public static function cleanForCharset( $string, $charset='' ) {
global $wgOutputEncoding;
switch ( $charset ? $charset : $wgOutputEncoding ) {
# UTF-8 should be all we need to worry about. :)
case 'UTF-8':
return self::cleanUtf8( $string );
default:
return $string;
}
}
/**
* Clean invalid UTF-8 characters and sequences from a given string,
* replacing them with U+FFFD.
* Should be RFC 3629 compliant.
*
* @param $string \type{$string} String to clean
* @return \type{$string} Cleaned string
*/
private static function cleanUtf8( $str ) {
# HERE BE DRAGONS!
# ABANDON ALL HOPE, ALL YE WHO ENTER THE BITWISE HELLFIRE.
$illegal = array( 0xD800, 0xDB7F, 0xDB80, 0xDBFF,
0xDC00, 0xDF80, 0xDFFF, 0xFFFE, 0xFFFF );
$len = strlen( $str );
$left = $bytes = 0;
for ( $i = 0; $i < $len; $i++ ) {
$ch = ord( $str[$i] );
if ( !$left ) {
if ( !($ch & 0x80 ) )
continue;
$left = (( $ch & 0xFE ) == 0xFC ? 5 :
(( $ch & 0xFC ) == 0xF8 ? 4 :
(( $ch & 0xF8 ) == 0xF0 ? 3 :
(( $ch & 0xF0 ) == 0xE0 ? 2 :
(( $ch & 0xE0 ) == 0xC0 ? 1 :
0 )))));
if ( $left ) {
$bytes = $left + 1;
$sum = $ch & ( 0xFF >> $bytes + 1 );
continue;
} else if ( $ch & 0x80 ) {
$bytes = 1;
}
} else if ( ( $ch & 0xC0 ) == 0x80 ) {
$sum <<= 6;
$sum += $ch & 0x3F;
if ( --$left ) continue;
if ( ( $bytes == 2 && $sum < 0x80 ) ||
( $bytes == 3 && $sum < 0x800 ) ||
( $bytes == 4 && $sum < 0x10000 ) ||
( $bytes > 4 || $sum > 0x10FFFF ) ||
in_array( $sum, $illegal ) ) {
} else continue;
} else {
$bytes -= $left;
$i--;
}
$str = ( substr( $str, 0, $i - $bytes + 1 ) .
"\xEF\xBF\xBD" .
substr( $str, $i + 1 ) );
$i += 3 - $bytes;
$len += 3 - $bytes;
$left = 0;
}
return $str;
}
}
/**