wiki.techinc.nl/languages/LanguageUtf8.php

157 lines
4.8 KiB
PHP

<?php
#$Id$
if( defined( "MEDIAWIKI" ) ) {
# This file and LanguageLatin1.php may be included from within functions, so
# we need to have global statements
global $wgInputEncoding, $wgOutputEncoding, $wikiUpperChars, $wikiLowerChars;
global $wgDBname, $wgMemc;
$wgInputEncoding = "UTF-8";
$wgOutputEncoding = "UTF-8";
if (function_exists('mb_internal_encoding')) {
mb_internal_encoding('UTF-8');
} else {
# Hack our own case conversion routines
# Loading serialized arrays is faster than parsing code :P
$wikiUpperChars = $wgMemc->get( $key1 = "$wgDBname:utf8:upper" );
$wikiLowerChars = $wgMemc->get( $key2 = "$wgDBname:utf8:lower" );
if(empty( $wikiUpperChars) || empty($wikiLowerChars )) {
require_once( "includes/Utf8Case.php" );
$wgMemc->set( $key1, $wikiUpperChars );
$wgMemc->set( $key2, $wikiLowerChars );
}
}
# Base stuff useful to all UTF-8 based language files
class LanguageUtf8 extends Language {
# These two functions use mbstring library, if it is loaded
# or compiled and character mapping arrays otherwise.
# In case of language-specific character mismatch
# it should be dealt with in Language classes.
function ucfirst( $string ) {
if (function_exists('mb_strtoupper')) {
return mb_strtoupper(mb_substr($string,0,1)).mb_substr($string,1);
} else {
global $wikiUpperChars;
return preg_replace (
"/^([a-z]|[\\xc0-\\xff][\\x80-\\xbf]*)/e",
"strtr ( \"\$1\" , \$wikiUpperChars )",
$string );
}
}
function lcfirst( $string ) {
if (function_exists('mb_strtolower')) {
return mb_strtolower(mb_substr($string,0,1)).mb_substr($string,1);
} else {
global $wikiLowerChars;
return preg_replace (
"/^([A-Z]|[\\xc0-\\xff][\\x80-\\xbf]*)/e",
"strtr ( \"\$1\" , \$wikiLowerChars )",
$string );
}
}
function stripForSearch( $string ) {
# MySQL fulltext index doesn't grok utf-8, so we
# need to fold cases and convert to hex
# In Language:: it just returns lowercase, maybe
# all strtolower on stripped output or argument
# should be removed and all stripForSearch
# methods adjusted to that.
wfProfileIn( "LanguageUtf8::stripForSearch" );
if( function_exists( 'mb_strtolower' ) ) {
$out = preg_replace(
"/([\\xc0-\\xff][\\x80-\\xbf]*)/e",
"'U8' . bin2hex( \"$1\" )",
mb_strtolower( $string ) );
} else {
global $wikiLowerChars;
$out = preg_replace(
"/([\\xc0-\\xff][\\x80-\\xbf]*)/e",
"'U8' . bin2hex( strtr( \"\$1\", \$wikiLowerChars ) )",
$string );
}
wfProfileOut( "LanguageUtf8::stripForSearch" );
return $out;
}
function fallback8bitEncoding() {
# Windows codepage 1252 is a superset of iso 8859-1
# override this to use difference source encoding to
# translate incoming 8-bit URLs.
return "windows-1252";
}
function checkTitleEncoding( $s ) {
global $wgInputEncoding;
# Check for non-UTF-8 URLs
$ishigh = preg_match( '/[\x80-\xff]/', $s);
if(!$ishigh) return $s;
$isutf8 = preg_match( '/^([\x00-\x7f]|[\xc0-\xdf][\x80-\xbf]|' .
'[\xe0-\xef][\x80-\xbf]{2}|[\xf0-\xf7][\x80-\xbf]{3})+$/', $s );
if( $isutf8 ) return $s;
return $this->iconv( $this->fallback8bitEncoding(), "utf-8", $s );
}
function firstChar( $s ) {
preg_match( '/^([\x00-\x7f]|[\xc0-\xdf][\x80-\xbf]|' .
'[\xe0-\xef][\x80-\xbf]{2}|[\xf0-\xf7][\x80-\xbf]{3})/', $s, $matches);
return isset( $matches[1] ) ? $matches[1] : "";
}
# Crop a string from the beginning or end to a certain number of bytes.
# (Bytes are used because our storage has limited byte lengths for some
# columns in the database.) Multibyte charsets will need to make sure that
# only whole characters are included!
#
# $length does not include the optional ellipsis.
# If $length is negative, snip from the beginning
function truncate( $string, $length, $ellipsis = "" ) {
if( $length == 0 ) {
return $ellipsis;
}
if ( strlen( $string ) <= abs( $length ) ) {
return $string;
}
if( $length > 0 ) {
$string = substr( $string, 0, $length );
$char = ord( $string[strlen( $string ) - 1] );
if ($char >= 0xc0) {
# We got the first byte only of a multibyte char; remove it.
$string = substr( $string, 0, -1 );
} elseif( $char >= 0x80 &&
preg_match( '/^(.*)(?:[\xe0-\xef][\x80-\xbf]|' .
'[\xf0-\xf7][\x80-\xbf]{1,2})$/', $string, $m ) ) {
# We chopped in the middle of a character; remove it
$string = $m[1];
}
return $string . $ellipsis;
} else {
$string = substr( $string, $length );
$char = ord( $string[0] );
if( $char >= 0x80 && $char < 0xc0 ) {
# We chopped in the middle of a character; remove the whole thing
$string = preg_replace( '/^[\x80-\xbf]+/', '', $string );
}
return $ellipsis . $string;
}
}
}
} # ifdef MEDIAWIKI
?>