Fixed r61214: moved MySQL munging to SearchEngine, updated calls. Can we kill $doStrip now?

This commit is contained in:
Max Semenik 2010-01-22 20:36:26 +00:00
parent 42df7b3112
commit eff719b75d
5 changed files with 95 additions and 101 deletions

View file

@ -2366,18 +2366,6 @@ abstract class DatabaseBase {
return "SearchMySQL";
}
/**
* When overridden in derived class, performs database-specific conversions
* on text to be used for searching or updating search index.
* Default implementation does nothing (simply returns $string).
*
* @param $string string: String to strip
* @return string
*/
public function stripForSearch( $string ) {
return $string;
}
/**
* Allow or deny "big selects" for this session only. This is done by setting
* the sql_big_selects session variable.

View file

@ -7,8 +7,6 @@
* @see Database
*/
class DatabaseMysql extends DatabaseBase {
static $mMinSearchLength;
function getType() {
return 'mysql';
}
@ -368,84 +366,6 @@ class DatabaseMysql extends DatabaseBase {
public function unlockTables( $method ) {
$this->query( "UNLOCK TABLES", $method );
}
/**
* Converts some characters for MySQL's indexing to grok it correctly,
* and pads short words to overcome limitations.
*/
function stripForSearch( $string ) {
global $wgContLang;
wfProfileIn( __METHOD__ );
// MySQL fulltext index doesn't grok utf-8, so we
// need to fold cases and convert to hex
$out = preg_replace_callback(
"/([\\xc0-\\xff][\\x80-\\xbf]*)/",
array( $this, 'stripForSearchCallback' ),
$wgContLang->lc( $string ) );
// And to add insult to injury, the default indexing
// ignores short words... Pad them so we can pass them
// through without reconfiguring the server...
$minLength = $this->minSearchLength();
if( $minLength > 1 ) {
$n = $minLength - 1;
$out = preg_replace(
"/\b(\w{1,$n})\b/",
"$1u800",
$out );
}
// Periods within things like hostnames and IP addresses
// are also important -- we want a search for "example.com"
// or "192.168.1.1" to work sanely.
//
// MySQL's search seems to ignore them, so you'd match on
// "example.wikipedia.com" and "192.168.83.1" as well.
$out = preg_replace(
"/(\w)\.(\w|\*)/u",
"$1u82e$2",
$out );
wfProfileOut( __METHOD__ );
return $out;
}
/**
* Armor a case-folded UTF-8 string to get through MySQL's
* fulltext search without being mucked up by funny charset
* settings or anything else of the sort.
*/
protected function stripForSearchCallback( $matches ) {
return 'u8' . bin2hex( $matches[1] );
}
/**
* Check MySQL server's ft_min_word_len setting so we know
* if we need to pad short words...
*
* @return int
*/
protected function minSearchLength() {
if( is_null( self::$mMinSearchLength ) ) {
$sql = "show global variables like 'ft\\_min\\_word\\_len'";
// Even though this query is pretty fast, let's not overload the master
$dbr = wfGetDB( DB_SLAVE );
$result = $dbr->query( $sql );
$row = $result->fetchObject();
$result->free();
if( $row && $row->Variable_name == 'ft_min_word_len' ) {
self::$mMinSearchLength = intval( $row->Value );
} else {
self::$mMinSearchLength = 0;
}
}
return self::$mMinSearchLength;
}
public function setBigSelects( $value = true ) {
if ( $value === 'default' ) {

View file

@ -47,6 +47,18 @@ class SearchEngine {
return true;
}
/**
* When overridden in derived class, performs database-specific conversions
* on text to be used for searching or updating search index.
* Default implementation does nothing (simply returns $string).
*
* @param $string string: String to process
* @return string
*/
public function normalizeText( $string ) {
return $string;
}
/**
* Transform search term in cases when parts of the query came as different GET params (when supported)
* e.g. for prefix queries: search=test&prefix=Main_Page/Archive -> test prefix:Main Page/Archive

View file

@ -28,6 +28,7 @@
*/
class SearchMySQL extends SearchEngine {
var $strictMatching = true;
static $mMinSearchLength;
/** @todo document */
function __construct( $db ) {
@ -91,6 +92,7 @@ class SearchMySQL extends SearchEngine {
if( count( $strippedVariants) > 1 )
$searchon .= '(';
foreach( $strippedVariants as $stripped ) {
$stripped = $this->normalizeText( $stripped );
if( $nonQuoted && strpos( $stripped, ' ' ) !== false ) {
// Hack for Chinese: we need to toss in quotes for
// multiple-character phrases since stripForSearch()
@ -292,8 +294,8 @@ class SearchMySQL extends SearchEngine {
array( 'si_page' ),
array(
'si_page' => $id,
'si_title' => $title,
'si_text' => $text
'si_title' => $this->normalizeText( $title ),
'si_text' => $this->normalizeText( $text )
), __METHOD__ );
}
@ -308,11 +310,88 @@ class SearchMySQL extends SearchEngine {
$dbw = wfGetDB( DB_MASTER );
$dbw->update( 'searchindex',
array( 'si_title' => $title ),
array( 'si_title' => $this->normalizeText( $title ) ),
array( 'si_page' => $id ),
__METHOD__,
array( $dbw->lowPriorityOption() ) );
}
/**
* Converts some characters for MySQL's indexing to grok it correctly,
* and pads short words to overcome limitations.
*/
function normalizeText( $string ) {
global $wgContLang;
wfProfileIn( __METHOD__ );
// MySQL fulltext index doesn't grok utf-8, so we
// need to fold cases and convert to hex
$out = preg_replace_callback(
"/([\\xc0-\\xff][\\x80-\\xbf]*)/",
array( $this, 'stripForSearchCallback' ),
$wgContLang->lc( $string ) );
// And to add insult to injury, the default indexing
// ignores short words... Pad them so we can pass them
// through without reconfiguring the server...
$minLength = $this->minSearchLength();
if( $minLength > 1 ) {
$n = $minLength - 1;
$out = preg_replace(
"/\b(\w{1,$n})\b/",
"$1u800",
$out );
}
// Periods within things like hostnames and IP addresses
// are also important -- we want a search for "example.com"
// or "192.168.1.1" to work sanely.
//
// MySQL's search seems to ignore them, so you'd match on
// "example.wikipedia.com" and "192.168.83.1" as well.
$out = preg_replace(
"/(\w)\.(\w|\*)/u",
"$1u82e$2",
$out );
wfProfileOut( __METHOD__ );
return $out;
}
/**
* Armor a case-folded UTF-8 string to get through MySQL's
* fulltext search without being mucked up by funny charset
* settings or anything else of the sort.
*/
protected function stripForSearchCallback( $matches ) {
return 'u8' . bin2hex( $matches[1] );
}
/**
* Check MySQL server's ft_min_word_len setting so we know
* if we need to pad short words...
*
* @return int
*/
protected function minSearchLength() {
if( is_null( self::$mMinSearchLength ) ) {
$sql = "SHOW GLOBAL VARIABLES LIKE 'ft\\_min\\_word\\_len'";
$dbr = wfGetDB( DB_SLAVE );
$result = $dbr->query( $sql );
$row = $result->fetchObject();
$result->free();
if( $row && $row->Variable_name == 'ft_min_word_len' ) {
self::$mMinSearchLength = intval( $row->Value );
} else {
self::$mMinSearchLength = 0;
}
}
return self::$mMinSearchLength;
}
}
/**

View file

@ -1695,12 +1695,7 @@ class Language {
* @return String
*/
function stripForSearch( $string, $doStrip = true ) {
if ( !$doStrip ) {
return $string;
}
$dbr = wfGetDB( DB_SLAVE );
return $dbr->stripForSearch( $string );
return $string;
}
/**