Follow-up r61856
* Rename wordSegmentation() to segmentByWord(). * Consolidate search index locking and iteration to Maintenance.php * Add maintenance/updateDoubleWidthSearch.php to take care of new format for normalized double-width roman characters. * Add error checking to updateSearchIndex.php for creating $posFile. * Add note to UPGRADE about running updateDoubleWidthSearch.php.
This commit is contained in:
parent
b4c7ddf1ea
commit
54badce2d8
8 changed files with 198 additions and 82 deletions
11
UPGRADE
11
UPGRADE
|
|
@ -53,11 +53,19 @@ deleted file archives, and any custom skins.
|
|||
You will need to have $wgDBadminuser and $wgDBadminpass set in your
|
||||
LocalSettings.php, see there for more info.
|
||||
|
||||
From the command line, browse to the "maintenance" directory and run the
|
||||
From the command line, browse to the "maintenance" directory and run the
|
||||
update.php script to check and update the schema. This will insert missing
|
||||
tables, update existing tables, and move data around as needed. In most cases,
|
||||
this is successful and nothing further needs to be done.
|
||||
|
||||
If you have a Chinese or Japanese wiki ($wgLanguageCode is set to one
|
||||
of "zh", "ja", or "yue") and you are using MySQL fulltext search, you
|
||||
will probably want to update the search index.
|
||||
|
||||
In the "maintenance" directory, run the updateDoubleWidthSearch.php
|
||||
script. This will update the searchindex table for those pages that
|
||||
contain double-byte latin characters.
|
||||
|
||||
=== Check configuration settings ===
|
||||
|
||||
The names of configuration variables, and their default values and purposes,
|
||||
|
|
@ -67,6 +75,7 @@ notes to check for configuration changes which would alter the expected
|
|||
behaviour of MediaWiki.
|
||||
|
||||
=== Check installed extensions ===
|
||||
|
||||
In MediaWiki 1.14 some extensions are migrated into the core. Please see the
|
||||
HISTORY section "Migrated extensions" and disable these extensions in your
|
||||
LocalSettings.php
|
||||
|
|
|
|||
|
|
@ -1695,7 +1695,7 @@ class Language {
|
|||
* @param $string String
|
||||
* @return String
|
||||
*/
|
||||
function wordSegmentation( $string ) {
|
||||
function segmentByWord( $string ) {
|
||||
return $string;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -6,7 +6,7 @@
|
|||
* @ingroup Language
|
||||
*/
|
||||
class LanguageJa extends Language {
|
||||
function wordSegmentation( $string ) {
|
||||
function segmentByWord( $string ) {
|
||||
// Strip known punctuation ?
|
||||
// $s = preg_replace( '/\xe3\x80[\x80-\xbf]/', '', $s ); # U3000-303f
|
||||
|
||||
|
|
|
|||
|
|
@ -12,7 +12,7 @@ class LanguageYue extends Language {
|
|||
* for now just treat each character as a word.
|
||||
* @todo Fixme: only do this for Han characters...
|
||||
*/
|
||||
function wordSegmentation( $string ) {
|
||||
function segmentByWord( $string ) {
|
||||
$reg = "/([\\xc0-\\xff][\\x80-\\xbf]*)/";
|
||||
$s = self::insertSpace( $string, $reg );
|
||||
return $s;
|
||||
|
|
|
|||
|
|
@ -13,7 +13,7 @@ class LanguageZh_hans extends Language {
|
|||
* for now just treat each character as a word.
|
||||
* @todo Fixme: only do this for Han characters...
|
||||
*/
|
||||
function wordSegmentation( $string ) {
|
||||
function segmentByWord( $string ) {
|
||||
$reg = "/([\\xc0-\\xff][\\x80-\\xbf]*)/";
|
||||
$s = self::insertSpace( $string, $reg );
|
||||
return $s;
|
||||
|
|
@ -25,7 +25,7 @@ class LanguageZh_hans extends Language {
|
|||
// Double-width roman characters
|
||||
$s = self::convertDoubleWidth( $string );
|
||||
$s = trim( $s );
|
||||
$s = self::wordSegmentation( $s );
|
||||
$s = self::segmentByWord( $s );
|
||||
$s = parent::normalizeForSearch( $s );
|
||||
|
||||
wfProfileOut( __METHOD__ );
|
||||
|
|
|
|||
|
|
@ -844,4 +844,91 @@ abstract class Maintenance {
|
|||
}
|
||||
return self::$mCoreScripts;
|
||||
}
|
||||
|
||||
/**
|
||||
* Lock the search index
|
||||
* @param &$db Database object
|
||||
*/
|
||||
private function lockSearchindex( &$db ) {
|
||||
$write = array( 'searchindex' );
|
||||
$read = array( 'page', 'revision', 'text', 'interwiki', 'l10n_cache' );
|
||||
$db->lockTables( $read, $write, __CLASS__ . '::' . __METHOD__ );
|
||||
}
|
||||
|
||||
/**
|
||||
* Unlock the tables
|
||||
* @param &$db Database object
|
||||
*/
|
||||
private function unlockSearchindex( &$db ) {
|
||||
$db->unlockTables( __CLASS__ . '::' . __METHOD__ );
|
||||
}
|
||||
|
||||
/**
|
||||
* Unlock and lock again
|
||||
* Since the lock is low-priority, queued reads will be able to complete
|
||||
* @param &$db Database object
|
||||
*/
|
||||
private function relockSearchindex( &$db ) {
|
||||
$this->unlockSearchindex( $db );
|
||||
$this->lockSearchindex( $db );
|
||||
}
|
||||
|
||||
/**
|
||||
* Perform a search index update with locking
|
||||
* @param $maxLockTime integer the maximum time to keep the search index locked.
|
||||
* @param $updateFunction callback the function that will update the function.
|
||||
*/
|
||||
public function updateSearchIndex( $maxLockTime, $callback, $dbw, $results ) {
|
||||
$lockTime = time();
|
||||
|
||||
# Lock searchindex
|
||||
if ( $maxLockTime ) {
|
||||
$this->output( " --- Waiting for lock ---" );
|
||||
$this->lockSearchindex( $dbw );
|
||||
$lockTime = time();
|
||||
$this->output( "\n" );
|
||||
}
|
||||
|
||||
# Loop through the results and do a search update
|
||||
foreach ( $results as $row ) {
|
||||
# Allow reads to be processed
|
||||
if ( $maxLockTime && time() > $lockTime + $maxLockTime ) {
|
||||
$this->output( " --- Relocking ---" );
|
||||
$this->relockSearchindex( $dbw );
|
||||
$lockTime = time();
|
||||
$this->output( "\n" );
|
||||
}
|
||||
call_user_func( $callback, $dbw, $row );
|
||||
}
|
||||
|
||||
# Unlock searchindex
|
||||
if ( $maxLockTime ) {
|
||||
$this->output( " --- Unlocking --" );
|
||||
$this->unlockSearchindex( $dbw );
|
||||
$this->output( "\n" );
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Update the searchindex table for a given pageid
|
||||
* @param $dbw Database a database write handle
|
||||
* @param $pageId the page ID to update.
|
||||
*/
|
||||
public function updateSearchIndexForPage( $dbw, $pageId ) {
|
||||
// Get current revision
|
||||
$rev = Revision::loadFromPageId( $dbw, $pageId );
|
||||
$title = null;
|
||||
if( $rev ) {
|
||||
$titleObj = $rev->getTitle();
|
||||
$title = $titleObj->getPrefixedDBkey();
|
||||
$this->output( "$title..." );
|
||||
# Update searchindex
|
||||
$u = new SearchUpdate( $pageId, $titleObj->getText(), $rev->getText() );
|
||||
$u->doUpdate();
|
||||
$this->output( "\n" );
|
||||
}
|
||||
return $title;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
72
maintenance/updateDoubleWidthSearch.php
Normal file
72
maintenance/updateDoubleWidthSearch.php
Normal file
|
|
@ -0,0 +1,72 @@
|
|||
<?php
|
||||
/**
|
||||
* Script to normalize double-byte latin UTF-8 characters
|
||||
*
|
||||
* Usage: php updateDoubleWidthSearch.php
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License along
|
||||
* with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
||||
* http://www.gnu.org/copyleft/gpl.html
|
||||
*
|
||||
* @ingroup Maintenance
|
||||
*/
|
||||
|
||||
require_once( dirname(__FILE__) . '/Maintenance.php' );
|
||||
|
||||
class UpdateDoubleWidthSearch extends Maintenance {
|
||||
|
||||
public function __construct() {
|
||||
parent::__construct();
|
||||
$this->mDescription = "Script to normalize double-byte latin UTF-8 characters";
|
||||
$this->addOption( 'q', 'quiet', false, true );
|
||||
$this->addOption( 'l', 'How long the searchindex and revision tables will be locked for', false, true );
|
||||
}
|
||||
|
||||
public function getDbType() {
|
||||
return Maintenance::DB_ADMIN;
|
||||
}
|
||||
|
||||
public function execute() {
|
||||
$quiet = $this->hasOption( 'q' );
|
||||
$maxLockTime = $this->getOption( 'l', 20 );
|
||||
$lockTime = time();
|
||||
|
||||
$dbw = wfGetDB( DB_MASTER );
|
||||
if( $dbw->getType() !== 'mysql' ) {
|
||||
$this->output( "This change is only needed on MySQL, quitting..." );
|
||||
exit(1);
|
||||
}
|
||||
|
||||
$res = $this->findRows($dbw);
|
||||
$this->updateSearchIndex($maxLockTime, array($this, 'searchIndexUpdateCallback'), $dbw, $res);
|
||||
|
||||
$this->output( "Done\n" );
|
||||
}
|
||||
|
||||
public function searchIndexUpdateCallback($dbw, $row) {
|
||||
return $this->updateSearchIndexForPage( $dbw, $row->si_page );
|
||||
}
|
||||
|
||||
private function findRows($dbw) {
|
||||
$searchindex = $dbw->tableName( 'searchindex' );
|
||||
$regexp = '[[:<:]]u8efbd([89][1-9a]|8[b-f]|90)[[:>:]]';
|
||||
$sql = "SELECT si_page FROM $searchindex
|
||||
WHERE ( si_text RLIKE '$regexp' )
|
||||
OR ( si_title RLIKE '$regexp' )";
|
||||
return $dbw->query( $sql, __METHOD__ );
|
||||
}
|
||||
}
|
||||
|
||||
$maintClass = "UpdateDoubleWidthSearch";
|
||||
require_once( DO_MAINTENANCE );
|
||||
|
|
@ -63,9 +63,18 @@ class UpdateSearchIndex extends Maintenance {
|
|||
$lockTime = $this->getOption( 'l', 20 );
|
||||
|
||||
$this->doUpdateSearchIndex( $start, $end, $lockTime );
|
||||
$file = fopen( $posFile, 'w' );
|
||||
fwrite( $file, $end );
|
||||
fclose( $file );
|
||||
if( is_writable( dirname( realpath( $posFile ) ) ) ) {
|
||||
$file = fopen( $posFile, 'w' );
|
||||
if( $file !== false ) {
|
||||
fwrite( $file, $end );
|
||||
fclose( $file );
|
||||
} else {
|
||||
echo posix_get_last_error();
|
||||
$this->output( "*** Couldn't write to the $posFile!" );
|
||||
}
|
||||
} else {
|
||||
$this->output( "*** Couldn't write to the $posFile!" );
|
||||
}
|
||||
}
|
||||
|
||||
private function doUpdateSearchIndex( $start, $end, $maxLockTime ) {
|
||||
|
|
@ -89,83 +98,22 @@ class UpdateSearchIndex extends Maintenance {
|
|||
";
|
||||
$res = $dbw->query( $sql, __METHOD__ );
|
||||
|
||||
$this->updateSearchIndex($maxLockTime, array($this, 'searchIndexUpdateCallback'), $dbw, $res);
|
||||
|
||||
# Lock searchindex
|
||||
if ( $maxLockTime ) {
|
||||
$this->output( " --- Waiting for lock ---" );
|
||||
$this->lockSearchindex( $dbw );
|
||||
$lockTime = time();
|
||||
$this->output( "\n" );
|
||||
}
|
||||
|
||||
# Loop through the results and do a search update
|
||||
foreach ( $res as $row ) {
|
||||
# Allow reads to be processed
|
||||
if ( $maxLockTime && time() > $lockTime + $maxLockTime ) {
|
||||
$this->output( " --- Relocking ---" );
|
||||
$this->relockSearchindex( $dbw );
|
||||
$lockTime = time();
|
||||
$this->output( "\n" );
|
||||
}
|
||||
if ( $row->rc_type == RC_LOG ) {
|
||||
continue;
|
||||
} elseif ( $row->rc_type == RC_MOVE || $row->rc_type == RC_MOVE_OVER_REDIRECT ) {
|
||||
# Rename searchindex entry
|
||||
$titleObj = Title::makeTitle( $row->rc_moved_to_ns, $row->rc_moved_to_title );
|
||||
$title = $titleObj->getPrefixedDBkey();
|
||||
$this->output( "$title..." );
|
||||
$u = new SearchUpdate( $row->rc_cur_id, $title, false );
|
||||
$this->output( "\n" );
|
||||
} else {
|
||||
// Get current revision
|
||||
$rev = Revision::loadFromPageId( $dbw, $row->rc_cur_id );
|
||||
if( $rev ) {
|
||||
$titleObj = $rev->getTitle();
|
||||
$title = $titleObj->getPrefixedDBkey();
|
||||
$this->output( $title );
|
||||
# Update searchindex
|
||||
$u = new SearchUpdate( $row->rc_cur_id, $titleObj->getText(), $rev->getText() );
|
||||
$u->doUpdate();
|
||||
$this->output( "\n" );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# Unlock searchindex
|
||||
if ( $maxLockTime ) {
|
||||
$this->output( " --- Unlocking --" );
|
||||
$this->unlockSearchindex( $dbw );
|
||||
$this->output( "\n" );
|
||||
}
|
||||
$this->output( "Done\n" );
|
||||
}
|
||||
|
||||
/**
|
||||
* Lock the search index
|
||||
* @param &$db Database object
|
||||
*/
|
||||
private function lockSearchindex( &$db ) {
|
||||
$write = array( 'searchindex' );
|
||||
$read = array( 'page', 'revision', 'text', 'interwiki' );
|
||||
$db->lockTables( $read, $write, 'updateSearchIndex.php ' . __METHOD__ );
|
||||
}
|
||||
|
||||
/**
|
||||
* Unlock the tables
|
||||
* @param &$db Database object
|
||||
*/
|
||||
private function unlockSearchindex( &$db ) {
|
||||
$db->unlockTables( 'updateSearchIndex.php ' . __METHOD__ );
|
||||
}
|
||||
|
||||
/**
|
||||
* Unlock and lock again
|
||||
* Since the lock is low-priority, queued reads will be able to complete
|
||||
* @param &$db Database object
|
||||
*/
|
||||
private function relockSearchindex( &$db ) {
|
||||
$this->unlockSearchindex( $db );
|
||||
$this->lockSearchindex( $db );
|
||||
public function searchIndexUpdateCallback($dbw, $row) {
|
||||
if ( $row->rc_type == RC_MOVE || $row->rc_type == RC_MOVE_OVER_REDIRECT ) {
|
||||
# Rename searchindex entry
|
||||
$titleObj = Title::makeTitle( $row->rc_moved_to_ns, $row->rc_moved_to_title );
|
||||
$title = $titleObj->getPrefixedDBkey();
|
||||
$this->output( "$title..." );
|
||||
$u = new SearchUpdate( $row->rc_cur_id, $title, false );
|
||||
$this->output( "\n" );
|
||||
} elseif ( $row->rc_type !== RC_LOG ) {
|
||||
$this->updateSearchIndexForPage( $dbw, $row->rc_cur_id );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue