Follow-up r61856

* Rename wordSegmentation() to segmentByWord().
* Consolidate search index locking and iteration to Maintenance.php
* Add maintenance/updateDoubleWidthSearch.php to take care of new
  format for normalized double-width roman characters.
* Add error checking to updateSearchIndex.php for creating $posFile.
* Add note to UPGRADE about running updateDoubleWidthSearch.php.
This commit is contained in:
Mark A. Hershberger 2010-03-10 21:54:23 +00:00
parent b4c7ddf1ea
commit 54badce2d8
8 changed files with 198 additions and 82 deletions

11
UPGRADE
View file

@ -53,11 +53,19 @@ deleted file archives, and any custom skins.
You will need to have $wgDBadminuser and $wgDBadminpass set in your
LocalSettings.php, see there for more info.
From the command line, browse to the "maintenance" directory and run the
From the command line, browse to the "maintenance" directory and run the
update.php script to check and update the schema. This will insert missing
tables, update existing tables, and move data around as needed. In most cases,
this is successful and nothing further needs to be done.
If you have a Chinese or Japanese wiki ($wgLanguageCode is set to one
of "zh", "ja", or "yue") and you are using MySQL fulltext search, you
will probably want to update the search index.
In the "maintenance" directory, run the updateDoubleWidthSearch.php
script. This will update the searchindex table for those pages that
contain double-byte latin characters.
=== Check configuration settings ===
The names of configuration variables, and their default values and purposes,
@ -67,6 +75,7 @@ notes to check for configuration changes which would alter the expected
behaviour of MediaWiki.
=== Check installed extensions ===
In MediaWiki 1.14 some extensions are migrated into the core. Please see the
HISTORY section "Migrated extensions" and disable these extensions in your
LocalSettings.php

View file

@ -1695,7 +1695,7 @@ class Language {
* @param $string String
* @return String
*/
function wordSegmentation( $string ) {
function segmentByWord( $string ) {
return $string;
}

View file

@ -6,7 +6,7 @@
* @ingroup Language
*/
class LanguageJa extends Language {
function wordSegmentation( $string ) {
function segmentByWord( $string ) {
// Strip known punctuation ?
// $s = preg_replace( '/\xe3\x80[\x80-\xbf]/', '', $s ); # U3000-303f

View file

@ -12,7 +12,7 @@ class LanguageYue extends Language {
* for now just treat each character as a word.
* @todo Fixme: only do this for Han characters...
*/
function wordSegmentation( $string ) {
function segmentByWord( $string ) {
$reg = "/([\\xc0-\\xff][\\x80-\\xbf]*)/";
$s = self::insertSpace( $string, $reg );
return $s;

View file

@ -13,7 +13,7 @@ class LanguageZh_hans extends Language {
* for now just treat each character as a word.
* @todo Fixme: only do this for Han characters...
*/
function wordSegmentation( $string ) {
function segmentByWord( $string ) {
$reg = "/([\\xc0-\\xff][\\x80-\\xbf]*)/";
$s = self::insertSpace( $string, $reg );
return $s;
@ -25,7 +25,7 @@ class LanguageZh_hans extends Language {
// Double-width roman characters
$s = self::convertDoubleWidth( $string );
$s = trim( $s );
$s = self::wordSegmentation( $s );
$s = self::segmentByWord( $s );
$s = parent::normalizeForSearch( $s );
wfProfileOut( __METHOD__ );

View file

@ -844,4 +844,91 @@ abstract class Maintenance {
}
return self::$mCoreScripts;
}
/**
* Lock the search index
* @param &$db Database object
*/
private function lockSearchindex( &$db ) {
$write = array( 'searchindex' );
$read = array( 'page', 'revision', 'text', 'interwiki', 'l10n_cache' );
$db->lockTables( $read, $write, __CLASS__ . '::' . __METHOD__ );
}
/**
* Unlock the tables
* @param &$db Database object
*/
private function unlockSearchindex( &$db ) {
$db->unlockTables( __CLASS__ . '::' . __METHOD__ );
}
/**
* Unlock and lock again
* Since the lock is low-priority, queued reads will be able to complete
* @param &$db Database object
*/
private function relockSearchindex( &$db ) {
$this->unlockSearchindex( $db );
$this->lockSearchindex( $db );
}
/**
* Perform a search index update with locking
* @param $maxLockTime integer the maximum time to keep the search index locked.
* @param $updateFunction callback the function that will update the function.
*/
public function updateSearchIndex( $maxLockTime, $callback, $dbw, $results ) {
$lockTime = time();
# Lock searchindex
if ( $maxLockTime ) {
$this->output( " --- Waiting for lock ---" );
$this->lockSearchindex( $dbw );
$lockTime = time();
$this->output( "\n" );
}
# Loop through the results and do a search update
foreach ( $results as $row ) {
# Allow reads to be processed
if ( $maxLockTime && time() > $lockTime + $maxLockTime ) {
$this->output( " --- Relocking ---" );
$this->relockSearchindex( $dbw );
$lockTime = time();
$this->output( "\n" );
}
call_user_func( $callback, $dbw, $row );
}
# Unlock searchindex
if ( $maxLockTime ) {
$this->output( " --- Unlocking --" );
$this->unlockSearchindex( $dbw );
$this->output( "\n" );
}
}
/**
* Update the searchindex table for a given pageid
* @param $dbw Database a database write handle
* @param $pageId the page ID to update.
*/
public function updateSearchIndexForPage( $dbw, $pageId ) {
// Get current revision
$rev = Revision::loadFromPageId( $dbw, $pageId );
$title = null;
if( $rev ) {
$titleObj = $rev->getTitle();
$title = $titleObj->getPrefixedDBkey();
$this->output( "$title..." );
# Update searchindex
$u = new SearchUpdate( $pageId, $titleObj->getText(), $rev->getText() );
$u->doUpdate();
$this->output( "\n" );
}
return $title;
}
}

View file

@ -0,0 +1,72 @@
<?php
/**
* Script to normalize double-byte latin UTF-8 characters
*
* Usage: php updateDoubleWidthSearch.php
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
* http://www.gnu.org/copyleft/gpl.html
*
* @ingroup Maintenance
*/
require_once( dirname(__FILE__) . '/Maintenance.php' );
class UpdateDoubleWidthSearch extends Maintenance {
public function __construct() {
parent::__construct();
$this->mDescription = "Script to normalize double-byte latin UTF-8 characters";
$this->addOption( 'q', 'quiet', false, true );
$this->addOption( 'l', 'How long the searchindex and revision tables will be locked for', false, true );
}
public function getDbType() {
return Maintenance::DB_ADMIN;
}
public function execute() {
$quiet = $this->hasOption( 'q' );
$maxLockTime = $this->getOption( 'l', 20 );
$lockTime = time();
$dbw = wfGetDB( DB_MASTER );
if( $dbw->getType() !== 'mysql' ) {
$this->output( "This change is only needed on MySQL, quitting..." );
exit(1);
}
$res = $this->findRows($dbw);
$this->updateSearchIndex($maxLockTime, array($this, 'searchIndexUpdateCallback'), $dbw, $res);
$this->output( "Done\n" );
}
public function searchIndexUpdateCallback($dbw, $row) {
return $this->updateSearchIndexForPage( $dbw, $row->si_page );
}
private function findRows($dbw) {
$searchindex = $dbw->tableName( 'searchindex' );
$regexp = '[[:<:]]u8efbd([89][1-9a]|8[b-f]|90)[[:>:]]';
$sql = "SELECT si_page FROM $searchindex
WHERE ( si_text RLIKE '$regexp' )
OR ( si_title RLIKE '$regexp' )";
return $dbw->query( $sql, __METHOD__ );
}
}
$maintClass = "UpdateDoubleWidthSearch";
require_once( DO_MAINTENANCE );

View file

@ -63,9 +63,18 @@ class UpdateSearchIndex extends Maintenance {
$lockTime = $this->getOption( 'l', 20 );
$this->doUpdateSearchIndex( $start, $end, $lockTime );
$file = fopen( $posFile, 'w' );
fwrite( $file, $end );
fclose( $file );
if( is_writable( dirname( realpath( $posFile ) ) ) ) {
$file = fopen( $posFile, 'w' );
if( $file !== false ) {
fwrite( $file, $end );
fclose( $file );
} else {
echo posix_get_last_error();
$this->output( "*** Couldn't write to the $posFile!" );
}
} else {
$this->output( "*** Couldn't write to the $posFile!" );
}
}
private function doUpdateSearchIndex( $start, $end, $maxLockTime ) {
@ -89,83 +98,22 @@ class UpdateSearchIndex extends Maintenance {
";
$res = $dbw->query( $sql, __METHOD__ );
$this->updateSearchIndex($maxLockTime, array($this, 'searchIndexUpdateCallback'), $dbw, $res);
# Lock searchindex
if ( $maxLockTime ) {
$this->output( " --- Waiting for lock ---" );
$this->lockSearchindex( $dbw );
$lockTime = time();
$this->output( "\n" );
}
# Loop through the results and do a search update
foreach ( $res as $row ) {
# Allow reads to be processed
if ( $maxLockTime && time() > $lockTime + $maxLockTime ) {
$this->output( " --- Relocking ---" );
$this->relockSearchindex( $dbw );
$lockTime = time();
$this->output( "\n" );
}
if ( $row->rc_type == RC_LOG ) {
continue;
} elseif ( $row->rc_type == RC_MOVE || $row->rc_type == RC_MOVE_OVER_REDIRECT ) {
# Rename searchindex entry
$titleObj = Title::makeTitle( $row->rc_moved_to_ns, $row->rc_moved_to_title );
$title = $titleObj->getPrefixedDBkey();
$this->output( "$title..." );
$u = new SearchUpdate( $row->rc_cur_id, $title, false );
$this->output( "\n" );
} else {
// Get current revision
$rev = Revision::loadFromPageId( $dbw, $row->rc_cur_id );
if( $rev ) {
$titleObj = $rev->getTitle();
$title = $titleObj->getPrefixedDBkey();
$this->output( $title );
# Update searchindex
$u = new SearchUpdate( $row->rc_cur_id, $titleObj->getText(), $rev->getText() );
$u->doUpdate();
$this->output( "\n" );
}
}
}
# Unlock searchindex
if ( $maxLockTime ) {
$this->output( " --- Unlocking --" );
$this->unlockSearchindex( $dbw );
$this->output( "\n" );
}
$this->output( "Done\n" );
}
/**
* Lock the search index
* @param &$db Database object
*/
private function lockSearchindex( &$db ) {
$write = array( 'searchindex' );
$read = array( 'page', 'revision', 'text', 'interwiki' );
$db->lockTables( $read, $write, 'updateSearchIndex.php ' . __METHOD__ );
}
/**
* Unlock the tables
* @param &$db Database object
*/
private function unlockSearchindex( &$db ) {
$db->unlockTables( 'updateSearchIndex.php ' . __METHOD__ );
}
/**
* Unlock and lock again
* Since the lock is low-priority, queued reads will be able to complete
* @param &$db Database object
*/
private function relockSearchindex( &$db ) {
$this->unlockSearchindex( $db );
$this->lockSearchindex( $db );
public function searchIndexUpdateCallback($dbw, $row) {
if ( $row->rc_type == RC_MOVE || $row->rc_type == RC_MOVE_OVER_REDIRECT ) {
# Rename searchindex entry
$titleObj = Title::makeTitle( $row->rc_moved_to_ns, $row->rc_moved_to_title );
$title = $titleObj->getPrefixedDBkey();
$this->output( "$title..." );
$u = new SearchUpdate( $row->rc_cur_id, $title, false );
$this->output( "\n" );
} elseif ( $row->rc_type !== RC_LOG ) {
$this->updateSearchIndexForPage( $dbw, $row->rc_cur_id );
}
}
}