2004-02-18 02:15:00 +00:00
|
|
|
<?php
|
WARNING: HUGE COMMIT
Doxygen documentation update:
* Changed alls @addtogroup to @ingroup. @addtogroup adds the comment to the group description, but doesn't add the file, class, function, ... to the group like @ingroup does. See for example http://svn.wikimedia.org/doc/group__SpecialPage.html where it's impossible to see related files, classes, ... that should belong to that group.
* Added @file to file description, it seems that it should be explicitely decalred for file descriptions, otherwise doxygen will think that the comment document the first class, variabled, function, ... that is in that file.
* Removed some empty comments
* Removed some ?>
Added following groups:
* ExternalStorage
* JobQueue
* MaintenanceLanguage
One more thing: there are still a lot of warnings when generating the doc.
2008-05-20 17:13:28 +00:00
|
|
|
/**
|
|
|
|
|
* @defgroup Search Search
|
|
|
|
|
*
|
|
|
|
|
* @file
|
|
|
|
|
* @ingroup Search
|
|
|
|
|
*/
|
|
|
|
|
|
2004-10-24 19:14:48 +00:00
|
|
|
/**
|
|
|
|
|
* Contain a class for special pages
|
WARNING: HUGE COMMIT
Doxygen documentation update:
* Changed alls @addtogroup to @ingroup. @addtogroup adds the comment to the group description, but doesn't add the file, class, function, ... to the group like @ingroup does. See for example http://svn.wikimedia.org/doc/group__SpecialPage.html where it's impossible to see related files, classes, ... that should belong to that group.
* Added @file to file description, it seems that it should be explicitely decalred for file descriptions, otherwise doxygen will think that the comment document the first class, variabled, function, ... that is in that file.
* Removed some empty comments
* Removed some ?>
Added following groups:
* ExternalStorage
* JobQueue
* MaintenanceLanguage
One more thing: there are still a lot of warnings when generating the doc.
2008-05-20 17:13:28 +00:00
|
|
|
* @ingroup Search
|
2004-10-24 19:14:48 +00:00
|
|
|
*/
|
2003-04-14 23:10:40 +00:00
|
|
|
class SearchEngine {
|
2004-10-20 09:56:34 +00:00
|
|
|
var $limit = 10;
|
|
|
|
|
var $offset = 0;
|
|
|
|
|
var $searchTerms = array();
|
2005-09-19 12:54:45 +00:00
|
|
|
var $namespaces = array( NS_MAIN );
|
2004-10-20 09:56:34 +00:00
|
|
|
var $showRedirects = false;
|
2005-08-02 13:35:19 +00:00
|
|
|
|
2004-09-02 23:28:24 +00:00
|
|
|
/**
|
2004-10-20 09:56:34 +00:00
|
|
|
* Perform a full text search query and return a result set.
|
2005-05-23 08:42:20 +00:00
|
|
|
* If title searches are not supported or disabled, return null.
|
2004-10-20 09:56:34 +00:00
|
|
|
*
|
|
|
|
|
* @param string $term - Raw search term
|
2005-05-23 08:42:20 +00:00
|
|
|
* @return SearchResultSet
|
2004-10-20 09:56:34 +00:00
|
|
|
* @access public
|
2005-05-23 08:42:20 +00:00
|
|
|
* @abstract
|
2004-09-02 23:28:24 +00:00
|
|
|
*/
|
2004-10-20 09:56:34 +00:00
|
|
|
function searchText( $term ) {
|
2005-05-23 08:42:20 +00:00
|
|
|
return null;
|
2003-04-14 23:10:40 +00:00
|
|
|
}
|
|
|
|
|
|
2004-09-02 23:28:24 +00:00
|
|
|
/**
|
2004-10-20 09:56:34 +00:00
|
|
|
* Perform a title-only search query and return a result set.
|
2005-05-23 08:42:20 +00:00
|
|
|
* If title searches are not supported or disabled, return null.
|
2004-10-20 09:56:34 +00:00
|
|
|
*
|
|
|
|
|
* @param string $term - Raw search term
|
2005-05-23 08:42:20 +00:00
|
|
|
* @return SearchResultSet
|
2004-10-20 09:56:34 +00:00
|
|
|
* @access public
|
2005-05-23 08:42:20 +00:00
|
|
|
* @abstract
|
2004-09-02 23:28:24 +00:00
|
|
|
*/
|
2004-10-20 09:56:34 +00:00
|
|
|
function searchTitle( $term ) {
|
2005-05-23 08:42:20 +00:00
|
|
|
return null;
|
2003-04-14 23:10:40 +00:00
|
|
|
}
|
2008-04-15 23:06:28 +00:00
|
|
|
|
2004-09-02 23:28:24 +00:00
|
|
|
/**
|
2004-10-20 09:56:34 +00:00
|
|
|
* If an exact title match can be find, or a very slightly close match,
|
|
|
|
|
* return the title. If no match, returns NULL.
|
|
|
|
|
*
|
|
|
|
|
* @param string $term
|
|
|
|
|
* @return Title
|
2004-09-02 23:28:24 +00:00
|
|
|
*/
|
2007-05-02 16:02:23 +00:00
|
|
|
public static function getNearMatch( $searchterm ) {
|
2006-09-20 10:22:12 +00:00
|
|
|
global $wgContLang;
|
2005-06-28 17:42:47 +00:00
|
|
|
|
2006-09-20 14:38:32 +00:00
|
|
|
$allSearchTerms = array($searchterm);
|
2003-04-14 23:10:40 +00:00
|
|
|
|
2006-09-20 14:38:32 +00:00
|
|
|
if($wgContLang->hasVariants()){
|
|
|
|
|
$allSearchTerms = array_merge($allSearchTerms,$wgContLang->convertLinkToAllVariants($searchterm));
|
2003-04-14 23:10:40 +00:00
|
|
|
}
|
2008-04-14 07:45:50 +00:00
|
|
|
|
2006-09-20 14:38:32 +00:00
|
|
|
foreach($allSearchTerms as $term){
|
2003-04-14 23:10:40 +00:00
|
|
|
|
2006-09-20 14:38:32 +00:00
|
|
|
# Exact match? No need to look further.
|
|
|
|
|
$title = Title::newFromText( $term );
|
|
|
|
|
if (is_null($title))
|
|
|
|
|
return NULL;
|
2006-01-07 13:31:29 +00:00
|
|
|
|
2006-09-20 14:38:32 +00:00
|
|
|
if ( $title->getNamespace() == NS_SPECIAL || $title->exists() ) {
|
|
|
|
|
return $title;
|
|
|
|
|
}
|
2006-01-07 13:31:29 +00:00
|
|
|
|
2006-09-20 14:38:32 +00:00
|
|
|
# Now try all lower case (i.e. first letter capitalized)
|
|
|
|
|
#
|
|
|
|
|
$title = Title::newFromText( $wgContLang->lc( $term ) );
|
2005-07-13 06:47:17 +00:00
|
|
|
if ( $title->exists() ) {
|
|
|
|
|
return $title;
|
|
|
|
|
}
|
2006-09-20 14:38:32 +00:00
|
|
|
|
|
|
|
|
# Now try capitalized string
|
|
|
|
|
#
|
|
|
|
|
$title = Title::newFromText( $wgContLang->ucwords( $term ) );
|
|
|
|
|
if ( $title->exists() ) {
|
|
|
|
|
return $title;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# Now try all upper case
|
|
|
|
|
#
|
|
|
|
|
$title = Title::newFromText( $wgContLang->uc( $term ) );
|
2005-07-13 06:47:17 +00:00
|
|
|
if ( $title->exists() ) {
|
|
|
|
|
return $title;
|
|
|
|
|
}
|
2006-09-20 14:38:32 +00:00
|
|
|
|
|
|
|
|
# Now try Word-Caps-Breaking-At-Word-Breaks, for hyphenated names etc
|
|
|
|
|
$title = Title::newFromText( $wgContLang->ucwordbreaks($term) );
|
|
|
|
|
if ( $title->exists() ) {
|
|
|
|
|
return $title;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
global $wgCapitalLinks, $wgContLang;
|
|
|
|
|
if( !$wgCapitalLinks ) {
|
|
|
|
|
// Catch differs-by-first-letter-case-only
|
|
|
|
|
$title = Title::newFromText( $wgContLang->ucfirst( $term ) );
|
|
|
|
|
if ( $title->exists() ) {
|
|
|
|
|
return $title;
|
|
|
|
|
}
|
|
|
|
|
$title = Title::newFromText( $wgContLang->lcfirst( $term ) );
|
|
|
|
|
if ( $title->exists() ) {
|
|
|
|
|
return $title;
|
|
|
|
|
}
|
|
|
|
|
}
|
2008-01-31 20:51:42 +00:00
|
|
|
|
|
|
|
|
// Give hooks a chance at better match variants
|
|
|
|
|
$title = null;
|
|
|
|
|
if( !wfRunHooks( 'SearchGetNearMatch', array( $term, &$title ) ) ) {
|
|
|
|
|
return $title;
|
|
|
|
|
}
|
2005-07-13 06:47:17 +00:00
|
|
|
}
|
2004-03-06 01:49:16 +00:00
|
|
|
|
2006-09-20 14:38:32 +00:00
|
|
|
$title = Title::newFromText( $searchterm );
|
2005-03-26 23:30:53 +00:00
|
|
|
|
2004-05-04 14:36:42 +00:00
|
|
|
# Entering an IP address goes to the contributions page
|
2005-03-27 01:28:03 +00:00
|
|
|
if ( ( $title->getNamespace() == NS_USER && User::isIP($title->getText() ) )
|
2006-09-20 14:38:32 +00:00
|
|
|
|| User::isIP( trim( $searchterm ) ) ) {
|
2008-01-14 09:13:04 +00:00
|
|
|
return SpecialPage::getTitleFor( 'Contributions', $title->getDBkey() );
|
2004-08-15 08:23:19 +00:00
|
|
|
}
|
2005-03-26 23:30:53 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
# Entering a user goes to the user page whether it's there or not
|
2005-03-27 01:28:03 +00:00
|
|
|
if ( $title->getNamespace() == NS_USER ) {
|
|
|
|
|
return $title;
|
2005-03-27 01:23:40 +00:00
|
|
|
}
|
2008-04-14 07:45:50 +00:00
|
|
|
|
2007-04-20 15:22:41 +00:00
|
|
|
# Go to images that exist even if there's no local page.
|
|
|
|
|
# There may have been a funny upload, or it may be on a shared
|
|
|
|
|
# file repository such as Wikimedia Commons.
|
|
|
|
|
if( $title->getNamespace() == NS_IMAGE ) {
|
2007-05-30 21:02:32 +00:00
|
|
|
$image = wfFindFile( $title );
|
|
|
|
|
if( $image ) {
|
2007-04-20 15:22:41 +00:00
|
|
|
return $title;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# MediaWiki namespace? Page may be "implied" if not customized.
|
|
|
|
|
# Just return it, with caps forced as the message system likes it.
|
|
|
|
|
if( $title->getNamespace() == NS_MEDIAWIKI ) {
|
|
|
|
|
return Title::makeTitle( NS_MEDIAWIKI, $wgContLang->ucfirst( $title->getText() ) );
|
|
|
|
|
}
|
2005-08-02 13:35:19 +00:00
|
|
|
|
2005-04-10 00:03:49 +00:00
|
|
|
# Quoted term? Try without the quotes...
|
2006-11-23 08:25:56 +00:00
|
|
|
$matches = array();
|
2006-09-20 14:38:32 +00:00
|
|
|
if( preg_match( '/^"([^"]+)"$/', $searchterm, $matches ) ) {
|
2005-04-10 00:03:49 +00:00
|
|
|
return SearchEngine::getNearMatch( $matches[1] );
|
|
|
|
|
}
|
2008-04-14 07:45:50 +00:00
|
|
|
|
2004-08-15 08:23:19 +00:00
|
|
|
return NULL;
|
|
|
|
|
}
|
2005-08-02 13:35:19 +00:00
|
|
|
|
2007-01-09 19:56:23 +00:00
|
|
|
public static function legalSearchChars() {
|
2004-10-20 09:56:34 +00:00
|
|
|
return "A-Za-z_'0-9\\x80-\\xFF\\-";
|
2004-02-11 01:48:42 +00:00
|
|
|
}
|
2003-06-15 08:10:47 +00:00
|
|
|
|
2004-09-02 23:28:24 +00:00
|
|
|
/**
|
2004-10-20 09:56:34 +00:00
|
|
|
* Set the maximum number of results to return
|
|
|
|
|
* and how many to skip before returning the first.
|
|
|
|
|
*
|
|
|
|
|
* @param int $limit
|
|
|
|
|
* @param int $offset
|
|
|
|
|
* @access public
|
2004-09-02 23:28:24 +00:00
|
|
|
*/
|
2004-10-20 09:56:34 +00:00
|
|
|
function setLimitOffset( $limit, $offset = 0 ) {
|
2005-08-16 23:36:16 +00:00
|
|
|
$this->limit = intval( $limit );
|
|
|
|
|
$this->offset = intval( $offset );
|
2004-02-11 01:48:42 +00:00
|
|
|
}
|
2005-08-02 13:35:19 +00:00
|
|
|
|
2004-09-02 23:28:24 +00:00
|
|
|
/**
|
2004-10-20 09:56:34 +00:00
|
|
|
* Set which namespaces the search should include.
|
|
|
|
|
* Give an array of namespace index numbers.
|
|
|
|
|
*
|
|
|
|
|
* @param array $namespaces
|
|
|
|
|
* @access public
|
2004-09-02 23:28:24 +00:00
|
|
|
*/
|
2004-10-20 09:56:34 +00:00
|
|
|
function setNamespaces( $namespaces ) {
|
|
|
|
|
$this->namespaces = $namespaces;
|
|
|
|
|
}
|
2008-04-14 07:45:50 +00:00
|
|
|
|
2008-03-23 17:29:43 +00:00
|
|
|
/**
|
|
|
|
|
* Parse some common prefixes: all (search everything)
|
|
|
|
|
* or namespace names
|
|
|
|
|
*
|
|
|
|
|
* @param string $query
|
|
|
|
|
*/
|
|
|
|
|
function replacePrefixes( $query ){
|
|
|
|
|
global $wgContLang;
|
2008-04-14 07:45:50 +00:00
|
|
|
|
2008-03-23 17:29:43 +00:00
|
|
|
if( strpos($query,':') === false )
|
|
|
|
|
return $query; // nothing to do
|
2008-04-14 07:45:50 +00:00
|
|
|
|
2008-03-23 17:29:43 +00:00
|
|
|
$parsed = $query;
|
2008-04-17 15:11:42 +00:00
|
|
|
$allkeyword = wfMsgForContent('searchall').":";
|
2008-03-23 17:29:43 +00:00
|
|
|
if( strncmp($query, $allkeyword, strlen($allkeyword)) == 0 ){
|
|
|
|
|
$this->namespaces = null;
|
|
|
|
|
$parsed = substr($query,strlen($allkeyword));
|
|
|
|
|
} else if( strpos($query,':') !== false ) {
|
|
|
|
|
$prefix = substr($query,0,strpos($query,':'));
|
|
|
|
|
$index = $wgContLang->getNsIndex($prefix);
|
|
|
|
|
if($index !== false){
|
|
|
|
|
$this->namespaces = array($index);
|
|
|
|
|
$parsed = substr($query,strlen($prefix)+1);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if(trim($parsed) == '')
|
|
|
|
|
return $query; // prefix was the whole query
|
2008-04-14 07:45:50 +00:00
|
|
|
|
2008-03-23 17:29:43 +00:00
|
|
|
return $parsed;
|
|
|
|
|
}
|
2005-08-02 13:35:19 +00:00
|
|
|
|
2004-10-20 09:56:34 +00:00
|
|
|
/**
|
|
|
|
|
* Make a list of searchable namespaces and their canonical names.
|
|
|
|
|
* @return array
|
|
|
|
|
*/
|
2007-05-02 16:02:23 +00:00
|
|
|
public static function searchableNamespaces() {
|
2004-10-20 09:56:34 +00:00
|
|
|
global $wgContLang;
|
|
|
|
|
$arr = array();
|
|
|
|
|
foreach( $wgContLang->getNamespaces() as $ns => $name ) {
|
2005-01-28 05:10:05 +00:00
|
|
|
if( $ns >= NS_MAIN ) {
|
2004-10-20 09:56:34 +00:00
|
|
|
$arr[$ns] = $name;
|
2003-11-09 11:45:12 +00:00
|
|
|
}
|
2004-03-20 08:41:33 +00:00
|
|
|
}
|
2004-10-20 09:56:34 +00:00
|
|
|
return $arr;
|
2004-02-11 01:48:42 +00:00
|
|
|
}
|
2008-04-15 23:06:28 +00:00
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Extract default namespaces to search from the given user's
|
|
|
|
|
* settings, returning a list of index numbers.
|
|
|
|
|
*
|
|
|
|
|
* @param User $user
|
|
|
|
|
* @return array
|
|
|
|
|
* @static
|
|
|
|
|
*/
|
|
|
|
|
public static function userNamespaces( &$user ) {
|
|
|
|
|
$arr = array();
|
|
|
|
|
foreach( SearchEngine::searchableNamespaces() as $ns => $name ) {
|
|
|
|
|
if( $user->getOption( 'searchNs' . $ns ) ) {
|
|
|
|
|
$arr[] = $ns;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return $arr;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Find snippet highlight settings for a given user
|
|
|
|
|
*
|
|
|
|
|
* @param User $user
|
|
|
|
|
* @return array contextlines, contextchars
|
|
|
|
|
* @static
|
|
|
|
|
*/
|
|
|
|
|
public static function userHighlightPrefs( &$user ){
|
|
|
|
|
//$contextlines = $user->getOption( 'contextlines', 5 );
|
2008-05-04 15:31:03 +00:00
|
|
|
//$contextchars = $user->getOption( 'contextchars', 50 );
|
2008-04-15 23:06:28 +00:00
|
|
|
$contextlines = 2; // Hardcode this. Old defaults sucked. :)
|
2008-05-04 15:31:03 +00:00
|
|
|
$contextchars = 75; // same as above.... :P
|
2008-04-15 23:06:28 +00:00
|
|
|
return array($contextlines, $contextchars);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* An array of namespaces indexes to be searched by default
|
|
|
|
|
*
|
|
|
|
|
* @return array
|
|
|
|
|
* @static
|
|
|
|
|
*/
|
|
|
|
|
public static function defaultNamespaces(){
|
|
|
|
|
global $wgNamespacesToBeSearchedDefault;
|
|
|
|
|
|
|
|
|
|
return array_keys($wgNamespacesToBeSearchedDefault, true);
|
|
|
|
|
}
|
2005-08-02 13:35:19 +00:00
|
|
|
|
2004-10-20 09:56:34 +00:00
|
|
|
/**
|
|
|
|
|
* Return a 'cleaned up' search string
|
|
|
|
|
*
|
|
|
|
|
* @return string
|
|
|
|
|
* @access public
|
|
|
|
|
*/
|
|
|
|
|
function filter( $text ) {
|
|
|
|
|
$lc = $this->legalSearchChars();
|
|
|
|
|
return trim( preg_replace( "/[^{$lc}]/", " ", $text ) );
|
|
|
|
|
}
|
2004-12-02 19:09:40 +00:00
|
|
|
/**
|
|
|
|
|
* Load up the appropriate search engine class for the currently
|
2005-05-23 05:25:26 +00:00
|
|
|
* active database backend, and return a configured instance.
|
|
|
|
|
*
|
|
|
|
|
* @return SearchEngine
|
|
|
|
|
*/
|
2007-01-09 19:56:23 +00:00
|
|
|
public static function create() {
|
2005-08-26 23:02:54 +00:00
|
|
|
global $wgDBtype, $wgSearchType;
|
2005-05-23 08:42:20 +00:00
|
|
|
if( $wgSearchType ) {
|
|
|
|
|
$class = $wgSearchType;
|
|
|
|
|
} elseif( $wgDBtype == 'mysql' ) {
|
2008-03-18 23:50:05 +00:00
|
|
|
$class = 'SearchMySQL';
|
2006-06-27 15:08:08 +00:00
|
|
|
} else if ( $wgDBtype == 'postgres' ) {
|
2006-07-05 03:54:01 +00:00
|
|
|
$class = 'SearchPostgres';
|
2007-03-11 04:41:02 +00:00
|
|
|
} else if ( $wgDBtype == 'oracle' ) {
|
|
|
|
|
$class = 'SearchOracle';
|
2005-05-23 05:25:26 +00:00
|
|
|
} else {
|
|
|
|
|
$class = 'SearchEngineDummy';
|
|
|
|
|
}
|
|
|
|
|
$search = new $class( wfGetDB( DB_SLAVE ) );
|
|
|
|
|
$search->setLimitOffset(0,0);
|
|
|
|
|
return $search;
|
2004-12-02 19:09:40 +00:00
|
|
|
}
|
2005-08-02 13:35:19 +00:00
|
|
|
|
2005-05-23 08:42:20 +00:00
|
|
|
/**
|
|
|
|
|
* Create or update the search index record for the given page.
|
|
|
|
|
* Title and text should be pre-processed.
|
|
|
|
|
*
|
|
|
|
|
* @param int $id
|
|
|
|
|
* @param string $title
|
|
|
|
|
* @param string $text
|
|
|
|
|
* @abstract
|
|
|
|
|
*/
|
|
|
|
|
function update( $id, $title, $text ) {
|
|
|
|
|
// no-op
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Update a search index record's title only.
|
|
|
|
|
* Title should be pre-processed.
|
|
|
|
|
*
|
|
|
|
|
* @param int $id
|
|
|
|
|
* @param string $title
|
|
|
|
|
* @abstract
|
|
|
|
|
*/
|
2007-04-21 14:44:56 +00:00
|
|
|
function updateTitle( $id, $title ) {
|
2005-05-23 08:42:20 +00:00
|
|
|
// no-op
|
2007-04-21 14:44:56 +00:00
|
|
|
}
|
2008-04-15 23:06:28 +00:00
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Get OpenSearch suggestion template
|
|
|
|
|
*
|
|
|
|
|
* @return string
|
|
|
|
|
* @static
|
|
|
|
|
*/
|
|
|
|
|
public static function getOpenSearchTemplate() {
|
|
|
|
|
global $wgOpenSearchTemplate, $wgServer, $wgScriptPath;
|
|
|
|
|
if($wgOpenSearchTemplate)
|
|
|
|
|
return $wgOpenSearchTemplate;
|
|
|
|
|
else{
|
|
|
|
|
$ns = implode(',',SearchEngine::defaultNamespaces());
|
|
|
|
|
if(!$ns) $ns = "0";
|
|
|
|
|
return $wgServer . $wgScriptPath . '/api.php?action=opensearch&search={searchTerms}&namespace='.$ns;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Get internal MediaWiki Suggest template
|
|
|
|
|
*
|
|
|
|
|
* @return string
|
|
|
|
|
* @static
|
|
|
|
|
*/
|
|
|
|
|
public static function getMWSuggestTemplate() {
|
|
|
|
|
global $wgMWSuggestTemplate, $wgServer, $wgScriptPath;
|
|
|
|
|
if($wgMWSuggestTemplate)
|
|
|
|
|
return $wgMWSuggestTemplate;
|
|
|
|
|
else
|
|
|
|
|
return $wgServer . $wgScriptPath . '/api.php?action=opensearch&search={searchTerms}&namespace={namespaces}';
|
|
|
|
|
}
|
2005-05-23 08:42:20 +00:00
|
|
|
}
|
|
|
|
|
|
2007-04-20 08:55:14 +00:00
|
|
|
/**
|
WARNING: HUGE COMMIT
Doxygen documentation update:
* Changed alls @addtogroup to @ingroup. @addtogroup adds the comment to the group description, but doesn't add the file, class, function, ... to the group like @ingroup does. See for example http://svn.wikimedia.org/doc/group__SpecialPage.html where it's impossible to see related files, classes, ... that should belong to that group.
* Added @file to file description, it seems that it should be explicitely decalred for file descriptions, otherwise doxygen will think that the comment document the first class, variabled, function, ... that is in that file.
* Removed some empty comments
* Removed some ?>
Added following groups:
* ExternalStorage
* JobQueue
* MaintenanceLanguage
One more thing: there are still a lot of warnings when generating the doc.
2008-05-20 17:13:28 +00:00
|
|
|
* @ingroup Search
|
2007-04-20 08:55:14 +00:00
|
|
|
*/
|
2005-05-23 08:42:20 +00:00
|
|
|
class SearchResultSet {
|
|
|
|
|
/**
|
|
|
|
|
* Fetch an array of regular expression fragments for matching
|
|
|
|
|
* the search terms as parsed by this engine in a text extract.
|
|
|
|
|
*
|
|
|
|
|
* @return array
|
|
|
|
|
* @access public
|
|
|
|
|
* @abstract
|
|
|
|
|
*/
|
|
|
|
|
function termMatches() {
|
|
|
|
|
return array();
|
|
|
|
|
}
|
2005-08-02 13:35:19 +00:00
|
|
|
|
2005-05-23 08:42:20 +00:00
|
|
|
function numRows() {
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
2005-08-02 13:35:19 +00:00
|
|
|
|
2005-05-23 08:42:20 +00:00
|
|
|
/**
|
|
|
|
|
* Return true if results are included in this result set.
|
|
|
|
|
* @return bool
|
|
|
|
|
* @abstract
|
|
|
|
|
*/
|
|
|
|
|
function hasResults() {
|
|
|
|
|
return false;
|
|
|
|
|
}
|
2005-08-02 13:35:19 +00:00
|
|
|
|
2005-05-23 08:42:20 +00:00
|
|
|
/**
|
|
|
|
|
* Some search modes return a total hit count for the query
|
|
|
|
|
* in the entire article database. This may include pages
|
|
|
|
|
* in namespaces that would not be matched on the given
|
|
|
|
|
* settings.
|
|
|
|
|
*
|
|
|
|
|
* Return null if no total hits number is supported.
|
|
|
|
|
*
|
|
|
|
|
* @return int
|
|
|
|
|
* @access public
|
|
|
|
|
*/
|
|
|
|
|
function getTotalHits() {
|
|
|
|
|
return null;
|
|
|
|
|
}
|
2005-08-02 13:35:19 +00:00
|
|
|
|
2005-05-23 08:42:20 +00:00
|
|
|
/**
|
|
|
|
|
* Some search modes return a suggested alternate term if there are
|
|
|
|
|
* no exact hits. Returns true if there is one on this set.
|
|
|
|
|
*
|
|
|
|
|
* @return bool
|
|
|
|
|
* @access public
|
|
|
|
|
*/
|
|
|
|
|
function hasSuggestion() {
|
|
|
|
|
return false;
|
|
|
|
|
}
|
2005-08-02 13:35:19 +00:00
|
|
|
|
2005-05-23 08:42:20 +00:00
|
|
|
/**
|
2008-03-23 13:43:11 +00:00
|
|
|
* @return string suggested query, null if none
|
|
|
|
|
*/
|
|
|
|
|
function getSuggestionQuery(){
|
|
|
|
|
return null;
|
|
|
|
|
}
|
2008-04-14 07:45:50 +00:00
|
|
|
|
2008-03-23 13:43:11 +00:00
|
|
|
/**
|
|
|
|
|
* @return string highlighted suggested query, '' if none
|
2005-05-23 08:42:20 +00:00
|
|
|
*/
|
2008-03-23 13:43:11 +00:00
|
|
|
function getSuggestionSnippet(){
|
2005-05-23 08:42:20 +00:00
|
|
|
return '';
|
|
|
|
|
}
|
2008-04-15 23:06:28 +00:00
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Return information about how and from where the results were fetched,
|
|
|
|
|
* should be useful for diagnostics and debugging
|
|
|
|
|
*
|
|
|
|
|
* @return string
|
|
|
|
|
*/
|
|
|
|
|
function getInfo() {
|
|
|
|
|
return null;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Return a result set of hits on other (multiple) wikis associated with this one
|
|
|
|
|
*
|
|
|
|
|
* @return SearchResultSet
|
|
|
|
|
*/
|
|
|
|
|
function getInterwikiResults() {
|
|
|
|
|
return null;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Check if there are results on other wikis
|
|
|
|
|
*
|
|
|
|
|
* @return boolean
|
|
|
|
|
*/
|
|
|
|
|
function hasInterwikiResults() {
|
|
|
|
|
return $this->getInterwikiResults() != null;
|
|
|
|
|
}
|
|
|
|
|
|
2005-08-02 13:35:19 +00:00
|
|
|
|
2005-05-23 08:42:20 +00:00
|
|
|
/**
|
|
|
|
|
* Fetches next search result, or false.
|
|
|
|
|
* @return SearchResult
|
|
|
|
|
* @access public
|
|
|
|
|
* @abstract
|
|
|
|
|
*/
|
|
|
|
|
function next() {
|
|
|
|
|
return false;
|
|
|
|
|
}
|
2008-04-14 07:45:50 +00:00
|
|
|
|
2007-06-06 18:36:11 +00:00
|
|
|
/**
|
|
|
|
|
* Frees the result set, if applicable.
|
|
|
|
|
* @ access public
|
|
|
|
|
*/
|
|
|
|
|
function free() {
|
|
|
|
|
// ...
|
|
|
|
|
}
|
2005-05-23 08:42:20 +00:00
|
|
|
}
|
|
|
|
|
|
2007-04-20 08:55:14 +00:00
|
|
|
|
2008-02-17 14:11:55 +00:00
|
|
|
/**
|
WARNING: HUGE COMMIT
Doxygen documentation update:
* Changed alls @addtogroup to @ingroup. @addtogroup adds the comment to the group description, but doesn't add the file, class, function, ... to the group like @ingroup does. See for example http://svn.wikimedia.org/doc/group__SpecialPage.html where it's impossible to see related files, classes, ... that should belong to that group.
* Added @file to file description, it seems that it should be explicitely decalred for file descriptions, otherwise doxygen will think that the comment document the first class, variabled, function, ... that is in that file.
* Removed some empty comments
* Removed some ?>
Added following groups:
* ExternalStorage
* JobQueue
* MaintenanceLanguage
One more thing: there are still a lot of warnings when generating the doc.
2008-05-20 17:13:28 +00:00
|
|
|
* @ingroup Search
|
2008-02-17 14:11:55 +00:00
|
|
|
*/
|
|
|
|
|
class SearchResultTooMany {
|
|
|
|
|
## Some search engines may bail out if too many matches are found
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
2007-04-20 08:55:14 +00:00
|
|
|
/**
|
WARNING: HUGE COMMIT
Doxygen documentation update:
* Changed alls @addtogroup to @ingroup. @addtogroup adds the comment to the group description, but doesn't add the file, class, function, ... to the group like @ingroup does. See for example http://svn.wikimedia.org/doc/group__SpecialPage.html where it's impossible to see related files, classes, ... that should belong to that group.
* Added @file to file description, it seems that it should be explicitely decalred for file descriptions, otherwise doxygen will think that the comment document the first class, variabled, function, ... that is in that file.
* Removed some empty comments
* Removed some ?>
Added following groups:
* ExternalStorage
* JobQueue
* MaintenanceLanguage
One more thing: there are still a lot of warnings when generating the doc.
2008-05-20 17:13:28 +00:00
|
|
|
* @ingroup Search
|
2007-04-20 08:55:14 +00:00
|
|
|
*/
|
2005-05-23 08:42:20 +00:00
|
|
|
class SearchResult {
|
2008-04-30 09:49:58 +00:00
|
|
|
var $mRevision = null;
|
2008-02-17 14:11:55 +00:00
|
|
|
|
2005-05-23 08:42:20 +00:00
|
|
|
function SearchResult( $row ) {
|
|
|
|
|
$this->mTitle = Title::makeTitle( $row->page_namespace, $row->page_title );
|
2008-04-15 23:06:28 +00:00
|
|
|
if( !is_null($this->mTitle) )
|
|
|
|
|
$this->mRevision = Revision::newFromTitle( $this->mTitle );
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Check if this is result points to an invalid title
|
|
|
|
|
*
|
|
|
|
|
* @return boolean
|
|
|
|
|
* @access public
|
|
|
|
|
*/
|
|
|
|
|
function isBrokenTitle(){
|
|
|
|
|
if( is_null($this->mTitle) )
|
|
|
|
|
return true;
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Check if target page is missing, happens when index is out of date
|
|
|
|
|
*
|
|
|
|
|
* @return boolean
|
|
|
|
|
* @access public
|
|
|
|
|
*/
|
|
|
|
|
function isMissingRevision(){
|
|
|
|
|
if( !$this->mRevision )
|
|
|
|
|
return true;
|
|
|
|
|
return false;
|
2005-05-23 08:42:20 +00:00
|
|
|
}
|
2005-08-02 13:35:19 +00:00
|
|
|
|
2005-05-23 08:42:20 +00:00
|
|
|
/**
|
|
|
|
|
* @return Title
|
|
|
|
|
* @access public
|
|
|
|
|
*/
|
|
|
|
|
function getTitle() {
|
|
|
|
|
return $this->mTitle;
|
|
|
|
|
}
|
2005-08-02 13:35:19 +00:00
|
|
|
|
2005-05-23 08:42:20 +00:00
|
|
|
/**
|
|
|
|
|
* @return double or null if not supported
|
|
|
|
|
*/
|
|
|
|
|
function getScore() {
|
|
|
|
|
return null;
|
|
|
|
|
}
|
2008-04-14 07:45:50 +00:00
|
|
|
|
2008-03-23 13:43:11 +00:00
|
|
|
/**
|
2008-04-15 23:06:28 +00:00
|
|
|
* Lazy initialization of article text from DB
|
2008-03-23 13:43:11 +00:00
|
|
|
*/
|
2008-04-15 23:06:28 +00:00
|
|
|
protected function initText(){
|
|
|
|
|
if( !isset($this->mText) ){
|
|
|
|
|
$this->mText = $this->mRevision->getText();
|
|
|
|
|
}
|
2008-03-23 13:43:11 +00:00
|
|
|
}
|
2008-05-04 15:31:03 +00:00
|
|
|
|
2008-03-23 13:43:11 +00:00
|
|
|
/**
|
2008-05-01 20:55:03 +00:00
|
|
|
* @param array $terms terms to highlight
|
2008-04-15 23:06:28 +00:00
|
|
|
* @return string highlighted text snippet, null (and not '') if not supported
|
|
|
|
|
*/
|
|
|
|
|
function getTextSnippet($terms){
|
2008-05-04 15:31:03 +00:00
|
|
|
global $wgUser, $wgAdvancedSearchHighlighting;
|
2008-04-15 23:06:28 +00:00
|
|
|
$this->initText();
|
|
|
|
|
list($contextlines,$contextchars) = SearchEngine::userHighlightPrefs($wgUser);
|
2008-05-04 15:31:03 +00:00
|
|
|
$h = new SearchHighlighter();
|
|
|
|
|
if( $wgAdvancedSearchHighlighting )
|
|
|
|
|
return $h->highlightText( $this->mText, $terms, $contextlines, $contextchars );
|
|
|
|
|
else
|
|
|
|
|
return $h->highlightSimple( $this->mText, $terms, $contextlines, $contextchars );
|
2008-04-15 23:06:28 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* @param array $terms terms to highlight
|
2008-03-23 13:43:11 +00:00
|
|
|
* @return string highlighted title, '' if not supported
|
|
|
|
|
*/
|
2008-04-15 23:06:28 +00:00
|
|
|
function getTitleSnippet($terms){
|
2008-03-23 13:43:11 +00:00
|
|
|
return '';
|
|
|
|
|
}
|
2008-04-14 07:45:50 +00:00
|
|
|
|
2008-03-23 13:43:11 +00:00
|
|
|
/**
|
2008-04-15 23:06:28 +00:00
|
|
|
* @param array $terms terms to highlight
|
2008-03-23 13:43:11 +00:00
|
|
|
* @return string highlighted redirect name (redirect to this page), '' if none or not supported
|
|
|
|
|
*/
|
2008-04-15 23:06:28 +00:00
|
|
|
function getRedirectSnippet($terms){
|
2008-03-23 13:43:11 +00:00
|
|
|
return '';
|
|
|
|
|
}
|
2008-04-14 07:45:50 +00:00
|
|
|
|
2008-03-23 13:43:11 +00:00
|
|
|
/**
|
|
|
|
|
* @return Title object for the redirect to this page, null if none or not supported
|
|
|
|
|
*/
|
|
|
|
|
function getRedirectTitle(){
|
|
|
|
|
return null;
|
|
|
|
|
}
|
2008-04-14 07:45:50 +00:00
|
|
|
|
2008-03-23 13:43:11 +00:00
|
|
|
/**
|
|
|
|
|
* @return string highlighted relevant section name, null if none or not supported
|
|
|
|
|
*/
|
|
|
|
|
function getSectionSnippet(){
|
|
|
|
|
return '';
|
|
|
|
|
}
|
2008-04-14 07:45:50 +00:00
|
|
|
|
2008-03-23 13:43:11 +00:00
|
|
|
/**
|
2008-04-14 07:45:50 +00:00
|
|
|
* @return Title object (pagename+fragment) for the section, null if none or not supported
|
2008-03-23 13:43:11 +00:00
|
|
|
*/
|
|
|
|
|
function getSectionTitle(){
|
|
|
|
|
return null;
|
|
|
|
|
}
|
2008-04-14 07:45:50 +00:00
|
|
|
|
2008-03-23 13:43:11 +00:00
|
|
|
/**
|
2008-04-15 23:06:28 +00:00
|
|
|
* @return string timestamp
|
2008-03-23 13:43:11 +00:00
|
|
|
*/
|
|
|
|
|
function getTimestamp(){
|
2008-04-15 23:06:28 +00:00
|
|
|
return $this->mRevision->getTimestamp();
|
2008-03-23 13:43:11 +00:00
|
|
|
}
|
2008-04-14 07:45:50 +00:00
|
|
|
|
2008-03-23 13:43:11 +00:00
|
|
|
/**
|
2008-04-15 23:06:28 +00:00
|
|
|
* @return int number of words
|
2008-03-23 13:43:11 +00:00
|
|
|
*/
|
|
|
|
|
function getWordCount(){
|
2008-04-15 23:06:28 +00:00
|
|
|
$this->initText();
|
|
|
|
|
return str_word_count( $this->mText );
|
2008-03-23 13:43:11 +00:00
|
|
|
}
|
2008-04-14 07:45:50 +00:00
|
|
|
|
2008-03-23 13:43:11 +00:00
|
|
|
/**
|
2008-04-15 23:06:28 +00:00
|
|
|
* @return int size in bytes
|
2008-03-23 13:43:11 +00:00
|
|
|
*/
|
|
|
|
|
function getByteSize(){
|
2008-04-15 23:06:28 +00:00
|
|
|
$this->initText();
|
|
|
|
|
return strlen( $this->mText );
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* @return boolean if hit has related articles
|
|
|
|
|
*/
|
|
|
|
|
function hasRelated(){
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* @return interwiki prefix of the title (return iw even if title is broken)
|
|
|
|
|
*/
|
|
|
|
|
function getInterwikiPrefix(){
|
|
|
|
|
return '';
|
2008-03-23 13:43:11 +00:00
|
|
|
}
|
2004-10-20 09:56:34 +00:00
|
|
|
}
|
2004-02-11 01:48:42 +00:00
|
|
|
|
2008-05-04 15:31:03 +00:00
|
|
|
/**
|
|
|
|
|
* Highlight bits of wikitext
|
|
|
|
|
*
|
WARNING: HUGE COMMIT
Doxygen documentation update:
* Changed alls @addtogroup to @ingroup. @addtogroup adds the comment to the group description, but doesn't add the file, class, function, ... to the group like @ingroup does. See for example http://svn.wikimedia.org/doc/group__SpecialPage.html where it's impossible to see related files, classes, ... that should belong to that group.
* Added @file to file description, it seems that it should be explicitely decalred for file descriptions, otherwise doxygen will think that the comment document the first class, variabled, function, ... that is in that file.
* Removed some empty comments
* Removed some ?>
Added following groups:
* ExternalStorage
* JobQueue
* MaintenanceLanguage
One more thing: there are still a lot of warnings when generating the doc.
2008-05-20 17:13:28 +00:00
|
|
|
* @ingroup Search
|
2008-05-04 15:31:03 +00:00
|
|
|
*/
|
|
|
|
|
class SearchHighlighter {
|
|
|
|
|
var $mCleanWikitext = true;
|
|
|
|
|
|
|
|
|
|
function SearchHighlighter($cleanupWikitext = true){
|
|
|
|
|
$this->mCleanWikitext = $cleanupWikitext;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Default implementation of wikitext highlighting
|
|
|
|
|
*
|
|
|
|
|
* @param string $text
|
|
|
|
|
* @param array $terms Terms to highlight (unescaped)
|
|
|
|
|
* @param int $contextlines
|
|
|
|
|
* @param int $contextchars
|
|
|
|
|
* @return string
|
|
|
|
|
*/
|
|
|
|
|
public function highlightText( $text, $terms, $contextlines, $contextchars ) {
|
|
|
|
|
global $wgLang, $wgContLang;
|
|
|
|
|
global $wgSearchHighlightBoundaries;
|
|
|
|
|
$fname = __METHOD__;
|
|
|
|
|
|
|
|
|
|
if($text == '')
|
|
|
|
|
return '';
|
|
|
|
|
|
|
|
|
|
// spli text into text + templates/links/tables
|
|
|
|
|
$spat = "/(\\{\\{)|(\\[\\[[^\\]:]+:)|(\n\\{\\|)";
|
|
|
|
|
// first capture group is for detecting nested templates/links/tables/references
|
|
|
|
|
$endPatterns = array(
|
|
|
|
|
1 => '/(\{\{)|(\}\})/', // template
|
|
|
|
|
2 => '/(\[\[)|(\]\])/', // image
|
|
|
|
|
3 => "/(\n\\{\\|)|(\n\\|\\})/"); // table
|
|
|
|
|
|
|
|
|
|
// FIXME: this should prolly be a hook or something
|
|
|
|
|
if(function_exists('wfCite')){
|
|
|
|
|
$spat .= '|(<ref>)'; // references via cite extension
|
|
|
|
|
$endPatterns[4] = '/(<ref>)|(<\/ref>)/';
|
|
|
|
|
}
|
|
|
|
|
$spat .= '/';
|
|
|
|
|
$textExt = array(); // text extracts
|
|
|
|
|
$otherExt = array(); // other extracts
|
|
|
|
|
wfProfileIn( "$fname-split" );
|
|
|
|
|
$start = 0;
|
|
|
|
|
$textLen = strlen($text);
|
|
|
|
|
$count = 0; // sequence number to maintain ordering
|
|
|
|
|
while( $start < $textLen ){
|
|
|
|
|
// find start of template/image/table
|
|
|
|
|
if( preg_match( $spat, $text, $matches, PREG_OFFSET_CAPTURE, $start ) ){
|
|
|
|
|
$epat = '';
|
|
|
|
|
foreach($matches as $key => $val){
|
|
|
|
|
if($key > 0 && $val[1] != -1){
|
|
|
|
|
if($key == 2){
|
|
|
|
|
// see if this is an image link
|
|
|
|
|
$ns = substr($val[0],2,-1);
|
|
|
|
|
if( $wgContLang->getNsIndex($ns) != NS_IMAGE )
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
$epat = $endPatterns[$key];
|
|
|
|
|
$this->splitAndAdd( $textExt, $count, substr( $text, $start, $val[1] - $start ) );
|
|
|
|
|
$start = $val[1];
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if( $epat ){
|
|
|
|
|
// find end (and detect any nested elements)
|
|
|
|
|
$level = 0;
|
|
|
|
|
$offset = $start + 1;
|
|
|
|
|
$found = false;
|
|
|
|
|
while( preg_match( $epat, $text, $endMatches, PREG_OFFSET_CAPTURE, $offset ) ){
|
|
|
|
|
if( array_key_exists(2,$endMatches) ){
|
|
|
|
|
// found end
|
|
|
|
|
if($level == 0){
|
|
|
|
|
$len = strlen($endMatches[2][0]);
|
|
|
|
|
$off = $endMatches[2][1];
|
|
|
|
|
$this->splitAndAdd( $otherExt, $count,
|
|
|
|
|
substr( $text, $start, $off + $len - $start ) );
|
|
|
|
|
$start = $off + $len;
|
|
|
|
|
$found = true;
|
|
|
|
|
break;
|
|
|
|
|
} else{
|
|
|
|
|
// end of nested element
|
|
|
|
|
$level -= 1;
|
|
|
|
|
}
|
|
|
|
|
} else{
|
|
|
|
|
// nested
|
|
|
|
|
$level += 1;
|
|
|
|
|
}
|
|
|
|
|
$offset = $endMatches[0][1] + strlen($endMatches[0][0]);
|
|
|
|
|
}
|
|
|
|
|
if( ! $found ){
|
|
|
|
|
// couldn't find appropriate closing tag, skip
|
|
|
|
|
$this->splitAndAdd( $textExt, $count, substr( $text, $start, strlen($matches[0][0]) ) );
|
|
|
|
|
$start += strlen($matches[0][0]);
|
|
|
|
|
}
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
// else: add as text extract
|
|
|
|
|
$this->splitAndAdd( $textExt, $count, substr($text,$start) );
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$all = $textExt + $otherExt; // these have disjunct key sets
|
|
|
|
|
|
|
|
|
|
wfProfileOut( "$fname-split" );
|
|
|
|
|
|
|
|
|
|
// prepare regexps
|
|
|
|
|
foreach( $terms as $index => $term ) {
|
|
|
|
|
$terms[$index] = preg_quote( $term, '/' );
|
|
|
|
|
// manually do upper/lowercase stuff for utf-8 since PHP won't do it
|
|
|
|
|
if(preg_match('/[\x80-\xff]/', $term) ){
|
|
|
|
|
$terms[$index] = preg_replace_callback('/./us',array($this,'caseCallback'),$terms[$index]);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
$anyterm = implode( '|', $terms );
|
|
|
|
|
$phrase = implode("$wgSearchHighlightBoundaries+", $terms );
|
|
|
|
|
|
|
|
|
|
// FIXME: a hack to scale contextchars, a correct solution
|
|
|
|
|
// would be to have contextchars actually be char and not byte
|
|
|
|
|
// length, and do proper utf-8 substrings and lengths everywhere,
|
|
|
|
|
// but PHP is making that very hard and unclean to implement :(
|
|
|
|
|
$scale = strlen($anyterm) / mb_strlen($anyterm);
|
|
|
|
|
$contextchars = intval( $contextchars * $scale );
|
|
|
|
|
|
|
|
|
|
$patPre = "(^|$wgSearchHighlightBoundaries)";
|
|
|
|
|
$patPost = "($wgSearchHighlightBoundaries|$)";
|
|
|
|
|
|
|
|
|
|
$pat1 = "/(".$phrase.")/ui";
|
|
|
|
|
$pat2 = "/$patPre(".$anyterm.")$patPost/ui";
|
|
|
|
|
|
|
|
|
|
wfProfileIn( "$fname-extract" );
|
|
|
|
|
|
|
|
|
|
$left = $contextlines;
|
|
|
|
|
|
|
|
|
|
$snippets = array();
|
|
|
|
|
$offsets = array();
|
|
|
|
|
|
|
|
|
|
// show beginning only if it contains all words
|
|
|
|
|
$first = 0;
|
|
|
|
|
$firstText = '';
|
|
|
|
|
foreach($textExt as $index => $line){
|
|
|
|
|
if(strlen($line)>0 && $line[0] != ';' && $line[0] != ':'){
|
|
|
|
|
$firstText = $this->extract( $line, 0, $contextchars * $contextlines );
|
|
|
|
|
$first = $index;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if( $firstText ){
|
|
|
|
|
$succ = true;
|
|
|
|
|
// check if first text contains all terms
|
|
|
|
|
foreach($terms as $term){
|
|
|
|
|
if( ! preg_match("/$patPre".$term."$patPost/ui", $firstText) ){
|
|
|
|
|
$succ = false;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if( $succ ){
|
|
|
|
|
$snippets[$first] = $firstText;
|
|
|
|
|
$offsets[$first] = 0;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if( ! $snippets ) {
|
|
|
|
|
// match whole query on text
|
|
|
|
|
$this->process($pat1, $textExt, $left, $contextchars, $snippets, $offsets);
|
|
|
|
|
// match whole query on templates/tables/images
|
|
|
|
|
$this->process($pat1, $otherExt, $left, $contextchars, $snippets, $offsets);
|
|
|
|
|
// match any words on text
|
|
|
|
|
$this->process($pat2, $textExt, $left, $contextchars, $snippets, $offsets);
|
|
|
|
|
// match any words on templates/tables/images
|
|
|
|
|
$this->process($pat2, $otherExt, $left, $contextchars, $snippets, $offsets);
|
|
|
|
|
|
|
|
|
|
ksort($snippets);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// add extra chars to each snippet to make snippets constant size
|
|
|
|
|
$extended = array();
|
|
|
|
|
if( count( $snippets ) == 0){
|
|
|
|
|
// couldn't find the target words, just show beginning of article
|
|
|
|
|
$targetchars = $contextchars * $contextlines;
|
|
|
|
|
$snippets[$first] = '';
|
|
|
|
|
$offsets[$first] = 0;
|
|
|
|
|
} else{
|
|
|
|
|
// if begin of the article contains the whole phrase, show only that !!
|
|
|
|
|
if( array_key_exists($first,$snippets) && preg_match($pat1,$snippets[$first])
|
|
|
|
|
&& $offsets[$first] < $contextchars * 2 ){
|
|
|
|
|
$snippets = array ($first => $snippets[$first]);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// calc by how much to extend existing snippets
|
|
|
|
|
$targetchars = intval( ($contextchars * $contextlines) / count ( $snippets ) );
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
foreach($snippets as $index => $line){
|
|
|
|
|
$extended[$index] = $line;
|
|
|
|
|
$len = strlen($line);
|
|
|
|
|
if( $len < $targetchars - 20 ){
|
|
|
|
|
// complete this line
|
|
|
|
|
if($len < strlen( $all[$index] )){
|
|
|
|
|
$extended[$index] = $this->extract( $all[$index], $offsets[$index], $offsets[$index]+$targetchars, $offsets[$index]);
|
|
|
|
|
$len = strlen( $extended[$index] );
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// add more lines
|
|
|
|
|
$add = $index + 1;
|
|
|
|
|
while( $len < $targetchars - 20
|
|
|
|
|
&& array_key_exists($add,$all)
|
|
|
|
|
&& !array_key_exists($add,$snippets) ){
|
|
|
|
|
$offsets[$add] = 0;
|
|
|
|
|
$tt = "\n".$this->extract( $all[$add], 0, $targetchars - $len, $offsets[$add] );
|
|
|
|
|
$extended[$add] = $tt;
|
|
|
|
|
$len += strlen( $tt );
|
|
|
|
|
$add++;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
//$snippets = array_map('htmlspecialchars', $extended);
|
|
|
|
|
$snippets = $extended;
|
|
|
|
|
$last = -1;
|
|
|
|
|
$extract = '';
|
|
|
|
|
foreach($snippets as $index => $line){
|
|
|
|
|
if($last == -1)
|
|
|
|
|
$extract .= $line; // first line
|
|
|
|
|
elseif($last+1 == $index && $offsets[$last]+strlen($snippets[$last]) >= strlen($all[$last]))
|
|
|
|
|
$extract .= " ".$line; // continous lines
|
|
|
|
|
else
|
|
|
|
|
$extract .= '<b> ... </b>' . $line;
|
|
|
|
|
|
|
|
|
|
$last = $index;
|
|
|
|
|
}
|
|
|
|
|
if( $extract )
|
|
|
|
|
$extract .= '<b> ... </b>';
|
|
|
|
|
|
|
|
|
|
$processed = array();
|
|
|
|
|
foreach($terms as $term){
|
|
|
|
|
if( ! isset($processed[$term]) ){
|
|
|
|
|
$pat3 = "/$patPre(".$term.")$patPost/ui"; // highlight word
|
|
|
|
|
$extract = preg_replace( $pat3,
|
|
|
|
|
"\\1<span class='searchmatch'>\\2</span>\\3", $extract );
|
|
|
|
|
$processed[$term] = true;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
wfProfileOut( "$fname-extract" );
|
|
|
|
|
|
|
|
|
|
return $extract;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Split text into lines and add it to extracts array
|
|
|
|
|
*
|
|
|
|
|
* @param array $extracts index -> $line
|
|
|
|
|
* @param int $count
|
|
|
|
|
* @param string $text
|
|
|
|
|
*/
|
|
|
|
|
function splitAndAdd(&$extracts, &$count, $text){
|
|
|
|
|
$split = explode( "\n", $this->mCleanWikitext? $this->removeWiki($text) : $text );
|
|
|
|
|
foreach($split as $line){
|
|
|
|
|
$tt = trim($line);
|
|
|
|
|
if( $tt )
|
|
|
|
|
$extracts[$count++] = $tt;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Do manual case conversion for non-ascii chars
|
|
|
|
|
*
|
|
|
|
|
* @param unknown_type $matches
|
|
|
|
|
*/
|
|
|
|
|
function caseCallback($matches){
|
|
|
|
|
global $wgContLang;
|
|
|
|
|
if( strlen($matches[0]) > 1 ){
|
|
|
|
|
return '['.$wgContLang->lc($matches[0]).$wgContLang->uc($matches[0]).']';
|
|
|
|
|
} else
|
|
|
|
|
return $matches[0];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Extract part of the text from start to end, but by
|
|
|
|
|
* not chopping up words
|
|
|
|
|
* @param string $text
|
|
|
|
|
* @param int $start
|
|
|
|
|
* @param int $end
|
|
|
|
|
* @param int $posStart (out) actual start position
|
|
|
|
|
* @param int $posEnd (out) actual end position
|
|
|
|
|
* @return string
|
|
|
|
|
*/
|
|
|
|
|
function extract($text, $start, $end, &$posStart = null, &$posEnd = null ){
|
|
|
|
|
global $wgContLang;
|
|
|
|
|
|
|
|
|
|
if( $start != 0)
|
|
|
|
|
$start = $this->position( $text, $start, 1 );
|
|
|
|
|
if( $end >= strlen($text) )
|
|
|
|
|
$end = strlen($text);
|
|
|
|
|
else
|
|
|
|
|
$end = $this->position( $text, $end );
|
|
|
|
|
|
|
|
|
|
if(!is_null($posStart))
|
|
|
|
|
$posStart = $start;
|
|
|
|
|
if(!is_null($posEnd))
|
|
|
|
|
$posEnd = $end;
|
|
|
|
|
|
|
|
|
|
if($end > $start)
|
|
|
|
|
return substr($text, $start, $end-$start);
|
|
|
|
|
else
|
|
|
|
|
return '';
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Find a nonletter near a point (index) in the text
|
|
|
|
|
*
|
|
|
|
|
* @param string $text
|
|
|
|
|
* @param int $point
|
|
|
|
|
* @param int $offset to found index
|
|
|
|
|
* @return int nearest nonletter index, or beginning of utf8 char if none
|
|
|
|
|
*/
|
|
|
|
|
function position($text, $point, $offset=0 ){
|
|
|
|
|
$tolerance = 10;
|
|
|
|
|
$s = max( 0, $point - $tolerance );
|
|
|
|
|
$l = min( strlen($text), $point + $tolerance ) - $s;
|
|
|
|
|
$m = array();
|
|
|
|
|
if( preg_match('/[ ,.!?~!@#$%^&*\(\)+=\-\\\|\[\]"\'<>]/', substr($text,$s,$l), $m, PREG_OFFSET_CAPTURE ) ){
|
|
|
|
|
return $m[0][1] + $s + $offset;
|
|
|
|
|
} else{
|
|
|
|
|
// check if point is on a valid first UTF8 char
|
|
|
|
|
$char = ord( $text[$point] );
|
|
|
|
|
while( $char >= 0x80 && $char < 0xc0 ) {
|
|
|
|
|
// skip trailing bytes
|
|
|
|
|
$point++;
|
|
|
|
|
if($point >= strlen($text))
|
|
|
|
|
return strlen($text);
|
|
|
|
|
$char = ord( $text[$point] );
|
|
|
|
|
}
|
|
|
|
|
return $point;
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Search extracts for a pattern, and return snippets
|
|
|
|
|
*
|
|
|
|
|
* @param string $pattern regexp for matching lines
|
|
|
|
|
* @param array $extracts extracts to search
|
|
|
|
|
* @param int $linesleft number of extracts to make
|
|
|
|
|
* @param int $contextchars length of snippet
|
|
|
|
|
* @param array $out map for highlighted snippets
|
|
|
|
|
* @param array $offsets map of starting points of snippets
|
|
|
|
|
* @protected
|
|
|
|
|
*/
|
|
|
|
|
function process( $pattern, $extracts, &$linesleft, &$contextchars, &$out, &$offsets ){
|
|
|
|
|
if($linesleft == 0)
|
|
|
|
|
return; // nothing to do
|
|
|
|
|
foreach($extracts as $index => $line){
|
|
|
|
|
if( array_key_exists($index,$out) )
|
|
|
|
|
continue; // this line already highlighted
|
|
|
|
|
|
|
|
|
|
$m = array();
|
|
|
|
|
if ( !preg_match( $pattern, $line, $m, PREG_OFFSET_CAPTURE ) )
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
$offset = $m[0][1];
|
|
|
|
|
$len = strlen($m[0][0]);
|
|
|
|
|
if($offset + $len < $contextchars)
|
|
|
|
|
$begin = 0;
|
|
|
|
|
elseif( $len > $contextchars)
|
|
|
|
|
$begin = $offset;
|
|
|
|
|
else
|
|
|
|
|
$begin = $offset + intval( ($len - $contextchars) / 2 );
|
|
|
|
|
|
|
|
|
|
$end = $begin + $contextchars;
|
|
|
|
|
|
|
|
|
|
$posBegin = $begin;
|
|
|
|
|
// basic snippet from this line
|
|
|
|
|
$out[$index] = $this->extract($line,$begin,$end,$posBegin);
|
|
|
|
|
$offsets[$index] = $posBegin;
|
|
|
|
|
$linesleft--;
|
|
|
|
|
if($linesleft == 0)
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Basic wikitext removal
|
|
|
|
|
* @protected
|
|
|
|
|
*/
|
|
|
|
|
function removeWiki($text) {
|
|
|
|
|
$fname = __METHOD__;
|
|
|
|
|
wfProfileIn( $fname );
|
|
|
|
|
|
|
|
|
|
//$text = preg_replace("/'{2,5}/", "", $text);
|
|
|
|
|
//$text = preg_replace("/\[[a-z]+:\/\/[^ ]+ ([^]]+)\]/", "\\2", $text);
|
|
|
|
|
//$text = preg_replace("/\[\[([^]|]+)\]\]/", "\\1", $text);
|
|
|
|
|
//$text = preg_replace("/\[\[([^]]+\|)?([^|]]+)\]\]/", "\\2", $text);
|
|
|
|
|
//$text = preg_replace("/\\{\\|(.*?)\\|\\}/", "", $text);
|
|
|
|
|
//$text = preg_replace("/\\[\\[[A-Za-z_-]+:([^|]+?)\\]\\]/", "", $text);
|
|
|
|
|
$text = preg_replace("/\\{\\{([^|]+?)\\}\\}/", "", $text);
|
|
|
|
|
$text = preg_replace("/\\{\\{([^|]+\\|)(.*?)\\}\\}/", "\\2", $text);
|
|
|
|
|
$text = preg_replace("/\\[\\[([^|]+?)\\]\\]/", "\\1", $text);
|
|
|
|
|
$text = preg_replace_callback("/\\[\\[([^|]+\\|)(.*?)\\]\\]/", array($this,'linkReplace'), $text);
|
|
|
|
|
//$text = preg_replace("/\\[\\[([^|]+\\|)(.*?)\\]\\]/", "\\2", $text);
|
|
|
|
|
$text = preg_replace("/<\/?[^>]+>/", "", $text);
|
|
|
|
|
$text = preg_replace("/'''''/", "", $text);
|
|
|
|
|
$text = preg_replace("/('''|<\/?[iIuUbB]>)/", "", $text);
|
|
|
|
|
$text = preg_replace("/''/", "", $text);
|
|
|
|
|
|
|
|
|
|
wfProfileOut( $fname );
|
|
|
|
|
return $text;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* callback to replace [[target|caption]] kind of links, if
|
|
|
|
|
* the target is category or image, leave it
|
|
|
|
|
*
|
|
|
|
|
* @param array $matches
|
|
|
|
|
*/
|
|
|
|
|
function linkReplace($matches){
|
|
|
|
|
$colon = strpos( $matches[1], ':' );
|
|
|
|
|
if( $colon === false )
|
|
|
|
|
return $matches[2]; // replace with caption
|
|
|
|
|
global $wgContLang;
|
|
|
|
|
$ns = substr( $matches[1], 0, $colon );
|
|
|
|
|
$index = $wgContLang->getNsIndex($ns);
|
|
|
|
|
if( $index !== false && ($index == NS_IMAGE || $index == NS_CATEGORY) )
|
|
|
|
|
return $matches[0]; // return the whole thing
|
|
|
|
|
else
|
|
|
|
|
return $matches[2];
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Simple & fast snippet extraction, but gives completely unrelevant
|
|
|
|
|
* snippets
|
|
|
|
|
*
|
|
|
|
|
* @param string $text
|
|
|
|
|
* @param array $terms
|
|
|
|
|
* @param int $contextlines
|
|
|
|
|
* @param int $contextchars
|
|
|
|
|
* @return string
|
|
|
|
|
*/
|
|
|
|
|
public function highlightSimple( $text, $terms, $contextlines, $contextchars ) {
|
|
|
|
|
global $wgLang, $wgContLang;
|
|
|
|
|
$fname = __METHOD__;
|
|
|
|
|
|
|
|
|
|
$lines = explode( "\n", $text );
|
|
|
|
|
|
|
|
|
|
$terms = implode( '|', $terms );
|
|
|
|
|
$terms = str_replace( '/', "\\/", $terms);
|
|
|
|
|
$max = intval( $contextchars ) + 1;
|
|
|
|
|
$pat1 = "/(.*)($terms)(.{0,$max})/i";
|
|
|
|
|
|
|
|
|
|
$lineno = 0;
|
|
|
|
|
|
|
|
|
|
$extract = "";
|
|
|
|
|
wfProfileIn( "$fname-extract" );
|
|
|
|
|
foreach ( $lines as $line ) {
|
|
|
|
|
if ( 0 == $contextlines ) {
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
++$lineno;
|
|
|
|
|
$m = array();
|
|
|
|
|
if ( ! preg_match( $pat1, $line, $m ) ) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
--$contextlines;
|
|
|
|
|
$pre = $wgContLang->truncate( $m[1], -$contextchars, ' ... ' );
|
|
|
|
|
|
|
|
|
|
if ( count( $m ) < 3 ) {
|
|
|
|
|
$post = '';
|
|
|
|
|
} else {
|
|
|
|
|
$post = $wgContLang->truncate( $m[3], $contextchars, ' ... ' );
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$found = $m[2];
|
|
|
|
|
|
|
|
|
|
$line = htmlspecialchars( $pre . $found . $post );
|
|
|
|
|
$pat2 = '/(' . $terms . ")/i";
|
|
|
|
|
$line = preg_replace( $pat2,
|
|
|
|
|
"<span class='searchmatch'>\\1</span>", $line );
|
|
|
|
|
|
|
|
|
|
$extract .= "${line}\n";
|
|
|
|
|
}
|
|
|
|
|
wfProfileOut( "$fname-extract" );
|
|
|
|
|
|
|
|
|
|
return $extract;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
2005-01-27 19:51:47 +00:00
|
|
|
/**
|
WARNING: HUGE COMMIT
Doxygen documentation update:
* Changed alls @addtogroup to @ingroup. @addtogroup adds the comment to the group description, but doesn't add the file, class, function, ... to the group like @ingroup does. See for example http://svn.wikimedia.org/doc/group__SpecialPage.html where it's impossible to see related files, classes, ... that should belong to that group.
* Added @file to file description, it seems that it should be explicitely decalred for file descriptions, otherwise doxygen will think that the comment document the first class, variabled, function, ... that is in that file.
* Removed some empty comments
* Removed some ?>
Added following groups:
* ExternalStorage
* JobQueue
* MaintenanceLanguage
One more thing: there are still a lot of warnings when generating the doc.
2008-05-20 17:13:28 +00:00
|
|
|
* @ingroup Search
|
2005-01-27 19:51:47 +00:00
|
|
|
*/
|
2004-10-20 09:56:34 +00:00
|
|
|
class SearchEngineDummy {
|
|
|
|
|
function search( $term ) {
|
|
|
|
|
return null;
|
2003-04-14 23:10:40 +00:00
|
|
|
}
|
2005-08-02 13:35:19 +00:00
|
|
|
function setLimitOffset($l, $o) {}
|
|
|
|
|
function legalSearchChars() {}
|
|
|
|
|
function update() {}
|
|
|
|
|
function setnamespaces() {}
|
|
|
|
|
function searchtitle() {}
|
|
|
|
|
function searchtext() {}
|
2003-04-14 23:10:40 +00:00
|
|
|
}
|