wiki.techinc.nl/includes/api/ApiQuerySearch.php

443 lines
13 KiB
PHP
Raw Normal View History

<?php
/**
*
*
* Created on July 30, 2007
*
* Copyright © 2007 Yuri Astrakhan "<Firstname><Lastname>@gmail.com"
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
* http://www.gnu.org/copyleft/gpl.html
*
* @file
*/
/**
* Query module to perform full text search within wiki titles and content
*
* @ingroup API
*/
class ApiQuerySearch extends ApiQueryGeneratorBase {
/**
* When $wgSearchType is null, $wgSearchAlternatives[0] is null. Null isn't
* a valid option for an array for PARAM_TYPE, so we'll use a fake name
* that can't possibly be a class name and describes what the null behavior
* does
*/
const BACKEND_NULL_PARAM = 'database-backed';
public function __construct( $query, $moduleName ) {
parent::__construct( $query, $moduleName, 'sr' );
}
public function execute() {
$this->run();
}
public function executeGenerator( $resultPageSet ) {
$this->run( $resultPageSet );
}
2011-02-19 00:30:18 +00:00
/**
* @param ApiPageSet $resultPageSet
* @return void
2011-02-19 00:30:18 +00:00
*/
private function run( $resultPageSet = null ) {
global $wgContLang;
$params = $this->extractRequestParams();
// Extract parameters
$limit = $params['limit'];
$query = $params['search'];
$what = $params['what'];
$interwiki = $params['interwiki'];
$searchInfo = array_flip( $params['info'] );
$prop = array_flip( $params['prop'] );
// Create search engine instance and set options
$search = isset( $params['backend'] ) && $params['backend'] != self::BACKEND_NULL_PARAM ?
SearchEngine::create( $params['backend'] ) : SearchEngine::create();
$search->setLimitOffset( $limit + 1, $params['offset'] );
$search->setNamespaces( $params['namespace'] );
$query = $search->transformSearchTerm( $query );
$query = $search->replacePrefixes( $query );
// Perform the actual search
if ( $what == 'text' ) {
$matches = $search->searchText( $query );
} elseif ( $what == 'title' ) {
$matches = $search->searchTitle( $query );
} elseif ( $what == 'nearmatch' ) {
$matches = SearchEngine::getNearMatchResultSet( $query );
} else {
// We default to title searches; this is a terrible legacy
// of the way we initially set up the MySQL fulltext-based
// search engine with separate title and text fields.
// In the future, the default should be for a combined index.
$what = 'title';
$matches = $search->searchTitle( $query );
// Not all search engines support a separate title search,
// for instance the Lucene-based engine we use on Wikipedia.
// In this case, fall back to full-text search (which will
// include titles in it!)
if ( is_null( $matches ) ) {
$what = 'text';
$matches = $search->searchText( $query );
}
}
if ( is_null( $matches ) ) {
$this->dieUsage( "{$what} search is disabled", "search-{$what}-disabled" );
} elseif ( $matches instanceof Status && !$matches->isGood() ) {
$this->dieUsage( $matches->getWikiText(), 'search-error' );
}
$apiResult = $this->getResult();
// Add search meta data to result
if ( isset( $searchInfo['totalhits'] ) ) {
$totalhits = $matches->getTotalHits();
if ( $totalhits !== null ) {
$apiResult->addValue( array( 'query', 'searchinfo' ),
'totalhits', $totalhits );
}
* Added fields to list=search output: size, wordcount, timestamp, snippet * Where supported by backend, list=search adds a 'searchinfo' element with optional info: 'totalhits' count and 'suggestion' alternate query term Snippets added to result items earlier by Roan; extended this with the other byte size, word count, and timestamp available on the result items and exposed through the regular UI. Had to work out a backwards-compatible method for the search meta-information with Roan; added a second 'searchinfo' element since adding attributes to 'search' would break compatibility for JSON output (despite being safe in XML). 'searchinfo' is present only if the backend supports the extra info and has something available; 'totalhits' with a total hit count and 'suggestion' for an alternate query suggestion (exposed as "Did you mean X?" link in UI). Note that total hit counts can be enabled for MySQL backend now by setting the experimental option $wgSearchMySQLTotalHits, but did-you-mean suggestions are not yet supported and need to be tested with a hack or another backend. Sample XML and JSON output with the new searchinfo items (which can be present whether or not there are any result items): <?xml version="1.0"?> <api> <query> <searchinfo totalhits="0" suggestion="joe momma" /> <search /> </query> </api> { "query": { "searchinfo": { "totalhits": 0, "suggestion": "joe momma" }, "search": [ ] } } The suggestion value is suitable for plugging back in as a search term, if present.
2009-07-28 21:13:48 +00:00
}
if ( isset( $searchInfo['suggestion'] ) && $matches->hasSuggestion() ) {
$apiResult->addValue( array( 'query', 'searchinfo' ),
'suggestion', $matches->getSuggestionQuery() );
* Added fields to list=search output: size, wordcount, timestamp, snippet * Where supported by backend, list=search adds a 'searchinfo' element with optional info: 'totalhits' count and 'suggestion' alternate query term Snippets added to result items earlier by Roan; extended this with the other byte size, word count, and timestamp available on the result items and exposed through the regular UI. Had to work out a backwards-compatible method for the search meta-information with Roan; added a second 'searchinfo' element since adding attributes to 'search' would break compatibility for JSON output (despite being safe in XML). 'searchinfo' is present only if the backend supports the extra info and has something available; 'totalhits' with a total hit count and 'suggestion' for an alternate query suggestion (exposed as "Did you mean X?" link in UI). Note that total hit counts can be enabled for MySQL backend now by setting the experimental option $wgSearchMySQLTotalHits, but did-you-mean suggestions are not yet supported and need to be tested with a hack or another backend. Sample XML and JSON output with the new searchinfo items (which can be present whether or not there are any result items): <?xml version="1.0"?> <api> <query> <searchinfo totalhits="0" suggestion="joe momma" /> <search /> </query> </api> { "query": { "searchinfo": { "totalhits": 0, "suggestion": "joe momma" }, "search": [ ] } } The suggestion value is suitable for plugging back in as a search term, if present.
2009-07-28 21:13:48 +00:00
}
// Add the search results to the result
$terms = $wgContLang->convertForSearchResult( $matches->termMatches() );
$titles = array();
$count = 0;
$result = $matches->next();
while ( $result ) {
if ( ++$count > $limit ) {
// We've reached the one extra which shows that there are
// additional items to be had. Stop here...
$this->setContinueEnumParameter( 'offset', $params['offset'] + $params['limit'] );
break;
}
// Silently skip broken and missing titles
if ( $result->isBrokenTitle() || $result->isMissingRevision() ) {
$result = $matches->next();
continue;
}
$title = $result->getTitle();
if ( is_null( $resultPageSet ) ) {
$vals = array();
ApiQueryBase::addTitleInfo( $vals, $title );
if ( isset( $prop['snippet'] ) ) {
$vals['snippet'] = $result->getTextSnippet( $terms );
}
if ( isset( $prop['size'] ) ) {
$vals['size'] = $result->getByteSize();
}
if ( isset( $prop['wordcount'] ) ) {
$vals['wordcount'] = $result->getWordCount();
}
if ( isset( $prop['timestamp'] ) ) {
$vals['timestamp'] = wfTimestamp( TS_ISO_8601, $result->getTimestamp() );
}
if ( !is_null( $result->getScore() ) && isset( $prop['score'] ) ) {
$vals['score'] = $result->getScore();
}
if ( isset( $prop['titlesnippet'] ) ) {
$vals['titlesnippet'] = $result->getTitleSnippet( $terms );
}
if ( !is_null( $result->getRedirectTitle() ) ) {
if ( isset( $prop['redirecttitle'] ) ) {
$vals['redirecttitle'] = $result->getRedirectTitle();
}
if ( isset( $prop['redirectsnippet'] ) ) {
$vals['redirectsnippet'] = $result->getRedirectSnippet( $terms );
}
}
if ( !is_null( $result->getSectionTitle() ) ) {
if ( isset( $prop['sectiontitle'] ) ) {
$vals['sectiontitle'] = $result->getSectionTitle()->getFragment();
}
if ( isset( $prop['sectionsnippet'] ) ) {
$vals['sectionsnippet'] = $result->getSectionSnippet();
}
}
if ( isset( $prop['hasrelated'] ) && $result->hasRelated() ) {
$vals['hasrelated'] = '';
}
// Add item to results and see whether it fits
$fit = $apiResult->addValue( array( 'query', $this->getModuleName() ),
null, $vals );
if ( !$fit ) {
$this->setContinueEnumParameter( 'offset', $params['offset'] + $count - 1 );
* API: BREAKING CHANGE: (bug 11430) Return fewer results than the limit in some cases to prevent running out of memory * This means queries could possibly return fewer results than the limit and still set a query-continue * Add iicontinue, rvcontinue, cicontinue, incontinue, amfrom to faciliate query-continue for these modules * Implemented by blocking additions to the ApiResult object if they would make it too large ** Important things like query-continue values and warnings are exempt from this check ** RSS feeds and exported XML are also exempted (size-checking them would be too messy) ** Result size is checked against $wgAPIMaxResultSize, which defaults to 8 MB For those who really care, per-file details follow: ApiResult.php: * Introduced ApiResult::$mSize which keeps track of the result size. * Introduced ApiResult::size() which calculates an array's size (which is the sum of the strlen()s of its elements). * ApiResult::addValue() now checks that the result size stays below $wgAPIMaxResultSize. If the item won't fit, it won't be added and addValue() will return false. Callers should check the return value and set a query-continue if it's false. * Closed the back door that is ApiResult::getData(): callers can't manipulate the data array directly anymore so they can't bypass the result size limit. * Added ApiResult::setIndexedTagName_internal() which will call setIndexedTagName() on an array already in the result. This is needed for the 'new' order of adding results, which means addValue()ing one result at a time until you hit the limit or run out, then calling this function to set the tag name. * Added ApiResult::disableSizeCheck() and enableSizeCheck() which disable and enable size checking in addValue(). This is used for stuff like query-continue elements and warnings which shouldn't count towards the result size. * Added ApiResult::unsetValue() which removes an element from the result and decreases $mSize. ApiBase.php: * Like ApiResult::getData(), ApiBase::getResultData() no longer returns a reference. * Use ApiResult::disableSizeCheck() in ApiBase::setWarning() ApiQueryBase.php: * Added ApiQueryBase::addPageSubItem(), which adds page subitems one item at a time. * addPageSubItem() and addPageSubItems() now return whether the subitem fit in the result. * Use ApiResult::disableSizeCheck() in setContinueEnumParameter() ApiMain.php: * Use ApiResult::disableSizeCheck() in ApiMain::substituteResultWithError() * Use getParameter() rather than $mRequest to obtain requestid DefaultSettings.php: * Added $wgAPIMaxResultSize, with a default value of 8 MB ApiQuery*.php: * Added results one at a time, and set a query-continue if the result is full. ApiQueryLangLinks.php and friends: * Migrated from addPageSubItems() to addPageSubItem(). This eliminates the need for $lastId. ApiQueryAllLinks.php, ApiQueryWatchlist.php, ApiQueryAllimages.php, ApiQuerySearch.php: * Renamed $data to something more appropriate ($pageids, $ids or $titles) ApiQuerySiteinfo.php: * Abuse siprop as a query-continue parameter and set it to all props that couldn't be processed. ApiQueryRandom.php: * Doesn't do continuations, because the result is supposed to be random. * Be smart enough to not run the second query if the results of the first didn't fit. ApiQueryImageInfo.php, ApiQueryRevisions.php, ApiQueryCategoryInfo.php, ApiQueryInfo.php: * Added continue parameter which basically skips the first so many items ApiQueryBacklinks.php: * Throw the result in a big array first and addValue() that one element at a time if necessary ** This is necessary because the results aren't retrieved in order * Introduced $this->pageMap to map namespace and title to page ID * Rewritten extractRowInfo() and extractRedirRowInfo() a little * Declared all private member variables explicitly ApiQueryDeletedrevs.php: * Use a pagemap just like in Backlinks * Introduce fake page IDs and keep track of them so we know where to add what ** This doesn't change the output format, because the fake page IDs start at 0 and are consecutive ApiQueryAllmessages.php: * Add amfrom to facilitate query-continue ApiQueryUsers.php: * Rewrite: put the getOtherUsersInfo() code in execute()
2009-02-05 14:30:59 +00:00
break;
}
} else {
* API: BREAKING CHANGE: (bug 11430) Return fewer results than the limit in some cases to prevent running out of memory * This means queries could possibly return fewer results than the limit and still set a query-continue * Add iicontinue, rvcontinue, cicontinue, incontinue, amfrom to faciliate query-continue for these modules * Implemented by blocking additions to the ApiResult object if they would make it too large ** Important things like query-continue values and warnings are exempt from this check ** RSS feeds and exported XML are also exempted (size-checking them would be too messy) ** Result size is checked against $wgAPIMaxResultSize, which defaults to 8 MB For those who really care, per-file details follow: ApiResult.php: * Introduced ApiResult::$mSize which keeps track of the result size. * Introduced ApiResult::size() which calculates an array's size (which is the sum of the strlen()s of its elements). * ApiResult::addValue() now checks that the result size stays below $wgAPIMaxResultSize. If the item won't fit, it won't be added and addValue() will return false. Callers should check the return value and set a query-continue if it's false. * Closed the back door that is ApiResult::getData(): callers can't manipulate the data array directly anymore so they can't bypass the result size limit. * Added ApiResult::setIndexedTagName_internal() which will call setIndexedTagName() on an array already in the result. This is needed for the 'new' order of adding results, which means addValue()ing one result at a time until you hit the limit or run out, then calling this function to set the tag name. * Added ApiResult::disableSizeCheck() and enableSizeCheck() which disable and enable size checking in addValue(). This is used for stuff like query-continue elements and warnings which shouldn't count towards the result size. * Added ApiResult::unsetValue() which removes an element from the result and decreases $mSize. ApiBase.php: * Like ApiResult::getData(), ApiBase::getResultData() no longer returns a reference. * Use ApiResult::disableSizeCheck() in ApiBase::setWarning() ApiQueryBase.php: * Added ApiQueryBase::addPageSubItem(), which adds page subitems one item at a time. * addPageSubItem() and addPageSubItems() now return whether the subitem fit in the result. * Use ApiResult::disableSizeCheck() in setContinueEnumParameter() ApiMain.php: * Use ApiResult::disableSizeCheck() in ApiMain::substituteResultWithError() * Use getParameter() rather than $mRequest to obtain requestid DefaultSettings.php: * Added $wgAPIMaxResultSize, with a default value of 8 MB ApiQuery*.php: * Added results one at a time, and set a query-continue if the result is full. ApiQueryLangLinks.php and friends: * Migrated from addPageSubItems() to addPageSubItem(). This eliminates the need for $lastId. ApiQueryAllLinks.php, ApiQueryWatchlist.php, ApiQueryAllimages.php, ApiQuerySearch.php: * Renamed $data to something more appropriate ($pageids, $ids or $titles) ApiQuerySiteinfo.php: * Abuse siprop as a query-continue parameter and set it to all props that couldn't be processed. ApiQueryRandom.php: * Doesn't do continuations, because the result is supposed to be random. * Be smart enough to not run the second query if the results of the first didn't fit. ApiQueryImageInfo.php, ApiQueryRevisions.php, ApiQueryCategoryInfo.php, ApiQueryInfo.php: * Added continue parameter which basically skips the first so many items ApiQueryBacklinks.php: * Throw the result in a big array first and addValue() that one element at a time if necessary ** This is necessary because the results aren't retrieved in order * Introduced $this->pageMap to map namespace and title to page ID * Rewritten extractRowInfo() and extractRedirRowInfo() a little * Declared all private member variables explicitly ApiQueryDeletedrevs.php: * Use a pagemap just like in Backlinks * Introduce fake page IDs and keep track of them so we know where to add what ** This doesn't change the output format, because the fake page IDs start at 0 and are consecutive ApiQueryAllmessages.php: * Add amfrom to facilitate query-continue ApiQueryUsers.php: * Rewrite: put the getOtherUsersInfo() code in execute()
2009-02-05 14:30:59 +00:00
$titles[] = $title;
}
$result = $matches->next();
}
$hasInterwikiResults = false;
if ( $interwiki && $resultPageSet === null && $matches->hasInterwikiResults() ) {
$matches = $matches->getInterwikiResults();
$iwprefixes = array();
$hasInterwikiResults = true;
// Include number of results if requested
if ( isset( $searchInfo['totalhits'] ) ) {
$totalhits = $matches->getTotalHits();
if ( $totalhits !== null ) {
$apiResult->addValue( array( 'query', 'interwikisearchinfo' ),
'totalhits', $totalhits );
}
}
$result = $matches->next();
while ( $result ) {
$title = $result->getTitle();
$vals = array(
'namespace' => $result->getInterwikiNamespaceText(),
'title' => $title->getText(),
'url' => $title->getFullUrl(),
);
// Add item to results and see whether it fits
$fit = $apiResult->addValue(
array( 'query', 'interwiki' . $this->getModuleName(), $result->getInterwikiPrefix() ),
null,
$vals
);
if ( !$fit ) {
// We hit the limit. We can't really provide any meaningful
// pagination info so just bail out
break;
}
$result = $matches->next();
}
}
if ( is_null( $resultPageSet ) ) {
$apiResult->setIndexedTagName_internal( array(
'query', $this->getModuleName()
), 'p' );
if ( $hasInterwikiResults ) {
$apiResult->setIndexedTagName_internal( array(
'query', 'interwiki' . $this->getModuleName()
), 'p' );
}
} else {
$resultPageSet->populateFromTitles( $titles );
}
}
public function getCacheMode( $params ) {
return 'public';
}
public function getAllowedParams() {
global $wgSearchType;
$params = array(
'search' => array(
ApiBase::PARAM_TYPE => 'string',
ApiBase::PARAM_REQUIRED => true
),
'namespace' => array(
ApiBase::PARAM_DFLT => NS_MAIN,
ApiBase::PARAM_TYPE => 'namespace',
ApiBase::PARAM_ISMULTI => true,
),
'what' => array(
ApiBase::PARAM_DFLT => null,
ApiBase::PARAM_TYPE => array(
'title',
'text',
'nearmatch',
)
),
'info' => array(
ApiBase::PARAM_DFLT => 'totalhits|suggestion',
ApiBase::PARAM_TYPE => array(
'totalhits',
'suggestion',
),
ApiBase::PARAM_ISMULTI => true,
),
'prop' => array(
ApiBase::PARAM_DFLT => 'size|wordcount|timestamp|snippet',
ApiBase::PARAM_TYPE => array(
'size',
'wordcount',
'timestamp',
'score',
'snippet',
'titlesnippet',
'redirecttitle',
'redirectsnippet',
'sectiontitle',
'sectionsnippet',
'hasrelated',
),
ApiBase::PARAM_ISMULTI => true,
),
'offset' => 0,
'limit' => array(
ApiBase::PARAM_DFLT => 10,
ApiBase::PARAM_TYPE => 'limit',
ApiBase::PARAM_MIN => 1,
ApiBase::PARAM_MAX => ApiBase::LIMIT_SML1,
ApiBase::PARAM_MAX2 => ApiBase::LIMIT_SML2
),
'interwiki' => false,
);
$alternatives = SearchEngine::getSearchTypes();
if ( count( $alternatives ) > 1 ) {
if ( $alternatives[0] === null ) {
$alternatives[0] = self::BACKEND_NULL_PARAM;
}
$params['backend'] = array(
ApiBase::PARAM_DFLT => $wgSearchType,
ApiBase::PARAM_TYPE => $alternatives,
);
}
return $params;
}
public function getParamDescription() {
$descriptions = array(
'search' => 'Search for all page titles (or content) that has this value',
'namespace' => 'The namespace(s) to enumerate',
'what' => 'Search inside the text or titles',
'info' => 'What metadata to return',
'prop' => array(
'What properties to return',
' size - Adds the size of the page in bytes',
' wordcount - Adds the word count of the page',
' timestamp - Adds the timestamp of when the page was last edited',
' score - Adds the score (if any) from the search engine',
' snippet - Adds a parsed snippet of the page',
' titlesnippet - Adds a parsed snippet of the page title',
2011-07-06 20:47:26 +00:00
' redirectsnippet - Adds a parsed snippet of the redirect title',
' redirecttitle - Adds the title of the matching redirect',
' sectionsnippet - Adds a parsed snippet of the matching section title',
' sectiontitle - Adds the title of the matching section',
' hasrelated - Indicates whether a related search is available',
),
'offset' => 'Use this value to continue paging (return by query)',
'limit' => 'How many total pages to return',
'interwiki' => 'Include interwiki results in the search, if available'
);
if ( count( SearchEngine::getSearchTypes() ) > 1 ) {
$descriptions['backend'] = 'Which search backend to use, if not the default';
}
return $descriptions;
}
public function getResultProperties() {
return array(
'' => array(
'ns' => 'namespace',
'title' => 'string'
),
'snippet' => array(
'snippet' => 'string'
),
'size' => array(
'size' => 'integer'
),
'wordcount' => array(
'wordcount' => 'integer'
),
'timestamp' => array(
'timestamp' => 'timestamp'
),
'score' => array(
'score' => array(
ApiBase::PROP_TYPE => 'string',
ApiBase::PROP_NULLABLE => true
)
),
'titlesnippet' => array(
'titlesnippet' => 'string'
),
'redirecttitle' => array(
'redirecttitle' => array(
ApiBase::PROP_TYPE => 'string',
ApiBase::PROP_NULLABLE => true
)
),
'redirectsnippet' => array(
'redirectsnippet' => array(
ApiBase::PROP_TYPE => 'string',
ApiBase::PROP_NULLABLE => true
)
),
'sectiontitle' => array(
'sectiontitle' => array(
ApiBase::PROP_TYPE => 'string',
ApiBase::PROP_NULLABLE => true
)
),
'sectionsnippet' => array(
'sectionsnippet' => array(
ApiBase::PROP_TYPE => 'string',
ApiBase::PROP_NULLABLE => true
)
),
'hasrelated' => array(
'hasrelated' => 'boolean'
)
);
}
public function getDescription() {
return 'Perform a full text search.';
}
public function getPossibleErrors() {
return array_merge( parent::getPossibleErrors(), array(
array( 'code' => 'search-text-disabled', 'info' => 'text search is disabled' ),
array( 'code' => 'search-title-disabled', 'info' => 'title search is disabled' ),
array( 'code' => 'search-error', 'info' => 'search error has occurred' ),
) );
}
public function getExamples() {
return array(
'api.php?action=query&list=search&srsearch=meaning',
'api.php?action=query&list=search&srwhat=text&srsearch=meaning',
'api.php?action=query&generator=search&gsrsearch=meaning&prop=info',
);
}
public function getHelpUrls() {
2011-11-28 15:43:11 +00:00
return 'https://www.mediawiki.org/wiki/API:Search';
}
}