wiki.techinc.nl/includes/search/SearchPostgres.php

193 lines
6.2 KiB
PHP
Raw Normal View History

2006-07-05 03:54:01 +00:00
<?php
/**
* PostgreSQL search engine
*
* Copyright © 2006-2007 Greg Sabino Mullane <greg@turnstep.com>
* https://www.mediawiki.org/
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
* http://www.gnu.org/copyleft/gpl.html
*
* @file
* @ingroup Search
*/
2006-07-05 03:54:01 +00:00
/**
* Search engine hook base class for Postgres
* @ingroup Search
2006-07-05 03:54:01 +00:00
*/
class SearchPostgres extends SearchDatabase {
2006-07-05 03:54:01 +00:00
/**
* Perform a full text search query via tsearch2 and return a result set.
* Currently searches a page's current title (page.page_title) and
2007-04-04 00:14:03 +00:00
* latest revision article text (pagecontent.old_text)
2006-07-05 03:54:01 +00:00
*
* @param string $term Raw search term
* @return SqlSearchResultSet
2006-07-05 03:54:01 +00:00
*/
2007-04-04 00:14:03 +00:00
function searchTitle( $term ) {
$q = $this->searchQuery( $term, 'titlevector', 'page_title' );
$olderror = error_reporting( E_ERROR );
$resultSet = $this->db->query( $q, 'SearchPostgres', true );
error_reporting( $olderror );
return new SqlSearchResultSet( $resultSet, $this->searchTerms );
2006-07-05 03:54:01 +00:00
}
2011-07-09 03:49:25 +00:00
2007-04-04 00:14:03 +00:00
function searchText( $term ) {
$q = $this->searchQuery( $term, 'textvector', 'old_text' );
$olderror = error_reporting( E_ERROR );
$resultSet = $this->db->query( $q, 'SearchPostgres', true );
error_reporting( $olderror );
return new SqlSearchResultSet( $resultSet, $this->searchTerms );
2006-07-05 03:54:01 +00:00
}
/**
2006-07-05 03:54:01 +00:00
* Transform the user's search string into a better form for tsearch2
* Returns an SQL fragment consisting of quoted text to search for.
*
* @param string $term
*
* @return string
*/
2007-04-04 00:14:03 +00:00
function parseQuery( $term ) {
wfDebug( "parseQuery received: $term \n" );
2007-04-04 00:14:03 +00:00
# # No backslashes allowed
$term = preg_replace( '/\\\/', '', $term );
2007-04-04 00:14:03 +00:00
# # Collapse parens into nearby words:
$term = preg_replace( '/\s*\(\s*/', ' (', $term );
$term = preg_replace( '/\s*\)\s*/', ') ', $term );
2006-07-05 03:54:01 +00:00
# # Treat colons as word separators:
$term = preg_replace( '/:/', ' ', $term );
2007-04-05 13:51:16 +00:00
2007-04-04 00:14:03 +00:00
$searchstring = '';
$m = [];
if ( preg_match_all( '/([-!]?)(\S+)\s*/', $term, $m, PREG_SET_ORDER ) ) {
foreach ( $m as $terms ) {
if ( strlen( $terms[1] ) ) {
2007-04-04 00:14:03 +00:00
$searchstring .= ' & !';
}
if ( strtolower( $terms[2] ) === 'and' ) {
2007-04-04 00:14:03 +00:00
$searchstring .= ' & ';
} elseif ( strtolower( $terms[2] ) === 'or' || $terms[2] === '|' ) {
2007-04-04 00:14:03 +00:00
$searchstring .= ' | ';
} elseif ( strtolower( $terms[2] ) === 'not' ) {
2007-04-04 00:14:03 +00:00
$searchstring .= ' & !';
} else {
2007-04-04 00:14:03 +00:00
$searchstring .= " & $terms[2]";
2006-07-05 03:54:01 +00:00
}
}
}
# # Strip out leading junk
$searchstring = preg_replace( '/^[\s\&\|]+/', '', $searchstring );
2007-04-04 00:14:03 +00:00
# # Remove any doubled-up operators
$searchstring = preg_replace( '/([\!\&\|]) +(?:[\&\|] +)+/', "$1 ", $searchstring );
2007-04-04 00:14:03 +00:00
# # Remove any non-spaced operators (e.g. "Zounds!")
$searchstring = preg_replace( '/([^ ])[\!\&\|]/', "$1", $searchstring );
2007-04-05 13:51:16 +00:00
# # Remove any trailing whitespace or operators
$searchstring = preg_replace( '/[\s\!\&\|]+$/', '', $searchstring );
2007-04-06 12:32:26 +00:00
# # Remove unnecessary quotes around everything
$searchstring = preg_replace( '/^[\'"](.*)[\'"]$/', "$1", $searchstring );
2007-04-05 13:51:16 +00:00
# # Quote the whole thing
$searchstring = $this->db->addQuotes( $searchstring );
2007-04-04 00:14:03 +00:00
wfDebug( "parseQuery returned: $searchstring \n" );
2007-04-04 00:14:03 +00:00
return $searchstring;
2006-07-05 03:54:01 +00:00
}
/**
* Construct the full SQL query to do the search.
* @param string $term
* @param string $fulltext
* @param string $colname
2012-02-09 21:36:14 +00:00
* @return string
2006-07-05 03:54:01 +00:00
*/
2007-04-04 00:14:03 +00:00
function searchQuery( $term, $fulltext, $colname ) {
# Get the SQL fragment for the given term
2007-04-04 00:14:03 +00:00
$searchstring = $this->parseQuery( $term );
2006-07-05 03:54:01 +00:00
# # We need a separate query here so gin does not complain about empty searches
$sql = "SELECT to_tsquery($searchstring)";
$res = $this->db->query( $sql );
if ( !$res ) {
# # TODO: Better output (example to catch: one 'two)
die( "Sorry, that was not a valid search string. Please go back and try again" );
}
$top = $res->fetchRow()[0];
$this->searchTerms = [];
if ( $top === "" ) { # # e.g. if only stopwords are used XXX return something better
$query = "SELECT page_id, page_namespace, page_title, 0 AS score " .
"FROM page p, revision r, pagecontent c WHERE p.page_latest = r.rev_id " .
"AND r.rev_text_id = c.old_id AND 1=0";
} else {
$m = [];
if ( preg_match_all( "/'([^']+)'/", $top, $m, PREG_SET_ORDER ) ) {
foreach ( $m as $terms ) {
$this->searchTerms[$terms[1]] = $terms[1];
}
}
$query = "SELECT page_id, page_namespace, page_title, " .
"ts_rank($fulltext, to_tsquery($searchstring), 5) AS score " .
"FROM page p, revision r, pagecontent c WHERE p.page_latest = r.rev_id " .
"AND r.rev_text_id = c.old_id AND $fulltext @@ to_tsquery($searchstring)";
2007-04-04 00:14:03 +00:00
}
2006-07-05 03:54:01 +00:00
# # Namespaces - defaults to 0
if ( !is_null( $this->namespaces ) ) { // null -> search all
if ( count( $this->namespaces ) < 1 ) {
$query .= ' AND page_namespace = 0';
} else {
$namespaces = $this->db->makeList( $this->namespaces );
$query .= " AND page_namespace IN ($namespaces)";
}
2006-07-05 03:54:01 +00:00
}
2007-04-04 00:14:03 +00:00
$query .= " ORDER BY score DESC, page_id DESC";
2006-07-05 03:54:01 +00:00
$query .= $this->db->limitResult( '', $this->limit, $this->offset );
wfDebug( "searchQuery returned: $query \n" );
2007-04-04 00:14:03 +00:00
2006-07-05 03:54:01 +00:00
return $query;
}
# # Most of the work of these two functions are done automatically via triggers
2006-07-05 03:54:01 +00:00
2007-01-14 22:57:31 +00:00
function update( $pageid, $title, $text ) {
# # We don't want to index older revisions
PostgreSQL: Improve speed for page edit in imports Whenever a new revision is added, a deferred update gets enqueued. When it is fired, it clears the searchable text from all earlier revisions for the article. This becomes very slow for articles with long revision histories, as it re-clears the textvector even when it has already been cleared by earlier actions. This leads to very high load in the database for runs of importDump.php This patch improves this situation by adding a condition to the WHERE clause such that it does not update rows in which the textvector is already NULL. PostgreSQL cannot automatically remove such degenerate updates in general because the updated rows must be locked and have their transaction markers increased. However, in this particular case those things are unimportant. This change improves the performance of importDump.php on a wiki with long revision histories by 7 fold, and moves the major bottleneck from the database to PHP. It might also improve the performance of ordinary page edits, but that was not tested. There are more improvements that could be made here. For example, a partial index or expression index could make it so that already cleared rows do not have to be visited at all. Or the deferred update mechanism could have a notion of "idempotency" so that many indentical updates enqueued during bulk loading would be condensed to only a single execution. However, this change is very much simpler and is sufficient to shift the bottleneck elsewhere. Change-Id: I458603767c6a86425010d02ffc1f8079c4b3c9a0
2014-04-21 04:31:07 +00:00
$sql = "UPDATE pagecontent SET textvector = NULL WHERE textvector IS NOT NULL and old_id IN " .
"(SELECT DISTINCT rev_text_id FROM revision WHERE rev_page = " . intval( $pageid ) .
" ORDER BY rev_text_id DESC OFFSET 1)";
$this->db->query( $sql );
2007-04-04 00:14:03 +00:00
return true;
}
function updateTitle( $id, $title ) {
2007-01-14 22:57:31 +00:00
return true;
}
2006-07-05 03:54:01 +00:00
}