Make content handlers assemble content for search

Bug: T89733
Change-Id: Ie45de496ecc826211d98eea3a410c7639b4be0a4
This commit is contained in:
Stanislav Malyshev 2016-05-16 13:24:10 -07:00
parent b7c4c8717f
commit add1ebe2ab
14 changed files with 635 additions and 15 deletions

View file

@ -1539,6 +1539,7 @@ $wgAutoloadLocalClasses = [
'WikiReference' => __DIR__ . '/includes/WikiMap.php',
'WikiRevision' => __DIR__ . '/includes/import/WikiRevision.php',
'WikiStatsOutput' => __DIR__ . '/maintenance/language/StatOutputs.php',
'WikiTextStructure' => __DIR__ . '/includes/content/WikiTextStructure.php',
'WikitextContent' => __DIR__ . '/includes/content/WikitextContent.php',
'WikitextContentHandler' => __DIR__ . '/includes/content/WikitextContentHandler.php',
'WinCacheBagOStuff' => __DIR__ . '/includes/libs/objectcache/WinCacheBagOStuff.php',

View file

@ -2620,6 +2620,18 @@ search results.
$title: Current Title object being displayed in search results.
&$id: Revision ID (default is false, for latest)
'SearchIndexFields': Add fields to search index mapping.
&$fields: Array of fields, all implement SearchIndexField
$engine: SearchEngine instance for which mapping is being built.
'SearchDataForIndex': Add data to search document. Allows to add any data to
the field map used to index the document.
&$fields: Array of name => value pairs for fields
$handler: ContentHandler for the content being indexed
$page: WikiPage that is being indexed
$output: ParserOutput that is produced from the page
$engine: SearchEngine for which the indexing is intended
'SecondaryDataUpdates': Allows modification of the list of DataUpdates to
perform when page content is modified. Currently called by
AbstractContent::getSecondaryDataUpdates.

View file

@ -1270,4 +1270,69 @@ abstract class ContentHandler {
*/
return [];
}
/**
* Add new field definition to array.
* @param SearchIndexField[] $fields
* @param SearchEngine $engine
* @param string $name
* @param int $type
* @return SearchIndexField[] new field defs
* @since 1.28
*/
protected function addSearchField( &$fields, SearchEngine $engine, $name, $type ) {
$fields[$name] = $engine->makeSearchFieldMapping( $name, $type );
return $fields;
}
/**
* Return fields to be indexed by search engine
* as representation of this document.
* Overriding class should call parent function or take care of calling
* the SearchDataForIndex hook.
* @param WikiPage $page Page to index
* @param ParserOutput $output
* @param SearchEngine $engine Search engine for which we are indexing
* @return array Map of name=>value for fields
* @since 1.28
*/
public function getDataForSearchIndex( WikiPage $page, ParserOutput $output,
SearchEngine $engine ) {
$fields = [];
$content = $page->getContent();
if ( $content ) {
$text = $content->getTextForSearchIndex();
$fields['text'] = $text;
$fields['source_text'] = $text;
$fields['text_bytes'] = $content->getSize();
}
Hooks::run( 'SearchDataForIndex', [ &$fields, $this, $page, $output, $engine ] );
return $fields;
}
/**
* Produce page output suitable for indexing.
*
* Specific content handlers may override it if they need different content handling.
*
* @param WikiPage $page
* @param ParserCache $cache
* @return ParserOutput
*/
public function getParserOutputForIndexing( WikiPage $page, ParserCache $cache = null ) {
$parserOptions = $page->makeParserOptions( 'canonical' );
$revId = $page->getRevision()->getId();
if ( $cache ) {
$parserOutput = $cache->get( $page, $parserOptions );
}
if ( empty( $parserOutput ) ) {
$parserOutput =
$page->getContent()->getParserOutput( $page->getTitle(), $revId, $parserOptions );
if ( $cache ) {
$cache->save( $parserOutput, $page, $parserOptions );
}
}
return $parserOutput;
}
}

View file

@ -148,4 +148,13 @@ class TextContentHandler extends ContentHandler {
$engine->makeSearchFieldMapping( 'language', SearchIndexField::INDEX_TYPE_KEYWORD );
return $fields;
}
public function getDataForSearchIndex( WikiPage $page, ParserOutput $output,
SearchEngine $engine ) {
$fields = parent::getDataForSearchIndex( $page, $output, $engine );
$fields['language'] =
$this->getPageLanguage( $page->getTitle(), $page->getContent() )->getCode();
return $fields;
}
}

View file

@ -0,0 +1,277 @@
<?php
use HtmlFormatter\HtmlFormatter;
use MediaWiki\Logger\LoggerFactory;
/**
* Class allowing to explore structure of parsed wikitext.
*/
class WikiTextStructure {
/**
* @var string
*/
private $openingText;
/**
* @var string
*/
private $allText;
/**
* @var string[]
*/
private $auxText = [];
/**
* @var ParserOutput
*/
private $parserOutput;
/**
* @var string[] selectors to elements that are excluded entirely from search
*/
private $excludedElementSelectors = [
'audio', 'video', // "it looks like you don't have javascript enabled..."
// do not need to index
'sup.reference', // The [1] for references
'.mw-cite-backlink', // The ↑ next to references in the references section
'h1', 'h2', 'h3', // Headings are already indexed in their own field.
'h5', 'h6', 'h4',
'.autocollapse', // Collapsed fields are hidden by default so we don't want them
// showing up.
];
/**
* @var string[] selectors to elements that are considered auxiliary to article text for search
*/
private $auxiliaryElementSelectors = [
'.thumbcaption', // Thumbnail captions aren't really part of the text proper
'table', // Neither are tables
'.rellink', // Common style for "See also:".
'.dablink', // Common style for calling out helpful links at the top
// of the article.
'.searchaux', // New class users can use to mark stuff as auxiliary to searches.
];
/**
* WikiTextStructure constructor.
* @param ParserOutput $parserOutput
*/
public function __construct( ParserOutput $parserOutput ) {
$this->parserOutput = $parserOutput;
}
/**
* Get categories in the text.
* @return string[]
*/
public function categories() {
$categories = [];
foreach ( array_keys( $this->parserOutput->getCategories() ) as $key ) {
$categories[] = Category::newFromName( $key )->getTitle()->getText();
}
return $categories;
}
/**
* Get outgoing links.
* @return string[]
*/
public function outgoingLinks() {
$outgoingLinks = [];
foreach ( $this->parserOutput->getLinks() as $linkedNamespace => $namespaceLinks ) {
foreach ( array_keys( $namespaceLinks ) as $linkedDbKey ) {
$outgoingLinks[] =
Title::makeTitle( $linkedNamespace, $linkedDbKey )->getPrefixedDBkey();
}
}
return $outgoingLinks;
}
/**
* Get templates in the text.
* @return string[]
*/
public function templates() {
$templates = [];
foreach ( $this->parserOutput->getTemplates() as $tNS => $templatesInNS ) {
foreach ( array_keys( $templatesInNS ) as $tDbKey ) {
$templateTitle = Title::makeTitleSafe( $tNS, $tDbKey );
if ( $templateTitle && $templateTitle->exists() ) {
$templates[] = $templateTitle->getPrefixedText();
}
}
}
return $templates;
}
/**
* Get headings on the page.
* @return string[]
* First strip out things that look like references. We can't use HTML filtering because
* the references come back as <sup> tags without a class. To keep from breaking stuff like
* ==Applicability of the strict massenergy equivalence formula, ''E'' = ''mc''<sup>2</sup>==
* we don't remove the whole <sup> tag. We also don't want to strip the <sup> tag and remove
* everything that looks like [2] because, I dunno, maybe there is a band named Word [2] Foo
* or something. Whatever. So we only strip things that look like <sup> tags wrapping a
* reference. And since the data looks like:
* Reference in heading <sup>&#91;1&#93;</sup><sup>&#91;2&#93;</sup>
* we can not really use HtmlFormatter as we have no suitable selector.
*/
public function headings() {
$headings = [];
$ignoredHeadings = $this->getIgnoredHeadings();
foreach ( $this->parserOutput->getSections() as $heading ) {
$heading = $heading[ 'line' ];
// Some wikis wrap the brackets in a span:
// http://en.wikipedia.org/wiki/MediaWiki:Cite_reference_link
$heading = preg_replace( '/<\/?span>/', '', $heading );
// Normalize [] so the following regexp would work.
$heading = preg_replace( [ '/&#91;/', '/&#93;/' ], [ '[', ']' ], $heading );
$heading = preg_replace( '/<sup>\s*\[\s*\d+\s*\]\s*<\/sup>/is', '', $heading );
// Strip tags from the heading or else we'll display them (escaped) in search results
$heading = trim( Sanitizer::stripAllTags( $heading ) );
// Note that we don't take the level of the heading into account - all headings are equal.
// Except the ones we ignore.
if ( !in_array( $heading, $ignoredHeadings ) ) {
$headings[] = $heading;
}
}
return $headings;
}
/**
* Parse a message content into an array. This function is generally used to
* parse settings stored as i18n messages (see search-ignored-headings).
*
* @param string $message
* @return string[]
*/
public static function parseSettingsInMessage( $message ) {
$lines = explode( "\n", $message );
$lines = preg_replace( '/#.*$/', '', $lines ); // Remove comments
$lines = array_map( 'trim', $lines ); // Remove extra spaces
$lines = array_filter( $lines ); // Remove empty lines
return $lines;
}
/**
* Get list of heading to ignore.
* @return string[]
*/
private function getIgnoredHeadings() {
static $ignoredHeadings = null;
if ( $ignoredHeadings === null ) {
// FIXME: will be renamed in next patches to search-ignored-headings
$source = wfMessage( 'cirrussearch-ignored-headings' )->inContentLanguage();
$ignoredHeadings = [];
if ( !$source->isDisabled() ) {
$lines = self::parseSettingsInMessage( $source->plain() );
$ignoredHeadings = $lines; // Now we just have headings!
}
}
return $ignoredHeadings;
}
/**
* Extract parts of the text - opening, main and auxiliary.
*/
private function extractWikitextParts() {
if ( !is_null( $this->allText ) ) {
return;
}
$this->parserOutput->setEditSectionTokens( false );
$this->parserOutput->setTOCEnabled( false );
$text = $this->parserOutput->getText();
if ( strlen( $text ) == 0 ) {
$this->allText = "";
// empty text - nothing to seek here
return;
}
$opening = null;
$this->openingText = $this->extractHeadingBeforeFirstHeading( $text );
// Add extra spacing around break tags so text crammed together like<br>this
// doesn't make one word.
$text = str_replace( '<br', "\n<br", $text );
$formatter = new HtmlFormatter( $text );
// Strip elements from the page that we never want in the search text.
$formatter->remove( $this->excludedElementSelectors );
$formatter->filterContent();
// Strip elements from the page that are auxiliary text. These will still be
// searched but matches will be ranked lower and non-auxiliary matches will be
// preferred in highlighting.
$formatter->remove( $this->auxiliaryElementSelectors );
$auxiliaryElements = $formatter->filterContent();
$this->allText = trim( Sanitizer::stripAllTags( $formatter->getText() ) );
foreach ( $auxiliaryElements as $auxiliaryElement ) {
$this->auxText[] =
trim( Sanitizer::stripAllTags( $formatter->getText( $auxiliaryElement ) ) );
}
}
/**
* Get text before first heading.
* @param string $text
* @return string|null
*/
private function extractHeadingBeforeFirstHeading( $text ) {
$matches = [];
if ( !preg_match( '/<h[123456]>/', $text, $matches, PREG_OFFSET_CAPTURE ) ) {
// There isn't a first heading so we interpret this as the article
// being entirely without heading.
return null;
}
$text = substr( $text, 0, $matches[ 0 ][ 1 ] );
if ( !$text ) {
// There isn't any text before the first heading so we declare there isn't
// a first heading.
return null;
}
$formatter = new HtmlFormatter( $text );
$formatter->remove( $this->excludedElementSelectors );
$formatter->remove( $this->auxiliaryElementSelectors );
$formatter->filterContent();
$text = trim( Sanitizer::stripAllTags( $formatter->getText() ) );
if ( !$text ) {
// There isn't any text after filtering before the first heading so we declare
// that there isn't a first heading.
return null;
}
return $text;
}
/**
* Get opening text
* @return string
*/
public function getOpeningText() {
$this->extractWikitextParts();
return $this->openingText;
}
/**
* Get main text
* @return string
*/
public function getMainText() {
$this->extractWikitextParts();
return $this->allText;
}
/**
* Get auxiliary text
* @return string[]
*/
public function getAuxiliaryText() {
$this->extractWikitextParts();
return $this->auxText;
}
}

View file

@ -145,4 +145,44 @@ class WikitextContentHandler extends TextContentHandler {
return $fields;
}
/**
* Extract text of the file
* TODO: probably should go to file handler?
* @param Title $title
* @return string|null
*/
protected function getFileText( Title $title ) {
$file = wfLocalFile( $title );
if ( $file && $file->exists() ) {
return $file->getHandler()->getEntireText( $file );
}
return null;
}
public function getDataForSearchIndex( WikiPage $page, ParserOutput $parserOutput,
SearchEngine $engine ) {
$fields = parent::getDataForSearchIndex( $page, $parserOutput, $engine );
$structure = new WikiTextStructure( $parserOutput );
$fields['external_link'] = array_keys( $parserOutput->getExternalLinks() );
$fields['category'] = $structure->categories();
$fields['heading'] = $structure->headings();
$fields['outgoing_link'] = $structure->outgoingLinks();
$fields['template'] = $structure->templates();
// text fields
$fields['opening_text'] = $structure->getOpeningText();
$fields['text'] = $structure->getMainText(); // overwrites one from ContentHandler
$fields['auxiliary_text'] = $structure->getAuxiliaryText();
$title = $page->getTitle();
if ( NS_FILE == $title->getNamespace() ) {
$fileText = $this->getFileText( $title );
if ( $fileText ) {
$fields['file_text'] = $fileText;
}
}
return $fields;
}
}

View file

@ -1043,14 +1043,16 @@ class WikiPage implements Page, IDBAccessObject {
*
* @since 1.19
* @param ParserOptions $parserOptions ParserOptions to use for the parse operation
* @param null|int $oldid Revision ID to get the text from, passing null or 0 will
* get the current revision (default value)
*
* @return ParserOutput|bool ParserOutput or false if the revision was not found
* @param null|int $oldid Revision ID to get the text from, passing null or 0 will
* get the current revision (default value)
* @param bool $forceParse Force reindexing, regardless of cache settings
* @return bool|ParserOutput ParserOutput or false if the revision was not found
*/
public function getParserOutput( ParserOptions $parserOptions, $oldid = null ) {
public function getParserOutput( ParserOptions $parserOptions, $oldid = null,
$forceParse = false ) {
$useParserCache = $this->shouldCheckParserCache( $parserOptions, $oldid );
$useParserCache =
( !$forceParse ) && $this->shouldCheckParserCache( $parserOptions, $oldid );
wfDebug( __METHOD__ .
': using parser cache: ' . ( $useParserCache ? 'yes' : 'no' ) . "\n" );
if ( $parserOptions->getStubThreshold() ) {

View file

@ -659,7 +659,7 @@ abstract class SearchEngine {
* Create a search field definition.
* Specific search engines should override this method to create search fields.
* @param string $name
* @param int $type
* @param int $type One of the types in SearchIndexField::INDEX_TYPE_*
* @return SearchIndexField
* @since 1.28
*/

View file

@ -1775,4 +1775,15 @@ abstract class MediaWikiTestCase extends PHPUnit_Framework_TestCase {
return $buffer;
}
/**
* Create a temporary hook handler which will be reset by tearDown.
* This replaces other handlers for the same hook.
* @param string $hookName Hook name
* @param mixed $handler Value suitable for a hook handler
* @since 1.28
*/
protected function setTemporaryHook( $hookName, $handler ) {
$this->mergeMwGlobalArrayValue( 'wgHooks', [ $hookName => [ $handler ] ] );
}
}

View file

@ -3,6 +3,7 @@ use MediaWiki\MediaWikiServices;
/**
* @group ContentHandler
* @group Database
*/
class ContentHandlerTest extends MediaWikiTestCase {
@ -52,6 +53,11 @@ class ContentHandlerTest extends MediaWikiTestCase {
parent::tearDown();
}
public function addDBDataOnce() {
$this->insertPage( 'Not_Main_Page', 'This is not a main page' );
$this->insertPage( 'Smithee', 'A smithee is one who smiths. See also [[Alan Smithee]]' );
}
public static function dataGetDefaultModelFor() {
return [
[ 'Help:Foo', CONTENT_MODEL_WIKITEXT ],
@ -409,4 +415,39 @@ class ContentHandlerTest extends MediaWikiTestCase {
$this->assertInstanceOf( $handlerClass, $handler );
}
/**
* @covers ContentHandler::getDataForSearchIndex
*/
public function testDataIndexFields() {
$mockEngine = $this->getMock( 'SearchEngine' );
$title = Title::newFromText( 'Not_Main_Page', NS_MAIN );
$page = new WikiPage( $title );
$this->setTemporaryHook( 'SearchDataForIndex',
function ( &$fields, ContentHandler $handler, WikiPage $page, ParserOutput $output,
SearchEngine $engine ) {
$fields['testDataField'] = 'test content';
} );
$output = $page->getContent()->getParserOutput( $title );
$data = $page->getContentHandler()->getDataForSearchIndex( $page, $output, $mockEngine );
$this->assertArrayHasKey( 'text', $data );
$this->assertArrayHasKey( 'text_bytes', $data );
$this->assertArrayHasKey( 'language', $data );
$this->assertArrayHasKey( 'testDataField', $data );
$this->assertEquals( 'test content', $data['testDataField'] );
}
/**
* @covers ContentHandler::getParserOutputForIndexing
*/
public function testParserOutputForIndexing() {
$title = Title::newFromText( 'Smithee', NS_MAIN );
$page = new WikiPage( $title );
$out = $page->getContentHandler()->getParserOutputForIndexing( $page );
$this->assertInstanceOf( ParserOutput::class, $out );
$this->assertContains( 'one who smiths', $out->getRawText() );
}
}

View file

@ -49,5 +49,4 @@ class TextContentHandlerTest extends MediaWikiLangTestCase {
$this->assertEquals( 'test', $mappedFields['language']['testData'] );
$this->assertEquals( 'language', $mappedFields['language']['name'] );
}
}

View file

@ -243,4 +243,20 @@ class WikitextContentHandlerTest extends MediaWikiLangTestCase {
) {
}
*/
public function testDataIndexFieldsFile() {
$mockEngine = $this->getMock( 'SearchEngine' );
$title = Title::newFromText( 'Somefile.jpg', NS_FILE );
$page = new WikiPage( $title );
$handler = $this->getMockBuilder( WikitextContentHandler::class )
->disableOriginalConstructor()
->setMethods( [ 'getFileText' ] )
->getMock();
$handler->method( 'getFileText' )->will( $this->returnValue( 'This is file content' ) );
$data = $handler->getDataForSearchIndex( $page, new ParserOutput(), $mockEngine );
$this->assertArrayHasKey( 'file_text', $data );
$this->assertEquals( 'This is file content', $data['file_text'] );
}
}

View file

@ -0,0 +1,148 @@
<?php
class WikitextStructureTest extends MediaWikiLangTestCase {
private function getMockTitle() {
return Title::newFromText( "TestTitle" );
}
/**
* Get parser output for Wiki text
* @param $text
* @return ParserOutput
*/
private function getParserOutput( $text ) {
$content = new WikitextContent( $text );
return $content->getParserOutput( $this->getMockTitle() );
}
/**
* Get WikitextStructure for given text
* @param $text
* @return WikiTextStructure
*/
private function getStructure( $text ) {
return new WikiTextStructure( $this->getParserOutput( $text ) );
}
public function testCategories() {
$text = <<<END
We also have a {{Template}} and an {{Another template}} in addition.
This text also has [[Category:Some Category| ]] and then [[Category:Yet another category]].
And [[Category:Some Category| this category]] is repeated.
END;
$struct = $this->getStructure( $text );
$cats = $struct->categories();
$this->assertCount( 2, $cats );
$this->assertContains( "Some Category", $cats );
$this->assertContains( "Yet another category", $cats );
}
public function testOutgoingLinks() {
$text = <<<END
Here I add link to [[Some Page]]. And [[Some Page|This same page]] gets linked twice.
We also have [[File:Image.jpg|image]].
We also have a {{Template}} and an {{Another template}} in addition.
Some templates are {{lowercase}}.
And [[Some_Page]] is linked again.
It also has [[Category:Some Category| ]] and then [[Category:Yet another category]].
Also link to a [[Talk:TestTitle|talk page]] is here.
END;
$struct = $this->getStructure( $text );
$links = $struct->outgoingLinks();
$this->assertContains( "Some_Page", $links );
$this->assertContains( "Template:Template", $links );
$this->assertContains( "Template:Another_template", $links );
$this->assertContains( "Template:Lowercase", $links );
$this->assertContains( "Talk:TestTitle", $links );
$this->assertCount( 5, $links );
}
public function testTemplates() {
$text = <<<END
We have a {{Template}} and an {{Another template}} in addition.
Some templates are {{lowercase}}. And this {{Template}} is repeated.
Here is {{another_template|with=argument}}.
This is a template that {{Xdoes not exist}}.
END;
$this->setTemporaryHook( 'TitleExists', function ( Title $title, &$exists ) {
$txt = $title->getBaseText();
if ( $txt[0] != 'X' ) {
$exists = true;
}
return true;
} );
$struct = $this->getStructure( $text );
$templates = $struct->templates();
$this->assertCount( 3, $templates );
$this->assertContains( "Template:Template", $templates );
$this->assertContains( "Template:Another template", $templates );
$this->assertContains( "Template:Lowercase", $templates );
}
public function testHeadings() {
$text = <<<END
Some text here
== Heading one ==
Some text
==== heading two ====
More text
=== Applicability of the strict mass-energy equivalence formula, ''E'' = ''mc''<sup>2</sup> ===
and more text
== Wikitext '''in''' [[Heading]] and also <b>html</b> ==
more text
END;
// FIXME: add test for ==== See also ==== after cirrussearch-ignored-headings is renamed
$struct = $this->getStructure( $text );
$headings = $struct->headings();
$this->assertCount( 4, $headings );
$this->assertContains( "Heading one", $headings );
$this->assertContains( "heading two", $headings );
$this->assertContains( "Applicability of the strict mass-energy equivalence formula, E = mc2",
$headings );
$this->assertContains( "Wikitext in Heading and also html", $headings );
}
public function testHeadingsFirst() {
$text = <<<END
== Heading one ==
Some text
==== heading two ====
END;
$struct = $this->getStructure( $text );
$headings = $struct->headings();
$this->assertCount( 2, $headings );
$this->assertContains( "Heading one", $headings );
$this->assertContains( "heading two", $headings );
}
public function testHeadingsNone() {
$text = "This text is completely devoid of headings.";
$struct = $this->getStructure( $text );
$headings = $struct->headings();
$this->assertArrayEquals( [], $headings );
}
public function testTexts() {
$text = <<<END
Opening text is opening.
== Then comes header ==
Then we got more<br>text
=== And more headers ===
{| class="wikitable"
|-
! Header table
|-
| row in table
|-
| another row in table
|}
END;
$struct = $this->getStructure( $text );
$this->assertEquals( "Opening text is opening.", $struct->getOpeningText() );
$this->assertEquals( "Opening text is opening. Then we got more text",
$struct->getMainText() );
$this->assertEquals( [ "Header table row in table another row in table" ],
$struct->getAuxiliaryText() );
}
}

View file

@ -185,8 +185,12 @@ class SearchEngineTest extends MediaWikiLangTestCase {
->willReturnCallback( $mockFieldBuilder );
// Not using mock since PHPUnit mocks do not work properly with references in params
$this->mergeMwGlobalArrayValue( 'wgHooks',
[ 'SearchIndexFields' => [ [ $this, 'hookSearchIndexFields', $mockFieldBuilder ] ] ] );
$this->setTemporaryHook( 'SearchIndexFields',
function ( &$fields, SearchEngine $engine ) use ( $mockFieldBuilder ) {
$fields['testField'] =
$mockFieldBuilder( "testField", SearchIndexField::INDEX_TYPE_TEXT );
return true;
} );
$fields = $mockEngine->getSearchIndexFields();
$this->assertArrayHasKey( 'language', $fields );
@ -197,9 +201,4 @@ class SearchEngineTest extends MediaWikiLangTestCase {
$this->assertArrayHasKey( 'testData', $mapping );
$this->assertEquals( 'test', $mapping['testData'] );
}
public function hookSearchIndexFields( $mockFieldBuilder, &$fields, SearchEngine $engine ) {
$fields['testField'] = $mockFieldBuilder( "testField", SearchIndexField::INDEX_TYPE_TEXT );
return true;
}
}