Make content handlers assemble content for search
Bug: T89733 Change-Id: Ie45de496ecc826211d98eea3a410c7639b4be0a4
This commit is contained in:
parent
b7c4c8717f
commit
add1ebe2ab
14 changed files with 635 additions and 15 deletions
|
|
@ -1539,6 +1539,7 @@ $wgAutoloadLocalClasses = [
|
|||
'WikiReference' => __DIR__ . '/includes/WikiMap.php',
|
||||
'WikiRevision' => __DIR__ . '/includes/import/WikiRevision.php',
|
||||
'WikiStatsOutput' => __DIR__ . '/maintenance/language/StatOutputs.php',
|
||||
'WikiTextStructure' => __DIR__ . '/includes/content/WikiTextStructure.php',
|
||||
'WikitextContent' => __DIR__ . '/includes/content/WikitextContent.php',
|
||||
'WikitextContentHandler' => __DIR__ . '/includes/content/WikitextContentHandler.php',
|
||||
'WinCacheBagOStuff' => __DIR__ . '/includes/libs/objectcache/WinCacheBagOStuff.php',
|
||||
|
|
|
|||
|
|
@ -2620,6 +2620,18 @@ search results.
|
|||
$title: Current Title object being displayed in search results.
|
||||
&$id: Revision ID (default is false, for latest)
|
||||
|
||||
'SearchIndexFields': Add fields to search index mapping.
|
||||
&$fields: Array of fields, all implement SearchIndexField
|
||||
$engine: SearchEngine instance for which mapping is being built.
|
||||
|
||||
'SearchDataForIndex': Add data to search document. Allows to add any data to
|
||||
the field map used to index the document.
|
||||
&$fields: Array of name => value pairs for fields
|
||||
$handler: ContentHandler for the content being indexed
|
||||
$page: WikiPage that is being indexed
|
||||
$output: ParserOutput that is produced from the page
|
||||
$engine: SearchEngine for which the indexing is intended
|
||||
|
||||
'SecondaryDataUpdates': Allows modification of the list of DataUpdates to
|
||||
perform when page content is modified. Currently called by
|
||||
AbstractContent::getSecondaryDataUpdates.
|
||||
|
|
|
|||
|
|
@ -1270,4 +1270,69 @@ abstract class ContentHandler {
|
|||
*/
|
||||
return [];
|
||||
}
|
||||
|
||||
/**
|
||||
* Add new field definition to array.
|
||||
* @param SearchIndexField[] $fields
|
||||
* @param SearchEngine $engine
|
||||
* @param string $name
|
||||
* @param int $type
|
||||
* @return SearchIndexField[] new field defs
|
||||
* @since 1.28
|
||||
*/
|
||||
protected function addSearchField( &$fields, SearchEngine $engine, $name, $type ) {
|
||||
$fields[$name] = $engine->makeSearchFieldMapping( $name, $type );
|
||||
return $fields;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return fields to be indexed by search engine
|
||||
* as representation of this document.
|
||||
* Overriding class should call parent function or take care of calling
|
||||
* the SearchDataForIndex hook.
|
||||
* @param WikiPage $page Page to index
|
||||
* @param ParserOutput $output
|
||||
* @param SearchEngine $engine Search engine for which we are indexing
|
||||
* @return array Map of name=>value for fields
|
||||
* @since 1.28
|
||||
*/
|
||||
public function getDataForSearchIndex( WikiPage $page, ParserOutput $output,
|
||||
SearchEngine $engine ) {
|
||||
$fields = [];
|
||||
$content = $page->getContent();
|
||||
if ( $content ) {
|
||||
$text = $content->getTextForSearchIndex();
|
||||
$fields['text'] = $text;
|
||||
$fields['source_text'] = $text;
|
||||
$fields['text_bytes'] = $content->getSize();
|
||||
}
|
||||
Hooks::run( 'SearchDataForIndex', [ &$fields, $this, $page, $output, $engine ] );
|
||||
return $fields;
|
||||
}
|
||||
|
||||
/**
|
||||
* Produce page output suitable for indexing.
|
||||
*
|
||||
* Specific content handlers may override it if they need different content handling.
|
||||
*
|
||||
* @param WikiPage $page
|
||||
* @param ParserCache $cache
|
||||
* @return ParserOutput
|
||||
*/
|
||||
public function getParserOutputForIndexing( WikiPage $page, ParserCache $cache = null ) {
|
||||
$parserOptions = $page->makeParserOptions( 'canonical' );
|
||||
$revId = $page->getRevision()->getId();
|
||||
if ( $cache ) {
|
||||
$parserOutput = $cache->get( $page, $parserOptions );
|
||||
}
|
||||
if ( empty( $parserOutput ) ) {
|
||||
$parserOutput =
|
||||
$page->getContent()->getParserOutput( $page->getTitle(), $revId, $parserOptions );
|
||||
if ( $cache ) {
|
||||
$cache->save( $parserOutput, $page, $parserOptions );
|
||||
}
|
||||
}
|
||||
return $parserOutput;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -148,4 +148,13 @@ class TextContentHandler extends ContentHandler {
|
|||
$engine->makeSearchFieldMapping( 'language', SearchIndexField::INDEX_TYPE_KEYWORD );
|
||||
return $fields;
|
||||
}
|
||||
|
||||
public function getDataForSearchIndex( WikiPage $page, ParserOutput $output,
|
||||
SearchEngine $engine ) {
|
||||
$fields = parent::getDataForSearchIndex( $page, $output, $engine );
|
||||
$fields['language'] =
|
||||
$this->getPageLanguage( $page->getTitle(), $page->getContent() )->getCode();
|
||||
return $fields;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
277
includes/content/WikiTextStructure.php
Normal file
277
includes/content/WikiTextStructure.php
Normal file
|
|
@ -0,0 +1,277 @@
|
|||
<?php
|
||||
|
||||
use HtmlFormatter\HtmlFormatter;
|
||||
use MediaWiki\Logger\LoggerFactory;
|
||||
|
||||
/**
|
||||
* Class allowing to explore structure of parsed wikitext.
|
||||
*/
|
||||
class WikiTextStructure {
|
||||
/**
|
||||
* @var string
|
||||
*/
|
||||
private $openingText;
|
||||
/**
|
||||
* @var string
|
||||
*/
|
||||
private $allText;
|
||||
/**
|
||||
* @var string[]
|
||||
*/
|
||||
private $auxText = [];
|
||||
/**
|
||||
* @var ParserOutput
|
||||
*/
|
||||
private $parserOutput;
|
||||
|
||||
/**
|
||||
* @var string[] selectors to elements that are excluded entirely from search
|
||||
*/
|
||||
private $excludedElementSelectors = [
|
||||
'audio', 'video', // "it looks like you don't have javascript enabled..."
|
||||
// do not need to index
|
||||
'sup.reference', // The [1] for references
|
||||
'.mw-cite-backlink', // The ↑ next to references in the references section
|
||||
'h1', 'h2', 'h3', // Headings are already indexed in their own field.
|
||||
'h5', 'h6', 'h4',
|
||||
'.autocollapse', // Collapsed fields are hidden by default so we don't want them
|
||||
// showing up.
|
||||
];
|
||||
|
||||
/**
|
||||
* @var string[] selectors to elements that are considered auxiliary to article text for search
|
||||
*/
|
||||
private $auxiliaryElementSelectors = [
|
||||
'.thumbcaption', // Thumbnail captions aren't really part of the text proper
|
||||
'table', // Neither are tables
|
||||
'.rellink', // Common style for "See also:".
|
||||
'.dablink', // Common style for calling out helpful links at the top
|
||||
// of the article.
|
||||
'.searchaux', // New class users can use to mark stuff as auxiliary to searches.
|
||||
];
|
||||
|
||||
/**
|
||||
* WikiTextStructure constructor.
|
||||
* @param ParserOutput $parserOutput
|
||||
*/
|
||||
public function __construct( ParserOutput $parserOutput ) {
|
||||
$this->parserOutput = $parserOutput;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get categories in the text.
|
||||
* @return string[]
|
||||
*/
|
||||
public function categories() {
|
||||
$categories = [];
|
||||
foreach ( array_keys( $this->parserOutput->getCategories() ) as $key ) {
|
||||
$categories[] = Category::newFromName( $key )->getTitle()->getText();
|
||||
}
|
||||
return $categories;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get outgoing links.
|
||||
* @return string[]
|
||||
*/
|
||||
public function outgoingLinks() {
|
||||
$outgoingLinks = [];
|
||||
foreach ( $this->parserOutput->getLinks() as $linkedNamespace => $namespaceLinks ) {
|
||||
foreach ( array_keys( $namespaceLinks ) as $linkedDbKey ) {
|
||||
$outgoingLinks[] =
|
||||
Title::makeTitle( $linkedNamespace, $linkedDbKey )->getPrefixedDBkey();
|
||||
}
|
||||
}
|
||||
return $outgoingLinks;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get templates in the text.
|
||||
* @return string[]
|
||||
*/
|
||||
public function templates() {
|
||||
$templates = [];
|
||||
foreach ( $this->parserOutput->getTemplates() as $tNS => $templatesInNS ) {
|
||||
foreach ( array_keys( $templatesInNS ) as $tDbKey ) {
|
||||
$templateTitle = Title::makeTitleSafe( $tNS, $tDbKey );
|
||||
if ( $templateTitle && $templateTitle->exists() ) {
|
||||
$templates[] = $templateTitle->getPrefixedText();
|
||||
}
|
||||
}
|
||||
}
|
||||
return $templates;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get headings on the page.
|
||||
* @return string[]
|
||||
* First strip out things that look like references. We can't use HTML filtering because
|
||||
* the references come back as <sup> tags without a class. To keep from breaking stuff like
|
||||
* ==Applicability of the strict mass–energy equivalence formula, ''E'' = ''mc''<sup>2</sup>==
|
||||
* we don't remove the whole <sup> tag. We also don't want to strip the <sup> tag and remove
|
||||
* everything that looks like [2] because, I dunno, maybe there is a band named Word [2] Foo
|
||||
* or something. Whatever. So we only strip things that look like <sup> tags wrapping a
|
||||
* reference. And since the data looks like:
|
||||
* Reference in heading <sup>[1]</sup><sup>[2]</sup>
|
||||
* we can not really use HtmlFormatter as we have no suitable selector.
|
||||
*/
|
||||
public function headings() {
|
||||
$headings = [];
|
||||
$ignoredHeadings = $this->getIgnoredHeadings();
|
||||
foreach ( $this->parserOutput->getSections() as $heading ) {
|
||||
$heading = $heading[ 'line' ];
|
||||
|
||||
// Some wikis wrap the brackets in a span:
|
||||
// http://en.wikipedia.org/wiki/MediaWiki:Cite_reference_link
|
||||
$heading = preg_replace( '/<\/?span>/', '', $heading );
|
||||
// Normalize [] so the following regexp would work.
|
||||
$heading = preg_replace( [ '/[/', '/]/' ], [ '[', ']' ], $heading );
|
||||
$heading = preg_replace( '/<sup>\s*\[\s*\d+\s*\]\s*<\/sup>/is', '', $heading );
|
||||
|
||||
// Strip tags from the heading or else we'll display them (escaped) in search results
|
||||
$heading = trim( Sanitizer::stripAllTags( $heading ) );
|
||||
|
||||
// Note that we don't take the level of the heading into account - all headings are equal.
|
||||
// Except the ones we ignore.
|
||||
if ( !in_array( $heading, $ignoredHeadings ) ) {
|
||||
$headings[] = $heading;
|
||||
}
|
||||
}
|
||||
return $headings;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse a message content into an array. This function is generally used to
|
||||
* parse settings stored as i18n messages (see search-ignored-headings).
|
||||
*
|
||||
* @param string $message
|
||||
* @return string[]
|
||||
*/
|
||||
public static function parseSettingsInMessage( $message ) {
|
||||
$lines = explode( "\n", $message );
|
||||
$lines = preg_replace( '/#.*$/', '', $lines ); // Remove comments
|
||||
$lines = array_map( 'trim', $lines ); // Remove extra spaces
|
||||
$lines = array_filter( $lines ); // Remove empty lines
|
||||
return $lines;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get list of heading to ignore.
|
||||
* @return string[]
|
||||
*/
|
||||
private function getIgnoredHeadings() {
|
||||
static $ignoredHeadings = null;
|
||||
if ( $ignoredHeadings === null ) {
|
||||
// FIXME: will be renamed in next patches to search-ignored-headings
|
||||
$source = wfMessage( 'cirrussearch-ignored-headings' )->inContentLanguage();
|
||||
$ignoredHeadings = [];
|
||||
if ( !$source->isDisabled() ) {
|
||||
$lines = self::parseSettingsInMessage( $source->plain() );
|
||||
$ignoredHeadings = $lines; // Now we just have headings!
|
||||
}
|
||||
}
|
||||
return $ignoredHeadings;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract parts of the text - opening, main and auxiliary.
|
||||
*/
|
||||
private function extractWikitextParts() {
|
||||
if ( !is_null( $this->allText ) ) {
|
||||
return;
|
||||
}
|
||||
$this->parserOutput->setEditSectionTokens( false );
|
||||
$this->parserOutput->setTOCEnabled( false );
|
||||
$text = $this->parserOutput->getText();
|
||||
if ( strlen( $text ) == 0 ) {
|
||||
$this->allText = "";
|
||||
// empty text - nothing to seek here
|
||||
return;
|
||||
}
|
||||
$opening = null;
|
||||
|
||||
$this->openingText = $this->extractHeadingBeforeFirstHeading( $text );
|
||||
|
||||
// Add extra spacing around break tags so text crammed together like<br>this
|
||||
// doesn't make one word.
|
||||
$text = str_replace( '<br', "\n<br", $text );
|
||||
|
||||
$formatter = new HtmlFormatter( $text );
|
||||
|
||||
// Strip elements from the page that we never want in the search text.
|
||||
$formatter->remove( $this->excludedElementSelectors );
|
||||
$formatter->filterContent();
|
||||
|
||||
// Strip elements from the page that are auxiliary text. These will still be
|
||||
// searched but matches will be ranked lower and non-auxiliary matches will be
|
||||
// preferred in highlighting.
|
||||
$formatter->remove( $this->auxiliaryElementSelectors );
|
||||
$auxiliaryElements = $formatter->filterContent();
|
||||
$this->allText = trim( Sanitizer::stripAllTags( $formatter->getText() ) );
|
||||
foreach ( $auxiliaryElements as $auxiliaryElement ) {
|
||||
$this->auxText[] =
|
||||
trim( Sanitizer::stripAllTags( $formatter->getText( $auxiliaryElement ) ) );
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get text before first heading.
|
||||
* @param string $text
|
||||
* @return string|null
|
||||
*/
|
||||
private function extractHeadingBeforeFirstHeading( $text ) {
|
||||
$matches = [];
|
||||
if ( !preg_match( '/<h[123456]>/', $text, $matches, PREG_OFFSET_CAPTURE ) ) {
|
||||
// There isn't a first heading so we interpret this as the article
|
||||
// being entirely without heading.
|
||||
return null;
|
||||
}
|
||||
$text = substr( $text, 0, $matches[ 0 ][ 1 ] );
|
||||
if ( !$text ) {
|
||||
// There isn't any text before the first heading so we declare there isn't
|
||||
// a first heading.
|
||||
return null;
|
||||
}
|
||||
|
||||
$formatter = new HtmlFormatter( $text );
|
||||
$formatter->remove( $this->excludedElementSelectors );
|
||||
$formatter->remove( $this->auxiliaryElementSelectors );
|
||||
$formatter->filterContent();
|
||||
$text = trim( Sanitizer::stripAllTags( $formatter->getText() ) );
|
||||
|
||||
if ( !$text ) {
|
||||
// There isn't any text after filtering before the first heading so we declare
|
||||
// that there isn't a first heading.
|
||||
return null;
|
||||
}
|
||||
|
||||
return $text;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get opening text
|
||||
* @return string
|
||||
*/
|
||||
public function getOpeningText() {
|
||||
$this->extractWikitextParts();
|
||||
return $this->openingText;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get main text
|
||||
* @return string
|
||||
*/
|
||||
public function getMainText() {
|
||||
$this->extractWikitextParts();
|
||||
return $this->allText;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get auxiliary text
|
||||
* @return string[]
|
||||
*/
|
||||
public function getAuxiliaryText() {
|
||||
$this->extractWikitextParts();
|
||||
return $this->auxText;
|
||||
}
|
||||
}
|
||||
|
|
@ -145,4 +145,44 @@ class WikitextContentHandler extends TextContentHandler {
|
|||
return $fields;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract text of the file
|
||||
* TODO: probably should go to file handler?
|
||||
* @param Title $title
|
||||
* @return string|null
|
||||
*/
|
||||
protected function getFileText( Title $title ) {
|
||||
$file = wfLocalFile( $title );
|
||||
if ( $file && $file->exists() ) {
|
||||
return $file->getHandler()->getEntireText( $file );
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
public function getDataForSearchIndex( WikiPage $page, ParserOutput $parserOutput,
|
||||
SearchEngine $engine ) {
|
||||
$fields = parent::getDataForSearchIndex( $page, $parserOutput, $engine );
|
||||
|
||||
$structure = new WikiTextStructure( $parserOutput );
|
||||
$fields['external_link'] = array_keys( $parserOutput->getExternalLinks() );
|
||||
$fields['category'] = $structure->categories();
|
||||
$fields['heading'] = $structure->headings();
|
||||
$fields['outgoing_link'] = $structure->outgoingLinks();
|
||||
$fields['template'] = $structure->templates();
|
||||
// text fields
|
||||
$fields['opening_text'] = $structure->getOpeningText();
|
||||
$fields['text'] = $structure->getMainText(); // overwrites one from ContentHandler
|
||||
$fields['auxiliary_text'] = $structure->getAuxiliaryText();
|
||||
|
||||
$title = $page->getTitle();
|
||||
if ( NS_FILE == $title->getNamespace() ) {
|
||||
$fileText = $this->getFileText( $title );
|
||||
if ( $fileText ) {
|
||||
$fields['file_text'] = $fileText;
|
||||
}
|
||||
}
|
||||
return $fields;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1043,14 +1043,16 @@ class WikiPage implements Page, IDBAccessObject {
|
|||
*
|
||||
* @since 1.19
|
||||
* @param ParserOptions $parserOptions ParserOptions to use for the parse operation
|
||||
* @param null|int $oldid Revision ID to get the text from, passing null or 0 will
|
||||
* get the current revision (default value)
|
||||
*
|
||||
* @return ParserOutput|bool ParserOutput or false if the revision was not found
|
||||
* @param null|int $oldid Revision ID to get the text from, passing null or 0 will
|
||||
* get the current revision (default value)
|
||||
* @param bool $forceParse Force reindexing, regardless of cache settings
|
||||
* @return bool|ParserOutput ParserOutput or false if the revision was not found
|
||||
*/
|
||||
public function getParserOutput( ParserOptions $parserOptions, $oldid = null ) {
|
||||
public function getParserOutput( ParserOptions $parserOptions, $oldid = null,
|
||||
$forceParse = false ) {
|
||||
|
||||
$useParserCache = $this->shouldCheckParserCache( $parserOptions, $oldid );
|
||||
$useParserCache =
|
||||
( !$forceParse ) && $this->shouldCheckParserCache( $parserOptions, $oldid );
|
||||
wfDebug( __METHOD__ .
|
||||
': using parser cache: ' . ( $useParserCache ? 'yes' : 'no' ) . "\n" );
|
||||
if ( $parserOptions->getStubThreshold() ) {
|
||||
|
|
|
|||
|
|
@ -659,7 +659,7 @@ abstract class SearchEngine {
|
|||
* Create a search field definition.
|
||||
* Specific search engines should override this method to create search fields.
|
||||
* @param string $name
|
||||
* @param int $type
|
||||
* @param int $type One of the types in SearchIndexField::INDEX_TYPE_*
|
||||
* @return SearchIndexField
|
||||
* @since 1.28
|
||||
*/
|
||||
|
|
|
|||
|
|
@ -1775,4 +1775,15 @@ abstract class MediaWikiTestCase extends PHPUnit_Framework_TestCase {
|
|||
return $buffer;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a temporary hook handler which will be reset by tearDown.
|
||||
* This replaces other handlers for the same hook.
|
||||
* @param string $hookName Hook name
|
||||
* @param mixed $handler Value suitable for a hook handler
|
||||
* @since 1.28
|
||||
*/
|
||||
protected function setTemporaryHook( $hookName, $handler ) {
|
||||
$this->mergeMwGlobalArrayValue( 'wgHooks', [ $hookName => [ $handler ] ] );
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -3,6 +3,7 @@ use MediaWiki\MediaWikiServices;
|
|||
|
||||
/**
|
||||
* @group ContentHandler
|
||||
* @group Database
|
||||
*/
|
||||
class ContentHandlerTest extends MediaWikiTestCase {
|
||||
|
||||
|
|
@ -52,6 +53,11 @@ class ContentHandlerTest extends MediaWikiTestCase {
|
|||
parent::tearDown();
|
||||
}
|
||||
|
||||
public function addDBDataOnce() {
|
||||
$this->insertPage( 'Not_Main_Page', 'This is not a main page' );
|
||||
$this->insertPage( 'Smithee', 'A smithee is one who smiths. See also [[Alan Smithee]]' );
|
||||
}
|
||||
|
||||
public static function dataGetDefaultModelFor() {
|
||||
return [
|
||||
[ 'Help:Foo', CONTENT_MODEL_WIKITEXT ],
|
||||
|
|
@ -409,4 +415,39 @@ class ContentHandlerTest extends MediaWikiTestCase {
|
|||
$this->assertInstanceOf( $handlerClass, $handler );
|
||||
}
|
||||
|
||||
/**
|
||||
* @covers ContentHandler::getDataForSearchIndex
|
||||
*/
|
||||
public function testDataIndexFields() {
|
||||
$mockEngine = $this->getMock( 'SearchEngine' );
|
||||
$title = Title::newFromText( 'Not_Main_Page', NS_MAIN );
|
||||
$page = new WikiPage( $title );
|
||||
|
||||
$this->setTemporaryHook( 'SearchDataForIndex',
|
||||
function ( &$fields, ContentHandler $handler, WikiPage $page, ParserOutput $output,
|
||||
SearchEngine $engine ) {
|
||||
$fields['testDataField'] = 'test content';
|
||||
} );
|
||||
|
||||
$output = $page->getContent()->getParserOutput( $title );
|
||||
$data = $page->getContentHandler()->getDataForSearchIndex( $page, $output, $mockEngine );
|
||||
$this->assertArrayHasKey( 'text', $data );
|
||||
$this->assertArrayHasKey( 'text_bytes', $data );
|
||||
$this->assertArrayHasKey( 'language', $data );
|
||||
$this->assertArrayHasKey( 'testDataField', $data );
|
||||
$this->assertEquals( 'test content', $data['testDataField'] );
|
||||
}
|
||||
|
||||
/**
|
||||
* @covers ContentHandler::getParserOutputForIndexing
|
||||
*/
|
||||
public function testParserOutputForIndexing() {
|
||||
$title = Title::newFromText( 'Smithee', NS_MAIN );
|
||||
$page = new WikiPage( $title );
|
||||
|
||||
$out = $page->getContentHandler()->getParserOutputForIndexing( $page );
|
||||
$this->assertInstanceOf( ParserOutput::class, $out );
|
||||
$this->assertContains( 'one who smiths', $out->getRawText() );
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -49,5 +49,4 @@ class TextContentHandlerTest extends MediaWikiLangTestCase {
|
|||
$this->assertEquals( 'test', $mappedFields['language']['testData'] );
|
||||
$this->assertEquals( 'language', $mappedFields['language']['name'] );
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -243,4 +243,20 @@ class WikitextContentHandlerTest extends MediaWikiLangTestCase {
|
|||
) {
|
||||
}
|
||||
*/
|
||||
|
||||
public function testDataIndexFieldsFile() {
|
||||
$mockEngine = $this->getMock( 'SearchEngine' );
|
||||
$title = Title::newFromText( 'Somefile.jpg', NS_FILE );
|
||||
$page = new WikiPage( $title );
|
||||
|
||||
$handler = $this->getMockBuilder( WikitextContentHandler::class )
|
||||
->disableOriginalConstructor()
|
||||
->setMethods( [ 'getFileText' ] )
|
||||
->getMock();
|
||||
$handler->method( 'getFileText' )->will( $this->returnValue( 'This is file content' ) );
|
||||
|
||||
$data = $handler->getDataForSearchIndex( $page, new ParserOutput(), $mockEngine );
|
||||
$this->assertArrayHasKey( 'file_text', $data );
|
||||
$this->assertEquals( 'This is file content', $data['file_text'] );
|
||||
}
|
||||
}
|
||||
|
|
|
|||
148
tests/phpunit/includes/content/WikitextStructureTest.php
Normal file
148
tests/phpunit/includes/content/WikitextStructureTest.php
Normal file
|
|
@ -0,0 +1,148 @@
|
|||
<?php
|
||||
|
||||
class WikitextStructureTest extends MediaWikiLangTestCase {
|
||||
|
||||
private function getMockTitle() {
|
||||
return Title::newFromText( "TestTitle" );
|
||||
}
|
||||
|
||||
/**
|
||||
* Get parser output for Wiki text
|
||||
* @param $text
|
||||
* @return ParserOutput
|
||||
*/
|
||||
private function getParserOutput( $text ) {
|
||||
$content = new WikitextContent( $text );
|
||||
return $content->getParserOutput( $this->getMockTitle() );
|
||||
}
|
||||
|
||||
/**
|
||||
* Get WikitextStructure for given text
|
||||
* @param $text
|
||||
* @return WikiTextStructure
|
||||
*/
|
||||
private function getStructure( $text ) {
|
||||
return new WikiTextStructure( $this->getParserOutput( $text ) );
|
||||
}
|
||||
|
||||
public function testCategories() {
|
||||
$text = <<<END
|
||||
We also have a {{Template}} and an {{Another template}} in addition.
|
||||
This text also has [[Category:Some Category| ]] and then [[Category:Yet another category]].
|
||||
And [[Category:Some Category| this category]] is repeated.
|
||||
END;
|
||||
$struct = $this->getStructure( $text );
|
||||
$cats = $struct->categories();
|
||||
$this->assertCount( 2, $cats );
|
||||
$this->assertContains( "Some Category", $cats );
|
||||
$this->assertContains( "Yet another category", $cats );
|
||||
}
|
||||
|
||||
public function testOutgoingLinks() {
|
||||
$text = <<<END
|
||||
Here I add link to [[Some Page]]. And [[Some Page|This same page]] gets linked twice.
|
||||
We also have [[File:Image.jpg|image]].
|
||||
We also have a {{Template}} and an {{Another template}} in addition.
|
||||
Some templates are {{lowercase}}.
|
||||
And [[Some_Page]] is linked again.
|
||||
It also has [[Category:Some Category| ]] and then [[Category:Yet another category]].
|
||||
Also link to a [[Talk:TestTitle|talk page]] is here.
|
||||
END;
|
||||
$struct = $this->getStructure( $text );
|
||||
$links = $struct->outgoingLinks();
|
||||
$this->assertContains( "Some_Page", $links );
|
||||
$this->assertContains( "Template:Template", $links );
|
||||
$this->assertContains( "Template:Another_template", $links );
|
||||
$this->assertContains( "Template:Lowercase", $links );
|
||||
$this->assertContains( "Talk:TestTitle", $links );
|
||||
$this->assertCount( 5, $links );
|
||||
}
|
||||
|
||||
public function testTemplates() {
|
||||
$text = <<<END
|
||||
We have a {{Template}} and an {{Another template}} in addition.
|
||||
Some templates are {{lowercase}}. And this {{Template}} is repeated.
|
||||
Here is {{another_template|with=argument}}.
|
||||
This is a template that {{Xdoes not exist}}.
|
||||
END;
|
||||
$this->setTemporaryHook( 'TitleExists', function ( Title $title, &$exists ) {
|
||||
$txt = $title->getBaseText();
|
||||
if ( $txt[0] != 'X' ) {
|
||||
$exists = true;
|
||||
}
|
||||
return true;
|
||||
} );
|
||||
$struct = $this->getStructure( $text );
|
||||
$templates = $struct->templates();
|
||||
$this->assertCount( 3, $templates );
|
||||
$this->assertContains( "Template:Template", $templates );
|
||||
$this->assertContains( "Template:Another template", $templates );
|
||||
$this->assertContains( "Template:Lowercase", $templates );
|
||||
}
|
||||
|
||||
public function testHeadings() {
|
||||
$text = <<<END
|
||||
Some text here
|
||||
== Heading one ==
|
||||
Some text
|
||||
==== heading two ====
|
||||
More text
|
||||
=== Applicability of the strict mass-energy equivalence formula, ''E'' = ''mc''<sup>2</sup> ===
|
||||
and more text
|
||||
== Wikitext '''in''' [[Heading]] and also <b>html</b> ==
|
||||
more text
|
||||
END;
|
||||
// FIXME: add test for ==== See also ==== after cirrussearch-ignored-headings is renamed
|
||||
$struct = $this->getStructure( $text );
|
||||
$headings = $struct->headings();
|
||||
$this->assertCount( 4, $headings );
|
||||
$this->assertContains( "Heading one", $headings );
|
||||
$this->assertContains( "heading two", $headings );
|
||||
$this->assertContains( "Applicability of the strict mass-energy equivalence formula, E = mc2",
|
||||
$headings );
|
||||
$this->assertContains( "Wikitext in Heading and also html", $headings );
|
||||
}
|
||||
|
||||
public function testHeadingsFirst() {
|
||||
$text = <<<END
|
||||
== Heading one ==
|
||||
Some text
|
||||
==== heading two ====
|
||||
END;
|
||||
$struct = $this->getStructure( $text );
|
||||
$headings = $struct->headings();
|
||||
$this->assertCount( 2, $headings );
|
||||
$this->assertContains( "Heading one", $headings );
|
||||
$this->assertContains( "heading two", $headings );
|
||||
}
|
||||
|
||||
public function testHeadingsNone() {
|
||||
$text = "This text is completely devoid of headings.";
|
||||
$struct = $this->getStructure( $text );
|
||||
$headings = $struct->headings();
|
||||
$this->assertArrayEquals( [], $headings );
|
||||
}
|
||||
|
||||
public function testTexts() {
|
||||
$text = <<<END
|
||||
Opening text is opening.
|
||||
== Then comes header ==
|
||||
Then we got more<br>text
|
||||
=== And more headers ===
|
||||
{| class="wikitable"
|
||||
|-
|
||||
! Header table
|
||||
|-
|
||||
| row in table
|
||||
|-
|
||||
| another row in table
|
||||
|}
|
||||
END;
|
||||
$struct = $this->getStructure( $text );
|
||||
$this->assertEquals( "Opening text is opening.", $struct->getOpeningText() );
|
||||
$this->assertEquals( "Opening text is opening. Then we got more text",
|
||||
$struct->getMainText() );
|
||||
$this->assertEquals( [ "Header table row in table another row in table" ],
|
||||
$struct->getAuxiliaryText() );
|
||||
}
|
||||
}
|
||||
|
|
@ -185,8 +185,12 @@ class SearchEngineTest extends MediaWikiLangTestCase {
|
|||
->willReturnCallback( $mockFieldBuilder );
|
||||
|
||||
// Not using mock since PHPUnit mocks do not work properly with references in params
|
||||
$this->mergeMwGlobalArrayValue( 'wgHooks',
|
||||
[ 'SearchIndexFields' => [ [ $this, 'hookSearchIndexFields', $mockFieldBuilder ] ] ] );
|
||||
$this->setTemporaryHook( 'SearchIndexFields',
|
||||
function ( &$fields, SearchEngine $engine ) use ( $mockFieldBuilder ) {
|
||||
$fields['testField'] =
|
||||
$mockFieldBuilder( "testField", SearchIndexField::INDEX_TYPE_TEXT );
|
||||
return true;
|
||||
} );
|
||||
|
||||
$fields = $mockEngine->getSearchIndexFields();
|
||||
$this->assertArrayHasKey( 'language', $fields );
|
||||
|
|
@ -197,9 +201,4 @@ class SearchEngineTest extends MediaWikiLangTestCase {
|
|||
$this->assertArrayHasKey( 'testData', $mapping );
|
||||
$this->assertEquals( 'test', $mapping['testData'] );
|
||||
}
|
||||
|
||||
public function hookSearchIndexFields( $mockFieldBuilder, &$fields, SearchEngine $engine ) {
|
||||
$fields['testField'] = $mockFieldBuilder( "testField", SearchIndexField::INDEX_TYPE_TEXT );
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in a new issue