Make the doc building for search aware of the revision

Added an optional RevisionRecord param to:
- ContentHandler::getParserOutputForIndexing
- ContentHandler::getDataForSearchIndex
- the SearchDataForIndex hook

So that they have a chance to build the content related to a specific
revision.

Ultimately we'd like to make this parameter mandatory.

Bug: T317309
Depends-On: I8b220cd6c4aeeca1d924bdd527409b8602318944
Depends-On: I8616b611caab3f5fa97ff0e655b19c3034304597
Change-Id: I3298ce7591069eb32f624b2c9fbb6de58ae04a29
This commit is contained in:
David Causse 2022-09-19 08:54:15 +02:00
parent f3566aacb8
commit 9fbd8f500f
10 changed files with 105 additions and 18 deletions

View file

@ -66,6 +66,10 @@ For notes on 1.39.x and older releases, see HISTORY.
passed to var_dump(), to make its use for debugging more feasible.
* Added 'GetBlockErrorMessageKey' hook, allow extensions'
block error messages to be received and displayed by BlockErrorFormatter.
* Added an optional RevisionRecord param to:
- ContentHandler::getParserOutputForIndexing
- ContentHandler::getDataForSearchIndex
- the SearchDataForIndex hook is deprecated in favor of SearchDataForIndex2
* …
=== External library changes in 1.40 ===

View file

@ -61,6 +61,7 @@ class DeprecatedHooks {
'SkinTemplateNavigation' => [ 'deprecatedVersion' => '1.39' ],
'SkinTemplateNavigation::SpecialPage' => [ 'deprecatedVersion' => '1.39' ],
'PersonalUrls' => [ 'deprecatedVersion' => '1.39' ],
'SearchDataForIndex' => [ 'deprecatedVersion' => '1.40', 'silent' => true ],
];
/**

View file

@ -77,6 +77,7 @@ class HookRunner implements
\MediaWiki\Content\Hook\PageContentLanguageHook,
\MediaWiki\Content\Hook\PlaceNewSectionHook,
\MediaWiki\Content\Hook\SearchDataForIndexHook,
\MediaWiki\Content\Hook\SearchDataForIndex2Hook,
\MediaWiki\Specials\Contribute\Hook\ContributeCardsHook,
\MediaWiki\Diff\Hook\AbortDiffCacheHook,
\MediaWiki\Diff\Hook\ArticleContentOnDiffHook,
@ -3294,15 +3295,22 @@ class HookRunner implements
);
}
public function onSearchDataForIndex( &$fields, $handler, $page, $output,
$engine
) {
public function onSearchDataForIndex( &$fields, $handler, $page, $output, $engine ) {
return $this->container->run(
'SearchDataForIndex',
[ &$fields, $handler, $page, $output, $engine ]
);
}
public function onSearchDataForIndex2( array &$fields, \ContentHandler $handler,
\WikiPage $page, \ParserOutput $output, \SearchEngine $engine, RevisionRecord $revision
) {
return $this->container->run(
'SearchDataForIndex2',
[ &$fields, $handler, $page, $output, $engine, $revision ]
);
}
public function onSearchGetNearMatch( $term, &$title ) {
return $this->container->run(
'SearchGetNearMatch',

View file

@ -39,6 +39,7 @@ use MediaWiki\Revision\RevisionRecord;
use MediaWiki\Revision\SlotRecord;
use MediaWiki\Revision\SlotRenderingProvider;
use MediaWiki\Search\ParserOutputSearchDataExtractor;
use Wikimedia\Assert\Assert;
use Wikimedia\ScopedCallback;
/**
@ -1380,16 +1381,32 @@ abstract class ContentHandler {
* @param WikiPage $page Page to index
* @param ParserOutput $output
* @param SearchEngine $engine Search engine for which we are indexing
* @return array Map of name=>value for fields
* @param RevisionRecord|null $revision Revision content to fetch if provided or use the latest revision
* from WikiPage::getRevisionRecord() if not
* @return array Map of name=>value for fields, an empty array is returned if the latest
* revision cannot be retrieved.
* @since 1.28
*/
public function getDataForSearchIndex(
WikiPage $page,
ParserOutput $output,
SearchEngine $engine
SearchEngine $engine,
RevisionRecord $revision = null
) {
$fieldData = [];
$content = $page->getContent();
$revision = $revision ?? $page->getRevisionRecord();
if ( $revision === null ) {
LoggerFactory::getInstance( 'search' )->warning(
"Called getDataForSearchIndex on the page {page_id} for which the " .
"latest revision cannot be loaded.",
[ "page_id" => $page->getId() ]
);
return [];
}
Assert::invariant( $revision->getPageId() === $page->getId(),
'$revision and $page must target the same page_id' );
$content = $revision->getContent( SlotRecord::MAIN );
if ( $content ) {
$searchDataExtractor = new ParserOutputSearchDataExtractor();
@ -1408,6 +1425,8 @@ abstract class ContentHandler {
}
$this->getHookRunner()->onSearchDataForIndex( $fieldData, $this, $page, $output, $engine );
$this->getHookRunner()->onSearchDataForIndex2( $fieldData, $this, $page, $output, $engine, $revision );
return $fieldData;
}
@ -1426,10 +1445,15 @@ abstract class ContentHandler {
*
* @param WikiPage $page
* @param ParserCache|null $cache deprecated since 1.38 and won't have any effect
* @param RevisionRecord|null $revision
* @return ParserOutput|null null when the ParserOutput cannot be obtained
* @see ParserOutputAccess::getParserOutput() for failure modes
*/
public function getParserOutputForIndexing( WikiPage $page, ParserCache $cache = null ) {
public function getParserOutputForIndexing(
WikiPage $page,
ParserCache $cache = null,
RevisionRecord $revision = null
) {
// TODO: MCR: ContentHandler should be called per slot, not for the whole page.
// See T190066.
$parserOptions = $page->makeParserOptions( 'canonical' );
@ -1437,7 +1461,7 @@ abstract class ContentHandler {
return $parserOutputAccess->getParserOutput(
$page,
$parserOptions,
null,
$revision,
ParserOutputAccess::OPT_NO_UPDATE_CACHE
)->getValue();
}

View file

@ -1,6 +1,7 @@
<?php
use MediaWiki\MediaWikiServices;
use MediaWiki\Revision\RevisionRecord;
/**
* Content handler for File: files
@ -36,7 +37,8 @@ class FileContentHandler extends WikitextContentHandler {
public function getDataForSearchIndex(
WikiPage $page,
ParserOutput $parserOutput,
SearchEngine $engine
SearchEngine $engine,
?RevisionRecord $revision = null
) {
$fields = [];

View file

@ -0,0 +1,42 @@
<?php
namespace MediaWiki\Content\Hook;
use ContentHandler;
use MediaWiki\Revision\RevisionRecord;
use ParserOutput;
use SearchEngine;
use WikiPage;
/**
* This is a hook handler interface, see docs/Hooks.md.
* Use the hook name "SearchDataForIndex" to register handlers implementing this interface.
*
* @stable to implement
* @ingroup Hooks
*/
interface SearchDataForIndex2Hook {
/**
* Use this hook to add data to search document. Allows you to add any data to
* the field map used to index the document.
*
* @since 1.40
*
* @param array &$fields Array of name => value pairs for fields
* @param ContentHandler $handler ContentHandler for the content being indexed
* @param WikiPage $page WikiPage that is being indexed
* @param ParserOutput $output ParserOutput that is produced from the page
* @param SearchEngine $engine SearchEngine for which the indexing is intended
* @param RevisionRecord $revision RevisionRecord being indexed
* @return bool|void True or no return value to continue or false to abort
*/
public function onSearchDataForIndex2(
array &$fields,
ContentHandler $handler,
WikiPage $page,
ParserOutput $output,
SearchEngine $engine,
RevisionRecord $revision
);
}

View file

@ -12,6 +12,7 @@ use WikiPage;
* Use the hook name "SearchDataForIndex" to register handlers implementing this interface.
*
* @stable to implement
* @deprecated since 1.40, use SearchDataForIndexHook2 instead.
* @ingroup Hooks
*/
interface SearchDataForIndexHook {

View file

@ -27,6 +27,7 @@ use MediaWiki\Content\Renderer\ContentParseParams;
use MediaWiki\Content\Transform\PreSaveTransformParams;
use MediaWiki\MainConfigNames;
use MediaWiki\MediaWikiServices;
use MediaWiki\Revision\RevisionRecord;
/**
* Base content handler implementation for flat text contents.
@ -157,9 +158,10 @@ class TextContentHandler extends ContentHandler {
public function getDataForSearchIndex(
WikiPage $page,
ParserOutput $output,
SearchEngine $engine
SearchEngine $engine,
?RevisionRecord $revision = null
) {
$fields = parent::getDataForSearchIndex( $page, $output, $engine );
$fields = parent::getDataForSearchIndex( $page, $output, $engine, $revision );
$fields['language'] =
$this->getPageLanguage( $page->getTitle(), $page->getContent() )->getCode();
return $fields;

View file

@ -29,6 +29,7 @@ use MediaWiki\Content\Transform\PreSaveTransformParams;
use MediaWiki\Languages\LanguageNameUtils;
use MediaWiki\MediaWikiServices;
use MediaWiki\Parser\ParserOutputFlags;
use MediaWiki\Revision\RevisionRecord;
/**
* Content handler for wiki text pages.
@ -157,9 +158,10 @@ class WikitextContentHandler extends TextContentHandler {
public function getDataForSearchIndex(
WikiPage $page,
ParserOutput $parserOutput,
SearchEngine $engine
SearchEngine $engine,
?RevisionRecord $revision = null
) {
$fields = parent::getDataForSearchIndex( $page, $parserOutput, $engine );
$fields = parent::getDataForSearchIndex( $page, $parserOutput, $engine, $revision );
$structure = new WikiTextStructure( $parserOutput );
$fields['heading'] = $structure->headings();
@ -172,7 +174,7 @@ class WikitextContentHandler extends TextContentHandler {
// Until we have full first-class content handler for files, we invoke it explicitly here
if ( $page->getTitle()->getNamespace() === NS_FILE ) {
$fields = array_merge( $fields,
$this->getFileHandler()->getDataForSearchIndex( $page, $parserOutput, $engine ) );
$this->getFileHandler()->getDataForSearchIndex( $page, $parserOutput, $engine, $revision ) );
}
return $fields;
}

View file

@ -444,9 +444,9 @@ class ContentHandlerTest extends MediaWikiIntegrationTestCase {
$fields['testDataField'] = 'test content';
} );
$contentRenderer = $this->getServiceContainer()->getContentRenderer();
$output = $contentRenderer->getParserOutput( $page->getContent(), $title );
$data = $page->getContentHandler()->getDataForSearchIndex( $page, $output, $mockEngine );
$revision = $page->getRevisionRecord();
$output = $page->getContentHandler()->getParserOutputForIndexing( $page, null, $revision );
$data = $page->getContentHandler()->getDataForSearchIndex( $page, $output, $mockEngine, $revision );
$this->assertArrayHasKey( 'text', $data );
$this->assertArrayHasKey( 'text_bytes', $data );
$this->assertArrayHasKey( 'language', $data );
@ -461,8 +461,9 @@ class ContentHandlerTest extends MediaWikiIntegrationTestCase {
public function testParserOutputForIndexing() {
$title = Title::newFromText( 'Smithee', NS_MAIN );
$page = $this->getServiceContainer()->getWikiPageFactory()->newFromTitle( $title );
$revision = $page->getRevisionRecord();
$out = $page->getContentHandler()->getParserOutputForIndexing( $page );
$out = $page->getContentHandler()->getParserOutputForIndexing( $page, null, $revision );
$this->assertInstanceOf( ParserOutput::class, $out );
$this->assertStringContainsString( 'one who smiths', $out->getRawText() );
}