2016-05-16 20:24:10 +00:00
|
|
|
|
<?php
|
|
|
|
|
|
|
2024-08-06 13:40:20 +00:00
|
|
|
|
namespace MediaWiki\Content;
|
|
|
|
|
|
|
2016-05-16 20:24:10 +00:00
|
|
|
|
use HtmlFormatter\HtmlFormatter;
|
2023-12-14 19:20:33 +00:00
|
|
|
|
use MediaWiki\Parser\ParserOutput;
|
2023-09-19 16:59:47 +00:00
|
|
|
|
use MediaWiki\Parser\Sanitizer;
|
2016-05-16 20:24:10 +00:00
|
|
|
|
|
|
|
|
|
|
/**
|
2023-08-23 11:13:16 +00:00
|
|
|
|
* Class allowing to explore the structure of parsed wikitext.
|
2016-05-16 20:24:10 +00:00
|
|
|
|
*/
|
|
|
|
|
|
class WikiTextStructure {
|
2023-08-23 11:13:16 +00:00
|
|
|
|
|
|
|
|
|
|
private ?string $openingText = null;
|
|
|
|
|
|
private ?string $allText = null;
|
|
|
|
|
|
/** @var string[] */
|
|
|
|
|
|
private array $auxText = [];
|
|
|
|
|
|
private ParserOutput $parserOutput;
|
2016-05-16 20:24:10 +00:00
|
|
|
|
|
|
|
|
|
|
/**
|
2023-08-23 11:13:16 +00:00
|
|
|
|
* Selectors to elements that are excluded entirely from search
|
2016-05-16 20:24:10 +00:00
|
|
|
|
*/
|
2023-08-23 11:13:16 +00:00
|
|
|
|
private const EXCLUDED_ELEMENT_SELECTORS = [
|
2017-02-25 21:53:36 +00:00
|
|
|
|
// "it looks like you don't have javascript enabled..." – do not need to index
|
|
|
|
|
|
'audio', 'video',
|
2018-03-13 18:57:45 +00:00
|
|
|
|
// CSS stylesheets aren't content
|
|
|
|
|
|
'style',
|
2019-01-17 18:31:28 +00:00
|
|
|
|
// The [1] for references from Cite
|
2017-02-25 21:53:36 +00:00
|
|
|
|
'sup.reference',
|
2019-01-17 18:31:28 +00:00
|
|
|
|
// The ↑ next to references in the references section from Cite
|
2017-02-25 21:53:36 +00:00
|
|
|
|
'.mw-cite-backlink',
|
|
|
|
|
|
// Headings are already indexed in their own field.
|
|
|
|
|
|
'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
|
2023-08-23 11:13:16 +00:00
|
|
|
|
// Collapsed fields are hidden by default, so we don't want them showing up.
|
2017-02-25 21:53:36 +00:00
|
|
|
|
'.autocollapse',
|
2017-04-18 21:03:23 +00:00
|
|
|
|
// Content explicitly decided to be not searchable by editors such
|
|
|
|
|
|
// as custom navigation templates.
|
2019-01-17 18:31:52 +00:00
|
|
|
|
'.navigation-not-searchable',
|
|
|
|
|
|
// User-facing interface code prompting the user to act from WikibaseMediaInfo
|
|
|
|
|
|
'.wbmi-entityview-emptyCaption',
|
2016-05-16 20:24:10 +00:00
|
|
|
|
];
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
2023-08-23 11:13:16 +00:00
|
|
|
|
* Selectors to elements that are considered auxiliary to the article text for search
|
2016-05-16 20:24:10 +00:00
|
|
|
|
*/
|
2023-08-23 11:13:16 +00:00
|
|
|
|
private const AUXILIARY_ELEMENT_SELECTORS = [
|
2017-02-25 21:53:36 +00:00
|
|
|
|
// Thumbnail captions aren't really part of the text proper
|
|
|
|
|
|
'.thumbcaption',
|
2023-03-23 15:54:29 +00:00
|
|
|
|
'figcaption',
|
2017-02-25 21:53:36 +00:00
|
|
|
|
// Neither are tables
|
|
|
|
|
|
'table',
|
|
|
|
|
|
// Common style for "See also:".
|
|
|
|
|
|
'.rellink',
|
|
|
|
|
|
// Common style for calling out helpful links at the top of the article.
|
|
|
|
|
|
'.dablink',
|
|
|
|
|
|
// New class users can use to mark stuff as auxiliary to searches.
|
|
|
|
|
|
'.searchaux',
|
2016-05-16 20:24:10 +00:00
|
|
|
|
];
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
|
* @param ParserOutput $parserOutput
|
|
|
|
|
|
*/
|
|
|
|
|
|
public function __construct( ParserOutput $parserOutput ) {
|
|
|
|
|
|
$this->parserOutput = $parserOutput;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
2023-08-23 11:13:16 +00:00
|
|
|
|
* Gets headings from the page.
|
2024-08-06 13:40:20 +00:00
|
|
|
|
*
|
2016-05-16 20:24:10 +00:00
|
|
|
|
* @return string[]
|
|
|
|
|
|
* First strip out things that look like references. We can't use HTML filtering because
|
|
|
|
|
|
* the references come back as <sup> tags without a class. To keep from breaking stuff like
|
|
|
|
|
|
* ==Applicability of the strict mass–energy equivalence formula, ''E'' = ''mc''<sup>2</sup>==
|
2023-08-23 11:13:16 +00:00
|
|
|
|
* we don't remove the whole <sup> tag.
|
|
|
|
|
|
*
|
|
|
|
|
|
* We also don't want to strip the <sup> tag and remove everything that looks like [2] because,
|
|
|
|
|
|
* I don't know, maybe there is a band named Word [2] Foo r something. Whatever.
|
|
|
|
|
|
*
|
|
|
|
|
|
* So we only strip things that look like <sup> tags wrapping a reference. And since the data
|
|
|
|
|
|
* looks like:
|
2016-05-16 20:24:10 +00:00
|
|
|
|
* Reference in heading <sup>[1]</sup><sup>[2]</sup>
|
|
|
|
|
|
* we can not really use HtmlFormatter as we have no suitable selector.
|
|
|
|
|
|
*/
|
|
|
|
|
|
public function headings() {
|
|
|
|
|
|
$headings = [];
|
2022-09-01 23:07:29 +00:00
|
|
|
|
$tocData = $this->parserOutput->getTOCData();
|
|
|
|
|
|
if ( $tocData === null ) {
|
|
|
|
|
|
return $headings;
|
|
|
|
|
|
}
|
2016-05-16 20:24:10 +00:00
|
|
|
|
$ignoredHeadings = $this->getIgnoredHeadings();
|
2022-09-01 23:07:29 +00:00
|
|
|
|
foreach ( $tocData->getSections() as $heading ) {
|
|
|
|
|
|
$heading = $heading->line;
|
2016-05-16 20:24:10 +00:00
|
|
|
|
|
|
|
|
|
|
// Some wikis wrap the brackets in a span:
|
2016-10-09 17:48:14 +00:00
|
|
|
|
// https://en.wikipedia.org/wiki/MediaWiki:Cite_reference_link
|
2016-05-16 20:24:10 +00:00
|
|
|
|
$heading = preg_replace( '/<\/?span>/', '', $heading );
|
|
|
|
|
|
// Normalize [] so the following regexp would work.
|
|
|
|
|
|
$heading = preg_replace( [ '/[/', '/]/' ], [ '[', ']' ], $heading );
|
2023-03-24 03:21:20 +00:00
|
|
|
|
$heading = preg_replace( '/<sup>\s*\[\s*\d+\s*\]\s*<\/sup>/i', '', $heading );
|
2016-05-16 20:24:10 +00:00
|
|
|
|
|
|
|
|
|
|
// Strip tags from the heading or else we'll display them (escaped) in search results
|
|
|
|
|
|
$heading = trim( Sanitizer::stripAllTags( $heading ) );
|
|
|
|
|
|
|
|
|
|
|
|
// Note that we don't take the level of the heading into account - all headings are equal.
|
|
|
|
|
|
// Except the ones we ignore.
|
|
|
|
|
|
if ( !in_array( $heading, $ignoredHeadings ) ) {
|
|
|
|
|
|
$headings[] = $heading;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
2024-08-06 13:40:20 +00:00
|
|
|
|
|
2016-05-16 20:24:10 +00:00
|
|
|
|
return $headings;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
|
* Parse a message content into an array. This function is generally used to
|
|
|
|
|
|
* parse settings stored as i18n messages (see search-ignored-headings).
|
|
|
|
|
|
*
|
|
|
|
|
|
* @param string $message
|
2024-08-06 13:40:20 +00:00
|
|
|
|
*
|
2016-05-16 20:24:10 +00:00
|
|
|
|
* @return string[]
|
|
|
|
|
|
*/
|
|
|
|
|
|
public static function parseSettingsInMessage( $message ) {
|
|
|
|
|
|
$lines = explode( "\n", $message );
|
2023-08-23 11:13:16 +00:00
|
|
|
|
// Remove comments
|
|
|
|
|
|
$lines = preg_replace( '/#.*$/', '', $lines );
|
|
|
|
|
|
// Remove extra spaces
|
|
|
|
|
|
$lines = array_map( 'trim', $lines );
|
2024-08-06 13:40:20 +00:00
|
|
|
|
|
2023-08-23 11:13:16 +00:00
|
|
|
|
// Remove empty lines
|
|
|
|
|
|
return array_filter( $lines );
|
2016-05-16 20:24:10 +00:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
2023-08-23 11:13:16 +00:00
|
|
|
|
* Gets a list of heading to ignore.
|
2024-08-06 13:40:20 +00:00
|
|
|
|
*
|
2016-05-16 20:24:10 +00:00
|
|
|
|
* @return string[]
|
|
|
|
|
|
*/
|
|
|
|
|
|
private function getIgnoredHeadings() {
|
|
|
|
|
|
static $ignoredHeadings = null;
|
|
|
|
|
|
if ( $ignoredHeadings === null ) {
|
|
|
|
|
|
$ignoredHeadings = [];
|
2016-06-14 21:01:13 +00:00
|
|
|
|
$source = wfMessage( 'search-ignored-headings' )->inContentLanguage();
|
2016-05-16 20:24:10 +00:00
|
|
|
|
if ( !$source->isDisabled() ) {
|
|
|
|
|
|
$lines = self::parseSettingsInMessage( $source->plain() );
|
2023-08-23 11:13:16 +00:00
|
|
|
|
// Now we just have headings!
|
|
|
|
|
|
$ignoredHeadings = $lines;
|
2016-05-16 20:24:10 +00:00
|
|
|
|
}
|
|
|
|
|
|
}
|
2024-08-06 13:40:20 +00:00
|
|
|
|
|
2016-05-16 20:24:10 +00:00
|
|
|
|
return $ignoredHeadings;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
|
* Extract parts of the text - opening, main and auxiliary.
|
|
|
|
|
|
*/
|
|
|
|
|
|
private function extractWikitextParts() {
|
2020-01-09 23:48:34 +00:00
|
|
|
|
if ( $this->allText !== null ) {
|
2016-05-16 20:24:10 +00:00
|
|
|
|
return;
|
|
|
|
|
|
}
|
2024-03-07 20:24:00 +00:00
|
|
|
|
$text = $this->parserOutput->getRawText();
|
2019-03-27 10:13:08 +00:00
|
|
|
|
if ( $text === '' ) {
|
2016-05-16 20:24:10 +00:00
|
|
|
|
$this->allText = "";
|
2024-08-06 13:40:20 +00:00
|
|
|
|
|
2016-05-16 20:24:10 +00:00
|
|
|
|
// empty text - nothing to seek here
|
|
|
|
|
|
return;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2023-08-23 12:24:41 +00:00
|
|
|
|
$this->openingText = $this->extractTextBeforeFirstHeading( $text );
|
2016-05-16 20:24:10 +00:00
|
|
|
|
|
|
|
|
|
|
$formatter = new HtmlFormatter( $text );
|
|
|
|
|
|
|
|
|
|
|
|
// Strip elements from the page that we never want in the search text.
|
2023-08-23 11:13:16 +00:00
|
|
|
|
$formatter->remove( self::EXCLUDED_ELEMENT_SELECTORS );
|
2016-05-16 20:24:10 +00:00
|
|
|
|
$formatter->filterContent();
|
|
|
|
|
|
|
|
|
|
|
|
// Strip elements from the page that are auxiliary text. These will still be
|
2023-08-23 11:13:16 +00:00
|
|
|
|
// searched, but matches will be ranked lower and non-auxiliary matches will be
|
2016-05-16 20:24:10 +00:00
|
|
|
|
// preferred in highlighting.
|
2023-08-23 11:13:16 +00:00
|
|
|
|
$formatter->remove( self::AUXILIARY_ELEMENT_SELECTORS );
|
2016-05-16 20:24:10 +00:00
|
|
|
|
$auxiliaryElements = $formatter->filterContent();
|
|
|
|
|
|
$this->allText = trim( Sanitizer::stripAllTags( $formatter->getText() ) );
|
|
|
|
|
|
foreach ( $auxiliaryElements as $auxiliaryElement ) {
|
|
|
|
|
|
$this->auxText[] =
|
|
|
|
|
|
trim( Sanitizer::stripAllTags( $formatter->getText( $auxiliaryElement ) ) );
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
|
* Get text before first heading.
|
2024-08-06 13:40:20 +00:00
|
|
|
|
*
|
2016-05-16 20:24:10 +00:00
|
|
|
|
* @param string $text
|
2024-08-06 13:40:20 +00:00
|
|
|
|
*
|
2016-05-16 20:24:10 +00:00
|
|
|
|
* @return string|null
|
|
|
|
|
|
*/
|
2023-08-23 12:24:41 +00:00
|
|
|
|
private function extractTextBeforeFirstHeading( $text ) {
|
2016-05-16 20:24:10 +00:00
|
|
|
|
$matches = [];
|
2024-02-08 22:57:42 +00:00
|
|
|
|
if ( !preg_match( '/<h[123456]\b/', $text, $matches, PREG_OFFSET_CAPTURE ) ) {
|
2023-08-23 11:13:16 +00:00
|
|
|
|
// There isn't a first heading, so we interpret this as the article
|
2016-05-16 20:24:10 +00:00
|
|
|
|
// being entirely without heading.
|
|
|
|
|
|
return null;
|
|
|
|
|
|
}
|
|
|
|
|
|
$text = substr( $text, 0, $matches[ 0 ][ 1 ] );
|
|
|
|
|
|
if ( !$text ) {
|
2023-08-23 11:13:16 +00:00
|
|
|
|
// There isn't any text before the first heading, so we declare there isn't
|
2016-05-16 20:24:10 +00:00
|
|
|
|
// a first heading.
|
|
|
|
|
|
return null;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
$formatter = new HtmlFormatter( $text );
|
2023-08-23 11:13:16 +00:00
|
|
|
|
$formatter->remove( self::EXCLUDED_ELEMENT_SELECTORS );
|
|
|
|
|
|
$formatter->remove( self::AUXILIARY_ELEMENT_SELECTORS );
|
2016-05-16 20:24:10 +00:00
|
|
|
|
$formatter->filterContent();
|
|
|
|
|
|
$text = trim( Sanitizer::stripAllTags( $formatter->getText() ) );
|
|
|
|
|
|
|
|
|
|
|
|
if ( !$text ) {
|
2023-08-23 11:13:16 +00:00
|
|
|
|
// There isn't any text after filtering before the first heading, so we declare
|
2016-05-16 20:24:10 +00:00
|
|
|
|
// that there isn't a first heading.
|
|
|
|
|
|
return null;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return $text;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
2023-08-23 11:13:16 +00:00
|
|
|
|
* @return string|null
|
2016-05-16 20:24:10 +00:00
|
|
|
|
*/
|
|
|
|
|
|
public function getOpeningText() {
|
|
|
|
|
|
$this->extractWikitextParts();
|
2024-08-06 13:40:20 +00:00
|
|
|
|
|
2016-05-16 20:24:10 +00:00
|
|
|
|
return $this->openingText;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
|
* @return string
|
|
|
|
|
|
*/
|
|
|
|
|
|
public function getMainText() {
|
|
|
|
|
|
$this->extractWikitextParts();
|
2024-08-06 13:40:20 +00:00
|
|
|
|
|
2016-05-16 20:24:10 +00:00
|
|
|
|
return $this->allText;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
|
* @return string[]
|
|
|
|
|
|
*/
|
|
|
|
|
|
public function getAuxiliaryText() {
|
|
|
|
|
|
$this->extractWikitextParts();
|
2024-08-06 13:40:20 +00:00
|
|
|
|
|
2016-05-16 20:24:10 +00:00
|
|
|
|
return $this->auxText;
|
|
|
|
|
|
}
|
2016-08-29 14:30:43 +00:00
|
|
|
|
|
|
|
|
|
|
/**
|
2023-08-23 11:13:16 +00:00
|
|
|
|
* Get the "defaultsort" property
|
2024-08-06 13:40:20 +00:00
|
|
|
|
*
|
2016-08-29 14:30:43 +00:00
|
|
|
|
* @return string|null
|
|
|
|
|
|
*/
|
|
|
|
|
|
public function getDefaultSort() {
|
2022-11-18 20:08:28 +00:00
|
|
|
|
$sort = $this->parserOutput->getPageProperty( 'defaultsort' );
|
|
|
|
|
|
if ( $sort === false ) {
|
|
|
|
|
|
return null;
|
|
|
|
|
|
}
|
2024-08-06 13:40:20 +00:00
|
|
|
|
|
2022-11-18 20:08:28 +00:00
|
|
|
|
return $sort;
|
2016-08-29 14:30:43 +00:00
|
|
|
|
}
|
2016-05-16 20:24:10 +00:00
|
|
|
|
}
|
2024-08-06 13:40:20 +00:00
|
|
|
|
|
|
|
|
|
|
/** @deprecated class alias since 1.43 */
|
|
|
|
|
|
class_alias( WikiTextStructure::class, 'WikiTextStructure' );
|