wiki.techinc.nl/includes/content/WikiTextStructure.php

<?php

namespace MediaWiki\Content;

use HtmlFormatter\HtmlFormatter;
use MediaWiki\Parser\ParserOutput;
use MediaWiki\Parser\Sanitizer;

/**
 * Class allowing to explore the structure of parsed wikitext.
 */
class WikiTextStructure {

	private ?string $openingText = null;
	private ?string $allText = null;
	/** @var string[] */
	private array $auxText = [];
	private ParserOutput $parserOutput;

	/**
	 * Selectors to elements that are excluded entirely from search
	 */
	private const EXCLUDED_ELEMENT_SELECTORS = [
		// "it looks like you don't have javascript enabled..." – do not need to index
		'audio', 'video',
		// CSS stylesheets aren't content
		'style',
		// The [1] for references from Cite
		'sup.reference',
		// The ↑ next to references in the references section from Cite
		'.mw-cite-backlink',
		// Headings are already indexed in their own field.
		'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
		// Collapsed fields are hidden by default, so we don't want them showing up.
		'.autocollapse',
		// Content explicitly decided to be not searchable by editors such
		// as custom navigation templates.
		'.navigation-not-searchable',
		// User-facing interface code prompting the user to act from WikibaseMediaInfo
		'.wbmi-entityview-emptyCaption',
	];

	/**
	 * Selectors to elements that are considered auxiliary to the article text for search
	 */
	private const AUXILIARY_ELEMENT_SELECTORS = [
		// Thumbnail captions aren't really part of the text proper
		'.thumbcaption',
		'figcaption',
		// Neither are tables
		'table',
		// Common style for "See also:".
		'.rellink',
		// Common style for calling out helpful links at the top of the article.
		'.dablink',
		// New class users can use to mark stuff as auxiliary to searches.
		'.searchaux',
	];

	/**
	 * @param ParserOutput $parserOutput
	 */
	public function __construct( ParserOutput $parserOutput ) {
		$this->parserOutput = $parserOutput;
	}

	/**
	 * Gets headings from the page.
	 *
	 * @return string[]
	 * First strip out things that look like references.  We can't use HTML filtering because
	 * the references come back as <sup> tags without a class.  To keep from breaking stuff like
	 *  ==Applicability of the strict mass–energy equivalence formula, ''E'' = ''mc''<sup>2</sup>==
	 * we don't remove the whole <sup> tag.
	 *
	 * We also don't want to strip the <sup> tag and remove everything that looks like [2] because,
	 * I don't know, maybe there is a band named Word [2] Foo r something. Whatever.
	 *
	 * So we only strip things that look like <sup> tags wrapping a reference. And since the data
	 * looks like:
	 *      Reference in heading <sup>&#91;1&#93;</sup><sup>&#91;2&#93;</sup>
	 * we can not really use HtmlFormatter as we have no suitable selector.
	 */
	public function headings() {
		$headings = [];
		$tocData = $this->parserOutput->getTOCData();
		if ( $tocData === null ) {
			return $headings;
		}
		$ignoredHeadings = $this->getIgnoredHeadings();
		foreach ( $tocData->getSections() as $heading ) {
			$heading = $heading->line;

			// Some wikis wrap the brackets in a span:
			// https://en.wikipedia.org/wiki/MediaWiki:Cite_reference_link
			$heading = preg_replace( '/<\/?span>/', '', $heading );
			// Normalize [] so the following regexp would work.
			$heading = preg_replace( [ '/&#91;/', '/&#93;/' ], [ '[', ']' ], $heading );
			$heading = preg_replace( '/<sup>\s*\[\s*\d+\s*\]\s*<\/sup>/i', '', $heading );

			// Strip tags from the heading or else we'll display them (escaped) in search results
			$heading = trim( Sanitizer::stripAllTags( $heading ) );

			// Note that we don't take the level of the heading into account - all headings are equal.
			// Except the ones we ignore.
			if ( !in_array( $heading, $ignoredHeadings ) ) {
				$headings[] = $heading;
			}
		}

		return $headings;
	}

	/**
	 * Parse a message content into an array. This function is generally used to
	 * parse settings stored as i18n messages (see search-ignored-headings).
	 *
	 * @param string $message
	 *
	 * @return string[]
	 */
	public static function parseSettingsInMessage( $message ) {
		$lines = explode( "\n", $message );
		// Remove comments
		$lines = preg_replace( '/#.*$/', '', $lines );
		// Remove extra spaces
		$lines = array_map( 'trim', $lines );

		// Remove empty lines
		return array_filter( $lines );
	}

	/**
	 * Gets a list of heading to ignore.
	 *
	 * @return string[]
	 */
	private function getIgnoredHeadings() {
		static $ignoredHeadings = null;
		if ( $ignoredHeadings === null ) {
			$ignoredHeadings = [];
			$source = wfMessage( 'search-ignored-headings' )->inContentLanguage();
			if ( !$source->isDisabled() ) {
				$lines = self::parseSettingsInMessage( $source->plain() );
				// Now we just have headings!
				$ignoredHeadings = $lines;
			}
		}

		return $ignoredHeadings;
	}

	/**
	 * Extract parts of the text - opening, main and auxiliary.
	 */
	private function extractWikitextParts() {
		if ( $this->allText !== null ) {
			return;
		}
		$text = $this->parserOutput->getRawText();
		if ( $text === '' ) {
			$this->allText = "";

			// empty text - nothing to seek here
			return;
		}

		$this->openingText = $this->extractTextBeforeFirstHeading( $text );

		$formatter = new HtmlFormatter( $text );

		// Strip elements from the page that we never want in the search text.
		$formatter->remove( self::EXCLUDED_ELEMENT_SELECTORS );
		$formatter->filterContent();

		// Strip elements from the page that are auxiliary text.  These will still be
		// searched, but matches will be ranked lower and non-auxiliary matches will be
		// preferred in highlighting.
		$formatter->remove( self::AUXILIARY_ELEMENT_SELECTORS );
		$auxiliaryElements = $formatter->filterContent();
		$this->allText = trim( Sanitizer::stripAllTags( $formatter->getText() ) );
		foreach ( $auxiliaryElements as $auxiliaryElement ) {
			$this->auxText[] =
				trim( Sanitizer::stripAllTags( $formatter->getText( $auxiliaryElement ) ) );
		}
	}

	/**
	 * Get text before first heading.
	 *
	 * @param string $text
	 *
	 * @return string|null
	 */
	private function extractTextBeforeFirstHeading( $text ) {
		$matches = [];
		if ( !preg_match( '/<h[123456]\b/', $text, $matches, PREG_OFFSET_CAPTURE ) ) {
			// There isn't a first heading, so we interpret this as the article
			// being entirely without heading.
			return null;
		}
		$text = substr( $text, 0, $matches[ 0 ][ 1 ] );
		if ( !$text ) {
			// There isn't any text before the first heading, so we declare there isn't
			// a first heading.
			return null;
		}

		$formatter = new HtmlFormatter( $text );
		$formatter->remove( self::EXCLUDED_ELEMENT_SELECTORS );
		$formatter->remove( self::AUXILIARY_ELEMENT_SELECTORS );
		$formatter->filterContent();
		$text = trim( Sanitizer::stripAllTags( $formatter->getText() ) );

		if ( !$text ) {
			// There isn't any text after filtering before the first heading, so we declare
			// that there isn't a first heading.
			return null;
		}

		return $text;
	}

	/**
	 * @return string|null
	 */
	public function getOpeningText() {
		$this->extractWikitextParts();

		return $this->openingText;
	}

	/**
	 * @return string
	 */
	public function getMainText() {
		$this->extractWikitextParts();

		return $this->allText;
	}

	/**
	 * @return string[]
	 */
	public function getAuxiliaryText() {
		$this->extractWikitextParts();

		return $this->auxText;
	}

	/**
	 * Get the "defaultsort" property
	 *
	 * @return string|null
	 */
	public function getDefaultSort() {
		$sort = $this->parserOutput->getPageProperty( 'defaultsort' );
		if ( $sort === false ) {
			return null;
		}

		return $sort;
	}
}

/** @deprecated class alias since 1.43 */
class_alias( WikiTextStructure::class, 'WikiTextStructure' );
-												Make content handlers assemble content for search

Bug: T89733
Change-Id: Ie45de496ecc826211d98eea3a410c7639b4be0a4

											
										
										
											2016-05-16 20:24:10 +00:00
+								<?php
-												Add namespace to WikitextContent

It adds MediaWiki\Content namespace to WikitextContent
and two classes related.

Change-Id: Ib74e4c5b3edac6aa0e35d3b2093ce1d0b794cb6d

											
										
										
											2024-08-06 13:40:20 +00:00
+								namespace MediaWiki\Content;
-												Make content handlers assemble content for search

Bug: T89733
Change-Id: Ie45de496ecc826211d98eea3a410c7639b4be0a4

											
										
										
											2016-05-16 20:24:10 +00:00
+								use HtmlFormatter\HtmlFormatter;
-												Namespace ParserOutput

Most used non-namespaced class!

Bug: T353458
Change-Id: I4c2cbb0a808b3881a4d6ca489eee5d8c8ebf26cf

											
										
										
											2023-12-14 19:20:33 +00:00
+								use MediaWiki\Parser\ParserOutput;
-												Namespace Sanitizer under \MediaWiki\Parser

Bug: T166010
Change-Id: Id13dcbf7a0372017495958dbc4f601f40c122508

											
										
										
											2023-09-19 16:59:47 +00:00
+								use MediaWiki\Parser\Sanitizer;
-												Make content handlers assemble content for search

Bug: T89733
Change-Id: Ie45de496ecc826211d98eea3a410c7639b4be0a4

											
										
										
											2016-05-16 20:24:10 +00:00
 								/**
-												WikiTextStructure/WikitextContentHandler: Minor cleanup

Change-Id: If2f8243867994609d82618e61ddaaacca3516990

											
										
										
											2023-08-23 11:13:16 +00:00
+								 * Class allowing to explore the structure of parsed wikitext.
-												Make content handlers assemble content for search

Bug: T89733
Change-Id: Ie45de496ecc826211d98eea3a410c7639b4be0a4

											
										
										
											2016-05-16 20:24:10 +00:00
+								 */
 								class WikiTextStructure {
-												WikiTextStructure/WikitextContentHandler: Minor cleanup

Change-Id: If2f8243867994609d82618e61ddaaacca3516990

											
										
										
											2023-08-23 11:13:16 +00:00
 									private ?string $openingText = null;
 									private ?string $allText = null;
 									/** @var string[] */
 									private array $auxText = [];
 									private ParserOutput $parserOutput;
-												Make content handlers assemble content for search

Bug: T89733
Change-Id: Ie45de496ecc826211d98eea3a410c7639b4be0a4

											
										
										
											2016-05-16 20:24:10 +00:00
 									/**
-												WikiTextStructure/WikitextContentHandler: Minor cleanup

Change-Id: If2f8243867994609d82618e61ddaaacca3516990

											
										
										
											2023-08-23 11:13:16 +00:00
+									 * Selectors to elements that are excluded entirely from search
-												Make content handlers assemble content for search

Bug: T89733
Change-Id: Ie45de496ecc826211d98eea3a410c7639b4be0a4

											
										
										
											2016-05-16 20:24:10 +00:00
+									 */
-												WikiTextStructure/WikitextContentHandler: Minor cleanup

Change-Id: If2f8243867994609d82618e61ddaaacca3516990

											
										
										
											2023-08-23 11:13:16 +00:00
+									private const EXCLUDED_ELEMENT_SELECTORS = [
-												Miscellaneous indentation tweaks

I was bored. What? Don't look at me that way.

I mostly targetted mixed tabs and spaces, but others were not spared.
Note that some of the whitespace changes are inside HTML output,
extended regexps or SQL snippets.

Change-Id: Ie206cc946459f6befcfc2d520e35ad3ea3c0f1e0

											
										
										
											2017-02-25 21:53:36 +00:00
+										// "it looks like you don't have javascript enabled..." – do not need to index
 										'audio', 'video',
-												WikiTextStructure: Exclude <style> tags

They aren't part of the article content, so they shouldn't be indexed
for search.

Bug: T189528
Change-Id: I3203f1f415eb821975098057d75c0e535b1fc76c

											
										
										
											2018-03-13 18:57:45 +00:00
+										// CSS stylesheets aren't content
 										'style',
-												WikiTextStructure: Explain the source of two non-Core exclusions

Change-Id: I2673afb25c6f21789a4c89f390ca13dae2cc2fa9

											
										
										
											2019-01-17 18:31:28 +00:00
+										// The [1] for references from Cite
-												Miscellaneous indentation tweaks

I was bored. What? Don't look at me that way.

I mostly targetted mixed tabs and spaces, but others were not spared.
Note that some of the whitespace changes are inside HTML output,
extended regexps or SQL snippets.

Change-Id: Ie206cc946459f6befcfc2d520e35ad3ea3c0f1e0

											
										
										
											2017-02-25 21:53:36 +00:00
+										'sup.reference',
-												WikiTextStructure: Explain the source of two non-Core exclusions

Change-Id: I2673afb25c6f21789a4c89f390ca13dae2cc2fa9

											
										
										
											2019-01-17 18:31:28 +00:00
+										// The ↑ next to references in the references section from Cite
-												Miscellaneous indentation tweaks

I was bored. What? Don't look at me that way.

I mostly targetted mixed tabs and spaces, but others were not spared.
Note that some of the whitespace changes are inside HTML output,
extended regexps or SQL snippets.

Change-Id: Ie206cc946459f6befcfc2d520e35ad3ea3c0f1e0

											
										
										
											2017-02-25 21:53:36 +00:00
+										'.mw-cite-backlink',
 										// Headings are already indexed in their own field.
 										'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
-												WikiTextStructure/WikitextContentHandler: Minor cleanup

Change-Id: If2f8243867994609d82618e61ddaaacca3516990

											
										
										
											2023-08-23 11:13:16 +00:00
+										// Collapsed fields are hidden by default, so we don't want them showing up.
-												Miscellaneous indentation tweaks

I was bored. What? Don't look at me that way.

I mostly targetted mixed tabs and spaces, but others were not spared.
Note that some of the whitespace changes are inside HTML output,
extended regexps or SQL snippets.

Change-Id: Ie206cc946459f6befcfc2d520e35ad3ea3c0f1e0

											
										
										
											2017-02-25 21:53:36 +00:00
+										'.autocollapse',
-												Allow editors to exclude navigation items from search indices

When you perform a particular search, the results can be polluted by
navigation elements that are not supposed to be displayed. This gives
editors the ability to mark sections of the document that should not
be indexed.

Bug: T162905
Change-Id: Iab2b83c3778cd5f7f44736c0da569fd938ae2968

											
										
										
											2017-04-18 21:03:23 +00:00
+										// Content explicitly decided to be not searchable by editors such
 										// as custom navigation templates.
-												WikiTextStructure: Add an exclusion from WikibaseMediaInfo

This is not lovely, and probably should be a hook or similar.

Bug: T213638
Change-Id: I042ac81b630dede55887e644692ea1a3b1fd6fe1

											
										
										
											2019-01-17 18:31:52 +00:00
+										'.navigation-not-searchable',
 										// User-facing interface code prompting the user to act from WikibaseMediaInfo
 										'.wbmi-entityview-emptyCaption',
-												Make content handlers assemble content for search

Bug: T89733
Change-Id: Ie45de496ecc826211d98eea3a410c7639b4be0a4

											
										
										
											2016-05-16 20:24:10 +00:00
+									];
 									/**
-												WikiTextStructure/WikitextContentHandler: Minor cleanup

Change-Id: If2f8243867994609d82618e61ddaaacca3516990

											
										
										
											2023-08-23 11:13:16 +00:00
+									 * Selectors to elements that are considered auxiliary to the article text for search
-												Make content handlers assemble content for search

Bug: T89733
Change-Id: Ie45de496ecc826211d98eea3a410c7639b4be0a4

											
										
										
											2016-05-16 20:24:10 +00:00
+									 */
-												WikiTextStructure/WikitextContentHandler: Minor cleanup

Change-Id: If2f8243867994609d82618e61ddaaacca3516990

											
										
										
											2023-08-23 11:13:16 +00:00
+									private const AUXILIARY_ELEMENT_SELECTORS = [
-												Miscellaneous indentation tweaks

I was bored. What? Don't look at me that way.

I mostly targetted mixed tabs and spaces, but others were not spared.
Note that some of the whitespace changes are inside HTML output,
extended regexps or SQL snippets.

Change-Id: Ie206cc946459f6befcfc2d520e35ad3ea3c0f1e0

											
										
										
											2017-02-25 21:53:36 +00:00
+										// Thumbnail captions aren't really part of the text proper
 										'.thumbcaption',
-												WikiTextStructure: Also extract figcaption elements as captions

The figcaption element looks to be many years old, but was not
previously considered a caption by the search engine. Some integration
tests have recently started failing as an element that previously
contained the thumbcaption css element is now inside a figcaption
class.

Update the code that extracts captions from the text into a separate
field to also extract the figcaption, as it has the same purpose as
thumbcaption.

Change-Id: I2a4a309e58602281d6cca65744036efb4a5ce5b5

											
										
										
											2023-03-23 15:54:29 +00:00
+										'figcaption',
-												Miscellaneous indentation tweaks

I was bored. What? Don't look at me that way.

I mostly targetted mixed tabs and spaces, but others were not spared.
Note that some of the whitespace changes are inside HTML output,
extended regexps or SQL snippets.

Change-Id: Ie206cc946459f6befcfc2d520e35ad3ea3c0f1e0

											
										
										
											2017-02-25 21:53:36 +00:00
+										// Neither are tables
 										'table',
 										// Common style for "See also:".
 										'.rellink',
 										// Common style for calling out helpful links at the top of the article.
 										'.dablink',
 										// New class users can use to mark stuff as auxiliary to searches.
 										'.searchaux',
-												Make content handlers assemble content for search

Bug: T89733
Change-Id: Ie45de496ecc826211d98eea3a410c7639b4be0a4

											
										
										
											2016-05-16 20:24:10 +00:00
+									];
 									/**
 									 * @param ParserOutput $parserOutput
 									 */
 									public function __construct( ParserOutput $parserOutput ) {
 										$this->parserOutput = $parserOutput;
 									}
 									/**
-												WikiTextStructure/WikitextContentHandler: Minor cleanup

Change-Id: If2f8243867994609d82618e61ddaaacca3516990

											
										
										
											2023-08-23 11:13:16 +00:00
+									 * Gets headings from the page.
-												Add namespace to WikitextContent

It adds MediaWiki\Content namespace to WikitextContent
and two classes related.

Change-Id: Ib74e4c5b3edac6aa0e35d3b2093ce1d0b794cb6d

											
										
										
											2024-08-06 13:40:20 +00:00
+									 *
-												Make content handlers assemble content for search

Bug: T89733
Change-Id: Ie45de496ecc826211d98eea3a410c7639b4be0a4

											
										
										
											2016-05-16 20:24:10 +00:00
+									 * @return string[]
 									 * First strip out things that look like references.  We can't use HTML filtering because
 									 * the references come back as <sup> tags without a class.  To keep from breaking stuff like
 									 *  ==Applicability of the strict mass–energy equivalence formula, ''E'' = ''mc''<sup>2</sup>==
-												WikiTextStructure/WikitextContentHandler: Minor cleanup

Change-Id: If2f8243867994609d82618e61ddaaacca3516990

											
										
										
											2023-08-23 11:13:16 +00:00
+									 * we don't remove the whole <sup> tag.
 									 *
 									 * We also don't want to strip the <sup> tag and remove everything that looks like [2] because,
 									 * I don't know, maybe there is a band named Word [2] Foo r something. Whatever.
 									 *
 									 * So we only strip things that look like <sup> tags wrapping a reference. And since the data
 									 * looks like:
-												Make content handlers assemble content for search

Bug: T89733
Change-Id: Ie45de496ecc826211d98eea3a410c7639b4be0a4

											
										
										
											2016-05-16 20:24:10 +00:00
+									 *      Reference in heading <sup>&#91;1&#93;</sup><sup>&#91;2&#93;</sup>
 									 * we can not really use HtmlFormatter as we have no suitable selector.
 									 */
 									public function headings() {
 										$headings = [];
-												Generate/set/get TOCData/SectionMetadata objects instead of arrays

* ParserOutput::setSections()/::getSections() are expected
  to be deprecated. Uses in extensions and skins will need to be
  migrated in follow up patches once the new interface has stabilized.

* In the skins code, the metadata is converted back to an array.
  Downstream skin TOC consumers will need to be migrated as well
  before we can remove the toLegacy() conversion.

* Fixed SerializationTestTrait's validation method
  - Not sure if this is overkill but should handle all future
    complex objects we might stuff into the ParserCache.

* This patch emits a backward-compatible Sections property in order to
  avoid changing the parser cache serialization format. T327439 has
  been filed to eventually use the JsonCodec support for object
  serialization, but for this initial patch it makes sense to avoid
  the need for a concurrent ParserCache format migration by using a
  backward-compatible serialization.

* TOCData is nullable because the intent is that
  ParserOutput::setTOCData() is MW_MERGE_STRATEGY_WRITE_ONCE; that is,
  only the top-level fragment composing a page will set the TOCData.
  This will be enforced in the future via wfDeprecated() (T327429),
  but again our first patch is as backward-compatible as possible.

Bug: T296025
Depends-On: I1b267d23cf49d147c5379b914531303744481b68
Co-Authored-By: C. Scott Ananian <cananian@wikimedia.org>
Co-Authored-By: Subramanya Sastry <ssastry@wikimedia.org>
Change-Id: I8329864535f0b1dd5f9163868a08d6cb1ffcb78f

											
										
										
											2022-09-01 23:07:29 +00:00
+										$tocData = $this->parserOutput->getTOCData();
 										if ( $tocData === null ) {
 											return $headings;
 										}
-												Make content handlers assemble content for search

Bug: T89733
Change-Id: Ie45de496ecc826211d98eea3a410c7639b4be0a4

											
										
										
											2016-05-16 20:24:10 +00:00
+										$ignoredHeadings = $this->getIgnoredHeadings();
-												Generate/set/get TOCData/SectionMetadata objects instead of arrays

* ParserOutput::setSections()/::getSections() are expected
  to be deprecated. Uses in extensions and skins will need to be
  migrated in follow up patches once the new interface has stabilized.

* In the skins code, the metadata is converted back to an array.
  Downstream skin TOC consumers will need to be migrated as well
  before we can remove the toLegacy() conversion.

* Fixed SerializationTestTrait's validation method
  - Not sure if this is overkill but should handle all future
    complex objects we might stuff into the ParserCache.

* This patch emits a backward-compatible Sections property in order to
  avoid changing the parser cache serialization format. T327439 has
  been filed to eventually use the JsonCodec support for object
  serialization, but for this initial patch it makes sense to avoid
  the need for a concurrent ParserCache format migration by using a
  backward-compatible serialization.

* TOCData is nullable because the intent is that
  ParserOutput::setTOCData() is MW_MERGE_STRATEGY_WRITE_ONCE; that is,
  only the top-level fragment composing a page will set the TOCData.
  This will be enforced in the future via wfDeprecated() (T327429),
  but again our first patch is as backward-compatible as possible.

Bug: T296025
Depends-On: I1b267d23cf49d147c5379b914531303744481b68
Co-Authored-By: C. Scott Ananian <cananian@wikimedia.org>
Co-Authored-By: Subramanya Sastry <ssastry@wikimedia.org>
Change-Id: I8329864535f0b1dd5f9163868a08d6cb1ffcb78f

											
										
										
											2022-09-01 23:07:29 +00:00
+										foreach ( $tocData->getSections() as $heading ) {
 											$heading = $heading->line;
-												Make content handlers assemble content for search

Bug: T89733
Change-Id: Ie45de496ecc826211d98eea3a410c7639b4be0a4

											
										
										
											2016-05-16 20:24:10 +00:00
 											// Some wikis wrap the brackets in a span:
-												Update weblinks in comments from HTTP to HTTPS

Use HTTPS instead of HTTP where the HTTP link is a redirect to the HTTPS link.

Change-Id: I06d9e043730accc4ae71b927e0f8229f0fc3b340

											
										
										
											2016-10-09 17:48:14 +00:00
+											// https://en.wikipedia.org/wiki/MediaWiki:Cite_reference_link
-												Make content handlers assemble content for search

Bug: T89733
Change-Id: Ie45de496ecc826211d98eea3a410c7639b4be0a4

											
										
										
											2016-05-16 20:24:10 +00:00
+											$heading = preg_replace( '/<\/?span>/', '', $heading );
 											// Normalize [] so the following regexp would work.
 											$heading = preg_replace( [ '/&#91;/', '/&#93;/' ], [ '[', ']' ], $heading );
-												Just another 80 or so PHPStorm inspection fixes (#4)

* Unnecessary regex modifier. I agree with this inspection which flags
  /s modifiers on regexes that don't use a dot.
* Property declared dynamically.
* Unused local variable. But it's acceptable for an unused local
  variable to take the return value of a method under test, when it is
  being tested for its side-effects. And it's acceptable for an unused
  local variable to document unused list expansion elements, or the
  nature of array keys in a foreach.

Change-Id: I067b5b45dd1138c00e7269b66d3d1385f202fe7f

											
										
										
											2023-03-24 03:21:20 +00:00
+											$heading = preg_replace( '/<sup>\s*\[\s*\d+\s*\]\s*<\/sup>/i', '', $heading );
-												Make content handlers assemble content for search

Bug: T89733
Change-Id: Ie45de496ecc826211d98eea3a410c7639b4be0a4

											
										
										
											2016-05-16 20:24:10 +00:00
 											// Strip tags from the heading or else we'll display them (escaped) in search results
 											$heading = trim( Sanitizer::stripAllTags( $heading ) );
 											// Note that we don't take the level of the heading into account - all headings are equal.
 											// Except the ones we ignore.
 											if ( !in_array( $heading, $ignoredHeadings ) ) {
 												$headings[] = $heading;
 											}
 										}
-												Add namespace to WikitextContent

It adds MediaWiki\Content namespace to WikitextContent
and two classes related.

Change-Id: Ib74e4c5b3edac6aa0e35d3b2093ce1d0b794cb6d

											
										
										
											2024-08-06 13:40:20 +00:00
-												Make content handlers assemble content for search

Bug: T89733
Change-Id: Ie45de496ecc826211d98eea3a410c7639b4be0a4

											
										
										
											2016-05-16 20:24:10 +00:00
+										return $headings;
 									}
 									/**
 									 * Parse a message content into an array. This function is generally used to
 									 * parse settings stored as i18n messages (see search-ignored-headings).
 									 *
 									 * @param string $message
-												Add namespace to WikitextContent

It adds MediaWiki\Content namespace to WikitextContent
and two classes related.

Change-Id: Ib74e4c5b3edac6aa0e35d3b2093ce1d0b794cb6d

											
										
										
											2024-08-06 13:40:20 +00:00
+									 *
-												Make content handlers assemble content for search

Bug: T89733
Change-Id: Ie45de496ecc826211d98eea3a410c7639b4be0a4

											
										
										
											2016-05-16 20:24:10 +00:00
+									 * @return string[]
 									 */
 									public static function parseSettingsInMessage( $message ) {
 										$lines = explode( "\n", $message );
-												WikiTextStructure/WikitextContentHandler: Minor cleanup

Change-Id: If2f8243867994609d82618e61ddaaacca3516990

											
										
										
											2023-08-23 11:13:16 +00:00
+										// Remove comments
 										$lines = preg_replace( '/#.*$/', '', $lines );
 										// Remove extra spaces
 										$lines = array_map( 'trim', $lines );
-												Add namespace to WikitextContent

It adds MediaWiki\Content namespace to WikitextContent
and two classes related.

Change-Id: Ib74e4c5b3edac6aa0e35d3b2093ce1d0b794cb6d

											
										
										
											2024-08-06 13:40:20 +00:00
-												WikiTextStructure/WikitextContentHandler: Minor cleanup

Change-Id: If2f8243867994609d82618e61ddaaacca3516990

											
										
										
											2023-08-23 11:13:16 +00:00
+										// Remove empty lines
 										return array_filter( $lines );
-												Make content handlers assemble content for search

Bug: T89733
Change-Id: Ie45de496ecc826211d98eea3a410c7639b4be0a4

											
										
										
											2016-05-16 20:24:10 +00:00
+									}
 									/**
-												WikiTextStructure/WikitextContentHandler: Minor cleanup

Change-Id: If2f8243867994609d82618e61ddaaacca3516990

											
										
										
											2023-08-23 11:13:16 +00:00
+									 * Gets a list of heading to ignore.
-												Add namespace to WikitextContent

It adds MediaWiki\Content namespace to WikitextContent
and two classes related.

Change-Id: Ib74e4c5b3edac6aa0e35d3b2093ce1d0b794cb6d

											
										
										
											2024-08-06 13:40:20 +00:00
+									 *
-												Make content handlers assemble content for search

Bug: T89733
Change-Id: Ie45de496ecc826211d98eea3a410c7639b4be0a4

											
										
										
											2016-05-16 20:24:10 +00:00
+									 * @return string[]
 									 */
 									private function getIgnoredHeadings() {
 										static $ignoredHeadings = null;
 										if ( $ignoredHeadings === null ) {
 											$ignoredHeadings = [];
-												Add search-ignored-headings string, copied from cirrus-search-ignored-headings.

Change-Id: I4178f872996800379843301f7119840f4a4551df

											
										
										
											2016-06-14 21:01:13 +00:00
+											$source = wfMessage( 'search-ignored-headings' )->inContentLanguage();
-												Make content handlers assemble content for search

Bug: T89733
Change-Id: Ie45de496ecc826211d98eea3a410c7639b4be0a4

											
										
										
											2016-05-16 20:24:10 +00:00
+											if ( !$source->isDisabled() ) {
 												$lines = self::parseSettingsInMessage( $source->plain() );
-												WikiTextStructure/WikitextContentHandler: Minor cleanup

Change-Id: If2f8243867994609d82618e61ddaaacca3516990

											
										
										
											2023-08-23 11:13:16 +00:00
+												// Now we just have headings!
 												$ignoredHeadings = $lines;
-												Make content handlers assemble content for search

Bug: T89733
Change-Id: Ie45de496ecc826211d98eea3a410c7639b4be0a4

											
										
										
											2016-05-16 20:24:10 +00:00
+											}
 										}
-												Add namespace to WikitextContent

It adds MediaWiki\Content namespace to WikitextContent
and two classes related.

Change-Id: Ib74e4c5b3edac6aa0e35d3b2093ce1d0b794cb6d

											
										
										
											2024-08-06 13:40:20 +00:00
-												Make content handlers assemble content for search

Bug: T89733
Change-Id: Ie45de496ecc826211d98eea3a410c7639b4be0a4

											
										
										
											2016-05-16 20:24:10 +00:00
+										return $ignoredHeadings;
 									}
 									/**
 									 * Extract parts of the text - opening, main and auxiliary.
 									 */
 									private function extractWikitextParts() {
-												Coding style: Auto-fix MediaWiki.Usage.IsNull.IsNull

Change-Id: I90cfe8366c0245c9c67e598d17800684897a4e27

											
										
										
											2020-01-09 23:48:34 +00:00
+										if ( $this->allText !== null ) {
-												Make content handlers assemble content for search

Bug: T89733
Change-Id: Ie45de496ecc826211d98eea3a410c7639b4be0a4

											
										
										
											2016-05-16 20:24:10 +00:00
+											return;
 										}
-												WikiTextStructure: Use raw text from ParserOutput

It should be good enough for search indexing, and it avoids a
dependency on ParserOutput, Skin, OutputTransforms, and lots of other
slow and scary code.

getText() is deprecated, and we were trying to use a non-existent
option 'enableSectionEditTokens', so fixing that is nice as well.

Change-Id: Iac6b850af883ded414bdc26e954b0c0949af3b58

											
										
										
											2024-03-07 20:24:00 +00:00
+										$text = $this->parserOutput->getRawText();
-												Replace strlen() calls with strict string comparisons

Note there is an important difference between the two ways to express
this: strlen() does a string cast, but the `=== ''` and `!== ''`
comparisons will only detect empty strings, but not null, false, or any
other falsy value that becomes an empty string when cast to be one.

I am only touching code where I'm sure the variable is guaranteed to be
a string.

This change is done because I find the strict comparisons much more
readable. The code does exactly one thing now, and no magic casts any
more.

Change-Id: I3e908a0c7c7b6c29b0e5a1414f2ba9062a215b93

											
										
										
											2019-03-27 10:13:08 +00:00
+										if ( $text === '' ) {
-												Make content handlers assemble content for search

Bug: T89733
Change-Id: Ie45de496ecc826211d98eea3a410c7639b4be0a4

											
										
										
											2016-05-16 20:24:10 +00:00
+											$this->allText = "";
-												Add namespace to WikitextContent

It adds MediaWiki\Content namespace to WikitextContent
and two classes related.

Change-Id: Ib74e4c5b3edac6aa0e35d3b2093ce1d0b794cb6d

											
										
										
											2024-08-06 13:40:20 +00:00
-												Make content handlers assemble content for search

Bug: T89733
Change-Id: Ie45de496ecc826211d98eea3a410c7639b4be0a4

											
										
										
											2016-05-16 20:24:10 +00:00
+											// empty text - nothing to seek here
 											return;
 										}
-												WikiTextStructure: Rename extractHeadingBeforeFirstHeading() to extractTextBeforeFirstHeading()

Reflects what the function is actually trying to do...

Change-Id: Ic28a3121326be210ceb97af1d3e0287dbcc7e4cf

											
										
										
											2023-08-23 12:24:41 +00:00
+										$this->openingText = $this->extractTextBeforeFirstHeading( $text );
-												Make content handlers assemble content for search

Bug: T89733
Change-Id: Ie45de496ecc826211d98eea3a410c7639b4be0a4

											
										
										
											2016-05-16 20:24:10 +00:00
 										$formatter = new HtmlFormatter( $text );
 										// Strip elements from the page that we never want in the search text.
-												WikiTextStructure/WikitextContentHandler: Minor cleanup

Change-Id: If2f8243867994609d82618e61ddaaacca3516990

											
										
										
											2023-08-23 11:13:16 +00:00
+										$formatter->remove( self::EXCLUDED_ELEMENT_SELECTORS );
-												Make content handlers assemble content for search

Bug: T89733
Change-Id: Ie45de496ecc826211d98eea3a410c7639b4be0a4

											
										
										
											2016-05-16 20:24:10 +00:00
+										$formatter->filterContent();
 										// Strip elements from the page that are auxiliary text.  These will still be
-												WikiTextStructure/WikitextContentHandler: Minor cleanup

Change-Id: If2f8243867994609d82618e61ddaaacca3516990

											
										
										
											2023-08-23 11:13:16 +00:00
+										// searched, but matches will be ranked lower and non-auxiliary matches will be
-												Make content handlers assemble content for search

Bug: T89733
Change-Id: Ie45de496ecc826211d98eea3a410c7639b4be0a4

											
										
										
											2016-05-16 20:24:10 +00:00
+										// preferred in highlighting.
-												WikiTextStructure/WikitextContentHandler: Minor cleanup

Change-Id: If2f8243867994609d82618e61ddaaacca3516990

											
										
										
											2023-08-23 11:13:16 +00:00
+										$formatter->remove( self::AUXILIARY_ELEMENT_SELECTORS );
-												Make content handlers assemble content for search

Bug: T89733
Change-Id: Ie45de496ecc826211d98eea3a410c7639b4be0a4

											
										
										
											2016-05-16 20:24:10 +00:00
+										$auxiliaryElements = $formatter->filterContent();
 										$this->allText = trim( Sanitizer::stripAllTags( $formatter->getText() ) );
 										foreach ( $auxiliaryElements as $auxiliaryElement ) {
 											$this->auxText[] =
 												trim( Sanitizer::stripAllTags( $formatter->getText( $auxiliaryElement ) ) );
 										}
 									}
 									/**
 									 * Get text before first heading.
-												Add namespace to WikitextContent

It adds MediaWiki\Content namespace to WikitextContent
and two classes related.

Change-Id: Ib74e4c5b3edac6aa0e35d3b2093ce1d0b794cb6d

											
										
										
											2024-08-06 13:40:20 +00:00
+									 *
-												Make content handlers assemble content for search

Bug: T89733
Change-Id: Ie45de496ecc826211d98eea3a410c7639b4be0a4

											
										
										
											2016-05-16 20:24:10 +00:00
+									 * @param string $text
-												Add namespace to WikitextContent

It adds MediaWiki\Content namespace to WikitextContent
and two classes related.

Change-Id: Ib74e4c5b3edac6aa0e35d3b2093ce1d0b794cb6d

											
										
										
											2024-08-06 13:40:20 +00:00
+									 *
-												Make content handlers assemble content for search

Bug: T89733
Change-Id: Ie45de496ecc826211d98eea3a410c7639b4be0a4

											
										
										
											2016-05-16 20:24:10 +00:00
+									 * @return string|null
 									 */
-												WikiTextStructure: Rename extractHeadingBeforeFirstHeading() to extractTextBeforeFirstHeading()

Reflects what the function is actually trying to do...

Change-Id: Ic28a3121326be210ceb97af1d3e0287dbcc7e4cf

											
										
										
											2023-08-23 12:24:41 +00:00
+									private function extractTextBeforeFirstHeading( $text ) {
-												Make content handlers assemble content for search

Bug: T89733
Change-Id: Ie45de496ecc826211d98eea3a410c7639b4be0a4

											
										
										
											2016-05-16 20:24:10 +00:00
+										$matches = [];
-												WikiTextStructure: Support heading tags with attributes

Change-Id: Ie68c7c695cb84ed6844d6d422284dca8c0c13072

											
										
										
											2024-02-08 22:57:42 +00:00
+										if ( !preg_match( '/<h[123456]\b/', $text, $matches, PREG_OFFSET_CAPTURE ) ) {
-												WikiTextStructure/WikitextContentHandler: Minor cleanup

Change-Id: If2f8243867994609d82618e61ddaaacca3516990

											
										
										
											2023-08-23 11:13:16 +00:00
+											// There isn't a first heading, so we interpret this as the article
-												Make content handlers assemble content for search

Bug: T89733
Change-Id: Ie45de496ecc826211d98eea3a410c7639b4be0a4

											
										
										
											2016-05-16 20:24:10 +00:00
+											// being entirely without heading.
 											return null;
 										}
 										$text = substr( $text, 0, $matches[ 0 ][ 1 ] );
 										if ( !$text ) {
-												WikiTextStructure/WikitextContentHandler: Minor cleanup

Change-Id: If2f8243867994609d82618e61ddaaacca3516990

											
										
										
											2023-08-23 11:13:16 +00:00
+											// There isn't any text before the first heading, so we declare there isn't
-												Make content handlers assemble content for search

Bug: T89733
Change-Id: Ie45de496ecc826211d98eea3a410c7639b4be0a4

											
										
										
											2016-05-16 20:24:10 +00:00
+											// a first heading.
 											return null;
 										}
 										$formatter = new HtmlFormatter( $text );
-												WikiTextStructure/WikitextContentHandler: Minor cleanup

Change-Id: If2f8243867994609d82618e61ddaaacca3516990

											
										
										
											2023-08-23 11:13:16 +00:00
+										$formatter->remove( self::EXCLUDED_ELEMENT_SELECTORS );
 										$formatter->remove( self::AUXILIARY_ELEMENT_SELECTORS );
-												Make content handlers assemble content for search

Bug: T89733
Change-Id: Ie45de496ecc826211d98eea3a410c7639b4be0a4

											
										
										
											2016-05-16 20:24:10 +00:00
+										$formatter->filterContent();
 										$text = trim( Sanitizer::stripAllTags( $formatter->getText() ) );
 										if ( !$text ) {
-												WikiTextStructure/WikitextContentHandler: Minor cleanup

Change-Id: If2f8243867994609d82618e61ddaaacca3516990

											
										
										
											2023-08-23 11:13:16 +00:00
+											// There isn't any text after filtering before the first heading, so we declare
-												Make content handlers assemble content for search

Bug: T89733
Change-Id: Ie45de496ecc826211d98eea3a410c7639b4be0a4

											
										
										
											2016-05-16 20:24:10 +00:00
+											// that there isn't a first heading.
 											return null;
 										}
 										return $text;
 									}
 									/**
-												WikiTextStructure/WikitextContentHandler: Minor cleanup

Change-Id: If2f8243867994609d82618e61ddaaacca3516990

											
										
										
											2023-08-23 11:13:16 +00:00
+									 * @return string|null
-												Make content handlers assemble content for search

Bug: T89733
Change-Id: Ie45de496ecc826211d98eea3a410c7639b4be0a4

											
										
										
											2016-05-16 20:24:10 +00:00
+									 */
 									public function getOpeningText() {
 										$this->extractWikitextParts();
-												Add namespace to WikitextContent

It adds MediaWiki\Content namespace to WikitextContent
and two classes related.

Change-Id: Ib74e4c5b3edac6aa0e35d3b2093ce1d0b794cb6d

											
										
										
											2024-08-06 13:40:20 +00:00
-												Make content handlers assemble content for search

Bug: T89733
Change-Id: Ie45de496ecc826211d98eea3a410c7639b4be0a4

											
										
										
											2016-05-16 20:24:10 +00:00
+										return $this->openingText;
 									}
 									/**
 									 * @return string
 									 */
 									public function getMainText() {
 										$this->extractWikitextParts();
-												Add namespace to WikitextContent

It adds MediaWiki\Content namespace to WikitextContent
and two classes related.

Change-Id: Ib74e4c5b3edac6aa0e35d3b2093ce1d0b794cb6d

											
										
										
											2024-08-06 13:40:20 +00:00
-												Make content handlers assemble content for search

Bug: T89733
Change-Id: Ie45de496ecc826211d98eea3a410c7639b4be0a4

											
										
										
											2016-05-16 20:24:10 +00:00
+										return $this->allText;
 									}
 									/**
 									 * @return string[]
 									 */
 									public function getAuxiliaryText() {
 										$this->extractWikitextParts();
-												Add namespace to WikitextContent

It adds MediaWiki\Content namespace to WikitextContent
and two classes related.

Change-Id: Ib74e4c5b3edac6aa0e35d3b2093ce1d0b794cb6d

											
										
										
											2024-08-06 13:40:20 +00:00
-												Make content handlers assemble content for search

Bug: T89733
Change-Id: Ie45de496ecc826211d98eea3a410c7639b4be0a4

											
										
										
											2016-05-16 20:24:10 +00:00
+										return $this->auxText;
 									}
-												Add DEFAULTSORT to search index field data

Added FLAG_SOURCE_DATA to support additional data that is not supposed to be
part of the default mapping.

Should merged with I1484c2e62788bedb57a42869a5fb25cd8f64482f, otherwize rebuilding
an index may add an extra field to CirrusSearch mapping.

Bug: T134978
Change-Id: Ia41f8eeb9dd4f764543bdd4d71b7a50de8101101

											
										
										
											2016-08-29 14:30:43 +00:00
 									/**
-												WikiTextStructure/WikitextContentHandler: Minor cleanup

Change-Id: If2f8243867994609d82618e61ddaaacca3516990

											
										
										
											2023-08-23 11:13:16 +00:00
+									 * Get the "defaultsort" property
-												Add namespace to WikitextContent

It adds MediaWiki\Content namespace to WikitextContent
and two classes related.

Change-Id: Ib74e4c5b3edac6aa0e35d3b2093ce1d0b794cb6d

											
										
										
											2024-08-06 13:40:20 +00:00
+									 *
-												Add DEFAULTSORT to search index field data

Added FLAG_SOURCE_DATA to support additional data that is not supposed to be
part of the default mapping.

Should merged with I1484c2e62788bedb57a42869a5fb25cd8f64482f, otherwize rebuilding
an index may add an extra field to CirrusSearch mapping.

Bug: T134978
Change-Id: Ia41f8eeb9dd4f764543bdd4d71b7a50de8101101

											
										
										
											2016-08-29 14:30:43 +00:00
+									 * @return string|null
 									 */
 									public function getDefaultSort() {
-												WikiTextStructure::getDefaultSort should return null, not false

This is documented to return null, and that is expected by some
downstream users. Convert the false returned from page props into null
as expected.

Bug: T322327
Change-Id: I596512b6745347d7f75ec83eacdaf934044ea1d9

											
										
										
											2022-11-18 20:08:28 +00:00
+										$sort = $this->parserOutput->getPageProperty( 'defaultsort' );
 										if ( $sort === false ) {
 											return null;
 										}
-												Add namespace to WikitextContent

It adds MediaWiki\Content namespace to WikitextContent
and two classes related.

Change-Id: Ib74e4c5b3edac6aa0e35d3b2093ce1d0b794cb6d

											
										
										
											2024-08-06 13:40:20 +00:00
-												WikiTextStructure::getDefaultSort should return null, not false

This is documented to return null, and that is expected by some
downstream users. Convert the false returned from page props into null
as expected.

Bug: T322327
Change-Id: I596512b6745347d7f75ec83eacdaf934044ea1d9

											
										
										
											2022-11-18 20:08:28 +00:00
+										return $sort;
-												Add DEFAULTSORT to search index field data

Added FLAG_SOURCE_DATA to support additional data that is not supposed to be
part of the default mapping.

Should merged with I1484c2e62788bedb57a42869a5fb25cd8f64482f, otherwize rebuilding
an index may add an extra field to CirrusSearch mapping.

Bug: T134978
Change-Id: Ia41f8eeb9dd4f764543bdd4d71b7a50de8101101

											
										
										
											2016-08-29 14:30:43 +00:00
+									}
-												Make content handlers assemble content for search

Bug: T89733
Change-Id: Ie45de496ecc826211d98eea3a410c7639b4be0a4

											
										
										
											2016-05-16 20:24:10 +00:00
+								}
-												Add namespace to WikitextContent

It adds MediaWiki\Content namespace to WikitextContent
and two classes related.

Change-Id: Ib74e4c5b3edac6aa0e35d3b2093ce1d0b794cb6d

											
										
										
											2024-08-06 13:40:20 +00:00
 								/** @deprecated class alias since 1.43 */
 								class_alias( WikiTextStructure::class, 'WikiTextStructure' );