wiki.techinc.nl/includes/parser/MagicWordArray.php

<?php
/**
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License along
 * with this program; if not, write to the Free Software Foundation, Inc.,
 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
 * http://www.gnu.org/copyleft/gpl.html
 *
 * @file
 */

namespace MediaWiki\Parser;

use LogicException;
use MediaWiki\Logger\LoggerFactory;
use MediaWiki\MediaWikiServices;
use UnexpectedValueException;

/**
 * Class for handling an array of magic words
 *
 * See docs/magicword.md.
 *
 * @since 1.11
 * @ingroup Parser
 */
class MagicWordArray {

	/** @var string[] */
	public $names = [];
	private MagicWordFactory $factory;

	/** @var array<int,array<string,string>>|null */
	private $hash;

	/** @var string[]|null */
	private $baseRegex;

	/** @var string[]|null */
	private $regex;

	/**
	 * @param string[] $names
	 * @param MagicWordFactory|null $factory
	 */
	public function __construct( $names = [], MagicWordFactory $factory = null ) {
		$this->names = $names;
		$this->factory = $factory ?: MediaWikiServices::getInstance()->getMagicWordFactory();
	}

	/**
	 * Add a magic word by name
	 *
	 * @param string $name
	 */
	public function add( $name ): void {
		$this->names[] = $name;
		$this->hash = $this->baseRegex = $this->regex = null;
	}

	/**
	 * Get a 2-d hashtable for this array
	 *
	 * @return array<int,array<string,string>>
	 */
	public function getHash(): array {
		if ( $this->hash === null ) {
			$this->hash = [ 0 => [], 1 => [] ];
			foreach ( $this->names as $name ) {
				$magic = $this->factory->get( $name );
				$case = intval( $magic->isCaseSensitive() );
				foreach ( $magic->getSynonyms() as $syn ) {
					if ( !$case ) {
						$syn = $this->factory->getContentLanguage()->lc( $syn );
					}
					$this->hash[$case][$syn] = $name;
				}
			}
		}
		return $this->hash;
	}

	/**
	 * Get the base regex
	 *
	 * @internal For use in {@see Parser} only
	 * @param bool $capture Set to false to suppress the capture groups,
	 *  which can cause unexpected conflicts when this regexp is embedded in
	 *  other regexps with similar constructs.
	 * @param string $delimiter The delimiter which will be used for the
	 *  eventual regexp.
	 * @return array<int,string>
	 */
	public function getBaseRegex( bool $capture = true, string $delimiter = '/' ): array {
		if ( $capture && $delimiter === '/' && $this->baseRegex !== null ) {
			return $this->baseRegex;
		}
		$regex = [ 0 => [], 1 => [] ];
		$allGroups = [];
		foreach ( $this->names as $name ) {
			$magic = $this->factory->get( $name );
			$case = $magic->isCaseSensitive() ? 1 : 0;
			foreach ( $magic->getSynonyms() as $i => $syn ) {
				if ( $capture ) {
					// Group name must start with a non-digit in PCRE 8.34+
					$it = strtr( $i, '0123456789', 'abcdefghij' );
					$groupName = $it . '_' . $name;
					$group = '(?P<' . $groupName . '>' . preg_quote( $syn, $delimiter ) . ')';
					// look for same group names to avoid same named subpatterns in the regex
					if ( isset( $allGroups[$groupName] ) ) {
						throw new UnexpectedValueException(
							__METHOD__ . ': duplicate internal name in magic word array: ' . $name
						);
					}
					$allGroups[$groupName] = true;
					$regex[$case][] = $group;
				} else {
					$regex[$case][] = preg_quote( $syn, $delimiter );
				}
			}
		}
		'@phan-var array<int,string[]> $regex';
		foreach ( $regex as $case => &$re ) {
			$re = count( $re ) ? implode( '|', $re ) : '(?!)';
			if ( !$case ) {
				$re = "(?i:{$re})";
			}
		}
		'@phan-var array<int,string> $regex';

		if ( $capture && $delimiter === '/' ) {
			$this->baseRegex = $regex;
		}
		return $regex;
	}

	/**
	 * Get an unanchored regex that does not match parameters
	 *
	 * @return array<int,string>
	 */
	private function getRegex(): array {
		if ( $this->regex === null ) {
			$this->regex = [];
			$base = $this->getBaseRegex( true, '/' );
			foreach ( $base as $case => $re ) {
				$this->regex[$case] = "/{$re}/S";
			}
			// As a performance optimization, turn on unicode mode only for
			// case-insensitive matching.
			$this->regex[0] .= 'u';
		}
		return $this->regex;
	}

	/**
	 * Get a regex anchored to the start of the string that does not match parameters
	 *
	 * @return array<int,string>
	 */
	private function getRegexStart(): array {
		$newRegex = [];
		$base = $this->getBaseRegex( true, '/' );
		foreach ( $base as $case => $re ) {
			$newRegex[$case] = "/^(?:{$re})/S";
		}
		// As a performance optimization, turn on unicode mode only for
		// case-insensitive matching.
		$newRegex[0] .= 'u';
		return $newRegex;
	}

	/**
	 * Get an anchored regex for matching variables with parameters
	 *
	 * @return array<int,string>
	 */
	private function getVariableStartToEndRegex(): array {
		$newRegex = [];
		$base = $this->getBaseRegex( true, '/' );
		foreach ( $base as $case => $re ) {
			$newRegex[$case] = str_replace( "\\$1", "(.*?)", "/^(?:{$re})$/S" );
		}
		// As a performance optimization, turn on unicode mode only for
		// case-insensitive matching.
		$newRegex[0] .= 'u';
		return $newRegex;
	}

	/**
	 * @since 1.20
	 * @return string[]
	 */
	public function getNames() {
		return $this->names;
	}

	/**
	 * Parse a match array from preg_match
	 *
	 * @param array<string|int,string> $matches
	 * @return array{0:string,1:string|false} Pair of (magic word ID, parameter value),
	 *  where the latter is instead false if there is no parameter value.
	 */
	private function parseMatch( array $matches ): array {
		$magicName = null;
		foreach ( $matches as $key => $match ) {
			if ( $magicName !== null ) {
				// The structure we found at this point is [ …,
				//     'a_magicWordName' => 'matchedSynonym',
				//     n                 => 'matchedSynonym (again)',
				//     n + 1             => 'parameterValue',
				// … ]
				return [ $magicName, $matches[$key + 1] ?? false ];
			}
			// Skip the initial full match and any non-matching group
			if ( $match !== '' && $key !== 0 ) {
				$parts = explode( '_', $key, 2 );
				if ( !isset( $parts[1] ) ) {
					throw new LogicException( 'Unexpected group name' );
				}
				$magicName = $parts[1];
			}
		}
		throw new LogicException( 'Unexpected $m array with no match' );
	}

	/**
	 * Match some text, with parameter capture
	 *
	 * @param string $text
	 * @return (string|false)[] Magic word name in the first element and the parameter in the second
	 *  element. Both elements are false if there was no match.
	 */
	public function matchVariableStartToEnd( $text ): array {
		$regexes = $this->getVariableStartToEndRegex();
		foreach ( $regexes as $regex ) {
			$m = [];
			if ( preg_match( $regex, $text, $m ) ) {
				return $this->parseMatch( $m );
			}
		}
		return [ false, false ];
	}

	/**
	 * Match some text, without parameter capture
	 *
	 * @see MagicWord::matchStartToEnd
	 * @param string $text
	 * @return string|false The magic word name, or false if there was no capture
	 */
	public function matchStartToEnd( $text ) {
		$hash = $this->getHash();
		if ( isset( $hash[1][$text] ) ) {
			return $hash[1][$text];
		}
		$lc = $this->factory->getContentLanguage()->lc( $text );
		return $hash[0][$lc] ?? false;
	}

	/**
	 * Return an associative array for all items that match.
	 *
	 * Cannot be used for magic words with parameters.
	 * Removes the matched items from the input string (passed by reference)
	 *
	 * @see MagicWord::matchAndRemove
	 * @param string &$text
	 * @return array<string,false> Keyed by magic word ID
	 */
	public function matchAndRemove( &$text ): array {
		$found = [];
		$regexes = $this->getRegex();
		foreach ( $regexes as $regex ) {
			$matches = [];
			$res = preg_match_all( $regex, $text, $matches, PREG_SET_ORDER );
			if ( $res === false ) {
				$error = preg_last_error();
				$errorText = preg_last_error_msg();
				LoggerFactory::getInstance( 'parser' )->warning( 'preg_match_all error: {code} {errorText}', [
					'code' => $error,
					'regex' => $regex,
					'text' => $text,
					'errorText' => $errorText
				] );
				// T321234: Don't try to fix old revisions with broken UTF-8, just return as is
				if ( $error === PREG_BAD_UTF8_ERROR ) {
					continue;
				}
				throw new LogicException( "preg_match_all error $error: $errorText" );
			} elseif ( $res ) {
				foreach ( $matches as $m ) {
					[ $name, $param ] = $this->parseMatch( $m );
					$found[$name] = $param;
				}
			}
			$res = preg_replace( $regex, '', $text );
			if ( $res === null ) {
				$error = preg_last_error();
				$errorText = preg_last_error_msg();
				LoggerFactory::getInstance( 'parser' )->warning( 'preg_replace error: {code} {errorText}', [
					'code' => $error,
					'regex' => $regex,
					'text' => $text,
					'errorText' => $errorText
				] );
				throw new LogicException( "preg_replace error $error: $errorText" );
			}
			$text = $res;
		}
		return $found;
	}

	/**
	 * Return the ID of the magic word at the start of $text, and remove
	 * the prefix from $text.
	 *
	 * Does not match parameters.
	 *
	 * @see MagicWord::matchStartAndRemove
	 * @param string &$text Unmodified if no match is found.
	 * @return string|false False if no match is found.
	 */
	public function matchStartAndRemove( &$text ) {
		$regexes = $this->getRegexStart();
		foreach ( $regexes as $regex ) {
			if ( preg_match( $regex, $text, $m ) ) {
				[ $id, ] = $this->parseMatch( $m );
				if ( strlen( $m[0] ) >= strlen( $text ) ) {
					$text = '';
				} else {
					$text = substr( $text, strlen( $m[0] ) );
				}
				return $id;
			}
		}
		return false;
	}
}

/**
 * @deprecated since 1.40
 */
class_alias( MagicWordArray::class, 'MagicWordArray' );