MagicWordArray: mark ::get*Regex*() as @internal, add some options
Refactor MagicWordArray::getBaseRegex() to allow code sharing between Parsoid and core by parameterizing the delimiter and the presence/absence of capture groups. Mark a number of methods @internal because they are purely internal helpers; no one outside core is using: ::getRegex() ::getBaseRegex() ::getVariableRegex() ::getVariableStartToEndRegex() ::getRegexStart() Code search: https://codesearch.wmcloud.org/search/?q=-%3E(get(Base%7CVariable%7CVariableStartToEnd%7C)Regex%5C(%7CgetRegexStart%5C()&i=nope&files=&repos= Moved the regexp 'i' option inside the regexp to allow better code sharing between the case-sensitive and case-insensitive regexps. (Foiled to a minor degree by a performance optimization which only applies the "unicode" flag to the case-insensitive regexp.) Both regexps are always defined and valid, which saves some additional tests at use sites. Change-Id: I17f1b7207db8d2203c904508f3ab8a64b68736a8
This commit is contained in:
parent
d9ec591d7a
commit
5fbfda654a
1 changed files with 63 additions and 48 deletions
|
|
@ -96,20 +96,29 @@ class MagicWordArray {
|
|||
|
||||
/**
|
||||
* Get the base regex
|
||||
* @param bool $capture Set to false to suppress the capture groups,
|
||||
* which can cause unexpected conflicts when this regexp is embedded in
|
||||
* other regexps with similar constructs.
|
||||
* @param string $delimiter The delimiter which will be used for the
|
||||
* eventual regexp.
|
||||
* @return string[]
|
||||
* @internal
|
||||
*/
|
||||
public function getBaseRegex() : array {
|
||||
if ( $this->baseRegex === null ) {
|
||||
$this->baseRegex = [ 0 => '', 1 => '' ];
|
||||
$allGroups = [];
|
||||
foreach ( $this->names as $name ) {
|
||||
$magic = $this->factory->get( $name );
|
||||
$case = intval( $magic->isCaseSensitive() );
|
||||
foreach ( $magic->getSynonyms() as $i => $syn ) {
|
||||
public function getBaseRegex( bool $capture = true, string $delimiter = '/' ) : array {
|
||||
if ( $capture && $delimiter === '/' && $this->baseRegex !== null ) {
|
||||
return $this->baseRegex;
|
||||
}
|
||||
$regex = [ 0 => [], 1 => [] ];
|
||||
$allGroups = [];
|
||||
foreach ( $this->names as $name ) {
|
||||
$magic = $this->factory->get( $name );
|
||||
$case = $magic->isCaseSensitive() ? 1 : 0;
|
||||
foreach ( $magic->getSynonyms() as $i => $syn ) {
|
||||
if ( $capture ) {
|
||||
// Group name must start with a non-digit in PCRE 8.34+
|
||||
$it = strtr( $i, '0123456789', 'abcdefghij' );
|
||||
$groupName = $it . '_' . $name;
|
||||
$group = '(?P<' . $groupName . '>' . preg_quote( $syn, '/' ) . ')';
|
||||
$group = '(?P<' . $groupName . '>' . preg_quote( $syn, $delimiter ) . ')';
|
||||
// look for same group names to avoid same named subpatterns in the regex
|
||||
if ( isset( $allGroups[$groupName] ) ) {
|
||||
throw new MWException(
|
||||
|
|
@ -117,32 +126,42 @@ class MagicWordArray {
|
|||
);
|
||||
}
|
||||
$allGroups[$groupName] = true;
|
||||
if ( $this->baseRegex[$case] === '' ) {
|
||||
$this->baseRegex[$case] = $group;
|
||||
} else {
|
||||
$this->baseRegex[$case] .= '|' . $group;
|
||||
}
|
||||
$regex[$case][] = $group;
|
||||
} else {
|
||||
$regex[$case][] = preg_quote( $syn, $delimiter );
|
||||
}
|
||||
}
|
||||
}
|
||||
return $this->baseRegex;
|
||||
'@phan-var array<int,string[]> $regex';
|
||||
foreach ( $regex as $case => &$re ) {
|
||||
$re = count( $re ) ? implode( '|', $re ) : '(?!)';
|
||||
if ( !$case ) {
|
||||
$re = "(?i:{$re})";
|
||||
}
|
||||
}
|
||||
'@phan-var array<int,string> $regex';
|
||||
|
||||
if ( $capture && $delimiter === '/' ) {
|
||||
$this->baseRegex = $regex;
|
||||
}
|
||||
return $regex;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get an unanchored regex that does not match parameters
|
||||
* @return string[]
|
||||
* @suppress PhanTypeArraySuspiciousNullable False positive
|
||||
* @internal
|
||||
*/
|
||||
public function getRegex() {
|
||||
if ( $this->regex === null ) {
|
||||
$base = $this->getBaseRegex();
|
||||
$this->regex = [ '', '' ];
|
||||
if ( $this->baseRegex[0] !== '' ) {
|
||||
$this->regex[0] = "/{$base[0]}/iuS";
|
||||
}
|
||||
if ( $this->baseRegex[1] !== '' ) {
|
||||
$this->regex[1] = "/{$base[1]}/S";
|
||||
$this->regex = [];
|
||||
$base = $this->getBaseRegex( true, '/' );
|
||||
foreach ( $base as $case => $re ) {
|
||||
$this->regex[$case] = "/{$re}/S";
|
||||
}
|
||||
// As a performance optimization, turn on unicode mode only for
|
||||
// case-insensitive matching.
|
||||
$this->regex[0] .= 'u';
|
||||
}
|
||||
return $this->regex;
|
||||
}
|
||||
|
|
@ -151,6 +170,8 @@ class MagicWordArray {
|
|||
* Get a regex for matching variables with parameters
|
||||
*
|
||||
* @return string[]
|
||||
* @internal
|
||||
* @deprecated since 1.36 Appears to have no uses.
|
||||
*/
|
||||
public function getVariableRegex() {
|
||||
return str_replace( "\\$1", "(.*?)", $this->getRegex() );
|
||||
|
|
@ -160,16 +181,17 @@ class MagicWordArray {
|
|||
* Get a regex anchored to the start of the string that does not match parameters
|
||||
*
|
||||
* @return string[]
|
||||
* @internal
|
||||
*/
|
||||
public function getRegexStart() {
|
||||
$base = $this->getBaseRegex();
|
||||
$newRegex = [ '', '' ];
|
||||
if ( $base[0] !== '' ) {
|
||||
$newRegex[0] = "/^(?:{$base[0]})/iuS";
|
||||
}
|
||||
if ( $base[1] !== '' ) {
|
||||
$newRegex[1] = "/^(?:{$base[1]})/S";
|
||||
$newRegex = [];
|
||||
$base = $this->getBaseRegex( true, '/' );
|
||||
foreach ( $base as $case => $re ) {
|
||||
$newRegex[$case] = "/^(?:{$re})/S";
|
||||
}
|
||||
// As a performance optimization, turn on unicode mode only for
|
||||
// case-insensitive matching.
|
||||
$newRegex[0] .= 'u';
|
||||
return $newRegex;
|
||||
}
|
||||
|
||||
|
|
@ -177,16 +199,17 @@ class MagicWordArray {
|
|||
* Get an anchored regex for matching variables with parameters
|
||||
*
|
||||
* @return string[]
|
||||
* @internal
|
||||
*/
|
||||
public function getVariableStartToEndRegex() {
|
||||
$base = $this->getBaseRegex();
|
||||
$newRegex = [ '', '' ];
|
||||
if ( $base[0] !== '' ) {
|
||||
$newRegex[0] = str_replace( "\\$1", "(.*?)", "/^(?:{$base[0]})$/iuS" );
|
||||
}
|
||||
if ( $base[1] !== '' ) {
|
||||
$newRegex[1] = str_replace( "\\$1", "(.*?)", "/^(?:{$base[1]})$/S" );
|
||||
$newRegex = [];
|
||||
$base = $this->getBaseRegex( true, '/' );
|
||||
foreach ( $base as $case => $re ) {
|
||||
$newRegex[$case] = str_replace( "\\$1", "(.*?)", "/^(?:{$re})$/S" );
|
||||
}
|
||||
// As a performance optimization, turn on unicode mode only for
|
||||
// case-insensitive matching.
|
||||
$newRegex[0] .= 'u';
|
||||
return $newRegex;
|
||||
}
|
||||
|
||||
|
|
@ -243,11 +266,9 @@ class MagicWordArray {
|
|||
public function matchVariableStartToEnd( $text ) {
|
||||
$regexes = $this->getVariableStartToEndRegex();
|
||||
foreach ( $regexes as $regex ) {
|
||||
if ( $regex !== '' ) {
|
||||
$m = [];
|
||||
if ( preg_match( $regex, $text, $m ) ) {
|
||||
return $this->parseMatch( $m );
|
||||
}
|
||||
$m = [];
|
||||
if ( preg_match( $regex, $text, $m ) ) {
|
||||
return $this->parseMatch( $m );
|
||||
}
|
||||
}
|
||||
return [ false, false ];
|
||||
|
|
@ -282,9 +303,6 @@ class MagicWordArray {
|
|||
$found = [];
|
||||
$regexes = $this->getRegex();
|
||||
foreach ( $regexes as $regex ) {
|
||||
if ( $regex === '' ) {
|
||||
continue;
|
||||
}
|
||||
$matches = [];
|
||||
$res = preg_match_all( $regex, $text, $matches, PREG_SET_ORDER );
|
||||
if ( $res === false ) {
|
||||
|
|
@ -325,9 +343,6 @@ class MagicWordArray {
|
|||
public function matchStartAndRemove( &$text ) {
|
||||
$regexes = $this->getRegexStart();
|
||||
foreach ( $regexes as $regex ) {
|
||||
if ( $regex === '' ) {
|
||||
continue;
|
||||
}
|
||||
if ( preg_match( $regex, $text, $m ) ) {
|
||||
list( $id, ) = $this->parseMatch( $m );
|
||||
if ( strlen( $m[0] ) >= strlen( $text ) ) {
|
||||
|
|
|
|||
Loading…
Reference in a new issue