2008-01-21 16:36:08 +00:00
|
|
|
<?php
|
2010-08-22 14:31:05 +00:00
|
|
|
/**
|
|
|
|
|
* Preprocessor using PHP's dom extension
|
|
|
|
|
*
|
2012-04-30 09:22:16 +00:00
|
|
|
* This program is free software; you can redistribute it and/or modify
|
|
|
|
|
* it under the terms of the GNU General Public License as published by
|
|
|
|
|
* the Free Software Foundation; either version 2 of the License, or
|
|
|
|
|
* (at your option) any later version.
|
|
|
|
|
*
|
|
|
|
|
* This program is distributed in the hope that it will be useful,
|
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
|
* GNU General Public License for more details.
|
|
|
|
|
*
|
|
|
|
|
* You should have received a copy of the GNU General Public License along
|
|
|
|
|
* with this program; if not, write to the Free Software Foundation, Inc.,
|
|
|
|
|
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
|
|
|
|
* http://www.gnu.org/copyleft/gpl.html
|
|
|
|
|
*
|
2010-08-22 14:31:05 +00:00
|
|
|
* @file
|
|
|
|
|
* @ingroup Parser
|
2019-04-09 18:42:42 +00:00
|
|
|
* @deprecated since 1.34, use Preprocessor_Hash
|
2010-08-22 14:31:05 +00:00
|
|
|
*/
|
2011-02-12 04:06:22 +00:00
|
|
|
|
WARNING: HUGE COMMIT
Doxygen documentation update:
* Changed alls @addtogroup to @ingroup. @addtogroup adds the comment to the group description, but doesn't add the file, class, function, ... to the group like @ingroup does. See for example http://svn.wikimedia.org/doc/group__SpecialPage.html where it's impossible to see related files, classes, ... that should belong to that group.
* Added @file to file description, it seems that it should be explicitely decalred for file descriptions, otherwise doxygen will think that the comment document the first class, variabled, function, ... that is in that file.
* Removed some empty comments
* Removed some ?>
Added following groups:
* ExternalStorage
* JobQueue
* MaintenanceLanguage
One more thing: there are still a lot of warnings when generating the doc.
2008-05-20 17:13:28 +00:00
|
|
|
/**
|
|
|
|
|
* @ingroup Parser
|
|
|
|
|
*/
|
2018-01-01 13:10:16 +00:00
|
|
|
// phpcs:ignore Squiz.Classes.ValidClassName.NotCamelCaps
|
2015-10-08 20:54:15 +00:00
|
|
|
class Preprocessor_DOM extends Preprocessor {
|
2011-05-28 14:54:13 +00:00
|
|
|
|
2014-05-16 00:48:01 +00:00
|
|
|
/**
|
|
|
|
|
* @var Parser
|
|
|
|
|
*/
|
2014-08-11 20:24:54 +00:00
|
|
|
public $parser;
|
2014-05-16 00:48:01 +00:00
|
|
|
|
2014-08-11 20:24:54 +00:00
|
|
|
public $memoryLimit;
|
2008-01-21 16:36:08 +00:00
|
|
|
|
2015-10-08 20:54:15 +00:00
|
|
|
const CACHE_PREFIX = 'preprocess-xml';
|
2009-02-09 23:18:37 +00:00
|
|
|
|
2019-06-03 16:08:04 +00:00
|
|
|
/**
|
|
|
|
|
* @param Parser $parser
|
|
|
|
|
*/
|
2014-08-11 20:24:54 +00:00
|
|
|
public function __construct( $parser ) {
|
2019-04-09 18:42:42 +00:00
|
|
|
wfDeprecated( __METHOD__, '1.34' ); // T204945
|
2008-01-21 16:36:08 +00:00
|
|
|
$this->parser = $parser;
|
2008-01-24 04:29:56 +00:00
|
|
|
$mem = ini_get( 'memory_limit' );
|
|
|
|
|
$this->memoryLimit = false;
|
|
|
|
|
if ( strval( $mem ) !== '' && $mem != -1 ) {
|
|
|
|
|
if ( preg_match( '/^\d+$/', $mem ) ) {
|
|
|
|
|
$this->memoryLimit = $mem;
|
|
|
|
|
} elseif ( preg_match( '/^(\d+)M$/i', $mem, $m ) ) {
|
|
|
|
|
$this->memoryLimit = $m[1] * 1048576;
|
|
|
|
|
}
|
|
|
|
|
}
|
2008-01-21 16:36:08 +00:00
|
|
|
}
|
|
|
|
|
|
2011-05-26 20:26:51 +00:00
|
|
|
/**
|
|
|
|
|
* @return PPFrame_DOM
|
|
|
|
|
*/
|
2014-08-11 20:24:54 +00:00
|
|
|
public function newFrame() {
|
2008-01-21 16:36:08 +00:00
|
|
|
return new PPFrame_DOM( $this );
|
|
|
|
|
}
|
|
|
|
|
|
2011-05-26 20:26:51 +00:00
|
|
|
/**
|
2014-04-21 23:38:39 +00:00
|
|
|
* @param array $args
|
2011-05-26 20:26:51 +00:00
|
|
|
* @return PPCustomFrame_DOM
|
|
|
|
|
*/
|
2014-08-11 20:24:54 +00:00
|
|
|
public function newCustomFrame( $args ) {
|
2008-06-26 13:05:40 +00:00
|
|
|
return new PPCustomFrame_DOM( $this, $args );
|
|
|
|
|
}
|
|
|
|
|
|
2011-05-28 14:54:13 +00:00
|
|
|
/**
|
2014-04-21 23:38:39 +00:00
|
|
|
* @param array $values
|
2011-05-28 14:54:13 +00:00
|
|
|
* @return PPNode_DOM
|
2014-12-24 13:49:20 +00:00
|
|
|
* @throws MWException
|
2011-05-28 14:54:13 +00:00
|
|
|
*/
|
2014-08-11 20:24:54 +00:00
|
|
|
public function newPartNodeArray( $values ) {
|
2015-09-11 13:44:59 +00:00
|
|
|
// NOTE: DOM manipulation is slower than building & parsing XML! (or so Tim sais)
|
2011-01-06 03:24:35 +00:00
|
|
|
$xml = "<list>";
|
2010-06-10 15:02:25 +00:00
|
|
|
|
|
|
|
|
foreach ( $values as $k => $val ) {
|
|
|
|
|
if ( is_int( $k ) ) {
|
2014-05-10 23:03:45 +00:00
|
|
|
$xml .= "<part><name index=\"$k\"/><value>"
|
|
|
|
|
. htmlspecialchars( $val ) . "</value></part>";
|
2010-06-10 15:02:25 +00:00
|
|
|
} else {
|
2014-05-10 23:03:45 +00:00
|
|
|
$xml .= "<part><name>" . htmlspecialchars( $k )
|
|
|
|
|
. "</name>=<value>" . htmlspecialchars( $val ) . "</value></part>";
|
2010-06-10 15:02:25 +00:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$xml .= "</list>";
|
|
|
|
|
|
|
|
|
|
$dom = new DOMDocument();
|
2018-02-10 07:52:26 +00:00
|
|
|
Wikimedia\suppressWarnings();
|
2014-05-09 20:09:03 +00:00
|
|
|
$result = $dom->loadXML( $xml );
|
2018-02-10 07:52:26 +00:00
|
|
|
Wikimedia\restoreWarnings();
|
2014-05-09 20:09:03 +00:00
|
|
|
if ( !$result ) {
|
|
|
|
|
// Try running the XML through UtfNormal to get rid of invalid characters
|
2015-03-07 09:27:42 +00:00
|
|
|
$xml = UtfNormal\Validator::cleanUp( $xml );
|
2014-05-15 15:38:28 +00:00
|
|
|
// 1 << 19 == XML_PARSE_HUGE, needed so newer versions of libxml2
|
|
|
|
|
// don't barf when the XML is >256 levels deep
|
2014-05-09 20:09:03 +00:00
|
|
|
$result = $dom->loadXML( $xml, 1 << 19 );
|
|
|
|
|
}
|
2010-06-10 15:02:25 +00:00
|
|
|
|
2014-05-09 20:09:03 +00:00
|
|
|
if ( !$result ) {
|
|
|
|
|
throw new MWException( 'Parameters passed to ' . __METHOD__ . ' result in invalid XML' );
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$root = $dom->documentElement;
|
2010-06-10 15:02:25 +00:00
|
|
|
$node = new PPNode_DOM( $root->childNodes );
|
|
|
|
|
return $node;
|
|
|
|
|
}
|
|
|
|
|
|
2011-05-28 14:54:13 +00:00
|
|
|
/**
|
|
|
|
|
* @throws MWException
|
|
|
|
|
* @return bool
|
|
|
|
|
*/
|
2014-08-11 20:24:54 +00:00
|
|
|
public function memCheck() {
|
2008-01-24 04:29:56 +00:00
|
|
|
if ( $this->memoryLimit === false ) {
|
2011-10-14 21:18:38 +00:00
|
|
|
return true;
|
2008-01-24 04:29:56 +00:00
|
|
|
}
|
|
|
|
|
$usage = memory_get_usage();
|
|
|
|
|
if ( $usage > $this->memoryLimit * 0.9 ) {
|
|
|
|
|
$limit = intval( $this->memoryLimit * 0.9 / 1048576 + 0.5 );
|
|
|
|
|
throw new MWException( "Preprocessor hit 90% memory limit ($limit MB)" );
|
|
|
|
|
}
|
|
|
|
|
return $usage <= $this->memoryLimit * 0.8;
|
|
|
|
|
}
|
|
|
|
|
|
2008-01-21 16:36:08 +00:00
|
|
|
/**
|
|
|
|
|
* Preprocess some wikitext and return the document tree.
|
2008-04-14 07:45:50 +00:00
|
|
|
* This is the ghost of Parser::replace_variables().
|
2008-01-21 16:36:08 +00:00
|
|
|
*
|
2014-04-21 23:38:39 +00:00
|
|
|
* @param string $text The text to parse
|
|
|
|
|
* @param int $flags Bitwise combination of:
|
2014-05-10 23:03:45 +00:00
|
|
|
* Parser::PTD_FOR_INCLUSION Handle "<noinclude>" and "<includeonly>"
|
|
|
|
|
* as if the text is being included. Default
|
|
|
|
|
* is to assume a direct page view.
|
2008-01-21 16:36:08 +00:00
|
|
|
*
|
|
|
|
|
* The generated DOM tree must depend only on the input text and the flags.
|
2017-02-20 22:44:19 +00:00
|
|
|
* The DOM tree must be the same in OT_HTML and OT_WIKI mode, to avoid a regression of T6899.
|
2008-01-21 16:36:08 +00:00
|
|
|
*
|
2008-04-14 07:45:50 +00:00
|
|
|
* Any flag added to the $flags parameter here, or any other parameter liable to cause a
|
|
|
|
|
* change in the DOM tree for a given text, must be passed through the section identifier
|
|
|
|
|
* in the section edit link and thus back to extractSections().
|
2008-01-21 16:36:08 +00:00
|
|
|
*
|
2008-04-14 07:45:50 +00:00
|
|
|
* The output of this function is currently only cached in process memory, but a persistent
|
|
|
|
|
* cache may be implemented at a later date which takes further advantage of these strict
|
2008-01-21 16:36:08 +00:00
|
|
|
* dependency requirements.
|
|
|
|
|
*
|
2012-10-07 23:35:26 +00:00
|
|
|
* @throws MWException
|
2011-05-28 14:54:13 +00:00
|
|
|
* @return PPNode_DOM
|
2008-01-21 16:36:08 +00:00
|
|
|
*/
|
2014-08-11 20:24:54 +00:00
|
|
|
public function preprocessToObj( $text, $flags = 0 ) {
|
2015-10-08 20:54:15 +00:00
|
|
|
$xml = $this->cacheGetTree( $text, $flags );
|
|
|
|
|
if ( $xml === false ) {
|
2013-04-23 11:15:52 +00:00
|
|
|
$xml = $this->preprocessToXml( $text, $flags );
|
2015-10-08 20:54:15 +00:00
|
|
|
$this->cacheSetTree( $text, $flags, $xml );
|
2009-02-06 20:27:58 +00:00
|
|
|
}
|
2012-09-15 21:51:58 +00:00
|
|
|
|
|
|
|
|
// Fail if the number of elements exceeds acceptable limits
|
2012-10-10 18:13:40 +00:00
|
|
|
// Do not attempt to generate the DOM
|
2012-09-15 21:51:58 +00:00
|
|
|
$this->parser->mGeneratedPPNodeCount += substr_count( $xml, '<' );
|
|
|
|
|
$max = $this->parser->mOptions->getMaxGeneratedPPNodeCount();
|
|
|
|
|
if ( $this->parser->mGeneratedPPNodeCount > $max ) {
|
2015-10-08 20:54:15 +00:00
|
|
|
// if ( $cacheable ) { ... }
|
2013-02-03 19:42:08 +00:00
|
|
|
throw new MWException( __METHOD__ . ': generated node count limit exceeded' );
|
2012-09-15 21:51:58 +00:00
|
|
|
}
|
|
|
|
|
|
2009-02-06 20:27:58 +00:00
|
|
|
$dom = new DOMDocument;
|
2018-02-10 07:52:26 +00:00
|
|
|
Wikimedia\suppressWarnings();
|
2009-02-06 20:27:58 +00:00
|
|
|
$result = $dom->loadXML( $xml );
|
2018-02-10 07:52:26 +00:00
|
|
|
Wikimedia\restoreWarnings();
|
2009-02-06 20:27:58 +00:00
|
|
|
if ( !$result ) {
|
|
|
|
|
// Try running the XML through UtfNormal to get rid of invalid characters
|
2015-03-07 09:27:42 +00:00
|
|
|
$xml = UtfNormal\Validator::cleanUp( $xml );
|
2014-05-10 23:03:45 +00:00
|
|
|
// 1 << 19 == XML_PARSE_HUGE, needed so newer versions of libxml2
|
|
|
|
|
// don't barf when the XML is >256 levels deep.
|
2011-09-09 11:28:00 +00:00
|
|
|
$result = $dom->loadXML( $xml, 1 << 19 );
|
2009-02-06 20:27:58 +00:00
|
|
|
}
|
2013-06-05 01:07:42 +00:00
|
|
|
if ( $result ) {
|
|
|
|
|
$obj = new PPNode_DOM( $dom->documentElement );
|
|
|
|
|
}
|
|
|
|
|
|
2015-10-08 20:54:15 +00:00
|
|
|
// if ( $cacheable ) { ... }
|
2013-06-05 01:07:42 +00:00
|
|
|
|
|
|
|
|
if ( !$result ) {
|
|
|
|
|
throw new MWException( __METHOD__ . ' generated invalid XML' );
|
|
|
|
|
}
|
2009-02-06 20:27:58 +00:00
|
|
|
return $obj;
|
|
|
|
|
}
|
2011-02-12 04:06:22 +00:00
|
|
|
|
2011-05-28 14:54:13 +00:00
|
|
|
/**
|
2014-04-21 23:38:39 +00:00
|
|
|
* @param string $text
|
|
|
|
|
* @param int $flags
|
2011-05-28 14:54:13 +00:00
|
|
|
* @return string
|
|
|
|
|
*/
|
2014-08-11 20:24:54 +00:00
|
|
|
public function preprocessToXml( $text, $flags = 0 ) {
|
2016-09-20 22:26:32 +00:00
|
|
|
global $wgDisableLangConversion;
|
|
|
|
|
|
2008-01-21 16:36:08 +00:00
|
|
|
$forInclusion = $flags & Parser::PTD_FOR_INCLUSION;
|
|
|
|
|
|
|
|
|
|
$xmlishElements = $this->parser->getStripList();
|
Preprocessor: Don't allow unclosed extension tags (matching until end of input)
(Previously done in f51d0d9a819f8f1c181350ced2f015ce97985fcc and
reverted in 543f46e9c08e0ff8c5e8b4e917fcc045730ef1bc.)
I think it's saner to treat this as invalid syntax, and output the
mismatched tag code verbatim. The current behavior is particularly
annoying for <ref> tags, which often swallow everything afterwards.
This does not affect HTML tags, though. Assuming Tidy is enabled, they
are still auto-closed at the end of the page content. (For tags that
"shadow" a HTML tag name, this results in the tag being treated as a
HTML tag. This currently only affects <pre> tags: if unclosed, they
are still displayed as preformatted text, but without suppressing
wikitext formatting.)
It also does not affect <includeonly>, <noinclude> and <onlyinclude>
tags. Changing this behavior now would be too disruptive to existing
content, and is the reason why previous attempt was reverted. (They
are already special-cased enough that this isn't too weird, for example
mismatched closing tags are hidden.)
Related to T17712 and T58306. I think this brings the PHP parser closer
to Parsoid's interpretation.
It reduces performance somewhat in the worst case, though. Testing with
https://phabricator.wikimedia.org/F3245989 (a 1 MB page starting with
3000 opening tags of 15 different types), parsing time rises from
~0.2 seconds to ~1.1 seconds on my setup. We go from O(N) to O(kN),
where N is bytes of input and k is the number of types of tags present
on the page. Maximum k shouldn't exceed 30 or so in reasonable setups
(depends on installed extensions, it's 20 on English Wikipedia).
Change-Id: Ide8b034e464eefb1b7c9e2a48ed06e21a7f8d434
2016-02-04 01:13:24 +00:00
|
|
|
$xmlishAllowMissingEndTag = [ 'includeonly', 'noinclude', 'onlyinclude' ];
|
2008-01-21 16:36:08 +00:00
|
|
|
$enableOnlyinclude = false;
|
|
|
|
|
if ( $forInclusion ) {
|
2016-02-17 09:09:32 +00:00
|
|
|
$ignoredTags = [ 'includeonly', '/includeonly' ];
|
|
|
|
|
$ignoredElements = [ 'noinclude' ];
|
2008-01-21 16:36:08 +00:00
|
|
|
$xmlishElements[] = 'noinclude';
|
2014-05-10 23:03:45 +00:00
|
|
|
if ( strpos( $text, '<onlyinclude>' ) !== false
|
|
|
|
|
&& strpos( $text, '</onlyinclude>' ) !== false
|
|
|
|
|
) {
|
2008-01-21 16:36:08 +00:00
|
|
|
$enableOnlyinclude = true;
|
|
|
|
|
}
|
|
|
|
|
} else {
|
2016-02-17 09:09:32 +00:00
|
|
|
$ignoredTags = [ 'noinclude', '/noinclude', 'onlyinclude', '/onlyinclude' ];
|
|
|
|
|
$ignoredElements = [ 'includeonly' ];
|
2008-01-21 16:36:08 +00:00
|
|
|
$xmlishElements[] = 'includeonly';
|
|
|
|
|
}
|
|
|
|
|
$xmlishRegex = implode( '|', array_merge( $xmlishElements, $ignoredTags ) );
|
|
|
|
|
|
|
|
|
|
// Use "A" modifier (anchored) instead of "^", because ^ doesn't work with an offset
|
|
|
|
|
$elementsRegex = "~($xmlishRegex)(?:\s|\/>|>)|(!--)~iA";
|
2008-04-14 07:45:50 +00:00
|
|
|
|
2008-01-21 16:36:08 +00:00
|
|
|
$stack = new PPDStack;
|
|
|
|
|
|
2015-09-11 13:44:59 +00:00
|
|
|
$searchBase = "[{<\n"; # }
|
2016-09-20 22:26:32 +00:00
|
|
|
if ( !$wgDisableLangConversion ) {
|
2017-01-19 19:58:05 +00:00
|
|
|
$searchBase .= '-';
|
2016-09-20 22:26:32 +00:00
|
|
|
}
|
|
|
|
|
|
2014-05-10 23:03:45 +00:00
|
|
|
// For fast reverse searches
|
|
|
|
|
$revText = strrev( $text );
|
2012-08-06 11:41:28 +00:00
|
|
|
$lengthText = strlen( $text );
|
2008-01-21 16:36:08 +00:00
|
|
|
|
2014-05-10 23:03:45 +00:00
|
|
|
// Input pointer, starts out pointing to a pseudo-newline before the start
|
|
|
|
|
$i = 0;
|
|
|
|
|
// Current accumulator
|
|
|
|
|
$accum =& $stack->getAccum();
|
2008-01-21 16:36:08 +00:00
|
|
|
$accum = '<root>';
|
2014-05-10 23:03:45 +00:00
|
|
|
// True to find equals signs in arguments
|
|
|
|
|
$findEquals = false;
|
|
|
|
|
// True to take notice of pipe characters
|
|
|
|
|
$findPipe = false;
|
2008-01-21 16:36:08 +00:00
|
|
|
$headingIndex = 1;
|
2014-05-10 23:03:45 +00:00
|
|
|
// True if $i is inside a possible heading
|
|
|
|
|
$inHeading = false;
|
|
|
|
|
// True if there are no more greater-than (>) signs right of $i
|
|
|
|
|
$noMoreGT = false;
|
Preprocessor: Don't allow unclosed extension tags (matching until end of input)
(Previously done in f51d0d9a819f8f1c181350ced2f015ce97985fcc and
reverted in 543f46e9c08e0ff8c5e8b4e917fcc045730ef1bc.)
I think it's saner to treat this as invalid syntax, and output the
mismatched tag code verbatim. The current behavior is particularly
annoying for <ref> tags, which often swallow everything afterwards.
This does not affect HTML tags, though. Assuming Tidy is enabled, they
are still auto-closed at the end of the page content. (For tags that
"shadow" a HTML tag name, this results in the tag being treated as a
HTML tag. This currently only affects <pre> tags: if unclosed, they
are still displayed as preformatted text, but without suppressing
wikitext formatting.)
It also does not affect <includeonly>, <noinclude> and <onlyinclude>
tags. Changing this behavior now would be too disruptive to existing
content, and is the reason why previous attempt was reverted. (They
are already special-cased enough that this isn't too weird, for example
mismatched closing tags are hidden.)
Related to T17712 and T58306. I think this brings the PHP parser closer
to Parsoid's interpretation.
It reduces performance somewhat in the worst case, though. Testing with
https://phabricator.wikimedia.org/F3245989 (a 1 MB page starting with
3000 opening tags of 15 different types), parsing time rises from
~0.2 seconds to ~1.1 seconds on my setup. We go from O(N) to O(kN),
where N is bytes of input and k is the number of types of tags present
on the page. Maximum k shouldn't exceed 30 or so in reasonable setups
(depends on installed extensions, it's 20 on English Wikipedia).
Change-Id: Ide8b034e464eefb1b7c9e2a48ed06e21a7f8d434
2016-02-04 01:13:24 +00:00
|
|
|
// Map of tag name => true if there are no more closing tags of given type right of $i
|
|
|
|
|
$noMoreClosingTag = [];
|
2014-05-10 23:03:45 +00:00
|
|
|
// True to ignore all input up to the next <onlyinclude>
|
|
|
|
|
$findOnlyinclude = $enableOnlyinclude;
|
|
|
|
|
// Do a line-start run without outputting an LF character
|
|
|
|
|
$fakeLineStart = true;
|
2008-01-21 16:36:08 +00:00
|
|
|
|
|
|
|
|
while ( true ) {
|
2015-09-11 13:44:59 +00:00
|
|
|
// $this->memCheck();
|
2008-01-24 04:29:56 +00:00
|
|
|
|
2008-01-21 16:36:08 +00:00
|
|
|
if ( $findOnlyinclude ) {
|
|
|
|
|
// Ignore all input up to the next <onlyinclude>
|
|
|
|
|
$startPos = strpos( $text, '<onlyinclude>', $i );
|
|
|
|
|
if ( $startPos === false ) {
|
|
|
|
|
// Ignored section runs to the end
|
|
|
|
|
$accum .= '<ignore>' . htmlspecialchars( substr( $text, $i ) ) . '</ignore>';
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
$tagEndPos = $startPos + strlen( '<onlyinclude>' ); // past-the-end
|
|
|
|
|
$accum .= '<ignore>' . htmlspecialchars( substr( $text, $i, $tagEndPos - $i ) ) . '</ignore>';
|
|
|
|
|
$i = $tagEndPos;
|
|
|
|
|
$findOnlyinclude = false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if ( $fakeLineStart ) {
|
|
|
|
|
$found = 'line-start';
|
|
|
|
|
$curChar = '';
|
|
|
|
|
} else {
|
|
|
|
|
# Find next opening brace, closing brace or pipe
|
|
|
|
|
$search = $searchBase;
|
|
|
|
|
if ( $stack->top === false ) {
|
|
|
|
|
$currentClosing = '';
|
|
|
|
|
} else {
|
|
|
|
|
$currentClosing = $stack->top->close;
|
|
|
|
|
$search .= $currentClosing;
|
|
|
|
|
}
|
|
|
|
|
if ( $findPipe ) {
|
|
|
|
|
$search .= '|';
|
|
|
|
|
}
|
|
|
|
|
if ( $findEquals ) {
|
|
|
|
|
// First equals will be for the template
|
|
|
|
|
$search .= '=';
|
|
|
|
|
}
|
|
|
|
|
$rule = null;
|
|
|
|
|
# Output literal section, advance input counter
|
|
|
|
|
$literalLength = strcspn( $text, $search, $i );
|
|
|
|
|
if ( $literalLength > 0 ) {
|
|
|
|
|
$accum .= htmlspecialchars( substr( $text, $i, $literalLength ) );
|
|
|
|
|
$i += $literalLength;
|
|
|
|
|
}
|
2012-08-06 11:41:28 +00:00
|
|
|
if ( $i >= $lengthText ) {
|
2008-01-21 16:36:08 +00:00
|
|
|
if ( $currentClosing == "\n" ) {
|
|
|
|
|
// Do a past-the-end run to finish off the heading
|
|
|
|
|
$curChar = '';
|
|
|
|
|
$found = 'line-end';
|
|
|
|
|
} else {
|
|
|
|
|
# All done
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
} else {
|
2016-09-20 22:26:32 +00:00
|
|
|
$curChar = $curTwoChar = $text[$i];
|
|
|
|
|
if ( ( $i + 1 ) < $lengthText ) {
|
|
|
|
|
$curTwoChar .= $text[$i + 1];
|
|
|
|
|
}
|
2008-01-21 16:36:08 +00:00
|
|
|
if ( $curChar == '|' ) {
|
|
|
|
|
$found = 'pipe';
|
|
|
|
|
} elseif ( $curChar == '=' ) {
|
|
|
|
|
$found = 'equals';
|
|
|
|
|
} elseif ( $curChar == '<' ) {
|
|
|
|
|
$found = 'angle';
|
|
|
|
|
} elseif ( $curChar == "\n" ) {
|
|
|
|
|
if ( $inHeading ) {
|
|
|
|
|
$found = 'line-end';
|
|
|
|
|
} else {
|
|
|
|
|
$found = 'line-start';
|
|
|
|
|
}
|
2016-09-20 22:26:32 +00:00
|
|
|
} elseif ( $curTwoChar == $currentClosing ) {
|
|
|
|
|
$found = 'close';
|
|
|
|
|
$curChar = $curTwoChar;
|
2008-01-21 16:36:08 +00:00
|
|
|
} elseif ( $curChar == $currentClosing ) {
|
|
|
|
|
$found = 'close';
|
2016-09-20 22:26:32 +00:00
|
|
|
} elseif ( isset( $this->rules[$curTwoChar] ) ) {
|
|
|
|
|
$curChar = $curTwoChar;
|
|
|
|
|
$found = 'open';
|
|
|
|
|
$rule = $this->rules[$curChar];
|
2015-10-31 23:10:54 +00:00
|
|
|
} elseif ( isset( $this->rules[$curChar] ) ) {
|
2008-01-21 16:36:08 +00:00
|
|
|
$found = 'open';
|
2015-10-31 23:10:54 +00:00
|
|
|
$rule = $this->rules[$curChar];
|
2008-01-21 16:36:08 +00:00
|
|
|
} else {
|
2017-01-19 19:58:05 +00:00
|
|
|
# Some versions of PHP have a strcspn which stops on
|
|
|
|
|
# null characters; ignore these and continue.
|
|
|
|
|
# We also may get '-' and '}' characters here which
|
|
|
|
|
# don't match -{ or $currentClosing. Add these to
|
|
|
|
|
# output and continue.
|
|
|
|
|
if ( $curChar == '-' || $curChar == '}' ) {
|
|
|
|
|
$accum .= $curChar;
|
|
|
|
|
}
|
2008-01-21 16:36:08 +00:00
|
|
|
++$i;
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if ( $found == 'angle' ) {
|
|
|
|
|
$matches = false;
|
|
|
|
|
// Handle </onlyinclude>
|
2014-05-10 23:03:45 +00:00
|
|
|
if ( $enableOnlyinclude
|
|
|
|
|
&& substr( $text, $i, strlen( '</onlyinclude>' ) ) == '</onlyinclude>'
|
|
|
|
|
) {
|
2008-01-21 16:36:08 +00:00
|
|
|
$findOnlyinclude = true;
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Determine element name
|
|
|
|
|
if ( !preg_match( $elementsRegex, $text, $matches, 0, $i + 1 ) ) {
|
|
|
|
|
// Element name missing or not listed
|
|
|
|
|
$accum .= '<';
|
|
|
|
|
++$i;
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
// Handle comments
|
|
|
|
|
if ( isset( $matches[2] ) && $matches[2] == '!--' ) {
|
2013-08-07 00:21:00 +00:00
|
|
|
// To avoid leaving blank lines, when a sequence of
|
|
|
|
|
// space-separated comments is both preceded and followed by
|
|
|
|
|
// a newline (ignoring spaces), then
|
|
|
|
|
// trim leading and trailing spaces and the trailing newline.
|
2008-04-14 07:45:50 +00:00
|
|
|
|
2008-01-21 16:36:08 +00:00
|
|
|
// Find the end
|
|
|
|
|
$endPos = strpos( $text, '-->', $i + 4 );
|
|
|
|
|
if ( $endPos === false ) {
|
|
|
|
|
// Unclosed comment in input, runs to end
|
|
|
|
|
$inner = substr( $text, $i );
|
|
|
|
|
$accum .= '<comment>' . htmlspecialchars( $inner ) . '</comment>';
|
2012-08-06 11:41:28 +00:00
|
|
|
$i = $lengthText;
|
2008-01-21 16:36:08 +00:00
|
|
|
} else {
|
|
|
|
|
// Search backwards for leading whitespace
|
2013-08-08 23:48:16 +00:00
|
|
|
$wsStart = $i ? ( $i - strspn( $revText, " \t", $lengthText - $i ) ) : 0;
|
2013-08-07 00:21:00 +00:00
|
|
|
|
2008-01-21 16:36:08 +00:00
|
|
|
// Search forwards for trailing whitespace
|
2011-01-13 17:30:27 +00:00
|
|
|
// $wsEnd will be the position of the last space (or the '>' if there's none)
|
2013-08-08 23:48:16 +00:00
|
|
|
$wsEnd = $endPos + 2 + strspn( $text, " \t", $endPos + 3 );
|
2013-08-07 00:21:00 +00:00
|
|
|
|
|
|
|
|
// Keep looking forward as long as we're finding more
|
|
|
|
|
// comments.
|
2016-02-17 09:09:32 +00:00
|
|
|
$comments = [ [ $wsStart, $wsEnd ] ];
|
2013-08-07 00:21:00 +00:00
|
|
|
while ( substr( $text, $wsEnd + 1, 4 ) == '<!--' ) {
|
|
|
|
|
$c = strpos( $text, '-->', $wsEnd + 4 );
|
|
|
|
|
if ( $c === false ) {
|
|
|
|
|
break;
|
|
|
|
|
}
|
2013-08-08 23:48:16 +00:00
|
|
|
$c = $c + 2 + strspn( $text, " \t", $c + 3 );
|
2016-02-17 09:09:32 +00:00
|
|
|
$comments[] = [ $wsEnd + 1, $c ];
|
2013-08-07 00:21:00 +00:00
|
|
|
$wsEnd = $c;
|
|
|
|
|
}
|
|
|
|
|
|
2008-01-21 16:36:08 +00:00
|
|
|
// Eat the line if possible
|
2008-04-14 07:45:50 +00:00
|
|
|
// TODO: This could theoretically be done if $wsStart == 0, i.e. for comments at
|
|
|
|
|
// the overall start. That's not how Sanitizer::removeHTMLcomments() did it, but
|
2008-01-21 16:36:08 +00:00
|
|
|
// it's a possible beneficial b/c break.
|
2008-04-14 07:45:50 +00:00
|
|
|
if ( $wsStart > 0 && substr( $text, $wsStart - 1, 1 ) == "\n"
|
2013-12-01 20:39:00 +00:00
|
|
|
&& substr( $text, $wsEnd + 1, 1 ) == "\n"
|
|
|
|
|
) {
|
2008-01-21 16:36:08 +00:00
|
|
|
// Remove leading whitespace from the end of the accumulator
|
|
|
|
|
// Sanity check first though
|
|
|
|
|
$wsLength = $i - $wsStart;
|
2013-08-08 23:48:16 +00:00
|
|
|
if ( $wsLength > 0
|
2013-12-01 20:39:00 +00:00
|
|
|
&& strspn( $accum, " \t", -$wsLength ) === $wsLength
|
|
|
|
|
) {
|
2008-01-21 16:36:08 +00:00
|
|
|
$accum = substr( $accum, 0, -$wsLength );
|
|
|
|
|
}
|
2013-08-07 00:21:00 +00:00
|
|
|
|
|
|
|
|
// Dump all but the last comment to the accumulator
|
|
|
|
|
foreach ( $comments as $j => $com ) {
|
|
|
|
|
$startPos = $com[0];
|
|
|
|
|
$endPos = $com[1] + 1;
|
2013-08-24 15:06:25 +00:00
|
|
|
if ( $j == ( count( $comments ) - 1 ) ) {
|
2013-08-07 00:21:00 +00:00
|
|
|
break;
|
|
|
|
|
}
|
2013-08-24 15:06:25 +00:00
|
|
|
$inner = substr( $text, $startPos, $endPos - $startPos );
|
2013-08-07 00:21:00 +00:00
|
|
|
$accum .= '<comment>' . htmlspecialchars( $inner ) . '</comment>';
|
|
|
|
|
}
|
|
|
|
|
|
2008-01-24 09:07:47 +00:00
|
|
|
// Do a line-start run next time to look for headings after the comment
|
|
|
|
|
$fakeLineStart = true;
|
2008-01-21 16:36:08 +00:00
|
|
|
} else {
|
|
|
|
|
// No line to eat, just take the comment itself
|
|
|
|
|
$startPos = $i;
|
|
|
|
|
$endPos += 2;
|
|
|
|
|
}
|
|
|
|
|
|
2008-01-24 04:29:56 +00:00
|
|
|
if ( $stack->top ) {
|
2008-01-24 09:07:47 +00:00
|
|
|
$part = $stack->top->getCurrentPart();
|
2013-03-24 10:01:51 +00:00
|
|
|
if ( !( isset( $part->commentEnd ) && $part->commentEnd == $wsStart - 1 ) ) {
|
2008-01-24 09:07:47 +00:00
|
|
|
$part->visualEnd = $wsStart;
|
2008-01-24 04:29:56 +00:00
|
|
|
}
|
2011-04-21 22:20:48 +00:00
|
|
|
// Else comments abutting, no change in visual end
|
2011-04-22 14:25:17 +00:00
|
|
|
$part->commentEnd = $endPos;
|
2008-01-24 04:29:56 +00:00
|
|
|
}
|
2008-01-21 16:36:08 +00:00
|
|
|
$i = $endPos + 1;
|
|
|
|
|
$inner = substr( $text, $startPos, $endPos - $startPos + 1 );
|
|
|
|
|
$accum .= '<comment>' . htmlspecialchars( $inner ) . '</comment>';
|
|
|
|
|
}
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
$name = $matches[1];
|
2008-03-05 01:07:47 +00:00
|
|
|
$lowerName = strtolower( $name );
|
2008-01-21 16:36:08 +00:00
|
|
|
$attrStart = $i + strlen( $name ) + 1;
|
|
|
|
|
|
|
|
|
|
// Find end of tag
|
|
|
|
|
$tagEndPos = $noMoreGT ? false : strpos( $text, '>', $attrStart );
|
|
|
|
|
if ( $tagEndPos === false ) {
|
|
|
|
|
// Infinite backtrack
|
|
|
|
|
// Disable tag search to prevent worst-case O(N^2) performance
|
|
|
|
|
$noMoreGT = true;
|
|
|
|
|
$accum .= '<';
|
|
|
|
|
++$i;
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Handle ignored tags
|
2008-03-05 01:07:47 +00:00
|
|
|
if ( in_array( $lowerName, $ignoredTags ) ) {
|
2014-05-10 23:03:45 +00:00
|
|
|
$accum .= '<ignore>'
|
|
|
|
|
. htmlspecialchars( substr( $text, $i, $tagEndPos - $i + 1 ) )
|
|
|
|
|
. '</ignore>';
|
2008-01-21 16:36:08 +00:00
|
|
|
$i = $tagEndPos + 1;
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$tagStartPos = $i;
|
2013-04-13 11:36:24 +00:00
|
|
|
if ( $text[$tagEndPos - 1] == '/' ) {
|
2008-01-21 16:36:08 +00:00
|
|
|
$attrEnd = $tagEndPos - 1;
|
|
|
|
|
$inner = null;
|
|
|
|
|
$i = $tagEndPos + 1;
|
|
|
|
|
$close = '';
|
|
|
|
|
} else {
|
|
|
|
|
$attrEnd = $tagEndPos;
|
|
|
|
|
// Find closing tag
|
Preprocessor: Don't allow unclosed extension tags (matching until end of input)
(Previously done in f51d0d9a819f8f1c181350ced2f015ce97985fcc and
reverted in 543f46e9c08e0ff8c5e8b4e917fcc045730ef1bc.)
I think it's saner to treat this as invalid syntax, and output the
mismatched tag code verbatim. The current behavior is particularly
annoying for <ref> tags, which often swallow everything afterwards.
This does not affect HTML tags, though. Assuming Tidy is enabled, they
are still auto-closed at the end of the page content. (For tags that
"shadow" a HTML tag name, this results in the tag being treated as a
HTML tag. This currently only affects <pre> tags: if unclosed, they
are still displayed as preformatted text, but without suppressing
wikitext formatting.)
It also does not affect <includeonly>, <noinclude> and <onlyinclude>
tags. Changing this behavior now would be too disruptive to existing
content, and is the reason why previous attempt was reverted. (They
are already special-cased enough that this isn't too weird, for example
mismatched closing tags are hidden.)
Related to T17712 and T58306. I think this brings the PHP parser closer
to Parsoid's interpretation.
It reduces performance somewhat in the worst case, though. Testing with
https://phabricator.wikimedia.org/F3245989 (a 1 MB page starting with
3000 opening tags of 15 different types), parsing time rises from
~0.2 seconds to ~1.1 seconds on my setup. We go from O(N) to O(kN),
where N is bytes of input and k is the number of types of tags present
on the page. Maximum k shouldn't exceed 30 or so in reasonable setups
(depends on installed extensions, it's 20 on English Wikipedia).
Change-Id: Ide8b034e464eefb1b7c9e2a48ed06e21a7f8d434
2016-02-04 01:13:24 +00:00
|
|
|
if (
|
|
|
|
|
!isset( $noMoreClosingTag[$name] ) &&
|
|
|
|
|
preg_match( "/<\/" . preg_quote( $name, '/' ) . "\s*>/i",
|
2013-12-01 20:39:00 +00:00
|
|
|
$text, $matches, PREG_OFFSET_CAPTURE, $tagEndPos + 1 )
|
|
|
|
|
) {
|
2008-01-21 16:36:08 +00:00
|
|
|
$inner = substr( $text, $tagEndPos + 1, $matches[0][1] - $tagEndPos - 1 );
|
|
|
|
|
$i = $matches[0][1] + strlen( $matches[0][0] );
|
|
|
|
|
$close = '<close>' . htmlspecialchars( $matches[0][0] ) . '</close>';
|
|
|
|
|
} else {
|
Preprocessor: Don't allow unclosed extension tags (matching until end of input)
(Previously done in f51d0d9a819f8f1c181350ced2f015ce97985fcc and
reverted in 543f46e9c08e0ff8c5e8b4e917fcc045730ef1bc.)
I think it's saner to treat this as invalid syntax, and output the
mismatched tag code verbatim. The current behavior is particularly
annoying for <ref> tags, which often swallow everything afterwards.
This does not affect HTML tags, though. Assuming Tidy is enabled, they
are still auto-closed at the end of the page content. (For tags that
"shadow" a HTML tag name, this results in the tag being treated as a
HTML tag. This currently only affects <pre> tags: if unclosed, they
are still displayed as preformatted text, but without suppressing
wikitext formatting.)
It also does not affect <includeonly>, <noinclude> and <onlyinclude>
tags. Changing this behavior now would be too disruptive to existing
content, and is the reason why previous attempt was reverted. (They
are already special-cased enough that this isn't too weird, for example
mismatched closing tags are hidden.)
Related to T17712 and T58306. I think this brings the PHP parser closer
to Parsoid's interpretation.
It reduces performance somewhat in the worst case, though. Testing with
https://phabricator.wikimedia.org/F3245989 (a 1 MB page starting with
3000 opening tags of 15 different types), parsing time rises from
~0.2 seconds to ~1.1 seconds on my setup. We go from O(N) to O(kN),
where N is bytes of input and k is the number of types of tags present
on the page. Maximum k shouldn't exceed 30 or so in reasonable setups
(depends on installed extensions, it's 20 on English Wikipedia).
Change-Id: Ide8b034e464eefb1b7c9e2a48ed06e21a7f8d434
2016-02-04 01:13:24 +00:00
|
|
|
// No end tag
|
|
|
|
|
if ( in_array( $name, $xmlishAllowMissingEndTag ) ) {
|
|
|
|
|
// Let it run out to the end of the text.
|
|
|
|
|
$inner = substr( $text, $tagEndPos + 1 );
|
|
|
|
|
$i = $lengthText;
|
|
|
|
|
$close = '';
|
|
|
|
|
} else {
|
|
|
|
|
// Don't match the tag, treat opening tag as literal and resume parsing.
|
|
|
|
|
$i = $tagEndPos + 1;
|
|
|
|
|
$accum .= htmlspecialchars( substr( $text, $tagStartPos, $tagEndPos + 1 - $tagStartPos ) );
|
|
|
|
|
// Cache results, otherwise we have O(N^2) performance for input like <foo><foo><foo>...
|
|
|
|
|
$noMoreClosingTag[$name] = true;
|
|
|
|
|
continue;
|
|
|
|
|
}
|
2008-01-21 16:36:08 +00:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
// <includeonly> and <noinclude> just become <ignore> tags
|
2008-03-05 01:07:47 +00:00
|
|
|
if ( in_array( $lowerName, $ignoredElements ) ) {
|
2008-04-14 07:45:50 +00:00
|
|
|
$accum .= '<ignore>' . htmlspecialchars( substr( $text, $tagStartPos, $i - $tagStartPos ) )
|
2008-01-21 16:36:08 +00:00
|
|
|
. '</ignore>';
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$accum .= '<ext>';
|
|
|
|
|
if ( $attrEnd <= $attrStart ) {
|
|
|
|
|
$attr = '';
|
|
|
|
|
} else {
|
|
|
|
|
$attr = substr( $text, $attrStart, $attrEnd - $attrStart );
|
|
|
|
|
}
|
|
|
|
|
$accum .= '<name>' . htmlspecialchars( $name ) . '</name>' .
|
2008-04-14 07:45:50 +00:00
|
|
|
// Note that the attr element contains the whitespace between name and attribute,
|
2008-01-21 16:36:08 +00:00
|
|
|
// this is necessary for precise reconstruction during pre-save transform.
|
|
|
|
|
'<attr>' . htmlspecialchars( $attr ) . '</attr>';
|
|
|
|
|
if ( $inner !== null ) {
|
|
|
|
|
$accum .= '<inner>' . htmlspecialchars( $inner ) . '</inner>';
|
|
|
|
|
}
|
|
|
|
|
$accum .= $close . '</ext>';
|
2011-05-28 14:54:13 +00:00
|
|
|
} elseif ( $found == 'line-start' ) {
|
2008-04-14 07:45:50 +00:00
|
|
|
// Is this the start of a heading?
|
2008-01-21 16:36:08 +00:00
|
|
|
// Line break belongs before the heading element in any case
|
|
|
|
|
if ( $fakeLineStart ) {
|
|
|
|
|
$fakeLineStart = false;
|
|
|
|
|
} else {
|
|
|
|
|
$accum .= $curChar;
|
|
|
|
|
$i++;
|
|
|
|
|
}
|
2008-04-14 07:45:50 +00:00
|
|
|
|
2008-01-21 16:36:08 +00:00
|
|
|
$count = strspn( $text, '=', $i, 6 );
|
2008-01-24 09:07:47 +00:00
|
|
|
if ( $count == 1 && $findEquals ) {
|
2014-05-10 23:03:45 +00:00
|
|
|
// DWIM: This looks kind of like a name/value separator.
|
|
|
|
|
// Let's let the equals handler have it and break the
|
|
|
|
|
// potential heading. This is heuristic, but AFAICT the
|
|
|
|
|
// methods for completely correct disambiguation are very
|
|
|
|
|
// complex.
|
2008-01-24 09:07:47 +00:00
|
|
|
} elseif ( $count > 0 ) {
|
2016-02-17 09:09:32 +00:00
|
|
|
$piece = [
|
2008-01-21 16:36:08 +00:00
|
|
|
'open' => "\n",
|
|
|
|
|
'close' => "\n",
|
2016-02-17 09:09:32 +00:00
|
|
|
'parts' => [ new PPDPart( str_repeat( '=', $count ) ) ],
|
2008-01-21 16:36:08 +00:00
|
|
|
'startPos' => $i,
|
2016-02-17 09:09:32 +00:00
|
|
|
'count' => $count ];
|
2008-01-21 16:36:08 +00:00
|
|
|
$stack->push( $piece );
|
|
|
|
|
$accum =& $stack->getAccum();
|
2017-12-07 21:16:47 +00:00
|
|
|
$stackFlags = $stack->getFlags();
|
|
|
|
|
if ( isset( $stackFlags['findEquals'] ) ) {
|
|
|
|
|
$findEquals = $stackFlags['findEquals'];
|
|
|
|
|
}
|
|
|
|
|
if ( isset( $stackFlags['findPipe'] ) ) {
|
|
|
|
|
$findPipe = $stackFlags['findPipe'];
|
|
|
|
|
}
|
|
|
|
|
if ( isset( $stackFlags['inHeading'] ) ) {
|
|
|
|
|
$inHeading = $stackFlags['inHeading'];
|
|
|
|
|
}
|
2008-01-21 16:36:08 +00:00
|
|
|
$i += $count;
|
|
|
|
|
}
|
2011-05-28 14:54:13 +00:00
|
|
|
} elseif ( $found == 'line-end' ) {
|
2008-01-21 16:36:08 +00:00
|
|
|
$piece = $stack->top;
|
|
|
|
|
// A heading must be open, otherwise \n wouldn't have been in the search list
|
2018-05-07 08:09:19 +00:00
|
|
|
// FIXME: Don't use assert()
|
|
|
|
|
// phpcs:ignore MediaWiki.Usage.ForbiddenFunctions.assert
|
2016-08-15 06:02:53 +00:00
|
|
|
assert( $piece->open === "\n" );
|
2008-01-24 09:07:47 +00:00
|
|
|
$part = $piece->getCurrentPart();
|
2014-05-10 23:03:45 +00:00
|
|
|
// Search back through the input to see if it has a proper close.
|
|
|
|
|
// Do this using the reversed string since the other solutions
|
|
|
|
|
// (end anchor, etc.) are inefficient.
|
2012-08-06 11:41:28 +00:00
|
|
|
$wsLength = strspn( $revText, " \t", $lengthText - $i );
|
2008-01-24 09:07:47 +00:00
|
|
|
$searchStart = $i - $wsLength;
|
|
|
|
|
if ( isset( $part->commentEnd ) && $searchStart - 1 == $part->commentEnd ) {
|
|
|
|
|
// Comment found at line end
|
|
|
|
|
// Search for equals signs before the comment
|
|
|
|
|
$searchStart = $part->visualEnd;
|
2012-08-06 11:41:28 +00:00
|
|
|
$searchStart -= strspn( $revText, " \t", $lengthText - $searchStart );
|
2008-01-24 09:07:47 +00:00
|
|
|
}
|
2008-01-21 16:36:08 +00:00
|
|
|
$count = $piece->count;
|
2012-08-06 11:41:28 +00:00
|
|
|
$equalsLength = strspn( $revText, '=', $lengthText - $searchStart );
|
2008-01-24 09:07:47 +00:00
|
|
|
if ( $equalsLength > 0 ) {
|
2010-06-21 20:33:07 +00:00
|
|
|
if ( $searchStart - $equalsLength == $piece->startPos ) {
|
2008-01-21 16:36:08 +00:00
|
|
|
// This is just a single string of equals signs on its own line
|
2013-03-04 08:44:38 +00:00
|
|
|
// Replicate the doHeadings behavior /={count}(.+)={count}/
|
2008-01-21 16:36:08 +00:00
|
|
|
// First find out how many equals signs there really are (don't stop at 6)
|
2008-01-24 09:07:47 +00:00
|
|
|
$count = $equalsLength;
|
2008-01-21 16:36:08 +00:00
|
|
|
if ( $count < 3 ) {
|
|
|
|
|
$count = 0;
|
|
|
|
|
} else {
|
|
|
|
|
$count = min( 6, intval( ( $count - 1 ) / 2 ) );
|
|
|
|
|
}
|
|
|
|
|
} else {
|
2008-01-24 09:07:47 +00:00
|
|
|
$count = min( $equalsLength, $count );
|
2008-01-21 16:36:08 +00:00
|
|
|
}
|
|
|
|
|
if ( $count > 0 ) {
|
|
|
|
|
// Normal match, output <h>
|
|
|
|
|
$element = "<h level=\"$count\" i=\"$headingIndex\">$accum</h>";
|
|
|
|
|
$headingIndex++;
|
|
|
|
|
} else {
|
|
|
|
|
// Single equals sign on its own line, count=0
|
|
|
|
|
$element = $accum;
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
// No match, no <h>, just pass down the inner text
|
|
|
|
|
$element = $accum;
|
|
|
|
|
}
|
|
|
|
|
// Unwind the stack
|
|
|
|
|
$stack->pop();
|
|
|
|
|
$accum =& $stack->getAccum();
|
2017-12-07 21:16:47 +00:00
|
|
|
$stackFlags = $stack->getFlags();
|
|
|
|
|
if ( isset( $stackFlags['findEquals'] ) ) {
|
|
|
|
|
$findEquals = $stackFlags['findEquals'];
|
|
|
|
|
}
|
|
|
|
|
if ( isset( $stackFlags['findPipe'] ) ) {
|
|
|
|
|
$findPipe = $stackFlags['findPipe'];
|
|
|
|
|
}
|
|
|
|
|
if ( isset( $stackFlags['inHeading'] ) ) {
|
|
|
|
|
$inHeading = $stackFlags['inHeading'];
|
|
|
|
|
}
|
2008-01-21 16:36:08 +00:00
|
|
|
|
|
|
|
|
// Append the result to the enclosing accumulator
|
|
|
|
|
$accum .= $element;
|
|
|
|
|
// Note that we do NOT increment the input pointer.
|
2008-04-14 07:45:50 +00:00
|
|
|
// This is because the closing linebreak could be the opening linebreak of
|
2008-01-21 16:36:08 +00:00
|
|
|
// another heading. Infinite loops are avoided because the next iteration MUST
|
2008-04-14 07:45:50 +00:00
|
|
|
// hit the heading open case above, which unconditionally increments the
|
2008-01-21 16:36:08 +00:00
|
|
|
// input pointer.
|
2010-11-25 22:24:28 +00:00
|
|
|
} elseif ( $found == 'open' ) {
|
2008-01-21 16:36:08 +00:00
|
|
|
# count opening brace characters
|
2016-09-20 22:26:32 +00:00
|
|
|
$curLen = strlen( $curChar );
|
2017-01-19 19:58:05 +00:00
|
|
|
$count = ( $curLen > 1 ) ?
|
|
|
|
|
# allow the final character to repeat
|
2017-08-11 13:53:17 +00:00
|
|
|
strspn( $text, $curChar[$curLen - 1], $i + 1 ) + 1 :
|
2017-01-19 19:58:05 +00:00
|
|
|
strspn( $text, $curChar, $i );
|
2008-01-21 16:36:08 +00:00
|
|
|
|
2018-03-10 00:40:36 +00:00
|
|
|
$savedPrefix = '';
|
|
|
|
|
$lineStart = ( $i > 0 && $text[$i - 1] == "\n" );
|
|
|
|
|
|
|
|
|
|
if ( $curChar === "-{" && $count > $curLen ) {
|
|
|
|
|
// -{ => {{ transition because rightmost wins
|
|
|
|
|
$savedPrefix = '-';
|
|
|
|
|
$i++;
|
|
|
|
|
$curChar = '{';
|
|
|
|
|
$count--;
|
|
|
|
|
$rule = $this->rules[$curChar];
|
|
|
|
|
}
|
|
|
|
|
|
2008-01-21 16:36:08 +00:00
|
|
|
# we need to add to stack only if opening brace count is enough for one of the rules
|
|
|
|
|
if ( $count >= $rule['min'] ) {
|
|
|
|
|
# Add it to the stack
|
2016-02-17 09:09:32 +00:00
|
|
|
$piece = [
|
2008-01-21 16:36:08 +00:00
|
|
|
'open' => $curChar,
|
|
|
|
|
'close' => $rule['end'],
|
2018-03-10 00:40:36 +00:00
|
|
|
'savedPrefix' => $savedPrefix,
|
2008-01-21 16:36:08 +00:00
|
|
|
'count' => $count,
|
2018-03-10 00:40:36 +00:00
|
|
|
'lineStart' => $lineStart,
|
2016-02-17 09:09:32 +00:00
|
|
|
];
|
2008-01-21 16:36:08 +00:00
|
|
|
|
|
|
|
|
$stack->push( $piece );
|
|
|
|
|
$accum =& $stack->getAccum();
|
2017-12-07 21:16:47 +00:00
|
|
|
$stackFlags = $stack->getFlags();
|
|
|
|
|
if ( isset( $stackFlags['findEquals'] ) ) {
|
|
|
|
|
$findEquals = $stackFlags['findEquals'];
|
|
|
|
|
}
|
|
|
|
|
if ( isset( $stackFlags['findPipe'] ) ) {
|
|
|
|
|
$findPipe = $stackFlags['findPipe'];
|
|
|
|
|
}
|
|
|
|
|
if ( isset( $stackFlags['inHeading'] ) ) {
|
|
|
|
|
$inHeading = $stackFlags['inHeading'];
|
|
|
|
|
}
|
2008-01-21 16:36:08 +00:00
|
|
|
} else {
|
|
|
|
|
# Add literal brace(s)
|
2018-03-10 00:40:36 +00:00
|
|
|
$accum .= htmlspecialchars( $savedPrefix . str_repeat( $curChar, $count ) );
|
2008-01-21 16:36:08 +00:00
|
|
|
}
|
2017-01-19 19:58:05 +00:00
|
|
|
$i += $count;
|
2010-11-25 22:24:28 +00:00
|
|
|
} elseif ( $found == 'close' ) {
|
2008-01-21 16:36:08 +00:00
|
|
|
$piece = $stack->top;
|
|
|
|
|
# lets check if there are enough characters for closing brace
|
|
|
|
|
$maxCount = $piece->count;
|
2017-01-19 19:58:05 +00:00
|
|
|
if ( $piece->close === '}-' && $curChar === '}' ) {
|
|
|
|
|
$maxCount--; # don't try to match closing '-' as a '}'
|
|
|
|
|
}
|
2016-09-20 22:26:32 +00:00
|
|
|
$curLen = strlen( $curChar );
|
2017-01-19 19:58:05 +00:00
|
|
|
$count = ( $curLen > 1 ) ? $curLen :
|
|
|
|
|
strspn( $text, $curChar, $i, $maxCount );
|
2008-01-21 16:36:08 +00:00
|
|
|
|
|
|
|
|
# check for maximum matching characters (if there are 5 closing
|
|
|
|
|
# characters, we will probably need only 3 - depending on the rules)
|
2015-10-31 23:10:54 +00:00
|
|
|
$rule = $this->rules[$piece->open];
|
2008-01-21 16:36:08 +00:00
|
|
|
if ( $count > $rule['max'] ) {
|
|
|
|
|
# The specified maximum exists in the callback array, unless the caller
|
|
|
|
|
# has made an error
|
|
|
|
|
$matchingCount = $rule['max'];
|
|
|
|
|
} else {
|
|
|
|
|
# Count is less than the maximum
|
|
|
|
|
# Skip any gaps in the callback array to find the true largest match
|
|
|
|
|
# Need to use array_key_exists not isset because the callback can be null
|
|
|
|
|
$matchingCount = $count;
|
|
|
|
|
while ( $matchingCount > 0 && !array_key_exists( $matchingCount, $rule['names'] ) ) {
|
|
|
|
|
--$matchingCount;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2011-05-28 14:54:13 +00:00
|
|
|
if ( $matchingCount <= 0 ) {
|
2008-01-21 16:36:08 +00:00
|
|
|
# No matching element found in callback array
|
|
|
|
|
# Output a literal closing brace and continue
|
2017-01-19 19:58:05 +00:00
|
|
|
$endText = substr( $text, $i, $count );
|
|
|
|
|
$accum .= htmlspecialchars( $endText );
|
|
|
|
|
$i += $count;
|
2008-01-21 16:36:08 +00:00
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
$name = $rule['names'][$matchingCount];
|
|
|
|
|
if ( $name === null ) {
|
|
|
|
|
// No element, just literal text
|
2017-01-19 19:58:05 +00:00
|
|
|
$endText = substr( $text, $i, $matchingCount );
|
|
|
|
|
$element = $piece->breakSyntax( $matchingCount ) . $endText;
|
2008-01-21 16:36:08 +00:00
|
|
|
} else {
|
|
|
|
|
# Create XML element
|
|
|
|
|
# Note: $parts is already XML, does not need to be encoded further
|
|
|
|
|
$parts = $piece->parts;
|
2008-01-24 04:29:56 +00:00
|
|
|
$title = $parts[0]->out;
|
2008-01-21 16:36:08 +00:00
|
|
|
unset( $parts[0] );
|
|
|
|
|
|
2008-04-14 07:45:50 +00:00
|
|
|
# The invocation is at the start of the line if lineStart is set in
|
2008-01-21 16:36:08 +00:00
|
|
|
# the stack, and all opening brackets are used up.
|
2018-03-10 00:40:36 +00:00
|
|
|
if ( $maxCount == $matchingCount &&
|
|
|
|
|
!empty( $piece->lineStart ) &&
|
|
|
|
|
strlen( $piece->savedPrefix ) == 0 ) {
|
2008-01-21 16:36:08 +00:00
|
|
|
$attr = ' lineStart="1"';
|
|
|
|
|
} else {
|
|
|
|
|
$attr = '';
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$element = "<$name$attr>";
|
|
|
|
|
$element .= "<title>$title</title>";
|
|
|
|
|
$argIndex = 1;
|
2010-10-14 20:53:04 +00:00
|
|
|
foreach ( $parts as $part ) {
|
2008-01-24 04:29:56 +00:00
|
|
|
if ( isset( $part->eqpos ) ) {
|
|
|
|
|
$argName = substr( $part->out, 0, $part->eqpos );
|
|
|
|
|
$argValue = substr( $part->out, $part->eqpos + 1 );
|
2008-01-21 16:36:08 +00:00
|
|
|
$element .= "<part><name>$argName</name>=<value>$argValue</value></part>";
|
|
|
|
|
} else {
|
2008-01-24 04:29:56 +00:00
|
|
|
$element .= "<part><name index=\"$argIndex\" /><value>{$part->out}</value></part>";
|
2008-01-21 16:36:08 +00:00
|
|
|
$argIndex++;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
$element .= "</$name>";
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# Advance input pointer
|
2017-01-19 19:58:05 +00:00
|
|
|
$i += $matchingCount;
|
2008-01-21 16:36:08 +00:00
|
|
|
|
|
|
|
|
# Unwind the stack
|
|
|
|
|
$stack->pop();
|
|
|
|
|
$accum =& $stack->getAccum();
|
|
|
|
|
|
|
|
|
|
# Re-add the old stack element if it still has unmatched opening characters remaining
|
2011-05-28 14:54:13 +00:00
|
|
|
if ( $matchingCount < $piece->count ) {
|
2016-02-17 09:09:32 +00:00
|
|
|
$piece->parts = [ new PPDPart ];
|
2008-01-21 16:36:08 +00:00
|
|
|
$piece->count -= $matchingCount;
|
|
|
|
|
# do we still qualify for any callback with remaining count?
|
2015-10-31 23:10:54 +00:00
|
|
|
$min = $this->rules[$piece->open]['min'];
|
2012-05-22 23:56:33 +00:00
|
|
|
if ( $piece->count >= $min ) {
|
|
|
|
|
$stack->push( $piece );
|
|
|
|
|
$accum =& $stack->getAccum();
|
2018-03-10 00:40:36 +00:00
|
|
|
} elseif ( $piece->count == 1 && $piece->open === '{' && $piece->savedPrefix === '-' ) {
|
|
|
|
|
$piece->savedPrefix = '';
|
|
|
|
|
$piece->open = '-{';
|
|
|
|
|
$piece->count = 2;
|
|
|
|
|
$piece->close = $this->rules[$piece->open]['end'];
|
|
|
|
|
$stack->push( $piece );
|
|
|
|
|
$accum =& $stack->getAccum();
|
2012-05-22 23:56:33 +00:00
|
|
|
} else {
|
2017-01-19 19:58:05 +00:00
|
|
|
$s = substr( $piece->open, 0, -1 );
|
|
|
|
|
$s .= str_repeat(
|
|
|
|
|
substr( $piece->open, -1 ),
|
|
|
|
|
$piece->count - strlen( $s )
|
|
|
|
|
);
|
2018-03-10 00:40:36 +00:00
|
|
|
$accum .= $piece->savedPrefix . $s;
|
2008-01-21 16:36:08 +00:00
|
|
|
}
|
2018-03-10 00:40:36 +00:00
|
|
|
} elseif ( $piece->savedPrefix !== '' ) {
|
|
|
|
|
$accum .= $piece->savedPrefix;
|
2008-01-21 16:36:08 +00:00
|
|
|
}
|
2018-03-10 00:40:36 +00:00
|
|
|
|
2017-12-07 21:16:47 +00:00
|
|
|
$stackFlags = $stack->getFlags();
|
|
|
|
|
if ( isset( $stackFlags['findEquals'] ) ) {
|
|
|
|
|
$findEquals = $stackFlags['findEquals'];
|
|
|
|
|
}
|
|
|
|
|
if ( isset( $stackFlags['findPipe'] ) ) {
|
|
|
|
|
$findPipe = $stackFlags['findPipe'];
|
|
|
|
|
}
|
|
|
|
|
if ( isset( $stackFlags['inHeading'] ) ) {
|
|
|
|
|
$inHeading = $stackFlags['inHeading'];
|
|
|
|
|
}
|
2008-01-21 16:36:08 +00:00
|
|
|
|
|
|
|
|
# Add XML element to the enclosing accumulator
|
|
|
|
|
$accum .= $element;
|
2011-05-28 14:54:13 +00:00
|
|
|
} elseif ( $found == 'pipe' ) {
|
2008-01-21 16:36:08 +00:00
|
|
|
$findEquals = true; // shortcut for getFlags()
|
2008-01-24 04:29:56 +00:00
|
|
|
$stack->addPart();
|
2008-01-21 16:36:08 +00:00
|
|
|
$accum =& $stack->getAccum();
|
|
|
|
|
++$i;
|
2011-05-28 14:54:13 +00:00
|
|
|
} elseif ( $found == 'equals' ) {
|
2008-01-21 16:36:08 +00:00
|
|
|
$findEquals = false; // shortcut for getFlags()
|
2008-01-24 04:29:56 +00:00
|
|
|
$stack->getCurrentPart()->eqpos = strlen( $accum );
|
2008-01-21 16:36:08 +00:00
|
|
|
$accum .= '=';
|
|
|
|
|
++$i;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# Output any remaining unclosed brackets
|
|
|
|
|
foreach ( $stack->stack as $piece ) {
|
2008-01-24 04:29:56 +00:00
|
|
|
$stack->rootAccum .= $piece->breakSyntax();
|
2008-01-21 16:36:08 +00:00
|
|
|
}
|
2008-01-24 04:29:56 +00:00
|
|
|
$stack->rootAccum .= '</root>';
|
|
|
|
|
$xml = $stack->rootAccum;
|
2008-01-21 16:36:08 +00:00
|
|
|
|
2009-02-06 20:27:58 +00:00
|
|
|
return $xml;
|
2008-01-21 16:36:08 +00:00
|
|
|
}
|
|
|
|
|
}
|