2004-02-28 23:38:08 +00:00
|
|
|
<?php
|
|
|
|
|
class Tokenizer {
|
2004-02-29 11:00:30 +00:00
|
|
|
/* private */ var $mText, # Text to be processed by the tokenizer
|
|
|
|
|
$mPos, # current position of tokenizer in text
|
|
|
|
|
$mTextLength, # Length of $mText
|
|
|
|
|
$mCount, # token count, computed in preParse
|
|
|
|
|
$mMatch, # matches of tokenizer regex, computed in preParse
|
|
|
|
|
$mMatchPos; # current token position of tokenizer. Each match can
|
|
|
|
|
# be up to two tokens: A matched token and the text after it.
|
2004-02-28 23:38:08 +00:00
|
|
|
|
|
|
|
|
/* private */ function Tokenizer()
|
|
|
|
|
{
|
|
|
|
|
$this->mPos=0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# factory function
|
|
|
|
|
function newFromString( $s )
|
|
|
|
|
{
|
|
|
|
|
$t = new Tokenizer();
|
|
|
|
|
$t->mText = $s;
|
|
|
|
|
$t->preParse();
|
|
|
|
|
$t->mTextLength = strlen( $s );
|
|
|
|
|
return $t;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
function preParse()
|
|
|
|
|
{
|
2004-02-29 13:33:51 +00:00
|
|
|
global $wgLang;
|
2004-03-02 20:23:56 +00:00
|
|
|
|
|
|
|
|
# build up the regex, step by step.
|
|
|
|
|
# Basic features: Quotes for <em>/<strong> and hyphens for <hr>
|
|
|
|
|
$regex = "\'\'\'\'\'|\'\'\'|\'\'|\n-----*";
|
|
|
|
|
# Append regex for linkPrefixExtension
|
2004-02-29 13:33:51 +00:00
|
|
|
if ( $wgLang->linkPrefixExtension() ) {
|
2004-03-02 20:23:56 +00:00
|
|
|
$regex .= "|([a-zA-Z\x80-\xff]+)\[\[";
|
2004-02-29 13:33:51 +00:00
|
|
|
} else {
|
2004-03-16 02:17:33 +00:00
|
|
|
# end tag that can start with 3 [
|
|
|
|
|
$regex .= "|\[\[\[?";
|
2004-02-29 13:33:51 +00:00
|
|
|
}
|
2004-03-02 20:23:56 +00:00
|
|
|
# Closing link
|
|
|
|
|
$regex .= "|\]\]";
|
2004-03-06 20:04:25 +00:00
|
|
|
# Magic words that automatically generate links
|
2004-03-06 21:30:42 +00:00
|
|
|
$regex .= "|ISBN |RFC ";
|
2004-03-02 20:23:56 +00:00
|
|
|
# Language-specific additions
|
|
|
|
|
$regex .= $wgLang->tokenizerRegex();
|
|
|
|
|
# Finalize regex
|
|
|
|
|
$regex = "/(" . $regex . ")/";
|
2004-02-29 13:33:51 +00:00
|
|
|
|
2004-03-02 20:23:56 +00:00
|
|
|
# Apply the regex to the text
|
2004-02-29 13:33:51 +00:00
|
|
|
$this->mCount = preg_match_all( $regex, $this->mText, $this->mMatch,
|
2004-02-28 23:38:08 +00:00
|
|
|
PREG_PATTERN_ORDER|PREG_OFFSET_CAPTURE);
|
2004-02-29 11:00:30 +00:00
|
|
|
$this->mMatchPos=0;
|
2004-02-28 23:38:08 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
function nextToken()
|
|
|
|
|
{
|
|
|
|
|
$token = $this->previewToken();
|
|
|
|
|
if ( $token ) {
|
2004-02-29 13:33:51 +00:00
|
|
|
$this->mMatchPos = $token["mMatchPos"];
|
|
|
|
|
$this->mPos = $token["mPos"];
|
2004-02-28 23:38:08 +00:00
|
|
|
}
|
|
|
|
|
return $token;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
function previewToken()
|
|
|
|
|
{
|
2004-03-08 02:46:27 +00:00
|
|
|
if ( $this->mMatchPos < $this->mCount ) {
|
2004-02-28 23:38:08 +00:00
|
|
|
$token["pos"] = $this->mPos;
|
2004-02-29 11:00:30 +00:00
|
|
|
if ( $this->mPos < $this->mMatch[0][$this->mMatchPos][1] ) {
|
2004-02-28 23:38:08 +00:00
|
|
|
$token["type"] = "text";
|
|
|
|
|
$token["text"] = substr( $this->mText, $this->mPos,
|
2004-02-29 11:00:30 +00:00
|
|
|
$this->mMatch[0][$this->mMatchPos][1] - $this->mPos );
|
2004-02-29 13:33:51 +00:00
|
|
|
# What the pointers would change to if this would not just be a preview
|
|
|
|
|
$token["mMatchPos"] = $this->mMatchPos;
|
2004-02-29 11:00:30 +00:00
|
|
|
$token["mPos"] = $this->mMatch[0][$this->mMatchPos][1];
|
2004-02-28 23:38:08 +00:00
|
|
|
} else {
|
2004-02-29 13:33:51 +00:00
|
|
|
# If linkPrefixExtension is set, $this->mMatch[2][$this->mMatchPos][0]
|
|
|
|
|
# contains the link prefix, or is null if no link prefix exist.
|
2004-03-08 02:46:27 +00:00
|
|
|
if ( isset( $this->mMatch[2] ) && $this->mMatch[2][$this->mMatchPos][0] )
|
2004-02-29 13:33:51 +00:00
|
|
|
{
|
|
|
|
|
# prefixed link open tag, [0] is "prefix[["
|
|
|
|
|
$token["type"] = "[[";
|
|
|
|
|
$token["text"] = $this->mMatch[2][$this->mMatchPos][0]; # the prefix
|
|
|
|
|
} else {
|
|
|
|
|
$token["type"] = $this->mMatch[0][$this->mMatchPos][0];
|
2004-03-02 20:23:56 +00:00
|
|
|
if ( substr($token["type"],1,4) == "----" )
|
|
|
|
|
{
|
|
|
|
|
# any number of hyphens bigger than four is a <HR>.
|
|
|
|
|
# strip down to four.
|
|
|
|
|
$token["type"]="----";
|
|
|
|
|
}
|
2004-02-29 13:33:51 +00:00
|
|
|
}
|
|
|
|
|
# What the pointers would change to if this would not just be a preview
|
|
|
|
|
$token["mPos"] = $this->mPos + strlen( $this->mMatch[0][$this->mMatchPos][0] );
|
2004-02-29 11:00:30 +00:00
|
|
|
$token["mMatchPos"] = $this->mMatchPos + 1;
|
2004-02-28 23:38:08 +00:00
|
|
|
}
|
|
|
|
|
} elseif ( $this->mPos < $this->mTextLength ) {
|
|
|
|
|
$token["type"] = "text";
|
|
|
|
|
$token["text"] = substr( $this->mText, $this->mPos );
|
2004-02-29 13:33:51 +00:00
|
|
|
# What the pointers would change to if this would not just be a preview
|
2004-02-28 23:38:08 +00:00
|
|
|
$token["mPos"] = $this->mTextLength;
|
2004-02-29 13:33:51 +00:00
|
|
|
$token["mMatchPos"] = $this->mMatchPos;
|
2004-02-28 23:38:08 +00:00
|
|
|
} else {
|
|
|
|
|
$token = FALSE;
|
|
|
|
|
}
|
|
|
|
|
return $token;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|