295 lines
8.9 KiB
PHP
295 lines
8.9 KiB
PHP
<?php
|
|
class Tokenizer {
|
|
/* private */ var $mText, # Text to be processed by the tokenizer
|
|
$mPos, # current position of tokenizer in text
|
|
$mTextLength, # Length of $mText
|
|
$mQueuedToken; # Tokens that were already found, but not
|
|
# returned yet.
|
|
|
|
/* private */ function Tokenizer()
|
|
{
|
|
global $wgLang;
|
|
|
|
$this->mPos=0;
|
|
$this->mTokenQueue=array();
|
|
$this->linkPrefixExtension = $wgLang->linkPrefixExtension();
|
|
}
|
|
|
|
# factory function
|
|
function newFromString( $s )
|
|
{
|
|
$fname = 'Tokenizer::newFromString';
|
|
wfProfileIn( $fname );
|
|
|
|
$t = new Tokenizer();
|
|
$t->mText = $s;
|
|
$t->mTextLength = strlen( $s );
|
|
|
|
wfProfileOut( $fname );
|
|
return $t;
|
|
}
|
|
|
|
|
|
// Return the next token, but do not increase the pointer. The next call
|
|
// to previewToken or nextToken will return the same token again.
|
|
// Actually, the pointer is increased, but the token is queued. The next
|
|
// call to previewToken or nextToken will check the queue and return
|
|
// the stored token.
|
|
function previewToken()
|
|
{
|
|
$fname = 'Tokenizer::previewToken';
|
|
wfProfileIn( $fname );
|
|
|
|
if ( count( $this->mQueuedToken ) != 0 ) {
|
|
// still one token from the last round around. Return that one first.
|
|
$token = $this->mQueuedToken[0];
|
|
} else {
|
|
$token = $this->nextToken();
|
|
array_unshift( $this->mQueuedToken, $token );
|
|
}
|
|
|
|
wfProfileOut( $fname );
|
|
return $token;
|
|
}
|
|
|
|
|
|
// get the next token
|
|
// proceeds character by character through the text, looking for characters needing
|
|
// special attention. Those are currently: I, R, ', [, ], newline
|
|
//
|
|
// TODO: handling of French blanks not yet implemented
|
|
function nextToken()
|
|
{
|
|
$fname = 'Tokenizer::nextToken';
|
|
wfProfileIn( $fname );
|
|
|
|
if ( count( $this->mQueuedToken ) != 0 ) {
|
|
// still one token from the last round around. Return that one first.
|
|
$token = array_shift( $this->mQueuedToken );
|
|
} else if ( $this->mPos > $this->mTextLength ) {
|
|
// If no text is left, return "false".
|
|
$token = false;
|
|
} else {
|
|
|
|
$token['text']='';
|
|
$token['type']='text';
|
|
|
|
while ( $this->mPos <= $this->mTextLength ) {
|
|
switch ( @$ch = $this->mText[$this->mPos] ) {
|
|
case 'R': // for "RFC "
|
|
if ( $this->continues('FC ') ) {
|
|
$queueToken['type'] = $queueToken['text'] = 'RFC ';
|
|
$this->mQueuedToken[] = $queueToken;
|
|
$this->mPos += 3;
|
|
break 2; // switch + while
|
|
}
|
|
break;
|
|
case 'I': // for "ISBN "
|
|
if ( $this->continues('SBN ') ) {
|
|
$queueToken['type'] = $queueToken['text'] = 'ISBN ';
|
|
$this->mQueuedToken[] = $queueToken;
|
|
$this->mPos += 4;
|
|
break 2; // switch + while
|
|
}
|
|
break;
|
|
case '[': // for links "[["
|
|
if ( $this->continues('[[') ) {
|
|
$queueToken['type'] = '[[[';
|
|
$queueToken['text'] = '';
|
|
$this->mQueuedToken[] = $queueToken;
|
|
$this->mPos += 3;
|
|
break 2; // switch + while
|
|
} else if ( $this->continues('[') ) {
|
|
$queueToken['type'] = '[[';
|
|
$queueToken['text'] = '';
|
|
// Check for a "prefixed link", e.g. Al[[Khazar]]
|
|
// Mostly for arabic wikipedia
|
|
if ( $this->linkPrefixExtension ) {
|
|
while ( $this->linkPrefixExtension
|
|
&& ($len = strlen( $token['text'] ) ) > 0
|
|
&& !ctype_space( $token['text'][$len-1] ) )
|
|
{
|
|
//prepend the character to the link's open tag
|
|
$queueToken['text'] = $token['text'][$len-1] . $queueToken['text'];
|
|
//remove character from the end of the text token
|
|
$token['text'] = substr( $token['text'], 0, -1);
|
|
}
|
|
}
|
|
$this->mQueuedToken[] = $queueToken;
|
|
$this->mPos += 2;
|
|
break 2; // switch + while
|
|
}
|
|
break;
|
|
case ']': // for end of links "]]"
|
|
if ( $this->continues(']') ) {
|
|
$queueToken['type'] = ']]';
|
|
$queueToken['text'] = '';
|
|
$this->mQueuedToken[] = $queueToken;
|
|
$this->mPos += 2;
|
|
break 2; // switch + while
|
|
}
|
|
break;
|
|
case "'": // for all kind of em's and strong's
|
|
if ( $this->continues("'") ) {
|
|
$queueToken['type'] = "'";
|
|
$queueToken['text'] = '';
|
|
while( ($this->mPos+1 < $this->mTextLength)
|
|
&& $this->mText[$this->mPos+1] == "'" )
|
|
{
|
|
$queueToken['type'] .= "'";
|
|
$queueToken['pos'] = $this->mPos;
|
|
$this->mPos ++;
|
|
}
|
|
|
|
$this->mQueuedToken[] = $queueToken;
|
|
$this->mPos ++;
|
|
break 2; // switch + while
|
|
}
|
|
break;
|
|
case "\n": // for block levels, actually, only "----" is handled.
|
|
case "\r": // headings are detected to close any unbalanced em or strong tags in a section
|
|
if ( $this->continues( '----' ) )
|
|
{
|
|
$queueToken['type'] = '----';
|
|
$queueToken['text'] = '';
|
|
$this->mQueuedToken[] = $queueToken;
|
|
$this->mPos += 5;
|
|
while ( $this->mPos<$this->mTextLength
|
|
and $this->mText[$this->mPos] == '-' )
|
|
{
|
|
$this->mPos ++;
|
|
}
|
|
break 2;
|
|
} else if (
|
|
$this->continues( '<h' ) and (
|
|
$this->continues( '<h1' ) or
|
|
$this->continues( '<h2' ) or
|
|
$this->continues( '<h3' ) or
|
|
$this->continues( '<h4' ) or
|
|
$this->continues( '<h5' ) or
|
|
$this->continues( '<h6' )
|
|
)
|
|
) { // heading
|
|
$queueToken['type'] = 'h';
|
|
$queueToken['text'] = '';
|
|
$this->mQueuedToken[] = $queueToken;
|
|
$this->mPos ++;
|
|
break 2; // switch + while
|
|
}
|
|
break;
|
|
case '!': // French spacing rules have a space before exclamation
|
|
case '?': // and question marks. Those have to become
|
|
case ':': // And colons, Hashar says ...
|
|
if ( $this->preceeded( ' ' ) )
|
|
{
|
|
// strip blank from Token
|
|
$token['text'] = substr( $token['text'], 0, -1 );
|
|
$queueToken['type'] = 'blank';
|
|
$queueToken['text'] = " {$ch}";
|
|
$this->mQueuedToken[] = $queueToken;
|
|
$this->mPos ++;
|
|
break 2; // switch + while
|
|
}
|
|
break;
|
|
case '0': // A space between two numbers is used to ease reading
|
|
case '1': // of big numbers, e.g. 1 000 000. Those spaces need
|
|
case '2': // to be unbreakable
|
|
case '3':
|
|
case '4':
|
|
case '5':
|
|
case '6':
|
|
case '7':
|
|
case '8':
|
|
case '9':
|
|
if ( ($this->mTextLength >= $this->mPos +2)
|
|
&& ($this->mText[$this->mPos+1] == " ")
|
|
&& ctype_digit( $this->mText[$this->mPos+2] ) )
|
|
{
|
|
$queueToken['type'] = 'blank';
|
|
$queueToken['text'] = $ch . ' ';
|
|
$this->mQueuedToken[] = $queueToken;
|
|
$this->mPos += 2;
|
|
break 2; // switch + while
|
|
}
|
|
break;
|
|
case "\302": // first byte of UTF-8 Character Guillemet-left
|
|
if ( $this->continues( "\253 ") ) // second byte and a blank
|
|
{
|
|
$queueToken['type'] = 'blank';
|
|
$queueToken['text'] = "\302\253 ";
|
|
$this->mQueuedToken[] = $queueToken;
|
|
$this->mPos += 3;
|
|
break 2; // switch + while
|
|
}
|
|
break;
|
|
case "\273": //last byte of UTF-8 Character Guillemet-right
|
|
if ( $this->preceeded( " \302" ) )
|
|
{
|
|
$queueToken['type'] = 'blank';
|
|
$queueToken['text'] = " \302\273";
|
|
$token['text'] = substr( $token['text'], 0, -2 );
|
|
$this->mQueuedToken[] = $queueToken;
|
|
$this->mPos ++;
|
|
break 2; // switch + while
|
|
}
|
|
break;
|
|
case '&': //extensions like <timeline>, since HTML stripping has already been done,
|
|
//those look like <timeline>
|
|
if ( $this->continues( "lt;timeline>" ) )
|
|
{
|
|
$queueToken['type'] = "<timeline>";
|
|
$queueToken['text'] = "<timeline>";
|
|
$this->mQueuedToken[] = $queueToken;
|
|
$this->mPos += 16;
|
|
break 2; // switch + while
|
|
}
|
|
break;
|
|
|
|
} /* switch */
|
|
$token['text'].=$ch;
|
|
$this->mPos ++;
|
|
// echo $this->mPos . "<br>\n";
|
|
} /* while */
|
|
} /* if (nothing left in queue) */
|
|
|
|
wfProfileOut( $fname );
|
|
return $token;
|
|
}
|
|
|
|
// function continues
|
|
// checks whether the mText continues with $cont from mPos+1
|
|
/* private */ function continues( $cont )
|
|
{
|
|
// If string is not long enough to contain $cont, return false
|
|
if ( $this->mTextLength < $this->mPos + strlen( $cont ) )
|
|
return false;
|
|
for ( $i=0; $i < strlen( $cont ); $i++ )
|
|
{
|
|
if ( $this->mText[$this->mPos+1+$i] != $cont[$i] )
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
// function preceeded
|
|
// checks whether the mText is preceeded by $prec at position mPos
|
|
/* private */ function preceeded( $prec )
|
|
{
|
|
$len = strlen( $prec );
|
|
// if $prec is longer than the text up to mPos, return false
|
|
if ( $this->mPos < $len )
|
|
return false;
|
|
return ( 0 == strcmp( $prec, substr($this->mText, $this->mPos-$len, $len) ) );
|
|
}
|
|
|
|
function readAllUntil( $border )
|
|
{
|
|
$n = strpos( $this->mText, $border, $this->mPos );
|
|
if ( $n === false )
|
|
return '';
|
|
$ret = substr( $this->mText, $this->mPos, $n - $this->mPos );
|
|
$this->mPos = $n + strlen( $border ) + 1;
|
|
return $ret;
|
|
}
|
|
|
|
}
|