wiki.techinc.nl/includes/Tokenizer.php

<?php
class Tokenizer {
	/* private */ var $mText, 		# Text to be processed by the tokenizer
			  $mPos,		# current position of tokenizer in text
			  $mTextLength,		# Length of $mText
			  $mQueuedToken;	# Tokens that were already found, but not
			  			# returned yet.

	/* private */ function Tokenizer()
	{
		global $wgLang;

		$this->mPos=0;
		$this->mTokenQueue=array();
		$this->linkPrefixExtension = $wgLang->linkPrefixExtension();
	}

	# factory function
	function newFromString( $s )
	{
		$fname = "Tokenizer::newFromString";
		wfProfileIn( $fname );

		$t = new Tokenizer();
		$t->mText = $s;
		$t->mTextLength = strlen( $s );

		wfProfileOut( $fname );
		return $t;
	}


	// Return the next token, but do not increase the pointer. The next call
	// to previewToken or nextToken will return the same token again.
	// Actually, the pointer is increased, but the token is queued. The next
	// call to previewToken or nextToken will check the queue and return
	// the stored token.
	function previewToken()
	{
		$fname = "Tokenizer::previewToken";
		wfProfileIn( $fname );

		if ( count( $this->mQueuedToken ) != 0 ) {
			// still one token from the last round around. Return that one first.
			$token = $this->mQueuedToken[0];
		} else {
			$token = $this->nextToken();
			array_unshift( $this->mQueuedToken, $token );
		}

		wfProfileOut( $fname );
		return $token;
	}


	// get the next token
	// proceeds character by character through the text, looking for characters needing
	// special attention. Those are currently: I, R, ', [, ], newline
	//
	// TODO:  handling of French blanks not yet implemented
	function nextToken()
	{
		$fname = "Tokenizer::nextToken";
		wfProfileIn( $fname );

		if ( count( $this->mQueuedToken ) != 0 ) {
			// still one token from the last round around. Return that one first.
			$token = array_shift( $this->mQueuedToken );
		} else if ( $this->mPos > $this->mTextLength ) {
		 	// If no text is left, return "false".
			$token = false;
		} else {

			$token["text"]="";
			$token["type"]="text";

			while ( $this->mPos <= $this->mTextLength ) {
				switch ( @$ch = $this->mText[$this->mPos] ) {
					case 'R': // for "RFC "
						if ( $this->continues("FC ") ) {
						     	$queueToken["type"] = $queueToken["text"] = "RFC ";
							$this->mQueuedToken[] = $queueToken;
					     		$this->mPos += 3;
							break 2; // switch + while
						}
						break;
					case 'I': // for "ISBN "
						if ( $this->continues("SBN ") ) {
						     	$queueToken["type"] = $queueToken["text"] = "ISBN ";
							$this->mQueuedToken[] = $queueToken;
					     		$this->mPos += 4;
							break 2; // switch + while
						}
						break;
					case "[": // for links "[["
						if ( $this->continues("[[") ) {
						     	$queueToken["type"] = "[[[";
							$queueToken["text"] = "";
							$this->mQueuedToken[] = $queueToken;
					     		$this->mPos += 3;
							break 2; // switch + while
						} else if ( $this->continues("[") ) {
							$queueToken["type"] = "[[";
							$queueToken["text"] = "";
							// Check for a "prefixed link", e.g. Al[[Khazar]]
							// Mostly for arabic wikipedia
							if ( $this->linkPrefixExtension ) {
								while (    $this->linkPrefixExtension
									&& ($len = strlen( $token["text"] ) ) > 0
									&& !ctype_space( $token["text"][$len-1] ) )
								{
									//prepend the character to the link's open tag
									$queueToken["text"] = $token["text"][$len-1] . $queueToken["text"];
									//remove character from the end of the text token
									$token["text"] = substr( $token["text"], 0, -1);
								}
							}
							$this->mQueuedToken[] = $queueToken;
					     		$this->mPos += 2;
							break 2; // switch + while
						}
						break;
					case "]": // for end of links "]]"
						if ( $this->continues("]") ) {
						     	$queueToken["type"] = "]]";
							$queueToken["text"] = "";
							$this->mQueuedToken[] = $queueToken;
					     		$this->mPos += 2;
							break 2; // switch + while
						}
						break;
					case "'": // for all kind of em's and strong's
						if ( $this->continues("'") ) {
							$queueToken["type"] = "'";
							$queueToken["text"] = "";
							while(   ($this->mPos+1 < $this->mTextLength)
							       && $this->mText[$this->mPos+1] == "'" )
							{
								$queueToken["type"] .= "'";
								$this->mPos ++;
							}

							$this->mQueuedToken[] = $queueToken;
							$this->mPos ++;
							break 2; // switch + while
						}
						break;
					case "\n": // for block levels, actually, only "----" is handled.
					case "\r":
						if ( $this->continues( "----" ) )
						{
						     	$queueToken["type"] = "----";
							$queueToken["text"] = "";
							$this->mQueuedToken[] = $queueToken;
							$this->mPos += 5;
							while (     $this->mPos<$this->mTextLength
								and $this->mText[$this->mPos] == "-" )
							{
								$this->mPos ++;
							}
							break 2;
						}
						break;
					case "!": // French spacing rules have a space before exclamation
					case "?": // and question marks. Those have to become &nbsp;
					case ":": // And colons, Hashar says ...
						if ( $this->preceeded( " " ) )
						{
							// strip blank from Token
							$token["text"] = substr( $token["text"], 0, -1 );
							$queueToken["type"] = "blank";
							$queueToken["text"] = " {$ch}";
							$this->mQueuedToken[] = $queueToken;
							$this->mPos ++;
							break 2; // switch + while
						}
						break;
					case "0": // A space between two numbers is used to ease reading
					case "1": // of big numbers, e.g. 1 000 000. Those spaces need
					case "2": // to be unbreakable
					case "3":
					case "4":
					case "5":
					case "6":
					case "7":
					case "8":
					case "9":
						if (    ($this->mTextLength >= $this->mPos +2)
						     && ($this->mText[$this->mPos+1] == " ")
						     && ctype_digit( $this->mText[$this->mPos+2] ) )
						{
							$queueToken["type"] = "blank";
							$queueToken["text"] = $ch . " ";
							$this->mQueuedToken[] = $queueToken;
							$this->mPos += 2;
							break 2; // switch + while
						}
						break;
					case "\302": // first byte of UTF-8 Character Guillemet-left
						if ( $this->continues( "\253 ") ) // second byte and a blank
						{
							$queueToken["type"] = "blank";
							$queueToken["text"] = "\302\253 ";
							$this->mQueuedToken[] = $queueToken;
							$this->mPos += 3;
							break 2; // switch + while
						}
						break;
					case "\273": //last byte of UTF-8 Character Guillemet-right
						if ( $this->preceeded( " \302" ) )
						{
							$queueToken["type"] = "blank";
							$queueToken["text"] = " \302\273";
							$token["text"] = substr( $token["text"], 0, -2 );
							$this->mQueuedToken[] = $queueToken;
							$this->mPos ++;
							break 2; // switch + while
						}
						break;
					case "&": //extensions like <timeline>, since HTML stripping has already been done,
					 	  //those look like &lt;timeline&gt;
						if ( $this->continues( "lt;timeline&gt;" ) )
						{
							$queueToken["type"] = "<timeline>";
							$queueToken["text"] = "&lt;timeline&gt;";
							$this->mQueuedToken[] = $queueToken;
							$this->mPos += 16;
							break 2; // switch + while
						}
						break;

				} /* switch */
				$token["text"].=$ch;
				$this->mPos ++;
				// echo $this->mPos . "<br>\n";
			} /* while */
		} /* if (nothing left in queue) */

		wfProfileOut( $fname );
		return $token;
	}

	// function continues
	// checks whether the mText continues with $cont from mPos+1
	/* private */ function continues( $cont )
	{
		// If string is not long enough to contain $cont, return false
		if ( $this->mTextLength < $this->mPos + strlen( $cont ) )
			return false;
		for ( $i=0; $i < strlen( $cont ); $i++ )
		{
			if ( $this->mText[$this->mPos+1+$i] != $cont[$i] )
				return false;
		}
		return true;
	}

	// function preceeded
	// checks whether the mText is preceeded by $prec at position mPos
	/* private */ function preceeded( $prec )
	{
		$len = strlen( $prec );
		// if $prec is longer than the text up to mPos, return false
		if ( $this->mPos < $len )
			return false;
		return ( 0 == strcmp( $prec, substr($this->mText, $this->mPos-$len, $len) ) );
	}

	function readAllUntil( $border )
	{
		$n = strpos( $this->mText, $border, $this->mPos );
		if ( $n === false )
			return "";
		$ret = substr( $this->mText, $this->mPos, $n - $this->mPos );
		$this->mPos = $n + strlen( $border ) + 1;
		return $ret;
	}

}