wiki.techinc.nl/includes/libs/JavaScriptMinifier.php

730 lines
26 KiB
PHP
Raw Normal View History

<?php
/**
* JavaScript Minifier
*
* @file
* @author Paul Copperman <paul.copperman@gmail.com>
* @license Apache-2.0
* @license MIT
* @license GPL-2.0-or-later
* @license LGPL-2.1-or-later
*/
/**
* This class is meant to safely minify javascript code, while leaving syntactically correct
* programs intact. Other libraries, such as JSMin require a certain coding style to work
* correctly. OTOH, libraries like jsminplus, that do parse the code correctly are rather
* slow, because they construct a complete parse tree before outputting the code minified.
* So this class is meant to allow arbitrary (but syntactically correct) input, while being
* fast enough to be used for on-the-fly minifying.
*
* This class was written with ECMA-262 Edition 3 in mind ("ECMAScript 3"). Parsing features
* new to ECMAScript 5 or later might not be supported. However, Edition 5.1 better reflects
* how actual JS engines worked and work and is simpler and more readable prose. As such,
* the below code will refer to sections of the 5.1 specification.
*
* See <https://www.ecma-international.org/ecma-262/5.1/>.
*/
class JavaScriptMinifier {
/* Parsing states.
* The state machine is only necessary to decide whether to parse a slash as division
* operator or as regexp literal.
* States are named after the next expected item. We only distinguish states when the
* distinction is relevant for our purpose.
*/
const STATEMENT = 0;
const CONDITION = 1;
const PROPERTY_ASSIGNMENT = 2;
const EXPRESSION = 3;
const EXPRESSION_NO_NL = 4; // only relevant for semicolon insertion
const EXPRESSION_OP = 5;
const EXPRESSION_FUNC = 6;
const EXPRESSION_TERNARY = 7; // used to determine the role of a colon
const EXPRESSION_TERNARY_OP = 8;
const EXPRESSION_TERNARY_FUNC = 9;
const PAREN_EXPRESSION = 10; // expression which is not on the top level
const PAREN_EXPRESSION_OP = 11;
const PAREN_EXPRESSION_FUNC = 12;
const PROPERTY_EXPRESSION = 13; // expression which is within an object literal
const PROPERTY_EXPRESSION_OP = 14;
const PROPERTY_EXPRESSION_FUNC = 15;
/* Token types */
const TYPE_UN_OP = 101; // unary operators
const TYPE_INCR_OP = 102; // ++ and --
const TYPE_BIN_OP = 103; // binary operators
const TYPE_ADD_OP = 104; // + and - which can be either unary or binary ops
const TYPE_HOOK = 105; // ?
const TYPE_COLON = 106; // :
const TYPE_COMMA = 107; // ,
const TYPE_SEMICOLON = 108; // ;
const TYPE_BRACE_OPEN = 109; // {
const TYPE_BRACE_CLOSE = 110; // }
const TYPE_PAREN_OPEN = 111; // ( and [
const TYPE_PAREN_CLOSE = 112; // ) and ]
const TYPE_RETURN = 113; // keywords: break, continue, return, throw
const TYPE_IF = 114; // keywords: catch, for, with, switch, while, if
const TYPE_DO = 115; // keywords: case, var, finally, else, do, try
const TYPE_FUNC = 116; // keywords: function
const TYPE_LITERAL = 117; // all literals, identifiers and unrecognised tokens
// Sanity limit to avoid excessive memory usage
const STACK_LIMIT = 1000;
/**
* Maximum line length
*
* This is not a strict maximum, but a guideline. Longer lines will be
* produced when literals (e.g. quoted strings) longer than this are
* encountered, or when required to guard against semicolon insertion.
*
* This is a private member (instead of constant) to allow tests to
* set it to 1, to verify ASI and line-breaking behaviour.
*/
private static $maxLineLength = 1000;
/**
* Returns minified JavaScript code.
*
* @param string $s JavaScript code to minify
* @return String Minified code
*/
public static function minify( $s ) {
// First we declare a few tables that contain our parsing rules
// $opChars : Characters which can be combined without whitespace between them.
$opChars = [
// ECMAScript 5.1 § 7.7 Punctuators
// Unlike the spec, these are individual symbols, not sequences.
'{' => true,
'}' => true,
'(' => true,
')' => true,
'[' => true,
']' => true,
'.' => true,
';' => true,
',' => true,
'<' => true,
'>' => true,
'=' => true,
'!' => true,
'+' => true,
'-' => true,
'*' => true,
'%' => true,
'&' => true,
'|' => true,
'^' => true,
'~' => true,
'?' => true,
':' => true,
'/' => true,
// ECMAScript 5.1 § 7.8.4 String Literals
'"' => true,
"'" => true,
];
// $tokenTypes : Map keywords and operators to their corresponding token type
$tokenTypes = [
// ECMAScript 5.1 § 11.4 Unary Operators
// ECMAScript 5.1 § 11.6 Additive Operators
// UnaryExpression includes PostfixExpression, which includes 'new'.
'new' => self::TYPE_UN_OP,
'delete' => self::TYPE_UN_OP,
'void' => self::TYPE_UN_OP,
'typeof' => self::TYPE_UN_OP,
'++' => self::TYPE_INCR_OP,
'--' => self::TYPE_INCR_OP,
'+' => self::TYPE_ADD_OP,
'-' => self::TYPE_ADD_OP,
'~' => self::TYPE_UN_OP,
'!' => self::TYPE_UN_OP,
// ECMAScript 5.1 § 11.5 Multiplicative Operators
'*' => self::TYPE_BIN_OP,
'/' => self::TYPE_BIN_OP,
'%' => self::TYPE_BIN_OP,
// ECMAScript 5.1 § 11.7 Bitwise Shift Operators
'<<' => self::TYPE_BIN_OP,
'>>' => self::TYPE_BIN_OP,
'>>>' => self::TYPE_BIN_OP,
// ECMAScript 5.1 § 11.8 Relational Operators
'<' => self::TYPE_BIN_OP,
'>' => self::TYPE_BIN_OP,
'<=' => self::TYPE_BIN_OP,
'>=' => self::TYPE_BIN_OP,
// ECMAScript 5.1 § 11.9 Equality Operators
'==' => self::TYPE_BIN_OP,
'!=' => self::TYPE_BIN_OP,
'===' => self::TYPE_BIN_OP,
'!==' => self::TYPE_BIN_OP,
'instanceof' => self::TYPE_BIN_OP,
'in' => self::TYPE_BIN_OP,
// ECMAScript 5.1 § 11.10 Binary Bitwise Operators
'&' => self::TYPE_BIN_OP,
'^' => self::TYPE_BIN_OP,
'|' => self::TYPE_BIN_OP,
// ECMAScript 5.1 § 11.11 Binary Logical Operators
'&&' => self::TYPE_BIN_OP,
'||' => self::TYPE_BIN_OP,
// ECMAScript 5.1 § 11.12 Conditional Operator
// Also known as ternary.
'?' => self::TYPE_HOOK,
':' => self::TYPE_COLON,
// ECMAScript 5.1 § 11.13 Assignment Operators
'=' => self::TYPE_BIN_OP,
'*=' => self::TYPE_BIN_OP,
'/=' => self::TYPE_BIN_OP,
'%=' => self::TYPE_BIN_OP,
'+=' => self::TYPE_BIN_OP,
'-=' => self::TYPE_BIN_OP,
'<<=' => self::TYPE_BIN_OP,
'>>=' => self::TYPE_BIN_OP,
'>>>=' => self::TYPE_BIN_OP,
'&=' => self::TYPE_BIN_OP,
'^=' => self::TYPE_BIN_OP,
'|=' => self::TYPE_BIN_OP,
// ECMAScript 5.1 § 11.14 Comma Operator
',' => self::TYPE_COMMA,
// The keywords that disallow LineTerminator before their
// (sometimes optional) Expression or Identifier.
//
// keyword ;
// keyword [no LineTerminator here] Identifier ;
// keyword [no LineTerminator here] Expression ;
//
// See also ECMAScript 5.1:
// - § 12.7 The continue Statement
// - $ 12.8 The break Statement
// - § 12.9 The return Statement
// - § 12.13 The throw Statement
'continue' => self::TYPE_RETURN,
'break' => self::TYPE_RETURN,
'return' => self::TYPE_RETURN,
'throw' => self::TYPE_RETURN,
// The keywords require a parenthesised Expression or Identifier
// before the next Statement.
//
// keyword ( Expression ) Statement
// keyword ( Identifier ) Statement
//
// See also ECMAScript 5.1:
// - § 12.5 The if Statement
// - § 12.6 Iteration Statements (do, while, for)
// - § 12.10 The with Statement
// - § 12.11 The switch Statement
// - § 12.13 The throw Statement
'if' => self::TYPE_IF,
'catch' => self::TYPE_IF,
'while' => self::TYPE_IF,
'for' => self::TYPE_IF,
'switch' => self::TYPE_IF,
'with' => self::TYPE_IF,
// The keywords followed by an Identifier, Statement,
// Expression, or Block.
//
// var Identifier
// else Statement
// do Statement
// case Expression
// try Block
// finally Block
//
// See also ECMAScript 5.1:
// - § 12.2 Variable Statement
// - § 12.5 The if Statement (else)
// - § 12.6 Iteration Statements (do, while, for)
// - § 12.11 The switch Statement (case)
// - § 12.14 The try Statement
'var' => self::TYPE_DO,
'else' => self::TYPE_DO,
'do' => self::TYPE_DO,
'case' => self::TYPE_DO,
'try' => self::TYPE_DO,
'finally' => self::TYPE_DO,
// ECMAScript 5.1 § 13 Function Definition
'function' => self::TYPE_FUNC,
// Can be one of:
// - DecimalLiteral (ECMAScript 5.1 § 7.8.3 Numeric Literals)
// - MemberExpression (ECMAScript 5.1 § 11.2 Left-Hand-Side Expressions)
'.' => self::TYPE_BIN_OP,
// Can be one of:
// - Block (ECMAScript 5.1 § 12.1 Block)
// - ObjectLiteral (ECMAScript 5.1 § 11.1 Primary Expressions)
'{' => self::TYPE_BRACE_OPEN,
'}' => self::TYPE_BRACE_CLOSE,
// Can be one of:
// - Parenthesised Identifier or Expression after a
// TYPE_IF or TYPE_FUNC keyword.
// - PrimaryExpression (ECMAScript 5.1 § 11.1 Primary Expressions)
// - CallExpression (ECMAScript 5.1 § 11.2 Left-Hand-Side Expressions)
'(' => self::TYPE_PAREN_OPEN,
')' => self::TYPE_PAREN_CLOSE,
// Can be one of:
// - ArrayLiteral (ECMAScript 5.1 § 11.1 Primary Expressions)
'[' => self::TYPE_PAREN_OPEN,
']' => self::TYPE_PAREN_CLOSE,
// Can be one of:
// - End of any statement
// - EmptyStatement (ECMAScript 5.1 § 12.3 Empty Statement)
';' => self::TYPE_SEMICOLON,
];
// $goto : This is the main table for our state machine. For every state/token pair
// the following state is defined. When no rule exists for a given pair,
// the state is left unchanged.
$goto = [
self::STATEMENT => [
self::TYPE_UN_OP => self::EXPRESSION,
self::TYPE_INCR_OP => self::EXPRESSION,
self::TYPE_ADD_OP => self::EXPRESSION,
self::TYPE_PAREN_OPEN => self::PAREN_EXPRESSION,
self::TYPE_RETURN => self::EXPRESSION_NO_NL,
self::TYPE_IF => self::CONDITION,
self::TYPE_FUNC => self::CONDITION,
self::TYPE_LITERAL => self::EXPRESSION_OP
],
self::CONDITION => [
self::TYPE_PAREN_OPEN => self::PAREN_EXPRESSION
],
self::PROPERTY_ASSIGNMENT => [
self::TYPE_COLON => self::PROPERTY_EXPRESSION,
self::TYPE_BRACE_OPEN => self::STATEMENT
],
self::EXPRESSION => [
self::TYPE_SEMICOLON => self::STATEMENT,
self::TYPE_BRACE_OPEN => self::PROPERTY_ASSIGNMENT,
self::TYPE_PAREN_OPEN => self::PAREN_EXPRESSION,
self::TYPE_FUNC => self::EXPRESSION_FUNC,
self::TYPE_LITERAL => self::EXPRESSION_OP
],
self::EXPRESSION_NO_NL => [
self::TYPE_SEMICOLON => self::STATEMENT,
self::TYPE_BRACE_OPEN => self::PROPERTY_ASSIGNMENT,
self::TYPE_PAREN_OPEN => self::PAREN_EXPRESSION,
self::TYPE_FUNC => self::EXPRESSION_FUNC,
self::TYPE_LITERAL => self::EXPRESSION_OP
],
self::EXPRESSION_OP => [
self::TYPE_BIN_OP => self::EXPRESSION,
self::TYPE_ADD_OP => self::EXPRESSION,
self::TYPE_HOOK => self::EXPRESSION_TERNARY,
self::TYPE_COLON => self::STATEMENT,
self::TYPE_COMMA => self::EXPRESSION,
self::TYPE_SEMICOLON => self::STATEMENT,
self::TYPE_PAREN_OPEN => self::PAREN_EXPRESSION
],
self::EXPRESSION_FUNC => [
self::TYPE_BRACE_OPEN => self::STATEMENT
],
self::EXPRESSION_TERNARY => [
self::TYPE_BRACE_OPEN => self::PROPERTY_ASSIGNMENT,
self::TYPE_PAREN_OPEN => self::PAREN_EXPRESSION,
self::TYPE_FUNC => self::EXPRESSION_TERNARY_FUNC,
self::TYPE_LITERAL => self::EXPRESSION_TERNARY_OP
],
self::EXPRESSION_TERNARY_OP => [
self::TYPE_BIN_OP => self::EXPRESSION_TERNARY,
self::TYPE_ADD_OP => self::EXPRESSION_TERNARY,
self::TYPE_HOOK => self::EXPRESSION_TERNARY,
self::TYPE_COMMA => self::EXPRESSION_TERNARY,
self::TYPE_PAREN_OPEN => self::PAREN_EXPRESSION
],
self::EXPRESSION_TERNARY_FUNC => [
self::TYPE_BRACE_OPEN => self::STATEMENT
],
self::PAREN_EXPRESSION => [
self::TYPE_BRACE_OPEN => self::PROPERTY_ASSIGNMENT,
self::TYPE_PAREN_OPEN => self::PAREN_EXPRESSION,
self::TYPE_FUNC => self::PAREN_EXPRESSION_FUNC,
self::TYPE_LITERAL => self::PAREN_EXPRESSION_OP
],
self::PAREN_EXPRESSION_OP => [
self::TYPE_BIN_OP => self::PAREN_EXPRESSION,
self::TYPE_ADD_OP => self::PAREN_EXPRESSION,
self::TYPE_HOOK => self::PAREN_EXPRESSION,
self::TYPE_COLON => self::PAREN_EXPRESSION,
self::TYPE_COMMA => self::PAREN_EXPRESSION,
self::TYPE_SEMICOLON => self::PAREN_EXPRESSION,
self::TYPE_PAREN_OPEN => self::PAREN_EXPRESSION
],
self::PAREN_EXPRESSION_FUNC => [
self::TYPE_BRACE_OPEN => self::STATEMENT
],
self::PROPERTY_EXPRESSION => [
self::TYPE_BRACE_OPEN => self::PROPERTY_ASSIGNMENT,
self::TYPE_PAREN_OPEN => self::PAREN_EXPRESSION,
self::TYPE_FUNC => self::PROPERTY_EXPRESSION_FUNC,
self::TYPE_LITERAL => self::PROPERTY_EXPRESSION_OP
],
self::PROPERTY_EXPRESSION_OP => [
self::TYPE_BIN_OP => self::PROPERTY_EXPRESSION,
self::TYPE_ADD_OP => self::PROPERTY_EXPRESSION,
self::TYPE_HOOK => self::PROPERTY_EXPRESSION,
self::TYPE_COMMA => self::PROPERTY_ASSIGNMENT,
self::TYPE_PAREN_OPEN => self::PAREN_EXPRESSION
],
self::PROPERTY_EXPRESSION_FUNC => [
self::TYPE_BRACE_OPEN => self::STATEMENT
]
];
// $push : This table contains the rules for when to push a state onto the stack.
// The pushed state is the state to return to when the corresponding
// closing token is found
$push = [
self::STATEMENT => [
self::TYPE_BRACE_OPEN => self::STATEMENT,
self::TYPE_PAREN_OPEN => self::EXPRESSION_OP
],
self::CONDITION => [
self::TYPE_PAREN_OPEN => self::STATEMENT
],
self::PROPERTY_ASSIGNMENT => [
self::TYPE_BRACE_OPEN => self::PROPERTY_ASSIGNMENT
],
self::EXPRESSION => [
self::TYPE_BRACE_OPEN => self::EXPRESSION_OP,
self::TYPE_PAREN_OPEN => self::EXPRESSION_OP
],
self::EXPRESSION_NO_NL => [
self::TYPE_BRACE_OPEN => self::EXPRESSION_OP,
self::TYPE_PAREN_OPEN => self::EXPRESSION_OP
],
self::EXPRESSION_OP => [
self::TYPE_HOOK => self::EXPRESSION,
self::TYPE_PAREN_OPEN => self::EXPRESSION_OP
],
self::EXPRESSION_FUNC => [
self::TYPE_BRACE_OPEN => self::EXPRESSION_OP
],
self::EXPRESSION_TERNARY => [
self::TYPE_BRACE_OPEN => self::EXPRESSION_TERNARY_OP,
self::TYPE_PAREN_OPEN => self::EXPRESSION_TERNARY_OP
],
self::EXPRESSION_TERNARY_OP => [
self::TYPE_HOOK => self::EXPRESSION_TERNARY,
self::TYPE_PAREN_OPEN => self::EXPRESSION_TERNARY_OP
],
self::EXPRESSION_TERNARY_FUNC => [
self::TYPE_BRACE_OPEN => self::EXPRESSION_TERNARY_OP
],
self::PAREN_EXPRESSION => [
self::TYPE_BRACE_OPEN => self::PAREN_EXPRESSION_OP,
self::TYPE_PAREN_OPEN => self::PAREN_EXPRESSION_OP
],
self::PAREN_EXPRESSION_OP => [
self::TYPE_PAREN_OPEN => self::PAREN_EXPRESSION_OP
],
self::PAREN_EXPRESSION_FUNC => [
self::TYPE_BRACE_OPEN => self::PAREN_EXPRESSION_OP
],
self::PROPERTY_EXPRESSION => [
self::TYPE_BRACE_OPEN => self::PROPERTY_EXPRESSION_OP,
self::TYPE_PAREN_OPEN => self::PROPERTY_EXPRESSION_OP
],
self::PROPERTY_EXPRESSION_OP => [
JavaScriptMinifier: Fix bad state after '{}' in property value Previously, $push contained: self::PROPERTY_EXPRESSION_OP => [ self::TYPE_PAREN_OPEN => self::PROPERTY_EXPRESSION_OP ], But $pop contained: self::PROPERTY_EXPRESSION_OP => [ self::TYPE_BRACE_CLOSE => true ] This meant that when a closing brace was found inside a property expression, it would wrongly pop the stack, eventhough we are still inside the property expression. The impact is that everything after this is one level higher in the stack than it should be, causing various other types to be misinterpreted. Including in the following contrived example: call( function () { try { } catch (e) { obj = { key: 1 ? 0 : {} // A }; // B } // C return name === 'input'; } ); In the above, the closing brace at A would close the 'obj.key' assignment (PROPERTY_EXPRESSION_OP), instead of waiting for the closing brace at B to decide that. Then the closing brace at B would wrongly close the 'catch' block (instead of the 'obj' assignment). And lastly, the closing brace at C would close the function body (STATEMENT). This resulted in keyword 'return' being interpreted while in state PAREN_EXPRESSION_OP instead of STATEMENT, where PAREN_EXPRESSION_OP is the arguments list to `call()`. In an argument list, TYPE_RETURN is not valid, which means we stay in that state, instead of progressing to EXPRESSION_NO_NL, which then wrongly allows for a line break to be inserted. Bug: T201606 Change-Id: I07b809a7ca56e282ecb48b5c89c217b4b8da6856
2018-08-10 22:23:08 +00:00
self::TYPE_BRACE_OPEN => self::PROPERTY_EXPRESSION_OP,
self::TYPE_PAREN_OPEN => self::PROPERTY_EXPRESSION_OP
],
self::PROPERTY_EXPRESSION_FUNC => [
self::TYPE_BRACE_OPEN => self::PROPERTY_EXPRESSION_OP
]
];
// $pop : Rules for when to pop a state from the stack
$pop = [
self::STATEMENT => [ self::TYPE_BRACE_CLOSE => true ],
self::PROPERTY_ASSIGNMENT => [ self::TYPE_BRACE_CLOSE => true ],
self::EXPRESSION => [ self::TYPE_BRACE_CLOSE => true ],
self::EXPRESSION_NO_NL => [ self::TYPE_BRACE_CLOSE => true ],
self::EXPRESSION_OP => [ self::TYPE_BRACE_CLOSE => true ],
self::EXPRESSION_TERNARY_OP => [ self::TYPE_COLON => true ],
self::PAREN_EXPRESSION => [ self::TYPE_PAREN_CLOSE => true ],
self::PAREN_EXPRESSION_OP => [ self::TYPE_PAREN_CLOSE => true ],
self::PROPERTY_EXPRESSION => [ self::TYPE_BRACE_CLOSE => true ],
self::PROPERTY_EXPRESSION_OP => [ self::TYPE_BRACE_CLOSE => true ]
];
// $semicolon : Rules for when a semicolon insertion is appropriate
$semicolon = [
self::EXPRESSION_NO_NL => [
self::TYPE_UN_OP => true,
self::TYPE_INCR_OP => true,
self::TYPE_ADD_OP => true,
self::TYPE_BRACE_OPEN => true,
self::TYPE_PAREN_OPEN => true,
self::TYPE_RETURN => true,
self::TYPE_IF => true,
self::TYPE_DO => true,
self::TYPE_FUNC => true,
self::TYPE_LITERAL => true
],
self::EXPRESSION_OP => [
self::TYPE_UN_OP => true,
self::TYPE_INCR_OP => true,
self::TYPE_BRACE_OPEN => true,
self::TYPE_RETURN => true,
self::TYPE_IF => true,
self::TYPE_DO => true,
self::TYPE_FUNC => true,
self::TYPE_LITERAL => true
]
];
// $divStates : Contains all states that can be followed by a division operator
$divStates = [
self::EXPRESSION_OP => true,
self::EXPRESSION_TERNARY_OP => true,
self::PAREN_EXPRESSION_OP => true,
self::PROPERTY_EXPRESSION_OP => true
];
// Here's where the minifying takes place: Loop through the input, looking for tokens
// and output them to $out, taking actions to the above defined rules when appropriate.
$out = '';
$pos = 0;
$length = strlen( $s );
$lineLength = 0;
$newlineFound = true;
$state = self::STATEMENT;
$stack = [];
$last = ';'; // Pretend that we have seen a semicolon yet
while ( $pos < $length ) {
// First, skip over any whitespace and multiline comments, recording whether we
// found any newline character
$skip = strspn( $s, " \t\n\r\xb\xc", $pos );
if ( !$skip ) {
$ch = $s[$pos];
if ( $ch === '/' && substr( $s, $pos, 2 ) === '/*' ) {
// Multiline comment. Search for the end token or EOT.
$end = strpos( $s, '*/', $pos + 2 );
$skip = $end === false ? $length - $pos : $end - $pos + 2;
}
}
if ( $skip ) {
// The semicolon insertion mechanism needs to know whether there was a newline
// between two tokens, so record it now.
if ( !$newlineFound && strcspn( $s, "\r\n", $pos, $skip ) !== $skip ) {
$newlineFound = true;
}
$pos += $skip;
continue;
}
// Handle C++-style comments and html comments, which are treated as single line
// comments by the browser, regardless of whether the end tag is on the same line.
// Handle --> the same way, but only if it's at the beginning of the line
if ( ( $ch === '/' && substr( $s, $pos, 2 ) === '//' )
|| ( $ch === '<' && substr( $s, $pos, 4 ) === '<!--' )
|| ( $ch === '-' && $newlineFound && substr( $s, $pos, 3 ) === '-->' )
) {
$pos += strcspn( $s, "\r\n", $pos );
continue;
}
// Find out which kind of token we're handling.
// Note: $end must point past the end of the current token
// so that `substr($s, $pos, $end - $pos)` would be the entire token.
// In order words, $end will be the offset of the last relevant character
// in the stream + 1, or simply put: The offset of the first character
// of any next token in the stream.
$end = $pos + 1;
// Handle string literals
if ( $ch === "'" || $ch === '"' ) {
// Search to the end of the string literal, skipping over backslash escapes
$search = $ch . '\\';
do{
// Speculatively add 2 to the end so that if we see a backslash,
// the next iteration will start 2 characters further (one for the
// backslash, one for the escaped character).
// We'll correct this outside the loop.
$end += strcspn( $s, $search, $end ) + 2;
// If the last character in our search for a quote or a backlash
// matched a backslash and we haven't reached the end, keep searching..
} while ( $end - 2 < $length && $s[$end - 2] === '\\' );
// Correction (1): Undo speculative add, keep only one (end of string literal)
$end--;
if ( $end > $length ) {
// Correction (2): Loop wrongly assumed an end quote ended the search,
// but search ended because we've reached the end. Correct $end.
// TODO: This is invalid and should throw.
$end--;
}
// We have to distinguish between regexp literals and division operators
// A division operator is only possible in certain states
} elseif ( $ch === '/' && !isset( $divStates[$state] ) ) {
// Regexp literal
for ( ; ; ) {
// Search until we find "/" (end of regexp), "\" (backslash escapes),
// or "[" (start of character classes).
do{
// Speculatively add 2 to ensure next iteration skips
// over backslash and escaped character.
// We'll correct this outside the loop.
$end += strcspn( $s, '/[\\', $end ) + 2;
// If backslash escape, keep searching...
} while ( $end - 2 < $length && $s[$end - 2] === '\\' );
// Correction (1): Undo speculative add, keep only one (end of regexp)
$end--;
if ( $end > $length ) {
// Correction (2): Loop wrongly assumed end slash was seen
// String ended without end of regexp. Correct $end.
// TODO: This is invalid and should throw.
$end--;
break;
}
if ( $s[$end - 1] === '/' ) {
break;
}
// (Implicit else), we must've found the start of a char class,
// skip until we find "]" (end of char class), or "\" (backslash escape)
do{
// Speculatively add 2 for backslash escape.
// We'll substract one outside the loop.
$end += strcspn( $s, ']\\', $end ) + 2;
// If backslash escape, keep searching...
} while ( $end - 2 < $length && $s[$end - 2] === '\\' );
// Correction (1): Undo speculative add, keep only one (end of regexp)
$end--;
if ( $end > $length ) {
// Correction (2): Loop wrongly assumed "]" was seen
// String ended without ending char class or regexp. Correct $end.
// TODO: This is invalid and should throw.
$end--;
break;
}
}
// Search past the regexp modifiers (gi)
while ( $end < $length && ctype_alpha( $s[$end] ) ) {
$end++;
}
} elseif (
$ch === '0'
&& ( $pos + 1 < $length ) && ( $s[$pos + 1] === 'x' || $s[$pos + 1] === 'X' )
) {
// Hex numeric literal
$end++; // x or X
$len = strspn( $s, '0123456789ABCDEFabcdef', $end );
if ( !$len ) {
return self::parseError(
$s,
$pos,
'Expected a hexadecimal number but found ' . substr( $s, $pos, 5 ) . '...'
);
}
$end += $len;
} elseif (
ctype_digit( $ch )
|| ( $ch === '.' && $pos + 1 < $length && ctype_digit( $s[$pos + 1] ) )
) {
$end += strspn( $s, '0123456789', $end );
$decimal = strspn( $s, '.', $end );
if ( $decimal ) {
if ( $decimal > 2 ) {
return self::parseError( $s, $end, 'The number has too many decimal points' );
}
$end += strspn( $s, '0123456789', $end + 1 ) + $decimal;
}
$exponent = strspn( $s, 'eE', $end );
if ( $exponent ) {
if ( $exponent > 1 ) {
return self::parseError( $s, $end, 'Number with several E' );
}
$end++;
// + sign is optional; - sign is required.
$end += strspn( $s, '-+', $end );
$len = strspn( $s, '0123456789', $end );
if ( !$len ) {
return self::parseError(
$s,
$pos,
'No decimal digits after e, how many zeroes should be added?'
);
}
$end += $len;
}
} elseif ( isset( $opChars[$ch] ) ) {
// Punctuation character. Search for the longest matching operator.
while (
$end < $length
&& isset( $tokenTypes[substr( $s, $pos, $end - $pos + 1 )] )
) {
$end++;
}
} else {
// Identifier or reserved word. Search for the end by excluding whitespace and
// punctuation.
$end += strcspn( $s, " \t\n.;,=<>+-{}()[]?:*/%'\"!&|^~\xb\xc\r", $end );
}
// Now get the token type from our type array
$token = substr( $s, $pos, $end - $pos ); // so $end - $pos == strlen( $token )
$type = $tokenTypes[$token] ?? self::TYPE_LITERAL;
if ( $newlineFound && isset( $semicolon[$state][$type] ) ) {
// This token triggers the semicolon insertion mechanism of javascript. While we
// could add the ; token here ourselves, keeping the newline has a few advantages.
$out .= "\n";
$state = self::STATEMENT;
$lineLength = 0;
} elseif ( $lineLength + $end - $pos > self::$maxLineLength &&
!isset( $semicolon[$state][$type] ) && $type !== self::TYPE_INCR_OP ) {
// This line would get too long if we added $token, so add a newline first.
// Only do this if it won't trigger semicolon insertion and if it won't
// put a postfix increment operator on its own line, which is illegal in js.
$out .= "\n";
$lineLength = 0;
// Check, whether we have to separate the token from the last one with whitespace
} elseif ( !isset( $opChars[$last] ) && !isset( $opChars[$ch] ) ) {
$out .= ' ';
$lineLength++;
// Don't accidentally create ++, -- or // tokens
} elseif ( $last === $ch && ( $ch === '+' || $ch === '-' || $ch === '/' ) ) {
$out .= ' ';
$lineLength++;
}
if (
$type === self::TYPE_LITERAL
&& ( $token === 'true' || $token === 'false' )
&& ( $state === self::EXPRESSION || $state === self::PROPERTY_EXPRESSION )
&& $last !== '.'
) {
$token = ( $token === 'true' ) ? '!0' : '!1';
}
$out .= $token;
$lineLength += $end - $pos; // += strlen( $token )
$last = $s[$end - 1];
$pos = $end;
$newlineFound = false;
// Now that we have output our token, transition into the new state.
if ( isset( $push[$state][$type] ) && count( $stack ) < self::STACK_LIMIT ) {
$stack[] = $push[$state][$type];
}
if ( $stack && isset( $pop[$state][$type] ) ) {
$state = array_pop( $stack );
} elseif ( isset( $goto[$state][$type] ) ) {
$state = $goto[$state][$type];
}
}
return $out;
}
static function parseError( $fullJavascript, $position, $errorMsg ) {
// TODO: Handle the error: trigger_error, throw exception, return false...
return false;
}
}