Code Coverage - tburry/pquery - Measure and Improve Code Quality continuously with Scrutinizer

Code

gan_tokenizer.php

< 40 %

40-60 %

> 60 %

<?php
/**
 * @author Niels A.D.
 * @author Todd Burry <[email protected]>
 * @copyright 2010 Niels A.D., 2014 Todd Burry
 * @license http://opensource.org/licenses/LGPL-2.1 LGPL-2.1
 * @package pQuery
 */

namespace pQuery;

/**
 * Converts a document into tokens
 *
 * Can convert any string into tokens. The base class only supports
 * identifier/whitespace tokens. For more tokens, the class can be
 * easily extended.
 *
 * Use like:
 * <code>
 * <?php
 *  $a = new TokenizerBase('hello word');
 *  while ($a->next() !== $a::TOK_NULL) {
 *    echo $a->token, ': ',$a->getTokenString(), "<br>\n";
 *  }
 * ?>
 * </code>
 *
 * @internal The tokenizer works with a character map that connects a certain
 * character to a certain function/token. This class is build with speed in mind.
 */
class TokenizerBase {

	/**
	 * NULL Token, used at end of document (parsing should stop after this token)
	 */
	const TOK_NULL = 0;
	/**
	 * Unknown token, used at unidentified character
	 */
	const TOK_UNKNOWN = 1;
	/**
	 * Whitespace token, used with whitespace
	 */
	const TOK_WHITESPACE = 2;
	/**
	 * Identifier token, used with identifiers
	 */
	const TOK_IDENTIFIER = 3;

	/**
	 * The document that is being tokenized
	 * @var string
	 * @internal Public for faster access!
	 * @see setDoc()
	 * @see getDoc()
	 * @access private
	 */
	var $doc = '';

	/**
	 * The size of the document (length of string)
	 * @var int
	 * @internal Public for faster access!
	 * @see $doc
	 * @access private
	 */
	var $size = 0;

	/**
	 * Current (character) position in the document
	 * @var int
	 * @internal Public for faster access!
	 * @see setPos()
	 * @see getPos()
	 * @access private
	 */
	var $pos = 0;

	/**
	 * Current (Line/Column) position in document
	 * @var array (Current_Line, Line_Starting_Pos)
	 * @internal Public for faster access!
	 * @see getLinePos()
	 * @access private
	 */
	var $line_pos = array(0, 0);

	/**
	 * Current token
	 * @var int
	 * @internal Public for faster access!
	 * @see getToken()
	 * @access private
	 */
	var $token = self::TOK_NULL;

	/**
	 * Start position of token. If NULL, then current position is used.
	 * @var int
	 * @internal Public for faster access!
	 * @see getTokenString()
	 * @access private
	 */
	var $token_start = null;

	/**
	 * List with all the character that can be considered as whitespace
	 * @var array|string
	 * @internal Variable is public + associated array for faster access!
	 * @internal array(' ' => true) will recognize space (' ') as whitespace
	 * @internal String will be converted to array in constructor
	 * @internal Result token will be {@link self::TOK_WHITESPACE};
	 * @see setWhitespace()
	 * @see getWhitespace()
	 * @access private
	 */
	var $whitespace = " \t\n\r\0\x0B";

	/**
	 * List with all the character that can be considered as identifier
	 * @var array|string
	 * @internal Variable is public + associated array for faster access!
	 * @internal array('a' => true) will recognize 'a' as identifier
	 * @internal String will be converted to array in constructor
	 * @internal Result token will be {@link self::TOK_IDENTIFIER};
	 * @see setIdentifiers()
	 * @see getIdentifiers()
	 * @access private
	 */
	var $identifiers = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ01234567890_';

	/**
	 * All characters that should be mapped to a token/function that cannot be considered as whitespace or identifier
	 * @var array
	 * @internal Variable is public + associated array for faster access!
	 * @internal array('a' => 'parse_a') will call $this->parse_a() if it matches the character 'a'
	 * @internal array('a' => self::TOK_A) will set token to TOK_A if it matches the character 'a'
	 * @see mapChar()
	 * @see unmapChar()
	 * @access private
	 */
	var $custom_char_map = array();

	/**
	 * Automatically built character map. Built using {@link $identifiers}, {@link $whitespace} and {@link $custom_char_map}
	 * @var array
	 * @internal Public for faster access!
	 * @access private
	 */
	var $char_map = array();

	/**
	 * All errors found while parsing the document
	 * @var array
	 * @see addError()
	 */
	var $errors = array();

	/**
	 * Class constructor
	 * @param string $doc Document to be tokenized
	 * @param int $pos Position to start parsing
	 * @see setDoc()
	 * @see setPos()
	 */
	function __construct($doc = '', $pos = 0) {
		$this->setWhitespace($this->whitespace);
		$this->setIdentifiers($this->identifiers);

		$this->setDoc($doc, $pos);
	}

	#php4 PHP4 class constructor compatibility
	#function TokenizerBase($doc = '', $pos = 0) {return $this->__construct($doc, $pos);}
	#php4e

	/**
	 * Sets target document
	 * @param string $doc Document to be tokenized
	 * @param int $pos Position to start parsing
	 * @see getDoc()
	 * @see setPos()
	 */
	function setDoc($doc, $pos = 0) {
		$this->doc = $doc;
		$this->size = strlen($doc);
		$this->setPos($pos);
	}

	/**
	 * Returns target document
	 * @return string
	 * @see setDoc()
	 */
	function getDoc() {
		return $this->doc;
	}

	/**
	 * Sets position in document
	 * @param int $pos
	 * @see getPos()
	 */
	function setPos($pos = 0) {
		$this->pos = $pos - 1;
		$this->line_pos = array(0, 0);
		$this->next();
	}

	/**
	 * Returns current position in document (Index)
	 * @return int
	 * @see setPos()
	 */
	function getPos() {
		return $this->pos;
	}

	/**
	 * Returns current position in document (Line/Char)
	 * @return array array(Line, Column)
	 */
	function getLinePos() {
		return array($this->line_pos[0], $this->pos - $this->line_pos[1]);
	}

	/**
	 * Returns current token
	 * @return int
	 * @see $token
	 */
	function getToken() {
		return $this->token;
	}

	/**
	 * Returns current token as string
	 * @param int $start_offset Offset from token start
	 * @param int $end_offset Offset from token end
	 * @return string
	 */
	function getTokenString($start_offset = 0, $end_offset = 0) {
		$token_start = ((is_int($this->token_start)) ? $this->token_start : $this->pos) + $start_offset;
		$len = $this->pos - $token_start + 1 + $end_offset;
		return (($len > 0) ? substr($this->doc, $token_start, $len) : '');
	}

	/**
	 * Sets characters to be recognized as whitespace
	 *
	 * Used like: setWhitespace('ab') or setWhitespace(array('a' => true, 'b', 'c'));
	 * @param string|array $ws
	 * @see getWhitespace();
	 */
	function setWhitespace($ws) {
		if (is_array($ws)) {
			$this->whitespace = array_fill_keys(array_values($ws), true);
			$this->buildCharMap();
		} else {
			$this->setWhiteSpace(str_split($ws));
		}
	}

	/**
	 * Returns whitespace characters as string/array
	 * @param bool $as_string Should the result be a string or an array?
	 * @return string|array
	 * @see setWhitespace()
	 */
	function getWhitespace($as_string = true) {
		$ws = array_keys($this->whitespace);
		return (($as_string) ? implode('', $ws) : $ws);
	}

	/**
	 * Sets characters to be recognized as identifier
	 *
	 * Used like: setIdentifiers('ab') or setIdentifiers(array('a' => true, 'b', 'c'));
	 * @param string|array $ident
	 * @see getIdentifiers();
	 */
	function setIdentifiers($ident) {
		if (is_array($ident)) {
			$this->identifiers = array_fill_keys(array_values($ident), true);
			$this->buildCharMap();
		} else {
			$this->setIdentifiers(str_split($ident));
		}
	}

	/**
	 * Returns identifier characters as string/array
	 * @param bool $as_string Should the result be a string or an array?
	 * @return string|array
	 * @see setIdentifiers()
	 */
	function getIdentifiers($as_string = true) {
		$ident = array_keys($this->identifiers);
		return (($as_string) ? implode('', $ident) : $ident);
	}

	/**
	 * Maps a custom character to a token/function
	 *
	 * Used like: mapChar('a', self::{@link TOK_IDENTIFIER}) or mapChar('a', 'parse_identifier');
	 * @param string $char Character that should be mapped. If set, it will be overridden
	 * @param int|string $map If function name, then $this->function will be called, otherwise token is set to $map
	 * @see unmapChar()
	 */
	function mapChar($char, $map) {
		$this->custom_char_map[$char] = $map;
		$this->buildCharMap();
	}

	/**
	 * Removes a char mapped with {@link mapChar()}
	 * @param string $char Character that should be unmapped
	 * @see mapChar()
	 */
	function unmapChar($char) {
		unset($this->custom_char_map[$char]);
		$this->buildCharMap();
	}

	/**
	 * Builds the {@link $map_char} array
	 * @internal Builds single array that maps all characters. Gets called if {@link $whitespace}, {@link $identifiers} or {@link $custom_char_map} get modified
	 */
	protected function buildCharMap() {
		$this->char_map = $this->custom_char_map;
		if (is_array($this->whitespace)) {
			foreach($this->whitespace as $w => $v) {
				$this->char_map[$w] = 'parse_whitespace';
			}
		}
		if (is_array($this->identifiers)) {
			foreach($this->identifiers as $i => $v) {
				$this->char_map[$i] = 'parse_identifier';
			}
		}
	}

	/**
	 * Add error to the array and appends current position
	 * @param string $error
	 */
	function addError($error) {
		$this->errors[] = htmlentities($error.' at '.($this->line_pos[0] + 1).', '.($this->pos - $this->line_pos[1] + 1).'!');
	}

	/**
	 * Parse line breaks and increase line number
	 * @internal Gets called to process line breaks
	 */
	protected function parse_linebreak() {
		if($this->doc[$this->pos] === "\r") {
			++$this->line_pos[0];
			if ((($this->pos + 1) < $this->size) && ($this->doc[$this->pos + 1] === "\n")) {
				++$this->pos;
			}
			$this->line_pos[1] = $this->pos;
		} elseif($this->doc[$this->pos] === "\n") {
			++$this->line_pos[0];
			$this->line_pos[1] = $this->pos;
		}
	}

	/**
	 * Parse whitespace
	 * @return int Token
	 * @internal Gets called with {@link $whitespace} characters
	 */
	protected function parse_whitespace() {
		$this->token_start = $this->pos;

		while(++$this->pos < $this->size) {
			if (!isset($this->whitespace[$this->doc[$this->pos]])) {
				break;
			} else {
				$this->parse_linebreak();
			}
		}

		--$this->pos;
		return self::TOK_WHITESPACE;
	}

	/**
	 * Parse identifiers
	 * @return int Token
	 * @internal Gets called with {@link $identifiers} characters
	 */
	protected function parse_identifier() {
		$this->token_start = $this->pos;

		while((++$this->pos < $this->size) && isset($this->identifiers[$this->doc[$this->pos]])) {}

		--$this->pos;
		return self::TOK_IDENTIFIER;
	}

	/**
	 * Continues to the next token
	 * @return int Next token ({@link TOK_NULL} if none)
	 */
	function next() {
		$this->token_start = null;

		if (++$this->pos < $this->size) {
			if (isset($this->char_map[$this->doc[$this->pos]])) {
				if (is_string($this->char_map[$this->doc[$this->pos]])) {
					return ($this->token = $this->{$this->char_map[$this->doc[$this->pos]]}());
				} else {
					return ($this->token = $this->char_map[$this->doc[$this->pos]]);
				}
			} else {
				return ($this->token = self::TOK_UNKNOWN);
			}
		} else {
			return ($this->token = self::TOK_NULL);
		}
	}

	/**
	 * Finds the next token, but skips whitespace
	 * @return int Next token ({@link TOK_NULL} if none)
	 */
	function next_no_whitespace() {
		$this->token_start = null;

		while (++$this->pos < $this->size) {
			if (!isset($this->whitespace[$this->doc[$this->pos]])) {
				if (isset($this->char_map[$this->doc[$this->pos]])) {
					if (is_string($this->char_map[$this->doc[$this->pos]])) {
						return ($this->token = $this->{$this->char_map[$this->doc[$this->pos]]}());
					} else {
						return ($this->token = $this->char_map[$this->doc[$this->pos]]);
					}
				} else {
					return ($this->token = self::TOK_UNKNOWN);
				}
			} else {
				$this->parse_linebreak();
			}
		}

		return ($this->token = self::TOK_NULL);
	}

	/**
	 * Finds the next token using stop characters.
	 *
	 * Used like: next_search('abc') or next_search(array('a' => true, 'b' => true, 'c' => true));
	 * @param string|array $characters Characters to search for
	 * @param bool $callback Should the function check the charmap after finding a character?
	 * @return int Next token ({@link TOK_NULL} if none)
	 */
	function next_search($characters, $callback = true) {
		$this->token_start = $this->pos;
		if (!is_array($characters)) {
			$characters = array_fill_keys(str_split($characters), true);
		}

		while(++$this->pos < $this->size) {
			if (isset($characters[$this->doc[$this->pos]])) {
				if ($callback && isset($this->char_map[$this->doc[$this->pos]])) {
					if (is_string($this->char_map[$this->doc[$this->pos]])) {
						return ($this->token = $this->{$this->char_map[$this->doc[$this->pos]]}());
					} else {
						return ($this->token = $this->char_map[$this->doc[$this->pos]]);
					}
				} else {
					return ($this->token = self::TOK_UNKNOWN);
				}
			} else {
				$this->parse_linebreak();
			}
		}

		return ($this->token = self::TOK_NULL);
	}

	/**
	 * Finds the next token by searching for a string
	 * @param string $needle The needle that's being searched for
	 * @param bool $callback Should the function check the charmap after finding the needle?
	 * @return int Next token ({@link TOK_NULL} if none)
	 */
	function next_pos($needle, $callback = true) {
		$this->token_start = $this->pos;
		if (($this->pos < $this->size) && (($p = stripos($this->doc, $needle, $this->pos + 1)) !== false)) {

			$len = $p - $this->pos - 1;
			if ($len > 0) {
				$str = substr($this->doc, $this->pos + 1, $len);

				if (($l = strrpos($str, "\n")) !== false) {
					++$this->line_pos[0];
					$this->line_pos[1] = $l + $this->pos + 1;

					$len -= $l;
					if ($len > 0) {
						$str = substr($str, 0, -$len);
						$this->line_pos[0] += substr_count($str, "\n");
					}
				}
			}

			$this->pos = $p;
			if ($callback && isset($this->char_map[$this->doc[$this->pos]])) {
				if (is_string($this->char_map[$this->doc[$this->pos]])) {
					return ($this->token = $this->{$this->char_map[$this->doc[$this->pos]]}());
				} else {
					return ($this->token = $this->char_map[$this->doc[$this->pos]]);
				}
			} else {
				return ($this->token = self::TOK_UNKNOWN);
			}
		} else {
			$this->pos = $this->size;
			return ($this->token = self::TOK_NULL);
		}
	}

	/**
	 * Expect a specific token or character. Adds error if token doesn't match.
	 * @param string|int $token Character or token to expect
	 * @param bool|int $do_next Go to next character before evaluating. 1 for next char, true to ignore whitespace
	 * @param bool|int $try_next Try next character if current doesn't match. 1 for next char, true to ignore whitespace
	 * @param bool|int $next_on_match Go to next character after evaluating. 1 for next char, true to ignore whitespace
	 * @return bool
	 */
	protected function expect($token, $do_next = true, $try_next = false, $next_on_match = 1) {
		if ($do_next) {
			if ($do_next === 1) {
				$this->next();
			} else {
				$this->next_no_whitespace();
			}
		}

		if (is_int($token)) {
			if (($this->token !== $token) && ((!$try_next) || ((($try_next === 1) && ($this->next() !== $token)) || (($try_next === true) && ($this->next_no_whitespace() !== $token))))) {
				$this->addError('Unexpected "'.$this->getTokenString().'"');
				return false;
			}
		} else {
			if (($this->doc[$this->pos] !== $token) && ((!$try_next) || (((($try_next === 1) && ($this->next() !== self::TOK_NULL)) || (($try_next === true) && ($this->next_no_whitespace() !== self::TOK_NULL))) && ($this->doc[$this->pos] !== $token)))) {
				$this->addError('Expected "'.$token.'", but found "'.$this->getTokenString().'"');
				return false;
			}
		}

		if ($next_on_match) {
			if ($next_on_match === 1) {
				$this->next();
			} else {
				$this->next_no_whitespace();
			}
		}
		return true;
	}
}

?>

1		<?php
2		/**
3		* @author Niels A.D.
4		* @author Todd Burry <[email protected]>
5		* @copyright 2010 Niels A.D., 2014 Todd Burry
6		* @license http://opensource.org/licenses/LGPL-2.1 LGPL-2.1
7		* @package pQuery
8		*/
9
10		namespace pQuery;
11
12		/**
13		* Converts a document into tokens
14		*
15		* Can convert any string into tokens. The base class only supports
16		* identifier/whitespace tokens. For more tokens, the class can be
17		* easily extended.
18		*
19		* Use like:
20		* <code>
21		* <?php
22		* $a = new TokenizerBase('hello word');
23		* while ($a->next() !== $a::TOK_NULL) {
24		* echo $a->token, ': ',$a->getTokenString(), "<br>\n";
25		* }
26		* ?>
27		* </code>
28		*
29		* @internal The tokenizer works with a character map that connects a certain
30		* character to a certain function/token. This class is build with speed in mind.
31		*/
32		class TokenizerBase {
33
34		/**
35		* NULL Token, used at end of document (parsing should stop after this token)
36		*/
37		const TOK_NULL = 0;
38		/**
39		* Unknown token, used at unidentified character
40		*/
41		const TOK_UNKNOWN = 1;
42		/**
43		* Whitespace token, used with whitespace
44		*/
45		const TOK_WHITESPACE = 2;
46		/**
47		* Identifier token, used with identifiers
48		*/
49		const TOK_IDENTIFIER = 3;
50
51		/**
52		* The document that is being tokenized
53		* @var string
54		* @internal Public for faster access!
55		* @see setDoc()
56		* @see getDoc()
57		* @access private
58		*/
59		var $doc = '';
60
61		/**
62		* The size of the document (length of string)
63		* @var int
64		* @internal Public for faster access!
65		* @see $doc
66		* @access private
67		*/
68		var $size = 0;
69
70		/**
71		* Current (character) position in the document
72		* @var int
73		* @internal Public for faster access!
74		* @see setPos()
75		* @see getPos()
76		* @access private
77		*/
78		var $pos = 0;
79
80		/**
81		* Current (Line/Column) position in document
82		* @var array (Current_Line, Line_Starting_Pos)
83		* @internal Public for faster access!
84		* @see getLinePos()
85		* @access private
86		*/
87		var $line_pos = array(0, 0);
88
89		/**
90		* Current token
91		* @var int
92		* @internal Public for faster access!
93		* @see getToken()
94		* @access private
95		*/
96		var $token = self::TOK_NULL;
97
98		/**
99		* Start position of token. If NULL, then current position is used.
100		* @var int
101		* @internal Public for faster access!
102		* @see getTokenString()
103		* @access private
104		*/
105		var $token_start = null;
106
107		/**
108		* List with all the character that can be considered as whitespace
109		* @var array\|string
110		* @internal Variable is public + associated array for faster access!
111		* @internal array(' ' => true) will recognize space (' ') as whitespace
112		* @internal String will be converted to array in constructor
113		* @internal Result token will be {@link self::TOK_WHITESPACE};
114		* @see setWhitespace()
115		* @see getWhitespace()
116		* @access private
117		*/
118		var $whitespace = " \t\n\r\0\x0B";
119
120		/**
121		* List with all the character that can be considered as identifier
122		* @var array\|string
123		* @internal Variable is public + associated array for faster access!
124		* @internal array('a' => true) will recognize 'a' as identifier
125		* @internal String will be converted to array in constructor
126		* @internal Result token will be {@link self::TOK_IDENTIFIER};
127		* @see setIdentifiers()
128		* @see getIdentifiers()
129		* @access private
130		*/
131		var $identifiers = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ01234567890_';
132
133		/**
134		* All characters that should be mapped to a token/function that cannot be considered as whitespace or identifier
135		* @var array
136		* @internal Variable is public + associated array for faster access!
137		* @internal array('a' => 'parse_a') will call $this->parse_a() if it matches the character 'a'
138		* @internal array('a' => self::TOK_A) will set token to TOK_A if it matches the character 'a'
139		* @see mapChar()
140		* @see unmapChar()
141		* @access private
142		*/
143		var $custom_char_map = array();
144
145		/**
146		* Automatically built character map. Built using {@link $identifiers}, {@link $whitespace} and {@link $custom_char_map}
147		* @var array
148		* @internal Public for faster access!
149		* @access private
150		*/
151		var $char_map = array();
152
153		/**
154		* All errors found while parsing the document
155		* @var array
156		* @see addError()
157		*/
158		var $errors = array();
159
160		/**
161		* Class constructor
162		* @param string $doc Document to be tokenized
163		* @param int $pos Position to start parsing
164		* @see setDoc()
165		* @see setPos()
166		*/
167	37	function __construct($doc = '', $pos = 0) {
168	37	$this->setWhitespace($this->whitespace);
169	37	$this->setIdentifiers($this->identifiers);
170
171	37	$this->setDoc($doc, $pos);
172	37	}
173
174		#php4 PHP4 class constructor compatibility
175		#function TokenizerBase($doc = '', $pos = 0) {return $this->__construct($doc, $pos);}
176		#php4e
177
178		/**
179		* Sets target document
180		* @param string $doc Document to be tokenized
181		* @param int $pos Position to start parsing
182		* @see getDoc()
183		* @see setPos()
184		*/
185	37	function setDoc($doc, $pos = 0) {
186	37	$this->doc = $doc;
187	37	$this->size = strlen($doc);
188	37	$this->setPos($pos);
189	37	}
190
191		/**
192		* Returns target document
193		* @return string
194		* @see setDoc()
195		*/
196		function getDoc() {
197		return $this->doc;
198		}
199
200		/**
201		* Sets position in document
202		* @param int $pos
203		* @see getPos()
204		*/
205	37	function setPos($pos = 0) {
206	37	$this->pos = $pos - 1;
207	37	$this->line_pos = array(0, 0);
208	37	$this->next();
209	37	}
210
211		/**
212		* Returns current position in document (Index)
213		* @return int
214		* @see setPos()
215		*/
216	1	function getPos() {
217	1	return $this->pos;
218		}
219
220		/**
221		* Returns current position in document (Line/Char)
222		* @return array array(Line, Column)
223		*/
224		function getLinePos() {
225		return array($this->line_pos[0], $this->pos - $this->line_pos[1]);
226		}
227
228		/**
229		* Returns current token
230		* @return int
231		* @see $token
232		*/
233		function getToken() {
234		return $this->token;
235		}
236
237		/**
238		* Returns current token as string
239		* @param int $start_offset Offset from token start
240		* @param int $end_offset Offset from token end
241		* @return string
242		*/
243	37	function getTokenString($start_offset = 0, $end_offset = 0) {
244	37	$token_start = ((is_int($this->token_start)) ? $this->token_start : $this->pos) + $start_offset;
245	37	$len = $this->pos - $token_start + 1 + $end_offset;
246	37	return (($len > 0) ? substr($this->doc, $token_start, $len) : '');
247		}
248
249		/**
250		* Sets characters to be recognized as whitespace
251		*
252		* Used like: setWhitespace('ab') or setWhitespace(array('a' => true, 'b', 'c'));
253		* @param string\|array $ws
254		* @see getWhitespace();
255		*/
256	37	function setWhitespace($ws) {
257	37	if (is_array($ws)) {
258	37	$this->whitespace = array_fill_keys(array_values($ws), true);
259	37	$this->buildCharMap();
260	37	} else {
261	37	$this->setWhiteSpace(str_split($ws));
262		}
263	37	}
264
265		/**
266		* Returns whitespace characters as string/array
267		* @param bool $as_string Should the result be a string or an array?
268		* @return string\|array
269		* @see setWhitespace()
270		*/
271		function getWhitespace($as_string = true) {
272		$ws = array_keys($this->whitespace);
273		return (($as_string) ? implode('', $ws) : $ws);
274		}
275
276		/**
277		* Sets characters to be recognized as identifier
278		*
279		* Used like: setIdentifiers('ab') or setIdentifiers(array('a' => true, 'b', 'c'));
280		* @param string\|array $ident
281		* @see getIdentifiers();
282		*/
283	37	function setIdentifiers($ident) {
284	37	if (is_array($ident)) {
285	37	$this->identifiers = array_fill_keys(array_values($ident), true);
286	37	$this->buildCharMap();
287	37	} else {
288	37	$this->setIdentifiers(str_split($ident));
289		}
290	37	}
291
292		/**
293		* Returns identifier characters as string/array
294		* @param bool $as_string Should the result be a string or an array?
295		* @return string\|array
296		* @see setIdentifiers()
297		*/
298		function getIdentifiers($as_string = true) {
299		$ident = array_keys($this->identifiers);
300		return (($as_string) ? implode('', $ident) : $ident);
301		}
302
303		/**
304		* Maps a custom character to a token/function
305		*
306		* Used like: mapChar('a', self::{@link TOK_IDENTIFIER}) or mapChar('a', 'parse_identifier');
307		* @param string $char Character that should be mapped. If set, it will be overridden
308		* @param int\|string $map If function name, then $this->function will be called, otherwise token is set to $map
309		* @see unmapChar()
310		*/
311		function mapChar($char, $map) {
312		$this->custom_char_map[$char] = $map;
313		$this->buildCharMap();
314		}
315
316		/**
317		* Removes a char mapped with {@link mapChar()}
318		* @param string $char Character that should be unmapped
319		* @see mapChar()
320		*/
321		function unmapChar($char) {
322		unset($this->custom_char_map[$char]);
323		$this->buildCharMap();
324		}
325
326		/**
327		* Builds the {@link $map_char} array
328		* @internal Builds single array that maps all characters. Gets called if {@link $whitespace}, {@link $identifiers} or {@link $custom_char_map} get modified
329		*/
330	37	protected function buildCharMap() {
331	37	$this->char_map = $this->custom_char_map;
332	37	if (is_array($this->whitespace)) {
333	37	foreach($this->whitespace as $w => $v) {
334	37	$this->char_map[$w] = 'parse_whitespace';
335	37	}
336	37	}
337	37	if (is_array($this->identifiers)) {
338	37	foreach($this->identifiers as $i => $v) {
339	37	$this->char_map[$i] = 'parse_identifier';
340	37	}
341	37	}
342	37	}
343
344		/**
345		* Add error to the array and appends current position
346		* @param string $error
347		*/
348		function addError($error) {
349		$this->errors[] = htmlentities($error.' at '.($this->line_pos[0] + 1).', '.($this->pos - $this->line_pos[1] + 1).'!');
350		}
351
352		/**
353		* Parse line breaks and increase line number
354		* @internal Gets called to process line breaks
355		*/
356	34	protected function parse_linebreak() {
357	34	if($this->doc[$this->pos] === "\r") {
358		++$this->line_pos[0];
359		if ((($this->pos + 1) < $this->size) && ($this->doc[$this->pos + 1] === "\n")) {
360		++$this->pos;
361		}
362		$this->line_pos[1] = $this->pos;
363	34	} elseif($this->doc[$this->pos] === "\n") {
364		++$this->line_pos[0];
365		$this->line_pos[1] = $this->pos;
366		}
367	34	}
368
369		/**
370		* Parse whitespace
371		* @return int Token
372		* @internal Gets called with {@link $whitespace} characters
373		*/
374	4	protected function parse_whitespace() {
375	4	$this->token_start = $this->pos;
376
377	4	while(++$this->pos < $this->size) {
378	4	if (!isset($this->whitespace[$this->doc[$this->pos]])) {
379	4	break;
380		} else {
381		$this->parse_linebreak();
382		}
383		}
384
385	4	--$this->pos;
386	4	return self::TOK_WHITESPACE;
387		}
388
389		/**
390		* Parse identifiers
391		* @return int Token
392		* @internal Gets called with {@link $identifiers} characters
393		*/
394	37	protected function parse_identifier() {
395	37	$this->token_start = $this->pos;
396
397	37	while((++$this->pos < $this->size) && isset($this->identifiers[$this->doc[$this->pos]])) {}
398
399	37	--$this->pos;
400	37	return self::TOK_IDENTIFIER;
401		}
402
403		/**
404		* Continues to the next token
405		* @return int Next token ({@link TOK_NULL} if none)
406		*/
407	37	function next() {
408	37	$this->token_start = null;
409
410	37	if (++$this->pos < $this->size) {
411	37	if (isset($this->char_map[$this->doc[$this->pos]])) {
412	37	if (is_string($this->char_map[$this->doc[$this->pos]])) {
413	37	return ($this->token = $this->{$this->char_map[$this->doc[$this->pos]]}());
414		} else {
415	35	return ($this->token = $this->char_map[$this->doc[$this->pos]]);
416		}
417		} else {
418		return ($this->token = self::TOK_UNKNOWN);
419		}
420		} else {
421	37	return ($this->token = self::TOK_NULL);
422		}
423		}
424
425		/**
426		* Finds the next token, but skips whitespace
427		* @return int Next token ({@link TOK_NULL} if none)
428		*/
429	37	function next_no_whitespace() {
430	37	$this->token_start = null;
431
432	37	while (++$this->pos < $this->size) {
433	37	if (!isset($this->whitespace[$this->doc[$this->pos]])) {
434	37	if (isset($this->char_map[$this->doc[$this->pos]])) {
435	37	if (is_string($this->char_map[$this->doc[$this->pos]])) {
436	34	return ($this->token = $this->{$this->char_map[$this->doc[$this->pos]]}());
437		} else {
438	37	return ($this->token = $this->char_map[$this->doc[$this->pos]]);
439		}
440		} else {
441		return ($this->token = self::TOK_UNKNOWN);
442		}
443		} else {
444	34	$this->parse_linebreak();
445		}
446	34	}
447
448		return ($this->token = self::TOK_NULL);
449		}
450
451		/**
452		* Finds the next token using stop characters.
453		*
454		* Used like: next_search('abc') or next_search(array('a' => true, 'b' => true, 'c' => true));
455		* @param string\|array $characters Characters to search for
456		* @param bool $callback Should the function check the charmap after finding a character?
457		* @return int Next token ({@link TOK_NULL} if none)
458		*/
459	9	function next_search($characters, $callback = true) {
460	9	$this->token_start = $this->pos;
461	9	if (!is_array($characters)) {
462	9	$characters = array_fill_keys(str_split($characters), true);
463	9	}
464
465	9	while(++$this->pos < $this->size) {
466	9	if (isset($characters[$this->doc[$this->pos]])) {
467	9	if ($callback && isset($this->char_map[$this->doc[$this->pos]])) {
468		if (is_string($this->char_map[$this->doc[$this->pos]])) {
469		return ($this->token = $this->{$this->char_map[$this->doc[$this->pos]]}());
470		} else {
471		return ($this->token = $this->char_map[$this->doc[$this->pos]]);
472		}
473		} else {
474	9	return ($this->token = self::TOK_UNKNOWN);
475		}
476		} else {
477	9	$this->parse_linebreak();
478		}
479	9	}
480
481		return ($this->token = self::TOK_NULL);
482		}
483
484		/**
485		* Finds the next token by searching for a string
486		* @param string $needle The needle that's being searched for
487		* @param bool $callback Should the function check the charmap after finding the needle?
488		* @return int Next token ({@link TOK_NULL} if none)
489		*/
490	37	function next_pos($needle, $callback = true) {
491	37	$this->token_start = $this->pos;
492	37	if (($this->pos < $this->size) && (($p = stripos($this->doc, $needle, $this->pos + 1)) !== false)) {
493
494	37	$len = $p - $this->pos - 1;
495	37	if ($len > 0) {
496	37	$str = substr($this->doc, $this->pos + 1, $len);
497
498	37	if (($l = strrpos($str, "\n")) !== false) {
499	33	++$this->line_pos[0];
500	33	$this->line_pos[1] = $l + $this->pos + 1;
501
502	33	$len -= $l;
503	33	if ($len > 0) {
504	33	$str = substr($str, 0, -$len);
505	33	$this->line_pos[0] += substr_count($str, "\n");
506	33	}
507	33	}
508	37	}
509
510	37	$this->pos = $p;
511	37	if ($callback && isset($this->char_map[$this->doc[$this->pos]])) {
512	37	if (is_string($this->char_map[$this->doc[$this->pos]])) {
513		return ($this->token = $this->{$this->char_map[$this->doc[$this->pos]]}());
514		} else {
515	37	return ($this->token = $this->char_map[$this->doc[$this->pos]]);
516		}
517		} else {
518	34	return ($this->token = self::TOK_UNKNOWN);
519		}
520		} else {
521	37	$this->pos = $this->size;
522	37	return ($this->token = self::TOK_NULL);
523		}
524		}
525
526		/**
527		* Expect a specific token or character. Adds error if token doesn't match.
528		* @param string\|int $token Character or token to expect
529		* @param bool\|int $do_next Go to next character before evaluating. 1 for next char, true to ignore whitespace
530		* @param bool\|int $try_next Try next character if current doesn't match. 1 for next char, true to ignore whitespace
531		* @param bool\|int $next_on_match Go to next character after evaluating. 1 for next char, true to ignore whitespace
532		* @return bool
533		*/
534		protected function expect($token, $do_next = true, $try_next = false, $next_on_match = 1) {
535		if ($do_next) {
536		if ($do_next === 1) {
537		$this->next();
538		} else {
539		$this->next_no_whitespace();
540		}
541		}
542
543		if (is_int($token)) {
544		if (($this->token !== $token) && ((!$try_next) \|\| ((($try_next === 1) && ($this->next() !== $token)) \|\| (($try_next === true) && ($this->next_no_whitespace() !== $token))))) {
545		$this->addError('Unexpected "'.$this->getTokenString().'"');
546		return false;
547		}
548		} else {
549		if (($this->doc[$this->pos] !== $token) && ((!$try_next) \|\| (((($try_next === 1) && ($this->next() !== self::TOK_NULL)) \|\| (($try_next === true) && ($this->next_no_whitespace() !== self::TOK_NULL))) && ($this->doc[$this->pos] !== $token)))) {
550		$this->addError('Expected "'.$token.'", but found "'.$this->getTokenString().'"');
551		return false;
552		}
553		}
554
555		if ($next_on_match) {
556		if ($next_on_match === 1) {
557		$this->next();
558		} else {
559		$this->next_no_whitespace();
560		}
561		}
562		return true;
563		}
564		}
565
566		?>

tburry / pquery

Code