Code

< 40 %
40-60 %
> 60 %
1
<?php
2
/**
3
 * @author Niels A.D.
4
 * @author Todd Burry <[email protected]>
5
 * @copyright 2010 Niels A.D., 2014 Todd Burry
6
 * @license http://opensource.org/licenses/LGPL-2.1 LGPL-2.1
7
 * @package pQuery
8
 */
9
10
namespace pQuery;
11
12
/**
13
 * Converts a document into tokens
14
 *
15
 * Can convert any string into tokens. The base class only supports
16
 * identifier/whitespace tokens. For more tokens, the class can be
17
 * easily extended.
18
 *
19
 * Use like:
20
 * <code>
21
 * <?php
22
 *  $a = new TokenizerBase('hello word');
23
 *  while ($a->next() !== $a::TOK_NULL) {
24
 *    echo $a->token, ': ',$a->getTokenString(), "<br>\n";
25
 *  }
26
 * ?>
27
 * </code>
28
 *
29
 * @internal The tokenizer works with a character map that connects a certain
30
 * character to a certain function/token. This class is build with speed in mind.
31
 */
32
class TokenizerBase {
33
34
	/**
35
	 * NULL Token, used at end of document (parsing should stop after this token)
36
	 */
37
	const TOK_NULL = 0;
38
	/**
39
	 * Unknown token, used at unidentified character
40
	 */
41
	const TOK_UNKNOWN = 1;
42
	/**
43
	 * Whitespace token, used with whitespace
44
	 */
45
	const TOK_WHITESPACE = 2;
46
	/**
47
	 * Identifier token, used with identifiers
48
	 */
49
	const TOK_IDENTIFIER = 3;
50
51
	/**
52
	 * The document that is being tokenized
53
	 * @var string
54
	 * @internal Public for faster access!
55
	 * @see setDoc()
56
	 * @see getDoc()
57
	 * @access private
58
	 */
59
	var $doc = '';
60
61
	/**
62
	 * The size of the document (length of string)
63
	 * @var int
64
	 * @internal Public for faster access!
65
	 * @see $doc
66
	 * @access private
67
	 */
68
	var $size = 0;
69
70
	/**
71
	 * Current (character) position in the document
72
	 * @var int
73
	 * @internal Public for faster access!
74
	 * @see setPos()
75
	 * @see getPos()
76
	 * @access private
77
	 */
78
	var $pos = 0;
79
80
	/**
81
	 * Current (Line/Column) position in document
82
	 * @var array (Current_Line, Line_Starting_Pos)
83
	 * @internal Public for faster access!
84
	 * @see getLinePos()
85
	 * @access private
86
	 */
87
	var $line_pos = array(0, 0);
88
89
	/**
90
	 * Current token
91
	 * @var int
92
	 * @internal Public for faster access!
93
	 * @see getToken()
94
	 * @access private
95
	 */
96
	var $token = self::TOK_NULL;
97
98
	/**
99
	 * Start position of token. If NULL, then current position is used.
100
	 * @var int
101
	 * @internal Public for faster access!
102
	 * @see getTokenString()
103
	 * @access private
104
	 */
105
	var $token_start = null;
106
107
	/**
108
	 * List with all the character that can be considered as whitespace
109
	 * @var array|string
110
	 * @internal Variable is public + associated array for faster access!
111
	 * @internal array(' ' => true) will recognize space (' ') as whitespace
112
	 * @internal String will be converted to array in constructor
113
	 * @internal Result token will be {@link self::TOK_WHITESPACE};
114
	 * @see setWhitespace()
115
	 * @see getWhitespace()
116
	 * @access private
117
	 */
118
	var $whitespace = " \t\n\r\0\x0B";
119
120
	/**
121
	 * List with all the character that can be considered as identifier
122
	 * @var array|string
123
	 * @internal Variable is public + associated array for faster access!
124
	 * @internal array('a' => true) will recognize 'a' as identifier
125
	 * @internal String will be converted to array in constructor
126
	 * @internal Result token will be {@link self::TOK_IDENTIFIER};
127
	 * @see setIdentifiers()
128
	 * @see getIdentifiers()
129
	 * @access private
130
	 */
131
	var $identifiers = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ01234567890_';
132
133
	/**
134
	 * All characters that should be mapped to a token/function that cannot be considered as whitespace or identifier
135
	 * @var array
136
	 * @internal Variable is public + associated array for faster access!
137
	 * @internal array('a' => 'parse_a') will call $this->parse_a() if it matches the character 'a'
138
	 * @internal array('a' => self::TOK_A) will set token to TOK_A if it matches the character 'a'
139
	 * @see mapChar()
140
	 * @see unmapChar()
141
	 * @access private
142
	 */
143
	var $custom_char_map = array();
144
145
	/**
146
	 * Automatically built character map. Built using {@link $identifiers}, {@link $whitespace} and {@link $custom_char_map}
147
	 * @var array
148
	 * @internal Public for faster access!
149
	 * @access private
150
	 */
151
	var $char_map = array();
152
153
	/**
154
	 * All errors found while parsing the document
155
	 * @var array
156
	 * @see addError()
157
	 */
158
	var $errors = array();
159
160
	/**
161
	 * Class constructor
162
	 * @param string $doc Document to be tokenized
163
	 * @param int $pos Position to start parsing
164
	 * @see setDoc()
165
	 * @see setPos()
166
	 */
167 37
	function __construct($doc = '', $pos = 0) {
168 37
		$this->setWhitespace($this->whitespace);
169 37
		$this->setIdentifiers($this->identifiers);
170
171 37
		$this->setDoc($doc, $pos);
172 37
	}
173
174
	#php4 PHP4 class constructor compatibility
175
	#function TokenizerBase($doc = '', $pos = 0) {return $this->__construct($doc, $pos);}
176
	#php4e
177
178
	/**
179
	 * Sets target document
180
	 * @param string $doc Document to be tokenized
181
	 * @param int $pos Position to start parsing
182
	 * @see getDoc()
183
	 * @see setPos()
184
	 */
185 37
	function setDoc($doc, $pos = 0) {
186 37
		$this->doc = $doc;
187 37
		$this->size = strlen($doc);
188 37
		$this->setPos($pos);
189 37
	}
190
191
	/**
192
	 * Returns target document
193
	 * @return string
194
	 * @see setDoc()
195
	 */
196
	function getDoc() {
197
		return $this->doc;
198
	}
199
200
	/**
201
	 * Sets position in document
202
	 * @param int $pos
203
	 * @see getPos()
204
	 */
205 37
	function setPos($pos = 0) {
206 37
		$this->pos = $pos - 1;
207 37
		$this->line_pos = array(0, 0);
208 37
		$this->next();
209 37
	}
210
211
	/**
212
	 * Returns current position in document (Index)
213
	 * @return int
214
	 * @see setPos()
215
	 */
216 1
	function getPos() {
217 1
		return $this->pos;
218
	}
219
220
	/**
221
	 * Returns current position in document (Line/Char)
222
	 * @return array array(Line, Column)
223
	 */
224
	function getLinePos() {
225
		return array($this->line_pos[0], $this->pos - $this->line_pos[1]);
226
	}
227
228
	/**
229
	 * Returns current token
230
	 * @return int
231
	 * @see $token
232
	 */
233
	function getToken() {
234
		return $this->token;
235
	}
236
237
	/**
238
	 * Returns current token as string
239
	 * @param int $start_offset Offset from token start
240
	 * @param int $end_offset Offset from token end
241
	 * @return string
242
	 */
243 37
	function getTokenString($start_offset = 0, $end_offset = 0) {
244 37
		$token_start = ((is_int($this->token_start)) ? $this->token_start : $this->pos) + $start_offset;
245 37
		$len = $this->pos - $token_start + 1 + $end_offset;
246 37
		return (($len > 0) ? substr($this->doc, $token_start, $len) : '');
247
	}
248
249
	/**
250
	 * Sets characters to be recognized as whitespace
251
	 *
252
	 * Used like: setWhitespace('ab') or setWhitespace(array('a' => true, 'b', 'c'));
253
	 * @param string|array $ws
254
	 * @see getWhitespace();
255
	 */
256 37
	function setWhitespace($ws) {
257 37
		if (is_array($ws)) {
258 37
			$this->whitespace = array_fill_keys(array_values($ws), true);
259 37
			$this->buildCharMap();
260 37
		} else {
261 37
			$this->setWhiteSpace(str_split($ws));
262
		}
263 37
	}
264
265
	/**
266
	 * Returns whitespace characters as string/array
267
	 * @param bool $as_string Should the result be a string or an array?
268
	 * @return string|array
269
	 * @see setWhitespace()
270
	 */
271
	function getWhitespace($as_string = true) {
272
		$ws = array_keys($this->whitespace);
273
		return (($as_string) ? implode('', $ws) : $ws);
274
	}
275
276
	/**
277
	 * Sets characters to be recognized as identifier
278
	 *
279
	 * Used like: setIdentifiers('ab') or setIdentifiers(array('a' => true, 'b', 'c'));
280
	 * @param string|array $ident
281
	 * @see getIdentifiers();
282
	 */
283 37
	function setIdentifiers($ident) {
284 37
		if (is_array($ident)) {
285 37
			$this->identifiers = array_fill_keys(array_values($ident), true);
286 37
			$this->buildCharMap();
287 37
		} else {
288 37
			$this->setIdentifiers(str_split($ident));
289
		}
290 37
	}
291
292
	/**
293
	 * Returns identifier characters as string/array
294
	 * @param bool $as_string Should the result be a string or an array?
295
	 * @return string|array
296
	 * @see setIdentifiers()
297
	 */
298
	function getIdentifiers($as_string = true) {
299
		$ident = array_keys($this->identifiers);
300
		return (($as_string) ? implode('', $ident) : $ident);
301
	}
302
303
	/**
304
	 * Maps a custom character to a token/function
305
	 *
306
	 * Used like: mapChar('a', self::{@link TOK_IDENTIFIER}) or mapChar('a', 'parse_identifier');
307
	 * @param string $char Character that should be mapped. If set, it will be overridden
308
	 * @param int|string $map If function name, then $this->function will be called, otherwise token is set to $map
309
	 * @see unmapChar()
310
	 */
311
	function mapChar($char, $map) {
312
		$this->custom_char_map[$char] = $map;
313
		$this->buildCharMap();
314
	}
315
316
	/**
317
	 * Removes a char mapped with {@link mapChar()}
318
	 * @param string $char Character that should be unmapped
319
	 * @see mapChar()
320
	 */
321
	function unmapChar($char) {
322
		unset($this->custom_char_map[$char]);
323
		$this->buildCharMap();
324
	}
325
326
	/**
327
	 * Builds the {@link $map_char} array
328
	 * @internal Builds single array that maps all characters. Gets called if {@link $whitespace}, {@link $identifiers} or {@link $custom_char_map} get modified
329
	 */
330 37
	protected function buildCharMap() {
331 37
		$this->char_map = $this->custom_char_map;
332 37
		if (is_array($this->whitespace)) {
333 37
			foreach($this->whitespace as $w => $v) {
334 37
				$this->char_map[$w] = 'parse_whitespace';
335 37
			}
336 37
		}
337 37
		if (is_array($this->identifiers)) {
338 37
			foreach($this->identifiers as $i => $v) {
339 37
				$this->char_map[$i] = 'parse_identifier';
340 37
			}
341 37
		}
342 37
	}
343
344
	/**
345
	 * Add error to the array and appends current position
346
	 * @param string $error
347
	 */
348
	function addError($error) {
349
		$this->errors[] = htmlentities($error.' at '.($this->line_pos[0] + 1).', '.($this->pos - $this->line_pos[1] + 1).'!');
350
	}
351
352
	/**
353
	 * Parse line breaks and increase line number
354
	 * @internal Gets called to process line breaks
355
	 */
356 34
	protected function parse_linebreak() {
357 34
		if($this->doc[$this->pos] === "\r") {
358
			++$this->line_pos[0];
359
			if ((($this->pos + 1) < $this->size) && ($this->doc[$this->pos + 1] === "\n")) {
360
				++$this->pos;
361
			}
362
			$this->line_pos[1] = $this->pos;
363 34
		} elseif($this->doc[$this->pos] === "\n") {
364
			++$this->line_pos[0];
365
			$this->line_pos[1] = $this->pos;
366
		}
367 34
	}
368
369
	/**
370
	 * Parse whitespace
371
	 * @return int Token
372
	 * @internal Gets called with {@link $whitespace} characters
373
	 */
374 4
	protected function parse_whitespace() {
375 4
		$this->token_start = $this->pos;
376
377 4
		while(++$this->pos < $this->size) {
378 4
			if (!isset($this->whitespace[$this->doc[$this->pos]])) {
379 4
				break;
380
			} else {
381
				$this->parse_linebreak();
382
			}
383
		}
384
385 4
		--$this->pos;
386 4
		return self::TOK_WHITESPACE;
387
	}
388
389
	/**
390
	 * Parse identifiers
391
	 * @return int Token
392
	 * @internal Gets called with {@link $identifiers} characters
393
	 */
394 37
	protected function parse_identifier() {
395 37
		$this->token_start = $this->pos;
396
397 37
		while((++$this->pos < $this->size) && isset($this->identifiers[$this->doc[$this->pos]])) {}
398
399 37
		--$this->pos;
400 37
		return self::TOK_IDENTIFIER;
401
	}
402
403
	/**
404
	 * Continues to the next token
405
	 * @return int Next token ({@link TOK_NULL} if none)
406
	 */
407 37
	function next() {
408 37
		$this->token_start = null;
409
410 37
		if (++$this->pos < $this->size) {
411 37
			if (isset($this->char_map[$this->doc[$this->pos]])) {
412 37
				if (is_string($this->char_map[$this->doc[$this->pos]])) {
413 37
					return ($this->token = $this->{$this->char_map[$this->doc[$this->pos]]}());
414
				} else {
415 35
					return ($this->token = $this->char_map[$this->doc[$this->pos]]);
416
				}
417
			} else {
418
				return ($this->token = self::TOK_UNKNOWN);
419
			}
420
		} else {
421 37
			return ($this->token = self::TOK_NULL);
422
		}
423
	}
424
425
	/**
426
	 * Finds the next token, but skips whitespace
427
	 * @return int Next token ({@link TOK_NULL} if none)
428
	 */
429 37
	function next_no_whitespace() {
430 37
		$this->token_start = null;
431
432 37
		while (++$this->pos < $this->size) {
433 37
			if (!isset($this->whitespace[$this->doc[$this->pos]])) {
434 37
				if (isset($this->char_map[$this->doc[$this->pos]])) {
435 37
					if (is_string($this->char_map[$this->doc[$this->pos]])) {
436 34
						return ($this->token = $this->{$this->char_map[$this->doc[$this->pos]]}());
437
					} else {
438 37
						return ($this->token = $this->char_map[$this->doc[$this->pos]]);
439
					}
440
				} else {
441
					return ($this->token = self::TOK_UNKNOWN);
442
				}
443
			} else {
444 34
				$this->parse_linebreak();
445
			}
446 34
		}
447
448
		return ($this->token = self::TOK_NULL);
449
	}
450
451
	/**
452
	 * Finds the next token using stop characters.
453
	 *
454
	 * Used like: next_search('abc') or next_search(array('a' => true, 'b' => true, 'c' => true));
455
	 * @param string|array $characters Characters to search for
456
	 * @param bool $callback Should the function check the charmap after finding a character?
457
	 * @return int Next token ({@link TOK_NULL} if none)
458
	 */
459 9
	function next_search($characters, $callback = true) {
460 9
		$this->token_start = $this->pos;
461 9
		if (!is_array($characters)) {
462 9
			$characters = array_fill_keys(str_split($characters), true);
463 9
		}
464
465 9
		while(++$this->pos < $this->size) {
466 9
			if (isset($characters[$this->doc[$this->pos]])) {
467 9
				if ($callback && isset($this->char_map[$this->doc[$this->pos]])) {
468
					if (is_string($this->char_map[$this->doc[$this->pos]])) {
469
						return ($this->token = $this->{$this->char_map[$this->doc[$this->pos]]}());
470
					} else {
471
						return ($this->token = $this->char_map[$this->doc[$this->pos]]);
472
					}
473
				} else {
474 9
					return ($this->token = self::TOK_UNKNOWN);
475
				}
476
			} else {
477 9
				$this->parse_linebreak();
478
			}
479 9
		}
480
481
		return ($this->token = self::TOK_NULL);
482
	}
483
484
	/**
485
	 * Finds the next token by searching for a string
486
	 * @param string $needle The needle that's being searched for
487
	 * @param bool $callback Should the function check the charmap after finding the needle?
488
	 * @return int Next token ({@link TOK_NULL} if none)
489
	 */
490 37
	function next_pos($needle, $callback = true) {
491 37
		$this->token_start = $this->pos;
492 37
		if (($this->pos < $this->size) && (($p = stripos($this->doc, $needle, $this->pos + 1)) !== false)) {
493
494 37
			$len = $p - $this->pos - 1;
495 37
			if ($len > 0) {
496 37
				$str = substr($this->doc, $this->pos + 1, $len);
497
498 37
				if (($l = strrpos($str, "\n")) !== false) {
499 33
					++$this->line_pos[0];
500 33
					$this->line_pos[1] = $l + $this->pos + 1;
501
502 33
					$len -= $l;
503 33
					if ($len > 0) {
504 33
						$str = substr($str, 0, -$len);
505 33
						$this->line_pos[0] += substr_count($str, "\n");
506 33
					}
507 33
				}
508 37
			}
509
510 37
			$this->pos = $p;
511 37
			if ($callback && isset($this->char_map[$this->doc[$this->pos]])) {
512 37
				if (is_string($this->char_map[$this->doc[$this->pos]])) {
513
					return ($this->token = $this->{$this->char_map[$this->doc[$this->pos]]}());
514
				} else {
515 37
					return ($this->token = $this->char_map[$this->doc[$this->pos]]);
516
				}
517
			} else {
518 34
				return ($this->token = self::TOK_UNKNOWN);
519
			}
520
		} else {
521 37
			$this->pos = $this->size;
522 37
			return ($this->token = self::TOK_NULL);
523
		}
524
	}
525
526
	/**
527
	 * Expect a specific token or character. Adds error if token doesn't match.
528
	 * @param string|int $token Character or token to expect
529
	 * @param bool|int $do_next Go to next character before evaluating. 1 for next char, true to ignore whitespace
530
	 * @param bool|int $try_next Try next character if current doesn't match. 1 for next char, true to ignore whitespace
531
	 * @param bool|int $next_on_match Go to next character after evaluating. 1 for next char, true to ignore whitespace
532
	 * @return bool
533
	 */
534
	protected function expect($token, $do_next = true, $try_next = false, $next_on_match = 1) {
535
		if ($do_next) {
536
			if ($do_next === 1) {
537
				$this->next();
538
			} else {
539
				$this->next_no_whitespace();
540
			}
541
		}
542
543
		if (is_int($token)) {
544
			if (($this->token !== $token) && ((!$try_next) || ((($try_next === 1) && ($this->next() !== $token)) || (($try_next === true) && ($this->next_no_whitespace() !== $token))))) {
545
				$this->addError('Unexpected "'.$this->getTokenString().'"');
546
				return false;
547
			}
548
		} else {
549
			if (($this->doc[$this->pos] !== $token) && ((!$try_next) || (((($try_next === 1) && ($this->next() !== self::TOK_NULL)) || (($try_next === true) && ($this->next_no_whitespace() !== self::TOK_NULL))) && ($this->doc[$this->pos] !== $token)))) {
550
				$this->addError('Expected "'.$token.'", but found "'.$this->getTokenString().'"');
551
				return false;
552
			}
553
		}
554
555
		if ($next_on_match) {
556
			if ($next_on_match === 1) {
557
				$this->next();
558
			} else {
559
				$this->next_no_whitespace();
560
			}
561
		}
562
		return true;
563
	}
564
}
565
566
?>