1
|
|
|
<?php |
2
|
|
|
/** |
3
|
|
|
* Spiral Framework. |
4
|
|
|
* |
5
|
|
|
* @license MIT |
6
|
|
|
* @author Anton Titov (Wolfy-J) |
7
|
|
|
*/ |
8
|
|
|
namespace Spiral\Stempler; |
9
|
|
|
|
10
|
|
|
use Spiral\Tokenizer\Isolator; |
11
|
|
|
|
12
|
|
|
/** |
13
|
|
|
* Perform html code tokenization. Class used for spiral Stempler and can be used for other html |
14
|
|
|
* related operations. HtmlTokenizer is pretty slow! Please don't forget that this is tokenizer, |
15
|
|
|
* not parser. |
16
|
|
|
* |
17
|
|
|
* @todo very old class, improvement is required |
18
|
|
|
*/ |
19
|
|
|
class HtmlTokenizer |
20
|
|
|
{ |
21
|
|
|
/** |
22
|
|
|
* Current tokenizer position. Tokenizer is a linear processor (no regular expression is |
23
|
|
|
* involved). This slows it down, but the results are much more reliable. |
24
|
|
|
*/ |
25
|
|
|
const POSITION_PLAIN_TEXT = 0x001; |
26
|
|
|
const POSITION_IN_TAG = 0x002; |
27
|
|
|
const POSITION_IN_QUOTAS = 0x003; |
28
|
|
|
|
29
|
|
|
/** |
30
|
|
|
* Token types detected and processed by tokenizer. |
31
|
|
|
*/ |
32
|
|
|
const PLAIN_TEXT = 'plain'; |
33
|
|
|
const TAG_OPEN = 'open'; |
34
|
|
|
const TAG_CLOSE = 'close'; |
35
|
|
|
const TAG_SHORT = 'short'; |
36
|
|
|
const TAG_VOID = 'void'; |
37
|
|
|
|
38
|
|
|
/** |
39
|
|
|
* Token fields. There are a lot of tokens in HTML (up to 10,000 different ones). We better to |
40
|
|
|
* use numeric keys for array than any text fields or even objects. |
41
|
|
|
*/ |
42
|
|
|
const TOKEN_NAME = 0; |
43
|
|
|
const TOKEN_TYPE = 1; |
44
|
|
|
const TOKEN_CONTENT = 2; |
45
|
|
|
const TOKEN_ATTRIBUTES = 3; |
46
|
|
|
|
47
|
|
|
/** |
48
|
|
|
* List of void tags. |
49
|
|
|
* |
50
|
|
|
* @link http://www.w3.org/TR/html5/syntax.html#void-elements |
51
|
|
|
* @var array |
52
|
|
|
*/ |
53
|
|
|
protected $voidTags = [ |
54
|
|
|
'area', |
55
|
|
|
'base', |
56
|
|
|
'br', |
57
|
|
|
'col', |
58
|
|
|
'embed', |
59
|
|
|
'hr', |
60
|
|
|
'img', |
61
|
|
|
'input', |
62
|
|
|
'keygen', |
63
|
|
|
'link', |
64
|
|
|
'meta', |
65
|
|
|
'param', |
66
|
|
|
'source', |
67
|
|
|
'track', |
68
|
|
|
'wbr' |
69
|
|
|
]; |
70
|
|
|
|
71
|
|
|
/** |
72
|
|
|
* Array of parsed tokens. Every token has fields name, type, content and arguments. |
73
|
|
|
* |
74
|
|
|
* @var array |
75
|
|
|
*/ |
76
|
|
|
protected $tokens = []; |
77
|
|
|
|
78
|
|
|
/** |
79
|
|
|
* PHP block should be isolated while parsing, Keep enabled. |
80
|
|
|
* |
81
|
|
|
* @var bool |
82
|
|
|
*/ |
83
|
|
|
protected $isolatePHP = false; |
84
|
|
|
|
85
|
|
|
/** |
86
|
|
|
* PHP Blocks isolator, which holds all existing PHP blocks and restores them in output. |
87
|
|
|
* |
88
|
|
|
* @var Isolator|null |
89
|
|
|
*/ |
90
|
|
|
protected $isolator = null; |
91
|
|
|
|
92
|
|
|
/** |
93
|
|
|
* @param bool $isolatePHP PHP block should be isolated and enabled by default |
94
|
|
|
* @param Isolator $isolator |
95
|
|
|
*/ |
96
|
|
|
public function __construct(bool $isolatePHP = true, Isolator $isolator = null) |
97
|
|
|
{ |
98
|
|
|
$this->isolatePHP = $isolatePHP; |
99
|
|
|
$this->isolator = !empty($isolator) ? $isolator : new Isolator(); |
100
|
|
|
} |
101
|
|
|
|
102
|
|
|
/** |
103
|
|
|
* Parse HTML content and return it's tokens. |
104
|
|
|
* |
105
|
|
|
* @param string $source HTML source. |
106
|
|
|
* |
107
|
|
|
* @return array |
108
|
|
|
*/ |
109
|
|
|
public function parse(string $source): array |
110
|
|
|
{ |
111
|
|
|
//Cleaning list of already parsed tokens |
112
|
|
|
$this->tokens = []; |
113
|
|
|
|
114
|
|
|
if ($this->isolatePHP) { |
115
|
|
|
$source = $this->isolator->isolatePHP($source); |
116
|
|
|
} |
117
|
|
|
|
118
|
|
|
$quotas = ''; |
119
|
|
|
$buffer = ''; |
120
|
|
|
|
121
|
|
|
$length = strlen($source); |
122
|
|
|
$position = self::POSITION_PLAIN_TEXT; |
123
|
|
|
for ($pointer = 0; $pointer < $length; $pointer++) { |
124
|
|
|
$char = $source[$pointer]; |
125
|
|
|
switch ($char) { |
126
|
|
|
case '<': |
127
|
|
|
if ($position == self::POSITION_IN_QUOTAS) { |
128
|
|
|
$buffer .= $char; |
129
|
|
|
break; |
130
|
|
|
} |
131
|
|
|
|
132
|
|
|
if ($position == self::POSITION_IN_TAG) { |
133
|
|
|
$buffer = '<' . $buffer; |
134
|
|
|
} |
135
|
|
|
|
136
|
|
|
//Handling previous token |
137
|
|
|
$this->handleToken(self::PLAIN_TEXT, $buffer); |
138
|
|
|
|
139
|
|
|
//We are in tag now |
140
|
|
|
$position = self::POSITION_IN_TAG; |
141
|
|
|
$buffer = ''; |
142
|
|
|
break; |
143
|
|
|
case '>': |
144
|
|
|
if ($position != self::POSITION_IN_TAG) { |
145
|
|
|
$buffer .= $char; |
146
|
|
|
break; |
147
|
|
|
} |
148
|
|
|
|
149
|
|
|
//Token ended |
150
|
|
|
$this->handleToken(null, $buffer); |
151
|
|
|
|
152
|
|
|
//We are in a plain text now |
153
|
|
|
$position = self::POSITION_PLAIN_TEXT; |
154
|
|
|
$buffer = ''; |
155
|
|
|
break; |
156
|
|
|
case '"': |
157
|
|
|
//no break |
158
|
|
|
case "'": |
159
|
|
|
if ($position == self::POSITION_IN_TAG) { |
160
|
|
|
//Jumping into argument |
161
|
|
|
$position = self::POSITION_IN_QUOTAS; |
162
|
|
|
$quotas = $char; |
163
|
|
|
} elseif ($position == self::POSITION_IN_QUOTAS && $char == $quotas) { |
164
|
|
|
//Jumping from argument |
165
|
|
|
$position = self::POSITION_IN_TAG; |
166
|
|
|
$quotas = ''; |
167
|
|
|
} |
168
|
|
|
default: |
169
|
|
|
//Checking for invalid characters in tag name or arguments |
170
|
|
|
if ($position == self::POSITION_IN_TAG) { |
171
|
|
|
if (!preg_match('/[a-z0-9 \._\-="\':\/\r\n\t]/i', $char)) { |
172
|
|
|
$buffer = '<' . $buffer; |
173
|
|
|
$position = self::POSITION_PLAIN_TEXT; |
174
|
|
|
} |
175
|
|
|
} |
176
|
|
|
$buffer .= $char; |
177
|
|
|
} |
178
|
|
|
} |
179
|
|
|
|
180
|
|
|
$this->handleToken(self::PLAIN_TEXT, $buffer); |
181
|
|
|
|
182
|
|
|
return $this->tokens; |
183
|
|
|
} |
184
|
|
|
|
185
|
|
|
/** |
186
|
|
|
* Compile all parsed tokens back into html form. |
187
|
|
|
* |
188
|
|
|
* @return string |
189
|
|
|
*/ |
190
|
|
|
public function compile(): string |
191
|
|
|
{ |
192
|
|
|
$result = ''; |
193
|
|
|
foreach ($this->tokens as $token) { |
194
|
|
|
$result .= $this->compileToken($token); |
195
|
|
|
} |
196
|
|
|
|
197
|
|
|
return $result; |
198
|
|
|
} |
199
|
|
|
|
200
|
|
|
/** |
201
|
|
|
* Compile parsed token. |
202
|
|
|
* |
203
|
|
|
* @param array $token |
204
|
|
|
* |
205
|
|
|
* @return string |
206
|
|
|
*/ |
207
|
|
|
public function compileToken(array $token): string |
208
|
|
|
{ |
209
|
|
|
if (in_array($token[self::TOKEN_TYPE], [self::PLAIN_TEXT, self::TAG_CLOSE])) { |
210
|
|
|
//Nothing to compile |
211
|
|
|
return $token[HtmlTokenizer::TOKEN_CONTENT]; |
212
|
|
|
} |
213
|
|
|
|
214
|
|
|
$result = $token[HtmlTokenizer::TOKEN_NAME]; |
215
|
|
|
$attributes = []; |
216
|
|
|
foreach ($token[self::TOKEN_ATTRIBUTES] as $attribute => $value) { |
217
|
|
|
if ($value === null) { |
218
|
|
|
$attributes[] = $attribute; |
219
|
|
|
continue; |
220
|
|
|
} |
221
|
|
|
|
222
|
|
|
$attributes[] = $attribute . '="' . $value . '"'; |
223
|
|
|
} |
224
|
|
|
|
225
|
|
|
if (!empty($attributes)) { |
226
|
|
|
$result .= ' ' . join(' ', $attributes); |
227
|
|
|
} |
228
|
|
|
|
229
|
|
|
if ($token[HtmlTokenizer::TOKEN_TYPE] == HtmlTokenizer::TAG_SHORT) { |
230
|
|
|
$result .= '/'; |
231
|
|
|
} |
232
|
|
|
|
233
|
|
|
return '<' . $result . '>'; |
234
|
|
|
} |
235
|
|
|
|
236
|
|
|
/** |
237
|
|
|
* Parses tag body for arguments, name, etc. |
238
|
|
|
* |
239
|
|
|
* @param string $content Tag content to be parsed (from < till >). |
240
|
|
|
* |
241
|
|
|
* @return array |
242
|
|
|
*/ |
243
|
|
|
protected function parseToken(string $content): array |
244
|
|
|
{ |
245
|
|
|
$token = [ |
246
|
|
|
self::TOKEN_NAME => '', |
247
|
|
|
self::TOKEN_TYPE => self::TAG_OPEN, |
248
|
|
|
self::TOKEN_CONTENT => '<' . ($content = $this->repairPHP($content)) . '>', |
249
|
|
|
self::TOKEN_ATTRIBUTES => [] |
250
|
|
|
]; |
251
|
|
|
|
252
|
|
|
//Some parts of text just looks like tags, but their not |
253
|
|
|
if (!preg_match('/^\/?[a-z0-9_:\/][a-z 0-9\._\-:\/]*/i', $content)) { |
254
|
|
|
$token[self::TOKEN_TYPE] = self::PLAIN_TEXT; |
255
|
|
|
unset($token[self::TOKEN_NAME], $token[self::TOKEN_NAME]); |
256
|
|
|
|
257
|
|
|
return $token; |
258
|
|
|
} |
259
|
|
|
|
260
|
|
|
//Local PHP isolation |
261
|
|
|
$isolator = new Isolator('-argument-', '-block-', true); |
|
|
|
|
262
|
|
|
|
263
|
|
|
//No PHP blocks |
264
|
|
|
$content = $isolator->isolatePHP($content); |
265
|
|
|
|
266
|
|
|
//Parsing arguments, due they already checked for open-close quotas we can use regular expression |
267
|
|
|
$attribute = '/(?P<name>[a-z0-9_\-\.\:]+)[ \n\t\r]*(?:(?P<equal>=)[ \n\t\r]*' |
268
|
|
|
. '(?P<value>[a-z0-9\-]+|\'[^\']+\'|\"[^\"]+\"|\"\"))?/si'; |
269
|
|
|
|
270
|
|
|
preg_match_all($attribute, $content, $attributes); |
271
|
|
|
|
272
|
|
|
foreach ($attributes['value'] as $index => $value) { |
273
|
|
|
if ($value && ($value{0} == "'" || $value{0} == '"')) { |
274
|
|
|
$value = trim($value, $value{0}); |
275
|
|
|
} |
276
|
|
|
|
277
|
|
|
//Local and global php isolation restore |
278
|
|
|
$name = $this->repairPHP($isolator->repairPHP($attributes['name'][$index])); |
279
|
|
|
|
280
|
|
|
$token[self::TOKEN_ATTRIBUTES][$name] = $this->repairPHP($isolator->repairPHP($value)); |
281
|
|
|
|
282
|
|
|
if (empty($attributes['equal'][$index])) { |
283
|
|
|
$token[self::TOKEN_ATTRIBUTES][$name] = null; |
284
|
|
|
} |
285
|
|
|
} |
286
|
|
|
|
287
|
|
|
//Fetching name |
288
|
|
|
$name = $isolator->repairPHP(current(explode(' ', $content))); |
289
|
|
|
if ($name{0} == '/') { |
290
|
|
|
$token[self::TOKEN_TYPE] = self::TAG_CLOSE; |
291
|
|
|
unset($token[self::TOKEN_ATTRIBUTES]); |
292
|
|
|
} |
293
|
|
|
|
294
|
|
|
if ($content{strlen($content) - 1} == '/') { |
295
|
|
|
$token[self::TOKEN_TYPE] = self::TAG_SHORT; |
296
|
|
|
} |
297
|
|
|
|
298
|
|
|
$token[self::TOKEN_NAME] = $name = trim($name, '/'); |
299
|
|
|
unset($token[self::TOKEN_ATTRIBUTES][$name]); |
300
|
|
|
|
301
|
|
|
$token[self::TOKEN_NAME] = trim($token[self::TOKEN_NAME]); |
302
|
|
|
|
303
|
|
|
if ( |
304
|
|
|
$token[self::TOKEN_TYPE] == self::TAG_OPEN |
305
|
|
|
&& in_array($token[self::TOKEN_NAME], $this->voidTags) |
306
|
|
|
) { |
307
|
|
|
$token[self::TOKEN_TYPE] = self::TAG_VOID; |
308
|
|
|
} |
309
|
|
|
|
310
|
|
|
return $token; |
311
|
|
|
} |
312
|
|
|
|
313
|
|
|
/** |
314
|
|
|
* Handles single token and passes it to a callback function if specified. |
315
|
|
|
* |
316
|
|
|
* @param int|null $tokenType Token type. |
317
|
|
|
* @param string $content Non parsed token content. |
318
|
|
|
*/ |
319
|
|
|
protected function handleToken($tokenType, string $content) |
320
|
|
|
{ |
321
|
|
|
if ($tokenType == self::PLAIN_TEXT) { |
322
|
|
|
if (empty($content)) { |
323
|
|
|
return; |
324
|
|
|
} |
325
|
|
|
|
326
|
|
|
$token = [ |
327
|
|
|
self::TOKEN_TYPE => self::PLAIN_TEXT, |
328
|
|
|
self::TOKEN_CONTENT => $this->repairPHP($content) |
329
|
|
|
]; |
330
|
|
|
} else { |
331
|
|
|
$token = $this->parseToken($content); |
332
|
|
|
} |
333
|
|
|
|
334
|
|
|
$this->tokens[] = $token; |
335
|
|
|
} |
336
|
|
|
|
337
|
|
|
/** |
338
|
|
|
* Will restore all existing PHP blocks to their original content. |
339
|
|
|
* |
340
|
|
|
* @param string $source |
341
|
|
|
* |
342
|
|
|
* @return string |
343
|
|
|
*/ |
344
|
|
|
protected function repairPHP(string $source): string |
345
|
|
|
{ |
346
|
|
|
if (!$this->isolatePHP) { |
347
|
|
|
return $source; |
348
|
|
|
} |
349
|
|
|
|
350
|
|
|
return $this->isolator->repairPHP($source); |
351
|
|
|
} |
352
|
|
|
} |
353
|
|
|
|
This check compares calls to functions or methods with their respective definitions. If the call has more arguments than are defined, it raises an issue.
If a function is defined several times with a different number of parameters, the check may pick up the wrong definition and report false positives. One codebase where this has been known to happen is Wordpress.
In this case you can add the
@ignore
PhpDoc annotation to the duplicate definition and it will be ignored.