1
|
|
|
<?php |
2
|
|
|
/** |
3
|
|
|
* Spiral Framework. |
4
|
|
|
* |
5
|
|
|
* @license MIT |
6
|
|
|
* @author Anton Titov (Wolfy-J) |
7
|
|
|
*/ |
8
|
|
|
namespace Spiral\Stempler; |
9
|
|
|
|
10
|
|
|
use Spiral\Tokenizer\Isolator; |
11
|
|
|
|
12
|
|
|
/** |
13
|
|
|
* Perform html code tokenization. Class used for spiral Stempler and can be used for other html |
14
|
|
|
* related operations. HtmlTokenizer is pretty slow! Please don't forget that this is tokenizer, |
15
|
|
|
* not parser. |
16
|
|
|
* |
17
|
|
|
* @todo very old class, improvement required |
18
|
|
|
*/ |
19
|
|
|
class HtmlTokenizer |
20
|
|
|
{ |
21
|
|
|
/** |
22
|
|
|
* Current tokenizer position. Tokenizer is a linear processor (no regular expression is |
23
|
|
|
* involved). This slows it down, but the results are much more reliable. |
24
|
|
|
*/ |
25
|
|
|
const POSITION_PLAIN_TEXT = 0x001; |
26
|
|
|
const POSITION_IN_TAG = 0x002; |
27
|
|
|
const POSITION_IN_QUOTAS = 0x003; |
28
|
|
|
|
29
|
|
|
/** |
30
|
|
|
* Token types detected and processed by tokenizer. |
31
|
|
|
*/ |
32
|
|
|
const PLAIN_TEXT = 'plain'; |
33
|
|
|
const TAG_OPEN = 'open'; |
34
|
|
|
const TAG_CLOSE = 'close'; |
35
|
|
|
const TAG_SHORT = 'short'; |
36
|
|
|
const TAG_VOID = 'void'; |
37
|
|
|
|
38
|
|
|
/** |
39
|
|
|
* Token fields. There are a lot of tokens in HTML (up to 10,000 different ones). We better to |
40
|
|
|
* use numeric keys for array than any text fields or even objects. |
41
|
|
|
*/ |
42
|
|
|
const TOKEN_NAME = 0; |
43
|
|
|
const TOKEN_TYPE = 1; |
44
|
|
|
const TOKEN_CONTENT = 2; |
45
|
|
|
const TOKEN_ATTRIBUTES = 3; |
46
|
|
|
|
47
|
|
|
/** |
48
|
|
|
* List of void tags. |
49
|
|
|
* |
50
|
|
|
* @link http://www.w3.org/TR/html5/syntax.html#void-elements |
51
|
|
|
* @var array |
52
|
|
|
*/ |
53
|
|
|
protected $voidTags = [ |
54
|
|
|
'area', |
55
|
|
|
'base', |
56
|
|
|
'br', |
57
|
|
|
'col', |
58
|
|
|
'embed', |
59
|
|
|
'hr', |
60
|
|
|
'img', |
61
|
|
|
'input', |
62
|
|
|
'keygen', |
63
|
|
|
'link', |
64
|
|
|
'meta', |
65
|
|
|
'param', |
66
|
|
|
'source', |
67
|
|
|
'track', |
68
|
|
|
'wbr' |
69
|
|
|
]; |
70
|
|
|
|
71
|
|
|
/** |
72
|
|
|
* Array of parsed tokens. Every token has fields name, type, content and arguments. |
73
|
|
|
* |
74
|
|
|
* @var array |
75
|
|
|
*/ |
76
|
|
|
protected $tokens = []; |
77
|
|
|
|
78
|
|
|
/** |
79
|
|
|
* PHP block should be isolated while parsing, Keep enabled. |
80
|
|
|
* |
81
|
|
|
* @var bool |
82
|
|
|
*/ |
83
|
|
|
protected $isolatePHP = false; |
84
|
|
|
|
85
|
|
|
/** |
86
|
|
|
* PHP Blocks isolator, which holds all existing PHP blocks and restores them in output. |
87
|
|
|
* |
88
|
|
|
* @var Isolator|null |
89
|
|
|
*/ |
90
|
|
|
protected $isolator = null; |
91
|
|
|
|
92
|
|
|
/** |
93
|
|
|
* @param bool $isolatePHP PHP block should be isolated and enabled by default |
94
|
|
|
* @param Isolator $isolator |
95
|
|
|
*/ |
96
|
|
|
public function __construct($isolatePHP = true, Isolator $isolator = null) |
97
|
|
|
{ |
98
|
|
|
$this->isolatePHP = $isolatePHP; |
99
|
|
|
$this->isolator = !empty($isolator) ? $isolator : new Isolator(); |
100
|
|
|
} |
101
|
|
|
|
102
|
|
|
/** |
103
|
|
|
* Parse HTML content and return it's tokens. |
104
|
|
|
* |
105
|
|
|
* @param string $source HTML source. |
106
|
|
|
* @return array |
107
|
|
|
*/ |
108
|
|
|
public function parse($source) |
109
|
|
|
{ |
110
|
|
|
//Cleaning list of already parsed tokens |
111
|
|
|
$this->tokens = []; |
112
|
|
|
|
113
|
|
|
if ($this->isolatePHP) { |
114
|
|
|
$source = $this->isolator->isolatePHP($source); |
115
|
|
|
} |
116
|
|
|
|
117
|
|
|
$quotas = ''; |
118
|
|
|
$buffer = ''; |
119
|
|
|
|
120
|
|
|
$length = strlen($source); |
121
|
|
|
$position = self::POSITION_PLAIN_TEXT; |
122
|
|
|
for ($pointer = 0; $pointer < $length; $pointer++) { |
123
|
|
|
$char = $source[$pointer]; |
124
|
|
|
switch ($char) { |
125
|
|
|
case '<': |
126
|
|
|
if ($position == self::POSITION_IN_QUOTAS) { |
127
|
|
|
$buffer .= $char; |
128
|
|
|
break; |
129
|
|
|
} |
130
|
|
|
|
131
|
|
|
if ($position == self::POSITION_IN_TAG) { |
132
|
|
|
$buffer = '<' . $buffer; |
133
|
|
|
} |
134
|
|
|
|
135
|
|
|
//Handling previous token |
136
|
|
|
$this->handleToken(self::PLAIN_TEXT, $buffer); |
137
|
|
|
|
138
|
|
|
//We are in tag now |
139
|
|
|
$position = self::POSITION_IN_TAG; |
140
|
|
|
$buffer = ''; |
141
|
|
|
break; |
142
|
|
|
case '>': |
143
|
|
|
if ($position != self::POSITION_IN_TAG) { |
144
|
|
|
$buffer .= $char; |
145
|
|
|
break; |
146
|
|
|
} |
147
|
|
|
|
148
|
|
|
//Token ended |
149
|
|
|
$this->handleToken(false, $buffer); |
|
|
|
|
150
|
|
|
|
151
|
|
|
//We are in a plain text now |
152
|
|
|
$position = self::POSITION_PLAIN_TEXT; |
153
|
|
|
$buffer = ''; |
154
|
|
|
break; |
155
|
|
|
case '"': |
156
|
|
|
//no break |
157
|
|
|
case "'": |
|
|
|
|
158
|
|
|
if ($position == self::POSITION_IN_TAG) { |
159
|
|
|
//Jumping into argument |
160
|
|
|
$position = self::POSITION_IN_QUOTAS; |
161
|
|
|
$quotas = $char; |
162
|
|
|
} elseif ($position == self::POSITION_IN_QUOTAS && $char == $quotas) { |
163
|
|
|
//Jumping from argument |
164
|
|
|
$position = self::POSITION_IN_TAG; |
165
|
|
|
$quotas = ''; |
166
|
|
|
} |
167
|
|
|
default: |
168
|
|
|
//Checking for invalid characters in tag name or arguments |
169
|
|
|
if ($position == self::POSITION_IN_TAG) { |
170
|
|
|
if (!preg_match('/[a-z0-9 \._\-="\':\/\r\n\t]/i', $char)) { |
171
|
|
|
$buffer = '<' . $buffer; |
172
|
|
|
$position = self::POSITION_PLAIN_TEXT; |
173
|
|
|
} |
174
|
|
|
} |
175
|
|
|
$buffer .= $char; |
176
|
|
|
} |
177
|
|
|
} |
178
|
|
|
|
179
|
|
|
$this->handleToken(self::PLAIN_TEXT, $buffer); |
180
|
|
|
|
181
|
|
|
return $this->tokens; |
182
|
|
|
} |
183
|
|
|
|
184
|
|
|
/** |
185
|
|
|
* Compile token and all it's attributes into string. |
186
|
|
|
* |
187
|
|
|
* @param array $token |
188
|
|
|
* @return string |
189
|
|
|
*/ |
190
|
|
|
public function compile(array $token) |
191
|
|
|
{ |
192
|
|
|
if (in_array($token[self::TOKEN_TYPE], [self::PLAIN_TEXT, self::TAG_CLOSE])) { |
193
|
|
|
//Nothing to compile |
194
|
|
|
return $token[HtmlTokenizer::TOKEN_CONTENT]; |
195
|
|
|
} |
196
|
|
|
|
197
|
|
|
$result = $token[HtmlTokenizer::TOKEN_NAME]; |
198
|
|
|
$attributes = []; |
199
|
|
|
foreach ($token[self::TOKEN_ATTRIBUTES] as $attribute => $value) { |
200
|
|
|
if ($value === null) { |
201
|
|
|
$attributes[] = $attribute; |
202
|
|
|
continue; |
203
|
|
|
} |
204
|
|
|
|
205
|
|
|
$attributes[] = $attribute . '="' . $value . '"'; |
206
|
|
|
} |
207
|
|
|
|
208
|
|
|
if (!empty($attributes)) { |
209
|
|
|
$result .= ' ' . join(' ', $attributes); |
210
|
|
|
} |
211
|
|
|
|
212
|
|
|
if ($token[HtmlTokenizer::TOKEN_TYPE] == HtmlTokenizer::TAG_SHORT) { |
213
|
|
|
$result .= '/'; |
214
|
|
|
} |
215
|
|
|
|
216
|
|
|
return '<' . $result . '>'; |
217
|
|
|
} |
218
|
|
|
|
219
|
|
|
/** |
220
|
|
|
* Parses tag body for arguments, name, etc. |
221
|
|
|
* |
222
|
|
|
* @param string $content Tag content to be parsed (from < till >). |
223
|
|
|
* @return array |
224
|
|
|
*/ |
225
|
|
|
protected function parseToken($content) |
226
|
|
|
{ |
227
|
|
|
$token = [ |
228
|
|
|
self::TOKEN_NAME => '', |
229
|
|
|
self::TOKEN_TYPE => self::TAG_OPEN, |
230
|
|
|
self::TOKEN_CONTENT => '<' . ($content = $this->repairPHP($content)) . '>', |
231
|
|
|
self::TOKEN_ATTRIBUTES => [] |
232
|
|
|
]; |
233
|
|
|
|
234
|
|
|
//Some parts of text just looks like tags, but their not |
235
|
|
|
if (!preg_match('/^\/?[a-z0-9_:\/][a-z 0-9\._\-:\/]*/i', $content)) { |
236
|
|
|
$token[self::TOKEN_TYPE] = self::PLAIN_TEXT; |
237
|
|
|
unset($token[self::TOKEN_NAME], $token[self::TOKEN_NAME]); |
238
|
|
|
|
239
|
|
|
return $token; |
240
|
|
|
} |
241
|
|
|
|
242
|
|
|
//Local PHP isolation |
243
|
|
|
$isolator = new Isolator('-argument-', '-block-', true); |
244
|
|
|
|
245
|
|
|
//No PHP blocks |
246
|
|
|
$content = $isolator->isolatePHP($content); |
247
|
|
|
|
248
|
|
|
//Parsing arguments, due they already checked for open-close quotas we can use regular expression |
249
|
|
|
$attribute = '/(?P<name>[a-z0-9_\-\.\:]+)[ \n\t\r]*(?:(?P<equal>=)[ \n\t\r]*' |
250
|
|
|
. '(?P<value>[a-z0-9\-]+|\'[^\']+\'|\"[^\"]+\"|\"\"))?/si'; |
251
|
|
|
//todo: need better regexp for quotes |
252
|
|
|
|
253
|
|
|
preg_match_all($attribute, $content, $attributes); |
254
|
|
|
|
255
|
|
|
foreach ($attributes['value'] as $index => $value) { |
256
|
|
|
if ($value && ($value{0} == "'" || $value{0} == '"')) { |
257
|
|
|
$value = trim($value, $value{0}); |
258
|
|
|
} |
259
|
|
|
|
260
|
|
|
//Restoring global php isolation |
261
|
|
|
$name = $this->repairPHP( |
262
|
|
|
//Restoring local php isolation |
263
|
|
|
$isolator->repairPHP($attributes['name'][$index]) |
264
|
|
|
); |
265
|
|
|
|
266
|
|
|
$token[self::TOKEN_ATTRIBUTES][$name] = $this->repairPHP($isolator->repairPHP($value)); |
267
|
|
|
|
268
|
|
|
if (empty($attributes['equal'][$index])) { |
269
|
|
|
$token[self::TOKEN_ATTRIBUTES][$name] = null; |
270
|
|
|
} |
271
|
|
|
} |
272
|
|
|
|
273
|
|
|
//Fetching name |
274
|
|
|
$name = $isolator->repairPHP(current(explode(' ', $content))); |
275
|
|
|
if ($name{0} == '/') { |
276
|
|
|
$token[self::TOKEN_TYPE] = self::TAG_CLOSE; |
277
|
|
|
unset($token[self::TOKEN_ATTRIBUTES]); |
278
|
|
|
} |
279
|
|
|
|
280
|
|
|
if ($content{strlen($content) - 1} == '/') { |
281
|
|
|
$token[self::TOKEN_TYPE] = self::TAG_SHORT; |
282
|
|
|
} |
283
|
|
|
|
284
|
|
|
$token[self::TOKEN_NAME] = $name = trim($name, '/'); |
285
|
|
|
unset($token[self::TOKEN_ATTRIBUTES][$name]); |
286
|
|
|
|
287
|
|
|
$token[self::TOKEN_NAME] = trim($token[self::TOKEN_NAME]); |
288
|
|
|
|
289
|
|
|
if ( |
290
|
|
|
$token[self::TOKEN_TYPE] == self::TAG_OPEN |
291
|
|
|
&& in_array($token[self::TOKEN_NAME], $this->voidTags) |
292
|
|
|
) { |
293
|
|
|
$token[self::TOKEN_TYPE] = self::TAG_VOID; |
294
|
|
|
} |
295
|
|
|
|
296
|
|
|
return $token; |
297
|
|
|
} |
298
|
|
|
|
299
|
|
|
/** |
300
|
|
|
* Handles single token and passes it to a callback function if specified. |
301
|
|
|
* |
302
|
|
|
* @param int $tokenType Token type. |
303
|
|
|
* @param string $content Non parsed token content. |
304
|
|
|
*/ |
305
|
|
|
protected function handleToken($tokenType, $content) |
306
|
|
|
{ |
307
|
|
|
if ($tokenType == self::PLAIN_TEXT) { |
308
|
|
|
if (empty($content)) { |
309
|
|
|
return; |
310
|
|
|
} |
311
|
|
|
|
312
|
|
|
$token = [ |
313
|
|
|
self::TOKEN_TYPE => self::PLAIN_TEXT, |
314
|
|
|
self::TOKEN_CONTENT => $this->repairPHP($content) |
315
|
|
|
]; |
316
|
|
|
} else { |
317
|
|
|
$token = $this->parseToken($content); |
318
|
|
|
} |
319
|
|
|
|
320
|
|
|
$this->tokens[] = $token; |
321
|
|
|
} |
322
|
|
|
|
323
|
|
|
/** |
324
|
|
|
* Will restore all existing PHP blocks to their original content. |
325
|
|
|
* |
326
|
|
|
* @param string $source |
327
|
|
|
* @return string |
328
|
|
|
*/ |
329
|
|
|
protected function repairPHP($source) |
330
|
|
|
{ |
331
|
|
|
if (!$this->isolatePHP) { |
332
|
|
|
return $source; |
333
|
|
|
} |
334
|
|
|
|
335
|
|
|
return $this->isolator->repairPHP($source); |
336
|
|
|
} |
337
|
|
|
} |
338
|
|
|
|
It seems like the type of the argument is not accepted by the function/method which you are calling.
In some cases, in particular if PHP’s automatic type-juggling kicks in this might be fine. In other cases, however this might be a bug.
We suggest to add an explicit type cast like in the following example: