|
1
|
|
|
<?php |
|
2
|
|
|
/** |
|
3
|
|
|
* Spiral Framework. |
|
4
|
|
|
* |
|
5
|
|
|
* @license MIT |
|
6
|
|
|
* @author Anton Titov (Wolfy-J) |
|
7
|
|
|
*/ |
|
8
|
|
|
namespace Spiral\Stempler; |
|
9
|
|
|
|
|
10
|
|
|
use Spiral\Tokenizer\Isolator; |
|
11
|
|
|
|
|
12
|
|
|
/** |
|
13
|
|
|
* Perform html code tokenization. Class used for spiral Stempler and can be used for other html |
|
14
|
|
|
* related operations. HtmlTokenizer is pretty slow! Please don't forget that this is tokenizer, |
|
15
|
|
|
* not parser. |
|
16
|
|
|
* |
|
17
|
|
|
* @todo very old class, improvement required |
|
18
|
|
|
*/ |
|
19
|
|
|
class HtmlTokenizer |
|
20
|
|
|
{ |
|
21
|
|
|
/** |
|
22
|
|
|
* Current tokenizer position. Tokenizer is a linear processor (no regular expression is |
|
23
|
|
|
* involved). This slows it down, but the results are much more reliable. |
|
24
|
|
|
*/ |
|
25
|
|
|
const POSITION_PLAIN_TEXT = 0x001; |
|
26
|
|
|
const POSITION_IN_TAG = 0x002; |
|
27
|
|
|
const POSITION_IN_QUOTAS = 0x003; |
|
28
|
|
|
|
|
29
|
|
|
/** |
|
30
|
|
|
* Token types detected and processed by tokenizer. |
|
31
|
|
|
*/ |
|
32
|
|
|
const PLAIN_TEXT = 'plain'; |
|
33
|
|
|
const TAG_OPEN = 'open'; |
|
34
|
|
|
const TAG_CLOSE = 'close'; |
|
35
|
|
|
const TAG_SHORT = 'short'; |
|
36
|
|
|
const TAG_VOID = 'void'; |
|
37
|
|
|
|
|
38
|
|
|
/** |
|
39
|
|
|
* Token fields. There are a lot of tokens in HTML (up to 10,000 different ones). We better to |
|
40
|
|
|
* use numeric keys for array than any text fields or even objects. |
|
41
|
|
|
*/ |
|
42
|
|
|
const TOKEN_NAME = 0; |
|
43
|
|
|
const TOKEN_TYPE = 1; |
|
44
|
|
|
const TOKEN_CONTENT = 2; |
|
45
|
|
|
const TOKEN_ATTRIBUTES = 3; |
|
46
|
|
|
|
|
47
|
|
|
/** |
|
48
|
|
|
* List of void tags. |
|
49
|
|
|
* |
|
50
|
|
|
* @link http://www.w3.org/TR/html5/syntax.html#void-elements |
|
51
|
|
|
* @var array |
|
52
|
|
|
*/ |
|
53
|
|
|
protected $voidTags = [ |
|
54
|
|
|
'area', |
|
55
|
|
|
'base', |
|
56
|
|
|
'br', |
|
57
|
|
|
'col', |
|
58
|
|
|
'embed', |
|
59
|
|
|
'hr', |
|
60
|
|
|
'img', |
|
61
|
|
|
'input', |
|
62
|
|
|
'keygen', |
|
63
|
|
|
'link', |
|
64
|
|
|
'meta', |
|
65
|
|
|
'param', |
|
66
|
|
|
'source', |
|
67
|
|
|
'track', |
|
68
|
|
|
'wbr' |
|
69
|
|
|
]; |
|
70
|
|
|
|
|
71
|
|
|
/** |
|
72
|
|
|
* Array of parsed tokens. Every token has fields name, type, content and arguments. |
|
73
|
|
|
* |
|
74
|
|
|
* @var array |
|
75
|
|
|
*/ |
|
76
|
|
|
protected $tokens = []; |
|
77
|
|
|
|
|
78
|
|
|
/** |
|
79
|
|
|
* PHP block should be isolated while parsing, Keep enabled. |
|
80
|
|
|
* |
|
81
|
|
|
* @var bool |
|
82
|
|
|
*/ |
|
83
|
|
|
protected $isolatePHP = false; |
|
84
|
|
|
|
|
85
|
|
|
/** |
|
86
|
|
|
* PHP Blocks isolator, which holds all existing PHP blocks and restores them in output. |
|
87
|
|
|
* |
|
88
|
|
|
* @var Isolator|null |
|
89
|
|
|
*/ |
|
90
|
|
|
protected $isolator = null; |
|
91
|
|
|
|
|
92
|
|
|
/** |
|
93
|
|
|
* @param bool $isolatePHP PHP block should be isolated and enabled by default |
|
94
|
|
|
* @param Isolator $isolator |
|
95
|
|
|
*/ |
|
96
|
|
|
public function __construct($isolatePHP = true, Isolator $isolator = null) |
|
97
|
|
|
{ |
|
98
|
|
|
$this->isolatePHP = $isolatePHP; |
|
99
|
|
|
$this->isolator = !empty($isolator) ? $isolator : new Isolator(); |
|
100
|
|
|
} |
|
101
|
|
|
|
|
102
|
|
|
/** |
|
103
|
|
|
* Parse HTML content and return it's tokens. |
|
104
|
|
|
* |
|
105
|
|
|
* @param string $source HTML source. |
|
106
|
|
|
* @return array |
|
107
|
|
|
*/ |
|
108
|
|
|
public function parse($source) |
|
109
|
|
|
{ |
|
110
|
|
|
//Cleaning list of already parsed tokens |
|
111
|
|
|
$this->tokens = []; |
|
112
|
|
|
|
|
113
|
|
|
if ($this->isolatePHP) { |
|
114
|
|
|
$source = $this->isolator->isolatePHP($source); |
|
115
|
|
|
} |
|
116
|
|
|
|
|
117
|
|
|
$quotas = ''; |
|
118
|
|
|
$buffer = ''; |
|
119
|
|
|
|
|
120
|
|
|
$length = strlen($source); |
|
121
|
|
|
$position = self::POSITION_PLAIN_TEXT; |
|
122
|
|
|
for ($pointer = 0; $pointer < $length; $pointer++) { |
|
123
|
|
|
$char = $source[$pointer]; |
|
124
|
|
|
switch ($char) { |
|
125
|
|
|
case '<': |
|
126
|
|
|
if ($position == self::POSITION_IN_QUOTAS) { |
|
127
|
|
|
$buffer .= $char; |
|
128
|
|
|
break; |
|
129
|
|
|
} |
|
130
|
|
|
|
|
131
|
|
|
if ($position == self::POSITION_IN_TAG) { |
|
132
|
|
|
$buffer = '<' . $buffer; |
|
133
|
|
|
} |
|
134
|
|
|
|
|
135
|
|
|
//Handling previous token |
|
136
|
|
|
$this->handleToken(self::PLAIN_TEXT, $buffer); |
|
137
|
|
|
|
|
138
|
|
|
//We are in tag now |
|
139
|
|
|
$position = self::POSITION_IN_TAG; |
|
140
|
|
|
$buffer = ''; |
|
141
|
|
|
break; |
|
142
|
|
|
case '>': |
|
143
|
|
|
if ($position != self::POSITION_IN_TAG) { |
|
144
|
|
|
$buffer .= $char; |
|
145
|
|
|
break; |
|
146
|
|
|
} |
|
147
|
|
|
|
|
148
|
|
|
//Token ended |
|
149
|
|
|
$this->handleToken(false, $buffer); |
|
|
|
|
|
|
150
|
|
|
|
|
151
|
|
|
//We are in a plain text now |
|
152
|
|
|
$position = self::POSITION_PLAIN_TEXT; |
|
153
|
|
|
$buffer = ''; |
|
154
|
|
|
break; |
|
155
|
|
|
case '"': |
|
156
|
|
|
//no break |
|
157
|
|
|
case "'": |
|
|
|
|
|
|
158
|
|
|
if ($position == self::POSITION_IN_TAG) { |
|
159
|
|
|
//Jumping into argument |
|
160
|
|
|
$position = self::POSITION_IN_QUOTAS; |
|
161
|
|
|
$quotas = $char; |
|
162
|
|
|
} elseif ($position == self::POSITION_IN_QUOTAS && $char == $quotas) { |
|
163
|
|
|
//Jumping from argument |
|
164
|
|
|
$position = self::POSITION_IN_TAG; |
|
165
|
|
|
$quotas = ''; |
|
166
|
|
|
} |
|
167
|
|
|
default: |
|
168
|
|
|
//Checking for invalid characters in tag name or arguments |
|
169
|
|
|
if ($position == self::POSITION_IN_TAG) { |
|
170
|
|
|
if (!preg_match('/[a-z0-9 \._\-="\':\/\r\n\t]/i', $char)) { |
|
171
|
|
|
$buffer = '<' . $buffer; |
|
172
|
|
|
$position = self::POSITION_PLAIN_TEXT; |
|
173
|
|
|
} |
|
174
|
|
|
} |
|
175
|
|
|
$buffer .= $char; |
|
176
|
|
|
} |
|
177
|
|
|
} |
|
178
|
|
|
|
|
179
|
|
|
$this->handleToken(self::PLAIN_TEXT, $buffer); |
|
180
|
|
|
|
|
181
|
|
|
return $this->tokens; |
|
182
|
|
|
} |
|
183
|
|
|
|
|
184
|
|
|
/** |
|
185
|
|
|
* Compile token and all it's attributes into string. |
|
186
|
|
|
* |
|
187
|
|
|
* @param array $token |
|
188
|
|
|
* @return string |
|
189
|
|
|
*/ |
|
190
|
|
|
public function compile(array $token) |
|
191
|
|
|
{ |
|
192
|
|
|
if (in_array($token[self::TOKEN_TYPE], [self::PLAIN_TEXT, self::TAG_CLOSE])) { |
|
193
|
|
|
//Nothing to compile |
|
194
|
|
|
return $token[HtmlTokenizer::TOKEN_CONTENT]; |
|
195
|
|
|
} |
|
196
|
|
|
|
|
197
|
|
|
$result = $token[HtmlTokenizer::TOKEN_NAME]; |
|
198
|
|
|
$attributes = []; |
|
199
|
|
|
foreach ($token[self::TOKEN_ATTRIBUTES] as $attribute => $value) { |
|
200
|
|
|
if ($value === null) { |
|
201
|
|
|
$attributes[] = $attribute; |
|
202
|
|
|
continue; |
|
203
|
|
|
} |
|
204
|
|
|
|
|
205
|
|
|
$attributes[] = $attribute . '="' . $value . '"'; |
|
206
|
|
|
} |
|
207
|
|
|
|
|
208
|
|
|
if (!empty($attributes)) { |
|
209
|
|
|
$result .= ' ' . join(' ', $attributes); |
|
210
|
|
|
} |
|
211
|
|
|
|
|
212
|
|
|
if ($token[HtmlTokenizer::TOKEN_TYPE] == HtmlTokenizer::TAG_SHORT) { |
|
213
|
|
|
$result .= '/'; |
|
214
|
|
|
} |
|
215
|
|
|
|
|
216
|
|
|
return '<' . $result . '>'; |
|
217
|
|
|
} |
|
218
|
|
|
|
|
219
|
|
|
/** |
|
220
|
|
|
* Parses tag body for arguments, name, etc. |
|
221
|
|
|
* |
|
222
|
|
|
* @param string $content Tag content to be parsed (from < till >). |
|
223
|
|
|
* @return array |
|
224
|
|
|
*/ |
|
225
|
|
|
protected function parseToken($content) |
|
226
|
|
|
{ |
|
227
|
|
|
$token = [ |
|
228
|
|
|
self::TOKEN_NAME => '', |
|
229
|
|
|
self::TOKEN_TYPE => self::TAG_OPEN, |
|
230
|
|
|
self::TOKEN_CONTENT => '<' . ($content = $this->repairPHP($content)) . '>', |
|
231
|
|
|
self::TOKEN_ATTRIBUTES => [] |
|
232
|
|
|
]; |
|
233
|
|
|
|
|
234
|
|
|
//Some parts of text just looks like tags, but their not |
|
235
|
|
|
if (!preg_match('/^\/?[a-z0-9_:\/][a-z 0-9\._\-:\/]*/i', $content)) { |
|
236
|
|
|
$token[self::TOKEN_TYPE] = self::PLAIN_TEXT; |
|
237
|
|
|
unset($token[self::TOKEN_NAME], $token[self::TOKEN_NAME]); |
|
238
|
|
|
|
|
239
|
|
|
return $token; |
|
240
|
|
|
} |
|
241
|
|
|
|
|
242
|
|
|
//Local PHP isolation |
|
243
|
|
|
$isolator = new Isolator('-argument-', '-block-', true); |
|
244
|
|
|
|
|
245
|
|
|
//No PHP blocks |
|
246
|
|
|
$content = $isolator->isolatePHP($content); |
|
247
|
|
|
|
|
248
|
|
|
//Parsing arguments, due they already checked for open-close quotas we can use regular expression |
|
249
|
|
|
$attribute = '/(?P<name>[a-z0-9_\-\.\:]+)[ \n\t\r]*(?:(?P<equal>=)[ \n\t\r]*' |
|
250
|
|
|
. '(?P<value>[a-z0-9\-]+|\'[^\']+\'|\"[^\"]+\"|\"\"))?/si'; |
|
251
|
|
|
//todo: need better regexp for quotes |
|
252
|
|
|
|
|
253
|
|
|
preg_match_all($attribute, $content, $attributes); |
|
254
|
|
|
|
|
255
|
|
|
foreach ($attributes['value'] as $index => $value) { |
|
256
|
|
|
if ($value && ($value{0} == "'" || $value{0} == '"')) { |
|
257
|
|
|
$value = trim($value, $value{0}); |
|
258
|
|
|
} |
|
259
|
|
|
|
|
260
|
|
|
//Restoring global php isolation |
|
261
|
|
|
$name = $this->repairPHP( |
|
262
|
|
|
//Restoring local php isolation |
|
263
|
|
|
$isolator->repairPHP($attributes['name'][$index]) |
|
264
|
|
|
); |
|
265
|
|
|
|
|
266
|
|
|
$token[self::TOKEN_ATTRIBUTES][$name] = $this->repairPHP($isolator->repairPHP($value)); |
|
267
|
|
|
|
|
268
|
|
|
if (empty($attributes['equal'][$index])) { |
|
269
|
|
|
$token[self::TOKEN_ATTRIBUTES][$name] = null; |
|
270
|
|
|
} |
|
271
|
|
|
} |
|
272
|
|
|
|
|
273
|
|
|
//Fetching name |
|
274
|
|
|
$name = $isolator->repairPHP(current(explode(' ', $content))); |
|
275
|
|
|
if ($name{0} == '/') { |
|
276
|
|
|
$token[self::TOKEN_TYPE] = self::TAG_CLOSE; |
|
277
|
|
|
unset($token[self::TOKEN_ATTRIBUTES]); |
|
278
|
|
|
} |
|
279
|
|
|
|
|
280
|
|
|
if ($content{strlen($content) - 1} == '/') { |
|
281
|
|
|
$token[self::TOKEN_TYPE] = self::TAG_SHORT; |
|
282
|
|
|
} |
|
283
|
|
|
|
|
284
|
|
|
$token[self::TOKEN_NAME] = $name = trim($name, '/'); |
|
285
|
|
|
unset($token[self::TOKEN_ATTRIBUTES][$name]); |
|
286
|
|
|
|
|
287
|
|
|
$token[self::TOKEN_NAME] = trim($token[self::TOKEN_NAME]); |
|
288
|
|
|
|
|
289
|
|
|
if ( |
|
290
|
|
|
$token[self::TOKEN_TYPE] == self::TAG_OPEN |
|
291
|
|
|
&& in_array($token[self::TOKEN_NAME], $this->voidTags) |
|
292
|
|
|
) { |
|
293
|
|
|
$token[self::TOKEN_TYPE] = self::TAG_VOID; |
|
294
|
|
|
} |
|
295
|
|
|
|
|
296
|
|
|
return $token; |
|
297
|
|
|
} |
|
298
|
|
|
|
|
299
|
|
|
/** |
|
300
|
|
|
* Handles single token and passes it to a callback function if specified. |
|
301
|
|
|
* |
|
302
|
|
|
* @param int $tokenType Token type. |
|
303
|
|
|
* @param string $content Non parsed token content. |
|
304
|
|
|
*/ |
|
305
|
|
|
protected function handleToken($tokenType, $content) |
|
306
|
|
|
{ |
|
307
|
|
|
if ($tokenType == self::PLAIN_TEXT) { |
|
308
|
|
|
if (empty($content)) { |
|
309
|
|
|
return; |
|
310
|
|
|
} |
|
311
|
|
|
|
|
312
|
|
|
$token = [ |
|
313
|
|
|
self::TOKEN_TYPE => self::PLAIN_TEXT, |
|
314
|
|
|
self::TOKEN_CONTENT => $this->repairPHP($content) |
|
315
|
|
|
]; |
|
316
|
|
|
} else { |
|
317
|
|
|
$token = $this->parseToken($content); |
|
318
|
|
|
} |
|
319
|
|
|
|
|
320
|
|
|
$this->tokens[] = $token; |
|
321
|
|
|
} |
|
322
|
|
|
|
|
323
|
|
|
/** |
|
324
|
|
|
* Will restore all existing PHP blocks to their original content. |
|
325
|
|
|
* |
|
326
|
|
|
* @param string $source |
|
327
|
|
|
* @return string |
|
328
|
|
|
*/ |
|
329
|
|
|
protected function repairPHP($source) |
|
330
|
|
|
{ |
|
331
|
|
|
if (!$this->isolatePHP) { |
|
332
|
|
|
return $source; |
|
333
|
|
|
} |
|
334
|
|
|
|
|
335
|
|
|
return $this->isolator->repairPHP($source); |
|
336
|
|
|
} |
|
337
|
|
|
} |
|
338
|
|
|
|
It seems like the type of the argument is not accepted by the function/method which you are calling.
In some cases, in particular if PHP’s automatic type-juggling kicks in this might be fine. In other cases, however this might be a bug.
We suggest to add an explicit type cast like in the following example: