Completed
Branch feature/pre-split (8b986a)
by Anton
06:31
created

HtmlTokenizer::compile()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 9
Code Lines 5

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 2
eloc 5
nc 2
nop 0
dl 0
loc 9
rs 9.6666
c 0
b 0
f 0
1
<?php
2
/**
3
 * Spiral Framework.
4
 *
5
 * @license   MIT
6
 * @author    Anton Titov (Wolfy-J)
7
 */
8
namespace Spiral\Stempler;
9
10
use Spiral\Tokenizer\Isolator;
11
12
/**
13
 * Perform html code tokenization. Class used for spiral Stempler and can be used for other html
14
 * related operations. HtmlTokenizer is pretty slow! Please don't forget that this is tokenizer,
15
 * not parser.
16
 *
17
 * @todo very old class, improvement is required
18
 */
19
class HtmlTokenizer
20
{
21
    /**
22
     * Current tokenizer position. Tokenizer is a linear processor (no regular expression is
23
     * involved). This slows it down, but the results are much more reliable.
24
     */
25
    const POSITION_PLAIN_TEXT = 0x001;
26
    const POSITION_IN_TAG     = 0x002;
27
    const POSITION_IN_QUOTAS  = 0x003;
28
29
    /**
30
     * Token types detected and processed by tokenizer.
31
     */
32
    const PLAIN_TEXT = 'plain';
33
    const TAG_OPEN   = 'open';
34
    const TAG_CLOSE  = 'close';
35
    const TAG_SHORT  = 'short';
36
    const TAG_VOID   = 'void';
37
38
    /**
39
     * Token fields. There are a lot of tokens in HTML (up to 10,000 different ones). We better to
40
     * use numeric keys for array than any text fields or even objects.
41
     */
42
    const TOKEN_NAME       = 0;
43
    const TOKEN_TYPE       = 1;
44
    const TOKEN_CONTENT    = 2;
45
    const TOKEN_ATTRIBUTES = 3;
46
47
    /**
48
     * List of void tags.
49
     *
50
     * @link http://www.w3.org/TR/html5/syntax.html#void-elements
51
     * @var array
52
     */
53
    protected $voidTags = [
54
        'area',
55
        'base',
56
        'br',
57
        'col',
58
        'embed',
59
        'hr',
60
        'img',
61
        'input',
62
        'keygen',
63
        'link',
64
        'meta',
65
        'param',
66
        'source',
67
        'track',
68
        'wbr'
69
    ];
70
71
    /**
72
     * Array of parsed tokens. Every token has fields name, type, content and arguments.
73
     *
74
     * @var array
75
     */
76
    protected $tokens = [];
77
78
    /**
79
     * PHP block should be isolated while parsing, Keep enabled.
80
     *
81
     * @var bool
82
     */
83
    protected $isolatePHP = false;
84
85
    /**
86
     * PHP Blocks isolator, which holds all existing PHP blocks and restores them in output.
87
     *
88
     * @var Isolator|null
89
     */
90
    protected $isolator = null;
91
92
    /**
93
     * @param bool     $isolatePHP PHP block should be isolated and enabled by default
94
     * @param Isolator $isolator
95
     */
96
    public function __construct(bool $isolatePHP = true, Isolator $isolator = null)
97
    {
98
        $this->isolatePHP = $isolatePHP;
99
        $this->isolator = !empty($isolator) ? $isolator : new Isolator();
100
    }
101
102
    /**
103
     * Parse HTML content and return it's tokens.
104
     *
105
     * @param string $source HTML source.
106
     *
107
     * @return array
108
     */
109
    public function parse(string $source): array
110
    {
111
        //Cleaning list of already parsed tokens
112
        $this->tokens = [];
113
114
        if ($this->isolatePHP) {
115
            $source = $this->isolator->isolatePHP($source);
116
        }
117
118
        $quotas = '';
119
        $buffer = '';
120
121
        $length = strlen($source);
122
        $position = self::POSITION_PLAIN_TEXT;
123
        for ($pointer = 0; $pointer < $length; $pointer++) {
124
            $char = $source[$pointer];
125
            switch ($char) {
126
                case '<':
127
                    if ($position == self::POSITION_IN_QUOTAS) {
128
                        $buffer .= $char;
129
                        break;
130
                    }
131
132
                    if ($position == self::POSITION_IN_TAG) {
133
                        $buffer = '<' . $buffer;
134
                    }
135
136
                    //Handling previous token
137
                    $this->handleToken(self::PLAIN_TEXT, $buffer);
138
139
                    //We are in tag now
140
                    $position = self::POSITION_IN_TAG;
141
                    $buffer = '';
142
                    break;
143
                case '>':
144
                    if ($position != self::POSITION_IN_TAG) {
145
                        $buffer .= $char;
146
                        break;
147
                    }
148
149
                    //Token ended
150
                    $this->handleToken(null, $buffer);
151
152
                    //We are in a plain text now
153
                    $position = self::POSITION_PLAIN_TEXT;
154
                    $buffer = '';
155
                    break;
156
                case '"':
157
                    //no break
158
                case "'":
159
                    if ($position == self::POSITION_IN_TAG) {
160
                        //Jumping into argument
161
                        $position = self::POSITION_IN_QUOTAS;
162
                        $quotas = $char;
163
                    } elseif ($position == self::POSITION_IN_QUOTAS && $char == $quotas) {
164
                        //Jumping from argument
165
                        $position = self::POSITION_IN_TAG;
166
                        $quotas = '';
167
                    }
168
                default:
169
                    //Checking for invalid characters in tag name or arguments
170
                    if ($position == self::POSITION_IN_TAG) {
171
                        if (!preg_match('/[a-z0-9 \._\-="\':\/\r\n\t]/i', $char)) {
172
                            $buffer = '<' . $buffer;
173
                            $position = self::POSITION_PLAIN_TEXT;
174
                        }
175
                    }
176
                    $buffer .= $char;
177
            }
178
        }
179
180
        $this->handleToken(self::PLAIN_TEXT, $buffer);
181
182
        return $this->tokens;
183
    }
184
185
    /**
186
     * Compile all parsed tokens back into html form.
187
     *
188
     * @return string
189
     */
190
    public function compile(): string
191
    {
192
        $result = '';
193
        foreach ($this->tokens as $token) {
194
            $result .= $this->compileToken($token);
195
        }
196
197
        return $result;
198
    }
199
200
    /**
201
     * Compile parsed token.
202
     *
203
     * @param array $token
204
     *
205
     * @return string
206
     */
207
    public function compileToken(array $token): string
208
    {
209
        if (in_array($token[self::TOKEN_TYPE], [self::PLAIN_TEXT, self::TAG_CLOSE])) {
210
            //Nothing to compile
211
            return $token[HtmlTokenizer::TOKEN_CONTENT];
212
        }
213
214
        $result = $token[HtmlTokenizer::TOKEN_NAME];
215
        $attributes = [];
216
        foreach ($token[self::TOKEN_ATTRIBUTES] as $attribute => $value) {
217
            if ($value === null) {
218
                $attributes[] = $attribute;
219
                continue;
220
            }
221
222
            $attributes[] = $attribute . '="' . $value . '"';
223
        }
224
225
        if (!empty($attributes)) {
226
            $result .= ' ' . join(' ', $attributes);
227
        }
228
229
        if ($token[HtmlTokenizer::TOKEN_TYPE] == HtmlTokenizer::TAG_SHORT) {
230
            $result .= '/';
231
        }
232
233
        return '<' . $result . '>';
234
    }
235
236
    /**
237
     * Parses tag body for arguments, name, etc.
238
     *
239
     * @param string $content Tag content to be parsed (from < till >).
240
     *
241
     * @return array
242
     */
243
    protected function parseToken(string $content): array
244
    {
245
        $token = [
246
            self::TOKEN_NAME       => '',
247
            self::TOKEN_TYPE       => self::TAG_OPEN,
248
            self::TOKEN_CONTENT    => '<' . ($content = $this->repairPHP($content)) . '>',
249
            self::TOKEN_ATTRIBUTES => []
250
        ];
251
252
        //Some parts of text just looks like tags, but their not
253
        if (!preg_match('/^\/?[a-z0-9_:\/][a-z 0-9\._\-:\/]*/i', $content)) {
254
            $token[self::TOKEN_TYPE] = self::PLAIN_TEXT;
255
            unset($token[self::TOKEN_NAME], $token[self::TOKEN_NAME]);
256
257
            return $token;
258
        }
259
260
        //Local PHP isolation
261
        $isolator = new Isolator('-argument-', '-block-', true);
0 ignored issues
show
Unused Code introduced by
The call to Isolator::__construct() has too many arguments starting with true.

This check compares calls to functions or methods with their respective definitions. If the call has more arguments than are defined, it raises an issue.

If a function is defined several times with a different number of parameters, the check may pick up the wrong definition and report false positives. One codebase where this has been known to happen is Wordpress.

In this case you can add the @ignore PhpDoc annotation to the duplicate definition and it will be ignored.

Loading history...
262
263
        //No PHP blocks
264
        $content = $isolator->isolatePHP($content);
265
266
        //Parsing arguments, due they already checked for open-close quotas we can use regular expression
267
        $attribute = '/(?P<name>[a-z0-9_\-\.\:]+)[ \n\t\r]*(?:(?P<equal>=)[ \n\t\r]*'
268
            . '(?P<value>[a-z0-9\-]+|\'[^\']+\'|\"[^\"]+\"|\"\"))?/si';
269
270
        preg_match_all($attribute, $content, $attributes);
271
272
        foreach ($attributes['value'] as $index => $value) {
273
            if ($value && ($value{0} == "'" || $value{0} == '"')) {
274
                $value = trim($value, $value{0});
275
            }
276
277
            //Local and global php isolation restore
278
            $name = $this->repairPHP($isolator->repairPHP($attributes['name'][$index]));
279
280
            $token[self::TOKEN_ATTRIBUTES][$name] = $this->repairPHP($isolator->repairPHP($value));
281
282
            if (empty($attributes['equal'][$index])) {
283
                $token[self::TOKEN_ATTRIBUTES][$name] = null;
284
            }
285
        }
286
287
        //Fetching name
288
        $name = $isolator->repairPHP(current(explode(' ', $content)));
289
        if ($name{0} == '/') {
290
            $token[self::TOKEN_TYPE] = self::TAG_CLOSE;
291
            unset($token[self::TOKEN_ATTRIBUTES]);
292
        }
293
294
        if ($content{strlen($content) - 1} == '/') {
295
            $token[self::TOKEN_TYPE] = self::TAG_SHORT;
296
        }
297
298
        $token[self::TOKEN_NAME] = $name = trim($name, '/');
299
        unset($token[self::TOKEN_ATTRIBUTES][$name]);
300
301
        $token[self::TOKEN_NAME] = trim($token[self::TOKEN_NAME]);
302
303
        if (
304
            $token[self::TOKEN_TYPE] == self::TAG_OPEN
305
            && in_array($token[self::TOKEN_NAME], $this->voidTags)
306
        ) {
307
            $token[self::TOKEN_TYPE] = self::TAG_VOID;
308
        }
309
310
        return $token;
311
    }
312
313
    /**
314
     * Handles single token and passes it to a callback function if specified.
315
     *
316
     * @param int|null $tokenType Token type.
317
     * @param string   $content   Non parsed token content.
318
     */
319
    protected function handleToken($tokenType, string $content)
320
    {
321
        if ($tokenType == self::PLAIN_TEXT) {
322
            if (empty($content)) {
323
                return;
324
            }
325
326
            $token = [
327
                self::TOKEN_TYPE    => self::PLAIN_TEXT,
328
                self::TOKEN_CONTENT => $this->repairPHP($content)
329
            ];
330
        } else {
331
            $token = $this->parseToken($content);
332
        }
333
334
        $this->tokens[] = $token;
335
    }
336
337
    /**
338
     * Will restore all existing PHP blocks to their original content.
339
     *
340
     * @param string $source
341
     *
342
     * @return string
343
     */
344
    protected function repairPHP(string $source): string
345
    {
346
        if (!$this->isolatePHP) {
347
            return $source;
348
        }
349
350
        return $this->isolator->repairPHP($source);
351
    }
352
}
353