Completed
Branch develop (c2aa4c)
by Anton
05:17
created

HtmlTokenizer::parse()   C

Complexity

Conditions 15
Paths 54

Size

Total Lines 75
Code Lines 45

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
c 1
b 0
f 0
dl 0
loc 75
rs 5.3122
cc 15
eloc 45
nc 54
nop 1

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
/**
3
 * Spiral Framework.
4
 *
5
 * @license   MIT
6
 * @author    Anton Titov (Wolfy-J)
7
 */
8
namespace Spiral\Stempler;
9
10
use Spiral\Tokenizer\Isolator;
11
12
/**
13
 * Perform html code tokenization. Class used for spiral Stempler and can be used for other html
14
 * related operations. HtmlTokenizer is pretty slow! Please don't forget that this is tokenizer,
15
 * not parser.
16
 *
17
 * @todo very old class, improvement required
18
 */
19
class HtmlTokenizer
20
{
21
    /**
22
     * Current tokenizer position. Tokenizer is a linear processor (no regular expression is
23
     * involved). This slows it down, but the results are much more reliable.
24
     */
25
    const POSITION_PLAIN_TEXT = 0x001;
26
    const POSITION_IN_TAG     = 0x002;
27
    const POSITION_IN_QUOTAS  = 0x003;
28
29
    /**
30
     * Token types detected and processed by tokenizer.
31
     */
32
    const PLAIN_TEXT = 'plain';
33
    const TAG_OPEN   = 'open';
34
    const TAG_CLOSE  = 'close';
35
    const TAG_SHORT  = 'short';
36
    const TAG_VOID   = 'void';
37
38
    /**
39
     * Token fields. There are a lot of tokens in HTML (up to 10,000 different ones). We better to
40
     * use numeric keys for array than any text fields or even objects.
41
     */
42
    const TOKEN_NAME       = 0;
43
    const TOKEN_TYPE       = 1;
44
    const TOKEN_CONTENT    = 2;
45
    const TOKEN_ATTRIBUTES = 3;
46
47
    /**
48
     * List of void tags.
49
     *
50
     * @link http://www.w3.org/TR/html5/syntax.html#void-elements
51
     * @var array
52
     */
53
    protected $voidTags = [
54
        'area',
55
        'base',
56
        'br',
57
        'col',
58
        'embed',
59
        'hr',
60
        'img',
61
        'input',
62
        'keygen',
63
        'link',
64
        'meta',
65
        'param',
66
        'source',
67
        'track',
68
        'wbr'
69
    ];
70
71
    /**
72
     * Array of parsed tokens. Every token has fields name, type, content and arguments.
73
     *
74
     * @var array
75
     */
76
    protected $tokens = [];
77
78
    /**
79
     * PHP block should be isolated while parsing, Keep enabled.
80
     *
81
     * @var bool
82
     */
83
    protected $isolatePHP = false;
84
85
    /**
86
     * PHP Blocks isolator, which holds all existing PHP blocks and restores them in output.
87
     *
88
     * @var Isolator|null
89
     */
90
    protected $isolator = null;
91
92
    /**
93
     * @param bool     $isolatePHP PHP block should be isolated and enabled by default
94
     * @param Isolator $isolator
95
     */
96
    public function __construct($isolatePHP = true, Isolator $isolator = null)
97
    {
98
        $this->isolatePHP = $isolatePHP;
99
        $this->isolator = !empty($isolator) ? $isolator : new Isolator();
100
    }
101
102
    /**
103
     * Parse HTML content and return it's tokens.
104
     *
105
     * @param string $source HTML source.
106
     * @return array
107
     */
108
    public function parse($source)
109
    {
110
        //Cleaning list of already parsed tokens
111
        $this->tokens = [];
112
113
        if ($this->isolatePHP) {
114
            $source = $this->isolator->isolatePHP($source);
115
        }
116
117
        $quotas = '';
118
        $buffer = '';
119
120
        $length = strlen($source);
121
        $position = self::POSITION_PLAIN_TEXT;
122
        for ($pointer = 0; $pointer < $length; $pointer++) {
123
            $char = $source[$pointer];
124
            switch ($char) {
125
                case '<':
126
                    if ($position == self::POSITION_IN_QUOTAS) {
127
                        $buffer .= $char;
128
                        break;
129
                    }
130
131
                    if ($position == self::POSITION_IN_TAG) {
132
                        $buffer = '<' . $buffer;
133
                    }
134
135
                    //Handling previous token
136
                    $this->handleToken(self::PLAIN_TEXT, $buffer);
137
138
                    //We are in tag now
139
                    $position = self::POSITION_IN_TAG;
140
                    $buffer = '';
141
                    break;
142
                case '>':
143
                    if ($position != self::POSITION_IN_TAG) {
144
                        $buffer .= $char;
145
                        break;
146
                    }
147
148
                    //Token ended
149
                    $this->handleToken(false, $buffer);
0 ignored issues
show
Documentation introduced by
false is of type boolean, but the function expects a integer.

It seems like the type of the argument is not accepted by the function/method which you are calling.

In some cases, in particular if PHP’s automatic type-juggling kicks in this might be fine. In other cases, however this might be a bug.

We suggest to add an explicit type cast like in the following example:

function acceptsInteger($int) { }

$x = '123'; // string "123"

// Instead of
acceptsInteger($x);

// we recommend to use
acceptsInteger((integer) $x);
Loading history...
150
151
                    //We are in a plain text now
152
                    $position = self::POSITION_PLAIN_TEXT;
153
                    $buffer = '';
154
                    break;
155
                case '"':
156
                    //no break
157
                case "'":
1 ignored issue
show
Coding Style introduced by
There must be a comment when fall-through is intentional in a non-empty case body
Loading history...
158
                    if ($position == self::POSITION_IN_TAG) {
159
                        //Jumping into argument
160
                        $position = self::POSITION_IN_QUOTAS;
161
                        $quotas = $char;
162
                    } elseif ($position == self::POSITION_IN_QUOTAS && $char == $quotas) {
163
                        //Jumping from argument
164
                        $position = self::POSITION_IN_TAG;
165
                        $quotas = '';
166
                    }
167
                default:
168
                    //Checking for invalid characters in tag name or arguments
169
                    if ($position == self::POSITION_IN_TAG) {
170
                        if (!preg_match('/[a-z0-9 \._\-="\':\/\r\n\t]/i', $char)) {
171
                            $buffer = '<' . $buffer;
172
                            $position = self::POSITION_PLAIN_TEXT;
173
                        }
174
                    }
175
                    $buffer .= $char;
176
            }
177
        }
178
179
        $this->handleToken(self::PLAIN_TEXT, $buffer);
180
181
        return $this->tokens;
182
    }
183
184
    /**
185
     * Compile token and all it's attributes into string.
186
     *
187
     * @param array $token
188
     * @return string
189
     */
190
    public function compile(array $token)
191
    {
192
        if (in_array($token[self::TOKEN_TYPE], [self::PLAIN_TEXT, self::TAG_CLOSE])) {
193
            //Nothing to compile
194
            return $token[HtmlTokenizer::TOKEN_CONTENT];
195
        }
196
197
        $result = $token[HtmlTokenizer::TOKEN_NAME];
198
        $attributes = [];
199
        foreach ($token[self::TOKEN_ATTRIBUTES] as $attribute => $value) {
200
            if ($value === null) {
201
                $attributes[] = $attribute;
202
                continue;
203
            }
204
205
            $attributes[] = $attribute . '="' . $value . '"';
206
        }
207
208
        if (!empty($attributes)) {
209
            $result .= ' ' . join(' ', $attributes);
210
        }
211
212
        if ($token[HtmlTokenizer::TOKEN_TYPE] == HtmlTokenizer::TAG_SHORT) {
213
            $result .= '/';
214
        }
215
216
        return '<' . $result . '>';
217
    }
218
219
    /**
220
     * Parses tag body for arguments, name, etc.
221
     *
222
     * @param string $content Tag content to be parsed (from < till >).
223
     * @return array
224
     */
225
    protected function parseToken($content)
226
    {
227
        $token = [
228
            self::TOKEN_NAME       => '',
229
            self::TOKEN_TYPE       => self::TAG_OPEN,
230
            self::TOKEN_CONTENT    => '<' . ($content = $this->repairPHP($content)) . '>',
231
            self::TOKEN_ATTRIBUTES => []
232
        ];
233
234
        //Some parts of text just looks like tags, but their not
235
        if (!preg_match('/^\/?[a-z0-9_:\/][a-z 0-9\._\-:\/]*/i', $content)) {
236
            $token[self::TOKEN_TYPE] = self::PLAIN_TEXT;
237
            unset($token[self::TOKEN_NAME], $token[self::TOKEN_NAME]);
238
239
            return $token;
240
        }
241
242
        //Local PHP isolation
243
        $isolator = new Isolator('-argument-', '-block-', true);
244
245
        //No PHP blocks
246
        $content = $isolator->isolatePHP($content);
247
248
        //Parsing arguments, due they already checked for open-close quotas we can use regular expression
249
        $attribute = '/(?P<name>[a-z0-9_\-\.\:]+)[ \n\t\r]*(?:(?P<equal>=)[ \n\t\r]*'
250
            . '(?P<value>[a-z0-9\-]+|\'[^\']+\'|\"[^\"]+\"|\"\"))?/si';
251
        //todo: need better regexp for quotes
252
253
        preg_match_all($attribute, $content, $attributes);
254
255
        foreach ($attributes['value'] as $index => $value) {
256
            if ($value && ($value{0} == "'" || $value{0} == '"')) {
257
                $value = trim($value, $value{0});
258
            }
259
260
            //Restoring global php isolation
261
            $name = $this->repairPHP(
262
            //Restoring local php isolation
263
                $isolator->repairPHP($attributes['name'][$index])
264
            );
265
266
            $token[self::TOKEN_ATTRIBUTES][$name] = $this->repairPHP($isolator->repairPHP($value));
267
268
            if (empty($attributes['equal'][$index])) {
269
                $token[self::TOKEN_ATTRIBUTES][$name] = null;
270
            }
271
        }
272
273
        //Fetching name
274
        $name = $isolator->repairPHP(current(explode(' ', $content)));
275
        if ($name{0} == '/') {
276
            $token[self::TOKEN_TYPE] = self::TAG_CLOSE;
277
            unset($token[self::TOKEN_ATTRIBUTES]);
278
        }
279
280
        if ($content{strlen($content) - 1} == '/') {
281
            $token[self::TOKEN_TYPE] = self::TAG_SHORT;
282
        }
283
284
        $token[self::TOKEN_NAME] = $name = trim($name, '/');
285
        unset($token[self::TOKEN_ATTRIBUTES][$name]);
286
287
        $token[self::TOKEN_NAME] = trim($token[self::TOKEN_NAME]);
288
289
        if (
290
            $token[self::TOKEN_TYPE] == self::TAG_OPEN
291
            && in_array($token[self::TOKEN_NAME], $this->voidTags)
292
        ) {
293
            $token[self::TOKEN_TYPE] = self::TAG_VOID;
294
        }
295
296
        return $token;
297
    }
298
299
    /**
300
     * Handles single token and passes it to a callback function if specified.
301
     *
302
     * @param int    $tokenType Token type.
303
     * @param string $content   Non parsed token content.
304
     */
305
    protected function handleToken($tokenType, $content)
306
    {
307
        if ($tokenType == self::PLAIN_TEXT) {
308
            if (empty($content)) {
309
                return;
310
            }
311
312
            $token = [
313
                self::TOKEN_TYPE    => self::PLAIN_TEXT,
314
                self::TOKEN_CONTENT => $this->repairPHP($content)
315
            ];
316
        } else {
317
            $token = $this->parseToken($content);
318
        }
319
320
        $this->tokens[] = $token;
321
    }
322
323
    /**
324
     * Will restore all existing PHP blocks to their original content.
325
     *
326
     * @param string $source
327
     * @return string
328
     */
329
    protected function repairPHP($source)
330
    {
331
        if (!$this->isolatePHP) {
332
            return $source;
333
        }
334
335
        return $this->isolator->repairPHP($source);
336
    }
337
}
338