HtmlTokenizer - Code Metrics - Inspection of "more tests" - spiral/components - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Branch — feature/pre-split (8b986a)

by Anton

created 2017-01-19 15:00 UTC

HtmlTokenizer B

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	334
Duplicated Lines	0 %

Coupling/Cohesion

Components	1
Dependencies	1

Importance

Changes

Metric	Value
dl	0
loc	334
rs	8.2769
c	0
b	0
f	0
wmc	41
lcom	1
cbo	1

7 Methods

Rating	Name	Size	Complexity
A	__construct()	5	2
C	parse()	75	15
A	compile()	9	2
B	compileToken()	28	6
C	parseToken()	69	11
A	handleToken()	17	3
A	repairPHP()	8	2

How to fix Complexity

<?php
/**
 * Spiral Framework.
 *
 * @license   MIT
 * @author    Anton Titov (Wolfy-J)
 */
namespace Spiral\Stempler;

use Spiral\Tokenizer\Isolator;

/**
 * Perform html code tokenization. Class used for spiral Stempler and can be used for other html
 * related operations. HtmlTokenizer is pretty slow! Please don't forget that this is tokenizer,
 * not parser.
 *
 * @todo very old class, improvement is required
 */
class HtmlTokenizer
{
    /**
     * Current tokenizer position. Tokenizer is a linear processor (no regular expression is
     * involved). This slows it down, but the results are much more reliable.
     */
    const POSITION_PLAIN_TEXT = 0x001;
    const POSITION_IN_TAG     = 0x002;
    const POSITION_IN_QUOTAS  = 0x003;

    /**
     * Token types detected and processed by tokenizer.
     */
    const PLAIN_TEXT = 'plain';
    const TAG_OPEN   = 'open';
    const TAG_CLOSE  = 'close';
    const TAG_SHORT  = 'short';
    const TAG_VOID   = 'void';

    /**
     * Token fields. There are a lot of tokens in HTML (up to 10,000 different ones). We better to
     * use numeric keys for array than any text fields or even objects.
     */
    const TOKEN_NAME       = 0;
    const TOKEN_TYPE       = 1;
    const TOKEN_CONTENT    = 2;
    const TOKEN_ATTRIBUTES = 3;

    /**
     * List of void tags.
     *
     * @link http://www.w3.org/TR/html5/syntax.html#void-elements
     * @var array
     */
    protected $voidTags = [
        'area',
        'base',
        'br',
        'col',
        'embed',
        'hr',
        'img',
        'input',
        'keygen',
        'link',
        'meta',
        'param',
        'source',
        'track',
        'wbr'
    ];

    /**
     * Array of parsed tokens. Every token has fields name, type, content and arguments.
     *
     * @var array
     */
    protected $tokens = [];

    /**
     * PHP block should be isolated while parsing, Keep enabled.
     *
     * @var bool
     */
    protected $isolatePHP = false;

    /**
     * PHP Blocks isolator, which holds all existing PHP blocks and restores them in output.
     *
     * @var Isolator|null
     */
    protected $isolator = null;

    /**
     * @param bool     $isolatePHP PHP block should be isolated and enabled by default
     * @param Isolator $isolator
     */
    public function __construct(bool $isolatePHP = true, Isolator $isolator = null)
    {
        $this->isolatePHP = $isolatePHP;
        $this->isolator = !empty($isolator) ? $isolator : new Isolator();
    }

    /**
     * Parse HTML content and return it's tokens.
     *
     * @param string $source HTML source.
     *
     * @return array
     */
    public function parse(string $source): array
    {
        //Cleaning list of already parsed tokens
        $this->tokens = [];

        if ($this->isolatePHP) {
            $source = $this->isolator->isolatePHP($source);
        }

        $quotas = '';
        $buffer = '';

        $length = strlen($source);
        $position = self::POSITION_PLAIN_TEXT;
        for ($pointer = 0; $pointer < $length; $pointer++) {
            $char = $source[$pointer];
            switch ($char) {
                case '<':
                    if ($position == self::POSITION_IN_QUOTAS) {
                        $buffer .= $char;
                        break;
                    }

                    if ($position == self::POSITION_IN_TAG) {
                        $buffer = '<' . $buffer;
                    }

                    //Handling previous token
                    $this->handleToken(self::PLAIN_TEXT, $buffer);

                    //We are in tag now
                    $position = self::POSITION_IN_TAG;
                    $buffer = '';
                    break;
                case '>':
                    if ($position != self::POSITION_IN_TAG) {
                        $buffer .= $char;
                        break;
                    }

                    //Token ended
                    $this->handleToken(null, $buffer);

                    //We are in a plain text now
                    $position = self::POSITION_PLAIN_TEXT;
                    $buffer = '';
                    break;
                case '"':
                    //no break
                case "'":
                    if ($position == self::POSITION_IN_TAG) {
                        //Jumping into argument
                        $position = self::POSITION_IN_QUOTAS;
                        $quotas = $char;
                    } elseif ($position == self::POSITION_IN_QUOTAS && $char == $quotas) {
                        //Jumping from argument
                        $position = self::POSITION_IN_TAG;
                        $quotas = '';
                    }
                default:
                    //Checking for invalid characters in tag name or arguments
                    if ($position == self::POSITION_IN_TAG) {
                        if (!preg_match('/[a-z0-9 \._\-="\':\/\r\n\t]/i', $char)) {
                            $buffer = '<' . $buffer;
                            $position = self::POSITION_PLAIN_TEXT;
                        }
                    }
                    $buffer .= $char;
            }
        }

        $this->handleToken(self::PLAIN_TEXT, $buffer);

        return $this->tokens;
    }

    /**
     * Compile all parsed tokens back into html form.
     *
     * @return string
     */
    public function compile(): string
    {
        $result = '';
        foreach ($this->tokens as $token) {
            $result .= $this->compileToken($token);
        }

        return $result;
    }

    /**
     * Compile parsed token.
     *
     * @param array $token
     *
     * @return string
     */
    public function compileToken(array $token): string
    {
        if (in_array($token[self::TOKEN_TYPE], [self::PLAIN_TEXT, self::TAG_CLOSE])) {
            //Nothing to compile
            return $token[HtmlTokenizer::TOKEN_CONTENT];
        }

        $result = $token[HtmlTokenizer::TOKEN_NAME];
        $attributes = [];
        foreach ($token[self::TOKEN_ATTRIBUTES] as $attribute => $value) {
            if ($value === null) {
                $attributes[] = $attribute;
                continue;
            }

            $attributes[] = $attribute . '="' . $value . '"';
        }

        if (!empty($attributes)) {
            $result .= ' ' . join(' ', $attributes);
        }

        if ($token[HtmlTokenizer::TOKEN_TYPE] == HtmlTokenizer::TAG_SHORT) {
            $result .= '/';
        }

        return '<' . $result . '>';
    }

    /**
     * Parses tag body for arguments, name, etc.
     *
     * @param string $content Tag content to be parsed (from < till >).
     *
     * @return array
     */
    protected function parseToken(string $content): array
    {
        $token = [
            self::TOKEN_NAME       => '',
            self::TOKEN_TYPE       => self::TAG_OPEN,
            self::TOKEN_CONTENT    => '<' . ($content = $this->repairPHP($content)) . '>',
            self::TOKEN_ATTRIBUTES => []
        ];

        //Some parts of text just looks like tags, but their not
        if (!preg_match('/^\/?[a-z0-9_:\/][a-z 0-9\._\-:\/]*/i', $content)) {
            $token[self::TOKEN_TYPE] = self::PLAIN_TEXT;
            unset($token[self::TOKEN_NAME], $token[self::TOKEN_NAME]);

            return $token;
        }

        //Local PHP isolation
        $isolator = new Isolator('-argument-', '-block-', true);


        //No PHP blocks
        $content = $isolator->isolatePHP($content);

        //Parsing arguments, due they already checked for open-close quotas we can use regular expression
        $attribute = '/(?P<name>[a-z0-9_\-\.\:]+)[ \n\t\r]*(?:(?P<equal>=)[ \n\t\r]*'
            . '(?P<value>[a-z0-9\-]+|\'[^\']+\'|\"[^\"]+\"|\"\"))?/si';

        preg_match_all($attribute, $content, $attributes);

        foreach ($attributes['value'] as $index => $value) {
            if ($value && ($value{0} == "'" || $value{0} == '"')) {
                $value = trim($value, $value{0});
            }

            //Local and global php isolation restore
            $name = $this->repairPHP($isolator->repairPHP($attributes['name'][$index]));

            $token[self::TOKEN_ATTRIBUTES][$name] = $this->repairPHP($isolator->repairPHP($value));

            if (empty($attributes['equal'][$index])) {
                $token[self::TOKEN_ATTRIBUTES][$name] = null;
            }
        }

        //Fetching name
        $name = $isolator->repairPHP(current(explode(' ', $content)));
        if ($name{0} == '/') {
            $token[self::TOKEN_TYPE] = self::TAG_CLOSE;
            unset($token[self::TOKEN_ATTRIBUTES]);
        }

        if ($content{strlen($content) - 1} == '/') {
            $token[self::TOKEN_TYPE] = self::TAG_SHORT;
        }

        $token[self::TOKEN_NAME] = $name = trim($name, '/');
        unset($token[self::TOKEN_ATTRIBUTES][$name]);

        $token[self::TOKEN_NAME] = trim($token[self::TOKEN_NAME]);

        if (
            $token[self::TOKEN_TYPE] == self::TAG_OPEN
            && in_array($token[self::TOKEN_NAME], $this->voidTags)
        ) {
            $token[self::TOKEN_TYPE] = self::TAG_VOID;
        }

        return $token;
    }

    /**
     * Handles single token and passes it to a callback function if specified.
     *
     * @param int|null $tokenType Token type.
     * @param string   $content   Non parsed token content.
     */
    protected function handleToken($tokenType, string $content)
    {
        if ($tokenType == self::PLAIN_TEXT) {
            if (empty($content)) {
                return;
            }

            $token = [
                self::TOKEN_TYPE    => self::PLAIN_TEXT,
                self::TOKEN_CONTENT => $this->repairPHP($content)
            ];
        } else {
            $token = $this->parseToken($content);
        }

        $this->tokens[] = $token;
    }

    /**
     * Will restore all existing PHP blocks to their original content.
     *
     * @param string $source
     *
     * @return string
     */
    protected function repairPHP(string $source): string
    {
        if (!$this->isolatePHP) {
            return $source;
        }

        return $this->isolator->repairPHP($source);
    }
}


1			<?php
2			/**
3			* Spiral Framework.
4			*
5			* @license MIT
6			* @author Anton Titov (Wolfy-J)
7			*/
8			namespace Spiral\Stempler;
9
10			use Spiral\Tokenizer\Isolator;
11
12			/**
13			* Perform html code tokenization. Class used for spiral Stempler and can be used for other html
14			* related operations. HtmlTokenizer is pretty slow! Please don't forget that this is tokenizer,
15			* not parser.
16			*
17			* @todo very old class, improvement is required
18			*/
19			class HtmlTokenizer
20			{
21			/**
22			* Current tokenizer position. Tokenizer is a linear processor (no regular expression is
23			* involved). This slows it down, but the results are much more reliable.
24			*/
25			const POSITION_PLAIN_TEXT = 0x001;
26			const POSITION_IN_TAG = 0x002;
27			const POSITION_IN_QUOTAS = 0x003;
28
29			/**
30			* Token types detected and processed by tokenizer.
31			*/
32			const PLAIN_TEXT = 'plain';
33			const TAG_OPEN = 'open';
34			const TAG_CLOSE = 'close';
35			const TAG_SHORT = 'short';
36			const TAG_VOID = 'void';
37
38			/**
39			* Token fields. There are a lot of tokens in HTML (up to 10,000 different ones). We better to
40			* use numeric keys for array than any text fields or even objects.
41			*/
42			const TOKEN_NAME = 0;
43			const TOKEN_TYPE = 1;
44			const TOKEN_CONTENT = 2;
45			const TOKEN_ATTRIBUTES = 3;
46
47			/**
48			* List of void tags.
49			*
50			* @link http://www.w3.org/TR/html5/syntax.html#void-elements
51			* @var array
52			*/
53			protected $voidTags = [
54			'area',
55			'base',
56			'br',
57			'col',
58			'embed',
59			'hr',
60			'img',
61			'input',
62			'keygen',
63			'link',
64			'meta',
65			'param',
66			'source',
67			'track',
68			'wbr'
69			];
70
71			/**
72			* Array of parsed tokens. Every token has fields name, type, content and arguments.
73			*
74			* @var array
75			*/
76			protected $tokens = [];
77
78			/**
79			* PHP block should be isolated while parsing, Keep enabled.
80			*
81			* @var bool
82			*/
83			protected $isolatePHP = false;
84
85			/**
86			* PHP Blocks isolator, which holds all existing PHP blocks and restores them in output.
87			*
88			* @var Isolator\|null
89			*/
90			protected $isolator = null;
91
92			/**
93			* @param bool $isolatePHP PHP block should be isolated and enabled by default
94			* @param Isolator $isolator
95			*/
96			public function __construct(bool $isolatePHP = true, Isolator $isolator = null)
97			{
98			$this->isolatePHP = $isolatePHP;
99			$this->isolator = !empty($isolator) ? $isolator : new Isolator();
100			}
101
102			/**
103			* Parse HTML content and return it's tokens.
104			*
105			* @param string $source HTML source.
106			*
107			* @return array
108			*/
109			public function parse(string $source): array
110			{
111			//Cleaning list of already parsed tokens
112			$this->tokens = [];
113
114			if ($this->isolatePHP) {
115			$source = $this->isolator->isolatePHP($source);
116			}
117
118			$quotas = '';
119			$buffer = '';
120
121			$length = strlen($source);
122			$position = self::POSITION_PLAIN_TEXT;
123			for ($pointer = 0; $pointer < $length; $pointer++) {
124			$char = $source[$pointer];
125			switch ($char) {
126			case '<':
127			if ($position == self::POSITION_IN_QUOTAS) {
128			$buffer .= $char;
129			break;
130			}
131
132			if ($position == self::POSITION_IN_TAG) {
133			$buffer = '<' . $buffer;
134			}
135
136			//Handling previous token
137			$this->handleToken(self::PLAIN_TEXT, $buffer);
138
139			//We are in tag now
140			$position = self::POSITION_IN_TAG;
141			$buffer = '';
142			break;
143			case '>':
144			if ($position != self::POSITION_IN_TAG) {
145			$buffer .= $char;
146			break;
147			}
148
149			//Token ended
150			$this->handleToken(null, $buffer);
151
152			//We are in a plain text now
153			$position = self::POSITION_PLAIN_TEXT;
154			$buffer = '';
155			break;
156			case '"':
157			//no break
158			case "'":
159			if ($position == self::POSITION_IN_TAG) {
160			//Jumping into argument
161			$position = self::POSITION_IN_QUOTAS;
162			$quotas = $char;
163			} elseif ($position == self::POSITION_IN_QUOTAS && $char == $quotas) {
164			//Jumping from argument
165			$position = self::POSITION_IN_TAG;
166			$quotas = '';
167			}
168			default:
169			//Checking for invalid characters in tag name or arguments
170			if ($position == self::POSITION_IN_TAG) {
171			if (!preg_match('/[a-z0-9 \._\-="\':\/\r\n\t]/i', $char)) {
172			$buffer = '<' . $buffer;
173			$position = self::POSITION_PLAIN_TEXT;
174			}
175			}
176			$buffer .= $char;
177			}
178			}
179
180			$this->handleToken(self::PLAIN_TEXT, $buffer);
181
182			return $this->tokens;
183			}
184
185			/**
186			* Compile all parsed tokens back into html form.
187			*
188			* @return string
189			*/
190			public function compile(): string
191			{
192			$result = '';
193			foreach ($this->tokens as $token) {
194			$result .= $this->compileToken($token);
195			}
196
197			return $result;
198			}
199
200			/**
201			* Compile parsed token.
202			*
203			* @param array $token
204			*
205			* @return string
206			*/
207			public function compileToken(array $token): string
208			{
209			if (in_array($token[self::TOKEN_TYPE], [self::PLAIN_TEXT, self::TAG_CLOSE])) {
210			//Nothing to compile
211			return $token[HtmlTokenizer::TOKEN_CONTENT];
212			}
213
214			$result = $token[HtmlTokenizer::TOKEN_NAME];
215			$attributes = [];
216			foreach ($token[self::TOKEN_ATTRIBUTES] as $attribute => $value) {
217			if ($value === null) {
218			$attributes[] = $attribute;
219			continue;
220			}
221
222			$attributes[] = $attribute . '="' . $value . '"';
223			}
224
225			if (!empty($attributes)) {
226			$result .= ' ' . join(' ', $attributes);
227			}
228
229			if ($token[HtmlTokenizer::TOKEN_TYPE] == HtmlTokenizer::TAG_SHORT) {
230			$result .= '/';
231			}
232
233			return '<' . $result . '>';
234			}
235
236			/**
237			* Parses tag body for arguments, name, etc.
238			*
239			* @param string $content Tag content to be parsed (from < till >).
240			*
241			* @return array
242			*/
243			protected function parseToken(string $content): array
244			{
245			$token = [
246			self::TOKEN_NAME => '',
247			self::TOKEN_TYPE => self::TAG_OPEN,
248			self::TOKEN_CONTENT => '<' . ($content = $this->repairPHP($content)) . '>',
249			self::TOKEN_ATTRIBUTES => []
250			];
251
252			//Some parts of text just looks like tags, but their not
253			if (!preg_match('/^\/?[a-z0-9_:\/][a-z 0-9\._\-:\/]*/i', $content)) {
254			$token[self::TOKEN_TYPE] = self::PLAIN_TEXT;
255			unset($token[self::TOKEN_NAME], $token[self::TOKEN_NAME]);
256
257			return $token;
258			}
259
260			//Local PHP isolation
261			$isolator = new Isolator('-argument-', '-block-', true);
			0 ignored issues – show Unused Code introduced 2016-01-22 00:54 UTC by Report Bug Copy Issue Report The call to `Isolator::__construct()` has too many arguments starting with `true`. This check compares calls to functions or methods with their respective definitions. If the call has more arguments than are defined, it raises an issue. If a function is defined several times with a different number of parameters, the check may pick up the wrong definition and report false positives. One codebase where this has been known to happen is Wordpress. In this case you can add the `@ignore` PhpDoc annotation to the duplicate definition and it will be ignored. Loading history...
262
263			//No PHP blocks
264			$content = $isolator->isolatePHP($content);
265
266			//Parsing arguments, due they already checked for open-close quotas we can use regular expression
267			$attribute = '/(?P<name>[a-z0-9_\-\.\:]+)[ \n\t\r](?:(?P<equal>=)[ \n\t\r]'
268			. '(?P<value>[a-z0-9\-]+\|\'[^\']+\'\|\"[^\"]+\"\|\"\"))?/si';
269
270			preg_match_all($attribute, $content, $attributes);
271
272			foreach ($attributes['value'] as $index => $value) {
273			if ($value && ($value{0} == "'" \|\| $value{0} == '"')) {
274			$value = trim($value, $value{0});
275			}
276
277			//Local and global php isolation restore
278			$name = $this->repairPHP($isolator->repairPHP($attributes['name'][$index]));
279
280			$token[self::TOKEN_ATTRIBUTES][$name] = $this->repairPHP($isolator->repairPHP($value));
281
282			if (empty($attributes['equal'][$index])) {
283			$token[self::TOKEN_ATTRIBUTES][$name] = null;
284			}
285			}
286
287			//Fetching name
288			$name = $isolator->repairPHP(current(explode(' ', $content)));
289			if ($name{0} == '/') {
290			$token[self::TOKEN_TYPE] = self::TAG_CLOSE;
291			unset($token[self::TOKEN_ATTRIBUTES]);
292			}
293
294			if ($content{strlen($content) - 1} == '/') {
295			$token[self::TOKEN_TYPE] = self::TAG_SHORT;
296			}
297
298			$token[self::TOKEN_NAME] = $name = trim($name, '/');
299			unset($token[self::TOKEN_ATTRIBUTES][$name]);
300
301			$token[self::TOKEN_NAME] = trim($token[self::TOKEN_NAME]);
302
303			if (
304			$token[self::TOKEN_TYPE] == self::TAG_OPEN
305			&& in_array($token[self::TOKEN_NAME], $this->voidTags)
306			) {
307			$token[self::TOKEN_TYPE] = self::TAG_VOID;
308			}
309
310			return $token;
311			}
312
313			/**
314			* Handles single token and passes it to a callback function if specified.
315			*
316			* @param int\|null $tokenType Token type.
317			* @param string $content Non parsed token content.
318			*/
319			protected function handleToken($tokenType, string $content)
320			{
321			if ($tokenType == self::PLAIN_TEXT) {
322			if (empty($content)) {
323			return;
324			}
325
326			$token = [
327			self::TOKEN_TYPE => self::PLAIN_TEXT,
328			self::TOKEN_CONTENT => $this->repairPHP($content)
329			];
330			} else {
331			$token = $this->parseToken($content);
332			}
333
334			$this->tokens[] = $token;
335			}
336
337			/**
338			* Will restore all existing PHP blocks to their original content.
339			*
340			* @param string $source
341			*
342			* @return string
343			*/
344			protected function repairPHP(string $source): string
345			{
346			if (!$this->isolatePHP) {
347			return $source;
348			}
349
350			return $this->isolator->repairPHP($source);
351			}
352			}
353

spiral / components

Branch — feature/pre-split (8b986a)

HtmlTokenizer B

Complexity

Size/Duplication

Coupling/Cohesion

Importance

7 Methods

How to fix Complexity

Complex Class

Duplication Side-by-Side

Filter issues like