HtmlTokenizer - Code Metrics - Inspection of "minor refactoring in storages" - spiral/components - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Branch — dbal-improvement (06db1a)

by Anton

created 2015-12-04 23:00 UTC

HtmlTokenizer B

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	319
Duplicated Lines	0 %

Coupling/Cohesion

Components	1
Dependencies	1

Importance

Changes	1
Bugs	0	Features	0

Metric	Value
wmc	39
c	1
b	0
f	0
lcom	1
cbo	1
dl	0
loc	319
rs	8.2857

6 Methods

Rating	Name	Size	Complexity
A	__construct()	5	2
C	parse()	75	15
B	compile()	28	6
C	parseToken()	73	11
A	handleToken()	17	3
A	repairPHP()	8	2

<?php
/**
 * Spiral Framework.
 *
 * @license   MIT
 * @author    Anton Titov (Wolfy-J)
 */
namespace Spiral\Stempler;

use Spiral\Tokenizer\Isolator;

/**
 * Perform html code tokenization. Class used for spiral Stempler and can be used for other html
 * related operations. HtmlTokenizer is pretty slow! Please don't forget that this is tokenizer,
 * not parser.
 *
 * @todo very old class, improvement required
 */
class HtmlTokenizer
{
    /**
     * Current tokenizer position. Tokenizer is a linear processor (no regular expression is
     * involved). This slows it down, but the results are much more reliable.
     */
    const POSITION_PLAIN_TEXT = 0x001;
    const POSITION_IN_TAG     = 0x002;
    const POSITION_IN_QUOTAS  = 0x003;

    /**
     * Token types detected and processed by tokenizer.
     */
    const PLAIN_TEXT = 'plain';
    const TAG_OPEN   = 'open';
    const TAG_CLOSE  = 'close';
    const TAG_SHORT  = 'short';
    const TAG_VOID   = 'void';

    /**
     * Token fields. There are a lot of tokens in HTML (up to 10,000 different ones). We better to
     * use numeric keys for array than any text fields or even objects.
     */
    const TOKEN_NAME       = 0;
    const TOKEN_TYPE       = 1;
    const TOKEN_CONTENT    = 2;
    const TOKEN_ATTRIBUTES = 3;

    /**
     * List of void tags.
     *
     * @link http://www.w3.org/TR/html5/syntax.html#void-elements
     * @var array
     */
    protected $voidTags = [
        'area',
        'base',
        'br',
        'col',
        'embed',
        'hr',
        'img',
        'input',
        'keygen',
        'link',
        'meta',
        'param',
        'source',
        'track',
        'wbr'
    ];

    /**
     * Array of parsed tokens. Every token has fields name, type, content and arguments.
     *
     * @var array
     */
    protected $tokens = [];

    /**
     * PHP block should be isolated while parsing, Keep enabled.
     *
     * @var bool
     */
    protected $isolatePHP = false;

    /**
     * PHP Blocks isolator, which holds all existing PHP blocks and restores them in output.
     *
     * @var Isolator|null
     */
    protected $isolator = null;

    /**
     * @param bool     $isolatePHP PHP block should be isolated and enabled by default
     * @param Isolator $isolator
     */
    public function __construct($isolatePHP = true, Isolator $isolator = null)
    {
        $this->isolatePHP = $isolatePHP;
        $this->isolator = !empty($isolator) ? $isolator : new Isolator();
    }

    /**
     * Parse HTML content and return it's tokens.
     *
     * @param string $source HTML source.
     * @return array
     */
    public function parse($source)
    {
        //Cleaning list of already parsed tokens
        $this->tokens = [];

        if ($this->isolatePHP) {
            $source = $this->isolator->isolatePHP($source);
        }

        $quotas = '';
        $buffer = '';

        $length = strlen($source);
        $position = self::POSITION_PLAIN_TEXT;
        for ($pointer = 0; $pointer < $length; $pointer++) {
            $char = $source[$pointer];
            switch ($char) {
                case '<':
                    if ($position == self::POSITION_IN_QUOTAS) {
                        $buffer .= $char;
                        break;
                    }

                    if ($position == self::POSITION_IN_TAG) {
                        $buffer = '<' . $buffer;
                    }

                    //Handling previous token
                    $this->handleToken(self::PLAIN_TEXT, $buffer);

                    //We are in tag now
                    $position = self::POSITION_IN_TAG;
                    $buffer = '';
                    break;
                case '>':
                    if ($position != self::POSITION_IN_TAG) {
                        $buffer .= $char;
                        break;
                    }

                    //Token ended
                    $this->handleToken(false, $buffer);
function acceptsInteger($int) { }

$x = '123'; // string "123"

// Instead of
acceptsInteger($x);

// we recommend to use
acceptsInteger((integer) $x);

                    //We are in a plain text now
                    $position = self::POSITION_PLAIN_TEXT;
                    $buffer = '';
                    break;
                case '"':
                    //no break
                case "'":

                    if ($position == self::POSITION_IN_TAG) {
                        //Jumping into argument
                        $position = self::POSITION_IN_QUOTAS;
                        $quotas = $char;
                    } elseif ($position == self::POSITION_IN_QUOTAS && $char == $quotas) {
                        //Jumping from argument
                        $position = self::POSITION_IN_TAG;
                        $quotas = '';
                    }
                default:
                    //Checking for invalid characters in tag name or arguments
                    if ($position == self::POSITION_IN_TAG) {
                        if (!preg_match('/[a-z0-9 \._\-="\':\/\r\n\t]/i', $char)) {
                            $buffer = '<' . $buffer;
                            $position = self::POSITION_PLAIN_TEXT;
                        }
                    }
                    $buffer .= $char;
            }
        }

        $this->handleToken(self::PLAIN_TEXT, $buffer);

        return $this->tokens;
    }

    /**
     * Compile token and all it's attributes into string.
     *
     * @param array $token
     * @return string
     */
    public function compile(array $token)
    {
        if (in_array($token[self::TOKEN_TYPE], [self::PLAIN_TEXT, self::TAG_CLOSE])) {
            //Nothing to compile
            return $token[HtmlTokenizer::TOKEN_CONTENT];
        }

        $result = $token[HtmlTokenizer::TOKEN_NAME];
        $attributes = [];
        foreach ($token[self::TOKEN_ATTRIBUTES] as $attribute => $value) {
            if ($value === null) {
                $attributes[] = $attribute;
                continue;
            }

            $attributes[] = $attribute . '="' . $value . '"';
        }

        if (!empty($attributes)) {
            $result .= ' ' . join(' ', $attributes);
        }

        if ($token[HtmlTokenizer::TOKEN_TYPE] == HtmlTokenizer::TAG_SHORT) {
            $result .= '/';
        }

        return '<' . $result . '>';
    }

    /**
     * Parses tag body for arguments, name, etc.
     *
     * @param string $content Tag content to be parsed (from < till >).
     * @return array
     */
    protected function parseToken($content)
    {
        $token = [
            self::TOKEN_NAME       => '',
            self::TOKEN_TYPE       => self::TAG_OPEN,
            self::TOKEN_CONTENT    => '<' . ($content = $this->repairPHP($content)) . '>',
            self::TOKEN_ATTRIBUTES => []
        ];

        //Some parts of text just looks like tags, but their not
        if (!preg_match('/^\/?[a-z0-9_:\/][a-z 0-9\._\-:\/]*/i', $content)) {
            $token[self::TOKEN_TYPE] = self::PLAIN_TEXT;
            unset($token[self::TOKEN_NAME], $token[self::TOKEN_NAME]);

            return $token;
        }

        //Local PHP isolation
        $isolator = new Isolator('-argument-', '-block-', true);

        //No PHP blocks
        $content = $isolator->isolatePHP($content);

        //Parsing arguments, due they already checked for open-close quotas we can use regular expression
        $attribute = '/(?P<name>[a-z0-9_\-\.\:]+)[ \n\t\r]*(?:(?P<equal>=)[ \n\t\r]*'
            . '(?P<value>[a-z0-9\-]+|\'[^\']+\'|\"[^\"]+\"|\"\"))?/si';
        //todo: need better regexp for quotes

        preg_match_all($attribute, $content, $attributes);

        foreach ($attributes['value'] as $index => $value) {
            if ($value && ($value{0} == "'" || $value{0} == '"')) {
                $value = trim($value, $value{0});
            }

            //Restoring global php isolation
            $name = $this->repairPHP(
            //Restoring local php isolation
                $isolator->repairPHP($attributes['name'][$index])
            );

            $token[self::TOKEN_ATTRIBUTES][$name] = $this->repairPHP($isolator->repairPHP($value));

            if (empty($attributes['equal'][$index])) {
                $token[self::TOKEN_ATTRIBUTES][$name] = null;
            }
        }

        //Fetching name
        $name = $isolator->repairPHP(current(explode(' ', $content)));
        if ($name{0} == '/') {
            $token[self::TOKEN_TYPE] = self::TAG_CLOSE;
            unset($token[self::TOKEN_ATTRIBUTES]);
        }

        if ($content{strlen($content) - 1} == '/') {
            $token[self::TOKEN_TYPE] = self::TAG_SHORT;
        }

        $token[self::TOKEN_NAME] = $name = trim($name, '/');
        unset($token[self::TOKEN_ATTRIBUTES][$name]);

        $token[self::TOKEN_NAME] = trim($token[self::TOKEN_NAME]);

        if (
            $token[self::TOKEN_TYPE] == self::TAG_OPEN
            && in_array($token[self::TOKEN_NAME], $this->voidTags)
        ) {
            $token[self::TOKEN_TYPE] = self::TAG_VOID;
        }

        return $token;
    }

    /**
     * Handles single token and passes it to a callback function if specified.
     *
     * @param int    $tokenType Token type.
     * @param string $content   Non parsed token content.
     */
    protected function handleToken($tokenType, $content)
    {
        if ($tokenType == self::PLAIN_TEXT) {
            if (empty($content)) {
                return;
            }

            $token = [
                self::TOKEN_TYPE    => self::PLAIN_TEXT,
                self::TOKEN_CONTENT => $this->repairPHP($content)
            ];
        } else {
            $token = $this->parseToken($content);
        }

        $this->tokens[] = $token;
    }

    /**
     * Will restore all existing PHP blocks to their original content.
     *
     * @param string $source
     * @return string
     */
    protected function repairPHP($source)
    {
        if (!$this->isolatePHP) {
            return $source;
        }

        return $this->isolator->repairPHP($source);
    }
}


1			<?php
2			/**
3			* Spiral Framework.
4			*
5			* @license MIT
6			* @author Anton Titov (Wolfy-J)
7			*/
8			namespace Spiral\Stempler;
9
10			use Spiral\Tokenizer\Isolator;
11
12			/**
13			* Perform html code tokenization. Class used for spiral Stempler and can be used for other html
14			* related operations. HtmlTokenizer is pretty slow! Please don't forget that this is tokenizer,
15			* not parser.
16			*
17			* @todo very old class, improvement required
18			*/
19			class HtmlTokenizer
20			{
21			/**
22			* Current tokenizer position. Tokenizer is a linear processor (no regular expression is
23			* involved). This slows it down, but the results are much more reliable.
24			*/
25			const POSITION_PLAIN_TEXT = 0x001;
26			const POSITION_IN_TAG = 0x002;
27			const POSITION_IN_QUOTAS = 0x003;
28
29			/**
30			* Token types detected and processed by tokenizer.
31			*/
32			const PLAIN_TEXT = 'plain';
33			const TAG_OPEN = 'open';
34			const TAG_CLOSE = 'close';
35			const TAG_SHORT = 'short';
36			const TAG_VOID = 'void';
37
38			/**
39			* Token fields. There are a lot of tokens in HTML (up to 10,000 different ones). We better to
40			* use numeric keys for array than any text fields or even objects.
41			*/
42			const TOKEN_NAME = 0;
43			const TOKEN_TYPE = 1;
44			const TOKEN_CONTENT = 2;
45			const TOKEN_ATTRIBUTES = 3;
46
47			/**
48			* List of void tags.
49			*
50			* @link http://www.w3.org/TR/html5/syntax.html#void-elements
51			* @var array
52			*/
53			protected $voidTags = [
54			'area',
55			'base',
56			'br',
57			'col',
58			'embed',
59			'hr',
60			'img',
61			'input',
62			'keygen',
63			'link',
64			'meta',
65			'param',
66			'source',
67			'track',
68			'wbr'
69			];
70
71			/**
72			* Array of parsed tokens. Every token has fields name, type, content and arguments.
73			*
74			* @var array
75			*/
76			protected $tokens = [];
77
78			/**
79			* PHP block should be isolated while parsing, Keep enabled.
80			*
81			* @var bool
82			*/
83			protected $isolatePHP = false;
84
85			/**
86			* PHP Blocks isolator, which holds all existing PHP blocks and restores them in output.
87			*
88			* @var Isolator\|null
89			*/
90			protected $isolator = null;
91
92			/**
93			* @param bool $isolatePHP PHP block should be isolated and enabled by default
94			* @param Isolator $isolator
95			*/
96			public function __construct($isolatePHP = true, Isolator $isolator = null)
97			{
98			$this->isolatePHP = $isolatePHP;
99			$this->isolator = !empty($isolator) ? $isolator : new Isolator();
100			}
101
102			/**
103			* Parse HTML content and return it's tokens.
104			*
105			* @param string $source HTML source.
106			* @return array
107			*/
108			public function parse($source)
109			{
110			//Cleaning list of already parsed tokens
111			$this->tokens = [];
112
113			if ($this->isolatePHP) {
114			$source = $this->isolator->isolatePHP($source);
115			}
116
117			$quotas = '';
118			$buffer = '';
119
120			$length = strlen($source);
121			$position = self::POSITION_PLAIN_TEXT;
122			for ($pointer = 0; $pointer < $length; $pointer++) {
123			$char = $source[$pointer];
124			switch ($char) {
125			case '<':
126			if ($position == self::POSITION_IN_QUOTAS) {
127			$buffer .= $char;
128			break;
129			}
130
131			if ($position == self::POSITION_IN_TAG) {
132			$buffer = '<' . $buffer;
133			}
134
135			//Handling previous token
136			$this->handleToken(self::PLAIN_TEXT, $buffer);
137
138			//We are in tag now
139			$position = self::POSITION_IN_TAG;
140			$buffer = '';
141			break;
142			case '>':
143			if ($position != self::POSITION_IN_TAG) {
144			$buffer .= $char;
145			break;
146			}
147
148			//Token ended
149			$this->handleToken(false, $buffer);
			0 ignored issues – show Documentation introduced 2015-12-04 23:04 UTC by Report Bug Copy Issue Report `false` is of type `boolean`, but the function expects a `integer`. It seems like the type of the argument is not accepted by the function/method which you are calling. In some cases, in particular if PHP’s automatic type-juggling kicks in this might be fine. In other cases, however this might be a bug. We suggest to add an explicit type cast like in the following example: function acceptsInteger($int) { } $x = '123'; // string "123" // Instead of acceptsInteger($x); // we recommend to use acceptsInteger((integer) $x); Loading history...
150
151			//We are in a plain text now
152			$position = self::POSITION_PLAIN_TEXT;
153			$buffer = '';
154			break;
155			case '"':
156			//no break
157			case "'":
			1 ignored issue – show Coding Style introduced 2015-12-04 23:04 UTC by Report Bug Copy Issue Report There must be a comment when fall-through is intentional in a non-empty case body Loading history...
158			if ($position == self::POSITION_IN_TAG) {
159			//Jumping into argument
160			$position = self::POSITION_IN_QUOTAS;
161			$quotas = $char;
162			} elseif ($position == self::POSITION_IN_QUOTAS && $char == $quotas) {
163			//Jumping from argument
164			$position = self::POSITION_IN_TAG;
165			$quotas = '';
166			}
167			default:
168			//Checking for invalid characters in tag name or arguments
169			if ($position == self::POSITION_IN_TAG) {
170			if (!preg_match('/[a-z0-9 \._\-="\':\/\r\n\t]/i', $char)) {
171			$buffer = '<' . $buffer;
172			$position = self::POSITION_PLAIN_TEXT;
173			}
174			}
175			$buffer .= $char;
176			}
177			}
178
179			$this->handleToken(self::PLAIN_TEXT, $buffer);
180
181			return $this->tokens;
182			}
183
184			/**
185			* Compile token and all it's attributes into string.
186			*
187			* @param array $token
188			* @return string
189			*/
190			public function compile(array $token)
191			{
192			if (in_array($token[self::TOKEN_TYPE], [self::PLAIN_TEXT, self::TAG_CLOSE])) {
193			//Nothing to compile
194			return $token[HtmlTokenizer::TOKEN_CONTENT];
195			}
196
197			$result = $token[HtmlTokenizer::TOKEN_NAME];
198			$attributes = [];
199			foreach ($token[self::TOKEN_ATTRIBUTES] as $attribute => $value) {
200			if ($value === null) {
201			$attributes[] = $attribute;
202			continue;
203			}
204
205			$attributes[] = $attribute . '="' . $value . '"';
206			}
207
208			if (!empty($attributes)) {
209			$result .= ' ' . join(' ', $attributes);
210			}
211
212			if ($token[HtmlTokenizer::TOKEN_TYPE] == HtmlTokenizer::TAG_SHORT) {
213			$result .= '/';
214			}
215
216			return '<' . $result . '>';
217			}
218
219			/**
220			* Parses tag body for arguments, name, etc.
221			*
222			* @param string $content Tag content to be parsed (from < till >).
223			* @return array
224			*/
225			protected function parseToken($content)
226			{
227			$token = [
228			self::TOKEN_NAME => '',
229			self::TOKEN_TYPE => self::TAG_OPEN,
230			self::TOKEN_CONTENT => '<' . ($content = $this->repairPHP($content)) . '>',
231			self::TOKEN_ATTRIBUTES => []
232			];
233
234			//Some parts of text just looks like tags, but their not
235			if (!preg_match('/^\/?[a-z0-9_:\/][a-z 0-9\._\-:\/]*/i', $content)) {
236			$token[self::TOKEN_TYPE] = self::PLAIN_TEXT;
237			unset($token[self::TOKEN_NAME], $token[self::TOKEN_NAME]);
238
239			return $token;
240			}
241
242			//Local PHP isolation
243			$isolator = new Isolator('-argument-', '-block-', true);
244
245			//No PHP blocks
246			$content = $isolator->isolatePHP($content);
247
248			//Parsing arguments, due they already checked for open-close quotas we can use regular expression
249			$attribute = '/(?P<name>[a-z0-9_\-\.\:]+)[ \n\t\r](?:(?P<equal>=)[ \n\t\r]'
250			. '(?P<value>[a-z0-9\-]+\|\'[^\']+\'\|\"[^\"]+\"\|\"\"))?/si';
251			//todo: need better regexp for quotes
252
253			preg_match_all($attribute, $content, $attributes);
254
255			foreach ($attributes['value'] as $index => $value) {
256			if ($value && ($value{0} == "'" \|\| $value{0} == '"')) {
257			$value = trim($value, $value{0});
258			}
259
260			//Restoring global php isolation
261			$name = $this->repairPHP(
262			//Restoring local php isolation
263			$isolator->repairPHP($attributes['name'][$index])
264			);
265
266			$token[self::TOKEN_ATTRIBUTES][$name] = $this->repairPHP($isolator->repairPHP($value));
267
268			if (empty($attributes['equal'][$index])) {
269			$token[self::TOKEN_ATTRIBUTES][$name] = null;
270			}
271			}
272
273			//Fetching name
274			$name = $isolator->repairPHP(current(explode(' ', $content)));
275			if ($name{0} == '/') {
276			$token[self::TOKEN_TYPE] = self::TAG_CLOSE;
277			unset($token[self::TOKEN_ATTRIBUTES]);
278			}
279
280			if ($content{strlen($content) - 1} == '/') {
281			$token[self::TOKEN_TYPE] = self::TAG_SHORT;
282			}
283
284			$token[self::TOKEN_NAME] = $name = trim($name, '/');
285			unset($token[self::TOKEN_ATTRIBUTES][$name]);
286
287			$token[self::TOKEN_NAME] = trim($token[self::TOKEN_NAME]);
288
289			if (
290			$token[self::TOKEN_TYPE] == self::TAG_OPEN
291			&& in_array($token[self::TOKEN_NAME], $this->voidTags)
292			) {
293			$token[self::TOKEN_TYPE] = self::TAG_VOID;
294			}
295
296			return $token;
297			}
298
299			/**
300			* Handles single token and passes it to a callback function if specified.
301			*
302			* @param int $tokenType Token type.
303			* @param string $content Non parsed token content.
304			*/
305			protected function handleToken($tokenType, $content)
306			{
307			if ($tokenType == self::PLAIN_TEXT) {
308			if (empty($content)) {
309			return;
310			}
311
312			$token = [
313			self::TOKEN_TYPE => self::PLAIN_TEXT,
314			self::TOKEN_CONTENT => $this->repairPHP($content)
315			];
316			} else {
317			$token = $this->parseToken($content);
318			}
319
320			$this->tokens[] = $token;
321			}
322
323			/**
324			* Will restore all existing PHP blocks to their original content.
325			*
326			* @param string $source
327			* @return string
328			*/
329			protected function repairPHP($source)
330			{
331			if (!$this->isolatePHP) {
332			return $source;
333			}
334
335			return $this->isolator->repairPHP($source);
336			}
337			}
338

spiral / components

Branch — dbal-improvement (06db1a)

HtmlTokenizer B

Complexity

Size/Duplication

Coupling/Cohesion

Importance

6 Methods

Duplication Side-by-Side

Filter issues like