PHPTAL_Dom_SaxXmlParser::parseString()   F
last analyzed

Complexity

Conditions 84
Paths 233

Size

Total Lines 260

Duplication

Lines 39
Ratio 15 %

Importance

Changes 0
Metric Value
cc 84
nc 233
nop 3
dl 39
loc 260
rs 2.3365
c 0
b 0
f 0

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
/**
3
 * PHPTAL templating engine
4
 *
5
 * PHP Version 5
6
 *
7
 * @category HTML
8
 * @package  PHPTAL
9
 * @author   Laurent Bedubourg <[email protected]>
10
 * @author   Kornel Lesiński <[email protected]>
11
 * @license  http://www.gnu.org/licenses/lgpl.html GNU Lesser General Public License
12
 * @version  SVN: $Id$
13
 * @link     http://phptal.org/
14
 */
15
16
/**
17
 * Simple sax like xml parser for PHPTAL
18
 * ("Dom" in the class name comes from name of the directory, not mode of operation)
19
 *
20
 * At the time this parser was created, standard PHP libraries were not suitable
21
 * (could not retrieve doctypes, xml declaration, problems with comments and CDATA).
22
 *
23
 * There are still some problems: XML parsers don't care about exact format of enties
24
 * or CDATA sections (PHPTAL tries to preserve them),
25
 * <?php ?> blocks are not allowed in attributes.
26
 *
27
 * This parser failed to enforce some XML well-formedness constraints,
28
 * and there are ill-formed templates "in the wild" because of this.
29
 *
30
 * @package PHPTAL
31
 * @subpackage Dom
32
 * @see PHPTAL_DOM_DocumentBuilder
33
 */
34
class PHPTAL_Dom_SaxXmlParser
35
{
36
    private $_file;
37
    private $_line;
38
    private $_source;
39
40
    // available parser states
41
    const ST_ROOT = 0;
42
    const ST_TEXT = 1;
43
    const ST_LT   = 2;
44
    const ST_TAG_NAME = 3;
45
    const ST_TAG_CLOSE = 4;
46
    const ST_TAG_SINGLE = 5;
47
    const ST_TAG_ATTRIBUTES = 6;
48
    const ST_TAG_BETWEEN_ATTRIBUTE = 7;
49
    const ST_CDATA = 8;
50
    const ST_COMMENT = 9;
51
    const ST_DOCTYPE = 10;
52
    const ST_XMLDEC = 11;
53
    const ST_PREPROC = 12;
54
    const ST_ATTR_KEY = 13;
55
    const ST_ATTR_EQ = 14;
56
    const ST_ATTR_QUOTE = 15;
57
    const ST_ATTR_VALUE = 16;
58
59
    const BOM_STR = "\xef\xbb\xbf";
60
61
62
    static $state_names = array(
63
      self::ST_ROOT => 'root node',
64
      self::ST_TEXT => 'text',
65
      self::ST_LT   => 'start of tag',
66
      self::ST_TAG_NAME => 'tag name',
67
      self::ST_TAG_CLOSE => 'closing tag',
68
      self::ST_TAG_SINGLE => 'self-closing tag',
69
      self::ST_TAG_ATTRIBUTES => 'tag',
70
      self::ST_TAG_BETWEEN_ATTRIBUTE => 'tag attributes',
71
      self::ST_CDATA => 'CDATA',
72
      self::ST_COMMENT => 'comment',
73
      self::ST_DOCTYPE => 'doctype',
74
      self::ST_XMLDEC => 'XML declaration',
75
      self::ST_PREPROC => 'preprocessor directive',
76
      self::ST_ATTR_KEY => 'attribute name',
77
      self::ST_ATTR_EQ => 'attribute value',
78
      self::ST_ATTR_QUOTE => 'quoted attribute value',
79
      self::ST_ATTR_VALUE => 'unquoted attribute value',
80
    );
81
82
    private $input_encoding;
83
    public function __construct($input_encoding)
84
    {
85
        $this->input_encoding = $input_encoding;
86
        $this->_file = "<string>";
87
    }
88
89
    public function parseFile(PHPTAL_Dom_DocumentBuilder $builder, $src)
90
    {
91
        if (!file_exists($src)) {
92
            throw new PHPTAL_IOException("file $src not found");
93
        }
94
        return $this->parseString($builder, file_get_contents($src), $src);
95
    }
96
97
    public function parseString(PHPTAL_Dom_DocumentBuilder $builder, $src, $filename = '<string>')
98
    {
99
        try
100
        {
101
            $builder->setEncoding($this->input_encoding);
102
            $this->_file = $filename;
103
104
            $this->_line = 1;
105
            $state = self::ST_ROOT;
106
            $mark  = 0;
107
            $len   = strlen($src);
108
109
            $quoteStyle = '"';
110
            $tagname    = "";
111
            $attribute  = "";
112
            $attributes = array();
113
114
            $customDoctype = false;
115
116
            $builder->setSource($this->_file, $this->_line);
117
            $builder->onDocumentStart();
118
119
            $i=0;
120
            // remove BOM (UTF-8 byte order mark)...
121
            if (substr($src, 0, 3) === self::BOM_STR) {
122
                $i=3;
123
            }
124
            for (; $i<$len; $i++) {
125
                $c = $src[$i]; // Change to substr($src, $i, 1); if you want to use mb_string.func_overload
126
127
                if ($c === "\n") $builder->setSource($this->_file, ++$this->_line);
128
129
                switch ($state) {
130 View Code Duplication
                    case self::ST_ROOT:
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
131
                        if ($c === '<') {
132
                            $mark = $i; // mark tag start
133
                            $state = self::ST_LT;
134
                        } elseif (!self::isWhiteChar($c)) {
135
                            $this->raiseError("Characters found before beginning of the document! (wrap document in < tal:block > to avoid this error)");
136
                        }
137
                        break;
138
139
                    case self::ST_TEXT:
140
                        if ($c === '<') {
141
                            if ($mark != $i) {
142
                                $builder->onElementData($this->sanitizeEscapedText($this->checkEncoding(substr($src, $mark, $i-$mark))));
143
                            }
144
                            $mark = $i;
145
                            $state = self::ST_LT;
146
                        }
147
                        break;
148
149
                    case self::ST_LT:
150
                        if ($c === '/') {
151
                            $mark = $i+1;
152
                            $state = self::ST_TAG_CLOSE;
153 View Code Duplication
                        } elseif ($c === '?' and strtolower(substr($src, $i, 5)) === '?xml ') {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
154
                            $state = self::ST_XMLDEC;
155
                        } elseif ($c === '?') {
156
                            $state = self::ST_PREPROC;
157 View Code Duplication
                        } elseif ($c === '!' and substr($src, $i, 3) === '!--') {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
158
                            $state = self::ST_COMMENT;
159
                        } elseif ($c === '!' and substr($src, $i, 8) === '![CDATA[') {
160
                            $state = self::ST_CDATA;
161
                            $mark = $i+8; // past opening tag
162 View Code Duplication
                        } elseif ($c === '!' and strtoupper(substr($src, $i, 8)) === '!DOCTYPE') {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
163
                            $state = self::ST_DOCTYPE;
164
                        } elseif (self::isWhiteChar($c)) {
165
                            $state = self::ST_TEXT;
166
                        } else {
167
                            $mark = $i; // mark node name start
168
                            $attributes = array();
169
                            $attribute = "";
170
                            $state = self::ST_TAG_NAME;
171
                        }
172
                        break;
173
174
                    case self::ST_TAG_NAME:
175
                        if (self::isWhiteChar($c) || $c === '/' || $c === '>') {
176
                            $tagname = substr($src, $mark, $i-$mark);
177
                            if (!$this->isValidQName($tagname)) $this->raiseError("Invalid tag name '$tagname'");
178
179
                            if ($c === '/') {
180
                                $state = self::ST_TAG_SINGLE;
181
                            } elseif ($c === '>') {
182
                                $mark = $i+1; // mark text start
183
                                $state = self::ST_TEXT;
184
                                $builder->onElementStart($tagname, $attributes);
185
                            } else /* isWhiteChar */ {
186
                                $state = self::ST_TAG_ATTRIBUTES;
187
                            }
188
                        }
189
                        break;
190
191
                    case self::ST_TAG_CLOSE:
192
                        if ($c === '>') {
193
                            $tagname = rtrim(substr($src, $mark, $i-$mark));
194
                            $builder->onElementClose($tagname);
195
                            $mark = $i+1; // mark text start
196
                            $state = self::ST_TEXT;
197
                        }
198
                        break;
199
200
                    case self::ST_TAG_SINGLE:
201
                        if ($c !== '>') {
202
                            $this->raiseError("Expected '/>', but found '/$c' inside tag < $tagname >");
203
                        }
204
                        $mark = $i+1;   // mark text start
205
                        $state = self::ST_TEXT;
206
                        $builder->onElementStart($tagname, $attributes);
207
                        $builder->onElementClose($tagname);
208
                        break;
209
210
                    case self::ST_TAG_BETWEEN_ATTRIBUTE:
211
                    case self::ST_TAG_ATTRIBUTES:
212
                        if ($c === '>') {
213
                            $mark = $i+1;   // mark text start
214
                            $state = self::ST_TEXT;
215
                            $builder->onElementStart($tagname, $attributes);
216
                        } elseif ($c === '/') {
217
                            $state = self::ST_TAG_SINGLE;
218
                        } elseif (self::isWhiteChar($c)) {
219
                            $state = self::ST_TAG_ATTRIBUTES;
220
                        } elseif ($state === self::ST_TAG_ATTRIBUTES && $this->isValidQName($c)) {
221
                            $mark = $i; // mark attribute key start
222
                            $state = self::ST_ATTR_KEY;
223
                        } else $this->raiseError("Unexpected character '$c' between attributes of < $tagname >");
224
                        break;
225
226
                    case self::ST_COMMENT:
227
                        if ($c === '>' && $i > $mark+4 && substr($src, $i-2, 2) === '--') {
228
229
                            if (preg_match('/^-|--|-$/', substr($src, $mark +4, $i-$mark+1 -7))) {
230
                                $this->raiseError("Ill-formed comment. XML comments are not allowed to contain '--' or start/end with '-': ".substr($src, $mark+4, $i-$mark+1-7));
231
                            }
232
233
                            $builder->onComment($this->checkEncoding(substr($src, $mark+4, $i-$mark+1-7)));
234
                            $mark = $i+1; // mark text start
235
                            $state = self::ST_TEXT;
236
                        }
237
                        break;
238
239
                    case self::ST_CDATA:
240
                        if ($c === '>' and substr($src, $i-2, 2) === ']]') {
241
                            $builder->onCDATASection($this->checkEncoding(substr($src, $mark, $i-$mark-2)));
242
                            $mark = $i+1; // mark text start
243
                            $state = self::ST_TEXT;
244
                        }
245
                        break;
246
247 View Code Duplication
                    case self::ST_XMLDEC:
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
248
                        if ($c === '?' && substr($src, $i, 2) === '?>') {
249
                            $builder->onXmlDecl($this->checkEncoding(substr($src, $mark, $i-$mark+2)));
250
                            $i++; // skip '>'
251
                            $mark = $i+1; // mark text start
252
                            $state = self::ST_TEXT;
253
                        }
254
                        break;
255
256
                    case self::ST_DOCTYPE:
257
                        if ($c === '[') {
258
                            $customDoctype = true;
259
                        } elseif ($customDoctype && $c === '>' && substr($src, $i-1, 2) === ']>') {
260
                            $customDoctype = false;
261
                            $builder->onDocType($this->checkEncoding(substr($src, $mark, $i-$mark+1)));
262
                            $mark = $i+1; // mark text start
263
                            $state = self::ST_TEXT;
264
                        } elseif (!$customDoctype && $c === '>') {
265
                            $customDoctype = false;
266
                            $builder->onDocType($this->checkEncoding(substr($src, $mark, $i-$mark+1)));
267
                            $mark = $i+1; // mark text start
268
                            $state = self::ST_TEXT;
269
                        }
270
                        break;
271
272 View Code Duplication
                    case self::ST_PREPROC:
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
273
                        if ($c === '>' and substr($src, $i-1, 1) === '?') {
274
                            $builder->onProcessingInstruction($this->checkEncoding(substr($src, $mark, $i-$mark+1)));
275
                            $mark = $i+1; // mark text start
276
                            $state = self::ST_TEXT;
277
                        }
278
                        break;
279
280
                    case self::ST_ATTR_KEY:
281
                        if ($c === '=' || self::isWhiteChar($c)) {
282
                            $attribute = substr($src, $mark, $i-$mark);
283
                            if (!$this->isValidQName($attribute)) {
284
                                $this->raiseError("Invalid attribute name '$attribute' in < $tagname >");
285
                            }
286
                            if (isset($attributes[$attribute])) {
287
                                $this->raiseError("Attribute $attribute in < $tagname > is defined more than once");
288
                            }
289
290
                            if ($c === '=') $state = self::ST_ATTR_VALUE;
291
                            else /* white char */ $state = self::ST_ATTR_EQ;
292
                        } elseif ($c === '/' || $c==='>') {
293
                            $attribute = substr($src, $mark, $i-$mark);
294
                            if (!$this->isValidQName($attribute)) {
295
                                $this->raiseError("Invalid attribute name '$attribute'");
296
                            }
297
                            $this->raiseError("Attribute $attribute does not have value (found end of tag instead of '=')");
298
                        }
299
                        break;
300
301 View Code Duplication
                    case self::ST_ATTR_EQ:
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
302
                        if ($c === '=') {
303
                            $state = self::ST_ATTR_VALUE;
304
                        } elseif (!self::isWhiteChar($c)) {
305
                            $this->raiseError("Attribute $attribute in < $tagname > does not have value (found character '$c' instead of '=')");
306
                        }
307
                        break;
308
309
                    case self::ST_ATTR_VALUE:
310
                        if (self::isWhiteChar($c)) {
311
                        } elseif ($c === '"' or $c === '\'') {
312
                            $quoteStyle = $c;
313
                            $state = self::ST_ATTR_QUOTE;
314
                            $mark = $i+1; // mark attribute real value start
315
                        } else {
316
                            $this->raiseError("Value of attribute $attribute in < $tagname > is not in quotes (found character '$c' instead of quote)");
317
                        }
318
                        break;
319
320
                    case self::ST_ATTR_QUOTE:
321
                        if ($c === $quoteStyle) {
322
                            $attributes[$attribute] = $this->sanitizeEscapedText($this->checkEncoding(substr($src, $mark, $i-$mark)));
323
324
                            // PHPTAL's code generator assumes input is escaped for double-quoted strings. Single-quoted attributes need to be converted.
325
                            // FIXME: it should be escaped at later stage.
326
                            $attributes[$attribute] = str_replace('"',"&quot;", $attributes[$attribute]);
327
                            $state = self::ST_TAG_BETWEEN_ATTRIBUTE;
328
                        }
329
                        break;
330
                }
331
            }
332
333
            if ($state === self::ST_TEXT) // allows text past root node, which is in violation of XML spec
334
            {
335
                if ($i > $mark) {
336
                    $text = substr($src, $mark, $i-$mark);
337
                    if (!ctype_space($text)) $this->raiseError("Characters found after end of the root element (wrap document in < tal:block > to avoid this error)");
338
                }
339
            } else {
340
                if ($state === self::ST_ROOT) {
341
                    $msg = "Document does not have any tags";
342
                } else {
343
                    $msg = "Finished document in unexpected state: ".self::$state_names[$state]." is not finished";
344
                }
345
                $this->raiseError($msg);
346
            }
347
348
            $builder->onDocumentEnd();
349
        }
350
        catch(PHPTAL_TemplateException $e)
351
        {
352
            $e->hintSrcPosition($this->_file, $this->_line);
353
            throw $e;
354
        }
355
        return $builder;
356
    }
357
358
    private function isValidQName($name)
359
    {
360
        $name = $this->checkEncoding($name);
361
        return preg_match('/^([a-z_\x80-\xff]+[a-z0-9._\x80-\xff-]*:)?[a-z_\x80-\xff]+[a-z0-9._\x80-\xff-]*$/i', $name);
362
    }
363
364
    private function checkEncoding($str)
365
    {
366
        if ($str === '') return '';
367
368
        if ($this->input_encoding === 'UTF-8') {
369
370
            // $match expression below somehow triggers quite deep recurrency and stack overflow in preg
371
            // to avoid this, check string bit by bit, omitting ASCII fragments.
372
            if (strlen($str) > 200) {
373
                $chunks = preg_split('/(?>[\x09\x0A\x0D\x20-\x7F]+)/',$str,null,PREG_SPLIT_NO_EMPTY);
374
                foreach ($chunks as $chunk) {
375
                    if (strlen($chunk) < 200) {
376
                        $this->checkEncoding($chunk);
377
                    }
378
                }
379
                return $str;
380
            }
381
382
            // http://www.w3.org/International/questions/qa-forms-utf-8
383
            $match = '[\x09\x0A\x0D\x20-\x7F]'        // ASCII
384
               . '|[\xC2-\xDF][\x80-\xBF]'            // non-overlong 2-byte
385
               . '|\xE0[\xA0-\xBF][\x80-\xBF]'        // excluding overlongs
386
               . '|[\xE1-\xEC\xEE\xEE][\x80-\xBF]{2}' // straight 3-byte (exclude FFFE and FFFF)
387
               . '|\xEF[\x80-\xBE][\x80-\xBF]'        // straight 3-byte
388
               . '|\xEF\xBF[\x80-\xBD]'               // straight 3-byte
389
               . '|\xED[\x80-\x9F][\x80-\xBF]'        // excluding surrogates
390
               . '|\xF0[\x90-\xBF][\x80-\xBF]{2}'     // planes 1-3
391
               . '|[\xF1-\xF3][\x80-\xBF]{3}'         // planes 4-15
392
               . '|\xF4[\x80-\x8F][\x80-\xBF]{2}';    // plane 16
393
394
            if (!preg_match('/^(?:(?>'.$match.'))+$/s',$str)) {
395
                $res = preg_split('/((?>'.$match.')+)/s',$str,null,PREG_SPLIT_DELIM_CAPTURE);
396
                for($i=0; $i < count($res); $i+=2)
0 ignored issues
show
Performance Best Practice introduced by
It seems like you are calling the size function count() as part of the test condition. You might want to compute the size beforehand, and not on each iteration.

If the size of the collection does not change during the iteration, it is generally a good practice to compute it beforehand, and not on each iteration:

for ($i=0; $i<count($array); $i++) { // calls count() on each iteration
}

// Better
for ($i=0, $c=count($array); $i<$c; $i++) { // calls count() just once
}
Loading history...
397
                {
398
                    $res[$i] = self::convertBytesToEntities(array(1=>$res[$i]));
399
                }
400
                $this->raiseError("Invalid UTF-8 bytes: ".implode('', $res));
401
            }
402
        }
403
        if ($this->input_encoding === 'ISO-8859-1') {
404
405
            // http://www.w3.org/TR/2006/REC-xml11-20060816/#NT-RestrictedChar
406
            $forbid = '/((?>[\x00-\x08\x0B\x0C\x0E-\x1F\x7F-\x84\x86-\x9F]+))/s';
407
408
            if (preg_match($forbid, $str)) {
409
                $str = preg_replace_callback($forbid, array('self', 'convertBytesToEntities'), $str);
410
                $this->raiseError("Invalid ISO-8859-1 characters: ".$str);
411
            }
412
        }
413
414
        return $str;
415
    }
416
417
    /**
418
     * preg callback
419
     * Changes all bytes to hexadecimal XML entities
420
     *
421
     * @param array $m first array element is used for input
422
     *
423
     * @return string
424
     */
425
    private static function convertBytesToEntities(array $m)
426
    {
427
        $m = $m[1];
428
        $out = "";
429
        for($i=0; $i < strlen($m); $i++)
430
        {
431
            $out .= '&#X'.strtoupper(dechex(ord($m[$i]))).';';
432
        }
433
        return $out;
434
    }
435
436
    /**
437
     * This is where this parser violates XML and refuses to be an annoying bastard.
438
     */
439
    private function sanitizeEscapedText($str)
440
    {
441
        $str = str_replace('&apos;', '&#39;', $str); // PHP's html_entity_decode doesn't seem to support that!
442
443
        /* <?php ?> blocks can't reliably work in attributes (due to escaping impossible in XML)
444
           so they have to be converted into special TALES expression
445
        */
446
        $types = version_compare(PHP_VERSION, '5.4.0') < 0 ? (ini_get('short_open_tag') ? 'php|=|' : 'php') : 'php|=';
447
        $str = preg_replace_callback("/<\?($types)(.*?)\?>/", array('self', 'convertPHPBlockToTALES'), $str);
448
449
        // corrects all non-entities and neutralizes potentially problematic CDATA end marker
450
        $str = strtr(preg_replace('/&(?!(?:#x?[a-f0-9]+|[a-z][a-z0-9]*);)/i', '&amp;', $str), array('<'=>'&lt;', ']]>'=>']]&gt;'));
451
452
        return $str;
453
    }
454
455
    private static function convertPHPBlockToTALES($m)
0 ignored issues
show
Unused Code introduced by
This method is not used, and could be removed.
Loading history...
456
    {
457
        list(, $type, $code) = $m;
458
        if ($type === '=') $code = 'echo '.$code;
459
        return '${structure phptal-internal-php-block:'.rawurlencode($code).'}';
460
    }
461
462
    public function getSourceFile()
463
    {
464
        return $this->_file;
465
    }
466
467
    public function getLineNumber()
468
    {
469
        return $this->_line;
470
    }
471
472
    public static function isWhiteChar($c)
473
    {
474
        return strpos(" \t\n\r\0", $c) !== false;
475
    }
476
477
    protected function raiseError($errStr)
478
    {
479
        throw new PHPTAL_ParserException($errStr, $this->_file, $this->_line);
480
    }
481
}
482