Issues (146)

src/DOM.php (10 issues)

1
<?php
2
3
namespace QueryPath;
4
5
6
use DOMNode;
7
use QueryPath\CSS\DOMTraverser;
8
use QueryPath\Entities;
9
10
/**
11
 * Class DOM
12
 *
13
 * @package QueryPath
14
 *
15
 * @property \Traversable|array|\SplObjectStorage matches
16
 */
17
abstract class DOM implements Query, \IteratorAggregate, \Countable
18
{
19
20
    /**
21
     * The array of matches.
22
     */
23
    protected $matches = [];
24
25
    /**
26
     * Default parser flags.
27
     *
28
     * These are flags that will be used if no global or local flags override them.
29
     *
30
     * @since 2.0
31
     */
32
    public const DEFAULT_PARSER_FLAGS = NULL;
33
34
    public const JS_CSS_ESCAPE_CDATA             = '\\1';
35
    public const JS_CSS_ESCAPE_CDATA_CCOMMENT    = '/* \\1 */';
36
    public const JS_CSS_ESCAPE_CDATA_DOUBLESLASH = '// \\1';
37
    public const JS_CSS_ESCAPE_NONE              = '';
38
39
    protected $errTypes = 771; //E_ERROR; | E_USER_ERROR;
40
41
    protected $document;
42
    /**
43
     * The base DOMDocument.
44
     */
45
    protected $options = [
46
        'parser_flags'                 => NULL,
47
        'omit_xml_declaration'         => false,
48
        'replace_entities'             => false,
49
        'exception_level'              => 771, // E_ERROR | E_USER_ERROR | E_USER_WARNING | E_WARNING
50
        'ignore_parser_warnings'       => false,
51
        'escape_xhtml_js_css_sections' => self::JS_CSS_ESCAPE_CDATA_CCOMMENT,
52
    ];
53
54
    /**
55
     * Constructor.
56
     *
57
     * Typically, a new DOMQuery is created by QueryPath::with(), QueryPath::withHTML(),
58
     * qp(), or htmlqp().
59
     *
60
     * @param mixed $document
61
     *   A document-like object.
62
     * @param string $string
63
     *   A CSS 3 Selector
64
     * @param array $options
65
     *   An associative array of options.
66
     * @see qp()
67
     * @throws Exception
68
     */
69
    public function __construct($document = NULL, $string = NULL, $options = [])
70
    {
71
        $string = trim($string);
0 ignored issues
show
It seems like $string can also be of type null; however, parameter $string of trim() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

71
        $string = trim(/** @scrutinizer ignore-type */ $string);
Loading history...
72
        $this->options = $options + Options::get() + $this->options;
73
74
        $parser_flags = $options['parser_flags'] ?? self::DEFAULT_PARSER_FLAGS;
75
        if (!empty($this->options['ignore_parser_warnings'])) {
76
            // Don't convert parser warnings into exceptions.
77
            $this->errTypes = 257; //E_ERROR | E_USER_ERROR;
78
        } elseif (isset($this->options['exception_level'])) {
79
            // Set the error level at which exceptions will be thrown. By default,
80
            // QueryPath will throw exceptions for
81
            // E_ERROR | E_USER_ERROR | E_WARNING | E_USER_WARNING.
82
            $this->errTypes = $this->options['exception_level'];
83
        }
84
85
        // Empty: Just create an empty QP.
86
        if (empty($document)) {
87
            $this->document = isset($this->options['encoding']) ? new \DOMDocument('1.0',
88
                $this->options['encoding']) : new \DOMDocument();
89
            $this->setMatches(new \SplObjectStorage());
90
        } // Figure out if document is DOM, HTML/XML, or a filename
91
        elseif (is_object($document)) {
92
93
            // This is the most frequent object type.
94
            if ($document instanceof \SplObjectStorage) {
95
                $this->matches = $document;
96
                if ($document->count() !== 0) {
97
                    $first = $this->getFirstMatch();
98
                    if (!empty($first->ownerDocument)) {
99
                        $this->document = $first->ownerDocument;
100
                    }
101
                }
102
            } elseif ($document instanceof self) {
103
                //$this->matches = $document->get(NULL, TRUE);
104
                $this->setMatches($document->get(NULL, true));
105
                if ($this->matches->count() > 0) {
106
                    $this->document = $this->getFirstMatch()->ownerDocument;
107
                }
108
            } elseif ($document instanceof \DOMDocument) {
109
                $this->document = $document;
110
                //$this->matches = $this->matches($document->documentElement);
111
                $this->setMatches($document->documentElement);
112
            } elseif ($document instanceof \DOMNode) {
113
                $this->document = $document->ownerDocument;
114
                //$this->matches = array($document);
115
                $this->setMatches($document);
116
            } elseif ($document instanceof \Masterminds\HTML5) {
117
                $this->document = $document;
118
                $this->setMatches($document->documentElement);
0 ignored issues
show
The property documentElement does not seem to exist on Masterminds\HTML5.
Loading history...
119
            } elseif ($document instanceof \SimpleXMLElement) {
120
                $import = dom_import_simplexml($document);
121
                $this->document = $import->ownerDocument;
122
                //$this->matches = array($import);
123
                $this->setMatches($import);
124
            } else {
125
                throw new \QueryPath\Exception('Unsupported class type: ' . get_class($document));
126
            }
127
        } elseif (is_array($document)) {
128
            //trigger_error('Detected deprecated array support', E_USER_NOTICE);
129
            if (!empty($document) && $document[0] instanceof \DOMNode) {
130
                $found = new \SplObjectStorage();
131
                foreach ($document as $item) {
132
                    $found->attach($item);
133
                }
134
                //$this->matches = $found;
135
                $this->setMatches($found);
136
                $this->document = $this->getFirstMatch()->ownerDocument;
137
            }
138
        } elseif ($this->isXMLish($document)) {
139
            // $document is a string with XML
140
            $this->document = $this->parseXMLString($document);
141
            $this->setMatches($this->document->documentElement);
142
        } else {
143
144
            // $document is a filename
145
            $context = empty($options['context']) ? NULL : $options['context'];
146
            $this->document = $this->parseXMLFile($document, $parser_flags, $context);
147
            $this->setMatches($this->document->documentElement);
148
        }
149
150
        // Globally set the output option.
151
        $this->document->formatOutput = true;
152
        if (isset($this->options['format_output']) && $this->options['format_output'] === false) {
153
            $this->document->formatOutput = false;
154
        }
155
156
        // Do a find if the second param was set.
157
        if (strlen($string) > 0) {
158
            // We don't issue a find because that creates a new DOMQuery.
159
            //$this->find($string);
160
161
            $query = new DOMTraverser($this->matches);
0 ignored issues
show
It seems like $this->matches can also be of type array; however, parameter $splos of QueryPath\CSS\DOMTraverser::__construct() does only seem to accept SPLObjectStorage, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

161
            $query = new DOMTraverser(/** @scrutinizer ignore-type */ $this->matches);
Loading history...
162
            $query->find($string);
163
            $this->setMatches($query->matches());
164
        }
165
    }
166
167
    private function parseXMLString($string, $flags = NULL)
168
    {
169
        $document = new \DOMDocument('1.0');
170
        $lead = strtolower(substr($string, 0, 5)); // <?xml
171
        try {
172
            set_error_handler([ParseException::class, 'initializeFromError'], $this->errTypes);
173
174
            if (isset($this->options['convert_to_encoding'])) {
175
                // Is there another way to do this?
176
177
                $from_enc = $this->options['convert_from_encoding'] ?? 'auto';
178
                $to_enc = $this->options['convert_to_encoding'];
179
180
                if (function_exists('mb_convert_encoding')) {
181
                    $string = mb_convert_encoding($string, $to_enc, $from_enc);
182
                }
183
184
            }
185
186
            // This is to avoid cases where low ascii digits have slipped into HTML.
187
            // AFAIK, it should not adversly effect UTF-8 documents.
188
            if (!empty($this->options['strip_low_ascii'])) {
189
                $string = filter_var($string, FILTER_UNSAFE_RAW, FILTER_FLAG_ENCODE_LOW);
190
            }
191
192
            // Allow users to override parser settings.
193
            $useParser = '';
194
            if (!empty($this->options['use_parser'])) {
195
                $useParser = strtolower($this->options['use_parser']);
196
            }
197
198
            // If HTML parser is requested, we use it.
199
            if ($useParser === 'html') {
200
                $document->loadHTML($string);
201
            } // Parse as XML if it looks like XML, or if XML parser is requested.
202
            elseif ($lead === '<?xml' || $useParser === 'xml') {
203
                if ($this->options['replace_entities']) {
204
                    $string = Entities::replaceAllEntities($string);
205
                }
206
                $document->loadXML($string, $flags);
207
            } // In all other cases, we try the HTML parser.
208
            else {
209
                $document->loadHTML($string);
210
            }
211
        } // Emulate 'finally' behavior.
212
        catch (Exception $e) {
213
            restore_error_handler();
214
            throw $e;
215
        }
216
        restore_error_handler();
217
218
        if (empty($document)) {
219
            throw new \QueryPath\ParseException('Unknown parser exception.');
220
        }
221
222
        return $document;
223
    }
224
225
    /**
226
     * EXPERT: Be very, very careful using this.
227
     * A utility function for setting the current set of matches.
228
     * It makes sure the last matches buffer is set (for end() and andSelf()).
229
     *
230
     * @since 2.0
231
     * @param $matches
232
     */
233
    public function setMatches($matches)
234
    {
235
        // This causes a lot of overhead....
236
        //if ($unique) $matches = self::unique($matches);
237
        $this->last = $this->matches;
0 ignored issues
show
Bug Best Practice introduced by
The property last does not exist. Although not strictly required by PHP, it is generally a best practice to declare properties explicitly.
Loading history...
238
239
        // Just set current matches.
240
        if ($matches instanceof \SplObjectStorage) {
241
            $this->matches = $matches;
242
        } // This is likely legacy code that needs conversion.
243
        elseif (is_array($matches)) {
244
            trigger_error('Legacy array detected.');
245
            $tmp = new \SplObjectStorage();
246
            foreach ($matches as $m) {
247
                $tmp->attach($m);
248
            }
249
            $this->matches = $tmp;
250
        }
251
        // For non-arrays, try to create a new match set and
252
        // add this object.
253
        else {
254
            $found = new \SplObjectStorage();
255
            if (isset($matches)) {
256
                $found->attach($matches);
257
            }
258
            $this->matches = $found;
259
        }
260
261
        // EXPERIMENTAL: Support for qp()->length.
262
        $this->length = $this->matches->count();
0 ignored issues
show
Bug Best Practice introduced by
The property length does not exist. Although not strictly required by PHP, it is generally a best practice to declare properties explicitly.
Loading history...
263
    }
264
265
    /**
266
     * A depth-checking function. Typically, it only needs to be
267
     * invoked with the first parameter. The rest are used for recursion.
268
     *
269
     * @see deepest();
270
     * @param DOMNode $ele
271
     *  The element.
272
     * @param int $depth
273
     *  The depth guage
274
     * @param mixed $current
275
     *  The current set.
276
     * @param DOMNode $deepest
277
     *  A reference to the current deepest node.
278
     * @return array
279
     *  Returns an array of DOM nodes.
280
     */
281
    protected function deepestNode(\DOMNode $ele, $depth = 0, $current = NULL, &$deepest = NULL)
282
    {
283
        // FIXME: Should this use SplObjectStorage?
284
        if (!isset($current)) {
285
            $current = [$ele];
286
        }
287
        if (!isset($deepest)) {
288
            $deepest = $depth;
289
        }
290
        if ($ele->hasChildNodes()) {
291
            foreach ($ele->childNodes as $child) {
292
                if ($child->nodeType === XML_ELEMENT_NODE) {
293
                    $current = $this->deepestNode($child, $depth + 1, $current, $deepest);
0 ignored issues
show
It seems like $deepest can also be of type integer; however, parameter $deepest of QueryPath\DOM::deepestNode() does only seem to accept DOMNode, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

293
                    $current = $this->deepestNode($child, $depth + 1, $current, /** @scrutinizer ignore-type */ $deepest);
Loading history...
294
                }
295
            }
296
        } elseif ($depth > $deepest) {
297
            $current = [$ele];
298
            $deepest = $depth;
299
        } elseif ($depth === $deepest) {
300
            $current[] = $ele;
301
        }
302
303
        return $current;
304
    }
305
306
    /**
307
     * Prepare an item for insertion into a DOM.
308
     *
309
     * This handles a variety of boilerplate tasks that need doing before an
310
     * indeterminate object can be inserted into a DOM tree.
311
     * - If item is a string, this is converted into a document fragment and returned.
312
     * - If item is a DOMQuery, then all items are retrieved and converted into
313
     *   a document fragment and returned.
314
     * - If the item is a DOMNode, it is imported into the current DOM if necessary.
315
     * - If the item is a SimpleXMLElement, it is converted into a DOM node and then
316
     *   imported.
317
     *
318
     * @param mixed $item
319
     *  Item to prepare for insert.
320
     * @return mixed
321
     *  Returns the prepared item.
322
     * @throws QueryPath::Exception
323
     *  Thrown if the object passed in is not of a supprted object type.
324
     * @throws Exception
325
     */
326
    protected function prepareInsert($item)
327
    {
328
        if (empty($item)) {
329
            return NULL;
330
        }
331
332
        if (is_string($item)) {
333
            // If configured to do so, replace all entities.
334
            if ($this->options['replace_entities']) {
335
                $item = Entities::replaceAllEntities($item);
336
            }
337
338
            $frag = $this->document->createDocumentFragment();
339
            try {
340
                set_error_handler([ParseException::class, 'initializeFromError'], $this->errTypes);
341
                $frag->appendXML($item);
342
            } // Simulate a finally block.
343
            catch (Exception $e) {
344
                restore_error_handler();
345
                throw $e;
346
            }
347
            restore_error_handler();
348
349
            return $frag;
350
        }
351
352
        if ($item instanceof self) {
353
            if ($item->count() === 0) {
354
                return NULL;
355
            }
356
357
            $frag = $this->document->createDocumentFragment();
358
            foreach ($item->matches as $m) {
359
                $frag->appendXML($item->document->saveXML($m));
360
            }
361
362
            return $frag;
363
        }
364
365
        if ($item instanceof \DOMNode) {
366
            if ($item->ownerDocument !== $this->document) {
367
                // Deep clone this and attach it to this document
368
                $item = $this->document->importNode($item, true);
369
            }
370
371
            return $item;
372
        }
373
374
        if ($item instanceof \SimpleXMLElement) {
375
            $element = dom_import_simplexml($item);
376
377
            return $this->document->importNode($element, true);
378
        }
379
        // What should we do here?
380
        //var_dump($item);
381
        throw new \QueryPath\Exception('Cannot prepare item of unsupported type: ' . gettype($item));
382
    }
383
384
    /**
385
     * Convenience function for getNthMatch(0).
386
     */
387
    protected function getFirstMatch()
388
    {
389
        $this->matches->rewind();
0 ignored issues
show
The method rewind() does not exist on Traversable. It seems like you code against a sub-type of Traversable such as Yaf_Config_Simple or Yaf\Session or SimpleXMLElement or Yaf_Session or Yaf\Config\Simple or Yaf\Config\Ini or Iterator or Yaf_Config_Ini or MongoGridFSCursor or Mockery\Generator\TestTraversableInterface2 or SplFixedArray or QueryPath\DOMQuery. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

389
        $this->matches->/** @scrutinizer ignore-call */ 
390
                        rewind();
Loading history...
390
391
        return $this->matches->current();
0 ignored issues
show
The method current() does not exist on Traversable. It seems like you code against a sub-type of Traversable such as IntlCodePointBreakIterator or Yaf_Config_Simple or Yaf\Session or SimpleXMLElement or IntlRuleBasedBreakIterator or Yaf_Session or Yaf\Config\Simple or Yaf\Config\Ini or Iterator or Yaf_Config_Ini or MongoGridFSCursor or Mockery\Generator\TestTraversableInterface2 or SplFixedArray or IntlBreakIterator or QueryPath\DOMQuery. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

391
        return $this->matches->/** @scrutinizer ignore-call */ current();
Loading history...
392
    }
393
394
    /**
395
     * Parse an XML or HTML file.
396
     *
397
     * This attempts to autodetect the type of file, and then parse it.
398
     *
399
     * @param string $filename
400
     *  The file name to parse.
401
     * @param int $flags
402
     *  The OR-combined flags accepted by the DOM parser. See the PHP documentation
403
     *  for DOM or for libxml.
404
     * @param resource $context
405
     *  The stream context for the file IO. If this is set, then an alternate
406
     *  parsing path is followed: The file is loaded by PHP's stream-aware IO
407
     *  facilities, read entirely into memory, and then handed off to
408
     *  {@link parseXMLString()}. On large files, this can have a performance impact.
409
     * @throws \QueryPath\ParseException
410
     *  Thrown when a file cannot be loaded or parsed.
411
     */
412
    private function parseXMLFile($filename, $flags = NULL, $context = NULL)
413
    {
414
415
        // If a context is specified, we basically have to do the reading in
416
        // two steps:
417
        if (!empty($context)) {
418
            try {
419
                set_error_handler(['\QueryPath\ParseException', 'initializeFromError'], $this->errTypes);
420
                $contents = file_get_contents($filename, false, $context);
421
            }
422
                // Apparently there is no 'finally' in PHP, so we have to restore the error
423
                // handler this way:
424
            catch (Exception $e) {
425
                restore_error_handler();
426
                throw $e;
427
            }
428
            restore_error_handler();
429
430
            if ($contents == false) {
0 ignored issues
show
Bug Best Practice introduced by
It seems like you are loosely comparing $contents of type string to the boolean false. If you are specifically checking for an empty string, consider using the more explicit === '' instead.
Loading history...
431
                throw new \QueryPath\ParseException(sprintf('Contents of the file %s could not be retrieved.',
432
                    $filename));
433
            }
434
435
            return $this->parseXMLString($contents, $flags);
436
        }
437
438
        $document = new \DOMDocument();
439
        $lastDot = strrpos($filename, '.');
440
441
        $htmlExtensions = [
442
            '.html' => 1,
443
            '.htm'  => 1,
444
        ];
445
446
        // Allow users to override parser settings.
447
        if (empty($this->options['use_parser'])) {
448
            $useParser = '';
449
        } else {
450
            $useParser = strtolower($this->options['use_parser']);
451
        }
452
453
        $ext = $lastDot !== false ? strtolower(substr($filename, $lastDot)) : '';
454
455
        try {
456
            set_error_handler([ParseException::class, 'initializeFromError'], $this->errTypes);
457
458
            // If the parser is explicitly set to XML, use that parser.
459
            if ($useParser === 'xml') {
460
                $document->load($filename, $flags);
461
            } // Otherwise, see if it looks like HTML.
462
            elseif ($useParser === 'html' || isset($htmlExtensions[$ext])) {
463
                // Try parsing it as HTML.
464
                $document->loadHTMLFile($filename);
465
            } // Default to XML.
466
            else {
467
                $document->load($filename, $flags);
468
            }
469
470
        } // Emulate 'finally' behavior.
471
        catch (Exception $e) {
472
            restore_error_handler();
473
            throw $e;
474
        }
475
        restore_error_handler();
476
477
        return $document;
478
    }
479
480
    /**
481
     * Determine whether a given string looks like XML or not.
482
     *
483
     * Basically, this scans a portion of the supplied string, checking to see
484
     * if it has a tag-like structure. It is possible to "confuse" this, which
485
     * may subsequently result in parse errors, but in the vast majority of
486
     * cases, this method serves as a valid inicator of whether or not the
487
     * content looks like XML.
488
     *
489
     * Things that are intentional excluded:
490
     * - plain text with no markup.
491
     * - strings that look like filesystem paths.
492
     *
493
     * Subclasses SHOULD NOT OVERRIDE THIS. Altering it may be altering
494
     * core assumptions about how things work. Instead, classes should
495
     * override the constructor and pass in only one of the parsed types
496
     * that this class expects.
497
     */
498
    protected function isXMLish($string)
499
    {
500
        return (strpos($string, '<') !== false && strpos($string, '>') !== false);
501
    }
502
503
    /**
504
     * A utility function for retriving a match by index.
505
     *
506
     * The internal data structure used in DOMQuery does not have
507
     * strong random access support, so we suppliment it with this method.
508
     *
509
     * @param $index
510
     * @return object|void
511
     */
512
    protected function getNthMatch(int $index)
513
    {
514
        if ($index < 0 || $index > $this->matches->count()) {
0 ignored issues
show
The method count() does not exist on Traversable. It seems like you code against a sub-type of Traversable such as parallel\Events or Yaf_Config_Simple or Yaf\Session or Threaded or Volatile or MockeryTest_InterfaceWithTraversable or SimpleXMLElement or Thread or Yaf_Session or Ds\Collection or pq\Result or Yaf\Config\Simple or Yaf\Config\Ini or Worker or Yaf_Config_Ini or MongoGridFSCursor or DOMNodeList or PharIo\Manifest\AuthorCollection or SplFixedArray or QueryPath\CSS\Selector or QueryPath\DOM or WeakMap or DOMNamedNodeMap or PharIo\Manifest\BundledComponentCollection or SebastianBergmann\CodeCoverage\Node\Directory or ArrayObject or PHPUnit\Framework\TestSuite or PharIo\Manifest\RequirementCollection or ResourceBundle or SplDoublyLinkedList or HttpMessage or HttpRequestPool or RdKafka\Metadata\Collection or Yaf_Config_Simple or SplFixedArray or SplObjectStorage or Yaf\Session or SQLiteResult or Cassandra\UserTypeValue or Imagick or Cassandra\Collection or SimpleXMLElement or TheSeer\Tokenizer\TokenCollection or http\Message or Yaf_Session or SplPriorityQueue or Cassandra\Rows or Cassandra\Map or Yaf\Config\Simple or Yaf\Config\Ini or MongoCursor or Yaf_Config_Ini or SplHeap or Cassandra\Set or MongoGridFSCursor or Cassandra\Tuple or CachingIterator or PHP_Token_Stream or Phar or ArrayIterator or GlobIterator or Phar or Phar or RecursiveCachingIterator or SimpleXMLElement or RecursiveArrayIterator or SimpleXMLIterator or Phar. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

514
        if ($index < 0 || $index > $this->matches->/** @scrutinizer ignore-call */ count()) {
Loading history...
515
            return;
516
        }
517
518
        $i = 0;
519
        foreach ($this->matches as $m) {
520
            if ($i++ === $index) {
521
                return $m;
522
            }
523
        }
524
    }
525
}