Issues (146)

src/DOM.php (3 issues)

1
<?php
2
3
namespace QueryPath;
4
5
6
use DOMNode;
7
use QueryPath\CSS\DOMTraverser;
8
use QueryPath\Entities;
9
10
/**
11
 * Class DOM
12
 *
13
 * @package QueryPath
14
 *
15
 * @property \Traversable|array|\SplObjectStorage matches
16
 */
17
abstract class DOM implements Query, \IteratorAggregate, \Countable
18
{
19
20
    /**
21
     * The array of matches.
22
     */
23
    protected $matches = [];
24
25
    /**
26
     * Default parser flags.
27
     *
28
     * These are flags that will be used if no global or local flags override them.
29
     *
30
     * @since 2.0
31
     */
32
    public const DEFAULT_PARSER_FLAGS = NULL;
33
34
    public const JS_CSS_ESCAPE_CDATA             = '\\1';
35
    public const JS_CSS_ESCAPE_CDATA_CCOMMENT    = '/* \\1 */';
36
    public const JS_CSS_ESCAPE_CDATA_DOUBLESLASH = '// \\1';
37
    public const JS_CSS_ESCAPE_NONE              = '';
38
39
    protected $errTypes = 771; //E_ERROR; | E_USER_ERROR;
40
41
    protected $document;
42
    /**
43
     * The base DOMDocument.
44
     */
45
    protected $options = [
46
        'parser_flags'                 => NULL,
47
        'omit_xml_declaration'         => false,
48
        'replace_entities'             => false,
49
        'exception_level'              => 771, // E_ERROR | E_USER_ERROR | E_USER_WARNING | E_WARNING
50
        'ignore_parser_warnings'       => false,
51
        'escape_xhtml_js_css_sections' => self::JS_CSS_ESCAPE_CDATA_CCOMMENT,
52
    ];
53
54
    /**
55
     * Constructor.
56
     *
57
     * Typically, a new DOMQuery is created by QueryPath::with(), QueryPath::withHTML(),
58
     * qp(), or htmlqp().
59
     *
60
     * @param mixed $document
61
     *   A document-like object.
62
     * @param string $string
63
     *   A CSS 3 Selector
64
     * @param array $options
65
     *   An associative array of options.
66
     * @see qp()
67
     * @throws Exception
68
     */
69
    public function __construct($document = NULL, $string = NULL, $options = [])
70
    {
71
        $string = trim($string);
72
        $this->options = $options + Options::get() + $this->options;
73
74
        $parser_flags = $options['parser_flags'] ?? self::DEFAULT_PARSER_FLAGS;
75
        if (!empty($this->options['ignore_parser_warnings'])) {
76
            // Don't convert parser warnings into exceptions.
77
            $this->errTypes = 257; //E_ERROR | E_USER_ERROR;
78
        } elseif (isset($this->options['exception_level'])) {
79
            // Set the error level at which exceptions will be thrown. By default,
80
            // QueryPath will throw exceptions for
81
            // E_ERROR | E_USER_ERROR | E_WARNING | E_USER_WARNING.
82
            $this->errTypes = $this->options['exception_level'];
83
        }
84
85
        // Empty: Just create an empty QP.
86
        if (empty($document)) {
87
            $this->document = isset($this->options['encoding']) ? new \DOMDocument('1.0',
88
                $this->options['encoding']) : new \DOMDocument();
89
            $this->setMatches(new \SplObjectStorage());
90
        } // Figure out if document is DOM, HTML/XML, or a filename
91
        elseif (is_object($document)) {
92
93
            // This is the most frequent object type.
94
            if ($document instanceof \SplObjectStorage) {
95
                $this->matches = $document;
96
                if ($document->count() !== 0) {
97
                    $first = $this->getFirstMatch();
98
                    if (!empty($first->ownerDocument)) {
99
                        $this->document = $first->ownerDocument;
100
                    }
101
                }
102
            } elseif ($document instanceof self) {
103
                //$this->matches = $document->get(NULL, TRUE);
104
                $this->setMatches($document->get(NULL, true));
105
                if ($this->matches->count() > 0) {
106
                    $this->document = $this->getFirstMatch()->ownerDocument;
107
                }
108
            } elseif ($document instanceof \DOMDocument) {
109
                $this->document = $document;
110
                //$this->matches = $this->matches($document->documentElement);
111
                $this->setMatches($document->documentElement);
112
            } elseif ($document instanceof \DOMNode) {
113
                $this->document = $document->ownerDocument;
114
                //$this->matches = array($document);
115
                $this->setMatches($document);
116
            } elseif ($document instanceof \Masterminds\HTML5) {
117
                $this->document = $document;
118
                $this->setMatches($document->documentElement);
119
            } elseif ($document instanceof \SimpleXMLElement) {
120
                $import = dom_import_simplexml($document);
121
                $this->document = $import->ownerDocument;
122
                //$this->matches = array($import);
123
                $this->setMatches($import);
124
            } else {
125
                throw new \QueryPath\Exception('Unsupported class type: ' . get_class($document));
126
            }
127
        } elseif (is_array($document)) {
128
            //trigger_error('Detected deprecated array support', E_USER_NOTICE);
129
            if (!empty($document) && $document[0] instanceof \DOMNode) {
130
                $found = new \SplObjectStorage();
131
                foreach ($document as $item) {
132
                    $found->attach($item);
133
                }
134
                //$this->matches = $found;
135
                $this->setMatches($found);
136
                $this->document = $this->getFirstMatch()->ownerDocument;
137
            }
138
        } elseif ($this->isXMLish($document)) {
139
            // $document is a string with XML
140
            $this->document = $this->parseXMLString($document);
141
            $this->setMatches($this->document->documentElement);
142
        } else {
143
144
            // $document is a filename
145
            $context = empty($options['context']) ? NULL : $options['context'];
146
            $this->document = $this->parseXMLFile($document, $parser_flags, $context);
147
            $this->setMatches($this->document->documentElement);
148
        }
149
150
        // Globally set the output option.
151
        $this->document->formatOutput = true;
152
        if (isset($this->options['format_output']) && $this->options['format_output'] === false) {
153
            $this->document->formatOutput = false;
154
        }
155
156
        // Do a find if the second param was set.
157
        if (strlen($string) > 0) {
158
            // We don't issue a find because that creates a new DOMQuery.
159
            //$this->find($string);
160
161
            $query = new DOMTraverser($this->matches);
162
            $query->find($string);
163
            $this->setMatches($query->matches());
164
        }
165
    }
166
167
    private function parseXMLString($string, $flags = NULL)
168
    {
169
        $document = new \DOMDocument('1.0');
170
        $lead = strtolower(substr($string, 0, 5)); // <?xml
171
        try {
172
            set_error_handler([ParseException::class, 'initializeFromError'], $this->errTypes);
173
174
            if (isset($this->options['convert_to_encoding'])) {
175
                // Is there another way to do this?
176
177
                $from_enc = $this->options['convert_from_encoding'] ?? 'auto';
178
                $to_enc = $this->options['convert_to_encoding'];
179
180
                if (function_exists('mb_convert_encoding')) {
181
                    $string = mb_convert_encoding($string, $to_enc, $from_enc);
182
                }
183
184
            }
185
186
            // This is to avoid cases where low ascii digits have slipped into HTML.
187
            // AFAIK, it should not adversly effect UTF-8 documents.
188
            if (!empty($this->options['strip_low_ascii'])) {
189
                $string = filter_var($string, FILTER_UNSAFE_RAW, FILTER_FLAG_ENCODE_LOW);
190
            }
191
192
            // Allow users to override parser settings.
193
            $useParser = '';
194
            if (!empty($this->options['use_parser'])) {
195
                $useParser = strtolower($this->options['use_parser']);
196
            }
197
198
            // If HTML parser is requested, we use it.
199
            if ($useParser === 'html') {
200
                $document->loadHTML($string);
201
            } // Parse as XML if it looks like XML, or if XML parser is requested.
202
            elseif ($lead === '<?xml' || $useParser === 'xml') {
203
                if ($this->options['replace_entities']) {
204
                    $string = Entities::replaceAllEntities($string);
205
                }
206
                $document->loadXML($string, $flags);
207
            } // In all other cases, we try the HTML parser.
208
            else {
209
                $document->loadHTML($string);
210
            }
211
        } // Emulate 'finally' behavior.
212
        catch (Exception $e) {
213
            restore_error_handler();
214
            throw $e;
215
        }
216
        restore_error_handler();
217
218
        if (empty($document)) {
219
            throw new \QueryPath\ParseException('Unknown parser exception.');
220
        }
221
222
        return $document;
223
    }
224
225
    /**
226
     * EXPERT: Be very, very careful using this.
227
     * A utility function for setting the current set of matches.
228
     * It makes sure the last matches buffer is set (for end() and andSelf()).
229
     *
230
     * @since 2.0
231
     * @param $matches
232
     */
233
    public function setMatches($matches)
234
    {
235
        // This causes a lot of overhead....
236
        //if ($unique) $matches = self::unique($matches);
237
        $this->last = $this->matches;
0 ignored issues
show
Bug Best Practice introduced by
The property last does not exist. Although not strictly required by PHP, it is generally a best practice to declare properties explicitly.
Loading history...
238
239
        // Just set current matches.
240
        if ($matches instanceof \SplObjectStorage) {
241
            $this->matches = $matches;
242
        } // This is likely legacy code that needs conversion.
243
        elseif (is_array($matches)) {
244
            trigger_error('Legacy array detected.');
245
            $tmp = new \SplObjectStorage();
246
            foreach ($matches as $m) {
247
                $tmp->attach($m);
248
            }
249
            $this->matches = $tmp;
250
        }
251
        // For non-arrays, try to create a new match set and
252
        // add this object.
253
        else {
254
            $found = new \SplObjectStorage();
255
            if (isset($matches)) {
256
                $found->attach($matches);
257
            }
258
            $this->matches = $found;
259
        }
260
261
        // EXPERIMENTAL: Support for qp()->length.
262
        $this->length = $this->matches->count();
0 ignored issues
show
Bug Best Practice introduced by
The property length does not exist. Although not strictly required by PHP, it is generally a best practice to declare properties explicitly.
Loading history...
263
    }
264
265
    /**
266
     * A depth-checking function. Typically, it only needs to be
267
     * invoked with the first parameter. The rest are used for recursion.
268
     *
269
     * @see deepest();
270
     * @param DOMNode $ele
271
     *  The element.
272
     * @param int $depth
273
     *  The depth guage
274
     * @param mixed $current
275
     *  The current set.
276
     * @param DOMNode $deepest
277
     *  A reference to the current deepest node.
278
     * @return array
279
     *  Returns an array of DOM nodes.
280
     */
281
    protected function deepestNode(\DOMNode $ele, $depth = 0, $current = NULL, &$deepest = NULL)
282
    {
283
        // FIXME: Should this use SplObjectStorage?
284
        if (!isset($current)) {
285
            $current = [$ele];
286
        }
287
        if (!isset($deepest)) {
288
            $deepest = $depth;
289
        }
290
        if ($ele->hasChildNodes()) {
291
            foreach ($ele->childNodes as $child) {
292
                if ($child->nodeType === XML_ELEMENT_NODE) {
293
                    $current = $this->deepestNode($child, $depth + 1, $current, $deepest);
294
                }
295
            }
296
        } elseif ($depth > $deepest) {
297
            $current = [$ele];
298
            $deepest = $depth;
299
        } elseif ($depth === $deepest) {
300
            $current[] = $ele;
301
        }
302
303
        return $current;
304
    }
305
306
    /**
307
     * Prepare an item for insertion into a DOM.
308
     *
309
     * This handles a variety of boilerplate tasks that need doing before an
310
     * indeterminate object can be inserted into a DOM tree.
311
     * - If item is a string, this is converted into a document fragment and returned.
312
     * - If item is a DOMQuery, then all items are retrieved and converted into
313
     *   a document fragment and returned.
314
     * - If the item is a DOMNode, it is imported into the current DOM if necessary.
315
     * - If the item is a SimpleXMLElement, it is converted into a DOM node and then
316
     *   imported.
317
     *
318
     * @param mixed $item
319
     *  Item to prepare for insert.
320
     * @return mixed
321
     *  Returns the prepared item.
322
     * @throws QueryPath::Exception
323
     *  Thrown if the object passed in is not of a supprted object type.
324
     * @throws Exception
325
     */
326
    protected function prepareInsert($item)
327
    {
328
        if (empty($item)) {
329
            return NULL;
330
        }
331
332
        if (is_string($item)) {
333
            // If configured to do so, replace all entities.
334
            if ($this->options['replace_entities']) {
335
                $item = Entities::replaceAllEntities($item);
336
            }
337
338
            $frag = $this->document->createDocumentFragment();
339
            try {
340
                set_error_handler([ParseException::class, 'initializeFromError'], $this->errTypes);
341
                $frag->appendXML($item);
342
            } // Simulate a finally block.
343
            catch (Exception $e) {
344
                restore_error_handler();
345
                throw $e;
346
            }
347
            restore_error_handler();
348
349
            return $frag;
350
        }
351
352
        if ($item instanceof self) {
353
            if ($item->count() === 0) {
354
                return NULL;
355
            }
356
357
            $frag = $this->document->createDocumentFragment();
358
            foreach ($item->matches as $m) {
359
                $frag->appendXML($item->document->saveXML($m));
360
            }
361
362
            return $frag;
363
        }
364
365
        if ($item instanceof \DOMNode) {
366
            if ($item->ownerDocument !== $this->document) {
367
                // Deep clone this and attach it to this document
368
                $item = $this->document->importNode($item, true);
369
            }
370
371
            return $item;
372
        }
373
374
        if ($item instanceof \SimpleXMLElement) {
375
            $element = dom_import_simplexml($item);
376
377
            return $this->document->importNode($element, true);
378
        }
379
        // What should we do here?
380
        //var_dump($item);
381
        throw new \QueryPath\Exception('Cannot prepare item of unsupported type: ' . gettype($item));
382
    }
383
384
    /**
385
     * Convenience function for getNthMatch(0).
386
     */
387
    protected function getFirstMatch()
388
    {
389
        $this->matches->rewind();
390
391
        return $this->matches->current();
392
    }
393
394
    /**
395
     * Parse an XML or HTML file.
396
     *
397
     * This attempts to autodetect the type of file, and then parse it.
398
     *
399
     * @param string $filename
400
     *  The file name to parse.
401
     * @param int $flags
402
     *  The OR-combined flags accepted by the DOM parser. See the PHP documentation
403
     *  for DOM or for libxml.
404
     * @param resource $context
405
     *  The stream context for the file IO. If this is set, then an alternate
406
     *  parsing path is followed: The file is loaded by PHP's stream-aware IO
407
     *  facilities, read entirely into memory, and then handed off to
408
     *  {@link parseXMLString()}. On large files, this can have a performance impact.
409
     * @throws \QueryPath\ParseException
410
     *  Thrown when a file cannot be loaded or parsed.
411
     */
412
    private function parseXMLFile($filename, $flags = NULL, $context = NULL)
413
    {
414
415
        // If a context is specified, we basically have to do the reading in
416
        // two steps:
417
        if (!empty($context)) {
418
            try {
419
                set_error_handler(['\QueryPath\ParseException', 'initializeFromError'], $this->errTypes);
420
                $contents = file_get_contents($filename, false, $context);
421
            }
422
                // Apparently there is no 'finally' in PHP, so we have to restore the error
423
                // handler this way:
424
            catch (Exception $e) {
425
                restore_error_handler();
426
                throw $e;
427
            }
428
            restore_error_handler();
429
430
            if ($contents == false) {
0 ignored issues
show
Bug Best Practice introduced by
It seems like you are loosely comparing $contents of type string to the boolean false. If you are specifically checking for an empty string, consider using the more explicit === '' instead.
Loading history...
431
                throw new \QueryPath\ParseException(sprintf('Contents of the file %s could not be retrieved.',
432
                    $filename));
433
            }
434
435
            return $this->parseXMLString($contents, $flags);
436
        }
437
438
        $document = new \DOMDocument();
439
        $lastDot = strrpos($filename, '.');
440
441
        $htmlExtensions = [
442
            '.html' => 1,
443
            '.htm'  => 1,
444
        ];
445
446
        // Allow users to override parser settings.
447
        if (empty($this->options['use_parser'])) {
448
            $useParser = '';
449
        } else {
450
            $useParser = strtolower($this->options['use_parser']);
451
        }
452
453
        $ext = $lastDot !== false ? strtolower(substr($filename, $lastDot)) : '';
454
455
        try {
456
            set_error_handler([ParseException::class, 'initializeFromError'], $this->errTypes);
457
458
            // If the parser is explicitly set to XML, use that parser.
459
            if ($useParser === 'xml') {
460
                $document->load($filename, $flags);
461
            } // Otherwise, see if it looks like HTML.
462
            elseif ($useParser === 'html' || isset($htmlExtensions[$ext])) {
463
                // Try parsing it as HTML.
464
                $document->loadHTMLFile($filename);
465
            } // Default to XML.
466
            else {
467
                $document->load($filename, $flags);
468
            }
469
470
        } // Emulate 'finally' behavior.
471
        catch (Exception $e) {
472
            restore_error_handler();
473
            throw $e;
474
        }
475
        restore_error_handler();
476
477
        return $document;
478
    }
479
480
    /**
481
     * Determine whether a given string looks like XML or not.
482
     *
483
     * Basically, this scans a portion of the supplied string, checking to see
484
     * if it has a tag-like structure. It is possible to "confuse" this, which
485
     * may subsequently result in parse errors, but in the vast majority of
486
     * cases, this method serves as a valid inicator of whether or not the
487
     * content looks like XML.
488
     *
489
     * Things that are intentional excluded:
490
     * - plain text with no markup.
491
     * - strings that look like filesystem paths.
492
     *
493
     * Subclasses SHOULD NOT OVERRIDE THIS. Altering it may be altering
494
     * core assumptions about how things work. Instead, classes should
495
     * override the constructor and pass in only one of the parsed types
496
     * that this class expects.
497
     */
498
    protected function isXMLish($string)
499
    {
500
        return (strpos($string, '<') !== false && strpos($string, '>') !== false);
501
    }
502
503
    /**
504
     * A utility function for retriving a match by index.
505
     *
506
     * The internal data structure used in DOMQuery does not have
507
     * strong random access support, so we suppliment it with this method.
508
     *
509
     * @param $index
510
     * @return object|void
511
     */
512
    protected function getNthMatch(int $index)
513
    {
514
        if ($index < 0 || $index > $this->matches->count()) {
515
            return;
516
        }
517
518
        $i = 0;
519
        foreach ($this->matches as $m) {
520
            if ($i++ === $index) {
521
                return $m;
522
            }
523
        }
524
    }
525
}