Parser::elementName()   B
last analyzed

Complexity

Conditions 7
Paths 12

Size

Total Lines 32
Code Lines 23

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 7
eloc 23
nc 12
nop 0
dl 0
loc 32
rs 8.6186
c 0
b 0
f 0
1
<?php
2
/**
3
 * @file
4
 *
5
 * The CSS parser
6
 */
7
8
namespace QueryPath\CSS;
9
10
use QueryPath\Exception;
11
12
/**
13
 * Parse a CSS selector.
14
 *
15
 * In CSS, a selector is used to identify which element or elements
16
 * in a DOM are being selected for the application of a particular style.
17
 * Effectively, selectors function as a query language for a structured
18
 * document -- almost always HTML or XML.
19
 *
20
 * This class provides an event-based parser for CSS selectors. It can be
21
 * used, for example, as a basis for writing a DOM query engine based on
22
 * CSS.
23
 *
24
 * @ingroup querypath_css
25
 */
26
class Parser
27
{
28
    protected $scanner;
29
    protected $buffer = '';
30
    protected $handler;
31
    private $strict = false;
32
33
    protected $DEBUG = false;
34
35
    /**
36
     * Construct a new CSS parser object. This will attempt to
37
     * parse the string as a CSS selector. As it parses, it will
38
     * send events to the EventHandler implementation.
39
     *
40
     * @param string $string
41
     * @param EventHandler $handler
42
     */
43
    public function __construct(string $string, EventHandler $handler)
44
    {
45
        $this->originalString = $string;
0 ignored issues
show
Bug Best Practice introduced by
The property originalString does not exist. Although not strictly required by PHP, it is generally a best practice to declare properties explicitly.
Loading history...
46
        $is = new InputStream($string);
47
        $this->scanner = new Scanner($is);
48
        $this->handler = $handler;
49
    }
50
51
    /**
52
     * Parse the selector.
53
     *
54
     * This begins an event-based parsing process that will
55
     * fire events as the selector is handled. A EventHandler
56
     * implementation will be responsible for handling the events.
57
     *
58
     * @throws ParseException
59
     * @throws Exception
60
     */
61
    public function parse(): void
62
    {
63
        $this->scanner->nextToken();
64
65
        while ($this->scanner->token !== false) {
66
            // Primitive recursion detection.
67
            $position = $this->scanner->position();
68
69
            if ($this->DEBUG) {
70
                echo 'PARSE ' . $this->scanner->token . PHP_EOL;
71
            }
72
            $this->selector();
73
74
            $finalPosition = $this->scanner->position();
75
            if ($this->scanner->token !== false && $finalPosition === $position) {
76
                // If we get here, then the scanner did not pop a single character
77
                // off of the input stream during a full run of the parser, which
78
                // means that the current input does not match any recognizable
79
                // pattern.
80
                throw new ParseException('CSS selector is not well formed.');
81
            }
82
        }
83
    }
84
85
    /**
86
     * A restricted parser that can only parse simple selectors.
87
     * The pseudoClass handler for this parser will throw an
88
     * exception if it encounters a pseudo-element or the
89
     * negation pseudo-class.
90
     *
91
     * @deprecated This is not used anywhere in QueryPath and
92
     *  may be removed.
93
     *//*
94
  public function parseSimpleSelector() {
95
    while ($this->scanner->token !== FALSE) {
96
      if ($this->DEBUG) print "SIMPLE SELECTOR\n";
97
      $this->allElements();
98
      $this->elementName();
99
      $this->elementClass();
100
      $this->elementID();
101
      $this->pseudoClass(TRUE); // Operate in restricted mode.
102
      $this->attribute();
103
104
      // TODO: Need to add failure conditions here.
105
    }
106
  }*/
107
108
    /**
109
     * Handle an entire CSS selector.
110
     *
111
     * @throws ParseException
112
     * @throws Exception
113
     */
114
    private function selector(): void
115
    {
116
        if ($this->DEBUG) {
117
            print 'SELECTOR' . $this->scanner->position() . PHP_EOL;
118
        }
119
120
        $this->consumeWhitespace(); // Remove leading whitespace
121
        $this->simpleSelectors();
122
        $this->combinator();
123
    }
124
125
    /**
126
     * Consume whitespace and return a count of the number of whitespace consumed.
127
     *
128
     * @throws \QueryPath\CSS\ParseException
129
     * @throws Exception
130
     */
131
    private function consumeWhitespace(): int
132
    {
133
        if ($this->DEBUG) {
134
            echo 'CONSUME WHITESPACE' . PHP_EOL;
135
        }
136
137
        $white = 0;
138
        while ($this->scanner->token === Token::WHITE) {
139
            $this->scanner->nextToken();
140
            ++$white;
141
        }
142
143
        return $white;
144
    }
145
146
    /**
147
     * Handle one of the five combinators: '>', '+', ' ', '~', and ','.
148
     * This will call the appropriate event handlers.
149
     *
150
     * @see EventHandler::directDescendant(),
151
     * @see EventHandler::adjacent(),
152
     * @see EventHandler::anyDescendant(),
153
     * @see EventHandler::anotherSelector().
154
     * @throws ParseException
155
     * @throws \QueryPath\Exception
156
     */
157
    private function combinator(): void
158
    {
159
        if ($this->DEBUG) {
160
            echo 'COMBINATOR' . PHP_EOL;
161
        }
162
        /*
163
         * Problem: ' ' and ' > ' are both valid combinators.
164
         * So we have to track whitespace consumption to see
165
         * if we are hitting the ' ' combinator or if the
166
         * selector just has whitespace padding another combinator.
167
         */
168
169
        // Flag to indicate that post-checks need doing
170
        $inCombinator = false;
171
        $white = $this->consumeWhitespace();
172
        $t = $this->scanner->token;
173
174
        if ($t === Token::RANGLE) {
175
            $this->handler->directDescendant();
176
            $this->scanner->nextToken();
177
            $inCombinator = true;
178
            //$this->simpleSelectors();
179
        } elseif ($t === Token::PLUS) {
180
            $this->handler->adjacent();
181
            $this->scanner->nextToken();
182
            $inCombinator = true;
183
            //$this->simpleSelectors();
184
        } elseif ($t === Token::COMMA) {
185
            $this->handler->anotherSelector();
186
            $this->scanner->nextToken();
187
            $inCombinator = true;
188
            //$this->scanner->selectors();
189
        } elseif ($t === Token::TILDE) {
190
            $this->handler->sibling();
191
            $this->scanner->nextToken();
192
            $inCombinator = true;
193
        }
194
195
        // Check that we don't get two combinators in a row.
196
        if ($inCombinator) {
197
            if ($this->DEBUG) {
198
                print 'COMBINATOR: ' . Token::name($t) . "\n";
199
            }
200
            $this->consumeWhitespace();
201
            if ($this->isCombinator($this->scanner->token)) {
202
                throw new ParseException('Illegal combinator: Cannot have two combinators in sequence.');
203
            }
204
        } // Check to see if we have whitespace combinator:
205
        elseif ($white > 0) {
206
            if ($this->DEBUG) {
207
                echo 'COMBINATOR: any descendant' . PHP_EOL;
208
            }
209
            $this->handler->anyDescendant();
210
        } else {
211
            if ($this->DEBUG) {
212
                echo 'COMBINATOR: no combinator found.' . PHP_EOL;
213
            }
214
        }
215
    }
216
217
    /**
218
     * Check if the token is a combinator.
219
     *
220
     * @param int $tok
221
     * @return bool
222
     */
223
    private function isCombinator(int $tok): bool
224
    {
225
        return in_array($tok, [Token::PLUS, Token::RANGLE, Token::COMMA, Token::TILDE], true);
226
    }
227
228
    /**
229
     * Handle a simple selector.
230
     *
231
     * @throws ParseException
232
     */
233
    private function simpleSelectors(): void
234
    {
235
        if ($this->DEBUG) {
236
            print 'SIMPLE SELECTOR' . PHP_EOL;
237
        }
238
        $this->allElements();
239
        $this->elementName();
240
        $this->elementClass();
241
        $this->elementID();
242
        $this->pseudoClass();
243
        $this->attribute();
244
    }
245
246
    /**
247
     * Handles CSS ID selectors.
248
     * This will call EventHandler::elementID().
249
     *
250
     * @throws \QueryPath\CSS\ParseException
251
     * @throws Exception
252
     */
253
    private function elementID(): void
254
    {
255
        if ($this->DEBUG) {
256
            echo 'ELEMENT ID' . PHP_EOL;
257
        }
258
259
        if ($this->scanner->token === Token::OCTO) {
260
            $this->scanner->nextToken();
261
            if ($this->scanner->token !== Token::CHAR) {
262
                throw new ParseException("Expected string after #");
263
            }
264
            $id = $this->scanner->getNameString();
265
            $this->handler->elementID($id);
266
        }
267
    }
268
269
    /**
270
     * Handles CSS class selectors.
271
     * This will call the EventHandler::elementClass() method.
272
     */
273
    private function elementClass(): void
274
    {
275
        if ($this->DEBUG) {
276
            print 'ELEMENT CLASS' . PHP_EOL;
277
        }
278
        if ($this->scanner->token == Token::DOT) {
279
            $this->scanner->nextToken();
280
            $this->consumeWhitespace(); // We're very fault tolerent. This should prob through error.
281
            $cssClass = $this->scanner->getNameString();
282
            $this->handler->elementClass($cssClass);
283
        }
284
    }
285
286
    /**
287
     * Handle a pseudo-class and pseudo-element.
288
     *
289
     * CSS 3 selectors support separate pseudo-elements, using :: instead
290
     * of : for separator. This is now supported, and calls the pseudoElement
291
     * handler, EventHandler::pseudoElement().
292
     *
293
     * This will call EventHandler::pseudoClass() when a
294
     * pseudo-class is parsed.
295
     *
296
     * @throws ParseException
297
     * @throws Exception
298
     */
299
    private function pseudoClass($restricted = false): void
300
    {
301
        if ($this->DEBUG) {
302
            echo 'PSEUDO-CLASS' . PHP_EOL;
303
        }
304
        if ($this->scanner->token === Token::COLON) {
305
            // Check for CSS 3 pseudo element:
306
            $isPseudoElement = false;
307
            if ($this->scanner->nextToken() === Token::COLON) {
308
                $isPseudoElement = true;
309
                $this->scanner->nextToken();
310
            }
311
312
            $name = $this->scanner->getNameString();
313
            if ($restricted && $name === 'not') {
314
                throw new ParseException("The 'not' pseudo-class is illegal in this context.");
315
            }
316
317
            $value = NULL;
318
            if ($this->scanner->token === Token::LPAREN) {
319
                if ($isPseudoElement) {
320
                    throw new ParseException('Illegal left paren. Pseudo-Element cannot have arguments.');
321
                }
322
                $value = $this->pseudoClassValue();
323
            }
324
325
            // FIXME: This should throw errors when pseudo element has values.
326
            if ($isPseudoElement) {
327
                if ($restricted) {
328
                    throw new ParseException('Pseudo-Elements are illegal in this context.');
329
                }
330
                $this->handler->pseudoElement($name);
331
                $this->consumeWhitespace();
332
333
                // Per the spec, pseudo-elements must be the last items in a selector, so we
334
                // check to make sure that we are either at the end of the stream or that a
335
                // new selector is starting. Only one pseudo-element is allowed per selector.
336
                if ($this->scanner->token !== false && $this->scanner->token !== Token::COMMA) {
337
                    throw new ParseException('A Pseudo-Element must be the last item in a selector.');
338
                }
339
            } else {
340
                $this->handler->pseudoClass($name, $value);
341
            }
342
        }
343
    }
344
345
    /**
346
     * Get the value of a pseudo-classes.
347
     *
348
     * @return string
349
     *  Returns the value found from a pseudo-class.
350
     *
351
     * @todo Pseudoclasses can be passed pseudo-elements and
352
     *  other pseudo-classes as values, which means :pseudo(::pseudo)
353
     *  is legal.
354
     */
355
    private function pseudoClassValue()
356
    {
357
        if ($this->scanner->token === Token::LPAREN) {
358
            $buf = '';
359
360
            // For now, just leave pseudoClass value vague.
361
            /*
362
            // We have to peek to see if next char is a colon because
363
            // pseudo-classes and pseudo-elements are legal strings here.
364
            print $this->scanner->peek();
365
            if ($this->scanner->peek() == ':') {
366
              print "Is pseudo\n";
367
              $this->scanner->nextToken();
368
369
              // Pseudo class
370
              if ($this->scanner->token == Token::colon) {
371
                $buf .= ':';
372
                $this->scanner->nextToken();
373
                // Pseudo element
374
                if ($this->scanner->token == Token::colon) {
375
                  $buf .= ':';
376
                  $this->scanner->nextToken();
377
                }
378
                // Ident
379
                $buf .= $this->scanner->getNameString();
380
              }
381
            }
382
            else {
383
              print "fetching string.\n";
384
              $buf .= $this->scanner->getQuotedString();
385
              if ($this->scanner->token != Token::rparen) {
386
                $this->throwError(Token::rparen, $this->scanner->token);
387
              }
388
              $this->scanner->nextToken();
389
            }
390
            return $buf;
391
            */
392
            //$buf .= $this->scanner->getQuotedString();
393
            $buf .= $this->scanner->getPseudoClassString();
394
395
            return $buf;
396
        }
397
    }
398
399
    /**
400
     * Handle element names.
401
     * This will call the EventHandler::elementName().
402
     *
403
     * This handles:
404
     * <code>
405
     *  name (EventHandler::element())
406
     *  |name (EventHandler::element())
407
     *  ns|name (EventHandler::elementNS())
408
     *  ns|* (EventHandler::elementNS())
409
     * </code>
410
     */
411
    private function elementName()
412
    {
413
        if ($this->DEBUG) {
414
            print "ELEMENT NAME\n";
415
        }
416
        if ($this->scanner->token === Token::PIPE) {
417
            // We have '|name', which is equiv to 'name'
418
            $this->scanner->nextToken();
419
            $this->consumeWhitespace();
420
            $elementName = $this->scanner->getNameString();
421
            $this->handler->element($elementName);
422
        } elseif ($this->scanner->token === Token::CHAR) {
423
            $elementName = $this->scanner->getNameString();
424
            if ($this->scanner->token == Token::PIPE) {
425
                // Get ns|name
426
                $elementNS = $elementName;
427
                $this->scanner->nextToken();
428
                $this->consumeWhitespace();
429
                if ($this->scanner->token === Token::STAR) {
430
                    // We have ns|*
431
                    $this->handler->anyElementInNS($elementNS);
432
                    $this->scanner->nextToken();
433
                } elseif ($this->scanner->token !== Token::CHAR) {
434
                    $this->throwError(Token::CHAR, $this->scanner->token);
435
                } else {
436
                    $elementName = $this->scanner->getNameString();
437
                    // We have ns|name
438
                    $this->handler->elementNS($elementName, $elementNS);
439
                }
440
441
            } else {
442
                $this->handler->element($elementName);
443
            }
444
        }
445
    }
446
447
    /**
448
     * Check for all elements designators. Due to the new CSS 3 namespace
449
     * support, this is slightly more complicated, now, as it handles
450
     * the *|name and *|* cases as well as *.
451
     *
452
     * Calls EventHandler::anyElement() or EventHandler::elementName().
453
     */
454
    private function allElements()
455
    {
456
        if ($this->scanner->token === Token::STAR) {
457
            $this->scanner->nextToken();
458
            if ($this->scanner->token === Token::PIPE) {
459
                $this->scanner->nextToken();
460
                if ($this->scanner->token === Token::STAR) {
461
                    // We got *|*. According to spec, this requires
462
                    // that the element has a namespace, so we pass it on
463
                    // to the handler:
464
                    $this->scanner->nextToken();
465
                    $this->handler->anyElementInNS('*');
466
                } else {
467
                    // We got *|name, which means the name MUST be in a namespce,
468
                    // so we pass this off to elementNameNS().
469
                    $name = $this->scanner->getNameString();
470
                    $this->handler->elementNS($name, '*');
471
                }
472
            } else {
473
                $this->handler->anyElement();
474
            }
475
        }
476
    }
477
478
    /**
479
     * Handler an attribute.
480
     * An attribute can be in one of two forms:
481
     * <code>[attrName]</code>
482
     * or
483
     * <code>[attrName="AttrValue"]</code>
484
     *
485
     * This may call the following event handlers: EventHandler::attribute().
486
     *
487
     * @throws \QueryPath\CSS\ParseException
488
     * @throws Exception
489
     */
490
    private function attribute()
491
    {
492
        if ($this->scanner->token === Token::LSQUARE) {
493
            $attrVal = $op = $ns = NULL;
494
495
            $this->scanner->nextToken();
496
            $this->consumeWhitespace();
497
498
            if ($this->scanner->token === Token::AT) {
499
                if ($this->strict) {
500
                    throw new ParseException('The @ is illegal in attributes.');
501
                }
502
503
                $this->scanner->nextToken();
504
                $this->consumeWhitespace();
505
            }
506
507
            if ($this->scanner->token === Token::STAR) {
508
                // Global namespace... requires that attr be prefixed,
509
                // so we pass this on to a namespace handler.
510
                $ns = '*';
511
                $this->scanner->nextToken();
512
            }
513
            if ($this->scanner->token === Token::PIPE) {
514
                // Skip this. It's a global namespace.
515
                $this->scanner->nextToken();
516
                $this->consumeWhitespace();
517
            }
518
519
            $attrName = $this->scanner->getNameString();
520
            $this->consumeWhitespace();
521
522
            // Check for namespace attribute: ns|attr. We have to peek() to make
523
            // sure that we haven't hit the |= operator, which looks the same.
524
            if ($this->scanner->token === Token::PIPE && $this->scanner->peek() !== '=') {
525
                // We have a namespaced attribute.
526
                $ns = $attrName;
527
                $this->scanner->nextToken();
528
                $attrName = $this->scanner->getNameString();
529
                $this->consumeWhitespace();
530
            }
531
532
            // Note: We require that operators do not have spaces
533
            // between characters, e.g. ~= , not ~ =.
534
535
            // Get the operator:
536
            switch ($this->scanner->token) {
537
                case Token::EQ:
538
                    $this->consumeWhitespace();
539
                    $op = EventHandler::IS_EXACTLY;
540
                    break;
541
                case Token::TILDE:
542
                    if ($this->scanner->nextToken() !== Token::EQ) {
543
                        $this->throwError(Token::EQ, $this->scanner->token);
544
                    }
545
                    $op = EventHandler::CONTAINS_WITH_SPACE;
546
                    break;
547
                case Token::PIPE:
548
                    if ($this->scanner->nextToken() !== Token::EQ) {
549
                        $this->throwError(Token::EQ, $this->scanner->token);
550
                    }
551
                    $op = EventHandler::CONTAINS_WITH_HYPHEN;
552
                    break;
553
                case Token::STAR:
554
                    if ($this->scanner->nextToken() !== Token::EQ) {
555
                        $this->throwError(Token::EQ, $this->scanner->token);
556
                    }
557
                    $op = EventHandler::CONTAINS_IN_STRING;
558
                    break;
559
                case Token::DOLLAR;
560
                    if ($this->scanner->nextToken() !== Token::EQ) {
561
                        $this->throwError(Token::EQ, $this->scanner->token);
562
                    }
563
                    $op = EventHandler::ENDS_WITH;
564
                    break;
565
                case Token::CARAT:
566
                    if ($this->scanner->nextToken() !== Token::EQ) {
567
                        $this->throwError(Token::EQ, $this->scanner->token);
568
                    }
569
                    $op = EventHandler::BEGINS_WITH;
570
                    break;
571
            }
572
573
            if (isset($op)) {
574
                // Consume '=' and go on.
575
                $this->scanner->nextToken();
576
                $this->consumeWhitespace();
577
578
                // So... here we have a problem. The grammer suggests that the
579
                // value here is String1 or String2, both of which are enclosed
580
                // in quotes of some sort, and both of which allow lots of special
581
                // characters. But the spec itself includes examples like this:
582
                //   [lang=fr]
583
                // So some bareword support is assumed. To get around this, we assume
584
                // that bare words follow the NAME rules, while quoted strings follow
585
                // the String1/String2 rules.
586
587
                if ($this->scanner->token === Token::QUOTE || $this->scanner->token === Token::SQUOTE) {
588
                    $attrVal = $this->scanner->getQuotedString();
589
                } else {
590
                    $attrVal = $this->scanner->getNameString();
591
                }
592
593
                if ($this->DEBUG) {
594
                    print "ATTR: $attrVal AND OP: $op\n";
595
                }
596
            }
597
598
            $this->consumeWhitespace();
599
600
            if ($this->scanner->token !== Token::RSQUARE) {
601
                $this->throwError(Token::RSQUARE, $this->scanner->token);
602
            }
603
604
            if (isset($ns)) {
605
                $this->handler->attributeNS($attrName, $ns, $attrVal, $op);
606
            } elseif (isset($attrVal)) {
607
                $this->handler->attribute($attrName, $attrVal, $op);
608
            } else {
609
                $this->handler->attribute($attrName);
610
            }
611
            $this->scanner->nextToken();
612
        }
613
    }
614
615
    /**
616
     * Utility for throwing a consistantly-formatted parse error.
617
     */
618
    private function throwError($expected, $got)
619
    {
620
        $filter = sprintf('Expected %s, got %s', Token::name($expected), Token::name($got));
621
        throw new ParseException($filter);
622
    }
623
624
    /**
625
     * @return Scanner
626
     */
627
    public function getScanner(): Scanner
628
    {
629
        return $this->scanner;
630
    }
631
632
}
633
634