Complex classes like DOMTreeBuilder often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use DOMTreeBuilder, and based on these observations, apply Extract Interface, too.
| 1 | <?php |
||
| 26 | class DOMTreeBuilder implements EventHandler |
||
| 27 | { |
||
| 28 | /** |
||
| 29 | * Defined in http://www.w3.org/TR/html51/infrastructure.html#html-namespace-0. |
||
| 30 | */ |
||
| 31 | const NAMESPACE_HTML = 'http://www.w3.org/1999/xhtml'; |
||
| 32 | |||
| 33 | const NAMESPACE_MATHML = 'http://www.w3.org/1998/Math/MathML'; |
||
| 34 | |||
| 35 | const NAMESPACE_SVG = 'http://www.w3.org/2000/svg'; |
||
| 36 | |||
| 37 | const NAMESPACE_XLINK = 'http://www.w3.org/1999/xlink'; |
||
| 38 | |||
| 39 | const NAMESPACE_XML = 'http://www.w3.org/XML/1998/namespace'; |
||
| 40 | |||
| 41 | const NAMESPACE_XMLNS = 'http://www.w3.org/2000/xmlns/'; |
||
| 42 | |||
| 43 | const OPT_DISABLE_HTML_NS = 'disable_html_ns'; |
||
| 44 | |||
| 45 | const OPT_TARGET_DOC = 'target_document'; |
||
| 46 | |||
| 47 | const OPT_IMPLICIT_NS = 'implicit_namespaces'; |
||
| 48 | |||
| 49 | /** |
||
| 50 | * Holds the HTML5 element names that causes a namespace switch. |
||
| 51 | * |
||
| 52 | * @var array |
||
| 53 | */ |
||
| 54 | protected $nsRoots = array( |
||
| 55 | 'html' => self::NAMESPACE_HTML, |
||
| 56 | 'svg' => self::NAMESPACE_SVG, |
||
| 57 | 'math' => self::NAMESPACE_MATHML, |
||
| 58 | ); |
||
| 59 | |||
| 60 | /** |
||
| 61 | * Holds the always available namespaces (which does not require the XMLNS declaration). |
||
| 62 | * |
||
| 63 | * @var array |
||
| 64 | */ |
||
| 65 | protected $implicitNamespaces = array( |
||
| 66 | 'xml' => self::NAMESPACE_XML, |
||
| 67 | 'xmlns' => self::NAMESPACE_XMLNS, |
||
| 68 | 'xlink' => self::NAMESPACE_XLINK, |
||
| 69 | ); |
||
| 70 | |||
| 71 | /** |
||
| 72 | * Holds a stack of currently active namespaces. |
||
| 73 | * |
||
| 74 | * @var array |
||
| 75 | */ |
||
| 76 | protected $nsStack = array(); |
||
| 77 | |||
| 78 | /** |
||
| 79 | * Holds the number of namespaces declared by a node. |
||
| 80 | * |
||
| 81 | * @var array |
||
| 82 | */ |
||
| 83 | protected $pushes = array(); |
||
| 84 | |||
| 85 | /** |
||
| 86 | * Defined in 8.2.5. |
||
| 87 | */ |
||
| 88 | const IM_INITIAL = 0; |
||
| 89 | |||
| 90 | const IM_BEFORE_HTML = 1; |
||
| 91 | |||
| 92 | const IM_BEFORE_HEAD = 2; |
||
| 93 | |||
| 94 | const IM_IN_HEAD = 3; |
||
| 95 | |||
| 96 | const IM_IN_HEAD_NOSCRIPT = 4; |
||
| 97 | |||
| 98 | const IM_AFTER_HEAD = 5; |
||
| 99 | |||
| 100 | const IM_IN_BODY = 6; |
||
| 101 | |||
| 102 | const IM_TEXT = 7; |
||
| 103 | |||
| 104 | const IM_IN_TABLE = 8; |
||
| 105 | |||
| 106 | const IM_IN_TABLE_TEXT = 9; |
||
| 107 | |||
| 108 | const IM_IN_CAPTION = 10; |
||
| 109 | |||
| 110 | const IM_IN_COLUMN_GROUP = 11; |
||
| 111 | |||
| 112 | const IM_IN_TABLE_BODY = 12; |
||
| 113 | |||
| 114 | const IM_IN_ROW = 13; |
||
| 115 | |||
| 116 | const IM_IN_CELL = 14; |
||
| 117 | |||
| 118 | const IM_IN_SELECT = 15; |
||
| 119 | |||
| 120 | const IM_IN_SELECT_IN_TABLE = 16; |
||
| 121 | |||
| 122 | const IM_AFTER_BODY = 17; |
||
| 123 | |||
| 124 | const IM_IN_FRAMESET = 18; |
||
| 125 | |||
| 126 | const IM_AFTER_FRAMESET = 19; |
||
| 127 | |||
| 128 | const IM_AFTER_AFTER_BODY = 20; |
||
| 129 | |||
| 130 | const IM_AFTER_AFTER_FRAMESET = 21; |
||
| 131 | |||
| 132 | const IM_IN_SVG = 22; |
||
| 133 | |||
| 134 | const IM_IN_MATHML = 23; |
||
| 135 | |||
| 136 | protected $options = array(); |
||
| 137 | |||
| 138 | protected $stack = array(); |
||
| 139 | |||
| 140 | protected $current; // Pointer in the tag hierarchy. |
||
| 141 | protected $rules; |
||
| 142 | protected $doc; |
||
| 143 | |||
| 144 | protected $frag; |
||
| 145 | |||
| 146 | protected $processor; |
||
| 147 | |||
| 148 | protected $insertMode = 0; |
||
| 149 | |||
| 150 | /** |
||
| 151 | * Track if we are in an element that allows only inline child nodes. |
||
| 152 | * |
||
| 153 | * @var string|null |
||
| 154 | */ |
||
| 155 | protected $onlyInline; |
||
| 156 | |||
| 157 | /** |
||
| 158 | * Quirks mode is enabled by default. |
||
| 159 | * Any document that is missing the DT will be considered to be in quirks mode. |
||
| 160 | */ |
||
| 161 | protected $quirks = true; |
||
| 162 | |||
| 163 | protected $errors = array(); |
||
| 164 | |||
| 165 | 114 | public function __construct($isFragment = false, array $options = array()) |
|
| 205 | |||
| 206 | /** |
||
| 207 | * Get the document. |
||
| 208 | */ |
||
| 209 | 103 | public function document() |
|
| 213 | |||
| 214 | /** |
||
| 215 | * Get the DOM fragment for the body. |
||
| 216 | * |
||
| 217 | * This returns a DOMNodeList because a fragment may have zero or more |
||
| 218 | * DOMNodes at its root. |
||
| 219 | * |
||
| 220 | * @see http://www.w3.org/TR/2012/CR-html5-20121217/syntax.html#concept-frag-parse-context |
||
| 221 | * |
||
| 222 | * @return \DOMDocumentFragment |
||
| 223 | */ |
||
| 224 | 19 | public function fragment() |
|
| 228 | |||
| 229 | /** |
||
| 230 | * Provide an instruction processor. |
||
| 231 | * |
||
| 232 | * This is used for handling Processor Instructions as they are |
||
| 233 | * inserted. If omitted, PI's are inserted directly into the DOM tree. |
||
| 234 | * |
||
| 235 | * @param InstructionProcessor $proc |
||
| 236 | */ |
||
| 237 | 1 | public function setInstructionProcessor(InstructionProcessor $proc) |
|
| 241 | |||
| 242 | 97 | public function doctype($name, $idType = 0, $id = null, $quirks = false) |
|
| 256 | |||
| 257 | /** |
||
| 258 | * Process the start tag. |
||
| 259 | * |
||
| 260 | * @todo - XMLNS namespace handling (we need to parse, even if it's not valid) |
||
| 261 | * - XLink, MathML and SVG namespace handling |
||
| 262 | * - Omission rules: 8.1.2.4 Optional tags |
||
| 263 | * |
||
| 264 | * @param string $name |
||
| 265 | * @param array $attributes |
||
| 266 | * @param bool $selfClosing |
||
| 267 | * |
||
| 268 | * @return int |
||
| 269 | */ |
||
| 270 | 109 | public function startTag($name, $attributes = array(), $selfClosing = false) |
|
| 472 | |||
| 473 | 107 | public function endTag($name) |
|
| 474 | { |
||
| 475 | 107 | $lname = $this->normalizeTagName($name); |
|
| 476 | |||
| 477 | // Special case within 12.2.6.4.7: An end tag whose tag name is "br" should be treated as an opening tag |
||
| 478 | 107 | if ($name === 'br') { |
|
| 479 | 1 | $this->parseError('Closing tag encountered for void element br.'); |
|
| 480 | |||
| 481 | 1 | $this->startTag('br'); |
|
| 482 | 1 | } |
|
| 483 | // Ignore closing tags for other unary elements. |
||
| 484 | 107 | elseif (Elements::isA($name, Elements::VOID_TAG)) { |
|
| 485 | return; |
||
| 486 | } |
||
| 487 | |||
| 488 | 107 | if ($this->insertMode <= static::IM_BEFORE_HTML) { |
|
| 489 | // 8.2.5.4.2 |
||
| 490 | if (in_array($name, array( |
||
| 491 | 'html', |
||
| 492 | 'br', |
||
| 493 | 'head', |
||
| 494 | 'title', |
||
| 495 | ))) { |
||
| 496 | $this->startTag('html'); |
||
| 497 | $this->endTag($name); |
||
| 498 | $this->insertMode = static::IM_BEFORE_HEAD; |
||
| 499 | |||
| 500 | return; |
||
| 501 | } |
||
| 502 | |||
| 503 | // Ignore the tag. |
||
| 504 | $this->parseError('Illegal closing tag at global scope.'); |
||
| 505 | |||
| 506 | return; |
||
| 507 | } |
||
| 508 | |||
| 509 | // Special case handling for SVG. |
||
| 510 | 107 | if ($this->insertMode === static::IM_IN_SVG) { |
|
| 511 | 8 | $lname = Elements::normalizeSvgElement($lname); |
|
| 512 | 8 | } |
|
| 513 | |||
| 514 | 107 | $cid = spl_object_hash($this->current); |
|
| 515 | |||
| 516 | // XXX: HTML has no parent. What do we do, though, |
||
| 517 | // if this element appears in the wrong place? |
||
| 518 | 107 | if ('html' === $lname) { |
|
| 519 | 98 | return; |
|
| 520 | } |
||
| 521 | |||
| 522 | // remove the namespaced definded by current node |
||
| 523 | 101 | if (isset($this->pushes[$cid])) { |
|
| 524 | 15 | for ($i = 0; $i < $this->pushes[$cid][0]; ++$i) { |
|
| 525 | 15 | array_shift($this->nsStack); |
|
| 526 | 15 | } |
|
| 527 | 15 | unset($this->pushes[$cid]); |
|
| 528 | 15 | } |
|
| 529 | |||
| 530 | 101 | if (!$this->autoclose($lname)) { |
|
| 531 | 3 | $this->parseError('Could not find closing tag for ' . $lname); |
|
| 532 | 3 | } |
|
| 533 | |||
| 534 | switch ($lname) { |
||
| 535 | 101 | case 'head': |
|
| 536 | 44 | $this->insertMode = static::IM_AFTER_HEAD; |
|
| 537 | 44 | break; |
|
| 538 | 100 | case 'body': |
|
| 539 | 88 | $this->insertMode = static::IM_AFTER_BODY; |
|
| 540 | 88 | break; |
|
| 541 | 85 | case 'svg': |
|
| 542 | 85 | case 'mathml': |
|
| 543 | 8 | $this->insertMode = static::IM_IN_BODY; |
|
| 544 | 8 | break; |
|
| 545 | } |
||
| 546 | 101 | } |
|
| 547 | |||
| 548 | 5 | public function comment($cdata) |
|
| 554 | |||
| 555 | 91 | public function text($data) |
|
| 575 | |||
| 576 | 114 | public function eof() |
|
| 580 | |||
| 581 | 13 | public function parseError($msg, $line = 0, $col = 0) |
|
| 585 | |||
| 586 | 108 | public function getErrors() |
|
| 590 | |||
| 591 | 3 | public function cdata($data) |
|
| 596 | |||
| 597 | 5 | public function processingInstruction($name, $data = null) |
|
| 619 | |||
| 620 | // ========================================================================== |
||
| 621 | // UTILITIES |
||
| 622 | // ========================================================================== |
||
| 623 | |||
| 624 | /** |
||
| 625 | * Apply normalization rules to a tag name. |
||
| 626 | * See sections 2.9 and 8.1.2. |
||
| 627 | * |
||
| 628 | * @param string $tagName |
||
| 629 | * |
||
| 630 | * @return string The normalized tag name. |
||
| 631 | */ |
||
| 632 | 109 | protected function normalizeTagName($tagName) |
|
| 639 | |||
| 640 | protected function quirksTreeResolver($name) |
||
| 644 | |||
| 645 | /** |
||
| 646 | * Automatically climb the tree and close the closest node with the matching $tag. |
||
| 647 | * |
||
| 648 | * @param string $tagName |
||
| 649 | * |
||
| 650 | * @return bool |
||
| 651 | */ |
||
| 652 | 101 | protected function autoclose($tagName) |
|
| 668 | |||
| 669 | /** |
||
| 670 | * Checks if the given tagname is an ancestor of the present candidate. |
||
| 671 | * |
||
| 672 | * If $this->current or anything above $this->current matches the given tag |
||
| 673 | * name, this returns true. |
||
| 674 | * |
||
| 675 | * @param string $tagName |
||
| 676 | * |
||
| 677 | * @return bool |
||
| 678 | */ |
||
| 679 | protected function isAncestor($tagName) |
||
| 691 | |||
| 692 | /** |
||
| 693 | * Returns true if the immediate parent element is of the given tagname. |
||
| 694 | * |
||
| 695 | * @param string $tagName |
||
| 696 | * |
||
| 697 | * @return bool |
||
| 698 | */ |
||
| 699 | protected function isParent($tagName) |
||
| 703 | } |
||
| 704 |
If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check:
If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue.