| Total Complexity | 78 |
| Total Lines | 506 |
| Duplicated Lines | 0 % |
| Changes | 0 | ||
Complex classes like DOM often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use DOM, and based on these observations, apply Extract Interface, too.
| 1 | <?php |
||
| 15 | abstract class DOM implements Query, \IteratorAggregate, \Countable |
||
| 16 | { |
||
| 17 | |||
| 18 | /** |
||
| 19 | * The array of matches. |
||
| 20 | */ |
||
| 21 | protected $matches = []; |
||
| 22 | |||
| 23 | /** |
||
| 24 | * Default parser flags. |
||
| 25 | * |
||
| 26 | * These are flags that will be used if no global or local flags override them. |
||
| 27 | * |
||
| 28 | * @since 2.0 |
||
| 29 | */ |
||
| 30 | public const DEFAULT_PARSER_FLAGS = NULL; |
||
| 31 | |||
| 32 | public const JS_CSS_ESCAPE_CDATA = '\\1'; |
||
| 33 | public const JS_CSS_ESCAPE_CDATA_CCOMMENT = '/* \\1 */'; |
||
| 34 | public const JS_CSS_ESCAPE_CDATA_DOUBLESLASH = '// \\1'; |
||
| 35 | public const JS_CSS_ESCAPE_NONE = ''; |
||
| 36 | |||
| 37 | protected $errTypes = 771; //E_ERROR; | E_USER_ERROR; |
||
| 38 | |||
| 39 | protected $document; |
||
| 40 | /** |
||
| 41 | * The base DOMDocument. |
||
| 42 | */ |
||
| 43 | protected $options = [ |
||
| 44 | 'parser_flags' => NULL, |
||
| 45 | 'omit_xml_declaration' => false, |
||
| 46 | 'replace_entities' => false, |
||
| 47 | 'exception_level' => 771, // E_ERROR | E_USER_ERROR | E_USER_WARNING | E_WARNING |
||
| 48 | 'ignore_parser_warnings' => false, |
||
| 49 | 'escape_xhtml_js_css_sections' => self::JS_CSS_ESCAPE_CDATA_CCOMMENT, |
||
| 50 | ]; |
||
| 51 | |||
| 52 | /** |
||
| 53 | * Constructor. |
||
| 54 | * |
||
| 55 | * Typically, a new DOMQuery is created by QueryPath::with(), QueryPath::withHTML(), |
||
| 56 | * qp(), or htmlqp(). |
||
| 57 | * |
||
| 58 | * @param mixed $document |
||
| 59 | * A document-like object. |
||
| 60 | * @param string $string |
||
| 61 | * A CSS 3 Selector |
||
| 62 | * @param array $options |
||
| 63 | * An associative array of options. |
||
| 64 | * @see qp() |
||
| 65 | * @throws Exception |
||
| 66 | */ |
||
| 67 | public function __construct($document = NULL, $string = NULL, $options = []) |
||
| 68 | { |
||
| 69 | $string = trim($string); |
||
| 70 | $this->options = $options + Options::get() + $this->options; |
||
| 71 | |||
| 72 | $parser_flags = $options['parser_flags'] ?? self::DEFAULT_PARSER_FLAGS; |
||
| 73 | if (!empty($this->options['ignore_parser_warnings'])) { |
||
| 74 | // Don't convert parser warnings into exceptions. |
||
| 75 | $this->errTypes = 257; //E_ERROR | E_USER_ERROR; |
||
| 76 | } elseif (isset($this->options['exception_level'])) { |
||
| 77 | // Set the error level at which exceptions will be thrown. By default, |
||
| 78 | // QueryPath will throw exceptions for |
||
| 79 | // E_ERROR | E_USER_ERROR | E_WARNING | E_USER_WARNING. |
||
| 80 | $this->errTypes = $this->options['exception_level']; |
||
| 81 | } |
||
| 82 | |||
| 83 | // Empty: Just create an empty QP. |
||
| 84 | if (empty($document)) { |
||
| 85 | $this->document = isset($this->options['encoding']) ? new \DOMDocument('1.0', |
||
| 86 | $this->options['encoding']) : new \DOMDocument(); |
||
| 87 | $this->setMatches(new \SplObjectStorage()); |
||
| 88 | } // Figure out if document is DOM, HTML/XML, or a filename |
||
| 89 | elseif (is_object($document)) { |
||
| 90 | |||
| 91 | // This is the most frequent object type. |
||
| 92 | if ($document instanceof \SplObjectStorage) { |
||
| 93 | $this->matches = $document; |
||
| 94 | if ($document->count() !== 0) { |
||
| 95 | $first = $this->getFirstMatch(); |
||
| 96 | if (!empty($first->ownerDocument)) { |
||
| 97 | $this->document = $first->ownerDocument; |
||
| 98 | } |
||
| 99 | } |
||
| 100 | } elseif ($document instanceof self) { |
||
| 101 | //$this->matches = $document->get(NULL, TRUE); |
||
| 102 | $this->setMatches($document->get(NULL, true)); |
||
|
|
|||
| 103 | if ($this->matches->count() > 0) { |
||
| 104 | $this->document = $this->getFirstMatch()->ownerDocument; |
||
| 105 | } |
||
| 106 | } elseif ($document instanceof \DOMDocument) { |
||
| 107 | $this->document = $document; |
||
| 108 | //$this->matches = $this->matches($document->documentElement); |
||
| 109 | $this->setMatches($document->documentElement); |
||
| 110 | } elseif ($document instanceof \DOMNode) { |
||
| 111 | $this->document = $document->ownerDocument; |
||
| 112 | //$this->matches = array($document); |
||
| 113 | $this->setMatches($document); |
||
| 114 | } elseif ($document instanceof \Masterminds\HTML5) { |
||
| 115 | $this->document = $document; |
||
| 116 | $this->setMatches($document->documentElement); |
||
| 117 | } elseif ($document instanceof \SimpleXMLElement) { |
||
| 118 | $import = dom_import_simplexml($document); |
||
| 119 | $this->document = $import->ownerDocument; |
||
| 120 | //$this->matches = array($import); |
||
| 121 | $this->setMatches($import); |
||
| 122 | } else { |
||
| 123 | throw new \QueryPath\Exception('Unsupported class type: ' . get_class($document)); |
||
| 124 | } |
||
| 125 | } elseif (is_array($document)) { |
||
| 126 | //trigger_error('Detected deprecated array support', E_USER_NOTICE); |
||
| 127 | if (!empty($document) && $document[0] instanceof \DOMNode) { |
||
| 128 | $found = new \SplObjectStorage(); |
||
| 129 | foreach ($document as $item) { |
||
| 130 | $found->attach($item); |
||
| 131 | } |
||
| 132 | //$this->matches = $found; |
||
| 133 | $this->setMatches($found); |
||
| 134 | $this->document = $this->getFirstMatch()->ownerDocument; |
||
| 135 | } |
||
| 136 | } elseif ($this->isXMLish($document)) { |
||
| 137 | // $document is a string with XML |
||
| 138 | $this->document = $this->parseXMLString($document); |
||
| 139 | $this->setMatches($this->document->documentElement); |
||
| 140 | } else { |
||
| 141 | |||
| 142 | // $document is a filename |
||
| 143 | $context = empty($options['context']) ? NULL : $options['context']; |
||
| 144 | $this->document = $this->parseXMLFile($document, $parser_flags, $context); |
||
| 145 | $this->setMatches($this->document->documentElement); |
||
| 146 | } |
||
| 147 | |||
| 148 | // Globally set the output option. |
||
| 149 | $this->document->formatOutput = true; |
||
| 150 | if (isset($this->options['format_output']) && $this->options['format_output'] == false) { |
||
| 151 | $this->document->formatOutput = false; |
||
| 152 | } |
||
| 153 | |||
| 154 | // Do a find if the second param was set. |
||
| 155 | if (strlen($string) > 0) { |
||
| 156 | // We don't issue a find because that creates a new DOMQuery. |
||
| 157 | //$this->find($string); |
||
| 158 | |||
| 159 | $query = new \QueryPath\CSS\DOMTraverser($this->matches); |
||
| 160 | $query->find($string); |
||
| 161 | $this->setMatches($query->matches()); |
||
| 162 | } |
||
| 163 | } |
||
| 164 | |||
| 165 | private function parseXMLString($string, $flags = NULL) |
||
| 166 | { |
||
| 167 | $document = new \DOMDocument('1.0'); |
||
| 168 | $lead = strtolower(substr($string, 0, 5)); // <?xml |
||
| 169 | try { |
||
| 170 | set_error_handler(['\QueryPath\ParseException', 'initializeFromError'], $this->errTypes); |
||
| 171 | |||
| 172 | if (isset($this->options['convert_to_encoding'])) { |
||
| 173 | // Is there another way to do this? |
||
| 174 | |||
| 175 | $from_enc = isset($this->options['convert_from_encoding']) ? $this->options['convert_from_encoding'] : 'auto'; |
||
| 176 | $to_enc = $this->options['convert_to_encoding']; |
||
| 177 | |||
| 178 | if (function_exists('mb_convert_encoding')) { |
||
| 179 | $string = mb_convert_encoding($string, $to_enc, $from_enc); |
||
| 180 | } |
||
| 181 | |||
| 182 | } |
||
| 183 | |||
| 184 | // This is to avoid cases where low ascii digits have slipped into HTML. |
||
| 185 | // AFAIK, it should not adversly effect UTF-8 documents. |
||
| 186 | if (!empty($this->options['strip_low_ascii'])) { |
||
| 187 | $string = filter_var($string, FILTER_UNSAFE_RAW, FILTER_FLAG_ENCODE_LOW); |
||
| 188 | } |
||
| 189 | |||
| 190 | // Allow users to override parser settings. |
||
| 191 | if (empty($this->options['use_parser'])) { |
||
| 192 | $useParser = ''; |
||
| 193 | } else { |
||
| 194 | $useParser = strtolower($this->options['use_parser']); |
||
| 195 | } |
||
| 196 | |||
| 197 | // If HTML parser is requested, we use it. |
||
| 198 | if ($useParser == 'html') { |
||
| 199 | $document->loadHTML($string); |
||
| 200 | } // Parse as XML if it looks like XML, or if XML parser is requested. |
||
| 201 | elseif ($lead == '<?xml' || $useParser == 'xml') { |
||
| 202 | if ($this->options['replace_entities']) { |
||
| 203 | $string = \QueryPath\Entities::replaceAllEntities($string); |
||
| 204 | } |
||
| 205 | $document->loadXML($string, $flags); |
||
| 206 | } // In all other cases, we try the HTML parser. |
||
| 207 | else { |
||
| 208 | $document->loadHTML($string); |
||
| 209 | } |
||
| 210 | } // Emulate 'finally' behavior. |
||
| 211 | catch (Exception $e) { |
||
| 212 | restore_error_handler(); |
||
| 213 | throw $e; |
||
| 214 | } |
||
| 215 | restore_error_handler(); |
||
| 216 | |||
| 217 | if (empty($document)) { |
||
| 218 | throw new \QueryPath\ParseException('Unknown parser exception.'); |
||
| 219 | } |
||
| 220 | |||
| 221 | return $document; |
||
| 222 | } |
||
| 223 | |||
| 224 | /** |
||
| 225 | * EXPERT: Be very, very careful using this. |
||
| 226 | * A utility function for setting the current set of matches. |
||
| 227 | * It makes sure the last matches buffer is set (for end() and andSelf()). |
||
| 228 | * |
||
| 229 | * @since 2.0 |
||
| 230 | * @param $matches |
||
| 231 | */ |
||
| 232 | public function setMatches($matches) |
||
| 233 | { |
||
| 234 | // This causes a lot of overhead.... |
||
| 235 | //if ($unique) $matches = self::unique($matches); |
||
| 236 | $this->last = $this->matches; |
||
| 237 | |||
| 238 | // Just set current matches. |
||
| 239 | if ($matches instanceof \SplObjectStorage) { |
||
| 240 | $this->matches = $matches; |
||
| 241 | } // This is likely legacy code that needs conversion. |
||
| 242 | elseif (is_array($matches)) { |
||
| 243 | trigger_error('Legacy array detected.'); |
||
| 244 | $tmp = new \SplObjectStorage(); |
||
| 245 | foreach ($matches as $m) { |
||
| 246 | $tmp->attach($m); |
||
| 247 | } |
||
| 248 | $this->matches = $tmp; |
||
| 249 | } |
||
| 250 | // For non-arrays, try to create a new match set and |
||
| 251 | // add this object. |
||
| 252 | else { |
||
| 253 | $found = new \SplObjectStorage(); |
||
| 254 | if (isset($matches)) { |
||
| 255 | $found->attach($matches); |
||
| 256 | } |
||
| 257 | $this->matches = $found; |
||
| 258 | } |
||
| 259 | |||
| 260 | // EXPERIMENTAL: Support for qp()->length. |
||
| 261 | $this->length = $this->matches->count(); |
||
| 262 | } |
||
| 263 | |||
| 264 | /** |
||
| 265 | * A depth-checking function. Typically, it only needs to be |
||
| 266 | * invoked with the first parameter. The rest are used for recursion. |
||
| 267 | * |
||
| 268 | * @see deepest(); |
||
| 269 | * @param DOMNode $ele |
||
| 270 | * The element. |
||
| 271 | * @param int $depth |
||
| 272 | * The depth guage |
||
| 273 | * @param mixed $current |
||
| 274 | * The current set. |
||
| 275 | * @param DOMNode $deepest |
||
| 276 | * A reference to the current deepest node. |
||
| 277 | * @return array |
||
| 278 | * Returns an array of DOM nodes. |
||
| 279 | */ |
||
| 280 | protected function deepestNode(\DOMNode $ele, $depth = 0, $current = NULL, &$deepest = NULL) |
||
| 281 | { |
||
| 282 | // FIXME: Should this use SplObjectStorage? |
||
| 283 | if (!isset($current)) { |
||
| 284 | $current = [$ele]; |
||
| 285 | } |
||
| 286 | if (!isset($deepest)) { |
||
| 287 | $deepest = $depth; |
||
| 288 | } |
||
| 289 | if ($ele->hasChildNodes()) { |
||
| 290 | foreach ($ele->childNodes as $child) { |
||
| 291 | if ($child->nodeType === XML_ELEMENT_NODE) { |
||
| 292 | $current = $this->deepestNode($child, $depth + 1, $current, $deepest); |
||
| 293 | } |
||
| 294 | } |
||
| 295 | } elseif ($depth > $deepest) { |
||
| 296 | $current = [$ele]; |
||
| 297 | $deepest = $depth; |
||
| 298 | } elseif ($depth === $deepest) { |
||
| 299 | $current[] = $ele; |
||
| 300 | } |
||
| 301 | |||
| 302 | return $current; |
||
| 303 | } |
||
| 304 | |||
| 305 | /** |
||
| 306 | * Prepare an item for insertion into a DOM. |
||
| 307 | * |
||
| 308 | * This handles a variety of boilerplate tasks that need doing before an |
||
| 309 | * indeterminate object can be inserted into a DOM tree. |
||
| 310 | * - If item is a string, this is converted into a document fragment and returned. |
||
| 311 | * - If item is a DOMQuery, then all items are retrieved and converted into |
||
| 312 | * a document fragment and returned. |
||
| 313 | * - If the item is a DOMNode, it is imported into the current DOM if necessary. |
||
| 314 | * - If the item is a SimpleXMLElement, it is converted into a DOM node and then |
||
| 315 | * imported. |
||
| 316 | * |
||
| 317 | * @param mixed $item |
||
| 318 | * Item to prepare for insert. |
||
| 319 | * @return mixed |
||
| 320 | * Returns the prepared item. |
||
| 321 | * @throws QueryPath::Exception |
||
| 322 | * Thrown if the object passed in is not of a supprted object type. |
||
| 323 | * @throws Exception |
||
| 324 | */ |
||
| 325 | protected function prepareInsert($item) |
||
| 381 | } |
||
| 382 | |||
| 383 | /** |
||
| 384 | * Convenience function for getNthMatch(0). |
||
| 385 | */ |
||
| 386 | protected function getFirstMatch() |
||
| 391 | } |
||
| 392 | |||
| 393 | /** |
||
| 394 | * Parse an XML or HTML file. |
||
| 395 | * |
||
| 396 | * This attempts to autodetect the type of file, and then parse it. |
||
| 397 | * |
||
| 398 | * @param string $filename |
||
| 399 | * The file name to parse. |
||
| 400 | * @param int $flags |
||
| 401 | * The OR-combined flags accepted by the DOM parser. See the PHP documentation |
||
| 402 | * for DOM or for libxml. |
||
| 403 | * @param resource $context |
||
| 404 | * The stream context for the file IO. If this is set, then an alternate |
||
| 405 | * parsing path is followed: The file is loaded by PHP's stream-aware IO |
||
| 406 | * facilities, read entirely into memory, and then handed off to |
||
| 407 | * {@link parseXMLString()}. On large files, this can have a performance impact. |
||
| 408 | * @throws \QueryPath\ParseException |
||
| 409 | * Thrown when a file cannot be loaded or parsed. |
||
| 410 | */ |
||
| 411 | private function parseXMLFile($filename, $flags = NULL, $context = NULL) |
||
| 477 | } |
||
| 478 | |||
| 479 | /** |
||
| 480 | * Determine whether a given string looks like XML or not. |
||
| 481 | * |
||
| 482 | * Basically, this scans a portion of the supplied string, checking to see |
||
| 483 | * if it has a tag-like structure. It is possible to "confuse" this, which |
||
| 484 | * may subsequently result in parse errors, but in the vast majority of |
||
| 485 | * cases, this method serves as a valid inicator of whether or not the |
||
| 486 | * content looks like XML. |
||
| 487 | * |
||
| 488 | * Things that are intentional excluded: |
||
| 489 | * - plain text with no markup. |
||
| 490 | * - strings that look like filesystem paths. |
||
| 491 | * |
||
| 492 | * Subclasses SHOULD NOT OVERRIDE THIS. Altering it may be altering |
||
| 493 | * core assumptions about how things work. Instead, classes should |
||
| 494 | * override the constructor and pass in only one of the parsed types |
||
| 495 | * that this class expects. |
||
| 496 | */ |
||
| 497 | protected function isXMLish($string) |
||
| 498 | { |
||
| 499 | return (strpos($string, '<') !== false && strpos($string, '>') !== false); |
||
| 500 | } |
||
| 501 | |||
| 502 | /** |
||
| 503 | * A utility function for retriving a match by index. |
||
| 504 | * |
||
| 505 | * The internal data structure used in DOMQuery does not have |
||
| 506 | * strong random access support, so we suppliment it with this method. |
||
| 507 | * |
||
| 508 | * @param $index |
||
| 509 | * @return object|void |
||
| 510 | */ |
||
| 511 | protected function getNthMatch(int $index) |
||
| 521 | } |
||
| 522 | } |
||
| 524 | } |