Duplicate code is one of the most pungent code smells. A rule that is often used is to re-structure code once it is duplicated in three or more places.
Common duplication problems, and corresponding solutions are:
Complex classes like HtmlDomParser often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes. You can also have a look at the cohesion graph to spot any un-connected, or weakly-connected components.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use HtmlDomParser, and based on these observations, apply Extract Interface, too.
1 | <?php |
||
31 | class HtmlDomParser |
||
32 | { |
||
33 | /** |
||
34 | * @var array |
||
35 | */ |
||
36 | protected static $functionAliases = array( |
||
37 | 'outertext' => 'html', |
||
38 | 'outerhtml' => 'html', |
||
39 | 'innertext' => 'innerHtml', |
||
40 | 'innerhtml' => 'innerHtml', |
||
41 | 'load' => 'loadHtml', |
||
42 | 'load_file' => 'loadHtmlFile', |
||
43 | ); |
||
44 | |||
45 | /** |
||
46 | * @var array |
||
47 | */ |
||
48 | private static $domLinkReplaceHelper = array( |
||
49 | 'orig' => array('[', ']', '{', '}',), |
||
50 | 'tmp' => array( |
||
51 | '!!!!HTML_DOM__SQUARE_BRACKET_LEFT!!!!', |
||
52 | '!!!!HTML_DOM__SQUARE_BRACKET_RIGHT!!!!', |
||
53 | '!!!!HTML_DOM__BRACKET_LEFT!!!!', |
||
54 | '!!!!HTML_DOM__BRACKET_RIGHT!!!!', |
||
55 | ), |
||
56 | ); |
||
57 | |||
58 | /** |
||
59 | * @var array |
||
60 | */ |
||
61 | protected static $domReplaceHelper = array( |
||
62 | 'orig' => array('&', '|', '+', '%'), |
||
63 | 'tmp' => array( |
||
64 | '!!!!HTML_DOM__AMP!!!!', |
||
65 | '!!!!HTML_DOM__PIPE!!!!', |
||
66 | '!!!!HTML_DOM__PLUS!!!!', |
||
67 | '!!!!HTML_DOM__PERCENT!!!!', |
||
68 | ), |
||
69 | ); |
||
70 | |||
71 | /** |
||
72 | * @var Callable |
||
73 | */ |
||
74 | protected static $callback; |
||
75 | |||
76 | /** |
||
77 | * @var DOMDocument |
||
78 | */ |
||
79 | protected $document; |
||
80 | |||
81 | /** |
||
82 | * @var string |
||
83 | */ |
||
84 | protected $encoding = 'UTF-8'; |
||
85 | |||
86 | /** |
||
87 | * @var bool |
||
88 | */ |
||
89 | protected $isDOMDocumentCreatedWithoutHtml = false; |
||
90 | |||
91 | /** |
||
92 | * @var bool |
||
93 | */ |
||
94 | protected $isDOMDocumentCreatedWithoutHtmlWrapper = false; |
||
95 | |||
96 | /** |
||
97 | * Constructor |
||
98 | * |
||
99 | * @param string|SimpleHtmlDom|\DOMNode $element HTML code or SimpleHtmlDom, \DOMNode |
||
100 | */ |
||
101 | 116 | public function __construct($element = null) |
|
127 | |||
128 | /** |
||
129 | * @param $name |
||
130 | * @param $arguments |
||
131 | * |
||
132 | * @return bool|mixed |
||
133 | */ |
||
134 | 33 | public function __call($name, $arguments) |
|
144 | |||
145 | /** |
||
146 | * @param $name |
||
147 | * @param $arguments |
||
148 | * |
||
149 | * @return HtmlDomParser |
||
150 | */ |
||
151 | 14 | public static function __callStatic($name, $arguments) |
|
152 | { |
||
153 | 14 | $arguments0 = null; |
|
154 | 14 | if (isset($arguments[0])) { |
|
155 | 13 | $arguments0 = $arguments[0]; |
|
156 | } |
||
157 | |||
158 | 14 | $arguments1 = null; |
|
159 | 14 | if (isset($arguments[1])) { |
|
160 | 1 | $arguments1 = $arguments[1]; |
|
161 | } |
||
162 | |||
163 | 14 | if ($name == 'str_get_html') { |
|
164 | 9 | $parser = new self(); |
|
165 | |||
166 | 9 | return $parser->loadHtml($arguments0, $arguments1); |
|
167 | } |
||
168 | |||
169 | 5 | if ($name == 'file_get_html') { |
|
170 | 4 | $parser = new self(); |
|
171 | |||
172 | 4 | return $parser->loadHtmlFile($arguments0, $arguments1); |
|
173 | } |
||
174 | |||
175 | 1 | throw new BadMethodCallException('Method does not exist'); |
|
176 | } |
||
177 | |||
178 | /** |
||
179 | * @param $name |
||
180 | * |
||
181 | * @return string |
||
182 | */ |
||
183 | 14 | public function __get($name) |
|
184 | { |
||
185 | 14 | $name = strtolower($name); |
|
186 | |||
187 | switch ($name) { |
||
188 | 14 | case 'outerhtml': |
|
189 | 14 | case 'outertext': |
|
190 | 8 | return $this->html(); |
|
191 | 6 | case 'innerhtml': |
|
192 | 3 | case 'innertext': |
|
193 | 4 | return $this->innerHtml(); |
|
194 | 2 | case 'text': |
|
195 | 2 | case 'plaintext': |
|
196 | 1 | return $this->text(); |
|
197 | } |
||
198 | |||
199 | 1 | return null; |
|
200 | } |
||
201 | |||
202 | /** |
||
203 | * @param string $selector |
||
204 | * @param int $idx |
||
205 | * |
||
206 | * @return SimpleHtmlDom|SimpleHtmlDomNode|null |
||
207 | */ |
||
208 | 3 | public function __invoke($selector, $idx = null) |
|
209 | { |
||
210 | 3 | return $this->find($selector, $idx); |
|
211 | } |
||
212 | |||
213 | /** |
||
214 | * @return string |
||
215 | */ |
||
216 | 14 | public function __toString() |
|
220 | |||
221 | /** |
||
222 | * does nothing (only for api-compatibility-reasons) |
||
223 | * |
||
224 | * @return bool |
||
225 | */ |
||
226 | 1 | public function clear() |
|
230 | |||
231 | /** |
||
232 | * @param string $html |
||
233 | * |
||
234 | * @return string |
||
235 | */ |
||
236 | 70 | private function replaceToPreserveHtmlEntities($html) |
|
237 | { |
||
238 | 70 | preg_match_all("/(\bhttps?:\/\/[^\s()<>]+(?:\([\w\d]+\)|[^[:punct:]\s]|\/|\}|\]))/i", $html, $linksOld); |
|
239 | |||
240 | 70 | $linksNew = array(); |
|
241 | 70 | if (!empty($linksOld[1])) { |
|
242 | 49 | $linksOld = $linksOld[1]; |
|
243 | 49 | foreach ($linksOld as $linkKey => $linkOld) { |
|
244 | 49 | $linksNew[$linkKey] = str_replace( |
|
245 | 49 | self::$domLinkReplaceHelper['orig'], |
|
246 | 49 | self::$domLinkReplaceHelper['tmp'], |
|
247 | $linkOld |
||
248 | ); |
||
249 | } |
||
250 | } |
||
251 | |||
252 | 70 | $linksNewCount = count($linksNew); |
|
253 | 70 | if ($linksNewCount > 0 && count($linksOld) === $linksNewCount) { |
|
254 | 49 | $search = array_merge($linksOld, self::$domReplaceHelper['orig']); |
|
255 | 49 | $replace = array_merge($linksNew, self::$domReplaceHelper['tmp']); |
|
256 | } else { |
||
257 | 22 | $search = self::$domReplaceHelper['orig']; |
|
258 | 22 | $replace = self::$domReplaceHelper['tmp']; |
|
259 | } |
||
260 | |||
261 | 70 | return str_replace($search, $replace, $html); |
|
262 | } |
||
263 | |||
264 | /** |
||
265 | * @param string $html |
||
266 | * |
||
267 | * @return string |
||
268 | */ |
||
269 | 53 | public static function putReplacedBackToPreserveHtmlEntities($html) |
|
270 | { |
||
271 | 53 | return str_replace( |
|
272 | array_merge( |
||
273 | 53 | self::$domLinkReplaceHelper['tmp'], |
|
274 | 53 | self::$domReplaceHelper['tmp'], |
|
275 | 53 | array(' ') |
|
276 | ), |
||
277 | array_merge( |
||
278 | 53 | self::$domLinkReplaceHelper['orig'], |
|
279 | 53 | self::$domReplaceHelper['orig'], |
|
280 | 53 | array('') |
|
281 | ), |
||
282 | $html |
||
283 | ); |
||
284 | } |
||
285 | |||
286 | /** |
||
287 | * create DOMDocument from HTML |
||
288 | * |
||
289 | * @param string $html |
||
290 | * @param int|null $libXMLExtraOptions |
||
291 | * |
||
292 | * @return \DOMDocument |
||
293 | */ |
||
294 | 104 | private function createDOMDocument($html, $libXMLExtraOptions = null) |
|
295 | { |
||
296 | 104 | if (strpos($html, '<') === false) { |
|
297 | 6 | $this->isDOMDocumentCreatedWithoutHtml = true; |
|
298 | } |
||
299 | |||
300 | 104 | if (strpos($html, '<html') === false) { |
|
301 | 58 | $this->isDOMDocumentCreatedWithoutHtmlWrapper = true; |
|
302 | } |
||
303 | |||
304 | // set error level |
||
305 | 104 | $internalErrors = libxml_use_internal_errors(true); |
|
306 | 104 | $disableEntityLoader = libxml_disable_entity_loader(true); |
|
307 | 104 | libxml_clear_errors(); |
|
308 | |||
309 | 104 | $options = LIBXML_DTDLOAD | LIBXML_DTDATTR | LIBXML_NONET; |
|
310 | |||
311 | 104 | if (defined('LIBXML_COMPACT')) { |
|
312 | 104 | $options |= LIBXML_COMPACT; |
|
313 | } |
||
314 | |||
315 | 104 | if (defined('LIBXML_HTML_NOIMPLIED')) { |
|
316 | 104 | $options |= LIBXML_HTML_NOIMPLIED; |
|
317 | } |
||
318 | |||
319 | 104 | if (defined('LIBXML_HTML_NODEFDTD')) { |
|
320 | 104 | $options |= LIBXML_HTML_NODEFDTD; |
|
321 | } |
||
322 | |||
323 | 104 | if ($libXMLExtraOptions !== null) { |
|
324 | 1 | $options |= $libXMLExtraOptions; |
|
325 | } |
||
326 | |||
327 | 104 | $sxe = simplexml_load_string($html, 'SimpleXMLElement', $options); |
|
328 | 104 | if ($sxe !== false && count(libxml_get_errors()) === 0) { |
|
329 | 36 | $this->document = dom_import_simplexml($sxe)->ownerDocument; |
|
330 | } else { |
||
331 | |||
332 | // UTF-8 hack: http://php.net/manual/en/domdocument.loadhtml.php#95251 |
||
333 | 70 | $html = trim($html); |
|
334 | 70 | $xmlHackUsed = false; |
|
335 | 70 | if (stripos('<?xml', $html) !== 0) { |
|
336 | 70 | $xmlHackUsed = true; |
|
337 | 70 | $html = '<?xml encoding="' . $this->getEncoding() . '" ?>' . $html; |
|
338 | } |
||
339 | |||
340 | 70 | $html = $this->replaceToPreserveHtmlEntities($html); |
|
341 | |||
342 | 70 | $this->document->loadHTML($html); |
|
343 | |||
344 | // remove the "xml-encoding" hack |
||
345 | 70 | if ($xmlHackUsed === true) { |
|
346 | 70 | foreach ($this->document->childNodes as $child) { |
|
347 | 70 | if ($child->nodeType == XML_PI_NODE) { |
|
348 | 70 | $this->document->removeChild($child); |
|
349 | } |
||
350 | } |
||
351 | } |
||
352 | |||
353 | 70 | libxml_clear_errors(); |
|
354 | } |
||
355 | |||
356 | // set encoding |
||
357 | 104 | $this->document->encoding = $this->getEncoding(); |
|
358 | |||
359 | // restore lib-xml settings |
||
360 | 104 | libxml_use_internal_errors($internalErrors); |
|
361 | 104 | libxml_disable_entity_loader($disableEntityLoader); |
|
362 | |||
363 | 104 | return $this->document; |
|
364 | } |
||
365 | |||
366 | /** |
||
367 | * Return SimpleHtmlDom by id. |
||
368 | * |
||
369 | * @param string $id |
||
370 | * |
||
371 | * @return SimpleHtmlDom|SimpleHtmlDomNodeBlank |
||
372 | */ |
||
373 | 2 | public function getElementById($id) |
|
374 | { |
||
375 | 2 | return $this->find("#$id", 0); |
|
376 | } |
||
377 | |||
378 | /** |
||
379 | * Return SimpleHtmlDom by tag name. |
||
380 | * |
||
381 | * @param string $name |
||
382 | * |
||
383 | * @return SimpleHtmlDom|SimpleHtmlDomNodeBlank |
||
384 | */ |
||
385 | 1 | public function getElementByTagName($name) |
|
386 | { |
||
387 | 1 | $node = $this->document->getElementsByTagName($name)->item(0); |
|
388 | |||
389 | 1 | if ($node !== null) { |
|
390 | 1 | return new SimpleHtmlDom($node); |
|
391 | } else { |
||
392 | return new SimpleHtmlDomNodeBlank(); |
||
393 | } |
||
394 | } |
||
395 | |||
396 | /** |
||
397 | * Returns Elements by id |
||
398 | * |
||
399 | * @param string $id |
||
400 | * @param null|int $idx |
||
401 | * |
||
402 | * @return SimpleHtmlDomNode|SimpleHtmlDomNode[]|SimpleHtmlDomNodeBlank |
||
403 | */ |
||
404 | public function getElementsById($id, $idx = null) |
||
405 | { |
||
406 | return $this->find("#$id", $idx); |
||
407 | } |
||
408 | |||
409 | /** |
||
410 | * Returns Elements by tag name |
||
411 | * |
||
412 | * @param string $name |
||
413 | * @param null|int $idx |
||
414 | * |
||
415 | * @return SimpleHtmlDomNode|SimpleHtmlDomNode[]|SimpleHtmlDomNodeBlank |
||
416 | */ |
||
417 | 3 | View Code Duplication | public function getElementsByTagName($name, $idx = null) |
441 | |||
442 | /** |
||
443 | * Find list of nodes with a CSS selector. |
||
444 | * |
||
445 | * @param string $selector |
||
446 | * @param int $idx |
||
447 | * |
||
448 | * @return SimpleHtmlDom|SimpleHtmlDom[]|SimpleHtmlDomNodeBlank |
||
449 | */ |
||
450 | 76 | public function find($selector, $idx = null) |
|
451 | { |
||
452 | 76 | $xPathQuery = SelectorConverter::toXPath($selector); |
|
453 | |||
454 | 76 | $xPath = new DOMXPath($this->document); |
|
455 | 76 | $nodesList = $xPath->query($xPathQuery); |
|
456 | 76 | $elements = new SimpleHtmlDomNode(); |
|
457 | |||
458 | 76 | foreach ($nodesList as $node) { |
|
459 | 72 | $elements[] = new SimpleHtmlDom($node); |
|
460 | } |
||
461 | |||
462 | 76 | if (null === $idx) { |
|
463 | 49 | return $elements; |
|
464 | } else { |
||
465 | 39 | if ($idx < 0) { |
|
466 | 11 | $idx = count($elements) + $idx; |
|
467 | } |
||
468 | } |
||
469 | |||
470 | 39 | if (isset($elements[$idx])) { |
|
471 | 36 | return $elements[$idx]; |
|
472 | } else { |
||
473 | 5 | return new SimpleHtmlDomNodeBlank(); |
|
474 | } |
||
475 | } |
||
476 | |||
477 | /** |
||
478 | * @param string $content |
||
479 | * |
||
480 | * @return string |
||
481 | */ |
||
482 | 44 | protected function fixHtmlOutput($content) |
|
483 | { |
||
484 | // INFO: DOMDocument will encapsulate plaintext into a paragraph tag (<p>), |
||
485 | // so we try to remove it here again ... |
||
486 | |||
487 | 44 | if ($this->isDOMDocumentCreatedWithoutHtmlWrapper === true) { |
|
488 | 19 | $content = str_replace( |
|
489 | array( |
||
490 | 19 | "\n", |
|
491 | "\r\n", |
||
492 | "\r", |
||
493 | '<simpleHtmlDomP>', |
||
494 | '</simpleHtmlDomP>', |
||
495 | '<body>', |
||
496 | '</body>', |
||
497 | '<html>', |
||
498 | '</html>', |
||
499 | ), |
||
500 | 19 | '', |
|
501 | $content |
||
502 | ); |
||
503 | } |
||
504 | |||
505 | 44 | if ($this->isDOMDocumentCreatedWithoutHtml === true) { |
|
506 | 5 | $content = str_replace( |
|
507 | array( |
||
508 | 5 | '<p>', |
|
509 | '</p>', |
||
510 | '<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">' |
||
511 | ), |
||
512 | 5 | '', |
|
513 | $content); |
||
514 | } |
||
515 | |||
516 | 44 | $content = UTF8::html_entity_decode($content); |
|
517 | 44 | $content = trim($content); |
|
518 | 44 | $content = UTF8::urldecode($content); |
|
519 | |||
520 | 44 | $content = self::putReplacedBackToPreserveHtmlEntities($content); |
|
521 | |||
522 | 44 | return $content; |
|
523 | } |
||
524 | |||
525 | /** |
||
526 | * @return DOMDocument |
||
527 | */ |
||
528 | 36 | public function getDocument() |
|
529 | { |
||
530 | 36 | return $this->document; |
|
531 | } |
||
532 | |||
533 | /** |
||
534 | * Get the encoding to use |
||
535 | * |
||
536 | * @return string |
||
537 | */ |
||
538 | 116 | private function getEncoding() |
|
539 | { |
||
540 | 116 | return $this->encoding; |
|
541 | } |
||
542 | |||
543 | /** |
||
544 | * @return bool |
||
545 | */ |
||
546 | 7 | public function getIsDOMDocumentCreatedWithoutHtml() |
|
547 | { |
||
548 | 7 | return $this->isDOMDocumentCreatedWithoutHtml; |
|
549 | } |
||
550 | |||
551 | /** |
||
552 | * @return bool |
||
553 | */ |
||
554 | 32 | public function getIsDOMDocumentCreatedWithoutHtmlWrapper() |
|
555 | { |
||
556 | 32 | return $this->isDOMDocumentCreatedWithoutHtmlWrapper; |
|
557 | } |
||
558 | |||
559 | /** |
||
560 | * Get dom node's outer html |
||
561 | * |
||
562 | * @return string |
||
563 | */ |
||
564 | 32 | public function html() |
|
565 | { |
||
566 | 32 | if ($this::$callback !== null) { |
|
567 | call_user_func_array($this::$callback, array($this)); |
||
568 | } |
||
569 | |||
570 | 32 | if ($this->getIsDOMDocumentCreatedWithoutHtmlWrapper()) { |
|
571 | 14 | $content = $this->document->saveHTML($this->document->documentElement); |
|
572 | } else { |
||
573 | 22 | $content = $this->document->saveHTML(); |
|
574 | } |
||
575 | |||
576 | 32 | return $this->fixHtmlOutput($content); |
|
577 | } |
||
578 | |||
579 | /** |
||
580 | * Get the HTML as XML. |
||
581 | * |
||
582 | * @return string |
||
583 | */ |
||
584 | 1 | public function xml() |
|
585 | { |
||
586 | 1 | $xml = $this->document->saveXML(null, LIBXML_NOEMPTYTAG); |
|
587 | |||
588 | // remove the XML-header |
||
589 | 1 | $xml = ltrim(preg_replace('/<\?xml.*\?>/', '', $xml)); |
|
590 | |||
591 | 1 | return $this->fixHtmlOutput($xml); |
|
592 | } |
||
593 | |||
594 | /** |
||
595 | * Get dom node's inner html |
||
596 | * |
||
597 | * @return string |
||
598 | */ |
||
599 | 13 | public function innerHtml() |
|
600 | { |
||
601 | 13 | $text = ''; |
|
602 | |||
603 | 13 | foreach ($this->document->documentElement->childNodes as $node) { |
|
604 | 13 | $text .= $this->fixHtmlOutput($this->document->saveHTML($node)); |
|
605 | } |
||
606 | |||
607 | 13 | return $text; |
|
608 | } |
||
609 | |||
610 | /** |
||
611 | * Load HTML from string |
||
612 | * |
||
613 | * @param string $html |
||
614 | * @param int|null $libXMLExtraOptions |
||
615 | * |
||
616 | * @return HtmlDomParser |
||
617 | * |
||
618 | * @throws InvalidArgumentException if argument is not string |
||
619 | */ |
||
620 | 107 | public function loadHtml($html, $libXMLExtraOptions = null) |
|
621 | { |
||
622 | 107 | if (!is_string($html)) { |
|
623 | 3 | throw new InvalidArgumentException(__METHOD__ . ' expects parameter 1 to be string.'); |
|
624 | } |
||
625 | |||
626 | 104 | $this->document = $this->createDOMDocument($html, $libXMLExtraOptions); |
|
627 | |||
628 | 104 | return $this; |
|
629 | } |
||
630 | |||
631 | /** |
||
632 | * Load HTML from file |
||
633 | * |
||
634 | * @param string $filePath |
||
635 | * @param int|null $libXMLExtraOptions |
||
636 | * |
||
637 | * @return HtmlDomParser |
||
638 | */ |
||
639 | 12 | public function loadHtmlFile($filePath, $libXMLExtraOptions = null) |
|
640 | { |
||
641 | 12 | if (!is_string($filePath)) { |
|
642 | 2 | throw new InvalidArgumentException(__METHOD__ . ' expects parameter 1 to be string.'); |
|
643 | } |
||
644 | |||
645 | 10 | if (!preg_match("/^https?:\/\//i", $filePath) && !file_exists($filePath)) { |
|
646 | 1 | throw new RuntimeException("File $filePath not found"); |
|
647 | } |
||
648 | |||
649 | try { |
||
650 | 9 | $html = UTF8::file_get_contents($filePath); |
|
651 | |||
652 | 1 | } catch (\Exception $e) { |
|
653 | 1 | throw new RuntimeException("Could not load file $filePath"); |
|
654 | } |
||
655 | |||
656 | 8 | if ($html === false) { |
|
657 | throw new RuntimeException("Could not load file $filePath"); |
||
658 | } |
||
659 | |||
660 | 8 | $this->loadHtml($html, $libXMLExtraOptions); |
|
661 | |||
662 | 8 | return $this; |
|
663 | } |
||
664 | |||
665 | /** |
||
666 | * Save dom as string |
||
667 | * |
||
668 | * @param string $filepath |
||
669 | * |
||
670 | * @return string |
||
671 | */ |
||
672 | 1 | public function save($filepath = '') |
|
673 | { |
||
674 | 1 | $string = $this->innerHtml(); |
|
675 | 1 | if ($filepath !== '') { |
|
676 | file_put_contents($filepath, $string, LOCK_EX); |
||
677 | } |
||
678 | |||
679 | 1 | return $string; |
|
680 | } |
||
681 | |||
682 | /** |
||
683 | * @param $functionName |
||
684 | */ |
||
685 | public function set_callback($functionName) |
||
689 | |||
690 | /** |
||
691 | * Get dom node's plain text |
||
692 | * |
||
693 | * @return string |
||
694 | */ |
||
695 | 2 | public function text() |
|
696 | { |
||
697 | 2 | return $this->fixHtmlOutput($this->document->textContent); |
|
698 | } |
||
699 | } |
||
700 |
Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.
You can also find more detailed suggestions in the “Code” section of your repository.