Complex classes like HtmlPageCrawler often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes. You can also have a look at the cohesion graph to spot any un-connected, or weakly-connected components.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use HtmlPageCrawler, and based on these observations, apply Extract Interface, too.
1 | <?php |
||
15 | class HtmlPageCrawler extends Crawler |
||
16 | { |
||
17 | /** |
||
18 | * the (internal) root element name used when importing html fragments |
||
19 | * */ |
||
20 | const FRAGMENT_ROOT_TAGNAME = '_root'; |
||
21 | |||
22 | /** |
||
23 | * Get an HtmlPageCrawler object from a HTML string, DOMNode, DOMNodeList or HtmlPageCrawler |
||
24 | * |
||
25 | * This is the equivalent to jQuery's $() function when used for wrapping DOMNodes or creating DOMElements from HTML code. |
||
26 | * |
||
27 | * @param string|HtmlPageCrawler|\DOMNode|\DOMNodeList|array $content |
||
28 | * @return HtmlPageCrawler |
||
29 | * @api |
||
30 | */ |
||
31 | 17 | public static function create($content) |
|
39 | |||
40 | /** |
||
41 | * Adds the specified class(es) to each element in the set of matched elements. |
||
42 | * |
||
43 | * @param string $name One or more space-separated classes to be added to the class attribute of each matched element. |
||
44 | * @return HtmlPageCrawler $this for chaining |
||
45 | * @api |
||
46 | */ |
||
47 | 1 | public function addClass($name) |
|
68 | |||
69 | /** |
||
70 | * Insert content, specified by the parameter, after each element in the set of matched elements. |
||
71 | * |
||
72 | * @param string|HtmlPageCrawler|\DOMNode|\DOMNodeList $content |
||
73 | * @return HtmlPageCrawler $this for chaining |
||
74 | * @api |
||
75 | */ |
||
76 | 3 | public function after($content) |
|
98 | |||
99 | /** |
||
100 | * Insert HTML content as child nodes of each element after existing children |
||
101 | * |
||
102 | * @param string|HtmlPageCrawler|\DOMNode|\DOMNodeList $content HTML code fragment or DOMNode to append |
||
103 | * @return HtmlPageCrawler $this for chaining |
||
104 | * @api |
||
105 | */ |
||
106 | 2 | public function append($content) |
|
123 | |||
124 | /** |
||
125 | * Insert every element in the set of matched elements to the end of the target. |
||
126 | * |
||
127 | * @param string|HtmlPageCrawler|\DOMNode|\DOMNodeList $element |
||
128 | * @return \Wa72\HtmlPageDom\HtmlPageCrawler A new Crawler object containing all elements appended to the target elements |
||
129 | * @api |
||
130 | */ |
||
131 | 2 | public function appendTo($element) |
|
148 | |||
149 | /** |
||
150 | * Returns the attribute value of the first node of the list, or sets an attribute on each element |
||
151 | * |
||
152 | * @see HtmlPageCrawler::getAttribute() |
||
153 | * @see HtmlPageCrawler::setAttribute |
||
154 | * |
||
155 | * @param string $name |
||
156 | * @param null|string $value |
||
157 | * @return null|string|HtmlPageCrawler |
||
158 | * @api |
||
159 | */ |
||
160 | 2 | public function attr($name, $value = null) |
|
168 | |||
169 | /** |
||
170 | * Sets an attribute on each element |
||
171 | * |
||
172 | * @param string $name |
||
173 | * @param string $value |
||
174 | * @return HtmlPageCrawler $this for chaining |
||
175 | */ |
||
176 | 3 | public function setAttribute($name, $value) |
|
186 | |||
187 | /** |
||
188 | * Returns the attribute value of the first node of the list. |
||
189 | * |
||
190 | * @param string $name The attribute name |
||
191 | * @return string|null The attribute value or null if the attribute does not exist |
||
192 | * @throws \InvalidArgumentException When current node is empty |
||
193 | * |
||
194 | */ |
||
195 | 2 | public function getAttribute($name) |
|
203 | |||
204 | /** |
||
205 | * Insert content, specified by the parameter, before each element in the set of matched elements. |
||
206 | * |
||
207 | * @param string|HtmlPageCrawler|\DOMNode|\DOMNodeList $content |
||
208 | * @return HtmlPageCrawler $this for chaining |
||
209 | * @api |
||
210 | */ |
||
211 | 2 | public function before($content) |
|
230 | |||
231 | /** |
||
232 | * Create a deep copy of the set of matched elements. |
||
233 | * |
||
234 | * Equivalent to clone() in jQuery (clone is not a valid PHP function name) |
||
235 | * |
||
236 | * @return HtmlPageCrawler |
||
237 | * @api |
||
238 | */ |
||
239 | 1 | public function makeClone() |
|
243 | |||
244 | 1 | public function __clone() |
|
254 | |||
255 | /** |
||
256 | * Get one CSS style property of the first element or set it for all elements in the list |
||
257 | * |
||
258 | * Function is here for compatibility with jQuery; it is the same as getStyle() and setStyle() |
||
259 | * |
||
260 | * @see HtmlPageCrawler::getStyle() |
||
261 | * @see HtmlPageCrawler::setStyle() |
||
262 | * |
||
263 | * @param string $key The name of the style property |
||
264 | * @param null|string $value The CSS value to set, or NULL to get the current value |
||
265 | * @return HtmlPageCrawler|string If no param is provided, returns the CSS styles of the first element |
||
266 | * @api |
||
267 | */ |
||
268 | 1 | public function css($key, $value = null) |
|
276 | |||
277 | /** |
||
278 | * get one CSS style property of the first element |
||
279 | * |
||
280 | * @param string $key name of the property |
||
281 | * @return string|null value of the property |
||
282 | */ |
||
283 | 1 | public function getStyle($key) |
|
288 | |||
289 | /** |
||
290 | * set one CSS style property for all elements in the list |
||
291 | * |
||
292 | * @param string $key name of the property |
||
293 | * @param string $value value of the property |
||
294 | * @return HtmlPageCrawler $this for chaining |
||
295 | */ |
||
296 | 1 | public function setStyle($key, $value) |
|
312 | |||
313 | /** |
||
314 | * Removes all child nodes and text from all nodes in set |
||
315 | * |
||
316 | * Equivalent to jQuery's empty() function which is not a valid function name in PHP |
||
317 | * @return HtmlPageCrawler $this |
||
318 | * @api |
||
319 | */ |
||
320 | 1 | public function makeEmpty() |
|
327 | |||
328 | /** |
||
329 | * Determine whether any of the matched elements are assigned the given class. |
||
330 | * |
||
331 | * @param string $name |
||
332 | * @return bool |
||
333 | * @api |
||
334 | */ |
||
335 | 2 | public function hasClass($name) |
|
347 | |||
348 | /** |
||
349 | * Set the HTML contents of each element |
||
350 | * |
||
351 | * @param string|HtmlPageCrawler|\DOMNode|\DOMNodeList $content HTML code fragment |
||
352 | * @return HtmlPageCrawler $this for chaining |
||
353 | */ |
||
354 | 3 | public function setInnerHtml($content) |
|
368 | |||
369 | /** |
||
370 | * Insert every element in the set of matched elements after the target. |
||
371 | * |
||
372 | * @param string|HtmlPageCrawler|\DOMNode|\DOMNodeList $element |
||
373 | * @return \Wa72\HtmlPageDom\HtmlPageCrawler A new Crawler object containing all elements appended to the target elements |
||
374 | * @api |
||
375 | */ |
||
376 | 2 | public function insertAfter($element) |
|
396 | |||
397 | /** |
||
398 | * Insert every element in the set of matched elements before the target. |
||
399 | * |
||
400 | * @param string|HtmlPageCrawler|\DOMNode|\DOMNodeList $element |
||
401 | * @return \Wa72\HtmlPageDom\HtmlPageCrawler A new Crawler object containing all elements appended to the target elements |
||
402 | * @api |
||
403 | */ |
||
404 | 2 | public function insertBefore($element) |
|
421 | |||
422 | /** |
||
423 | * Insert content, specified by the parameter, to the beginning of each element in the set of matched elements. |
||
424 | * |
||
425 | * @param string|HtmlPageCrawler|\DOMNode|\DOMNodeList $content HTML code fragment |
||
426 | * @return HtmlPageCrawler $this for chaining |
||
427 | * @api |
||
428 | */ |
||
429 | 2 | public function prepend($content) |
|
451 | |||
452 | /** |
||
453 | * Insert every element in the set of matched elements to the beginning of the target. |
||
454 | * |
||
455 | * @param string|HtmlPageCrawler|\DOMNode|\DOMNodeList $element |
||
456 | * @return \Wa72\HtmlPageDom\HtmlPageCrawler A new Crawler object containing all elements prepended to the target elements |
||
457 | * @api |
||
458 | */ |
||
459 | 1 | public function prependTo($element) |
|
481 | |||
482 | /** |
||
483 | * Remove the set of matched elements from the DOM. |
||
484 | * |
||
485 | * (as opposed to Crawler::clear() which detaches the nodes only from Crawler |
||
486 | * but leaves them in the DOM) |
||
487 | * |
||
488 | * @api |
||
489 | */ |
||
490 | 2 | public function remove() |
|
502 | |||
503 | /** |
||
504 | * Remove an attribute from each element in the set of matched elements. |
||
505 | * |
||
506 | * Alias for removeAttribute for compatibility with jQuery |
||
507 | * |
||
508 | * @param string $name |
||
509 | * @return HtmlPageCrawler |
||
510 | * @api |
||
511 | */ |
||
512 | 1 | public function removeAttr($name) |
|
516 | |||
517 | /** |
||
518 | * Remove an attribute from each element in the set of matched elements. |
||
519 | * |
||
520 | * @param string $name |
||
521 | * @return HtmlPageCrawler |
||
522 | */ |
||
523 | 1 | public function removeAttribute($name) |
|
535 | |||
536 | /** |
||
537 | * Remove a class from each element in the list |
||
538 | * |
||
539 | * @param string $name |
||
540 | * @return HtmlPageCrawler $this for chaining |
||
541 | * @api |
||
542 | */ |
||
543 | 2 | public function removeClass($name) |
|
560 | |||
561 | /** |
||
562 | * Replace each target element with the set of matched elements. |
||
563 | * |
||
564 | * @param string|HtmlPageCrawler|\DOMNode|\DOMNodeList $element |
||
565 | * @return \Wa72\HtmlPageDom\HtmlPageCrawler A new Crawler object containing all elements appended to the target elements |
||
566 | * @api |
||
567 | */ |
||
568 | 2 | public function replaceAll($element) |
|
589 | |||
590 | /** |
||
591 | * Replace each element in the set of matched elements with the provided new content and return the set of elements that was removed. |
||
592 | * |
||
593 | * @param string|HtmlPageCrawler|\DOMNode|\DOMNodeList $content |
||
594 | * @return \Wa72\HtmlPageDom\HtmlPageCrawler $this for chaining |
||
595 | * @api |
||
596 | */ |
||
597 | 2 | public function replaceWith($content) |
|
620 | |||
621 | /** |
||
622 | * Get the combined text contents of each element in the set of matched elements, including their descendants. |
||
623 | * This is what the jQuery text() function does, contrary to the Crawler::text() method that returns only |
||
624 | * the text of the first node. |
||
625 | * |
||
626 | * @return string |
||
627 | * @api |
||
628 | */ |
||
629 | 1 | public function getCombinedText() |
|
638 | |||
639 | /** |
||
640 | * Set the text contents of the matched elements. |
||
641 | * |
||
642 | * @param string $text |
||
643 | * @return HtmlPageCrawler |
||
644 | * @api |
||
645 | */ |
||
646 | 1 | public function setText($text) |
|
654 | |||
655 | /** |
||
656 | * Add or remove one or more classes from each element in the set of matched elements, depending the class’s presence. |
||
657 | * |
||
658 | * @param string $classname One or more classnames separated by spaces |
||
659 | * @return \Wa72\HtmlPageDom\HtmlPageCrawler $this for chaining |
||
660 | * @api |
||
661 | */ |
||
662 | 1 | public function toggleClass($classname) |
|
678 | |||
679 | /** |
||
680 | * Remove the parents of the set of matched elements from the DOM, leaving the matched elements in their place. |
||
681 | * |
||
682 | * @return \Wa72\HtmlPageDom\HtmlPageCrawler $this for chaining |
||
683 | * @api |
||
684 | */ |
||
685 | 1 | public function unwrap() |
|
695 | |||
696 | /** |
||
697 | * Remove the matched elements, but promote the children to take their place. |
||
698 | * |
||
699 | * @return \Wa72\HtmlPageDom\HtmlPageCrawler $this for chaining |
||
700 | * @api |
||
701 | */ |
||
702 | 2 | public function unwrapInner() |
|
718 | |||
719 | |||
720 | /** |
||
721 | * Wrap an HTML structure around each element in the set of matched elements |
||
722 | * |
||
723 | * The HTML structure must contain only one root node, e.g.: |
||
724 | * Works: <div><div></div></div> |
||
725 | * Does not work: <div></div><div></div> |
||
726 | * |
||
727 | * @param string|HtmlPageCrawler|\DOMNode $wrappingElement |
||
728 | * @return HtmlPageCrawler $this for chaining |
||
729 | * @api |
||
730 | */ |
||
731 | 1 | public function wrap($wrappingElement) |
|
768 | |||
769 | /** |
||
770 | * Wrap an HTML structure around all elements in the set of matched elements. |
||
771 | * |
||
772 | * @param string|HtmlPageCrawler|\DOMNode|\DOMNodeList $content |
||
773 | * @throws \LogicException |
||
774 | * @return \Wa72\HtmlPageDom\HtmlPageCrawler $this for chaining |
||
775 | * @api |
||
776 | */ |
||
777 | 1 | public function wrapAll($content) |
|
815 | |||
816 | /** |
||
817 | * Wrap an HTML structure around the content of each element in the set of matched elements. |
||
818 | * |
||
819 | * @param string|HtmlPageCrawler|\DOMNode|\DOMNodeList $content |
||
820 | * @return \Wa72\HtmlPageDom\HtmlPageCrawler $this for chaining |
||
821 | * @api |
||
822 | */ |
||
823 | 1 | public function wrapInner($content) |
|
831 | |||
832 | /** |
||
833 | * Get the HTML code fragment of all elements and their contents. |
||
834 | * |
||
835 | * If the first node contains a complete HTML document return only |
||
836 | * the full code of this document. |
||
837 | * |
||
838 | * @return string HTML code (fragment) |
||
839 | * @api |
||
840 | */ |
||
841 | 8 | public function saveHTML() |
|
855 | |||
856 | 4 | public function __toString() |
|
860 | |||
861 | /** |
||
862 | * checks whether the first node contains a complete html document |
||
863 | * (as opposed to a document fragment) |
||
864 | * |
||
865 | * @return boolean |
||
866 | */ |
||
867 | 8 | public function isHtmlDocument() |
|
880 | |||
881 | /** |
||
882 | * get ownerDocument of the first element |
||
883 | * |
||
884 | * @return \DOMDocument|null |
||
885 | */ |
||
886 | 1 | public function getDOMDocument() |
|
897 | |||
898 | /** |
||
899 | * Filters the list of nodes with a CSS selector. |
||
900 | * |
||
901 | * @param string $selector |
||
902 | * @return HtmlPageCrawler |
||
903 | */ |
||
904 | 8 | public function filter($selector) |
|
908 | |||
909 | /** |
||
910 | * Filters the list of nodes with an XPath expression. |
||
911 | * |
||
912 | * @param string $xpath An XPath expression |
||
913 | * |
||
914 | * @return HtmlPageCrawler A new instance of Crawler with the filtered list of nodes |
||
915 | * |
||
916 | * @api |
||
917 | */ |
||
918 | 2 | public function filterXPath($xpath) |
|
922 | |||
923 | /** |
||
924 | * Adds HTML/XML content to the HtmlPageCrawler object (but not to the DOM of an already attached node). |
||
925 | * |
||
926 | * Function overriden from Crawler because HTML fragments are always added as complete documents there |
||
927 | * |
||
928 | * |
||
929 | * @param string $content A string to parse as HTML/XML |
||
930 | * @param null|string $type The content type of the string |
||
931 | * |
||
932 | * @return null|void |
||
933 | */ |
||
934 | 17 | public function addContent($content, $type = null) |
|
946 | |||
947 | 15 | public function addHtmlFragment($content, $charset = 'UTF-8') |
|
960 | |||
961 | // /** |
||
962 | // * returns the first node |
||
963 | // * deprecated, use getNode(0) instead |
||
964 | // * |
||
965 | // * @return \DOMNode|null |
||
966 | // * @deprecated |
||
967 | // * @see Crawler::getNode |
||
968 | // */ |
||
969 | // public function getFirstNode() |
||
970 | // { |
||
971 | // return $this->getNode(0); |
||
972 | // } |
||
973 | |||
974 | // /** |
||
975 | // * @param int $position |
||
976 | // * |
||
977 | // * overridden from Crawler because it is not public in Symfony 2.3 |
||
978 | // * TODO: throw away as soon as we don't need to support SF 2.3 any more |
||
979 | // * |
||
980 | // * @return \DOMElement|null |
||
981 | // */ |
||
982 | // public function getNode($position) |
||
983 | // { |
||
984 | // return parent::getNode($position); |
||
985 | // } |
||
986 | // |
||
987 | // /** |
||
988 | // * Returns the node name of the first node of the list. |
||
989 | // * |
||
990 | // * in Crawler (parent), this function will be available starting with 2.6.0, |
||
991 | // * therefore this method be removed from here as soon as we don't need to keep compatibility |
||
992 | // * with Symfony < 2.6 |
||
993 | // * |
||
994 | // * TODO: throw away as soon as we don't need to support SF 2.3 any more |
||
995 | // * |
||
996 | // * @return string The node name |
||
997 | // * |
||
998 | // * @throws \InvalidArgumentException When current node is empty |
||
999 | // */ |
||
1000 | // public function nodeName() |
||
1001 | // { |
||
1002 | // if (!count($this)) { |
||
1003 | // throw new \InvalidArgumentException('The current node list is empty.'); |
||
1004 | // } |
||
1005 | // return $this->getNode(0)->nodeName; |
||
1006 | // } |
||
1007 | |||
1008 | /** |
||
1009 | * Adds a node to the current list of nodes. |
||
1010 | * |
||
1011 | * This method uses the appropriate specialized add*() method based |
||
1012 | * on the type of the argument. |
||
1013 | * |
||
1014 | * Overwritten from parent to allow Crawler to be added |
||
1015 | * |
||
1016 | * @param null|\DOMNodeList|array|\DOMNode|Crawler $node A node |
||
1017 | * |
||
1018 | * @api |
||
1019 | */ |
||
1020 | 29 | public function add($node) |
|
1030 | |||
1031 | /** |
||
1032 | * @param \DOMNode $newnode |
||
1033 | * @param \DOMNode $referencenode |
||
1034 | * @param int $clone |
||
1035 | * @return \DOMNode |
||
1036 | */ |
||
1037 | 6 | protected static function importNewnode(\DOMNode $newnode, \DOMNode $referencenode, $clone = 0) { |
|
1048 | |||
1049 | /** |
||
1050 | * Checks whether the first node in the set is disconnected (has no parent node) |
||
1051 | * |
||
1052 | * @return bool |
||
1053 | */ |
||
1054 | 1 | public function isDisconnected() |
|
1059 | |||
1060 | 1 | public function __get($name) |
|
1069 | } |
||
1070 |
Really long classes often contain too much logic and violate the single responsibility principle.
We suggest to take a look at the “Code” section for options on how to refactor this code.