Complex classes like HtmlPage often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes. You can also have a look at the cohesion graph to spot any un-connected, or weakly-connected components.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use HtmlPage, and based on these observations, apply Extract Interface, too.
| 1 | <?php | ||
| 18 | class HtmlPage | ||
| 19 | { | ||
| 20 | /** | ||
| 21 | * | ||
| 22 | * @var \DOMDocument | ||
| 23 | */ | ||
| 24 | protected $dom; | ||
| 25 | |||
| 26 | /** | ||
| 27 | * @var string | ||
| 28 | */ | ||
| 29 | protected $charset; | ||
| 30 | |||
| 31 | /** | ||
| 32 | * @var string | ||
| 33 | */ | ||
| 34 | protected $url; | ||
| 35 | |||
| 36 | /** | ||
| 37 | * | ||
| 38 | * @var HtmlPageCrawler | ||
| 39 | */ | ||
| 40 | protected $crawler; | ||
| 41 | |||
| 42 | 5 | public function __construct($content = '', $url = '', $charset = 'UTF-8') | |
| 67 | |||
| 68 | /** | ||
| 69 | * Get a HtmlPageCrawler object containing the root node of the HTML document | ||
| 70 | * | ||
| 71 | * @return HtmlPageCrawler | ||
| 72 | */ | ||
| 73 | public function getCrawler() | ||
| 77 | |||
| 78 | /** | ||
| 79 | * Get a DOMDocument object for the HTML document | ||
| 80 | * | ||
| 81 | * @return \DOMDocument | ||
| 82 | */ | ||
| 83 | public function getDOMDocument() | ||
| 87 | |||
| 88 | /** | ||
| 89 | * Sets the page title of the HTML document | ||
| 90 | * | ||
| 91 | * @param string $title | ||
| 92 | */ | ||
| 93 | 2 | public function setTitle($title) | |
| 102 | |||
| 103 | /** | ||
| 104 | * Get the page title of the HTML document | ||
| 105 | * | ||
| 106 | * @return null|string | ||
| 107 | */ | ||
| 108 | 2 | public function getTitle() | |
| 117 | |||
| 118 | /** | ||
| 119 | * Set a META tag with specified 'name' and 'content' attributes | ||
| 120 | * | ||
| 121 | * @TODO: add support for multiple meta tags with the same name but different languages | ||
| 122 | * | ||
| 123 | * @param $name | ||
| 124 | * @param $content | ||
| 125 | */ | ||
| 126 | 1 | public function setMeta($name, $content) | |
| 137 | |||
| 138 | /** | ||
| 139 | * Remove all meta tags with the specified name attribute | ||
| 140 | * | ||
| 141 | * @param string $name | ||
| 142 | */ | ||
| 143 | 1 | public function removeMeta($name) | |
| 148 | |||
| 149 | /** | ||
| 150 | * Get the content attribute of a meta tag with the specified name attribute | ||
| 151 | * | ||
| 152 | * @param string $name | ||
| 153 | * @return null|string | ||
| 154 | */ | ||
| 155 | 1 | public function getMeta($name) | |
| 164 | |||
| 165 | /** | ||
| 166 | * Set the base tag with href attribute set to parameter $url | ||
| 167 | * | ||
| 168 | * @param string $url | ||
| 169 | */ | ||
| 170 | public function setBaseHref($url) | ||
| 179 | |||
| 180 | /** | ||
| 181 | * Get the href attribute from the base tag, null if not present in document | ||
| 182 | * | ||
| 183 | * @return null|string | ||
| 184 | */ | ||
| 185 | public function getBaseHref() | ||
| 194 | |||
| 195 | /** | ||
| 196 | * Sets innerHTML content of an element specified by elementId | ||
| 197 | * | ||
| 198 | * @param string $elementId | ||
| 199 | * @param string $html | ||
| 200 | */ | ||
| 201 | 1 | public function setHtmlById($elementId, $html) | |
| 205 | |||
| 206 | /** | ||
| 207 | * Get the document's HEAD section as DOMElement | ||
| 208 | * | ||
| 209 | * @return \DOMElement | ||
| 210 | */ | ||
| 211 | 1 | public function getHeadNode() | |
| 220 | |||
| 221 | /** | ||
| 222 | * Get the document's body as DOMElement | ||
| 223 | * | ||
| 224 | * @return \DOMElement | ||
| 225 | */ | ||
| 226 | 1 | public function getBodyNode() | |
| 235 | |||
| 236 | /** | ||
| 237 | * Get the document's HEAD section wrapped in a HtmlPageCrawler instance | ||
| 238 | * | ||
| 239 | * @return HtmlPageCrawler | ||
| 240 | */ | ||
| 241 | public function getHead() | ||
| 245 | |||
| 246 | /** | ||
| 247 | * Get the document's body wrapped in a HtmlPageCrawler instance | ||
| 248 | * | ||
| 249 | * @return HtmlPageCrawler | ||
| 250 | */ | ||
| 251 | 1 | public function getBody() | |
| 255 | |||
| 256 | 5 | public function __toString() | |
| 257 |     { | ||
| 258 | 5 | $html = $this->dom->saveHTML(); | |
| 259 | 5 |         if (function_exists('mb_convert_encoding') && in_array(strtolower($this->charset), array_map('strtolower', mb_list_encodings()))) { | |
| 260 | 5 | $html = mb_convert_encoding($html, $this->charset, 'HTML-ENTITIES'); | |
| 261 | 5 | } | |
| 262 | 5 | return $html; | |
| 263 | } | ||
| 264 | |||
| 265 | /** | ||
| 266 | * Save this document to a HTML file or return HTML code as string | ||
| 267 | * | ||
| 268 | * @param string $filename If provided, output will be saved to this file, otherwise returned | ||
| 269 | * @return string|void | ||
| 270 | */ | ||
| 271 | 3 | public function save($filename = '') | |
| 272 |     { | ||
| 273 | 3 |         if ($filename != '') { | |
| 274 | file_put_contents($filename, $this->__toString()); | ||
| 275 | return; | ||
| 276 |         } else { | ||
| 277 | 3 | return $this->__toString(); | |
| 278 | } | ||
| 279 | } | ||
| 280 | |||
| 281 | /** | ||
| 282 | * Get an element in the document by it's id attribute | ||
| 283 | * | ||
| 284 | * @param string $id | ||
| 285 | * @return HtmlPageCrawler | ||
| 286 | */ | ||
| 287 | 1 | public function getElementById($id) | |
| 291 | |||
| 292 | /** | ||
| 293 | * Filter nodes by using a CSS selector | ||
| 294 | * | ||
| 295 | * @param string $selector CSS selector | ||
| 296 | * @return HtmlPageCrawler | ||
| 297 | */ | ||
| 298 | 1 | public function filter($selector) | |
| 303 | |||
| 304 | /** | ||
| 305 | * Filter nodes by XPath expression | ||
| 306 | * | ||
| 307 | * @param string $xpath XPath expression | ||
| 308 | * @return HtmlPageCrawler | ||
| 309 | */ | ||
| 310 | 1 | public function filterXPath($xpath) | |
| 314 | |||
| 315 | /** | ||
| 316 | * remove newlines from string and minimize whitespace (multiple whitespace characters replaced by one space) | ||
| 317 | * | ||
| 318 | * useful for cleaning up text retrieved by HtmlPageCrawler::text() (nodeValue of a DOMNode) | ||
| 319 | * | ||
| 320 | * @param string $string | ||
| 321 | * @return string | ||
| 322 | */ | ||
| 323 | public static function trimNewlines($string) | ||
| 327 | |||
| 328 | 1 | public function __clone() | |
| 333 | |||
| 334 | /** | ||
| 335 | * minify the HTML document | ||
| 336 | * | ||
| 337 | * @param array $options Options passed to PrettyMin::__construct() | ||
| 338 | * @return HtmlPage | ||
| 339 | * @throws \Exception | ||
| 340 | */ | ||
| 341 | 1 | public function minify(array $options = array()) | |
| 350 | |||
| 351 | /** | ||
| 352 | * indent the HTML document | ||
| 353 | * | ||
| 354 | * @param array $options Options passed to PrettyMin::__construct() | ||
| 355 | * @return HtmlPage | ||
| 356 | * @throws \Exception | ||
| 357 | */ | ||
| 358 | 1 | public function indent(array $options = array()) | |
| 367 | } | ||
| 368 | 
If you suppress an error, we recommend checking for the error condition explicitly: