Total Complexity | 64 |
Total Lines | 444 |
Duplicated Lines | 0 % |
Changes | 1 | ||
Bugs | 0 | Features | 0 |
Complex classes like AbstractDomParser often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use AbstractDomParser, and based on these observations, apply Extract Interface, too.
1 | <?php |
||
22 | abstract class AbstractDomParser |
||
23 | { |
||
24 | /** @var object The object that holds the dom */ |
||
25 | public $document; |
||
26 | |||
27 | /** @var bool If we are using the internal or external parser */ |
||
28 | public $internalParser; |
||
29 | |||
30 | /** @var string Line end character */ |
||
31 | public $line_end = "\n"; |
||
32 | |||
33 | /** @var string Line break character */ |
||
34 | public $line_break = " \n\n"; |
||
35 | |||
36 | /** @var int Wordwrap output, set to 0 to skip wrapping */ |
||
37 | public $body_width = 76; |
||
38 | |||
39 | /** |
||
40 | * For a given node, checks if it is anywhere nested inside a code block |
||
41 | * |
||
42 | * - Prevents converting anything that's inside a code block |
||
43 | * |
||
44 | * @param object $node |
||
45 | * |
||
46 | * @return bool |
||
47 | */ |
||
48 | public static function hasParentCode($node, $internalParser) |
||
49 | { |
||
50 | $parent = $internalParser ? $node->parentNode : $node->parentNode(); |
||
51 | while ($parent) |
||
52 | { |
||
53 | // Anywhere nested inside a code/pre block we don't render tags |
||
54 | if (in_array($internalParser ? $parent->nodeName : $parent->nodeName(), ['pre', 'code'])) |
||
55 | { |
||
56 | return true; |
||
57 | } |
||
58 | |||
59 | // Back out another level, until we are done |
||
60 | $parent = $internalParser ? $parent->parentNode : $parent->parentNode(); |
||
61 | } |
||
62 | |||
63 | return false; |
||
64 | } |
||
65 | |||
66 | /** |
||
67 | * Set the DOM parser for class, loads the supplied HTML |
||
68 | */ |
||
69 | public function setParser() |
||
78 | } |
||
79 | } |
||
80 | |||
81 | /** |
||
82 | * Loads a string of HTML into the parser for processing |
||
83 | * |
||
84 | * @param string $html |
||
85 | */ |
||
86 | public function loadHTML($html) |
||
110 | } |
||
111 | } |
||
112 | |||
113 | /** |
||
114 | * Returns just the body of a html document such that we are not dealing with head |
||
115 | * and any above head markup. multipart/mixed may have multiple sections that we concatenate |
||
116 | * |
||
117 | * @param $text |
||
118 | * |
||
119 | * @return string |
||
120 | */ |
||
121 | public function getBodyText($text) |
||
122 | { |
||
123 | if (preg_match_all('~<body[^>]*?>(.*?)</body>~su', $text, $bodies)) |
||
124 | { |
||
125 | return implode("\n", $bodies[1]); |
||
126 | } |
||
127 | |||
128 | if (preg_match_all('~<html[^>]*?>(.*)</html>~su', $text, $bodies)) |
||
129 | { |
||
130 | return implode("\n", $bodies[1]); |
||
131 | } |
||
132 | |||
133 | // Parsers may have clipped the ending body or html tag off with the quote/signature |
||
134 | if (preg_match('~<body[^>]*?>(.*)~su', $text, $bodies)) |
||
135 | { |
||
136 | return $bodies[1]; |
||
137 | } |
||
138 | |||
139 | return $text; |
||
140 | } |
||
141 | |||
142 | /** |
||
143 | * Returns just the body of a dom object such that we are not dealing with head |
||
144 | * and any above head markup |
||
145 | * |
||
146 | * @return object |
||
147 | */ |
||
148 | public function getDOMBodyNode() |
||
167 | } |
||
168 | |||
169 | /** |
||
170 | * Remove any <head node from the DOM |
||
171 | * |
||
172 | * This is done due to poor structure of some received HTML via email ect |
||
173 | */ |
||
174 | private function _removeHeadNode() |
||
187 | } |
||
188 | } |
||
189 | } |
||
190 | |||
191 | /** |
||
192 | * Breaks a string up so its no more than width characters long |
||
193 | * |
||
194 | * - Will break at word boundaries |
||
195 | * - If no natural space is found will break mid-word |
||
196 | * |
||
197 | * @param string $string |
||
198 | * @param int $width |
||
199 | * @param string $break |
||
200 | * @return string |
||
201 | */ |
||
202 | public function utf8Wordwrap($string, $width = 76, $break = "\n") |
||
203 | { |
||
204 | if ($width < 76) |
||
205 | { |
||
206 | return $string; |
||
207 | } |
||
208 | |||
209 | $strings = explode($break, $string); |
||
210 | $lines = []; |
||
211 | |||
212 | foreach ($strings as $string) |
||
213 | { |
||
214 | $in_quote = isset($string[0]) && $string[0] === '>'; |
||
215 | if (empty($string)) |
||
216 | { |
||
217 | $lines[] = ''; |
||
218 | } |
||
219 | |||
220 | while (!empty($string)) |
||
221 | { |
||
222 | // Get the next #width characters before a break (space, punctuation tab etc) |
||
223 | if (preg_match('~^(.{1,' . $width . '})(?:\s|$|,|\.)~u', $string, $matches)) |
||
224 | { |
||
225 | // Add the #width to the output and set up for the next pass |
||
226 | $lines[] = ($in_quote && $matches[1][0] !== '>' ? '> ' : '') . $matches[1]; |
||
227 | $string = Util::substr($string, Util::strlen($matches[1])); |
||
228 | } |
||
229 | // Humm just a long word with no place to break, so we simply cut it after width characters |
||
230 | else |
||
231 | { |
||
232 | $lines[] = ($in_quote && $string[0] !== '>' ? '> ' : '') . Util::substr($string, 0, $width); |
||
233 | $string = Util::substr($string, $width); |
||
234 | } |
||
235 | } |
||
236 | } |
||
237 | |||
238 | // Join it all the shortened sections up on our break characters |
||
239 | return implode($break, $lines); |
||
240 | } |
||
241 | |||
242 | /** |
||
243 | * Get the nesting level when inside a list |
||
244 | * |
||
245 | * @param object $node |
||
246 | * |
||
247 | * @return int |
||
248 | */ |
||
249 | public function hasParentList($node) |
||
250 | { |
||
251 | $depth = 0; |
||
252 | |||
253 | $parent = $this->getParent($node); |
||
254 | while ($parent) |
||
255 | { |
||
256 | // Anywhere nested inside a list we need to get the depth |
||
257 | $tag = $this->getName($parent); |
||
258 | if (in_array($tag, ['ul', 'ol'])) |
||
259 | { |
||
260 | $depth++; |
||
261 | } |
||
262 | |||
263 | // Back out another level |
||
264 | $parent = $this->getParent($parent); |
||
265 | } |
||
266 | |||
267 | return $depth; |
||
268 | } |
||
269 | |||
270 | /** |
||
271 | * Returns the parent node of another node |
||
272 | * |
||
273 | * @param $node |
||
274 | * @return object |
||
275 | */ |
||
276 | public function getParent($node) |
||
277 | { |
||
278 | if ($node === null) |
||
279 | { |
||
280 | return null; |
||
281 | } |
||
282 | |||
283 | return $this->internalParser ? $node->parentNode : $node->parentNode(); |
||
284 | } |
||
285 | |||
286 | /** |
||
287 | * Returns the node Name of a node |
||
288 | * |
||
289 | * @param $node |
||
290 | * @return string |
||
291 | */ |
||
292 | public function getName($node) |
||
300 | } |
||
301 | |||
302 | /** |
||
303 | * Returns the HTML of the document |
||
304 | * |
||
305 | * @return string |
||
306 | */ |
||
307 | public function getHTML() |
||
315 | } |
||
316 | |||
317 | /** |
||
318 | * Gets a node object |
||
319 | * |
||
320 | * @param object $node |
||
321 | * @param int $item |
||
322 | * @return object |
||
323 | */ |
||
324 | public function getItem($node, $item) |
||
325 | { |
||
326 | return $this->internalParser ? $node->item($item) : $node[$item]; |
||
327 | } |
||
328 | |||
329 | /** |
||
330 | * gets a node length |
||
331 | * |
||
332 | * @param object|array $node |
||
333 | * @return int |
||
334 | */ |
||
335 | public function getLength($node) |
||
336 | { |
||
337 | return $this->internalParser ? $node->length : count($node); |
||
338 | } |
||
339 | |||
340 | /** |
||
341 | * gets all children of a parent node |
||
342 | * |
||
343 | * @param object|array $node |
||
344 | * @return object |
||
345 | */ |
||
346 | public function getChildren($node) |
||
347 | { |
||
348 | return $this->internalParser ? $node->childNodes : $node->childNodes(); |
||
349 | } |
||
350 | |||
351 | /** |
||
352 | * gets a specific child of a parent node |
||
353 | * |
||
354 | * @param object|array $node |
||
355 | * @param int child number to return |
||
356 | * @return object |
||
357 | */ |
||
358 | public function getChild($node, $child) |
||
359 | { |
||
360 | return $this->internalParser ? $node->childNodes->item($child) : $node->childNodes($child); |
||
361 | } |
||
362 | |||
363 | /** |
||
364 | * gets the next sibling of a node |
||
365 | * |
||
366 | * @param object|array $node |
||
367 | * @return object |
||
368 | */ |
||
369 | public function getSibling($node) |
||
370 | { |
||
371 | return $this->internalParser ? $node->nextSibling : $node->next_sibling(); |
||
372 | } |
||
373 | |||
374 | /** |
||
375 | * gets a node value |
||
376 | * |
||
377 | * @param object $node |
||
378 | * @return string |
||
379 | */ |
||
380 | public function getValue($node) |
||
381 | { |
||
382 | if ($node === null) |
||
383 | { |
||
384 | return ''; |
||
385 | } |
||
386 | |||
387 | if ($this->internalParser) |
||
388 | { |
||
389 | return $node->nodeValue; |
||
390 | } |
||
391 | |||
392 | return html_entity_decode(htmlspecialchars_decode($node->innertext, ENT_QUOTES), ENT_QUOTES, 'UTF-8'); |
||
393 | } |
||
394 | |||
395 | /** |
||
396 | * Sets a node to a text value, replacing what was there |
||
397 | * |
||
398 | * @param $node |
||
399 | * @param $text |
||
400 | */ |
||
401 | public function setTextNode($node, $text) |
||
402 | { |
||
403 | if ($this->internalParser) |
||
404 | { |
||
405 | $text_node = $this->document->createTextNode($text); |
||
406 | $node->parentNode->replaceChild($text_node, $node); |
||
407 | } |
||
408 | else |
||
409 | { |
||
410 | $node->outertext = $text; |
||
411 | } |
||
412 | } |
||
413 | |||
414 | /** |
||
415 | * Gets the inner html of a node |
||
416 | * |
||
417 | * @param \DOMNode|object $node |
||
418 | * @return string |
||
419 | */ |
||
420 | public function getInnerHTML($node) |
||
421 | { |
||
422 | if ($this->internalParser) |
||
423 | { |
||
424 | $doc = new \DOMDocument(); |
||
425 | $doc->preserveWhiteSpace = true; |
||
426 | $doc->appendChild($doc->importNode($node, true)); |
||
427 | $html = trim($doc->saveHTML()); |
||
428 | $tag = $node->nodeName; |
||
429 | |||
430 | return preg_replace('@^<' . $tag . '[^>]*>|</' . $tag . '>$@', '', $html); |
||
431 | } |
||
432 | |||
433 | return $node->innertext; |
||
434 | } |
||
435 | |||
436 | /** |
||
437 | * Gets the outer html of a node |
||
438 | * |
||
439 | * @param \DOMNode|object $node |
||
440 | * @return string |
||
441 | */ |
||
442 | public function getOuterHTML($node) |
||
443 | { |
||
444 | return $this->internalParser ? htmlspecialchars_decode($this->document->saveHTML($node)) : $node->outertext; |
||
445 | } |
||
446 | |||
447 | /** |
||
448 | * Gets the inner html of a node |
||
449 | * |
||
450 | * @param \DOMNode|object $node |
||
451 | * @return string |
||
452 | */ |
||
453 | public function setInnerHTML($node) |
||
466 | } |
||
467 | } |