godbout /
htmlpagedom
| 1 | <?php |
||||
| 2 | |||||
| 3 | namespace Wa72\HtmlPageDom; |
||||
| 4 | |||||
| 5 | /** |
||||
| 6 | * Static helper functions for HtmlPageDom |
||||
| 7 | * |
||||
| 8 | * @package Wa72\HtmlPageDom |
||||
| 9 | */ |
||||
| 10 | class Helpers |
||||
| 11 | { |
||||
| 12 | /** |
||||
| 13 | * remove newlines from string and minimize whitespace (multiple whitespace characters replaced by one space) |
||||
| 14 | * useful for cleaning up text retrieved by HtmlPageCrawler::text() (nodeValue of a DOMNode) |
||||
| 15 | * |
||||
| 16 | * @param string $string |
||||
| 17 | * @return string |
||||
| 18 | */ |
||||
| 19 | 8 | public static function trimNewlines($string) |
|||
| 20 | { |
||||
| 21 | 8 | $string = str_replace("\n", ' ', $string); |
|||
| 22 | 8 | $string = str_replace("\r", ' ', $string); |
|||
| 23 | 8 | $string = preg_replace('/\s+/', ' ', $string); |
|||
| 24 | |||||
| 25 | 8 | return trim($string); |
|||
| 26 | } |
||||
| 27 | |||||
| 28 | /** |
||||
| 29 | * Convert CSS string to array |
||||
| 30 | * |
||||
| 31 | * @param string $css list of CSS properties separated by ; |
||||
| 32 | * @return array name=>value pairs of CSS properties |
||||
| 33 | */ |
||||
| 34 | 8 | public static function cssStringToArray($css) |
|||
| 35 | { |
||||
| 36 | 8 | $statements = explode(';', preg_replace('/\s+/s', ' ', $css)); |
|||
| 37 | 8 | $styles = []; |
|||
| 38 | 8 | foreach ($statements as $statement) { |
|||
| 39 | 8 | $statement = trim($statement); |
|||
| 40 | 8 | if ('' === $statement) { |
|||
| 41 | 8 | continue; |
|||
| 42 | } |
||||
| 43 | 8 | $p = strpos($statement, ':'); |
|||
| 44 | 8 | if ($p <= 0) { |
|||
| 45 | 8 | continue; |
|||
| 46 | } // invalid statement, just ignore it |
||||
| 47 | 8 | $key = trim(substr($statement, 0, $p)); |
|||
| 48 | 8 | $value = trim(substr($statement, $p + 1)); |
|||
| 49 | 8 | $styles[$key] = $value; |
|||
| 50 | } |
||||
| 51 | |||||
| 52 | 8 | return $styles; |
|||
| 53 | } |
||||
| 54 | |||||
| 55 | /** |
||||
| 56 | * Convert CSS name->value array to string |
||||
| 57 | * |
||||
| 58 | * @param array $array name=>value pairs of CSS properties |
||||
| 59 | * @return string list of CSS properties separated by ; |
||||
| 60 | */ |
||||
| 61 | 8 | public static function cssArrayToString($array) |
|||
| 62 | { |
||||
| 63 | 8 | $styles = ''; |
|||
| 64 | 8 | foreach ($array as $key => $value) { |
|||
| 65 | 8 | $styles .= $key . ': ' . $value . ';'; |
|||
| 66 | } |
||||
| 67 | |||||
| 68 | 8 | return $styles; |
|||
| 69 | } |
||||
| 70 | |||||
| 71 | /** |
||||
| 72 | * Helper function for getting a body element |
||||
| 73 | * from an HTML fragment |
||||
| 74 | * |
||||
| 75 | * @param string $html A fragment of HTML code |
||||
| 76 | * @param string $charset |
||||
| 77 | * @return \DOMNode The body node containing child nodes created from the HTML fragment |
||||
| 78 | */ |
||||
| 79 | 120 | public static function getBodyNodeFromHtmlFragment($html, $charset = 'UTF-8') |
|||
| 80 | { |
||||
| 81 | 120 | $html = '<html><body>' . $html . '</body></html>'; |
|||
| 82 | 120 | $current = libxml_use_internal_errors(true); |
|||
| 83 | |||||
| 84 | 120 | if (\PHP_VERSION_ID < 80000) { |
|||
| 85 | 60 | $disableEntities = libxml_disable_entity_loader(true); |
|||
| 86 | } |
||||
| 87 | |||||
| 88 | 120 | $d = new \DOMDocument('1.0', $charset); |
|||
| 89 | 120 | $d->validateOnParse = true; |
|||
| 90 | 120 | if (function_exists('mb_convert_encoding') && in_array( |
|||
| 91 | 120 | strtolower($charset), |
|||
| 92 | 120 | array_map('strtolower', mb_list_encodings()) |
|||
| 93 | ) |
||||
| 94 | ) { |
||||
| 95 | 120 | $html = mb_convert_encoding($html, 'HTML-ENTITIES', $charset); |
|||
| 96 | } |
||||
| 97 | 120 | @$d->loadHTML($html); |
|||
|
0 ignored issues
–
show
It seems like
$html can also be of type array; however, parameter $source of DOMDocument::loadHTML() does only seem to accept string, maybe add an additional type check?
(
Ignorable by Annotation
)
If this is a false-positive, you can also ignore this issue in your code via the
Loading history...
|
|||||
| 98 | 120 | libxml_use_internal_errors($current); |
|||
| 99 | |||||
| 100 | 120 | if (\PHP_VERSION_ID < 80000) { |
|||
| 101 | 60 | libxml_disable_entity_loader($disableEntities); |
|||
|
0 ignored issues
–
show
Comprehensibility
Best Practice
introduced
by
|
|||||
| 102 | } |
||||
| 103 | |||||
| 104 | 120 | return $d->getElementsByTagName('body')->item(0); |
|||
| 105 | } |
||||
| 106 | } |
||||
| 107 |
If you suppress an error, we recommend checking for the error condition explicitly: