1 | <?php |
||
2 | |||
3 | namespace Wa72\HtmlPageDom; |
||
4 | |||
5 | /** |
||
6 | * Static helper functions for HtmlPageDom |
||
7 | * |
||
8 | * @package Wa72\HtmlPageDom |
||
9 | */ |
||
10 | class Helpers |
||
11 | { |
||
12 | /** |
||
13 | * remove newlines from string and minimize whitespace (multiple whitespace characters replaced by one space) |
||
14 | * useful for cleaning up text retrieved by HtmlPageCrawler::text() (nodeValue of a DOMNode) |
||
15 | * |
||
16 | * @param string $string |
||
17 | * @return string |
||
18 | */ |
||
19 | 8 | public static function trimNewlines($string) |
|
20 | { |
||
21 | 8 | $string = str_replace("\n", ' ', $string); |
|
22 | 8 | $string = str_replace("\r", ' ', $string); |
|
23 | 8 | $string = preg_replace('/\s+/', ' ', $string); |
|
24 | |||
25 | 8 | return trim($string); |
|
26 | } |
||
27 | |||
28 | /** |
||
29 | * Convert CSS string to array |
||
30 | * |
||
31 | * @param string $css list of CSS properties separated by ; |
||
32 | * @return array name=>value pairs of CSS properties |
||
33 | */ |
||
34 | 8 | public static function cssStringToArray($css) |
|
35 | { |
||
36 | 8 | $statements = explode(';', preg_replace('/\s+/s', ' ', $css)); |
|
37 | 8 | $styles = []; |
|
38 | 8 | foreach ($statements as $statement) { |
|
39 | 8 | $statement = trim($statement); |
|
40 | 8 | if ('' === $statement) { |
|
41 | 8 | continue; |
|
42 | } |
||
43 | 8 | $p = strpos($statement, ':'); |
|
44 | 8 | if ($p <= 0) { |
|
45 | 8 | continue; |
|
46 | } // invalid statement, just ignore it |
||
47 | 8 | $key = trim(substr($statement, 0, $p)); |
|
48 | 8 | $value = trim(substr($statement, $p + 1)); |
|
49 | 8 | $styles[$key] = $value; |
|
50 | } |
||
51 | |||
52 | 8 | return $styles; |
|
53 | } |
||
54 | |||
55 | /** |
||
56 | * Convert CSS name->value array to string |
||
57 | * |
||
58 | * @param array $array name=>value pairs of CSS properties |
||
59 | * @return string list of CSS properties separated by ; |
||
60 | */ |
||
61 | 8 | public static function cssArrayToString($array) |
|
62 | { |
||
63 | 8 | $styles = ''; |
|
64 | 8 | foreach ($array as $key => $value) { |
|
65 | 8 | $styles .= $key . ': ' . $value . ';'; |
|
66 | } |
||
67 | |||
68 | 8 | return $styles; |
|
69 | } |
||
70 | |||
71 | /** |
||
72 | * Helper function for getting a body element |
||
73 | * from an HTML fragment |
||
74 | * |
||
75 | * @param string $html A fragment of HTML code |
||
76 | * @param string $charset |
||
77 | * @return \DOMNode The body node containing child nodes created from the HTML fragment |
||
78 | */ |
||
79 | 120 | public static function getBodyNodeFromHtmlFragment($html, $charset = 'UTF-8') |
|
80 | { |
||
81 | 120 | $html = '<html><body>' . $html . '</body></html>'; |
|
82 | 120 | $current = libxml_use_internal_errors(true); |
|
83 | |||
84 | 120 | if (\PHP_VERSION_ID < 80000) { |
|
85 | 60 | $disableEntities = libxml_disable_entity_loader(true); |
|
86 | } |
||
87 | |||
88 | 120 | $d = new \DOMDocument('1.0', $charset); |
|
89 | 120 | $d->validateOnParse = true; |
|
90 | 120 | if (function_exists('mb_convert_encoding') && in_array( |
|
91 | 120 | strtolower($charset), |
|
92 | 120 | array_map('strtolower', mb_list_encodings()) |
|
93 | ) |
||
94 | ) { |
||
95 | 120 | $html = mb_convert_encoding($html, 'HTML-ENTITIES', $charset); |
|
96 | } |
||
97 | 120 | @$d->loadHTML($html); |
|
0 ignored issues
–
show
Bug
introduced
by
![]() |
|||
98 | 120 | libxml_use_internal_errors($current); |
|
99 | |||
100 | 120 | if (\PHP_VERSION_ID < 80000) { |
|
101 | 60 | libxml_disable_entity_loader($disableEntities); |
|
0 ignored issues
–
show
Comprehensibility
Best Practice
introduced
by
|
|||
102 | } |
||
103 | |||
104 | 120 | return $d->getElementsByTagName('body')->item(0); |
|
105 | } |
||
106 | } |
||
107 |