1 | <?php |
||
2 | |||
3 | namespace Wa72\HtmlPageDom; |
||
4 | |||
5 | use Symfony\Component\CssSelector\CssSelector; |
||
6 | use Wa72\HtmlPrettymin\PrettyMin; |
||
7 | |||
8 | /** |
||
9 | * This class represents a complete HTML document. |
||
10 | * |
||
11 | * It offers convenience functions for getting and setting elements of the document |
||
12 | * such as setTitle(), getTitle(), setMeta($name, $value), getBody(). |
||
13 | * |
||
14 | * It uses HtmlPageCrawler to navigate and manipulate the DOM tree. |
||
15 | * |
||
16 | * @author Christoph Singer |
||
17 | * @license MIT |
||
18 | */ |
||
19 | class HtmlPage |
||
20 | { |
||
21 | /** |
||
22 | * |
||
23 | * @var \DOMDocument |
||
24 | */ |
||
25 | protected $dom; |
||
26 | |||
27 | /** |
||
28 | * @var string |
||
29 | */ |
||
30 | protected $charset; |
||
31 | |||
32 | /** |
||
33 | * @var string |
||
34 | */ |
||
35 | protected $url; |
||
36 | |||
37 | /** |
||
38 | * |
||
39 | * @var HtmlPageCrawler |
||
40 | */ |
||
41 | protected $crawler; |
||
42 | |||
43 | 104 | public function __construct($content = '', $url = '', $charset = 'UTF-8') |
|
44 | { |
||
45 | 104 | $this->charset = $charset; |
|
46 | 104 | $this->url = $url; |
|
47 | 104 | if ($content == '') { |
|
48 | 16 | $content = '<!DOCTYPE html><html><head><title></title></head><body></body></html>'; |
|
49 | } |
||
50 | 104 | $current = libxml_use_internal_errors(true); |
|
51 | |||
52 | 104 | if (\PHP_VERSION_ID < 80000) { |
|
53 | 52 | $disableEntities = libxml_disable_entity_loader(true); |
|
54 | } |
||
55 | |||
56 | 104 | $this->dom = new \DOMDocument('1.0', $charset); |
|
57 | 104 | $this->dom->validateOnParse = true; |
|
58 | |||
59 | |||
60 | 104 | if (function_exists('mb_convert_encoding') && in_array(strtolower($charset), array_map('strtolower', mb_list_encodings()))) { |
|
61 | 104 | $content = mb_convert_encoding($content, 'HTML-ENTITIES', $charset); |
|
62 | } |
||
63 | |||
64 | 104 | @$this->dom->loadHTML($content); |
|
0 ignored issues
–
show
|
|||
65 | |||
66 | 104 | libxml_use_internal_errors($current); |
|
67 | |||
68 | 104 | if (\PHP_VERSION_ID < 80000) { |
|
69 | 52 | libxml_disable_entity_loader($disableEntities); |
|
0 ignored issues
–
show
Comprehensibility
Best Practice
introduced
by
|
|||
70 | } |
||
71 | |||
72 | 104 | $this->crawler = new HtmlPageCrawler($this->dom); |
|
73 | 104 | } |
|
74 | |||
75 | /** |
||
76 | * Get a HtmlPageCrawler object containing the root node of the HTML document |
||
77 | * |
||
78 | * @return HtmlPageCrawler |
||
79 | */ |
||
80 | 8 | public function getCrawler() |
|
81 | { |
||
82 | 8 | return $this->crawler; |
|
83 | } |
||
84 | |||
85 | /** |
||
86 | * Get a DOMDocument object for the HTML document |
||
87 | * |
||
88 | * @return \DOMDocument |
||
89 | */ |
||
90 | 8 | public function getDOMDocument() |
|
91 | { |
||
92 | 8 | return $this->dom; |
|
93 | } |
||
94 | |||
95 | /** |
||
96 | * Sets the page title of the HTML document |
||
97 | * |
||
98 | * @param string $title |
||
99 | */ |
||
100 | 24 | public function setTitle($title) |
|
101 | { |
||
102 | 24 | $t = $this->dom->getElementsByTagName('title')->item(0); |
|
103 | 24 | if ($t == null) { |
|
104 | 8 | $t = $this->dom->createElement('title'); |
|
105 | 8 | $this->getHeadNode()->appendChild($t); |
|
106 | } |
||
107 | 24 | $t->nodeValue = htmlspecialchars($title); |
|
108 | 24 | } |
|
109 | |||
110 | /** |
||
111 | * Get the page title of the HTML document |
||
112 | * |
||
113 | * @return null|string |
||
114 | */ |
||
115 | 32 | public function getTitle() |
|
116 | { |
||
117 | 32 | $t = $this->dom->getElementsByTagName('title')->item(0); |
|
118 | 32 | if ($t == null) { |
|
119 | 8 | return null; |
|
120 | } |
||
121 | |||
122 | 24 | return $t->nodeValue; |
|
123 | } |
||
124 | |||
125 | /** |
||
126 | * Set a META tag with specified 'name' and 'content' attributes |
||
127 | * |
||
128 | * @TODO: add support for multiple meta tags with the same name but different languages |
||
129 | * |
||
130 | * @param $name |
||
131 | * @param $content |
||
132 | */ |
||
133 | 8 | public function setMeta($name, $content) |
|
134 | { |
||
135 | 8 | $c = $this->filterXPath('descendant-or-self::meta[@name = \'' . $name . '\']'); |
|
136 | 8 | if (count($c) == 0) { |
|
137 | 8 | $node = $this->dom->createElement('meta'); |
|
138 | 8 | $node->setAttribute('name', $name); |
|
139 | 8 | $this->getHeadNode()->appendChild($node); |
|
140 | 8 | $c->addNode($node); |
|
141 | } |
||
142 | 8 | $c->setAttribute('content', $content); |
|
143 | 8 | } |
|
144 | |||
145 | /** |
||
146 | * Remove all meta tags with the specified name attribute |
||
147 | * |
||
148 | * @param string $name |
||
149 | */ |
||
150 | 8 | public function removeMeta($name) |
|
151 | { |
||
152 | 8 | $meta = $this->filterXPath('descendant-or-self::meta[@name = \'' . $name . '\']'); |
|
153 | 8 | $meta->remove(); |
|
154 | 8 | } |
|
155 | |||
156 | /** |
||
157 | * Get the content attribute of a meta tag with the specified name attribute |
||
158 | * |
||
159 | * @param string $name |
||
160 | * @return null|string |
||
161 | */ |
||
162 | 8 | public function getMeta($name) |
|
163 | { |
||
164 | 8 | $node = $this->filterXPath('descendant-or-self::meta[@name = \'' . $name . '\']')->getNode(0); |
|
165 | 8 | if ($node instanceof \DOMElement) { |
|
166 | 8 | return $node->getAttribute('content'); |
|
167 | } |
||
168 | |||
169 | 8 | return null; |
|
170 | } |
||
171 | |||
172 | /** |
||
173 | * Set the base tag with href attribute set to parameter $url |
||
174 | * |
||
175 | * @param string $url |
||
176 | */ |
||
177 | 8 | public function setBaseHref($url) |
|
178 | { |
||
179 | 8 | $node = $this->filterXPath('descendant-or-self::base')->getNode(0); |
|
180 | 8 | if ($node == null) { |
|
181 | 8 | $node = $this->dom->createElement('base'); |
|
182 | 8 | $this->getHeadNode()->appendChild($node); |
|
183 | } |
||
184 | 8 | $node->setAttribute('href', $url); |
|
185 | 8 | } |
|
186 | |||
187 | /** |
||
188 | * Get the href attribute from the base tag, null if not present in document |
||
189 | * |
||
190 | * @return null|string |
||
191 | */ |
||
192 | 16 | public function getBaseHref() |
|
193 | { |
||
194 | 16 | $node = $this->filterXPath('descendant-or-self::base')->getNode(0); |
|
195 | 16 | if ($node instanceof \DOMElement) { |
|
196 | 8 | return $node->getAttribute('href'); |
|
197 | } |
||
198 | |||
199 | 8 | return null; |
|
200 | } |
||
201 | |||
202 | /** |
||
203 | * Sets innerHTML content of an element specified by elementId |
||
204 | * |
||
205 | * @param string $elementId |
||
206 | * @param string $html |
||
207 | */ |
||
208 | 8 | public function setHtmlById($elementId, $html) |
|
209 | { |
||
210 | 8 | $this->getElementById($elementId)->setInnerHtml($html); |
|
211 | 8 | } |
|
212 | |||
213 | /** |
||
214 | * Get the document's HEAD section as DOMElement |
||
215 | * |
||
216 | * @return \DOMElement |
||
217 | */ |
||
218 | 24 | public function getHeadNode() |
|
219 | { |
||
220 | 24 | $head = $this->dom->getElementsByTagName('head')->item(0); |
|
221 | 24 | if ($head == null) { |
|
222 | 8 | $head = $this->dom->createElement('head'); |
|
223 | 8 | $head = $this->dom->documentElement->insertBefore($head, $this->getBodyNode()); |
|
224 | } |
||
225 | |||
226 | 24 | return $head; |
|
227 | } |
||
228 | |||
229 | /** |
||
230 | * Get the document's body as DOMElement |
||
231 | * |
||
232 | * @return \DOMElement |
||
233 | */ |
||
234 | 24 | public function getBodyNode() |
|
235 | { |
||
236 | 24 | $body = $this->dom->getElementsByTagName('body')->item(0); |
|
237 | 24 | if ($body == null) { |
|
238 | 8 | $body = $this->dom->createElement('body'); |
|
239 | 8 | $body = $this->dom->documentElement->appendChild($body); |
|
240 | } |
||
241 | |||
242 | 24 | return $body; |
|
243 | } |
||
244 | |||
245 | /** |
||
246 | * Get the document's HEAD section wrapped in a HtmlPageCrawler instance |
||
247 | * |
||
248 | * @return HtmlPageCrawler |
||
249 | */ |
||
250 | 8 | public function getHead() |
|
251 | { |
||
252 | 8 | return new HtmlPageCrawler($this->getHeadNode()); |
|
253 | } |
||
254 | |||
255 | /** |
||
256 | * Get the document's body wrapped in a HtmlPageCrawler instance |
||
257 | * |
||
258 | * @return HtmlPageCrawler |
||
259 | */ |
||
260 | 16 | public function getBody() |
|
261 | { |
||
262 | 16 | return new HtmlPageCrawler($this->getBodyNode()); |
|
263 | } |
||
264 | |||
265 | 48 | public function __toString() |
|
266 | { |
||
267 | 48 | return $this->dom->saveHTML(); |
|
268 | } |
||
269 | |||
270 | /** |
||
271 | * Save this document to a HTML file or return HTML code as string |
||
272 | * |
||
273 | * @param string $filename If provided, output will be saved to this file, otherwise returned |
||
274 | * @return string|void |
||
275 | */ |
||
276 | 32 | public function save($filename = '') |
|
277 | { |
||
278 | 32 | if ($filename != '') { |
|
279 | 8 | file_put_contents($filename, (string) $this); |
|
280 | |||
281 | 8 | return; |
|
282 | } |
||
283 | |||
284 | 24 | return (string) $this; |
|
285 | } |
||
286 | |||
287 | /** |
||
288 | * Get an element in the document by it's id attribute |
||
289 | * |
||
290 | * @param string $id |
||
291 | * @return HtmlPageCrawler |
||
292 | */ |
||
293 | 8 | public function getElementById($id) |
|
294 | { |
||
295 | 8 | return $this->filterXPath('descendant-or-self::*[@id = \'' . $id . '\']'); |
|
296 | } |
||
297 | |||
298 | /** |
||
299 | * Filter nodes by using a CSS selector |
||
300 | * |
||
301 | * @param string $selector CSS selector |
||
302 | * @return HtmlPageCrawler |
||
303 | */ |
||
304 | 8 | public function filter($selector) |
|
305 | { |
||
306 | //echo "\n" . CssSelector::toXPath($selector) . "\n"; |
||
307 | 8 | return $this->crawler->filter($selector); |
|
308 | } |
||
309 | |||
310 | /** |
||
311 | * Filter nodes by XPath expression |
||
312 | * |
||
313 | * @param string $xpath XPath expression |
||
314 | * @return HtmlPageCrawler |
||
315 | */ |
||
316 | 16 | public function filterXPath($xpath) |
|
317 | { |
||
318 | 16 | return $this->crawler->filterXPath($xpath); |
|
319 | } |
||
320 | |||
321 | /** |
||
322 | * remove newlines from string and minimize whitespace (multiple whitespace characters replaced by one space) |
||
323 | * |
||
324 | * useful for cleaning up text retrieved by HtmlPageCrawler::text() (nodeValue of a DOMNode) |
||
325 | * |
||
326 | * @param string $string |
||
327 | * @return string |
||
328 | */ |
||
329 | 8 | public static function trimNewlines($string) |
|
330 | { |
||
331 | 8 | return Helpers::trimNewlines($string); |
|
332 | } |
||
333 | |||
334 | 8 | public function __clone() |
|
335 | { |
||
336 | 8 | $this->dom = $this->dom->cloneNode(true); |
|
337 | 8 | $this->crawler = new HtmlPageCrawler($this->dom); |
|
338 | 8 | } |
|
339 | |||
340 | /** |
||
341 | * minify the HTML document |
||
342 | * |
||
343 | * @param array $options Options passed to PrettyMin::__construct() |
||
344 | * @throws \Exception |
||
345 | * @return HtmlPage |
||
346 | */ |
||
347 | 8 | public function minify(array $options = []) |
|
348 | { |
||
349 | 8 | if (!class_exists('Wa72\\HtmlPrettymin\\PrettyMin')) { |
|
350 | throw new \Exception('Function minify needs composer package wa72/html-pretty-min'); |
||
351 | } |
||
352 | 8 | $pm = new PrettyMin($options); |
|
353 | 8 | $pm->load($this->dom)->minify(); |
|
354 | |||
355 | 8 | return $this; |
|
356 | } |
||
357 | |||
358 | /** |
||
359 | * indent the HTML document |
||
360 | * |
||
361 | * @param array $options Options passed to PrettyMin::__construct() |
||
362 | * @throws \Exception |
||
363 | * @return HtmlPage |
||
364 | */ |
||
365 | 8 | public function indent(array $options = []) |
|
366 | { |
||
367 | 8 | if (!class_exists('Wa72\\HtmlPrettymin\\PrettyMin')) { |
|
368 | throw new \Exception('Function indent needs composer package wa72/html-pretty-min'); |
||
369 | } |
||
370 | 8 | $pm = new PrettyMin($options); |
|
371 | 8 | $pm->load($this->dom)->indent(); |
|
372 | |||
373 | 8 | return $this; |
|
374 | } |
||
375 | } |
||
376 |
If you suppress an error, we recommend checking for the error condition explicitly: