Issues (14)

src/HtmlPage.php (1 issue)

1
<?php
2
3
namespace Wa72\HtmlPageDom;
4
5
use Symfony\Component\CssSelector\CssSelector;
6
use Wa72\HtmlPrettymin\PrettyMin;
7
8
/**
9
 * This class represents a complete HTML document.
10
 *
11
 * It offers convenience functions for getting and setting elements of the document
12
 * such as setTitle(), getTitle(), setMeta($name, $value), getBody().
13
 *
14
 * It uses HtmlPageCrawler to navigate and manipulate the DOM tree.
15
 *
16
 * @author Christoph Singer
17
 * @license MIT
18
 */
19
class HtmlPage
20
{
21
    /**
22
     *
23
     * @var \DOMDocument
24
     */
25
    protected $dom;
26
27
    /**
28
     * @var string
29
     */
30
    protected $charset;
31
32
    /**
33
     * @var string
34
     */
35
    protected $url;
36
37
    /**
38
     *
39
     * @var HtmlPageCrawler
40
     */
41
    protected $crawler;
42
43 104
    public function __construct($content = '', $url = '', $charset = 'UTF-8')
44
    {
45 104
        $this->charset = $charset;
46 104
        $this->url = $url;
47 104
        if ($content == '') {
48 16
            $content = '<!DOCTYPE html><html><head><title></title></head><body></body></html>';
49
        }
50 104
        $current = libxml_use_internal_errors(true);
51
52 104
        if (\PHP_VERSION_ID < 80000) {
53 52
            $disableEntities = libxml_disable_entity_loader(true);
54
        }
55
56 104
        $this->dom = new \DOMDocument('1.0', $charset);
57 104
        $this->dom->validateOnParse = true;
58
59
60 104
        if (function_exists('mb_convert_encoding') && in_array(strtolower($charset), array_map('strtolower', mb_list_encodings()))) {
61 104
            $content = mb_convert_encoding($content, 'HTML-ENTITIES', $charset);
62
        }
63
64 104
        @$this->dom->loadHTML($content);
0 ignored issues
show
Security Best Practice introduced by
It seems like you do not handle an error condition for loadHTML(). This can introduce security issues, and is generally not recommended. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-unhandled  annotation

64
        /** @scrutinizer ignore-unhandled */ @$this->dom->loadHTML($content);

If you suppress an error, we recommend checking for the error condition explicitly:

// For example instead of
@mkdir($dir);

// Better use
if (@mkdir($dir) === false) {
    throw new \RuntimeException('The directory '.$dir.' could not be created.');
}
Loading history...
65
66 104
        libxml_use_internal_errors($current);
67
68 104
        if (\PHP_VERSION_ID < 80000) {
69 52
            libxml_disable_entity_loader($disableEntities);
70
        }
71
72 104
        $this->crawler = new HtmlPageCrawler($this->dom);
73 104
    }
74
75
    /**
76
     * Get a HtmlPageCrawler object containing the root node of the HTML document
77
     *
78
     * @return HtmlPageCrawler
79
     */
80 8
    public function getCrawler()
81
    {
82 8
        return $this->crawler;
83
    }
84
85
    /**
86
     * Get a DOMDocument object for the HTML document
87
     *
88
     * @return \DOMDocument
89
     */
90 8
    public function getDOMDocument()
91
    {
92 8
        return $this->dom;
93
    }
94
95
    /**
96
     * Sets the page title of the HTML document
97
     *
98
     * @param string $title
99
     */
100 24
    public function setTitle($title)
101
    {
102 24
        $t = $this->dom->getElementsByTagName('title')->item(0);
103 24
        if ($t == null) {
104 8
            $t = $this->dom->createElement('title');
105 8
            $this->getHeadNode()->appendChild($t);
106
        }
107 24
        $t->nodeValue = htmlspecialchars($title);
108 24
    }
109
110
    /**
111
     * Get the page title of the HTML document
112
     *
113
     * @return null|string
114
     */
115 32
    public function getTitle()
116
    {
117 32
        $t = $this->dom->getElementsByTagName('title')->item(0);
118 32
        if ($t == null) {
119 8
            return null;
120
        }
121
122 24
        return $t->nodeValue;
123
    }
124
125
    /**
126
     * Set a META tag with specified 'name' and 'content' attributes
127
     *
128
     * @TODO: add support for multiple meta tags with the same name but different languages
129
     *
130
     * @param $name
131
     * @param $content
132
     */
133 8
    public function setMeta($name, $content)
134
    {
135 8
        $c = $this->filterXPath('descendant-or-self::meta[@name = \'' . $name . '\']');
136 8
        if (count($c) == 0) {
137 8
            $node = $this->dom->createElement('meta');
138 8
            $node->setAttribute('name', $name);
139 8
            $this->getHeadNode()->appendChild($node);
140 8
            $c->addNode($node);
141
        }
142 8
        $c->setAttribute('content', $content);
143 8
    }
144
145
    /**
146
     * Remove all meta tags with the specified name attribute
147
     *
148
     * @param string $name
149
     */
150 8
    public function removeMeta($name)
151
    {
152 8
        $meta = $this->filterXPath('descendant-or-self::meta[@name = \'' . $name . '\']');
153 8
        $meta->remove();
154 8
    }
155
156
    /**
157
     * Get the content attribute of a meta tag with the specified name attribute
158
     *
159
     * @param string $name
160
     * @return null|string
161
     */
162 8
    public function getMeta($name)
163
    {
164 8
        $node = $this->filterXPath('descendant-or-self::meta[@name = \'' . $name . '\']')->getNode(0);
165 8
        if ($node instanceof \DOMElement) {
166 8
            return $node->getAttribute('content');
167
        }
168
169 8
        return null;
170
    }
171
172
    /**
173
     * Set the base tag with href attribute set to parameter $url
174
     *
175
     * @param string $url
176
     */
177 8
    public function setBaseHref($url)
178
    {
179 8
        $node = $this->filterXPath('descendant-or-self::base')->getNode(0);
180 8
        if ($node == null) {
181 8
            $node = $this->dom->createElement('base');
182 8
            $this->getHeadNode()->appendChild($node);
183
        }
184 8
        $node->setAttribute('href', $url);
185 8
    }
186
187
    /**
188
     * Get the href attribute from the base tag, null if not present in document
189
     *
190
     * @return null|string
191
     */
192 16
    public function getBaseHref()
193
    {
194 16
        $node = $this->filterXPath('descendant-or-self::base')->getNode(0);
195 16
        if ($node instanceof \DOMElement) {
196 8
            return $node->getAttribute('href');
197
        }
198
199 8
        return null;
200
    }
201
202
    /**
203
     * Sets innerHTML content of an element specified by elementId
204
     *
205
     * @param string $elementId
206
     * @param string $html
207
     */
208 8
    public function setHtmlById($elementId, $html)
209
    {
210 8
        $this->getElementById($elementId)->setInnerHtml($html);
211 8
    }
212
213
    /**
214
     * Get the document's HEAD section as DOMElement
215
     *
216
     * @return \DOMElement
217
     */
218 24
    public function getHeadNode()
219
    {
220 24
        $head = $this->dom->getElementsByTagName('head')->item(0);
221 24
        if ($head == null) {
222 8
            $head = $this->dom->createElement('head');
223 8
            $head = $this->dom->documentElement->insertBefore($head, $this->getBodyNode());
224
        }
225
226 24
        return $head;
227
    }
228
229
    /**
230
     * Get the document's body as DOMElement
231
     *
232
     * @return \DOMElement
233
     */
234 24
    public function getBodyNode()
235
    {
236 24
        $body = $this->dom->getElementsByTagName('body')->item(0);
237 24
        if ($body == null) {
238 8
            $body = $this->dom->createElement('body');
239 8
            $body = $this->dom->documentElement->appendChild($body);
240
        }
241
242 24
        return $body;
243
    }
244
245
    /**
246
     * Get the document's HEAD section wrapped in a HtmlPageCrawler instance
247
     *
248
     * @return HtmlPageCrawler
249
     */
250 8
    public function getHead()
251
    {
252 8
        return new HtmlPageCrawler($this->getHeadNode());
253
    }
254
255
    /**
256
     * Get the document's body wrapped in a HtmlPageCrawler instance
257
     *
258
     * @return HtmlPageCrawler
259
     */
260 16
    public function getBody()
261
    {
262 16
        return new HtmlPageCrawler($this->getBodyNode());
263
    }
264
265 48
    public function __toString()
266
    {
267 48
        return $this->dom->saveHTML();
268
    }
269
270
    /**
271
     * Save this document to a HTML file or return HTML code as string
272
     *
273
     * @param string $filename If provided, output will be saved to this file, otherwise returned
274
     * @return string|void
275
     */
276 32
    public function save($filename = '')
277
    {
278 32
        if ($filename != '') {
279 8
            file_put_contents($filename, (string) $this);
280
281 8
            return;
282
        }
283
284 24
        return (string) $this;
285
    }
286
287
    /**
288
     * Get an element in the document by it's id attribute
289
     *
290
     * @param string $id
291
     * @return HtmlPageCrawler
292
     */
293 8
    public function getElementById($id)
294
    {
295 8
        return $this->filterXPath('descendant-or-self::*[@id = \'' . $id . '\']');
296
    }
297
298
    /**
299
     * Filter nodes by using a CSS selector
300
     *
301
     * @param string $selector CSS selector
302
     * @return HtmlPageCrawler
303
     */
304 8
    public function filter($selector)
305
    {
306
        //echo "\n" . CssSelector::toXPath($selector) . "\n";
307 8
        return $this->crawler->filter($selector);
308
    }
309
310
    /**
311
     * Filter nodes by XPath expression
312
     *
313
     * @param string $xpath XPath expression
314
     * @return HtmlPageCrawler
315
     */
316 16
    public function filterXPath($xpath)
317
    {
318 16
        return $this->crawler->filterXPath($xpath);
319
    }
320
321
    /**
322
     * remove newlines from string and minimize whitespace (multiple whitespace characters replaced by one space)
323
     *
324
     * useful for cleaning up text retrieved by HtmlPageCrawler::text() (nodeValue of a DOMNode)
325
     *
326
     * @param string $string
327
     * @return string
328
     */
329 8
    public static function trimNewlines($string)
330
    {
331 8
        return Helpers::trimNewlines($string);
332
    }
333
334 8
    public function __clone()
335
    {
336 8
        $this->dom = $this->dom->cloneNode(true);
337 8
        $this->crawler = new HtmlPageCrawler($this->dom);
338 8
    }
339
340
    /**
341
     * minify the HTML document
342
     *
343
     * @param array $options Options passed to PrettyMin::__construct()
344
     * @throws \Exception
345
     * @return HtmlPage
346
     */
347 8
    public function minify(array $options = [])
348
    {
349 8
        if (!class_exists('Wa72\\HtmlPrettymin\\PrettyMin')) {
350
            throw new \Exception('Function minify needs composer package wa72/html-pretty-min');
351
        }
352 8
        $pm = new PrettyMin($options);
353 8
        $pm->load($this->dom)->minify();
354
355 8
        return $this;
356
    }
357
358
    /**
359
     * indent the HTML document
360
     *
361
     * @param array $options Options passed to PrettyMin::__construct()
362
     * @throws \Exception
363
     * @return HtmlPage
364
     */
365 8
    public function indent(array $options = [])
366
    {
367 8
        if (!class_exists('Wa72\\HtmlPrettymin\\PrettyMin')) {
368
            throw new \Exception('Function indent needs composer package wa72/html-pretty-min');
369
        }
370 8
        $pm = new PrettyMin($options);
371 8
        $pm->load($this->dom)->indent();
372
373 8
        return $this;
374
    }
375
}
376