Issues (29)

src/Document.php (6 issues)

1
<?php declare(strict_types=1);
2
3
namespace DOMWrap;
4
5
use DOMWrap\Traits\{
6
    CommonTrait,
7
    TraversalTrait,
8
    ManipulationTrait
9
};
10
11
/**
12
 * Document Node
13
 *
14
 * @package DOMWrap
15
 * @license http://opensource.org/licenses/BSD-3-Clause BSD 3 Clause
16
 */
17
class Document extends \DOMDocument
18
{
19
    use CommonTrait;
20
    use TraversalTrait;
21
    use ManipulationTrait;
22
23 140
    /** @var int */
24 140
    protected $libxmlOptions = 0;
25
26 140
    /** @var string|null */
27 140
    protected $documentEncoding = null;
28 140
29 140
    public function __construct(string $version = '1.0', string $encoding = 'UTF-8') {
30 140
        parent::__construct($version, $encoding);
31 140
32
        $this->registerNodeClass('DOMText', 'DOMWrap\\Text');
33
        $this->registerNodeClass('DOMElement', 'DOMWrap\\Element');
34
        $this->registerNodeClass('DOMComment', 'DOMWrap\\Comment');
35
        $this->registerNodeClass('DOMDocument', 'DOMWrap\\Document');
36 138
        $this->registerNodeClass('DOMDocumentType', 'DOMWrap\\DocumentType');
37 138
        $this->registerNodeClass('DOMProcessingInstruction', 'DOMWrap\\ProcessingInstruction');
38
    }
39
40
    /**
41
     * Set libxml options.
42
     *
43 134
     * Multiple values must use bitwise OR.
44 134
     * eg: LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD
45
     *
46
     * @link http://php.net/manual/en/libxml.constants.php
47
     *
48
     * @param int $libxmlOptions
49
     */
50 1
    public function setLibxmlOptions(int $libxmlOptions): void {
51 1
        $this->libxmlOptions = $libxmlOptions;
52
    }
53
54
    /**
55 1
     * {@inheritdoc}
56
     */
57
    public function document(): ?\DOMDocument {
58
        return $this;
59
    }
60
61 1
    /**
62 1
     * {@inheritdoc}
63
     */
64
    public function collection(): NodeList {
65
        return $this->newNodeList([$this]);
66
    }
67
68 1
    /**
69 1
     * {@inheritdoc}
70
     */
71
    public function result(NodeList $nodeList) {
72
        if ($nodeList->count()) {
73
            return $nodeList->first();
74
        }
75
76
        return null;
77
    }
78
79
    /**
80
     * {@inheritdoc}
81
     */
82
    public function parent() {
83
        return null;
84 1
    }
85 1
86
    /**
87
     * {@inheritdoc}
88
     */
89
    public function parents() {
90
        return $this->newNodeList();
91 2
    }
92 2
93
    /**
94
     * {@inheritdoc}
95
     */
96
    public function substituteWith($newNode): self {
97
        $this->replaceChild($newNode, $this);
98 140
99 140
        return $this;
100 1
    }
101
102
    /**
103 140
     * {@inheritdoc}
104 140
     */
105
    public function _clone() {
106 140
        return null;
107 140
    }
108
109 140
    /**
110 1
     * {@inheritdoc}
111 1
     */
112 1
    public function getHtml(): string {
113
        return $this->getOuterHtml();
114
    }
115
116 140
    /**
117
     * {@inheritdoc}
118
     */
119 140
    public function setHtml($html): self {
120
        if (!is_string($html) || trim($html) == '') {
121 140
            return $this;
122 140
        }
123
124 140
        $internalErrors = libxml_use_internal_errors(true);
125
        if (\PHP_VERSION_ID < 80000) {
126
            $disableEntities = libxml_disable_entity_loader(true);
127
            $this->composeXmlNode($html);
128
            libxml_use_internal_errors($internalErrors);
129
            libxml_disable_entity_loader($disableEntities);
130
        } else {
131
            $this->composeXmlNode($html);
132
            libxml_use_internal_errors($internalErrors);
133
        }
134
135
        return $this;
136
    }
137
138
    /**
139
     * @param string $html
140
     * @param int $options
141
     *
142
     * @return bool
143
     */
144
    public function loadHTML($html, $options = 0): bool {
145
        // Fix LibXML's crazy-ness RE root nodes
146
        // While importing HTML using the LIBXML_HTML_NOIMPLIED option LibXML insists
147
        //  on having one root node. All subsequent nodes are appended to this first node.
148
        // To counter this we will create a fake element, allow LibXML to 'do its thing'
149
        //  then undo it by taking the contents of the fake element, placing it back into
150
        //  the root and then remove our fake element.
151
        if ($options & LIBXML_HTML_NOIMPLIED) {
152
            $html = '<domwrap></domwrap>' . $html;
153
        }
154
155
        $html = '<?xml encoding="' . ($this->getEncoding() ?? 'UTF-8') . '">' . $html;
156
157
        $result = parent::loadHTML($html, $options);
158
159
        // Do our re-shuffling of nodes.
160
        if ($this->libxmlOptions & LIBXML_HTML_NOIMPLIED) {
161
            $this->children()->first()->contents()->each(function($node){
162
                $this->appendWith($node);
163
            });
164
165
            $this->removeChild($this->children()->first());
166
        }
167
168
        return $result;
0 ignored issues
show
Bug Best Practice introduced by
The expression return $result could return the type DOMDocument which is incompatible with the type-hinted return boolean. Consider adding an additional type-check to rule them out.
Loading history...
169
    }
170
171
    /*
0 ignored issues
show
Unused Code Comprehensibility introduced by
40% of this comment could be valid code. Did you maybe forget this after debugging?

Sometimes obsolete code just ends up commented out instead of removed. In this case it is better to remove the code once you have checked you do not need it.

The code might also have been commented out for debugging purposes. In this case it is vital that someone uncomments it again or your project may behave in very unexpected ways in production.

This check looks for comments that seem to be mostly valid code and reports them.

Loading history...
172
     * @param $encoding string|null
173
     */
174
    public function setEncoding(string $encoding = null) {
175
        $this->documentEncoding = $encoding;
176
    }
177
178
    /*
0 ignored issues
show
Unused Code Comprehensibility introduced by
50% of this comment could be valid code. Did you maybe forget this after debugging?

Sometimes obsolete code just ends up commented out instead of removed. In this case it is better to remove the code once you have checked you do not need it.

The code might also have been commented out for debugging purposes. In this case it is vital that someone uncomments it again or your project may behave in very unexpected ways in production.

This check looks for comments that seem to be mostly valid code and reports them.

Loading history...
179
     * @return string|null
180
     */
181
    public function getEncoding(): ?string {
182
        return $this->documentEncoding;
183
    }
184
185
    /*
0 ignored issues
show
Unused Code Comprehensibility introduced by
36% of this comment could be valid code. Did you maybe forget this after debugging?

Sometimes obsolete code just ends up commented out instead of removed. In this case it is better to remove the code once you have checked you do not need it.

The code might also have been commented out for debugging purposes. In this case it is vital that someone uncomments it again or your project may behave in very unexpected ways in production.

This check looks for comments that seem to be mostly valid code and reports them.

Loading history...
186
     * @param $html string
187
     *
188
     * @return string|null
189
     */
190
    private function getCharset(string $html): ?string {
191
        $charset = null;
192
193
        if (preg_match('@<meta[^>]*?charset=["\']?([^"\'\s>]+)@im', $html, $matches)) {
194
            $charset = mb_strtoupper($matches[1]);
195
        }
196
197
        return $charset;
198
    }
199
200
    /*
201
     * @param $html string
202
     */
203
    private function detectEncoding(string $html) {
204
        $charset = $this->getEncoding();
205
206
        if (is_null($charset)) {
207
            $charset = $this->getCharset($html);
208
        }
209
210
        $detectedCharset = mb_detect_encoding($html, mb_detect_order(), true);
0 ignored issues
show
It seems like mb_detect_order() can also be of type true; however, parameter $encodings of mb_detect_encoding() does only seem to accept array|null|string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

210
        $detectedCharset = mb_detect_encoding($html, /** @scrutinizer ignore-type */ mb_detect_order(), true);
Loading history...
211
212
        if ($charset === null && $detectedCharset == 'UTF-8') {
213
            $charset = $detectedCharset;
214
        }
215
216
        $this->setEncoding($charset);
217
    }
218
219
    /*
220
     * @param $html string
221
     *
222
     * @return string
223
     */
224
    private function convertToUtf8(string $html): string {
225
        $charset = $this->getEncoding();
226
227
        if ($charset !== null) {
228
            $html = preg_replace('@(charset=["]?)([^"\s]+)([^"]*["]?)@im', '$1UTF-8$3', $html);
229
            $mbHasCharset = in_array($charset, array_map('mb_strtoupper', mb_list_encodings()));
230
231
            if ($mbHasCharset) {
232
                $html = mb_convert_encoding($html, 'UTF-8', $charset);
233
234
            // Fallback to iconv if available.
235
            } elseif (extension_loaded('iconv')) {
236
                $htmlIconv = iconv($charset, 'UTF-8', $html);
237
238
                if ($htmlIconv !== false) {
239
                    $html = $htmlIconv;
240
                } else {
241
                    $charset = null;
242
                }
243
            }
244
        }
245
246
        if ($charset === null) {
247
            $html = htmlspecialchars_decode(mb_encode_numericentity(htmlentities($html, ENT_QUOTES, 'UTF-8'), [0x80, 0x10FFFF, 0, ~0], 'UTF-8'));
0 ignored issues
show
It seems like $html can also be of type array; however, parameter $string of htmlentities() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

247
            $html = htmlspecialchars_decode(mb_encode_numericentity(htmlentities(/** @scrutinizer ignore-type */ $html, ENT_QUOTES, 'UTF-8'), [0x80, 0x10FFFF, 0, ~0], 'UTF-8'));
Loading history...
248
        }
249
250
        return $html;
251
    }
252
253
    /**
254
     * @param $html
255
     */
256
    private function composeXmlNode($html)
257
    {
258
        $this->detectEncoding($html);
259
260
        $html = $this->convertToUtf8($html);
261
262
        $this->loadHTML($html, $this->libxmlOptions);
263
264
        // Remove <?xml ...> processing instruction.
265
        $this->contents()->each(function($node) {
266
            if ($node instanceof ProcessingInstruction && $node->nodeName == 'xml') {
267
                $node->destroy();
268
            }
269
        });
270
    }
271
}
272