Issues (29)

src/Document.php (2 issues)

1
<?php declare(strict_types=1);
2
3
namespace DOMWrap;
4
5
use DOMWrap\Traits\{
6
    CommonTrait,
7
    TraversalTrait,
8
    ManipulationTrait
9
};
10
11
/**
12
 * Document Node
13
 *
14
 * @package DOMWrap
15
 * @license http://opensource.org/licenses/BSD-3-Clause BSD 3 Clause
16
 */
17
class Document extends \DOMDocument
18
{
19
    use CommonTrait;
20
    use TraversalTrait;
21
    use ManipulationTrait;
22
23 140
    /** @var int */
24 140
    protected $libxmlOptions = 0;
25
26 140
    /** @var string|null */
27 140
    protected $documentEncoding = null;
28 140
29 140
    public function __construct(string $version = '1.0', string $encoding = 'UTF-8') {
30 140
        parent::__construct($version, $encoding);
31 140
32
        $this->registerNodeClass('DOMText', 'DOMWrap\\Text');
33
        $this->registerNodeClass('DOMElement', 'DOMWrap\\Element');
34
        $this->registerNodeClass('DOMComment', 'DOMWrap\\Comment');
35
        $this->registerNodeClass('DOMDocument', 'DOMWrap\\Document');
36 138
        $this->registerNodeClass('DOMDocumentType', 'DOMWrap\\DocumentType');
37 138
        $this->registerNodeClass('DOMProcessingInstruction', 'DOMWrap\\ProcessingInstruction');
38
    }
39
40
    /**
41
     * Set libxml options.
42
     *
43 134
     * Multiple values must use bitwise OR.
44 134
     * eg: LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD
45
     *
46
     * @link http://php.net/manual/en/libxml.constants.php
47
     *
48
     * @param int $libxmlOptions
49
     */
50 1
    public function setLibxmlOptions(int $libxmlOptions): void {
51 1
        $this->libxmlOptions = $libxmlOptions;
52
    }
53
54
    /**
55 1
     * {@inheritdoc}
56
     */
57
    public function document(): ?\DOMDocument {
58
        return $this;
59
    }
60
61 1
    /**
62 1
     * {@inheritdoc}
63
     */
64
    public function collection(): NodeList {
65
        return $this->newNodeList([$this]);
66
    }
67
68 1
    /**
69 1
     * {@inheritdoc}
70
     */
71
    public function result(NodeList $nodeList) {
72
        if ($nodeList->count()) {
73
            return $nodeList->first();
74
        }
75
76
        return null;
77
    }
78
79
    /**
80
     * {@inheritdoc}
81
     */
82
    public function parent() {
83
        return null;
84 1
    }
85 1
86
    /**
87
     * {@inheritdoc}
88
     */
89
    public function parents() {
90
        return $this->newNodeList();
91 2
    }
92 2
93
    /**
94
     * {@inheritdoc}
95
     */
96
    public function substituteWith($newNode): self {
97
        $this->replaceChild($newNode, $this);
98 140
99 140
        return $this;
100 1
    }
101
102
    /**
103 140
     * {@inheritdoc}
104 140
     */
105
    public function _clone() {
106 140
        return null;
107 140
    }
108
109 140
    /**
110 1
     * {@inheritdoc}
111 1
     */
112 1
    public function getHtml(): string {
113
        return $this->getOuterHtml();
114
    }
115
116 140
    /**
117
     * {@inheritdoc}
118
     */
119 140
    public function setHtml($html): self {
120
        if (!is_string($html) || trim($html) == '') {
121 140
            return $this;
122 140
        }
123
124 140
        $internalErrors = libxml_use_internal_errors(true);
125
        if (\PHP_VERSION_ID < 80000) {
126
            $disableEntities = libxml_disable_entity_loader(true);
127
            $this->composeXmlNode($html);
128
            libxml_use_internal_errors($internalErrors);
129
            libxml_disable_entity_loader($disableEntities);
130
        } else {
131
            $this->composeXmlNode($html);
132
            libxml_use_internal_errors($internalErrors);
133
        }
134
135
        return $this;
136
    }
137
138
    /**
139
     * @param string $html
140
     * @param int $options
141
     *
142
     * @return bool
143
     */
144
    public function loadHTML($html, $options = 0): bool {
145
        // Fix LibXML's crazy-ness RE root nodes
146
        // While importing HTML using the LIBXML_HTML_NOIMPLIED option LibXML insists
147
        //  on having one root node. All subsequent nodes are appended to this first node.
148
        // To counter this we will create a fake element, allow LibXML to 'do its thing'
149
        //  then undo it by taking the contents of the fake element, placing it back into
150
        //  the root and then remove our fake element.
151
        if ($options & LIBXML_HTML_NOIMPLIED) {
152
            $html = '<domwrap></domwrap>' . $html;
153
        }
154
155
        $html = '<?xml encoding="' . ($this->getEncoding() ?? 'UTF-8') . '">' . $html;
156
157
        $result = parent::loadHTML($html, $options);
158
159
        // Do our re-shuffling of nodes.
160
        if ($this->libxmlOptions & LIBXML_HTML_NOIMPLIED) {
161
            $this->children()->first()->contents()->each(function($node){
162
                $this->appendWith($node);
163
            });
164
165
            $this->removeChild($this->children()->first());
166
        }
167
168
        return $result;
0 ignored issues
show
Bug Best Practice introduced by
The expression return $result could return the type DOMDocument which is incompatible with the type-hinted return boolean. Consider adding an additional type-check to rule them out.
Loading history...
169
    }
170
171
    /*
172
     * @param $encoding string|null
173
     */
174
    public function setEncoding(string $encoding = null) {
175
        $this->documentEncoding = $encoding;
176
    }
177
178
    /*
179
     * @return string|null
180
     */
181
    public function getEncoding(): ?string {
182
        return $this->documentEncoding;
183
    }
184
185
    /*
186
     * @param $html string
187
     *
188
     * @return string|null
189
     */
190
    private function getCharset(string $html): ?string {
191
        $charset = null;
192
193
        if (preg_match('@<meta[^>]*?charset=["\']?([^"\'\s>]+)@im', $html, $matches)) {
194
            $charset = mb_strtoupper($matches[1]);
195
        }
196
197
        return $charset;
198
    }
199
200
    /*
201
     * @param $html string
202
     */
203
    private function detectEncoding(string $html) {
204
        $charset = $this->getEncoding();
205
206
        if (is_null($charset)) {
207
            $charset = $this->getCharset($html);
208
        }
209
210
        $detectedCharset = mb_detect_encoding($html, mb_detect_order(), true);
211
212
        if ($charset === null && $detectedCharset == 'UTF-8') {
213
            $charset = $detectedCharset;
214
        }
215
216
        $this->setEncoding($charset);
217
    }
218
219
    /*
220
     * @param $html string
221
     *
222
     * @return string
223
     */
224
    private function convertToUtf8(string $html): string {
225
        $charset = $this->getEncoding();
226
227
        if ($charset !== null) {
228
            $html = preg_replace('@(charset=["]?)([^"\s]+)([^"]*["]?)@im', '$1UTF-8$3', $html);
229
            $mbHasCharset = in_array($charset, array_map('mb_strtoupper', mb_list_encodings()));
230
231
            if ($mbHasCharset) {
232
                $html = mb_convert_encoding($html, 'UTF-8', $charset);
233
234
            // Fallback to iconv if available.
235
            } elseif (extension_loaded('iconv')) {
236
                $htmlIconv = iconv($charset, 'UTF-8', $html);
237
238
                if ($htmlIconv !== false) {
239
                    $html = $htmlIconv;
240
                } else {
241
                    $charset = null;
242
                }
243
            }
244
        }
245
246
        if ($charset === null) {
247
            $html = htmlspecialchars_decode(mb_encode_numericentity(htmlentities($html, ENT_QUOTES, 'UTF-8'), [0x80, 0x10FFFF, 0, ~0], 'UTF-8'));
248
        }
249
250
        return $html;
0 ignored issues
show
Bug Best Practice introduced by
The expression return $html could return the type array which is incompatible with the type-hinted return string. Consider adding an additional type-check to rule them out.
Loading history...
251
    }
252
253
    /**
254
     * @param $html
255
     */
256
    private function composeXmlNode($html)
257
    {
258
        $this->detectEncoding($html);
259
260
        $html = $this->convertToUtf8($html);
261
262
        $this->loadHTML($html, $this->libxmlOptions);
263
264
        // Remove <?xml ...> processing instruction.
265
        $this->contents()->each(function($node) {
266
            if ($node instanceof ProcessingInstruction && $node->nodeName == 'xml') {
267
                $node->destroy();
268
            }
269
        });
270
    }
271
}
272