1 | <?php declare(strict_types=1); |
||
2 | |||
3 | namespace DOMWrap; |
||
4 | |||
5 | use DOMWrap\Traits\{ |
||
6 | CommonTrait, |
||
7 | TraversalTrait, |
||
8 | ManipulationTrait |
||
9 | }; |
||
10 | |||
11 | /** |
||
12 | * Document Node |
||
13 | * |
||
14 | * @package DOMWrap |
||
15 | * @license http://opensource.org/licenses/BSD-3-Clause BSD 3 Clause |
||
16 | */ |
||
17 | class Document extends \DOMDocument |
||
18 | { |
||
19 | use CommonTrait; |
||
20 | use TraversalTrait; |
||
21 | use ManipulationTrait; |
||
22 | |||
23 | 140 | /** @var int */ |
|
24 | 140 | protected $libxmlOptions = 0; |
|
25 | |||
26 | 140 | /** @var string|null */ |
|
27 | 140 | protected $documentEncoding = null; |
|
28 | 140 | ||
29 | 140 | public function __construct(string $version = '1.0', string $encoding = 'UTF-8') { |
|
30 | 140 | parent::__construct($version, $encoding); |
|
31 | 140 | ||
32 | $this->registerNodeClass('DOMText', 'DOMWrap\\Text'); |
||
33 | $this->registerNodeClass('DOMElement', 'DOMWrap\\Element'); |
||
34 | $this->registerNodeClass('DOMComment', 'DOMWrap\\Comment'); |
||
35 | $this->registerNodeClass('DOMDocument', 'DOMWrap\\Document'); |
||
36 | 138 | $this->registerNodeClass('DOMDocumentType', 'DOMWrap\\DocumentType'); |
|
37 | 138 | $this->registerNodeClass('DOMProcessingInstruction', 'DOMWrap\\ProcessingInstruction'); |
|
38 | } |
||
39 | |||
40 | /** |
||
41 | * Set libxml options. |
||
42 | * |
||
43 | 134 | * Multiple values must use bitwise OR. |
|
44 | 134 | * eg: LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD |
|
45 | * |
||
46 | * @link http://php.net/manual/en/libxml.constants.php |
||
47 | * |
||
48 | * @param int $libxmlOptions |
||
49 | */ |
||
50 | 1 | public function setLibxmlOptions(int $libxmlOptions): void { |
|
51 | 1 | $this->libxmlOptions = $libxmlOptions; |
|
52 | } |
||
53 | |||
54 | /** |
||
55 | 1 | * {@inheritdoc} |
|
56 | */ |
||
57 | public function document(): ?\DOMDocument { |
||
58 | return $this; |
||
59 | } |
||
60 | |||
61 | 1 | /** |
|
62 | 1 | * {@inheritdoc} |
|
63 | */ |
||
64 | public function collection(): NodeList { |
||
65 | return $this->newNodeList([$this]); |
||
66 | } |
||
67 | |||
68 | 1 | /** |
|
69 | 1 | * {@inheritdoc} |
|
70 | */ |
||
71 | public function result(NodeList $nodeList) { |
||
72 | if ($nodeList->count()) { |
||
73 | return $nodeList->first(); |
||
74 | } |
||
75 | |||
76 | return null; |
||
77 | } |
||
78 | |||
79 | /** |
||
80 | * {@inheritdoc} |
||
81 | */ |
||
82 | public function parent() { |
||
83 | return null; |
||
84 | 1 | } |
|
85 | 1 | ||
86 | /** |
||
87 | * {@inheritdoc} |
||
88 | */ |
||
89 | public function parents() { |
||
90 | return $this->newNodeList(); |
||
91 | 2 | } |
|
92 | 2 | ||
93 | /** |
||
94 | * {@inheritdoc} |
||
95 | */ |
||
96 | public function substituteWith($newNode): self { |
||
97 | $this->replaceChild($newNode, $this); |
||
98 | 140 | ||
99 | 140 | return $this; |
|
100 | 1 | } |
|
101 | |||
102 | /** |
||
103 | 140 | * {@inheritdoc} |
|
104 | 140 | */ |
|
105 | public function _clone() { |
||
106 | 140 | return null; |
|
107 | 140 | } |
|
108 | |||
109 | 140 | /** |
|
110 | 1 | * {@inheritdoc} |
|
111 | 1 | */ |
|
112 | 1 | public function getHtml(): string { |
|
113 | return $this->getOuterHtml(); |
||
114 | } |
||
115 | |||
116 | 140 | /** |
|
117 | * {@inheritdoc} |
||
118 | */ |
||
119 | 140 | public function setHtml($html): self { |
|
120 | if (!is_string($html) || trim($html) == '') { |
||
121 | 140 | return $this; |
|
122 | 140 | } |
|
123 | |||
124 | 140 | $internalErrors = libxml_use_internal_errors(true); |
|
125 | if (\PHP_VERSION_ID < 80000) { |
||
126 | $disableEntities = libxml_disable_entity_loader(true); |
||
127 | $this->composeXmlNode($html); |
||
128 | libxml_use_internal_errors($internalErrors); |
||
129 | libxml_disable_entity_loader($disableEntities); |
||
130 | } else { |
||
131 | $this->composeXmlNode($html); |
||
132 | libxml_use_internal_errors($internalErrors); |
||
133 | } |
||
134 | |||
135 | return $this; |
||
136 | } |
||
137 | |||
138 | /** |
||
139 | * @param string $html |
||
140 | * @param int $options |
||
141 | * |
||
142 | * @return bool |
||
143 | */ |
||
144 | public function loadHTML($html, $options = 0): bool { |
||
145 | // Fix LibXML's crazy-ness RE root nodes |
||
146 | // While importing HTML using the LIBXML_HTML_NOIMPLIED option LibXML insists |
||
147 | // on having one root node. All subsequent nodes are appended to this first node. |
||
148 | // To counter this we will create a fake element, allow LibXML to 'do its thing' |
||
149 | // then undo it by taking the contents of the fake element, placing it back into |
||
150 | // the root and then remove our fake element. |
||
151 | if ($options & LIBXML_HTML_NOIMPLIED) { |
||
152 | $html = '<domwrap></domwrap>' . $html; |
||
153 | } |
||
154 | |||
155 | $html = '<?xml encoding="' . ($this->getEncoding() ?? 'UTF-8') . '">' . $html; |
||
156 | |||
157 | $result = parent::loadHTML($html, $options); |
||
158 | |||
159 | // Do our re-shuffling of nodes. |
||
160 | if ($this->libxmlOptions & LIBXML_HTML_NOIMPLIED) { |
||
161 | $this->children()->first()->contents()->each(function($node){ |
||
162 | $this->appendWith($node); |
||
163 | }); |
||
164 | |||
165 | $this->removeChild($this->children()->first()); |
||
166 | } |
||
167 | |||
168 | return $result; |
||
0 ignored issues
–
show
Bug
Best Practice
introduced
by
![]() |
|||
169 | } |
||
170 | |||
171 | /* |
||
172 | * @param $encoding string|null |
||
173 | */ |
||
174 | public function setEncoding(string $encoding = null) { |
||
175 | $this->documentEncoding = $encoding; |
||
176 | } |
||
177 | |||
178 | /* |
||
179 | * @return string|null |
||
180 | */ |
||
181 | public function getEncoding(): ?string { |
||
182 | return $this->documentEncoding; |
||
183 | } |
||
184 | |||
185 | /* |
||
186 | * @param $html string |
||
187 | * |
||
188 | * @return string|null |
||
189 | */ |
||
190 | private function getCharset(string $html): ?string { |
||
191 | $charset = null; |
||
192 | |||
193 | if (preg_match('@<meta[^>]*?charset=["\']?([^"\'\s>]+)@im', $html, $matches)) { |
||
194 | $charset = mb_strtoupper($matches[1]); |
||
195 | } |
||
196 | |||
197 | return $charset; |
||
198 | } |
||
199 | |||
200 | /* |
||
201 | * @param $html string |
||
202 | */ |
||
203 | private function detectEncoding(string $html) { |
||
204 | $charset = $this->getEncoding(); |
||
205 | |||
206 | if (is_null($charset)) { |
||
207 | $charset = $this->getCharset($html); |
||
208 | } |
||
209 | |||
210 | $detectedCharset = mb_detect_encoding($html, mb_detect_order(), true); |
||
211 | |||
212 | if ($charset === null && $detectedCharset == 'UTF-8') { |
||
213 | $charset = $detectedCharset; |
||
214 | } |
||
215 | |||
216 | $this->setEncoding($charset); |
||
217 | } |
||
218 | |||
219 | /* |
||
220 | * @param $html string |
||
221 | * |
||
222 | * @return string |
||
223 | */ |
||
224 | private function convertToUtf8(string $html): string { |
||
225 | $charset = $this->getEncoding(); |
||
226 | |||
227 | if ($charset !== null) { |
||
228 | $html = preg_replace('@(charset=["]?)([^"\s]+)([^"]*["]?)@im', '$1UTF-8$3', $html); |
||
229 | $mbHasCharset = in_array($charset, array_map('mb_strtoupper', mb_list_encodings())); |
||
230 | |||
231 | if ($mbHasCharset) { |
||
232 | $html = mb_convert_encoding($html, 'UTF-8', $charset); |
||
233 | |||
234 | // Fallback to iconv if available. |
||
235 | } elseif (extension_loaded('iconv')) { |
||
236 | $htmlIconv = iconv($charset, 'UTF-8', $html); |
||
237 | |||
238 | if ($htmlIconv !== false) { |
||
239 | $html = $htmlIconv; |
||
240 | } else { |
||
241 | $charset = null; |
||
242 | } |
||
243 | } |
||
244 | } |
||
245 | |||
246 | if ($charset === null) { |
||
247 | $html = htmlspecialchars_decode(mb_encode_numericentity(htmlentities($html, ENT_QUOTES, 'UTF-8'), [0x80, 0x10FFFF, 0, ~0], 'UTF-8')); |
||
248 | } |
||
249 | |||
250 | return $html; |
||
0 ignored issues
–
show
|
|||
251 | } |
||
252 | |||
253 | /** |
||
254 | * @param $html |
||
255 | */ |
||
256 | private function composeXmlNode($html) |
||
257 | { |
||
258 | $this->detectEncoding($html); |
||
259 | |||
260 | $html = $this->convertToUtf8($html); |
||
261 | |||
262 | $this->loadHTML($html, $this->libxmlOptions); |
||
263 | |||
264 | // Remove <?xml ...> processing instruction. |
||
265 | $this->contents()->each(function($node) { |
||
266 | if ($node instanceof ProcessingInstruction && $node->nodeName == 'xml') { |
||
267 | $node->destroy(); |
||
268 | } |
||
269 | }); |
||
270 | } |
||
271 | } |
||
272 |