1 | <?php declare(strict_types=1); |
||||
2 | |||||
3 | namespace DOMWrap; |
||||
4 | |||||
5 | use DOMWrap\Traits\{ |
||||
6 | CommonTrait, |
||||
7 | TraversalTrait, |
||||
8 | ManipulationTrait |
||||
9 | }; |
||||
10 | |||||
11 | /** |
||||
12 | * Document Node |
||||
13 | * |
||||
14 | * @package DOMWrap |
||||
15 | * @license http://opensource.org/licenses/BSD-3-Clause BSD 3 Clause |
||||
16 | */ |
||||
17 | class Document extends \DOMDocument |
||||
18 | { |
||||
19 | use CommonTrait; |
||||
20 | use TraversalTrait; |
||||
21 | use ManipulationTrait; |
||||
22 | |||||
23 | 140 | /** @var int */ |
|||
24 | 140 | protected $libxmlOptions = 0; |
|||
25 | |||||
26 | 140 | /** @var string|null */ |
|||
27 | 140 | protected $documentEncoding = null; |
|||
28 | 140 | ||||
29 | 140 | public function __construct(string $version = '1.0', string $encoding = 'UTF-8') { |
|||
30 | 140 | parent::__construct($version, $encoding); |
|||
31 | 140 | ||||
32 | $this->registerNodeClass('DOMText', 'DOMWrap\\Text'); |
||||
33 | $this->registerNodeClass('DOMElement', 'DOMWrap\\Element'); |
||||
34 | $this->registerNodeClass('DOMComment', 'DOMWrap\\Comment'); |
||||
35 | $this->registerNodeClass('DOMDocument', 'DOMWrap\\Document'); |
||||
36 | 138 | $this->registerNodeClass('DOMDocumentType', 'DOMWrap\\DocumentType'); |
|||
37 | 138 | $this->registerNodeClass('DOMProcessingInstruction', 'DOMWrap\\ProcessingInstruction'); |
|||
38 | } |
||||
39 | |||||
40 | /** |
||||
41 | * Set libxml options. |
||||
42 | * |
||||
43 | 134 | * Multiple values must use bitwise OR. |
|||
44 | 134 | * eg: LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD |
|||
45 | * |
||||
46 | * @link http://php.net/manual/en/libxml.constants.php |
||||
47 | * |
||||
48 | * @param int $libxmlOptions |
||||
49 | */ |
||||
50 | 1 | public function setLibxmlOptions(int $libxmlOptions): void { |
|||
51 | 1 | $this->libxmlOptions = $libxmlOptions; |
|||
52 | } |
||||
53 | |||||
54 | /** |
||||
55 | 1 | * {@inheritdoc} |
|||
56 | */ |
||||
57 | public function document(): ?\DOMDocument { |
||||
58 | return $this; |
||||
59 | } |
||||
60 | |||||
61 | 1 | /** |
|||
62 | 1 | * {@inheritdoc} |
|||
63 | */ |
||||
64 | public function collection(): NodeList { |
||||
65 | return $this->newNodeList([$this]); |
||||
66 | } |
||||
67 | |||||
68 | 1 | /** |
|||
69 | 1 | * {@inheritdoc} |
|||
70 | */ |
||||
71 | public function result(NodeList $nodeList) { |
||||
72 | if ($nodeList->count()) { |
||||
73 | return $nodeList->first(); |
||||
74 | } |
||||
75 | |||||
76 | return null; |
||||
77 | } |
||||
78 | |||||
79 | /** |
||||
80 | * {@inheritdoc} |
||||
81 | */ |
||||
82 | public function parent() { |
||||
83 | return null; |
||||
84 | 1 | } |
|||
85 | 1 | ||||
86 | /** |
||||
87 | * {@inheritdoc} |
||||
88 | */ |
||||
89 | public function parents() { |
||||
90 | return $this->newNodeList(); |
||||
91 | 2 | } |
|||
92 | 2 | ||||
93 | /** |
||||
94 | * {@inheritdoc} |
||||
95 | */ |
||||
96 | public function substituteWith($newNode): self { |
||||
97 | $this->replaceChild($newNode, $this); |
||||
98 | 140 | ||||
99 | 140 | return $this; |
|||
100 | 1 | } |
|||
101 | |||||
102 | /** |
||||
103 | 140 | * {@inheritdoc} |
|||
104 | 140 | */ |
|||
105 | public function _clone() { |
||||
106 | 140 | return null; |
|||
107 | 140 | } |
|||
108 | |||||
109 | 140 | /** |
|||
110 | 1 | * {@inheritdoc} |
|||
111 | 1 | */ |
|||
112 | 1 | public function getHtml(): string { |
|||
113 | return $this->getOuterHtml(); |
||||
114 | } |
||||
115 | |||||
116 | 140 | /** |
|||
117 | * {@inheritdoc} |
||||
118 | */ |
||||
119 | 140 | public function setHtml($html): self { |
|||
120 | if (!is_string($html) || trim($html) == '') { |
||||
121 | 140 | return $this; |
|||
122 | 140 | } |
|||
123 | |||||
124 | 140 | $internalErrors = libxml_use_internal_errors(true); |
|||
125 | if (\PHP_VERSION_ID < 80000) { |
||||
126 | $disableEntities = libxml_disable_entity_loader(true); |
||||
127 | $this->composeXmlNode($html); |
||||
128 | libxml_use_internal_errors($internalErrors); |
||||
129 | libxml_disable_entity_loader($disableEntities); |
||||
130 | } else { |
||||
131 | $this->composeXmlNode($html); |
||||
132 | libxml_use_internal_errors($internalErrors); |
||||
133 | } |
||||
134 | |||||
135 | return $this; |
||||
136 | } |
||||
137 | |||||
138 | /** |
||||
139 | * @param string $html |
||||
140 | * @param int $options |
||||
141 | * |
||||
142 | * @return bool |
||||
143 | */ |
||||
144 | public function loadHTML($html, $options = 0): bool { |
||||
145 | // Fix LibXML's crazy-ness RE root nodes |
||||
146 | // While importing HTML using the LIBXML_HTML_NOIMPLIED option LibXML insists |
||||
147 | // on having one root node. All subsequent nodes are appended to this first node. |
||||
148 | // To counter this we will create a fake element, allow LibXML to 'do its thing' |
||||
149 | // then undo it by taking the contents of the fake element, placing it back into |
||||
150 | // the root and then remove our fake element. |
||||
151 | if ($options & LIBXML_HTML_NOIMPLIED) { |
||||
152 | $html = '<domwrap></domwrap>' . $html; |
||||
153 | } |
||||
154 | |||||
155 | $html = '<?xml encoding="' . ($this->getEncoding() ?? 'UTF-8') . '">' . $html; |
||||
156 | |||||
157 | $result = parent::loadHTML($html, $options); |
||||
158 | |||||
159 | // Do our re-shuffling of nodes. |
||||
160 | if ($this->libxmlOptions & LIBXML_HTML_NOIMPLIED) { |
||||
161 | $this->children()->first()->contents()->each(function($node){ |
||||
162 | $this->appendWith($node); |
||||
163 | }); |
||||
164 | |||||
165 | $this->removeChild($this->children()->first()); |
||||
166 | } |
||||
167 | |||||
168 | return $result; |
||||
0 ignored issues
–
show
Bug
Best Practice
introduced
by
![]() |
|||||
169 | } |
||||
170 | |||||
171 | /* |
||||
172 | * @param $encoding string|null |
||||
173 | */ |
||||
174 | public function setEncoding(string $encoding = null) { |
||||
175 | $this->documentEncoding = $encoding; |
||||
176 | } |
||||
177 | |||||
178 | /* |
||||
179 | * @return string|null |
||||
180 | */ |
||||
181 | public function getEncoding(): ?string { |
||||
182 | return $this->documentEncoding; |
||||
183 | } |
||||
184 | |||||
185 | /* |
||||
186 | * @param $html string |
||||
187 | * |
||||
188 | * @return string|null |
||||
189 | */ |
||||
190 | private function getCharset(string $html): ?string { |
||||
191 | $charset = null; |
||||
192 | |||||
193 | if (preg_match('@<meta[^>]*?charset=["\']?([^"\'\s>]+)@im', $html, $matches)) { |
||||
194 | $charset = mb_strtoupper($matches[1]); |
||||
195 | } |
||||
196 | |||||
197 | return $charset; |
||||
198 | } |
||||
199 | |||||
200 | /* |
||||
201 | * @param $html string |
||||
202 | */ |
||||
203 | private function detectEncoding(string $html) { |
||||
204 | $charset = $this->getEncoding(); |
||||
205 | |||||
206 | if (is_null($charset)) { |
||||
207 | $charset = $this->getCharset($html); |
||||
208 | } |
||||
209 | |||||
210 | $detectedCharset = mb_detect_encoding($html, mb_detect_order(), true); |
||||
0 ignored issues
–
show
It seems like
mb_detect_order() can also be of type true ; however, parameter $encodings of mb_detect_encoding() does only seem to accept array|null|string , maybe add an additional type check?
(
Ignorable by Annotation
)
If this is a false-positive, you can also ignore this issue in your code via the
![]() |
|||||
211 | |||||
212 | if ($charset === null && $detectedCharset == 'UTF-8') { |
||||
213 | $charset = $detectedCharset; |
||||
214 | } |
||||
215 | |||||
216 | $this->setEncoding($charset); |
||||
217 | } |
||||
218 | |||||
219 | /* |
||||
220 | * @param $html string |
||||
221 | * |
||||
222 | * @return string |
||||
223 | */ |
||||
224 | private function convertToUtf8(string $html): string { |
||||
225 | $charset = $this->getEncoding(); |
||||
226 | |||||
227 | if ($charset !== null) { |
||||
228 | $html = preg_replace('@(charset=["]?)([^"\s]+)([^"]*["]?)@im', '$1UTF-8$3', $html); |
||||
229 | $mbHasCharset = in_array($charset, array_map('mb_strtoupper', mb_list_encodings())); |
||||
230 | |||||
231 | if ($mbHasCharset) { |
||||
232 | $html = mb_convert_encoding($html, 'UTF-8', $charset); |
||||
233 | |||||
234 | // Fallback to iconv if available. |
||||
235 | } elseif (extension_loaded('iconv')) { |
||||
236 | $htmlIconv = iconv($charset, 'UTF-8', $html); |
||||
237 | |||||
238 | if ($htmlIconv !== false) { |
||||
239 | $html = $htmlIconv; |
||||
240 | } else { |
||||
241 | $charset = null; |
||||
242 | } |
||||
243 | } |
||||
244 | } |
||||
245 | |||||
246 | if ($charset === null) { |
||||
247 | $html = htmlspecialchars_decode(mb_encode_numericentity(htmlentities($html, ENT_QUOTES, 'UTF-8'), [0x80, 0x10FFFF, 0, ~0], 'UTF-8')); |
||||
0 ignored issues
–
show
It seems like
$html can also be of type array ; however, parameter $string of htmlentities() does only seem to accept string , maybe add an additional type check?
(
Ignorable by Annotation
)
If this is a false-positive, you can also ignore this issue in your code via the
![]() |
|||||
248 | } |
||||
249 | |||||
250 | return $html; |
||||
0 ignored issues
–
show
|
|||||
251 | } |
||||
252 | |||||
253 | /** |
||||
254 | * @param $html |
||||
255 | */ |
||||
256 | private function composeXmlNode($html) |
||||
257 | { |
||||
258 | $this->detectEncoding($html); |
||||
259 | |||||
260 | $html = $this->convertToUtf8($html); |
||||
261 | |||||
262 | $this->loadHTML($html, $this->libxmlOptions); |
||||
263 | |||||
264 | // Remove <?xml ...> processing instruction. |
||||
265 | $this->contents()->each(function($node) { |
||||
266 | if ($node instanceof ProcessingInstruction && $node->nodeName == 'xml') { |
||||
267 | $node->destroy(); |
||||
268 | } |
||||
269 | }); |
||||
270 | } |
||||
271 | } |
||||
272 |