1 | <?php declare(strict_types=1); |
||||
2 | |||||
3 | namespace DOMWrap; |
||||
4 | |||||
5 | use DOMWrap\Traits\{ |
||||
6 | CommonTrait, |
||||
7 | TraversalTrait, |
||||
8 | ManipulationTrait |
||||
9 | }; |
||||
10 | |||||
11 | /** |
||||
12 | * Document Node |
||||
13 | * |
||||
14 | * @package DOMWrap |
||||
15 | * @license http://opensource.org/licenses/BSD-3-Clause BSD 3 Clause |
||||
16 | */ |
||||
17 | class Document extends \DOMDocument |
||||
18 | { |
||||
19 | use CommonTrait; |
||||
20 | use TraversalTrait; |
||||
21 | use ManipulationTrait; |
||||
22 | |||||
23 | 140 | /** @var int */ |
|||
24 | 140 | protected $libxmlOptions = 0; |
|||
25 | |||||
26 | 140 | /** @var string|null */ |
|||
27 | 140 | protected $documentEncoding = null; |
|||
28 | 140 | ||||
29 | 140 | public function __construct(string $version = '1.0', string $encoding = 'UTF-8') { |
|||
30 | 140 | parent::__construct($version, $encoding); |
|||
31 | 140 | ||||
32 | $this->registerNodeClass('DOMText', 'DOMWrap\\Text'); |
||||
33 | $this->registerNodeClass('DOMElement', 'DOMWrap\\Element'); |
||||
34 | $this->registerNodeClass('DOMComment', 'DOMWrap\\Comment'); |
||||
35 | $this->registerNodeClass('DOMDocument', 'DOMWrap\\Document'); |
||||
36 | 138 | $this->registerNodeClass('DOMDocumentType', 'DOMWrap\\DocumentType'); |
|||
37 | 138 | $this->registerNodeClass('DOMProcessingInstruction', 'DOMWrap\\ProcessingInstruction'); |
|||
38 | } |
||||
39 | |||||
40 | /** |
||||
41 | * Set libxml options. |
||||
42 | * |
||||
43 | 134 | * Multiple values must use bitwise OR. |
|||
44 | 134 | * eg: LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD |
|||
45 | * |
||||
46 | * @link http://php.net/manual/en/libxml.constants.php |
||||
47 | * |
||||
48 | * @param int $libxmlOptions |
||||
49 | */ |
||||
50 | 1 | public function setLibxmlOptions(int $libxmlOptions): void { |
|||
51 | 1 | $this->libxmlOptions = $libxmlOptions; |
|||
52 | } |
||||
53 | |||||
54 | /** |
||||
55 | 1 | * {@inheritdoc} |
|||
56 | */ |
||||
57 | public function document(): ?\DOMDocument { |
||||
58 | return $this; |
||||
59 | } |
||||
60 | |||||
61 | 1 | /** |
|||
62 | 1 | * {@inheritdoc} |
|||
63 | */ |
||||
64 | public function collection(): NodeList { |
||||
65 | return $this->newNodeList([$this]); |
||||
66 | } |
||||
67 | |||||
68 | 1 | /** |
|||
69 | 1 | * {@inheritdoc} |
|||
70 | */ |
||||
71 | public function result(NodeList $nodeList) { |
||||
72 | if ($nodeList->count()) { |
||||
73 | return $nodeList->first(); |
||||
74 | } |
||||
75 | |||||
76 | return null; |
||||
77 | } |
||||
78 | |||||
79 | /** |
||||
80 | * {@inheritdoc} |
||||
81 | */ |
||||
82 | public function parent() { |
||||
83 | return null; |
||||
84 | 1 | } |
|||
85 | 1 | ||||
86 | /** |
||||
87 | * {@inheritdoc} |
||||
88 | */ |
||||
89 | public function parents() { |
||||
90 | return $this->newNodeList(); |
||||
91 | 2 | } |
|||
92 | 2 | ||||
93 | /** |
||||
94 | * {@inheritdoc} |
||||
95 | */ |
||||
96 | public function substituteWith($newNode): self { |
||||
97 | $this->replaceChild($newNode, $this); |
||||
98 | 140 | ||||
99 | 140 | return $this; |
|||
100 | 1 | } |
|||
101 | |||||
102 | /** |
||||
103 | 140 | * {@inheritdoc} |
|||
104 | 140 | */ |
|||
105 | public function _clone() { |
||||
106 | 140 | return null; |
|||
107 | 140 | } |
|||
108 | |||||
109 | 140 | /** |
|||
110 | 1 | * {@inheritdoc} |
|||
111 | 1 | */ |
|||
112 | 1 | public function getHtml(): string { |
|||
113 | return $this->getOuterHtml(); |
||||
114 | } |
||||
115 | |||||
116 | 140 | /** |
|||
117 | * {@inheritdoc} |
||||
118 | */ |
||||
119 | 140 | public function setHtml($html): self { |
|||
120 | if (!is_string($html) || trim($html) == '') { |
||||
121 | 140 | return $this; |
|||
122 | 140 | } |
|||
123 | |||||
124 | 140 | $internalErrors = libxml_use_internal_errors(true); |
|||
125 | if (\PHP_VERSION_ID < 80000) { |
||||
126 | $disableEntities = libxml_disable_entity_loader(true); |
||||
127 | $this->composeXmlNode($html); |
||||
128 | libxml_use_internal_errors($internalErrors); |
||||
129 | libxml_disable_entity_loader($disableEntities); |
||||
130 | } else { |
||||
131 | $this->composeXmlNode($html); |
||||
132 | libxml_use_internal_errors($internalErrors); |
||||
133 | } |
||||
134 | |||||
135 | return $this; |
||||
136 | } |
||||
137 | |||||
138 | /** |
||||
139 | * @param string $html |
||||
140 | * @param int $options |
||||
141 | * |
||||
142 | * @return bool |
||||
143 | */ |
||||
144 | public function loadHTML($html, $options = 0): bool { |
||||
145 | // Fix LibXML's crazy-ness RE root nodes |
||||
146 | // While importing HTML using the LIBXML_HTML_NOIMPLIED option LibXML insists |
||||
147 | // on having one root node. All subsequent nodes are appended to this first node. |
||||
148 | // To counter this we will create a fake element, allow LibXML to 'do its thing' |
||||
149 | // then undo it by taking the contents of the fake element, placing it back into |
||||
150 | // the root and then remove our fake element. |
||||
151 | if ($options & LIBXML_HTML_NOIMPLIED) { |
||||
152 | $html = '<domwrap></domwrap>' . $html; |
||||
153 | } |
||||
154 | |||||
155 | $html = '<?xml encoding="' . ($this->getEncoding() ?? 'UTF-8') . '">' . $html; |
||||
156 | |||||
157 | $result = parent::loadHTML($html, $options); |
||||
158 | |||||
159 | // Do our re-shuffling of nodes. |
||||
160 | if ($this->libxmlOptions & LIBXML_HTML_NOIMPLIED) { |
||||
161 | $this->children()->first()->contents()->each(function($node){ |
||||
162 | $this->appendWith($node); |
||||
163 | }); |
||||
164 | |||||
165 | $this->removeChild($this->children()->first()); |
||||
166 | } |
||||
167 | |||||
168 | return $result; |
||||
0 ignored issues
–
show
Bug
Best Practice
introduced
by
![]() |
|||||
169 | } |
||||
170 | |||||
171 | /* |
||||
0 ignored issues
–
show
Unused Code
Comprehensibility
introduced
by
40% of this comment could be valid code. Did you maybe forget this after debugging?
Sometimes obsolete code just ends up commented out instead of removed. In this case it is better to remove the code once you have checked you do not need it. The code might also have been commented out for debugging purposes. In this case it is vital that someone uncomments it again or your project may behave in very unexpected ways in production. This check looks for comments that seem to be mostly valid code and reports them. ![]() |
|||||
172 | * @param $encoding string|null |
||||
173 | */ |
||||
174 | public function setEncoding(string $encoding = null) { |
||||
175 | $this->documentEncoding = $encoding; |
||||
176 | } |
||||
177 | |||||
178 | /* |
||||
0 ignored issues
–
show
Unused Code
Comprehensibility
introduced
by
50% of this comment could be valid code. Did you maybe forget this after debugging?
Sometimes obsolete code just ends up commented out instead of removed. In this case it is better to remove the code once you have checked you do not need it. The code might also have been commented out for debugging purposes. In this case it is vital that someone uncomments it again or your project may behave in very unexpected ways in production. This check looks for comments that seem to be mostly valid code and reports them. ![]() |
|||||
179 | * @return string|null |
||||
180 | */ |
||||
181 | public function getEncoding(): ?string { |
||||
182 | return $this->documentEncoding; |
||||
183 | } |
||||
184 | |||||
185 | /* |
||||
0 ignored issues
–
show
Unused Code
Comprehensibility
introduced
by
36% of this comment could be valid code. Did you maybe forget this after debugging?
Sometimes obsolete code just ends up commented out instead of removed. In this case it is better to remove the code once you have checked you do not need it. The code might also have been commented out for debugging purposes. In this case it is vital that someone uncomments it again or your project may behave in very unexpected ways in production. This check looks for comments that seem to be mostly valid code and reports them. ![]() |
|||||
186 | * @param $html string |
||||
187 | * |
||||
188 | * @return string|null |
||||
189 | */ |
||||
190 | private function getCharset(string $html): ?string { |
||||
191 | $charset = null; |
||||
192 | |||||
193 | if (preg_match('@<meta[^>]*?charset=["\']?([^"\'\s>]+)@im', $html, $matches)) { |
||||
194 | $charset = mb_strtoupper($matches[1]); |
||||
195 | } |
||||
196 | |||||
197 | return $charset; |
||||
198 | } |
||||
199 | |||||
200 | /* |
||||
201 | * @param $html string |
||||
202 | */ |
||||
203 | private function detectEncoding(string $html) { |
||||
204 | $charset = $this->getEncoding(); |
||||
205 | |||||
206 | if (is_null($charset)) { |
||||
207 | $charset = $this->getCharset($html); |
||||
208 | } |
||||
209 | |||||
210 | $detectedCharset = mb_detect_encoding($html, mb_detect_order(), true); |
||||
0 ignored issues
–
show
It seems like
mb_detect_order() can also be of type true ; however, parameter $encodings of mb_detect_encoding() does only seem to accept array|null|string , maybe add an additional type check?
(
Ignorable by Annotation
)
If this is a false-positive, you can also ignore this issue in your code via the
![]() |
|||||
211 | |||||
212 | if ($charset === null && $detectedCharset == 'UTF-8') { |
||||
213 | $charset = $detectedCharset; |
||||
214 | } |
||||
215 | |||||
216 | $this->setEncoding($charset); |
||||
217 | } |
||||
218 | |||||
219 | /* |
||||
220 | * @param $html string |
||||
221 | * |
||||
222 | * @return string |
||||
223 | */ |
||||
224 | private function convertToUtf8(string $html): string { |
||||
225 | $charset = $this->getEncoding(); |
||||
226 | |||||
227 | if ($charset !== null) { |
||||
228 | $html = preg_replace('@(charset=["]?)([^"\s]+)([^"]*["]?)@im', '$1UTF-8$3', $html); |
||||
229 | $mbHasCharset = in_array($charset, array_map('mb_strtoupper', mb_list_encodings())); |
||||
230 | |||||
231 | if ($mbHasCharset) { |
||||
232 | $html = mb_convert_encoding($html, 'UTF-8', $charset); |
||||
233 | |||||
234 | // Fallback to iconv if available. |
||||
235 | } elseif (extension_loaded('iconv')) { |
||||
236 | $htmlIconv = iconv($charset, 'UTF-8', $html); |
||||
237 | |||||
238 | if ($htmlIconv !== false) { |
||||
239 | $html = $htmlIconv; |
||||
240 | } else { |
||||
241 | $charset = null; |
||||
242 | } |
||||
243 | } |
||||
244 | } |
||||
245 | |||||
246 | if ($charset === null) { |
||||
247 | $html = htmlspecialchars_decode(mb_encode_numericentity(htmlentities($html, ENT_QUOTES, 'UTF-8'), [0x80, 0x10FFFF, 0, ~0], 'UTF-8')); |
||||
0 ignored issues
–
show
It seems like
$html can also be of type array ; however, parameter $string of htmlentities() does only seem to accept string , maybe add an additional type check?
(
Ignorable by Annotation
)
If this is a false-positive, you can also ignore this issue in your code via the
![]() |
|||||
248 | } |
||||
249 | |||||
250 | return $html; |
||||
251 | } |
||||
252 | |||||
253 | /** |
||||
254 | * @param $html |
||||
255 | */ |
||||
256 | private function composeXmlNode($html) |
||||
257 | { |
||||
258 | $this->detectEncoding($html); |
||||
259 | |||||
260 | $html = $this->convertToUtf8($html); |
||||
261 | |||||
262 | $this->loadHTML($html, $this->libxmlOptions); |
||||
263 | |||||
264 | // Remove <?xml ...> processing instruction. |
||||
265 | $this->contents()->each(function($node) { |
||||
266 | if ($node instanceof ProcessingInstruction && $node->nodeName == 'xml') { |
||||
267 | $node->destroy(); |
||||
268 | } |
||||
269 | }); |
||||
270 | } |
||||
271 | } |
||||
272 |