1
|
|
|
<?php declare(strict_types=1); |
2
|
|
|
|
3
|
|
|
namespace DOMWrap; |
4
|
|
|
|
5
|
|
|
use DOMWrap\Traits\{ |
6
|
|
|
CommonTrait, |
7
|
|
|
TraversalTrait, |
8
|
|
|
ManipulationTrait |
9
|
|
|
}; |
10
|
|
|
|
11
|
|
|
/** |
12
|
|
|
* Document Node |
13
|
|
|
* |
14
|
|
|
* @package DOMWrap |
15
|
|
|
* @license http://opensource.org/licenses/BSD-3-Clause BSD 3 Clause |
16
|
|
|
*/ |
17
|
|
|
class Document extends \DOMDocument |
18
|
|
|
{ |
19
|
|
|
use CommonTrait; |
20
|
|
|
use TraversalTrait; |
21
|
|
|
use ManipulationTrait; |
22
|
|
|
|
23
|
140 |
|
/** @var int */ |
24
|
140 |
|
protected $libxmlOptions = 0; |
25
|
|
|
|
26
|
140 |
|
/** @var string|null */ |
27
|
140 |
|
protected $documentEncoding = null; |
28
|
140 |
|
|
29
|
140 |
|
public function __construct(string $version = '1.0', string $encoding = 'UTF-8') { |
30
|
140 |
|
parent::__construct($version, $encoding); |
31
|
140 |
|
|
32
|
|
|
$this->registerNodeClass('DOMText', 'DOMWrap\\Text'); |
33
|
|
|
$this->registerNodeClass('DOMElement', 'DOMWrap\\Element'); |
34
|
|
|
$this->registerNodeClass('DOMComment', 'DOMWrap\\Comment'); |
35
|
|
|
$this->registerNodeClass('DOMDocument', 'DOMWrap\\Document'); |
36
|
138 |
|
$this->registerNodeClass('DOMDocumentType', 'DOMWrap\\DocumentType'); |
37
|
138 |
|
$this->registerNodeClass('DOMProcessingInstruction', 'DOMWrap\\ProcessingInstruction'); |
38
|
|
|
} |
39
|
|
|
|
40
|
|
|
/** |
41
|
|
|
* Set libxml options. |
42
|
|
|
* |
43
|
134 |
|
* Multiple values must use bitwise OR. |
44
|
134 |
|
* eg: LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD |
45
|
|
|
* |
46
|
|
|
* @link http://php.net/manual/en/libxml.constants.php |
47
|
|
|
* |
48
|
|
|
* @param int $libxmlOptions |
49
|
|
|
*/ |
50
|
1 |
|
public function setLibxmlOptions(int $libxmlOptions): void { |
51
|
1 |
|
$this->libxmlOptions = $libxmlOptions; |
52
|
|
|
} |
53
|
|
|
|
54
|
|
|
/** |
55
|
1 |
|
* {@inheritdoc} |
56
|
|
|
*/ |
57
|
|
|
public function document(): ?\DOMDocument { |
58
|
|
|
return $this; |
59
|
|
|
} |
60
|
|
|
|
61
|
1 |
|
/** |
62
|
1 |
|
* {@inheritdoc} |
63
|
|
|
*/ |
64
|
|
|
public function collection(): NodeList { |
65
|
|
|
return $this->newNodeList([$this]); |
66
|
|
|
} |
67
|
|
|
|
68
|
1 |
|
/** |
69
|
1 |
|
* {@inheritdoc} |
70
|
|
|
*/ |
71
|
|
|
public function result(NodeList $nodeList) { |
72
|
|
|
if ($nodeList->count()) { |
73
|
|
|
return $nodeList->first(); |
74
|
|
|
} |
75
|
|
|
|
76
|
|
|
return null; |
77
|
|
|
} |
78
|
|
|
|
79
|
|
|
/** |
80
|
|
|
* {@inheritdoc} |
81
|
|
|
*/ |
82
|
|
|
public function parent() { |
83
|
|
|
return null; |
84
|
1 |
|
} |
85
|
1 |
|
|
86
|
|
|
/** |
87
|
|
|
* {@inheritdoc} |
88
|
|
|
*/ |
89
|
|
|
public function parents() { |
90
|
|
|
return $this->newNodeList(); |
91
|
2 |
|
} |
92
|
2 |
|
|
93
|
|
|
/** |
94
|
|
|
* {@inheritdoc} |
95
|
|
|
*/ |
96
|
|
|
public function replaceWith($newNode): self { |
97
|
|
|
$this->replaceChild($newNode, $this); |
98
|
140 |
|
|
99
|
140 |
|
return $this; |
100
|
1 |
|
} |
101
|
|
|
|
102
|
|
|
/** |
103
|
140 |
|
* {@inheritdoc} |
104
|
140 |
|
*/ |
105
|
|
|
public function _clone() { |
106
|
140 |
|
return null; |
107
|
140 |
|
} |
108
|
|
|
|
109
|
140 |
|
/** |
110
|
1 |
|
* {@inheritdoc} |
111
|
1 |
|
*/ |
112
|
1 |
|
public function getHtml(): string { |
113
|
|
|
return $this->getOuterHtml(); |
114
|
|
|
} |
115
|
|
|
|
116
|
140 |
|
/** |
117
|
|
|
* {@inheritdoc} |
118
|
|
|
*/ |
119
|
140 |
|
public function setHtml($html): self { |
120
|
|
|
if (!is_string($html) || trim($html) == '') { |
121
|
140 |
|
return $this; |
122
|
140 |
|
} |
123
|
|
|
|
124
|
140 |
|
$internalErrors = libxml_use_internal_errors(true); |
125
|
|
|
$disableEntities = libxml_disable_entity_loader(true); |
126
|
|
|
|
127
|
|
|
$this->detectEncoding($html); |
128
|
|
|
|
129
|
|
|
$html = $this->convertToUtf8($html); |
130
|
|
|
|
131
|
|
|
$this->loadHTML($html, $this->libxmlOptions); |
132
|
|
|
|
133
|
|
|
// Remove <?xml ...> processing instruction. |
134
|
|
|
$this->contents()->each(function($node) { |
135
|
|
|
if ($node instanceof ProcessingInstruction && $node->nodeName == 'xml') { |
136
|
|
|
$node->remove(); |
137
|
|
|
} |
138
|
|
|
}); |
139
|
|
|
|
140
|
|
|
libxml_use_internal_errors($internalErrors); |
141
|
|
|
libxml_disable_entity_loader($disableEntities); |
142
|
|
|
|
143
|
|
|
return $this; |
144
|
|
|
} |
145
|
|
|
|
146
|
|
|
/** |
147
|
|
|
* @param string $html |
148
|
|
|
* @param int $options |
149
|
|
|
* |
150
|
|
|
* @return bool |
151
|
|
|
*/ |
152
|
|
|
public function loadHTML($html, $options = 0): bool { |
153
|
|
|
// Fix LibXML's crazy-ness RE root nodes |
154
|
|
|
// While importing HTML using the LIBXML_HTML_NOIMPLIED option LibXML insists |
155
|
|
|
// on having one root node. All subsequent nodes are appended to this first node. |
156
|
|
|
// To counter this we will create a fake element, allow LibXML to 'do its thing' |
157
|
|
|
// then undo it by taking the contents of the fake element, placing it back into |
158
|
|
|
// the root and then remove our fake element. |
159
|
|
|
if ($options & LIBXML_HTML_NOIMPLIED) { |
160
|
|
|
$html = '<domwrap></domwrap>' . $html; |
161
|
|
|
} |
162
|
|
|
|
163
|
|
|
$html = '<?xml encoding="' . ($this->getEncoding() ?? 'UTF-8') . '">' . $html; |
164
|
|
|
|
165
|
|
|
$result = parent::loadHTML($html, $options); |
166
|
|
|
|
167
|
|
|
// Do our re-shuffling of nodes. |
168
|
|
|
if ($this->libxmlOptions & LIBXML_HTML_NOIMPLIED) { |
169
|
|
|
$this->children()->first()->contents()->each(function($node){ |
170
|
|
|
$this->append($node); |
171
|
|
|
}); |
172
|
|
|
|
173
|
|
|
$this->removeChild($this->children()->first()); |
174
|
|
|
} |
175
|
|
|
|
176
|
|
|
return $result; |
177
|
|
|
} |
178
|
|
|
|
179
|
|
|
/* |
|
|
|
|
180
|
|
|
* @param $encoding string|null |
181
|
|
|
*/ |
182
|
|
|
public function setEncoding(string $encoding = null) { |
183
|
|
|
$this->documentEncoding = $encoding; |
184
|
|
|
} |
185
|
|
|
|
186
|
|
|
/* |
|
|
|
|
187
|
|
|
* @return string|null |
188
|
|
|
*/ |
189
|
|
|
public function getEncoding(): ?string { |
190
|
|
|
return $this->documentEncoding; |
191
|
|
|
} |
192
|
|
|
|
193
|
|
|
/* |
|
|
|
|
194
|
|
|
* @param $html string |
195
|
|
|
* |
196
|
|
|
* @return string|null |
197
|
|
|
*/ |
198
|
|
|
private function getCharset(string $html): ?string { |
199
|
|
|
$charset = null; |
200
|
|
|
|
201
|
|
|
if (preg_match('@<meta.*?charset=["\']?([^"\'\s>]+)@im', $html, $matches)) { |
202
|
|
|
$charset = mb_strtoupper($matches[1]); |
203
|
|
|
} |
204
|
|
|
|
205
|
|
|
return $charset; |
206
|
|
|
} |
207
|
|
|
|
208
|
|
|
/* |
209
|
|
|
* @param $html string |
210
|
|
|
*/ |
211
|
|
|
private function detectEncoding(string $html) { |
212
|
|
|
$charset = $this->getEncoding(); |
213
|
|
|
|
214
|
|
|
if (is_null($charset)) { |
215
|
|
|
$charset = $this->getCharset($html); |
216
|
|
|
} |
217
|
|
|
|
218
|
|
|
$detectedCharset = mb_detect_encoding($html, mb_detect_order(), true); |
219
|
|
|
|
220
|
|
|
if ($charset === null && $detectedCharset == 'UTF-8') { |
221
|
|
|
$charset = $detectedCharset; |
222
|
|
|
} |
223
|
|
|
|
224
|
|
|
$this->setEncoding($charset); |
225
|
|
|
} |
226
|
|
|
|
227
|
|
|
/* |
228
|
|
|
* @param $html string |
229
|
|
|
* |
230
|
|
|
* @return string |
231
|
|
|
*/ |
232
|
|
|
private function convertToUtf8(string $html): string { |
233
|
|
|
$charset = $this->getEncoding(); |
234
|
|
|
|
235
|
|
|
if ($charset !== null) { |
236
|
|
|
$html = preg_replace('@(charset=["]?)([^"\s]+)([^"]*["]?)@im', '$1UTF-8$3', $html); |
237
|
|
|
$mbHasCharset = in_array($charset, array_map('mb_strtoupper', mb_list_encodings())); |
238
|
|
|
|
239
|
|
|
if ($mbHasCharset) { |
240
|
|
|
$html = mb_convert_encoding($html, 'UTF-8', $charset); |
241
|
|
|
|
242
|
|
|
// Fallback to iconv if available. |
243
|
|
|
} elseif (extension_loaded('iconv')) { |
244
|
|
|
$htmlIconv = iconv($charset, 'UTF-8', $html); |
245
|
|
|
|
246
|
|
|
if ($htmlIconv !== false) { |
247
|
|
|
$html = $htmlIconv; |
248
|
|
|
} else { |
249
|
|
|
$charset = null; |
250
|
|
|
} |
251
|
|
|
} |
252
|
|
|
} |
253
|
|
|
|
254
|
|
|
if ($charset === null) { |
255
|
|
|
$html = mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8'); |
256
|
|
|
} |
257
|
|
|
|
258
|
|
|
return $html; |
259
|
|
|
} |
260
|
|
|
} |
261
|
|
|
|
Sometimes obsolete code just ends up commented out instead of removed. In this case it is better to remove the code once you have checked you do not need it.
The code might also have been commented out for debugging purposes. In this case it is vital that someone uncomments it again or your project may behave in very unexpected ways in production.
This check looks for comments that seem to be mostly valid code and reports them.