scotteh /
php-dom-wrapper
| 1 | <?php declare(strict_types=1); |
||
| 2 | |||
| 3 | namespace DOMWrap; |
||
| 4 | |||
| 5 | use DOMWrap\Traits\{ |
||
| 6 | CommonTrait, |
||
| 7 | TraversalTrait, |
||
| 8 | ManipulationTrait |
||
| 9 | }; |
||
| 10 | |||
| 11 | /** |
||
| 12 | * Document Node |
||
| 13 | * |
||
| 14 | * @package DOMWrap |
||
| 15 | * @license http://opensource.org/licenses/BSD-3-Clause BSD 3 Clause |
||
| 16 | */ |
||
| 17 | class Document extends \DOMDocument |
||
| 18 | { |
||
| 19 | use CommonTrait; |
||
| 20 | use TraversalTrait; |
||
| 21 | use ManipulationTrait; |
||
| 22 | |||
| 23 | 140 | /** @var int */ |
|
| 24 | 140 | protected $libxmlOptions = 0; |
|
| 25 | |||
| 26 | 140 | /** @var string|null */ |
|
| 27 | 140 | protected $documentEncoding = null; |
|
| 28 | 140 | ||
| 29 | 140 | public function __construct(string $version = '1.0', string $encoding = 'UTF-8') { |
|
| 30 | 140 | parent::__construct($version, $encoding); |
|
| 31 | 140 | ||
| 32 | $this->registerNodeClass('DOMText', 'DOMWrap\\Text'); |
||
| 33 | $this->registerNodeClass('DOMElement', 'DOMWrap\\Element'); |
||
| 34 | $this->registerNodeClass('DOMComment', 'DOMWrap\\Comment'); |
||
| 35 | $this->registerNodeClass('DOMDocument', 'DOMWrap\\Document'); |
||
| 36 | 138 | $this->registerNodeClass('DOMDocumentType', 'DOMWrap\\DocumentType'); |
|
| 37 | 138 | $this->registerNodeClass('DOMProcessingInstruction', 'DOMWrap\\ProcessingInstruction'); |
|
| 38 | } |
||
| 39 | |||
| 40 | /** |
||
| 41 | * Set libxml options. |
||
| 42 | * |
||
| 43 | 134 | * Multiple values must use bitwise OR. |
|
| 44 | 134 | * eg: LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD |
|
| 45 | * |
||
| 46 | * @link http://php.net/manual/en/libxml.constants.php |
||
| 47 | * |
||
| 48 | * @param int $libxmlOptions |
||
| 49 | */ |
||
| 50 | 1 | public function setLibxmlOptions(int $libxmlOptions): void { |
|
| 51 | 1 | $this->libxmlOptions = $libxmlOptions; |
|
| 52 | } |
||
| 53 | |||
| 54 | /** |
||
| 55 | 1 | * {@inheritdoc} |
|
| 56 | */ |
||
| 57 | public function document(): ?\DOMDocument { |
||
| 58 | return $this; |
||
| 59 | } |
||
| 60 | |||
| 61 | 1 | /** |
|
| 62 | 1 | * {@inheritdoc} |
|
| 63 | */ |
||
| 64 | public function collection(): NodeList { |
||
| 65 | return $this->newNodeList([$this]); |
||
| 66 | } |
||
| 67 | |||
| 68 | 1 | /** |
|
| 69 | 1 | * {@inheritdoc} |
|
| 70 | */ |
||
| 71 | public function result(NodeList $nodeList) { |
||
| 72 | if ($nodeList->count()) { |
||
| 73 | return $nodeList->first(); |
||
| 74 | } |
||
| 75 | |||
| 76 | return null; |
||
| 77 | } |
||
| 78 | |||
| 79 | /** |
||
| 80 | * {@inheritdoc} |
||
| 81 | */ |
||
| 82 | public function parent() { |
||
| 83 | return null; |
||
| 84 | 1 | } |
|
| 85 | 1 | ||
| 86 | /** |
||
| 87 | * {@inheritdoc} |
||
| 88 | */ |
||
| 89 | public function parents() { |
||
| 90 | return $this->newNodeList(); |
||
| 91 | 2 | } |
|
| 92 | 2 | ||
| 93 | /** |
||
| 94 | * {@inheritdoc} |
||
| 95 | */ |
||
| 96 | public function substituteWith($newNode): self { |
||
| 97 | $this->replaceChild($newNode, $this); |
||
| 98 | 140 | ||
| 99 | 140 | return $this; |
|
| 100 | 1 | } |
|
| 101 | |||
| 102 | /** |
||
| 103 | 140 | * {@inheritdoc} |
|
| 104 | 140 | */ |
|
| 105 | public function _clone() { |
||
| 106 | 140 | return null; |
|
| 107 | 140 | } |
|
| 108 | |||
| 109 | 140 | /** |
|
| 110 | 1 | * {@inheritdoc} |
|
| 111 | 1 | */ |
|
| 112 | 1 | public function getHtml(): string { |
|
| 113 | return $this->getOuterHtml(); |
||
| 114 | } |
||
| 115 | |||
| 116 | 140 | /** |
|
| 117 | * {@inheritdoc} |
||
| 118 | */ |
||
| 119 | 140 | public function setHtml($html): self { |
|
| 120 | if (!is_string($html) || trim($html) == '') { |
||
| 121 | 140 | return $this; |
|
| 122 | 140 | } |
|
| 123 | |||
| 124 | 140 | $internalErrors = libxml_use_internal_errors(true); |
|
| 125 | if (\PHP_VERSION_ID < 80000) { |
||
| 126 | $disableEntities = libxml_disable_entity_loader(true); |
||
| 127 | $this->composeXmlNode($html); |
||
| 128 | libxml_use_internal_errors($internalErrors); |
||
| 129 | libxml_disable_entity_loader($disableEntities); |
||
| 130 | } else { |
||
| 131 | $this->composeXmlNode($html); |
||
| 132 | libxml_use_internal_errors($internalErrors); |
||
| 133 | } |
||
| 134 | |||
| 135 | return $this; |
||
| 136 | } |
||
| 137 | |||
| 138 | /** |
||
| 139 | * @param string $html |
||
| 140 | * @param int $options |
||
| 141 | * |
||
| 142 | * @return bool |
||
| 143 | */ |
||
| 144 | public function loadHTML($html, $options = 0): bool { |
||
| 145 | // Fix LibXML's crazy-ness RE root nodes |
||
| 146 | // While importing HTML using the LIBXML_HTML_NOIMPLIED option LibXML insists |
||
| 147 | // on having one root node. All subsequent nodes are appended to this first node. |
||
| 148 | // To counter this we will create a fake element, allow LibXML to 'do its thing' |
||
| 149 | // then undo it by taking the contents of the fake element, placing it back into |
||
| 150 | // the root and then remove our fake element. |
||
| 151 | if ($options & LIBXML_HTML_NOIMPLIED) { |
||
| 152 | $html = '<domwrap></domwrap>' . $html; |
||
| 153 | } |
||
| 154 | |||
| 155 | $html = '<?xml encoding="' . ($this->getEncoding() ?? 'UTF-8') . '">' . $html; |
||
| 156 | |||
| 157 | $result = parent::loadHTML($html, $options); |
||
| 158 | |||
| 159 | // Do our re-shuffling of nodes. |
||
| 160 | if ($this->libxmlOptions & LIBXML_HTML_NOIMPLIED) { |
||
| 161 | $this->children()->first()->contents()->each(function($node){ |
||
| 162 | $this->appendWith($node); |
||
| 163 | }); |
||
| 164 | |||
| 165 | $this->removeChild($this->children()->first()); |
||
| 166 | } |
||
| 167 | |||
| 168 | return $result; |
||
|
0 ignored issues
–
show
Bug
Best Practice
introduced
by
Loading history...
|
|||
| 169 | } |
||
| 170 | |||
| 171 | /* |
||
| 172 | * @param $encoding string|null |
||
| 173 | */ |
||
| 174 | public function setEncoding(string $encoding = null) { |
||
| 175 | $this->documentEncoding = $encoding; |
||
| 176 | } |
||
| 177 | |||
| 178 | /* |
||
| 179 | * @return string|null |
||
| 180 | */ |
||
| 181 | public function getEncoding(): ?string { |
||
| 182 | return $this->documentEncoding; |
||
| 183 | } |
||
| 184 | |||
| 185 | /* |
||
| 186 | * @param $html string |
||
| 187 | * |
||
| 188 | * @return string|null |
||
| 189 | */ |
||
| 190 | private function getCharset(string $html): ?string { |
||
| 191 | $charset = null; |
||
| 192 | |||
| 193 | if (preg_match('@<meta[^>]*?charset=["\']?([^"\'\s>]+)@im', $html, $matches)) { |
||
| 194 | $charset = mb_strtoupper($matches[1]); |
||
| 195 | } |
||
| 196 | |||
| 197 | return $charset; |
||
| 198 | } |
||
| 199 | |||
| 200 | /* |
||
| 201 | * @param $html string |
||
| 202 | */ |
||
| 203 | private function detectEncoding(string $html) { |
||
| 204 | $charset = $this->getEncoding(); |
||
| 205 | |||
| 206 | if (is_null($charset)) { |
||
| 207 | $charset = $this->getCharset($html); |
||
| 208 | } |
||
| 209 | |||
| 210 | $detectedCharset = mb_detect_encoding($html, mb_detect_order(), true); |
||
| 211 | |||
| 212 | if ($charset === null && $detectedCharset == 'UTF-8') { |
||
| 213 | $charset = $detectedCharset; |
||
| 214 | } |
||
| 215 | |||
| 216 | $this->setEncoding($charset); |
||
| 217 | } |
||
| 218 | |||
| 219 | /* |
||
| 220 | * @param $html string |
||
| 221 | * |
||
| 222 | * @return string |
||
| 223 | */ |
||
| 224 | private function convertToUtf8(string $html): string { |
||
| 225 | $charset = $this->getEncoding(); |
||
| 226 | |||
| 227 | if ($charset !== null) { |
||
| 228 | $html = preg_replace('@(charset=["]?)([^"\s]+)([^"]*["]?)@im', '$1UTF-8$3', $html); |
||
| 229 | $mbHasCharset = in_array($charset, array_map('mb_strtoupper', mb_list_encodings())); |
||
| 230 | |||
| 231 | if ($mbHasCharset) { |
||
| 232 | $html = mb_convert_encoding($html, 'UTF-8', $charset); |
||
| 233 | |||
| 234 | // Fallback to iconv if available. |
||
| 235 | } elseif (extension_loaded('iconv')) { |
||
| 236 | $htmlIconv = iconv($charset, 'UTF-8', $html); |
||
| 237 | |||
| 238 | if ($htmlIconv !== false) { |
||
| 239 | $html = $htmlIconv; |
||
| 240 | } else { |
||
| 241 | $charset = null; |
||
| 242 | } |
||
| 243 | } |
||
| 244 | } |
||
| 245 | |||
| 246 | if ($charset === null) { |
||
| 247 | $html = htmlspecialchars_decode(mb_encode_numericentity(htmlentities($html, ENT_QUOTES, 'UTF-8'), [0x80, 0x10FFFF, 0, ~0], 'UTF-8')); |
||
| 248 | } |
||
| 249 | |||
| 250 | return $html; |
||
|
0 ignored issues
–
show
|
|||
| 251 | } |
||
| 252 | |||
| 253 | /** |
||
| 254 | * @param $html |
||
| 255 | */ |
||
| 256 | private function composeXmlNode($html) |
||
| 257 | { |
||
| 258 | $this->detectEncoding($html); |
||
| 259 | |||
| 260 | $html = $this->convertToUtf8($html); |
||
| 261 | |||
| 262 | $this->loadHTML($html, $this->libxmlOptions); |
||
| 263 | |||
| 264 | // Remove <?xml ...> processing instruction. |
||
| 265 | $this->contents()->each(function($node) { |
||
| 266 | if ($node instanceof ProcessingInstruction && $node->nodeName == 'xml') { |
||
| 267 | $node->destroy(); |
||
| 268 | } |
||
| 269 | }); |
||
| 270 | } |
||
| 271 | } |
||
| 272 |