scotteh /
php-dom-wrapper
| 1 | <?php declare(strict_types=1); |
||||
| 2 | |||||
| 3 | namespace DOMWrap; |
||||
| 4 | |||||
| 5 | use DOMWrap\Traits\{ |
||||
| 6 | CommonTrait, |
||||
| 7 | TraversalTrait, |
||||
| 8 | ManipulationTrait |
||||
| 9 | }; |
||||
| 10 | |||||
| 11 | /** |
||||
| 12 | * Document Node |
||||
| 13 | * |
||||
| 14 | * @package DOMWrap |
||||
| 15 | * @license http://opensource.org/licenses/BSD-3-Clause BSD 3 Clause |
||||
| 16 | */ |
||||
| 17 | class Document extends \DOMDocument |
||||
| 18 | { |
||||
| 19 | use CommonTrait; |
||||
| 20 | use TraversalTrait; |
||||
| 21 | use ManipulationTrait; |
||||
| 22 | |||||
| 23 | 140 | /** @var int */ |
|||
| 24 | 140 | protected $libxmlOptions = 0; |
|||
| 25 | |||||
| 26 | 140 | /** @var string|null */ |
|||
| 27 | 140 | protected $documentEncoding = null; |
|||
| 28 | 140 | ||||
| 29 | 140 | public function __construct(string $version = '1.0', string $encoding = 'UTF-8') { |
|||
| 30 | 140 | parent::__construct($version, $encoding); |
|||
| 31 | 140 | ||||
| 32 | $this->registerNodeClass('DOMText', 'DOMWrap\\Text'); |
||||
| 33 | $this->registerNodeClass('DOMElement', 'DOMWrap\\Element'); |
||||
| 34 | $this->registerNodeClass('DOMComment', 'DOMWrap\\Comment'); |
||||
| 35 | $this->registerNodeClass('DOMDocument', 'DOMWrap\\Document'); |
||||
| 36 | 138 | $this->registerNodeClass('DOMDocumentType', 'DOMWrap\\DocumentType'); |
|||
| 37 | 138 | $this->registerNodeClass('DOMProcessingInstruction', 'DOMWrap\\ProcessingInstruction'); |
|||
| 38 | } |
||||
| 39 | |||||
| 40 | /** |
||||
| 41 | * Set libxml options. |
||||
| 42 | * |
||||
| 43 | 134 | * Multiple values must use bitwise OR. |
|||
| 44 | 134 | * eg: LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD |
|||
| 45 | * |
||||
| 46 | * @link http://php.net/manual/en/libxml.constants.php |
||||
| 47 | * |
||||
| 48 | * @param int $libxmlOptions |
||||
| 49 | */ |
||||
| 50 | 1 | public function setLibxmlOptions(int $libxmlOptions): void { |
|||
| 51 | 1 | $this->libxmlOptions = $libxmlOptions; |
|||
| 52 | } |
||||
| 53 | |||||
| 54 | /** |
||||
| 55 | 1 | * {@inheritdoc} |
|||
| 56 | */ |
||||
| 57 | public function document(): ?\DOMDocument { |
||||
| 58 | return $this; |
||||
| 59 | } |
||||
| 60 | |||||
| 61 | 1 | /** |
|||
| 62 | 1 | * {@inheritdoc} |
|||
| 63 | */ |
||||
| 64 | public function collection(): NodeList { |
||||
| 65 | return $this->newNodeList([$this]); |
||||
| 66 | } |
||||
| 67 | |||||
| 68 | 1 | /** |
|||
| 69 | 1 | * {@inheritdoc} |
|||
| 70 | */ |
||||
| 71 | public function result(NodeList $nodeList) { |
||||
| 72 | if ($nodeList->count()) { |
||||
| 73 | return $nodeList->first(); |
||||
| 74 | } |
||||
| 75 | |||||
| 76 | return null; |
||||
| 77 | } |
||||
| 78 | |||||
| 79 | /** |
||||
| 80 | * {@inheritdoc} |
||||
| 81 | */ |
||||
| 82 | public function parent() { |
||||
| 83 | return null; |
||||
| 84 | 1 | } |
|||
| 85 | 1 | ||||
| 86 | /** |
||||
| 87 | * {@inheritdoc} |
||||
| 88 | */ |
||||
| 89 | public function parents() { |
||||
| 90 | return $this->newNodeList(); |
||||
| 91 | 2 | } |
|||
| 92 | 2 | ||||
| 93 | /** |
||||
| 94 | * {@inheritdoc} |
||||
| 95 | */ |
||||
| 96 | public function substituteWith($newNode): self { |
||||
| 97 | $this->replaceChild($newNode, $this); |
||||
| 98 | 140 | ||||
| 99 | 140 | return $this; |
|||
| 100 | 1 | } |
|||
| 101 | |||||
| 102 | /** |
||||
| 103 | 140 | * {@inheritdoc} |
|||
| 104 | 140 | */ |
|||
| 105 | public function _clone() { |
||||
| 106 | 140 | return null; |
|||
| 107 | 140 | } |
|||
| 108 | |||||
| 109 | 140 | /** |
|||
| 110 | 1 | * {@inheritdoc} |
|||
| 111 | 1 | */ |
|||
| 112 | 1 | public function getHtml(): string { |
|||
| 113 | return $this->getOuterHtml(); |
||||
| 114 | } |
||||
| 115 | |||||
| 116 | 140 | /** |
|||
| 117 | * {@inheritdoc} |
||||
| 118 | */ |
||||
| 119 | 140 | public function setHtml($html): self { |
|||
| 120 | if (!is_string($html) || trim($html) == '') { |
||||
| 121 | 140 | return $this; |
|||
| 122 | 140 | } |
|||
| 123 | |||||
| 124 | 140 | $internalErrors = libxml_use_internal_errors(true); |
|||
| 125 | if (\PHP_VERSION_ID < 80000) { |
||||
| 126 | $disableEntities = libxml_disable_entity_loader(true); |
||||
| 127 | $this->composeXmlNode($html); |
||||
| 128 | libxml_use_internal_errors($internalErrors); |
||||
| 129 | libxml_disable_entity_loader($disableEntities); |
||||
| 130 | } else { |
||||
| 131 | $this->composeXmlNode($html); |
||||
| 132 | libxml_use_internal_errors($internalErrors); |
||||
| 133 | } |
||||
| 134 | |||||
| 135 | return $this; |
||||
| 136 | } |
||||
| 137 | |||||
| 138 | /** |
||||
| 139 | * @param string $html |
||||
| 140 | * @param int $options |
||||
| 141 | * |
||||
| 142 | * @return bool |
||||
| 143 | */ |
||||
| 144 | public function loadHTML($html, $options = 0): bool { |
||||
| 145 | // Fix LibXML's crazy-ness RE root nodes |
||||
| 146 | // While importing HTML using the LIBXML_HTML_NOIMPLIED option LibXML insists |
||||
| 147 | // on having one root node. All subsequent nodes are appended to this first node. |
||||
| 148 | // To counter this we will create a fake element, allow LibXML to 'do its thing' |
||||
| 149 | // then undo it by taking the contents of the fake element, placing it back into |
||||
| 150 | // the root and then remove our fake element. |
||||
| 151 | if ($options & LIBXML_HTML_NOIMPLIED) { |
||||
| 152 | $html = '<domwrap></domwrap>' . $html; |
||||
| 153 | } |
||||
| 154 | |||||
| 155 | $html = '<?xml encoding="' . ($this->getEncoding() ?? 'UTF-8') . '">' . $html; |
||||
| 156 | |||||
| 157 | $result = parent::loadHTML($html, $options); |
||||
| 158 | |||||
| 159 | // Do our re-shuffling of nodes. |
||||
| 160 | if ($this->libxmlOptions & LIBXML_HTML_NOIMPLIED) { |
||||
| 161 | $this->children()->first()->contents()->each(function($node){ |
||||
| 162 | $this->appendWith($node); |
||||
| 163 | }); |
||||
| 164 | |||||
| 165 | $this->removeChild($this->children()->first()); |
||||
| 166 | } |
||||
| 167 | |||||
| 168 | return $result; |
||||
|
0 ignored issues
–
show
Bug
Best Practice
introduced
by
Loading history...
|
|||||
| 169 | } |
||||
| 170 | |||||
| 171 | /* |
||||
| 172 | * @param $encoding string|null |
||||
| 173 | */ |
||||
| 174 | public function setEncoding(string $encoding = null) { |
||||
| 175 | $this->documentEncoding = $encoding; |
||||
| 176 | } |
||||
| 177 | |||||
| 178 | /* |
||||
| 179 | * @return string|null |
||||
| 180 | */ |
||||
| 181 | public function getEncoding(): ?string { |
||||
| 182 | return $this->documentEncoding; |
||||
| 183 | } |
||||
| 184 | |||||
| 185 | /* |
||||
| 186 | * @param $html string |
||||
| 187 | * |
||||
| 188 | * @return string|null |
||||
| 189 | */ |
||||
| 190 | private function getCharset(string $html): ?string { |
||||
| 191 | $charset = null; |
||||
| 192 | |||||
| 193 | if (preg_match('@<meta[^>]*?charset=["\']?([^"\'\s>]+)@im', $html, $matches)) { |
||||
| 194 | $charset = mb_strtoupper($matches[1]); |
||||
| 195 | } |
||||
| 196 | |||||
| 197 | return $charset; |
||||
| 198 | } |
||||
| 199 | |||||
| 200 | /* |
||||
| 201 | * @param $html string |
||||
| 202 | */ |
||||
| 203 | private function detectEncoding(string $html) { |
||||
| 204 | $charset = $this->getEncoding(); |
||||
| 205 | |||||
| 206 | if (is_null($charset)) { |
||||
| 207 | $charset = $this->getCharset($html); |
||||
| 208 | } |
||||
| 209 | |||||
| 210 | $detectedCharset = mb_detect_encoding($html, mb_detect_order(), true); |
||||
|
0 ignored issues
–
show
It seems like
mb_detect_order() can also be of type true; however, parameter $encodings of mb_detect_encoding() does only seem to accept array|null|string, maybe add an additional type check?
(
Ignorable by Annotation
)
If this is a false-positive, you can also ignore this issue in your code via the
Loading history...
|
|||||
| 211 | |||||
| 212 | if ($charset === null && $detectedCharset == 'UTF-8') { |
||||
| 213 | $charset = $detectedCharset; |
||||
| 214 | } |
||||
| 215 | |||||
| 216 | $this->setEncoding($charset); |
||||
| 217 | } |
||||
| 218 | |||||
| 219 | /* |
||||
| 220 | * @param $html string |
||||
| 221 | * |
||||
| 222 | * @return string |
||||
| 223 | */ |
||||
| 224 | private function convertToUtf8(string $html): string { |
||||
| 225 | $charset = $this->getEncoding(); |
||||
| 226 | |||||
| 227 | if ($charset !== null) { |
||||
| 228 | $html = preg_replace('@(charset=["]?)([^"\s]+)([^"]*["]?)@im', '$1UTF-8$3', $html); |
||||
| 229 | $mbHasCharset = in_array($charset, array_map('mb_strtoupper', mb_list_encodings())); |
||||
| 230 | |||||
| 231 | if ($mbHasCharset) { |
||||
| 232 | $html = mb_convert_encoding($html, 'UTF-8', $charset); |
||||
| 233 | |||||
| 234 | // Fallback to iconv if available. |
||||
| 235 | } elseif (extension_loaded('iconv')) { |
||||
| 236 | $htmlIconv = iconv($charset, 'UTF-8', $html); |
||||
| 237 | |||||
| 238 | if ($htmlIconv !== false) { |
||||
| 239 | $html = $htmlIconv; |
||||
| 240 | } else { |
||||
| 241 | $charset = null; |
||||
| 242 | } |
||||
| 243 | } |
||||
| 244 | } |
||||
| 245 | |||||
| 246 | if ($charset === null) { |
||||
| 247 | $html = htmlspecialchars_decode(mb_encode_numericentity(htmlentities($html, ENT_QUOTES, 'UTF-8'), [0x80, 0x10FFFF, 0, ~0], 'UTF-8')); |
||||
|
0 ignored issues
–
show
It seems like
$html can also be of type array; however, parameter $string of htmlentities() does only seem to accept string, maybe add an additional type check?
(
Ignorable by Annotation
)
If this is a false-positive, you can also ignore this issue in your code via the
Loading history...
|
|||||
| 248 | } |
||||
| 249 | |||||
| 250 | return $html; |
||||
|
0 ignored issues
–
show
|
|||||
| 251 | } |
||||
| 252 | |||||
| 253 | /** |
||||
| 254 | * @param $html |
||||
| 255 | */ |
||||
| 256 | private function composeXmlNode($html) |
||||
| 257 | { |
||||
| 258 | $this->detectEncoding($html); |
||||
| 259 | |||||
| 260 | $html = $this->convertToUtf8($html); |
||||
| 261 | |||||
| 262 | $this->loadHTML($html, $this->libxmlOptions); |
||||
| 263 | |||||
| 264 | // Remove <?xml ...> processing instruction. |
||||
| 265 | $this->contents()->each(function($node) { |
||||
| 266 | if ($node instanceof ProcessingInstruction && $node->nodeName == 'xml') { |
||||
| 267 | $node->destroy(); |
||||
| 268 | } |
||||
| 269 | }); |
||||
| 270 | } |
||||
| 271 | } |
||||
| 272 |