Passed
Push — master ( a82f13...c7ab15 )
by Andrew
01:31
created

Document::loadHTML()   A

Complexity

Conditions 3
Paths 4

Size

Total Lines 23
Code Lines 8

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 12

Importance

Changes 0
Metric Value
eloc 8
dl 0
loc 23
ccs 0
cts 0
cp 0
rs 10
c 0
b 0
f 0
cc 3
nc 4
nop 2
crap 12
1
<?php declare(strict_types=1);
2
3
namespace DOMWrap;
4
5
use DOMWrap\Traits\{
6
    CommonTrait,
7
    TraversalTrait,
8
    ManipulationTrait
9
};
10
11
/**
12
 * Document Node
13
 *
14
 * @package DOMWrap
15
 * @license http://opensource.org/licenses/BSD-3-Clause BSD 3 Clause
16
 */
17
class Document extends \DOMDocument
18
{
19
    use CommonTrait;
20
    use TraversalTrait;
21
    use ManipulationTrait;
22
23 140
    /** @var int */
24 140
    protected $libxmlOptions = 0;
25
26 140
    public function __construct(string $version = '1.0', string $encoding = 'UTF-8') {
27 140
        parent::__construct($version, $encoding);
28 140
29 140
        $this->registerNodeClass('DOMText', 'DOMWrap\\Text');
30 140
        $this->registerNodeClass('DOMElement', 'DOMWrap\\Element');
31 140
        $this->registerNodeClass('DOMComment', 'DOMWrap\\Comment');
32
        $this->registerNodeClass('DOMDocument', 'DOMWrap\\Document');
33
        $this->registerNodeClass('DOMDocumentType', 'DOMWrap\\DocumentType');
34
        $this->registerNodeClass('DOMProcessingInstruction', 'DOMWrap\\ProcessingInstruction');
35
    }
36 138
37 138
    /**
38
     * Set libxml options.
39
     *
40
     * Multiple values must use bitwise OR.
41
     * eg: LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD
42
     *
43 134
     * @link http://php.net/manual/en/libxml.constants.php
44 134
     *
45
     * @param int $libxmlOptions
46
     */
47
    public function setLibxmlOptions(int $libxmlOptions): void {
48
        $this->libxmlOptions = $libxmlOptions;
49
    }
50 1
51 1
    /**
52
     * {@inheritdoc}
53
     */
54
    public function document(): ?\DOMDocument {
55 1
        return $this;
56
    }
57
58
    /**
59
     * {@inheritdoc}
60
     */
61 1
    public function collection(): NodeList {
62 1
        return $this->newNodeList([$this]);
63
    }
64
65
    /**
66
     * {@inheritdoc}
67
     */
68 1
    public function result(NodeList $nodeList) {
69 1
        if ($nodeList->count()) {
70
            return $nodeList->first();
71
        }
72
73
        return null;
74
    }
75
76
    /**
77
     * {@inheritdoc}
78
     */
79
    public function parent() {
80
        return null;
81
    }
82
83
    /**
84 1
     * {@inheritdoc}
85 1
     */
86
    public function parents() {
87
        return $this->newNodeList();
88
    }
89
90
    /**
91 2
     * {@inheritdoc}
92 2
     */
93
    public function replaceWith($newNode): self {
94
        $this->replaceChild($newNode, $this);
95
96
        return $this;
97
    }
98 140
99 140
    /**
100 1
     * {@inheritdoc}
101
     */
102
    public function _clone() {
103 140
        return null;
104 140
    }
105
106 140
    /**
107 140
     * {@inheritdoc}
108
     */
109 140
    public function getHtml(): string {
110 1
        return $this->getOuterHtml();
111 1
    }
112 1
113
    /**
114
     * {@inheritdoc}
115
     */
116 140
    public function setHtml($html): self {
117
        if (!is_string($html) || trim($html) == '') {
118
            return $this;
119 140
        }
120
121 140
        $internalErrors = libxml_use_internal_errors(true);
122 140
        $disableEntities = libxml_disable_entity_loader(true);
123
124 140
        $html = $this->convertToUtf8($html);
125
126
        $this->loadHTML($html, $this->libxmlOptions);
127
128
        libxml_use_internal_errors($internalErrors);
129
        libxml_disable_entity_loader($disableEntities);
130
131
        return $this;
132
    }
133
134
    /**
135
     * @param string $html
136
     * @param int $options
137
     * @return bool
138
     */
139
    public function loadHTML($html, $options = 0): bool {
140
        // Fix LibXML's crazy-ness RE root nodes
141
        // While importing HTML using the LIBXML_HTML_NOIMPLIED option LibXML insists
142
        //  on having one root node. All subsequent nodes are appended to this first node.
143
        // To counter this we will create a fake element, allow LibXML to 'do its thing'
144
        //  then undo it by taking the contents of the fake element, placing it back into
145
        //  the root and then remove our fake element.
146
        if ($options & LIBXML_HTML_NOIMPLIED) {
147
            $html = '<domwrap></domwrap>' . $html;
148
        }
149
150
        $result = parent::loadHTML($html, $options);
151
152
        // Do our re-shuffling of nodes.
153
        if ($this->libxmlOptions & LIBXML_HTML_NOIMPLIED) {
154
            $this->children()->first()->contents()->each(function($node){
155
                $this->append($node);
156
            });
157
158
            $this->removeChild($this->children()->first());
159
        }
160
161
        return $result;
162
    }
163
164
    private function getCharset(string $html): ?string {
165
        $charset = null;
166
167
        if (preg_match('@<meta.*?charset=["]?([^"\s]+)@im', $html, $matches)) {
168
            $charset = strtoupper($matches[1]);
169
        }
170
171
        return $charset;
172
    }
173
        
174
    private function convertToUtf8(string $html): string {
175
        if (mb_detect_encoding($html, mb_detect_order(), true) === 'UTF-8') {
176
            return $html;
177
        }
178
179
        $charset = $this->getCharset($html);
180
181
        if ($charset !== null) {
182
            $html = preg_replace('@(charset=["]?)([^"\s]+)([^"]*["]?)@im', '$1UTF-8$3', $html);
183
            $mbHasCharset = in_array($charset, array_map('strtoupper', mb_list_encodings()));
184
185
            if ($mbHasCharset) {
186
                $html = mb_convert_encoding($html, 'UTF-8', $charset);
187
188
            // Fallback to iconv if available.
189
            } elseif (extension_loaded('iconv')) {
190
                $htmlIconv = iconv($charset, 'UTF-8', $html);
191
192
                if ($htmlIconv !== false) {
193
                    $html = $htmlIconv;
194
                } else {
195
                    $charset = null;
196
                }
197
            }
198
        }
199
200
        if ($charset === null) {
201
            $html = mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8');
202
        }
203
204
        return $html;
205
    }
206
}
207