Completed
Pull Request — master (#182)
by
unknown
01:57
created

Normalizer::loadHtml()   B

Complexity

Conditions 9
Paths 6

Size

Total Lines 31

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 21
CRAP Score 9

Importance

Changes 0
Metric Value
dl 0
loc 31
ccs 21
cts 21
cp 1
rs 8.0555
c 0
b 0
f 0
cc 9
nc 6
nop 1
crap 9
1
<?php
2
3
namespace Masterminds\HTML5\Parser;
4
5
/**
6
 * Normalizes HTML.
7
 *
8
 * This class adds missing root elements, namely <html>, <head>, <body>. <!DOCTYPE> can optionally be added
9
 * if specified in the tree structure - by default this is disabled.
10
 *
11
 * This library treats input HTML as a document fragment rather than a complete document (even if it has a DOCTYPE).
12
 * DOMDocument automatically adds missing root elements so this class aims to replicate that functionality.
13
 *
14
 * @author Kieran Brahney <[email protected]>
15
 * @see    https://github.com/Masterminds/html5-php/issues/166
16
 */
17
class Normalizer
18
{
19
    /**
20
     * Structure of a basic HTML document.
21
     *
22
     * @var array
23
     */
24
    protected $tree = array(
25
        'doctype' => '',
26
        'html' => array(
27
            'start' => '<html>',
28
            'end' => '</html>',
29
            'content' => array(),
30
        ),
31
        'head' => array(
32
            'start' => '<head>',
33
            'end' => '</head>',
34
            'content' => array(),
35
        ),
36
        'body' => array(
37
            'start' => '<body>',
38
            'end' => '</body>',
39
            'content' => array(),
40
        ),
41
    );
42
43
    /**
44
     * What root element did we last add to.
45
     *
46
     * @var string|null
47
     */
48
    protected $previousKey = null;
49
50
    /**
51
     * Parse a HTML document.
52
     *
53
     * @param  string $html
54
     * @return void
55
     */
56 14
    public function loadHtml($html)
57
    {
58 14
        $i = 0;
59 14
        $len = \strlen($html);
60 14
        while ($i < $len) {
61 14
            if ($html[$i] === '<') {
62
                // Found a tag, get chars until the end of the tag.
63 13
                $tag = '';
64 13
                while ($i < $len && $html[$i] !== '>') {
65 13
                    $tag .= $html[$i++];
66 13
                }
67
68 13
                if ($i < $len && (string) $html[$i] === '>') {
69 12
                    $tag .= $html[$i++];
70
71
                    // Copy any whitespace following the tag.
72
                    // Anything added here needs to be added to the rtrim in the nodeName function.
73 12
                    while ($i < $len && \preg_match('/\s/', (string) $html[$i])) {
74 1
                        $tag .= $html[$i++];
75 1
                    }
76 12
                } else {
77
                    // Missing closing tag?
78 1
                    $tag .= '>';
79
                }
80
81 13
                $this->addToTree($tag);
82 13
            } else {
83 6
                $this->addToTree($html[$i++]);
84
            }
85 14
        }
86 14
    }
87
88
    /**
89
     * Format the document in a structured way (ensures root elements exists and moves scripts/css into <body>).
90
     *
91
     * @return string
92
     */
93 14
    public function saveHtml()
94
    {
95
        // Initialise buffer.
96 14
        $buffer = '';
97
98
        // Add <!DOCTYPE> - this is optional.
99 14
        $buffer .= $this->tree['doctype'];
100
101
        // Add <html>
102 14
        $buffer .= $this->tree['html']['start'];
103
104
        // Add head
105 14
        $buffer .= $this->tree['head']['start'];
106 14
        foreach ($this->tree['head']['content'] as $node) {
107 1
            $buffer .= $node;
108 14
        }
109 14
        $buffer .= $this->tree['head']['end'];
110
111
        // Add body
112 14
        $buffer .= $this->tree['body']['start'];
113 14
        foreach ($this->tree['body']['content'] as $node) {
114 9
            $buffer .= $node;
115 14
        }
116 14
        $buffer .= $this->tree['body']['end'];
117
118
        // Close </html> tag
119 14
        return $buffer . $this->tree['html']['end'];
120
    }
121
122
    /**
123
     * Add a node into the tree for the correct parent.
124
     *
125
     * @param string $node
126
     * @return void
127
     */
128 14
    protected function addToTree($node)
129
    {
130 14
        if ($node[0] == '<') {
131 13
            switch (\strtolower($this->nodeName($node))) {
132 13
                case '!doctype':
133
                    if (empty($this->tree['doctype'])) {
134
                        $this->tree['doctype'] = $node;
135
136
                        return;
137
                    }
138
139
                    // Don't overwrite if we've already got a doctype definition.
140
                    return;
141
142 13
                case 'html':
143 4
                    $this->addTo('html', $node, false);
144
145 4
                    return;
146
147 10
                case 'head':
148 4
                    $this->addTo('head', $node, true);
149
150 4
                    return;
151
152 8
                default:
153 8
                    $this->addTo(isset($this->previousKey) ? $this->previousKey : 'body', $node, true);
154
155 8
                    return;
156 8
            }
157
        }
158
159
        // text node
160 6
        $this->addTo(isset($this->previousKey) ? $this->previousKey : 'body', $node, true);
161 6
    }
162
163
    /**
164
     * Add a node to the the tree.
165
     *
166
     * @param  string $key
167
     * @param  string $node
168
     * @param  bool   $setPrevious
169
     * @return void
170
     */
171 14
    protected function addTo($key, $node, $setPrevious)
172
    {
173 14
        $previousKey = $key;
174
175 14
        if (\stripos($node, '<' . $key) !== false) {
176 7
            $this->tree[$key]['start'] = $node;
177 14
        } elseif (\stristr($node, '/' . $key . '>')) {
178 8
            $this->tree[$key]['end'] = $node;
179 8
            $previousKey = null;
180 8
        } else {
181 10
            $this->tree[$key]['content'][] = $node;
182
        }
183
184 14
        if ($setPrevious) {
185 13
            $this->previousKey = $previousKey;
186 13
        }
187 14
    }
188
189
    /**
190
     * Get the name of a node without </>
191
     *
192
     * @param string $node
193
     * @return string
194
     */
195 13
    protected function nodeName($node)
196
    {
197 13
        $name = \preg_replace('/>\s*/', '', \ltrim($node, '</'));
198
199 13
        $chunks = \explode(' ', $name);
200
201 13
        return $chunks[0];
202
    }
203
}
204