Completed
Push — master ( e162f1...95ce61 )
by Richard
13s
created

HTMLPurifier_Lexer_DOMLex::wrapHTML()   C

Complexity

Conditions 7
Paths 20

Size

Total Lines 26
Code Lines 18

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 7
eloc 18
nc 20
nop 4
dl 0
loc 26
rs 6.7272
c 0
b 0
f 0
1
<?php
2
3
/**
4
 * Parser that uses PHP 5's DOM extension (part of the core).
5
 *
6
 * In PHP 5, the DOM XML extension was revamped into DOM and added to the core.
7
 * It gives us a forgiving HTML parser, which we use to transform the HTML
8
 * into a DOM, and then into the tokens.  It is blazingly fast (for large
9
 * documents, it performs twenty times faster than
10
 * HTMLPurifier_Lexer_DirectLex,and is the default choice for PHP 5.
11
 *
12
 * @note Any empty elements will have empty tokens associated with them, even if
13
 * this is prohibited by the spec. This is cannot be fixed until the spec
14
 * comes into play.
15
 *
16
 * @note PHP's DOM extension does not actually parse any entities, we use
17
 *       our own function to do that.
18
 *
19
 * @warning DOM tends to drop whitespace, which may wreak havoc on indenting.
20
 *          If this is a huge problem, due to the fact that HTML is hand
21
 *          edited and you are unable to get a parser cache that caches the
22
 *          the output of HTML Purifier while keeping the original HTML lying
23
 *          around, you may want to run Tidy on the resulting output or use
24
 *          HTMLPurifier_DirectLex
25
 */
26
27
class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
28
{
29
30
    /**
31
     * @type HTMLPurifier_TokenFactory
32
     */
33
    private $factory;
34
35
    public function __construct()
36
    {
37
        // setup the factory
38
        parent::__construct();
39
        $this->factory = new HTMLPurifier_TokenFactory();
40
    }
41
42
    /**
43
     * @param string $html
44
     * @param HTMLPurifier_Config $config
45
     * @param HTMLPurifier_Context $context
46
     * @return HTMLPurifier_Token[]
47
     */
48
    public function tokenizeHTML($html, $config, $context)
49
    {
50
        $html = $this->normalize($html, $config, $context);
51
52
        // attempt to armor stray angled brackets that cannot possibly
53
        // form tags and thus are probably being used as emoticons
54
        if ($config->get('Core.AggressivelyFixLt')) {
55
            $char = '[^a-z!\/]';
56
            $comment = "/<!--(.*?)(-->|\z)/is";
57
            $html = preg_replace_callback($comment, array($this, 'callbackArmorCommentEntities'), $html);
58
            do {
59
                $old = $html;
60
                $html = preg_replace("/<($char)/i", '&lt;\\1', $html);
61
            } while ($html !== $old);
62
            $html = preg_replace_callback($comment, array($this, 'callbackUndoCommentSubst'), $html); // fix comments
63
        }
64
65
        // preprocess html, essential for UTF-8
66
        $html = $this->wrapHTML($html, $config, $context);
67
68
        $doc = new DOMDocument();
69
        $doc->encoding = 'UTF-8'; // theoretically, the above has this covered
70
71
        set_error_handler(array($this, 'muteErrorHandler'));
72
        $doc->loadHTML($html);
73
        restore_error_handler();
74
75
        $body = $doc->getElementsByTagName('html')->item(0)-> // <html>
76
                      getElementsByTagName('body')->item(0);  // <body>
77
78
        $div = $body->getElementsByTagName('div')->item(0); // <div>
79
        $tokens = array();
80
        $this->tokenizeDOM($div, $tokens, $config);
81
        // If the div has a sibling, that means we tripped across
82
        // a premature </div> tag.  So remove the div we parsed,
83
        // and then tokenize the rest of body.  We can't tokenize
84
        // the sibling directly as we'll lose the tags in that case.
85
        if ($div->nextSibling) {
86
            $body->removeChild($div);
87
            $this->tokenizeDOM($body, $tokens, $config);
88
        }
89
        return $tokens;
90
    }
91
92
    /**
93
     * Iterative function that tokenizes a node, putting it into an accumulator.
94
     * To iterate is human, to recurse divine - L. Peter Deutsch
95
     * @param DOMNode $node DOMNode to be tokenized.
96
     * @param HTMLPurifier_Token[] $tokens   Array-list of already tokenized tokens.
97
     * @return HTMLPurifier_Token of node appended to previously passed tokens.
0 ignored issues
show
Documentation introduced by
Should the return type not be HTMLPurifier_Token|null?

This check compares the return type specified in the @return annotation of a function or method doc comment with the types returned by the function and raises an issue if they mismatch.

Loading history...
98
     */
99
    protected function tokenizeDOM($node, &$tokens, $config)
100
    {
101
        $level = 0;
102
        $nodes = array($level => new HTMLPurifier_Queue(array($node)));
103
        $closingNodes = array();
104
        do {
105
            while (!$nodes[$level]->isEmpty()) {
106
                $node = $nodes[$level]->shift(); // FIFO
107
                $collect = $level > 0 ? true : false;
108
                $needEndingTag = $this->createStartNode($node, $tokens, $collect, $config);
109
                if ($needEndingTag) {
110
                    $closingNodes[$level][] = $node;
111
                }
112
                if ($node->childNodes && $node->childNodes->length) {
113
                    $level++;
114
                    $nodes[$level] = new HTMLPurifier_Queue();
115
                    foreach ($node->childNodes as $childNode) {
116
                        $nodes[$level]->push($childNode);
117
                    }
118
                }
119
            }
120
            $level--;
121
            if ($level && isset($closingNodes[$level])) {
122
                while ($node = array_pop($closingNodes[$level])) {
123
                    $this->createEndNode($node, $tokens);
124
                }
125
            }
126
        } while ($level > 0);
127
    }
128
129
    /**
130
     * @param DOMNode $node DOMNode to be tokenized.
131
     * @param HTMLPurifier_Token[] $tokens   Array-list of already tokenized tokens.
132
     * @param bool $collect  Says whether or start and close are collected, set to
133
     *                    false at first recursion because it's the implicit DIV
134
     *                    tag you're dealing with.
135
     * @return bool if the token needs an endtoken
136
     * @todo data and tagName properties don't seem to exist in DOMNode?
137
     */
138
    protected function createStartNode($node, &$tokens, $collect, $config)
139
    {
140
        // intercept non element nodes. WE MUST catch all of them,
141
        // but we're not getting the character reference nodes because
142
        // those should have been preprocessed
143
        if ($node->nodeType === XML_TEXT_NODE) {
144
            $tokens[] = $this->factory->createText($node->data);
0 ignored issues
show
Bug introduced by
The property data does not seem to exist in DOMNode.

An attempt at access to an undefined property has been detected. This may either be a typographical error or the property has been renamed but there are still references to its old name.

If you really want to allow access to undefined properties, you can define magic methods to allow access. See the php core documentation on Overloading.

Loading history...
145
            return false;
146
        } elseif ($node->nodeType === XML_CDATA_SECTION_NODE) {
147
            // undo libxml's special treatment of <script> and <style> tags
148
            $last = end($tokens);
149
            $data = $node->data;
150
            // (note $node->tagname is already normalized)
151
            if ($last instanceof HTMLPurifier_Token_Start && ($last->name == 'script' || $last->name == 'style')) {
152
                $new_data = trim($data);
153
                if (substr($new_data, 0, 4) === '<!--') {
154
                    $data = substr($new_data, 4);
155 View Code Duplication
                    if (substr($data, -3) === '-->') {
156
                        $data = substr($data, 0, -3);
157
                    } else {
0 ignored issues
show
Unused Code introduced by
This else statement is empty and can be removed.

This check looks for the else branches of if statements that have no statements or where all statements have been commented out. This may be the result of changes for debugging or the code may simply be obsolete.

These else branches can be removed.

if (rand(1, 6) > 3) {
print "Check failed";
} else {
    //print "Check succeeded";
}

could be turned into

if (rand(1, 6) > 3) {
    print "Check failed";
}

This is much more concise to read.

Loading history...
158
                        // Highly suspicious! Not sure what to do...
159
                    }
160
                }
161
            }
162
            $tokens[] = $this->factory->createText($this->parseText($data, $config));
163
            return false;
164
        } elseif ($node->nodeType === XML_COMMENT_NODE) {
165
            // this is code is only invoked for comments in script/style in versions
166
            // of libxml pre-2.6.28 (regular comments, of course, are still
167
            // handled regularly)
168
            $tokens[] = $this->factory->createComment($node->data);
169
            return false;
170
        } elseif ($node->nodeType !== XML_ELEMENT_NODE) {
171
            // not-well tested: there may be other nodes we have to grab
172
            return false;
173
        }
174
175
        $attr = $node->hasAttributes() ? $this->transformAttrToAssoc($node->attributes) : array();
176
177
        // We still have to make sure that the element actually IS empty
178
        if (!$node->childNodes->length) {
179
            if ($collect) {
180
                $tokens[] = $this->factory->createEmpty($node->tagName, $attr);
0 ignored issues
show
Bug introduced by
The property tagName does not seem to exist in DOMNode.

An attempt at access to an undefined property has been detected. This may either be a typographical error or the property has been renamed but there are still references to its old name.

If you really want to allow access to undefined properties, you can define magic methods to allow access. See the php core documentation on Overloading.

Loading history...
181
            }
182
            return false;
183
        } else {
184
            if ($collect) {
185
                $tokens[] = $this->factory->createStart(
186
                    $tag_name = $node->tagName, // somehow, it get's dropped
187
                    $attr
188
                );
189
            }
190
            return true;
191
        }
192
    }
193
194
    /**
195
     * @param DOMNode $node
196
     * @param HTMLPurifier_Token[] $tokens
197
     */
198
    protected function createEndNode($node, &$tokens)
199
    {
200
        $tokens[] = $this->factory->createEnd($node->tagName);
0 ignored issues
show
Bug introduced by
The property tagName does not seem to exist in DOMNode.

An attempt at access to an undefined property has been detected. This may either be a typographical error or the property has been renamed but there are still references to its old name.

If you really want to allow access to undefined properties, you can define magic methods to allow access. See the php core documentation on Overloading.

Loading history...
201
    }
202
203
204
    /**
205
     * Converts a DOMNamedNodeMap of DOMAttr objects into an assoc array.
206
     *
207
     * @param DOMNamedNodeMap $node_map DOMNamedNodeMap of DOMAttr objects.
208
     * @return array Associative array of attributes.
209
     */
210
    protected function transformAttrToAssoc($node_map)
211
    {
212
        // NamedNodeMap is documented very well, so we're using undocumented
213
        // features, namely, the fact that it implements Iterator and
214
        // has a ->length attribute
215
        if ($node_map->length === 0) {
0 ignored issues
show
Bug introduced by
The property length does not seem to exist in DOMNamedNodeMap.

An attempt at access to an undefined property has been detected. This may either be a typographical error or the property has been renamed but there are still references to its old name.

If you really want to allow access to undefined properties, you can define magic methods to allow access. See the php core documentation on Overloading.

Loading history...
216
            return array();
217
        }
218
        $array = array();
219
        foreach ($node_map as $attr) {
220
            $array[$attr->name] = $attr->value;
221
        }
222
        return $array;
223
    }
224
225
    /**
226
     * An error handler that mutes all errors
227
     * @param int $errno
228
     * @param string $errstr
229
     */
230
    public function muteErrorHandler($errno, $errstr)
0 ignored issues
show
Unused Code introduced by
The parameter $errno is not used and could be removed.

This check looks from parameters that have been defined for a function or method, but which are not used in the method body.

Loading history...
Unused Code introduced by
The parameter $errstr is not used and could be removed.

This check looks from parameters that have been defined for a function or method, but which are not used in the method body.

Loading history...
231
    {
232
    }
233
234
    /**
235
     * Callback function for undoing escaping of stray angled brackets
236
     * in comments
237
     * @param array $matches
238
     * @return string
239
     */
240
    public function callbackUndoCommentSubst($matches)
241
    {
242
        return '<!--' . strtr($matches[1], array('&amp;' => '&', '&lt;' => '<')) . $matches[2];
243
    }
244
245
    /**
246
     * Callback function that entity-izes ampersands in comments so that
247
     * callbackUndoCommentSubst doesn't clobber them
248
     * @param array $matches
249
     * @return string
250
     */
251
    public function callbackArmorCommentEntities($matches)
252
    {
253
        return '<!--' . str_replace('&', '&amp;', $matches[1]) . $matches[2];
254
    }
255
256
    /**
257
     * Wraps an HTML fragment in the necessary HTML
258
     * @param string $html
259
     * @param HTMLPurifier_Config $config
260
     * @param HTMLPurifier_Context $context
261
     * @return string
262
     */
263
    protected function wrapHTML($html, $config, $context, $use_div = true)
0 ignored issues
show
Unused Code introduced by
The parameter $context is not used and could be removed.

This check looks from parameters that have been defined for a function or method, but which are not used in the method body.

Loading history...
264
    {
265
        $def = $config->getDefinition('HTML');
266
        $ret = '';
267
268
        if (!empty($def->doctype->dtdPublic) || !empty($def->doctype->dtdSystem)) {
269
            $ret .= '<!DOCTYPE html ';
270
            if (!empty($def->doctype->dtdPublic)) {
271
                $ret .= 'PUBLIC "' . $def->doctype->dtdPublic . '" ';
0 ignored issues
show
Bug introduced by
The property doctype does not seem to exist in HTMLPurifier_Definition.

An attempt at access to an undefined property has been detected. This may either be a typographical error or the property has been renamed but there are still references to its old name.

If you really want to allow access to undefined properties, you can define magic methods to allow access. See the php core documentation on Overloading.

Loading history...
272
            }
273
            if (!empty($def->doctype->dtdSystem)) {
274
                $ret .= '"' . $def->doctype->dtdSystem . '" ';
275
            }
276
            $ret .= '>';
277
        }
278
279
        $ret .= '<html><head>';
280
        $ret .= '<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />';
281
        // No protection if $html contains a stray </div>!
282
        $ret .= '</head><body>';
283
        if ($use_div) $ret .= '<div>';
284
        $ret .= $html;
285
        if ($use_div) $ret .= '</div>';
286
        $ret .= '</body></html>';
287
        return $ret;
288
    }
289
}
290
291
// vim: et sw=4 sts=4
292