| Conditions | 41 |
| Paths | 8496 |
| Total Lines | 302 |
| Code Lines | 182 |
| Lines | 0 |
| Ratio | 0 % |
| Changes | 0 | ||
Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.
For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.
Commonly applied refactorings include:
If many parameters/temporary variables are present:
| 1 | <?php |
||
| 42 | public function tokenizeHTML($html, $config, $context) |
||
| 43 | { |
||
| 44 | // special normalization for script tags without any armor |
||
| 45 | // our "armor" heurstic is a < sign any number of whitespaces after |
||
| 46 | // the first script tag |
||
| 47 | if ($config->get('HTML.Trusted')) { |
||
| 48 | $html = preg_replace_callback( |
||
| 49 | '#(<script[^>]*>)(\s*[^<].+?)(</script>)#si', |
||
| 50 | array($this, 'scriptCallback'), |
||
| 51 | $html |
||
| 52 | ); |
||
| 53 | } |
||
| 54 | |||
| 55 | $html = $this->normalize($html, $config, $context); |
||
| 56 | |||
| 57 | $cursor = 0; // our location in the text |
||
| 58 | $inside_tag = false; // whether or not we're parsing the inside of a tag |
||
| 59 | $array = array(); // result array |
||
| 60 | |||
| 61 | // This is also treated to mean maintain *column* numbers too |
||
| 62 | $maintain_line_numbers = $config->get('Core.MaintainLineNumbers'); |
||
| 63 | |||
| 64 | if ($maintain_line_numbers === null) { |
||
| 65 | // automatically determine line numbering by checking |
||
| 66 | // if error collection is on |
||
| 67 | $maintain_line_numbers = $config->get('Core.CollectErrors'); |
||
| 68 | } |
||
| 69 | |||
| 70 | if ($maintain_line_numbers) { |
||
| 71 | $current_line = 1; |
||
| 72 | $current_col = 0; |
||
| 73 | $length = strlen($html); |
||
| 74 | } else { |
||
| 75 | $current_line = false; |
||
| 76 | $current_col = false; |
||
| 77 | $length = false; |
||
| 78 | } |
||
| 79 | $context->register('CurrentLine', $current_line); |
||
| 80 | $context->register('CurrentCol', $current_col); |
||
| 81 | $nl = "\n"; |
||
| 82 | // how often to manually recalculate. This will ALWAYS be right, |
||
| 83 | // but it's pretty wasteful. Set to 0 to turn off |
||
| 84 | $synchronize_interval = $config->get('Core.DirectLexLineNumberSyncInterval'); |
||
| 85 | |||
| 86 | $e = false; |
||
| 87 | if ($config->get('Core.CollectErrors')) { |
||
| 88 | $e =& $context->get('ErrorCollector'); |
||
| 89 | } |
||
| 90 | |||
| 91 | // for testing synchronization |
||
| 92 | $loops = 0; |
||
| 93 | |||
| 94 | while (++$loops) { |
||
| 95 | // $cursor is either at the start of a token, or inside of |
||
| 96 | // a tag (i.e. there was a < immediately before it), as indicated |
||
| 97 | // by $inside_tag |
||
| 98 | |||
| 99 | if ($maintain_line_numbers) { |
||
| 100 | // $rcursor, however, is always at the start of a token. |
||
| 101 | $rcursor = $cursor - (int)$inside_tag; |
||
| 102 | |||
| 103 | // Column number is cheap, so we calculate it every round. |
||
| 104 | // We're interested at the *end* of the newline string, so |
||
| 105 | // we need to add strlen($nl) == 1 to $nl_pos before subtracting it |
||
| 106 | // from our "rcursor" position. |
||
| 107 | $nl_pos = strrpos($html, $nl, $rcursor - $length); |
||
| 108 | $current_col = $rcursor - (is_bool($nl_pos) ? 0 : $nl_pos + 1); |
||
| 109 | |||
| 110 | // recalculate lines |
||
| 111 | if ($synchronize_interval && // synchronization is on |
||
| 112 | $cursor > 0 && // cursor is further than zero |
||
| 113 | $loops % $synchronize_interval === 0) { // time to synchronize! |
||
| 114 | $current_line = 1 + $this->substrCount($html, $nl, 0, $cursor); |
||
| 115 | } |
||
| 116 | } |
||
| 117 | |||
| 118 | $position_next_lt = strpos($html, '<', $cursor); |
||
| 119 | $position_next_gt = strpos($html, '>', $cursor); |
||
| 120 | |||
| 121 | // triggers on "<b>asdf</b>" but not "asdf <b></b>" |
||
| 122 | // special case to set up context |
||
| 123 | if ($position_next_lt === $cursor) { |
||
| 124 | $inside_tag = true; |
||
| 125 | $cursor++; |
||
| 126 | } |
||
| 127 | |||
| 128 | if (!$inside_tag && $position_next_lt !== false) { |
||
| 129 | // We are not inside tag and there still is another tag to parse |
||
| 130 | $token = new |
||
| 131 | HTMLPurifier_Token_Text( |
||
| 132 | $this->parseText( |
||
| 133 | substr( |
||
| 134 | $html, |
||
| 135 | $cursor, |
||
| 136 | $position_next_lt - $cursor |
||
| 137 | ), $config |
||
| 138 | ) |
||
| 139 | ); |
||
| 140 | if ($maintain_line_numbers) { |
||
| 141 | $token->rawPosition($current_line, $current_col); |
||
| 142 | $current_line += $this->substrCount($html, $nl, $cursor, $position_next_lt - $cursor); |
||
| 143 | } |
||
| 144 | $array[] = $token; |
||
| 145 | $cursor = $position_next_lt + 1; |
||
| 146 | $inside_tag = true; |
||
| 147 | continue; |
||
| 148 | } elseif (!$inside_tag) { |
||
| 149 | // We are not inside tag but there are no more tags |
||
| 150 | // If we're already at the end, break |
||
| 151 | if ($cursor === strlen($html)) { |
||
| 152 | break; |
||
| 153 | } |
||
| 154 | // Create Text of rest of string |
||
| 155 | $token = new |
||
| 156 | HTMLPurifier_Token_Text( |
||
| 157 | $this->parseText( |
||
| 158 | substr( |
||
| 159 | $html, |
||
| 160 | $cursor |
||
| 161 | ), $config |
||
| 162 | ) |
||
| 163 | ); |
||
| 164 | if ($maintain_line_numbers) { |
||
| 165 | $token->rawPosition($current_line, $current_col); |
||
| 166 | } |
||
| 167 | $array[] = $token; |
||
| 168 | break; |
||
| 169 | } elseif ($inside_tag && $position_next_gt !== false) { |
||
| 170 | // We are in tag and it is well formed |
||
| 171 | // Grab the internals of the tag |
||
| 172 | $strlen_segment = $position_next_gt - $cursor; |
||
| 173 | |||
| 174 | if ($strlen_segment < 1) { |
||
| 175 | // there's nothing to process! |
||
| 176 | $token = new HTMLPurifier_Token_Text('<'); |
||
|
|
|||
| 177 | $cursor++; |
||
| 178 | continue; |
||
| 179 | } |
||
| 180 | |||
| 181 | $segment = substr($html, $cursor, $strlen_segment); |
||
| 182 | |||
| 183 | if ($segment === false) { |
||
| 184 | // somehow, we attempted to access beyond the end of |
||
| 185 | // the string, defense-in-depth, reported by Nate Abele |
||
| 186 | break; |
||
| 187 | } |
||
| 188 | |||
| 189 | // Check if it's a comment |
||
| 190 | if (substr($segment, 0, 3) === '!--') { |
||
| 191 | // re-determine segment length, looking for --> |
||
| 192 | $position_comment_end = strpos($html, '-->', $cursor); |
||
| 193 | if ($position_comment_end === false) { |
||
| 194 | // uh oh, we have a comment that extends to |
||
| 195 | // infinity. Can't be helped: set comment |
||
| 196 | // end position to end of string |
||
| 197 | if ($e) { |
||
| 198 | $e->send(E_WARNING, 'Lexer: Unclosed comment'); |
||
| 199 | } |
||
| 200 | $position_comment_end = strlen($html); |
||
| 201 | $end = true; |
||
| 202 | } else { |
||
| 203 | $end = false; |
||
| 204 | } |
||
| 205 | $strlen_segment = $position_comment_end - $cursor; |
||
| 206 | $segment = substr($html, $cursor, $strlen_segment); |
||
| 207 | $token = new |
||
| 208 | HTMLPurifier_Token_Comment( |
||
| 209 | substr( |
||
| 210 | $segment, |
||
| 211 | 3, |
||
| 212 | $strlen_segment - 3 |
||
| 213 | ) |
||
| 214 | ); |
||
| 215 | if ($maintain_line_numbers) { |
||
| 216 | $token->rawPosition($current_line, $current_col); |
||
| 217 | $current_line += $this->substrCount($html, $nl, $cursor, $strlen_segment); |
||
| 218 | } |
||
| 219 | $array[] = $token; |
||
| 220 | $cursor = $end ? $position_comment_end : $position_comment_end + 3; |
||
| 221 | $inside_tag = false; |
||
| 222 | continue; |
||
| 223 | } |
||
| 224 | |||
| 225 | // Check if it's an end tag |
||
| 226 | $is_end_tag = (strpos($segment, '/') === 0); |
||
| 227 | if ($is_end_tag) { |
||
| 228 | $type = substr($segment, 1); |
||
| 229 | $token = new HTMLPurifier_Token_End($type); |
||
| 230 | if ($maintain_line_numbers) { |
||
| 231 | $token->rawPosition($current_line, $current_col); |
||
| 232 | $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor); |
||
| 233 | } |
||
| 234 | $array[] = $token; |
||
| 235 | $inside_tag = false; |
||
| 236 | $cursor = $position_next_gt + 1; |
||
| 237 | continue; |
||
| 238 | } |
||
| 239 | |||
| 240 | // Check leading character is alnum, if not, we may |
||
| 241 | // have accidently grabbed an emoticon. Translate into |
||
| 242 | // text and go our merry way |
||
| 243 | if (!ctype_alpha($segment[0])) { |
||
| 244 | // XML: $segment[0] !== '_' && $segment[0] !== ':' |
||
| 245 | if ($e) { |
||
| 246 | $e->send(E_NOTICE, 'Lexer: Unescaped lt'); |
||
| 247 | } |
||
| 248 | $token = new HTMLPurifier_Token_Text('<'); |
||
| 249 | if ($maintain_line_numbers) { |
||
| 250 | $token->rawPosition($current_line, $current_col); |
||
| 251 | $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor); |
||
| 252 | } |
||
| 253 | $array[] = $token; |
||
| 254 | $inside_tag = false; |
||
| 255 | continue; |
||
| 256 | } |
||
| 257 | |||
| 258 | // Check if it is explicitly self closing, if so, remove |
||
| 259 | // trailing slash. Remember, we could have a tag like <br>, so |
||
| 260 | // any later token processing scripts must convert improperly |
||
| 261 | // classified EmptyTags from StartTags. |
||
| 262 | $is_self_closing = (strrpos($segment, '/') === $strlen_segment - 1); |
||
| 263 | if ($is_self_closing) { |
||
| 264 | $strlen_segment--; |
||
| 265 | $segment = substr($segment, 0, $strlen_segment); |
||
| 266 | } |
||
| 267 | |||
| 268 | // Check if there are any attributes |
||
| 269 | $position_first_space = strcspn($segment, $this->_whitespace); |
||
| 270 | |||
| 271 | if ($position_first_space >= $strlen_segment) { |
||
| 272 | if ($is_self_closing) { |
||
| 273 | $token = new HTMLPurifier_Token_Empty($segment); |
||
| 274 | } else { |
||
| 275 | $token = new HTMLPurifier_Token_Start($segment); |
||
| 276 | } |
||
| 277 | if ($maintain_line_numbers) { |
||
| 278 | $token->rawPosition($current_line, $current_col); |
||
| 279 | $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor); |
||
| 280 | } |
||
| 281 | $array[] = $token; |
||
| 282 | $inside_tag = false; |
||
| 283 | $cursor = $position_next_gt + 1; |
||
| 284 | continue; |
||
| 285 | } |
||
| 286 | |||
| 287 | // Grab out all the data |
||
| 288 | $type = substr($segment, 0, $position_first_space); |
||
| 289 | $attribute_string = |
||
| 290 | trim( |
||
| 291 | substr( |
||
| 292 | $segment, |
||
| 293 | $position_first_space |
||
| 294 | ) |
||
| 295 | ); |
||
| 296 | if ($attribute_string) { |
||
| 297 | $attr = $this->parseAttributeString( |
||
| 298 | $attribute_string, |
||
| 299 | $config, |
||
| 300 | $context |
||
| 301 | ); |
||
| 302 | } else { |
||
| 303 | $attr = array(); |
||
| 304 | } |
||
| 305 | |||
| 306 | if ($is_self_closing) { |
||
| 307 | $token = new HTMLPurifier_Token_Empty($type, $attr); |
||
| 308 | } else { |
||
| 309 | $token = new HTMLPurifier_Token_Start($type, $attr); |
||
| 310 | } |
||
| 311 | if ($maintain_line_numbers) { |
||
| 312 | $token->rawPosition($current_line, $current_col); |
||
| 313 | $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor); |
||
| 314 | } |
||
| 315 | $array[] = $token; |
||
| 316 | $cursor = $position_next_gt + 1; |
||
| 317 | $inside_tag = false; |
||
| 318 | continue; |
||
| 319 | } else { |
||
| 320 | // inside tag, but there's no ending > sign |
||
| 321 | if ($e) { |
||
| 322 | $e->send(E_WARNING, 'Lexer: Missing gt'); |
||
| 323 | } |
||
| 324 | $token = new |
||
| 325 | HTMLPurifier_Token_Text( |
||
| 326 | '<' . |
||
| 327 | $this->parseText( |
||
| 328 | substr($html, $cursor), $config |
||
| 329 | ) |
||
| 330 | ); |
||
| 331 | if ($maintain_line_numbers) { |
||
| 332 | $token->rawPosition($current_line, $current_col); |
||
| 333 | } |
||
| 334 | // no cursor scroll? Hmm... |
||
| 335 | $array[] = $token; |
||
| 336 | break; |
||
| 337 | } |
||
| 338 | break; |
||
| 339 | } |
||
| 340 | |||
| 341 | $context->destroy('CurrentLine'); |
||
| 342 | $context->destroy('CurrentCol'); |
||
| 343 | return $array; |
||
| 344 | } |
||
| 540 |