Duplicate code is one of the most pungent code smells. A rule that is often used is to re-structure code once it is duplicated in three or more places.
Common duplication problems, and corresponding solutions are:
| 1 | <?php |
||
| 32 | class HTML5_InputStream { |
||
| 33 | /** |
||
| 34 | * The string data we're parsing. |
||
| 35 | */ |
||
| 36 | private $data; |
||
| 37 | |||
| 38 | /** |
||
| 39 | * The current integer byte position we are in $data |
||
| 40 | */ |
||
| 41 | private $char; |
||
| 42 | |||
| 43 | /** |
||
| 44 | * Length of $data; when $char === $data, we are at the end-of-file. |
||
| 45 | */ |
||
| 46 | private $EOF; |
||
| 47 | |||
| 48 | /** |
||
| 49 | * Parse errors. |
||
| 50 | */ |
||
| 51 | public $errors = array(); |
||
| 52 | |||
| 53 | /** |
||
| 54 | * @param $data Data to parse |
||
| 55 | */ |
||
| 56 | public function __construct($data) { |
||
| 57 | |||
| 58 | /* Given an encoding, the bytes in the input stream must be |
||
| 59 | converted to Unicode characters for the tokeniser, as |
||
| 60 | described by the rules for that encoding, except that the |
||
| 61 | leading U+FEFF BYTE ORDER MARK character, if any, must not |
||
| 62 | be stripped by the encoding layer (it is stripped by the rule below). |
||
| 63 | |||
| 64 | Bytes or sequences of bytes in the original byte stream that |
||
| 65 | could not be converted to Unicode characters must be converted |
||
| 66 | to U+FFFD REPLACEMENT CHARACTER code points. */ |
||
| 67 | |||
| 68 | // XXX currently assuming input data is UTF-8; once we |
||
| 69 | // build encoding detection this will no longer be the case |
||
| 70 | // |
||
| 71 | // We previously had an mbstring implementation here, but that |
||
| 72 | // implementation is heavily non-conforming, so it's been |
||
| 73 | // omitted. |
||
| 74 | if (extension_loaded('iconv')) { |
||
| 75 | // non-conforming |
||
| 76 | $data = @iconv('UTF-8', 'UTF-8//IGNORE', $data); |
||
| 77 | } else { |
||
| 78 | // we can make a conforming native implementation |
||
| 79 | throw new Exception('Not implemented, please install mbstring or iconv'); |
||
| 80 | } |
||
| 81 | |||
| 82 | /* One leading U+FEFF BYTE ORDER MARK character must be |
||
| 83 | ignored if any are present. */ |
||
| 84 | if (substr($data, 0, 3) === "\xEF\xBB\xBF") { |
||
| 85 | $data = substr($data, 3); |
||
| 86 | } |
||
| 87 | |||
| 88 | /* All U+0000 NULL characters in the input must be replaced |
||
| 89 | by U+FFFD REPLACEMENT CHARACTERs. Any occurrences of such |
||
| 90 | characters is a parse error. */ |
||
| 91 | for ($i = 0, $count = substr_count($data, "\0"); $i < $count; $i++) { |
||
| 92 | $this->errors[] = array( |
||
| 93 | 'type' => HTML5_Tokenizer::PARSEERROR, |
||
| 94 | 'data' => 'null-character' |
||
| 95 | ); |
||
| 96 | } |
||
| 97 | /* U+000D CARRIAGE RETURN (CR) characters and U+000A LINE FEED |
||
| 98 | (LF) characters are treated specially. Any CR characters |
||
| 99 | that are followed by LF characters must be removed, and any |
||
| 100 | CR characters not followed by LF characters must be converted |
||
| 101 | to LF characters. Thus, newlines in HTML DOMs are represented |
||
| 102 | by LF characters, and there are never any CR characters in the |
||
| 103 | input to the tokenization stage. */ |
||
| 104 | $data = str_replace( |
||
| 105 | array( |
||
| 106 | "\0", |
||
| 107 | "\r\n", |
||
| 108 | "\r" |
||
| 109 | ), |
||
| 110 | array( |
||
| 111 | "\xEF\xBF\xBD", |
||
| 112 | "\n", |
||
| 113 | "\n" |
||
| 114 | ), |
||
| 115 | $data |
||
| 116 | ); |
||
| 117 | |||
| 118 | /* Any occurrences of any characters in the ranges U+0001 to |
||
| 119 | U+0008, U+000B, U+000E to U+001F, U+007F to U+009F, |
||
| 120 | U+D800 to U+DFFF , U+FDD0 to U+FDEF, and |
||
| 121 | characters U+FFFE, U+FFFF, U+1FFFE, U+1FFFF, U+2FFFE, U+2FFFF, |
||
| 122 | U+3FFFE, U+3FFFF, U+4FFFE, U+4FFFF, U+5FFFE, U+5FFFF, U+6FFFE, |
||
| 123 | U+6FFFF, U+7FFFE, U+7FFFF, U+8FFFE, U+8FFFF, U+9FFFE, U+9FFFF, |
||
| 124 | U+AFFFE, U+AFFFF, U+BFFFE, U+BFFFF, U+CFFFE, U+CFFFF, U+DFFFE, |
||
| 125 | U+DFFFF, U+EFFFE, U+EFFFF, U+FFFFE, U+FFFFF, U+10FFFE, and |
||
| 126 | U+10FFFF are parse errors. (These are all control characters |
||
| 127 | or permanently undefined Unicode characters.) */ |
||
| 128 | // Check PCRE is loaded. |
||
| 129 | if (extension_loaded('pcre')) { |
||
| 130 | $count = preg_match_all( |
||
| 131 | '/(?: |
||
| 132 | [\x01-\x08\x0B\x0E-\x1F\x7F] # U+0001 to U+0008, U+000B, U+000E to U+001F and U+007F |
||
| 133 | | |
||
| 134 | \xC2[\x80-\x9F] # U+0080 to U+009F |
||
| 135 | | |
||
| 136 | \xED(?:\xA0[\x80-\xFF]|[\xA1-\xBE][\x00-\xFF]|\xBF[\x00-\xBF]) # U+D800 to U+DFFFF |
||
| 137 | | |
||
| 138 | \xEF\xB7[\x90-\xAF] # U+FDD0 to U+FDEF |
||
| 139 | | |
||
| 140 | \xEF\xBF[\xBE\xBF] # U+FFFE and U+FFFF |
||
| 141 | | |
||
| 142 | [\xF0-\xF4][\x8F-\xBF]\xBF[\xBE\xBF] # U+nFFFE and U+nFFFF (1 <= n <= 10_{16}) |
||
| 143 | )/x', |
||
| 144 | $data, |
||
| 145 | $matches |
||
| 146 | ); |
||
| 147 | for ($i = 0; $i < $count; $i++) { |
||
| 148 | $this->errors[] = array( |
||
| 149 | 'type' => HTML5_Tokenizer::PARSEERROR, |
||
| 150 | 'data' => 'invalid-codepoint' |
||
| 151 | ); |
||
| 152 | } |
||
| 153 | } else { |
||
| 154 | // XXX: Need non-PCRE impl, probably using substr_count |
||
| 155 | } |
||
| 156 | |||
| 157 | $this->data = $data; |
||
| 158 | $this->char = 0; |
||
| 159 | $this->EOF = strlen($data); |
||
| 160 | } |
||
| 161 | |||
| 162 | /** |
||
| 163 | * Returns the current line that the tokenizer is at. |
||
| 164 | */ |
||
| 165 | public function getCurrentLine() { |
||
| 166 | // Check the string isn't empty |
||
| 167 | if($this->EOF) { |
||
| 168 | // Add one to $this->char because we want the number for the next |
||
| 169 | // byte to be processed. |
||
| 170 | return substr_count($this->data, "\n", 0, min($this->char, $this->EOF)) + 1; |
||
| 171 | } else { |
||
| 172 | // If the string is empty, we are on the first line (sorta). |
||
| 173 | return 1; |
||
| 174 | } |
||
| 175 | } |
||
| 176 | |||
| 177 | /** |
||
| 178 | * Returns the current column of the current line that the tokenizer is at. |
||
| 179 | */ |
||
| 180 | public function getColumnOffset() { |
||
| 181 | // strrpos is weird, and the offset needs to be negative for what we |
||
| 182 | // want (i.e., the last \n before $this->char). This needs to not have |
||
| 183 | // one (to make it point to the next character, the one we want the |
||
| 184 | // position of) added to it because strrpos's behaviour includes the |
||
| 185 | // final offset byte. |
||
| 186 | $lastLine = strrpos($this->data, "\n", $this->char - 1 - strlen($this->data)); |
||
| 187 | |||
| 188 | // However, for here we want the length up until the next byte to be |
||
| 189 | // processed, so add one to the current byte ($this->char). |
||
| 190 | if($lastLine !== false) { |
||
| 191 | $findLengthOf = substr($this->data, $lastLine + 1, $this->char - 1 - $lastLine); |
||
| 192 | } else { |
||
| 193 | $findLengthOf = substr($this->data, 0, $this->char); |
||
| 194 | } |
||
| 195 | |||
| 196 | // Get the length for the string we need. |
||
| 197 | if(extension_loaded('iconv')) { |
||
| 198 | return iconv_strlen($findLengthOf, 'utf-8'); |
||
| 199 | } elseif(extension_loaded('mbstring')) { |
||
| 200 | return mb_strlen($findLengthOf, 'utf-8'); |
||
| 201 | } elseif(extension_loaded('xml')) { |
||
| 202 | return strlen(utf8_decode($findLengthOf)); |
||
| 203 | } else { |
||
| 204 | $count = count_chars($findLengthOf); |
||
| 205 | // 0x80 = 0x7F - 0 + 1 (one added to get inclusive range) |
||
| 206 | // 0x33 = 0xF4 - 0x2C + 1 (one added to get inclusive range) |
||
| 207 | return array_sum(array_slice($count, 0, 0x80)) + |
||
| 208 | array_sum(array_slice($count, 0xC2, 0x33)); |
||
| 209 | } |
||
| 210 | } |
||
| 211 | |||
| 212 | /** |
||
| 213 | * Retrieve the currently consume character. |
||
| 214 | * @note This performs bounds checking |
||
| 215 | */ |
||
| 216 | public function char() { |
||
| 217 | return ($this->char++ < $this->EOF) |
||
| 218 | ? $this->data[$this->char - 1] |
||
| 219 | : false; |
||
| 220 | } |
||
| 221 | |||
| 222 | /** |
||
| 223 | * Get all characters until EOF. |
||
| 224 | * @note This performs bounds checking |
||
| 225 | */ |
||
| 226 | public function remainingChars() { |
||
| 227 | if($this->char < $this->EOF) { |
||
| 228 | $data = substr($this->data, $this->char); |
||
| 229 | $this->char = $this->EOF; |
||
| 230 | return $data; |
||
| 231 | } else { |
||
| 232 | return false; |
||
| 233 | } |
||
| 234 | } |
||
| 235 | |||
| 236 | /** |
||
| 237 | * Matches as far as possible until we reach a certain set of bytes |
||
| 238 | * and returns the matched substring. |
||
| 239 | * @param $bytes Bytes to match. |
||
| 240 | */ |
||
| 241 | public function charsUntil($bytes, $max = null) { |
||
| 242 | if ($this->char < $this->EOF) { |
||
| 243 | if ($max === 0 || $max) { |
||
| 244 | $len = strcspn($this->data, $bytes, $this->char, $max); |
||
| 245 | } else { |
||
| 246 | $len = strcspn($this->data, $bytes, $this->char); |
||
| 247 | } |
||
| 248 | $string = (string) substr($this->data, $this->char, $len); |
||
| 249 | $this->char += $len; |
||
| 250 | return $string; |
||
| 251 | } else { |
||
| 252 | return false; |
||
| 253 | } |
||
| 254 | } |
||
| 255 | |||
| 256 | /** |
||
| 257 | * Matches as far as possible with a certain set of bytes |
||
| 258 | * and returns the matched substring. |
||
| 259 | * @param $bytes Bytes to match. |
||
| 260 | */ |
||
| 261 | public function charsWhile($bytes, $max = null) { |
||
| 262 | if ($this->char < $this->EOF) { |
||
| 263 | if ($max === 0 || $max) { |
||
| 264 | $len = strspn($this->data, $bytes, $this->char, $max); |
||
| 265 | } else { |
||
| 266 | $len = strspn($this->data, $bytes, $this->char); |
||
| 267 | } |
||
| 268 | $string = (string) substr($this->data, $this->char, $len); |
||
| 269 | $this->char += $len; |
||
| 270 | return $string; |
||
| 271 | } else { |
||
| 272 | return false; |
||
| 273 | } |
||
| 274 | } |
||
| 275 | |||
| 276 | /** |
||
| 277 | * Unconsume one character. |
||
| 278 | */ |
||
| 279 | public function unget() { |
||
| 280 | if ($this->char <= $this->EOF) { |
||
| 281 | $this->char--; |
||
| 282 | } |
||
| 283 | } |
||
| 284 | } |
||
| 285 |