Duplicate code is one of the most pungent code smells. A rule that is often used is to re-structure code once it is duplicated in three or more places.
Common duplication problems, and corresponding solutions are:
1 | <?php |
||
32 | class HTML5_InputStream { |
||
33 | /** |
||
34 | * The string data we're parsing. |
||
35 | */ |
||
36 | private $data; |
||
37 | |||
38 | /** |
||
39 | * The current integer byte position we are in $data |
||
40 | */ |
||
41 | private $char; |
||
42 | |||
43 | /** |
||
44 | * Length of $data; when $char === $data, we are at the end-of-file. |
||
45 | */ |
||
46 | private $EOF; |
||
47 | |||
48 | /** |
||
49 | * Parse errors. |
||
50 | */ |
||
51 | public $errors = array(); |
||
52 | |||
53 | /** |
||
54 | * @param $data Data to parse |
||
55 | */ |
||
56 | public function __construct($data) { |
||
57 | |||
58 | /* Given an encoding, the bytes in the input stream must be |
||
59 | converted to Unicode characters for the tokeniser, as |
||
60 | described by the rules for that encoding, except that the |
||
61 | leading U+FEFF BYTE ORDER MARK character, if any, must not |
||
62 | be stripped by the encoding layer (it is stripped by the rule below). |
||
63 | |||
64 | Bytes or sequences of bytes in the original byte stream that |
||
65 | could not be converted to Unicode characters must be converted |
||
66 | to U+FFFD REPLACEMENT CHARACTER code points. */ |
||
67 | |||
68 | // XXX currently assuming input data is UTF-8; once we |
||
69 | // build encoding detection this will no longer be the case |
||
70 | // |
||
71 | // We previously had an mbstring implementation here, but that |
||
72 | // implementation is heavily non-conforming, so it's been |
||
73 | // omitted. |
||
74 | if (extension_loaded('iconv')) { |
||
75 | // non-conforming |
||
76 | $data = @iconv('UTF-8', 'UTF-8//IGNORE', $data); |
||
77 | } else { |
||
78 | // we can make a conforming native implementation |
||
79 | throw new Exception('Not implemented, please install mbstring or iconv'); |
||
80 | } |
||
81 | |||
82 | /* One leading U+FEFF BYTE ORDER MARK character must be |
||
83 | ignored if any are present. */ |
||
84 | if (substr($data, 0, 3) === "\xEF\xBB\xBF") { |
||
85 | $data = substr($data, 3); |
||
86 | } |
||
87 | |||
88 | /* All U+0000 NULL characters in the input must be replaced |
||
89 | by U+FFFD REPLACEMENT CHARACTERs. Any occurrences of such |
||
90 | characters is a parse error. */ |
||
91 | for ($i = 0, $count = substr_count($data, "\0"); $i < $count; $i++) { |
||
92 | $this->errors[] = array( |
||
93 | 'type' => HTML5_Tokenizer::PARSEERROR, |
||
94 | 'data' => 'null-character' |
||
95 | ); |
||
96 | } |
||
97 | /* U+000D CARRIAGE RETURN (CR) characters and U+000A LINE FEED |
||
98 | (LF) characters are treated specially. Any CR characters |
||
99 | that are followed by LF characters must be removed, and any |
||
100 | CR characters not followed by LF characters must be converted |
||
101 | to LF characters. Thus, newlines in HTML DOMs are represented |
||
102 | by LF characters, and there are never any CR characters in the |
||
103 | input to the tokenization stage. */ |
||
104 | $data = str_replace( |
||
105 | array( |
||
106 | "\0", |
||
107 | "\r\n", |
||
108 | "\r" |
||
109 | ), |
||
110 | array( |
||
111 | "\xEF\xBF\xBD", |
||
112 | "\n", |
||
113 | "\n" |
||
114 | ), |
||
115 | $data |
||
116 | ); |
||
117 | |||
118 | /* Any occurrences of any characters in the ranges U+0001 to |
||
119 | U+0008, U+000B, U+000E to U+001F, U+007F to U+009F, |
||
120 | U+D800 to U+DFFF , U+FDD0 to U+FDEF, and |
||
121 | characters U+FFFE, U+FFFF, U+1FFFE, U+1FFFF, U+2FFFE, U+2FFFF, |
||
122 | U+3FFFE, U+3FFFF, U+4FFFE, U+4FFFF, U+5FFFE, U+5FFFF, U+6FFFE, |
||
123 | U+6FFFF, U+7FFFE, U+7FFFF, U+8FFFE, U+8FFFF, U+9FFFE, U+9FFFF, |
||
124 | U+AFFFE, U+AFFFF, U+BFFFE, U+BFFFF, U+CFFFE, U+CFFFF, U+DFFFE, |
||
125 | U+DFFFF, U+EFFFE, U+EFFFF, U+FFFFE, U+FFFFF, U+10FFFE, and |
||
126 | U+10FFFF are parse errors. (These are all control characters |
||
127 | or permanently undefined Unicode characters.) */ |
||
128 | // Check PCRE is loaded. |
||
129 | if (extension_loaded('pcre')) { |
||
130 | $count = preg_match_all( |
||
131 | '/(?: |
||
132 | [\x01-\x08\x0B\x0E-\x1F\x7F] # U+0001 to U+0008, U+000B, U+000E to U+001F and U+007F |
||
133 | | |
||
134 | \xC2[\x80-\x9F] # U+0080 to U+009F |
||
135 | | |
||
136 | \xED(?:\xA0[\x80-\xFF]|[\xA1-\xBE][\x00-\xFF]|\xBF[\x00-\xBF]) # U+D800 to U+DFFFF |
||
137 | | |
||
138 | \xEF\xB7[\x90-\xAF] # U+FDD0 to U+FDEF |
||
139 | | |
||
140 | \xEF\xBF[\xBE\xBF] # U+FFFE and U+FFFF |
||
141 | | |
||
142 | [\xF0-\xF4][\x8F-\xBF]\xBF[\xBE\xBF] # U+nFFFE and U+nFFFF (1 <= n <= 10_{16}) |
||
143 | )/x', |
||
144 | $data, |
||
145 | $matches |
||
146 | ); |
||
147 | for ($i = 0; $i < $count; $i++) { |
||
148 | $this->errors[] = array( |
||
149 | 'type' => HTML5_Tokenizer::PARSEERROR, |
||
150 | 'data' => 'invalid-codepoint' |
||
151 | ); |
||
152 | } |
||
153 | } else { |
||
154 | // XXX: Need non-PCRE impl, probably using substr_count |
||
155 | } |
||
156 | |||
157 | $this->data = $data; |
||
158 | $this->char = 0; |
||
159 | $this->EOF = strlen($data); |
||
160 | } |
||
161 | |||
162 | /** |
||
163 | * Returns the current line that the tokenizer is at. |
||
164 | */ |
||
165 | public function getCurrentLine() { |
||
166 | // Check the string isn't empty |
||
167 | if($this->EOF) { |
||
168 | // Add one to $this->char because we want the number for the next |
||
169 | // byte to be processed. |
||
170 | return substr_count($this->data, "\n", 0, min($this->char, $this->EOF)) + 1; |
||
171 | } else { |
||
172 | // If the string is empty, we are on the first line (sorta). |
||
173 | return 1; |
||
174 | } |
||
175 | } |
||
176 | |||
177 | /** |
||
178 | * Returns the current column of the current line that the tokenizer is at. |
||
179 | */ |
||
180 | public function getColumnOffset() { |
||
181 | // strrpos is weird, and the offset needs to be negative for what we |
||
182 | // want (i.e., the last \n before $this->char). This needs to not have |
||
183 | // one (to make it point to the next character, the one we want the |
||
184 | // position of) added to it because strrpos's behaviour includes the |
||
185 | // final offset byte. |
||
186 | $lastLine = strrpos($this->data, "\n", $this->char - 1 - strlen($this->data)); |
||
187 | |||
188 | // However, for here we want the length up until the next byte to be |
||
189 | // processed, so add one to the current byte ($this->char). |
||
190 | if($lastLine !== false) { |
||
191 | $findLengthOf = substr($this->data, $lastLine + 1, $this->char - 1 - $lastLine); |
||
192 | } else { |
||
193 | $findLengthOf = substr($this->data, 0, $this->char); |
||
194 | } |
||
195 | |||
196 | // Get the length for the string we need. |
||
197 | if(extension_loaded('iconv')) { |
||
198 | return iconv_strlen($findLengthOf, 'utf-8'); |
||
199 | } elseif(extension_loaded('mbstring')) { |
||
200 | return mb_strlen($findLengthOf, 'utf-8'); |
||
201 | } elseif(extension_loaded('xml')) { |
||
202 | return strlen(utf8_decode($findLengthOf)); |
||
203 | } else { |
||
204 | $count = count_chars($findLengthOf); |
||
205 | // 0x80 = 0x7F - 0 + 1 (one added to get inclusive range) |
||
206 | // 0x33 = 0xF4 - 0x2C + 1 (one added to get inclusive range) |
||
207 | return array_sum(array_slice($count, 0, 0x80)) + |
||
208 | array_sum(array_slice($count, 0xC2, 0x33)); |
||
209 | } |
||
210 | } |
||
211 | |||
212 | /** |
||
213 | * Retrieve the currently consume character. |
||
214 | * @note This performs bounds checking |
||
215 | */ |
||
216 | public function char() { |
||
217 | return ($this->char++ < $this->EOF) |
||
218 | ? $this->data[$this->char - 1] |
||
219 | : false; |
||
220 | } |
||
221 | |||
222 | /** |
||
223 | * Get all characters until EOF. |
||
224 | * @note This performs bounds checking |
||
225 | */ |
||
226 | public function remainingChars() { |
||
227 | if($this->char < $this->EOF) { |
||
228 | $data = substr($this->data, $this->char); |
||
229 | $this->char = $this->EOF; |
||
230 | return $data; |
||
231 | } else { |
||
232 | return false; |
||
233 | } |
||
234 | } |
||
235 | |||
236 | /** |
||
237 | * Matches as far as possible until we reach a certain set of bytes |
||
238 | * and returns the matched substring. |
||
239 | * @param $bytes Bytes to match. |
||
240 | */ |
||
241 | public function charsUntil($bytes, $max = null) { |
||
242 | if ($this->char < $this->EOF) { |
||
243 | if ($max === 0 || $max) { |
||
244 | $len = strcspn($this->data, $bytes, $this->char, $max); |
||
245 | } else { |
||
246 | $len = strcspn($this->data, $bytes, $this->char); |
||
247 | } |
||
248 | $string = (string) substr($this->data, $this->char, $len); |
||
249 | $this->char += $len; |
||
250 | return $string; |
||
251 | } else { |
||
252 | return false; |
||
253 | } |
||
254 | } |
||
255 | |||
256 | /** |
||
257 | * Matches as far as possible with a certain set of bytes |
||
258 | * and returns the matched substring. |
||
259 | * @param $bytes Bytes to match. |
||
260 | */ |
||
261 | public function charsWhile($bytes, $max = null) { |
||
262 | if ($this->char < $this->EOF) { |
||
263 | if ($max === 0 || $max) { |
||
264 | $len = strspn($this->data, $bytes, $this->char, $max); |
||
265 | } else { |
||
266 | $len = strspn($this->data, $bytes, $this->char); |
||
267 | } |
||
268 | $string = (string) substr($this->data, $this->char, $len); |
||
269 | $this->char += $len; |
||
270 | return $string; |
||
271 | } else { |
||
272 | return false; |
||
273 | } |
||
274 | } |
||
275 | |||
276 | /** |
||
277 | * Unconsume one character. |
||
278 | */ |
||
279 | public function unget() { |
||
280 | if ($this->char <= $this->EOF) { |
||
281 | $this->char--; |
||
282 | } |
||
283 | } |
||
284 | } |
||
285 |