Complex classes like Tokenizer often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes. You can also have a look at the cohesion graph to spot any un-connected, or weakly-connected components.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use Tokenizer, and based on these observations, apply Extract Interface, too.
| 1 | <?php |
||
| 12 | class Tokenizer |
||
| 13 | { |
||
| 14 | protected $source; |
||
| 15 | protected $pos = 0; |
||
| 16 | protected $line = 1; |
||
| 17 | protected $lineStart = 0; |
||
| 18 | |||
| 19 | /** @var Token */ |
||
| 20 | protected $lookAhead; |
||
| 21 | |||
| 22 | 49 | public function setSource($source) |
|
| 27 | |||
| 28 | 49 | protected function next() |
|
| 37 | |||
| 38 | 49 | protected function skipWhitespace() |
|
| 39 | { |
||
| 40 | 49 | while ($this->pos < strlen($this->source)) { |
|
| 41 | 49 | $ch = $this->source[$this->pos]; |
|
| 42 | 49 | if ($ch === ' ' || $ch === "\t") { |
|
| 43 | 47 | $this->pos++; |
|
| 44 | 49 | } elseif ($ch === '#') { |
|
| 45 | 1 | $this->pos++; |
|
| 46 | while ( |
||
| 47 | 1 | $this->pos < strlen($this->source) && |
|
| 48 | 1 | ($code = ord($this->source[$this->pos])) && |
|
| 49 | 1 | $code !== 10 && $code !== 13 && $code !== 0x2028 && $code !== 0x2029 |
|
| 50 | 1 | ) { |
|
| 51 | 1 | $this->pos++; |
|
| 52 | 1 | } |
|
| 53 | 49 | } elseif ($ch === "\r") { |
|
| 54 | $this->pos++; |
||
| 55 | if ($this->source[$this->pos] === "\n") { |
||
| 56 | $this->pos++; |
||
| 57 | } |
||
| 58 | $this->line++; |
||
| 59 | $this->lineStart = $this->pos; |
||
| 60 | 49 | } elseif ($ch === "\n") { |
|
| 61 | 21 | $this->pos++; |
|
| 62 | 21 | $this->line++; |
|
| 63 | 21 | $this->lineStart = $this->pos; |
|
| 64 | 21 | } else { |
|
| 65 | 49 | break; |
|
| 66 | } |
||
| 67 | 47 | } |
|
| 68 | 49 | } |
|
| 69 | |||
| 70 | /** |
||
| 71 | * @return Token |
||
| 72 | */ |
||
| 73 | 49 | protected function scan() |
|
| 153 | |||
| 154 | 8 | protected function checkFragment() |
|
| 172 | |||
| 173 | 48 | protected function scanWord() |
|
| 174 | { |
||
| 175 | 48 | $start = $this->pos; |
|
| 176 | 48 | $this->pos++; |
|
| 177 | |||
| 178 | 48 | while ($this->pos < strlen($this->source)) { |
|
| 179 | 48 | $ch = $this->source[$this->pos]; |
|
| 180 | |||
| 181 | 48 | if ($ch === '_' || $ch === '$' || 'a' <= $ch && $ch <= ('z') || 'A' <= $ch && $ch <= 'Z' || '0' <= $ch && $ch <= '9') { |
|
| 182 | 48 | $this->pos++; |
|
| 183 | 48 | } else { |
|
| 184 | 47 | break; |
|
| 185 | } |
||
| 186 | 48 | } |
|
| 187 | |||
| 188 | 48 | $value = substr($this->source, $start, $this->pos - $start); |
|
| 189 | |||
| 190 | 48 | return new Token($this->getKeyword($value), $value); |
|
| 191 | } |
||
| 192 | |||
| 193 | 48 | protected function getKeyword($name) |
|
| 220 | |||
| 221 | 10 | protected function scanNumber() |
|
| 222 | { |
||
| 223 | 10 | $start = $this->pos; |
|
| 224 | 10 | if ($this->source[$this->pos] === '-') { |
|
| 225 | 1 | ++$this->pos; |
|
| 226 | 1 | } |
|
| 227 | |||
| 228 | 10 | $this->skipInteger(); |
|
| 229 | |||
| 230 | 10 | if ($this->source[$this->pos] === '.') { |
|
| 231 | 1 | $this->pos++; |
|
| 232 | 1 | $this->skipInteger(); |
|
| 233 | 1 | } |
|
| 234 | |||
| 235 | 10 | $value = substr($this->source, $start, $this->pos - $start); |
|
| 236 | |||
| 237 | 10 | if (strpos($value, '.') === false) { |
|
| 238 | 10 | $value = (int)$value; |
|
| 239 | 10 | } else { |
|
| 240 | 1 | $value = (float)$value; |
|
| 241 | } |
||
| 242 | |||
| 243 | 10 | return new Token(Token::TYPE_NUMBER, $value); |
|
| 244 | } |
||
| 245 | |||
| 246 | 10 | protected function skipInteger() |
|
| 247 | { |
||
| 248 | 10 | $start = $this->pos; |
|
| 249 | |||
| 250 | 10 | while ($this->pos < strlen($this->source)) { |
|
| 251 | 10 | $ch = $this->source[$this->pos]; |
|
| 252 | 10 | if ('0' <= $ch && $ch <= '9') { |
|
| 253 | 10 | $this->pos++; |
|
| 254 | 10 | } else { |
|
| 255 | 10 | break; |
|
| 256 | } |
||
| 257 | 10 | } |
|
| 258 | |||
| 259 | 10 | if ($this->pos - $start === 0) { |
|
| 260 | throw $this->createIllegal(); |
||
| 261 | } |
||
| 262 | 10 | } |
|
| 263 | |||
| 264 | protected function createIllegal() |
||
| 270 | |||
| 271 | 1 | protected function createError($message) |
|
| 275 | |||
| 276 | 1 | protected function getColumn() |
|
| 280 | |||
| 281 | 9 | protected function scanString() |
|
| 282 | { |
||
| 283 | 9 | $this->pos++; |
|
| 284 | |||
| 285 | 9 | $value = ''; |
|
| 286 | 9 | while ($this->pos < strlen($this->source)) { |
|
| 287 | 9 | $ch = $this->source[$this->pos]; |
|
| 288 | 9 | if ($ch === '"' && $this->source[$this->pos - 1] != '\\') { |
|
| 289 | 9 | $this->pos++; |
|
| 290 | |||
| 291 | 9 | return new Token(Token::TYPE_STRING, $value); |
|
| 292 | } |
||
| 293 | |||
| 294 | 9 | $value .= $ch; |
|
| 295 | 9 | $this->pos++; |
|
| 296 | 9 | } |
|
| 297 | |||
| 298 | throw $this->createIllegal(); |
||
| 299 | } |
||
| 300 | |||
| 301 | 49 | protected function end() |
|
| 305 | |||
| 306 | 49 | protected function peek() |
|
| 310 | |||
| 311 | 48 | protected function lex() |
|
| 318 | |||
| 319 | 6 | protected function createUnexpected(Token $token) |
|
| 320 | { |
||
| 321 | 6 | switch ($token->getType()) { |
|
| 322 | 6 | case Token::TYPE_END: |
|
| 323 | return $this->createError('Unexpected end of input'); |
||
| 334 | } |
||
| 335 |