Duplicate code is one of the most pungent code smells. A rule that is often used is to re-structure code once it is duplicated in three or more places.
Common duplication problems, and corresponding solutions are:
Complex classes like Lexer often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes. You can also have a look at the cohesion graph to spot any un-connected, or weakly-connected components.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use Lexer, and based on these observations, apply Extract Interface, too.
| 1 | <?php  | 
            ||
| 7 | class Lexer  | 
            ||
| 8 | { | 
            ||
| 9 | protected $code;  | 
            ||
| 10 | protected $tokens;  | 
            ||
| 11 | protected $pos;  | 
            ||
| 12 | protected $line;  | 
            ||
| 13 | protected $filePos;  | 
            ||
| 14 | |||
| 15 | protected $tokenMap;  | 
            ||
| 16 | protected $dropTokens;  | 
            ||
| 17 | |||
| 18 | protected $usedAttributes;  | 
            ||
| 19 | |||
| 20 | /**  | 
            ||
| 21 | * Creates a Lexer.  | 
            ||
| 22 | *  | 
            ||
| 23 | * @param array $options Options array. Currently only the 'usedAttributes' option is supported,  | 
            ||
| 24 | * which is an array of attributes to add to the AST nodes. Possible attributes  | 
            ||
| 25 | * are: 'comments', 'startLine', 'endLine', 'startTokenPos', 'endTokenPos',  | 
            ||
| 26 | * 'startFilePos', 'endFilePos'. The option defaults to the first three.  | 
            ||
| 27 | * For more info see getNextToken() docs.  | 
            ||
| 28 | */  | 
            ||
| 29 |     public function __construct(array $options = array()) { | 
            ||
| 30 | // map from internal tokens to PhpParser tokens  | 
            ||
| 31 | $this->tokenMap = $this->createTokenMap();  | 
            ||
| 32 | |||
| 33 | // map of tokens to drop while lexing (the map is only used for isset lookup,  | 
            ||
| 34 | // that's why the value is simply set to 1; the value is never actually used.)  | 
            ||
| 35 | $this->dropTokens = array_fill_keys(array(T_WHITESPACE, T_OPEN_TAG), 1);  | 
            ||
| 36 | |||
| 37 | // the usedAttributes member is a map of the used attribute names to a dummy  | 
            ||
| 38 | // value (here "true")  | 
            ||
| 39 | $options += array(  | 
            ||
| 40 |             'usedAttributes' => array('comments', 'startLine', 'endLine'), | 
            ||
| 41 | );  | 
            ||
| 42 | $this->usedAttributes = array_fill_keys($options['usedAttributes'], true);  | 
            ||
| 43 | }  | 
            ||
| 44 | |||
| 45 | /**  | 
            ||
| 46 | * Initializes the lexer for lexing the provided source code.  | 
            ||
| 47 | *  | 
            ||
| 48 | * @param string $code The source code to lex  | 
            ||
| 49 | *  | 
            ||
| 50 | * @throws Error on lexing errors (unterminated comment or unexpected character)  | 
            ||
| 51 | */  | 
            ||
| 52 |     public function startLexing($code) { | 
            ||
| 53 |         $scream = ini_set('xdebug.scream', '0'); | 
            ||
| 54 | |||
| 55 | $this->resetErrors();  | 
            ||
| 56 | $this->tokens = @token_get_all($code);  | 
            ||
| 57 | $this->handleErrors();  | 
            ||
| 58 | |||
| 59 |         if (false !== $scream) { | 
            ||
| 60 |             ini_set('xdebug.scream', $scream); | 
            ||
| 61 | }  | 
            ||
| 62 | |||
| 63 | $this->code = $code; // keep the code around for __halt_compiler() handling  | 
            ||
| 64 | $this->pos = -1;  | 
            ||
| 65 | $this->line = 1;  | 
            ||
| 66 | $this->filePos = 0;  | 
            ||
| 67 | }  | 
            ||
| 68 | |||
| 69 |     protected function resetErrors() { | 
            ||
| 79 | |||
| 80 |     protected function handleErrors() { | 
            ||
| 108 | |||
| 109 | /**  | 
            ||
| 110 | * Fetches the next token.  | 
            ||
| 111 | *  | 
            ||
| 112 | * The available attributes are determined by the 'usedAttributes' option, which can  | 
            ||
| 113 | * be specified in the constructor. The following attributes are supported:  | 
            ||
| 114 | *  | 
            ||
| 115 | * * 'comments' => Array of PhpParser\Comment or PhpParser\Comment\Doc instances,  | 
            ||
| 116 | * representing all comments that occurred between the previous  | 
            ||
| 117 | * non-discarded token and the current one.  | 
            ||
| 118 | * * 'startLine' => Line in which the node starts.  | 
            ||
| 119 | * * 'endLine' => Line in which the node ends.  | 
            ||
| 120 | * * 'startTokenPos' => Offset into the token array of the first token in the node.  | 
            ||
| 121 | * * 'endTokenPos' => Offset into the token array of the last token in the node.  | 
            ||
| 122 | * * 'startFilePos' => Offset into the code string of the first character that is part of the node.  | 
            ||
| 123 | * * 'endFilePos' => Offset into the code string of the last character that is part of the node  | 
            ||
| 124 | *  | 
            ||
| 125 | * @param mixed $value Variable to store token content in  | 
            ||
| 126 | * @param mixed $startAttributes Variable to store start attributes in  | 
            ||
| 127 | * @param mixed $endAttributes Variable to store end attributes in  | 
            ||
| 128 | *  | 
            ||
| 129 | * @return int Token id  | 
            ||
| 130 | */  | 
            ||
| 131 |     public function getNextToken(&$value = null, &$startAttributes = null, &$endAttributes = null) { | 
            ||
| 132 | $startAttributes = array();  | 
            ||
| 133 | $endAttributes = array();  | 
            ||
| 134 | |||
| 135 |         while (1) { | 
            ||
| 136 |             if (isset($this->tokens[++$this->pos])) { | 
            ||
| 137 | $token = $this->tokens[$this->pos];  | 
            ||
| 138 |             } else { | 
            ||
| 139 | // EOF token with ID 0  | 
            ||
| 140 | $token = "\0";  | 
            ||
| 141 | }  | 
            ||
| 142 | |||
| 143 |             if (isset($this->usedAttributes['startTokenPos'])) { | 
            ||
| 144 | $startAttributes['startTokenPos'] = $this->pos;  | 
            ||
| 145 | }  | 
            ||
| 146 |             if (isset($this->usedAttributes['startFilePos'])) { | 
            ||
| 147 | $startAttributes['startFilePos'] = $this->filePos;  | 
            ||
| 148 | }  | 
            ||
| 149 | |||
| 150 |             if (is_string($token)) { | 
            ||
| 151 | // bug in token_get_all  | 
            ||
| 152 |                 if ('b"' === $token) { | 
            ||
| 153 | $value = 'b"';  | 
            ||
| 154 | $this->filePos += 2;  | 
            ||
| 155 |                     $id = ord('"'); | 
            ||
| 156 |                 } else { | 
            ||
| 157 | $value = $token;  | 
            ||
| 158 | $this->filePos += 1;  | 
            ||
| 159 | $id = ord($token);  | 
            ||
| 160 | }  | 
            ||
| 161 | |||
| 162 |                 if (isset($this->usedAttributes['startLine'])) { | 
            ||
| 163 | $startAttributes['startLine'] = $this->line;  | 
            ||
| 164 | }  | 
            ||
| 165 |                 if (isset($this->usedAttributes['endLine'])) { | 
            ||
| 166 | $endAttributes['endLine'] = $this->line;  | 
            ||
| 167 | }  | 
            ||
| 168 |                 if (isset($this->usedAttributes['endTokenPos'])) { | 
            ||
| 169 | $endAttributes['endTokenPos'] = $this->pos;  | 
            ||
| 170 | }  | 
            ||
| 171 |                 if (isset($this->usedAttributes['endFilePos'])) { | 
            ||
| 172 | $endAttributes['endFilePos'] = $this->filePos - 1;  | 
            ||
| 173 | }  | 
            ||
| 174 | |||
| 175 | return $id;  | 
            ||
| 176 |             } else { | 
            ||
| 177 | $this->line += substr_count($token[1], "\n");  | 
            ||
| 178 | $this->filePos += strlen($token[1]);  | 
            ||
| 179 | |||
| 180 |                 if (T_COMMENT === $token[0]) { | 
            ||
| 181 | View Code Duplication |                     if (isset($this->usedAttributes['comments'])) { | 
            |
| 182 | $startAttributes['comments'][] = new Comment($token[1], $token[2]);  | 
            ||
| 183 | }  | 
            ||
| 184 |                 } elseif (T_DOC_COMMENT === $token[0]) { | 
            ||
| 185 | View Code Duplication |                     if (isset($this->usedAttributes['comments'])) { | 
            |
| 186 | $startAttributes['comments'][] = new Comment\Doc($token[1], $token[2]);  | 
            ||
| 187 | }  | 
            ||
| 188 |                 } elseif (!isset($this->dropTokens[$token[0]])) { | 
            ||
| 189 | $value = $token[1];  | 
            ||
| 190 | |||
| 191 |                     if (isset($this->usedAttributes['startLine'])) { | 
            ||
| 192 | $startAttributes['startLine'] = $token[2];  | 
            ||
| 193 | }  | 
            ||
| 194 |                     if (isset($this->usedAttributes['endLine'])) { | 
            ||
| 195 | $endAttributes['endLine'] = $this->line;  | 
            ||
| 196 | }  | 
            ||
| 197 |                     if (isset($this->usedAttributes['endTokenPos'])) { | 
            ||
| 198 | $endAttributes['endTokenPos'] = $this->pos;  | 
            ||
| 199 | }  | 
            ||
| 200 |                     if (isset($this->usedAttributes['endFilePos'])) { | 
            ||
| 201 | $endAttributes['endFilePos'] = $this->filePos - 1;  | 
            ||
| 202 | }  | 
            ||
| 203 | |||
| 204 | return $this->tokenMap[$token[0]];  | 
            ||
| 205 | }  | 
            ||
| 206 | }  | 
            ||
| 207 | }  | 
            ||
| 208 | |||
| 209 |         throw new \RuntimeException('Reached end of lexer loop'); | 
            ||
| 210 | }  | 
            ||
| 211 | |||
| 212 | /**  | 
            ||
| 213 | * Returns the token array for current code.  | 
            ||
| 214 | *  | 
            ||
| 215 | * The token array is in the same format as provided by the  | 
            ||
| 216 | * token_get_all() function and does not discard tokens (i.e.  | 
            ||
| 217 | * whitespace and comments are included). The token position  | 
            ||
| 218 | * attributes are against this token array.  | 
            ||
| 219 | *  | 
            ||
| 220 | * @return array Array of tokens in token_get_all() format  | 
            ||
| 221 | */  | 
            ||
| 222 |     public function getTokens() { | 
            ||
| 223 | return $this->tokens;  | 
            ||
| 224 | }  | 
            ||
| 225 | |||
| 226 | /**  | 
            ||
| 227 | * Handles __halt_compiler() by returning the text after it.  | 
            ||
| 228 | *  | 
            ||
| 229 | * @return string Remaining text  | 
            ||
| 230 | */  | 
            ||
| 231 |     public function handleHaltCompiler() { | 
            ||
| 248 | |||
| 249 | /**  | 
            ||
| 250 | * Creates the token map.  | 
            ||
| 251 | *  | 
            ||
| 252 | * The token map maps the PHP internal token identifiers  | 
            ||
| 253 | * to the identifiers used by the Parser. Additionally it  | 
            ||
| 254 | * maps T_OPEN_TAG_WITH_ECHO to T_ECHO and T_CLOSE_TAG to ';'.  | 
            ||
| 255 | *  | 
            ||
| 256 | * @return array The token map  | 
            ||
| 257 | */  | 
            ||
| 258 |     protected function createTokenMap() { | 
            ||
| 295 | }  | 
            ||
| 296 |