Complex classes like Parser often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes. You can also have a look at the cohesion graph to spot any un-connected, or weakly-connected components.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use Parser, and based on these observations, apply Extract Interface, too.
| 1 | <?php |
||
| 14 | class Parser extends ParserBase |
||
| 15 | { |
||
| 16 | /** |
||
| 17 | * @var bool Whether current text contains escape characters |
||
| 18 | */ |
||
| 19 | protected $hasEscapedChars; |
||
| 20 | |||
| 21 | /** |
||
| 22 | * @var bool Whether current text contains references |
||
| 23 | */ |
||
| 24 | protected $hasRefs; |
||
| 25 | |||
| 26 | /** |
||
| 27 | * @var array Array of [label => link info] |
||
| 28 | */ |
||
| 29 | protected $refs; |
||
| 30 | |||
| 31 | /** |
||
| 32 | * @var string Text being parsed |
||
| 33 | */ |
||
| 34 | protected $text; |
||
| 35 | |||
| 36 | /** |
||
| 37 | * {@inheritdoc} |
||
| 38 | */ |
||
| 39 | 263 | public function parse($text, array $matches) |
|
| 40 | { |
||
| 41 | 263 | $this->init($text); |
|
| 42 | |||
| 43 | // Match block-level markup as well as forced line breaks |
||
| 44 | 263 | $this->matchBlockLevelMarkup(); |
|
| 45 | |||
| 46 | // Capture link references after block markup as been overwritten |
||
| 47 | 263 | $this->matchLinkReferences(); |
|
| 48 | |||
| 49 | // Inline code must be done first to avoid false positives in other inline markup |
||
| 50 | 263 | $this->matchInlineCode(); |
|
| 51 | |||
| 52 | // Do the rest of inline markup. Images must be matched before links |
||
| 53 | 263 | $this->matchImages(); |
|
| 54 | 263 | $this->matchLinks(); |
|
| 55 | 263 | $this->matchStrikethrough(); |
|
| 56 | 263 | $this->matchSuperscript(); |
|
| 57 | 263 | $this->matchEmphasis(); |
|
| 58 | 263 | $this->matchForcedLineBreaks(); |
|
| 59 | |||
| 60 | // Unset the text to free its memory |
||
| 61 | 263 | unset($this->text); |
|
| 62 | 263 | } |
|
| 63 | |||
| 64 | /** |
||
| 65 | * Add an image tag for given text span |
||
| 66 | * |
||
| 67 | * @param integer $startTagPos Start tag position |
||
| 68 | * @param integer $endTagPos End tag position |
||
| 69 | * @param integer $endTagLen End tag length |
||
| 70 | * @param string $linkInfo URL optionally followed by space and a title |
||
| 71 | * @param string $alt Value for the alt attribute |
||
| 72 | * @return void |
||
| 73 | */ |
||
| 74 | 24 | protected function addImageTag($startTagPos, $endTagPos, $endTagLen, $linkInfo, $alt) |
|
| 75 | { |
||
| 76 | 24 | $tag = $this->parser->addTagPair('IMG', $startTagPos, 2, $endTagPos, $endTagLen); |
|
| 77 | 24 | $this->setLinkAttributes($tag, $linkInfo, 'src'); |
|
| 78 | 24 | $tag->setAttribute('alt', $this->decode($alt)); |
|
| 79 | |||
| 80 | // Overwrite the markup |
||
| 81 | 24 | $this->overwrite($startTagPos, $endTagPos + $endTagLen - $startTagPos); |
|
| 82 | 24 | } |
|
| 83 | |||
| 84 | /** |
||
| 85 | * Add the tag pair for an inline code span |
||
| 86 | * |
||
| 87 | * @param array $left Left marker |
||
| 88 | * @param array $right Right marker |
||
| 89 | * @return void |
||
| 90 | */ |
||
| 91 | 21 | protected function addInlineCodeTags($left, $right) |
|
| 92 | { |
||
| 93 | 21 | $startTagPos = $left['pos']; |
|
| 94 | 21 | $startTagLen = $left['len'] + $left['trimAfter']; |
|
| 95 | 21 | $endTagPos = $right['pos'] - $right['trimBefore']; |
|
| 96 | 21 | $endTagLen = $right['len'] + $right['trimBefore']; |
|
| 97 | 21 | $this->parser->addTagPair('C', $startTagPos, $startTagLen, $endTagPos, $endTagLen); |
|
| 98 | 21 | $this->overwrite($startTagPos, $endTagPos + $endTagLen - $startTagPos); |
|
| 99 | 21 | } |
|
| 100 | |||
| 101 | /** |
||
| 102 | * Add an image tag for given text span |
||
| 103 | * |
||
| 104 | * @param integer $startTagPos Start tag position |
||
| 105 | * @param integer $endTagPos End tag position |
||
| 106 | * @param integer $endTagLen End tag length |
||
| 107 | * @param string $linkInfo URL optionally followed by space and a title |
||
| 108 | * @return void |
||
| 109 | */ |
||
| 110 | 57 | protected function addLinkTag($startTagPos, $endTagPos, $endTagLen, $linkInfo) |
|
| 111 | { |
||
| 112 | // Give the link a slightly worse priority if this is a implicit reference and a slightly |
||
| 113 | // better priority if it's an explicit reference or an inline link or to give it precedence |
||
| 114 | // over possible BBCodes such as [b](https://en.wikipedia.org/wiki/B) |
||
| 115 | 57 | $priority = ($endTagLen === 1) ? 1 : -1; |
|
| 116 | |||
| 117 | 57 | $tag = $this->parser->addTagPair('URL', $startTagPos, 1, $endTagPos, $endTagLen, $priority); |
|
| 118 | 57 | $this->setLinkAttributes($tag, $linkInfo, 'url'); |
|
| 119 | |||
| 120 | // Overwrite the markup without touching the link's text |
||
| 121 | 57 | $this->overwrite($startTagPos, 1); |
|
| 122 | 57 | $this->overwrite($endTagPos, $endTagLen); |
|
| 123 | 57 | } |
|
| 124 | |||
| 125 | /** |
||
| 126 | * Close a list at given offset |
||
| 127 | * |
||
| 128 | * @param array $list |
||
| 129 | * @param integer $textBoundary |
||
| 130 | * @return void |
||
| 131 | */ |
||
| 132 | 27 | protected function closeList(array $list, $textBoundary) |
|
| 145 | |||
| 146 | /** |
||
| 147 | * Compute the amount of text to ignore at the start of a quote line |
||
| 148 | * |
||
| 149 | * @param string $str Original quote markup |
||
| 150 | * @param integer $maxQuoteDepth Maximum quote depth |
||
| 151 | * @return integer Number of characters to ignore |
||
| 152 | */ |
||
| 153 | 4 | protected function computeQuoteIgnoreLen($str, $maxQuoteDepth) |
|
| 154 | { |
||
| 155 | 4 | $remaining = $str; |
|
| 156 | 4 | while (--$maxQuoteDepth >= 0) |
|
| 157 | { |
||
| 158 | 3 | $remaining = preg_replace('/^ *> ?/', '', $remaining); |
|
| 159 | 3 | } |
|
| 160 | |||
| 161 | 4 | return strlen($str) - strlen($remaining); |
|
| 162 | } |
||
| 163 | |||
| 164 | /** |
||
| 165 | * Decode a chunk of encoded text to be used as an attribute value |
||
| 166 | * |
||
| 167 | * Decodes escaped literals and removes slashes and 0x1A characters |
||
| 168 | * |
||
| 169 | * @param string $str Encoded text |
||
| 170 | * @return string Decoded text |
||
| 171 | */ |
||
| 172 | 69 | protected function decode($str) |
|
| 173 | { |
||
| 174 | 69 | if ($this->config['decodeHtmlEntities'] && strpos($str, '&') !== false) |
|
| 175 | 69 | { |
|
| 176 | 1 | $str = html_entity_decode($str, ENT_QUOTES, 'UTF-8'); |
|
| 177 | 1 | } |
|
| 178 | 69 | $str = str_replace("\x1A", '', $str); |
|
| 179 | |||
| 180 | 69 | if ($this->hasEscapedChars) |
|
| 181 | 69 | { |
|
| 182 | 7 | $str = strtr( |
|
| 183 | 7 | $str, |
|
| 184 | [ |
||
| 185 | 7 | "\x1B0" => '!', "\x1B1" => '"', "\x1B2" => "'", "\x1B3" => '(', |
|
| 186 | 7 | "\x1B4" => ')', "\x1B5" => '*', "\x1B6" => '[', "\x1B7" => '\\', |
|
| 187 | 7 | "\x1B8" => ']', "\x1B9" => '^', "\x1BA" => '_', "\x1BB" => '`', |
|
| 188 | "\x1BC" => '~' |
||
| 189 | 7 | ] |
|
| 190 | 7 | ); |
|
| 191 | 7 | } |
|
| 192 | |||
| 193 | 69 | return $str; |
|
| 194 | } |
||
| 195 | |||
| 196 | /** |
||
| 197 | * Encode escaped literals that have a special meaning |
||
| 198 | * |
||
| 199 | * @param string $str Original text |
||
| 200 | * @return string Encoded text |
||
| 201 | */ |
||
| 202 | 15 | protected function encode($str) |
|
| 203 | { |
||
| 204 | 15 | return strtr( |
|
| 205 | 15 | $str, |
|
| 206 | [ |
||
| 207 | 15 | '\\!' => "\x1B0", '\\"' => "\x1B1", "\\'" => "\x1B2", '\\(' => "\x1B3", |
|
| 208 | 15 | '\\)' => "\x1B4", '\\*' => "\x1B5", '\\[' => "\x1B6", '\\\\' => "\x1B7", |
|
| 209 | 15 | '\\]' => "\x1B8", '\\^' => "\x1B9", '\\_' => "\x1BA", '\\`' => "\x1BB", |
|
| 210 | '\\~' => "\x1BC" |
||
| 211 | 15 | ] |
|
| 212 | 15 | ); |
|
| 213 | } |
||
| 214 | |||
| 215 | /** |
||
| 216 | * Return the length of the markup at the end of an ATX header |
||
| 217 | * |
||
| 218 | * @param integer $startPos Start of the header's text |
||
| 219 | * @param integer $endPos End of the header's text |
||
| 220 | * @return integer |
||
| 221 | */ |
||
| 222 | 17 | protected function getAtxHeaderEndTagLen($startPos, $endPos) |
|
| 229 | |||
| 230 | /** |
||
| 231 | * Capture lines that contain a Setext-tyle header |
||
| 232 | * |
||
| 233 | * @return array |
||
| 234 | */ |
||
| 235 | 263 | protected function getSetextLines() |
|
| 271 | |||
| 272 | /** |
||
| 273 | * Get emphasis markup split by block |
||
| 274 | * |
||
| 275 | * @param string $regexp Regexp used to match emphasis |
||
| 276 | * @param integer $pos Position in the text of the first emphasis character |
||
| 277 | * @return array[] Each array contains a list of [matchPos, matchLen] pairs |
||
| 278 | */ |
||
| 279 | 60 | protected function getEmphasisByBlock($regexp, $pos) |
|
| 309 | |||
| 310 | /** |
||
| 311 | * Capture and return inline code markers |
||
| 312 | * |
||
| 313 | * @return array |
||
| 314 | */ |
||
| 315 | 263 | protected function getInlineCodeMarkers() |
|
| 316 | { |
||
| 317 | 263 | $pos = strpos($this->text, '`'); |
|
| 318 | 263 | if ($pos === false) |
|
| 319 | 263 | { |
|
| 320 | 239 | return []; |
|
| 321 | } |
||
| 322 | |||
| 323 | 24 | preg_match_all( |
|
| 324 | 24 | '/(`+)(\\s*)[^\\x17`]*/', |
|
| 325 | 24 | str_replace("\x1BB", '\\`', $this->text), |
|
| 326 | 24 | $matches, |
|
| 327 | 24 | PREG_OFFSET_CAPTURE | PREG_SET_ORDER, |
|
| 328 | $pos |
||
| 329 | 24 | ); |
|
| 330 | 24 | $trimNext = 0; |
|
| 331 | 24 | $markers = []; |
|
| 332 | 24 | foreach ($matches as $m) |
|
| 333 | { |
||
| 334 | 24 | $markers[] = [ |
|
| 335 | 24 | 'pos' => $m[0][1], |
|
| 336 | 24 | 'len' => strlen($m[1][0]), |
|
| 337 | 24 | 'trimBefore' => $trimNext, |
|
| 338 | 24 | 'trimAfter' => strlen($m[2][0]), |
|
| 339 | 24 | 'next' => $m[0][1] + strlen($m[0][0]) |
|
| 340 | 24 | ]; |
|
| 341 | 24 | $trimNext = strlen($m[0][0]) - strlen(rtrim($m[0][0])); |
|
| 342 | 24 | } |
|
| 343 | |||
| 344 | 24 | return $markers; |
|
| 345 | } |
||
| 346 | |||
| 347 | /** |
||
| 348 | * Capture and return labels used in current text |
||
| 349 | * |
||
| 350 | * @return array Labels' text position as keys, lowercased text content as values |
||
| 351 | */ |
||
| 352 | 26 | protected function getLabels() |
|
| 353 | { |
||
| 354 | 26 | preg_match_all( |
|
| 355 | 26 | '/\\[((?:[^\\x17[\\]]|\\[[^\\x17[\\]]*\\])*)\\]/', |
|
| 356 | 26 | $this->text, |
|
| 357 | 26 | $matches, |
|
| 358 | PREG_OFFSET_CAPTURE |
||
| 359 | 26 | ); |
|
| 360 | 26 | $labels = []; |
|
| 361 | 26 | foreach ($matches[1] as $m) |
|
| 362 | { |
||
| 363 | 26 | $labels[$m[1] - 1] = strtolower($m[0]); |
|
| 364 | 26 | } |
|
| 365 | |||
| 366 | 26 | return $labels; |
|
| 367 | } |
||
| 368 | |||
| 369 | /** |
||
| 370 | * Test whether emphasis should be ignored at the given position in the text |
||
| 371 | * |
||
| 372 | * @param integer $matchPos Position of the emphasis in the text |
||
| 373 | * @param integer $matchLen Length of the emphasis |
||
| 374 | * @return bool |
||
| 375 | */ |
||
| 376 | 60 | protected function ignoreEmphasis($matchPos, $matchLen) |
|
| 377 | { |
||
| 378 | // Ignore single underscores between alphanumeric characters |
||
| 379 | 60 | return ($this->text[$matchPos] === '_' && $matchLen === 1 && $this->isSurroundedByAlnum($matchPos, $matchLen)); |
|
| 380 | } |
||
| 381 | |||
| 382 | /** |
||
| 383 | * Initialize this parser with given text |
||
| 384 | * |
||
| 385 | * @param string $text Text to be parsed |
||
| 386 | * @return void |
||
| 387 | */ |
||
| 388 | 263 | protected function init($text) |
|
| 409 | |||
| 410 | /** |
||
| 411 | * Test whether given position is preceded by whitespace |
||
| 412 | * |
||
| 413 | * @param integer $pos |
||
| 414 | * @return bool |
||
| 415 | */ |
||
| 416 | 59 | protected function isAfterWhitespace($pos) |
|
| 417 | { |
||
| 418 | 59 | return ($pos > 0 && $this->isWhitespace($this->text[$pos - 1])); |
|
| 419 | } |
||
| 420 | |||
| 421 | /** |
||
| 422 | * Test whether given character is alphanumeric |
||
| 423 | * |
||
| 424 | * @param string $chr |
||
| 425 | * @return bool |
||
| 426 | */ |
||
| 427 | 8 | protected function isAlnum($chr) |
|
| 431 | |||
| 432 | /** |
||
| 433 | * Test whether given position is followed by whitespace |
||
| 434 | * |
||
| 435 | * @param integer $pos |
||
| 436 | * @return bool |
||
| 437 | */ |
||
| 438 | 59 | protected function isBeforeWhitespace($pos) |
|
| 442 | |||
| 443 | /** |
||
| 444 | * Test whether a length of text is surrounded by alphanumeric characters |
||
| 445 | * |
||
| 446 | * @param integer $matchPos Start of the text |
||
| 447 | * @param integer $matchLen Length of the text |
||
| 448 | * @return bool |
||
| 449 | */ |
||
| 450 | 8 | protected function isSurroundedByAlnum($matchPos, $matchLen) |
|
| 454 | |||
| 455 | /** |
||
| 456 | * Test whether given character is an ASCII whitespace character |
||
| 457 | * |
||
| 458 | * NOTE: newlines are normalized to LF before parsing so we don't have to check for CR |
||
| 459 | * |
||
| 460 | * @param string $chr |
||
| 461 | * @return bool |
||
| 462 | */ |
||
| 463 | 59 | protected function isWhitespace($chr) |
|
| 467 | |||
| 468 | /** |
||
| 469 | * Mark the boundary of a block in the original text |
||
| 470 | * |
||
| 471 | * @param integer $pos |
||
| 472 | * @return void |
||
| 473 | */ |
||
| 474 | 263 | protected function markBoundary($pos) |
|
| 478 | |||
| 479 | /** |
||
| 480 | * Match block-level markup, as well as forced line breaks and headers |
||
| 481 | * |
||
| 482 | * @return void |
||
| 483 | */ |
||
| 484 | 263 | protected function matchBlockLevelMarkup() |
|
| 882 | |||
| 883 | /** |
||
| 884 | * Match all forms of emphasis (emphasis and strong, using underscores or asterisks) |
||
| 885 | * |
||
| 886 | * @return void |
||
| 887 | */ |
||
| 888 | 263 | protected function matchEmphasis() |
|
| 893 | |||
| 894 | /** |
||
| 895 | * Match emphasis and strong applied using given character |
||
| 896 | * |
||
| 897 | * @param string $character Markup character, either * or _ |
||
| 898 | * @param string $regexp Regexp used to match the series of emphasis character |
||
| 899 | * @return void |
||
| 900 | */ |
||
| 901 | 263 | protected function matchEmphasisByCharacter($character, $regexp) |
|
| 914 | |||
| 915 | /** |
||
| 916 | * Match forced line breaks |
||
| 917 | * |
||
| 918 | * @return void |
||
| 919 | */ |
||
| 920 | 263 | protected function matchForcedLineBreaks() |
|
| 929 | |||
| 930 | /** |
||
| 931 | * Match images markup |
||
| 932 | * |
||
| 933 | * @return void |
||
| 934 | */ |
||
| 935 | 263 | protected function matchImages() |
|
| 951 | |||
| 952 | /** |
||
| 953 | * Match inline images markup |
||
| 954 | * |
||
| 955 | * @return void |
||
| 956 | */ |
||
| 957 | 13 | protected function matchInlineImages() |
|
| 976 | |||
| 977 | /** |
||
| 978 | * Match reference images markup |
||
| 979 | * |
||
| 980 | * @return void |
||
| 981 | */ |
||
| 982 | 11 | protected function matchReferenceImages() |
|
| 1011 | |||
| 1012 | /** |
||
| 1013 | * Match inline code spans |
||
| 1014 | * |
||
| 1015 | * @return void |
||
| 1016 | */ |
||
| 1017 | 263 | protected function matchInlineCode() |
|
| 1044 | |||
| 1045 | /** |
||
| 1046 | * Match inline links markup |
||
| 1047 | * |
||
| 1048 | * @return void |
||
| 1049 | */ |
||
| 1050 | 32 | protected function matchInlineLinks() |
|
| 1068 | |||
| 1069 | /** |
||
| 1070 | * Capture link reference definitions in current text |
||
| 1071 | * |
||
| 1072 | * @return void |
||
| 1073 | */ |
||
| 1074 | 263 | protected function matchLinkReferences() |
|
| 1100 | |||
| 1101 | /** |
||
| 1102 | * Match inline and reference links |
||
| 1103 | * |
||
| 1104 | * @return void |
||
| 1105 | */ |
||
| 1106 | 263 | protected function matchLinks() |
|
| 1117 | |||
| 1118 | /** |
||
| 1119 | * Match reference links markup |
||
| 1120 | * |
||
| 1121 | * @return void |
||
| 1122 | */ |
||
| 1123 | 26 | protected function matchReferenceLinks() |
|
| 1147 | |||
| 1148 | /** |
||
| 1149 | * Match strikethrough |
||
| 1150 | * |
||
| 1151 | * @return void |
||
| 1152 | */ |
||
| 1153 | 263 | protected function matchStrikethrough() |
|
| 1175 | |||
| 1176 | /** |
||
| 1177 | * Match superscript |
||
| 1178 | * |
||
| 1179 | * @return void |
||
| 1180 | */ |
||
| 1181 | 263 | protected function matchSuperscript() |
|
| 1212 | |||
| 1213 | /** |
||
| 1214 | * Overwrite part of the text with substitution characters ^Z (0x1A) |
||
| 1215 | * |
||
| 1216 | * @param integer $pos Start of the range |
||
| 1217 | * @param integer $len Length of text to overwrite |
||
| 1218 | * @return void |
||
| 1219 | */ |
||
| 1220 | 169 | protected function overwrite($pos, $len) |
|
| 1227 | |||
| 1228 | /** |
||
| 1229 | * Process a list of emphasis markup strings |
||
| 1230 | * |
||
| 1231 | * @param array[] $block List of [matchPos, matchLen] pairs |
||
| 1232 | * @return void |
||
| 1233 | */ |
||
| 1234 | 60 | protected function processEmphasisBlock(array $block) |
|
| 1300 | |||
| 1301 | /** |
||
| 1302 | * Set a URL or IMG tag's attributes |
||
| 1303 | * |
||
| 1304 | * @param Tag $tag URL or IMG tag |
||
| 1305 | * @param string $linkInfo Link's info: an URL optionally followed by spaces and a title |
||
| 1306 | * @param string $attrName Name of the URL attribute |
||
| 1307 | * @return void |
||
| 1308 | */ |
||
| 1309 | 69 | protected function setLinkAttributes(Tag $tag, $linkInfo, $attrName) |
|
| 1326 | } |