Complex classes like Parser often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes. You can also have a look at the cohesion graph to spot any un-connected, or weakly-connected components.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use Parser, and based on these observations, apply Extract Interface, too.
1 | <?php |
||
14 | class Parser extends ParserBase |
||
15 | { |
||
16 | /** |
||
17 | * @var bool Whether current text contains escape characters |
||
18 | */ |
||
19 | protected $hasEscapedChars; |
||
20 | |||
21 | /** |
||
22 | * @var bool Whether current text contains references |
||
23 | */ |
||
24 | protected $hasRefs; |
||
25 | |||
26 | /** |
||
27 | * @var array Array of [label => link info] |
||
28 | */ |
||
29 | protected $refs; |
||
30 | |||
31 | /** |
||
32 | * @var string Text being parsed |
||
33 | */ |
||
34 | protected $text; |
||
35 | |||
36 | /** |
||
37 | * {@inheritdoc} |
||
38 | */ |
||
39 | 263 | public function parse($text, array $matches) |
|
40 | { |
||
41 | 263 | $this->init($text); |
|
42 | |||
43 | // Match block-level markup as well as forced line breaks |
||
44 | 263 | $this->matchBlockLevelMarkup(); |
|
45 | |||
46 | // Capture link references after block markup as been overwritten |
||
47 | 263 | $this->matchLinkReferences(); |
|
48 | |||
49 | // Inline code must be done first to avoid false positives in other inline markup |
||
50 | 263 | $this->matchInlineCode(); |
|
51 | |||
52 | // Do the rest of inline markup. Images must be matched before links |
||
53 | 263 | $this->matchImages(); |
|
54 | 263 | $this->matchLinks(); |
|
55 | 263 | $this->matchStrikethrough(); |
|
56 | 263 | $this->matchSuperscript(); |
|
57 | 263 | $this->matchEmphasis(); |
|
58 | 263 | $this->matchForcedLineBreaks(); |
|
59 | |||
60 | // Unset the text to free its memory |
||
61 | 263 | unset($this->text); |
|
62 | 263 | } |
|
63 | |||
64 | /** |
||
65 | * Add an image tag for given text span |
||
66 | * |
||
67 | * @param integer $startTagPos Start tag position |
||
68 | * @param integer $endTagPos End tag position |
||
69 | * @param integer $endTagLen End tag length |
||
70 | * @param string $linkInfo URL optionally followed by space and a title |
||
71 | * @param string $alt Value for the alt attribute |
||
72 | * @return void |
||
73 | */ |
||
74 | 24 | protected function addImageTag($startTagPos, $endTagPos, $endTagLen, $linkInfo, $alt) |
|
75 | { |
||
76 | 24 | $tag = $this->parser->addTagPair('IMG', $startTagPos, 2, $endTagPos, $endTagLen); |
|
77 | 24 | $this->setLinkAttributes($tag, $linkInfo, 'src'); |
|
78 | 24 | $tag->setAttribute('alt', $this->decode($alt)); |
|
79 | |||
80 | // Overwrite the markup |
||
81 | 24 | $this->overwrite($startTagPos, $endTagPos + $endTagLen - $startTagPos); |
|
82 | 24 | } |
|
83 | |||
84 | /** |
||
85 | * Add the tag pair for an inline code span |
||
86 | * |
||
87 | * @param array $left Left marker |
||
88 | * @param array $right Right marker |
||
89 | * @return void |
||
90 | */ |
||
91 | 21 | protected function addInlineCodeTags($left, $right) |
|
92 | { |
||
93 | 21 | $startTagPos = $left['pos']; |
|
94 | 21 | $startTagLen = $left['len'] + $left['trimAfter']; |
|
95 | 21 | $endTagPos = $right['pos'] - $right['trimBefore']; |
|
96 | 21 | $endTagLen = $right['len'] + $right['trimBefore']; |
|
97 | 21 | $this->parser->addTagPair('C', $startTagPos, $startTagLen, $endTagPos, $endTagLen); |
|
98 | 21 | $this->overwrite($startTagPos, $endTagPos + $endTagLen - $startTagPos); |
|
99 | 21 | } |
|
100 | |||
101 | /** |
||
102 | * Add an image tag for given text span |
||
103 | * |
||
104 | * @param integer $startTagPos Start tag position |
||
105 | * @param integer $endTagPos End tag position |
||
106 | * @param integer $endTagLen End tag length |
||
107 | * @param string $linkInfo URL optionally followed by space and a title |
||
108 | * @return void |
||
109 | */ |
||
110 | 57 | protected function addLinkTag($startTagPos, $endTagPos, $endTagLen, $linkInfo) |
|
111 | { |
||
112 | // Give the link a slightly worse priority if this is a implicit reference and a slightly |
||
113 | // better priority if it's an explicit reference or an inline link or to give it precedence |
||
114 | // over possible BBCodes such as [b](https://en.wikipedia.org/wiki/B) |
||
115 | 57 | $priority = ($endTagLen === 1) ? 1 : -1; |
|
116 | |||
117 | 57 | $tag = $this->parser->addTagPair('URL', $startTagPos, 1, $endTagPos, $endTagLen, $priority); |
|
118 | 57 | $this->setLinkAttributes($tag, $linkInfo, 'url'); |
|
119 | |||
120 | // Overwrite the markup without touching the link's text |
||
121 | 57 | $this->overwrite($startTagPos, 1); |
|
122 | 57 | $this->overwrite($endTagPos, $endTagLen); |
|
123 | 57 | } |
|
124 | |||
125 | /** |
||
126 | * Close a list at given offset |
||
127 | * |
||
128 | * @param array $list |
||
129 | * @param integer $textBoundary |
||
130 | * @return void |
||
131 | */ |
||
132 | 27 | protected function closeList(array $list, $textBoundary) |
|
145 | |||
146 | /** |
||
147 | * Compute the amount of text to ignore at the start of a quote line |
||
148 | * |
||
149 | * @param string $str Original quote markup |
||
150 | * @param integer $maxQuoteDepth Maximum quote depth |
||
151 | * @return integer Number of characters to ignore |
||
152 | */ |
||
153 | 4 | protected function computeQuoteIgnoreLen($str, $maxQuoteDepth) |
|
154 | { |
||
155 | 4 | $remaining = $str; |
|
156 | 4 | while (--$maxQuoteDepth >= 0) |
|
157 | { |
||
158 | 3 | $remaining = preg_replace('/^ *> ?/', '', $remaining); |
|
159 | 3 | } |
|
160 | |||
161 | 4 | return strlen($str) - strlen($remaining); |
|
162 | } |
||
163 | |||
164 | /** |
||
165 | * Decode a chunk of encoded text to be used as an attribute value |
||
166 | * |
||
167 | * Decodes escaped literals and removes slashes and 0x1A characters |
||
168 | * |
||
169 | * @param string $str Encoded text |
||
170 | * @return string Decoded text |
||
171 | */ |
||
172 | 69 | protected function decode($str) |
|
173 | { |
||
174 | 69 | if ($this->config['decodeHtmlEntities'] && strpos($str, '&') !== false) |
|
175 | 69 | { |
|
176 | 1 | $str = html_entity_decode($str, ENT_QUOTES, 'UTF-8'); |
|
177 | 1 | } |
|
178 | 69 | $str = str_replace("\x1A", '', $str); |
|
179 | |||
180 | 69 | if ($this->hasEscapedChars) |
|
181 | 69 | { |
|
182 | 7 | $str = strtr( |
|
183 | 7 | $str, |
|
184 | [ |
||
185 | 7 | "\x1B0" => '!', "\x1B1" => '"', "\x1B2" => "'", "\x1B3" => '(', |
|
186 | 7 | "\x1B4" => ')', "\x1B5" => '*', "\x1B6" => '[', "\x1B7" => '\\', |
|
187 | 7 | "\x1B8" => ']', "\x1B9" => '^', "\x1BA" => '_', "\x1BB" => '`', |
|
188 | "\x1BC" => '~' |
||
189 | 7 | ] |
|
190 | 7 | ); |
|
191 | 7 | } |
|
192 | |||
193 | 69 | return $str; |
|
194 | } |
||
195 | |||
196 | /** |
||
197 | * Encode escaped literals that have a special meaning |
||
198 | * |
||
199 | * @param string $str Original text |
||
200 | * @return string Encoded text |
||
201 | */ |
||
202 | 15 | protected function encode($str) |
|
203 | { |
||
204 | 15 | return strtr( |
|
205 | 15 | $str, |
|
206 | [ |
||
207 | 15 | '\\!' => "\x1B0", '\\"' => "\x1B1", "\\'" => "\x1B2", '\\(' => "\x1B3", |
|
208 | 15 | '\\)' => "\x1B4", '\\*' => "\x1B5", '\\[' => "\x1B6", '\\\\' => "\x1B7", |
|
209 | 15 | '\\]' => "\x1B8", '\\^' => "\x1B9", '\\_' => "\x1BA", '\\`' => "\x1BB", |
|
210 | '\\~' => "\x1BC" |
||
211 | 15 | ] |
|
212 | 15 | ); |
|
213 | } |
||
214 | |||
215 | /** |
||
216 | * Return the length of the markup at the end of an ATX header |
||
217 | * |
||
218 | * @param integer $startPos Start of the header's text |
||
219 | * @param integer $endPos End of the header's text |
||
220 | * @return integer |
||
221 | */ |
||
222 | 17 | protected function getAtxHeaderEndTagLen($startPos, $endPos) |
|
229 | |||
230 | /** |
||
231 | * Capture lines that contain a Setext-tyle header |
||
232 | * |
||
233 | * @return array |
||
234 | */ |
||
235 | 263 | protected function getSetextLines() |
|
271 | |||
272 | /** |
||
273 | * Get emphasis markup split by block |
||
274 | * |
||
275 | * @param string $regexp Regexp used to match emphasis |
||
276 | * @param integer $pos Position in the text of the first emphasis character |
||
277 | * @return array[] Each array contains a list of [matchPos, matchLen] pairs |
||
278 | */ |
||
279 | 60 | protected function getEmphasisByBlock($regexp, $pos) |
|
309 | |||
310 | /** |
||
311 | * Capture and return inline code markers |
||
312 | * |
||
313 | * @return array |
||
314 | */ |
||
315 | 263 | protected function getInlineCodeMarkers() |
|
316 | { |
||
317 | 263 | $pos = strpos($this->text, '`'); |
|
318 | 263 | if ($pos === false) |
|
319 | 263 | { |
|
320 | 239 | return []; |
|
321 | } |
||
322 | |||
323 | 24 | preg_match_all( |
|
324 | 24 | '/(`+)(\\s*)[^\\x17`]*/', |
|
325 | 24 | str_replace("\x1BB", '\\`', $this->text), |
|
326 | 24 | $matches, |
|
327 | 24 | PREG_OFFSET_CAPTURE | PREG_SET_ORDER, |
|
328 | $pos |
||
329 | 24 | ); |
|
330 | 24 | $trimNext = 0; |
|
331 | 24 | $markers = []; |
|
332 | 24 | foreach ($matches as $m) |
|
333 | { |
||
334 | 24 | $markers[] = [ |
|
335 | 24 | 'pos' => $m[0][1], |
|
336 | 24 | 'len' => strlen($m[1][0]), |
|
337 | 24 | 'trimBefore' => $trimNext, |
|
338 | 24 | 'trimAfter' => strlen($m[2][0]), |
|
339 | 24 | 'next' => $m[0][1] + strlen($m[0][0]) |
|
340 | 24 | ]; |
|
341 | 24 | $trimNext = strlen($m[0][0]) - strlen(rtrim($m[0][0])); |
|
342 | 24 | } |
|
343 | |||
344 | 24 | return $markers; |
|
345 | } |
||
346 | |||
347 | /** |
||
348 | * Capture and return labels used in current text |
||
349 | * |
||
350 | * @return array Labels' text position as keys, lowercased text content as values |
||
351 | */ |
||
352 | 26 | protected function getLabels() |
|
353 | { |
||
354 | 26 | preg_match_all( |
|
355 | 26 | '/\\[((?:[^\\x17[\\]]|\\[[^\\x17[\\]]*\\])*)\\]/', |
|
356 | 26 | $this->text, |
|
357 | 26 | $matches, |
|
358 | PREG_OFFSET_CAPTURE |
||
359 | 26 | ); |
|
360 | 26 | $labels = []; |
|
361 | 26 | foreach ($matches[1] as $m) |
|
362 | { |
||
363 | 26 | $labels[$m[1] - 1] = strtolower($m[0]); |
|
364 | 26 | } |
|
365 | |||
366 | 26 | return $labels; |
|
367 | } |
||
368 | |||
369 | /** |
||
370 | * Test whether emphasis should be ignored at the given position in the text |
||
371 | * |
||
372 | * @param integer $matchPos Position of the emphasis in the text |
||
373 | * @param integer $matchLen Length of the emphasis |
||
374 | * @return bool |
||
375 | */ |
||
376 | 60 | protected function ignoreEmphasis($matchPos, $matchLen) |
|
377 | { |
||
378 | // Ignore single underscores between alphanumeric characters |
||
379 | 60 | return ($this->text[$matchPos] === '_' && $matchLen === 1 && $this->isSurroundedByAlnum($matchPos, $matchLen)); |
|
380 | } |
||
381 | |||
382 | /** |
||
383 | * Initialize this parser with given text |
||
384 | * |
||
385 | * @param string $text Text to be parsed |
||
386 | * @return void |
||
387 | */ |
||
388 | 263 | protected function init($text) |
|
409 | |||
410 | /** |
||
411 | * Test whether given position is preceded by whitespace |
||
412 | * |
||
413 | * @param integer $pos |
||
414 | * @return bool |
||
415 | */ |
||
416 | 59 | protected function isAfterWhitespace($pos) |
|
417 | { |
||
418 | 59 | return ($pos > 0 && $this->isWhitespace($this->text[$pos - 1])); |
|
419 | } |
||
420 | |||
421 | /** |
||
422 | * Test whether given character is alphanumeric |
||
423 | * |
||
424 | * @param string $chr |
||
425 | * @return bool |
||
426 | */ |
||
427 | 8 | protected function isAlnum($chr) |
|
431 | |||
432 | /** |
||
433 | * Test whether given position is followed by whitespace |
||
434 | * |
||
435 | * @param integer $pos |
||
436 | * @return bool |
||
437 | */ |
||
438 | 59 | protected function isBeforeWhitespace($pos) |
|
442 | |||
443 | /** |
||
444 | * Test whether a length of text is surrounded by alphanumeric characters |
||
445 | * |
||
446 | * @param integer $matchPos Start of the text |
||
447 | * @param integer $matchLen Length of the text |
||
448 | * @return bool |
||
449 | */ |
||
450 | 8 | protected function isSurroundedByAlnum($matchPos, $matchLen) |
|
454 | |||
455 | /** |
||
456 | * Test whether given character is an ASCII whitespace character |
||
457 | * |
||
458 | * NOTE: newlines are normalized to LF before parsing so we don't have to check for CR |
||
459 | * |
||
460 | * @param string $chr |
||
461 | * @return bool |
||
462 | */ |
||
463 | 59 | protected function isWhitespace($chr) |
|
467 | |||
468 | /** |
||
469 | * Mark the boundary of a block in the original text |
||
470 | * |
||
471 | * @param integer $pos |
||
472 | * @return void |
||
473 | */ |
||
474 | 263 | protected function markBoundary($pos) |
|
478 | |||
479 | /** |
||
480 | * Match block-level markup, as well as forced line breaks and headers |
||
481 | * |
||
482 | * @return void |
||
483 | */ |
||
484 | 263 | protected function matchBlockLevelMarkup() |
|
882 | |||
883 | /** |
||
884 | * Match all forms of emphasis (emphasis and strong, using underscores or asterisks) |
||
885 | * |
||
886 | * @return void |
||
887 | */ |
||
888 | 263 | protected function matchEmphasis() |
|
893 | |||
894 | /** |
||
895 | * Match emphasis and strong applied using given character |
||
896 | * |
||
897 | * @param string $character Markup character, either * or _ |
||
898 | * @param string $regexp Regexp used to match the series of emphasis character |
||
899 | * @return void |
||
900 | */ |
||
901 | 263 | protected function matchEmphasisByCharacter($character, $regexp) |
|
914 | |||
915 | /** |
||
916 | * Match forced line breaks |
||
917 | * |
||
918 | * @return void |
||
919 | */ |
||
920 | 263 | protected function matchForcedLineBreaks() |
|
929 | |||
930 | /** |
||
931 | * Match images markup |
||
932 | * |
||
933 | * @return void |
||
934 | */ |
||
935 | 263 | protected function matchImages() |
|
951 | |||
952 | /** |
||
953 | * Match inline images markup |
||
954 | * |
||
955 | * @return void |
||
956 | */ |
||
957 | 13 | protected function matchInlineImages() |
|
976 | |||
977 | /** |
||
978 | * Match reference images markup |
||
979 | * |
||
980 | * @return void |
||
981 | */ |
||
982 | 11 | protected function matchReferenceImages() |
|
1011 | |||
1012 | /** |
||
1013 | * Match inline code spans |
||
1014 | * |
||
1015 | * @return void |
||
1016 | */ |
||
1017 | 263 | protected function matchInlineCode() |
|
1044 | |||
1045 | /** |
||
1046 | * Match inline links markup |
||
1047 | * |
||
1048 | * @return void |
||
1049 | */ |
||
1050 | 32 | protected function matchInlineLinks() |
|
1068 | |||
1069 | /** |
||
1070 | * Capture link reference definitions in current text |
||
1071 | * |
||
1072 | * @return void |
||
1073 | */ |
||
1074 | 263 | protected function matchLinkReferences() |
|
1100 | |||
1101 | /** |
||
1102 | * Match inline and reference links |
||
1103 | * |
||
1104 | * @return void |
||
1105 | */ |
||
1106 | 263 | protected function matchLinks() |
|
1117 | |||
1118 | /** |
||
1119 | * Match reference links markup |
||
1120 | * |
||
1121 | * @return void |
||
1122 | */ |
||
1123 | 26 | protected function matchReferenceLinks() |
|
1147 | |||
1148 | /** |
||
1149 | * Match strikethrough |
||
1150 | * |
||
1151 | * @return void |
||
1152 | */ |
||
1153 | 263 | protected function matchStrikethrough() |
|
1175 | |||
1176 | /** |
||
1177 | * Match superscript |
||
1178 | * |
||
1179 | * @return void |
||
1180 | */ |
||
1181 | 263 | protected function matchSuperscript() |
|
1212 | |||
1213 | /** |
||
1214 | * Overwrite part of the text with substitution characters ^Z (0x1A) |
||
1215 | * |
||
1216 | * @param integer $pos Start of the range |
||
1217 | * @param integer $len Length of text to overwrite |
||
1218 | * @return void |
||
1219 | */ |
||
1220 | 169 | protected function overwrite($pos, $len) |
|
1227 | |||
1228 | /** |
||
1229 | * Process a list of emphasis markup strings |
||
1230 | * |
||
1231 | * @param array[] $block List of [matchPos, matchLen] pairs |
||
1232 | * @return void |
||
1233 | */ |
||
1234 | 60 | protected function processEmphasisBlock(array $block) |
|
1300 | |||
1301 | /** |
||
1302 | * Set a URL or IMG tag's attributes |
||
1303 | * |
||
1304 | * @param Tag $tag URL or IMG tag |
||
1305 | * @param string $linkInfo Link's info: an URL optionally followed by spaces and a title |
||
1306 | * @param string $attrName Name of the URL attribute |
||
1307 | * @return void |
||
1308 | */ |
||
1309 | 69 | protected function setLinkAttributes(Tag $tag, $linkInfo, $attrName) |
|
1326 | } |