Complex classes like BrillTagger often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes. You can also have a look at the cohesion graph to spot any un-connected, or weakly-connected components.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use BrillTagger, and based on these observations, apply Extract Interface, too.
1 | <?php declare(strict_types=1); |
||
13 | class BrillTagger |
||
14 | { |
||
15 | private $dictionary = LEXICON; |
||
16 | |||
17 | /** |
||
18 | * @param $text |
||
19 | * @return array |
||
20 | */ |
||
21 | 26 | public function tag($text): array |
|
22 | { |
||
23 | |||
24 | 26 | preg_match_all("/[\w\.'%@]+/", $text, $matches); |
|
25 | |||
26 | 26 | $tags = []; |
|
27 | 26 | $i = 0; |
|
28 | |||
29 | 26 | foreach ($matches[0] as $token) { |
|
30 | # default to a common noun |
||
31 | 26 | $tags[$i] = ['token' => $token, 'tag' => 'NN']; |
|
32 | |||
33 | # remove trailing full stops |
||
34 | 26 | if (substr(trim($token), -1) === '.') { |
|
35 | 13 | $token = preg_replace('/\.+$/', '', $token); |
|
36 | } |
||
37 | |||
38 | # get from dictionary if set |
||
39 | 26 | if ($this->tokenExists($token)) { |
|
40 | 26 | $tags[$i]['tag'] = $this->dictionary[$token][0]; |
|
41 | } |
||
42 | |||
43 | 26 | $tags[$i]['tag'] = $this->transformNumerics($tags[$i]['tag'], $token); |
|
44 | |||
45 | # Anything that ends 'ly' is an adverb |
||
46 | 26 | if ($this->isAdverb($token)) { |
|
47 | 1 | $tags[$i]['tag'] = 'RB'; |
|
48 | } |
||
49 | |||
50 | 26 | if ($this->isNoun($tags[$i]['tag']) && !$this->isProperNoun($tags[$i]['tag'])) { |
|
51 | 17 | $tags[$i]['tag'] = $this->transformNoun($tags[$i]['tag'], $token); |
|
52 | } |
||
53 | |||
54 | 26 | if ($i > 0) { |
|
55 | 13 | $tags[$i]['tag'] = $this->transformBetweenNounAndVerb($tags, $i, $token); |
|
56 | } |
||
57 | |||
58 | 26 | $i++; |
|
59 | } |
||
60 | |||
61 | 26 | return $tags; |
|
62 | } |
||
63 | |||
64 | /** |
||
65 | * @param string $token |
||
66 | * @return bool |
||
67 | */ |
||
68 | 27 | public function tokenExists($token): bool |
|
72 | |||
73 | /** |
||
74 | * @param string $tag |
||
75 | * @return bool |
||
76 | */ |
||
77 | 26 | public function isNoun($tag): bool |
|
81 | |||
82 | /** |
||
83 | * @param string $tag |
||
84 | * @return bool |
||
85 | */ |
||
86 | 17 | public function isProperNoun($tag): bool |
|
90 | |||
91 | /** |
||
92 | * @param string $tag |
||
93 | * @return bool |
||
94 | */ |
||
95 | 1 | public function isSingularNoun($tag): bool |
|
99 | |||
100 | /** |
||
101 | * @param string $tag |
||
102 | * @param string $token |
||
103 | * @return bool |
||
104 | */ |
||
105 | 17 | public function isPluralNoun($tag, $token): bool |
|
109 | |||
110 | /** |
||
111 | * @param string $tag |
||
112 | * @return bool |
||
113 | */ |
||
114 | 2 | public function isVerb($tag): bool |
|
118 | |||
119 | /** |
||
120 | * @param string $tag |
||
121 | * @return bool |
||
122 | */ |
||
123 | 1 | public function isPronoun($tag): bool |
|
127 | |||
128 | /** |
||
129 | * @param string $token |
||
130 | * @return bool |
||
131 | */ |
||
132 | 4 | public function isPastTenseVerb($token): bool |
|
136 | |||
137 | /** |
||
138 | * @param string $token |
||
139 | * @return bool |
||
140 | */ |
||
141 | 4 | public function isPresentTenseVerb($token): bool |
|
145 | |||
146 | /** it him me us you 'em thee we'uns |
||
147 | * |
||
148 | * @param string $tag |
||
149 | * @return bool |
||
150 | */ |
||
151 | 1 | public function isAccusativePronoun($tag): bool |
|
155 | |||
156 | /** it he she thee |
||
157 | * |
||
158 | * @param string $tag |
||
159 | * @return bool |
||
160 | */ |
||
161 | 1 | public function isThirdPersonPronoun($tag): bool |
|
165 | |||
166 | /** they we I you ye thou you'uns |
||
167 | * |
||
168 | * @param string $tag |
||
169 | * @return bool |
||
170 | */ |
||
171 | 1 | public function isSingularPersonalPronoun($tag): bool |
|
175 | |||
176 | /** itself himself myself yourself herself oneself ownself |
||
177 | * |
||
178 | * @param string $tag |
||
179 | * @return bool |
||
180 | */ |
||
181 | 1 | public function isSingularReflexivePronoun($tag): bool |
|
185 | |||
186 | /** themselves ourselves yourselves |
||
187 | * |
||
188 | * @param string $tag |
||
189 | * @return bool |
||
190 | */ |
||
191 | 1 | public function isPluralReflexivePronoun($tag): bool |
|
195 | |||
196 | /** ours mine his her/hers their/theirs our its my your/yours out thy thine |
||
197 | * |
||
198 | * @param string $tag |
||
199 | * @return bool |
||
200 | */ |
||
201 | 1 | public function isPossessivePronoun($tag): bool |
|
205 | |||
206 | /** |
||
207 | * @param string $token |
||
208 | * @return bool |
||
209 | */ |
||
210 | 18 | public function isAdjective($token): bool |
|
214 | |||
215 | /** |
||
216 | * @param string $token |
||
217 | * @return bool |
||
218 | */ |
||
219 | 18 | public function isGerund($token): bool |
|
223 | |||
224 | /** |
||
225 | * @param string $token |
||
226 | * @return bool |
||
227 | */ |
||
228 | 18 | public function isPastParticiple($token): bool |
|
232 | |||
233 | /** |
||
234 | * @param string $token |
||
235 | * @return bool |
||
236 | */ |
||
237 | 27 | public function isAdverb($token): bool |
|
241 | |||
242 | /** Common noun to adj. if it ends with 'al', |
||
243 | * to gerund if 'ing', to past tense if 'ed' |
||
244 | * |
||
245 | * @param string $tag |
||
246 | * @param string $token |
||
247 | * @return string |
||
248 | */ |
||
249 | 17 | public function transformNoun($tag, $token): string |
|
271 | |||
272 | /** |
||
273 | * @param array $tags |
||
274 | * @param int $i |
||
275 | * @param string $token |
||
276 | * @return mixed |
||
277 | */ |
||
278 | 13 | public function transformBetweenNounAndVerb($tags, $i, $token) |
|
304 | |||
305 | /** |
||
306 | * @param string $tag |
||
307 | * @param string $token |
||
308 | * @return string |
||
309 | */ |
||
310 | 26 | public function transformNumerics($tag, $token): string |
|
329 | } |
||
330 |