Complex classes like BrillTagger often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes. You can also have a look at the cohesion graph to spot any un-connected, or weakly-connected components.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use BrillTagger, and based on these observations, apply Extract Interface, too.
1 | <?php |
||
13 | class BrillTagger |
||
14 | { |
||
15 | private $dictionary = LEXICON; |
||
16 | |||
17 | /** |
||
18 | * @param $text |
||
19 | * @return array |
||
20 | */ |
||
21 | 21 | public function tag($text) { |
|
22 | |||
23 | 21 | preg_match_all("/[\w\d\.'%@]+/", $text, $matches); |
|
24 | |||
25 | 21 | $tags = []; |
|
26 | 21 | $i = 0; |
|
27 | |||
28 | 21 | foreach ($matches[0] as $token) { |
|
29 | # default to a common noun |
||
30 | 21 | $tags[$i] = ['token' => $token, 'tag' => 'NN']; |
|
31 | |||
32 | # remove trailing full stops |
||
33 | 21 | if (substr(trim($token), -1) == '.') { |
|
34 | 12 | $token = preg_replace('/\.+$/', '', $token); |
|
35 | } |
||
36 | |||
37 | # get from dictionary if set |
||
38 | 21 | if ($this->tokenExists($token)) { |
|
39 | 21 | $tags[$i]['tag'] = $this->dictionary[strtolower($token)][0]; |
|
40 | } |
||
41 | |||
42 | 21 | $tags[$i]['tag'] = $this->transformNumerics($tags[$i]['tag'], $token); |
|
43 | |||
44 | # Anything that ends 'ly' is an adverb |
||
45 | 21 | if ($this->isAdverb($token)) { |
|
46 | 1 | $tags[$i]['tag'] = 'RB'; |
|
47 | } |
||
48 | |||
49 | 21 | if ($this->isNoun($tags[$i]['tag'])) { |
|
50 | 18 | $tags[$i]['tag'] = $this->transformNoun($tags[$i]['tag'], $token); |
|
51 | } |
||
52 | |||
53 | 21 | if ($i > 0) { |
|
54 | 12 | $tags[$i]['tag'] = $this->transformNounToVerb($tags, $i, $token); |
|
55 | 12 | $tags[$i]['tag'] = $this->transformVerbToNoun($tags, $i); |
|
56 | } |
||
57 | |||
58 | 21 | $i++; |
|
59 | } |
||
60 | |||
61 | 21 | return $tags; |
|
62 | } |
||
63 | |||
64 | /** |
||
65 | * @param string $token |
||
66 | * @return bool |
||
67 | */ |
||
68 | 22 | public function tokenExists($token) { |
|
69 | 22 | return isset($this->dictionary[strtolower($token)]); |
|
70 | } |
||
71 | |||
72 | /** |
||
73 | * @param string $tag |
||
74 | * @return bool |
||
75 | */ |
||
76 | 21 | public function isNoun($tag) { |
|
77 | 21 | return substr(trim($tag), 0, 1) == 'N'; |
|
78 | } |
||
79 | |||
80 | /** |
||
81 | * @param string $tag |
||
82 | * @return bool |
||
83 | */ |
||
84 | 13 | public function isSingularNoun($tag) { |
|
85 | 13 | return $tag == 'NN'; |
|
86 | } |
||
87 | |||
88 | /** |
||
89 | * @param string $tag |
||
90 | * @param string $token |
||
91 | * @return bool |
||
92 | */ |
||
93 | 15 | public function isPluralNoun($tag, $token) { |
|
96 | |||
97 | /** |
||
98 | * @param string $tag |
||
99 | * @return bool |
||
100 | */ |
||
101 | 2 | public function isVerb($tag) { |
|
102 | 2 | return substr(trim($tag), 0, 2) == 'VB'; |
|
103 | } |
||
104 | |||
105 | /** |
||
106 | * @param string $tag |
||
107 | * @return bool |
||
108 | */ |
||
109 | 1 | public function isPronoun($tag) { |
|
110 | 1 | return substr(trim($tag), 0, 1) == 'P'; |
|
111 | } |
||
112 | |||
113 | /** |
||
114 | * @param string $token |
||
115 | * @return bool |
||
116 | */ |
||
117 | 4 | public function isPastTenseVerb($token) { |
|
118 | 4 | return in_array('VBN', $this->dictionary[strtolower($token)]); |
|
119 | } |
||
120 | |||
121 | /** |
||
122 | * @param string $token |
||
123 | * @return bool |
||
124 | */ |
||
125 | 4 | public function isPresentTenseVerb($token) { |
|
126 | 4 | return in_array('VBZ', $this->dictionary[strtolower($token)]); |
|
127 | } |
||
128 | |||
129 | /** it him me us you 'em thee we'uns |
||
130 | * @param string $tag |
||
131 | * @return bool |
||
132 | */ |
||
133 | 1 | public function isAccusativePronoun($tag) { |
|
134 | 1 | return $tag === 'PPO'; |
|
135 | } |
||
136 | |||
137 | /** it he she thee |
||
138 | * @param string $tag |
||
139 | * @return bool |
||
140 | */ |
||
141 | 1 | public function isThirdPersonPronoun($tag) { |
|
142 | 1 | return $tag === 'PPS'; |
|
143 | } |
||
144 | |||
145 | /** they we I you ye thou you'uns |
||
146 | * @param string $tag |
||
147 | * @return bool |
||
148 | */ |
||
149 | 1 | public function isSingularPersonalPronoun($tag) { |
|
150 | 1 | return $tag === 'PPSS'; |
|
151 | } |
||
152 | |||
153 | /** itself himself myself yourself herself oneself ownself |
||
154 | * @param string $tag |
||
155 | * @return bool |
||
156 | */ |
||
157 | 1 | public function isSingularReflexivePronoun($tag) { |
|
160 | |||
161 | /** themselves ourselves yourselves |
||
162 | * @param string $tag |
||
163 | * @return bool |
||
164 | */ |
||
165 | 1 | public function isPluralReflexivePronoun($tag) { |
|
168 | |||
169 | /** ours mine his her/hers their/theirs our its my your/yours out thy thine |
||
170 | * @param string $tag |
||
171 | * @return bool |
||
172 | */ |
||
173 | 1 | public function isPossessivePronoun($tag) { |
|
174 | 1 | return in_array($tag, ['PP$$', 'PP$']); |
|
175 | } |
||
176 | |||
177 | /** |
||
178 | * @param string $token |
||
179 | * @return bool |
||
180 | */ |
||
181 | 19 | public function isAdjective($token) { |
|
184 | |||
185 | /** |
||
186 | * @param string $token |
||
187 | * @return bool |
||
188 | */ |
||
189 | 19 | public function isGerund($token) { |
|
192 | |||
193 | /** |
||
194 | * @param string $token |
||
195 | * @return bool |
||
196 | */ |
||
197 | 19 | public function isPastParticiple($token) { |
|
200 | |||
201 | /** |
||
202 | * @param string $token |
||
203 | * @return bool |
||
204 | */ |
||
205 | 22 | public function isAdverb($token) { |
|
208 | |||
209 | /** Common noun to adj. if it ends with 'al', |
||
210 | * to gerund if 'ing', to past tense if 'ed' |
||
211 | * |
||
212 | * @param string $tag |
||
213 | * @param string $token |
||
214 | * @return string |
||
215 | */ |
||
216 | 18 | public function transformNoun($tag, $token) { |
|
237 | |||
238 | /** |
||
239 | * @param array $tags |
||
240 | * @param int $i |
||
241 | * @param string $token |
||
242 | * @return mixed |
||
243 | */ |
||
244 | 12 | public function transformNounToVerb($tags, $i, $token) { |
|
264 | |||
265 | /** |
||
266 | * @param array $tags |
||
267 | * @param int $i |
||
268 | * @return mixed |
||
269 | */ |
||
270 | 12 | public function transformVerbToNoun($tags, $i) { |
|
278 | |||
279 | /** |
||
280 | * @param string $tag |
||
281 | * @param string $token |
||
282 | * @return string |
||
283 | */ |
||
284 | 21 | public function transformNumerics($tag, $token) { |
|
302 | } |
||
303 |