Completed
Branch personify (053d57)
by Ekin
02:24
created

BrillTagger::tag()   D

Complexity

Conditions 9
Paths 65

Size

Total Lines 45
Code Lines 22

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 22
CRAP Score 9

Importance

Changes 1
Bugs 0 Features 0
Metric Value
c 1
b 0
f 0
dl 0
loc 45
ccs 22
cts 22
cp 1
rs 4.909
cc 9
eloc 22
nc 65
nop 1
crap 9
1
<?php
2
/**
3
 * Part Of Speech Tagging
4
 * Brill Tagger
5
 *
6
 * @category   BrillTagger
7
 * @author     Ekin H. Bayar <[email protected]>
8
 * @version    0.1.0
9
 */
10
11
namespace BrillTagger;
12
13
class BrillTagger
14
{
15
    private $dictionary = LEXICON;
16
17
    /**
18
     * @param $text
19
     * @return array
20
     */
21 31
    public function tag($text) {
22
23 31
        preg_match_all("/[\w\d\.'%@]+/", $text, $matches);
24
25 31
        $tags = [];
26 31
        $i = 0;
27
28 31
        foreach ($matches[0] as $token) {
29
            # default to a common noun
30 31
            $tags[$i] = ['token' => $token, 'tag' => 'NN'];
31
32
            # remove trailing full stops
33 31
            if (substr(trim($token), -1) == '.') {
34 12
                $token = preg_replace('/\.+$/', '', $token);
35
            }
36
37
            # get from dictionary if set
38 31
            if ($this->tokenExists($token)) {
39 31
                $tags[$i]['tag'] = $this->dictionary[strtolower($token)][0];
40
            }
41
42 31
            $tags[$i]['tag'] = $this->transformNumerics($tags[$i]['tag'], $token);
43
44
            # Anything that ends 'ly' is an adverb
45 31
            if ($this->isAdverb($token)) {
46 1
                $tags[$i]['tag'] = 'RB';
47
            }
48
49 31
            if ($this->isNoun($tags[$i]['tag'])) {
50 20
                $tags[$i]['tag'] = $this->transformNoun($tags[$i]['tag'], $token);
51
            }
52
53 31
            if ($i > 0) {
54 12
                if ($this->isNoun($tags[$i]['tag'])) {
55 9
                    $tags[$i]['tag'] = $this->transformNounToVerb($tags, $i, $token);
56 12
                } elseif ($this->isVerb($tags[$i]['tag'])) {
57 9
                    $tags[$i]['tag'] = $this->transformVerbToNoun($tags, $i);
58
                }
59
            }
60
61 31
            $i++;
62
        }
63
64 31
        return $tags;
65
    }
66
67
    /**
68
     * @param string $token
69
     * @return bool
70
     */
71 32
    public function tokenExists($token) {
72 32
        return isset($this->dictionary[strtolower($token)]);
73
    }
74
75
    /**
76
     * @param string $tag
77
     * @return bool
78
     */
79 31
    public function isNoun($tag) {
80 31
        return substr(trim($tag), 0, 1) == 'N';
81
    }
82
83
    /**
84
     * @param string $tag
85
     * @return bool
86
     */
87 10
    public function isSingularNoun($tag) {
88 10
        return $tag == 'NN';
89
    }
90
91
    /**
92
     * @param string $tag
93
     * @param string $token
94
     * @return bool
95
     */
96 17
    public function isPluralNoun($tag, $token) {
97 17
        return ($this->isNoun($tag) && substr($token, -1) == 's');
98
    }
99
100
    /**
101
     * @param string $tag
102
     * @return bool
103
     */
104 22
    public function isVerb($tag) {
105 22
        return substr(trim($tag), 0, 2) == 'VB' || $this->isVerbToHave($tag);
106
    }
107
108
    /**
109
     * @param string $tag
110
     * @return bool
111
     */
112 18
    public function isVerbToHave($tag) {
113 18
        return substr(trim($tag), 0, 3) == 'HVD' || substr(trim($tag), 0, 3) == 'HVN';
114
    }
115
116
    /**
117
     * @param string $tag
118
     * @return bool
119
     */
120 1
    public function isPronoun($tag) {
121 1
        return substr(trim($tag), 0, 1) == 'P';
122
    }
123
124
    /**
125
     * @param string $token
126
     * @return bool
127
     */
128 4
    public function isPastTenseVerb($token) {
129 4
        return in_array('VBN', $this->dictionary[strtolower($token)]);
130
    }
131
132
    /**
133
     * @param string $token
134
     * @return bool
135
     */
136 4
    public function isPresentTenseVerb($token) {
137 4
        return in_array('VBZ', $this->dictionary[strtolower($token)]);
138
    }
139
140
    /** it him me us you 'em thee we'uns
141
     * @param string $tag
142
     * @return bool
143
     */
144 1
    public function isAccusativePronoun($tag) {
145 1
        return $tag === 'PPO';
146
    }
147
148
    /** it he she thee
149
     * @param string $tag
150
     * @return bool
151
     */
152 1
    public function isThirdPersonPronoun($tag) {
153 1
        return $tag === 'PPS';
154
    }
155
156
    /** they we I you ye thou you'uns
157
     * @param string $tag
158
     * @return bool
159
     */
160 1
    public function isSingularPersonalPronoun($tag) {
161 1
        return $tag === 'PPSS';
162
    }
163
164
    /** itself himself myself yourself herself oneself ownself
165
     * @param string $tag
166
     * @return bool
167
     */
168 1
    public function isSingularReflexivePronoun($tag) {
169 1
        return $tag === 'PPL';
170
    }
171
172
    /** themselves ourselves yourselves
173
     * @param string $tag
174
     * @return bool
175
     */
176 1
    public function isPluralReflexivePronoun($tag) {
177 1
        return $tag === 'PPLS';
178
    }
179
180
    /** ours mine his her/hers their/theirs our its my your/yours out thy thine
181
     * @param string $tag
182
     * @return bool
183
     */
184 1
    public function isPossessivePronoun($tag) {
185 1
        return in_array($tag, ['PP$$', 'PP$']);
186
    }
187
188
    /**
189
     * @param string $token
190
     * @return bool
191
     */
192 21
    public function isAdjective($token) {
193 21
        return (substr($token, -2) == 'al' || in_array('JJ', $this->dictionary[strtolower($token)]));
194
    }
195
196
    /**
197
     * @param string $token
198
     * @return bool
199
     */
200 21
    public function isGerund($token) {
201 21
        return substr($token, -3) == 'ing';
202
    }
203
204
    /**
205
     * @param string $token
206
     * @return bool
207
     */
208 21
    public function isPastParticiple($token) {
209 21
        return substr($token, -2) == 'ed';
210
    }
211
212
    /**
213
     * @param string $token
214
     * @return bool
215
     */
216 32
    public function isAdverb($token) {
217 32
        return substr($token, -2) == 'ly' && strlen($token) !== 3;
218
    }
219
220
    /** Common noun to adj. if it ends with 'al',
221
     * to gerund if 'ing', to past tense if 'ed'
222
     *
223
     * @param string $tag
224
     * @param string $token
225
     * @return string
226
     */
227 20
    public function transformNoun($tag, $token) {
228
229 20
        if ($this->isAdjective($token)) {
230
            $tag = 'JJ';
231 20
        } elseif ($this->isGerund($token)) {
232 1
            $tag = 'VBG';
233 20
        } elseif ($this->isPastParticiple($token)) {
234
            $tag = 'VBN';
235 20
        } elseif ($token === 'I') {
236 5
            $tag = 'PPSS';
237 17
        } elseif ($this->isPluralNoun($tag, $token)) {
238 9
            $tag = 'NNS';
239
        }
240
241
        # Convert noun to number if . appears
242 20
        if (strpos($token, '.') !== false) {
243
            $tag = 'CD';
244
        }
245
246 20
        return $tag;
247
    }
248
249
    /**
250
     * @param array $tags
251
     * @param int $i
252
     * @param string $token
253
     * @return mixed
254
     */
255 9
    public function transformNounToVerb($tags, $i, $token) {
256
        # Noun to verb if the word before is 'would'
257 9
        if ($this->isSingularNoun($tags[$i]['tag']) && strtolower($tags[$i-1]['token']) == 'would') {
258
            $tags[$i]['tag'] = 'VB';
259
        }
260
261
        # If we get noun noun, and the 2nd can be a verb, convert to verb
262 9
        if ($this->isNoun($tags[$i]['tag']) &&
263 9
            $this->isNoun($tags[$i-1]['tag']) &&
264 9
            $this->tokenExists($token)
265
        ) {
266 3
            if ($this->isPastTenseVerb($token)) {
267
                $tags[$i]['tag'] = 'VBN';
268 3
            } elseif ($this->isPresentTenseVerb($token)) {
269 3
                $tags[$i]['tag'] = 'VBZ';
270
            }
271
        }
272
273 9
        return $tags[$i]['tag'];
274
    }
275
276
    /**
277
     * @param array $tags
278
     * @param int $i
279
     * @return mixed
280
     */
281 9
    public function transformVerbToNoun($tags, $i) {
282
        # Converts verbs after 'the' to nouns
283 9
        if ($tags[$i-1]['tag'] == 'DT' && $this->isVerb($tags[$i]['tag'])) {
284
            $tags[$i]['tag'] = 'NN';
285
        }
286
287 9
        return $tags[$i]['tag'];
288
    }
289
290
    /**
291
     * @param string $tag
292
     * @param string $token
293
     * @return string
294
     */
295 31
    public function transformNumerics($tag, $token) {
296
        # tag numerals, cardinals, money (NNS)
297 31
        if (preg_match(NUMERAL, $token)) {
298 1
            $tag = 'NNS';
299
        }
300
301
        # tag years
302 31
        if (preg_match(YEAR, $token, $matches)) {
303 1
            $tag = (isset($matches['nns'])) ? 'NNS' : 'CD';
304
        }
305
306
        # tag percentages
307 31
        if (preg_match(PERCENTAGE, $token)) {
308 1
            $tag = 'NN';
309
        }
310
311 31
        return $tag;
312
    }
313
    
314 10
    public function transformVerbsToThirdPerson($tag, $token) {
315 10
        $verbs = ['can', 'shall', 'am', 'was', 'were', 'haz', 'said', 'made', 'do', 'go'];
316 10
        $isVB  = $this->isVerb($tag) || in_array($token, $verbs);
317
        # Disregard verb tags that don't need s|es|ies
318 10
        $isOK = in_array($tag, [ 'VBD', 'VBG', 'VBN', 'VBZ', 'MD' ]);
319
320 10
        if ($isVB) {
321 8
            if (substr($token, -1 ) == 'o') {
322 2
                $o = $token . 'es';
323 6
            } elseif (substr($token, -1 ) == 'y') {
324 2
                $o = substr($token, 0, 2) . 'ies';
325 4
            } elseif ($isOK) {
326 2
                $o = $token;
327
            } else {
328 2
                $o = $token . 's';
329
            }
330
331 8
            return $o;
332
333
        } else {
334 2
            return $token;
335
        }
336
337
    }
338
}
339