BrillTagger::transformNoun()   B
last analyzed

Complexity

Conditions 7
Paths 12

Size

Total Lines 22

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 11
CRAP Score 7.4822

Importance

Changes 0
Metric Value
dl 0
loc 22
ccs 11
cts 14
cp 0.7856
rs 8.6346
c 0
b 0
f 0
cc 7
nc 12
nop 2
crap 7.4822
1
<?php declare(strict_types=1);
2
/**
3
 * Part Of Speech Tagging
4
 * Brill Tagger
5
 *
6
 * @category   BrillTagger
7
 * @author     Ekin H. Bayar <[email protected]>
8
 * @version    0.2.0
9
 */
10
11
namespace BrillTagger;
12
13
class BrillTagger
14
{
15
    private $dictionary = LEXICON;
16
17
    /**
18
     * @param $text
19
     * @return array
20
     */
21 26
    public function tag($text): array
22
    {
23
24 26
        preg_match_all("/[\w\.'%@]+/", $text, $matches);
25
26 26
        $tags = [];
27 26
        $i    = 0;
28
29 26
        foreach ($matches[0] as $token) {
30
            # default to a common noun
31 26
            $tags[$i] = ['token' => $token, 'tag' => 'NN'];
32
33
            # remove trailing full stops
34 26
            if (substr(trim($token), -1) === '.') {
35 13
                $token = preg_replace('/\.+$/', '', $token);
36
            }
37
38
            # get from dictionary if set
39 26
            if ($this->tokenExists($token)) {
40 26
                $tags[$i]['tag'] = $this->dictionary[$token][0];
41
            }
42
43 26
            $tags[$i]['tag'] = $this->transformNumerics($tags[$i]['tag'], $token);
44
45
            # Anything that ends 'ly' is an adverb
46 26
            if ($this->isAdverb($token)) {
47 1
                $tags[$i]['tag'] = 'RB';
48
            }
49
50 26
            if ($this->isNoun($tags[$i]['tag']) && !$this->isProperNoun($tags[$i]['tag'])) {
51 17
                $tags[$i]['tag'] = $this->transformNoun($tags[$i]['tag'], $token);
52
            }
53
54 26
            if ($i > 0) {
55 13
                $tags[$i]['tag'] = $this->transformBetweenNounAndVerb($tags, $i, $token);
56
            }
57
58 26
            $i++;
59
        }
60
61 26
        return $tags;
62
    }
63
64
    /**
65
     * @param string $token
66
     * @return bool
67
     */
68 27
    public function tokenExists($token): bool
69
    {
70 27
        return isset($this->dictionary[$token]);
71
    }
72
73
    /**
74
     * @param string $tag
75
     * @return bool
76
     */
77 26
    public function isNoun($tag): bool
78
    {
79 26
        return strpos(trim($tag), 'N') === 0;
80
    }
81
82
    /**
83
     * @param string $tag
84
     * @return bool
85
     */
86 17
    public function isProperNoun($tag): bool
87
    {
88 17
        return strpos(trim($tag), 'NP') === 0;
89
    }
90
91
    /**
92
     * @param string $tag
93
     * @return bool
94
     */
95 1
    public function isSingularNoun($tag): bool
96
    {
97 1
        return $tag === 'NN';
98
    }
99
100
    /**
101
     * @param string $tag
102
     * @param string $token
103
     * @return bool
104
     */
105 17
    public function isPluralNoun($tag, $token): bool
106
    {
107 17
        return ($this->isNoun($tag) && substr($token, -1) === 's');
108
    }
109
110
    /**
111
     * @param string $tag
112
     * @return bool
113
     */
114 2
    public function isVerb($tag): bool
115
    {
116 2
        return strpos(trim($tag), 'VB') === 0;
117
    }
118
119
    /**
120
     * @param string $tag
121
     * @return bool
122
     */
123 1
    public function isPronoun($tag): bool
124
    {
125 1
        return strpos(trim($tag), 'P') === 0;
126
    }
127
128
    /**
129
     * @param string $token
130
     * @return bool
131
     */
132 4
    public function isPastTenseVerb($token): bool
133
    {
134 4
        return in_array('VBN', $this->dictionary[$token], true);
135
    }
136
137
    /**
138
     * @param string $token
139
     * @return bool
140
     */
141 4
    public function isPresentTenseVerb($token): bool
142
    {
143 4
        return in_array('VBZ', $this->dictionary[$token], true);
144
    }
145
146
    /** it him me us you 'em thee we'uns
147
     *
148
     * @param string $tag
149
     * @return bool
150
     */
151 1
    public function isAccusativePronoun($tag): bool
152
    {
153 1
        return strpos(trim($tag), 'PPO') === 0;
154
    }
155
156
    /** it he she thee
157
     *
158
     * @param string $tag
159
     * @return bool
160
     */
161 1
    public function isThirdPersonPronoun($tag): bool
162
    {
163 1
        return strpos(trim($tag), 'PPS') === 0;
164
    }
165
166
    /** they we I you ye thou you'uns
167
     *
168
     * @param string $tag
169
     * @return bool
170
     */
171 1
    public function isSingularPersonalPronoun($tag): bool
172
    {
173 1
        return strpos(trim($tag), 'PPSS') === 0;
174
    }
175
176
    /** itself himself myself yourself herself oneself ownself
177
     *
178
     * @param string $tag
179
     * @return bool
180
     */
181 1
    public function isSingularReflexivePronoun($tag): bool
182
    {
183 1
        return strpos(trim($tag), 'PPL') === 0;
184
    }
185
186
    /** themselves ourselves yourselves
187
     *
188
     * @param string $tag
189
     * @return bool
190
     */
191 1
    public function isPluralReflexivePronoun($tag): bool
192
    {
193 1
        return strpos(trim($tag), 'PPLS') === 0;
194
    }
195
196
    /** ours mine his her/hers their/theirs our its my your/yours out thy thine
197
     *
198
     * @param string $tag
199
     * @return bool
200
     */
201 1
    public function isPossessivePronoun($tag): bool
202
    {
203 1
        return in_array($tag, ['PP$$', 'PP$'], true);
204
    }
205
206
    /**
207
     * @param string $token
208
     * @return bool
209
     */
210 18
    public function isAdjective($token): bool
211
    {
212 18
        return (substr($token, -2) === 'al' || in_array('JJ', $this->dictionary[$token], true));
213
    }
214
215
    /**
216
     * @param string $token
217
     * @return bool
218
     */
219 18
    public function isGerund($token): bool
220
    {
221 18
        return substr($token, -3) === 'ing';
222
    }
223
224
    /**
225
     * @param string $token
226
     * @return bool
227
     */
228 18
    public function isPastParticiple($token): bool
229
    {
230 18
        return substr($token, -2) === 'ed';
231
    }
232
233
    /**
234
     * @param string $token
235
     * @return bool
236
     */
237 27
    public function isAdverb($token): bool
238
    {
239 27
        return substr($token, -2) === 'ly';
240
    }
241
242
    /** Common noun to adj. if it ends with 'al',
243
     * to gerund if 'ing', to past tense if 'ed'
244
     *
245
     * @param string $tag
246
     * @param string $token
247
     * @return string
248
     */
249 17
    public function transformNoun($tag, $token): string
250
    {
251
252 17
        if ($this->isAdjective($token)) {
253
            $tag = 'JJ';
254 17
        } elseif ($this->isGerund($token)) {
255 1
            $tag = 'VBG';
256 17
        } elseif ($this->isPastParticiple($token)) {
257
            $tag = 'VBN';
258 17
        } elseif ($token === 'I') {
259
            $tag = 'PPSS';
260 17
        } elseif ($this->isPluralNoun($tag, $token)) {
261 8
            $tag = 'NNS';
262
        }
263
264
        # Convert noun to number if . appears
265 17
        if (strpos($token, '.') !== false) {
266 1
            $tag = 'CD';
267
        }
268
269 17
        return $tag;
270
    }
271
272
    /**
273
     * @param array  $tags
274
     * @param int    $i
275
     * @param string $token
276
     * @return mixed
277
     */
278 13
    public function transformBetweenNounAndVerb($tags, $i, $token)
279
    {
280
        # Noun to verb if the word before is 'would'
281 13
        if ($tags[$i - 1]['token'] === 'would' && $this->isSingularNoun($tags[$i]['tag'])) {
282
            $tags[$i]['tag'] = 'VB';
283
        }
284
285
        # If we get noun noun, and the 2nd can be a verb, convert to verb
286 13
        if ($this->tokenExists($token)
287 13
            && $this->isNoun($tags[$i]['tag'])
288 13
            && $this->isNoun($tags[$i - 1]['tag'])
289
        ) {
290 3
            if ($this->isPastTenseVerb($token)) {
291
                $tags[$i]['tag'] = 'VBN';
292 3
            } elseif ($this->isPresentTenseVerb($token)) {
293 3
                $tags[$i]['tag'] = 'VBZ';
294
            }
295
        }
296
297
        # Converts verbs after 'the' to nouns
298 13
        if ($tags[$i - 1]['tag'] === 'DT' && $this->isVerb($tags[$i]['tag'])) {
299
            $tags[$i]['tag'] = 'NN';
300
        }
301
302 13
        return $tags[$i]['tag'];
303
    }
304
305
    /**
306
     * @param string $tag
307
     * @param string $token
308
     * @return string
309
     */
310 26
    public function transformNumerics($tag, $token): string
311
    {
312
        # tag numerals, cardinals, money (NNS)
313 26
        if (preg_match(NUMERAL, $token)) {
314 2
            $tag = 'NNS';
315
        }
316
317
        # tag years
318 26
        if (preg_match(YEAR, $token, $matches)) {
319 1
            $tag = isset($matches['nns']) ? 'NNS' : 'CD';
320
        }
321
322
        # tag percentages
323 26
        if (preg_match(PERCENTAGE, $token)) {
324 1
            $tag = 'NN';
325
        }
326
327 26
        return $tag;
328
    }
329
}
330