Completed
Branch unit-tests (e061ab)
by Ekin
03:45
created

BrillTagger::isAdverb()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 3
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 2
CRAP Score 1

Importance

Changes 0
Metric Value
c 0
b 0
f 0
dl 0
loc 3
ccs 2
cts 2
cp 1
rs 10
cc 1
eloc 2
nc 1
nop 1
crap 1
1
<?php
2
/**
3
 * Part Of Speech Tagging
4
 * Brill Tagger
5
 *
6
 * @category   BrillTagger
7
 * @author     Ekin H. Bayar <[email protected]>
8
 * @version    0.1.0
9
 */
10
11
namespace BrillTagger;
12
13
class BrillTagger
14
{
15
    private $dictionary = LEXICON;
16
17 12
    public function tag($text) {
18
19 12
        preg_match_all("/[\w\d\.'%@]+/", $text, $matches);
20
21 12
        $tags = [];
22 12
        $i = 0;
23
24 12
        foreach ($matches[0] as $token) {
25
            # default to a common noun
26 12
            $tags[$i] = ['token' => $token, 'tag' => 'NN'];
27
28
            # remove trailing full stops
29 12
            if (substr(trim($token), -1) == '.') {
30 12
                $token = preg_replace('/\.+$/', '', $token);
31
            }
32
33
            # get from dictionary if set
34 12
            if (isset($this->dictionary[strtolower($token)])) {
35 12
                $tags[$i]['tag'] = $this->dictionary[strtolower($token)][0];
36
            }
37
38
            # tag numerals, cardinals, money (NNS)
39 12
            if (preg_match(NUMERAL, $token)) {
40
                $tags[$i]['tag'] = 'NNS';
41
            }
42
43
            # tag years
44 12
            if (preg_match(YEAR, $token, $matches)) {
45
                $tags[$i]['tag'] = (isset($matches['nns'])) ? 'NNS' : 'CD';
46
            }
47
48
            # tag percentages
49 12
            if (preg_match(PERCENTAGE, $token)) {
50 1
                $tags[$i]['tag'] = 'NN';
51
            }
52
53
            # Anything that ends 'ly' is an adverb
54 12
            if ($this->isAdverb($token)) {
55 1
                $tags[$i]['tag'] = 'RB';
56
            }
57
58
            # Common noun to adj. if it ends with 'al', to gerund if 'ing', to past tense if 'ed'
59 12
            if ($this->isNoun($tags[$i]['tag'])) {
60
61 11
                if ($this->isAdjective($token)) {
62
                    $tags[$i]['tag'] = 'JJ';
63 11
                } elseif ($this->isGerund($token)) {
64 1
                    $tags[$i]['tag'] = 'VBG';
65 11
                } elseif ($this->isPastParticiple($token)) {
66
                    $tags[$i]['tag'] = 'VBN';
67 11
                } elseif ($token === 'I') {
68 4
                    $tags[$i]['tag'] = 'PPSS';
69
                }
70
                # Convert noun to number if . appears
71 11
                if(strpos($token, '.') !== false) {
72
                    $tags[$i]['tag'] = 'CD';
73
                }
74
            }
75
76
            # Noun to plural if it ends with an 's'
77 12
            if ($this->isPluralNoun($tags[$i]['tag'], $token)) {
78 4
                $tags[$i]['tag'] = 'NNS';
79
            }
80
81 12
            if ($i > 0) {
82
83
                # Converts verbs after 'the' to nouns
84 12
                if ($tags[$i-1]['tag'] == 'DT' && $this->isVerb($tags[$i]['tag'])) {
85
                    $tags[$i]['tag'] = 'NN';
86
                }
87
88
                # Noun to verb if the word before is 'would'
89 12
                if ($this->isSingularNoun($tags[$i]['tag']) && strtolower($tags[$i-1]['token']) == 'would') {
90
                    $tags[$i]['tag'] = 'VB';
91
                }
92
93
                # If we get noun noun, and the 2nd can be a verb, convert to verb
94 12
                if ($this->isNoun($tags[$i]['tag']) &&
95 12
                    $this->isNoun($tags[$i-1]['tag']) &&
96 12
                    $this->tokenExists($token)
97
                ) {
98 3
                    if ($this->isPastTenseVerb($token)) {
99
                        $tags[$i]['tag'] = 'VBN';
100 3
                    } elseif ($this->isPresentTenseVerb($token)) {
101 3
                        $tags[$i]['tag'] = 'VBZ';
102
                    }
103
                }
104
            }
105
106 12
            $i++;
107
        }
108
109 12
        return $tags;
110
    }
111
112 3
    public function tokenExists($token){
113 3
        return isset($this->dictionary[strtolower($token)]);
114
    }
115
116 12
    public function isNoun($tag) {
117 12
        return substr(trim($tag), 0, 1) == 'N';
118
    }
119
120 12
    public function isSingularNoun($tag){
121 12
        return $tag == 'NN';
122
    }
123
124 12
    public function isPluralNoun($tag, $token) {
125 12
        return ($this->isNoun($tag) && substr($token, -1) == 's');
126
    }
127
128 1
    public function isVerb($tag) {
129 1
        return substr(trim($tag), 0, 2) == 'VB';
130
    }
131
132
    public function isPronoun($tag) {
133
        return substr(trim($tag), 0, 1) == 'P';
134
    }
135
136 3
    public function isPastTenseVerb($token) {
137 3
        return in_array('VBN', $this->dictionary[strtolower($token)]);
138
    }
139
140 3
    public function isPresentTenseVerb($token) {
141 3
        return in_array('VBZ', $this->dictionary[strtolower($token)]);
142
    }
143
144
    # it him me us you 'em thee we'uns
145
    public function isAccusativePronoun($tag) {
146
        return $tag === 'PPO';
147
    }
148
149
    # it he she thee
150
    public function isThirdPersonPronoun($tag) {
151
        return $tag === 'PPS';
152
    }
153
154
    # they we I you ye thou you'uns
155
    public function isSingularPersonalPronoun($tag) {
156
        return $tag === 'PPSS';
157
    }
158
159
    # itself himself myself yourself herself oneself ownself
160
    public function isSingularReflexivePronoun($tag) {
161
        return $tag === 'PPL';
162
    }
163
164
    # themselves ourselves yourselves
165
    public function isPluralReflexivePronoun($tag) {
166
        return $tag === 'PPLS';
167
    }
168
169
    #  ours mine his her/hers their/theirs our its my your/yours out thy thine
170
    public function isPossessivePronoun($tag) {
171
        return in_array($tag,['PP$$', 'PP$']);
172
    }
173
174 11
    public function isAdjective($token) {
175 11
        return substr($token, -2) == 'al';
176
    }
177
178 11
    public function isGerund($token) {
179 11
        return substr($token, -3) == 'ing';
180
    }
181
182 11
    public function isPastParticiple($token) {
183 11
        return substr($token, -2) == 'ed';
184
    }
185
186 12
    public function isAdverb($token){
187 12
        return substr($token, -2) == 'ly';
188
    }
189
}
190