|
1
|
|
|
<?php |
|
2
|
|
|
/** |
|
3
|
|
|
* Part Of Speech Tagging |
|
4
|
|
|
* Brill Tagger |
|
5
|
|
|
* |
|
6
|
|
|
* @category BrillTagger |
|
7
|
|
|
* @author Ekin H. Bayar <[email protected]> |
|
8
|
|
|
* @version 0.1.0 |
|
9
|
|
|
*/ |
|
10
|
|
|
|
|
11
|
|
|
namespace BrillTagger; |
|
12
|
|
|
|
|
13
|
|
|
class BrillTagger |
|
14
|
|
|
{ |
|
15
|
|
|
private $dictionary = LEXICON; |
|
16
|
|
|
|
|
17
|
12 |
|
public function tag($text) { |
|
18
|
|
|
|
|
19
|
12 |
|
preg_match_all("/[\w\d\.'%@]+/", $text, $matches); |
|
20
|
|
|
|
|
21
|
12 |
|
$tags = []; |
|
22
|
12 |
|
$i = 0; |
|
23
|
|
|
|
|
24
|
12 |
|
foreach ($matches[0] as $token) { |
|
25
|
|
|
# default to a common noun |
|
26
|
12 |
|
$tags[$i] = ['token' => $token, 'tag' => 'NN']; |
|
27
|
|
|
|
|
28
|
|
|
# remove trailing full stops |
|
29
|
12 |
|
if (substr(trim($token), -1) == '.') { |
|
30
|
12 |
|
$token = preg_replace('/\.+$/', '', $token); |
|
31
|
|
|
} |
|
32
|
|
|
|
|
33
|
|
|
# get from dictionary if set |
|
34
|
12 |
View Code Duplication |
if (isset($this->dictionary[strtolower($token)])) { |
|
|
|
|
|
|
35
|
12 |
|
$tags[$i]['tag'] = $this->dictionary[strtolower($token)][0]; |
|
36
|
|
|
} |
|
37
|
|
|
|
|
38
|
|
|
# Converts verbs after 'the' to nouns |
|
39
|
12 |
|
if ($i > 0) { |
|
40
|
12 |
|
if ($tags[$i-1]['tag'] == 'DT' && $this->isVerb($tags[$i]['tag'])) { |
|
41
|
|
|
$tags[$i]['tag'] = 'NN'; |
|
42
|
|
|
} |
|
43
|
|
|
} |
|
44
|
|
|
|
|
45
|
|
|
# Convert noun to number if . appears |
|
46
|
12 |
|
if ($tags[$i]['tag'][0] == 'N' && strpos($token, '.') !== false) { |
|
47
|
|
|
$tags[$i]['tag'] = 'CD'; |
|
48
|
|
|
} |
|
49
|
|
|
|
|
50
|
|
|
# manually tag numerals, cardinals, money (NNS) |
|
51
|
12 |
|
if (preg_match(NUMERAL, $token)) { |
|
52
|
|
|
$tags[$i]['tag'] = 'NNS'; |
|
53
|
|
|
} |
|
54
|
|
|
|
|
55
|
|
|
# manually tag years |
|
56
|
12 |
|
if (preg_match(YEAR, $token, $matches)) { |
|
57
|
|
|
$tags[$i]['tag'] = (isset($matches['nns'])) ? 'NNS' : 'CD'; |
|
58
|
|
|
} |
|
59
|
|
|
|
|
60
|
|
|
# manually tag percentages |
|
61
|
12 |
|
if (preg_match(PERCENTAGE, $token)) { |
|
62
|
1 |
|
$tags[$i]['tag'] = 'NN'; |
|
63
|
|
|
} |
|
64
|
|
|
|
|
65
|
|
|
# Convert noun to past participle if ends with 'ed' |
|
66
|
12 |
View Code Duplication |
if ($tags[$i]['tag'][0] == 'N' && substr($token, -2) == 'ed') { |
|
|
|
|
|
|
67
|
|
|
$tags[$i]['tag'] = 'VBN'; |
|
68
|
|
|
} |
|
69
|
|
|
|
|
70
|
|
|
# Anything that ends 'ly' is an adverb |
|
71
|
12 |
|
if (substr($token, -2) == 'ly') { |
|
72
|
1 |
|
$tags[$i]['tag'] = 'RB'; |
|
73
|
|
|
} |
|
74
|
|
|
|
|
75
|
|
|
# Common noun to adjective if it ends with 'al', to gerund if 'ing' |
|
76
|
12 |
|
if ($this->isNoun($tags[$i]['tag'])) { |
|
77
|
11 |
|
if (substr($token, -2) == 'al') { |
|
78
|
|
|
$tags[$i]['tag'] = 'JJ'; |
|
79
|
11 |
|
} elseif (substr($token, -3) == 'ing') { |
|
80
|
1 |
|
$tags[$i]['tag'] = 'VBG'; |
|
81
|
11 |
|
} elseif ($token === 'I') { |
|
82
|
4 |
|
$tags[$i]['tag'] = 'PPSS'; |
|
83
|
|
|
} |
|
84
|
|
|
} |
|
85
|
|
|
|
|
86
|
|
|
# Noun to verb if the word before is 'would' |
|
87
|
12 |
|
if ($i > 0) { |
|
88
|
12 |
|
if ($tags[$i]['tag'] == 'NN' && strtolower($tags[$i-1]['token']) == 'would') { |
|
89
|
|
|
$tags[$i]['tag'] = 'VB'; |
|
90
|
|
|
} |
|
91
|
|
|
} |
|
92
|
|
|
|
|
93
|
|
|
# Noun to plural if it ends with an 's' |
|
94
|
12 |
View Code Duplication |
if ($tags[$i]['tag'] == 'NN' && substr($token, -1) == 's') { |
|
|
|
|
|
|
95
|
|
|
$tags[$i]['tag'] = 'NNS'; |
|
96
|
|
|
} |
|
97
|
|
|
|
|
98
|
|
|
# If we get noun noun, and the 2nd can be a verb, convert to verb |
|
99
|
12 |
|
if ($i > 0) { |
|
100
|
|
|
|
|
101
|
12 |
|
if ($this->isNoun($tags[$i]['tag']) |
|
102
|
12 |
|
&& $this->isNoun($tags[$i-1]['tag']) |
|
103
|
12 |
|
&& isset($this->dictionary[strtolower($token)]) |
|
104
|
|
|
) { |
|
105
|
3 |
|
if (in_array('VBN', $this->dictionary[strtolower($token)])) { |
|
106
|
|
|
$tags[$i]['tag'] = 'VBN'; |
|
107
|
3 |
View Code Duplication |
} else if (in_array('VBZ', $this->dictionary[strtolower($token)])) { |
|
|
|
|
|
|
108
|
3 |
|
$tags[$i]['tag'] = 'VBZ'; |
|
109
|
|
|
} |
|
110
|
|
|
} |
|
111
|
|
|
} |
|
112
|
|
|
|
|
113
|
12 |
|
$i++; |
|
114
|
|
|
} |
|
115
|
|
|
|
|
116
|
12 |
|
return $tags; |
|
117
|
|
|
} |
|
118
|
|
|
|
|
119
|
12 |
|
public function isNoun($tag) { |
|
120
|
12 |
|
return in_array($tag, ['NN', 'NNS']); |
|
121
|
|
|
} |
|
122
|
|
|
|
|
123
|
1 |
|
public function isVerb($tag) { |
|
124
|
1 |
|
return substr(trim($tag), 0, 2) == 'VB'; |
|
125
|
|
|
} |
|
126
|
|
|
|
|
127
|
|
|
public function isPronoun($tag) { |
|
128
|
|
|
return substr(trim($tag), 0, 1) == 'P'; |
|
129
|
|
|
} |
|
130
|
|
|
|
|
131
|
|
|
# it him me us you 'em thee we'uns |
|
132
|
|
|
public function isAccusativePronoun($tag) { |
|
133
|
|
|
return $tag === 'PPO'; |
|
134
|
|
|
} |
|
135
|
|
|
|
|
136
|
|
|
# it he she thee |
|
137
|
|
|
public function isThirdPersonPronoun($tag) { |
|
138
|
|
|
return $tag === 'PPS'; |
|
139
|
|
|
} |
|
140
|
|
|
|
|
141
|
|
|
# they we I you ye thou you'uns |
|
142
|
|
|
public function isSingularPersonalPronoun($tag) { |
|
143
|
|
|
return $tag === 'PPSS'; |
|
144
|
|
|
} |
|
145
|
|
|
|
|
146
|
|
|
# itself himself myself yourself herself oneself ownself |
|
147
|
|
|
public function isSingularReflexivePronoun($tag) { |
|
148
|
|
|
return $tag === 'PPL'; |
|
149
|
|
|
} |
|
150
|
|
|
|
|
151
|
|
|
# themselves ourselves yourselves |
|
152
|
|
|
public function isPluralReflexivePronoun($tag) { |
|
153
|
|
|
return $tag === 'PPLS'; |
|
154
|
|
|
} |
|
155
|
|
|
|
|
156
|
|
|
# ours mine his her/hers their/theirs our its my your/yours out thy thine |
|
157
|
|
|
public function isPossessivePronoun($tag) { |
|
158
|
|
|
return in_array($tag,['PP$$', 'PP$']); |
|
159
|
|
|
} |
|
160
|
|
|
|
|
161
|
|
|
} |
|
162
|
|
|
|
Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.
You can also find more detailed suggestions in the “Code” section of your repository.