1
|
|
|
<?php |
2
|
|
|
namespace Mystem; |
3
|
|
|
|
4
|
|
|
/** |
5
|
|
|
* Class Word |
6
|
|
|
* @property array[] $variants lexical interpretation variants: |
7
|
|
|
* - string $normalized - normalized word representation |
8
|
|
|
* - boolean $strict - dictionary or predictable normalized representation |
9
|
|
|
* - array $grammems - lexical information (constants from MystemConst) |
10
|
|
|
*/ |
11
|
|
|
class Word |
12
|
|
|
{ |
13
|
|
|
/** |
14
|
|
|
* @var string $grammemeRegexp cached constants regular expression from MystemConst |
15
|
|
|
*/ |
16
|
|
|
private static $grammemeRegexp = null; |
17
|
|
|
|
18
|
|
|
/* @var string $constructorClass need for instantiate properly class in newFrom* methods */ |
19
|
|
|
protected static $constructorClass = '\Mystem\Word'; |
20
|
|
|
|
21
|
|
|
/** |
22
|
|
|
* @var string $original original string |
23
|
|
|
*/ |
24
|
|
|
public $original; |
25
|
|
|
|
26
|
|
|
public $variants = array(); |
27
|
|
|
|
28
|
|
|
|
29
|
|
|
/* @var string[] $falsePositiveList */ |
30
|
|
|
public static $falsePositiveList = array(); |
31
|
|
|
|
32
|
|
|
/* @var string[] $falsePositiveList */ |
33
|
|
|
public static $falsePositiveNormalizedList = array(); |
34
|
|
|
|
35
|
|
|
/* @var string[] $falseNegativeList */ |
36
|
|
|
public static $falseNegativeList = array(); |
37
|
|
|
|
38
|
|
|
/* @var string[] $falseNegativeList */ |
39
|
|
|
public static $falseNegativeNormalizedList = array(); |
40
|
|
|
|
41
|
37 |
|
public function __construct() |
42
|
|
|
{ |
43
|
37 |
|
if (self::$grammemeRegexp === null) { |
|
|
|
|
44
|
37 |
|
self::$grammemeRegexp = '#(' . implode('|', MystemConst::grammemeList()) . ')#u'; |
45
|
37 |
|
} |
46
|
37 |
|
} |
47
|
|
|
|
48
|
|
|
/** |
49
|
|
|
* @param array|string $lexicalString - prepared structure from Mystem |
50
|
|
|
* @param int $maxVariants |
51
|
|
|
* @return Word |
52
|
|
|
*/ |
53
|
37 |
|
public static function newFromLexicalString($lexicalString, $maxVariants = null) |
54
|
|
|
{ |
55
|
|
|
/* @var Word $word */ |
56
|
37 |
|
$word = new static::$constructorClass(); |
57
|
37 |
|
if (is_array($lexicalString)) { |
58
|
35 |
|
$word->parse($lexicalString, $maxVariants); |
59
|
35 |
|
} else { |
60
|
2 |
|
$word->original = $lexicalString; |
61
|
|
|
} |
62
|
37 |
|
return $word; |
63
|
|
|
} |
64
|
|
|
|
65
|
|
|
/** |
66
|
|
|
* @param string $word |
67
|
|
|
* @param int $maxVariants |
68
|
|
|
* @return Word |
69
|
|
|
*/ |
70
|
33 |
|
public static function stemm($word, $maxVariants = null) |
71
|
|
|
{ |
72
|
33 |
|
$lexicalString = Mystem::stemm($word); |
73
|
33 |
|
return self::newFromLexicalString(isset($lexicalString[0]) ? $lexicalString[0] : $word, $maxVariants); |
74
|
|
|
} |
75
|
|
|
|
76
|
|
|
/** |
77
|
|
|
* Normalized word |
78
|
|
|
* @return string |
79
|
|
|
*/ |
80
|
15 |
|
public function normalized() |
81
|
|
|
{ |
82
|
15 |
|
if (isset($this->variants[0], $this->variants[0]['normalized'])) { |
83
|
14 |
|
return $this->variants[0]['normalized']; |
84
|
|
|
} else { |
85
|
1 |
|
return ''; |
86
|
|
|
} |
87
|
|
|
} |
88
|
|
|
|
89
|
2 |
|
public function __toString() |
90
|
|
|
{ |
91
|
2 |
|
return $this->normalized(); |
92
|
|
|
} |
93
|
|
|
|
94
|
|
|
/** |
95
|
|
|
* Parse raw morphological data from mystem and fill Word object data |
96
|
|
|
* @param array $lexicalString - prepared string from Mystem |
97
|
|
|
* @param int $maxVariants |
98
|
|
|
*/ |
99
|
35 |
|
protected function parse($lexicalString, $maxVariants = null) |
100
|
|
|
{ |
101
|
35 |
|
$counter = 0; |
102
|
35 |
|
$this->original = $lexicalString['text']; |
103
|
35 |
|
$analysis = $lexicalString['analysis']; |
104
|
35 |
|
foreach ($analysis as $aVariant) { |
105
|
|
|
$variant = array( |
106
|
35 |
|
'normalized' => $aVariant['lex'], |
107
|
35 |
|
'strict' => isset($aVariant['qual']) && $aVariant['qual'] === 'bastard', |
108
|
35 |
|
'grammems' => array(), |
109
|
35 |
|
); |
110
|
35 |
|
preg_match_all(self::$grammemeRegexp, $aVariant['gr'], $match); |
111
|
35 |
|
if (!empty($match[0])) { |
112
|
35 |
|
$variant['grammems'] = $match[0]; |
113
|
35 |
|
} |
114
|
35 |
|
$this->variants[$counter++] = $variant; |
115
|
35 |
|
if ($maxVariants !== null && $counter >= $maxVariants) { |
116
|
4 |
|
break; |
117
|
|
|
} |
118
|
35 |
|
} |
119
|
35 |
|
} |
120
|
|
|
|
121
|
|
|
/** |
122
|
|
|
* @param string $gramm - grammar primitive from MystemConst |
123
|
|
|
* @return int |
124
|
|
|
*/ |
125
|
3 |
|
public function addGrammeme($gramm) |
126
|
|
|
{ |
127
|
3 |
|
$counter = 0; |
128
|
3 |
|
$count = count($this->variants); |
129
|
3 |
|
for ($i = 0; $i < $count; $i++) { |
130
|
3 |
|
$counter += $this->addGrammemeInVariant($gramm, $i); |
131
|
3 |
|
} |
132
|
3 |
|
return $counter; |
133
|
|
|
} |
134
|
|
|
|
135
|
|
|
/** |
136
|
|
|
* @param string $gramm - grammar primitive from MystemConst |
137
|
|
|
* @param int $level |
138
|
|
|
* @return bool |
139
|
|
|
*/ |
140
|
3 |
|
protected function addGrammemeInVariant($gramm, $level = null) |
141
|
|
|
{ |
142
|
3 |
|
if (!isset($this->variants[$level]) || in_array($gramm, $this->variants[$level]['grammems'])) { |
143
|
|
|
return false; |
144
|
|
|
} |
145
|
3 |
|
$this->variants[$level]['grammems'][] = $gramm; |
146
|
3 |
|
return true; |
147
|
|
|
} |
148
|
|
|
|
149
|
|
|
/** |
150
|
|
|
* @param string $gramm - grammar primitive from MystemConst |
151
|
|
|
* @return int |
152
|
|
|
*/ |
153
|
3 |
|
public function removeGrammeme($gramm) |
154
|
|
|
{ |
155
|
3 |
|
$counter = 0; |
156
|
3 |
|
$count = count($this->variants); |
157
|
3 |
|
for ($i = 0; $i < $count; $i++) { |
158
|
3 |
|
$counter += $this->removeGrammemeInVariant($gramm, $i); |
159
|
3 |
|
} |
160
|
3 |
|
return $counter; |
161
|
|
|
} |
162
|
|
|
|
163
|
|
|
/** |
164
|
|
|
* @param string $gramm - grammar primitive from MystemConst |
165
|
|
|
* @param int $level |
166
|
|
|
* @return bool |
167
|
|
|
*/ |
168
|
3 |
|
protected function removeGrammemeInVariant($gramm, $level) |
169
|
|
|
{ |
170
|
3 |
|
if (!isset($this->variants[$level]['grammems'])) { |
171
|
|
|
return false; |
172
|
|
|
} |
173
|
3 |
|
$key = array_search($gramm, $this->variants[$level]['grammems']); |
174
|
3 |
|
unset($this->variants[$level]['grammems'][$key]); |
175
|
3 |
|
return $key !== false; |
176
|
|
|
} |
177
|
|
|
|
178
|
|
|
/** |
179
|
|
|
* Search grammese primitive in word variants |
180
|
|
|
* @param string $gramm - grammar primitive from MystemConst |
181
|
|
|
* @param integer $level - variants maximum depth |
182
|
|
|
* @return boolean |
183
|
|
|
*/ |
184
|
12 |
|
public function checkGrammeme($gramm, $level = null) |
185
|
|
|
{ |
186
|
12 |
|
$counter = 0; |
187
|
12 |
|
foreach ($this->variants as $variant) { |
188
|
11 |
|
if (in_array($gramm, $variant['grammems'])) { |
189
|
6 |
|
return true; |
190
|
7 |
|
} elseif ($level !== null && ++$counter >= $level) { |
191
|
1 |
|
return false; |
192
|
|
|
} |
193
|
8 |
|
} |
194
|
7 |
|
return false; |
195
|
|
|
} |
196
|
|
|
|
197
|
|
|
/** |
198
|
|
|
* Get verb time: present, past or future |
199
|
|
|
* @param int $variant find in which morphological variant |
200
|
|
|
* @return null|string MystemConst::PRESENT, MystemConst::PAST or MystemConst::FUTURE |
201
|
|
|
*/ |
202
|
2 |
|
public function getVerbTime($variant = 0) |
203
|
|
|
{ |
204
|
2 |
|
return $this->searchGrammemeInList(array( |
205
|
2 |
|
MystemConst::PRESENT, MystemConst::FUTURE, MystemConst::PAST |
206
|
2 |
|
), $variant); |
207
|
|
|
} |
208
|
|
|
|
209
|
|
|
/** |
210
|
|
|
* Get count: single or plural |
211
|
|
|
* @param int $variant find in which morphological variant |
212
|
|
|
* @return null|string - MystemConst |
213
|
|
|
*/ |
214
|
4 |
|
public function getCount($variant = 0) |
215
|
|
|
{ |
216
|
4 |
|
return $this->searchGrammemeInList(array( |
217
|
4 |
|
MystemConst::SINGULAR, MystemConst::PLURAL |
218
|
4 |
|
), $variant); |
219
|
|
|
} |
220
|
|
|
|
221
|
|
|
/** |
222
|
|
|
* Get gender |
223
|
|
|
* @param int $variant find in which morphological variant |
224
|
|
|
* @return null|string - MystemConst |
225
|
|
|
*/ |
226
|
3 |
|
public function getGender($variant = 0) |
227
|
|
|
{ |
228
|
3 |
|
return $this->searchGrammemeInList(array( |
229
|
3 |
|
MystemConst::FEMININE, MystemConst::MASCULINE, MystemConst::NEUTER |
230
|
3 |
|
), $variant); |
231
|
|
|
} |
232
|
|
|
|
233
|
|
|
/** |
234
|
|
|
* Get animate |
235
|
|
|
* @param int $variant find in which morphological variant |
236
|
|
|
* @return null|string - MystemConst |
237
|
|
|
*/ |
238
|
2 |
|
public function getAnimate($variant = 0) |
239
|
|
|
{ |
240
|
2 |
|
return $this->searchGrammemeInList(array( |
241
|
2 |
|
MystemConst::ANIMATE, MystemConst::INANIMATE |
242
|
2 |
|
), $variant); |
243
|
|
|
} |
244
|
|
|
|
245
|
|
|
/** |
246
|
|
|
* Get noun case |
247
|
|
|
* @param int $variant |
248
|
|
|
* @return null|string - MystemConst |
249
|
|
|
*/ |
250
|
7 |
|
public function getNounCase($variant = 0) |
251
|
|
|
{ |
252
|
7 |
|
return $this->searchGrammemeInList(array( |
253
|
7 |
|
MystemConst::NOMINATIVE, |
254
|
7 |
|
MystemConst::GENITIVE, |
255
|
7 |
|
MystemConst::DATIVE, |
256
|
7 |
|
MystemConst::ACCUSATIVE, |
257
|
7 |
|
MystemConst::INSTRUMENTAL, |
258
|
7 |
|
MystemConst::PREPOSITIONAL, |
259
|
7 |
|
MystemConst::PARTITIVE, |
260
|
7 |
|
MystemConst::LOCATIVE, |
261
|
7 |
|
MystemConst::VOCATIVE, |
262
|
7 |
|
), $variant); |
263
|
|
|
} |
264
|
|
|
|
265
|
|
|
/** |
266
|
|
|
* @param array $constants |
267
|
|
|
* @param int $variant |
268
|
|
|
* @return null|string |
269
|
|
|
*/ |
270
|
18 |
|
protected function searchGrammemeInList(array $constants, $variant = 0) |
271
|
|
|
{ |
272
|
18 |
|
if (!isset($this->variants[$variant])) { |
273
|
1 |
|
return null; |
274
|
|
|
} |
275
|
|
|
|
276
|
17 |
|
foreach ($constants as $grammeme) { |
277
|
17 |
|
if (in_array($grammeme, $this->variants[$variant]['grammems'])) { |
278
|
16 |
|
return $grammeme; |
279
|
|
|
} |
280
|
12 |
|
} |
281
|
|
|
|
282
|
1 |
|
return null; |
283
|
|
|
} |
284
|
|
|
|
285
|
|
|
/** |
286
|
|
|
* @return bool |
287
|
|
|
*/ |
288
|
10 |
|
public function isBadWord() |
289
|
|
|
{ |
290
|
10 |
|
$original = mb_strtolower($this->original, 'UTF-8'); |
291
|
10 |
|
if ($this->checkGrammeme(MystemConst::OTHER_VULGARISM)) { |
292
|
5 |
|
$inExceptions = in_array($original, self::$falsePositiveList) || |
293
|
5 |
|
(in_array($this->normalized(), self::$falsePositiveNormalizedList) && |
294
|
5 |
|
!in_array($original, self::$falseNegativeList)); |
295
|
5 |
|
if ($inExceptions) { |
296
|
3 |
|
$this->removeGrammeme(MystemConst::OTHER_VULGARISM); |
297
|
3 |
|
} |
298
|
5 |
|
return !$inExceptions; |
299
|
|
|
} else { |
300
|
6 |
|
$inExceptions = in_array($original, self::$falseNegativeList) || |
301
|
6 |
|
(in_array($this->normalized(), self::$falseNegativeNormalizedList) && |
302
|
6 |
|
!in_array($original, self::$falsePositiveList)); |
303
|
6 |
|
if ($inExceptions) { |
304
|
3 |
|
$this->addGrammeme(MystemConst::OTHER_VULGARISM); |
305
|
3 |
|
} |
306
|
6 |
|
return $inExceptions; |
307
|
|
|
} |
308
|
|
|
} |
309
|
|
|
} |
310
|
|
|
|