1 | <?php |
||
2 | namespace Mystem; |
||
3 | |||
4 | /** |
||
5 | * Class Word |
||
6 | * @property array[] $variants lexical interpretation variants: |
||
7 | * - string $normalized - normalized word representation |
||
8 | * - boolean $strict - dictionary or predictable normalized representation |
||
9 | * - array $grammems - lexical information (constants from MystemConst) |
||
10 | */ |
||
11 | class Word |
||
12 | { |
||
13 | /** |
||
14 | * @var string $grammemeRegexp cached constants regular expression from MystemConst |
||
15 | */ |
||
16 | private static $grammemeRegexp = null; |
||
17 | |||
18 | /* @var string $constructorClass need for instantiate properly class in newFrom* methods */ |
||
19 | protected static $constructorClass = '\Mystem\Word'; |
||
20 | |||
21 | /** |
||
22 | * @var string $original original string |
||
23 | */ |
||
24 | public $original; |
||
25 | |||
26 | public $variants = array(); |
||
27 | |||
28 | |||
29 | /* @var string[] $falsePositiveList */ |
||
30 | public static $falsePositiveList = array(); |
||
31 | |||
32 | /* @var string[] $falsePositiveList */ |
||
33 | public static $falsePositiveNormalizedList = array(); |
||
34 | |||
35 | /* @var string[] $falseNegativeList */ |
||
36 | public static $falseNegativeList = array(); |
||
37 | |||
38 | /* @var string[] $falseNegativeList */ |
||
39 | public static $falseNegativeNormalizedList = array(); |
||
40 | |||
41 | 37 | public function __construct() |
|
42 | { |
||
43 | 37 | if (self::$grammemeRegexp === null) { |
|
0 ignored issues
–
show
introduced
by
![]() |
|||
44 | 37 | self::$grammemeRegexp = '#(' . implode('|', MystemConst::grammemeList()) . ')#u'; |
|
45 | 37 | } |
|
46 | 37 | } |
|
47 | |||
48 | /** |
||
49 | * @param array|string $lexicalString - prepared structure from Mystem |
||
50 | * @param int $maxVariants |
||
51 | * @return Word |
||
52 | */ |
||
53 | 37 | public static function newFromLexicalString($lexicalString, $maxVariants = null) |
|
54 | { |
||
55 | /* @var Word $word */ |
||
56 | 37 | $word = new static::$constructorClass(); |
|
57 | 37 | if (is_array($lexicalString)) { |
|
58 | 35 | $word->parse($lexicalString, $maxVariants); |
|
59 | 35 | } else { |
|
60 | 2 | $word->original = $lexicalString; |
|
61 | } |
||
62 | 37 | return $word; |
|
63 | } |
||
64 | |||
65 | /** |
||
66 | * @param string $word |
||
67 | * @param int $maxVariants |
||
68 | * @return Word |
||
69 | */ |
||
70 | 33 | public static function stemm($word, $maxVariants = null) |
|
71 | { |
||
72 | 33 | $lexicalString = Mystem::stemm($word); |
|
73 | 33 | return self::newFromLexicalString(isset($lexicalString[0]) ? $lexicalString[0] : $word, $maxVariants); |
|
74 | } |
||
75 | |||
76 | /** |
||
77 | * Normalized word |
||
78 | * @return string |
||
79 | */ |
||
80 | 15 | public function normalized() |
|
81 | { |
||
82 | 15 | if (isset($this->variants[0], $this->variants[0]['normalized'])) { |
|
83 | 14 | return $this->variants[0]['normalized']; |
|
84 | } else { |
||
85 | 1 | return ''; |
|
86 | } |
||
87 | } |
||
88 | |||
89 | 2 | public function __toString() |
|
90 | { |
||
91 | 2 | return $this->normalized(); |
|
92 | } |
||
93 | |||
94 | /** |
||
95 | * Parse raw morphological data from mystem and fill Word object data |
||
96 | * @param array $lexicalString - prepared string from Mystem |
||
97 | * @param int $maxVariants |
||
98 | */ |
||
99 | 35 | protected function parse($lexicalString, $maxVariants = null) |
|
100 | { |
||
101 | 35 | $counter = 0; |
|
102 | 35 | $this->original = $lexicalString['text']; |
|
103 | 35 | $analysis = $lexicalString['analysis']; |
|
104 | 35 | foreach ($analysis as $aVariant) { |
|
105 | $variant = array( |
||
106 | 35 | 'normalized' => $aVariant['lex'], |
|
107 | 35 | 'strict' => isset($aVariant['qual']) && $aVariant['qual'] === 'bastard', |
|
108 | 35 | 'grammems' => array(), |
|
109 | 35 | ); |
|
110 | 35 | preg_match_all(self::$grammemeRegexp, $aVariant['gr'], $match); |
|
111 | 35 | if (!empty($match[0])) { |
|
112 | 35 | $variant['grammems'] = $match[0]; |
|
113 | 35 | } |
|
114 | 35 | $this->variants[$counter++] = $variant; |
|
115 | 35 | if ($maxVariants !== null && $counter >= $maxVariants) { |
|
116 | 4 | break; |
|
117 | } |
||
118 | 35 | } |
|
119 | 35 | } |
|
120 | |||
121 | /** |
||
122 | * @param string $gramm - grammar primitive from MystemConst |
||
123 | * @return int |
||
124 | */ |
||
125 | 3 | public function addGrammeme($gramm) |
|
126 | { |
||
127 | 3 | $counter = 0; |
|
128 | 3 | $count = count($this->variants); |
|
129 | 3 | for ($i = 0; $i < $count; $i++) { |
|
130 | 3 | $counter += $this->addGrammemeInVariant($gramm, $i); |
|
131 | 3 | } |
|
132 | 3 | return $counter; |
|
133 | } |
||
134 | |||
135 | /** |
||
136 | * @param string $gramm - grammar primitive from MystemConst |
||
137 | * @param int $level |
||
138 | * @return bool |
||
139 | */ |
||
140 | 3 | protected function addGrammemeInVariant($gramm, $level = null) |
|
141 | { |
||
142 | 3 | if (!isset($this->variants[$level]) || in_array($gramm, $this->variants[$level]['grammems'])) { |
|
143 | return false; |
||
144 | } |
||
145 | 3 | $this->variants[$level]['grammems'][] = $gramm; |
|
146 | 3 | return true; |
|
147 | } |
||
148 | |||
149 | /** |
||
150 | * @param string $gramm - grammar primitive from MystemConst |
||
151 | * @return int |
||
152 | */ |
||
153 | 3 | public function removeGrammeme($gramm) |
|
154 | { |
||
155 | 3 | $counter = 0; |
|
156 | 3 | $count = count($this->variants); |
|
157 | 3 | for ($i = 0; $i < $count; $i++) { |
|
158 | 3 | $counter += $this->removeGrammemeInVariant($gramm, $i); |
|
159 | 3 | } |
|
160 | 3 | return $counter; |
|
161 | } |
||
162 | |||
163 | /** |
||
164 | * @param string $gramm - grammar primitive from MystemConst |
||
165 | * @param int $level |
||
166 | * @return bool |
||
167 | */ |
||
168 | 3 | protected function removeGrammemeInVariant($gramm, $level) |
|
169 | { |
||
170 | 3 | if (!isset($this->variants[$level]['grammems'])) { |
|
171 | return false; |
||
172 | } |
||
173 | 3 | $key = array_search($gramm, $this->variants[$level]['grammems']); |
|
174 | 3 | unset($this->variants[$level]['grammems'][$key]); |
|
175 | 3 | return $key !== false; |
|
176 | } |
||
177 | |||
178 | /** |
||
179 | * Search grammese primitive in word variants |
||
180 | * @param string $gramm - grammar primitive from MystemConst |
||
181 | * @param integer $level - variants maximum depth |
||
182 | * @return boolean |
||
183 | */ |
||
184 | 12 | public function checkGrammeme($gramm, $level = null) |
|
185 | { |
||
186 | 12 | $counter = 0; |
|
187 | 12 | foreach ($this->variants as $variant) { |
|
188 | 11 | if (in_array($gramm, $variant['grammems'])) { |
|
189 | 6 | return true; |
|
190 | 7 | } elseif ($level !== null && ++$counter >= $level) { |
|
191 | 1 | return false; |
|
192 | } |
||
193 | 8 | } |
|
194 | 7 | return false; |
|
195 | } |
||
196 | |||
197 | /** |
||
198 | * Get verb time: present, past or future |
||
199 | * @param int $variant find in which morphological variant |
||
200 | * @return null|string MystemConst::PRESENT, MystemConst::PAST or MystemConst::FUTURE |
||
201 | */ |
||
202 | 2 | public function getVerbTime($variant = 0) |
|
203 | { |
||
204 | 2 | return $this->searchGrammemeInList(array( |
|
205 | 2 | MystemConst::PRESENT, MystemConst::FUTURE, MystemConst::PAST |
|
206 | 2 | ), $variant); |
|
207 | } |
||
208 | |||
209 | /** |
||
210 | * Get count: single or plural |
||
211 | * @param int $variant find in which morphological variant |
||
212 | * @return null|string - MystemConst |
||
213 | */ |
||
214 | 4 | public function getCount($variant = 0) |
|
215 | { |
||
216 | 4 | return $this->searchGrammemeInList(array( |
|
217 | 4 | MystemConst::SINGULAR, MystemConst::PLURAL |
|
218 | 4 | ), $variant); |
|
219 | } |
||
220 | |||
221 | /** |
||
222 | * Get gender |
||
223 | * @param int $variant find in which morphological variant |
||
224 | * @return null|string - MystemConst |
||
225 | */ |
||
226 | 3 | public function getGender($variant = 0) |
|
227 | { |
||
228 | 3 | return $this->searchGrammemeInList(array( |
|
229 | 3 | MystemConst::FEMININE, MystemConst::MASCULINE, MystemConst::NEUTER |
|
230 | 3 | ), $variant); |
|
231 | } |
||
232 | |||
233 | /** |
||
234 | * Get animate |
||
235 | * @param int $variant find in which morphological variant |
||
236 | * @return null|string - MystemConst |
||
237 | */ |
||
238 | 2 | public function getAnimate($variant = 0) |
|
239 | { |
||
240 | 2 | return $this->searchGrammemeInList(array( |
|
241 | 2 | MystemConst::ANIMATE, MystemConst::INANIMATE |
|
242 | 2 | ), $variant); |
|
243 | } |
||
244 | |||
245 | /** |
||
246 | * Get noun case |
||
247 | * @param int $variant |
||
248 | * @return null|string - MystemConst |
||
249 | */ |
||
250 | 7 | public function getNounCase($variant = 0) |
|
251 | { |
||
252 | 7 | return $this->searchGrammemeInList(array( |
|
253 | 7 | MystemConst::NOMINATIVE, |
|
254 | 7 | MystemConst::GENITIVE, |
|
255 | 7 | MystemConst::DATIVE, |
|
256 | 7 | MystemConst::ACCUSATIVE, |
|
257 | 7 | MystemConst::INSTRUMENTAL, |
|
258 | 7 | MystemConst::PREPOSITIONAL, |
|
259 | 7 | MystemConst::PARTITIVE, |
|
260 | 7 | MystemConst::LOCATIVE, |
|
261 | 7 | MystemConst::VOCATIVE, |
|
262 | 7 | ), $variant); |
|
263 | } |
||
264 | |||
265 | /** |
||
266 | * @param array $constants |
||
267 | * @param int $variant |
||
268 | * @return null|string |
||
269 | */ |
||
270 | 18 | protected function searchGrammemeInList(array $constants, $variant = 0) |
|
271 | { |
||
272 | 18 | if (!isset($this->variants[$variant])) { |
|
273 | 1 | return null; |
|
274 | } |
||
275 | |||
276 | 17 | foreach ($constants as $grammeme) { |
|
277 | 17 | if (in_array($grammeme, $this->variants[$variant]['grammems'])) { |
|
278 | 16 | return $grammeme; |
|
279 | } |
||
280 | 12 | } |
|
281 | |||
282 | 1 | return null; |
|
283 | } |
||
284 | |||
285 | /** |
||
286 | * @return bool |
||
287 | */ |
||
288 | 10 | public function isBadWord() |
|
289 | { |
||
290 | 10 | $original = mb_strtolower($this->original, 'UTF-8'); |
|
291 | 10 | if ($this->checkGrammeme(MystemConst::OTHER_VULGARISM)) { |
|
292 | 5 | $inExceptions = in_array($original, self::$falsePositiveList) || |
|
293 | 5 | (in_array($this->normalized(), self::$falsePositiveNormalizedList) && |
|
294 | 5 | !in_array($original, self::$falseNegativeList)); |
|
295 | 5 | if ($inExceptions) { |
|
296 | 3 | $this->removeGrammeme(MystemConst::OTHER_VULGARISM); |
|
297 | 3 | } |
|
298 | 5 | return !$inExceptions; |
|
299 | } else { |
||
300 | 6 | $inExceptions = in_array($original, self::$falseNegativeList) || |
|
301 | 6 | (in_array($this->normalized(), self::$falseNegativeNormalizedList) && |
|
302 | 6 | !in_array($original, self::$falsePositiveList)); |
|
303 | 6 | if ($inExceptions) { |
|
304 | 3 | $this->addGrammeme(MystemConst::OTHER_VULGARISM); |
|
305 | 3 | } |
|
306 | 6 | return $inExceptions; |
|
307 | } |
||
308 | } |
||
309 | } |
||
310 |