aotd1 /
mystem
| 1 | <?php |
||
| 2 | namespace Mystem; |
||
| 3 | |||
| 4 | /** |
||
| 5 | * Class Word |
||
| 6 | * @property array[] $variants lexical interpretation variants: |
||
| 7 | * - string $normalized - normalized word representation |
||
| 8 | * - boolean $strict - dictionary or predictable normalized representation |
||
| 9 | * - array $grammems - lexical information (constants from MystemConst) |
||
| 10 | */ |
||
| 11 | class Word |
||
| 12 | { |
||
| 13 | /** |
||
| 14 | * @var string $grammemeRegexp cached constants regular expression from MystemConst |
||
| 15 | */ |
||
| 16 | private static $grammemeRegexp = null; |
||
| 17 | |||
| 18 | /* @var string $constructorClass need for instantiate properly class in newFrom* methods */ |
||
| 19 | protected static $constructorClass = '\Mystem\Word'; |
||
| 20 | |||
| 21 | /** |
||
| 22 | * @var string $original original string |
||
| 23 | */ |
||
| 24 | public $original; |
||
| 25 | |||
| 26 | public $variants = array(); |
||
| 27 | |||
| 28 | |||
| 29 | /* @var string[] $falsePositiveList */ |
||
| 30 | public static $falsePositiveList = array(); |
||
| 31 | |||
| 32 | /* @var string[] $falsePositiveList */ |
||
| 33 | public static $falsePositiveNormalizedList = array(); |
||
| 34 | |||
| 35 | /* @var string[] $falseNegativeList */ |
||
| 36 | public static $falseNegativeList = array(); |
||
| 37 | |||
| 38 | /* @var string[] $falseNegativeList */ |
||
| 39 | public static $falseNegativeNormalizedList = array(); |
||
| 40 | |||
| 41 | 37 | public function __construct() |
|
| 42 | { |
||
| 43 | 37 | if (self::$grammemeRegexp === null) { |
|
|
0 ignored issues
–
show
introduced
by
Loading history...
|
|||
| 44 | 37 | self::$grammemeRegexp = '#(' . implode('|', MystemConst::grammemeList()) . ')#u'; |
|
| 45 | 37 | } |
|
| 46 | 37 | } |
|
| 47 | |||
| 48 | /** |
||
| 49 | * @param array|string $lexicalString - prepared structure from Mystem |
||
| 50 | * @param int $maxVariants |
||
| 51 | * @return Word |
||
| 52 | */ |
||
| 53 | 37 | public static function newFromLexicalString($lexicalString, $maxVariants = null) |
|
| 54 | { |
||
| 55 | /* @var Word $word */ |
||
| 56 | 37 | $word = new static::$constructorClass(); |
|
| 57 | 37 | if (is_array($lexicalString)) { |
|
| 58 | 35 | $word->parse($lexicalString, $maxVariants); |
|
| 59 | 35 | } else { |
|
| 60 | 2 | $word->original = $lexicalString; |
|
| 61 | } |
||
| 62 | 37 | return $word; |
|
| 63 | } |
||
| 64 | |||
| 65 | /** |
||
| 66 | * @param string $word |
||
| 67 | * @param int $maxVariants |
||
| 68 | * @return Word |
||
| 69 | */ |
||
| 70 | 33 | public static function stemm($word, $maxVariants = null) |
|
| 71 | { |
||
| 72 | 33 | $lexicalString = Mystem::stemm($word); |
|
| 73 | 33 | return self::newFromLexicalString(isset($lexicalString[0]) ? $lexicalString[0] : $word, $maxVariants); |
|
| 74 | } |
||
| 75 | |||
| 76 | /** |
||
| 77 | * Normalized word |
||
| 78 | * @return string |
||
| 79 | */ |
||
| 80 | 15 | public function normalized() |
|
| 81 | { |
||
| 82 | 15 | if (isset($this->variants[0], $this->variants[0]['normalized'])) { |
|
| 83 | 14 | return $this->variants[0]['normalized']; |
|
| 84 | } else { |
||
| 85 | 1 | return ''; |
|
| 86 | } |
||
| 87 | } |
||
| 88 | |||
| 89 | 2 | public function __toString() |
|
| 90 | { |
||
| 91 | 2 | return $this->normalized(); |
|
| 92 | } |
||
| 93 | |||
| 94 | /** |
||
| 95 | * Parse raw morphological data from mystem and fill Word object data |
||
| 96 | * @param array $lexicalString - prepared string from Mystem |
||
| 97 | * @param int $maxVariants |
||
| 98 | */ |
||
| 99 | 35 | protected function parse($lexicalString, $maxVariants = null) |
|
| 100 | { |
||
| 101 | 35 | $counter = 0; |
|
| 102 | 35 | $this->original = $lexicalString['text']; |
|
| 103 | 35 | $analysis = $lexicalString['analysis']; |
|
| 104 | 35 | foreach ($analysis as $aVariant) { |
|
| 105 | $variant = array( |
||
| 106 | 35 | 'normalized' => $aVariant['lex'], |
|
| 107 | 35 | 'strict' => isset($aVariant['qual']) && $aVariant['qual'] === 'bastard', |
|
| 108 | 35 | 'grammems' => array(), |
|
| 109 | 35 | ); |
|
| 110 | 35 | preg_match_all(self::$grammemeRegexp, $aVariant['gr'], $match); |
|
| 111 | 35 | if (!empty($match[0])) { |
|
| 112 | 35 | $variant['grammems'] = $match[0]; |
|
| 113 | 35 | } |
|
| 114 | 35 | $this->variants[$counter++] = $variant; |
|
| 115 | 35 | if ($maxVariants !== null && $counter >= $maxVariants) { |
|
| 116 | 4 | break; |
|
| 117 | } |
||
| 118 | 35 | } |
|
| 119 | 35 | } |
|
| 120 | |||
| 121 | /** |
||
| 122 | * @param string $gramm - grammar primitive from MystemConst |
||
| 123 | * @return int |
||
| 124 | */ |
||
| 125 | 3 | public function addGrammeme($gramm) |
|
| 126 | { |
||
| 127 | 3 | $counter = 0; |
|
| 128 | 3 | $count = count($this->variants); |
|
| 129 | 3 | for ($i = 0; $i < $count; $i++) { |
|
| 130 | 3 | $counter += $this->addGrammemeInVariant($gramm, $i); |
|
| 131 | 3 | } |
|
| 132 | 3 | return $counter; |
|
| 133 | } |
||
| 134 | |||
| 135 | /** |
||
| 136 | * @param string $gramm - grammar primitive from MystemConst |
||
| 137 | * @param int $level |
||
| 138 | * @return bool |
||
| 139 | */ |
||
| 140 | 3 | protected function addGrammemeInVariant($gramm, $level = null) |
|
| 141 | { |
||
| 142 | 3 | if (!isset($this->variants[$level]) || in_array($gramm, $this->variants[$level]['grammems'])) { |
|
| 143 | return false; |
||
| 144 | } |
||
| 145 | 3 | $this->variants[$level]['grammems'][] = $gramm; |
|
| 146 | 3 | return true; |
|
| 147 | } |
||
| 148 | |||
| 149 | /** |
||
| 150 | * @param string $gramm - grammar primitive from MystemConst |
||
| 151 | * @return int |
||
| 152 | */ |
||
| 153 | 3 | public function removeGrammeme($gramm) |
|
| 154 | { |
||
| 155 | 3 | $counter = 0; |
|
| 156 | 3 | $count = count($this->variants); |
|
| 157 | 3 | for ($i = 0; $i < $count; $i++) { |
|
| 158 | 3 | $counter += $this->removeGrammemeInVariant($gramm, $i); |
|
| 159 | 3 | } |
|
| 160 | 3 | return $counter; |
|
| 161 | } |
||
| 162 | |||
| 163 | /** |
||
| 164 | * @param string $gramm - grammar primitive from MystemConst |
||
| 165 | * @param int $level |
||
| 166 | * @return bool |
||
| 167 | */ |
||
| 168 | 3 | protected function removeGrammemeInVariant($gramm, $level) |
|
| 169 | { |
||
| 170 | 3 | if (!isset($this->variants[$level]['grammems'])) { |
|
| 171 | return false; |
||
| 172 | } |
||
| 173 | 3 | $key = array_search($gramm, $this->variants[$level]['grammems']); |
|
| 174 | 3 | unset($this->variants[$level]['grammems'][$key]); |
|
| 175 | 3 | return $key !== false; |
|
| 176 | } |
||
| 177 | |||
| 178 | /** |
||
| 179 | * Search grammese primitive in word variants |
||
| 180 | * @param string $gramm - grammar primitive from MystemConst |
||
| 181 | * @param integer $level - variants maximum depth |
||
| 182 | * @return boolean |
||
| 183 | */ |
||
| 184 | 12 | public function checkGrammeme($gramm, $level = null) |
|
| 185 | { |
||
| 186 | 12 | $counter = 0; |
|
| 187 | 12 | foreach ($this->variants as $variant) { |
|
| 188 | 11 | if (in_array($gramm, $variant['grammems'])) { |
|
| 189 | 6 | return true; |
|
| 190 | 7 | } elseif ($level !== null && ++$counter >= $level) { |
|
| 191 | 1 | return false; |
|
| 192 | } |
||
| 193 | 8 | } |
|
| 194 | 7 | return false; |
|
| 195 | } |
||
| 196 | |||
| 197 | /** |
||
| 198 | * Get verb time: present, past or future |
||
| 199 | * @param int $variant find in which morphological variant |
||
| 200 | * @return null|string MystemConst::PRESENT, MystemConst::PAST or MystemConst::FUTURE |
||
| 201 | */ |
||
| 202 | 2 | public function getVerbTime($variant = 0) |
|
| 203 | { |
||
| 204 | 2 | return $this->searchGrammemeInList(array( |
|
| 205 | 2 | MystemConst::PRESENT, MystemConst::FUTURE, MystemConst::PAST |
|
| 206 | 2 | ), $variant); |
|
| 207 | } |
||
| 208 | |||
| 209 | /** |
||
| 210 | * Get count: single or plural |
||
| 211 | * @param int $variant find in which morphological variant |
||
| 212 | * @return null|string - MystemConst |
||
| 213 | */ |
||
| 214 | 4 | public function getCount($variant = 0) |
|
| 215 | { |
||
| 216 | 4 | return $this->searchGrammemeInList(array( |
|
| 217 | 4 | MystemConst::SINGULAR, MystemConst::PLURAL |
|
| 218 | 4 | ), $variant); |
|
| 219 | } |
||
| 220 | |||
| 221 | /** |
||
| 222 | * Get gender |
||
| 223 | * @param int $variant find in which morphological variant |
||
| 224 | * @return null|string - MystemConst |
||
| 225 | */ |
||
| 226 | 3 | public function getGender($variant = 0) |
|
| 227 | { |
||
| 228 | 3 | return $this->searchGrammemeInList(array( |
|
| 229 | 3 | MystemConst::FEMININE, MystemConst::MASCULINE, MystemConst::NEUTER |
|
| 230 | 3 | ), $variant); |
|
| 231 | } |
||
| 232 | |||
| 233 | /** |
||
| 234 | * Get animate |
||
| 235 | * @param int $variant find in which morphological variant |
||
| 236 | * @return null|string - MystemConst |
||
| 237 | */ |
||
| 238 | 2 | public function getAnimate($variant = 0) |
|
| 239 | { |
||
| 240 | 2 | return $this->searchGrammemeInList(array( |
|
| 241 | 2 | MystemConst::ANIMATE, MystemConst::INANIMATE |
|
| 242 | 2 | ), $variant); |
|
| 243 | } |
||
| 244 | |||
| 245 | /** |
||
| 246 | * Get noun case |
||
| 247 | * @param int $variant |
||
| 248 | * @return null|string - MystemConst |
||
| 249 | */ |
||
| 250 | 7 | public function getNounCase($variant = 0) |
|
| 251 | { |
||
| 252 | 7 | return $this->searchGrammemeInList(array( |
|
| 253 | 7 | MystemConst::NOMINATIVE, |
|
| 254 | 7 | MystemConst::GENITIVE, |
|
| 255 | 7 | MystemConst::DATIVE, |
|
| 256 | 7 | MystemConst::ACCUSATIVE, |
|
| 257 | 7 | MystemConst::INSTRUMENTAL, |
|
| 258 | 7 | MystemConst::PREPOSITIONAL, |
|
| 259 | 7 | MystemConst::PARTITIVE, |
|
| 260 | 7 | MystemConst::LOCATIVE, |
|
| 261 | 7 | MystemConst::VOCATIVE, |
|
| 262 | 7 | ), $variant); |
|
| 263 | } |
||
| 264 | |||
| 265 | /** |
||
| 266 | * @param array $constants |
||
| 267 | * @param int $variant |
||
| 268 | * @return null|string |
||
| 269 | */ |
||
| 270 | 18 | protected function searchGrammemeInList(array $constants, $variant = 0) |
|
| 271 | { |
||
| 272 | 18 | if (!isset($this->variants[$variant])) { |
|
| 273 | 1 | return null; |
|
| 274 | } |
||
| 275 | |||
| 276 | 17 | foreach ($constants as $grammeme) { |
|
| 277 | 17 | if (in_array($grammeme, $this->variants[$variant]['grammems'])) { |
|
| 278 | 16 | return $grammeme; |
|
| 279 | } |
||
| 280 | 12 | } |
|
| 281 | |||
| 282 | 1 | return null; |
|
| 283 | } |
||
| 284 | |||
| 285 | /** |
||
| 286 | * @return bool |
||
| 287 | */ |
||
| 288 | 10 | public function isBadWord() |
|
| 289 | { |
||
| 290 | 10 | $original = mb_strtolower($this->original, 'UTF-8'); |
|
| 291 | 10 | if ($this->checkGrammeme(MystemConst::OTHER_VULGARISM)) { |
|
| 292 | 5 | $inExceptions = in_array($original, self::$falsePositiveList) || |
|
| 293 | 5 | (in_array($this->normalized(), self::$falsePositiveNormalizedList) && |
|
| 294 | 5 | !in_array($original, self::$falseNegativeList)); |
|
| 295 | 5 | if ($inExceptions) { |
|
| 296 | 3 | $this->removeGrammeme(MystemConst::OTHER_VULGARISM); |
|
| 297 | 3 | } |
|
| 298 | 5 | return !$inExceptions; |
|
| 299 | } else { |
||
| 300 | 6 | $inExceptions = in_array($original, self::$falseNegativeList) || |
|
| 301 | 6 | (in_array($this->normalized(), self::$falseNegativeNormalizedList) && |
|
| 302 | 6 | !in_array($original, self::$falsePositiveList)); |
|
| 303 | 6 | if ($inExceptions) { |
|
| 304 | 3 | $this->addGrammeme(MystemConst::OTHER_VULGARISM); |
|
| 305 | 3 | } |
|
| 306 | 6 | return $inExceptions; |
|
| 307 | } |
||
| 308 | } |
||
| 309 | } |
||
| 310 |