scotteh /
php-goose
| 1 | <?php declare(strict_types=1); |
||
| 2 | |||
| 3 | namespace Goose\Text; |
||
| 4 | |||
| 5 | use Goose\Configuration; |
||
| 6 | |||
| 7 | /** |
||
| 8 | * Stop Words |
||
| 9 | * |
||
| 10 | * @package Goose\Text |
||
| 11 | * @license http://www.apache.org/licenses/LICENSE-2.0 Apache License 2.0 |
||
| 12 | */ |
||
| 13 | class StopWords |
||
| 14 | { |
||
| 15 | /** @var Configuration */ |
||
| 16 | private $config; |
||
| 17 | |||
| 18 | /** @var string[] */ |
||
| 19 | private $cached = []; |
||
| 20 | |||
| 21 | /** @var string[] */ |
||
| 22 | private $languages = [ |
||
| 23 | 'ar', 'cs', 'da', 'de', 'en', 'es', |
||
| 24 | 'fi', 'fr', 'hu', 'id', 'it', 'ja', |
||
| 25 | 'ko', 'nb', 'nl', 'no', 'pl', 'pt', |
||
| 26 | 'ru', 'sv', 'zh', 'vi' |
||
| 27 | ]; |
||
| 28 | |||
| 29 | /** |
||
| 30 | * @param Configuration $config |
||
| 31 | */ |
||
| 32 | public function __construct(Configuration $config) { |
||
| 33 | $this->config = $config; |
||
| 34 | } |
||
| 35 | |||
| 36 | /** |
||
| 37 | * @return Configuration |
||
| 38 | */ |
||
| 39 | public function config(): Configuration { |
||
| 40 | return $this->config; |
||
| 41 | } |
||
| 42 | |||
| 43 | /** |
||
| 44 | * @param string $str |
||
| 45 | * |
||
| 46 | * @return string |
||
| 47 | */ |
||
| 48 | public function removePunctuation(string $str): string { |
||
| 49 | return preg_replace("/[[:punct:]]+/", '', $str); |
||
| 50 | } |
||
| 51 | |||
| 52 | /** |
||
| 53 | * @return string |
||
| 54 | */ |
||
| 55 | public function getLanguage(): string { |
||
| 56 | list($language) = explode('-', $this->config()->get('language')); |
||
|
0 ignored issues
–
show
Bug
introduced
by
Loading history...
|
|||
| 57 | |||
| 58 | if (!in_array($language, $this->languages)) { |
||
| 59 | $language = 'en'; |
||
| 60 | } |
||
| 61 | return mb_strtolower($language); |
||
| 62 | } |
||
| 63 | |||
| 64 | /** |
||
| 65 | * @return mixed |
||
| 66 | */ |
||
| 67 | public function getWordList(): array { |
||
| 68 | if (empty($this->cached)) { |
||
| 69 | $file = sprintf(__DIR__ . '/../../resources/text/stopwords-%s.txt', $this->getLanguage()); |
||
| 70 | |||
| 71 | $lines = explode("\n", str_replace(["\r\n", "\r"], "\n", file_get_contents($file))); |
||
| 72 | |||
| 73 | $this->cached = array_filter($lines, function($line) { |
||
| 74 | // Ignore emoty lines and lines starting with '#'. |
||
| 75 | return !(trim($line) == '' || mb_substr($line, 0, 1) == '#'); |
||
| 76 | }); |
||
| 77 | } |
||
| 78 | |||
| 79 | return $this->cached; |
||
| 80 | } |
||
| 81 | |||
| 82 | /** |
||
| 83 | * @param string $content |
||
| 84 | * |
||
| 85 | * @return WordStats |
||
| 86 | */ |
||
| 87 | public function getStopwordCount(string $content): WordStats { |
||
| 88 | if (empty($content)) { |
||
| 89 | return new WordStats(); |
||
| 90 | } |
||
| 91 | |||
| 92 | $strippedInput = $this->removePunctuation($content); |
||
| 93 | $candidateWords = $this->getCandidateWords($strippedInput); |
||
| 94 | |||
| 95 | $overlappingStopWords = []; |
||
| 96 | foreach ($candidateWords as $w) { |
||
| 97 | if (in_array(mb_strtolower($w), $this->getWordList())) { |
||
| 98 | $overlappingStopWords[] = mb_strtolower($w); |
||
| 99 | } |
||
| 100 | } |
||
| 101 | |||
| 102 | return new WordStats([ |
||
| 103 | 'wordCount' => count($candidateWords), |
||
| 104 | 'stopWordCount' => count($overlappingStopWords), |
||
| 105 | 'stopWords' => $overlappingStopWords, |
||
| 106 | ]); |
||
| 107 | } |
||
| 108 | |||
| 109 | /** |
||
| 110 | * @param string $strippedInput |
||
| 111 | * |
||
| 112 | * @return array |
||
| 113 | */ |
||
| 114 | public function getCandidateWords(string $strippedInput): array { |
||
| 115 | // Simple separating words in Japanese. |
||
| 116 | if ($this->getLanguage() === 'ja') { |
||
| 117 | $regexp = '/(' . implode('|', array_map('preg_quote', $this->getWordList())) . ')/'; |
||
| 118 | $strippedInput = preg_replace($regexp, ' $1 ', $strippedInput); |
||
| 119 | } |
||
| 120 | |||
| 121 | return explode(' ', $strippedInput); |
||
| 122 | } |
||
| 123 | } |