1 | <?php declare(strict_types=1); |
||
2 | |||
3 | namespace Goose\Text; |
||
4 | |||
5 | use Goose\Configuration; |
||
6 | |||
7 | /** |
||
8 | * Stop Words |
||
9 | * |
||
10 | * @package Goose\Text |
||
11 | * @license http://www.apache.org/licenses/LICENSE-2.0 Apache License 2.0 |
||
12 | */ |
||
13 | class StopWords |
||
14 | { |
||
15 | /** @var Configuration */ |
||
16 | private $config; |
||
17 | |||
18 | /** @var string[] */ |
||
19 | private $cached = []; |
||
20 | |||
21 | /** @var string[] */ |
||
22 | private $languages = [ |
||
23 | 'ar', 'cs', 'da', 'de', 'en', 'es', |
||
24 | 'fi', 'fr', 'hu', 'id', 'it', 'ja', |
||
25 | 'ko', 'nb', 'nl', 'no', 'pl', 'pt', |
||
26 | 'ru', 'sv', 'zh', 'vi' |
||
27 | ]; |
||
28 | |||
29 | /** |
||
30 | * @param Configuration $config |
||
31 | */ |
||
32 | public function __construct(Configuration $config) { |
||
33 | $this->config = $config; |
||
34 | } |
||
35 | |||
36 | /** |
||
37 | * @return Configuration |
||
38 | */ |
||
39 | public function config(): Configuration { |
||
40 | return $this->config; |
||
41 | } |
||
42 | |||
43 | /** |
||
44 | * @param string $str |
||
45 | * |
||
46 | * @return string |
||
47 | */ |
||
48 | public function removePunctuation(string $str): string { |
||
49 | return preg_replace("/[[:punct:]]+/", '', $str); |
||
50 | } |
||
51 | |||
52 | /** |
||
53 | * @return string |
||
54 | */ |
||
55 | public function getLanguage(): string { |
||
56 | list($language) = explode('-', $this->config()->get('language')); |
||
0 ignored issues
–
show
Bug
introduced
by
![]() |
|||
57 | |||
58 | if (!in_array($language, $this->languages)) { |
||
59 | $language = 'en'; |
||
60 | } |
||
61 | return mb_strtolower($language); |
||
62 | } |
||
63 | |||
64 | /** |
||
65 | * @return mixed |
||
66 | */ |
||
67 | public function getWordList(): array { |
||
68 | if (empty($this->cached)) { |
||
69 | $file = sprintf(__DIR__ . '/../../resources/text/stopwords-%s.txt', $this->getLanguage()); |
||
70 | |||
71 | $lines = explode("\n", str_replace(["\r\n", "\r"], "\n", file_get_contents($file))); |
||
72 | |||
73 | $this->cached = array_filter($lines, function($line) { |
||
74 | // Ignore emoty lines and lines starting with '#'. |
||
75 | return !(trim($line) == '' || mb_substr($line, 0, 1) == '#'); |
||
76 | }); |
||
77 | } |
||
78 | |||
79 | return $this->cached; |
||
80 | } |
||
81 | |||
82 | /** |
||
83 | * @param string $content |
||
84 | * |
||
85 | * @return WordStats |
||
86 | */ |
||
87 | public function getStopwordCount(string $content): WordStats { |
||
88 | if (empty($content)) { |
||
89 | return new WordStats(); |
||
90 | } |
||
91 | |||
92 | $strippedInput = $this->removePunctuation($content); |
||
93 | $candidateWords = $this->getCandidateWords($strippedInput); |
||
94 | |||
95 | $overlappingStopWords = []; |
||
96 | foreach ($candidateWords as $w) { |
||
97 | if (in_array(mb_strtolower($w), $this->getWordList())) { |
||
98 | $overlappingStopWords[] = mb_strtolower($w); |
||
99 | } |
||
100 | } |
||
101 | |||
102 | return new WordStats([ |
||
103 | 'wordCount' => count($candidateWords), |
||
104 | 'stopWordCount' => count($overlappingStopWords), |
||
105 | 'stopWords' => $overlappingStopWords, |
||
106 | ]); |
||
107 | } |
||
108 | |||
109 | /** |
||
110 | * @param string $strippedInput |
||
111 | * |
||
112 | * @return array |
||
113 | */ |
||
114 | public function getCandidateWords(string $strippedInput): array { |
||
115 | // Simple separating words in Japanese. |
||
116 | if ($this->getLanguage() === 'ja') { |
||
117 | $regexp = '/(' . implode('|', array_map('preg_quote', $this->getWordList())) . ')/'; |
||
118 | $strippedInput = preg_replace($regexp, ' $1 ', $strippedInput); |
||
119 | } |
||
120 | |||
121 | return explode(' ', $strippedInput); |
||
122 | } |
||
123 | } |