StopWords::removePunctuation()   A
last analyzed

Complexity

Conditions 1
Paths 1

Size

Total Lines 2
Code Lines 1

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
dl 0
loc 2
c 0
b 0
f 0
rs 10
cc 1
eloc 1
nc 1
nop 1
1
<?php declare(strict_types=1);
2
3
namespace Goose\Text;
4
5
use Goose\Configuration;
6
7
/**
8
 * Stop Words
9
 *
10
 * @package Goose\Text
11
 * @license http://www.apache.org/licenses/LICENSE-2.0 Apache License 2.0
12
 */
13
class StopWords
14
{
15
    /** @var Configuration */
16
    private $config;
17
18
    /** @var string[] */
19
    private $cached = [];
20
21
    /** @var string[] */
22
    private $languages = [
23
        'ar', 'cs', 'da', 'de', 'en', 'es',
24
        'fi', 'fr', 'hu', 'id', 'it', 'ja',
25
        'ko', 'nb', 'nl', 'no', 'pl', 'pt',
26
        'ru', 'sv', 'zh', 'vi'
27
    ];
28
29
    /**
30
     * @param Configuration $config
31
     */
32
    public function __construct(Configuration $config) {
33
        $this->config = $config;
34
    }
35
36
    /**
37
     * @return Configuration
38
     */
39
    public function config(): Configuration {
40
        return $this->config;
41
    }
42
43
    /**
44
     * @param string $str
45
     *
46
     * @return string
47
     */
48
    public function removePunctuation(string $str): string {
49
        return preg_replace("/[[:punct:]]+/", '', $str);
50
    }
51
52
    /**
53
     * @return string
54
     */
55
    public function getLanguage(): string {
56
        list($language) = explode('-', $this->config()->get('language'));
0 ignored issues
show
Bug introduced by
It seems like $this->config()->get('language') can also be of type null; however, parameter $string of explode() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

56
        list($language) = explode('-', /** @scrutinizer ignore-type */ $this->config()->get('language'));
Loading history...
57
58
        if (!in_array($language, $this->languages)) {
59
            $language = 'en';
60
        }
61
        return mb_strtolower($language);
62
    }
63
64
    /**
65
     * @return mixed
66
     */
67
    public function getWordList(): array {
68
        if (empty($this->cached)) {
69
            $file = sprintf(__DIR__ . '/../../resources/text/stopwords-%s.txt', $this->getLanguage());
70
71
            $lines = explode("\n", str_replace(["\r\n", "\r"], "\n", file_get_contents($file)));
72
73
            $this->cached = array_filter($lines, function($line) {
74
                // Ignore emoty lines and lines starting with '#'.
75
                return !(trim($line) == '' || mb_substr($line, 0, 1) == '#');
76
            });
77
        }
78
79
        return $this->cached;
80
    }
81
82
    /**
83
     * @param string $content
84
     *
85
     * @return WordStats
86
     */
87
    public function getStopwordCount(string $content): WordStats {
88
        if (empty($content)) {
89
            return new WordStats();
90
        }
91
92
        $strippedInput = $this->removePunctuation($content);
93
        $candidateWords = $this->getCandidateWords($strippedInput);
94
95
        $overlappingStopWords = [];
96
        foreach ($candidateWords as $w) {
97
            if (in_array(mb_strtolower($w), $this->getWordList())) {
98
                $overlappingStopWords[] = mb_strtolower($w);
99
            }
100
        }
101
102
        return new WordStats([
103
            'wordCount' => count($candidateWords),
104
            'stopWordCount' => count($overlappingStopWords),
105
            'stopWords' => $overlappingStopWords,
106
        ]);
107
    }
108
109
    /**
110
     * @param string $strippedInput
111
     *
112
     * @return array
113
     */
114
    public function getCandidateWords(string $strippedInput): array {
115
        // Simple separating words in Japanese.
116
        if ($this->getLanguage() === 'ja') {
117
            $regexp = '/(' . implode('|', array_map('preg_quote', $this->getWordList())) . ')/';
118
            $strippedInput = preg_replace($regexp, ' $1 ', $strippedInput);
119
        }
120
121
        return explode(' ', $strippedInput);
122
    }
123
}