Completed
Push — master ( 7c1730...75be4d )
by Andrew
05:57
created

StopWords   A

Complexity

Total Complexity 13

Size/Duplication

Total Lines 106
Duplicated Lines 0 %

Coupling/Cohesion

Components 1
Dependencies 2

Importance

Changes 4
Bugs 0 Features 0
Metric Value
wmc 13
c 4
b 0
f 0
lcom 1
cbo 2
dl 0
loc 106
rs 10

7 Methods

Rating   Name   Duplication   Size   Complexity  
A __construct() 0 3 1
A config() 0 3 1
A removePunctuation() 0 3 1
A getLanguage() 0 8 2
A getWordList() 0 9 2
A getStopwordCount() 0 21 4
A getCandidateWords() 0 9 2
1
<?php declare(strict_types=1);
2
3
namespace Goose\Text;
4
5
use Goose\Configuration;
6
7
/**
8
 * Stop Words
9
 *
10
 * @package Goose\Text
11
 * @license http://www.apache.org/licenses/LICENSE-2.0 Apache License 2.0
12
 */
13
class StopWords
14
{
15
    /** @var Configuration */
16
    private $config;
17
18
    /** @var string[] */
19
    private $cached = [];
20
21
    /** @var string[] */
22
    private $languages = [
23
        'ar', 'da', 'de', 'en', 'es', 'fi',
24
        'fr', 'hu', 'id', 'it', 'ja', 'ko',
25
        'nb', 'nl', 'no', 'pl', 'pt', 'ru',
26
        'sv', 'zh'
27
    ];
28
29
    /**
30
     * @param Configuration $config
31
     */
32
    public function __construct(Configuration $config) {
33
        $this->config = $config;
34
    }
35
36
    /**
37
     * @return Configuration
38
     */
39
    public function config(): Configuration {
40
        return $this->config;
41
    }
42
43
    /**
44
     * @param string $str
45
     *
46
     * @return string
47
     */
48
    public function removePunctuation(string $str): string {
49
        return preg_replace("/[[:punct:]]+/", '', $str);
50
    }
51
52
    /**
53
     * @return string
54
     */
55
    public function getLanguage(): string {
56
        list($language) = explode('-', $this->config()->get('language'));
57
58
        if (!in_array($language, $this->languages)) {
59
            $language = 'en';
60
        }
61
        return mb_strtolower($language);
62
    }
63
64
    /**
65
     * @return mixed
66
     */
67
    public function getWordList(): array {
68
        if (empty($this->cached)) {
69
            $file = sprintf(__DIR__ . '/../../resources/text/stopwords-%s.txt', $this->getLanguage());
70
71
            $this->cached = explode("\n", str_replace(["\r\n", "\r"], "\n", file_get_contents($file)));
72
        }
73
74
        return $this->cached;
75
    }
76
77
    /**
78
     * @param string $content
79
     *
80
     * @return WordStats
81
     */
82
    public function getStopwordCount(string $content): WordStats {
83
        if (empty($content)) {
84
            return new WordStats();
85
        }
86
87
        $strippedInput = $this->removePunctuation($content);
88
        $candidateWords = $this->getCandidateWords($strippedInput);
89
90
        $overlappingStopWords = [];
91
        foreach ($candidateWords as $w) {
0 ignored issues
show
Bug introduced by
The expression $candidateWords of type string is not traversable.
Loading history...
92
            if (in_array(mb_strtolower($w), $this->getWordList())) {
93
                $overlappingStopWords[] = mb_strtolower($w);
94
            }
95
        }
96
97
        return new WordStats([
98
            'wordCount' => count($candidateWords),
99
            'stopWordCount' => count($overlappingStopWords),
100
            'stopWords' => $overlappingStopWords,
101
        ]);
102
    }
103
104
    /**
105
     * @param string $strippedInput
106
     *
107
     * @return array
108
     */
109
    public function getCandidateWords(string $strippedInput): string {
110
        // Simple separating words in Japanese.
111
        if ($this->getLanguage() === 'ja') {
112
            $regexp = '/(' . implode('|', array_map('preg_quote', $this->getWordList())) . ')/';
113
            $strippedInput = preg_replace($regexp, ' $1 ', $strippedInput);
114
        }
115
116
        return explode(' ', $strippedInput);
117
    }
118
}