Completed
Push — develop ( a2aa27...cce689 )
by Arkadiusz
02:48
created

TokenCountVectorizer::resetBeyondMinimum()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 6
Code Lines 3

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 1
Metric Value
c 1
b 0
f 1
dl 0
loc 6
rs 9.4285
cc 2
eloc 3
nc 2
nop 2
1
<?php
2
3
declare (strict_types = 1);
4
5
namespace Phpml\FeatureExtraction;
6
7
use Phpml\Tokenization\Tokenizer;
8
use Phpml\Transformer;
9
10
class TokenCountVectorizer implements Transformer
11
{
12
    /**
13
     * @var Tokenizer
14
     */
15
    private $tokenizer;
16
17
    /**
18
     * @var StopWords
19
     */
20
    private $stopWords;
21
22
    /**
23
     * @var float
24
     */
25
    private $minDF;
26
27
    /**
28
     * @var array
29
     */
30
    private $vocabulary;
31
32
    /**
33
     * @var array
34
     */
35
    private $frequencies;
36
37
    /**
38
     * @param Tokenizer $tokenizer
39
     * @param StopWords $stopWords
40
     * @param float     $minDF
41
     */
42
    public function __construct(Tokenizer $tokenizer, StopWords $stopWords = null, float $minDF = 0)
43
    {
44
        $this->tokenizer = $tokenizer;
45
        $this->stopWords = $stopWords;
46
        $this->minDF = $minDF;
0 ignored issues
show
Documentation Bug introduced by
It seems like $minDF can also be of type integer. However, the property $minDF is declared as type double. Maybe add an additional type check?

Our type inference engine has found a suspicous assignment of a value to a property. This check raises an issue when a value that can be of a mixed type is assigned to a property that is type hinted more strictly.

For example, imagine you have a variable $accountId that can either hold an Id object or false (if there is no account id yet). Your code now assigns that value to the id property of an instance of the Account class. This class holds a proper account, so the id value must no longer be false.

Either this assignment is in error or a type check should be added for that assignment.

class Id
{
    public $id;

    public function __construct($id)
    {
        $this->id = $id;
    }

}

class Account
{
    /** @var  Id $id */
    public $id;
}

$account_id = false;

if (starsAreRight()) {
    $account_id = new Id(42);
}

$account = new Account();
if ($account instanceof Id)
{
    $account->id = $account_id;
}
Loading history...
47
48
        $this->vocabulary = [];
49
        $this->frequencies = [];
50
    }
51
52
    /**
53
     * @param array $samples
54
     */
55
    public function fit(array $samples)
56
    {
57
        $this->buildVocabulary($samples);
58
    }
59
60
    /**
61
     * @param array $samples
62
     */
63
    public function transform(array &$samples)
64
    {
65
        foreach ($samples as &$sample) {
66
            $this->transformSample($sample);
67
        }
68
69
        $this->checkDocumentFrequency($samples);
70
    }
71
72
    /**
73
     * @return array
74
     */
75
    public function getVocabulary()
76
    {
77
        return array_flip($this->vocabulary);
78
    }
79
80
    /**
81
     * @param array $samples
82
     */
83
    private function buildVocabulary(array &$samples)
84
    {
85
        foreach ($samples as $index => $sample) {
86
            $tokens = $this->tokenizer->tokenize($sample);
87
            foreach ($tokens as $token) {
88
                $this->addTokenToVocabulary($token);
89
            }
90
        }
91
    }
92
93
    /**
94
     * @param string $sample
95
     */
96
    private function transformSample(string &$sample)
97
    {
98
        $counts = [];
99
        $tokens = $this->tokenizer->tokenize($sample);
100
101
        foreach ($tokens as $token) {
102
            $index = $this->getTokenIndex($token);
103
            if (false !== $index) {
104
                $this->updateFrequency($token);
105
                if (!isset($counts[$index])) {
106
                    $counts[$index] = 0;
107
                }
108
109
                ++$counts[$index];
110
            }
111
        }
112
113
        foreach ($this->vocabulary as $index) {
114
            if (!isset($counts[$index])) {
115
                $counts[$index] = 0;
116
            }
117
        }
118
119
        $sample = $counts;
120
    }
121
122
    /**
123
     * @param string $token
124
     *
125
     * @return int|bool
126
     */
127
    private function getTokenIndex(string $token)
128
    {
129
        if ($this->isStopWord($token)) {
130
            return false;
131
        }
132
133
        return isset($this->vocabulary[$token]) ? $this->vocabulary[$token] : false;
134
    }
135
136
    /**
137
     * @param string $token
138
     */
139
    private function addTokenToVocabulary(string $token)
140
    {
141
        if ($this->isStopWord($token)) {
142
            return;
143
        }
144
145
        if (!isset($this->vocabulary[$token])) {
146
            $this->vocabulary[$token] = count($this->vocabulary);
147
        }
148
    }
149
150
    /**
151
     * @param string $token
152
     *
153
     * @return bool
154
     */
155
    private function isStopWord(string $token): bool
156
    {
157
        return $this->stopWords && $this->stopWords->isStopWord($token);
158
    }
159
160
    /**
161
     * @param string $token
162
     */
163
    private function updateFrequency(string $token)
164
    {
165
        if (!isset($this->frequencies[$token])) {
166
            $this->frequencies[$token] = 0;
167
        }
168
169
        ++$this->frequencies[$token];
170
    }
171
172
    /**
173
     * @param array $samples
174
     * 
175
     * @return array
176
     */
177
    private function checkDocumentFrequency(array &$samples)
178
    {
179
        if ($this->minDF > 0) {
180
            $beyondMinimum = $this->getBeyondMinimumIndexes(count($samples));
181
            foreach ($samples as &$sample) {
182
                $this->resetBeyondMinimum($sample, $beyondMinimum);
183
            }
184
        }
185
    }
186
187
    /**
188
     * @param array $sample
189
     * @param array $beyondMinimum
190
     */
191
    private function resetBeyondMinimum(array &$sample, array $beyondMinimum)
192
    {
193
        foreach ($beyondMinimum as $index) {
194
            $sample[$index] = 0;
195
        }
196
    }
197
198
    /**
199
     * @param int $samplesCount
200
     *
201
     * @return array
202
     */
203
    private function getBeyondMinimumIndexes(int $samplesCount)
204
    {
205
        $indexes = [];
206
        foreach ($this->frequencies as $token => $frequency) {
207
            if (($frequency / $samplesCount) < $this->minDF) {
208
                $indexes[] = $this->getTokenIndex($token);
209
            }
210
        }
211
212
        return $indexes;
213
    }
214
}
215