Completed
Push — develop ( 23eff0...2f5171 )
by Arkadiusz
03:32
created

TokenCountVectorizer::transformSample()   B

Complexity

Conditions 5
Paths 9

Size

Total Lines 23
Code Lines 13

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
dl 0
loc 23
rs 8.5906
c 0
b 0
f 0
cc 5
eloc 13
nc 9
nop 1
1
<?php
2
3
declare (strict_types = 1);
4
5
namespace Phpml\FeatureExtraction;
6
7
use Phpml\Tokenization\Tokenizer;
8
9
class TokenCountVectorizer implements Vectorizer
10
{
11
    /**
12
     * @var Tokenizer
13
     */
14
    private $tokenizer;
15
16
    /**
17
     * @var float
18
     */
19
    private $minDF;
20
21
    /**
22
     * @var array
23
     */
24
    private $vocabulary;
25
26
    /**
27
     * @var array
28
     */
29
    private $tokens;
30
31
    /**
32
     * @var array
33
     */
34
    private $frequencies;
35
36
    /**
37
     * @param Tokenizer $tokenizer
38
     * @param float     $minDF
39
     */
40
    public function __construct(Tokenizer $tokenizer, float $minDF = 0)
41
    {
42
        $this->tokenizer = $tokenizer;
43
        $this->minDF = $minDF;
0 ignored issues
show
Documentation Bug introduced by
It seems like $minDF can also be of type integer. However, the property $minDF is declared as type double. Maybe add an additional type check?

Our type inference engine has found a suspicous assignment of a value to a property. This check raises an issue when a value that can be of a mixed type is assigned to a property that is type hinted more strictly.

For example, imagine you have a variable $accountId that can either hold an Id object or false (if there is no account id yet). Your code now assigns that value to the id property of an instance of the Account class. This class holds a proper account, so the id value must no longer be false.

Either this assignment is in error or a type check should be added for that assignment.

class Id
{
    public $id;

    public function __construct($id)
    {
        $this->id = $id;
    }

}

class Account
{
    /** @var  Id $id */
    public $id;
}

$account_id = false;

if (starsAreRight()) {
    $account_id = new Id(42);
}

$account = new Account();
if ($account instanceof Id)
{
    $account->id = $account_id;
}
Loading history...
44
        $this->vocabulary = [];
45
        $this->frequencies = [];
46
    }
47
48
    /**
49
     * @param array $samples
50
     *
51
     * @return array
52
     */
53
    public function transform(array $samples): array
54
    {
55
        $this->buildVocabulary($samples);
56
57
        foreach ($samples as $index => $sample) {
58
            $samples[$index] = $this->transformSample($index);
59
        }
60
61
        $samples = $this->checkDocumentFrequency($samples);
62
63
        return $samples;
64
    }
65
66
    /**
67
     * @return array
68
     */
69
    public function getVocabulary()
70
    {
71
        return array_flip($this->vocabulary);
72
    }
73
74
    /**
75
     * @param array $samples
76
     */
77
    private function buildVocabulary(array &$samples)
78
    {
79
        foreach ($samples as $index => $sample) {
80
            $tokens = $this->tokenizer->tokenize($sample);
81
            foreach ($tokens as $token) {
82
                $this->addTokenToVocabulary($token);
83
            }
84
            $this->tokens[$index] = $tokens;
85
        }
86
    }
87
88
    /**
89
     * @param int $index
90
     *
91
     * @return array
92
     */
93
    private function transformSample(int $index)
94
    {
95
        $counts = [];
96
        $tokens = $this->tokens[$index];
97
98
        foreach ($tokens as $token) {
99
            $index = $this->getTokenIndex($token);
100
            $this->updateFrequency($token);
101
            if (!isset($counts[$index])) {
102
                $counts[$index] = 0;
103
            }
104
105
            ++$counts[$index];
106
        }
107
108
        foreach ($this->vocabulary as $index) {
109
            if (!isset($counts[$index])) {
110
                $counts[$index] = 0;
111
            }
112
        }
113
114
        return $counts;
115
    }
116
117
    /**
118
     * @param string $token
119
     *
120
     * @return int
121
     */
122
    private function getTokenIndex(string $token): int
123
    {
124
        return $this->vocabulary[$token];
125
    }
126
127
    /**
128
     * @param string $token
129
     */
130
    private function addTokenToVocabulary(string $token)
131
    {
132
        if (!isset($this->vocabulary[$token])) {
133
            $this->vocabulary[$token] = count($this->vocabulary);
134
        }
135
    }
136
137
    /**
138
     * @param string $token
139
     */
140
    private function updateFrequency(string $token)
141
    {
142
        if (!isset($this->frequencies[$token])) {
143
            $this->frequencies[$token] = 0;
144
        }
145
146
        ++$this->frequencies[$token];
147
    }
148
149
    /**
150
     * @param array $samples
151
     * 
152
     * @return array
153
     */
154
    private function checkDocumentFrequency(array $samples)
155
    {
156
        if ($this->minDF > 0) {
157
            $beyondMinimum = $this->getBeyondMinimumIndexes(count($samples));
158
            foreach ($samples as $index => $sample) {
159
                $samples[$index] = $this->resetBeyondMinimum($sample, $beyondMinimum);
160
            }
161
        }
162
163
        return $samples;
164
    }
165
166
    /**
167
     * @param array $sample
168
     * @param array $beyondMinimum
169
     *
170
     * @return array
171
     */
172
    private function resetBeyondMinimum(array $sample, array $beyondMinimum)
173
    {
174
        foreach ($beyondMinimum as $index) {
175
            $sample[$index] = 0;
176
        }
177
178
        return $sample;
179
    }
180
181
    /**
182
     * @param int $samplesCount
183
     *
184
     * @return array
185
     */
186
    private function getBeyondMinimumIndexes(int $samplesCount)
187
    {
188
        $indexes = [];
189
        foreach ($this->frequencies as $token => $frequency) {
190
            if (($frequency / $samplesCount) < $this->minDF) {
191
                $indexes[] = $this->getTokenIndex($token);
192
            }
193
        }
194
195
        return $indexes;
196
    }
197
}
198