Completed
Push — develop ( 7f4a0b...601ff8 )
by Arkadiusz
03:21
created

TokenCountVectorizer   A

Complexity

Total Complexity 28

Size/Duplication

Total Lines 179
Duplicated Lines 0 %

Coupling/Cohesion

Components 1
Dependencies 1

Importance

Changes 1
Bugs 0 Features 1
Metric Value
wmc 28
c 1
b 0
f 1
lcom 1
cbo 1
dl 0
loc 179
rs 10

12 Methods

Rating   Name   Duplication   Size   Complexity  
A updateFrequency() 0 8 2
A __construct() 0 7 1
A fit() 0 4 1
A transform() 0 8 2
A getVocabulary() 0 4 1
A buildVocabulary() 0 9 3
B transformSample() 0 25 6
A getTokenIndex() 0 4 2
A addTokenToVocabulary() 0 6 2
A checkDocumentFrequency() 0 9 3
A resetBeyondMinimum() 0 6 2
A getBeyondMinimumIndexes() 0 11 3
1
<?php
2
3
declare (strict_types = 1);
4
5
namespace Phpml\FeatureExtraction;
6
7
use Phpml\Tokenization\Tokenizer;
8
use Phpml\Transformer;
9
10
class TokenCountVectorizer implements Transformer
11
{
12
    /**
13
     * @var Tokenizer
14
     */
15
    private $tokenizer;
16
17
    /**
18
     * @var float
19
     */
20
    private $minDF;
21
22
    /**
23
     * @var array
24
     */
25
    private $vocabulary;
26
27
    /**
28
     * @var array
29
     */
30
    private $frequencies;
31
32
    /**
33
     * @param Tokenizer $tokenizer
34
     * @param float     $minDF
35
     */
36
    public function __construct(Tokenizer $tokenizer, float $minDF = 0)
37
    {
38
        $this->tokenizer = $tokenizer;
39
        $this->minDF = $minDF;
0 ignored issues
show
Documentation Bug introduced by
It seems like $minDF can also be of type integer. However, the property $minDF is declared as type double. Maybe add an additional type check?

Our type inference engine has found a suspicous assignment of a value to a property. This check raises an issue when a value that can be of a mixed type is assigned to a property that is type hinted more strictly.

For example, imagine you have a variable $accountId that can either hold an Id object or false (if there is no account id yet). Your code now assigns that value to the id property of an instance of the Account class. This class holds a proper account, so the id value must no longer be false.

Either this assignment is in error or a type check should be added for that assignment.

class Id
{
    public $id;

    public function __construct($id)
    {
        $this->id = $id;
    }

}

class Account
{
    /** @var  Id $id */
    public $id;
}

$account_id = false;

if (starsAreRight()) {
    $account_id = new Id(42);
}

$account = new Account();
if ($account instanceof Id)
{
    $account->id = $account_id;
}
Loading history...
40
        $this->vocabulary = [];
41
        $this->frequencies = [];
42
    }
43
44
    /**
45
     * @param array $samples
46
     */
47
    public function fit(array $samples)
48
    {
49
        $this->buildVocabulary($samples);
50
    }
51
52
    /**
53
     * @param array $samples
54
     */
55
    public function transform(array &$samples)
56
    {
57
        foreach ($samples as &$sample) {
58
            $this->transformSample($sample);
59
        }
60
61
        $this->checkDocumentFrequency($samples);
62
    }
63
64
    /**
65
     * @return array
66
     */
67
    public function getVocabulary()
68
    {
69
        return array_flip($this->vocabulary);
70
    }
71
72
    /**
73
     * @param array $samples
74
     */
75
    private function buildVocabulary(array &$samples)
76
    {
77
        foreach ($samples as $index => $sample) {
78
            $tokens = $this->tokenizer->tokenize($sample);
79
            foreach ($tokens as $token) {
80
                $this->addTokenToVocabulary($token);
81
            }
82
        }
83
    }
84
85
    /**
86
     * @param string $sample
87
     */
88
    private function transformSample(string &$sample)
89
    {
90
        $counts = [];
91
        $tokens = $this->tokenizer->tokenize($sample);
92
93
        foreach ($tokens as $token) {
94
            $index = $this->getTokenIndex($token);
95
            if (false !== $index) {
96
                $this->updateFrequency($token);
97
                if (!isset($counts[$index])) {
98
                    $counts[$index] = 0;
99
                }
100
101
                ++$counts[$index];
102
            }
103
        }
104
105
        foreach ($this->vocabulary as $index) {
106
            if (!isset($counts[$index])) {
107
                $counts[$index] = 0;
108
            }
109
        }
110
111
        $sample = $counts;
112
    }
113
114
    /**
115
     * @param string $token
116
     *
117
     * @return int|bool
118
     */
119
    private function getTokenIndex(string $token)
120
    {
121
        return isset($this->vocabulary[$token]) ? $this->vocabulary[$token] : false;
122
    }
123
124
    /**
125
     * @param string $token
126
     */
127
    private function addTokenToVocabulary(string $token)
128
    {
129
        if (!isset($this->vocabulary[$token])) {
130
            $this->vocabulary[$token] = count($this->vocabulary);
131
        }
132
    }
133
134
    /**
135
     * @param string $token
136
     */
137
    private function updateFrequency(string $token)
138
    {
139
        if (!isset($this->frequencies[$token])) {
140
            $this->frequencies[$token] = 0;
141
        }
142
143
        ++$this->frequencies[$token];
144
    }
145
146
    /**
147
     * @param array $samples
148
     * 
149
     * @return array
150
     */
151
    private function checkDocumentFrequency(array &$samples)
152
    {
153
        if ($this->minDF > 0) {
154
            $beyondMinimum = $this->getBeyondMinimumIndexes(count($samples));
155
            foreach ($samples as &$sample) {
156
                $this->resetBeyondMinimum($sample, $beyondMinimum);
157
            }
158
        }
159
    }
160
161
    /**
162
     * @param array $sample
163
     * @param array $beyondMinimum
164
     */
165
    private function resetBeyondMinimum(array &$sample, array $beyondMinimum)
166
    {
167
        foreach ($beyondMinimum as $index) {
168
            $sample[$index] = 0;
169
        }
170
    }
171
172
    /**
173
     * @param int $samplesCount
174
     *
175
     * @return array
176
     */
177
    private function getBeyondMinimumIndexes(int $samplesCount)
178
    {
179
        $indexes = [];
180
        foreach ($this->frequencies as $token => $frequency) {
181
            if (($frequency / $samplesCount) < $this->minDF) {
182
                $indexes[] = $this->getTokenIndex($token);
183
            }
184
        }
185
186
        return $indexes;
187
    }
188
}
189