|
1
|
|
|
<?php |
|
2
|
|
|
|
|
3
|
|
|
declare (strict_types = 1); |
|
4
|
|
|
|
|
5
|
|
|
namespace Phpml\FeatureExtraction; |
|
6
|
|
|
|
|
7
|
|
|
use Phpml\Tokenization\Tokenizer; |
|
8
|
|
|
|
|
9
|
|
|
class TokenCountVectorizer implements Vectorizer |
|
10
|
|
|
{ |
|
11
|
|
|
/** |
|
12
|
|
|
* @var Tokenizer |
|
13
|
|
|
*/ |
|
14
|
|
|
private $tokenizer; |
|
15
|
|
|
|
|
16
|
|
|
/** |
|
17
|
|
|
* @var float |
|
18
|
|
|
*/ |
|
19
|
|
|
private $minDF; |
|
20
|
|
|
|
|
21
|
|
|
/** |
|
22
|
|
|
* @var array |
|
23
|
|
|
*/ |
|
24
|
|
|
private $vocabulary; |
|
25
|
|
|
|
|
26
|
|
|
/** |
|
27
|
|
|
* @var array |
|
28
|
|
|
*/ |
|
29
|
|
|
private $tokens; |
|
30
|
|
|
|
|
31
|
|
|
/** |
|
32
|
|
|
* @var array |
|
33
|
|
|
*/ |
|
34
|
|
|
private $frequencies; |
|
35
|
|
|
|
|
36
|
|
|
/** |
|
37
|
|
|
* @param Tokenizer $tokenizer |
|
38
|
|
|
* @param float $minDF |
|
39
|
|
|
*/ |
|
40
|
|
|
public function __construct(Tokenizer $tokenizer, float $minDF = 0) |
|
41
|
|
|
{ |
|
42
|
|
|
$this->tokenizer = $tokenizer; |
|
43
|
|
|
$this->minDF = $minDF; |
|
|
|
|
|
|
44
|
|
|
$this->vocabulary = []; |
|
45
|
|
|
$this->frequencies = []; |
|
46
|
|
|
} |
|
47
|
|
|
|
|
48
|
|
|
/** |
|
49
|
|
|
* @param array $samples |
|
50
|
|
|
* |
|
51
|
|
|
* @return array |
|
52
|
|
|
*/ |
|
53
|
|
|
public function transform(array $samples): array |
|
54
|
|
|
{ |
|
55
|
|
|
$this->buildVocabulary($samples); |
|
56
|
|
|
|
|
57
|
|
|
foreach ($samples as $index => $sample) { |
|
58
|
|
|
$samples[$index] = $this->transformSample($index); |
|
59
|
|
|
} |
|
60
|
|
|
|
|
61
|
|
|
$samples = $this->checkDocumentFrequency($samples); |
|
62
|
|
|
|
|
63
|
|
|
return $samples; |
|
64
|
|
|
} |
|
65
|
|
|
|
|
66
|
|
|
/** |
|
67
|
|
|
* @return array |
|
68
|
|
|
*/ |
|
69
|
|
|
public function getVocabulary() |
|
70
|
|
|
{ |
|
71
|
|
|
return array_flip($this->vocabulary); |
|
72
|
|
|
} |
|
73
|
|
|
|
|
74
|
|
|
/** |
|
75
|
|
|
* @param array $samples |
|
76
|
|
|
*/ |
|
77
|
|
|
private function buildVocabulary(array &$samples) |
|
78
|
|
|
{ |
|
79
|
|
|
foreach ($samples as $index => $sample) { |
|
80
|
|
|
$tokens = $this->tokenizer->tokenize($sample); |
|
81
|
|
|
foreach ($tokens as $token) { |
|
82
|
|
|
$this->addTokenToVocabulary($token); |
|
83
|
|
|
} |
|
84
|
|
|
$this->tokens[$index] = $tokens; |
|
85
|
|
|
} |
|
86
|
|
|
} |
|
87
|
|
|
|
|
88
|
|
|
/** |
|
89
|
|
|
* @param int $index |
|
90
|
|
|
* |
|
91
|
|
|
* @return array |
|
92
|
|
|
*/ |
|
93
|
|
|
private function transformSample(int $index) |
|
94
|
|
|
{ |
|
95
|
|
|
$counts = []; |
|
96
|
|
|
$tokens = $this->tokens[$index]; |
|
97
|
|
|
|
|
98
|
|
|
foreach ($tokens as $token) { |
|
99
|
|
|
$index = $this->getTokenIndex($token); |
|
100
|
|
|
$this->updateFrequency($token); |
|
101
|
|
|
if (!isset($counts[$index])) { |
|
102
|
|
|
$counts[$index] = 0; |
|
103
|
|
|
} |
|
104
|
|
|
|
|
105
|
|
|
++$counts[$index]; |
|
106
|
|
|
} |
|
107
|
|
|
|
|
108
|
|
|
foreach ($this->vocabulary as $index) { |
|
109
|
|
|
if (!isset($counts[$index])) { |
|
110
|
|
|
$counts[$index] = 0; |
|
111
|
|
|
} |
|
112
|
|
|
} |
|
113
|
|
|
|
|
114
|
|
|
return $counts; |
|
115
|
|
|
} |
|
116
|
|
|
|
|
117
|
|
|
/** |
|
118
|
|
|
* @param string $token |
|
119
|
|
|
* |
|
120
|
|
|
* @return int |
|
121
|
|
|
*/ |
|
122
|
|
|
private function getTokenIndex(string $token): int |
|
123
|
|
|
{ |
|
124
|
|
|
return $this->vocabulary[$token]; |
|
125
|
|
|
} |
|
126
|
|
|
|
|
127
|
|
|
/** |
|
128
|
|
|
* @param string $token |
|
129
|
|
|
*/ |
|
130
|
|
|
private function addTokenToVocabulary(string $token) |
|
131
|
|
|
{ |
|
132
|
|
|
if (!isset($this->vocabulary[$token])) { |
|
133
|
|
|
$this->vocabulary[$token] = count($this->vocabulary); |
|
134
|
|
|
} |
|
135
|
|
|
} |
|
136
|
|
|
|
|
137
|
|
|
/** |
|
138
|
|
|
* @param string $token |
|
139
|
|
|
*/ |
|
140
|
|
|
private function updateFrequency(string $token) |
|
141
|
|
|
{ |
|
142
|
|
|
if (!isset($this->frequencies[$token])) { |
|
143
|
|
|
$this->frequencies[$token] = 0; |
|
144
|
|
|
} |
|
145
|
|
|
|
|
146
|
|
|
++$this->frequencies[$token]; |
|
147
|
|
|
} |
|
148
|
|
|
|
|
149
|
|
|
/** |
|
150
|
|
|
* @param array $samples |
|
151
|
|
|
* |
|
152
|
|
|
* @return array |
|
153
|
|
|
*/ |
|
154
|
|
|
private function checkDocumentFrequency(array $samples) |
|
155
|
|
|
{ |
|
156
|
|
|
if ($this->minDF > 0) { |
|
157
|
|
|
$beyondMinimum = $this->getBeyondMinimumIndexes(count($samples)); |
|
158
|
|
|
foreach ($samples as $index => $sample) { |
|
159
|
|
|
$samples[$index] = $this->resetBeyondMinimum($sample, $beyondMinimum); |
|
160
|
|
|
} |
|
161
|
|
|
} |
|
162
|
|
|
|
|
163
|
|
|
return $samples; |
|
164
|
|
|
} |
|
165
|
|
|
|
|
166
|
|
|
/** |
|
167
|
|
|
* @param array $sample |
|
168
|
|
|
* @param array $beyondMinimum |
|
169
|
|
|
* |
|
170
|
|
|
* @return array |
|
171
|
|
|
*/ |
|
172
|
|
|
private function resetBeyondMinimum(array $sample, array $beyondMinimum) |
|
173
|
|
|
{ |
|
174
|
|
|
foreach ($beyondMinimum as $index) { |
|
175
|
|
|
$sample[$index] = 0; |
|
176
|
|
|
} |
|
177
|
|
|
|
|
178
|
|
|
return $sample; |
|
179
|
|
|
} |
|
180
|
|
|
|
|
181
|
|
|
/** |
|
182
|
|
|
* @param int $samplesCount |
|
183
|
|
|
* |
|
184
|
|
|
* @return array |
|
185
|
|
|
*/ |
|
186
|
|
|
private function getBeyondMinimumIndexes(int $samplesCount) |
|
187
|
|
|
{ |
|
188
|
|
|
$indexes = []; |
|
189
|
|
|
foreach ($this->frequencies as $token => $frequency) { |
|
190
|
|
|
if (($frequency / $samplesCount) < $this->minDF) { |
|
191
|
|
|
$indexes[] = $this->getTokenIndex($token); |
|
192
|
|
|
} |
|
193
|
|
|
} |
|
194
|
|
|
|
|
195
|
|
|
return $indexes; |
|
196
|
|
|
} |
|
197
|
|
|
} |
|
198
|
|
|
|
Our type inference engine has found a suspicous assignment of a value to a property. This check raises an issue when a value that can be of a mixed type is assigned to a property that is type hinted more strictly.
For example, imagine you have a variable
$accountIdthat can either hold an Id object or false (if there is no account id yet). Your code now assigns that value to theidproperty of an instance of theAccountclass. This class holds a proper account, so the id value must no longer be false.Either this assignment is in error or a type check should be added for that assignment.