1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
declare (strict_types = 1); |
4
|
|
|
|
5
|
|
|
namespace Phpml\FeatureExtraction; |
6
|
|
|
|
7
|
|
|
use Phpml\Tokenization\Tokenizer; |
8
|
|
|
use Phpml\Transformer; |
9
|
|
|
|
10
|
|
|
class TokenCountVectorizer implements Transformer |
11
|
|
|
{ |
12
|
|
|
/** |
13
|
|
|
* @var Tokenizer |
14
|
|
|
*/ |
15
|
|
|
private $tokenizer; |
16
|
|
|
|
17
|
|
|
/** |
18
|
|
|
* @var float |
19
|
|
|
*/ |
20
|
|
|
private $minDF; |
21
|
|
|
|
22
|
|
|
/** |
23
|
|
|
* @var array |
24
|
|
|
*/ |
25
|
|
|
private $vocabulary; |
26
|
|
|
|
27
|
|
|
/** |
28
|
|
|
* @var array |
29
|
|
|
*/ |
30
|
|
|
private $frequencies; |
31
|
|
|
|
32
|
|
|
/** |
33
|
|
|
* @param Tokenizer $tokenizer |
34
|
|
|
* @param float $minDF |
35
|
|
|
*/ |
36
|
|
|
public function __construct(Tokenizer $tokenizer, float $minDF = 0) |
37
|
|
|
{ |
38
|
|
|
$this->tokenizer = $tokenizer; |
39
|
|
|
$this->minDF = $minDF; |
|
|
|
|
40
|
|
|
$this->vocabulary = []; |
41
|
|
|
$this->frequencies = []; |
42
|
|
|
} |
43
|
|
|
|
44
|
|
|
/** |
45
|
|
|
* @param array $samples |
46
|
|
|
*/ |
47
|
|
|
public function fit(array $samples) |
48
|
|
|
{ |
49
|
|
|
$this->buildVocabulary($samples); |
50
|
|
|
} |
51
|
|
|
|
52
|
|
|
/** |
53
|
|
|
* @param array $samples |
54
|
|
|
*/ |
55
|
|
|
public function transform(array &$samples) |
56
|
|
|
{ |
57
|
|
|
foreach ($samples as &$sample) { |
58
|
|
|
$this->transformSample($sample); |
59
|
|
|
} |
60
|
|
|
|
61
|
|
|
$this->checkDocumentFrequency($samples); |
62
|
|
|
} |
63
|
|
|
|
64
|
|
|
/** |
65
|
|
|
* @return array |
66
|
|
|
*/ |
67
|
|
|
public function getVocabulary() |
68
|
|
|
{ |
69
|
|
|
return array_flip($this->vocabulary); |
70
|
|
|
} |
71
|
|
|
|
72
|
|
|
/** |
73
|
|
|
* @param array $samples |
74
|
|
|
*/ |
75
|
|
|
private function buildVocabulary(array &$samples) |
76
|
|
|
{ |
77
|
|
|
foreach ($samples as $index => $sample) { |
78
|
|
|
$tokens = $this->tokenizer->tokenize($sample); |
79
|
|
|
foreach ($tokens as $token) { |
80
|
|
|
$this->addTokenToVocabulary($token); |
81
|
|
|
} |
82
|
|
|
} |
83
|
|
|
} |
84
|
|
|
|
85
|
|
|
/** |
86
|
|
|
* @param string $sample |
87
|
|
|
*/ |
88
|
|
|
private function transformSample(string &$sample) |
89
|
|
|
{ |
90
|
|
|
$counts = []; |
91
|
|
|
$tokens = $this->tokenizer->tokenize($sample); |
92
|
|
|
|
93
|
|
|
foreach ($tokens as $token) { |
94
|
|
|
$index = $this->getTokenIndex($token); |
95
|
|
|
if (false !== $index) { |
96
|
|
|
$this->updateFrequency($token); |
97
|
|
|
if (!isset($counts[$index])) { |
98
|
|
|
$counts[$index] = 0; |
99
|
|
|
} |
100
|
|
|
|
101
|
|
|
++$counts[$index]; |
102
|
|
|
} |
103
|
|
|
} |
104
|
|
|
|
105
|
|
|
foreach ($this->vocabulary as $index) { |
106
|
|
|
if (!isset($counts[$index])) { |
107
|
|
|
$counts[$index] = 0; |
108
|
|
|
} |
109
|
|
|
} |
110
|
|
|
|
111
|
|
|
$sample = $counts; |
112
|
|
|
} |
113
|
|
|
|
114
|
|
|
/** |
115
|
|
|
* @param string $token |
116
|
|
|
* |
117
|
|
|
* @return int|bool |
118
|
|
|
*/ |
119
|
|
|
private function getTokenIndex(string $token) |
120
|
|
|
{ |
121
|
|
|
return isset($this->vocabulary[$token]) ? $this->vocabulary[$token] : false; |
122
|
|
|
} |
123
|
|
|
|
124
|
|
|
/** |
125
|
|
|
* @param string $token |
126
|
|
|
*/ |
127
|
|
|
private function addTokenToVocabulary(string $token) |
128
|
|
|
{ |
129
|
|
|
if (!isset($this->vocabulary[$token])) { |
130
|
|
|
$this->vocabulary[$token] = count($this->vocabulary); |
131
|
|
|
} |
132
|
|
|
} |
133
|
|
|
|
134
|
|
|
/** |
135
|
|
|
* @param string $token |
136
|
|
|
*/ |
137
|
|
|
private function updateFrequency(string $token) |
138
|
|
|
{ |
139
|
|
|
if (!isset($this->frequencies[$token])) { |
140
|
|
|
$this->frequencies[$token] = 0; |
141
|
|
|
} |
142
|
|
|
|
143
|
|
|
++$this->frequencies[$token]; |
144
|
|
|
} |
145
|
|
|
|
146
|
|
|
/** |
147
|
|
|
* @param array $samples |
148
|
|
|
* |
149
|
|
|
* @return array |
150
|
|
|
*/ |
151
|
|
|
private function checkDocumentFrequency(array &$samples) |
152
|
|
|
{ |
153
|
|
|
if ($this->minDF > 0) { |
154
|
|
|
$beyondMinimum = $this->getBeyondMinimumIndexes(count($samples)); |
155
|
|
|
foreach ($samples as &$sample) { |
156
|
|
|
$this->resetBeyondMinimum($sample, $beyondMinimum); |
157
|
|
|
} |
158
|
|
|
} |
159
|
|
|
} |
160
|
|
|
|
161
|
|
|
/** |
162
|
|
|
* @param array $sample |
163
|
|
|
* @param array $beyondMinimum |
164
|
|
|
*/ |
165
|
|
|
private function resetBeyondMinimum(array &$sample, array $beyondMinimum) |
166
|
|
|
{ |
167
|
|
|
foreach ($beyondMinimum as $index) { |
168
|
|
|
$sample[$index] = 0; |
169
|
|
|
} |
170
|
|
|
} |
171
|
|
|
|
172
|
|
|
/** |
173
|
|
|
* @param int $samplesCount |
174
|
|
|
* |
175
|
|
|
* @return array |
176
|
|
|
*/ |
177
|
|
|
private function getBeyondMinimumIndexes(int $samplesCount) |
178
|
|
|
{ |
179
|
|
|
$indexes = []; |
180
|
|
|
foreach ($this->frequencies as $token => $frequency) { |
181
|
|
|
if (($frequency / $samplesCount) < $this->minDF) { |
182
|
|
|
$indexes[] = $this->getTokenIndex($token); |
183
|
|
|
} |
184
|
|
|
} |
185
|
|
|
|
186
|
|
|
return $indexes; |
187
|
|
|
} |
188
|
|
|
} |
189
|
|
|
|
Our type inference engine has found a suspicous assignment of a value to a property. This check raises an issue when a value that can be of a mixed type is assigned to a property that is type hinted more strictly.
For example, imagine you have a variable
$accountId
that can either hold an Id object or false (if there is no account id yet). Your code now assigns that value to theid
property of an instance of theAccount
class. This class holds a proper account, so the id value must no longer be false.Either this assignment is in error or a type check should be added for that assignment.