Completed
Push — develop ( 00178c...c05ce8 )
by Arkadiusz
03:10
created

TokenCountVectorizer::checkDocumentFrequency()   A

Complexity

Conditions 3
Paths 2

Size

Total Lines 11
Code Lines 6

Duplication

Lines 0
Ratio 0 %
Metric Value
dl 0
loc 11
rs 9.4285
cc 3
eloc 6
nc 2
nop 1
1
<?php
2
3
declare (strict_types = 1);
4
5
namespace Phpml\FeatureExtraction;
6
7
use Phpml\Tokenization\Tokenizer;
8
9
class TokenCountVectorizer implements Vectorizer
10
{
11
    /**
12
     * @var Tokenizer
13
     */
14
    private $tokenizer;
15
16
    /**
17
     * @var float
18
     */
19
    private $minDF;
20
21
    /**
22
     * @var array
23
     */
24
    private $vocabulary;
25
26
    /**
27
     * @var array
28
     */
29
    private $frequencies;
30
31
    /**
32
     * @param Tokenizer $tokenizer
33
     * @param float     $minDF
34
     */
35
    public function __construct(Tokenizer $tokenizer, float $minDF = 0)
36
    {
37
        $this->tokenizer = $tokenizer;
38
        $this->minDF = $minDF;
0 ignored issues
show
Documentation Bug introduced by
It seems like $minDF can also be of type integer. However, the property $minDF is declared as type double. Maybe add an additional type check?

Our type inference engine has found a suspicous assignment of a value to a property. This check raises an issue when a value that can be of a mixed type is assigned to a property that is type hinted more strictly.

For example, imagine you have a variable $accountId that can either hold an Id object or false (if there is no account id yet). Your code now assigns that value to the id property of an instance of the Account class. This class holds a proper account, so the id value must no longer be false.

Either this assignment is in error or a type check should be added for that assignment.

class Id
{
    public $id;

    public function __construct($id)
    {
        $this->id = $id;
    }

}

class Account
{
    /** @var  Id $id */
    public $id;
}

$account_id = false;

if (starsAreRight()) {
    $account_id = new Id(42);
}

$account = new Account();
if ($account instanceof Id)
{
    $account->id = $account_id;
}
Loading history...
39
        $this->vocabulary = [];
40
        $this->frequencies = [];
41
    }
42
43
    /**
44
     * @param array $samples
45
     *
46
     * @return array
47
     */
48
    public function transform(array $samples): array
49
    {
50
        foreach ($samples as $index => $sample) {
51
            $samples[$index] = $this->transformSample($sample);
52
        }
53
54
        $samples = $this->checkDocumentFrequency($samples);
55
56
        return $samples;
57
    }
58
59
    /**
60
     * @return array
61
     */
62
    public function getVocabulary()
63
    {
64
        return array_flip($this->vocabulary);
65
    }
66
67
    /**
68
     * @param string $sample
69
     *
70
     * @return array
71
     */
72
    private function transformSample(string $sample)
73
    {
74
        $counts = [];
75
        $tokens = $this->tokenizer->tokenize($sample);
76
        foreach ($tokens as $token) {
77
            $index = $this->getTokenIndex($token);
78
            $this->updateFrequency($token);
79
            if (!isset($counts[$index])) {
80
                $counts[$index] = 0;
81
            }
82
83
            ++$counts[$index];
84
        }
85
86
        return $counts;
87
    }
88
89
    /**
90
     * @param string $token
91
     *
92
     * @return mixed
93
     */
94
    private function getTokenIndex(string $token)
95
    {
96
        if (!isset($this->vocabulary[$token])) {
97
            $this->vocabulary[$token] = count($this->vocabulary);
98
        }
99
100
        return $this->vocabulary[$token];
101
    }
102
103
    /**
104
     * @param string $token
105
     */
106
    private function updateFrequency(string $token)
107
    {
108
        if (!isset($this->frequencies[$token])) {
109
            $this->frequencies[$token] = 0;
110
        }
111
112
        ++$this->frequencies[$token];
113
    }
114
115
    /**
116
     * @param array $samples
117
     * 
118
     * @return array
119
     */
120
    private function checkDocumentFrequency(array $samples)
121
    {
122
        if ($this->minDF > 0) {
123
            $beyondMinimum = $this->getBeyondMinimumIndexes(count($samples));
124
            foreach ($samples as $index => $sample) {
125
                $samples[$index] = $this->unsetBeyondMinimum($sample, $beyondMinimum);
126
            }
127
        }
128
129
        return $samples;
130
    }
131
132
    /**
133
     * @param array $sample
134
     * @param array $beyondMinimum
135
     *
136
     * @return array
137
     */
138
    private function unsetBeyondMinimum(array $sample, array $beyondMinimum)
139
    {
140
        foreach ($beyondMinimum as $index) {
141
            unset($sample[$index]);
142
        }
143
144
        return $sample;
145
    }
146
147
    /**
148
     * @param int $samplesCount
149
     *
150
     * @return array
151
     */
152
    private function getBeyondMinimumIndexes(int $samplesCount)
153
    {
154
        $indexes = [];
155
        foreach ($this->frequencies as $token => $frequency) {
156
            if (($frequency / $samplesCount) < $this->minDF) {
157
                $indexes[] = $this->getTokenIndex($token);
158
            }
159
        }
160
161
        return $indexes;
162
    }
163
}
164