Test Failed
Pull Request — master (#54)
by
unknown
02:33
created

DecisionStump::calculateErrorRate()   C

Complexity

Conditions 8
Paths 36

Size

Total Lines 38
Code Lines 23

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
dl 0
loc 38
rs 5.3846
c 0
b 0
f 0
cc 8
eloc 23
nc 36
nop 3
1
<?php
2
3
declare(strict_types=1);
4
5
namespace Phpml\Classification\Linear;
6
7
use Phpml\Helper\Predictable;
8
use Phpml\Helper\Trainable;
9
use Phpml\Helper\OneVsRest;
10
use Phpml\Classification\WeightedClassifier;
11
use Phpml\Classification\DecisionTree;
12
13
class DecisionStump extends WeightedClassifier
14
{
15
    use Trainable, Predictable, OneVsRest;
16
17
    const AUTO_SELECT = -1;
18
19
    /**
20
     * @var int
21
     */
22
    protected $givenColumnIndex;
23
24
    /**
25
     * @var array
26
     */
27
    protected $binaryLabels;
28
29
    /**
30
     * Sample weights : If used the optimization on the decision value
31
     * will take these weights into account. If not given, all samples
32
     * will be weighed with the same value of 1
33
     *
34
     * @var array
35
     */
36
    protected $weights = null;
37
38
    /**
39
     * Lowest error rate obtained while training/optimizing the model
40
     *
41
     * @var float
42
     */
43
    protected $trainingErrorRate;
44
45
    /**
46
     * @var int
47
     */
48
    protected $column;
49
50
    /**
51
     * @var mixed
52
     */
53
    protected $value;
54
55
    /**
56
     * @var string
57
     */
58
    protected $operator;
59
60
    /**
61
     * @var array
62
     */
63
    protected $columnTypes;
64
65
    /**
66
     * @var int
67
     */
68
    protected $featureCount;
69
70
    /**
71
     * @var float
72
     */
73
    protected $numSplitCount = 100.0;
74
75
    /**
76
     * Distribution of samples in the leaves
77
     *
78
     * @var array
79
     */
80
    protected $prob;
81
82
    /**
83
     * A DecisionStump classifier is a one-level deep DecisionTree. It is generally
84
     * used with ensemble algorithms as in the weak classifier role. <br>
85
     *
86
     * If columnIndex is given, then the stump tries to produce a decision node
87
     * on this column, otherwise in cases given the value of -1, the stump itself
88
     * decides which column to take for the decision (Default DecisionTree behaviour)
89
     *
90
     * @param int $columnIndex
91
     */
92
    public function __construct(int $columnIndex = self::AUTO_SELECT)
93
    {
94
        $this->givenColumnIndex = $columnIndex;
95
    }
96
97
    /**
98
     * @param array $samples
99
     * @param array $targets
100
     */
101
    protected function trainBinary(array $samples, array $targets)
102
    {
103
        $this->samples = array_merge($this->samples, $samples);
104
        $this->targets = array_merge($this->targets, $targets);
105
        $this->binaryLabels = array_keys(array_count_values($this->targets));
106
        $this->featureCount = count($this->samples[0]);
107
108
        // If a column index is given, it should be among the existing columns
109
        if ($this->givenColumnIndex > count($this->samples[0]) - 1) {
110
            $this->givenColumnIndex = self::AUTO_SELECT;
111
        }
112
113
        // Check the size of the weights given.
114
        // If none given, then assign 1 as a weight to each sample
115
        if ($this->weights) {
0 ignored issues
show
Bug Best Practice introduced by
The expression $this->weights of type array is implicitly converted to a boolean; are you sure this is intended? If so, consider using ! empty($expr) instead to make it clear that you intend to check for an array without elements.

This check marks implicit conversions of arrays to boolean values in a comparison. While in PHP an empty array is considered to be equal (but not identical) to false, this is not always apparent.

Consider making the comparison explicit by using empty(..) or ! empty(...) instead.

Loading history...
116
            $numWeights = count($this->weights);
117
            if ($numWeights != count($this->samples)) {
118
                throw new \Exception("Number of sample weights does not match with number of samples");
119
            }
120
        } else {
121
            $this->weights = array_fill(0, count($this->samples), 1);
122
        }
123
124
        // Determine type of each column as either "continuous" or "nominal"
125
        $this->columnTypes = DecisionTree::getColumnTypes($this->samples);
126
127
        // Try to find the best split in the columns of the dataset
128
        // by calculating error rate for each split point in each column
129
        $columns = range(0, count($this->samples[0]) - 1);
130
        if ($this->givenColumnIndex != self::AUTO_SELECT) {
131
            $columns = [$this->givenColumnIndex];
132
        }
133
134
        $bestSplit = [
135
            'value' => 0, 'operator' => '',
136
            'prob' => [], 'column' => 0,
137
            'trainingErrorRate' => 1.0];
138
        foreach ($columns as $col) {
139
            if ($this->columnTypes[$col] == DecisionTree::CONTINUOS) {
140
                $split = $this->getBestNumericalSplit($col);
141
            } else {
142
                $split = $this->getBestNominalSplit($col);
143
            }
144
145
            if ($split['trainingErrorRate'] < $bestSplit['trainingErrorRate']) {
146
                $bestSplit = $split;
147
            }
148
        }
149
150
        // Assign determined best values to the stump
151
        foreach ($bestSplit as $name => $value) {
152
            $this->{$name} = $value;
153
        }
154
    }
155
156
    /**
157
     * While finding best split point for a numerical valued column,
158
     * DecisionStump looks for equally distanced values between minimum and maximum
159
     * values in the column. Given <i>$count</i> value determines how many split
160
     * points to be probed. The more split counts, the better performance but
161
     * worse processing time (Default value is 10.0)
162
     *
163
     * @param float $count
164
     */
165
    public function setNumericalSplitCount(float $count)
166
    {
167
        $this->numSplitCount = $count;
168
    }
169
170
    /**
171
     * Determines best split point for the given column
172
     *
173
     * @param int $col
174
     *
175
     * @return array
176
     */
177
    protected function getBestNumericalSplit(int $col)
178
    {
179
        $values = array_column($this->samples, $col);
180
        // Trying all possible points may be accomplished in two general ways:
181
        // 1- Try all values in the $samples array ($values)
182
        // 2- Artificially split the range of values into several parts and try them
183
        // We choose the second one because it is faster in larger datasets
184
        $minValue = min($values);
185
        $maxValue = max($values);
186
        $stepSize = ($maxValue - $minValue) / $this->numSplitCount;
187
188
        $split = null;
189
190
        foreach (['<=', '>'] as $operator) {
191
            // Before trying all possible split points, let's first try
192
            // the average value for the cut point
193
            $threshold = array_sum($values) / (float) count($values);
194
            list($errorRate, $prob) = $this->calculateErrorRate($threshold, $operator, $values);
195 View Code Duplication
            if ($split == null || $errorRate < $split['trainingErrorRate']) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
196
                $split = ['value' => $threshold, 'operator' => $operator,
197
                        'prob' => $prob, 'column' => $col,
198
                        'trainingErrorRate' => $errorRate];
199
            }
200
201
            // Try other possible points one by one
202
            for ($step = $minValue; $step <= $maxValue; $step+= $stepSize) {
203
                $threshold = (float)$step;
204
                list($errorRate, $prob) = $this->calculateErrorRate($threshold, $operator, $values);
205 View Code Duplication
                if ($errorRate < $split['trainingErrorRate']) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
206
                    $split = ['value' => $threshold, 'operator' => $operator,
207
                        'prob' => $prob, 'column' => $col,
208
                        'trainingErrorRate' => $errorRate];
209
                }
210
            }// for
211
        }
212
213
        return $split;
214
    }
215
216
    /**
217
     *
218
     * @param int $col
219
     *
220
     * @return array
221
     */
222
    protected function getBestNominalSplit(int $col)
223
    {
224
        $values = array_column($this->samples, $col);
225
        $valueCounts = array_count_values($values);
226
        $distinctVals= array_keys($valueCounts);
227
228
        $split = null;
229
230
        foreach (['=', '!='] as $operator) {
231
            foreach ($distinctVals as $val) {
232
                list($errorRate, $prob) = $this->calculateErrorRate($val, $operator, $values);
233
234 View Code Duplication
                if ($split == null || $split['trainingErrorRate'] < $errorRate) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
235
                    $split = ['value' => $val, 'operator' => $operator,
236
                        'prob' => $prob, 'column' => $col,
237
                        'trainingErrorRate' => $errorRate];
238
                }
239
            }// for
240
        }
241
242
        return $split;
243
    }
244
245
246
    /**
247
     *
248
     * @param type $leftValue
249
     * @param type $operator
250
     * @param type $rightValue
251
     *
252
     * @return boolean
253
     */
254
    protected function evaluate($leftValue, $operator, $rightValue)
255
    {
256
        switch ($operator) {
257
            case '>': return $leftValue > $rightValue;
0 ignored issues
show
Coding Style introduced by
The case body in a switch statement must start on the line following the statement.

According to the PSR-2, the body of a case statement must start on the line immediately following the case statement.

switch ($expr) {
case "A":
    doSomething(); //right
    break;
case "B":

    doSomethingElse(); //wrong
    break;

}

To learn more about the PSR-2 coding standard, please refer to the PHP-Fig.

Loading history...
Coding Style introduced by
Terminating statement must be on a line by itself

As per the PSR-2 coding standard, the break (or other terminating) statement must be on a line of its own.

switch ($expr) {
     case "A":
         doSomething();
         break; //wrong
     case "B":
         doSomething();
         break; //right
     case "C:":
         doSomething();
         return true; //right
 }

To learn more about the PSR-2 coding standard, please refer to the PHP-Fig.

Loading history...
258
            case '>=': return $leftValue >= $rightValue;
0 ignored issues
show
Coding Style introduced by
The case body in a switch statement must start on the line following the statement.

According to the PSR-2, the body of a case statement must start on the line immediately following the case statement.

switch ($expr) {
case "A":
    doSomething(); //right
    break;
case "B":

    doSomethingElse(); //wrong
    break;

}

To learn more about the PSR-2 coding standard, please refer to the PHP-Fig.

Loading history...
Coding Style introduced by
Terminating statement must be on a line by itself

As per the PSR-2 coding standard, the break (or other terminating) statement must be on a line of its own.

switch ($expr) {
     case "A":
         doSomething();
         break; //wrong
     case "B":
         doSomething();
         break; //right
     case "C:":
         doSomething();
         return true; //right
 }

To learn more about the PSR-2 coding standard, please refer to the PHP-Fig.

Loading history...
259
            case '<': return $leftValue < $rightValue;
0 ignored issues
show
Coding Style introduced by
The case body in a switch statement must start on the line following the statement.

According to the PSR-2, the body of a case statement must start on the line immediately following the case statement.

switch ($expr) {
case "A":
    doSomething(); //right
    break;
case "B":

    doSomethingElse(); //wrong
    break;

}

To learn more about the PSR-2 coding standard, please refer to the PHP-Fig.

Loading history...
Coding Style introduced by
Terminating statement must be on a line by itself

As per the PSR-2 coding standard, the break (or other terminating) statement must be on a line of its own.

switch ($expr) {
     case "A":
         doSomething();
         break; //wrong
     case "B":
         doSomething();
         break; //right
     case "C:":
         doSomething();
         return true; //right
 }

To learn more about the PSR-2 coding standard, please refer to the PHP-Fig.

Loading history...
260
            case '<=': return $leftValue <= $rightValue;
0 ignored issues
show
Coding Style introduced by
The case body in a switch statement must start on the line following the statement.

According to the PSR-2, the body of a case statement must start on the line immediately following the case statement.

switch ($expr) {
case "A":
    doSomething(); //right
    break;
case "B":

    doSomethingElse(); //wrong
    break;

}

To learn more about the PSR-2 coding standard, please refer to the PHP-Fig.

Loading history...
Coding Style introduced by
Terminating statement must be on a line by itself

As per the PSR-2 coding standard, the break (or other terminating) statement must be on a line of its own.

switch ($expr) {
     case "A":
         doSomething();
         break; //wrong
     case "B":
         doSomething();
         break; //right
     case "C:":
         doSomething();
         return true; //right
 }

To learn more about the PSR-2 coding standard, please refer to the PHP-Fig.

Loading history...
261
            case '=': return $leftValue === $rightValue;
0 ignored issues
show
Coding Style introduced by
The case body in a switch statement must start on the line following the statement.

According to the PSR-2, the body of a case statement must start on the line immediately following the case statement.

switch ($expr) {
case "A":
    doSomething(); //right
    break;
case "B":

    doSomethingElse(); //wrong
    break;

}

To learn more about the PSR-2 coding standard, please refer to the PHP-Fig.

Loading history...
Coding Style introduced by
Terminating statement must be on a line by itself

As per the PSR-2 coding standard, the break (or other terminating) statement must be on a line of its own.

switch ($expr) {
     case "A":
         doSomething();
         break; //wrong
     case "B":
         doSomething();
         break; //right
     case "C:":
         doSomething();
         return true; //right
 }

To learn more about the PSR-2 coding standard, please refer to the PHP-Fig.

Loading history...
262
            case '!=':
263
            case '<>': return $leftValue !== $rightValue;
0 ignored issues
show
Coding Style introduced by
The case body in a switch statement must start on the line following the statement.

According to the PSR-2, the body of a case statement must start on the line immediately following the case statement.

switch ($expr) {
case "A":
    doSomething(); //right
    break;
case "B":

    doSomethingElse(); //wrong
    break;

}

To learn more about the PSR-2 coding standard, please refer to the PHP-Fig.

Loading history...
Coding Style introduced by
Terminating statement must be on a line by itself

As per the PSR-2 coding standard, the break (or other terminating) statement must be on a line of its own.

switch ($expr) {
     case "A":
         doSomething();
         break; //wrong
     case "B":
         doSomething();
         break; //right
     case "C:":
         doSomething();
         return true; //right
 }

To learn more about the PSR-2 coding standard, please refer to the PHP-Fig.

Loading history...
264
        }
265
266
        return false;
267
    }
268
269
    /**
270
     * Calculates the ratio of wrong predictions based on the new threshold
271
     * value given as the parameter
272
     *
273
     * @param float $threshold
274
     * @param string $operator
275
     * @param array $values
276
     *
277
     * @return array
278
     */
279
    protected function calculateErrorRate(float $threshold, string $operator, array $values)
280
    {
281
        $wrong = 0.0;
282
        $prob = [];
283
        $leftLabel = $this->binaryLabels[0];
284
        $rightLabel= $this->binaryLabels[1];
285
286
        foreach ($values as $index => $value) {
287
            if ($this->evaluate($value, $operator, $threshold)) {
0 ignored issues
show
Documentation introduced by
$operator is of type string, but the function expects a object<Phpml\Classification\Linear\type>.

It seems like the type of the argument is not accepted by the function/method which you are calling.

In some cases, in particular if PHP’s automatic type-juggling kicks in this might be fine. In other cases, however this might be a bug.

We suggest to add an explicit type cast like in the following example:

function acceptsInteger($int) { }

$x = '123'; // string "123"

// Instead of
acceptsInteger($x);

// we recommend to use
acceptsInteger((integer) $x);
Loading history...
Documentation introduced by
$threshold is of type double, but the function expects a object<Phpml\Classification\Linear\type>.

It seems like the type of the argument is not accepted by the function/method which you are calling.

In some cases, in particular if PHP’s automatic type-juggling kicks in this might be fine. In other cases, however this might be a bug.

We suggest to add an explicit type cast like in the following example:

function acceptsInteger($int) { }

$x = '123'; // string "123"

// Instead of
acceptsInteger($x);

// we recommend to use
acceptsInteger((integer) $x);
Loading history...
288
                $predicted = $leftLabel;
289
            } else {
290
                $predicted = $rightLabel;
291
            }
292
293
            $target = $this->targets[$index];
294
            if (strval($predicted) != strval($this->targets[$index])) {
295
                $wrong += $this->weights[$index];
296
            }
297
298
            if (! isset($prob[$predicted][$target])) {
299
                $prob[$predicted][$target] = 0;
300
            }
301
            $prob[$predicted][$target]++;
302
        }
303
304
        // Calculate probabilities: Proportion of labels in each leaf
305
        $dist = array_combine($this->binaryLabels, array_fill(0, 2, 0.0));
306
        foreach ($prob as $leaf => $counts) {
307
            $leafTotal = (float)array_sum($prob[$leaf]);
308
            foreach ($counts as $label => $count) {
309
                if (strval($leaf) == strval($label)) {
310
                    $dist[$leaf] = $count / $leafTotal;
311
                }
312
            }
313
        }
314
315
        return [$wrong / (float) array_sum($this->weights), $dist];
316
    }
317
318
    /**
319
     * Returns the probability of the sample of belonging to the given label
320
     *
321
     * Probability of a sample is calculated as the proportion of the label
322
     * within the labels of the training samples in the decision node
323
     *
324
     * @param array $sample
325
     * @param mixed $label
326
     *
327
     * @return float
328
     */
329
    protected function predictProbability(array $sample, $label)
330
    {
331
        $predicted = $this->predictSampleBinary($sample);
332
        if (strval($predicted) == strval($label)) {
333
            return $this->prob[$label];
334
        }
335
336
        return 0.0;
337
    }
338
339
    /**
340
     * @param array $sample
341
     *
342
     * @return mixed
343
     */
344
    protected function predictSampleBinary(array $sample)
345
    {
346
        if ($this->evaluate($sample[$this->column], $this->operator, $this->value)) {
0 ignored issues
show
Documentation introduced by
$this->operator is of type string, but the function expects a object<Phpml\Classification\Linear\type>.

It seems like the type of the argument is not accepted by the function/method which you are calling.

In some cases, in particular if PHP’s automatic type-juggling kicks in this might be fine. In other cases, however this might be a bug.

We suggest to add an explicit type cast like in the following example:

function acceptsInteger($int) { }

$x = '123'; // string "123"

// Instead of
acceptsInteger($x);

// we recommend to use
acceptsInteger((integer) $x);
Loading history...
347
            return $this->binaryLabels[0];
348
        }
349
350
        return $this->binaryLabels[1];
351
    }
352
353
    /**
354
     * @return string
355
     */
356
    public function __toString()
357
    {
358
        return "IF $this->column $this->operator $this->value " .
359
            "THEN " . $this->binaryLabels[0] . " ".
360
            "ELSE " . $this->binaryLabels[1];
361
    }
362
}
363