Completed
Push — master ( 95fc13...87396e )
by Arkadiusz
02:45
created

DecisionTree::preprocess()   B

Complexity

Conditions 5
Paths 3

Size

Total Lines 23
Code Lines 13

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
dl 0
loc 23
rs 8.5906
c 0
b 0
f 0
cc 5
eloc 13
nc 3
nop 1
1
<?php
2
3
declare(strict_types=1);
4
5
namespace Phpml\Classification;
6
7
use Phpml\Helper\Predictable;
8
use Phpml\Helper\Trainable;
9
use Phpml\Math\Statistic\Mean;
10
use Phpml\Classification\DecisionTree\DecisionTreeLeaf;
11
12
class DecisionTree implements Classifier
13
{
14
    use Trainable, Predictable;
15
16
    const CONTINUOS = 1;
17
    const NOMINAL = 2;
18
19
    /**
20
     * @var array
21
     */
22
    private $samples = array();
23
24
    /**
25
     * @var array
26
     */
27
    private $columnTypes;
28
    /**
29
     * @var array
30
     */
31
    private $labels = array();
32
    /**
33
     * @var int
34
     */
35
    private $featureCount = 0;
36
    /**
37
     * @var DecisionTreeLeaf
38
     */
39
    private $tree = null;
40
41
    /**
42
     * @var int
43
     */
44
    private $maxDepth;
45
46
    /**
47
     * @var int
48
     */
49
    public $actualDepth = 0;
50
51
    /**
52
     * @param int $maxDepth
53
     */
54
    public function __construct($maxDepth = 10)
55
    {
56
        $this->maxDepth = $maxDepth;
57
    }
58
    /**
59
     * @param array $samples
60
     * @param array $targets
61
     */
62
    public function train(array $samples, array $targets)
63
    {
64
        $this->featureCount = count($samples[0]);
65
        $this->columnTypes = $this->getColumnTypes($samples);
66
        $this->samples = $samples;
67
        $this->targets = $targets;
68
        $this->labels = array_keys(array_count_values($targets));
69
        $this->tree = $this->getSplitLeaf(range(0, count($samples) - 1));
70
    }
71
72
    protected function getColumnTypes(array $samples)
73
    {
74
        $types = [];
75
        for ($i=0; $i<$this->featureCount; $i++) {
76
            $values = array_column($samples, $i);
77
            $isCategorical = $this->isCategoricalColumn($values);
78
            $types[] = $isCategorical ? self::NOMINAL : self::CONTINUOS;
79
        }
80
        return $types;
81
    }
82
83
    /**
84
     * @param null|array $records
85
     * @return DecisionTreeLeaf
86
     */
87
    protected function getSplitLeaf($records, $depth = 0)
88
    {
89
        $split = $this->getBestSplit($records);
0 ignored issues
show
Bug introduced by
It seems like $records defined by parameter $records on line 87 can also be of type null; however, Phpml\Classification\DecisionTree::getBestSplit() does only seem to accept array, maybe add an additional type check?

This check looks at variables that have been passed in as parameters and are passed out again to other methods.

If the outgoing method call has stricter type requirements than the method itself, an issue is raised.

An additional type check may prevent trouble.

Loading history...
90
        $split->level = $depth;
91
        if ($this->actualDepth < $depth) {
92
            $this->actualDepth = $depth;
93
        }
94
        $leftRecords = [];
95
        $rightRecords= [];
96
        $remainingTargets = [];
97
        $prevRecord = null;
98
        $allSame = true;
99
        foreach ($records as $recordNo) {
0 ignored issues
show
Bug introduced by
The expression $records of type null|array is not guaranteed to be traversable. How about adding an additional type check?

There are different options of fixing this problem.

  1. If you want to be on the safe side, you can add an additional type-check:

    $collection = json_decode($data, true);
    if ( ! is_array($collection)) {
        throw new \RuntimeException('$collection must be an array.');
    }
    
    foreach ($collection as $item) { /** ... */ }
    
  2. If you are sure that the expression is traversable, you might want to add a doc comment cast to improve IDE auto-completion and static analysis:

    /** @var array $collection */
    $collection = json_decode($data, true);
    
    foreach ($collection as $item) { /** .. */ }
    
  3. Mark the issue as a false-positive: Just hover the remove button, in the top-right corner of this issue for more options.

Loading history...
100
            $record = $this->samples[$recordNo];
101
            if ($prevRecord && $prevRecord != $record) {
102
                $allSame = false;
103
            }
104
            $prevRecord = $record;
105
            if ($split->evaluate($record)) {
106
                $leftRecords[] = $recordNo;
107
            } else {
108
                $rightRecords[]= $recordNo;
109
            }
110
            $target = $this->targets[$recordNo];
111
            if (! in_array($target, $remainingTargets)) {
112
                $remainingTargets[] = $target;
113
            }
114
        }
115
116
        if (count($remainingTargets) == 1 || $allSame || $depth >= $this->maxDepth) {
117
            $split->isTerminal = 1;
0 ignored issues
show
Documentation Bug introduced by
The property $isTerminal was declared of type boolean, but 1 is of type integer. Maybe add a type cast?

This check looks for assignments to scalar types that may be of the wrong type.

To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.

$answer = 42;

$correct = false;

$correct = (bool) $answer;
Loading history...
118
            $classes = array_count_values($remainingTargets);
119
            arsort($classes);
120
            $split->classValue = key($classes);
121
        } else {
122
            if ($leftRecords) {
0 ignored issues
show
Bug Best Practice introduced by
The expression $leftRecords of type array is implicitly converted to a boolean; are you sure this is intended? If so, consider using ! empty($expr) instead to make it clear that you intend to check for an array without elements.

This check marks implicit conversions of arrays to boolean values in a comparison. While in PHP an empty array is considered to be equal (but not identical) to false, this is not always apparent.

Consider making the comparison explicit by using empty(..) or ! empty(...) instead.

Loading history...
123
                $split->leftLeaf = $this->getSplitLeaf($leftRecords, $depth + 1);
124
            }
125
            if ($rightRecords) {
0 ignored issues
show
Bug Best Practice introduced by
The expression $rightRecords of type array is implicitly converted to a boolean; are you sure this is intended? If so, consider using ! empty($expr) instead to make it clear that you intend to check for an array without elements.

This check marks implicit conversions of arrays to boolean values in a comparison. While in PHP an empty array is considered to be equal (but not identical) to false, this is not always apparent.

Consider making the comparison explicit by using empty(..) or ! empty(...) instead.

Loading history...
126
                $split->rightLeaf= $this->getSplitLeaf($rightRecords, $depth + 1);
127
            }
128
        }
129
        return $split;
130
    }
131
132
    /**
133
     * @param array $records
134
     * @return DecisionTreeLeaf[]
135
     */
136
    protected function getBestSplit($records)
137
    {
138
        $targets = array_intersect_key($this->targets, array_flip($records));
139
        $samples = array_intersect_key($this->samples, array_flip($records));
140
        $samples = array_combine($records, $this->preprocess($samples));
141
        $bestGiniVal = 1;
142
        $bestSplit = null;
143
        for ($i=0; $i<$this->featureCount; $i++) {
144
            $colValues = [];
145
            $baseValue = null;
146
            foreach ($samples as $index => $row) {
147
                $colValues[$index] = $row[$i];
148
                if ($baseValue === null) {
149
                    $baseValue = $row[$i];
150
                }
151
            }
152
            $gini = $this->getGiniIndex($baseValue, $colValues, $targets);
153
            if ($bestSplit == null || $bestGiniVal > $gini) {
154
                $split = new DecisionTreeLeaf();
155
                $split->value = $baseValue;
156
                $split->giniIndex = $gini;
0 ignored issues
show
Documentation Bug introduced by
It seems like $gini can also be of type integer. However, the property $giniIndex is declared as type double. Maybe add an additional type check?

Our type inference engine has found a suspicous assignment of a value to a property. This check raises an issue when a value that can be of a mixed type is assigned to a property that is type hinted more strictly.

For example, imagine you have a variable $accountId that can either hold an Id object or false (if there is no account id yet). Your code now assigns that value to the id property of an instance of the Account class. This class holds a proper account, so the id value must no longer be false.

Either this assignment is in error or a type check should be added for that assignment.

class Id
{
    public $id;

    public function __construct($id)
    {
        $this->id = $id;
    }

}

class Account
{
    /** @var  Id $id */
    public $id;
}

$account_id = false;

if (starsAreRight()) {
    $account_id = new Id(42);
}

$account = new Account();
if ($account instanceof Id)
{
    $account->id = $account_id;
}
Loading history...
157
                $split->columnIndex = $i;
158
                $split->records = $records;
159
                $bestSplit = $split;
160
                $bestGiniVal = $gini;
161
            }
162
        }
163
        return $bestSplit;
164
    }
165
166
    /**
167
     * @param string $baseValue
168
     * @param array $colValues
169
     * @param array $targets
170
     */
171
    public function getGiniIndex($baseValue, $colValues, $targets)
172
    {
173
        $countMatrix = [];
174
        foreach ($this->labels as $label) {
175
            $countMatrix[$label] = [0, 0];
176
        }
177
        foreach ($colValues as $index => $value) {
178
            $label = $targets[$index];
179
            $rowIndex = $value == $baseValue ? 0 : 1;
180
            $countMatrix[$label][$rowIndex]++;
181
        }
182
        $giniParts = [0, 0];
183
        for ($i=0; $i<=1; $i++) {
184
            $part = 0;
185
            $sum = array_sum(array_column($countMatrix, $i));
186
            if ($sum > 0) {
187
                foreach ($this->labels as $label) {
188
                    $part += pow($countMatrix[$label][$i] / floatval($sum), 2);
189
                }
190
            }
191
            $giniParts[$i] = (1 - $part) * $sum;
192
        }
193
        return array_sum($giniParts) / count($colValues);
194
    }
195
196
    /**
197
     * @param array $samples
198
     * @return array
199
     */
200
    protected function preprocess(array $samples)
201
    {
202
        // Detect and convert continuous data column values into
203
        // discrete values by using the median as a threshold value
204
        $columns = array();
205
        for ($i=0; $i<$this->featureCount; $i++) {
206
            $values = array_column($samples, $i);
207
            if ($this->columnTypes[$i] == self::CONTINUOS) {
208
                $median = Mean::median($values);
209
                foreach ($values as &$value) {
210
                    if ($value <= $median) {
211
                        $value = "<= $median";
212
                    } else {
213
                        $value = "> $median";
214
                    }
215
                }
216
            }
217
            $columns[] = $values;
218
        }
219
        // Below method is a strange yet very simple & efficient method
220
        // to get the transpose of a 2D array
221
        return array_map(null, ...$columns);
222
    }
223
224
    /**
225
     * @param array $columnValues
226
     * @return bool
227
     */
228
    protected function isCategoricalColumn(array $columnValues)
229
    {
230
        $count = count($columnValues);
231
        // There are two main indicators that *may* show whether a
232
        // column is composed of discrete set of values:
233
        // 1- Column may contain string values
234
        // 2- Number of unique values in the column is only a small fraction of
235
        //	  all values in that column (Lower than or equal to %20 of all values)
236
        $numericValues = array_filter($columnValues, 'is_numeric');
237
        if (count($numericValues) != $count) {
238
            return true;
239
        }
240
        $distinctValues = array_count_values($columnValues);
241
        if (count($distinctValues) <= $count / 5) {
242
            return true;
243
        }
244
        return false;
245
    }
246
247
    /**
248
     * @return string
249
     */
250
    public function getHtml()
251
    {
252
        return $this->tree->__toString();
253
    }
254
255
    /**
256
     * @param array $sample
257
     * @return mixed
258
     */
259
    protected function predictSample(array $sample)
260
    {
261
        $node = $this->tree;
262
        do {
263
            if ($node->isTerminal) {
264
                break;
265
            }
266
            if ($node->evaluate($sample)) {
267
                $node = $node->leftLeaf;
268
            } else {
269
                $node = $node->rightLeaf;
270
            }
271
        } while ($node);
272
        return $node->classValue;
273
    }
274
}
275