DecisionTree - Code Metrics - Inspection of "Update easy coding standard to ^5.1" - php-ai/php-ml - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Pull Request — master (#317)

by Marcin

created 2018-10-15 19:47 UTC

DecisionTree F

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	466
Duplicated Lines	0 %

Coupling/Cohesion

Components	1
Dependencies	5

Importance

Changes

Metric	Value
wmc	68
lcom	1
cbo	5
dl	0
loc	466
rs	2.96
c	0
b	0
f	0

16 Methods

Rating	Name	Size	Complexity
C	getSplitLeaf()	59	12
B	getBestSplit()	43	6
A	getSelectedFeatures()	22	5
A	preprocess()	25	5
A	isCategoricalColumn()	23	3
A	setSelectedFeatures()	4	1
A	getSplitNodesByColumn()	23	5
A	predictSample()	17	5
A	__construct()	4	1
A	train()	27	4
A	getColumnTypes()	12	3
B	getGiniIndex()	28	7
A	setNumFeatures()	10	2
A	setColumnNames()	10	3
A	getHtml()	4	1
A	getFeatureImportances()	30	5

How to fix Complexity

<?php

declare(strict_types=1);

namespace Phpml\Classification;

use Phpml\Classification\DecisionTree\DecisionTreeLeaf;
use Phpml\Exception\InvalidArgumentException;
use Phpml\Helper\Predictable;
use Phpml\Helper\Trainable;
use Phpml\Math\Statistic\Mean;

class DecisionTree implements Classifier
{
    use Trainable;
    use Predictable;

    public const CONTINUOUS = 1;

    public const NOMINAL = 2;

    /**
     * @var int
     */
    public $actualDepth = 0;

    /**
     * @var array
     */
    protected $columnTypes = [];

    /**
     * @var DecisionTreeLeaf
     */
    protected $tree;

    /**
     * @var int
     */
    protected $maxDepth;

    /**
     * @var array
     */
    private $labels = [];

    /**
     * @var int
     */
    private $featureCount = 0;

    /**
     * @var int
     */
    private $numUsableFeatures = 0;

    /**
     * @var array
     */
    private $selectedFeatures = [];

    /**
     * @var array|null
     */
    private $featureImportances;

    /**
     * @var array
     */
    private $columnNames = [];

    public function __construct(int $maxDepth = 10)
    {
        $this->maxDepth = $maxDepth;
    }

    public function train(array $samples, array $targets): void
    {
        $this->samples = array_merge($this->samples, $samples);
        $this->targets = array_merge($this->targets, $targets);

        $this->featureCount = count($this->samples[0]);
        $this->columnTypes = self::getColumnTypes($this->samples);
        $this->labels = array_keys(array_count_values($this->targets));
        $this->tree = $this->getSplitLeaf(range(0, count($this->samples) - 1));

        // Each time the tree is trained, feature importances are reset so that
        // we will have to compute it again depending on the new data
        $this->featureImportances = null;

        // If column names are given or computed before, then there is no
        // need to init it and accidentally remove the previous given names
        if ($this->columnNames === []) {
            $this->columnNames = range(0, $this->featureCount - 1);
        } elseif (count($this->columnNames) > $this->featureCount) {
            $this->columnNames = array_slice($this->columnNames, 0, $this->featureCount);
        } elseif (count($this->columnNames) < $this->featureCount) {
            $this->columnNames = array_merge(
                $this->columnNames,
                range(count($this->columnNames), $this->featureCount - 1)
            );
        }
    }

    public static function getColumnTypes(array $samples): array
    {
        $types = [];
        $featureCount = count($samples[0]);
        for ($i = 0; $i < $featureCount; ++$i) {
            $values = array_column($samples, $i);
            $isCategorical = self::isCategoricalColumn($values);
            $types[] = $isCategorical ? self::NOMINAL : self::CONTINUOUS;
        }

        return $types;
    }

    /**
     * @param mixed $baseValue
     */
    public function getGiniIndex($baseValue, array $colValues, array $targets): float
    {
        $countMatrix = [];
        foreach ($this->labels as $label) {
            $countMatrix[$label] = [0, 0];
        }

        foreach ($colValues as $index => $value) {
            $label = $targets[$index];
            $rowIndex = $value === $baseValue ? 0 : 1;
            ++$countMatrix[$label][$rowIndex];
        }

        $giniParts = [0, 0];
        for ($i = 0; $i <= 1; ++$i) {
            $part = 0;
            $sum = array_sum(array_column($countMatrix, $i));
            if ($sum > 0) {
                foreach ($this->labels as $label) {
                    $part += pow($countMatrix[$label][$i] / (float) $sum, 2);
                }
            }

            $giniParts[$i] = (1 - $part) * $sum;
        }

        return array_sum($giniParts) / count($colValues);
    }

    /**
     * This method is used to set number of columns to be used
     * when deciding a split at an internal node of the tree.  <br>
     * If the value is given 0, then all features are used (default behaviour),
     * otherwise the given value will be used as a maximum for number of columns
     * randomly selected for each split operation.
     *
     * @return $this
     *
     * @throws InvalidArgumentException
     */
    public function setNumFeatures(int $numFeatures)
    {
        if ($numFeatures < 0) {
            throw new InvalidArgumentException('Selected column count should be greater or equal to zero');
        }

        $this->numUsableFeatures = $numFeatures;

        return $this;
    }

    /**
     * A string array to represent columns. Useful when HTML output or
     * column importances are desired to be inspected.
     *
     * @return $this
     *
     * @throws InvalidArgumentException
     */
    public function setColumnNames(array $names)
    {
        if ($this->featureCount !== 0 && count($names) !== $this->featureCount) {
            throw new InvalidArgumentException(sprintf('Length of the given array should be equal to feature count %s', $this->featureCount));
        }

        $this->columnNames = $names;

        return $this;
    }

    public function getHtml(): string
    {
        return $this->tree->getHTML($this->columnNames);
    }

    /**
     * This will return an array including an importance value for
     * each column in the given dataset. The importance values are
     * normalized and their total makes 1.<br/>
     */
    public function getFeatureImportances(): array
    {
        if ($this->featureImportances !== null) {
            return $this->featureImportances;
        }

        $sampleCount = count($this->samples);
        $this->featureImportances = [];
        foreach ($this->columnNames as $column => $columnName) {
            $nodes = $this->getSplitNodesByColumn($column, $this->tree);

            $importance = 0;
            foreach ($nodes as $node) {
                $importance += $node->getNodeImpurityDecrease($sampleCount);
            }

            $this->featureImportances[$columnName] = $importance;
        }

        // Normalize & sort the importances
        $total = array_sum($this->featureImportances);
        if ($total > 0) {
            array_walk($this->featureImportances, function (&$importance) use ($total): void {
                $importance /= $total;
            });
            arsort($this->featureImportances);
        }

        return $this->featureImportances;
    }

    protected function getSplitLeaf(array $records, int $depth = 0): DecisionTreeLeaf
    {
        $split = $this->getBestSplit($records);
        $split->level = $depth;
        if ($this->actualDepth < $depth) {
            $this->actualDepth = $depth;
        }

        // Traverse all records to see if all records belong to the same class,
        // otherwise group the records so that we can classify the leaf
        // in case maximum depth is reached
        $leftRecords = [];
        $rightRecords = [];
        $remainingTargets = [];
        $prevRecord = null;
        $allSame = true;

        foreach ($records as $recordNo) {
            // Check if the previous record is the same with the current one
            $record = $this->samples[$recordNo];
            if ($prevRecord && $prevRecord != $record) {
                $allSame = false;
            }

            $prevRecord = $record;

            // According to the split criteron, this record will
            // belong to either left or the right side in the next split
            if ($split->evaluate($record)) {
                $leftRecords[] = $recordNo;
            } else {
                $rightRecords[] = $recordNo;
            }

            // Group remaining targets
            $target = $this->targets[$recordNo];
            if (!array_key_exists($target, $remainingTargets)) {
                $remainingTargets[$target] = 1;
            } else {
                ++$remainingTargets[$target];
            }
        }

        if ($allSame || $depth >= $this->maxDepth || count($remainingTargets) === 1) {
            $split->isTerminal = true;
            arsort($remainingTargets);
            $split->classValue = key($remainingTargets);
        } else {
            if (!empty($leftRecords)) {
                $split->leftLeaf = $this->getSplitLeaf($leftRecords, $depth + 1);
            }

            if (!empty($rightRecords)) {
                $split->rightLeaf = $this->getSplitLeaf($rightRecords, $depth + 1);
            }
        }

        return $split;
    }

    protected function getBestSplit(array $records): DecisionTreeLeaf
    {
        $targets = array_intersect_key($this->targets, array_flip($records));
        $samples = array_intersect_key($this->samples, array_flip($records));
        $samples = array_combine($records, $this->preprocess($samples));
        $bestGiniVal = 1;
        $bestSplit = null;
        $features = $this->getSelectedFeatures();
        foreach ($features as $i) {
            $colValues = [];
            foreach ($samples as $index => $row) {
                $colValues[$index] = $row[$i];
            }

            $counts = array_count_values($colValues);
            arsort($counts);
            $baseValue = key($counts);
            $gini = $this->getGiniIndex($baseValue, $colValues, $targets);
            if ($bestSplit === null || $bestGiniVal > $gini) {
                $split = new DecisionTreeLeaf();
                $split->value = $baseValue;
                $split->giniIndex = $gini;
                $split->columnIndex = $i;
                $split->isContinuous = $this->columnTypes[$i] === self::CONTINUOUS;
                $split->records = $records;

                // If a numeric column is to be selected, then
                // the original numeric value and the selected operator
                // will also be saved into the leaf for future access
                if ($this->columnTypes[$i] === self::CONTINUOUS) {
                    $matches = [];
                    preg_match("/^([<>=]{1,2})\s*(.*)/", (string) $split->value, $matches);
                    $split->operator = $matches[1];
                    $split->numericValue = (float) $matches[2];
                }

                $bestSplit = $split;
                $bestGiniVal = $gini;
            }
        }

        return $bestSplit;
    }

    /**
     * Returns available features/columns to the tree for the decision making
     * process. <br>
     *
     * If a number is given with setNumFeatures() method, then a random selection
     * of features up to this number is returned. <br>
     *
     * If some features are manually selected by use of setSelectedFeatures(),
     * then only these features are returned <br>
     *
     * If any of above methods were not called beforehand, then all features
     * are returned by default.
     */
    protected function getSelectedFeatures(): array
    {
        $allFeatures = range(0, $this->featureCount - 1);
        if ($this->numUsableFeatures === 0 && empty($this->selectedFeatures)) {
            return $allFeatures;
        }

        if (!empty($this->selectedFeatures)) {
            return $this->selectedFeatures;
        }

        $numFeatures = $this->numUsableFeatures;
        if ($numFeatures > $this->featureCount) {
            $numFeatures = $this->featureCount;
        }

        shuffle($allFeatures);
        $selectedFeatures = array_slice($allFeatures, 0, $numFeatures);
        sort($selectedFeatures);

        return $selectedFeatures;
    }

    protected function preprocess(array $samples): array
    {
        // Detect and convert continuous data column values into
        // discrete values by using the median as a threshold value
        $columns = [];
        for ($i = 0; $i < $this->featureCount; ++$i) {
            $values = array_column($samples, $i);
            if ($this->columnTypes[$i] == self::CONTINUOUS) {
                $median = Mean::median($values);
                foreach ($values as &$value) {
                    if ($value <= $median) {
                        $value = "<= ${median}";
                    } else {
                        $value = "> ${median}";
                    }
                }
            }

            $columns[] = $values;
        }

        // Below method is a strange yet very simple & efficient method
        // to get the transpose of a 2D array
        return array_map(null, ...$columns);
    }

    protected static function isCategoricalColumn(array $columnValues): bool
    {
        $count = count($columnValues);

        // There are two main indicators that *may* show whether a
        // column is composed of discrete set of values:
        // 1- Column may contain string values and non-float values
        // 2- Number of unique values in the column is only a small fraction of
        //	  all values in that column (Lower than or equal to %20 of all values)
        $numericValues = array_filter($columnValues, 'is_numeric');
        $floatValues = array_filter($columnValues, 'is_float');
        if (!empty($floatValues)) {
            return false;
        }

        if (count($numericValues) !== $count) {
            return true;
        }

        $distinctValues = array_count_values($columnValues);

        return count($distinctValues) <= $count / 5;
    }

    /**
     * Used to set predefined features to consider while deciding which column to use for a split
     */
    protected function setSelectedFeatures(array $selectedFeatures): void
    {
        $this->selectedFeatures = $selectedFeatures;
    }

    /**
     * Collects and returns an array of internal nodes that use the given
     * column as a split criterion
     */
    protected function getSplitNodesByColumn(int $column, DecisionTreeLeaf $node): array
    {
        if ($node->isTerminal) {
            return [];
        }

        $nodes = [];
        if ($node->columnIndex === $column) {
            $nodes[] = $node;
        }

        $lNodes = [];
        $rNodes = [];
        if ($node->leftLeaf !== null) {
            $lNodes = $this->getSplitNodesByColumn($column, $node->leftLeaf);
        }

        if ($node->rightLeaf !== null) {
            $rNodes = $this->getSplitNodesByColumn($column, $node->rightLeaf);
        }

        return array_merge($nodes, $lNodes, $rNodes);
    }

    /**
     * @return mixed
     */
    protected function predictSample(array $sample)
    {
        $node = $this->tree;
        do {
            if ($node->isTerminal) {
                break;
            }

            if ($node->evaluate($sample)) {
                $node = $node->leftLeaf;
            } else {
                $node = $node->rightLeaf;
            }
        } while ($node);

        return $node !== null ? $node->classValue : $this->labels[0];
    }
}


1			<?php
2
3			declare(strict_types=1);
4
5			namespace Phpml\Classification;
6
7			use Phpml\Classification\DecisionTree\DecisionTreeLeaf;
8			use Phpml\Exception\InvalidArgumentException;
9			use Phpml\Helper\Predictable;
10			use Phpml\Helper\Trainable;
11			use Phpml\Math\Statistic\Mean;
12
13			class DecisionTree implements Classifier
14			{
15			use Trainable;
16			use Predictable;
17
18			public const CONTINUOUS = 1;
19
20			public const NOMINAL = 2;
21
22			/**
23			* @var int
24			*/
25			public $actualDepth = 0;
26
27			/**
28			* @var array
29			*/
30			protected $columnTypes = [];
31
32			/**
33			* @var DecisionTreeLeaf
34			*/
35			protected $tree;
36
37			/**
38			* @var int
39			*/
40			protected $maxDepth;
41
42			/**
43			* @var array
44			*/
45			private $labels = [];
46
47			/**
48			* @var int
49			*/
50			private $featureCount = 0;
51
52			/**
53			* @var int
54			*/
55			private $numUsableFeatures = 0;
56
57			/**
58			* @var array
59			*/
60			private $selectedFeatures = [];
61
62			/**
63			* @var array\|null
64			*/
65			private $featureImportances;
66
67			/**
68			* @var array
69			*/
70			private $columnNames = [];
71
72			public function __construct(int $maxDepth = 10)
73			{
74			$this->maxDepth = $maxDepth;
75			}
76
77			public function train(array $samples, array $targets): void
78			{
79			$this->samples = array_merge($this->samples, $samples);
80			$this->targets = array_merge($this->targets, $targets);
81
82			$this->featureCount = count($this->samples[0]);
83			$this->columnTypes = self::getColumnTypes($this->samples);
84			$this->labels = array_keys(array_count_values($this->targets));
85			$this->tree = $this->getSplitLeaf(range(0, count($this->samples) - 1));
86
87			// Each time the tree is trained, feature importances are reset so that
88			// we will have to compute it again depending on the new data
89			$this->featureImportances = null;
90
91			// If column names are given or computed before, then there is no
92			// need to init it and accidentally remove the previous given names
93			if ($this->columnNames === []) {
94			$this->columnNames = range(0, $this->featureCount - 1);
95			} elseif (count($this->columnNames) > $this->featureCount) {
96			$this->columnNames = array_slice($this->columnNames, 0, $this->featureCount);
97			} elseif (count($this->columnNames) < $this->featureCount) {
98			$this->columnNames = array_merge(
99			$this->columnNames,
100			range(count($this->columnNames), $this->featureCount - 1)
101			);
102			}
103			}
104
105			public static function getColumnTypes(array $samples): array
106			{
107			$types = [];
108			$featureCount = count($samples[0]);
109			for ($i = 0; $i < $featureCount; ++$i) {
110			$values = array_column($samples, $i);
111			$isCategorical = self::isCategoricalColumn($values);
112			$types[] = $isCategorical ? self::NOMINAL : self::CONTINUOUS;
113			}
114
115			return $types;
116			}
117
118			/**
119			* @param mixed $baseValue
120			*/
121			public function getGiniIndex($baseValue, array $colValues, array $targets): float
122			{
123			$countMatrix = [];
124			foreach ($this->labels as $label) {
125			$countMatrix[$label] = [0, 0];
126			}
127
128			foreach ($colValues as $index => $value) {
129			$label = $targets[$index];
130			$rowIndex = $value === $baseValue ? 0 : 1;
131			++$countMatrix[$label][$rowIndex];
132			}
133
134			$giniParts = [0, 0];
135			for ($i = 0; $i <= 1; ++$i) {
136			$part = 0;
137			$sum = array_sum(array_column($countMatrix, $i));
138			if ($sum > 0) {
139			foreach ($this->labels as $label) {
140			$part += pow($countMatrix[$label][$i] / (float) $sum, 2);
141			}
142			}
143
144			$giniParts[$i] = (1 - $part) * $sum;
145			}
146
147			return array_sum($giniParts) / count($colValues);
148			}
149
150			/**
151			* This method is used to set number of columns to be used
152			* when deciding a split at an internal node of the tree. <br>
153			* If the value is given 0, then all features are used (default behaviour),
154			* otherwise the given value will be used as a maximum for number of columns
155			* randomly selected for each split operation.
156			*
157			* @return $this
158			*
159			* @throws InvalidArgumentException
160			*/
161			public function setNumFeatures(int $numFeatures)
162			{
163			if ($numFeatures < 0) {
164			throw new InvalidArgumentException('Selected column count should be greater or equal to zero');
165			}
166
167			$this->numUsableFeatures = $numFeatures;
168
169			return $this;
170			}
171
172			/**
173			* A string array to represent columns. Useful when HTML output or
174			* column importances are desired to be inspected.
175			*
176			* @return $this
177			*
178			* @throws InvalidArgumentException
179			*/
180			public function setColumnNames(array $names)
181			{
182			if ($this->featureCount !== 0 && count($names) !== $this->featureCount) {
183			throw new InvalidArgumentException(sprintf('Length of the given array should be equal to feature count %s', $this->featureCount));
184			}
185
186			$this->columnNames = $names;
187
188			return $this;
189			}
190
191			public function getHtml(): string
192			{
193			return $this->tree->getHTML($this->columnNames);
194			}
195
196			/**
197			* This will return an array including an importance value for
198			* each column in the given dataset. The importance values are
199			* normalized and their total makes 1.<br/>
200			*/
201			public function getFeatureImportances(): array
202			{
203			if ($this->featureImportances !== null) {
204			return $this->featureImportances;
205			}
206
207			$sampleCount = count($this->samples);
208			$this->featureImportances = [];
209			foreach ($this->columnNames as $column => $columnName) {
210			$nodes = $this->getSplitNodesByColumn($column, $this->tree);
211
212			$importance = 0;
213			foreach ($nodes as $node) {
214			$importance += $node->getNodeImpurityDecrease($sampleCount);
215			}
216
217			$this->featureImportances[$columnName] = $importance;
218			}
219
220			// Normalize & sort the importances
221			$total = array_sum($this->featureImportances);
222			if ($total > 0) {
223			array_walk($this->featureImportances, function (&$importance) use ($total): void {
224			$importance /= $total;
225			});
226			arsort($this->featureImportances);
227			}
228
229			return $this->featureImportances;
230			}
231
232			protected function getSplitLeaf(array $records, int $depth = 0): DecisionTreeLeaf
233			{
234			$split = $this->getBestSplit($records);
235			$split->level = $depth;
236			if ($this->actualDepth < $depth) {
237			$this->actualDepth = $depth;
238			}
239
240			// Traverse all records to see if all records belong to the same class,
241			// otherwise group the records so that we can classify the leaf
242			// in case maximum depth is reached
243			$leftRecords = [];
244			$rightRecords = [];
245			$remainingTargets = [];
246			$prevRecord = null;
247			$allSame = true;
248
249			foreach ($records as $recordNo) {
250			// Check if the previous record is the same with the current one
251			$record = $this->samples[$recordNo];
252			if ($prevRecord && $prevRecord != $record) {
253			$allSame = false;
254			}
255
256			$prevRecord = $record;
257
258			// According to the split criteron, this record will
259			// belong to either left or the right side in the next split
260			if ($split->evaluate($record)) {
261			$leftRecords[] = $recordNo;
262			} else {
263			$rightRecords[] = $recordNo;
264			}
265
266			// Group remaining targets
267			$target = $this->targets[$recordNo];
268			if (!array_key_exists($target, $remainingTargets)) {
269			$remainingTargets[$target] = 1;
270			} else {
271			++$remainingTargets[$target];
272			}
273			}
274
275			if ($allSame \|\| $depth >= $this->maxDepth \|\| count($remainingTargets) === 1) {
276			$split->isTerminal = true;
277			arsort($remainingTargets);
278			$split->classValue = key($remainingTargets);
279			} else {
280			if (!empty($leftRecords)) {
281			$split->leftLeaf = $this->getSplitLeaf($leftRecords, $depth + 1);
282			}
283
284			if (!empty($rightRecords)) {
285			$split->rightLeaf = $this->getSplitLeaf($rightRecords, $depth + 1);
286			}
287			}
288
289			return $split;
290			}
291
292			protected function getBestSplit(array $records): DecisionTreeLeaf
293			{
294			$targets = array_intersect_key($this->targets, array_flip($records));
295			$samples = array_intersect_key($this->samples, array_flip($records));
296			$samples = array_combine($records, $this->preprocess($samples));
297			$bestGiniVal = 1;
298			$bestSplit = null;
299			$features = $this->getSelectedFeatures();
300			foreach ($features as $i) {
301			$colValues = [];
302			foreach ($samples as $index => $row) {
303			$colValues[$index] = $row[$i];
304			}
305
306			$counts = array_count_values($colValues);
307			arsort($counts);
308			$baseValue = key($counts);
309			$gini = $this->getGiniIndex($baseValue, $colValues, $targets);
310			if ($bestSplit === null \|\| $bestGiniVal > $gini) {
311			$split = new DecisionTreeLeaf();
312			$split->value = $baseValue;
313			$split->giniIndex = $gini;
314			$split->columnIndex = $i;
315			$split->isContinuous = $this->columnTypes[$i] === self::CONTINUOUS;
316			$split->records = $records;
317
318			// If a numeric column is to be selected, then
319			// the original numeric value and the selected operator
320			// will also be saved into the leaf for future access
321			if ($this->columnTypes[$i] === self::CONTINUOUS) {
322			$matches = [];
323			preg_match("/^([<>=]{1,2})\s(.)/", (string) $split->value, $matches);
324			$split->operator = $matches[1];
325			$split->numericValue = (float) $matches[2];
326			}
327
328			$bestSplit = $split;
329			$bestGiniVal = $gini;
330			}
331			}
332
333			return $bestSplit;
334			}
335
336			/**
337			* Returns available features/columns to the tree for the decision making
338			* process. <br>
339			*
340			* If a number is given with setNumFeatures() method, then a random selection
341			* of features up to this number is returned. <br>
342			*
343			* If some features are manually selected by use of setSelectedFeatures(),
344			* then only these features are returned <br>
345			*
346			* If any of above methods were not called beforehand, then all features
347			* are returned by default.
348			*/
349			protected function getSelectedFeatures(): array
350			{
351			$allFeatures = range(0, $this->featureCount - 1);
352			if ($this->numUsableFeatures === 0 && empty($this->selectedFeatures)) {
353			return $allFeatures;
354			}
355
356			if (!empty($this->selectedFeatures)) {
357			return $this->selectedFeatures;
358			}
359
360			$numFeatures = $this->numUsableFeatures;
361			if ($numFeatures > $this->featureCount) {
362			$numFeatures = $this->featureCount;
363			}
364
365			shuffle($allFeatures);
366			$selectedFeatures = array_slice($allFeatures, 0, $numFeatures);
367			sort($selectedFeatures);
368
369			return $selectedFeatures;
370			}
371
372			protected function preprocess(array $samples): array
373			{
374			// Detect and convert continuous data column values into
375			// discrete values by using the median as a threshold value
376			$columns = [];
377			for ($i = 0; $i < $this->featureCount; ++$i) {
378			$values = array_column($samples, $i);
379			if ($this->columnTypes[$i] == self::CONTINUOUS) {
380			$median = Mean::median($values);
381			foreach ($values as &$value) {
382			if ($value <= $median) {
383			$value = "<= ${median}";
384			} else {
385			$value = "> ${median}";
386			}
387			}
388			}
389
390			$columns[] = $values;
391			}
392
393			// Below method is a strange yet very simple & efficient method
394			// to get the transpose of a 2D array
395			return array_map(null, ...$columns);
396			}
397
398			protected static function isCategoricalColumn(array $columnValues): bool
399			{
400			$count = count($columnValues);
401
402			// There are two main indicators that may show whether a
403			// column is composed of discrete set of values:
404			// 1- Column may contain string values and non-float values
405			// 2- Number of unique values in the column is only a small fraction of
406			// all values in that column (Lower than or equal to %20 of all values)
407			$numericValues = array_filter($columnValues, 'is_numeric');
408			$floatValues = array_filter($columnValues, 'is_float');
409			if (!empty($floatValues)) {
410			return false;
411			}
412
413			if (count($numericValues) !== $count) {
414			return true;
415			}
416
417			$distinctValues = array_count_values($columnValues);
418
419			return count($distinctValues) <= $count / 5;
420			}
421
422			/**
423			* Used to set predefined features to consider while deciding which column to use for a split
424			*/
425			protected function setSelectedFeatures(array $selectedFeatures): void
426			{
427			$this->selectedFeatures = $selectedFeatures;
428			}
429
430			/**
431			* Collects and returns an array of internal nodes that use the given
432			* column as a split criterion
433			*/
434			protected function getSplitNodesByColumn(int $column, DecisionTreeLeaf $node): array
435			{
436			if ($node->isTerminal) {
437			return [];
438			}
439
440			$nodes = [];
441			if ($node->columnIndex === $column) {
442			$nodes[] = $node;
443			}
444
445			$lNodes = [];
446			$rNodes = [];
447			if ($node->leftLeaf !== null) {
448			$lNodes = $this->getSplitNodesByColumn($column, $node->leftLeaf);
449			}
450
451			if ($node->rightLeaf !== null) {
452			$rNodes = $this->getSplitNodesByColumn($column, $node->rightLeaf);
453			}
454
455			return array_merge($nodes, $lNodes, $rNodes);
456			}
457
458			/**
459			* @return mixed
460			*/
461			protected function predictSample(array $sample)
462			{
463			$node = $this->tree;
464			do {
465			if ($node->isTerminal) {
466			break;
467			}
468
469			if ($node->evaluate($sample)) {
470			$node = $node->leftLeaf;
471			} else {
472			$node = $node->rightLeaf;
473			}
474			} while ($node);
475
476			return $node !== null ? $node->classValue : $this->labels[0];
477			}
478			}
479

php-ai / php-ml

Pull Request — master (#317)

DecisionTree F

Complexity

Size/Duplication

Coupling/Cohesion

Importance

16 Methods

How to fix Complexity

Complex Class

Duplication Side-by-Side

Filter issues like