Complex classes like DecisionTree often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes. You can also have a look at the cohesion graph to spot any un-connected, or weakly-connected components.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use DecisionTree, and based on these observations, apply Extract Interface, too.
| 1 | <?php |
||
| 13 | class DecisionTree implements Classifier |
||
| 14 | { |
||
| 15 | use Trainable, Predictable; |
||
| 16 | |||
| 17 | const CONTINUOUS = 1; |
||
| 18 | const NOMINAL = 2; |
||
| 19 | |||
| 20 | /** |
||
| 21 | * @var array |
||
| 22 | */ |
||
| 23 | protected $columnTypes = []; |
||
| 24 | |||
| 25 | /** |
||
| 26 | * @var array |
||
| 27 | */ |
||
| 28 | private $labels = []; |
||
| 29 | |||
| 30 | /** |
||
| 31 | * @var int |
||
| 32 | */ |
||
| 33 | private $featureCount = 0; |
||
| 34 | |||
| 35 | /** |
||
| 36 | * @var DecisionTreeLeaf |
||
| 37 | */ |
||
| 38 | protected $tree = null; |
||
| 39 | |||
| 40 | /** |
||
| 41 | * @var int |
||
| 42 | */ |
||
| 43 | protected $maxDepth; |
||
| 44 | |||
| 45 | public static $categoricalColumnMinimumUniqueValueCount = 0.2; |
||
| 46 | |||
| 47 | /** |
||
| 48 | * @var int |
||
| 49 | */ |
||
| 50 | public $actualDepth = 0; |
||
| 51 | |||
| 52 | /** |
||
| 53 | * @var int |
||
| 54 | */ |
||
| 55 | private $numUsableFeatures = 0; |
||
| 56 | |||
| 57 | /** |
||
| 58 | * @var array |
||
| 59 | */ |
||
| 60 | private $selectedFeatures; |
||
| 61 | |||
| 62 | /** |
||
| 63 | * @var array |
||
| 64 | */ |
||
| 65 | private $featureImportances = null; |
||
| 66 | |||
| 67 | /** |
||
| 68 | * |
||
| 69 | * @var array |
||
| 70 | */ |
||
| 71 | private $columnNames = null; |
||
| 72 | |||
| 73 | /** |
||
| 74 | * @param int $maxDepth |
||
| 75 | */ |
||
| 76 | public function __construct(int $maxDepth = 10) |
||
| 80 | |||
| 81 | /** |
||
| 82 | * @param array $samples |
||
| 83 | * @param array $targets |
||
| 84 | */ |
||
| 85 | public function train(array $samples, array $targets) |
||
| 120 | |||
| 121 | /** |
||
| 122 | * @param array $samples |
||
| 123 | * |
||
| 124 | * @return array |
||
| 125 | */ |
||
| 126 | public static function getColumnTypes(array $samples) : array |
||
| 138 | |||
| 139 | /** |
||
| 140 | * @param array $records |
||
| 141 | * @param int $depth |
||
| 142 | * |
||
| 143 | * @return DecisionTreeLeaf |
||
| 144 | */ |
||
| 145 | protected function getSplitLeaf(array $records, int $depth = 0) : DecisionTreeLeaf |
||
| 202 | |||
| 203 | /** |
||
| 204 | * @param array $records |
||
| 205 | * |
||
| 206 | * @return DecisionTreeLeaf|null |
||
| 207 | */ |
||
| 208 | protected function getBestSplit(array $records) : DecisionTreeLeaf |
||
| 252 | |||
| 253 | /** |
||
| 254 | * Returns available features/columns to the tree for the decision making |
||
| 255 | * process. <br> |
||
| 256 | * |
||
| 257 | * If a number is given with setNumFeatures() method, then a random selection |
||
| 258 | * of features up to this number is returned. <br> |
||
| 259 | * |
||
| 260 | * If some features are manually selected by use of setSelectedFeatures(), |
||
| 261 | * then only these features are returned <br> |
||
| 262 | * |
||
| 263 | * If any of above methods were not called beforehand, then all features |
||
| 264 | * are returned by default. |
||
| 265 | * |
||
| 266 | * @return array |
||
| 267 | */ |
||
| 268 | protected function getSelectedFeatures() : array |
||
| 289 | |||
| 290 | /** |
||
| 291 | * @param mixed $baseValue |
||
| 292 | * @param array $colValues |
||
| 293 | * @param array $targets |
||
| 294 | * |
||
| 295 | * @return float |
||
| 296 | */ |
||
| 297 | public function getGiniIndex($baseValue, array $colValues, array $targets) : float |
||
| 328 | |||
| 329 | /** |
||
| 330 | * @param array $samples |
||
| 331 | * |
||
| 332 | * @return array |
||
| 333 | */ |
||
| 334 | protected function preprocess(array $samples) : array |
||
| 357 | |||
| 358 | /** |
||
| 359 | * @param array $columnValues |
||
| 360 | * |
||
| 361 | * @return bool |
||
| 362 | */ |
||
| 363 | protected static function isCategoricalColumn(array $columnValues) : bool |
||
| 387 | |||
| 388 | /** |
||
| 389 | * This method is used to set number of columns to be used |
||
| 390 | * when deciding a split at an internal node of the tree. <br> |
||
| 391 | * If the value is given 0, then all features are used (default behaviour), |
||
| 392 | * otherwise the given value will be used as a maximum for number of columns |
||
| 393 | * randomly selected for each split operation. |
||
| 394 | * |
||
| 395 | * @param int $numFeatures |
||
| 396 | * |
||
| 397 | * @return $this |
||
| 398 | * |
||
| 399 | * @throws InvalidArgumentException |
||
| 400 | */ |
||
| 401 | public function setNumFeatures(int $numFeatures) |
||
| 411 | |||
| 412 | /** |
||
| 413 | * Used to set predefined features to consider while deciding which column to use for a split |
||
| 414 | * |
||
| 415 | * @param array $selectedFeatures |
||
| 416 | */ |
||
| 417 | protected function setSelectedFeatures(array $selectedFeatures) |
||
| 421 | |||
| 422 | /** |
||
| 423 | * A string array to represent columns. Useful when HTML output or |
||
| 424 | * column importances are desired to be inspected. |
||
| 425 | * |
||
| 426 | * @param array $names |
||
| 427 | * |
||
| 428 | * @return $this |
||
| 429 | * |
||
| 430 | * @throws InvalidArgumentException |
||
| 431 | */ |
||
| 432 | public function setColumnNames(array $names) |
||
| 442 | |||
| 443 | /** |
||
| 444 | * @return string |
||
| 445 | */ |
||
| 446 | public function getHtml() |
||
| 450 | |||
| 451 | /** |
||
| 452 | * This will return an array including an importance value for |
||
| 453 | * each column in the given dataset. The importance values are |
||
| 454 | * normalized and their total makes 1.<br/> |
||
| 455 | * |
||
| 456 | * @return array |
||
| 457 | */ |
||
| 458 | public function getFeatureImportances() |
||
| 488 | |||
| 489 | /** |
||
| 490 | * Collects and returns an array of internal nodes that use the given |
||
| 491 | * column as a split criterion |
||
| 492 | * |
||
| 493 | * @param int $column |
||
| 494 | * @param DecisionTreeLeaf $node |
||
| 495 | * |
||
| 496 | * @return array |
||
| 497 | */ |
||
| 498 | protected function getSplitNodesByColumn(int $column, DecisionTreeLeaf $node) : array |
||
| 523 | |||
| 524 | /** |
||
| 525 | * @param array $sample |
||
| 526 | * |
||
| 527 | * @return mixed |
||
| 528 | */ |
||
| 529 | protected function predictSample(array $sample) |
||
| 546 | |||
| 547 | /** |
||
| 548 | * @return integer[]|null[] |
||
| 549 | */ |
||
| 550 | public function getInstanceColumnTypes() { |
||
| 553 | |||
| 554 | /** |
||
| 555 | * @param integer[]|null[] $columnTypes |
||
| 556 | */ |
||
| 557 | public function setInstanceColumnTypes(array $columnTypes) { |
||
| 560 | |||
| 561 | /** |
||
| 562 | * @param array $values |
||
| 563 | * |
||
| 564 | * @return array |
||
| 565 | */ |
||
| 566 | protected static function arrayCountValues(array $values) { |
||
| 576 | } |
||
| 577 |
Our type inference engine has found an assignment to a property that is incompatible with the declared type of that property.
Either this assignment is in error or the assigned type should be added to the documentation/type hint for that property..