Complex classes like DecisionTree often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes. You can also have a look at the cohesion graph to spot any un-connected, or weakly-connected components.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use DecisionTree, and based on these observations, apply Extract Interface, too.
| 1 | <?php |
||
| 13 | class DecisionTree implements Classifier |
||
| 14 | { |
||
| 15 | use Trainable, Predictable; |
||
| 16 | |||
| 17 | const CONTINUOUS = 1; |
||
| 18 | const NOMINAL = 2; |
||
| 19 | |||
| 20 | /** |
||
| 21 | * @var array |
||
| 22 | */ |
||
| 23 | protected $columnTypes; |
||
| 24 | |||
| 25 | /** |
||
| 26 | * @var array |
||
| 27 | */ |
||
| 28 | private $labels = []; |
||
| 29 | |||
| 30 | /** |
||
| 31 | * @var int |
||
| 32 | */ |
||
| 33 | private $featureCount = 0; |
||
| 34 | |||
| 35 | /** |
||
| 36 | * @var DecisionTreeLeaf |
||
| 37 | */ |
||
| 38 | protected $tree = null; |
||
| 39 | |||
| 40 | /** |
||
| 41 | * @var int |
||
| 42 | */ |
||
| 43 | protected $maxDepth; |
||
| 44 | |||
| 45 | /** |
||
| 46 | * @var int |
||
| 47 | */ |
||
| 48 | public $actualDepth = 0; |
||
| 49 | |||
| 50 | /** |
||
| 51 | * @var int |
||
| 52 | */ |
||
| 53 | private $numUsableFeatures = 0; |
||
| 54 | |||
| 55 | /** |
||
| 56 | * @var array |
||
| 57 | */ |
||
| 58 | private $selectedFeatures; |
||
| 59 | |||
| 60 | /** |
||
| 61 | * @var array |
||
| 62 | */ |
||
| 63 | private $featureImportances = null; |
||
| 64 | |||
| 65 | /** |
||
| 66 | * |
||
| 67 | * @var array |
||
| 68 | */ |
||
| 69 | private $columnNames = null; |
||
| 70 | |||
| 71 | /** |
||
| 72 | * @param int $maxDepth |
||
| 73 | */ |
||
| 74 | public function __construct(int $maxDepth = 10) |
||
| 78 | |||
| 79 | /** |
||
| 80 | * @param array $samples |
||
| 81 | * @param array $targets |
||
| 82 | */ |
||
| 83 | public function train(array $samples, array $targets) |
||
| 108 | |||
| 109 | /** |
||
| 110 | * @param array $samples |
||
| 111 | * @return array |
||
| 112 | */ |
||
| 113 | public static function getColumnTypes(array $samples) : array |
||
| 125 | |||
| 126 | /** |
||
| 127 | * @param array $records |
||
| 128 | * @param int $depth |
||
| 129 | * @return DecisionTreeLeaf |
||
| 130 | */ |
||
| 131 | protected function getSplitLeaf(array $records, int $depth = 0) : DecisionTreeLeaf |
||
| 188 | |||
| 189 | /** |
||
| 190 | * @param array $records |
||
| 191 | * @return DecisionTreeLeaf |
||
| 192 | */ |
||
| 193 | protected function getBestSplit(array $records) : DecisionTreeLeaf |
||
| 235 | |||
| 236 | /** |
||
| 237 | * Returns available features/columns to the tree for the decision making |
||
| 238 | * process. <br> |
||
| 239 | * |
||
| 240 | * If a number is given with setNumFeatures() method, then a random selection |
||
| 241 | * of features up to this number is returned. <br> |
||
| 242 | * |
||
| 243 | * If some features are manually selected by use of setSelectedFeatures(), |
||
| 244 | * then only these features are returned <br> |
||
| 245 | * |
||
| 246 | * If any of above methods were not called beforehand, then all features |
||
| 247 | * are returned by default. |
||
| 248 | * |
||
| 249 | * @return array |
||
| 250 | */ |
||
| 251 | protected function getSelectedFeatures() : array |
||
| 272 | |||
| 273 | /** |
||
| 274 | * @param $baseValue |
||
| 275 | * @param array $colValues |
||
| 276 | * @param array $targets |
||
| 277 | * @return float |
||
| 278 | */ |
||
| 279 | public function getGiniIndex($baseValue, array $colValues, array $targets) : float |
||
| 304 | |||
| 305 | /** |
||
| 306 | * @param array $samples |
||
| 307 | * @return array |
||
| 308 | */ |
||
| 309 | protected function preprocess(array $samples) : array |
||
| 332 | |||
| 333 | /** |
||
| 334 | * @param array $columnValues |
||
| 335 | * @return bool |
||
| 336 | */ |
||
| 337 | protected static function isCategoricalColumn(array $columnValues) : bool |
||
| 359 | |||
| 360 | /** |
||
| 361 | * This method is used to set number of columns to be used |
||
| 362 | * when deciding a split at an internal node of the tree. <br> |
||
| 363 | * If the value is given 0, then all features are used (default behaviour), |
||
| 364 | * otherwise the given value will be used as a maximum for number of columns |
||
| 365 | * randomly selected for each split operation. |
||
| 366 | * |
||
| 367 | * @param int $numFeatures |
||
| 368 | * @return $this |
||
| 369 | * @throws InvalidArgumentException |
||
| 370 | */ |
||
| 371 | public function setNumFeatures(int $numFeatures) |
||
| 381 | |||
| 382 | /** |
||
| 383 | * Used to set predefined features to consider while deciding which column to use for a split |
||
| 384 | * |
||
| 385 | * @param array $selectedFeatures |
||
| 386 | */ |
||
| 387 | protected function setSelectedFeatures(array $selectedFeatures) |
||
| 391 | |||
| 392 | /** |
||
| 393 | * A string array to represent columns. Useful when HTML output or |
||
| 394 | * column importances are desired to be inspected. |
||
| 395 | * |
||
| 396 | * @param array $names |
||
| 397 | * @return $this |
||
| 398 | * @throws InvalidArgumentException |
||
| 399 | */ |
||
| 400 | public function setColumnNames(array $names) |
||
| 410 | |||
| 411 | /** |
||
| 412 | * @return string |
||
| 413 | */ |
||
| 414 | public function getHtml() |
||
| 418 | |||
| 419 | /** |
||
| 420 | * This will return an array including an importance value for |
||
| 421 | * each column in the given dataset. The importance values are |
||
| 422 | * normalized and their total makes 1.<br/> |
||
| 423 | * |
||
| 424 | * @return array |
||
| 425 | */ |
||
| 426 | public function getFeatureImportances() |
||
| 456 | |||
| 457 | /** |
||
| 458 | * Collects and returns an array of internal nodes that use the given |
||
| 459 | * column as a split criterion |
||
| 460 | * |
||
| 461 | * @param int $column |
||
| 462 | * @param DecisionTreeLeaf $node |
||
| 463 | * @return array |
||
| 464 | */ |
||
| 465 | protected function getSplitNodesByColumn(int $column, DecisionTreeLeaf $node) : array |
||
| 488 | |||
| 489 | /** |
||
| 490 | * @param array $sample |
||
| 491 | * @return mixed |
||
| 492 | */ |
||
| 493 | protected function predictSample(array $sample) |
||
| 509 | } |
||
| 510 |
Our type inference engine has found an assignment to a property that is incompatible with the declared type of that property.
Either this assignment is in error or the assigned type should be added to the documentation/type hint for that property..