1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
declare(strict_types=1); |
4
|
|
|
|
5
|
|
|
namespace Phpml\Math\Statistic; |
6
|
|
|
|
7
|
|
|
use Phpml\Exception\InvalidArgumentException; |
8
|
|
|
|
9
|
|
|
/** |
10
|
|
|
* Analysis of variance |
11
|
|
|
* https://en.wikipedia.org/wiki/Analysis_of_variance |
12
|
|
|
*/ |
13
|
|
|
final class ANOVA |
14
|
|
|
{ |
15
|
|
|
/** |
16
|
|
|
* The one-way ANOVA tests the null hypothesis that 2 or more groups have |
17
|
|
|
* the same population mean. The test is applied to samples from two or |
18
|
|
|
* more groups, possibly with differing sizes. |
19
|
|
|
* |
20
|
|
|
* @param array[] $samples - each row is class samples |
21
|
|
|
* |
22
|
|
|
* @return float[] |
23
|
|
|
*/ |
24
|
|
|
public static function oneWayF(array $samples): array |
25
|
|
|
{ |
26
|
|
|
$classes = count($samples); |
27
|
|
|
if ($classes < 2) { |
28
|
|
|
throw new InvalidArgumentException('The array must have at least 2 elements'); |
29
|
|
|
} |
30
|
|
|
|
31
|
|
|
$samplesPerClass = array_map(function (array $class): int { |
32
|
|
|
return count($class); |
33
|
|
|
}, $samples); |
34
|
|
|
$allSamples = (int) array_sum($samplesPerClass); |
35
|
|
|
$ssAllSamples = self::sumOfSquaresPerFeature($samples); |
36
|
|
|
$sumSamples = self::sumOfFeaturesPerClass($samples); |
37
|
|
|
$squareSumSamples = self::sumOfSquares($sumSamples); |
38
|
|
|
$sumSamplesSquare = self::squaresSum($sumSamples); |
39
|
|
|
$ssbn = self::calculateSsbn($samples, $sumSamplesSquare, $samplesPerClass, $squareSumSamples, $allSamples); |
40
|
|
|
$sswn = self::calculateSswn($ssbn, $ssAllSamples, $squareSumSamples, $allSamples); |
41
|
|
|
$dfbn = $classes - 1; |
42
|
|
|
$dfwn = $allSamples - $classes; |
43
|
|
|
|
44
|
|
|
$msb = array_map(function ($s) use ($dfbn) { |
45
|
|
|
return $s / $dfbn; |
46
|
|
|
}, $ssbn); |
47
|
|
|
$msw = array_map(function ($s) use ($dfwn) { |
48
|
|
|
return $s / $dfwn; |
49
|
|
|
}, $sswn); |
50
|
|
|
|
51
|
|
|
$f = []; |
52
|
|
|
foreach ($msb as $index => $msbValue) { |
53
|
|
|
$f[$index] = $msbValue / $msw[$index]; |
54
|
|
|
} |
55
|
|
|
|
56
|
|
|
return $f; |
57
|
|
|
} |
58
|
|
|
|
59
|
|
View Code Duplication |
private static function sumOfSquaresPerFeature(array $samples): array |
|
|
|
|
60
|
|
|
{ |
61
|
|
|
$sum = array_fill(0, count($samples[0][0]), 0); |
62
|
|
|
foreach ($samples as $class) { |
63
|
|
|
foreach ($class as $sample) { |
64
|
|
|
foreach ($sample as $index => $feature) { |
65
|
|
|
$sum[$index] += $feature ** 2; |
66
|
|
|
} |
67
|
|
|
} |
68
|
|
|
} |
69
|
|
|
|
70
|
|
|
return $sum; |
71
|
|
|
} |
72
|
|
|
|
73
|
|
|
private static function sumOfFeaturesPerClass(array $samples): array |
74
|
|
|
{ |
75
|
|
|
return array_map(function (array $class) { |
76
|
|
|
$sum = array_fill(0, count($class[0]), 0); |
77
|
|
|
foreach ($class as $sample) { |
78
|
|
|
foreach ($sample as $index => $feature) { |
79
|
|
|
$sum[$index] += $feature; |
80
|
|
|
} |
81
|
|
|
} |
82
|
|
|
|
83
|
|
|
return $sum; |
84
|
|
|
}, $samples); |
85
|
|
|
} |
86
|
|
|
|
87
|
|
View Code Duplication |
private static function sumOfSquares(array $sums): array |
|
|
|
|
88
|
|
|
{ |
89
|
|
|
$squares = array_fill(0, count($sums[0]), 0); |
90
|
|
|
foreach ($sums as $row) { |
91
|
|
|
foreach ($row as $index => $sum) { |
92
|
|
|
$squares[$index] += $sum; |
93
|
|
|
} |
94
|
|
|
} |
95
|
|
|
|
96
|
|
|
return array_map(function ($sum) { |
97
|
|
|
return $sum ** 2; |
98
|
|
|
}, $squares); |
99
|
|
|
} |
100
|
|
|
|
101
|
|
|
private static function squaresSum(array $sums): array |
102
|
|
|
{ |
103
|
|
|
foreach ($sums as &$row) { |
104
|
|
|
foreach ($row as &$sum) { |
105
|
|
|
$sum **= 2; |
106
|
|
|
} |
107
|
|
|
} |
108
|
|
|
|
109
|
|
|
return $sums; |
110
|
|
|
} |
111
|
|
|
|
112
|
|
|
private static function calculateSsbn(array $samples, array $sumSamplesSquare, array $samplesPerClass, array $squareSumSamples, int $allSamples): array |
113
|
|
|
{ |
114
|
|
|
$ssbn = array_fill(0, count($samples[0][0]), 0); |
115
|
|
|
foreach ($sumSamplesSquare as $classIndex => $class) { |
116
|
|
|
foreach ($class as $index => $feature) { |
117
|
|
|
$ssbn[$index] += $feature / $samplesPerClass[$classIndex]; |
118
|
|
|
} |
119
|
|
|
} |
120
|
|
|
|
121
|
|
|
foreach ($squareSumSamples as $index => $sum) { |
122
|
|
|
$ssbn[$index] -= $sum / $allSamples; |
123
|
|
|
} |
124
|
|
|
|
125
|
|
|
return $ssbn; |
126
|
|
|
} |
127
|
|
|
|
128
|
|
|
private static function calculateSswn(array $ssbn, array $ssAllSamples, array $squareSumSamples, int $allSamples): array |
129
|
|
|
{ |
130
|
|
|
$sswn = []; |
131
|
|
|
foreach ($ssAllSamples as $index => $ss) { |
132
|
|
|
$sswn[$index] = ($ss - $squareSumSamples[$index] / $allSamples) - $ssbn[$index]; |
133
|
|
|
} |
134
|
|
|
|
135
|
|
|
return $sswn; |
136
|
|
|
} |
137
|
|
|
} |
138
|
|
|
|
Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.
You can also find more detailed suggestions in the “Code” section of your repository.