1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
declare(strict_types=1); |
4
|
|
|
|
5
|
|
|
namespace Phpml\Math\Statistic; |
6
|
|
|
|
7
|
|
|
use Phpml\Exception\InvalidArgumentException; |
8
|
|
|
|
9
|
|
|
class Covariance |
10
|
|
|
{ |
11
|
|
|
/** |
12
|
|
|
* Calculates covariance from two given arrays, x and y, respectively |
13
|
|
|
* |
14
|
|
|
* @param array $x |
15
|
|
|
* @param array $y |
16
|
|
|
* @param bool $sample |
17
|
|
|
* @param float $meanX |
18
|
|
|
* @param float $meanY |
19
|
|
|
* |
20
|
|
|
* @return float |
21
|
|
|
* |
22
|
|
|
* @throws InvalidArgumentException |
23
|
|
|
*/ |
24
|
|
|
public static function fromXYArrays(array $x, array $y, $sample = true, float $meanX = null, float $meanY = null) |
25
|
|
|
{ |
26
|
|
|
if (empty($x) || empty($y)) { |
27
|
|
|
throw InvalidArgumentException::arrayCantBeEmpty(); |
28
|
|
|
} |
29
|
|
|
|
30
|
|
|
$n = count($x); |
31
|
|
|
if ($sample && $n === 1) { |
32
|
|
|
throw InvalidArgumentException::arraySizeToSmall(2); |
33
|
|
|
} |
34
|
|
|
|
35
|
|
|
if ($meanX === null) { |
36
|
|
|
$meanX = Mean::arithmetic($x); |
37
|
|
|
} |
38
|
|
|
|
39
|
|
|
if ($meanY === null) { |
40
|
|
|
$meanY = Mean::arithmetic($y); |
41
|
|
|
} |
42
|
|
|
|
43
|
|
|
$sum = 0.0; |
44
|
|
View Code Duplication |
foreach ($x as $index => $xi) { |
|
|
|
|
45
|
|
|
$yi = $y[$index]; |
46
|
|
|
$sum += ($xi - $meanX) * ($yi - $meanY); |
47
|
|
|
} |
48
|
|
|
|
49
|
|
|
if ($sample) { |
50
|
|
|
--$n; |
51
|
|
|
} |
52
|
|
|
|
53
|
|
|
return $sum / $n; |
54
|
|
|
} |
55
|
|
|
|
56
|
|
|
/** |
57
|
|
|
* Calculates covariance of two dimensions, i and k in the given data. |
58
|
|
|
* |
59
|
|
|
* @param array $data |
60
|
|
|
* @param int $i |
61
|
|
|
* @param int $k |
62
|
|
|
* @param type $sample |
63
|
|
|
* @param int $n |
|
|
|
|
64
|
|
|
* @param float $meanX |
65
|
|
|
* @param float $meanY |
66
|
|
|
*/ |
67
|
|
|
public static function fromDataset(array $data, int $i, int $k, $sample = true, float $meanX = null, float $meanY = null) |
68
|
|
|
{ |
69
|
|
|
if (empty($data)) { |
70
|
|
|
throw InvalidArgumentException::arrayCantBeEmpty(); |
71
|
|
|
} |
72
|
|
|
|
73
|
|
|
$n = count($data); |
74
|
|
|
if ($sample && $n === 1) { |
75
|
|
|
throw InvalidArgumentException::arraySizeToSmall(2); |
76
|
|
|
} |
77
|
|
|
|
78
|
|
|
if ($i < 0 || $k < 0 || $i >= $n || $k >= $n) { |
79
|
|
|
throw new \Exception("Given indices i and k do not match with the dimensionality of data"); |
80
|
|
|
} |
81
|
|
|
|
82
|
|
|
if ($meanX === null || $meanY === null) { |
83
|
|
|
$x = array_column($data, $i); |
84
|
|
|
$y = array_column($data, $k); |
85
|
|
|
|
86
|
|
|
$meanX = Mean::arithmetic($x); |
87
|
|
|
$meanY = Mean::arithmetic($y); |
88
|
|
|
$sum = 0.0; |
89
|
|
View Code Duplication |
foreach ($x as $index => $xi) { |
|
|
|
|
90
|
|
|
$yi = $y[$index]; |
91
|
|
|
$sum += ($xi - $meanX) * ($yi - $meanY); |
92
|
|
|
} |
93
|
|
|
} else { |
94
|
|
|
// In the case, whole dataset given along with dimension indices, i and k, |
95
|
|
|
// we would like to avoid getting column data with array_column and operate |
96
|
|
|
// over this extra copy of column data for memory efficiency purposes. |
97
|
|
|
// |
98
|
|
|
// Instead we traverse through the whole data and get what we actually need |
99
|
|
|
// without copying the data. This way, memory use will be reduced |
100
|
|
|
// with a slight cost of CPU utilization. |
101
|
|
|
$sum = 0.0; |
102
|
|
|
foreach ($data as $row) { |
103
|
|
|
$val = []; |
104
|
|
|
foreach ($row as $index => $col) { |
105
|
|
|
if ($index == $i) { |
106
|
|
|
$val[0] = $col - $meanX; |
107
|
|
|
} |
108
|
|
|
if ($index == $k) { |
109
|
|
|
$val[1] = $col - $meanY; |
110
|
|
|
} |
111
|
|
|
} |
112
|
|
|
$sum += $val[0] * $val[1]; |
113
|
|
|
} |
114
|
|
|
} |
115
|
|
|
|
116
|
|
|
if ($sample) { |
117
|
|
|
--$n; |
118
|
|
|
} |
119
|
|
|
|
120
|
|
|
return $sum / $n; |
121
|
|
|
} |
122
|
|
|
|
123
|
|
|
/** |
124
|
|
|
* Returns the covariance matrix of n-dimensional data |
125
|
|
|
* |
126
|
|
|
* @param array $data |
127
|
|
|
* |
128
|
|
|
* @return array |
129
|
|
|
*/ |
130
|
|
|
public static function covarianceMatrix(array $data, array $means = null) |
131
|
|
|
{ |
132
|
|
|
$n = count($data[0]); |
133
|
|
|
|
134
|
|
|
if ($means === null) { |
135
|
|
|
$means = []; |
136
|
|
View Code Duplication |
for ($i=0; $i < $n; $i++) { |
|
|
|
|
137
|
|
|
$means[] = Mean::arithmetic(array_column($data, $i)); |
138
|
|
|
} |
139
|
|
|
} |
140
|
|
|
|
141
|
|
|
$cov = []; |
142
|
|
|
for ($i=0; $i < $n; $i++) { |
143
|
|
|
for ($k=0; $k < $n; $k++) { |
144
|
|
|
if ($i > $k) { |
145
|
|
|
$cov[$i][$k] = $cov[$k][$i]; |
146
|
|
|
} else { |
147
|
|
|
$cov[$i][$k] = Covariance::fromDataset( |
148
|
|
|
$data, $i, $k, true, $means[$i], $means[$k]); |
149
|
|
|
} |
150
|
|
|
} |
151
|
|
|
} |
152
|
|
|
|
153
|
|
|
return $cov; |
154
|
|
|
} |
155
|
|
|
} |
156
|
|
|
|
Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.
You can also find more detailed suggestions in the “Code” section of your repository.