PCA::eigenDecomposition() - Code Metrics - Inspection of "Linear algebra operations, Dimensionality reductio..." - php-ai/php-ml - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — master ( 6296e4...a87859 )

by Arkadiusz

created 2017-04-23 07:03 UTC

PCA::eigenDecomposition() B

↳ Parent: PCA

Complexity

Conditions	5
Paths	5

Size

Total Lines	31
Code Lines	20

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
dl	0
loc	31
rs	8.439
c	0
b	0
f	0
cc	5
eloc	20
nc	5
nop	2

<?php

declare(strict_types=1);

namespace Phpml\DimensionReduction;

use Phpml\Math\LinearAlgebra\EigenvalueDecomposition;
use Phpml\Math\Statistic\Covariance;
use Phpml\Math\Statistic\Mean;
use Phpml\Math\Matrix;

class PCA
{
    /**
     * Total variance to be conserved after the reduction
     *
     * @var float
     */
    public $totalVariance = 0.9;

    /**
     * Number of features to be preserved after the reduction
     *
     * @var int
     */
    public $numFeatures = null;

    /**
     * Temporary storage for mean values for each dimension in given data
     *
     * @var array
     */
    protected $means = [];

    /**
     * Eigenvectors of the covariance matrix
     *
     * @var array
     */
    protected $eigVectors = [];

    /**
     * Top eigenValues of the covariance matrix
     *
     * @var type
     */
    protected $eigValues = [];

    /**
     * @var bool
     */
    protected $fit = false;

    /**
     * PCA (Principal Component Analysis) used to explain given
     * data with lower number of dimensions. This analysis transforms the
     * data to a lower dimensional version of it by conserving a proportion of total variance
     * within the data. It is a lossy data compression technique.<br>
     *
     * @param float $totalVariance Total explained variance to be preserved
     * @param int $numFeatures Number of features to be preserved
     *
     * @throws \Exception
     */
    public function __construct($totalVariance = null, $numFeatures = null)
    {
        if ($totalVariance !== null && ($totalVariance < 0.1 || $totalVariance > 0.99)) {
            throw new \Exception("Total variance can be a value between 0.1 and 0.99");
        }
        if ($numFeatures !== null && $numFeatures <= 0) {
            throw new \Exception("Number of features to be preserved should be greater than 0");
        }
        if ($totalVariance !== null && $numFeatures !== null) {
            throw new \Exception("Either totalVariance or numFeatures should be specified in order to run the algorithm");
        }

        if ($numFeatures !== null) {
            $this->numFeatures = $numFeatures;
        }
        if ($totalVariance !== null) {
            $this->totalVariance = $totalVariance;
        }
    }

    /**
     * Takes a data and returns a lower dimensional version
     * of this data while preserving $totalVariance or $numFeatures. <br>
     * $data is an n-by-m matrix and returned array is
     * n-by-k matrix where k <= m
     *
     * @param array $data
     *
     * @return array
     */
    public function fit(array $data)
    {
        $n = count($data[0]);

        $data = $this->normalize($data, $n);

        $covMatrix = Covariance::covarianceMatrix($data, array_fill(0, $n, 0));

        list($this->eigValues, $this->eigVectors) = $this->eigenDecomposition($covMatrix, $n);

        $this->fit = true;

        return $this->reduce($data);
    }

    /**
     * @param array $data
     * @param int $n
     */
    protected function calculateMeans(array $data, int $n)
    {
        // Calculate means for each dimension
        $this->means = [];
        for ($i=0; $i < $n; $i++) {

            $column = array_column($data, $i);
            $this->means[] = Mean::arithmetic($column);
        }
    }

    /**
     * Normalization of the data includes subtracting mean from
     * each dimension therefore dimensions will be centered to zero
     *
     * @param array $data
     * @param int $n
     *
     * @return array
     */
    protected function normalize(array $data, int $n)
    {
        if (empty($this->means)) {
            $this->calculateMeans($data, $n);
        }

        // Normalize data
        foreach ($data as $i => $row) {
            for ($k=0; $k < $n; $k++) {
                $data[$i][$k] -= $this->means[$k];
            }
        }

        return $data;
    }

    /**
     * Calculates eigenValues and eigenVectors of the given matrix. Returns
     * top eigenVectors along with the largest eigenValues. The total explained variance
     * of these eigenVectors will be no less than desired $totalVariance value
     *
     * @param array $matrix
     * @param int $n
     *
     * @return array
     */
    protected function eigenDecomposition(array $matrix, int $n)

    {
        $eig = new EigenvalueDecomposition($matrix);
        $eigVals = $eig->getRealEigenvalues();
        $eigVects= $eig->getEigenvectors();

        $totalEigVal = array_sum($eigVals);
        // Sort eigenvalues in descending order
        arsort($eigVals);

        $explainedVar = 0.0;
        $vectors = [];
        $values = [];
        foreach ($eigVals as $i => $eigVal) {
            $explainedVar += $eigVal / $totalEigVal;
            $vectors[] = $eigVects[$i];
            $values[] = $eigVal;

            if ($this->numFeatures !== null) {
                if (count($vectors) == $this->numFeatures) {
                    break;
                }
            } else {
                if ($explainedVar >= $this->totalVariance) {
                    break;
                }
            }
        }

        return [$values, $vectors];
    }

    /**
     * Returns the reduced data
     *
     * @param array $data
     *
     * @return array
     */
    protected function reduce(array $data)
    {
        $m1 = new Matrix($data);
        $m2 = new Matrix($this->eigVectors);

        return $m1->multiply($m2->transpose())->toArray();
    }

    /**
     * Transforms the given sample to a lower dimensional vector by using
     * the eigenVectors obtained in the last run of <code>fit</code>.
     *
     * @param array $sample
     *
     * @return array
     */
    public function transform(array $sample)
    {
        if (!$this->fit) {
            throw new \Exception("PCA has not been fitted with respect to original dataset, please run PCA::fit() first");
        }

        if (! is_array($sample[0])) {
            $sample = [$sample];
        }

        $sample = $this->normalize($sample, count($sample[0]));

        return $this->reduce($sample);
    }
}


1		<?php
2
3		declare(strict_types=1);
4
5		namespace Phpml\DimensionReduction;
6
7		use Phpml\Math\LinearAlgebra\EigenvalueDecomposition;
8		use Phpml\Math\Statistic\Covariance;
9		use Phpml\Math\Statistic\Mean;
10		use Phpml\Math\Matrix;
11
12		class PCA
13		{
14		/**
15		* Total variance to be conserved after the reduction
16		*
17		* @var float
18		*/
19		public $totalVariance = 0.9;
20
21		/**
22		* Number of features to be preserved after the reduction
23		*
24		* @var int
25		*/
26		public $numFeatures = null;
27
28		/**
29		* Temporary storage for mean values for each dimension in given data
30		*
31		* @var array
32		*/
33		protected $means = [];
34
35		/**
36		* Eigenvectors of the covariance matrix
37		*
38		* @var array
39		*/
40		protected $eigVectors = [];
41
42		/**
43		* Top eigenValues of the covariance matrix
44		*
45		* @var type
46		*/
47		protected $eigValues = [];
48
49		/**
50		* @var bool
51		*/
52		protected $fit = false;
53
54		/**
55		* PCA (Principal Component Analysis) used to explain given
56		* data with lower number of dimensions. This analysis transforms the
57		* data to a lower dimensional version of it by conserving a proportion of total variance
58		* within the data. It is a lossy data compression technique.<br>
59		*
60		* @param float $totalVariance Total explained variance to be preserved
61		* @param int $numFeatures Number of features to be preserved
62		*
63		* @throws \Exception
64		*/
65		public function __construct($totalVariance = null, $numFeatures = null)
66		{
67		if ($totalVariance !== null && ($totalVariance < 0.1 \|\| $totalVariance > 0.99)) {
68		throw new \Exception("Total variance can be a value between 0.1 and 0.99");
69		}
70		if ($numFeatures !== null && $numFeatures <= 0) {
71		throw new \Exception("Number of features to be preserved should be greater than 0");
72		}
73		if ($totalVariance !== null && $numFeatures !== null) {
74		throw new \Exception("Either totalVariance or numFeatures should be specified in order to run the algorithm");
75		}
76
77		if ($numFeatures !== null) {
78		$this->numFeatures = $numFeatures;
79		}
80		if ($totalVariance !== null) {
81		$this->totalVariance = $totalVariance;
82		}
83		}
84
85		/**
86		* Takes a data and returns a lower dimensional version
87		* of this data while preserving $totalVariance or $numFeatures. <br>
88		* $data is an n-by-m matrix and returned array is
89		* n-by-k matrix where k <= m
90		*
91		* @param array $data
92		*
93		* @return array
94		*/
95		public function fit(array $data)
96		{
97		$n = count($data[0]);
98
99		$data = $this->normalize($data, $n);
100
101		$covMatrix = Covariance::covarianceMatrix($data, array_fill(0, $n, 0));
102
103		list($this->eigValues, $this->eigVectors) = $this->eigenDecomposition($covMatrix, $n);
104
105		$this->fit = true;
106
107		return $this->reduce($data);
108		}
109
110		/**
111		* @param array $data
112		* @param int $n
113		*/
114		protected function calculateMeans(array $data, int $n)
115		{
116		// Calculate means for each dimension
117		$this->means = [];
118	View Code Duplication	for ($i=0; $i < $n; $i++) {
		0 ignored issues – show Duplication introduced 2017-04-19 11:50 UTC by Report Bug Copy Issue Report This code seems to be duplicated across your project. Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation. You can also find more detailed suggestions in the “Code” section of your repository. Loading history...
119		$column = array_column($data, $i);
120		$this->means[] = Mean::arithmetic($column);
121		}
122		}
123
124		/**
125		* Normalization of the data includes subtracting mean from
126		* each dimension therefore dimensions will be centered to zero
127		*
128		* @param array $data
129		* @param int $n
130		*
131		* @return array
132		*/
133		protected function normalize(array $data, int $n)
134		{
135		if (empty($this->means)) {
136		$this->calculateMeans($data, $n);
137		}
138
139		// Normalize data
140		foreach ($data as $i => $row) {
141		for ($k=0; $k < $n; $k++) {
142		$data[$i][$k] -= $this->means[$k];
143		}
144		}
145
146		return $data;
147		}
148
149		/**
150		* Calculates eigenValues and eigenVectors of the given matrix. Returns
151		* top eigenVectors along with the largest eigenValues. The total explained variance
152		* of these eigenVectors will be no less than desired $totalVariance value
153		*
154		* @param array $matrix
155		* @param int $n
156		*
157		* @return array
158		*/
159		protected function eigenDecomposition(array $matrix, int $n)
		0 ignored issues – show Unused Code introduced 2017-04-20 21:25 UTC by Report Bug Copy Issue Report The parameter `$n` is not used and could be removed. This check looks from parameters that have been defined for a function or method, but which are not used in the method body. Loading history...
160		{
161		$eig = new EigenvalueDecomposition($matrix);
162		$eigVals = $eig->getRealEigenvalues();
163		$eigVects= $eig->getEigenvectors();
164
165		$totalEigVal = array_sum($eigVals);
166		// Sort eigenvalues in descending order
167		arsort($eigVals);
168
169		$explainedVar = 0.0;
170		$vectors = [];
171		$values = [];
172		foreach ($eigVals as $i => $eigVal) {
173		$explainedVar += $eigVal / $totalEigVal;
174		$vectors[] = $eigVects[$i];
175		$values[] = $eigVal;
176
177		if ($this->numFeatures !== null) {
178		if (count($vectors) == $this->numFeatures) {
179		break;
180		}
181		} else {
182		if ($explainedVar >= $this->totalVariance) {
183		break;
184		}
185		}
186		}
187
188		return [$values, $vectors];
189		}
190
191		/**
192		* Returns the reduced data
193		*
194		* @param array $data
195		*
196		* @return array
197		*/
198		protected function reduce(array $data)
199		{
200		$m1 = new Matrix($data);
201		$m2 = new Matrix($this->eigVectors);
202
203		return $m1->multiply($m2->transpose())->toArray();
204		}
205
206		/**
207		* Transforms the given sample to a lower dimensional vector by using
208		* the eigenVectors obtained in the last run of <code>fit</code>.
209		*
210		* @param array $sample
211		*
212		* @return array
213		*/
214		public function transform(array $sample)
215		{
216		if (!$this->fit) {
217		throw new \Exception("PCA has not been fitted with respect to original dataset, please run PCA::fit() first");
218		}
219
220		if (! is_array($sample[0])) {
221		$sample = [$sample];
222		}
223
224		$sample = $this->normalize($sample, count($sample[0]));
225
226		return $this->reduce($sample);
227		}
228		}
229

php-ai / php-ml

Push — master ( 6296e4...a87859 )

PCA::eigenDecomposition() B

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like