Completed
Pull Request — master (#392)
by
unknown
05:07
created

TfIdfTransformer   A

Complexity

Total Complexity 11

Size/Duplication

Total Lines 72
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
wmc 11
eloc 27
dl 0
loc 72
rs 10
c 0
b 0
f 0

3 Methods

Rating   Name   Duplication   Size   Complexity  
A fit() 0 20 5
A __construct() 0 7 2
A transform() 0 12 4
1
<?php
2
3
declare(strict_types=1);
4
5
namespace Phpml\FeatureExtraction;
6
7
use Phpml\Transformer;
8
9
class TfIdfTransformer implements Transformer
10
{
11
    /**
12
     * @var array
13
     */
14
    private $idf = [];
15
16
    /**
17
     * The term counts.
18
     *
19
     * @var array
20
     */
21
    private $termCounts = [];
22
23
    /**
24
     * The minimum accepted term frequency.
25
     *
26
     * @var int
27
     */
28
    private $minTf;
29
30
    /**
31
     * The minimum accepted IDF value.
32
     *
33
     * @var float
34
     */
35
    private $minIdf;
36
37
    public function __construct(array $samples = [], int $minTf = 0, float $minIdf = 0.0)
38
    {
39
        if (count($samples) > 0) {
40
            $this->fit($samples);
41
        }
42
        $this->minTf = $minTf;
43
        $this->minIdf = $minIdf;
44
    }
45
46
    public function fit(array $samples, ?array $targets = null): void
47
    {
48
        $this->termCounts = array_fill_keys(array_keys($samples[0]), 0);
49
50
        foreach ($samples as $sample) {
51
            foreach ($sample as $index => $count) {
52
                if ($count > 0) {
53
                    $this->termCounts[$index]++;
54
                }
55
            }
56
        }
57
58
        $count = count($samples);
59
        $this->idf = array_map(
60
            function (float $value) use ($count): float {
61
                return $value > 0.0
62
                    ? log($count / $value, 10)
63
                    : 0;
64
            },
65
            $this->termCounts
66
        );
67
    }
68
69
    public function transform(array &$samples, ?array &$targets = null): void
70
    {
71
        array_walk($samples, function (array &$sample): void {
72
            foreach ($sample as $index => &$feature) {
73
                if ($this->termCounts[$index] < $this->minTf || $this->idf[$index] < $this->minIdf) {
74
                    unset($sample[$index]);
75
76
                    continue;
77
                }
78
                $feature *= $this->idf[$index];
79
            }
80
            $sample = array_values($sample);
81
        });
82
    }
83
}
84