Passed
Push — master ( 7e03c5...6dfa53 )
by Dev
34:30 queued 19:18
created

SimplePageRankCalculator::initLinksIndex()   A

Complexity

Conditions 3
Paths 3

Size

Total Lines 13
Code Lines 8

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 1
Metric Value
cc 3
eloc 8
c 1
b 0
f 1
nc 3
nop 0
dl 0
loc 13
rs 10
1
<?php
2
3
namespace PiedWeb\SeoPocketCrawler;
4
5
use League\Csv\Reader;
6
7
/**
8
 * Page Rank Calculator.
9
 */
10
class SimplePageRankCalculator
11
{
12
    /**
13
     * @var CrawlerConfig
14
     */
15
    protected $config;
16
17
    /**
18
     * @var int
19
     */
20
    protected $pagesNbr;
21
22
    /**
23
     * @var array [id, pagerank]
24
     */
25
    protected $results;
26
27
    protected $maxIteration = 10000;
28
29
    /**
30
     * @var array
31
     */
32
    protected $linksTo = [];
33
    protected $nbrLinksFrom = [];
34
35
    protected $dampingFactor = 0.85;
36
37
    public function __construct(string $id, ?string $dataDirectory = null)
38
    {
39
        $this->config = CrawlerConfig::loadFrom($id, $dataDirectory);
40
41
        $this->initLinksIndex();
42
43
        $this->results = array_fill_keys(array_reverse(array_keys($this->linksTo)), null);
44
45
        for ($iteration = 0; $iteration < $this->maxIteration; ++$iteration) {
46
            $this->calcul();
47
        }
48
    }
49
50
    public function record()
51
    {
52
        // merge it with previous data harvested
53
        $data = $this->config->getDataFromPreviousCrawl();
54
        $urls = $data['urls'];
55
56
        foreach ($urls as $k => $url) {
57
            if (isset($this->results[$url->id])) {
58
                $urls[$k]->pagerank = $this->results[$url->id];
59
            }
60
        }
61
62
        (new Recorder($this->config->getDataFolder(), $this->config->getCacheMethod()))->record($urls);
63
64
        // return data filepath
65
        return realpath($this->config->getDataFolder()).'/data.csv';
66
    }
67
68
    protected function calcul()
69
    {
70
        foreach ($this->results as $id => $pageRank) {
71
            $sumPR = 0;
72
            foreach ($this->getLinksTo($id) as $link) {
73
                $sumPR = $sumPR + $this->results[$link] / $this->getNbrLinksFrom($link);
74
            }
75
76
            $this->results[$id] = $this->dampingFactor * $sumPR + (1 - $this->dampingFactor) / $this->getPagesNbr();
77
        }
78
    }
79
80
    protected function getPagesNbr()
81
    {
82
        if (null !== $this->pagesNbr) {
83
            return $this->pagesNbr;
84
        }
85
86
        return $this->pagesNbr = count($this->linksTo);
87
    }
88
89
    protected function getLinksTo(int $id): ?array
90
    {
91
        return $this->linksTo[$id];
92
    }
93
94
    protected function getNbrLinksFrom(int $id): ?int
95
    {
96
        return $this->nbrLinksFrom[$id];
97
    }
98
99
    protected function initLinksIndex()
100
    {
101
        $csv = Reader::createFromPath($this->config->getDataFolder().Recorder::LINKS_DIR.'/Index.csv', 'r');
102
        $csv->setHeaderOffset(0);
103
104
        $records = $csv->getRecords();
105
        foreach ($records as $r) {
106
            if (!isset($this->linksTo[$r['To']])) {
107
                $this->linksTo[$r['To']] = [];
108
            }
109
            $this->linksTo[$r['To']][] = $r['From'];
110
111
            $this->nbrLinksFrom[$r['From']] = ($this->nbrLinksFrom[$r['From']] ?? 0) + 1;
112
        }
113
    }
114
}
115