SimplePageRankCalculator::record()   A
last analyzed

Complexity

Conditions 3
Paths 3

Size

Total Lines 16
Code Lines 7

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 1
Metric Value
cc 3
eloc 7
c 1
b 0
f 1
nc 3
nop 0
dl 0
loc 16
rs 10
1
<?php
2
3
namespace PiedWeb\SeoPocketCrawler;
4
5
use League\Csv\Reader;
6
7
/**
8
 * Page Rank Calculator.
9
 */
10
class SimplePageRankCalculator
11
{
12
    /**
13
     * @var CrawlerConfig
14
     */
15
    protected $config;
16
17
    /**
18
     * @var int
19
     */
20
    protected $pagesNbr;
21
22
    /**
23
     * @var array [id, pagerank]
24
     */
25
    protected $results;
26
27
    protected $maxIteration = 10000;
28
29
    /**
30
     * @var array
31
     */
32
    protected $linksTo = [];
33
    protected $nbrLinksFrom = [];
34
35
    protected $dampingFactor = 0.85;
36
37
    public function __construct(string $id, ?string $dataDirectory = null)
38
    {
39
        $this->config = CrawlerConfig::loadFrom($id, $dataDirectory);
40
41
        $this->initLinksIndex();
42
43
        $this->results = array_fill_keys(array_reverse(array_keys($this->linksTo)), null);
44
45
        for ($iteration = 0; $iteration < $this->maxIteration; ++$iteration) {
46
            $this->calcul();
47
        }
48
    }
49
50
    public function record()
51
    {
52
        // merge it with previous data harvested
53
        $data = $this->config->getDataFromPreviousCrawl();
54
        $urls = $data['urls'];
55
56
        foreach ($urls as $k => $url) {
57
            if (isset($this->results[$url->id])) {
58
                $urls[$k]->pagerank = $this->results[$url->id];
59
            }
60
        }
61
62
        (new Recorder($this->config->getDataFolder(), $this->config->getCacheMethod()))->record($urls);
63
64
        // return data filepath
65
        return realpath($this->config->getDataFolder()).'/data.csv';
66
    }
67
68
    protected function calcul()
69
    {
70
        $ids = array_keys($this->results);
71
        foreach ($ids as $id) {
72
            $sumPR = 0;
73
            foreach ($this->getLinksTo($id) as $link) {
74
                $sumPR = $sumPR + $this->results[$link] / $this->getNbrLinksFrom($link);
75
            }
76
77
            $this->results[$id] = $this->dampingFactor * $sumPR + (1 - $this->dampingFactor) / $this->getPagesNbr();
78
        }
79
    }
80
81
    protected function getPagesNbr()
82
    {
83
        if (null !== $this->pagesNbr) {
84
            return $this->pagesNbr;
85
        }
86
87
        return $this->pagesNbr = count($this->linksTo);
88
    }
89
90
    protected function getLinksTo(int $id): ?array
91
    {
92
        return $this->linksTo[$id];
93
    }
94
95
    protected function getNbrLinksFrom(int $id): ?int
96
    {
97
        return $this->nbrLinksFrom[$id];
98
    }
99
100
    protected function initLinksIndex()
101
    {
102
        $csv = Reader::createFromPath($this->config->getDataFolder().Recorder::LINKS_DIR.'/Index.csv', 'r');
103
        $csv->setHeaderOffset(0);
104
105
        $records = $csv->getRecords();
106
        foreach ($records as $r) {
107
            if (! isset($this->linksTo[$r['To']])) {
108
                $this->linksTo[$r['To']] = [];
109
            }
110
            $this->linksTo[$r['To']][] = $r['From'];
111
112
            $this->nbrLinksFrom[$r['From']] = ($this->nbrLinksFrom[$r['From']] ?? 0) + 1;
113
        }
114
    }
115
}
116