Passed
Push — master ( 7e03c5...6dfa53 )
by Dev
34:30 queued 19:18
created

Recorder::recordLinksIndex()   A

Complexity

Conditions 4
Paths 4

Size

Total Lines 17
Code Lines 11

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 20

Importance

Changes 0
Metric Value
cc 4
eloc 11
c 0
b 0
f 0
nc 4
nop 4
dl 0
loc 17
rs 9.9
ccs 0
cts 0
cp 0
crap 20
1
<?php
2
3
namespace PiedWeb\SeoPocketCrawler;
4
5
use PiedWeb\UrlHarvester\Harvest;
6
7
class Recorder
8
{
9
    const LINKS_DIR = '/links';
10
    const CACHE_DIR = '/cache';
11
12
    const CACHE_NONE = 0;
13
    const CACHE_ID = 2;
14
    const CACHE_URI = 1;
15
16
    protected $folder;
17
    protected $cacheMethod;
18
19 9
    public function __construct($folder, $cacheMethod = self::CACHE_ID)
20
    {
21 9
        $this->folder = $folder;
22 9
        $this->cacheMethod = $cacheMethod;
23
24 9
        if (!file_exists($folder)) {
25 3
            mkdir($folder);
26 3
            mkdir($folder.Recorder::LINKS_DIR);
27 3
            mkdir($folder.Recorder::CACHE_DIR);
28
            $this->initLinksIndex();
29 9
        }
30
    }
31 6
32
    public function cache(Harvest $harvest, Url $url)
33 6
    {
34
        if (Recorder::CACHE_NONE === $this->cacheMethod || !$this->mustWeCache($harvest)) {
35
            return;
36
        }
37 6
38 6
        $filePath = $this->getCacheFilePath($url);
39 6
        if (!file_exists($filePath)) {
40 6
            file_put_contents(
41 6
                $filePath,
42
                $harvest->getResponse()->getHeaders(false).PHP_EOL.PHP_EOL.$harvest->getResponse()->getContent()
43
            );
44 6
45
            return file_put_contents($filePath.'---info', json_encode($harvest->getResponse()->getInfo()));
46 2
        }
47
    }
48 7
49
    public function getCacheFilePath(Url $url)
50 7
    {
51 3
        if (Recorder::CACHE_URI === $this->cacheMethod) {
52
            return $this->getCacheFilePathWithUrlAsFilename($url);
53 4
        } else {
54
            return $this->getCacheFilePathWithIdAsFilename($url);
55
        }
56
    }
57 3
58
    protected function getCacheFilePathWithUrlAsFilename(Url $url)
59 3
    {
60 3
        $url = trim($url->uri, '/').'/';
61 3
        $urlPart = explode('/', $url);
62
        $folder = $this->folder.Recorder::CACHE_DIR;
63 3
64 3
        $urlPartLenght = count($urlPart);
65 3
        for ($i = 0; $i < $urlPartLenght; ++$i) {
66 3
            if ($i == $urlPartLenght - 1) {
67
                return $folder.'/'.(empty($urlPart[$i]) ? 'index.html' : $urlPart[$i]);
68 3
            } else {
69 3
                $folder .= '/'.$urlPart[$i];
70
                if (!file_exists($folder) || !is_dir($folder)) {
71
                    mkdir($folder);
72
                }
73
            }
74
        }
75
    }
76 4
77
    protected function getCacheFilePathWithIdAsFilename(Url $url)
78 4
    {
79
        return $this->folder.Recorder::CACHE_DIR.'/'.(string) $url->id;
80
    }
81 6
82
    protected function mustWeCache(Harvest $harvest)
83 6
    {
84
        return false !== strpos($harvest->getResponse()->getContentType(), 'text/html');
85
    }
86 9
87
    public function record(array $urls)
88 9
    {
89
        $dataCsv = fopen($this->folder.'/data.csv', 'w');
90 9
        $indexCsv = fopen($this->folder.'/index.csv', 'w');
91 9
92 9
        if (false !== $dataCsv) {
93
            $header = array_keys(get_object_vars(array_values($urls)[0]));
94 9
            fputcsv($dataCsv, $header);
95 9
            fputcsv($indexCsv, ['id', 'uri']);
0 ignored issues
show
Bug introduced by
It seems like $indexCsv can also be of type false; however, parameter $handle of fputcsv() does only seem to accept resource, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

95
            fputcsv(/** @scrutinizer ignore-type */ $indexCsv, ['id', 'uri']);
Loading history...
96
97
            foreach ($urls as $url) {
98 9
                fputcsv($dataCsv, get_object_vars($url));
99
                fputcsv($indexCsv, [$url->id, $url->uri]);
100 9
            }
101
102
            fclose($dataCsv);
103
104
            return true;
105
        }
106 6
107
        return false;
108 6
    }
109 6
110 6
    public function recordInboundLink(Url $from, Url $to, int $type)
111 6
    {
112
        file_put_contents(
113 6
            $this->folder.Recorder::LINKS_DIR.'/To_'.(string) $to->id.'_'.$type,
114
            $from->uri.PHP_EOL,
115 6
            FILE_APPEND
116
        );
117
    }
118 6
119 6
    public function recordOutboundLink(Url $from, array $links)
120 6
    {
121 6
        $links = array_map(function ($link) {
122
            return $link->getUrl();
123
        }, $links);
124
        file_put_contents($this->folder.Recorder::LINKS_DIR.'/From_'.(string) $from->id, implode(PHP_EOL, $links));
125
    }
126
127
    protected function initLinksIndex()
128
    {
129
        if (!file_exists($this->folder.Recorder::LINKS_DIR.'/Index.csv')) {
130
            file_put_contents($this->folder.Recorder::LINKS_DIR.'/Index.csv', 'From,To'.PHP_EOL);
131
        }
132
    }
133
134
    public static function removeBase(string $base, string $url)
135
    {
136
        return (0 === strpos($url, $base)) ? $newstring = substr_replace($url, '', 0, strlen($base)) : null;
0 ignored issues
show
Unused Code introduced by
The assignment to $newstring is dead and can be removed.
Loading history...
137
    }
138
139
    public function recordLinksIndex(string $base, Url $from, $urls, array $links)
140
    {
141
        $everAdded = [];
142
        $content = '';
143
144
        foreach ($links as $link) {
145
            $content .= $from->getId();
146
            if (in_array($link->getUrl(), $everAdded)) { // like Google, we sould not add duplicate link,
147
                // so we say the juice is lost -1
148
                $content .= ',-1'.PHP_EOL;
149
            } else {
150
                $everAdded[] = $link->getUrl();
151
                $relative = self::removeBase($base, $link->getPageUrl());
152
                $content .= ','.(isset($urls[$relative]) ? $urls[$relative]->getId() : 0).PHP_EOL; // 0 = external
153
            }
154
        }
155
        file_put_contents($this->folder.Recorder::LINKS_DIR.'/Index.csv', $content, FILE_APPEND);
156
    }
157
}
158