Passed
Push — master ( 6700a9...62f4d8 )
by Dev
12:41
created

Recorder::inboundLinkToStr()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 3
Code Lines 1

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 2

Importance

Changes 0
Metric Value
cc 1
eloc 1
nc 1
nop 1
dl 0
loc 3
ccs 0
cts 0
cp 0
crap 2
rs 10
c 0
b 0
f 0
1
<?php
2
3
namespace PiedWeb\SeoPocketCrawler;
4
5
use PiedWeb\UrlHarvester\Harvest;
6
use PiedWeb\UrlHarvester\Link;
7
8
class Recorder
9
{
10
    const LINKS_DIR = '/links';
11
    const CACHE_DIR = '/cache';
12
13
    const CACHE_NONE = 0;
14
    const CACHE_ID = 2;
15
    const CACHE_URI = 1;
16
17
    protected $folder;
18
    protected $cacheMethod;
19 9
20
    public function __construct($folder, $cacheMethod = self::CACHE_ID)
21 9
    {
22 9
        $this->folder = $folder;
23
        $this->cacheMethod = $cacheMethod;
24 9
25 3
        if (!file_exists($folder)) {
26 3
            mkdir($folder);
27 3
        }
28
29 9
        if (!file_exists($folder.Recorder::LINKS_DIR)) {
30
            mkdir($folder.Recorder::LINKS_DIR);
31 6
            $this->initLinksIndex();
32
        }
33 6
34
        if (!file_exists($folder.Recorder::CACHE_DIR)) {
35
            mkdir($folder.Recorder::CACHE_DIR);
36
        }
37 6
    }
38 6
39 6
    public function cache(Harvest $harvest, Url $url)
40 6
    {
41 6
        if (Recorder::CACHE_NONE === $this->cacheMethod || !$this->mustWeCache($harvest)) {
42
            return;
43
        }
44 6
45
        $filePath = $this->getCacheFilePath($url);
46 2
        if (!file_exists($filePath)) {
47
            file_put_contents(
48 7
                $filePath,
49
                $harvest->getResponse()->getHeaders(false).PHP_EOL.PHP_EOL.$harvest->getResponse()->getContent()
50 7
            );
51 3
52
            return file_put_contents($filePath.'---info', json_encode($harvest->getResponse()->getInfo()));
53 4
        }
54
    }
55
56
    public function getCacheFilePath(Url $url)
57 3
    {
58
        if (Recorder::CACHE_URI === $this->cacheMethod) {
59 3
            return $this->getCacheFilePathWithUrlAsFilename($url);
60 3
        } else {
61 3
            return $this->getCacheFilePathWithIdAsFilename($url);
62
        }
63 3
    }
64 3
65 3
    protected function getCacheFilePathWithUrlAsFilename(Url $url)
66 3
    {
67
        $url = trim($url->uri, '/').'/';
68 3
        $urlPart = explode('/', $url);
69 3
        $folder = $this->folder.Recorder::CACHE_DIR;
70
71
        $urlPartLenght = count($urlPart);
72
        for ($i = 0; $i < $urlPartLenght; ++$i) {
73
            if ($i == $urlPartLenght - 1) {
74
                return $folder.'/'.(empty($urlPart[$i]) ? 'index.html' : $urlPart[$i]);
75
            } else {
76 4
                $folder .= '/'.$urlPart[$i];
77
                if (!file_exists($folder) || !is_dir($folder)) {
78 4
                    mkdir($folder);
79
                }
80
            }
81 6
        }
82
    }
83 6
84
    protected function getCacheFilePathWithIdAsFilename(Url $url)
85
    {
86 9
        return $this->folder.Recorder::CACHE_DIR.'/'.(string) $url->id;
87
    }
88 9
89
    protected function mustWeCache(Harvest $harvest)
90 9
    {
91 9
        return false !== strpos($harvest->getResponse()->getContentType(), 'text/html');
92 9
    }
93
94 9
    public function record(array $urls)
95 9
    {
96
        $dataCsv = fopen($this->folder.'/data.csv', 'w');
97
        $indexCsv = fopen($this->folder.'/index.csv', 'w');
98 9
99
        if (false !== $dataCsv && false !== $indexCsv) {
100 9
            $header = array_keys(get_object_vars(array_values($urls)[0]));
101
            fputcsv($dataCsv, $header);
102
            fputcsv($indexCsv, ['id', 'uri']);
103
104
            foreach ($urls as $url) {
105
                fputcsv($dataCsv, get_object_vars($url));
106 6
                fputcsv($indexCsv, [$url->id, $url->uri]);
107
            }
108 6
109 6
            fclose($dataCsv);
110 6
111 6
            return true;
112
        }
113 6
114
        return false;
115 6
    }
116
117
    public function recordInboundLink(Link $link, Url $from, Url $to)
118 6
    {
119 6
        file_put_contents(
120 6
            $this->folder.Recorder::LINKS_DIR.'/To_'.(string) $to->id.'_'.((int) $link->mayFollow()),
121 6
            $this->inboundLinkToStr($link).PHP_EOL, // can use ->relativize to get only /uri
122
            FILE_APPEND
123
        );
124
    }
125
126
    protected function inboundLinkToStr(Link $link)
127
    {
128
        return $link->getParentUrl().';'.$link->getAnchor().';'.((int) $link->mayFollow()).';'.$link->getType();
129
    }
130
131
    public function recordOutboundLink(Url $from, array $links)
132
    {
133
        $links = array_map(function (Link $link) {
134
            return $link->getUrl().';'.$link->getAnchor().';'.((int) $link->mayFollow()).';'.$link->getType();
135
        }, $links);
136
        file_put_contents($this->folder.Recorder::LINKS_DIR.'/From_'.(string) $from->id, implode(PHP_EOL, $links));
137
    }
138
139
    protected function initLinksIndex()
140
    {
141
        if (!file_exists($this->folder.Recorder::LINKS_DIR.'/Index.csv')) {
142
            file_put_contents($this->folder.Recorder::LINKS_DIR.'/Index.csv', 'From,To'.PHP_EOL);
143
        }
144
    }
145
146
    public static function removeBase(string $base, string $url)
147
    {
148
        return (0 === strpos($url, $base)) ? substr_replace($url, '', 0, strlen($base)) : null;
149
    }
150
151
    public function recordLinksIndex(string $base, Url $from, $urls, array $links)
152
    {
153
        $everAdded = [];
154
        $content = '';
155
156
        foreach ($links as $link) {
157
            $content .= $from->getId();
158
            if (in_array($link->getUrl(), $everAdded)) { // like Google, we sould not add duplicate link,
159
                // so we say the juice is lost -1
160
                $content .= ',-1'.PHP_EOL;
161
            } else {
162
                $everAdded[] = $link->getUrl();
163
                $relative = self::removeBase($base, $link->getPageUrl());
164
                $content .= ','.(isset($urls[$relative]) ? $urls[$relative]->getId() : 0).PHP_EOL; // 0 = external
165
            }
166
        }
167
        file_put_contents($this->folder.Recorder::LINKS_DIR.'/Index.csv', $content, FILE_APPEND);
168
    }
169
}
170