Recorder::inboundLinkToStr()   A
last analyzed

Complexity

Conditions 1
Paths 1

Size

Total Lines 3
Code Lines 1

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 2

Importance

Changes 0
Metric Value
cc 1
eloc 1
nc 1
nop 1
dl 0
loc 3
ccs 0
cts 0
cp 0
crap 2
rs 10
c 0
b 0
f 0
1
<?php
2
3
namespace PiedWeb\SeoPocketCrawler;
4
5
use PiedWeb\UrlHarvester\Harvest;
6
use PiedWeb\UrlHarvester\Link;
7
8
class Recorder
9
{
10
    const LINKS_DIR = '/links';
11
    const CACHE_DIR = '/cache';
12
13
    const CACHE_NONE = 0;
14
    const CACHE_ID = 2;
15
    const CACHE_URI = 1;
16
17
    protected $folder;
18
    protected $cacheMethod;
19 9
20
    public function __construct($folder, $cacheMethod = self::CACHE_ID)
21 9
    {
22 9
        $this->folder = $folder;
23
        $this->cacheMethod = $cacheMethod;
24 9
25 3
        if (! file_exists($folder)) {
26 3
            mkdir($folder);
27 3
        }
28
29 9
        if (! file_exists($folder.Recorder::LINKS_DIR)) {
30
            mkdir($folder.Recorder::LINKS_DIR);
31 6
            $this->initLinksIndex();
32
        }
33 6
34
        if (! file_exists($folder.Recorder::CACHE_DIR)) {
35
            mkdir($folder.Recorder::CACHE_DIR);
36
        }
37 6
    }
38 6
39 6
    public function cache($harvest, Url $url)
40 6
    {
41 6
        if (Recorder::CACHE_NONE === $this->cacheMethod || ! $this->mustWeCache($harvest)) {
42
            return;
43
        }
44 6
45
        $filePath = $this->getCacheFilePath($url);
46 2
        if (! file_exists($filePath)) {
0 ignored issues
show
Bug introduced by
It seems like $filePath can also be of type null; however, parameter $filename of file_exists() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

46
        if (! file_exists(/** @scrutinizer ignore-type */ $filePath)) {
Loading history...
47
            if ($harvest instanceof Harvest) {
48 7
                file_put_contents(
49
                    $filePath,
0 ignored issues
show
Bug introduced by
It seems like $filePath can also be of type null; however, parameter $filename of file_put_contents() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

49
                    /** @scrutinizer ignore-type */ $filePath,
Loading history...
50 7
                    $harvest->getResponse()->getHeaders(false).PHP_EOL.PHP_EOL.$harvest->getResponse()->getContent()
51 3
                );
52
53 4
                return file_put_contents($filePath.'---info', json_encode($harvest->getResponse()->getInfo()));
54
            } else {
55
                return file_put_contents($filePath, 'curl_error_code:'.$harvest); // cache curl error code
56
            }
57 3
        }
58
    }
59 3
60 3
    public function getCacheFilePath(Url $url)
61 3
    {
62
        if (Recorder::CACHE_URI === $this->cacheMethod) {
63 3
            return $this->getCacheFilePathWithUrlAsFilename($url);
64 3
        } else {
65 3
            return $this->getCacheFilePathWithIdAsFilename($url);
66 3
        }
67
    }
68 3
69 3
    protected function getCacheFilePathWithUrlAsFilename(Url $url)
70
    {
71
        $url = trim($url->uri, '/').'/';
72
        $urlPart = explode('/', $url);
73
        $folder = $this->folder.Recorder::CACHE_DIR;
74
75
        $urlPartLenght = count($urlPart);
76 4
        for ($i = 0; $i < $urlPartLenght; ++$i) {
77
            if ($i == $urlPartLenght - 1) {
78 4
                return $folder.'/'.(empty($urlPart[$i]) ? 'index.html' : $urlPart[$i]);
79
            } else {
80
                $folder .= '/'.$urlPart[$i];
81 6
                if (! file_exists($folder) || ! is_dir($folder)) {
82
                    mkdir($folder);
83 6
                }
84
            }
85
        }
86 9
    }
87
88 9
    protected function getCacheFilePathWithIdAsFilename(Url $url)
89
    {
90 9
        return $this->folder.Recorder::CACHE_DIR.'/'.(string) $url->id;
91 9
    }
92 9
93
    protected function mustWeCache($harvest)
94 9
    {
95 9
        return true; //false !== strpos($harvest->getResponse()->getContentType(), 'text/html');
96
    }
97
98 9
    public function record(array $urls)
99
    {
100 9
        $dataCsv = fopen($this->folder.'/data.csv', 'w');
101
        $indexCsv = fopen($this->folder.'/index.csv', 'w');
102
103
        if (false !== $dataCsv && false !== $indexCsv) {
104
            $header = array_keys(get_object_vars(array_values($urls)[0]));
105
            fputcsv($dataCsv, $header);
106 6
            fputcsv($indexCsv, ['id', 'uri']);
107
108 6
            foreach ($urls as $url) {
109 6
                fputcsv($dataCsv, get_object_vars($url));
110 6
                fputcsv($indexCsv, [$url->id, $url->uri]);
111 6
            }
112
113 6
            fclose($dataCsv);
114
115 6
            return true;
116
        }
117
118 6
        return false;
119 6
    }
120 6
121 6
    public function recordInboundLink(Link $link, Url $from, Url $to)
122
    {
123
        file_put_contents(
124
            $this->folder.Recorder::LINKS_DIR.'/To_'.(string) $to->id.'_'.((int) $link->mayFollow()),
125
            $this->inboundLinkToStr($link).PHP_EOL, // can use ->relativize to get only /uri
126
            FILE_APPEND
127
        );
128
    }
129
130
    protected function inboundLinkToStr(Link $link)
131
    {
132
        return $link->getParentUrl().';'.$link->getAnchor().';'.((int) $link->mayFollow()).';'.$link->getType();
133
    }
134
135
    public function recordOutboundLink(Url $from, array $links)
136
    {
137
        $links = array_map(function (Link $link) {
138
            return $link->getUrl().';'.$link->getAnchor().';'.((int) $link->mayFollow()).';'.$link->getType();
139
        }, $links);
140
        file_put_contents($this->folder.Recorder::LINKS_DIR.'/From_'.(string) $from->id, implode(PHP_EOL, $links));
141
    }
142
143
    protected function initLinksIndex()
144
    {
145
        if (! file_exists($this->folder.Recorder::LINKS_DIR.'/Index.csv')) {
146
            file_put_contents($this->folder.Recorder::LINKS_DIR.'/Index.csv', 'From,To'.PHP_EOL);
147
        }
148
    }
149
150
    public static function removeBase(string $base, string $url)
151
    {
152
        return (0 === strpos($url, $base)) ? substr_replace($url, '', 0, strlen($base)) : null;
153
    }
154
155
    public function recordLinksIndex(string $base, Url $from, $urls, array $links)
156
    {
157
        $everAdded = [];
158
        $content = '';
159
160
        foreach ($links as $link) {
161
            $content .= $from->getId();
162
            if (in_array($link->getUrl(), $everAdded)) { // like Google, we sould not add duplicate link,
163
                // so we say the juice is lost -1
164
                $content .= ',-1'.PHP_EOL;
165
            } else {
166
                $everAdded[] = $link->getUrl();
167
                $relative = self::removeBase($base, $link->getPageUrl());
168
                $content .= ','.(isset($urls[$relative]) ? $urls[$relative]->getId() : 0).PHP_EOL; // 0 = external
169
            }
170
        }
171
        file_put_contents($this->folder.Recorder::LINKS_DIR.'/Index.csv', $content, FILE_APPEND);
172
    }
173
}
174