Passed
Push — master ( 7e03c5...6dfa53 )
by Dev
34:30 queued 19:18
created

CrawlerConfig::getDataFromPreviousCrawl()   B

Complexity

Conditions 7
Paths 8

Size

Total Lines 33
Code Lines 21

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 1
Metric Value
cc 7
eloc 21
c 1
b 0
f 1
nc 8
nop 0
dl 0
loc 33
rs 8.6506
1
<?php
2
3
namespace PiedWeb\SeoPocketCrawler;
4
5
use League\Csv\Reader;
6
use Spatie\Robots\RobotsTxt;
7
8
class CrawlerConfig
9
{
10
    /**
11
     * @var string contain the user agent used during the crawl
12
     */
13
    protected $userAgent;
14
15
    /**
16
     * @var string crawl id
17
     */
18
    protected $id;
19
20
    /**
21
     * @var RobotsTxt page to ignore during the crawl
22
     */
23
    protected $ignore;
24
25
    /**
26
     * @var int depth max where to crawl
27
     */
28
    protected $limit;
29
30
    /**
31
     * @var string contain https://domain.tdl from start url
32
     */
33
    protected $base;
34
35
    /**
36
     * @var int
37
     */
38
    protected $wait;
39
40
    /**
41
     * @var int
42
     */
43
    protected $cacheMethod;
44
45
    /**
46
     * @var string
47
     */
48
    protected $dataDirectory;
49
50
    /**
51
     * @var string
52
     */
53
    protected $startUrl;
54
55
    public function __construct(
56
        string $startUrl,
57
        string $ignore,
58
        int $limit,
59
        string $userAgent,
60
        int $cacheMethod = Recorder::CACHE_ID,
61
        int $waitInMicroSeconds = 100000,
62
        ?string $dataDirectory = null
63
    ) {
64
        $this->setBaseAndStartUrl($startUrl);
65
        //$this->urls[$this->startUrl] = null;
66
        $this->id = date('ymdHi').'-'.parse_url($this->base, PHP_URL_HOST);
67
        $this->ignore = $ignore;
0 ignored issues
show
Documentation Bug introduced by
It seems like $ignore of type string is incompatible with the declared type Spatie\Robots\RobotsTxt of property $ignore.

Our type inference engine has found an assignment to a property that is incompatible with the declared type of that property.

Either this assignment is in error or the assigned type should be added to the documentation/type hint for that property..

Loading history...
68
        $this->userAgent = $userAgent;
69
        $this->limit = $limit;
70
        $this->cacheMethod = $cacheMethod;
71
        $this->wait = $waitInMicroSeconds;
72
        $this->dataDirectory = rtrim($dataDirectory ?? __DIR__.'/../data', '/');
73
    }
74
75
    public static function loadFrom(string $crawlId, ?string $dataDirectory = null): self
76
    {
77
        $configFilePath = self::getDataFolderFrom($crawlId, $dataDirectory).'/config.json';
78
        if (!file_exists($configFilePath)) {
79
            throw new \Exception('Crawl `'.$crawlId.'` not found.');
80
        }
81
        $config = json_decode(file_get_contents($configFilePath), true);
82
83
        return (new self(
84
            $config['base'].$config['startUrl'],
85
            $config['ignore'],
86
            intval($config['limit']),
87
            (string) $config['userAgent'],
88
            intval($config['cacheMethod']),
89
            intval($config['wait']),
90
            $dataDirectory
91
        ))->setId($crawlId);
92
    }
93
94
    public function recordConfig()
95
    {
96
        file_put_contents($this->getDataFolder().'/config.json', json_encode([
97
            'startUrl' => $this->startUrl,
98
            'base' => $this->base,
99
            'ignore' => $this->ignore,
100
            'limit' => $this->limit,
101
            'userAgent' => $this->userAgent,
102
            'cacheMethod' => $this->cacheMethod,
103
            'wait' => $this->wait,
104
        ]));
105
    }
106
107
    protected function setBaseAndStartUrl(string $url)
108
    {
109
        if (!filter_var($url, FILTER_VALIDATE_URL)) {
110
            throw new \Exception('start is not a valid URL `'.$url.'`');
111
        }
112
113
        $this->base = preg_match('@^(http://|https://)?[^/\?#]+@', $url, $match) ? $match[0] : $url;
114
115
        $url = substr($url, strlen($this->base));
116
117
        $this->startUrl = (!isset($url[0]) || '/' != $url[0] ? '/' : '').$url;
118
    }
119
120
    public static function getDataFolderFrom($id, ?string $path)
121
    {
122
        return ($path ?? __DIR__.'/../data').'/'.$id;
123
    }
124
125
    public function getDataFolder()
126
    {
127
        return $this->dataDirectory.'/'.$this->id;
128
    }
129
130
    public function getId()
131
    {
132
        return $this->id;
133
    }
134
135
    public function getBase()
136
    {
137
        return $this->base;
138
    }
139
140
    public function getStartUrl()
141
    {
142
        return $this->startUrl;
143
    }
144
145
    public function getWait()
146
    {
147
        return $this->wait;
148
    }
149
150
    public function getUserAgent()
151
    {
152
        return $this->userAgent;
153
    }
154
155
    public function getLimit()
156
    {
157
        return $this->limit;
158
    }
159
160
    public function getCacheMethod()
161
    {
162
        return $this->cacheMethod;
163
    }
164
165
    public function getDataDirectory()
166
    {
167
        $this->dataDirectory;
168
    }
169
170
    protected $virtualRobots;
171
172
    public function getVirtualRobots()
173
    {
174
        if (null === $this->virtualRobots) {
175
            $this->virtualRobots = new RobotsTxt($this->ignore);
0 ignored issues
show
Bug introduced by
$this->ignore of type Spatie\Robots\RobotsTxt is incompatible with the type string expected by parameter $content of Spatie\Robots\RobotsTxt::__construct(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

175
            $this->virtualRobots = new RobotsTxt(/** @scrutinizer ignore-type */ $this->ignore);
Loading history...
176
        }
177
178
        return $this->virtualRobots;
179
    }
180
181
    public function setId(string $id): self
182
    {
183
        $this->id = $id;
184
185
        return $this;
186
    }
187
188
    public function getDataFromPreviousCrawl()
189
    {
190
        $dataFilePath = $this->getDataFolder().'/data.csv';
191
        if (!file_exists($dataFilePath)) {
192
            throw new \Exception('Previous crawl\'s data not found (index.csv)');
193
        }
194
195
        $urls = [];
196
        $counter = 0;
197
198
        $csv = Reader::createFromPath($dataFilePath, 'r');
199
        $csv->setHeaderOffset(0);
200
201
        $records = $csv->getRecords();
202
        foreach ($records as $r) {
203
            $urls[$r['uri']] = new Url($this->base.$r['uri'], 0);
204
            foreach ($r as $k => $v) {
205
                if ('can_be_crawled' == $k && !empty($v)) {
206
                    $v = (bool) $v;
207
                }
208
                $urls[$r['uri']]->$k = $v;
209
            }
210
            if (!empty($r['can_be_crawled'])) {
211
                ++$counter;
212
            }
213
        }
214
215
        $currentClick = $r['click'] ?? 0;
216
217
        return [
218
            'urls' => $urls,
219
            'counter' => $counter,
220
            'currentClick' => $currentClick,
221
        ];
222
    }
223
224
    // could be add in an other class..
225
    protected $index;
226
227
    protected function getIndexFromPreviousCrawl()
228
    {
229
        if (null !== $this->index) {
230
            return $this->index;
231
        }
232
233
        $this->index = [];
234
235
        $indexFilePath = $this->getDataFolder().'/index.csv';
236
        if (!file_exists($indexFilePath)) {
237
            throw new \Exception('Previous crawl\'s data not found (index.csv)');
238
        }
239
240
        $csv = Reader::createFromPath($indexFilePath, 'r');
241
        $csv->setHeaderOffset(0);
242
243
        $records = $csv->getRecords();
244
        foreach ($records as $r) {
245
            $this->index[$r['id']] = new Url($this->base.$r['uri'], 0);
246
            $this->index[$r['id']]->id = $r['id'];
247
        }
248
249
        return $this->index;
250
    }
251
252
    public function getUrlFromId($id, $base = true)
253
    {
254
        $index = $this->getIndexFromPreviousCrawl();
255
256
        return isset($index[$id]) ? ($base ? $this->base : '').$index[$id]->uri : null;
257
    }
258
}
259