1 | <?php |
||
2 | |||
3 | namespace Famdirksen\LaravelJobHandler\Http\Controllers; |
||
4 | |||
5 | use Carbon\Carbon; |
||
6 | use Famdirksen\LaravelJobHandler\Exceptions\CrawlerException; |
||
7 | use Famdirksen\LaravelJobHandler\Exceptions\CrawlerNotReachedTimeBetweenJobsException; |
||
8 | use Famdirksen\LaravelJobHandler\Exceptions\CrawlerSaveException; |
||
9 | use Famdirksen\LaravelJobHandler\Models\Crawlers; |
||
10 | use Famdirksen\LaravelJobHandler\Models\CrawlerStatus; |
||
11 | use Famdirksen\LaravelJobHandler\Models\CrawlerStatusLogs; |
||
12 | use Illuminate\Support\Facades\Log; |
||
13 | |||
14 | class CrawlController |
||
15 | { |
||
16 | protected $crawler; |
||
17 | protected $crawler_id; |
||
18 | protected $override_fail_status = false; |
||
19 | protected $logging = false; |
||
20 | protected $logs = []; |
||
21 | |||
22 | |||
23 | public function __construct() |
||
24 | { |
||
25 | $this->startLogging(); |
||
26 | } |
||
27 | public function __destruct() |
||
28 | { |
||
29 | $this->stopLogging(); |
||
30 | } |
||
31 | |||
32 | public function overrideFailStatus(bool $state) |
||
33 | { |
||
34 | $this->log('Setup overrideFailStatus to: '.$state); |
||
35 | |||
36 | $this->override_fail_status = $state; |
||
37 | } |
||
38 | |||
39 | /** |
||
40 | * Get the latest crawler data from the database |
||
41 | */ |
||
42 | protected function getCrawler() |
||
43 | { |
||
44 | if (empty($this->crawler) || $this->crawler->id != $this->crawler_id) { |
||
45 | $this->log('Loading new crawler data'); |
||
46 | $this->crawler = Crawlers::findOrFail($this->crawler_id); |
||
47 | $this->log('Loaded new crawler data'); |
||
48 | } else { |
||
49 | $this->log('Refreshing crawler data'); |
||
50 | $this->crawler = $this->crawler->fresh(); |
||
51 | $this->log('Refreshed crawler data'); |
||
52 | } |
||
53 | } |
||
54 | |||
55 | /** |
||
56 | * Set the crawler id |
||
57 | * |
||
58 | * @param $crawler_id |
||
59 | */ |
||
60 | public function setCrawlerId($crawler_id) |
||
61 | { |
||
62 | $this->log('Setting crawler_id'); |
||
63 | $this->crawler_id = $crawler_id; |
||
64 | $this->log('Set crawler_id'); |
||
65 | } |
||
66 | |||
67 | /** |
||
68 | * Return the crawler id |
||
69 | * |
||
70 | * @return mixed |
||
71 | */ |
||
72 | public function getCrawlerId() |
||
73 | { |
||
74 | $this->log('Getting crawler_id'); |
||
75 | |||
76 | return $this->crawler_id; |
||
77 | } |
||
78 | |||
79 | /** |
||
80 | * Check if the controller is setup correctly |
||
81 | * |
||
82 | * @return bool |
||
83 | */ |
||
84 | protected function controllerIsSetup() |
||
85 | { |
||
86 | $this->log('Check if controllerIsSetup'); |
||
87 | |||
88 | if (!is_null($this->crawler_id)) { |
||
89 | return true; |
||
90 | } |
||
91 | |||
92 | return false; |
||
93 | } |
||
94 | |||
95 | /** |
||
96 | * Setup the crawler so it won't run twice at the same time |
||
97 | * |
||
98 | * @param $crawler_id |
||
99 | */ |
||
100 | public function setupCrawler($crawler_id = null) |
||
101 | { |
||
102 | $this->log('Setup crawler'); |
||
103 | |||
104 | if (!is_null($crawler_id)) { |
||
105 | $this->log('Setup crawler, crawler_id is not set'); |
||
106 | $this->setCrawlerId($crawler_id); |
||
107 | } |
||
108 | |||
109 | if ($this->controllerIsSetup()) { |
||
110 | $times = config('laravel-job-handler.run_times', 10); |
||
111 | |||
112 | for ($x = 0; $x <= $times; $x++) { |
||
113 | //fetch the last data |
||
114 | $this->getCrawler(); |
||
115 | |||
116 | $this->log('Checking if crawler is enabled'); |
||
117 | if (!$this->crawler->enabled) { |
||
118 | $this->log('Crawler is not enabled'); |
||
119 | throw new CrawlerException('Crawler (#' . $this->crawler_id . ') - crawler isnt enabled in database'); |
||
120 | } |
||
121 | |||
122 | $this->log('Checking if crawler can be runned'); |
||
123 | $checkIfCrawlerCanBeRunned = $this->canCrawlerRunAfterPeriod(); |
||
124 | |||
125 | if ($checkIfCrawlerCanBeRunned['status']) { |
||
126 | $this->log('Checked if crawler can runned'); |
||
127 | if (is_null($this->crawler->latest_status)) { |
||
128 | $this->log('Crawler can be runned, it the first time'); |
||
129 | |||
130 | //first time it runs... |
||
131 | break; |
||
132 | } |
||
133 | if ($this->crawler->latest_status == 2) { |
||
134 | $this->log('Crawler can be runned, last crawler runned successfully'); |
||
135 | |||
136 | //Done running... |
||
137 | break; |
||
138 | } |
||
139 | |||
140 | |||
141 | |||
142 | if ($this->crawler->latest_status == 3) { |
||
143 | if ($this->override_fail_status) { |
||
144 | $this->log('Last crawler failed, but it is forced to run'); |
||
145 | |||
146 | //override the failed state, this will force to rerun... |
||
147 | break; |
||
148 | } |
||
149 | |||
150 | $this->log('Last crawler failed, force run is not enabled'); |
||
151 | throw new CrawlerException('Crawler (#' . $this->crawler_id . ') - last run had an error and override_fail_status is not enabled'); |
||
152 | } |
||
153 | } else { |
||
154 | $this->log('Crawler needs to wait ('.$checkIfCrawlerCanBeRunned['retry_in'].' seconds) before running again'); |
||
155 | throw new CrawlerNotReachedTimeBetweenJobsException('Has to wait ' . $checkIfCrawlerCanBeRunned['retry_in'] . ' more seconds to run'); |
||
156 | } |
||
157 | |||
158 | if ($x == $times && !$this->override_fail_status) { |
||
159 | $this->log('Crawler exceeded the max execution time'); |
||
160 | $this->failCrawler('Crawler (#' . $this->crawler_id . ') - max execution time'); |
||
161 | } |
||
162 | |||
163 | if ($this->crawler->status == 1) { |
||
164 | if ($this->crawler->multiple_crawlers) { |
||
165 | $this->log('Crawler can run multiple crawlers at the same time'); |
||
166 | break; |
||
167 | } |
||
168 | |||
169 | $wait = config('laravel-job-handler.retry_in_seconds', 3); |
||
170 | |||
171 | $this->log('Waiting for rechecking ('.$wait.' seconds) if crawler can be runned'); |
||
172 | |||
173 | sleep($wait); |
||
174 | } |
||
175 | } |
||
176 | |||
177 | $this->log('All setup, starting crawler'); |
||
178 | $this->startCrawler(); |
||
179 | } else { |
||
180 | throw new CrawlerException('CrawlController is not setup correctly.'); |
||
181 | } |
||
182 | } |
||
183 | /** |
||
184 | * Start the crawler and save it to the database |
||
185 | * |
||
186 | * @param string $output |
||
187 | */ |
||
188 | public function startCrawler($output = '') |
||
189 | { |
||
190 | $this->log('Starting crawler'); |
||
191 | |||
192 | return $this->addStatus(1, $output); //start running |
||
193 | } |
||
194 | /** |
||
195 | * set the crawler as done so other scripts can run |
||
196 | * |
||
197 | * @param string $output |
||
198 | */ |
||
199 | public function doneCrawler($output = '') |
||
200 | { |
||
201 | $this->log('Crawler done'); |
||
202 | |||
203 | return $this->addStatus(2, $output); //done running |
||
204 | } |
||
205 | |||
206 | /** |
||
207 | * Finishing the crawler |
||
208 | * |
||
209 | * @param string $output |
||
210 | * @return bool |
||
211 | */ |
||
212 | public function finish($output = '') |
||
213 | { |
||
214 | $this->log('Finishing crawler'); |
||
215 | |||
216 | return $this->doneCrawler($output); |
||
217 | } |
||
218 | /** |
||
219 | * crawler failed... |
||
220 | * |
||
221 | * @param string $output |
||
222 | */ |
||
223 | public function failCrawler($output = '') |
||
224 | { |
||
225 | $this->log('Crawler failed'); |
||
226 | |||
227 | $this->addStatus(3, $output); //failed |
||
228 | |||
229 | throw new CrawlerException($output.' - status 3'); |
||
230 | } |
||
231 | /** |
||
232 | * Save the latest crawler status to the database |
||
233 | * |
||
234 | * @param $status |
||
235 | * @param string $output |
||
236 | * @return bool |
||
237 | */ |
||
238 | protected function addStatus($status, $output = '') |
||
239 | { |
||
240 | $this->log('Registering status ('.$status.')'); |
||
241 | |||
242 | $crawlerstatus = new CrawlerStatus(); |
||
243 | |||
244 | $crawlerstatus->crawler_id = $this->crawler_id; |
||
245 | $crawlerstatus->status = $status; |
||
246 | |||
247 | if ($crawlerstatus->save()) { |
||
248 | $this->log('Registered status ('.$status.')'); |
||
249 | |||
250 | if ($this->crawler) { |
||
251 | $this->log('Setting crawler latest status (' . $status . ') attribute'); |
||
252 | |||
253 | $this->crawler->latest_status = $status; |
||
254 | |||
255 | $this->crawler->save(); |
||
256 | $this->log('Set crawler latest status (' . $status . ') attribute'); |
||
257 | } |
||
258 | |||
259 | if (!empty($output)) { |
||
260 | $formatted_logs[] = [ |
||
0 ignored issues
–
show
Comprehensibility
Best Practice
introduced
by
![]() |
|||
261 | 'status_id' => $crawlerstatus->id, |
||
262 | 'output' => $output, |
||
263 | 'created_at' => Carbon::now(), |
||
264 | 'updated_at' => Carbon::now(), |
||
265 | ]; |
||
266 | |||
267 | CrawlerStatusLogs::insert($formatted_logs); |
||
268 | } |
||
269 | |||
270 | if ($status == 2) { |
||
271 | $this->stopLogging($crawlerstatus->id); |
||
272 | } |
||
273 | |||
274 | $this->getCrawler(); |
||
275 | |||
276 | return true; |
||
277 | } else { |
||
278 | throw new CrawlerSaveException('Cannot save crawlerstatus to database...'); |
||
279 | } |
||
280 | } |
||
281 | protected function saveLog($crawlerstatus_id) |
||
282 | { |
||
283 | $formatted_logs = []; |
||
284 | |||
285 | foreach ($this->logs as $log) { |
||
286 | $formatted_logs[] = [ |
||
287 | 'status_id' => $crawlerstatus_id, |
||
288 | 'output' => $log, |
||
289 | 'created_at' => Carbon::now(), |
||
290 | 'updated_at' => Carbon::now(), |
||
291 | ]; |
||
292 | } |
||
293 | if (count($formatted_logs) > 0) { |
||
294 | $this->log('Registering crawler logs'); |
||
295 | |||
296 | CrawlerStatusLogs::insert($formatted_logs); |
||
297 | |||
298 | $this->log('Registered crawler logs (count: ' . count($formatted_logs) . ')'); |
||
299 | } else { |
||
300 | $this->log('Log output is not set, skipping inserting'); |
||
301 | } |
||
302 | } |
||
303 | |||
304 | /** |
||
305 | * This will define when the job can be runned again |
||
306 | * |
||
307 | * @return array |
||
308 | */ |
||
309 | public function canCrawlerRunAfterPeriod() |
||
310 | { |
||
311 | $this->getCrawler(); |
||
312 | |||
313 | if (is_null($this->crawler->time_between)) { |
||
314 | $this->log('Not a time_between specified'); |
||
315 | |||
316 | return $this->canCrawlerRunAfterPeriodStatus(true); |
||
317 | } else { |
||
318 | $seconds = $this->crawler->time_between; |
||
319 | } |
||
320 | |||
321 | if (!is_null($this->crawler->last_runned_at)) { |
||
322 | if ($this->crawler->last_runned_at <= Carbon::now()->subSeconds($seconds)) { |
||
323 | return $this->canCrawlerRunAfterPeriodStatus(true); |
||
324 | } |
||
325 | |||
326 | return $this->canCrawlerRunAfterPeriodStatus(false, Carbon::parse($this->crawler->last_runned_at)->diffInSeconds(Carbon::now()->subSeconds($seconds))); |
||
327 | } else { |
||
328 | //crawler never runned, so it can run now |
||
329 | return $this->canCrawlerRunAfterPeriodStatus(true); |
||
330 | } |
||
331 | } |
||
332 | |||
333 | /** |
||
334 | * Return the status for canCrawlerRunAfterPeriod method |
||
335 | * |
||
336 | * @param $status |
||
337 | * @param int $retry_in |
||
338 | * @return array |
||
339 | */ |
||
340 | public function canCrawlerRunAfterPeriodStatus($status, $retry_in = 0) |
||
341 | { |
||
342 | return [ |
||
343 | 'status' => $status, |
||
344 | 'retry_in' => $retry_in |
||
345 | ]; |
||
346 | } |
||
347 | |||
348 | |||
349 | |||
350 | protected function startLogging() |
||
351 | { |
||
352 | $this->logging = true; |
||
353 | $this->log('Started logging'); |
||
354 | } |
||
355 | protected function stopLogging($crawlerstatus_id = null) |
||
356 | { |
||
357 | $this->log('Stop logging'); |
||
358 | $this->logging = false; |
||
359 | |||
360 | if (!is_null($crawlerstatus_id)) { |
||
361 | $this->saveLog($crawlerstatus_id); |
||
362 | } |
||
363 | } |
||
364 | protected function log($item = '') |
||
365 | { |
||
366 | if ($this->logging) { |
||
367 | $log = $item.' (crawler_id: '.$this->crawler_id.')'; |
||
368 | |||
369 | $this->logs[] = $log; |
||
370 | Log::debug($log); |
||
371 | } |
||
372 | } |
||
373 | } |
||
374 |