Completed
Push — issue/323 ( f4202b...dded2a )
by Tomas Norre
05:22
created

CrawlerApi   C

Complexity

Total Complexity 56

Size/Duplication

Total Lines 548
Duplicated Lines 0 %

Coupling/Cohesion

Components 2
Dependencies 5

Test Coverage

Coverage 41.64%

Importance

Changes 0
Metric Value
dl 0
loc 548
ccs 117
cts 281
cp 0.4164
rs 5.5199
c 0
b 0
f 0
wmc 56
lcom 2
cbo 5

24 Methods

Rating   Name   Duplication   Size   Complexity  
A overwriteSetId() 0 4 1
A setAllowedConfigurations() 0 4 1
A getAllowedConfigurations() 0 4 1
A getSetId() 0 4 1
A findCrawler() 0 13 3
A addPageToQueue() 0 6 1
A filterUnallowedConfigurations() 0 13 4
A addPageToQueueTimed() 0 35 3
A countEntriesInQueueForPageByScheduleTime() 0 24 2
B isPageInQueue() 0 34 7
A getLatestCrawlTimestampForPage() 0 23 4
A getCrawlHistoryForPage() 0 13 2
A getUnprocessedItems() 0 12 1
A countUnprocessedItems() 0 9 1
A isPageInQueueTimed() 0 6 1
A getCrawlerProcInstructions() 0 8 2
A removeQueueEntrie() 0 7 1
A getQueueStatistics() 0 7 1
A getQueueRepository() 0 8 2
A getQueueStatisticsByConfiguration() 0 15 2
A getActiveProcessesCount() 0 6 1
A getLastProcessedQueueEntries() 0 4 1
A getCurrentCrawlingSpeed() 0 35 5
B getPerformanceData() 0 49 8

How to fix   Complexity   

Complex Class

Complex classes like CrawlerApi often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes. You can also have a look at the cohesion graph to spot any un-connected, or weakly-connected components.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

While breaking up the class, it is a good idea to analyze how other classes use CrawlerApi, and based on these observations, apply Extract Interface, too.

1
<?php
2
namespace AOE\Crawler\Api;
3
4
/***************************************************************
5
 *  Copyright notice
6
 *
7
 *  (c) 2018 AOE GmbH <[email protected]>
8
 *
9
 *  All rights reserved
10
 *
11
 *  This script is part of the TYPO3 project. The TYPO3 project is
12
 *  free software; you can redistribute it and/or modify
13
 *  it under the terms of the GNU General Public License as published by
14
 *  the Free Software Foundation; either version 3 of the License, or
15
 *  (at your option) any later version.
16
 *
17
 *  The GNU General Public License can be found at
18
 *  http://www.gnu.org/copyleft/gpl.html.
19
 *
20
 *  This script is distributed in the hope that it will be useful,
21
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
22
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
23
 *  GNU General Public License for more details.
24
 *
25
 *  This copyright notice MUST APPEAR in all copies of the script!
26
 ***************************************************************/
27
28
use AOE\Crawler\Controller\CrawlerController;
29
use AOE\Crawler\Domain\Repository\ProcessRepository;
30
use AOE\Crawler\Domain\Repository\QueueRepository;
31
use TYPO3\CMS\Core\Utility\GeneralUtility;
32
use TYPO3\CMS\Core\Utility\MathUtility;
33
use TYPO3\CMS\Frontend\Page\PageRepository;
34
35
/**
36
 * Class CrawlerApi
37
 *
38
 * @package AOE\Crawler\Api
39
 */
40
class CrawlerApi
41
{
42
    /**
43
     * @var CrawlerController
44
     */
45
    private $crawlerController;
46
47
    /**
48
     * @var QueueRepository
49
     */
50
    protected $queueRepository;
51
52
    /**
53
     * @var $allowedConfigurations array
54
     */
55
    protected $allowedConfigurations = [];
56
57
    /**
58
     * Each crawler run has a setid, this facade method delegates
59
     * the it to the crawler object
60
     *
61
     * @param int
62
     */
63 1
    public function overwriteSetId($id)
64
    {
65 1
        $this->findCrawler()->setID = intval($id);
66 1
    }
67
68
    /**
69
     * This method is used to limit the configuration selection to
70
     * a set of configurations.
71
     *
72
     * @param array $allowedConfigurations
73
     */
74 1
    public function setAllowedConfigurations(array $allowedConfigurations)
75
    {
76 1
        $this->allowedConfigurations = $allowedConfigurations;
77 1
    }
78
79
    /**
80
     * @return array
81
     */
82 1
    public function getAllowedConfigurations()
83
    {
84 1
        return $this->allowedConfigurations;
85
    }
86
87
    /**
88
     * Returns the setID of the crawler
89
     *
90
     * @return int
91
     */
92 1
    public function getSetId()
93
    {
94 1
        return $this->findCrawler()->setID;
95
    }
96
97
    /**
98
     * Method to get an instance of the internal crawler singleton
99
     *
100
     * @return CrawlerController Instance of the crawler lib
101
     *
102
     * @throws \Exception
103
     */
104 2
    protected function findCrawler()
105
    {
106 2
        if (!is_object($this->crawlerController)) {
107 2
            $this->crawlerController = GeneralUtility::makeInstance(CrawlerController::class);
108 2
            $this->crawlerController->setID = GeneralUtility::md5int(microtime());
109
        }
110
111 2
        if (is_object($this->crawlerController)) {
112 2
            return $this->crawlerController;
113
        } else {
114
            throw new \Exception('no crawler object', 1512659759);
115
        }
116
    }
117
118
    /**
119
     * Adds a page to the crawlerqueue by uid
120
     *
121
     * @param int $uid uid
122
     */
123
    public function addPageToQueue($uid)
124
    {
125
        $uid = intval($uid);
126
        //non timed elements will be added with timestamp 0
127
        $this->addPageToQueueTimed($uid, 0);
128
    }
129
130
    /**
131
     * This method is used to limit the processing instructions to the processing instructions
132
     * that are allowed.
133
     *
134
     * @return array
135
     */
136 4
    protected function filterUnallowedConfigurations($configurations)
137
    {
138 4
        if (count($this->allowedConfigurations) > 0) {
139
            // 	remove configuration that does not match the current selection
140
            foreach ($configurations as $confKey => $confArray) {
141
                if (!in_array($confKey, $this->allowedConfigurations)) {
142
                    unset($configurations[$confKey]);
143
                }
144
            }
145
        }
146
147 4
        return $configurations;
148
    }
149
150
    /**
151
     * Adds a page to the crawlerqueue by uid and sets a
152
     * timestamp when the page should be crawled.
153
     *
154
     * @param int $uid pageid
155
     * @param int $time timestamp
156
     *
157
     * @throws \Exception
158
     */
159 4
    public function addPageToQueueTimed($uid, $time)
160
    {
161 4
        $uid = intval($uid);
162 4
        $time = intval($time);
163
164 4
        $crawler = $this->findCrawler();
165 4
        $pageData = GeneralUtility::makeInstance(PageRepository::class)->getPage($uid);
166 4
        $configurations = $crawler->getUrlsForPageRow($pageData);
167 4
        $configurations = $this->filterUnallowedConfigurations($configurations);
168 4
        $downloadUrls = [];
169 4
        $duplicateTrack = [];
170
171 4
        if (is_array($configurations)) {
172 4
            foreach ($configurations as $cv) {
173
                //enable inserting of entries
174 4
                $crawler->registerQueueEntriesInternallyOnly = false;
175 4
                $crawler->urlListFromUrlArray(
176 4
                    $cv,
177 4
                    $pageData,
178 4
                    $time,
179 4
                    300,
180 4
                    true,
181 4
                    false,
182 4
                    $duplicateTrack,
183 4
                    $downloadUrls,
184 4
                    array_keys($this->getCrawlerProcInstructions())
185
                );
186
187
                //reset the queue because the entries have been written to the db
188 4
                unset($crawler->queueEntries);
189
            }
190
        } else {
191
            //no configuration found
192
        }
193 4
    }
194
195
    /**
196
     * Counts all entries in the database which are scheduled for a given page id and a schedule timestamp.
197
     *
198
     * @param int $page_uid
199
     * @param int $schedule_timestamp
200
     *
201
     * @return int
202
     *
203
     * @deprecated since crawler v6.2.0, will be removed in crawler v7.0.0.
204
     */
205 1
    protected function countEntriesInQueueForPageByScheduleTime($page_uid, $schedule_timestamp)
206
    {
207 1
        $page_uid = intval($page_uid);
208 1
        $schedule_timestamp = intval($schedule_timestamp);
209
210
        //if the same page is scheduled for the same time and has not be executed?
211 1
        if ($schedule_timestamp == 0) {
212
            //un-timed elements need an exec_time with 0 because they can occur multiple times
213 1
            $where = 'page_id=' . $page_uid . ' AND exec_time = 0 AND scheduled=' . $schedule_timestamp;
214
        } else {
215
            //timed elements have got a fixed schedule time, if a record with this time
216
            //exists it is maybe queued for the future, or is has been queue for the past and therefore
217
            //also been processed.
218 1
            $where = 'page_id=' . $page_uid . ' AND scheduled=' . $schedule_timestamp;
219
        }
220
221 1
        $row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($GLOBALS['TYPO3_DB']->exec_SELECTquery(
222 1
            'count(*) as cnt',
223 1
            'tx_crawler_queue',
224 1
            $where
225
        ));
226
227 1
        return intval($row['cnt']);
228
    }
229
230
    /**
231
     * Determines if a page is queued
232
     *
233
     * @param $uid
234
     * @param bool $unprocessed_only
235
     * @param bool $timed_only
236
     * @param bool $timestamp
237
     *
238
     * @return bool
239
     */
240 6
    public function isPageInQueue($uid, $unprocessed_only = true, $timed_only = false, $timestamp = false)
241
    {
242 6
        if (!MathUtility::canBeInterpretedAsInteger($uid)) {
243 1
            throw new \InvalidArgumentException('Invalid parameter type', 1468931945);
244
        }
245
246 5
        $isPageInQueue = false;
247
248 5
        $whereClause = 'page_id = ' . (integer)$uid;
249
250 5
        if (false !== $unprocessed_only) {
251 2
            $whereClause .= ' AND exec_time = 0';
252
        }
253
254 5
        if (false !== $timed_only) {
255 1
            $whereClause .= ' AND scheduled != 0';
256
        }
257
258 5
        if (false !== $timestamp) {
259 1
            $whereClause .= ' AND scheduled = ' . (integer)$timestamp;
260
        }
261
262 5
        $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows(
263 5
            '*',
264 5
            'tx_crawler_queue',
265 5
            $whereClause
266
        );
267
268 5
        if (false !== $count && $count > 0) {
269 4
            $isPageInQueue = true;
270
        }
271
272 5
        return $isPageInQueue;
273
    }
274
275
    /**
276
     * Method to return the latest Crawle Timestamp for a page.
277
     *
278
     * @param int $uid uid id of the page
279
     * @param bool $future_crawldates_only
280
     * @param bool $unprocessed_only
281
     *
282
     * @return int
283
     */
284 1
    public function getLatestCrawlTimestampForPage($uid, $future_crawldates_only = false, $unprocessed_only = false)
285
    {
286 1
        $uid = intval($uid);
287 1
        $query = 'max(scheduled) as latest';
288 1
        $where = ' page_id = ' . $uid;
289
290 1
        if ($future_crawldates_only) {
291
            $where .= ' AND scheduled > ' . time();
292
        }
293
294 1
        if ($unprocessed_only) {
295
            $where .= ' AND exec_time = 0';
296
        }
297
298 1
        $rs = $GLOBALS['TYPO3_DB']->exec_SELECTquery($query, 'tx_crawler_queue', $where);
299 1
        if ($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($rs)) {
300 1
            $res = $row['latest'];
301
        } else {
302
            $res = 0;
303
        }
304
305 1
        return $res;
306
    }
307
308
    /**
309
     * Returns an array with timestamps when the page has been scheduled for crawling and
310
     * at what time the scheduled crawl has been executed. The array also contains items that are
311
     * scheduled but have note been crawled yet.
312
     *
313
     * @param int $uid uid of the page
314
     * @param bool $limit
315
     *
316
     * @return array array with the crawl-history of a page => 0 : scheduled time , 1 : executed_time, 2 : set_id
317
     */
318 1
    public function getCrawlHistoryForPage($uid, $limit = 0)
319
    {
320 1
        $uid = intval($uid);
321 1
        $limit = intval($limit);
322
323 1
        $query = 'scheduled, exec_time, set_id';
324 1
        $where = ' page_id = ' . $uid;
325
326 1
        $limit_query = ($limit) ? $limit : null;
327
328 1
        $rows = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows($query, 'tx_crawler_queue', $where, null, null, $limit_query);
329 1
        return $rows;
330
    }
331
332
    /**
333
     * Method to determine unprocessed Items in the crawler queue.
334
     *
335
     * @return array
336
     */
337 1
    public function getUnprocessedItems()
338
    {
339 1
        $rows = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows(
340 1
            '*',
341 1
            'tx_crawler_queue',
342 1
            'exec_time = 0',
343 1
            '',
344 1
            'page_id, scheduled'
345
        );
346
347 1
        return $rows;
348
    }
349
350
    /**
351
     * Method to get the number of unprocessed items in the crawler
352
     *
353
     * @param int number of unprocessed items in the queue
354
     */
355 4
    public function countUnprocessedItems()
356
    {
357 4
        $query = 'count(page_id) as anz';
358 4
        $where = 'exec_time = 0';
359 4
        $rs = $GLOBALS['TYPO3_DB']->exec_SELECTquery($query, 'tx_crawler_queue', $where);
360 4
        $row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($rs);
361
362 4
        return $row['anz'];
363
    }
364
365
    /**
366
     * Method to check if a page is in the queue which is timed for a
367
     * date when it should be crawled
368
     *
369
     * @param int $uid uid of the page
370
     * @param boolean $show_unprocessed only respect unprocessed pages
371
     *
372
     * @return boolean
373
     */
374 1
    public function isPageInQueueTimed($uid, $show_unprocessed = true)
375
    {
376 1
        $uid = intval($uid);
377
378 1
        return $this->isPageInQueue($uid, $show_unprocessed);
379
    }
380
381
    /**
382
     * Reads the registered processingInstructions of the crawler
383
     *
384
     * @return array
385
     */
386 4
    private function getCrawlerProcInstructions()
387
    {
388 4
        if (isset($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['procInstructions'])) {
389
            return $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['procInstructions'];
390
        }
391
392 4
        return [];
393
    }
394
395
    /**
396
     * Removes an queue entry with a given queue id
397
     *
398
     * @param int $qid
399
     */
400
    public function removeQueueEntrie($qid)
401
    {
402
        $qid = intval($qid);
403
        $table = 'tx_crawler_queue';
404
        $where = ' qid=' . $qid;
405
        $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, $where);
406
    }
407
408
    /**
409
     * Get queue statistics
410
     *
411
     * @param void
412
     *
413
     * @return array array('assignedButUnprocessed' => <>, 'unprocessed' => <>);
414
     */
415 1
    public function getQueueStatistics()
416
    {
417
        return [
418 1
            'assignedButUnprocessed' => $this->getQueueRepository()->countAllAssignedPendingItems(),
419 1
            'unprocessed' => $this->getQueueRepository()->countAllPendingItems()
420
        ];
421
    }
422
423
    /**
424
     * Get queue repository
425
     *
426
     * @return QueueRepository
427
     */
428 2
    protected function getQueueRepository()
429
    {
430 2
        if (!$this->queueRepository instanceof QueueRepository) {
431 2
            $this->queueRepository = new QueueRepository();
432
        }
433
434 2
        return $this->queueRepository;
435
    }
436
437
    /**
438
     * Get queue statistics by configuration
439
     *
440
     * @return array array of array('configuration' => <>, 'assignedButUnprocessed' => <>, 'unprocessed' => <>)
441
     */
442
    public function getQueueStatisticsByConfiguration()
443
    {
444
        $statistics = $this->getQueueRepository()->countPendingItemsGroupedByConfigurationKey();
445
446
        $setIds = $this->getQueueRepository()->getSetIdWithUnprocessedEntries();
447
448
        $totals = $this->getQueueRepository()->getTotalQueueEntriesByConfiguration($setIds);
449
450
        // "merge" arrays
451
        foreach ($statistics as $key => &$value) {
452
            $value['total'] = $totals[$value['configuration']];
453
        }
454
455
        return $statistics;
456
    }
457
458
    /**
459
     * Get active processes count
460
     *
461
     * @param void
462
     *
463
     * @return int
464
     */
465
    public function getActiveProcessesCount()
466
    {
467
        $processRepository = new ProcessRepository();
468
469
        return $processRepository->countActive();
470
    }
471
472
    /**
473
     * Get last processed entries
474
     *
475
     * @param int $limit
476
     *
477
     * @return array
478
     */
479
    public function getLastProcessedQueueEntries($limit)
480
    {
481
        return $this->getQueueRepository()->getLastProcessedEntries('*', $limit);
482
    }
483
484
    /**
485
     * Get current crawling speed
486
     *
487
     * @param float|false page speed in pages per minute
488
     *
489
     * @return int
490
     */
491
    public function getCurrentCrawlingSpeed()
492
    {
493
        $lastProcessedEntries = $this->getQueueRepository()->getLastProcessedEntriesTimestamps();
494
495
        if (count($lastProcessedEntries) < 10) {
496
            // not enough information
497
            return false;
498
        }
499
500
        $tooOldDelta = 60; // time between two entries is "too old"
501
502
        $compareValue = time();
503
        $startTime = $lastProcessedEntries[0];
504
505
        $pages = 0;
506
507
        reset($lastProcessedEntries);
508
        foreach ($lastProcessedEntries as $key => $timestamp) {
509
            if ($compareValue - $timestamp > $tooOldDelta) {
510
                break;
511
            }
512
            $compareValue = $timestamp;
513
            $pages++;
514
        }
515
516
        if ($pages < 10) {
517
            // not enough information
518
            return false;
519
        }
520
        $oldestTimestampThatIsNotTooOld = $compareValue;
521
        $time = $startTime - $oldestTimestampThatIsNotTooOld;
522
        $speed = $pages / ($time / 60);
523
524
        return $speed;
525
    }
526
527
    /**
528
     * Get some performance data
529
     *
530
     * @param integer $start
531
     * @param integer $end
532
     * @param integer $resolution
533
     *
534
     * @return array data
535
     *
536
     * @throws \Exception
537
     */
538
    public function getPerformanceData($start, $end, $resolution)
539
    {
540
        $data = [];
541
542
        $data['urlcount'] = 0;
543
        $data['start'] = $start;
544
        $data['end'] = $end;
545
        $data['duration'] = $data['end'] - $data['start'];
546
547
        if ($data['duration'] < 1) {
548
            throw new \Exception('End timestamp must be after start timestamp', 1512659945);
549
        }
550
551
        for ($slotStart = $start; $slotStart < $end; $slotStart += $resolution) {
552
            $slotEnd = min($slotStart + $resolution - 1, $end);
553
            $slotData = $this->getQueueRepository()->getPerformanceData($slotStart, $slotEnd);
554
555
            $slotUrlCount = 0;
556
            foreach ($slotData as $processId => &$processData) {
557
                $duration = $processData['end'] - $processData['start'];
558
                if ($processData['urlcount'] > 5 && $duration > 0) {
559
                    $processData['speed'] = 60 * 1 / ($duration / $processData['urlcount']);
560
                }
561
                $slotUrlCount += $processData['urlcount'];
562
            }
563
564
            $data['urlcount'] += $slotUrlCount;
565
566
            $data['slots'][$slotEnd] = [
567
                'amountProcesses' => count($slotData),
568
                'urlcount' => $slotUrlCount,
569
                'processes' => $slotData,
570
            ];
571
572
            if ($slotUrlCount > 5) {
573
                $data['slots'][$slotEnd]['speed'] = 60 * 1 / ($slotEnd - $slotStart / $slotUrlCount);
574
            } else {
575
                $data['slots'][$slotEnd]['speed'] = 0;
576
            }
577
        }
578
579
        if ($data['urlcount'] > 5) {
580
            $data['speed'] = 60 * 1 / ($data['duration'] / $data['urlcount']);
581
        } else {
582
            $data['speed'] = 0;
583
        }
584
585
        return $data;
586
    }
587
}
588