Completed
Push — typo3v9 ( e7a195...f05983 )
by Tomas Norre
26:19 queued 11:20
created

CrawlerApi::getQueueStatisticsByConfiguration()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 15

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 6

Importance

Changes 0
Metric Value
cc 2
nc 2
nop 0
dl 0
loc 15
ccs 0
cts 6
cp 0
crap 6
rs 9.7666
c 0
b 0
f 0
1
<?php
2
namespace AOE\Crawler\Api;
3
4
/***************************************************************
5
 *  Copyright notice
6
 *
7
 *  (c) 2018 AOE GmbH <[email protected]>
8
 *
9
 *  All rights reserved
10
 *
11
 *  This script is part of the TYPO3 project. The TYPO3 project is
12
 *  free software; you can redistribute it and/or modify
13
 *  it under the terms of the GNU General Public License as published by
14
 *  the Free Software Foundation; either version 3 of the License, or
15
 *  (at your option) any later version.
16
 *
17
 *  The GNU General Public License can be found at
18
 *  http://www.gnu.org/copyleft/gpl.html.
19
 *
20
 *  This script is distributed in the hope that it will be useful,
21
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
22
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
23
 *  GNU General Public License for more details.
24
 *
25
 *  This copyright notice MUST APPEAR in all copies of the script!
26
 ***************************************************************/
27
28
use AOE\Crawler\Controller\CrawlerController;
29
use AOE\Crawler\Domain\Repository\ProcessRepository;
30
use AOE\Crawler\Domain\Repository\QueueRepository;
31
use TYPO3\CMS\Core\Database\ConnectionPool;
32
use TYPO3\CMS\Core\Database\Query\QueryBuilder;
33
use TYPO3\CMS\Core\Utility\GeneralUtility;
34
use TYPO3\CMS\Core\Utility\MathUtility;
35
use TYPO3\CMS\Extbase\Object\ObjectManager;
36
use TYPO3\CMS\Frontend\Page\PageRepository;
37
38
/**
39
 * Class CrawlerApi
40
 *
41
 * @package AOE\Crawler\Api
42
 */
43
class CrawlerApi
44
{
45
    /**
46
     * @var CrawlerController
47
     */
48
    private $crawlerController;
49
50
    /**
51
     * @var QueueRepository
52
     */
53
    protected $queueRepository;
54
55
    /**
56
     * @var $allowedConfigurations array
57
     */
58
    protected $allowedConfigurations = [];
59
60
    /**
61
     * @var QueryBuilder
62
     */
63
    protected $queryBuilder;
64
65
    /**
66
     * @var string
67
     */
68
    protected $tableName = 'tx_crawler_queue';
69
70 9
    public function __construct()
71
    {
72 9
        $objectManager = GeneralUtility::makeInstance(ObjectManager::class);
73 9
        $this->crawlerController = $objectManager->get(CrawlerController::class);
74
    }
75 9
76 9
    /**
77
     * Each crawler run has a setid, this facade method delegates
78
     * the it to the crawler object
79
     *
80
     * @param int
81
     */
82
    public function overwriteSetId($id)
83
    {
84 1
        $this->findCrawler()->setID = intval($id);
85
    }
86 1
87 1
    /**
88
     * This method is used to limit the configuration selection to
89
     * a set of configurations.
90
     *
91
     * @param array $allowedConfigurations
92
     */
93
    public function setAllowedConfigurations(array $allowedConfigurations)
94
    {
95 1
        $this->allowedConfigurations = $allowedConfigurations;
96
    }
97 1
98 1
    /**
99
     * @return array
100
     */
101
    public function getAllowedConfigurations()
102
    {
103 1
        return $this->allowedConfigurations;
104
    }
105 1
106
    /**
107
     * Returns the setID of the crawler
108
     *
109
     * @return int
110
     */
111
    public function getSetId()
112
    {
113 1
        return $this->findCrawler()->setID;
114
    }
115 1
116
    /**
117
     * Method to get an instance of the internal crawler singleton
118
     *
119
     * @return CrawlerController Instance of the crawler lib
120
     *
121
     * @throws \Exception
122
     */
123
    protected function findCrawler()
124
    {
125 2
        if (!is_object($this->crawlerController)) {
126
            $this->crawlerController = GeneralUtility::makeInstance(CrawlerController::class);
127 2
            $this->crawlerController->setID = GeneralUtility::md5int(microtime());
128
        }
129
130
        if (is_object($this->crawlerController)) {
131
            return $this->crawlerController;
132 2
        } else {
133 2
            throw new \Exception('no crawler object', 1512659759);
134
        }
135
    }
136
137
    /**
138
     * Adds a page to the crawlerqueue by uid
139
     *
140
     * @param int $uid uid
141
     */
142
    public function addPageToQueue($uid)
143
    {
144
        $uid = intval($uid);
145
        //non timed elements will be added with timestamp 0
146
        $this->addPageToQueueTimed($uid, 0);
147
    }
148
149
    /**
150
     * This method is used to limit the processing instructions to the processing instructions
151
     * that are allowed.
152
     *
153
     * @return array
154
     */
155
    protected function filterUnallowedConfigurations($configurations)
156
    {
157
        if (count($this->allowedConfigurations) > 0) {
158
            // 	remove configuration that does not match the current selection
159
            foreach ($configurations as $confKey => $confArray) {
160
                if (!in_array($confKey, $this->allowedConfigurations)) {
161
                    unset($configurations[$confKey]);
162
                }
163
            }
164
        }
165
166
        return $configurations;
167
    }
168
169
    /**
170
     * Adds a page to the crawlerqueue by uid and sets a
171
     * timestamp when the page should be crawled.
172
     *
173
     * @param int $uid pageid
174
     * @param int $time timestamp
175
     */
176
    public function addPageToQueueTimed($uid, $time)
177
    {
178
        $uid = intval($uid);
179
        $time = intval($time);
180
181
        $crawler = $this->findCrawler();
182
        $pageData = GeneralUtility::makeInstance(PageRepository::class)->getPage($uid);
183
        $configurations = $crawler->getUrlsForPageRow($pageData);
184
        $configurations = $this->filterUnallowedConfigurations($configurations);
185
        $downloadUrls = [];
186
        $duplicateTrack = [];
187
188
        if (is_array($configurations)) {
189
            foreach ($configurations as $cv) {
190
                //enable inserting of entries
191
                $crawler->registerQueueEntriesInternallyOnly = false;
192
                $crawler->urlListFromUrlArray(
193
                    $cv,
194
                    $pageData,
195
                    $time,
196
                    300,
197
                    true,
198
                    false,
199
                    $duplicateTrack,
200
                    $downloadUrls,
201
                    array_keys($this->getCrawlerProcInstructions())
202
                );
203
204
                //reset the queue because the entries have been written to the db
205
                unset($crawler->queueEntries);
206
            }
207
        } else {
208
            //no configuration found
209
        }
210
    }
211
212
    /**
213
     * Counts all entries in the database which are scheduled for a given page id and a schedule timestamp.
214
     *
215
     * @param int $page_uid
216
     * @param int $schedule_timestamp
217
     *
218
     * @return int
219
     */
220
    protected function countEntriesInQueueForPageByScheduleTime($page_uid, $schedule_timestamp)
221
    {
222 1
223
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
224
        $count = $queryBuilder
225 1
            ->count('*')
226 1
            ->from($this->tableName);
227 1
228
        //if the same page is scheduled for the same time and has not be executed?
229
        //un-timed elements need an exec_time with 0 because they can occur multiple times
230
        if ($schedule_timestamp == 0) {
231 1
            $count->where(
232 1
                $queryBuilder->expr()->eq('page_id', $page_uid),
233 1
                $queryBuilder->expr()->eq('exec_time', 0),
234 1
                $queryBuilder->expr()->eq('scheduled', $schedule_timestamp)
235 1
            );
236
        } else {
237
            //timed elements have got a fixed schedule time, if a record with this time
238
            //exists it is maybe queued for the future, or is has been queue for the past and therefore
239
            //also been processed.
240
            $count->where(
241 1
                $queryBuilder->expr()->eq('page_id', $page_uid),
242 1
                $queryBuilder->expr()->eq('scheduled', $schedule_timestamp)
243 1
            );
244
        }
245
246
        return $count->execute()->rowCount();
247 1
    }
248
249
    /**
250
     * Determines if a page is queued
251
     *
252
     * @param $uid
253
     * @param bool $unprocessed_only
254
     * @param bool $timed_only
255
     * @param bool $timestamp
256
     *
257
     * @return bool
258
     *
259
     * @deprecated since crawler v7.0.0, will be removed in crawler v8.0.0.
260
     */
261
    public function isPageInQueue($uid, $unprocessed_only = true, $timed_only = false, $timestamp = false)
262
    {
263
        if (!MathUtility::canBeInterpretedAsInteger($uid)) {
264
            throw new \InvalidArgumentException('Invalid parameter type', 1468931945);
265
        }
266
267
        $isPageInQueue = false;
268
269
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
270
        $queryBuilder
271
            ->count('*')
272
            ->from($this->tableName)
273
            ->where(
274
                $queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($uid, \PDO::PARAM_INT))
275
            );
276
277
        if (false !== $unprocessed_only) {
278
            $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
279
        }
280
281
        if (false !== $timed_only) {
282
            $queryBuilder->andWhere($queryBuilder->expr()->neq('scheduled', 0));
283
        }
284
285
        if (false !== $timestamp) {
286
            $queryBuilder->andWhere($queryBuilder->expr()->neq('scheduled', $queryBuilder->createNamedParameter($timestamp, \PDO::PARAM_INT)));
287
        }
288
289
        $count = $queryBuilder->execute()->fetchColumn(0);
290
291
        if (false !== $count && $count > 0) {
292
            $isPageInQueue = true;
293
        }
294
295
        return $isPageInQueue;
296
    }
297
298
    /**
299
     * Method to return the latest Crawle Timestamp for a page.
300
     *
301
     * @param int $uid uid id of the page
302
     * @param bool $future_crawldates_only
303
     * @param bool $unprocessed_only
304
     *
305
     * @return int
306
     */
307 1
    public function getLatestCrawlTimestampForPage($uid, $future_crawldates_only = false, $unprocessed_only = false)
308
    {
309 1
        $uid = intval($uid);
310 1
311 1
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
312 1
        $query = $queryBuilder
313 1
            ->from($this->tableName)
314 1
            ->selectLiteral('max(scheduled) as latest')
315
            ->where(
316
                $queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($uid))
317 1
            );
318
319
        if ($future_crawldates_only) {
320
            $query->andWhere(
321
                $queryBuilder->expr()->gt('scheduled', time())
322
            );
323 1
        }
324
325
        if ($unprocessed_only) {
326
            $query->andWhere(
327
                $queryBuilder->expr()->eq('exec_time', 0)
328
            );
329 1
        }
330 1
331 1
        $row = $query->execute()->fetch(0);
332
        if ($row['latest']) {
333
            $res = $row['latest'];
334
        } else {
335
            $res = 0;
336 1
        }
337
338
        return $res;
339
    }
340
341
    /**
342
     * Returns an array with timestamps when the page has been scheduled for crawling and
343
     * at what time the scheduled crawl has been executed. The array also contains items that are
344
     * scheduled but have note been crawled yet.
345
     *
346
     * @param int $uid uid of the page
347
     * @param bool $limit
348
     *
349 1
     * @return array array with the crawl-history of a page => 0 : scheduled time , 1 : executed_time, 2 : set_id
350
     */
351 1
    public function getCrawlHistoryForPage($uid, $limit = 0)
352 1
    {
353
        $uid = intval($uid);
354 1
        $limit = intval($limit);
355 1
356 1
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
357 1
        $statement = $queryBuilder
358 1
            ->from($this->tableName)
359
            ->select('scheduled', 'exec_time', 'set_id')
360 1
            ->where(
361 1
                $queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($uid, \PDO::PARAM_INT))
362
            );
363
        if($limit) {
364 1
            $statement->setMaxResults($limit);
365
        }
366
367
        return $statement->execute()->fetchAll();
368
    }
369
370
    /**
371
     * Method to determine unprocessed Items in the crawler queue.
372
     *
373
     * @return array
374
     *
375
     * @deprecated since crawler v7.0.0, will be removed in crawler v8.0.0.
376
     */
377
    public function getUnprocessedItems()
378
    {
379
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
380
        return $queryBuilder
381
            ->select('*')
382
            ->from($this->tableName)
383
            ->where(
384
                $queryBuilder->expr()->eq('exec_time', 0)
385
            )
386
            ->orderBy('page_id')
387
            ->addOrderBy('scheduled')
388
            ->execute()
389
            ->fetchAll();
390
    }
391
392
    /**
393
     * Method to get the number of unprocessed items in the crawler
394
     *
395
     * @param int number of unprocessed items in the queue
396
     *
397
     * @deprecated since crawler v7.0.0, will be removed in crawler v8.0.0.
398
     */
399
    public function countUnprocessedItems()
400
    {
401
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
402
        return $queryBuilder
403
            ->count('page_id')
404
            ->from($this->tableName)
405
            ->where(
406
                $queryBuilder->expr()->eq('exec_time', 0)
407
            )
408
            ->execute()
409
            ->fetchColumn(0);
410
    }
411
412
    /**
413
     * Method to check if a page is in the queue which is timed for a
414
     * date when it should be crawled
415
     *
416
     * @param int $uid uid of the page
417
     * @param boolean $show_unprocessed only respect unprocessed pages
418
     *
419
     * @return boolean
420
     *
421
     * @deprecated since crawler v7.0.0, will be removed in crawler v8.0.0.
422
     */
423
    public function isPageInQueueTimed($uid, $show_unprocessed = true)
424
    {
425
        $uid = intval($uid);
426
427
        return $this->isPageInQueue($uid, $show_unprocessed);
0 ignored issues
show
Deprecated Code introduced by
The method AOE\Crawler\Api\CrawlerApi::isPageInQueue() has been deprecated with message: since crawler v7.0.0, will be removed in crawler v8.0.0.

This method has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the method will be removed from the class and what other method or class to use instead.

Loading history...
428
    }
429
430
    /**
431
     * Reads the registered processingInstructions of the crawler
432
     *
433
     * @return array
434
     */
435
    private function getCrawlerProcInstructions(): array
436
    {
437
438
        $crawlerProcInstructions = [];
439
        if (!empty($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['procInstructions'])) {
440
            foreach($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['procInstructions'] as $configuration) {
441
                $crawlerProcInstructions[$configuration['key']] = $configuration['value'];
442
            }
443
        }
444
445
        return $crawlerProcInstructions;
446
    }
447
448
    /**
449
     * Removes an queue entry with a given queue id
450
     *
451
     * @param int $qid
452
     *
453
     * @deprecated since crawler v7.0.0, will be removed in crawler v8.0.0.
454
     */
455
    public function removeQueueEntrie($qid)
456
    {
457
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
458
        $queryBuilder
459
            ->delete()
460
            ->from($this->tableName)
461
            ->where(
462
                $queryBuilder->expr()->eq('qid', $queryBuilder->createNamedParameter($qid, \PDO::PARAM_INT))
463
            )
464 1
            ->execute();
465
    }
466
467 1
    /**
468 1
     * Get queue statistics
469
     *
470
     * @param void
471
     *
472
     * @return array array('assignedButUnprocessed' => <>, 'unprocessed' => <>);
473
     */
474
    public function getQueueStatistics()
475
    {
476
        return [
477 2
            'assignedButUnprocessed' => $this->getQueueRepository()->countAllAssignedPendingItems(),
478
            'unprocessed' => $this->getQueueRepository()->countAllPendingItems()
479 2
        ];
480 2
    }
481
482
    /**
483 2
     * Get queue repository
484
     *
485
     * @return QueueRepository
486
     */
487
    protected function getQueueRepository()
488
    {
489
        if (!$this->queueRepository instanceof QueueRepository) {
490
            $this->queueRepository = new QueueRepository();
491
        }
492
493
        return $this->queueRepository;
494
    }
495
496
    /**
497
     * Get queue statistics by configuration
498
     *
499
     * @return array array of array('configuration' => <>, 'assignedButUnprocessed' => <>, 'unprocessed' => <>)
500
     */
501
    public function getQueueStatisticsByConfiguration()
502
    {
503
        $statistics = $this->getQueueRepository()->countPendingItemsGroupedByConfigurationKey();
504
505
        $setIds = $this->getQueueRepository()->getSetIdWithUnprocessedEntries();
506
507
        $totals = $this->getQueueRepository()->getTotalQueueEntriesByConfiguration($setIds);
508
509
        // "merge" arrays
510
        foreach ($statistics as $key => &$value) {
511
            $value['total'] = $totals[$value['configuration']];
512
        }
513
514
        return $statistics;
515
    }
516
517
    /**
518
     * Get active processes count
519
     *
520
     * @param void
521
     *
522
     * @return int
523
     */
524
    public function getActiveProcessesCount()
525
    {
526
        $processRepository = new ProcessRepository();
527
528
        return $processRepository->countActive();
529
    }
530
531
    /**
532
     * Get last processed entries
533
     *
534
     * @param int $limit
535
     *
536
     * @return array
537
     */
538
    public function getLastProcessedQueueEntries($limit)
539
    {
540
        return $this->getQueueRepository()->getLastProcessedEntries('*', $limit);
0 ignored issues
show
Unused Code introduced by
The call to QueueRepository::getLastProcessedEntries() has too many arguments starting with $limit.

This check compares calls to functions or methods with their respective definitions. If the call has more arguments than are defined, it raises an issue.

If a function is defined several times with a different number of parameters, the check may pick up the wrong definition and report false positives. One codebase where this has been known to happen is Wordpress.

In this case you can add the @ignore PhpDoc annotation to the duplicate definition and it will be ignored.

Loading history...
541
    }
542
543
    /**
544
     * Get current crawling speed
545
     *
546
     * @param float|false page speed in pages per minute
547
     *
548
     * @return int
549
     */
550
    public function getCurrentCrawlingSpeed()
551
    {
552
        $lastProcessedEntries = $this->getQueueRepository()->getLastProcessedEntriesTimestamps();
553
554
        if (count($lastProcessedEntries) < 10) {
555
            // not enough information
556
            return false;
557
        }
558
559
        $tooOldDelta = 60; // time between two entries is "too old"
560
561
        $compareValue = time();
562
        $startTime = $lastProcessedEntries[0];
563
564
        $pages = 0;
565
566
        reset($lastProcessedEntries);
567
        foreach($lastProcessedEntries as $key => $timestamp) {
568
            if ($compareValue - $timestamp > $tooOldDelta) {
569
                break;
570
            }
571
            $compareValue = $timestamp;
572
            $pages++;
573
        }
574
575
        if ($pages < 10) {
576
            // not enough information
577
            return false;
578
        }
579
        $oldestTimestampThatIsNotTooOld = $compareValue;
580
        $time = $startTime - $oldestTimestampThatIsNotTooOld;
581
        $speed = $pages / ($time / 60);
582
583
        return $speed;
584
    }
585
586
    /**
587
     * Get some performance data
588
     *
589
     * @param integer $start
590
     * @param integer $end
591
     * @param integer $resolution
592
     *
593
     * @return array data
594
     *
595
     * @throws \Exception
596
     */
597
    public function getPerformanceData($start, $end, $resolution)
598
    {
599
        $data = [];
600
601
        $data['urlcount'] = 0;
602
        $data['start'] = $start;
603
        $data['end'] = $end;
604
        $data['duration'] = $data['end'] - $data['start'];
605
606
        if ($data['duration'] < 1) {
607
            throw new \Exception('End timestamp must be after start timestamp', 1512659945);
608
        }
609
610
        for ($slotStart = $start; $slotStart < $end; $slotStart += $resolution) {
611
            $slotEnd = min($slotStart + $resolution - 1, $end);
612
            $slotData = $this->getQueueRepository()->getPerformanceData($slotStart, $slotEnd);
613
614
            $slotUrlCount = 0;
615
            foreach ($slotData as $processId => &$processData) {
616
                $duration = $processData['end'] - $processData['start'];
617
                if ($processData['urlcount'] > 5 && $duration > 0) {
618
                    $processData['speed'] = 60 * 1 / ($duration / $processData['urlcount']);
619
                }
620
                $slotUrlCount += $processData['urlcount'];
621
            }
622
623
            $data['urlcount'] += $slotUrlCount;
624
625
            $data['slots'][$slotEnd] = [
626
                'amountProcesses' => count($slotData),
627
                'urlcount' => $slotUrlCount,
628
                'processes' => $slotData,
629
            ];
630
631
            if ($slotUrlCount > 5) {
632
                $data['slots'][$slotEnd]['speed'] = 60 * 1 / ($slotEnd - $slotStart / $slotUrlCount);
633
            } else {
634
                $data['slots'][$slotEnd]['speed'] = 0;
635
            }
636
        }
637
638
        if ($data['urlcount'] > 5) {
639
            $data['speed'] = 60 * 1 / ($data['duration'] / $data['urlcount']);
640
        } else {
641
            $data['speed'] = 0;
642
        }
643
644
        return $data;
645
    }
646
}
647