Completed
Push — typo3v9 ( 5153b7...aa6d82 )
by Tomas Norre
20:31
created

CrawlerApi::__construct()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 5

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 4
CRAP Score 1.008

Importance

Changes 0
Metric Value
cc 1
nc 1
nop 0
dl 0
loc 5
ccs 4
cts 5
cp 0.8
crap 1.008
rs 10
c 0
b 0
f 0
1
<?php
2
namespace AOE\Crawler\Api;
3
4
/***************************************************************
5
 *  Copyright notice
6
 *
7
 *  (c) 2018 AOE GmbH <[email protected]>
8
 *
9
 *  All rights reserved
10
 *
11
 *  This script is part of the TYPO3 project. The TYPO3 project is
12
 *  free software; you can redistribute it and/or modify
13
 *  it under the terms of the GNU General Public License as published by
14
 *  the Free Software Foundation; either version 3 of the License, or
15
 *  (at your option) any later version.
16
 *
17
 *  The GNU General Public License can be found at
18
 *  http://www.gnu.org/copyleft/gpl.html.
19
 *
20
 *  This script is distributed in the hope that it will be useful,
21
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
22
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
23
 *  GNU General Public License for more details.
24
 *
25
 *  This copyright notice MUST APPEAR in all copies of the script!
26
 ***************************************************************/
27
28
use AOE\Crawler\Controller\CrawlerController;
29
use AOE\Crawler\Domain\Repository\ProcessRepository;
30
use AOE\Crawler\Domain\Repository\QueueRepository;
31
use TYPO3\CMS\Core\Database\ConnectionPool;
32
use TYPO3\CMS\Core\Database\Query\QueryBuilder;
33
use TYPO3\CMS\Core\Utility\GeneralUtility;
34
use TYPO3\CMS\Core\Utility\MathUtility;
35
use TYPO3\CMS\Extbase\Object\ObjectManager;
36
use TYPO3\CMS\Frontend\Page\PageRepository;
37
38
/**
39
 * Class CrawlerApi
40
 *
41
 * @package AOE\Crawler\Api
42
 */
43
class CrawlerApi
44
{
45
    /**
46
     * @var CrawlerController
47
     */
48
    private $crawlerController;
49
50
    /**
51
     * @var QueueRepository
52
     */
53
    protected $queueRepository;
54
55
    /**
56
     * @var $allowedConfigurations array
57
     */
58
    protected $allowedConfigurations = [];
59
60
    /**
61
     * @var QueryBuilder
62
     */
63
    protected $queryBuilder;
64
65
    /**
66
     * @var string
67
     */
68
    protected $tableName = 'tx_crawler_queue';
69
70 11
    public function __construct()
71
    {
72 11
        $objectManager = GeneralUtility::makeInstance(ObjectManager::class);
73 11
        $this->crawlerController = $objectManager->get(CrawlerController::class);
74 11
    }
75
76
    /**
77
     * Each crawler run has a setid, this facade method delegates
78
     * the it to the crawler object
79
     *
80
     * @param int
81
     */
82 1
    public function overwriteSetId($id)
83
    {
84 1
        $this->findCrawler()->setID = intval($id);
85 1
    }
86
87
    /**
88
     * This method is used to limit the configuration selection to
89
     * a set of configurations.
90
     *
91
     * @param array $allowedConfigurations
92
     */
93 1
    public function setAllowedConfigurations(array $allowedConfigurations)
94
    {
95 1
        $this->allowedConfigurations = $allowedConfigurations;
96 1
    }
97
98
    /**
99
     * @return array
100
     */
101 1
    public function getAllowedConfigurations()
102
    {
103 1
        return $this->allowedConfigurations;
104
    }
105
106
    /**
107
     * Returns the setID of the crawler
108
     *
109
     * @return int
110
     */
111 1
    public function getSetId()
112
    {
113 1
        return $this->findCrawler()->setID;
114
    }
115
116
    /**
117
     * Method to get an instance of the internal crawler singleton
118
     *
119
     * @return CrawlerController Instance of the crawler lib
120
     *
121
     * @throws \Exception
122
     */
123 2
    protected function findCrawler()
124
    {
125 2
        if (!is_object($this->crawlerController)) {
126
            $this->crawlerController = GeneralUtility::makeInstance(CrawlerController::class);
127
            $this->crawlerController->setID = GeneralUtility::md5int(microtime());
128
        }
129
130 2
        if (is_object($this->crawlerController)) {
131 2
            return $this->crawlerController;
132
        } else {
133
            throw new \Exception('no crawler object', 1512659759);
134
        }
135
    }
136
137
    /**
138
     * Adds a page to the crawlerqueue by uid
139
     *
140
     * @param int $uid uid
141
     */
142
    public function addPageToQueue($uid)
143
    {
144
        $uid = intval($uid);
145
        //non timed elements will be added with timestamp 0
146
        $this->addPageToQueueTimed($uid, 0);
147
    }
148
149
    /**
150
     * This method is used to limit the processing instructions to the processing instructions
151
     * that are allowed.
152
     *
153
     * @return array
154
     */
155 2
    protected function filterUnallowedConfigurations($configurations)
156
    {
157 2
        if (count($this->allowedConfigurations) > 0) {
158
            // 	remove configuration that does not match the current selection
159
            foreach ($configurations as $confKey => $confArray) {
160
                if (!in_array($confKey, $this->allowedConfigurations)) {
161
                    unset($configurations[$confKey]);
162
                }
163
            }
164
        }
165
166 2
        return $configurations;
167
    }
168
169
    /**
170
     * Adds a page to the crawlerqueue by uid and sets a
171
     * timestamp when the page should be crawled.
172
     *
173
     * @param int $uid pageid
174
     * @param int $time timestamp
175
     */
176 2
    public function addPageToQueueTimed($uid, $time)
177
    {
178 2
        $uid = intval($uid);
179 2
        $time = intval($time);
180
181 2
        $crawler = $this->findCrawler();
182 2
        $pageData = GeneralUtility::makeInstance(PageRepository::class)->getPage($uid);
183 2
        $configurations = $crawler->getUrlsForPageRow($pageData);
184 2
        $configurations = $this->filterUnallowedConfigurations($configurations);
185 2
        $downloadUrls = [];
186 2
        $duplicateTrack = [];
187
188 2
        if (is_array($configurations)) {
189 2
            foreach ($configurations as $cv) {
190
                //enable inserting of entries
191 2
                $crawler->registerQueueEntriesInternallyOnly = false;
192 2
                $crawler->urlListFromUrlArray(
193 2
                    $cv,
194 2
                    $pageData,
195 2
                    $time,
196 2
                    300,
197 2
                    true,
198 2
                    false,
199 2
                    $duplicateTrack,
200 2
                    $downloadUrls,
201 2
                    array_keys($this->getCrawlerProcInstructions())
202
                );
203
204
                //reset the queue because the entries have been written to the db
205 2
                unset($crawler->queueEntries);
206
            }
207
        } else {
208
            //no configuration found
209
        }
210 2
    }
211
212
    /**
213
     * Counts all entries in the database which are scheduled for a given page id and a schedule timestamp.
214
     *
215
     * @param int $page_uid
216
     * @param int $schedule_timestamp
217
     *
218
     * @return int
219
     */
220 1
    protected function countEntriesInQueueForPageByScheduleTime($page_uid, $schedule_timestamp)
221
    {
222 1
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
223
        $count = $queryBuilder
224 1
            ->count('*')
225 1
            ->from($this->tableName);
226
227
        //if the same page is scheduled for the same time and has not be executed?
228
        //un-timed elements need an exec_time with 0 because they can occur multiple times
229 1
        if ($schedule_timestamp == 0) {
230 1
            $count->where(
231 1
                $queryBuilder->expr()->eq('page_id', $page_uid),
232 1
                $queryBuilder->expr()->eq('exec_time', 0),
233 1
                $queryBuilder->expr()->eq('scheduled', $schedule_timestamp)
234
            );
235
        } else {
236
            //timed elements have got a fixed schedule time, if a record with this time
237
            //exists it is maybe queued for the future, or is has been queue for the past and therefore
238
            //also been processed.
239 1
            $count->where(
240 1
                $queryBuilder->expr()->eq('page_id', $page_uid),
241 1
                $queryBuilder->expr()->eq('scheduled', $schedule_timestamp)
242
            );
243
        }
244
245 1
        return $count->execute()->rowCount();
246
    }
247
248
    /**
249
     * Determines if a page is queued
250
     *
251
     * @param $uid
252
     * @param bool $unprocessed_only
253
     * @param bool $timed_only
254
     * @param bool $timestamp
255
     *
256
     * @return bool
257
     *
258
     * @deprecated since crawler v7.0.0, will be removed in crawler v8.0.0.
259
     */
260
    public function isPageInQueue($uid, $unprocessed_only = true, $timed_only = false, $timestamp = false)
261
    {
262
        if (!MathUtility::canBeInterpretedAsInteger($uid)) {
263
            throw new \InvalidArgumentException('Invalid parameter type', 1468931945);
264
        }
265
266
        $isPageInQueue = false;
267
268
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
269
        $queryBuilder
270
            ->count('*')
271
            ->from($this->tableName)
272
            ->where(
273
                $queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($uid, \PDO::PARAM_INT))
274
            );
275
276
        if (false !== $unprocessed_only) {
277
            $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
278
        }
279
280
        if (false !== $timed_only) {
281
            $queryBuilder->andWhere($queryBuilder->expr()->neq('scheduled', 0));
282
        }
283
284
        if (false !== $timestamp) {
285
            $queryBuilder->andWhere($queryBuilder->expr()->neq('scheduled', $queryBuilder->createNamedParameter($timestamp, \PDO::PARAM_INT)));
286
        }
287
288
        $count = $queryBuilder->execute()->fetchColumn(0);
289
290
        if (false !== $count && $count > 0) {
291
            $isPageInQueue = true;
292
        }
293
294
        return $isPageInQueue;
295
    }
296
297
    /**
298
     * Method to return the latest Crawle Timestamp for a page.
299
     *
300
     * @param int $uid uid id of the page
301
     * @param bool $future_crawldates_only
302
     * @param bool $unprocessed_only
303
     *
304
     * @return int
305
     */
306 1
    public function getLatestCrawlTimestampForPage($uid, $future_crawldates_only = false, $unprocessed_only = false)
307
    {
308 1
        $uid = intval($uid);
309
310 1
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
311
        $query = $queryBuilder
312 1
            ->from($this->tableName)
313 1
            ->selectLiteral('max(scheduled) as latest')
314 1
            ->where(
315 1
                $queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($uid))
316
            );
317
318 1
        if ($future_crawldates_only) {
319
            $query->andWhere(
320
                $queryBuilder->expr()->gt('scheduled', time())
321
            );
322
        }
323
324 1
        if ($unprocessed_only) {
325
            $query->andWhere(
326
                $queryBuilder->expr()->eq('exec_time', 0)
327
            );
328
        }
329
330 1
        $row = $query->execute()->fetch(0);
331 1
        if ($row['latest']) {
332 1
            $res = $row['latest'];
333
        } else {
334
            $res = 0;
335
        }
336
337 1
        return $res;
338
    }
339
340
    /**
341
     * Returns an array with timestamps when the page has been scheduled for crawling and
342
     * at what time the scheduled crawl has been executed. The array also contains items that are
343
     * scheduled but have note been crawled yet.
344
     *
345
     * @param int $uid uid of the page
346
     * @param bool $limit
347
     *
348
     * @return array array with the crawl-history of a page => 0 : scheduled time , 1 : executed_time, 2 : set_id
349
     */
350 1
    public function getCrawlHistoryForPage($uid, $limit = 0)
351
    {
352 1
        $uid = intval($uid);
353 1
        $limit = intval($limit);
354
355 1
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
356
        $statement = $queryBuilder
357 1
            ->from($this->tableName)
358 1
            ->select('scheduled', 'exec_time', 'set_id')
359 1
            ->where(
360 1
                $queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($uid, \PDO::PARAM_INT))
361
            );
362 1
        if ($limit) {
363 1
            $statement->setMaxResults($limit);
364
        }
365
366 1
        return $statement->execute()->fetchAll();
367
    }
368
369
    /**
370
     * Method to determine unprocessed Items in the crawler queue.
371
     *
372
     * @return array
373
     *
374
     * @deprecated since crawler v7.0.0, will be removed in crawler v8.0.0.
375
     */
376
    public function getUnprocessedItems()
377
    {
378
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
379
        return $queryBuilder
380
            ->select('*')
381
            ->from($this->tableName)
382
            ->where(
383
                $queryBuilder->expr()->eq('exec_time', 0)
384
            )
385
            ->orderBy('page_id')
386
            ->addOrderBy('scheduled')
387
            ->execute()
388
            ->fetchAll();
389
    }
390
391
    /**
392
     * Method to get the number of unprocessed items in the crawler
393
     *
394
     * @param int number of unprocessed items in the queue
395
     *
396
     * @deprecated since crawler v7.0.0, will be removed in crawler v8.0.0.
397
     */
398
    public function countUnprocessedItems()
399
    {
400
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
401
        return $queryBuilder
402
            ->count('page_id')
403
            ->from($this->tableName)
404
            ->where(
405
                $queryBuilder->expr()->eq('exec_time', 0)
406
            )
407
            ->execute()
408
            ->fetchColumn(0);
409
    }
410
411
    /**
412
     * Method to check if a page is in the queue which is timed for a
413
     * date when it should be crawled
414
     *
415
     * @param int $uid uid of the page
416
     * @param boolean $show_unprocessed only respect unprocessed pages
417
     *
418
     * @return boolean
419
     *
420
     * @deprecated since crawler v7.0.0, will be removed in crawler v8.0.0.
421
     */
422
    public function isPageInQueueTimed($uid, $show_unprocessed = true)
423
    {
424
        $uid = intval($uid);
425
426
        return $this->isPageInQueue($uid, $show_unprocessed);
0 ignored issues
show
Deprecated Code introduced by
The method AOE\Crawler\Api\CrawlerApi::isPageInQueue() has been deprecated with message: since crawler v7.0.0, will be removed in crawler v8.0.0.

This method has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the method will be removed from the class and what other method or class to use instead.

Loading history...
427
    }
428
429
    /**
430
     * Reads the registered processingInstructions of the crawler
431
     *
432
     * @return array
433
     */
434 2
    private function getCrawlerProcInstructions(): array
435
    {
436 2
        $crawlerProcInstructions = [];
437 2
        if (!empty($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['procInstructions'])) {
438
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['procInstructions'] as $configuration) {
439
                $crawlerProcInstructions[$configuration['key']] = $configuration['value'];
440
            }
441
        }
442
443 2
        return $crawlerProcInstructions;
444
    }
445
446
    /**
447
     * Removes an queue entry with a given queue id
448
     *
449
     * @param int $qid
450
     *
451
     * @deprecated since crawler v7.0.0, will be removed in crawler v8.0.0.
452
     */
453
    public function removeQueueEntrie($qid)
454
    {
455
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
456
        $queryBuilder
457
            ->delete()
458
            ->from($this->tableName)
459
            ->where(
460
                $queryBuilder->expr()->eq('qid', $queryBuilder->createNamedParameter($qid, \PDO::PARAM_INT))
461
            )
462
            ->execute();
463
    }
464
465
    /**
466
     * Get queue statistics
467
     *
468
     * @param void
469
     *
470
     * @return array array('assignedButUnprocessed' => <>, 'unprocessed' => <>);
471
     */
472 1
    public function getQueueStatistics()
473
    {
474
        return [
475 1
            'assignedButUnprocessed' => $this->getQueueRepository()->countAllAssignedPendingItems(),
476 1
            'unprocessed' => $this->getQueueRepository()->countAllPendingItems()
477
        ];
478
    }
479
480
    /**
481
     * Get queue repository
482
     *
483
     * @return QueueRepository
484
     */
485 2
    protected function getQueueRepository()
486
    {
487 2
        if (!$this->queueRepository instanceof QueueRepository) {
488 2
            $this->queueRepository = new QueueRepository();
489
        }
490
491 2
        return $this->queueRepository;
492
    }
493
494
    /**
495
     * Get queue statistics by configuration
496
     *
497
     * @return array array of array('configuration' => <>, 'assignedButUnprocessed' => <>, 'unprocessed' => <>)
498
     */
499
    public function getQueueStatisticsByConfiguration()
500
    {
501
        $statistics = $this->getQueueRepository()->countPendingItemsGroupedByConfigurationKey();
502
503
        $setIds = $this->getQueueRepository()->getSetIdWithUnprocessedEntries();
504
505
        $totals = $this->getQueueRepository()->getTotalQueueEntriesByConfiguration($setIds);
506
507
        // "merge" arrays
508
        foreach ($statistics as $key => &$value) {
509
            $value['total'] = $totals[$value['configuration']];
510
        }
511
512
        return $statistics;
513
    }
514
515
    /**
516
     * Get active processes count
517
     *
518
     * @param void
519
     *
520
     * @return int
521
     */
522
    public function getActiveProcessesCount()
523
    {
524
        $processRepository = new ProcessRepository();
525
526
        return $processRepository->countActive();
527
    }
528
529
    /**
530
     * Get last processed entries
531
     *
532
     * @param int $limit
533
     *
534
     * @return array
535
     */
536
    public function getLastProcessedQueueEntries($limit)
537
    {
538
        return $this->getQueueRepository()->getLastProcessedEntries('*', $limit);
0 ignored issues
show
Unused Code introduced by
The call to QueueRepository::getLastProcessedEntries() has too many arguments starting with $limit.

This check compares calls to functions or methods with their respective definitions. If the call has more arguments than are defined, it raises an issue.

If a function is defined several times with a different number of parameters, the check may pick up the wrong definition and report false positives. One codebase where this has been known to happen is Wordpress.

In this case you can add the @ignore PhpDoc annotation to the duplicate definition and it will be ignored.

Loading history...
539
    }
540
541
    /**
542
     * Get current crawling speed
543
     *
544
     * @param float|false page speed in pages per minute
545
     *
546
     * @return int
547
     */
548
    public function getCurrentCrawlingSpeed()
549
    {
550
        $lastProcessedEntries = $this->getQueueRepository()->getLastProcessedEntriesTimestamps();
551
552
        if (count($lastProcessedEntries) < 10) {
553
            // not enough information
554
            return false;
555
        }
556
557
        $tooOldDelta = 60; // time between two entries is "too old"
558
559
        $compareValue = time();
560
        $startTime = $lastProcessedEntries[0];
561
562
        $pages = 0;
563
564
        reset($lastProcessedEntries);
565
        foreach ($lastProcessedEntries as $key => $timestamp) {
566
            if ($compareValue - $timestamp > $tooOldDelta) {
567
                break;
568
            }
569
            $compareValue = $timestamp;
570
            $pages++;
571
        }
572
573
        if ($pages < 10) {
574
            // not enough information
575
            return false;
576
        }
577
        $oldestTimestampThatIsNotTooOld = $compareValue;
578
        $time = $startTime - $oldestTimestampThatIsNotTooOld;
579
        $speed = $pages / ($time / 60);
580
581
        return $speed;
582
    }
583
584
    /**
585
     * Get some performance data
586
     *
587
     * @param integer $start
588
     * @param integer $end
589
     * @param integer $resolution
590
     *
591
     * @return array data
592
     *
593
     * @throws \Exception
594
     */
595
    public function getPerformanceData($start, $end, $resolution)
596
    {
597
        $data = [];
598
599
        $data['urlcount'] = 0;
600
        $data['start'] = $start;
601
        $data['end'] = $end;
602
        $data['duration'] = $data['end'] - $data['start'];
603
604
        if ($data['duration'] < 1) {
605
            throw new \Exception('End timestamp must be after start timestamp', 1512659945);
606
        }
607
608
        for ($slotStart = $start; $slotStart < $end; $slotStart += $resolution) {
609
            $slotEnd = min($slotStart + $resolution - 1, $end);
610
            $slotData = $this->getQueueRepository()->getPerformanceData($slotStart, $slotEnd);
611
612
            $slotUrlCount = 0;
613
            foreach ($slotData as $processId => &$processData) {
614
                $duration = $processData['end'] - $processData['start'];
615
                if ($processData['urlcount'] > 5 && $duration > 0) {
616
                    $processData['speed'] = 60 * 1 / ($duration / $processData['urlcount']);
617
                }
618
                $slotUrlCount += $processData['urlcount'];
619
            }
620
621
            $data['urlcount'] += $slotUrlCount;
622
623
            $data['slots'][$slotEnd] = [
624
                'amountProcesses' => count($slotData),
625
                'urlcount' => $slotUrlCount,
626
                'processes' => $slotData,
627
            ];
628
629
            if ($slotUrlCount > 5) {
630
                $data['slots'][$slotEnd]['speed'] = 60 * 1 / ($slotEnd - $slotStart / $slotUrlCount);
631
            } else {
632
                $data['slots'][$slotEnd]['speed'] = 0;
633
            }
634
        }
635
636
        if ($data['urlcount'] > 5) {
637
            $data['speed'] = 60 * 1 / ($data['duration'] / $data['urlcount']);
638
        } else {
639
            $data['speed'] = 0;
640
        }
641
642
        return $data;
643
    }
644
}
645