Completed
Push — Testing/fix ( f80687...a7e6fd )
by Tomas Norre
16:42 queued 15:05
created

CrawlerApi::getUnprocessedItems()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 14

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 2

Importance

Changes 0
Metric Value
cc 1
nc 1
nop 0
dl 0
loc 14
ccs 0
cts 14
cp 0
crap 2
rs 9.7998
c 0
b 0
f 0
1
<?php
2
namespace AOE\Crawler\Api;
3
4
/***************************************************************
5
 *  Copyright notice
6
 *
7
 *  (c) 2018 AOE GmbH <[email protected]>
8
 *
9
 *  All rights reserved
10
 *
11
 *  This script is part of the TYPO3 project. The TYPO3 project is
12
 *  free software; you can redistribute it and/or modify
13
 *  it under the terms of the GNU General Public License as published by
14
 *  the Free Software Foundation; either version 3 of the License, or
15
 *  (at your option) any later version.
16
 *
17
 *  The GNU General Public License can be found at
18
 *  http://www.gnu.org/copyleft/gpl.html.
19
 *
20
 *  This script is distributed in the hope that it will be useful,
21
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
22
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
23
 *  GNU General Public License for more details.
24
 *
25
 *  This copyright notice MUST APPEAR in all copies of the script!
26
 ***************************************************************/
27
28
use AOE\Crawler\Controller\CrawlerController;
29
use AOE\Crawler\Domain\Repository\ProcessRepository;
30
use AOE\Crawler\Domain\Repository\QueueRepository;
31
use TYPO3\CMS\Core\Database\ConnectionPool;
32
use TYPO3\CMS\Core\Database\Query\QueryBuilder;
33
use TYPO3\CMS\Core\Utility\GeneralUtility;
34
use TYPO3\CMS\Core\Utility\MathUtility;
35
use TYPO3\CMS\Extbase\Object\ObjectManager;
36
use TYPO3\CMS\Frontend\Page\PageRepository;
37
38
/**
39
 * Class CrawlerApi
40
 *
41
 * @package AOE\Crawler\Api
42
 */
43
class CrawlerApi
44
{
45
    /**
46
     * @var CrawlerController
47
     */
48
    private $crawlerController;
49
50
    /**
51
     * @var QueueRepository
52
     */
53
    protected $queueRepository;
54
55
    /**
56
     * @var $allowedConfigurations array
57
     */
58
    protected $allowedConfigurations = [];
59
60
    /**
61
     * @var QueryBuilder
62
     */
63
    protected $queryBuilder;
64
65
    /**
66
     * @var string
67
     */
68
    protected $tableName = 'tx_crawler_queue';
69
70 11
    public function __construct()
71
    {
72 11
        $objectManager = GeneralUtility::makeInstance(ObjectManager::class);
73 11
        $this->crawlerController = $objectManager->get(CrawlerController::class);
74 11
    }
75
76
    /**
77
     * Each crawler run has a setid, this facade method delegates
78
     * the it to the crawler object
79
     *
80
     * @param int
81
     */
82 1
    public function overwriteSetId($id)
83
    {
84 1
        $this->findCrawler()->setID = intval($id);
85 1
    }
86
87
    /**
88
     * This method is used to limit the configuration selection to
89
     * a set of configurations.
90
     *
91
     * @param array $allowedConfigurations
92
     */
93 1
    public function setAllowedConfigurations(array $allowedConfigurations)
94
    {
95 1
        $this->allowedConfigurations = $allowedConfigurations;
96 1
    }
97
98
    /**
99
     * @return array
100
     */
101 1
    public function getAllowedConfigurations()
102
    {
103 1
        return $this->allowedConfigurations;
104
    }
105
106
    /**
107
     * Returns the setID of the crawler
108
     *
109
     * @return int
110
     */
111 1
    public function getSetId()
112
    {
113 1
        return $this->findCrawler()->setID;
114
    }
115
116
    /**
117
     * Method to get an instance of the internal crawler singleton
118
     *
119
     * @return CrawlerController Instance of the crawler lib
120
     *
121
     * @throws \Exception
122
     */
123 2
    protected function findCrawler()
124
    {
125 2
        if (!is_object($this->crawlerController)) {
126
            $this->crawlerController = GeneralUtility::makeInstance(CrawlerController::class);
127
            $this->crawlerController->setID = GeneralUtility::md5int(microtime());
128
        }
129
130 2
        if (is_object($this->crawlerController)) {
131 2
            return $this->crawlerController;
132
        } else {
133
            throw new \Exception('no crawler object', 1512659759);
134
        }
135
    }
136
137
    /**
138
     * Adds a page to the crawlerqueue by uid
139
     *
140
     * @param int $uid uid
141
     */
142
    public function addPageToQueue($uid)
143
    {
144
        $uid = intval($uid);
145
        //non timed elements will be added with timestamp 0
146
        $this->addPageToQueueTimed($uid, 0);
147
    }
148
149
    /**
150
     * This method is used to limit the processing instructions to the processing instructions
151
     * that are allowed.
152
     *
153
     * @return array
154
     */
155 2
    protected function filterUnallowedConfigurations($configurations)
156
    {
157 2
        if (count($this->allowedConfigurations) > 0) {
158
            // 	remove configuration that does not match the current selection
159
            foreach ($configurations as $confKey => $confArray) {
160
                if (!in_array($confKey, $this->allowedConfigurations)) {
161
                    unset($configurations[$confKey]);
162
                }
163
            }
164
        }
165
166 2
        return $configurations;
167
    }
168
169
    /**
170
     * Adds a page to the crawlerqueue by uid and sets a
171
     * timestamp when the page should be crawled.
172
     *
173
     * @param int $uid pageid
174
     * @param int $time timestamp
175
     */
176 2
    public function addPageToQueueTimed($uid, $time)
177
    {
178 2
        $uid = intval($uid);
179 2
        $time = intval($time);
180
181 2
        $crawler = $this->findCrawler();
182 2
        $pageData = GeneralUtility::makeInstance(PageRepository::class)->getPage($uid);
183 2
        $configurations = $crawler->getUrlsForPageRow($pageData);
184 2
        $configurations = $this->filterUnallowedConfigurations($configurations);
185 2
        $downloadUrls = [];
186 2
        $duplicateTrack = [];
187
188 2
        if (is_array($configurations)) {
189 2
            foreach ($configurations as $cv) {
190
                //enable inserting of entries
191 2
                $crawler->registerQueueEntriesInternallyOnly = false;
192 2
                $crawler->urlListFromUrlArray(
193 2
                    $cv,
194 2
                    $pageData,
195 2
                    $time,
196 2
                    300,
197 2
                    true,
198 2
                    false,
199 2
                    $duplicateTrack,
200 2
                    $downloadUrls,
201 2
                    array_keys($this->getCrawlerProcInstructions())
202
                );
203
204
                //reset the queue because the entries have been written to the db
205 2
                unset($crawler->queueEntries);
206
            }
207
        } else {
208
            //no configuration found
209
        }
210 2
    }
211
212
    /**
213
     * Counts all entries in the database which are scheduled for a given page id and a schedule timestamp.
214
     *
215
     * @param int $page_uid
216
     * @param int $schedule_timestamp
217
     *
218
     * @return int
219
     */
220 1
    protected function countEntriesInQueueForPageByScheduleTime($page_uid, $schedule_timestamp)
221
    {
222
223 1
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
224
        $count = $queryBuilder
225 1
            ->count('*')
226 1
            ->from($this->tableName);
227
228
        //if the same page is scheduled for the same time and has not be executed?
229
        //un-timed elements need an exec_time with 0 because they can occur multiple times
230 1
        if ($schedule_timestamp == 0) {
231 1
            $count->where(
232 1
                $queryBuilder->expr()->eq('page_id', $page_uid),
233 1
                $queryBuilder->expr()->eq('exec_time', 0),
234 1
                $queryBuilder->expr()->eq('scheduled', $schedule_timestamp)
235
            );
236
        } else {
237
            //timed elements have got a fixed schedule time, if a record with this time
238
            //exists it is maybe queued for the future, or is has been queue for the past and therefore
239
            //also been processed.
240 1
            $count->where(
241 1
                $queryBuilder->expr()->eq('page_id', $page_uid),
242 1
                $queryBuilder->expr()->eq('scheduled', $schedule_timestamp)
243
            );
244
        }
245
246 1
        return $count->execute()->rowCount();
247
    }
248
249
    /**
250
     * Determines if a page is queued
251
     *
252
     * @param $uid
253
     * @param bool $unprocessed_only
254
     * @param bool $timed_only
255
     * @param bool $timestamp
256
     *
257
     * @return bool
258
     *
259
     * @deprecated since crawler v7.0.0, will be removed in crawler v8.0.0.
260
     */
261
    public function isPageInQueue($uid, $unprocessed_only = true, $timed_only = false, $timestamp = false)
262
    {
263
        if (!MathUtility::canBeInterpretedAsInteger($uid)) {
264
            throw new \InvalidArgumentException('Invalid parameter type', 1468931945);
265
        }
266
267
        $isPageInQueue = false;
268
269
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
270
        $queryBuilder
271
            ->count('*')
272
            ->from($this->tableName)
273
            ->where(
274
                $queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($uid, \PDO::PARAM_INT))
275
            );
276
277
        if (false !== $unprocessed_only) {
278
            $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
279
        }
280
281
        if (false !== $timed_only) {
282
            $queryBuilder->andWhere($queryBuilder->expr()->neq('scheduled', 0));
283
        }
284
285
        if (false !== $timestamp) {
286
            $queryBuilder->andWhere($queryBuilder->expr()->neq('scheduled', $queryBuilder->createNamedParameter($timestamp, \PDO::PARAM_INT)));
287
        }
288
289
        $count = $queryBuilder->execute()->fetchColumn(0);
290
291
        if (false !== $count && $count > 0) {
292
            $isPageInQueue = true;
293
        }
294
295
        return $isPageInQueue;
296
    }
297
298
    /**
299
     * Method to return the latest Crawle Timestamp for a page.
300
     *
301
     * @param int $uid uid id of the page
302
     * @param bool $future_crawldates_only
303
     * @param bool $unprocessed_only
304
     *
305
     * @return int
306
     */
307 1
    public function getLatestCrawlTimestampForPage($uid, $future_crawldates_only = false, $unprocessed_only = false)
308
    {
309 1
        $uid = intval($uid);
310
311 1
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
312
        $query = $queryBuilder
313 1
            ->from($this->tableName)
314 1
            ->selectLiteral('max(scheduled) as latest')
315 1
            ->where(
316 1
                $queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($uid))
317
            );
318
319 1
        if ($future_crawldates_only) {
320
            $query->andWhere(
321
                $queryBuilder->expr()->gt('scheduled', time())
322
            );
323
        }
324
325 1
        if ($unprocessed_only) {
326
            $query->andWhere(
327
                $queryBuilder->expr()->eq('exec_time', 0)
328
            );
329
        }
330
331 1
        $row = $query->execute()->fetch(0);
332 1
        if ($row['latest']) {
333 1
            $res = $row['latest'];
334
        } else {
335
            $res = 0;
336
        }
337
338 1
        return $res;
339
    }
340
341
    /**
342
     * Returns an array with timestamps when the page has been scheduled for crawling and
343
     * at what time the scheduled crawl has been executed. The array also contains items that are
344
     * scheduled but have note been crawled yet.
345
     *
346
     * @param int $uid uid of the page
347
     * @param bool $limit
348
     *
349
     * @return array array with the crawl-history of a page => 0 : scheduled time , 1 : executed_time, 2 : set_id
350
     */
351 1
    public function getCrawlHistoryForPage($uid, $limit = 0)
352
    {
353 1
        $uid = intval($uid);
354 1
        $limit = intval($limit);
355
356 1
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
357
        $statement = $queryBuilder
358 1
            ->from($this->tableName)
359 1
            ->select('scheduled', 'exec_time', 'set_id')
360 1
            ->where(
361 1
                $queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($uid, \PDO::PARAM_INT))
362
            );
363 1
        if($limit) {
364 1
            $statement->setMaxResults($limit);
365
        }
366
367 1
        return $statement->execute()->fetchAll();
368
    }
369
370
    /**
371
     * Method to determine unprocessed Items in the crawler queue.
372
     *
373
     * @return array
374
     *
375
     * @deprecated since crawler v7.0.0, will be removed in crawler v8.0.0.
376
     */
377
    public function getUnprocessedItems()
378
    {
379
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
380
        return $queryBuilder
381
            ->select('*')
382
            ->from($this->tableName)
383
            ->where(
384
                $queryBuilder->expr()->eq('exec_time', 0)
385
            )
386
            ->orderBy('page_id')
387
            ->addOrderBy('scheduled')
388
            ->execute()
389
            ->fetchAll();
390
    }
391
392
    /**
393
     * Method to get the number of unprocessed items in the crawler
394
     *
395
     * @param int number of unprocessed items in the queue
396
     *
397
     * @deprecated since crawler v7.0.0, will be removed in crawler v8.0.0.
398
     */
399
    public function countUnprocessedItems()
400
    {
401
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
402
        return $queryBuilder
403
            ->count('page_id')
404
            ->from($this->tableName)
405
            ->where(
406
                $queryBuilder->expr()->eq('exec_time', 0)
407
            )
408
            ->execute()
409
            ->fetchColumn(0);
410
    }
411
412
    /**
413
     * Method to check if a page is in the queue which is timed for a
414
     * date when it should be crawled
415
     *
416
     * @param int $uid uid of the page
417
     * @param boolean $show_unprocessed only respect unprocessed pages
418
     *
419
     * @return boolean
420
     *
421
     * @deprecated since crawler v7.0.0, will be removed in crawler v8.0.0.
422
     */
423
    public function isPageInQueueTimed($uid, $show_unprocessed = true)
424
    {
425
        $uid = intval($uid);
426
427
        return $this->isPageInQueue($uid, $show_unprocessed);
0 ignored issues
show
Deprecated Code introduced by
The method AOE\Crawler\Api\CrawlerApi::isPageInQueue() has been deprecated with message: since crawler v7.0.0, will be removed in crawler v8.0.0.

This method has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the method will be removed from the class and what other method or class to use instead.

Loading history...
428
    }
429
430
    /**
431
     * Reads the registered processingInstructions of the crawler
432
     *
433
     * @return array
434
     */
435 2
    private function getCrawlerProcInstructions()
436
    {
437 2
        if (isset($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['procInstructions'])) {
438
            return $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['procInstructions'];
439
        }
440
441 2
        return [];
442
    }
443
444
    /**
445
     * Removes an queue entry with a given queue id
446
     *
447
     * @param int $qid
448
     *
449
     * @deprecated since crawler v7.0.0, will be removed in crawler v8.0.0.
450
     */
451
    public function removeQueueEntrie($qid)
452
    {
453
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
454
        $queryBuilder
455
            ->delete()
456
            ->from($this->tableName)
457
            ->where(
458
                $queryBuilder->expr()->eq('qid', $queryBuilder->createNamedParameter($qid, \PDO::PARAM_INT))
459
            )
460
            ->execute();
461
    }
462
463
    /**
464
     * Get queue statistics
465
     *
466
     * @param void
467
     *
468
     * @return array array('assignedButUnprocessed' => <>, 'unprocessed' => <>);
469
     */
470 1
    public function getQueueStatistics()
471
    {
472
        return [
473 1
            'assignedButUnprocessed' => $this->getQueueRepository()->countAllAssignedPendingItems(),
474 1
            'unprocessed' => $this->getQueueRepository()->countAllPendingItems()
475
        ];
476
    }
477
478
    /**
479
     * Get queue repository
480
     *
481
     * @return QueueRepository
482
     */
483 2
    protected function getQueueRepository()
484
    {
485 2
        if (!$this->queueRepository instanceof QueueRepository) {
486 2
            $this->queueRepository = new QueueRepository();
487
        }
488
489 2
        return $this->queueRepository;
490
    }
491
492
    /**
493
     * Get queue statistics by configuration
494
     *
495
     * @return array array of array('configuration' => <>, 'assignedButUnprocessed' => <>, 'unprocessed' => <>)
496
     */
497
    public function getQueueStatisticsByConfiguration()
498
    {
499
        $statistics = $this->getQueueRepository()->countPendingItemsGroupedByConfigurationKey();
500
501
        $setIds = $this->getQueueRepository()->getSetIdWithUnprocessedEntries();
502
503
        $totals = $this->getQueueRepository()->getTotalQueueEntriesByConfiguration($setIds);
504
505
        // "merge" arrays
506
        foreach ($statistics as $key => &$value) {
507
            $value['total'] = $totals[$value['configuration']];
508
        }
509
510
        return $statistics;
511
    }
512
513
    /**
514
     * Get active processes count
515
     *
516
     * @param void
517
     *
518
     * @return int
519
     */
520
    public function getActiveProcessesCount()
521
    {
522
        $processRepository = new ProcessRepository();
523
524
        return $processRepository->countActive();
525
    }
526
527
    /**
528
     * Get last processed entries
529
     *
530
     * @param int $limit
531
     *
532
     * @return array
533
     */
534
    public function getLastProcessedQueueEntries($limit)
535
    {
536
        return $this->getQueueRepository()->getLastProcessedEntries('*', $limit);
0 ignored issues
show
Unused Code introduced by
The call to QueueRepository::getLastProcessedEntries() has too many arguments starting with $limit.

This check compares calls to functions or methods with their respective definitions. If the call has more arguments than are defined, it raises an issue.

If a function is defined several times with a different number of parameters, the check may pick up the wrong definition and report false positives. One codebase where this has been known to happen is Wordpress.

In this case you can add the @ignore PhpDoc annotation to the duplicate definition and it will be ignored.

Loading history...
537
    }
538
539
    /**
540
     * Get current crawling speed
541
     *
542
     * @param float|false page speed in pages per minute
543
     *
544
     * @return int
545
     */
546
    public function getCurrentCrawlingSpeed()
547
    {
548
        $lastProcessedEntries = $this->getQueueRepository()->getLastProcessedEntriesTimestamps();
549
550
        if (count($lastProcessedEntries) < 10) {
551
            // not enough information
552
            return false;
553
        }
554
555
        $tooOldDelta = 60; // time between two entries is "too old"
556
557
        $compareValue = time();
558
        $startTime = $lastProcessedEntries[0];
559
560
        $pages = 0;
561
562
        reset($lastProcessedEntries);
563
        foreach($lastProcessedEntries as $key => $timestamp) {
564
            if ($compareValue - $timestamp > $tooOldDelta) {
565
                break;
566
            }
567
            $compareValue = $timestamp;
568
            $pages++;
569
        }
570
571
        if ($pages < 10) {
572
            // not enough information
573
            return false;
574
        }
575
        $oldestTimestampThatIsNotTooOld = $compareValue;
576
        $time = $startTime - $oldestTimestampThatIsNotTooOld;
577
        $speed = $pages / ($time / 60);
578
579
        return $speed;
580
    }
581
582
    /**
583
     * Get some performance data
584
     *
585
     * @param integer $start
586
     * @param integer $end
587
     * @param integer $resolution
588
     *
589
     * @return array data
590
     *
591
     * @throws \Exception
592
     */
593
    public function getPerformanceData($start, $end, $resolution)
594
    {
595
        $data = [];
596
597
        $data['urlcount'] = 0;
598
        $data['start'] = $start;
599
        $data['end'] = $end;
600
        $data['duration'] = $data['end'] - $data['start'];
601
602
        if ($data['duration'] < 1) {
603
            throw new \Exception('End timestamp must be after start timestamp', 1512659945);
604
        }
605
606
        for ($slotStart = $start; $slotStart < $end; $slotStart += $resolution) {
607
            $slotEnd = min($slotStart + $resolution - 1, $end);
608
            $slotData = $this->getQueueRepository()->getPerformanceData($slotStart, $slotEnd);
609
610
            $slotUrlCount = 0;
611
            foreach ($slotData as $processId => &$processData) {
612
                $duration = $processData['end'] - $processData['start'];
613
                if ($processData['urlcount'] > 5 && $duration > 0) {
614
                    $processData['speed'] = 60 * 1 / ($duration / $processData['urlcount']);
615
                }
616
                $slotUrlCount += $processData['urlcount'];
617
            }
618
619
            $data['urlcount'] += $slotUrlCount;
620
621
            $data['slots'][$slotEnd] = [
622
                'amountProcesses' => count($slotData),
623
                'urlcount' => $slotUrlCount,
624
                'processes' => $slotData,
625
            ];
626
627
            if ($slotUrlCount > 5) {
628
                $data['slots'][$slotEnd]['speed'] = 60 * 1 / ($slotEnd - $slotStart / $slotUrlCount);
629
            } else {
630
                $data['slots'][$slotEnd]['speed'] = 0;
631
            }
632
        }
633
634
        if ($data['urlcount'] > 5) {
635
            $data['speed'] = 60 * 1 / ($data['duration'] / $data['urlcount']);
636
        } else {
637
            $data['speed'] = 0;
638
        }
639
640
        return $data;
641
    }
642
}
643