Passed
Push — typo3v9 ( 2404ee...b9b5fa )
by Tomas Norre
05:51
created

CrawlerApi::getQueueStatistics()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 5
Code Lines 3

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 3
CRAP Score 1.064

Importance

Changes 0
Metric Value
cc 1
eloc 3
c 0
b 0
f 0
nc 1
nop 0
dl 0
loc 5
ccs 3
cts 5
cp 0.6
crap 1.064
rs 10
1
<?php
2
3
declare(strict_types=1);
4
5
namespace AOE\Crawler\Api;
6
7
/***************************************************************
8
 *  Copyright notice
9
 *
10
 *  (c) 2018 AOE GmbH <[email protected]>
11
 *
12
 *  All rights reserved
13
 *
14
 *  This script is part of the TYPO3 project. The TYPO3 project is
15
 *  free software; you can redistribute it and/or modify
16
 *  it under the terms of the GNU General Public License as published by
17
 *  the Free Software Foundation; either version 3 of the License, or
18
 *  (at your option) any later version.
19
 *
20
 *  The GNU General Public License can be found at
21
 *  http://www.gnu.org/copyleft/gpl.html.
22
 *
23
 *  This script is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
26
 *  GNU General Public License for more details.
27
 *
28
 *  This copyright notice MUST APPEAR in all copies of the script!
29
 ***************************************************************/
30
31
use AOE\Crawler\Controller\CrawlerController;
32
use AOE\Crawler\Domain\Repository\ProcessRepository;
33
use AOE\Crawler\Domain\Repository\QueueRepository;
34
use TYPO3\CMS\Core\Database\ConnectionPool;
35
use TYPO3\CMS\Core\Database\Query\QueryBuilder;
36
use TYPO3\CMS\Core\Utility\GeneralUtility;
37
use TYPO3\CMS\Extbase\Object\ObjectManager;
38
use TYPO3\CMS\Frontend\Page\PageRepository;
39
40
/**
41
 * Class CrawlerApi
42
 *
43
 * @package AOE\Crawler\Api
44
 */
45
class CrawlerApi
46
{
47
    /**
48
     * @var QueueRepository
49
     */
50
    protected $queueRepository;
51
52
    /**
53
     * @var array
54
     */
55
    protected $allowedConfigurations = [];
56
57
    /**
58
     * @var QueryBuilder
59
     */
60
    protected $queryBuilder;
61
62
    /**
63
     * @var string
64
     */
65
    protected $tableName = 'tx_crawler_queue';
66
67 9
    public function __construct()
68
    {
69
        /** @var ObjectManager $objectManager */
70 9
        $objectManager = GeneralUtility::makeInstance(ObjectManager::class);
71 9
        $this->queueRepository = $objectManager->get(QueueRepository::class);
72 9
    }
73
74
    /**
75
     * Each crawler run has a setid, this facade method delegates
76
     * the it to the crawler object
77
     *
78
     * @param int $id
79
     * @throws \Exception
80
     */
81 1
    public function overwriteSetId(int $id): void
82
    {
83 1
        $this->findCrawler()->setID = $id;
84 1
    }
85
86
    /**
87
     * This method is used to limit the configuration selection to
88
     * a set of configurations.
89
     *
90
     * @param array $allowedConfigurations
91
     */
92 1
    public function setAllowedConfigurations(array $allowedConfigurations): void
93
    {
94 1
        $this->allowedConfigurations = $allowedConfigurations;
95 1
    }
96
97
    /**
98
     * @return array
99
     */
100 1
    public function getAllowedConfigurations()
101
    {
102 1
        return $this->allowedConfigurations;
103
    }
104
105
    /**
106
     * Returns the setID of the crawler
107
     *
108
     * @return int
109
     */
110 1
    public function getSetId()
111
    {
112 1
        return $this->findCrawler()->setID;
113
    }
114
115
    /**
116
     * Method to get an instance of the internal crawler singleton
117
     *
118
     * @return CrawlerController Instance of the crawler lib
119
     *
120
     * @throws \Exception
121
     */
122 2
    protected function findCrawler()
123
    {
124 2
        if (!is_object($this->crawlerController)) {
125 2
            $this->crawlerController = GeneralUtility::makeInstance(CrawlerController::class);
0 ignored issues
show
Bug Best Practice introduced by
The property crawlerController does not exist. Although not strictly required by PHP, it is generally a best practice to declare properties explicitly.
Loading history...
126 2
            $this->crawlerController->setID = GeneralUtility::md5int(microtime());
127
        }
128
129 2
        if (is_object($this->crawlerController)) {
130 2
            return $this->crawlerController;
131
        } else {
132
            throw new \Exception('no crawler object', 1512659759);
133
        }
134
    }
135
136
    /**
137
     * Adds a page to the crawlerqueue by uid
138
     *
139
     * @param int $uid uid
140
     */
141
    public function addPageToQueue($uid): void
142
    {
143
        $uid = intval($uid);
144
        //non timed elements will be added with timestamp 0
145
        $this->addPageToQueueTimed($uid, 0);
146
    }
147
148
    /**
149
     * This method is used to limit the processing instructions to the processing instructions
150
     * that are allowed.
151
     *
152
     * @return array
153
     */
154 2
    protected function filterUnallowedConfigurations($configurations)
155
    {
156 2
        if (count($this->allowedConfigurations) > 0) {
157
            // 	remove configuration that does not match the current selection
158
            foreach ($configurations as $confKey => $confArray) {
159
                if (!in_array($confKey, $this->allowedConfigurations)) {
160
                    unset($configurations[$confKey]);
161
                }
162
            }
163
        }
164
165 2
        return $configurations;
166
    }
167
168
    /**
169
     * Adds a page to the crawlerqueue by uid and sets a
170
     * timestamp when the page should be crawled.
171
     *
172
     * @param int $uid pageid
173
     * @param int $time timestamp
174
     *
175
     * @throws \Exception
176
     * @return void
177
     */
178 2
    public function addPageToQueueTimed($uid, $time): void
179
    {
180 2
        $uid = intval($uid);
181 2
        $time = intval($time);
182
183 2
        $crawler = $this->findCrawler();
184 2
        $pageData = GeneralUtility::makeInstance(PageRepository::class)->getPage($uid);
185 2
        $configurations = $crawler->getUrlsForPageRow($pageData);
186 2
        $configurations = $this->filterUnallowedConfigurations($configurations);
187 2
        $downloadUrls = [];
188 2
        $duplicateTrack = [];
189
190 2
        if (is_array($configurations)) {
0 ignored issues
show
introduced by
The condition is_array($configurations) is always true.
Loading history...
191 2
            foreach ($configurations as $cv) {
192
                //enable inserting of entries
193 2
                $crawler->registerQueueEntriesInternallyOnly = false;
194 2
                $crawler->urlListFromUrlArray(
195 2
                    $cv,
196 2
                    $pageData,
197 2
                    $time,
198 2
                    300,
199 2
                    true,
200 2
                    false,
201 2
                    $duplicateTrack,
202 2
                    $downloadUrls,
203 2
                    array_keys($this->getCrawlerProcInstructions())
204
                );
205
206
                //reset the queue because the entries have been written to the db
207 2
                unset($crawler->queueEntries);
208
            }
209
        }
210 2
    }
211
212
    /**
213
     * Counts all entries in the database which are scheduled for a given page id and a schedule timestamp.
214
     *
215
     * @param int $page_uid
216
     * @param int $schedule_timestamp
217
     *
218
     * @return int
219
     */
220 1
    protected function countEntriesInQueueForPageByScheduleTime($page_uid, $schedule_timestamp)
221
    {
222 1
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
223
        $count = $queryBuilder
224 1
            ->count('*')
225 1
            ->from($this->tableName);
226
227
        //if the same page is scheduled for the same time and has not be executed?
228
        //un-timed elements need an exec_time with 0 because they can occur multiple times
229 1
        if ($schedule_timestamp == 0) {
230 1
            $count->where(
231 1
                $queryBuilder->expr()->eq('page_id', $page_uid),
232 1
                $queryBuilder->expr()->eq('exec_time', 0),
233 1
                $queryBuilder->expr()->eq('scheduled', $schedule_timestamp)
234
            );
235
        } else {
236
            //timed elements have got a fixed schedule time, if a record with this time
237
            //exists it is maybe queued for the future, or is has been queue for the past and therefore
238
            //also been processed.
239 1
            $count->where(
240 1
                $queryBuilder->expr()->eq('page_id', $page_uid),
241 1
                $queryBuilder->expr()->eq('scheduled', $schedule_timestamp)
242
            );
243
        }
244
245 1
        return $count->execute()->rowCount();
246
    }
247
248
    /**
249
     * Method to return the latest Crawle Timestamp for a page.
250
     *
251
     * @param int $uid uid id of the page
252
     * @param bool $future_crawldates_only
253
     * @param bool $unprocessed_only
254
     *
255
     * @return int
256
     */
257 1
    public function getLatestCrawlTimestampForPage($uid, $future_crawldates_only = false, $unprocessed_only = false)
258
    {
259 1
        $uid = intval($uid);
260
261 1
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
262
        $query = $queryBuilder
263 1
            ->from($this->tableName)
264 1
            ->selectLiteral('max(scheduled) as latest')
265 1
            ->where(
266 1
                $queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($uid))
267
            );
268
269 1
        if ($future_crawldates_only) {
270
            $query->andWhere(
271
                $queryBuilder->expr()->gt('scheduled', time())
272
            );
273
        }
274
275 1
        if ($unprocessed_only) {
276
            $query->andWhere(
277
                $queryBuilder->expr()->eq('exec_time', 0)
278
            );
279
        }
280
281 1
        $row = $query->execute()->fetch(0);
282 1
        if ($row['latest']) {
283 1
            $res = $row['latest'];
284
        } else {
285
            $res = 0;
286
        }
287
288 1
        return intval($res);
289
    }
290
291
    /**
292
     * Returns an array with timestamps when the page has been scheduled for crawling and
293
     * at what time the scheduled crawl has been executed. The array also contains items that are
294
     * scheduled but have note been crawled yet.
295
     *
296
     * @param int $uid uid of the page
297
     * @param bool $limit
298
     *
299
     * @return array array with the crawl-history of a page => 0 : scheduled time , 1 : executed_time, 2 : set_id
300
     */
301 1
    public function getCrawlHistoryForPage($uid, $limit = 0)
302
    {
303 1
        $uid = intval($uid);
304 1
        $limit = intval($limit);
305
306 1
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
307
        $statement = $queryBuilder
308 1
            ->from($this->tableName)
309 1
            ->select('scheduled', 'exec_time', 'set_id')
310 1
            ->where(
311 1
                $queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($uid, \PDO::PARAM_INT))
312
            );
313 1
        if ($limit) {
314 1
            $statement->setMaxResults($limit);
315
        }
316
317 1
        return $statement->execute()->fetchAll();
318
    }
319
320
    /**
321
     * Reads the registered processingInstructions of the crawler
322
     *
323
     * @return array
324
     */
325 2
    private function getCrawlerProcInstructions(): array
326
    {
327 2
        $crawlerProcInstructions = [];
328 2
        if (!empty($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['procInstructions'])) {
329
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['procInstructions'] as $configuration) {
330
                $crawlerProcInstructions[$configuration['key']] = $configuration['value'];
331
            }
332
        }
333
334 2
        return $crawlerProcInstructions;
335
    }
336
337
    /**
338
     * Get queue statistics
339
     *
340
     * @param void
341
     *
342
     * @return array array('assignedButUnprocessed' => <>, 'unprocessed' => <>);
343
     */
344 1
    public function getQueueStatistics()
345
    {
346
        return [
347 1
            'assignedButUnprocessed' => $this->queueRepository->countAllAssignedPendingItems(),
348 1
            'unprocessed' => $this->queueRepository->countAllPendingItems(),
349
        ];
350
    }
351
352
    /**
353
     * Get queue statistics by configuration
354
     *
355
     * @return array array of array('configuration' => <>, 'assignedButUnprocessed' => <>, 'unprocessed' => <>)
356
     */
357
    public function getQueueStatisticsByConfiguration()
358
    {
359
        $statistics = $this->queueRepository->countPendingItemsGroupedByConfigurationKey();
360
        $setIds = $this->queueRepository->getSetIdWithUnprocessedEntries();
361
        $totals = $this->queueRepository->getTotalQueueEntriesByConfiguration($setIds);
362
363
        // "merge" arrays
364
        foreach ($statistics as &$value) {
365
            $value['total'] = $totals[$value['configuration']];
366
        }
367
368
        return $statistics;
369
    }
370
371
    /**
372
     * Get active processes count
373
     *
374
     * @param void
375
     *
376
     * @return int
377
     */
378
    public function getActiveProcessesCount()
379
    {
380
        $processRepository = new ProcessRepository();
381
382
        return $processRepository->countActive();
383
    }
384
385
    /**
386
     * Get last processed entries
387
     *
388
     * @param int $limit
389
     *
390
     * @return array
391
     */
392
    public function getLastProcessedQueueEntries($limit)
393
    {
394
        return $this->queueRepository->getLastProcessedEntries('*', $limit);
0 ignored issues
show
Bug introduced by
'*' of type string is incompatible with the type integer expected by parameter $limit of AOE\Crawler\Domain\Repos...tLastProcessedEntries(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

394
        return $this->queueRepository->getLastProcessedEntries(/** @scrutinizer ignore-type */ '*', $limit);
Loading history...
Unused Code introduced by
The call to AOE\Crawler\Domain\Repos...tLastProcessedEntries() has too many arguments starting with $limit. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

394
        return $this->queueRepository->/** @scrutinizer ignore-call */ getLastProcessedEntries('*', $limit);

This check compares calls to functions or methods with their respective definitions. If the call has more arguments than are defined, it raises an issue.

If a function is defined several times with a different number of parameters, the check may pick up the wrong definition and report false positives. One codebase where this has been known to happen is Wordpress. Please note the @ignore annotation hint above.

Loading history...
395
    }
396
397
    /**
398
     * Get current crawling speed
399
     *
400
     * @param float|false page speed in pages per minute
0 ignored issues
show
Bug introduced by
The type AOE\Crawler\Api\page was not found. Maybe you did not declare it correctly or list all dependencies?

The issue could also be caused by a filter entry in the build configuration. If the path has been excluded in your configuration, e.g. excluded_paths: ["lib/*"], you can move it to the dependency path list as follows:

filter:
    dependency_paths: ["lib/*"]

For further information see https://scrutinizer-ci.com/docs/tools/php/php-scrutinizer/#list-dependency-paths

Loading history...
401
     *
402
     * @return int
403
     */
404
    public function getCurrentCrawlingSpeed()
405
    {
406
        $lastProcessedEntries = $this->queueRepository->getLastProcessedEntriesTimestamps();
407
408
        if (count($lastProcessedEntries) < 10) {
409
            // not enough information
410
            return false;
0 ignored issues
show
Bug Best Practice introduced by
The expression return false returns the type false which is incompatible with the documented return type integer.
Loading history...
411
        }
412
413
        $tooOldDelta = 60; // time between two entries is "too old"
414
415
        $compareValue = time();
416
        $startTime = $lastProcessedEntries[0];
417
418
        $pages = 0;
419
420
        reset($lastProcessedEntries);
421
        foreach ($lastProcessedEntries as $timestamp) {
422
            if ($compareValue - $timestamp > $tooOldDelta) {
423
                break;
424
            }
425
            $compareValue = $timestamp;
426
            $pages++;
427
        }
428
429
        if ($pages < 10) {
430
            // not enough information
431
            return false;
0 ignored issues
show
Bug Best Practice introduced by
The expression return false returns the type false which is incompatible with the documented return type integer.
Loading history...
432
        }
433
        $oldestTimestampThatIsNotTooOld = $compareValue;
434
        $time = $startTime - $oldestTimestampThatIsNotTooOld;
435
436
        return $pages / ($time / 60);
437
    }
438
439
    /**
440
     * Get some performance data
441
     *
442
     * @param integer $start
443
     * @param integer $end
444
     * @param integer $resolution
445
     *
446
     * @return array data
447
     *
448
     * @throws \Exception
449
     */
450
    public function getPerformanceData($start, $end, $resolution)
451
    {
452
        $data = [];
453
454
        $data['urlcount'] = 0;
455
        $data['start'] = $start;
456
        $data['end'] = $end;
457
        $data['duration'] = $data['end'] - $data['start'];
458
459
        if ($data['duration'] < 1) {
460
            throw new \Exception('End timestamp must be after start timestamp', 1512659945);
461
        }
462
463
        for ($slotStart = $start; $slotStart < $end; $slotStart += $resolution) {
464
            $slotEnd = min($slotStart + $resolution - 1, $end);
465
            $slotData = $this->queueRepository->getPerformanceData($slotStart, $slotEnd);
466
467
            $slotUrlCount = 0;
468
            foreach ($slotData as &$processData) {
469
                $duration = $processData['end'] - $processData['start'];
470
                if ($processData['urlcount'] > 5 && $duration > 0) {
471
                    $processData['speed'] = 60 * 1 / ($duration / $processData['urlcount']);
472
                }
473
                $slotUrlCount += $processData['urlcount'];
474
            }
475
476
            $data['urlcount'] += $slotUrlCount;
477
478
            $data['slots'][$slotEnd] = [
479
                'amountProcesses' => count($slotData),
480
                'urlcount' => $slotUrlCount,
481
                'processes' => $slotData,
482
            ];
483
484
            if ($slotUrlCount > 5) {
485
                $data['slots'][$slotEnd]['speed'] = 60 * 1 / ($slotEnd - $slotStart / $slotUrlCount);
486
            } else {
487
                $data['slots'][$slotEnd]['speed'] = 0;
488
            }
489
        }
490
491
        if ($data['urlcount'] > 5) {
492
            $data['speed'] = 60 * 1 / ($data['duration'] / $data['urlcount']);
493
        } else {
494
            $data['speed'] = 0;
495
        }
496
497
        return $data;
498
    }
499
}
500