Completed
Push — typo3v9 ( aea555...37a7d2 )
by Tomas Norre
06:20
created

CrawlerApi::addPageToQueueTimed()   A

Complexity

Conditions 3
Paths 3

Size

Total Lines 33

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 12

Importance

Changes 0
Metric Value
cc 3
nc 3
nop 2
dl 0
loc 33
ccs 0
cts 28
cp 0
crap 12
rs 9.392
c 0
b 0
f 0
1
<?php
2
3
declare(strict_types=1);
4
5
namespace AOE\Crawler\Api;
6
7
/***************************************************************
8
 *  Copyright notice
9
 *
10
 *  (c) 2018 AOE GmbH <[email protected]>
11
 *
12
 *  All rights reserved
13
 *
14
 *  This script is part of the TYPO3 project. The TYPO3 project is
15
 *  free software; you can redistribute it and/or modify
16
 *  it under the terms of the GNU General Public License as published by
17
 *  the Free Software Foundation; either version 3 of the License, or
18
 *  (at your option) any later version.
19
 *
20
 *  The GNU General Public License can be found at
21
 *  http://www.gnu.org/copyleft/gpl.html.
22
 *
23
 *  This script is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
26
 *  GNU General Public License for more details.
27
 *
28
 *  This copyright notice MUST APPEAR in all copies of the script!
29
 ***************************************************************/
30
31
use AOE\Crawler\Controller\CrawlerController;
32
use AOE\Crawler\Domain\Repository\ProcessRepository;
33
use AOE\Crawler\Domain\Repository\QueueRepository;
34
use TYPO3\CMS\Core\Database\ConnectionPool;
35
use TYPO3\CMS\Core\Database\Query\QueryBuilder;
36
use TYPO3\CMS\Core\Utility\GeneralUtility;
37
use TYPO3\CMS\Extbase\Object\ObjectManager;
38
use TYPO3\CMS\Frontend\Page\PageRepository;
39
40
/**
41
 * Class CrawlerApi
42
 *
43
 * @package AOE\Crawler\Api
44
 */
45
class CrawlerApi
46
{
47
    /**
48
     * @var QueueRepository
49
     */
50
    protected $queueRepository;
51
52
    /**
53
     * @var array
54
     */
55
    protected $allowedConfigurations = [];
56
57
    /**
58
     * @var QueryBuilder
59
     */
60
    protected $queryBuilder;
61
62
    /**
63
     * @var string
64
     */
65
    protected $tableName = 'tx_crawler_queue';
66
67
    public function __construct()
68
    {
69
        /** @var ObjectManager $objectManager */
70
        $objectManager = GeneralUtility::makeInstance(ObjectManager::class);
71
        $this->queueRepository = $objectManager->get(QueueRepository::class);
72
    }
73
74
    /**
75
     * Each crawler run has a setid, this facade method delegates
76
     * the it to the crawler object
77
     *
78
     * @param int $id
79
     * @throws \Exception
80
     */
81
    public function overwriteSetId(int $id): void
82
    {
83
        $this->findCrawler()->setID = $id;
84
    }
85
86
    /**
87
     * This method is used to limit the configuration selection to
88
     * a set of configurations.
89
     *
90
     * @param array $allowedConfigurations
91
     */
92
    public function setAllowedConfigurations(array $allowedConfigurations): void
93
    {
94
        $this->allowedConfigurations = $allowedConfigurations;
95
    }
96
97
    /**
98
     * @return array
99
     */
100
    public function getAllowedConfigurations()
101
    {
102
        return $this->allowedConfigurations;
103
    }
104
105
    /**
106
     * Returns the setID of the crawler
107
     *
108
     * @return int
109
     */
110
    public function getSetId()
111
    {
112
        return $this->findCrawler()->setID;
113
    }
114
115
    /**
116
     * Method to get an instance of the internal crawler singleton
117
     *
118
     * @return CrawlerController Instance of the crawler lib
119
     *
120
     * @throws \Exception
121
     */
122
    protected function findCrawler()
123
    {
124
        if (!is_object($this->crawlerController)) {
125
            $this->crawlerController = GeneralUtility::makeInstance(CrawlerController::class);
0 ignored issues
show
Bug introduced by
The property crawlerController does not exist. Did you maybe forget to declare it?

In PHP it is possible to write to properties without declaring them. For example, the following is perfectly valid PHP code:

class MyClass { }

$x = new MyClass();
$x->foo = true;

Generally, it is a good practice to explictly declare properties to avoid accidental typos and provide IDE auto-completion:

class MyClass {
    public $foo;
}

$x = new MyClass();
$x->foo = true;
Loading history...
126
            $this->crawlerController->setID = GeneralUtility::md5int(microtime());
127
        }
128
129
        if (is_object($this->crawlerController)) {
130
            return $this->crawlerController;
131
        } else {
132
            throw new \Exception('no crawler object', 1512659759);
133
        }
134
    }
135
136
    /**
137
     * Adds a page to the crawlerqueue by uid
138
     *
139
     * @param int $uid uid
140
     */
141
    public function addPageToQueue($uid): void
142
    {
143
        $uid = intval($uid);
144
        //non timed elements will be added with timestamp 0
145
        $this->addPageToQueueTimed($uid, 0);
146
    }
147
148
    /**
149
     * This method is used to limit the processing instructions to the processing instructions
150
     * that are allowed.
151
     *
152
     * @return array
153
     */
154
    protected function filterUnallowedConfigurations($configurations)
155
    {
156
        if (count($this->allowedConfigurations) > 0) {
157
            // 	remove configuration that does not match the current selection
158
            foreach ($configurations as $confKey => $confArray) {
159
                if (!in_array($confKey, $this->allowedConfigurations)) {
160
                    unset($configurations[$confKey]);
161
                }
162
            }
163
        }
164
165
        return $configurations;
166
    }
167
168
    /**
169
     * Adds a page to the crawlerqueue by uid and sets a
170
     * timestamp when the page should be crawled.
171
     *
172
     * @param int $uid pageid
173
     * @param int $time timestamp
174
     *
175
     * @throws \Exception
176
     * @return void
177
     */
178
    public function addPageToQueueTimed($uid, $time): void
179
    {
180
        $uid = intval($uid);
181
        $time = intval($time);
182
183
        $crawler = $this->findCrawler();
184
        $pageData = GeneralUtility::makeInstance(PageRepository::class)->getPage($uid);
185
        $configurations = $crawler->getUrlsForPageRow($pageData);
186
        $configurations = $this->filterUnallowedConfigurations($configurations);
187
        $downloadUrls = [];
188
        $duplicateTrack = [];
189
190
        if (is_array($configurations)) {
191
            foreach ($configurations as $cv) {
192
                //enable inserting of entries
193
                $crawler->registerQueueEntriesInternallyOnly = false;
194
                $crawler->urlListFromUrlArray(
195
                    $cv,
196
                    $pageData,
197
                    $time,
198
                    300,
199
                    true,
200
                    false,
201
                    $duplicateTrack,
202
                    $downloadUrls,
203
                    array_keys($this->getCrawlerProcInstructions())
204
                );
205
206
                //reset the queue because the entries have been written to the db
207
                unset($crawler->queueEntries);
208
            }
209
        }
210
    }
211
212
    /**
213
     * Counts all entries in the database which are scheduled for a given page id and a schedule timestamp.
214
     *
215
     * @param int $page_uid
216
     * @param int $schedule_timestamp
217
     *
218
     * @return int
219
     */
220
    protected function countEntriesInQueueForPageByScheduleTime($page_uid, $schedule_timestamp)
221
    {
222
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
223
        $count = $queryBuilder
224
            ->count('*')
225
            ->from($this->tableName);
226
227
        //if the same page is scheduled for the same time and has not be executed?
228
        //un-timed elements need an exec_time with 0 because they can occur multiple times
229
        if ($schedule_timestamp == 0) {
230
            $count->where(
231
                $queryBuilder->expr()->eq('page_id', $page_uid),
232
                $queryBuilder->expr()->eq('exec_time', 0),
233
                $queryBuilder->expr()->eq('scheduled', $schedule_timestamp)
234
            );
235
        } else {
236
            //timed elements have got a fixed schedule time, if a record with this time
237
            //exists it is maybe queued for the future, or is has been queue for the past and therefore
238
            //also been processed.
239
            $count->where(
240
                $queryBuilder->expr()->eq('page_id', $page_uid),
241
                $queryBuilder->expr()->eq('scheduled', $schedule_timestamp)
242
            );
243
        }
244
245
        return $count->execute()->rowCount();
246
    }
247
248
    /**
249
     * Method to return the latest Crawle Timestamp for a page.
250
     *
251
     * @param int $uid uid id of the page
252
     * @param bool $future_crawldates_only
253
     * @param bool $unprocessed_only
254
     *
255
     * @return int
256
     */
257
    public function getLatestCrawlTimestampForPage($uid, $future_crawldates_only = false, $unprocessed_only = false)
258
    {
259
        $uid = intval($uid);
260
261
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
262
        $query = $queryBuilder
263
            ->from($this->tableName)
264
            ->selectLiteral('max(scheduled) as latest')
265
            ->where(
266
                $queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($uid))
267
            );
268
269
        if ($future_crawldates_only) {
270
            $query->andWhere(
271
                $queryBuilder->expr()->gt('scheduled', time())
272
            );
273
        }
274
275
        if ($unprocessed_only) {
276
            $query->andWhere(
277
                $queryBuilder->expr()->eq('exec_time', 0)
278
            );
279
        }
280
281
        $row = $query->execute()->fetch(0);
282
        if ($row['latest']) {
283
            $res = $row['latest'];
284
        } else {
285
            $res = 0;
286
        }
287
288
        return intval($res);
289
    }
290
291
    /**
292
     * Returns an array with timestamps when the page has been scheduled for crawling and
293
     * at what time the scheduled crawl has been executed. The array also contains items that are
294
     * scheduled but have note been crawled yet.
295
     *
296
     * @param int $uid uid of the page
297
     * @param bool $limit
298
     *
299
     * @return array array with the crawl-history of a page => 0 : scheduled time , 1 : executed_time, 2 : set_id
300
     */
301
    public function getCrawlHistoryForPage($uid, $limit = 0)
302
    {
303
        $uid = intval($uid);
304
        $limit = intval($limit);
305
306
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
307
        $statement = $queryBuilder
308
            ->from($this->tableName)
309
            ->select('scheduled', 'exec_time', 'set_id')
310
            ->where(
311
                $queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($uid, \PDO::PARAM_INT))
312
            );
313
        if ($limit) {
314
            $statement->setMaxResults($limit);
315
        }
316
317
        return $statement->execute()->fetchAll();
318
    }
319
320
    /**
321
     * Reads the registered processingInstructions of the crawler
322
     *
323
     * @return array
324
     */
325
    private function getCrawlerProcInstructions(): array
326
    {
327
        $crawlerProcInstructions = [];
328
        if (!empty($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['procInstructions'])) {
329
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['procInstructions'] as $configuration) {
330
                $crawlerProcInstructions[$configuration['key']] = $configuration['value'];
331
            }
332
        }
333
334
        return $crawlerProcInstructions;
335
    }
336
337
    /**
338
     * Get queue statistics
339
     *
340
     * @param void
341
     *
342
     * @return array array('assignedButUnprocessed' => <>, 'unprocessed' => <>);
343
     */
344
    public function getQueueStatistics()
345
    {
346
        return [
347
            'assignedButUnprocessed' => $this->queueRepository->countAllAssignedPendingItems(),
348
            'unprocessed' => $this->queueRepository->countAllPendingItems(),
349
        ];
350
    }
351
352
    /**
353
     * Get queue statistics by configuration
354
     *
355
     * @return array array of array('configuration' => <>, 'assignedButUnprocessed' => <>, 'unprocessed' => <>)
356
     */
357
    public function getQueueStatisticsByConfiguration()
358
    {
359
        $statistics = $this->queueRepository->countPendingItemsGroupedByConfigurationKey();
360
        $setIds = $this->queueRepository->getSetIdWithUnprocessedEntries();
361
        $totals = $this->queueRepository->getTotalQueueEntriesByConfiguration($setIds);
362
363
        // "merge" arrays
364
        foreach ($statistics as &$value) {
365
            $value['total'] = $totals[$value['configuration']];
366
        }
367
368
        return $statistics;
369
    }
370
371
    /**
372
     * Get active processes count
373
     *
374
     * @param void
375
     *
376
     * @return int
377
     */
378
    public function getActiveProcessesCount()
379
    {
380
        $processRepository = new ProcessRepository();
381
382
        return $processRepository->countActive();
383
    }
384
385
    /**
386
     * Get last processed entries
387
     *
388
     * @param int $limit
389
     *
390
     * @return array
391
     */
392
    public function getLastProcessedQueueEntries($limit)
393
    {
394
        return $this->queueRepository->getLastProcessedEntries('*', $limit);
395
    }
396
397
    /**
398
     * Get current crawling speed
399
     *
400
     * @param float|false page speed in pages per minute
401
     *
402
     * @return int
403
     */
404
    public function getCurrentCrawlingSpeed()
405
    {
406
        $lastProcessedEntries = $this->queueRepository->getLastProcessedEntriesTimestamps();
407
408
        if (count($lastProcessedEntries) < 10) {
409
            // not enough information
410
            return false;
411
        }
412
413
        $tooOldDelta = 60; // time between two entries is "too old"
414
415
        $compareValue = time();
416
        $startTime = $lastProcessedEntries[0];
417
418
        $pages = 0;
419
420
        reset($lastProcessedEntries);
421
        foreach ($lastProcessedEntries as $timestamp) {
422
            if ($compareValue - $timestamp > $tooOldDelta) {
423
                break;
424
            }
425
            $compareValue = $timestamp;
426
            $pages++;
427
        }
428
429
        if ($pages < 10) {
430
            // not enough information
431
            return false;
432
        }
433
        $oldestTimestampThatIsNotTooOld = $compareValue;
434
        $time = $startTime - $oldestTimestampThatIsNotTooOld;
435
436
        return $pages / ($time / 60);
437
    }
438
439
    /**
440
     * Get some performance data
441
     *
442
     * @param integer $start
443
     * @param integer $end
444
     * @param integer $resolution
445
     *
446
     * @return array data
447
     *
448
     * @throws \Exception
449
     */
450
    public function getPerformanceData($start, $end, $resolution)
451
    {
452
        $data = [];
453
454
        $data['urlcount'] = 0;
455
        $data['start'] = $start;
456
        $data['end'] = $end;
457
        $data['duration'] = $data['end'] - $data['start'];
458
459
        if ($data['duration'] < 1) {
460
            throw new \Exception('End timestamp must be after start timestamp', 1512659945);
461
        }
462
463
        for ($slotStart = $start; $slotStart < $end; $slotStart += $resolution) {
464
            $slotEnd = min($slotStart + $resolution - 1, $end);
465
            $slotData = $this->queueRepository->getPerformanceData($slotStart, $slotEnd);
466
467
            $slotUrlCount = 0;
468
            foreach ($slotData as &$processData) {
469
                $duration = $processData['end'] - $processData['start'];
470
                if ($processData['urlcount'] > 5 && $duration > 0) {
471
                    $processData['speed'] = 60 * 1 / ($duration / $processData['urlcount']);
472
                }
473
                $slotUrlCount += $processData['urlcount'];
474
            }
475
476
            $data['urlcount'] += $slotUrlCount;
477
478
            $data['slots'][$slotEnd] = [
479
                'amountProcesses' => count($slotData),
480
                'urlcount' => $slotUrlCount,
481
                'processes' => $slotData,
482
            ];
483
484
            if ($slotUrlCount > 5) {
485
                $data['slots'][$slotEnd]['speed'] = 60 * 1 / ($slotEnd - $slotStart / $slotUrlCount);
486
            } else {
487
                $data['slots'][$slotEnd]['speed'] = 0;
488
            }
489
        }
490
491
        if ($data['urlcount'] > 5) {
492
            $data['speed'] = 60 * 1 / ($data['duration'] / $data['urlcount']);
493
        } else {
494
            $data['speed'] = 0;
495
        }
496
497
        return $data;
498
    }
499
}
500