Completed
Push — typo3v9 ( 6762b6...ea38b1 )
by Tomas Norre
05:52
created

CrawlerApi::getQueueRepository()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 8

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 4
CRAP Score 2.3149

Importance

Changes 0
Metric Value
cc 2
nc 2
nop 0
dl 0
loc 8
rs 10
c 0
b 0
f 0
ccs 4
cts 7
cp 0.5714
crap 2.3149
1
<?php
2
namespace AOE\Crawler\Api;
3
4
/***************************************************************
5
 *  Copyright notice
6
 *
7
 *  (c) 2018 AOE GmbH <[email protected]>
8
 *
9
 *  All rights reserved
10
 *
11
 *  This script is part of the TYPO3 project. The TYPO3 project is
12
 *  free software; you can redistribute it and/or modify
13
 *  it under the terms of the GNU General Public License as published by
14
 *  the Free Software Foundation; either version 3 of the License, or
15
 *  (at your option) any later version.
16
 *
17
 *  The GNU General Public License can be found at
18
 *  http://www.gnu.org/copyleft/gpl.html.
19
 *
20
 *  This script is distributed in the hope that it will be useful,
21
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
22
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
23
 *  GNU General Public License for more details.
24
 *
25
 *  This copyright notice MUST APPEAR in all copies of the script!
26
 ***************************************************************/
27
28
use AOE\Crawler\Controller\CrawlerController;
29
use AOE\Crawler\Domain\Repository\ProcessRepository;
30
use AOE\Crawler\Domain\Repository\QueueRepository;
31
use TYPO3\CMS\Core\Database\ConnectionPool;
32
use TYPO3\CMS\Core\Database\Query\QueryBuilder;
33
use TYPO3\CMS\Core\Utility\GeneralUtility;
34
use TYPO3\CMS\Core\Utility\MathUtility;
35
use TYPO3\CMS\Extbase\Object\ObjectManager;
36
use TYPO3\CMS\Frontend\Page\PageRepository;
37
38
/**
39
 * Class CrawlerApi
40
 *
41
 * @package AOE\Crawler\Api
42
 */
43
class CrawlerApi
44
{
45
    /**
46
     * @var CrawlerController|Object
47
     */
48
    private $crawlerController;
49
50
    /**
51
     * @var QueueRepository
52
     */
53
    protected $queueRepository;
54
55
    /**
56
     * @var array
57
     */
58
    protected $allowedConfigurations = [];
59
60
    /**
61
     * @var QueryBuilder
62
     */
63
    protected $queryBuilder;
64
65
    /**
66
     * @var string
67
     */
68
    protected $tableName = 'tx_crawler_queue';
69
70 10
    public function __construct()
71
    {
72
        /** @var ObjectManager $objectManager */
73 10
        $objectManager = GeneralUtility::makeInstance(ObjectManager::class);
74 10
        $this->crawlerController = $objectManager->get(CrawlerController::class);
75 10
        $this->queueRepository = $objectManager->get(QueueRepository::class);
76 10
    }
77
78
    /**
79
     * Each crawler run has a setid, this facade method delegates
80
     * the it to the crawler object
81
     *
82
     * @param int $id
83
     * @throws \Exception
84
     */
85 1
    public function overwriteSetId(int $id)
86
    {
87 1
        $this->findCrawler()->setID = $id;
88 1
    }
89
90
    /**
91
     * This method is used to limit the configuration selection to
92
     * a set of configurations.
93
     *
94
     * @param array $allowedConfigurations
95
     */
96 1
    public function setAllowedConfigurations(array $allowedConfigurations)
97
    {
98 1
        $this->allowedConfigurations = $allowedConfigurations;
99 1
    }
100
101
    /**
102
     * @return array
103
     */
104 1
    public function getAllowedConfigurations()
105
    {
106 1
        return $this->allowedConfigurations;
107
    }
108
109
    /**
110
     * Returns the setID of the crawler
111
     *
112
     * @return int
113
     */
114 1
    public function getSetId()
115
    {
116 1
        return $this->findCrawler()->setID;
117
    }
118
119
    /**
120
     * Method to get an instance of the internal crawler singleton
121
     *
122
     * @return CrawlerController Instance of the crawler lib
123
     *
124
     * @throws \Exception
125
     */
126 2
    protected function findCrawler()
127
    {
128 2
        if (!is_object($this->crawlerController)) {
129
            $this->crawlerController = GeneralUtility::makeInstance(CrawlerController::class);
130
            $this->crawlerController->setID = GeneralUtility::md5int(microtime());
131
        }
132
133 2
        if (is_object($this->crawlerController)) {
134 2
            return $this->crawlerController;
135
        } else {
136
            throw new \Exception('no crawler object', 1512659759);
137
        }
138
    }
139
140
    /**
141
     * Adds a page to the crawlerqueue by uid
142
     *
143
     * @param int $uid uid
144
     */
145
    public function addPageToQueue($uid)
146
    {
147
        $uid = intval($uid);
148
        //non timed elements will be added with timestamp 0
149
        $this->addPageToQueueTimed($uid, 0);
150
    }
151
152
    /**
153
     * This method is used to limit the processing instructions to the processing instructions
154
     * that are allowed.
155
     *
156
     * @return array
157
     */
158 2
    protected function filterUnallowedConfigurations($configurations)
159
    {
160 2
        if (count($this->allowedConfigurations) > 0) {
161
            // 	remove configuration that does not match the current selection
162
            foreach ($configurations as $confKey => $confArray) {
163
                if (!in_array($confKey, $this->allowedConfigurations)) {
164
                    unset($configurations[$confKey]);
165
                }
166
            }
167
        }
168
169 2
        return $configurations;
170
    }
171
172
    /**
173
     * Adds a page to the crawlerqueue by uid and sets a
174
     * timestamp when the page should be crawled.
175
     *
176
     * @param int $uid pageid
177
     * @param int $time timestamp
178
     *
179
     * @throws \Exception
180
     * @return void
181
     */
182 2
    public function addPageToQueueTimed($uid, $time)
183
    {
184 2
        $uid = intval($uid);
185 2
        $time = intval($time);
186
187 2
        $crawler = $this->findCrawler();
188 2
        $pageData = GeneralUtility::makeInstance(PageRepository::class)->getPage($uid);
189 2
        $configurations = $crawler->getUrlsForPageRow($pageData);
190 2
        $configurations = $this->filterUnallowedConfigurations($configurations);
191 2
        $downloadUrls = [];
192 2
        $duplicateTrack = [];
193
194 2
        if (is_array($configurations)) {
195 2
            foreach ($configurations as $cv) {
196
                //enable inserting of entries
197 2
                $crawler->registerQueueEntriesInternallyOnly = false;
198 2
                $crawler->urlListFromUrlArray(
199 2
                    $cv,
200 2
                    $pageData,
201 2
                    $time,
202 2
                    300,
203 2
                    true,
204 2
                    false,
205 2
                    $duplicateTrack,
206 2
                    $downloadUrls,
207 2
                    array_keys($this->getCrawlerProcInstructions())
208
                );
209
210
                //reset the queue because the entries have been written to the db
211 2
                unset($crawler->queueEntries);
212
            }
213
        }
214 2
    }
215
216
    /**
217
     * Counts all entries in the database which are scheduled for a given page id and a schedule timestamp.
218
     *
219
     * @param int $page_uid
220
     * @param int $schedule_timestamp
221
     *
222
     * @return int
223
     */
224 1
    protected function countEntriesInQueueForPageByScheduleTime($page_uid, $schedule_timestamp)
225
    {
226 1
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
227
        $count = $queryBuilder
228 1
            ->count('*')
229 1
            ->from($this->tableName);
230
231
        //if the same page is scheduled for the same time and has not be executed?
232
        //un-timed elements need an exec_time with 0 because they can occur multiple times
233 1
        if ($schedule_timestamp == 0) {
234 1
            $count->where(
235 1
                $queryBuilder->expr()->eq('page_id', $page_uid),
236 1
                $queryBuilder->expr()->eq('exec_time', 0),
237 1
                $queryBuilder->expr()->eq('scheduled', $schedule_timestamp)
238
            );
239
        } else {
240
            //timed elements have got a fixed schedule time, if a record with this time
241
            //exists it is maybe queued for the future, or is has been queue for the past and therefore
242
            //also been processed.
243 1
            $count->where(
244 1
                $queryBuilder->expr()->eq('page_id', $page_uid),
245 1
                $queryBuilder->expr()->eq('scheduled', $schedule_timestamp)
246
            );
247
        }
248
249 1
        return $count->execute()->rowCount();
250
    }
251
252
    /**
253
     * Method to return the latest Crawle Timestamp for a page.
254
     *
255
     * @param int $uid uid id of the page
256
     * @param bool $future_crawldates_only
257
     * @param bool $unprocessed_only
258
     *
259
     * @return int
260
     */
261 1
    public function getLatestCrawlTimestampForPage($uid, $future_crawldates_only = false, $unprocessed_only = false)
262
    {
263 1
        $uid = intval($uid);
264
265 1
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
266
        $query = $queryBuilder
267 1
            ->from($this->tableName)
268 1
            ->selectLiteral('max(scheduled) as latest')
269 1
            ->where(
270 1
                $queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($uid))
271
            );
272
273 1
        if ($future_crawldates_only) {
274
            $query->andWhere(
275
                $queryBuilder->expr()->gt('scheduled', time())
276
            );
277
        }
278
279 1
        if ($unprocessed_only) {
280
            $query->andWhere(
281
                $queryBuilder->expr()->eq('exec_time', 0)
282
            );
283
        }
284
285 1
        $row = $query->execute()->fetch(0);
286 1
        if ($row['latest']) {
287 1
            $res = $row['latest'];
288
        } else {
289
            $res = 0;
290
        }
291
292 1
        return $res;
293
    }
294
295
    /**
296
     * Returns an array with timestamps when the page has been scheduled for crawling and
297
     * at what time the scheduled crawl has been executed. The array also contains items that are
298
     * scheduled but have note been crawled yet.
299
     *
300
     * @param int $uid uid of the page
301
     * @param bool $limit
302
     *
303
     * @return array array with the crawl-history of a page => 0 : scheduled time , 1 : executed_time, 2 : set_id
304
     */
305 1
    public function getCrawlHistoryForPage($uid, $limit = 0)
306
    {
307 1
        $uid = intval($uid);
308 1
        $limit = intval($limit);
309
310 1
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
311
        $statement = $queryBuilder
312 1
            ->from($this->tableName)
313 1
            ->select('scheduled', 'exec_time', 'set_id')
314 1
            ->where(
315 1
                $queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($uid, \PDO::PARAM_INT))
316
            );
317 1
        if ($limit) {
318 1
            $statement->setMaxResults($limit);
319
        }
320
321 1
        return $statement->execute()->fetchAll();
322
    }
323
324
    /**
325
     * Reads the registered processingInstructions of the crawler
326
     *
327
     * @return array
328
     */
329 2
    private function getCrawlerProcInstructions(): array
330
    {
331 2
        $crawlerProcInstructions = [];
332 2
        if (!empty($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['procInstructions'])) {
333
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['procInstructions'] as $configuration) {
334
                $crawlerProcInstructions[$configuration['key']] = $configuration['value'];
335
            }
336
        }
337
338 2
        return $crawlerProcInstructions;
339
    }
340
341
    /**
342
     * Get queue statistics
343
     *
344
     * @param void
345
     *
346
     * @return array array('assignedButUnprocessed' => <>, 'unprocessed' => <>);
347
     */
348 1
    public function getQueueStatistics()
349
    {
350
        return [
351 1
            'assignedButUnprocessed' => $this->queueRepository->countAllAssignedPendingItems(),
352 1
            'unprocessed' => $this->queueRepository->countAllPendingItems()
353
        ];
354
    }
355
356
    /**
357
     * Get queue statistics by configuration
358
     *
359
     * @return array array of array('configuration' => <>, 'assignedButUnprocessed' => <>, 'unprocessed' => <>)
360
     */
361
    public function getQueueStatisticsByConfiguration()
362
    {
363
        $statistics = $this->queueRepository->countPendingItemsGroupedByConfigurationKey();
364
        $setIds = $this->queueRepository->getSetIdWithUnprocessedEntries();
365
        $totals = $this->queueRepository->getTotalQueueEntriesByConfiguration($setIds);
366
367
        // "merge" arrays
368
        foreach ($statistics as $key => &$value) {
369
            $value['total'] = $totals[$value['configuration']];
370
        }
371
372
        return $statistics;
373
    }
374
375
    /**
376
     * Get active processes count
377
     *
378
     * @param void
379
     *
380
     * @return int
381
     */
382
    public function getActiveProcessesCount()
383
    {
384
        $processRepository = new ProcessRepository();
385
386
        return $processRepository->countActive();
387
    }
388
389
    /**
390
     * Get last processed entries
391
     *
392
     * @param int $limit
393
     *
394
     * @return array
395
     */
396
    public function getLastProcessedQueueEntries($limit)
397
    {
398
        return $this->queueRepository->getLastProcessedEntries('*', $limit);
399
    }
400
401
    /**
402
     * Get current crawling speed
403
     *
404
     * @param float|false page speed in pages per minute
405
     *
406
     * @return int
407
     */
408
    public function getCurrentCrawlingSpeed()
409
    {
410
        $lastProcessedEntries = $this->queueRepository->getLastProcessedEntriesTimestamps();
411
412
        if (count($lastProcessedEntries) < 10) {
413
            // not enough information
414
            return false;
415
        }
416
417
        $tooOldDelta = 60; // time between two entries is "too old"
418
419
        $compareValue = time();
420
        $startTime = $lastProcessedEntries[0];
421
422
        $pages = 0;
423
424
        reset($lastProcessedEntries);
425
        foreach ($lastProcessedEntries as $key => $timestamp) {
426
            if ($compareValue - $timestamp > $tooOldDelta) {
427
                break;
428
            }
429
            $compareValue = $timestamp;
430
            $pages++;
431
        }
432
433
        if ($pages < 10) {
434
            // not enough information
435
            return false;
436
        }
437
        $oldestTimestampThatIsNotTooOld = $compareValue;
438
        $time = $startTime - $oldestTimestampThatIsNotTooOld;
439
        $speed = $pages / ($time / 60);
440
441
        return $speed;
442
    }
443
444
    /**
445
     * Get some performance data
446
     *
447
     * @param integer $start
448
     * @param integer $end
449
     * @param integer $resolution
450
     *
451
     * @return array data
452
     *
453
     * @throws \Exception
454
     */
455
    public function getPerformanceData($start, $end, $resolution)
456
    {
457
        $data = [];
458
459
        $data['urlcount'] = 0;
460
        $data['start'] = $start;
461
        $data['end'] = $end;
462
        $data['duration'] = $data['end'] - $data['start'];
463
464
        if ($data['duration'] < 1) {
465
            throw new \Exception('End timestamp must be after start timestamp', 1512659945);
466
        }
467
468
        for ($slotStart = $start; $slotStart < $end; $slotStart += $resolution) {
469
            $slotEnd = min($slotStart + $resolution - 1, $end);
470
            $slotData = $this->queueRepository->getPerformanceData($slotStart, $slotEnd);
471
472
            $slotUrlCount = 0;
473
            foreach ($slotData as $processId => &$processData) {
474
                $duration = $processData['end'] - $processData['start'];
475
                if ($processData['urlcount'] > 5 && $duration > 0) {
476
                    $processData['speed'] = 60 * 1 / ($duration / $processData['urlcount']);
477
                }
478
                $slotUrlCount += $processData['urlcount'];
479
            }
480
481
            $data['urlcount'] += $slotUrlCount;
482
483
            $data['slots'][$slotEnd] = [
484
                'amountProcesses' => count($slotData),
485
                'urlcount' => $slotUrlCount,
486
                'processes' => $slotData,
487
            ];
488
489
            if ($slotUrlCount > 5) {
490
                $data['slots'][$slotEnd]['speed'] = 60 * 1 / ($slotEnd - $slotStart / $slotUrlCount);
491
            } else {
492
                $data['slots'][$slotEnd]['speed'] = 0;
493
            }
494
        }
495
496
        if ($data['urlcount'] > 5) {
497
            $data['speed'] = 60 * 1 / ($data['duration'] / $data['urlcount']);
498
        } else {
499
            $data['speed'] = 0;
500
        }
501
502
        return $data;
503
    }
504
}
505