Passed
Push — features/addFlushedPagesToCraw... ( 44f5f4...d700c8 )
by Tomas Norre
08:39
created

CrawlerApi::__construct()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 5
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 3
CRAP Score 1.0156

Importance

Changes 0
Metric Value
cc 1
eloc 2
c 0
b 0
f 0
nc 1
nop 0
dl 0
loc 5
ccs 3
cts 4
cp 0.75
crap 1.0156
rs 10
1
<?php
2
3
declare(strict_types=1);
4
5
namespace AOE\Crawler\Api;
6
7
/***************************************************************
8
 *  Copyright notice
9
 *
10
 *  (c) 2018 AOE GmbH <[email protected]>
11
 *
12
 *  All rights reserved
13
 *
14
 *  This script is part of the TYPO3 project. The TYPO3 project is
15
 *  free software; you can redistribute it and/or modify
16
 *  it under the terms of the GNU General Public License as published by
17
 *  the Free Software Foundation; either version 3 of the License, or
18
 *  (at your option) any later version.
19
 *
20
 *  The GNU General Public License can be found at
21
 *  http://www.gnu.org/copyleft/gpl.html.
22
 *
23
 *  This script is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
26
 *  GNU General Public License for more details.
27
 *
28
 *  This copyright notice MUST APPEAR in all copies of the script!
29
 ***************************************************************/
30
31
use AOE\Crawler\Controller\CrawlerController;
32
use AOE\Crawler\Domain\Repository\ProcessRepository;
33
use AOE\Crawler\Domain\Repository\QueueRepository;
34
use AOE\Crawler\Exception\CrawlerObjectException;
35
use AOE\Crawler\Exception\TimeStampException;
36
use TYPO3\CMS\Core\Database\ConnectionPool;
37
use TYPO3\CMS\Core\Database\Query\QueryBuilder;
38
use TYPO3\CMS\Core\Utility\GeneralUtility;
39
use TYPO3\CMS\Extbase\Object\ObjectManager;
40
use TYPO3\CMS\Frontend\Page\PageRepository;
41
42
/**
43
 * Class CrawlerApi
44
 *
45
 * @package AOE\Crawler\Api
46
 */
47
class CrawlerApi
48
{
49
    /**
50
     * @var QueueRepository
51
     */
52
    protected $queueRepository;
53
54
    /**
55
     * @var array
56
     */
57
    protected $allowedConfigurations = [];
58
59
    /**
60
     * @var QueryBuilder
61
     */
62
    protected $queryBuilder;
63
64
    /**
65
     * @var string
66
     */
67
    protected $tableName = 'tx_crawler_queue';
68
69
    /**
70
     * @var CrawlerController
71
     */
72
    protected $crawlerController;
73
74 10
    public function __construct()
75
    {
76
        /** @var ObjectManager $objectManager */
77 10
        $objectManager = GeneralUtility::makeInstance(ObjectManager::class);
78 10
        $this->queueRepository = $objectManager->get(QueueRepository::class);
79 10
    }
80
81
    /**
82
     * Each crawler run has a setid, this facade method delegates
83
     * the it to the crawler object
84
     *
85
     * @throws \Exception
86
     */
87 1
    public function overwriteSetId(int $id): void
88
    {
89 1
        $this->findCrawler()->setID = $id;
90 1
    }
91
92
    /**
93
     * This method is used to limit the configuration selection to
94
     * a set of configurations.
95
     */
96 1
    public function setAllowedConfigurations(array $allowedConfigurations): void
97
    {
98 1
        $this->allowedConfigurations = $allowedConfigurations;
99 1
    }
100
101
    /**
102
     * @return array
103
     */
104 1
    public function getAllowedConfigurations()
105
    {
106 1
        return $this->allowedConfigurations;
107
    }
108
109
    /**
110
     * Returns the setID of the crawler
111
     *
112
     * @return int
113
     */
114 1
    public function getSetId()
115
    {
116 1
        return $this->findCrawler()->setID;
117
    }
118
119
    /**
120
     * Adds a page to the crawlerqueue by uid
121
     *
122
     * @param int $uid uid
123
     */
124
    public function addPageToQueue($uid): void
125
    {
126
        $uid = intval($uid);
127
        //non timed elements will be added with timestamp 0
128
        $this->addPageToQueueTimed($uid, 0);
129
    }
130
131
    /**
132
     * Adds a page to the crawlerqueue by uid and sets a
133
     * timestamp when the page should be crawled.
134
     *
135
     * @param int $uid pageid
136
     * @param int $time timestamp
137
     *
138
     * @throws \Exception
139
     */
140 4
    public function addPageToQueueTimed($uid, $time): void
141
    {
142 4
        $uid = intval($uid);
143 4
        $time = intval($time);
144
145 4
        $crawler = $this->findCrawler();
146 4
        $pageData = GeneralUtility::makeInstance(PageRepository::class)->getPage($uid);
147 4
        $configurations = $crawler->getUrlsForPageRow($pageData);
148 4
        $configurations = $this->filterUnallowedConfigurations($configurations);
149 4
        $downloadUrls = [];
150 4
        $duplicateTrack = [];
151
152 4
        if (is_array($configurations)) {
0 ignored issues
show
introduced by
The condition is_array($configurations) is always true.
Loading history...
153 4
            foreach ($configurations as $cv) {
154
                //enable inserting of entries
155 4
                $crawler->registerQueueEntriesInternallyOnly = false;
156 4
                $crawler->urlListFromUrlArray(
157 4
                    $cv,
158 4
                    $pageData,
159 4
                    $time,
160 4
                    300,
161 4
                    true,
162 4
                    false,
163 4
                    $duplicateTrack,
164 4
                    $downloadUrls,
165 4
                    array_keys($this->getCrawlerProcInstructions())
166
                );
167
168
                //reset the queue because the entries have been written to the db
169 4
                unset($crawler->queueEntries);
170
            }
171
        }
172 4
    }
173
174
    /**
175
     * Method to return the latest Crawle Timestamp for a page.
176
     *
177
     * @param int $uid uid id of the page
178
     * @param bool $future_crawldates_only
179
     * @param bool $unprocessed_only
180
     *
181
     * @return int
182
     */
183 1
    public function getLatestCrawlTimestampForPage($uid, $future_crawldates_only = false, $unprocessed_only = false)
184
    {
185 1
        $uid = intval($uid);
186
187 1
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
188
        $query = $queryBuilder
189 1
            ->from($this->tableName)
190 1
            ->selectLiteral('max(scheduled) as latest')
191 1
            ->where(
192 1
                $queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($uid))
193
            );
194
195 1
        if ($future_crawldates_only) {
196
            $query->andWhere(
197
                $queryBuilder->expr()->gt('scheduled', time())
198
            );
199
        }
200
201 1
        if ($unprocessed_only) {
202
            $query->andWhere(
203
                $queryBuilder->expr()->eq('exec_time', 0)
204
            );
205
        }
206
207 1
        $row = $query->execute()->fetch(0);
208 1
        if ($row['latest']) {
209 1
            $res = $row['latest'];
210
        } else {
211
            $res = 0;
212
        }
213
214 1
        return intval($res);
215
    }
216
217
    /**
218
     * Returns an array with timestamps when the page has been scheduled for crawling and
219
     * at what time the scheduled crawl has been executed. The array also contains items that are
220
     * scheduled but have note been crawled yet.
221
     *
222
     * @return array array with the crawl-history of a page => 0 : scheduled time , 1 : executed_time, 2 : set_id
223
     */
224 1
    public function getCrawlHistoryForPage(int $uid, int $limit = 0)
225
    {
226 1
        $uid = intval($uid);
227 1
        $limit = intval($limit);
228
229 1
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
230
        $statement = $queryBuilder
231 1
            ->from($this->tableName)
232 1
            ->select('scheduled', 'exec_time', 'set_id')
233 1
            ->where(
234 1
                $queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($uid, \PDO::PARAM_INT))
235
            );
236 1
        if ($limit) {
237 1
            $statement->setMaxResults($limit);
238
        }
239
240 1
        return $statement->execute()->fetchAll();
241
    }
242
243
    /**
244
     * Get queue statistics
245
     */
246 1
    public function getQueueStatistics(): array
247
    {
248
        return [
249 1
            'assignedButUnprocessed' => $this->queueRepository->countAllAssignedPendingItems(),
250 1
            'unprocessed' => $this->queueRepository->countAllPendingItems(),
251
        ];
252
    }
253
254
    /**
255
     * Get queue statistics by configuration
256
     *
257
     * @return array array of array('configuration' => <>, 'assignedButUnprocessed' => <>, 'unprocessed' => <>)
258
     */
259
    public function getQueueStatisticsByConfiguration()
260
    {
261
        $statistics = $this->queueRepository->countPendingItemsGroupedByConfigurationKey();
262
        $setIds = $this->queueRepository->getSetIdWithUnprocessedEntries();
263
        $totals = $this->queueRepository->getTotalQueueEntriesByConfiguration($setIds);
264
265
        // "merge" arrays
266
        foreach ($statistics as &$value) {
267
            $value['total'] = $totals[$value['configuration']];
268
        }
269
270
        return $statistics;
271
    }
272
273
    /**
274
     * Get active processes count
275
     */
276
    public function getActiveProcessesCount(): int
277
    {
278
        $processRepository = new ProcessRepository();
279
280
        return $processRepository->countActive();
281
    }
282
283
    public function getLastProcessedQueueEntries(int $limit): array
284
    {
285
        return $this->queueRepository->getLastProcessedEntries($limit);
286
    }
287
288
    /**
289
     * Get current crawling speed
290
     *
291
     * @return int|float|bool
292
     */
293
    public function getCurrentCrawlingSpeed()
294
    {
295
        $lastProcessedEntries = $this->queueRepository->getLastProcessedEntriesTimestamps();
296
297
        if (count($lastProcessedEntries) < 10) {
298
            // not enough information
299
            return false;
300
        }
301
302
        $tooOldDelta = 60; // time between two entries is "too old"
303
304
        $compareValue = time();
305
        $startTime = $lastProcessedEntries[0];
306
307
        $pages = 0;
308
309
        reset($lastProcessedEntries);
310
        foreach ($lastProcessedEntries as $timestamp) {
311
            if ($compareValue - $timestamp > $tooOldDelta) {
312
                break;
313
            }
314
            $compareValue = $timestamp;
315
            $pages++;
316
        }
317
318
        if ($pages < 10) {
319
            // not enough information
320
            return false;
321
        }
322
        $oldestTimestampThatIsNotTooOld = $compareValue;
323
        $time = $startTime - $oldestTimestampThatIsNotTooOld;
324
325
        return $pages / ($time / 60);
326
    }
327
328
    /**
329
     * Get some performance data
330
     *
331
     * @param integer $start
332
     * @param integer $end
333
     * @param integer $resolution
334
     *
335
     * @return array data
336
     *
337
     * @throws TimeStampException
338
     */
339
    public function getPerformanceData($start, $end, $resolution)
340
    {
341
        $data = [];
342
343
        $data['urlcount'] = 0;
344
        $data['start'] = $start;
345
        $data['end'] = $end;
346
        $data['duration'] = $data['end'] - $data['start'];
347
348
        if ($data['duration'] < 1) {
349
            throw new TimeStampException('End timestamp must be after start timestamp', 1512659945);
350
        }
351
352
        for ($slotStart = $start; $slotStart < $end; $slotStart += $resolution) {
353
            $slotEnd = min($slotStart + $resolution - 1, $end);
354
            $slotData = $this->queueRepository->getPerformanceData($slotStart, $slotEnd);
355
356
            $slotUrlCount = 0;
357
            foreach ($slotData as &$processData) {
358
                $duration = $processData['end'] - $processData['start'];
359
                if ($processData['urlcount'] > 5 && $duration > 0) {
360
                    $processData['speed'] = 60 * 1 / ($duration / $processData['urlcount']);
361
                }
362
                $slotUrlCount += $processData['urlcount'];
363
            }
364
365
            $data['urlcount'] += $slotUrlCount;
366
367
            $data['slots'][$slotEnd] = [
368
                'amountProcesses' => count($slotData),
369
                'urlcount' => $slotUrlCount,
370
                'processes' => $slotData,
371
            ];
372
373
            if ($slotUrlCount > 5) {
374
                $data['slots'][$slotEnd]['speed'] = 60 * 1 / ($slotEnd - $slotStart / $slotUrlCount);
375
            } else {
376
                $data['slots'][$slotEnd]['speed'] = 0;
377
            }
378
        }
379
380
        if ($data['urlcount'] > 5) {
381
            $data['speed'] = 60 * 1 / ($data['duration'] / $data['urlcount']);
382
        } else {
383
            $data['speed'] = 0;
384
        }
385
386
        return $data;
387
    }
388
389
    /**
390
     * Method to get an instance of the internal crawler singleton
391
     *
392
     * @return CrawlerController Instance of the crawler lib
393
     *
394
     * @throws CrawlerObjectException
395
     */
396 2
    protected function findCrawler()
397
    {
398 2
        if (! is_object($this->crawlerController)) {
399 2
            $this->crawlerController = GeneralUtility::makeInstance(CrawlerController::class);
400 2
            $this->crawlerController->setID = GeneralUtility::md5int(microtime());
401
        }
402
403 2
        if (is_object($this->crawlerController)) {
404 2
            return $this->crawlerController;
405
        }
406
        throw new CrawlerObjectException('no crawler object', 1512659759);
407
    }
408
409
    /**
410
     * This method is used to limit the processing instructions to the processing instructions
411
     * that are allowed.
412
     */
413 4
    protected function filterUnallowedConfigurations(array $configurations): array
414
    {
415 4
        if (count($this->allowedConfigurations) > 0) {
416
            // 	remove configuration that does not match the current selection
417
            foreach ($configurations as $confKey => $confArray) {
418
                if (! in_array($confKey, $this->allowedConfigurations, true)) {
419
                    unset($configurations[$confKey]);
420
                }
421
            }
422
        }
423
424 4
        return $configurations;
425
    }
426
427
    /**
428
     * Reads the registered processingInstructions of the crawler
429
     */
430 4
    private function getCrawlerProcInstructions(): array
431
    {
432 4
        $crawlerProcInstructions = [];
433 4
        if (! empty($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['procInstructions'])) {
434
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['procInstructions'] as $configuration) {
435
                $crawlerProcInstructions[$configuration['key']] = $configuration['value'];
436
            }
437
        }
438
439 4
        return $crawlerProcInstructions;
440
    }
441
}
442