Passed
Push — testing/dev-master ( cc645c...2d88f0 )
by Tomas Norre
60:30 queued 57:02
created

CrawlerApi::getPageRepository()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 3
Code Lines 1

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 2
CRAP Score 1.037

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 1
eloc 1
c 1
b 0
f 0
nc 1
nop 0
dl 0
loc 3
ccs 2
cts 3
cp 0.6667
crap 1.037
rs 10
1
<?php
2
3
declare(strict_types=1);
4
5
namespace AOE\Crawler\Api;
6
7
/***************************************************************
8
 *  Copyright notice
9
 *
10
 *  (c) 2018 AOE GmbH <[email protected]>
11
 *
12
 *  All rights reserved
13
 *
14
 *  This script is part of the TYPO3 project. The TYPO3 project is
15
 *  free software; you can redistribute it and/or modify
16
 *  it under the terms of the GNU General Public License as published by
17
 *  the Free Software Foundation; either version 3 of the License, or
18
 *  (at your option) any later version.
19
 *
20
 *  The GNU General Public License can be found at
21
 *  http://www.gnu.org/copyleft/gpl.html.
22
 *
23
 *  This script is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26
 *  GNU General Public License for more details.
27
 *
28
 *  This copyright notice MUST APPEAR in all copies of the script!
29
 ***************************************************************/
30
31
use AOE\Crawler\Controller\CrawlerController;
32
use AOE\Crawler\Domain\Repository\ProcessRepository;
33
use AOE\Crawler\Domain\Repository\QueueRepository;
34
use AOE\Crawler\Exception\CrawlerObjectException;
35
use AOE\Crawler\Exception\TimeStampException;
36
use TYPO3\CMS\Core\Database\ConnectionPool;
37
use TYPO3\CMS\Core\Database\Query\QueryBuilder;
38
use TYPO3\CMS\Core\Domain\Repository\PageRepository;
39
use TYPO3\CMS\Core\Utility\GeneralUtility;
40
use TYPO3\CMS\Extbase\Object\ObjectManager;
41
42
/**
43
 * Class CrawlerApi
44
 *
45
 * @package AOE\Crawler\Api
46
 * @deprecated Since v9.1.6 - This class will be removed when dropping support for TYPO3 9LTS and 10LTS
47
 */
48
class CrawlerApi
49
{
50
    /**
51
     * @var QueueRepository
52
     */
53
    protected $queueRepository;
54
55
    /**
56
     * @var array
57
     */
58
    protected $allowedConfigurations = [];
59
60
    /**
61
     * @var QueryBuilder
62
     */
63
    protected $queryBuilder;
64
65
    /**
66
     * @var string
67
     */
68
    protected $tableName = 'tx_crawler_queue';
69
70
    /**
71
     * @var CrawlerController
72
     */
73
    protected $crawlerController;
74
75 10
    public function __construct()
76
    {
77
        /** @var ObjectManager $objectManager */
78 10
        $objectManager = GeneralUtility::makeInstance(ObjectManager::class);
79 10
        $this->queueRepository = $objectManager->get(QueueRepository::class);
0 ignored issues
show
Deprecated Code introduced by
The function TYPO3\CMS\Extbase\Object\ObjectManager::get() has been deprecated: since TYPO3 10.4, will be removed in version 12.0 ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

79
        $this->queueRepository = /** @scrutinizer ignore-deprecated */ $objectManager->get(QueueRepository::class);

This function has been deprecated. The supplier of the function has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the function will be removed and what other function to use instead.

Loading history...
80 10
    }
81
82
    /**
83
     * Each crawler run has a setid, this facade method delegates
84
     * the it to the crawler object
85
     *
86
     * @throws \Exception
87
     */
88 1
    public function overwriteSetId(int $id): void
89
    {
90 1
        $this->findCrawler()->setID = $id;
91 1
    }
92
93
    /**
94
     * This method is used to limit the configuration selection to
95
     * a set of configurations.
96
     */
97 1
    public function setAllowedConfigurations(array $allowedConfigurations): void
98
    {
99 1
        $this->allowedConfigurations = $allowedConfigurations;
100 1
    }
101
102
    /**
103
     * @return array
104
     */
105 1
    public function getAllowedConfigurations()
106
    {
107 1
        return $this->allowedConfigurations;
108
    }
109
110
    /**
111
     * Returns the setID of the crawler
112
     *
113
     * @return int
114
     */
115 1
    public function getSetId()
116
    {
117 1
        return $this->findCrawler()->setID;
118
    }
119
120
    /**
121
     * Adds a page to the crawlerqueue by uid
122
     *
123
     * @param int $uid uid
124
     * @codeCoverageIgnore
125
     */
126
    public function addPageToQueue($uid): void
127
    {
128
        $uid = intval($uid);
129
        //non timed elements will be added with timestamp 0
130
        $this->addPageToQueueTimed($uid, 0);
131
    }
132
133
    /**
134
     * Adds a page to the crawlerqueue by uid and sets a
135
     * timestamp when the page should be crawled.
136
     *
137
     * @param int $uid pageid
138
     * @param int $time timestamp
139
     *
140
     * @throws \Exception
141
     */
142 4
    public function addPageToQueueTimed($uid, $time): void
143
    {
144 4
        $uid = intval($uid);
145 4
        $time = intval($time);
146
147 4
        $crawler = $this->findCrawler();
148 4
        $pageData = $this->getPageRepository()->getPage($uid, true);
149 4
        $configurations = $crawler->getUrlsForPageRow($pageData);
150 4
        $configurations = $this->filterUnallowedConfigurations($configurations);
151 4
        $downloadUrls = [];
152 4
        $duplicateTrack = [];
153
154 4
        if (is_array($configurations)) {
0 ignored issues
show
introduced by
The condition is_array($configurations) is always true.
Loading history...
155 4
            foreach ($configurations as $cv) {
156
                //enable inserting of entries
157 4
                $crawler->registerQueueEntriesInternallyOnly = false;
158 4
                $crawler->urlListFromUrlArray(
159 4
                    $cv,
160
                    $pageData,
161
                    $time,
162 4
                    300,
163 4
                    true,
164 4
                    false,
165
                    $duplicateTrack,
166
                    $downloadUrls,
167 4
                    array_keys($this->getCrawlerProcInstructions())
168
                );
169
170
                //reset the queue because the entries have been written to the db
171 4
                unset($crawler->queueEntries);
172
            }
173
        }
174 4
    }
175
176
    /**
177
     * Method to return the latest Crawle Timestamp for a page.
178
     *
179
     * @param int $uid uid id of the page
180
     * @param bool $future_crawldates_only
181
     * @param bool $unprocessed_only
182
     *
183
     * @return int
184
     */
185 1
    public function getLatestCrawlTimestampForPage($uid, $future_crawldates_only = false, $unprocessed_only = false)
186
    {
187 1
        $uid = intval($uid);
188
189 1
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
190
        $query = $queryBuilder
191 1
            ->from($this->tableName)
192 1
            ->selectLiteral('max(scheduled) as latest')
193 1
            ->where(
194 1
                $queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($uid))
195
            );
196
197 1
        if ($future_crawldates_only) {
198
            $query->andWhere(
199
                $queryBuilder->expr()->gt('scheduled', time())
200
            );
201
        }
202
203 1
        if ($unprocessed_only) {
204
            $query->andWhere(
205
                $queryBuilder->expr()->eq('exec_time', 0)
206
            );
207
        }
208
209 1
        $row = $query->execute()->fetch(0);
210 1
        if ($row['latest']) {
211 1
            $res = $row['latest'];
212
        } else {
213
            $res = 0;
214
        }
215
216 1
        return intval($res);
217
    }
218
219
    /**
220
     * Returns an array with timestamps when the page has been scheduled for crawling and
221
     * at what time the scheduled crawl has been executed. The array also contains items that are
222
     * scheduled but have note been crawled yet.
223
     *
224
     * @return array array with the crawl-history of a page => 0 : scheduled time , 1 : executed_time, 2 : set_id
225
     */
226 1
    public function getCrawlHistoryForPage(int $uid, int $limit = 0)
227
    {
228 1
        $uid = intval($uid);
229 1
        $limit = intval($limit);
230
231 1
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
232
        $statement = $queryBuilder
233 1
            ->from($this->tableName)
234 1
            ->select('scheduled', 'exec_time', 'set_id')
235 1
            ->where(
236 1
                $queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($uid, \PDO::PARAM_INT))
237
            );
238 1
        if ($limit) {
239 1
            $statement->setMaxResults($limit);
240
        }
241
242 1
        return $statement->execute()->fetchAll();
243
    }
244
245
    /**
246
     * Get queue statistics
247
     */
248 1
    public function getQueueStatistics(): array
249
    {
250
        return [
251 1
            'assignedButUnprocessed' => $this->queueRepository->countAllAssignedPendingItems(),
252 1
            'unprocessed' => $this->queueRepository->countAllPendingItems(),
253
        ];
254
    }
255
256
    /**
257
     * Get queue statistics by configuration
258
     *
259
     * @return array array of array('configuration' => <>, 'assignedButUnprocessed' => <>, 'unprocessed' => <>)
260
     * @codeCoverageIgnore
261
     */
262
    public function getQueueStatisticsByConfiguration()
263
    {
264
        $statistics = $this->queueRepository->countPendingItemsGroupedByConfigurationKey();
265
        $setIds = $this->queueRepository->getSetIdWithUnprocessedEntries();
266
        $totals = $this->queueRepository->getTotalQueueEntriesByConfiguration($setIds);
267
268
        // "merge" arrays
269
        foreach ($statistics as &$value) {
270
            $value['total'] = $totals[$value['configuration']];
271
        }
272
273
        return $statistics;
274
    }
275
276
    /**
277
     * Get active processes count
278
     * @codeCoverageIgnore
279
     */
280
    public function getActiveProcessesCount(): int
281
    {
282
        $processRepository = GeneralUtility::makeInstance(ProcessRepository::class);
283
        return $processRepository->findAllActive()->count();
284
    }
285
286
    /**
287
     * @codeCoverageIgnore
288
     */
289
    public function getLastProcessedQueueEntries(int $limit): array
290
    {
291
        return $this->queueRepository->getLastProcessedEntries($limit);
292
    }
293
294
    /**
295
     * Get current crawling speed
296
     *
297
     * @return int|float|bool
298
     * @codeCoverageIgnore
299
     */
300
    public function getCurrentCrawlingSpeed()
301
    {
302
        $lastProcessedEntries = $this->queueRepository->getLastProcessedEntriesTimestamps();
303
304
        if (count($lastProcessedEntries) < 10) {
305
            // not enough information
306
            return false;
307
        }
308
309
        // time between two entries is "too old"
310
        $tooOldDelta = 60;
311
312
        $compareValue = time();
313
        $startTime = $lastProcessedEntries[0];
314
315
        $pages = 0;
316
317
        reset($lastProcessedEntries);
318
        foreach ($lastProcessedEntries as $timestamp) {
319
            if ($compareValue - $timestamp > $tooOldDelta) {
320
                break;
321
            }
322
            $compareValue = $timestamp;
323
            $pages++;
324
        }
325
326
        if ($pages < 10) {
327
            // not enough information
328
            return false;
329
        }
330
        $oldestTimestampThatIsNotTooOld = $compareValue;
331
        $time = $startTime - $oldestTimestampThatIsNotTooOld;
332
333
        return $pages / ($time / 60);
334
    }
335
336
    /**
337
     * Get some performance data
338
     *
339
     * @param integer $start
340
     * @param integer $end
341
     * @param integer $resolution
342
     *
343
     * @return array data
344
     *
345
     * @throws TimeStampException
346
     * @codeCoverageIgnore
347
     */
348
    public function getPerformanceData($start, $end, $resolution)
349
    {
350
        $data = [];
351
352
        $data['urlcount'] = 0;
353
        $data['start'] = $start;
354
        $data['end'] = $end;
355
        $data['duration'] = $data['end'] - $data['start'];
356
357
        if ($data['duration'] < 1) {
358
            throw new TimeStampException('End timestamp must be after start timestamp', 1512659945);
359
        }
360
361
        for ($slotStart = $start; $slotStart < $end; $slotStart += $resolution) {
362
            $slotEnd = min($slotStart + $resolution - 1, $end);
363
            $slotData = $this->queueRepository->getPerformanceData($slotStart, $slotEnd);
364
365
            $slotUrlCount = 0;
366
            foreach ($slotData as &$processData) {
367
                $duration = $processData['end'] - $processData['start'];
368
                if ($processData['urlcount'] > 5 && $duration > 0) {
369
                    $processData['speed'] = 60 * 1 / ($duration / $processData['urlcount']);
370
                }
371
                $slotUrlCount += $processData['urlcount'];
372
            }
373
374
            $data['urlcount'] += $slotUrlCount;
375
376
            $data['slots'][$slotEnd] = [
377
                'amountProcesses' => count($slotData),
378
                'urlcount' => $slotUrlCount,
379
                'processes' => $slotData,
380
            ];
381
382
            if ($slotUrlCount > 5) {
383
                $data['slots'][$slotEnd]['speed'] = 60 * 1 / ($slotEnd - $slotStart / $slotUrlCount);
384
            } else {
385
                $data['slots'][$slotEnd]['speed'] = 0;
386
            }
387
        }
388
389
        if ($data['urlcount'] > 5) {
390
            $data['speed'] = 60 * 1 / ($data['duration'] / $data['urlcount']);
391
        } else {
392
            $data['speed'] = 0;
393
        }
394
395
        return $data;
396
    }
397
398
    /**
399
     * Method to get an instance of the internal crawler singleton
400
     *
401
     * @return CrawlerController Instance of the crawler lib
402
     *
403
     * @throws CrawlerObjectException
404
     */
405 2
    protected function findCrawler()
406
    {
407 2
        if (! is_object($this->crawlerController)) {
408 2
            $this->crawlerController = GeneralUtility::makeInstance(CrawlerController::class);
409 2
            $this->crawlerController->setID = GeneralUtility::md5int(microtime());
410
        }
411
412 2
        if (is_object($this->crawlerController)) {
413 2
            return $this->crawlerController;
414
        }
415
        throw new CrawlerObjectException('no crawler object', 1512659759);
416
    }
417
418
    /**
419
     * This method is used to limit the processing instructions to the processing instructions
420
     * that are allowed.
421
     */
422 4
    protected function filterUnallowedConfigurations(array $configurations): array
423
    {
424 4
        if (count($this->allowedConfigurations) > 0) {
425
            // 	remove configuration that does not match the current selection
426
            foreach ($configurations as $confKey => $confArray) {
427
                if (! in_array($confKey, $this->allowedConfigurations, true)) {
428
                    unset($configurations[$confKey]);
429
                }
430
            }
431
        }
432
433 4
        return $configurations;
434
    }
435
436
    /**
437
     * Reads the registered processingInstructions of the crawler
438
     */
439 4
    private function getCrawlerProcInstructions(): array
440
    {
441 4
        $crawlerProcInstructions = [];
442 4
        if (! empty($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['procInstructions'])) {
443
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['procInstructions'] as $configuration) {
444
                $crawlerProcInstructions[$configuration['key']] = $configuration['value'];
445
            }
446
        }
447
448 4
        return $crawlerProcInstructions;
449
    }
450
451 4
    private function getPageRepository(): PageRepository
452
    {
453 4
        return GeneralUtility::makeInstance(ObjectManager::class)->get(PageRepository::class);
454
    }
455
}
456