Passed
Push — deprecate/crawlerapi ( 17ac42 )
by Tomas Norre
05:43
created

CrawlerApi   B

Complexity

Total Complexity 43

Size/Duplication

Total Lines 408
Duplicated Lines 0 %

Test Coverage

Coverage 56.49%

Importance

Changes 1
Bugs 0 Features 0
Metric Value
eloc 142
c 1
b 0
f 0
dl 0
loc 408
ccs 74
cts 131
cp 0.5649
rs 8.96
wmc 43

18 Methods

Rating   Name   Duplication   Size   Complexity  
A setAllowedConfigurations() 0 3 1
A getAllowedConfigurations() 0 3 1
A getSetId() 0 3 1
A overwriteSetId() 0 3 1
A addPageToQueue() 0 5 1
A filterUnallowedConfigurations() 0 12 4
A addPageToQueueTimed() 0 35 3
A getCurrentCrawlingSpeed() 0 34 5
A getQueueStatistics() 0 5 1
A getLastProcessedQueueEntries() 0 3 1
A getLatestCrawlTimestampForPage() 0 32 4
A getCrawlHistoryForPage() 0 17 2
A __construct() 0 5 1
B getPerformanceData() 0 48 8
A findCrawler() 0 11 3
A getQueueStatisticsByConfiguration() 0 12 2
A getCrawlerProcInstructions() 0 10 3
A getActiveProcessesCount() 0 4 1

How to fix   Complexity   

Complex Class

Complex classes like CrawlerApi often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

While breaking up the class, it is a good idea to analyze how other classes use CrawlerApi, and based on these observations, apply Extract Interface, too.

1
<?php
2
3
declare(strict_types=1);
4
5
namespace AOE\Crawler\Api;
6
7
/***************************************************************
8
 *  Copyright notice
9
 *
10
 *  (c) 2018 AOE GmbH <[email protected]>
11
 *
12
 *  All rights reserved
13
 *
14
 *  This script is part of the TYPO3 project. The TYPO3 project is
15
 *  free software; you can redistribute it and/or modify
16
 *  it under the terms of the GNU General Public License as published by
17
 *  the Free Software Foundation; either version 3 of the License, or
18
 *  (at your option) any later version.
19
 *
20
 *  The GNU General Public License can be found at
21
 *  http://www.gnu.org/copyleft/gpl.html.
22
 *
23
 *  This script is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26
 *  GNU General Public License for more details.
27
 *
28
 *  This copyright notice MUST APPEAR in all copies of the script!
29
 ***************************************************************/
30
31
use AOE\Crawler\Controller\CrawlerController;
32
use AOE\Crawler\Domain\Repository\ProcessRepository;
33
use AOE\Crawler\Domain\Repository\QueueRepository;
34
use AOE\Crawler\Exception\CrawlerObjectException;
35
use AOE\Crawler\Exception\TimeStampException;
36
use TYPO3\CMS\Core\Database\ConnectionPool;
37
use TYPO3\CMS\Core\Database\Query\QueryBuilder;
38
use TYPO3\CMS\Core\Utility\GeneralUtility;
39
use TYPO3\CMS\Extbase\Object\ObjectManager;
40
use TYPO3\CMS\Frontend\Page\PageRepository;
41
42
/**
43
 * Class CrawlerApi
44
 *
45
 * @package AOE\Crawler\Api
46
 * @deprecated This class will be removed when dropping support for TYPO3 9LTS and 10LTS
47
 */
48
class CrawlerApi
49
{
50
    /**
51
     * @var QueueRepository
52
     */
53
    protected $queueRepository;
54
55
    /**
56
     * @var array
57
     */
58
    protected $allowedConfigurations = [];
59
60
    /**
61
     * @var QueryBuilder
62
     */
63
    protected $queryBuilder;
64
65
    /**
66
     * @var string
67
     */
68
    protected $tableName = 'tx_crawler_queue';
69
70
    /**
71
     * @var CrawlerController
72
     */
73
    protected $crawlerController;
74
75 10
    public function __construct()
76
    {
77
        /** @var ObjectManager $objectManager */
78 10
        $objectManager = GeneralUtility::makeInstance(ObjectManager::class);
79 10
        $this->queueRepository = $objectManager->get(QueueRepository::class);
80 10
    }
81
82
    /**
83
     * Each crawler run has a setid, this facade method delegates
84
     * the it to the crawler object
85
     *
86
     * @throws \Exception
87
     */
88 1
    public function overwriteSetId(int $id): void
89
    {
90 1
        $this->findCrawler()->setID = $id;
91 1
    }
92
93
    /**
94
     * This method is used to limit the configuration selection to
95
     * a set of configurations.
96
     */
97 1
    public function setAllowedConfigurations(array $allowedConfigurations): void
98
    {
99 1
        $this->allowedConfigurations = $allowedConfigurations;
100 1
    }
101
102
    /**
103
     * @return array
104
     */
105 1
    public function getAllowedConfigurations()
106
    {
107 1
        return $this->allowedConfigurations;
108
    }
109
110
    /**
111
     * Returns the setID of the crawler
112
     *
113
     * @return int
114
     */
115 1
    public function getSetId()
116
    {
117 1
        return $this->findCrawler()->setID;
118
    }
119
120
    /**
121
     * Adds a page to the crawlerqueue by uid
122
     *
123
     * @param int $uid uid
124
     * @codeCoverageIgnore
125
     */
126
    public function addPageToQueue($uid): void
127
    {
128
        $uid = intval($uid);
129
        //non timed elements will be added with timestamp 0
130
        $this->addPageToQueueTimed($uid, 0);
131
    }
132
133
    /**
134
     * Adds a page to the crawlerqueue by uid and sets a
135
     * timestamp when the page should be crawled.
136
     *
137
     * @param int $uid pageid
138
     * @param int $time timestamp
139
     *
140
     * @throws \Exception
141
     */
142 4
    public function addPageToQueueTimed($uid, $time): void
143
    {
144 4
        $uid = intval($uid);
145 4
        $time = intval($time);
146
147 4
        $crawler = $this->findCrawler();
148
        /**
149
         * Todo: Switch back to getPage(); when dropping support for TYPO3 9 LTS - TNM
150
         * This switch to getPage_noCheck() is needed as TYPO3 9 LTS doesn't return dokType < 200, therefore automatically
151
         * adding pages to crawler queue when editing page-titles from the page tree directly was not working.
152
         */
153 4
        $pageData = GeneralUtility::makeInstance(PageRepository::class)->getPage_noCheck($uid, true);
154 4
        $configurations = $crawler->getUrlsForPageRow($pageData);
155 4
        $configurations = $this->filterUnallowedConfigurations($configurations);
156 4
        $downloadUrls = [];
157 4
        $duplicateTrack = [];
158
159 4
        if (is_array($configurations)) {
0 ignored issues
show
introduced by
The condition is_array($configurations) is always true.
Loading history...
160 4
            foreach ($configurations as $cv) {
161
                //enable inserting of entries
162 4
                $crawler->registerQueueEntriesInternallyOnly = false;
163 4
                $crawler->urlListFromUrlArray(
164 4
                    $cv,
165
                    $pageData,
166
                    $time,
167 4
                    300,
168 4
                    true,
169 4
                    false,
170
                    $duplicateTrack,
171
                    $downloadUrls,
172 4
                    array_keys($this->getCrawlerProcInstructions())
173
                );
174
175
                //reset the queue because the entries have been written to the db
176 4
                unset($crawler->queueEntries);
177
            }
178
        }
179 4
    }
180
181
    /**
182
     * Method to return the latest Crawle Timestamp for a page.
183
     *
184
     * @param int $uid uid id of the page
185
     * @param bool $future_crawldates_only
186
     * @param bool $unprocessed_only
187
     *
188
     * @return int
189
     */
190 1
    public function getLatestCrawlTimestampForPage($uid, $future_crawldates_only = false, $unprocessed_only = false)
191
    {
192 1
        $uid = intval($uid);
193
194 1
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
195
        $query = $queryBuilder
196 1
            ->from($this->tableName)
197 1
            ->selectLiteral('max(scheduled) as latest')
198 1
            ->where(
199 1
                $queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($uid))
200
            );
201
202 1
        if ($future_crawldates_only) {
203
            $query->andWhere(
204
                $queryBuilder->expr()->gt('scheduled', time())
205
            );
206
        }
207
208 1
        if ($unprocessed_only) {
209
            $query->andWhere(
210
                $queryBuilder->expr()->eq('exec_time', 0)
211
            );
212
        }
213
214 1
        $row = $query->execute()->fetch(0);
215 1
        if ($row['latest']) {
216 1
            $res = $row['latest'];
217
        } else {
218
            $res = 0;
219
        }
220
221 1
        return intval($res);
222
    }
223
224
    /**
225
     * Returns an array with timestamps when the page has been scheduled for crawling and
226
     * at what time the scheduled crawl has been executed. The array also contains items that are
227
     * scheduled but have note been crawled yet.
228
     *
229
     * @return array array with the crawl-history of a page => 0 : scheduled time , 1 : executed_time, 2 : set_id
230
     */
231 1
    public function getCrawlHistoryForPage(int $uid, int $limit = 0)
232
    {
233 1
        $uid = intval($uid);
234 1
        $limit = intval($limit);
235
236 1
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
237
        $statement = $queryBuilder
238 1
            ->from($this->tableName)
239 1
            ->select('scheduled', 'exec_time', 'set_id')
240 1
            ->where(
241 1
                $queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($uid, \PDO::PARAM_INT))
242
            );
243 1
        if ($limit) {
244 1
            $statement->setMaxResults($limit);
245
        }
246
247 1
        return $statement->execute()->fetchAll();
248
    }
249
250
    /**
251
     * Get queue statistics
252
     */
253 1
    public function getQueueStatistics(): array
254
    {
255
        return [
256 1
            'assignedButUnprocessed' => $this->queueRepository->countAllAssignedPendingItems(),
257 1
            'unprocessed' => $this->queueRepository->countAllPendingItems(),
258
        ];
259
    }
260
261
    /**
262
     * Get queue statistics by configuration
263
     *
264
     * @return array array of array('configuration' => <>, 'assignedButUnprocessed' => <>, 'unprocessed' => <>)
265
     * @codeCoverageIgnore
266
     */
267
    public function getQueueStatisticsByConfiguration()
268
    {
269
        $statistics = $this->queueRepository->countPendingItemsGroupedByConfigurationKey();
270
        $setIds = $this->queueRepository->getSetIdWithUnprocessedEntries();
271
        $totals = $this->queueRepository->getTotalQueueEntriesByConfiguration($setIds);
272
273
        // "merge" arrays
274
        foreach ($statistics as &$value) {
275
            $value['total'] = $totals[$value['configuration']];
276
        }
277
278
        return $statistics;
279
    }
280
281
    /**
282
     * Get active processes count
283
     * @codeCoverageIgnore
284
     */
285
    public function getActiveProcessesCount(): int
286
    {
287
        $processRepository = GeneralUtility::makeInstance(ProcessRepository::class);
288
        return $processRepository->findAllActive()->count();
289
    }
290
291
    /**
292
     * @param int $limit
293
     * @return array
294
     * @codeCoverageIgnore
295
     */
296
    public function getLastProcessedQueueEntries(int $limit): array
297
    {
298
        return $this->queueRepository->getLastProcessedEntries($limit);
299
    }
300
301
    /**
302
     * Get current crawling speed
303
     *
304
     * @return int|float|bool
305
     * @codeCoverageIgnore
306
     */
307
    public function getCurrentCrawlingSpeed()
308
    {
309
        $lastProcessedEntries = $this->queueRepository->getLastProcessedEntriesTimestamps();
310
311
        if (count($lastProcessedEntries) < 10) {
312
            // not enough information
313
            return false;
314
        }
315
316
        // time between two entries is "too old"
317
        $tooOldDelta = 60;
318
319
        $compareValue = time();
320
        $startTime = $lastProcessedEntries[0];
321
322
        $pages = 0;
323
324
        reset($lastProcessedEntries);
325
        foreach ($lastProcessedEntries as $timestamp) {
326
            if ($compareValue - $timestamp > $tooOldDelta) {
327
                break;
328
            }
329
            $compareValue = $timestamp;
330
            $pages++;
331
        }
332
333
        if ($pages < 10) {
334
            // not enough information
335
            return false;
336
        }
337
        $oldestTimestampThatIsNotTooOld = $compareValue;
338
        $time = $startTime - $oldestTimestampThatIsNotTooOld;
339
340
        return $pages / ($time / 60);
341
    }
342
343
    /**
344
     * Get some performance data
345
     *
346
     * @param integer $start
347
     * @param integer $end
348
     * @param integer $resolution
349
     *
350
     * @return array data
351
     *
352
     * @throws TimeStampException
353
     * @codeCoverageIgnore
354
     */
355
    public function getPerformanceData($start, $end, $resolution)
356
    {
357
        $data = [];
358
359
        $data['urlcount'] = 0;
360
        $data['start'] = $start;
361
        $data['end'] = $end;
362
        $data['duration'] = $data['end'] - $data['start'];
363
364
        if ($data['duration'] < 1) {
365
            throw new TimeStampException('End timestamp must be after start timestamp', 1512659945);
366
        }
367
368
        for ($slotStart = $start; $slotStart < $end; $slotStart += $resolution) {
369
            $slotEnd = min($slotStart + $resolution - 1, $end);
370
            $slotData = $this->queueRepository->getPerformanceData($slotStart, $slotEnd);
371
372
            $slotUrlCount = 0;
373
            foreach ($slotData as &$processData) {
374
                $duration = $processData['end'] - $processData['start'];
375
                if ($processData['urlcount'] > 5 && $duration > 0) {
376
                    $processData['speed'] = 60 * 1 / ($duration / $processData['urlcount']);
377
                }
378
                $slotUrlCount += $processData['urlcount'];
379
            }
380
381
            $data['urlcount'] += $slotUrlCount;
382
383
            $data['slots'][$slotEnd] = [
384
                'amountProcesses' => count($slotData),
385
                'urlcount' => $slotUrlCount,
386
                'processes' => $slotData,
387
            ];
388
389
            if ($slotUrlCount > 5) {
390
                $data['slots'][$slotEnd]['speed'] = 60 * 1 / ($slotEnd - $slotStart / $slotUrlCount);
391
            } else {
392
                $data['slots'][$slotEnd]['speed'] = 0;
393
            }
394
        }
395
396
        if ($data['urlcount'] > 5) {
397
            $data['speed'] = 60 * 1 / ($data['duration'] / $data['urlcount']);
398
        } else {
399
            $data['speed'] = 0;
400
        }
401
402
        return $data;
403
    }
404
405
    /**
406
     * Method to get an instance of the internal crawler singleton
407
     *
408
     * @return CrawlerController Instance of the crawler lib
409
     *
410
     * @throws CrawlerObjectException
411
     */
412 2
    protected function findCrawler()
413
    {
414 2
        if (! is_object($this->crawlerController)) {
415 2
            $this->crawlerController = GeneralUtility::makeInstance(CrawlerController::class);
416 2
            $this->crawlerController->setID = GeneralUtility::md5int(microtime());
417
        }
418
419 2
        if (is_object($this->crawlerController)) {
420 2
            return $this->crawlerController;
421
        }
422
        throw new CrawlerObjectException('no crawler object', 1512659759);
423
    }
424
425
    /**
426
     * This method is used to limit the processing instructions to the processing instructions
427
     * that are allowed.
428
     */
429 4
    protected function filterUnallowedConfigurations(array $configurations): array
430
    {
431 4
        if (count($this->allowedConfigurations) > 0) {
432
            // 	remove configuration that does not match the current selection
433
            foreach ($configurations as $confKey => $confArray) {
434
                if (! in_array($confKey, $this->allowedConfigurations, true)) {
435
                    unset($configurations[$confKey]);
436
                }
437
            }
438
        }
439
440 4
        return $configurations;
441
    }
442
443
    /**
444
     * Reads the registered processingInstructions of the crawler
445
     */
446 4
    private function getCrawlerProcInstructions(): array
447
    {
448 4
        $crawlerProcInstructions = [];
449 4
        if (! empty($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['procInstructions'])) {
450
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['procInstructions'] as $configuration) {
451
                $crawlerProcInstructions[$configuration['key']] = $configuration['value'];
452
            }
453
        }
454
455 4
        return $crawlerProcInstructions;
456
    }
457
}
458