Passed
Pull Request — master (#674)
by Tomas Norre
10:43 queued 07:11
created

CrawlerApi::__construct()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 5
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 3
CRAP Score 1

Importance

Changes 0
Metric Value
cc 1
eloc 2
c 0
b 0
f 0
nc 1
nop 0
dl 0
loc 5
ccs 3
cts 3
cp 1
crap 1
rs 10
1
<?php
2
3
declare(strict_types=1);
4
5
namespace AOE\Crawler\Api;
6
7
/***************************************************************
8
 *  Copyright notice
9
 *
10
 *  (c) 2018 AOE GmbH <[email protected]>
11
 *
12
 *  All rights reserved
13
 *
14
 *  This script is part of the TYPO3 project. The TYPO3 project is
15
 *  free software; you can redistribute it and/or modify
16
 *  it under the terms of the GNU General Public License as published by
17
 *  the Free Software Foundation; either version 3 of the License, or
18
 *  (at your option) any later version.
19
 *
20
 *  The GNU General Public License can be found at
21
 *  http://www.gnu.org/copyleft/gpl.html.
22
 *
23
 *  This script is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26
 *  GNU General Public License for more details.
27
 *
28
 *  This copyright notice MUST APPEAR in all copies of the script!
29
 ***************************************************************/
30
31
use AOE\Crawler\Controller\CrawlerController;
32
use AOE\Crawler\Domain\Repository\ProcessRepository;
33
use AOE\Crawler\Domain\Repository\QueueRepository;
34
use AOE\Crawler\Exception\CrawlerObjectException;
35
use AOE\Crawler\Exception\TimeStampException;
36
use TYPO3\CMS\Core\Database\ConnectionPool;
37
use TYPO3\CMS\Core\Database\Query\QueryBuilder;
38
use TYPO3\CMS\Core\Utility\GeneralUtility;
39
use TYPO3\CMS\Extbase\Object\ObjectManager;
40
use TYPO3\CMS\Frontend\Page\PageRepository;
41
42
/**
43
 * Class CrawlerApi
44
 *
45
 * @package AOE\Crawler\Api
46
 */
47
class CrawlerApi
48
{
49
    /**
50
     * @var QueueRepository
51
     */
52
    protected $queueRepository;
53
54
    /**
55
     * @var array
56
     */
57
    protected $allowedConfigurations = [];
58
59
    /**
60
     * @var QueryBuilder
61
     */
62
    protected $queryBuilder;
63
64
    /**
65
     * @var string
66
     */
67
    protected $tableName = 'tx_crawler_queue';
68
69
    /**
70
     * @var CrawlerController
71
     */
72
    protected $crawlerController;
73
74 10
    public function __construct()
75
    {
76
        /** @var ObjectManager $objectManager */
77 10
        $objectManager = GeneralUtility::makeInstance(ObjectManager::class);
78 10
        $this->queueRepository = $objectManager->get(QueueRepository::class);
79 10
    }
80
81
    /**
82
     * Each crawler run has a setid, this facade method delegates
83
     * the it to the crawler object
84
     *
85
     * @throws \Exception
86
     */
87 1
    public function overwriteSetId(int $id): void
88
    {
89 1
        $this->findCrawler()->setID = $id;
90 1
    }
91
92
    /**
93
     * This method is used to limit the configuration selection to
94
     * a set of configurations.
95
     */
96 1
    public function setAllowedConfigurations(array $allowedConfigurations): void
97
    {
98 1
        $this->allowedConfigurations = $allowedConfigurations;
99 1
    }
100
101
    /**
102
     * @return array
103
     */
104 1
    public function getAllowedConfigurations()
105
    {
106 1
        return $this->allowedConfigurations;
107
    }
108
109
    /**
110
     * Returns the setID of the crawler
111
     *
112
     * @return int
113
     */
114 1
    public function getSetId()
115
    {
116 1
        return $this->findCrawler()->setID;
117
    }
118
119
    /**
120
     * Adds a page to the crawlerqueue by uid
121
     *
122
     * @param int $uid uid
123
     */
124
    public function addPageToQueue($uid): void
125
    {
126
        $uid = intval($uid);
127
        //non timed elements will be added with timestamp 0
128
        $this->addPageToQueueTimed($uid, 0);
129
    }
130
131
    /**
132
     * Adds a page to the crawlerqueue by uid and sets a
133
     * timestamp when the page should be crawled.
134
     *
135
     * @param int $uid pageid
136
     * @param int $time timestamp
137
     *
138
     * @throws \Exception
139
     */
140 4
    public function addPageToQueueTimed($uid, $time): void
141
    {
142 4
        $uid = intval($uid);
143 4
        $time = intval($time);
144
145 4
        $crawler = $this->findCrawler();
146
        /**
147
         * Todo: Switch back to getPage(); when dropping support for TYPO3 9 LTS - TNM
148
         * This switch to getPage_noCheck() is needed as TYPO3 9 LTS doesn't return dokType < 200, therefore automatically
149
         * adding pages to crawler queue when editing page-titles from the page tree directly was not working.
150
         */
151 4
        $pageData = GeneralUtility::makeInstance(PageRepository::class)->getPage_noCheck($uid, true);
152 4
        $configurations = $crawler->getUrlsForPageRow($pageData);
153 4
        $configurations = $this->filterUnallowedConfigurations($configurations);
154 4
        $downloadUrls = [];
155 4
        $duplicateTrack = [];
156
157 4
        if (is_array($configurations)) {
0 ignored issues
show
introduced by
The condition is_array($configurations) is always true.
Loading history...
158 4
            foreach ($configurations as $cv) {
159
                //enable inserting of entries
160 4
                $crawler->registerQueueEntriesInternallyOnly = false;
161 4
                $crawler->urlListFromUrlArray(
162 4
                    $cv,
163 4
                    $pageData,
164 4
                    $time,
165 4
                    300,
166 4
                    true,
167 4
                    false,
168 4
                    $duplicateTrack,
169 4
                    $downloadUrls,
170 4
                    array_keys($this->getCrawlerProcInstructions())
171
                );
172
173
                //reset the queue because the entries have been written to the db
174 4
                unset($crawler->queueEntries);
175
            }
176
        }
177 4
    }
178
179
    /**
180
     * Method to return the latest Crawle Timestamp for a page.
181
     *
182
     * @param int $uid uid id of the page
183
     * @param bool $future_crawldates_only
184
     * @param bool $unprocessed_only
185
     *
186
     * @return int
187
     */
188 1
    public function getLatestCrawlTimestampForPage($uid, $future_crawldates_only = false, $unprocessed_only = false)
189
    {
190 1
        $uid = intval($uid);
191
192 1
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
193
        $query = $queryBuilder
194 1
            ->from($this->tableName)
195 1
            ->selectLiteral('max(scheduled) as latest')
196 1
            ->where(
197 1
                $queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($uid))
198
            );
199
200 1
        if ($future_crawldates_only) {
201
            $query->andWhere(
202
                $queryBuilder->expr()->gt('scheduled', time())
203
            );
204
        }
205
206 1
        if ($unprocessed_only) {
207
            $query->andWhere(
208
                $queryBuilder->expr()->eq('exec_time', 0)
209
            );
210
        }
211
212 1
        $row = $query->execute()->fetch(0);
213 1
        if ($row['latest']) {
214 1
            $res = $row['latest'];
215
        } else {
216
            $res = 0;
217
        }
218
219 1
        return intval($res);
220
    }
221
222
    /**
223
     * Returns an array with timestamps when the page has been scheduled for crawling and
224
     * at what time the scheduled crawl has been executed. The array also contains items that are
225
     * scheduled but have note been crawled yet.
226
     *
227
     * @return array array with the crawl-history of a page => 0 : scheduled time , 1 : executed_time, 2 : set_id
228
     */
229 1
    public function getCrawlHistoryForPage(int $uid, int $limit = 0)
230
    {
231 1
        $uid = intval($uid);
232 1
        $limit = intval($limit);
233
234 1
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
235
        $statement = $queryBuilder
236 1
            ->from($this->tableName)
237 1
            ->select('scheduled', 'exec_time', 'set_id')
238 1
            ->where(
239 1
                $queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($uid, \PDO::PARAM_INT))
240
            );
241 1
        if ($limit) {
242 1
            $statement->setMaxResults($limit);
243
        }
244
245 1
        return $statement->execute()->fetchAll();
246
    }
247
248
    /**
249
     * Get queue statistics
250
     */
251 1
    public function getQueueStatistics(): array
252
    {
253
        return [
254 1
            'assignedButUnprocessed' => $this->queueRepository->countAllAssignedPendingItems(),
255 1
            'unprocessed' => $this->queueRepository->countAllPendingItems(),
256
        ];
257
    }
258
259
    /**
260
     * Get queue statistics by configuration
261
     *
262
     * @return array array of array('configuration' => <>, 'assignedButUnprocessed' => <>, 'unprocessed' => <>)
263
     */
264
    public function getQueueStatisticsByConfiguration()
265
    {
266
        $statistics = $this->queueRepository->countPendingItemsGroupedByConfigurationKey();
267
        $setIds = $this->queueRepository->getSetIdWithUnprocessedEntries();
268
        $totals = $this->queueRepository->getTotalQueueEntriesByConfiguration($setIds);
269
270
        // "merge" arrays
271
        foreach ($statistics as &$value) {
272
            $value['total'] = $totals[$value['configuration']];
273
        }
274
275
        return $statistics;
276
    }
277
278
    /**
279
     * Get active processes count
280
     */
281
    public function getActiveProcessesCount(): int
282
    {
283
        $processRepository = GeneralUtility::makeInstance(ProcessRepository::class);
284
        return $processRepository->findAllActive()->count();
285
    }
286
287
    public function getLastProcessedQueueEntries(int $limit): array
288
    {
289
        return $this->queueRepository->getLastProcessedEntries($limit);
290
    }
291
292
    /**
293
     * Get current crawling speed
294
     *
295
     * @return int|float|bool
296
     */
297
    public function getCurrentCrawlingSpeed()
298
    {
299
        $lastProcessedEntries = $this->queueRepository->getLastProcessedEntriesTimestamps();
300
301
        if (count($lastProcessedEntries) < 10) {
302
            // not enough information
303
            return false;
304
        }
305
306
        // time between two entries is "too old"
307
        $tooOldDelta = 60;
308
309
        $compareValue = time();
310
        $startTime = $lastProcessedEntries[0];
311
312
        $pages = 0;
313
314
        reset($lastProcessedEntries);
315
        foreach ($lastProcessedEntries as $timestamp) {
316
            if ($compareValue - $timestamp > $tooOldDelta) {
317
                break;
318
            }
319
            $compareValue = $timestamp;
320
            $pages++;
321
        }
322
323
        if ($pages < 10) {
324
            // not enough information
325
            return false;
326
        }
327
        $oldestTimestampThatIsNotTooOld = $compareValue;
328
        $time = $startTime - $oldestTimestampThatIsNotTooOld;
329
330
        return $pages / ($time / 60);
331
    }
332
333
    /**
334
     * Get some performance data
335
     *
336
     * @param integer $start
337
     * @param integer $end
338
     * @param integer $resolution
339
     *
340
     * @return array data
341
     *
342
     * @throws TimeStampException
343
     */
344
    public function getPerformanceData($start, $end, $resolution)
345
    {
346
        $data = [];
347
348
        $data['urlcount'] = 0;
349
        $data['start'] = $start;
350
        $data['end'] = $end;
351
        $data['duration'] = $data['end'] - $data['start'];
352
353
        if ($data['duration'] < 1) {
354
            throw new TimeStampException('End timestamp must be after start timestamp', 1512659945);
355
        }
356
357
        for ($slotStart = $start; $slotStart < $end; $slotStart += $resolution) {
358
            $slotEnd = min($slotStart + $resolution - 1, $end);
359
            $slotData = $this->queueRepository->getPerformanceData($slotStart, $slotEnd);
360
361
            $slotUrlCount = 0;
362
            foreach ($slotData as &$processData) {
363
                $duration = $processData['end'] - $processData['start'];
364
                if ($processData['urlcount'] > 5 && $duration > 0) {
365
                    $processData['speed'] = 60 * 1 / ($duration / $processData['urlcount']);
366
                }
367
                $slotUrlCount += $processData['urlcount'];
368
            }
369
370
            $data['urlcount'] += $slotUrlCount;
371
372
            $data['slots'][$slotEnd] = [
373
                'amountProcesses' => count($slotData),
374
                'urlcount' => $slotUrlCount,
375
                'processes' => $slotData,
376
            ];
377
378
            if ($slotUrlCount > 5) {
379
                $data['slots'][$slotEnd]['speed'] = 60 * 1 / ($slotEnd - $slotStart / $slotUrlCount);
380
            } else {
381
                $data['slots'][$slotEnd]['speed'] = 0;
382
            }
383
        }
384
385
        if ($data['urlcount'] > 5) {
386
            $data['speed'] = 60 * 1 / ($data['duration'] / $data['urlcount']);
387
        } else {
388
            $data['speed'] = 0;
389
        }
390
391
        return $data;
392
    }
393
394
    /**
395
     * Method to get an instance of the internal crawler singleton
396
     *
397
     * @return CrawlerController Instance of the crawler lib
398
     *
399
     * @throws CrawlerObjectException
400
     */
401 2
    protected function findCrawler()
402
    {
403 2
        if (! is_object($this->crawlerController)) {
404 2
            $this->crawlerController = GeneralUtility::makeInstance(CrawlerController::class);
405 2
            $this->crawlerController->setID = GeneralUtility::md5int(microtime());
406
        }
407
408 2
        if (is_object($this->crawlerController)) {
409 2
            return $this->crawlerController;
410
        }
411
        throw new CrawlerObjectException('no crawler object', 1512659759);
412
    }
413
414
    /**
415
     * This method is used to limit the processing instructions to the processing instructions
416
     * that are allowed.
417
     */
418 4
    protected function filterUnallowedConfigurations(array $configurations): array
419
    {
420 4
        if (count($this->allowedConfigurations) > 0) {
421
            // 	remove configuration that does not match the current selection
422
            foreach ($configurations as $confKey => $confArray) {
423
                if (! in_array($confKey, $this->allowedConfigurations, true)) {
424
                    unset($configurations[$confKey]);
425
                }
426
            }
427
        }
428
429 4
        return $configurations;
430
    }
431
432
    /**
433
     * Reads the registered processingInstructions of the crawler
434
     */
435 4
    private function getCrawlerProcInstructions(): array
436
    {
437 4
        $crawlerProcInstructions = [];
438 4
        if (! empty($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['procInstructions'])) {
439
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['procInstructions'] as $configuration) {
440
                $crawlerProcInstructions[$configuration['key']] = $configuration['value'];
441
            }
442
        }
443
444 4
        return $crawlerProcInstructions;
445
    }
446
}
447