Issues (138)

Classes/Api/CrawlerApi.php (2 issues)

1
<?php
2
3
declare(strict_types=1);
4
5
namespace AOE\Crawler\Api;
6
7
/***************************************************************
8
 *  Copyright notice
9
 *
10
 *  (c) 2018 AOE GmbH <[email protected]>
11
 *
12
 *  All rights reserved
13
 *
14
 *  This script is part of the TYPO3 project. The TYPO3 project is
15
 *  free software; you can redistribute it and/or modify
16
 *  it under the terms of the GNU General Public License as published by
17
 *  the Free Software Foundation; either version 3 of the License, or
18
 *  (at your option) any later version.
19
 *
20
 *  The GNU General Public License can be found at
21
 *  http://www.gnu.org/copyleft/gpl.html.
22
 *
23
 *  This script is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26
 *  GNU General Public License for more details.
27
 *
28
 *  This copyright notice MUST APPEAR in all copies of the script!
29
 ***************************************************************/
30
31
use AOE\Crawler\Controller\CrawlerController;
32
use AOE\Crawler\Domain\Repository\ProcessRepository;
33
use AOE\Crawler\Domain\Repository\QueueRepository;
34
use AOE\Crawler\Exception\CrawlerObjectException;
35
use AOE\Crawler\Exception\TimeStampException;
36
use TYPO3\CMS\Core\Database\ConnectionPool;
37
use TYPO3\CMS\Core\Database\Query\QueryBuilder;
38
use TYPO3\CMS\Core\Domain\Repository\PageRepository;
39
use TYPO3\CMS\Core\Utility\GeneralUtility;
40
use TYPO3\CMS\Extbase\Object\ObjectManager;
41
42
/**
43
 * Class CrawlerApi
44
 *
45
 * @package AOE\Crawler\Api
46
 * @deprecated Since v9.2.0 - This class will be removed when dropping support for TYPO3 9LTS and 10LTS
47
 */
48
class CrawlerApi
49
{
50
    /**
51
     * @var QueueRepository
52
     */
53
    protected $queueRepository;
54
55
    /**
56
     * @var array
57
     */
58
    protected $allowedConfigurations = [];
59
60
    /**
61
     * @var QueryBuilder
62
     */
63
    protected $queryBuilder;
64
65
    /**
66
     * @var string
67
     */
68
    protected $tableName = 'tx_crawler_queue';
69
70
    /**
71
     * @var CrawlerController
72
     */
73
    protected $crawlerController;
74
75 10
    public function __construct()
76
    {
77
        /** @var ObjectManager $objectManager */
78 10
        $objectManager = GeneralUtility::makeInstance(ObjectManager::class);
79 10
        $this->queueRepository = $objectManager->get(QueueRepository::class);
0 ignored issues
show
Deprecated Code introduced by
The function TYPO3\CMS\Extbase\Object\ObjectManager::get() has been deprecated: since TYPO3 10.4, will be removed in version 12.0 ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

79
        $this->queueRepository = /** @scrutinizer ignore-deprecated */ $objectManager->get(QueueRepository::class);

This function has been deprecated. The supplier of the function has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the function will be removed and what other function to use instead.

Loading history...
80 10
    }
81
82
    /**
83
     * Each crawler run has a setid, this facade method delegates
84
     * the it to the crawler object
85
     *
86
     * @throws \Exception
87
     */
88 1
    public function overwriteSetId(int $id): void
89
    {
90 1
        $this->findCrawler()->setID = $id;
91 1
    }
92
93
    /**
94
     * This method is used to limit the configuration selection to
95
     * a set of configurations.
96
     */
97 1
    public function setAllowedConfigurations(array $allowedConfigurations): void
98
    {
99 1
        $this->allowedConfigurations = $allowedConfigurations;
100 1
    }
101
102
    /**
103
     * @return array
104
     */
105 1
    public function getAllowedConfigurations()
106
    {
107 1
        return $this->allowedConfigurations;
108
    }
109
110
    /**
111
     * Returns the setID of the crawler
112
     *
113
     * @return int
114
     */
115 1
    public function getSetId()
116
    {
117 1
        return $this->findCrawler()->setID;
118
    }
119
120
    /**
121
     * Adds a page to the crawlerqueue by uid
122
     *
123
     * @param int $uid uid
124
     * @codeCoverageIgnore
125
     */
126
    public function addPageToQueue($uid): void
127
    {
128
        $uid = intval($uid);
129
        //non timed elements will be added with timestamp 0
130
        $this->addPageToQueueTimed($uid, 0);
131
    }
132
133
    /**
134
     * Adds a page to the crawlerqueue by uid and sets a
135
     * timestamp when the page should be crawled.
136
     *
137
     * @param int $uid pageid
138
     * @param int $time timestamp
139
     *
140
     * @throws \Exception
141
     */
142 4
    public function addPageToQueueTimed($uid, $time): void
143
    {
144 4
        $uid = intval($uid);
145 4
        $time = intval($time);
146
147 4
        $crawler = $this->findCrawler();
148 4
        $pageData = $this->getPageRepository()->getPage($uid, true);
149 4
        $configurations = $crawler->getUrlsForPageRow($pageData);
150 4
        $configurations = $this->filterUnallowedConfigurations($configurations);
151 4
        $downloadUrls = [];
152 4
        $duplicateTrack = [];
153
154 4
        if (is_array($configurations)) {
0 ignored issues
show
The condition is_array($configurations) is always true.
Loading history...
155 4
            foreach ($configurations as $cv) {
156
                //enable inserting of entries
157 4
                $crawler->registerQueueEntriesInternallyOnly = false;
158 4
                $crawler->urlListFromUrlArray(
159 4
                    $cv,
160
                    $pageData,
161
                    $time,
162 4
                    300,
163 4
                    true,
164 4
                    false,
165
                    $duplicateTrack,
166
                    $downloadUrls,
167 4
                    array_keys($this->getCrawlerProcInstructions())
168
                );
169
170
                //reset the queue because the entries have been written to the db
171 4
                unset($crawler->queueEntries);
172
            }
173
        }
174 4
    }
175
176
    /**
177
     * Method to return the latest Crawle Timestamp for a page.
178
     *
179
     * @param int $uid uid id of the page
180
     * @param bool $future_crawldates_only
181
     * @param bool $unprocessed_only
182
     *
183
     * @return int
184
     */
185 1
    public function getLatestCrawlTimestampForPage($uid, $future_crawldates_only = false, $unprocessed_only = false)
186
    {
187 1
        $uid = intval($uid);
188
189 1
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable(QueueRepository::TABLE_NAME);
190
        $query = $queryBuilder
191 1
            ->from(QueueRepository::TABLE_NAME)
192 1
            ->selectLiteral('max(scheduled) as latest')
193 1
            ->where(
194 1
                $queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($uid))
195
            );
196
197 1
        if ($future_crawldates_only) {
198
            $query->andWhere(
199
                $queryBuilder->expr()->gt('scheduled', time())
200
            );
201
        }
202
203 1
        if ($unprocessed_only) {
204
            $query->andWhere(
205
                $queryBuilder->expr()->eq('exec_time', 0)
206
            );
207
        }
208
209 1
        $row = $query->execute()->fetch(0);
210 1
        if ($row['latest']) {
211 1
            $res = $row['latest'];
212
        } else {
213
            $res = 0;
214
        }
215
216 1
        return intval($res);
217
    }
218
219
    /**
220
     * Returns an array with timestamps when the page has been scheduled for crawling and
221
     * at what time the scheduled crawl has been executed. The array also contains items that are
222
     * scheduled but have note been crawled yet.
223
     *
224
     * @return array array with the crawl-history of a page => 0 : scheduled time , 1 : executed_time, 2 : set_id
225
     */
226 1
    public function getCrawlHistoryForPage(int $uid, int $limit = 0)
227
    {
228 1
        $uid = intval($uid);
229 1
        $limit = intval($limit);
230
231 1
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable(QueueRepository::TABLE_NAME);
232
        $statement = $queryBuilder
233 1
            ->from(QueueRepository::TABLE_NAME)
234 1
            ->select('scheduled', 'exec_time', 'set_id')
235 1
            ->where(
236 1
                $queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($uid, \PDO::PARAM_INT))
237
            );
238 1
        if ($limit) {
239 1
            $statement->setMaxResults($limit);
240
        }
241
242 1
        return $statement->execute()->fetchAll();
243
    }
244
245 1
    public function getQueueStatistics(): array
246
    {
247
        return [
248 1
            'assignedButUnprocessed' => $this->queueRepository->countAllAssignedPendingItems(),
249 1
            'unprocessed' => $this->queueRepository->countAllPendingItems(),
250
        ];
251
    }
252
253
    /**
254
     * Get queue statistics by configuration
255
     *
256
     * @return array array of array('configuration' => <>, 'assignedButUnprocessed' => <>, 'unprocessed' => <>)
257
     * @codeCoverageIgnore
258
     */
259
    public function getQueueStatisticsByConfiguration()
260
    {
261
        $statistics = $this->queueRepository->countPendingItemsGroupedByConfigurationKey();
262
        $setIds = $this->queueRepository->getSetIdWithUnprocessedEntries();
263
        $totals = $this->queueRepository->getTotalQueueEntriesByConfiguration($setIds);
264
265
        // "merge" arrays
266
        foreach ($statistics as &$value) {
267
            $value['total'] = $totals[$value['configuration']];
268
        }
269
270
        return $statistics;
271
    }
272
273
    /**
274
     * Get active processes count
275
     * @codeCoverageIgnore
276
     */
277
    public function getActiveProcessesCount(): int
278
    {
279
        $processRepository = GeneralUtility::makeInstance(ProcessRepository::class);
280
        return $processRepository->findAllActive()->count();
281
    }
282
283
    /**
284
     * @codeCoverageIgnore
285
     */
286
    public function getLastProcessedQueueEntries(int $limit): array
287
    {
288
        return $this->queueRepository->getLastProcessedEntries($limit);
289
    }
290
291
    /**
292
     * Get current crawling speed
293
     *
294
     * @return int|float|bool
295
     * @codeCoverageIgnore
296
     */
297
    public function getCurrentCrawlingSpeed()
298
    {
299
        $lastProcessedEntries = $this->queueRepository->getLastProcessedEntriesTimestamps();
300
301
        if (count($lastProcessedEntries) < 10) {
302
            // not enough information
303
            return false;
304
        }
305
306
        // time between two entries is "too old"
307
        $tooOldDelta = 60;
308
309
        $compareValue = time();
310
        $startTime = $lastProcessedEntries[0];
311
312
        $pages = 0;
313
314
        reset($lastProcessedEntries);
315
        foreach ($lastProcessedEntries as $timestamp) {
316
            if ($compareValue - $timestamp > $tooOldDelta) {
317
                break;
318
            }
319
            $compareValue = $timestamp;
320
            $pages++;
321
        }
322
323
        if ($pages < 10) {
324
            // not enough information
325
            return false;
326
        }
327
        $oldestTimestampThatIsNotTooOld = $compareValue;
328
        $time = $startTime - $oldestTimestampThatIsNotTooOld;
329
330
        return $pages / ($time / 60);
331
    }
332
333
    /**
334
     * Get some performance data
335
     *
336
     * @param integer $start
337
     * @param integer $end
338
     * @param integer $resolution
339
     *
340
     * @return array data
341
     *
342
     * @throws TimeStampException
343
     * @codeCoverageIgnore
344
     */
345
    public function getPerformanceData($start, $end, $resolution)
346
    {
347
        $data = [];
348
349
        $data['urlcount'] = 0;
350
        $data['start'] = $start;
351
        $data['end'] = $end;
352
        $data['duration'] = $data['end'] - $data['start'];
353
354
        if ($data['duration'] < 1) {
355
            throw new TimeStampException('End timestamp must be after start timestamp', 1512659945);
356
        }
357
358
        for ($slotStart = $start; $slotStart < $end; $slotStart += $resolution) {
359
            $slotEnd = min($slotStart + $resolution - 1, $end);
360
            $slotData = $this->queueRepository->getPerformanceData($slotStart, $slotEnd);
361
362
            $slotUrlCount = 0;
363
            foreach ($slotData as &$processData) {
364
                $duration = $processData['end'] - $processData['start'];
365
                if ($processData['urlcount'] > 5 && $duration > 0) {
366
                    $processData['speed'] = 60 * 1 / ($duration / $processData['urlcount']);
367
                }
368
                $slotUrlCount += $processData['urlcount'];
369
            }
370
371
            $data['urlcount'] += $slotUrlCount;
372
373
            $data['slots'][$slotEnd] = [
374
                'amountProcesses' => count($slotData),
375
                'urlcount' => $slotUrlCount,
376
                'processes' => $slotData,
377
            ];
378
379
            if ($slotUrlCount > 5) {
380
                $data['slots'][$slotEnd]['speed'] = 60 * 1 / ($slotEnd - $slotStart / $slotUrlCount);
381
            } else {
382
                $data['slots'][$slotEnd]['speed'] = 0;
383
            }
384
        }
385
386
        if ($data['urlcount'] > 5) {
387
            $data['speed'] = 60 * 1 / ($data['duration'] / $data['urlcount']);
388
        } else {
389
            $data['speed'] = 0;
390
        }
391
392
        return $data;
393
    }
394
395
    /**
396
     * Method to get an instance of the internal crawler singleton
397
     *
398
     * @return CrawlerController Instance of the crawler lib
399
     *
400
     * @throws CrawlerObjectException
401
     */
402 2
    protected function findCrawler()
403
    {
404 2
        if (! is_object($this->crawlerController)) {
405 2
            $this->crawlerController = GeneralUtility::makeInstance(CrawlerController::class);
406 2
            $this->crawlerController->setID = GeneralUtility::md5int(microtime());
407
        }
408
409 2
        if (is_object($this->crawlerController)) {
410 2
            return $this->crawlerController;
411
        }
412
        throw new CrawlerObjectException('no crawler object', 1512659759);
413
    }
414
415
    /**
416
     * This method is used to limit the processing instructions to the processing instructions
417
     * that are allowed.
418
     */
419 4
    protected function filterUnallowedConfigurations(array $configurations): array
420
    {
421 4
        if (count($this->allowedConfigurations) > 0) {
422
            // 	remove configuration that does not match the current selection
423
            foreach ($configurations as $confKey => $confArray) {
424
                if (! in_array($confKey, $this->allowedConfigurations, true)) {
425
                    unset($configurations[$confKey]);
426
                }
427
            }
428
        }
429
430 4
        return $configurations;
431
    }
432
433
    /**
434
     * Reads the registered processingInstructions of the crawler
435
     */
436 4
    private function getCrawlerProcInstructions(): array
437
    {
438 4
        $crawlerProcInstructions = [];
439 4
        if (! empty($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['procInstructions'])) {
440
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['procInstructions'] as $configuration) {
441
                $crawlerProcInstructions[$configuration['key']] = $configuration['value'];
442
            }
443
        }
444
445 4
        return $crawlerProcInstructions;
446
    }
447
448 4
    private function getPageRepository(): PageRepository
449
    {
450 4
        return GeneralUtility::makeInstance(ObjectManager::class)->get(PageRepository::class);
451
    }
452
}
453