Completed
Push — master ( 6a3171...e461ad )
by Tomas Norre
07:14
created

CrawlerApi::__construct()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 5

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 4
CRAP Score 1.008

Importance

Changes 0
Metric Value
cc 1
nc 1
nop 0
dl 0
loc 5
ccs 4
cts 5
cp 0.8
crap 1.008
rs 10
c 0
b 0
f 0
1
<?php
2
namespace AOE\Crawler\Api;
3
4
/***************************************************************
5
 *  Copyright notice
6
 *
7
 *  (c) 2018 AOE GmbH <[email protected]>
8
 *
9
 *  All rights reserved
10
 *
11
 *  This script is part of the TYPO3 project. The TYPO3 project is
12
 *  free software; you can redistribute it and/or modify
13
 *  it under the terms of the GNU General Public License as published by
14
 *  the Free Software Foundation; either version 3 of the License, or
15
 *  (at your option) any later version.
16
 *
17
 *  The GNU General Public License can be found at
18
 *  http://www.gnu.org/copyleft/gpl.html.
19
 *
20
 *  This script is distributed in the hope that it will be useful,
21
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
22
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
23
 *  GNU General Public License for more details.
24
 *
25
 *  This copyright notice MUST APPEAR in all copies of the script!
26
 ***************************************************************/
27
28
use AOE\Crawler\Controller\CrawlerController;
29
use AOE\Crawler\Domain\Repository\ProcessRepository;
30
use AOE\Crawler\Domain\Repository\QueueRepository;
31
use TYPO3\CMS\Core\Utility\GeneralUtility;
32
use TYPO3\CMS\Core\Utility\MathUtility;
33
use TYPO3\CMS\Extbase\Object\ObjectManager;
34
use TYPO3\CMS\Frontend\Page\PageRepository;
35
36
/**
37
 * Class CrawlerApi
38
 *
39
 * @package AOE\Crawler\Api
40
 */
41
class CrawlerApi
42
{
43
    /**
44
     * @var CrawlerController
45
     */
46
    private $crawlerController;
47
48
    /**
49
     * @var QueueRepository
50
     */
51
    protected $queueRepository;
52
53
    /**
54
     * @var $allowedConfigurations array
55
     */
56
    protected $allowedConfigurations = [];
57
58
    /**
59
     * Each crawler run has a setid, this facade method delegates
60
     * the it to the crawler object
61
     *
62
     * @param int
63
     */
64 1
    public function overwriteSetId($id)
65
    {
66 1
        $this->findCrawler()->setID = intval($id);
67 1
    }
68
69
    /**
70
     * This method is used to limit the configuration selection to
71
     * a set of configurations.
72
     *
73
     * @param array $allowedConfigurations
74
     */
75 1
    public function setAllowedConfigurations(array $allowedConfigurations)
76
    {
77 1
        $this->allowedConfigurations = $allowedConfigurations;
78 1
    }
79
80
    /**
81
     * @return array
82
     */
83 1
    public function getAllowedConfigurations()
84
    {
85 1
        return $this->allowedConfigurations;
86
    }
87
88
    /**
89
     * Returns the setID of the crawler
90
     *
91
     * @return int
92
     */
93 1
    public function getSetId()
94
    {
95 1
        return $this->findCrawler()->setID;
96
    }
97
98 19
    public function __construct()
99
    {
100 19
        $objectManager = GeneralUtility::makeInstance(ObjectManager::class);
101 19
        $this->queueRepository = $objectManager->get(QueueRepository::class);
102 19
    }
103
104
    /**
105
     * Method to get an instance of the internal crawler singleton
106
     *
107
     * @return CrawlerController Instance of the crawler lib
108
     *
109
     * @throws \Exception
110
     */
111 2
    protected function findCrawler()
112
    {
113 2
        if (!is_object($this->crawlerController)) {
114 2
            $this->crawlerController = GeneralUtility::makeInstance(CrawlerController::class);
115 2
            $this->crawlerController->setID = GeneralUtility::md5int(microtime());
116
        }
117
118 2
        if (is_object($this->crawlerController)) {
119 2
            return $this->crawlerController;
120
        } else {
121
            throw new \Exception('no crawler object', 1512659759);
122
        }
123
    }
124
125
    /**
126
     * Adds a page to the crawlerqueue by uid
127
     *
128
     * @param int $uid uid
129
     */
130
    public function addPageToQueue($uid)
131
    {
132
        $uid = intval($uid);
133
        //non timed elements will be added with timestamp 0
134
        $this->addPageToQueueTimed($uid, 0);
135
    }
136
137
    /**
138
     * This method is used to limit the processing instructions to the processing instructions
139
     * that are allowed.
140
     *
141
     * @return array
142
     */
143 4
    protected function filterUnallowedConfigurations($configurations)
144
    {
145 4
        if (count($this->allowedConfigurations) > 0) {
146
            // 	remove configuration that does not match the current selection
147
            foreach ($configurations as $confKey => $confArray) {
148
                if (!in_array($confKey, $this->allowedConfigurations)) {
149
                    unset($configurations[$confKey]);
150
                }
151
            }
152
        }
153
154 4
        return $configurations;
155
    }
156
157
    /**
158
     * Adds a page to the crawlerqueue by uid and sets a
159
     * timestamp when the page should be crawled.
160
     *
161
     * @param int $uid pageid
162
     * @param int $time timestamp
163
     *
164
     * @throws \Exception
165
     */
166 4
    public function addPageToQueueTimed($uid, $time)
167
    {
168 4
        $uid = intval($uid);
169 4
        $time = intval($time);
170
171 4
        $crawler = $this->findCrawler();
172 4
        $pageData = GeneralUtility::makeInstance(PageRepository::class)->getPage($uid);
173 4
        $configurations = $crawler->getUrlsForPageRow($pageData);
174 4
        $configurations = $this->filterUnallowedConfigurations($configurations);
175 4
        $downloadUrls = [];
176 4
        $duplicateTrack = [];
177
178 4
        if (is_array($configurations)) {
179 4
            foreach ($configurations as $cv) {
180
                //enable inserting of entries
181 4
                $crawler->registerQueueEntriesInternallyOnly = false;
182 4
                $crawler->urlListFromUrlArray(
183 4
                    $cv,
184 4
                    $pageData,
185 4
                    $time,
186 4
                    300,
187 4
                    true,
188 4
                    false,
189 4
                    $duplicateTrack,
190 4
                    $downloadUrls,
191 4
                    array_keys($this->getCrawlerProcInstructions())
192
                );
193
194
                //reset the queue because the entries have been written to the db
195 4
                unset($crawler->queueEntries);
196
            }
197
        } else {
198
            //no configuration found
199
        }
200 4
    }
201
202
    /**
203
     * Counts all entries in the database which are scheduled for a given page id and a schedule timestamp.
204
     *
205
     * @param int $page_uid
206
     * @param int $schedule_timestamp
207
     *
208
     * @return int
209
     *
210
     * @deprecated since crawler v6.2.0, will be removed in crawler v7.0.0.
211
     */
212 1
    protected function countEntriesInQueueForPageByScheduleTime($page_uid, $schedule_timestamp)
213
    {
214 1
        $page_uid = intval($page_uid);
215 1
        $schedule_timestamp = intval($schedule_timestamp);
216
217
        //if the same page is scheduled for the same time and has not be executed?
218 1
        if ($schedule_timestamp == 0) {
219
            //un-timed elements need an exec_time with 0 because they can occur multiple times
220 1
            $where = 'page_id=' . $page_uid . ' AND exec_time = 0 AND scheduled=' . $schedule_timestamp;
221
        } else {
222
            //timed elements have got a fixed schedule time, if a record with this time
223
            //exists it is maybe queued for the future, or is has been queue for the past and therefore
224
            //also been processed.
225 1
            $where = 'page_id=' . $page_uid . ' AND scheduled=' . $schedule_timestamp;
226
        }
227
228 1
        $row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($GLOBALS['TYPO3_DB']->exec_SELECTquery(
229 1
            'count(*) as cnt',
230 1
            'tx_crawler_queue',
231 1
            $where
232
        ));
233
234 1
        return intval($row['cnt']);
235
    }
236
237
    /**
238
     * Determines if a page is queued
239
     *
240
     * @param $uid
241
     * @param bool $unprocessed_only
242
     * @param bool $timed_only
243
     * @param bool $timestamp
244
     *
245
     * @deprecated since crawler v6.5.1, will be removed in crawler v9.0.0.
246
     *
247
     * @return bool
248
     */
249 5
    public function isPageInQueue($uid, $unprocessed_only = true, $timed_only = false, $timestamp = false)
250
    {
251 5
        if (!MathUtility::canBeInterpretedAsInteger($uid)) {
252 1
            throw new \InvalidArgumentException('Invalid parameter type', 1468931945);
253
        }
254
255 4
        $isPageInQueue = false;
256
257 4
        $whereClause = 'page_id = ' . (integer)$uid;
258
259 4
        if (false !== $unprocessed_only) {
260 1
            $whereClause .= ' AND exec_time = 0';
261
        }
262
263 4
        if (false !== $timed_only) {
264 1
            $whereClause .= ' AND scheduled != 0';
265
        }
266
267 4
        if (false !== $timestamp) {
268 1
            $whereClause .= ' AND scheduled = ' . (integer)$timestamp;
269
        }
270
271 4
        $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows(
272 4
            '*',
273 4
            'tx_crawler_queue',
274 4
            $whereClause
275
        );
276
277 4
        if (false !== $count && $count > 0) {
278 3
            $isPageInQueue = true;
279
        }
280
281 4
        return $isPageInQueue;
282
    }
283
284
    /**
285
     * Method to return the latest Crawler Timestamp for a page.
286
     *
287
     * @param int $uid uid id of the page
288
     * @param bool $future_crawldates_only
289
     * @param bool $unprocessed_only
290
     *
291
     * @return int
292
     */
293 1
    public function getLatestCrawlTimestampForPage($uid, $future_crawldates_only = false, $unprocessed_only = false)
294
    {
295 1
        $uid = intval($uid);
296 1
        $query = 'max(scheduled) as latest';
297 1
        $where = ' page_id = ' . $uid;
298
299 1
        if ($future_crawldates_only) {
300
            $where .= ' AND scheduled > ' . time();
301
        }
302
303 1
        if ($unprocessed_only) {
304
            $where .= ' AND exec_time = 0';
305
        }
306
307 1
        $rs = $GLOBALS['TYPO3_DB']->exec_SELECTquery($query, 'tx_crawler_queue', $where);
308 1
        if ($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($rs)) {
309 1
            $res = $row['latest'];
310
        } else {
311
            $res = 0;
312
        }
313
314 1
        return intval($res);
315
    }
316
317
    /**
318
     * Returns an array with timestamps when the page has been scheduled for crawling and
319
     * at what time the scheduled crawl has been executed. The array also contains items that are
320
     * scheduled but have note been crawled yet.
321
     *
322
     * @param int $uid uid of the page
323
     * @param bool $limit
324
     *
325
     * @return array array with the crawl-history of a page => 0 : scheduled time , 1 : executed_time, 2 : set_id
326
     */
327 1
    public function getCrawlHistoryForPage($uid, $limit = 0)
328
    {
329 1
        $uid = intval($uid);
330 1
        $limit = intval($limit);
331
332 1
        $query = 'scheduled, exec_time, set_id';
333 1
        $where = ' page_id = ' . $uid;
334
335 1
        $limit_query = ($limit) ? $limit : null;
336
337 1
        $rows = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows($query, 'tx_crawler_queue', $where, null, null, $limit_query);
338 1
        return $rows;
339
    }
340
341
    /**
342
     * Method to determine unprocessed Items in the crawler queue.
343
     *
344
     * @deprecated since crawler v6.5.1, will be removed in crawler v9.0.0.
345
     *
346
     * @return array
347
     */
348 1
    public function getUnprocessedItems()
349
    {
350 1
        $rows = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows(
351 1
            '*',
352 1
            'tx_crawler_queue',
353 1
            'exec_time = 0',
354 1
            '',
355 1
            'page_id, scheduled'
356
        );
357
358 1
        return $rows;
359
    }
360
361
    /**
362
     * Method to get the number of unprocessed items in the crawler
363
     *
364
     * @param int number of unprocessed items in the queue
365
     * @deprecated since crawler v6.5.1, will be removed in crawler v9.0.0.
366
     *
367
     */
368 3
    public function countUnprocessedItems()
369
    {
370 3
        $query = 'count(page_id) as anz';
371 3
        $where = 'exec_time = 0';
372 3
        $rs = $GLOBALS['TYPO3_DB']->exec_SELECTquery($query, 'tx_crawler_queue', $where);
373 3
        $row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($rs);
374
375 3
        return $row['anz'];
376
    }
377
378
    /**
379
     * Method to check if a page is in the queue which is timed for a
380
     * date when it should be crawled
381
     *
382
     * @param int $uid uid of the page
383
     * @param boolean $show_unprocessed only respect unprocessed pages
384
     *
385
     * @deprecated since crawler v6.5.1, will be removed in crawler v9.0.0.
386
     *
387
     * @return boolean
388
     */
389 1
    public function isPageInQueueTimed($uid, $show_unprocessed = true)
390
    {
391 1
        $uid = intval($uid);
392 1
        return $this->queueRepository->isPageInQueue($uid, $show_unprocessed);
393
    }
394
395
    /**
396
     * Reads the registered processingInstructions of the crawler
397
     *
398
     * @return array
399
     */
400 4
    private function getCrawlerProcInstructions()
401
    {
402 4
        if (isset($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['procInstructions'])) {
403
            return $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['procInstructions'];
404
        }
405
406 4
        return [];
407
    }
408
409
    /**
410
     * Removes an queue entry with a given queue id
411
     *
412
     * @param int $qid
413
     * @deprecated since crawler v6.5.1, will be removed in crawler v9.0.0. Please use the QueueRepository instead
414
     */
415
    public function removeQueueEntrie($qid)
416
    {
417
        $qid = intval($qid);
418
        $table = 'tx_crawler_queue';
419
        $where = ' qid=' . $qid;
420
        $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, $where);
421
    }
422
423
    /**
424
     * Get queue statistics
425
     *
426
     * @param void
427
     *
428
     * @return array array('assignedButUnprocessed' => <>, 'unprocessed' => <>);
429
     */
430 1
    public function getQueueStatistics()
431
    {
432
        return [
433 1
            'assignedButUnprocessed' => $this->getQueueRepository()->countAllAssignedPendingItems(),
434 1
            'unprocessed' => $this->getQueueRepository()->countAllPendingItems()
435
        ];
436
    }
437
438
    /**
439
     * Get queue repository
440
     *
441
     * @return QueueRepository
442
     */
443 2
    protected function getQueueRepository()
444
    {
445 2
        if (!$this->queueRepository instanceof QueueRepository) {
446
            $this->queueRepository = new QueueRepository();
447
        }
448
449 2
        return $this->queueRepository;
450
    }
451
452
    /**
453
     * Get queue statistics by configuration
454
     *
455
     * @return array array of array('configuration' => <>, 'assignedButUnprocessed' => <>, 'unprocessed' => <>)
456
     */
457
    public function getQueueStatisticsByConfiguration()
458
    {
459
        $statistics = $this->getQueueRepository()->countPendingItemsGroupedByConfigurationKey();
460
461
        $setIds = $this->getQueueRepository()->getSetIdWithUnprocessedEntries();
462
463
        $totals = $this->getQueueRepository()->getTotalQueueEntriesByConfiguration($setIds);
464
465
        // "merge" arrays
466
        foreach ($statistics as $key => &$value) {
467
            $value['total'] = $totals[$value['configuration']];
468
        }
469
470
        return $statistics;
471
    }
472
473
    /**
474
     * Get active processes count
475
     *
476
     * @param void
477
     *
478
     * @return int
479
     */
480
    public function getActiveProcessesCount()
481
    {
482
        $processRepository = new ProcessRepository();
483
484
        return $processRepository->countActive();
485
    }
486
487
    /**
488
     * Get last processed entries
489
     *
490
     * @param int $limit
491
     *
492
     * @return array
493
     */
494
    public function getLastProcessedQueueEntries($limit)
495
    {
496
        return $this->getQueueRepository()->getLastProcessedEntries('*', $limit);
497
    }
498
499
    /**
500
     * Get current crawling speed
501
     *
502
     * @param float|false page speed in pages per minute
503
     *
504
     * @return int
505
     */
506
    public function getCurrentCrawlingSpeed()
507
    {
508
        $lastProcessedEntries = $this->getQueueRepository()->getLastProcessedEntriesTimestamps();
509
510
        if (count($lastProcessedEntries) < 10) {
511
            // not enough information
512
            return false;
513
        }
514
515
        $tooOldDelta = 60; // time between two entries is "too old"
516
517
        $compareValue = time();
518
        $startTime = $lastProcessedEntries[0];
519
520
        $pages = 0;
521
522
        reset($lastProcessedEntries);
523
        foreach ($lastProcessedEntries as $key => $timestamp) {
524
            if ($compareValue - $timestamp > $tooOldDelta) {
525
                break;
526
            }
527
            $compareValue = $timestamp;
528
            $pages++;
529
        }
530
531
        if ($pages < 10) {
532
            // not enough information
533
            return false;
534
        }
535
        $oldestTimestampThatIsNotTooOld = $compareValue;
536
        $time = $startTime - $oldestTimestampThatIsNotTooOld;
537
        $speed = $pages / ($time / 60);
538
539
        return $speed;
540
    }
541
542
    /**
543
     * Get some performance data
544
     *
545
     * @param integer $start
546
     * @param integer $end
547
     * @param integer $resolution
548
     *
549
     * @return array data
550
     *
551
     * @throws \Exception
552
     */
553
    public function getPerformanceData($start, $end, $resolution)
554
    {
555
        $data = [];
556
557
        $data['urlcount'] = 0;
558
        $data['start'] = $start;
559
        $data['end'] = $end;
560
        $data['duration'] = $data['end'] - $data['start'];
561
562
        if ($data['duration'] < 1) {
563
            throw new \Exception('End timestamp must be after start timestamp', 1512659945);
564
        }
565
566
        for ($slotStart = $start; $slotStart < $end; $slotStart += $resolution) {
567
            $slotEnd = min($slotStart + $resolution - 1, $end);
568
            $slotData = $this->getQueueRepository()->getPerformanceData($slotStart, $slotEnd);
569
570
            $slotUrlCount = 0;
571
            foreach ($slotData as $processId => &$processData) {
572
                $duration = $processData['end'] - $processData['start'];
573
                if ($processData['urlcount'] > 5 && $duration > 0) {
574
                    $processData['speed'] = 60 * 1 / ($duration / $processData['urlcount']);
575
                }
576
                $slotUrlCount += $processData['urlcount'];
577
            }
578
579
            $data['urlcount'] += $slotUrlCount;
580
581
            $data['slots'][$slotEnd] = [
582
                'amountProcesses' => count($slotData),
583
                'urlcount' => $slotUrlCount,
584
                'processes' => $slotData,
585
            ];
586
587
            if ($slotUrlCount > 5) {
588
                $data['slots'][$slotEnd]['speed'] = 60 * 1 / ($slotEnd - $slotStart / $slotUrlCount);
589
            } else {
590
                $data['slots'][$slotEnd]['speed'] = 0;
591
            }
592
        }
593
594
        if ($data['urlcount'] > 5) {
595
            $data['speed'] = 60 * 1 / ($data['duration'] / $data['urlcount']);
596
        } else {
597
            $data['speed'] = 0;
598
        }
599
600
        return $data;
601
    }
602
}
603