Completed
Push — TYPO3_7 ( 85059e...d37b91 )
by Tomas Norre
10:37
created

CrawlerApi::getLatestCrawlTimestampForPage()   A

Complexity

Conditions 4
Paths 8

Size

Total Lines 23

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 20

Importance

Changes 0
Metric Value
cc 4
nc 8
nop 3
dl 0
loc 23
ccs 0
cts 19
cp 0
crap 20
rs 9.552
c 0
b 0
f 0
1
<?php
2
namespace AOE\Crawler\Api;
3
4
/***************************************************************
5
 *  Copyright notice
6
 *
7
 *  (c) 2018 AOE GmbH <[email protected]>
8
 *
9
 *  All rights reserved
10
 *
11
 *  This script is part of the TYPO3 project. The TYPO3 project is
12
 *  free software; you can redistribute it and/or modify
13
 *  it under the terms of the GNU General Public License as published by
14
 *  the Free Software Foundation; either version 3 of the License, or
15
 *  (at your option) any later version.
16
 *
17
 *  The GNU General Public License can be found at
18
 *  http://www.gnu.org/copyleft/gpl.html.
19
 *
20
 *  This script is distributed in the hope that it will be useful,
21
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
22
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
23
 *  GNU General Public License for more details.
24
 *
25
 *  This copyright notice MUST APPEAR in all copies of the script!
26
 ***************************************************************/
27
28
use AOE\Crawler\Controller\CrawlerController;
29
use AOE\Crawler\Domain\Repository\ProcessRepository;
30
use AOE\Crawler\Domain\Repository\QueueRepository;
31
use TYPO3\CMS\Core\Utility\GeneralUtility;
32
use TYPO3\CMS\Core\Utility\MathUtility;
33
use TYPO3\CMS\Extbase\Object\ObjectManager;
34
use TYPO3\CMS\Frontend\Page\PageRepository;
35
36
/**
37
 * Class CrawlerApi
38
 *
39
 * @package AOE\Crawler\Api
40
 */
41
class CrawlerApi
42
{
43
    /**
44
     * @var CrawlerController
45
     */
46
    private $crawlerController;
47
48
    /**
49
     * @var QueueRepository
50
     */
51
    protected $queueRepository;
52
53
    /**
54
     * @var ProcessRepository
55
     */
56
    protected $processRepository;
57
58
    /**
59
     * @var $allowedConfigurations array
60
     */
61
    protected $allowedConfigurations = [];
62
63
    /**
64
     * Each crawler run has a setid, this facade method delegates
65
     * the it to the crawler object
66
     *
67
     * @param int
68
     */
69 1
    public function overwriteSetId($id)
70
    {
71 1
        $this->findCrawler()->setID = intval($id);
72 1
    }
73
74
    /**
75
     * This method is used to limit the configuration selection to
76
     * a set of configurations.
77
     *
78
     * @param array $allowedConfigurations
79
     */
80 1
    public function setAllowedConfigurations(array $allowedConfigurations)
81
    {
82 1
        $this->allowedConfigurations = $allowedConfigurations;
83 1
    }
84
85
    /**
86
     * @return array
87
     */
88 1
    public function getAllowedConfigurations()
89
    {
90 1
        return $this->allowedConfigurations;
91
    }
92
93
    /**
94
     * Returns the setID of the crawler
95
     *
96
     * @return int
97
     */
98 1
    public function getSetId()
99
    {
100 1
        return $this->findCrawler()->setID;
101
    }
102
103 5
    public function __construct()
104
    {
105 5
        $objectManager = GeneralUtility::makeInstance(ObjectManager::class);
106 5
        $this->queueRepository = $objectManager->get(QueueRepository::class);
107 5
        $this->processRepository = $objectManager->get(ProcessRepository::class);
108 5
    }
109
110
    /**
111
     * Method to get an instance of the internal crawler singleton
112
     *
113
     * @return CrawlerController Instance of the crawler lib
114
     *
115
     * @throws \Exception
116
     */
117 1
    protected function findCrawler()
118
    {
119 1
        if (!is_object($this->crawlerController)) {
120 1
            $this->crawlerController = GeneralUtility::makeInstance(CrawlerController::class);
121 1
            $this->crawlerController->setID = GeneralUtility::md5int(microtime());
122
        }
123
124 1
        if (is_object($this->crawlerController)) {
125 1
            return $this->crawlerController;
126
        } else {
127
            throw new \Exception('no crawler object', 1512659759);
128
        }
129
    }
130
131
    /**
132
     * Adds a page to the crawlerqueue by uid
133
     *
134
     * @param int $uid uid
135
     */
136
    public function addPageToQueue($uid)
137
    {
138
        $uid = intval($uid);
139
        //non timed elements will be added with timestamp 0
140
        $this->addPageToQueueTimed($uid, 0);
141
    }
142
143
    /**
144
     * This method is used to limit the processing instructions to the processing instructions
145
     * that are allowed.
146
     *
147
     * @return array
148
     */
149
    protected function filterUnallowedConfigurations($configurations)
150
    {
151
        if (count($this->allowedConfigurations) > 0) {
152
            // 	remove configuration that does not match the current selection
153
            foreach ($configurations as $confKey => $confArray) {
154
                if (!in_array($confKey, $this->allowedConfigurations)) {
155
                    unset($configurations[$confKey]);
156
                }
157
            }
158
        }
159
160
        return $configurations;
161
    }
162
163
    /**
164
     * Adds a page to the crawlerqueue by uid and sets a
165
     * timestamp when the page should be crawled.
166
     *
167
     * @param int $uid pageid
168
     * @param int $time timestamp
169
     *
170
     * @throws \Exception
171
     */
172
    public function addPageToQueueTimed($uid, $time)
173
    {
174
        $uid = intval($uid);
175
        $time = intval($time);
176
177
        $crawler = $this->findCrawler();
178
        $pageData = GeneralUtility::makeInstance(PageRepository::class)->getPage($uid);
179
        $configurations = $crawler->getUrlsForPageRow($pageData);
180
        $configurations = $this->filterUnallowedConfigurations($configurations);
181
        $downloadUrls = [];
182
        $duplicateTrack = [];
183
184
        if (is_array($configurations)) {
185
            foreach ($configurations as $cv) {
186
                //enable inserting of entries
187
                $crawler->registerQueueEntriesInternallyOnly = false;
188
                $crawler->urlListFromUrlArray(
189
                    $cv,
190
                    $pageData,
191
                    $time,
192
                    300,
193
                    true,
194
                    false,
195
                    $duplicateTrack,
196
                    $downloadUrls,
197
                    array_keys($this->getCrawlerProcInstructions())
198
                );
199
200
                //reset the queue because the entries have been written to the db
201
                unset($crawler->queueEntries);
202
            }
203
        } else {
204
            //no configuration found
205
        }
206
    }
207
208
    /**
209
     * Counts all entries in the database which are scheduled for a given page id and a schedule timestamp.
210
     *
211
     * @param int $page_uid
212
     * @param int $schedule_timestamp
213
     *
214
     * @return int
215
     *
216
     * @deprecated since crawler v6.2.0, will be removed in crawler v7.0.0.
217
     */
218
    protected function countEntriesInQueueForPageByScheduleTime($page_uid, $schedule_timestamp)
219
    {
220
        $page_uid = intval($page_uid);
221
        $schedule_timestamp = intval($schedule_timestamp);
222
223
        //if the same page is scheduled for the same time and has not be executed?
224
        if ($schedule_timestamp == 0) {
225
            //un-timed elements need an exec_time with 0 because they can occur multiple times
226
            $where = 'page_id=' . $page_uid . ' AND exec_time = 0 AND scheduled=' . $schedule_timestamp;
227
        } else {
228
            //timed elements have got a fixed schedule time, if a record with this time
229
            //exists it is maybe queued for the future, or is has been queue for the past and therefore
230
            //also been processed.
231
            $where = 'page_id=' . $page_uid . ' AND scheduled=' . $schedule_timestamp;
232
        }
233
234
        $row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($GLOBALS['TYPO3_DB']->exec_SELECTquery(
235
            'count(*) as cnt',
236
            'tx_crawler_queue',
237
            $where
238
        ));
239
240
        return intval($row['cnt']);
241
    }
242
243
    /**
244
     * Determines if a page is queued
245
     *
246
     * @param $uid
247
     * @param bool $unprocessed_only
248
     * @param bool $timed_only
249
     * @param bool $timestamp
250
     *
251
     * @deprecated since crawler v6.5.1, will be removed in crawler v9.0.0.
252
     *
253
     * @return bool
254
     */
255 1
    public function isPageInQueue($uid, $unprocessed_only = true, $timed_only = false, $timestamp = false)
256
    {
257 1
        if (!MathUtility::canBeInterpretedAsInteger($uid)) {
258 1
            throw new \InvalidArgumentException('Invalid parameter type', 1468931945);
259
        }
260
261
        $isPageInQueue = false;
262
263
        $whereClause = 'page_id = ' . (integer)$uid;
264
265
        if (false !== $unprocessed_only) {
266
            $whereClause .= ' AND exec_time = 0';
267
        }
268
269
        if (false !== $timed_only) {
270
            $whereClause .= ' AND scheduled != 0';
271
        }
272
273
        if (false !== $timestamp) {
274
            $whereClause .= ' AND scheduled = ' . (integer)$timestamp;
275
        }
276
277
        $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows(
278
            '*',
279
            'tx_crawler_queue',
280
            $whereClause
281
        );
282
283
        if (false !== $count && $count > 0) {
284
            $isPageInQueue = true;
285
        }
286
287
        return $isPageInQueue;
288
    }
289
290
    /**
291
     * Method to return the latest Crawler Timestamp for a page.
292
     *
293
     * @param int $uid uid id of the page
294
     * @param bool $future_crawldates_only
295
     * @param bool $unprocessed_only
296
     *
297
     * @return int
298
     */
299
    public function getLatestCrawlTimestampForPage($uid, $future_crawldates_only = false, $unprocessed_only = false)
300
    {
301
        $uid = intval($uid);
302
        $query = 'max(scheduled) as latest';
303
        $where = ' page_id = ' . $uid;
304
305
        if ($future_crawldates_only) {
306
            $where .= ' AND scheduled > ' . time();
307
        }
308
309
        if ($unprocessed_only) {
310
            $where .= ' AND exec_time = 0';
311
        }
312
313
        $rs = $GLOBALS['TYPO3_DB']->exec_SELECTquery($query, 'tx_crawler_queue', $where);
314
        if ($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($rs)) {
315
            $res = $row['latest'];
316
        } else {
317
            $res = 0;
318
        }
319
320
        return intval($res);
321
    }
322
323
    /**
324
     * Returns an array with timestamps when the page has been scheduled for crawling and
325
     * at what time the scheduled crawl has been executed. The array also contains items that are
326
     * scheduled but have note been crawled yet.
327
     *
328
     * @param int $uid uid of the page
329
     * @param bool $limit
330
     *
331
     * @return array array with the crawl-history of a page => 0 : scheduled time , 1 : executed_time, 2 : set_id
332
     */
333
    public function getCrawlHistoryForPage($uid, $limit = 0)
334
    {
335
        $uid = intval($uid);
336
        $limit = intval($limit);
337
338
        $query = 'scheduled, exec_time, set_id';
339
        $where = ' page_id = ' . $uid;
340
341
        $limit_query = ($limit) ? $limit : null;
342
343
        $rows = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows($query, 'tx_crawler_queue', $where, null, null, $limit_query);
344
        return $rows;
345
    }
346
347
    /**
348
     * Method to determine unprocessed Items in the crawler queue.
349
     *
350
     * @deprecated since crawler v6.5.1, will be removed in crawler v9.0.0.
351
     *
352
     * @return array
353
     */
354
    public function getUnprocessedItems()
355
    {
356
        $rows = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows(
357
            '*',
358
            'tx_crawler_queue',
359
            'exec_time = 0',
360
            '',
361
            'page_id, scheduled'
362
        );
363
364
        return $rows;
365
    }
366
367
    /**
368
     * Method to get the number of unprocessed items in the crawler
369
     *
370
     * @param int number of unprocessed items in the queue
371
     * @deprecated since crawler v6.5.1, will be removed in crawler v9.0.0.
372
     *
373
     */
374
    public function countUnprocessedItems()
375
    {
376
        $query = 'count(page_id) as anz';
377
        $where = 'exec_time = 0';
378
        $rs = $GLOBALS['TYPO3_DB']->exec_SELECTquery($query, 'tx_crawler_queue', $where);
379
        $row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($rs);
380
381
        return $row['anz'];
382
    }
383
384
    /**
385
     * Method to check if a page is in the queue which is timed for a
386
     * date when it should be crawled
387
     *
388
     * @param int $uid uid of the page
389
     * @param boolean $show_unprocessed only respect unprocessed pages
390
     *
391
     * @deprecated since crawler v6.5.1, will be removed in crawler v9.0.0.
392
     *
393
     * @return boolean
394
     */
395
    public function isPageInQueueTimed($uid, $show_unprocessed = true)
396
    {
397
        $uid = intval($uid);
398
        return $this->queueRepository->isPageInQueue($uid, $show_unprocessed);
399
    }
400
401
    /**
402
     * Reads the registered processingInstructions of the crawler
403
     *
404
     * @return array
405
     */
406
    private function getCrawlerProcInstructions()
407
    {
408
        if (isset($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['procInstructions'])) {
409
            return $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['procInstructions'];
410
        }
411
412
        return [];
413
    }
414
415
    /**
416
     * Removes an queue entry with a given queue id
417
     *
418
     * @param int $qid
419
     * @deprecated since crawler v6.5.1, will be removed in crawler v9.0.0. Please use the QueueRepository instead
420
     */
421
    public function removeQueueEntrie($qid)
422
    {
423
        $qid = intval($qid);
424
        $table = 'tx_crawler_queue';
425
        $where = ' qid=' . $qid;
426
        $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, $where);
427
    }
428
429
    /**
430
     * Get queue statistics
431
     *
432
     * @param void
433
     *
434
     * @return array array('assignedButUnprocessed' => <>, 'unprocessed' => <>);
435
     */
436
    public function getQueueStatistics()
437
    {
438
        return [
439
            'assignedButUnprocessed' => $this->getQueueRepository()->countAllAssignedPendingItems(),
440
            'unprocessed' => $this->getQueueRepository()->countAllPendingItems()
441
        ];
442
    }
443
444
    /**
445
     * Get queue repository
446
     *
447
     * @return QueueRepository
448
     */
449 1
    protected function getQueueRepository()
450
    {
451 1
        if (!$this->queueRepository instanceof QueueRepository) {
452
            $objectManager = GeneralUtility::makeInstance(ObjectManager::class);
453
            $this->queueRepository = $objectManager->get(QueueRepository::class);
454
        }
455
456 1
        return $this->queueRepository;
457
    }
458
459
    /**
460
     * Get queue statistics by configuration
461
     *
462
     * @return array array of array('configuration' => <>, 'assignedButUnprocessed' => <>, 'unprocessed' => <>)
463
     */
464
    public function getQueueStatisticsByConfiguration()
465
    {
466
        $statistics = $this->getQueueRepository()->countPendingItemsGroupedByConfigurationKey();
467
468
        $setIds = $this->getQueueRepository()->getSetIdWithUnprocessedEntries();
469
470
        $totals = $this->getQueueRepository()->getTotalQueueEntriesByConfiguration($setIds);
471
472
        // "merge" arrays
473
        foreach ($statistics as $key => &$value) {
474
            $value['total'] = $totals[$value['configuration']];
475
        }
476
477
        return $statistics;
478
    }
479
480
    /**
481
     * Get active processes count
482
     *
483
     * @param void
484
     *
485
     * @return int
486
     */
487
    public function getActiveProcessesCount()
488
    {
489
        return $$this->processRepository->countActive();
490
    }
491
492
    /**
493
     * Get last processed entries
494
     *
495
     * @param int $limit
496
     *
497
     * @return array
498
     */
499
    public function getLastProcessedQueueEntries($limit)
500
    {
501
        return $this->getQueueRepository()->getLastProcessedEntries('*', $limit);
502
    }
503
504
    /**
505
     * Get current crawling speed
506
     *
507
     * @param float|false page speed in pages per minute
508
     *
509
     * @return int
510
     */
511
    public function getCurrentCrawlingSpeed()
512
    {
513
        $lastProcessedEntries = $this->getQueueRepository()->getLastProcessedEntriesTimestamps();
514
515
        if (count($lastProcessedEntries) < 10) {
516
            // not enough information
517
            return false;
518
        }
519
520
        $tooOldDelta = 60; // time between two entries is "too old"
521
522
        $compareValue = time();
523
        $startTime = $lastProcessedEntries[0];
524
525
        $pages = 0;
526
527
        reset($lastProcessedEntries);
528
        foreach ($lastProcessedEntries as $key => $timestamp) {
529
            if ($compareValue - $timestamp > $tooOldDelta) {
530
                break;
531
            }
532
            $compareValue = $timestamp;
533
            $pages++;
534
        }
535
536
        if ($pages < 10) {
537
            // not enough information
538
            return false;
539
        }
540
        $oldestTimestampThatIsNotTooOld = $compareValue;
541
        $time = $startTime - $oldestTimestampThatIsNotTooOld;
542
        $speed = $pages / ($time / 60);
543
544
        return $speed;
545
    }
546
547
    /**
548
     * Get some performance data
549
     *
550
     * @param integer $start
551
     * @param integer $end
552
     * @param integer $resolution
553
     *
554
     * @return array data
555
     *
556
     * @throws \Exception
557
     */
558
    public function getPerformanceData($start, $end, $resolution)
559
    {
560
        $data = [];
561
562
        $data['urlcount'] = 0;
563
        $data['start'] = $start;
564
        $data['end'] = $end;
565
        $data['duration'] = $data['end'] - $data['start'];
566
567
        if ($data['duration'] < 1) {
568
            throw new \Exception('End timestamp must be after start timestamp', 1512659945);
569
        }
570
571
        for ($slotStart = $start; $slotStart < $end; $slotStart += $resolution) {
572
            $slotEnd = min($slotStart + $resolution - 1, $end);
573
            $slotData = $this->getQueueRepository()->getPerformanceData($slotStart, $slotEnd);
574
575
            $slotUrlCount = 0;
576
            foreach ($slotData as $processId => &$processData) {
577
                $duration = $processData['end'] - $processData['start'];
578
                if ($processData['urlcount'] > 5 && $duration > 0) {
579
                    $processData['speed'] = 60 * 1 / ($duration / $processData['urlcount']);
580
                }
581
                $slotUrlCount += $processData['urlcount'];
582
            }
583
584
            $data['urlcount'] += $slotUrlCount;
585
586
            $data['slots'][$slotEnd] = [
587
                'amountProcesses' => count($slotData),
588
                'urlcount' => $slotUrlCount,
589
                'processes' => $slotData,
590
            ];
591
592
            if ($slotUrlCount > 5) {
593
                $data['slots'][$slotEnd]['speed'] = 60 * 1 / ($slotEnd - $slotStart / $slotUrlCount);
594
            } else {
595
                $data['slots'][$slotEnd]['speed'] = 0;
596
            }
597
        }
598
599
        if ($data['urlcount'] > 5) {
600
            $data['speed'] = 60 * 1 / ($data['duration'] / $data['urlcount']);
601
        } else {
602
            $data['speed'] = 0;
603
        }
604
605
        return $data;
606
    }
607
}
608