Completed
Push — issue/252 ( b288df...34f654 )
by Tomas Norre
10:26 queued 02:05
created

CrawlerApi::countUnprocessedItems()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 9

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 2

Importance

Changes 0
Metric Value
cc 1
nc 1
nop 0
dl 0
loc 9
ccs 0
cts 8
cp 0
crap 2
rs 9.9666
c 0
b 0
f 0
1
<?php
2
namespace AOE\Crawler\Api;
3
4
/***************************************************************
5
 *  Copyright notice
6
 *
7
 *  (c) 2018 AOE GmbH <[email protected]>
8
 *
9
 *  All rights reserved
10
 *
11
 *  This script is part of the TYPO3 project. The TYPO3 project is
12
 *  free software; you can redistribute it and/or modify
13
 *  it under the terms of the GNU General Public License as published by
14
 *  the Free Software Foundation; either version 3 of the License, or
15
 *  (at your option) any later version.
16
 *
17
 *  The GNU General Public License can be found at
18
 *  http://www.gnu.org/copyleft/gpl.html.
19
 *
20
 *  This script is distributed in the hope that it will be useful,
21
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
22
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
23
 *  GNU General Public License for more details.
24
 *
25
 *  This copyright notice MUST APPEAR in all copies of the script!
26
 ***************************************************************/
27
28
use AOE\Crawler\Controller\CrawlerController;
29
use AOE\Crawler\Domain\Repository\ProcessRepository;
30
use AOE\Crawler\Domain\Repository\QueueRepository;
31
use TYPO3\CMS\Core\Utility\GeneralUtility;
32
use TYPO3\CMS\Core\Utility\MathUtility;
33
use TYPO3\CMS\Frontend\Page\PageRepository;
34
35
/**
36
 * Class CrawlerApi
37
 *
38
 * @package AOE\Crawler\Api
39
 */
40
class CrawlerApi
41
{
42
    /**
43
     * @var CrawlerController
44
     */
45
    private $crawlerController;
46
47
    /**
48
     * @var QueueRepository
49
     */
50
    protected $queueRepository;
51
52
    /**
53
     * @var $allowedConfigurations array
54
     */
55
    protected $allowedConfigurations = [];
56
57
    /**
58
     * Each crawler run has a setid, this facade method delegates
59
     * the it to the crawler object
60
     *
61
     * @param int
62
     */
63 1
    public function overwriteSetId($id)
64
    {
65 1
        $this->findCrawler()->setID = intval($id);
66 1
    }
67
68
    /**
69
     * This method is used to limit the configuration selection to
70
     * a set of configurations.
71
     *
72
     * @param array $allowedConfigurations
73
     */
74 1
    public function setAllowedConfigurations(array $allowedConfigurations)
75
    {
76 1
        $this->allowedConfigurations = $allowedConfigurations;
77 1
    }
78
79
    /**
80
     * @return array
81
     */
82 1
    public function getAllowedConfigurations()
83
    {
84 1
        return $this->allowedConfigurations;
85
    }
86
87
    /**
88
     * Returns the setID of the crawler
89
     *
90
     * @return int
91
     */
92 1
    public function getSetId()
93
    {
94 1
        return $this->findCrawler()->setID;
95
    }
96
97
    /**
98
     * Method to get an instance of the internal crawler singleton
99
     *
100
     * @return CrawlerController Instance of the crawler lib
101
     *
102
     * @throws \Exception
103
     */
104 2
    protected function findCrawler()
105
    {
106 2
        if (!is_object($this->crawlerController)) {
107 2
            $this->crawlerController = GeneralUtility::makeInstance(CrawlerController::class);
108 2
            $this->crawlerController->setID = GeneralUtility::md5int(microtime());
109
        }
110
111 2
        if (is_object($this->crawlerController)) {
112 2
            return $this->crawlerController;
113
        } else {
114
            throw new \Exception('no crawler object', 1512659759);
115
        }
116
    }
117
118
    /**
119
     * Adds a page to the crawlerqueue by uid
120
     *
121
     * @param int $uid uid
122
     */
123
    public function addPageToQueue($uid)
124
    {
125
        $uid = intval($uid);
126
        //non timed elements will be added with timestamp 0
127
        $this->addPageToQueueTimed($uid, 0);
128
    }
129
130
    /**
131
     * This method is used to limit the processing instructions to the processing instructions
132
     * that are allowed.
133
     *
134
     * @return array
135
     */
136
    protected function filterUnallowedConfigurations($configurations)
137
    {
138
        if (count($this->allowedConfigurations) > 0) {
139
            // 	remove configuration that does not match the current selection
140
            foreach ($configurations as $confKey => $confArray) {
141
                if (!in_array($confKey, $this->allowedConfigurations)) {
142
                    unset($configurations[$confKey]);
143
                }
144
            }
145
        }
146
147
        return $configurations;
148
    }
149
150
    /**
151
     * Adds a page to the crawlerqueue by uid and sets a
152
     * timestamp when the page should be crawled.
153
     *
154
     * @param int $uid pageid
155
     * @param int $time timestamp
156
     */
157
    public function addPageToQueueTimed($uid, $time)
158
    {
159
        $uid = intval($uid);
160
        $time = intval($time);
161
162
        $crawler = $this->findCrawler();
163
        $pageData = GeneralUtility::makeInstance(PageRepository::class)->getPage($uid);
164
        $configurations = $crawler->getUrlsForPageRow($pageData);
165
        $configurations = $this->filterUnallowedConfigurations($configurations);
166
        $downloadUrls = [];
167
        $duplicateTrack = [];
168
169
        if (is_array($configurations)) {
170
            foreach ($configurations as $cv) {
171
                //enable inserting of entries
172
                $crawler->registerQueueEntriesInternallyOnly = false;
173
                $crawler->urlListFromUrlArray(
174
                    $cv,
175
                    $pageData,
176
                    $time,
177
                    300,
178
                    true,
179
                    false,
180
                    $duplicateTrack,
181
                    $downloadUrls,
182
                    array_keys($this->getCrawlerProcInstructions())
183
                );
184
185
                //reset the queue because the entries have been written to the db
186
                unset($crawler->queueEntries);
187
            }
188
        } else {
189
            //no configuration found
190
        }
191
    }
192
193
    /**
194
     * Counts all entries in the database which are scheduled for a given page id and a schedule timestamp.
195
     *
196
     * @param int $page_uid
197
     * @param int $schedule_timestamp
198
     *
199
     * @return int
200
     *
201
     * @deprecated since crawler v6.2.0, will be removed in crawler v7.0.0.
202
     */
203 1
    protected function countEntriesInQueueForPageByScheduleTime($page_uid, $schedule_timestamp)
204
    {
205 1
        $page_uid = intval($page_uid);
206 1
        $schedule_timestamp = intval($schedule_timestamp);
207
208
        //if the same page is scheduled for the same time and has not be executed?
209 1
        if ($schedule_timestamp == 0) {
210
            //un-timed elements need an exec_time with 0 because they can occur multiple times
211 1
            $where = 'page_id=' . $page_uid . ' AND exec_time = 0 AND scheduled=' . $schedule_timestamp;
212
        } else {
213
            //timed elements have got a fixed schedule time, if a record with this time
214
            //exists it is maybe queued for the future, or is has been queue for the past and therefore
215
            //also been processed.
216 1
            $where = 'page_id=' . $page_uid . ' AND scheduled=' . $schedule_timestamp;
217
        }
218
219 1
        $row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($GLOBALS['TYPO3_DB']->exec_SELECTquery(
220 1
            'count(*) as cnt',
221 1
            'tx_crawler_queue',
222 1
            $where
223
        ));
224
225 1
        return intval($row['cnt']);
226
    }
227
228
    /**
229
     * Determines if a page is queued
230
     *
231
     * @param $uid
232
     * @param bool $unprocessed_only
233
     * @param bool $timed_only
234
     * @param bool $timestamp
235
     *
236
     * @return bool
237
     */
238 6
    public function isPageInQueue($uid, $unprocessed_only = true, $timed_only = false, $timestamp = false)
239
    {
240 6
        if (!MathUtility::canBeInterpretedAsInteger($uid)) {
241 1
            throw new \InvalidArgumentException('Invalid parameter type', 1468931945);
242
        }
243
244 5
        $isPageInQueue = false;
245
246 5
        $whereClause = 'page_id = ' . (integer)$uid;
247
248 5
        if (false !== $unprocessed_only) {
249 2
            $whereClause .= ' AND exec_time = 0';
250
        }
251
252 5
        if (false !== $timed_only) {
253 1
            $whereClause .= ' AND scheduled != 0';
254
        }
255
256 5
        if (false !== $timestamp) {
257 1
            $whereClause .= ' AND scheduled = ' . (integer)$timestamp;
258
        }
259
260 5
        $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows(
261 5
            '*',
262 5
            'tx_crawler_queue',
263 5
            $whereClause
264
        );
265
266 5
        if (false !== $count && $count > 0) {
267 4
            $isPageInQueue = true;
268
        }
269
270 5
        return $isPageInQueue;
271
    }
272
273
    /**
274
     * Method to return the latest Crawle Timestamp for a page.
275
     *
276
     * @param int $uid uid id of the page
277
     * @param bool $future_crawldates_only
278
     * @param bool $unprocessed_only
279
     *
280
     * @return int
281
     */
282 1
    public function getLatestCrawlTimestampForPage($uid, $future_crawldates_only = false, $unprocessed_only = false)
283
    {
284 1
        $uid = intval($uid);
285 1
        $query = 'max(scheduled) as latest';
286 1
        $where = ' page_id = ' . $uid;
287
288 1
        if ($future_crawldates_only) {
289
            $where .= ' AND scheduled > ' . time();
290
        }
291
292 1
        if ($unprocessed_only) {
293
            $where .= ' AND exec_time = 0';
294
        }
295
296 1
        $rs = $GLOBALS['TYPO3_DB']->exec_SELECTquery($query, 'tx_crawler_queue', $where);
297 1
        if ($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($rs)) {
298 1
            $res = $row['latest'];
299
        } else {
300
            $res = 0;
301
        }
302
303 1
        return $res;
304
    }
305
306
    /**
307
     * Returns an array with timestamps when the page has been scheduled for crawling and
308
     * at what time the scheduled crawl has been executed. The array also contains items that are
309
     * scheduled but have note been crawled yet.
310
     *
311
     * @param int $uid uid of the page
312
     * @param bool $limit
313
     *
314
     * @return array array with the crawl-history of a page => 0 : scheduled time , 1 : executed_time, 2 : set_id
315
     */
316 1
    public function getCrawlHistoryForPage($uid, $limit = 0)
317
    {
318 1
        $uid = intval($uid);
319 1
        $limit = intval($limit);
320
321 1
        $query = 'scheduled, exec_time, set_id';
322 1
        $where = ' page_id = ' . $uid;
323
324 1
        $limit_query = ($limit) ? $limit : null;
325
326 1
        $rows = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows($query, 'tx_crawler_queue', $where, null, null, $limit_query);
327 1
        return $rows;
328
    }
329
330
    /**
331
     * Method to determine unprocessed Items in the crawler queue.
332
     *
333
     * @return array
334
     */
335
    public function getUnprocessedItems()
336
    {
337
        $query = '*';
338
        $where = 'exec_time = 0';
339
        $rows = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows($query, 'tx_crawler_queue', $where, '', 'page_id, scheduled');
340
341
        return $rows;
342
    }
343
344
    /**
345
     * Method to get the number of unprocessed items in the crawler
346
     *
347
     * @param int number of unprocessed items in the queue
348
     */
349
    public function countUnprocessedItems()
350
    {
351
        $query = 'count(page_id) as anz';
352
        $where = 'exec_time = 0';
353
        $rs = $GLOBALS['TYPO3_DB']->exec_SELECTquery($query, 'tx_crawler_queue', $where);
354
        $row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($rs);
355
356
        return $row['anz'];
357
    }
358
359
    /**
360
     * Method to check if a page is in the queue which is timed for a
361
     * date when it should be crawled
362
     *
363
     * @param int $uid uid of the page
364
     * @param boolean $show_unprocessed only respect unprocessed pages
365
     *
366
     * @return boolean
367
     */
368 1
    public function isPageInQueueTimed($uid, $show_unprocessed = true)
369
    {
370 1
        $uid = intval($uid);
371
372 1
        return $this->isPageInQueue($uid, $show_unprocessed);
373
    }
374
375
    /**
376
     * Reads the registered processingInstructions of the crawler
377
     *
378
     * @return array
379
     */
380
    private function getCrawlerProcInstructions()
381
    {
382
        if (isset($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['procInstructions'])) {
383
            return $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['procInstructions'];
384
        }
385
386
        return [];
387
    }
388
389
    /**
390
     * Removes an queue entry with a given queue id
391
     *
392
     * @param int $qid
393
     */
394
    public function removeQueueEntrie($qid)
395
    {
396
        $qid = intval($qid);
397
        $table = 'tx_crawler_queue';
398
        $where = ' qid=' . $qid;
399
        $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, $where);
400
    }
401
402
    /**
403
     * Get queue statistics
404
     *
405
     * @param void
406
     *
407
     * @return array array('assignedButUnprocessed' => <>, 'unprocessed' => <>);
408
     */
409 1
    public function getQueueStatistics()
410
    {
411
        return [
412 1
            'assignedButUnprocessed' => $this->getQueueRepository()->countAllAssignedPendingItems(),
413 1
            'unprocessed' => $this->getQueueRepository()->countAllPendingItems()
414
        ];
415
    }
416
417
    /**
418
     * Get queue repository
419
     *
420
     * @return QueueRepository
421
     */
422 2
    protected function getQueueRepository()
423
    {
424 2
        if (!$this->queueRepository instanceof QueueRepository) {
425 2
            $this->queueRepository = new QueueRepository();
426
        }
427
428 2
        return $this->queueRepository;
429
    }
430
431
    /**
432
     * Get queue statistics by configuration
433
     *
434
     * @return array array of array('configuration' => <>, 'assignedButUnprocessed' => <>, 'unprocessed' => <>)
435
     */
436
    public function getQueueStatisticsByConfiguration()
437
    {
438
        $statistics = $this->getQueueRepository()->countPendingItemsGroupedByConfigurationKey();
439
440
        $setIds = $this->getQueueRepository()->getSetIdWithUnprocessedEntries();
441
442
        $totals = $this->getQueueRepository()->getTotalQueueEntriesByConfiguration($setIds);
443
444
        // "merge" arrays
445
        foreach ($statistics as $key => &$value) {
446
            $value['total'] = $totals[$value['configuration']];
447
        }
448
449
        return $statistics;
450
    }
451
452
    /**
453
     * Get active processes count
454
     *
455
     * @param void
456
     *
457
     * @return int
458
     */
459
    public function getActiveProcessesCount()
460
    {
461
        $processRepository = new ProcessRepository();
462
463
        return $processRepository->countActive();
464
    }
465
466
    /**
467
     * Get last processed entries
468
     *
469
     * @param int $limit
470
     *
471
     * @return array
472
     */
473
    public function getLastProcessedQueueEntries($limit)
474
    {
475
        return $this->getQueueRepository()->getLastProcessedEntries('*', $limit);
476
    }
477
478
    /**
479
     * Get current crawling speed
480
     *
481
     * @param float|false page speed in pages per minute
482
     *
483
     * @return int
484
     */
485
    public function getCurrentCrawlingSpeed()
486
    {
487
        $lastProcessedEntries = $this->getQueueRepository()->getLastProcessedEntriesTimestamps();
488
489
        if (count($lastProcessedEntries) < 10) {
490
            // not enough information
491
            return false;
492
        }
493
494
        $tooOldDelta = 60; // time between two entries is "too old"
495
496
        $compareValue = time();
497
        $startTime = $lastProcessedEntries[0];
498
499
        $pages = 0;
500
501
        reset($lastProcessedEntries);
502
        foreach ($lastProcessedEntries as $key => $timestamp) {
503
            if ($compareValue - $timestamp > $tooOldDelta) {
504
                break;
505
            }
506
            $compareValue = $timestamp;
507
            $pages++;
508
        }
509
510
        if ($pages < 10) {
511
            // not enough information
512
            return false;
513
        }
514
        $oldestTimestampThatIsNotTooOld = $compareValue;
515
        $time = $startTime - $oldestTimestampThatIsNotTooOld;
516
        $speed = $pages / ($time / 60);
517
518
        return $speed;
519
    }
520
521
    /**
522
     * Get some performance data
523
     *
524
     * @param integer $start
525
     * @param integer $end
526
     * @param integer $resolution
527
     *
528
     * @return array data
529
     *
530
     * @throws \Exception
531
     */
532
    public function getPerformanceData($start, $end, $resolution)
533
    {
534
        $data = [];
535
536
        $data['urlcount'] = 0;
537
        $data['start'] = $start;
538
        $data['end'] = $end;
539
        $data['duration'] = $data['end'] - $data['start'];
540
541
        if ($data['duration'] < 1) {
542
            throw new \Exception('End timestamp must be after start timestamp', 1512659945);
543
        }
544
545
        for ($slotStart = $start; $slotStart < $end; $slotStart += $resolution) {
546
            $slotEnd = min($slotStart + $resolution - 1, $end);
547
            $slotData = $this->getQueueRepository()->getPerformanceData($slotStart, $slotEnd);
548
549
            $slotUrlCount = 0;
550
            foreach ($slotData as $processId => &$processData) {
551
                $duration = $processData['end'] - $processData['start'];
552
                if ($processData['urlcount'] > 5 && $duration > 0) {
553
                    $processData['speed'] = 60 * 1 / ($duration / $processData['urlcount']);
554
                }
555
                $slotUrlCount += $processData['urlcount'];
556
            }
557
558
            $data['urlcount'] += $slotUrlCount;
559
560
            $data['slots'][$slotEnd] = [
561
                'amountProcesses' => count($slotData),
562
                'urlcount' => $slotUrlCount,
563
                'processes' => $slotData,
564
            ];
565
566
            if ($slotUrlCount > 5) {
567
                $data['slots'][$slotEnd]['speed'] = 60 * 1 / ($slotEnd - $slotStart / $slotUrlCount);
568
            } else {
569
                $data['slots'][$slotEnd]['speed'] = 0;
570
            }
571
        }
572
573
        if ($data['urlcount'] > 5) {
574
            $data['speed'] = 60 * 1 / ($data['duration'] / $data['urlcount']);
575
        } else {
576
            $data['speed'] = 0;
577
        }
578
579
        return $data;
580
    }
581
}
582