Completed
Push — master ( d63a29...5c0477 )
by Stefan
29:08 queued 02:04
created

CrawlerApi::addPageToQueueTimed()   B

Complexity

Conditions 3
Paths 3

Size

Total Lines 35
Code Lines 24

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 24
CRAP Score 3

Importance

Changes 0
Metric Value
cc 3
eloc 24
nc 3
nop 2
dl 0
loc 35
ccs 24
cts 24
cp 1
crap 3
rs 8.8571
c 0
b 0
f 0
1
<?php
2
namespace AOE\Crawler\Api;
3
4
/***************************************************************
5
 *  Copyright notice
6
 *
7
 *  (c) 2016 AOE GmbH <[email protected]>
8
 *
9
 *  All rights reserved
10
 *
11
 *  This script is part of the TYPO3 project. The TYPO3 project is
12
 *  free software; you can redistribute it and/or modify
13
 *  it under the terms of the GNU General Public License as published by
14
 *  the Free Software Foundation; either version 3 of the License, or
15
 *  (at your option) any later version.
16
 *
17
 *  The GNU General Public License can be found at
18
 *  http://www.gnu.org/copyleft/gpl.html.
19
 *
20
 *  This script is distributed in the hope that it will be useful,
21
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
22
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
23
 *  GNU General Public License for more details.
24
 *
25
 *  This copyright notice MUST APPEAR in all copies of the script!
26
 ***************************************************************/
27
28
use AOE\Crawler\Controller\CrawlerController;
29
use AOE\Crawler\Domain\Repository\ProcessRepository;
30
use AOE\Crawler\Domain\Repository\QueueRepository;
31
use TYPO3\CMS\Core\Utility\GeneralUtility;
32
use TYPO3\CMS\Core\Utility\MathUtility;
33
use TYPO3\CMS\Frontend\Page\PageRepository;
34
35
/**
36
 * Class CrawlerApi
37
 *
38
 * @package AOE\Crawler\Api
39
 */
40
class CrawlerApi
41
{
42
    /**
43
     * @var CrawlerController
44
     */
45
    private $crawlerController;
46
47
    /**
48
     * @var QueueRepository
49
     */
50
    protected $queueRepository;
51
52
    /**
53
     * @var $allowedConfigurations array
54
     */
55
    protected $allowedConfigurations = [];
56
57
    /**
58
     * Each crawler run has a setid, this facade method delegates
59
     * the it to the crawler object
60
     *
61
     * @param int
62
     */
63
    public function overwriteSetId($id)
64
    {
65
        $this->findCrawler()->setID = intval($id);
66
    }
67
68
    /**
69
     * This method is used to limit the configuration selection to
70
     * a set of configurations.
71
     *
72
     * @param array $allowedConfigurations
73
     */
74
    public function setAllowedConfigurations(array $allowedConfigurations)
75
    {
76
        $this->allowedConfigurations = $allowedConfigurations;
77
    }
78
79
    /**
80
     * Returns the setID of the crawler
81
     *
82
     * @return int
83
     */
84
    public function getSetId()
85
    {
86
        return $this->findCrawler()->setID;
87
    }
88
89
    /**
90
     * Method to get an instance of the internal crawler singleton
91
     *
92
     * @return CrawlerController Instance of the crawler lib
93
     *
94
     * @throws \Exception
95
     */
96
    protected function findCrawler()
97
    {
98
        if (!is_object($this->crawlerController)) {
99
            $this->crawlerController = GeneralUtility::makeInstance(CrawlerController::class);
100
            $this->crawlerController->setID = GeneralUtility::md5int(microtime());
101
        }
102
103
        if (is_object($this->crawlerController)) {
104
            return $this->crawlerController;
105
        } else {
106
            throw new \Exception('no crawler object', 1512659759);
107
        }
108
    }
109
110
    /**
111
     * Adds a page to the crawlerqueue by uid
112
     *
113
     * @param int $uid uid
114
     */
115
    public function addPageToQueue($uid)
116
    {
117
        $uid = intval($uid);
118
        //non timed elements will be added with timestamp 0
119
        $this->addPageToQueueTimed($uid, 0);
120
    }
121
122
    /**
123
     * This method is used to limit the processing instructions to the processing instructions
124
     * that are allowed.
125
     *
126
     * @return array
127
     */
128 4
    protected function filterUnallowedConfigurations($configurations)
129
    {
130 4
        if (count($this->allowedConfigurations) > 0) {
131
            // 	remove configuration that does not match the current selection
132
            foreach ($configurations as $confKey => $confArray) {
133
                if (!in_array($confKey, $this->allowedConfigurations)) {
134
                    unset($configurations[$confKey]);
135
                }
136
            }
137
        }
138
139 4
        return $configurations;
140
    }
141
142
    /**
143
     * Adds a page to the crawlerqueue by uid and sets a
144
     * timestamp when the page should be crawled.
145
     *
146
     * @param int $uid pageid
147
     * @param int $time timestamp
148
     */
149 4
    public function addPageToQueueTimed($uid, $time)
150
    {
151 4
        $uid = intval($uid);
152 4
        $time = intval($time);
153
154 4
        $crawler = $this->findCrawler();
155 4
        $pageData = GeneralUtility::makeInstance(PageRepository::class)->getPage($uid);
156 4
        $configurations = $crawler->getUrlsForPageRow($pageData);
157 4
        $configurations = $this->filterUnallowedConfigurations($configurations);
158 4
        $downloadUrls = [];
159 4
        $duplicateTrack = [];
160
161 4
        if (is_array($configurations)) {
162 4
            foreach ($configurations as $cv) {
163
                //enable inserting of entries
164 4
                $crawler->registerQueueEntriesInternallyOnly = false;
165 4
                $crawler->urlListFromUrlArray(
166 4
                    $cv,
167 4
                    $pageData,
168 4
                    $time,
169 4
                    300,
170 4
                    true,
171 4
                    false,
172 4
                    $duplicateTrack,
173 4
                    $downloadUrls,
174 4
                    array_keys($this->getCrawlerProcInstructions())
175
                );
176
177
                //reset the queue because the entries have been written to the db
178 4
                unset($crawler->queueEntries);
179
            }
180
        } else {
181
            //no configuration found
182
        }
183 4
    }
184
185
    /**
186
     * Counts all entrys in the database which are scheduled for a given page id and a schedule timestamp.
187
     *
188
     * @param int $page_uid
189
     * @param int $schedule_timestamp
190
     *
191
     * @return int
192
     */
193
    protected function countEntriesInQueueForPageByScheduletime($page_uid, $schedule_timestamp)
194
    {
195
        //if the same page is scheduled for the same time and has not be executed?
196
        if ($schedule_timestamp == 0) {
197
            //untimed elements need an exec_time with 0 because they can occure multiple times
198
            $where = 'page_id=' . $page_uid . ' AND exec_time = 0 AND scheduled=' . $schedule_timestamp;
199
        } else {
200
            //timed elementes have got a fixed schedule time, if a record with this time
201
            //exists it is maybe queued for the future, or is has been queue for the past and therefore
202
            //also been processed.
203
            $where = 'page_id=' . $page_uid . ' AND scheduled=' . $schedule_timestamp;
204
        }
205
206
        $row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($GLOBALS['TYPO3_DB']->exec_SELECTquery(
207
            'count(*) as cnt',
208
            'tx_crawler_queue',
209
            $where
210
        ));
211
212
        return intval($row['cnt']);
213
    }
214
215
    /**
216
     * Determines if a page is queued
217
     *
218
     * @param $uid
219
     * @param bool $unprocessed_only
220
     * @param bool $timed_only
221
     * @param bool $timestamp
222
     *
223
     * @return bool
224
     */
225
    public function isPageInQueue($uid, $unprocessed_only = true, $timed_only = false, $timestamp = false)
226
    {
227
        if (MathUtility::canBeInterpretedAsInteger($uid)) {
228
            throw new \InvalidArgumentException('Invalid parameter type', 1468931945);
229
        }
230
231
        $isPageInQueue = false;
232
233
        $whereClause = 'page_id = ' . (integer)$uid;
234
235
        if (false !== $unprocessed_only) {
236
            $whereClause .= ' AND exec_time = 0';
237
        }
238
239
        if (false !== $timed_only) {
240
            $whereClause .= ' AND scheduled != 0';
241
        }
242
243
        if (false !== $timestamp) {
244
            $whereClause .= ' AND scheduled = ' . (integer)$timestamp;
245
        }
246
247
        $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows(
248
            '*',
249
            'tx_crawler_queue',
250
            $whereClause
251
        );
252
253
        if (false !== $count && $count > 0) {
254
            $isPageInQueue = true;
255
        }
256
257
        return $isPageInQueue;
258
    }
259
260
    /**
261
     * Method to return the latest Crawle Timestamp for a page.
262
     *
263
     * @param int $uid uid id of the page
264
     * @param bool $future_crawldates_only
265
     * @param bool $unprocessed_only
266
     *
267
     * @return int
268
     */
269
    public function getLatestCrawlTimestampForPage($uid, $future_crawldates_only = false, $unprocessed_only = false)
270
    {
271
        $uid = intval($uid);
272
        $query = 'max(scheduled) as latest';
273
        $where = ' page_id = ' . $uid;
274
275
        if ($future_crawldates_only) {
276
            $where .= ' AND scheduled > ' . time();
277
        }
278
279
        if ($unprocessed_only) {
280
            $where .= ' AND exec_time = 0';
281
        }
282
283
        $rs = $GLOBALS['TYPO3_DB']->exec_SELECTquery($query, 'tx_crawler_queue', $where);
284
        if ($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($rs)) {
285
            $res = $row['latest'];
286
        } else {
287
            $res = 0;
288
        }
289
290
        return $res;
291
    }
292
293
    /**
294
     * Returns an array with timestamps when the page has been scheduled for crawling and
295
     * at what time the scheduled crawl has been executed. The array also contains items that are
296
     * scheduled but have note been crawled yet.
297
     *
298
     * @param int $uid uid of the page
299
     * @param bool $limit
300
     *
301
     * @return array array with the crawlhistory of a page => 0 : scheduled time , 1 : execuded_time, 2 : set_id
302
     */
303
    public function getCrawlHistoryForPage($uid, $limit = false)
304
    {
305
        $uid = intval($uid);
306
        $limit = $GLOBALS['TYPO3_DB']->fullQuoteStr($limit, 'tx_crawler_queue');
307
308
        $query = 'scheduled, exec_time, set_id';
309
        $where = ' page_id = ' . $uid;
310
311
        $limit_query = ($limit) ? $limit : null;
312
313
        $rows = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows($query, 'tx_crawler_queue', $where, null, null, $limit_query);
314
315
        return $rows;
316
    }
317
318
    /**
319
     * Method to determine unprocessed Items in the crawler queue.
320
     *
321
     * @return array
322
     */
323 1
    public function getUnprocessedItems()
324
    {
325 1
        $query = '*';
326 1
        $where = 'exec_time = 0';
327 1
        $rows = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows($query, 'tx_crawler_queue', $where, '', 'page_id, scheduled');
328
329 1
        return $rows;
330
    }
331
332
    /**
333
     * Method to get the number of unprocessed items in the crawler
334
     *
335
     * @param int number of unprocessed items in the queue
336
     */
337 4
    public function countUnprocessedItems()
338
    {
339 4
        $query = 'count(page_id) as anz';
340 4
        $where = 'exec_time = 0';
341 4
        $rs = $GLOBALS['TYPO3_DB']->exec_SELECTquery($query, 'tx_crawler_queue', $where);
342 4
        $row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($rs);
343
344 4
        return $row['anz'];
345
    }
346
347
    /**
348
     * Method to check if a page is in the queue which is timed for a
349
     * date when it should be crawled
350
     *
351
     * @param int $uid uid of the page
352
     * @param boolean $show_unprocessed only respect unprocessed pages
353
     *
354
     * @return boolean
355
     */
356
    public function isPageInQueueTimed($uid, $show_unprocessed = true)
357
    {
358
        $uid = intval($uid);
359
360
        return $this->isPageInQueue($uid, $show_unprocessed);
361
    }
362
363
    /**
364
     * Reads the registered processingInstructions of the crawler
365
     *
366
     * @return array
367
     */
368 4
    private function getCrawlerProcInstructions()
369
    {
370 4
        if (isset($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['procInstructions'])) {
371
            return $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['procInstructions'];
372
        }
373
374 4
        return [];
375
    }
376
377
    /**
378
     * Removes an queue entry with a given queue id
379
     *
380
     * @param int $qid
381
     */
382
    public function removeQueueEntrie($qid)
383
    {
384
        $qid = intval($qid);
385
        $table = 'tx_crawler_queue';
386
        $where = ' qid=' . $qid;
387
        $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, $where);
388
    }
389
390
    /**
391
     * Get queue statistics
392
     *
393
     * @param void
394
     *
395
     * @return array array('assignedButUnprocessed' => <>, 'unprocessed' => <>);
396
     */
397
    public function getQueueStatistics()
398
    {
399
        return [
400
            'assignedButUnprocessed' => $this->getQueueRepository()->countAllAssignedPendingItems(),
401
            'unprocessed' => $this->getQueueRepository()->countAllPendingItems()
402
        ];
403
    }
404
405
    /**
406
     * Get queue repository
407
     *
408
     * @return QueueRepository
409
     */
410
    protected function getQueueRepository()
411
    {
412
        if (!$this->queueRepository instanceof QueueRepository) {
413
            $this->queueRepository = new QueueRepository();
414
        }
415
416
        return $this->queueRepository;
417
    }
418
419
    /**
420
     * Get queue statistics by configuration
421
     *
422
     * @return array array of array('configuration' => <>, 'assignedButUnprocessed' => <>, 'unprocessed' => <>)
423
     */
424
    public function getQueueStatisticsByConfiguration()
425
    {
426
        $statistics = $this->getQueueRepository()->countPendingItemsGroupedByConfigurationKey();
427
428
        $setIds = $this->getQueueRepository()->getSetIdWithUnprocessedEntries();
429
430
        $totals = $this->getQueueRepository()->getTotalQueueEntriesByConfiguration($setIds);
431
432
        // "merge" arrays
433
        foreach ($statistics as $key => &$value) {
434
            $value['total'] = $totals[$value['configuration']];
435
        }
436
437
        return $statistics;
438
    }
439
440
    /**
441
     * Get active processes count
442
     *
443
     * @param void
444
     *
445
     * @return int
446
     */
447
    public function getActiveProcessesCount()
448
    {
449
        $processRepository = new ProcessRepository();
450
451
        return $processRepository->countActive();
452
    }
453
454
    /**
455
     * Get last processed entries
456
     *
457
     * @param int limit
458
     *
459
     * @return array
460
     */
461
    public function getLastProcessedQueueEntries($limit)
462
    {
463
        return $this->getQueueRepository()->getLastProcessedEntries('*', $limit);
464
    }
465
466
    /**
467
     * Get current crawling speed
468
     *
469
     * @param float|false page speed in pages per minute
470
     *
471
     * @return int
472
     */
473
    public function getCurrentCrawlingSpeed()
474
    {
475
        $lastProcessedEntries = $this->getQueueRepository()->getLastProcessedEntriesTimestamps();
476
477
        if (count($lastProcessedEntries) < 10) {
478
            // not enough information
479
            return false;
480
        }
481
482
        $tooOldDelta = 60; // time between two entries is "too old"
483
484
        $compareValue = time();
485
        $startTime = $lastProcessedEntries[0];
486
487
        $pages = 0;
488
489
        reset($lastProcessedEntries);
490
        while (list($key, $timestamp) = each($lastProcessedEntries)) {
0 ignored issues
show
Unused Code introduced by
The assignment to $key is unused. Consider omitting it like so list($first,,$third).

This checks looks for assignemnts to variables using the list(...) function, where not all assigned variables are subsequently used.

Consider the following code example.

<?php

function returnThreeValues() {
    return array('a', 'b', 'c');
}

list($a, $b, $c) = returnThreeValues();

print $a . " - " . $c;

Only the variables $a and $c are used. There was no need to assign $b.

Instead, the list call could have been.

list($a,, $c) = returnThreeValues();
Loading history...
491
            if ($compareValue - $timestamp > $tooOldDelta) {
492
                break;
493
            }
494
            $compareValue = $timestamp;
495
            $pages++;
496
        }
497
498
        if ($pages < 10) {
499
            // not enough information
500
            return false;
501
        }
502
        $oldestTimestampThatIsNotTooOld = $compareValue;
503
        $time = $startTime - $oldestTimestampThatIsNotTooOld;
504
        $speed = $pages / ($time / 60);
505
506
        return $speed;
507
    }
508
509
    /**
510
     * Get some performance data
511
     *
512
     * @param integer $start
513
     * @param integer $end
514
     * @param integer $resolution
515
     *
516
     * @return array data
517
     *
518
     * @throws \Exception
519
     */
520
    public function getPerformanceData($start, $end, $resolution)
521
    {
522
        $data = [];
523
524
        $data['urlcount'] = 0;
525
        $data['start'] = $start;
526
        $data['end'] = $end;
527
        $data['duration'] = $data['end'] - $data['start'];
528
529
        if ($data['duration'] < 1) {
530
            throw new \Exception('End timestamp must be after start timestamp', 1512659945);
531
        }
532
533
        for ($slotStart = $start; $slotStart < $end; $slotStart += $resolution) {
534
            $slotEnd = min($slotStart + $resolution - 1, $end);
535
            $slotData = $this->getQueueRepository()->getPerformanceData($slotStart, $slotEnd);
536
537
            $slotUrlCount = 0;
538
            foreach ($slotData as $processId => &$processData) {
539
                $duration = $processData['end'] - $processData['start'];
540
                if ($processData['urlcount'] > 5 && $duration > 0) {
541
                    $processData['speed'] = 60 * 1 / ($duration / $processData['urlcount']);
542
                }
543
                $slotUrlCount += $processData['urlcount'];
544
            }
545
546
            $data['urlcount'] += $slotUrlCount;
547
548
            $data['slots'][$slotEnd] = [
549
                'amountProcesses' => count($slotData),
550
                'urlcount' => $slotUrlCount,
551
                'processes' => $slotData,
552
            ];
553
554
            if ($slotUrlCount > 5) {
555
                $data['slots'][$slotEnd]['speed'] = 60 * 1 / ($slotEnd - $slotStart / $slotUrlCount);
556
            } else {
557
                $data['slots'][$slotEnd]['speed'] = 0;
558
            }
559
        }
560
561
        if ($data['urlcount'] > 5) {
562
            $data['speed'] = 60 * 1 / ($data['duration'] / $data['urlcount']);
563
        } else {
564
            $data['speed'] = 0;
565
        }
566
567
        return $data;
568
    }
569
}
570