Completed
Push — testing/CrawlerApi ( 8f76cc )
by Tomas Norre
06:43
created

CrawlerApi::findCrawler()   A

Complexity

Conditions 3
Paths 4

Size

Total Lines 13
Code Lines 8

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 6
CRAP Score 4.125

Importance

Changes 0
Metric Value
cc 3
eloc 8
nc 4
nop 0
dl 0
loc 13
ccs 6
cts 12
cp 0.5
crap 4.125
rs 9.4285
c 0
b 0
f 0
1
<?php
2
namespace AOE\Crawler\Api;
3
4
/***************************************************************
5
 *  Copyright notice
6
 *
7
 *  (c) 2016 AOE GmbH <[email protected]>
8
 *
9
 *  All rights reserved
10
 *
11
 *  This script is part of the TYPO3 project. The TYPO3 project is
12
 *  free software; you can redistribute it and/or modify
13
 *  it under the terms of the GNU General Public License as published by
14
 *  the Free Software Foundation; either version 3 of the License, or
15
 *  (at your option) any later version.
16
 *
17
 *  The GNU General Public License can be found at
18
 *  http://www.gnu.org/copyleft/gpl.html.
19
 *
20
 *  This script is distributed in the hope that it will be useful,
21
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
22
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
23
 *  GNU General Public License for more details.
24
 *
25
 *  This copyright notice MUST APPEAR in all copies of the script!
26
 ***************************************************************/
27
28
use AOE\Crawler\Controller\CrawlerController;
29
use AOE\Crawler\Domain\Repository\ProcessRepository;
30
use AOE\Crawler\Domain\Repository\QueueRepository;
31
use TYPO3\CMS\Core\Utility\GeneralUtility;
32
use TYPO3\CMS\Core\Utility\MathUtility;
33
use TYPO3\CMS\Frontend\Page\PageRepository;
34
35
/**
36
 * Class CrawlerApi
37
 *
38
 * @package AOE\Crawler\Api
39
 */
40
class CrawlerApi
41
{
42
    /**
43
     * @var CrawlerController
44
     */
45
    private $crawlerController;
46
47
    /**
48
     * @var QueueRepository
49
     */
50
    protected $queueRepository;
51
52
    /**
53
     * @var $allowedConfigurations array
54
     */
55
    protected $allowedConfigurations = [];
56
57
    /**
58
     * Each crawler run has a setid, this facade method delegates
59
     * the it to the crawler object
60
     *
61
     * @param int
62
     */
63 1
    public function overwriteSetId($id)
64
    {
65 1
        $this->findCrawler()->setID = intval($id);
66 1
    }
67
68
    /**
69
     * This method is used to limit the configuration selection to
70
     * a set of configurations.
71
     *
72
     * @param array $allowedConfigurations
73
     */
74 1
    public function setAllowedConfigurations(array $allowedConfigurations)
75
    {
76 1
        $this->allowedConfigurations = $allowedConfigurations;
77 1
    }
78
79
    /**
80
     * @return array
81
     */
82 1
    public function getAllowedConfigurations()
83
    {
84 1
        return $this->allowedConfigurations;
85
    }
86
87
    /**
88
     * Returns the setID of the crawler
89
     *
90
     * @return int
91
     */
92 1
    public function getSetId()
93
    {
94 1
        return $this->findCrawler()->setID;
95
    }
96
97
    /**
98
     * Method to get an instance of the internal crawler singleton
99
     *
100
     * @return CrawlerController Instance of the crawler lib
101
     *
102
     * @throws \Exception
103
     */
104 3
    protected function findCrawler()
105
    {
106 3
        if (!is_object($this->crawlerController)) {
107 3
            $this->crawlerController = GeneralUtility::makeInstance(CrawlerController::class);
108 3
            $this->crawlerController->setID = GeneralUtility::md5int(microtime());
109
        }
110
111 3
        if (is_object($this->crawlerController)) {
112 3
            return $this->crawlerController;
113
        } else {
114
            throw new \Exception('no crawler object', 1512659759);
115
        }
116
    }
117
118
    /**
119
     * Adds a page to the crawlerqueue by uid
120
     *
121
     * @param int $uid uid
122
     */
123 1
    public function addPageToQueue($uid)
124
    {
125 1
        $uid = intval($uid);
126
        //non timed elements will be added with timestamp 0
127 1
        $this->addPageToQueueTimed($uid, 0);
128 1
    }
129
130
    /**
131
     * This method is used to limit the processing instructions to the processing instructions
132
     * that are allowed.
133
     *
134
     * @return array
135
     */
136 5
    protected function filterUnallowedConfigurations($configurations)
137
    {
138 5
        if (count($this->allowedConfigurations) > 0) {
139
            // 	remove configuration that does not match the current selection
140
            foreach ($configurations as $confKey => $confArray) {
141
                if (!in_array($confKey, $this->allowedConfigurations)) {
142
                    unset($configurations[$confKey]);
143
                }
144
            }
145
        }
146
147 5
        return $configurations;
148
    }
149
150
    /**
151
     * Adds a page to the crawlerqueue by uid and sets a
152
     * timestamp when the page should be crawled.
153
     *
154
     * @param int $uid pageid
155
     * @param int $time timestamp
156
     */
157 5
    public function addPageToQueueTimed($uid, $time)
158
    {
159 5
        $uid = intval($uid);
160 5
        $time = intval($time);
161
162 5
        $crawler = $this->findCrawler();
163 5
        $pageData = GeneralUtility::makeInstance(PageRepository::class)->getPage($uid);
164 5
        $configurations = $crawler->getUrlsForPageRow($pageData);
165 5
        $configurations = $this->filterUnallowedConfigurations($configurations);
166 5
        $downloadUrls = [];
167 5
        $duplicateTrack = [];
168
169 5
        if (is_array($configurations)) {
170 5
            foreach ($configurations as $cv) {
171
                //enable inserting of entries
172 4
                $crawler->registerQueueEntriesInternallyOnly = false;
173 4
                $crawler->urlListFromUrlArray(
174 4
                    $cv,
175 4
                    $pageData,
176 4
                    $time,
177 4
                    300,
178 4
                    true,
179 4
                    false,
180 4
                    $duplicateTrack,
181 4
                    $downloadUrls,
182 4
                    array_keys($this->getCrawlerProcInstructions())
183
                );
184
185
                //reset the queue because the entries have been written to the db
186 5
                unset($crawler->queueEntries);
187
            }
188
        } else {
189
            //no configuration found
190
        }
191 5
    }
192
193
    /**
194
     * Counts all entrys in the database which are scheduled for a given page id and a schedule timestamp.
195
     *
196
     * @param int $page_uid
197
     * @param int $schedule_timestamp
198
     *
199
     * @return int
200
     */
201
    protected function countEntriesInQueueForPageByScheduletime($page_uid, $schedule_timestamp)
202
    {
203
        //if the same page is scheduled for the same time and has not be executed?
204
        if ($schedule_timestamp == 0) {
205
            //untimed elements need an exec_time with 0 because they can occure multiple times
206
            $where = 'page_id=' . $page_uid . ' AND exec_time = 0 AND scheduled=' . $schedule_timestamp;
207
        } else {
208
            //timed elementes have got a fixed schedule time, if a record with this time
209
            //exists it is maybe queued for the future, or is has been queue for the past and therefore
210
            //also been processed.
211
            $where = 'page_id=' . $page_uid . ' AND scheduled=' . $schedule_timestamp;
212
        }
213
214
        $row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($GLOBALS['TYPO3_DB']->exec_SELECTquery(
215
            'count(*) as cnt',
216
            'tx_crawler_queue',
217
            $where
218
        ));
219
220
        return intval($row['cnt']);
221
    }
222
223
    /**
224
     * Determines if a page is queued
225
     *
226
     * @param $uid
227
     * @param bool $unprocessed_only
228
     * @param bool $timed_only
229
     * @param bool $timestamp
230
     *
231
     * @return bool
232
     */
233 4
    public function isPageInQueue($uid, $unprocessed_only = true, $timed_only = false, $timestamp = false)
234
    {
235 4
        if (!MathUtility::canBeInterpretedAsInteger($uid)) {
236
            throw new \InvalidArgumentException('Invalid parameter type', 1468931945);
237
        }
238
239 4
        $isPageInQueue = false;
240
241 4
        $whereClause = 'page_id = ' . (integer)$uid;
242
243 4
        if (false !== $unprocessed_only) {
244 1
            $whereClause .= ' AND exec_time = 0';
245
        }
246
247 4
        if (false !== $timed_only) {
248 1
            $whereClause .= ' AND scheduled != 0';
249
        }
250
251 4
        if (false !== $timestamp) {
252 1
            $whereClause .= ' AND scheduled = ' . (integer)$timestamp;
253
        }
254
255 4
        $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows(
256 4
            '*',
257 4
            'tx_crawler_queue',
258 4
            $whereClause
259
        );
260
261 4
        if (false !== $count && $count > 0) {
262 3
            $isPageInQueue = true;
263
        }
264
265 4
        return $isPageInQueue;
266
    }
267
268
    /**
269
     * Method to return the latest Crawle Timestamp for a page.
270
     *
271
     * @param int $uid uid id of the page
272
     * @param bool $future_crawldates_only
273
     * @param bool $unprocessed_only
274
     *
275
     * @return int
276
     */
277
    public function getLatestCrawlTimestampForPage($uid, $future_crawldates_only = false, $unprocessed_only = false)
278
    {
279
        $uid = intval($uid);
280
        $query = 'max(scheduled) as latest';
281
        $where = ' page_id = ' . $uid;
282
283
        if ($future_crawldates_only) {
284
            $where .= ' AND scheduled > ' . time();
285
        }
286
287
        if ($unprocessed_only) {
288
            $where .= ' AND exec_time = 0';
289
        }
290
291
        $rs = $GLOBALS['TYPO3_DB']->exec_SELECTquery($query, 'tx_crawler_queue', $where);
292
        if ($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($rs)) {
293
            $res = $row['latest'];
294
        } else {
295
            $res = 0;
296
        }
297
298
        return $res;
299
    }
300
301
    /**
302
     * Returns an array with timestamps when the page has been scheduled for crawling and
303
     * at what time the scheduled crawl has been executed. The array also contains items that are
304
     * scheduled but have note been crawled yet.
305
     *
306
     * @param int $uid uid of the page
307
     * @param bool $limit
308
     *
309
     * @return array array with the crawlhistory of a page => 0 : scheduled time , 1 : execuded_time, 2 : set_id
310
     */
311
    public function getCrawlHistoryForPage($uid, $limit = false)
312
    {
313
        $uid = intval($uid);
314
        $limit = $GLOBALS['TYPO3_DB']->fullQuoteStr($limit, 'tx_crawler_queue');
315
316
        $query = 'scheduled, exec_time, set_id';
317
        $where = ' page_id = ' . $uid;
318
319
        $limit_query = ($limit) ? $limit : null;
320
321
        $rows = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows($query, 'tx_crawler_queue', $where, null, null, $limit_query);
322
323
        return $rows;
324
    }
325
326
    /**
327
     * Method to determine unprocessed Items in the crawler queue.
328
     *
329
     * @return array
330
     */
331 1
    public function getUnprocessedItems()
332
    {
333 1
        $query = '*';
334 1
        $where = 'exec_time = 0';
335 1
        $rows = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows($query, 'tx_crawler_queue', $where, '', 'page_id, scheduled');
336
337 1
        return $rows;
338
    }
339
340
    /**
341
     * Method to get the number of unprocessed items in the crawler
342
     *
343
     * @param int number of unprocessed items in the queue
344
     */
345 4
    public function countUnprocessedItems()
346
    {
347 4
        $query = 'count(page_id) as anz';
348 4
        $where = 'exec_time = 0';
349 4
        $rs = $GLOBALS['TYPO3_DB']->exec_SELECTquery($query, 'tx_crawler_queue', $where);
350 4
        $row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($rs);
351
352 4
        return $row['anz'];
353
    }
354
355
    /**
356
     * Method to check if a page is in the queue which is timed for a
357
     * date when it should be crawled
358
     *
359
     * @param int $uid uid of the page
360
     * @param boolean $show_unprocessed only respect unprocessed pages
361
     *
362
     * @return boolean
363
     */
364
    public function isPageInQueueTimed($uid, $show_unprocessed = true)
365
    {
366
        $uid = intval($uid);
367
368
        return $this->isPageInQueue($uid, $show_unprocessed);
369
    }
370
371
    /**
372
     * Reads the registered processingInstructions of the crawler
373
     *
374
     * @return array
375
     */
376 4
    private function getCrawlerProcInstructions()
377
    {
378 4
        if (isset($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['procInstructions'])) {
379
            return $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['procInstructions'];
380
        }
381
382 4
        return [];
383
    }
384
385
    /**
386
     * Removes an queue entry with a given queue id
387
     *
388
     * @param int $qid
389
     */
390
    public function removeQueueEntrie($qid)
391
    {
392
        $qid = intval($qid);
393
        $table = 'tx_crawler_queue';
394
        $where = ' qid=' . $qid;
395
        $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, $where);
396
    }
397
398
    /**
399
     * Get queue statistics
400
     *
401
     * @param void
402
     *
403
     * @return array array('assignedButUnprocessed' => <>, 'unprocessed' => <>);
404
     */
405
    public function getQueueStatistics()
406
    {
407
        return [
408
            'assignedButUnprocessed' => $this->getQueueRepository()->countAllAssignedPendingItems(),
409
            'unprocessed' => $this->getQueueRepository()->countAllPendingItems()
410
        ];
411
    }
412
413
    /**
414
     * Get queue repository
415
     *
416
     * @return QueueRepository
417
     */
418
    protected function getQueueRepository()
419
    {
420
        if (!$this->queueRepository instanceof QueueRepository) {
421
            $this->queueRepository = new QueueRepository();
422
        }
423
424
        return $this->queueRepository;
425
    }
426
427
    /**
428
     * Get queue statistics by configuration
429
     *
430
     * @return array array of array('configuration' => <>, 'assignedButUnprocessed' => <>, 'unprocessed' => <>)
431
     */
432
    public function getQueueStatisticsByConfiguration()
433
    {
434
        $statistics = $this->getQueueRepository()->countPendingItemsGroupedByConfigurationKey();
435
436
        $setIds = $this->getQueueRepository()->getSetIdWithUnprocessedEntries();
437
438
        $totals = $this->getQueueRepository()->getTotalQueueEntriesByConfiguration($setIds);
439
440
        // "merge" arrays
441
        foreach ($statistics as $key => &$value) {
442
            $value['total'] = $totals[$value['configuration']];
443
        }
444
445
        return $statistics;
446
    }
447
448
    /**
449
     * Get active processes count
450
     *
451
     * @param void
452
     *
453
     * @return int
454
     */
455
    public function getActiveProcessesCount()
456
    {
457
        $processRepository = new ProcessRepository();
458
459
        return $processRepository->countActive();
460
    }
461
462
    /**
463
     * Get last processed entries
464
     *
465
     * @param int limit
466
     *
467
     * @return array
468
     */
469
    public function getLastProcessedQueueEntries($limit)
470
    {
471
        return $this->getQueueRepository()->getLastProcessedEntries('*', $limit);
472
    }
473
474
    /**
475
     * Get current crawling speed
476
     *
477
     * @param float|false page speed in pages per minute
478
     *
479
     * @return int
480
     */
481
    public function getCurrentCrawlingSpeed()
482
    {
483
        $lastProcessedEntries = $this->getQueueRepository()->getLastProcessedEntriesTimestamps();
484
485
        if (count($lastProcessedEntries) < 10) {
486
            // not enough information
487
            return false;
488
        }
489
490
        $tooOldDelta = 60; // time between two entries is "too old"
491
492
        $compareValue = time();
493
        $startTime = $lastProcessedEntries[0];
494
495
        $pages = 0;
496
497
        reset($lastProcessedEntries);
498
        while (list($key, $timestamp) = each($lastProcessedEntries)) {
0 ignored issues
show
Unused Code introduced by
The assignment to $key is unused. Consider omitting it like so list($first,,$third).

This checks looks for assignemnts to variables using the list(...) function, where not all assigned variables are subsequently used.

Consider the following code example.

<?php

function returnThreeValues() {
    return array('a', 'b', 'c');
}

list($a, $b, $c) = returnThreeValues();

print $a . " - " . $c;

Only the variables $a and $c are used. There was no need to assign $b.

Instead, the list call could have been.

list($a,, $c) = returnThreeValues();
Loading history...
499
            if ($compareValue - $timestamp > $tooOldDelta) {
500
                break;
501
            }
502
            $compareValue = $timestamp;
503
            $pages++;
504
        }
505
506
        if ($pages < 10) {
507
            // not enough information
508
            return false;
509
        }
510
        $oldestTimestampThatIsNotTooOld = $compareValue;
511
        $time = $startTime - $oldestTimestampThatIsNotTooOld;
512
        $speed = $pages / ($time / 60);
513
514
        return $speed;
515
    }
516
517
    /**
518
     * Get some performance data
519
     *
520
     * @param integer $start
521
     * @param integer $end
522
     * @param integer $resolution
523
     *
524
     * @return array data
525
     *
526
     * @throws \Exception
527
     */
528
    public function getPerformanceData($start, $end, $resolution)
529
    {
530
        $data = [];
531
532
        $data['urlcount'] = 0;
533
        $data['start'] = $start;
534
        $data['end'] = $end;
535
        $data['duration'] = $data['end'] - $data['start'];
536
537
        if ($data['duration'] < 1) {
538
            throw new \Exception('End timestamp must be after start timestamp', 1512659945);
539
        }
540
541
        for ($slotStart = $start; $slotStart < $end; $slotStart += $resolution) {
542
            $slotEnd = min($slotStart + $resolution - 1, $end);
543
            $slotData = $this->getQueueRepository()->getPerformanceData($slotStart, $slotEnd);
544
545
            $slotUrlCount = 0;
546
            foreach ($slotData as $processId => &$processData) {
547
                $duration = $processData['end'] - $processData['start'];
548
                if ($processData['urlcount'] > 5 && $duration > 0) {
549
                    $processData['speed'] = 60 * 1 / ($duration / $processData['urlcount']);
550
                }
551
                $slotUrlCount += $processData['urlcount'];
552
            }
553
554
            $data['urlcount'] += $slotUrlCount;
555
556
            $data['slots'][$slotEnd] = [
557
                'amountProcesses' => count($slotData),
558
                'urlcount' => $slotUrlCount,
559
                'processes' => $slotData,
560
            ];
561
562
            if ($slotUrlCount > 5) {
563
                $data['slots'][$slotEnd]['speed'] = 60 * 1 / ($slotEnd - $slotStart / $slotUrlCount);
564
            } else {
565
                $data['slots'][$slotEnd]['speed'] = 0;
566
            }
567
        }
568
569
        if ($data['urlcount'] > 5) {
570
            $data['speed'] = 60 * 1 / ($data['duration'] / $data['urlcount']);
571
        } else {
572
            $data['speed'] = 0;
573
        }
574
575
        return $data;
576
    }
577
}
578