Completed
Push — master ( 1b04e6...82e336 )
by Tomas Norre
08:07
created

Classes/Api/CrawlerApi.php (1 issue)

Upgrade to new PHP Analysis Engine

These results are based on our legacy PHP analysis, consider migrating to our new PHP analysis engine instead. Learn more

1
<?php
2
namespace AOE\Crawler\Api;
3
4
/***************************************************************
5
 *  Copyright notice
6
 *
7
 *  (c) 2018 AOE GmbH <[email protected]>
8
 *
9
 *  All rights reserved
10
 *
11
 *  This script is part of the TYPO3 project. The TYPO3 project is
12
 *  free software; you can redistribute it and/or modify
13
 *  it under the terms of the GNU General Public License as published by
14
 *  the Free Software Foundation; either version 3 of the License, or
15
 *  (at your option) any later version.
16
 *
17
 *  The GNU General Public License can be found at
18
 *  http://www.gnu.org/copyleft/gpl.html.
19
 *
20
 *  This script is distributed in the hope that it will be useful,
21
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
22
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
23
 *  GNU General Public License for more details.
24
 *
25
 *  This copyright notice MUST APPEAR in all copies of the script!
26
 ***************************************************************/
27
28
use AOE\Crawler\Controller\CrawlerController;
29
use AOE\Crawler\Domain\Repository\ProcessRepository;
30
use AOE\Crawler\Domain\Repository\QueueRepository;
31
use TYPO3\CMS\Core\Utility\GeneralUtility;
32
use TYPO3\CMS\Core\Utility\MathUtility;
33
use TYPO3\CMS\Frontend\Page\PageRepository;
34
35
/**
36
 * Class CrawlerApi
37
 *
38
 * @package AOE\Crawler\Api
39
 */
40
class CrawlerApi
41
{
42
    /**
43
     * @var CrawlerController
44
     */
45
    private $crawlerController;
46
47
    /**
48
     * @var QueueRepository
49
     */
50
    protected $queueRepository;
51
52
    /**
53
     * @var $allowedConfigurations array
54
     */
55
    protected $allowedConfigurations = [];
56
57
    /**
58
     * Each crawler run has a setid, this facade method delegates
59
     * the it to the crawler object
60
     *
61
     * @param int
62
     */
63 1
    public function overwriteSetId($id)
64
    {
65 1
        $this->findCrawler()->setID = intval($id);
66 1
    }
67
68
    /**
69
     * This method is used to limit the configuration selection to
70
     * a set of configurations.
71
     *
72
     * @param array $allowedConfigurations
73
     */
74 1
    public function setAllowedConfigurations(array $allowedConfigurations)
75
    {
76 1
        $this->allowedConfigurations = $allowedConfigurations;
77 1
    }
78
79
    /**
80
     * @return array
81
     */
82 1
    public function getAllowedConfigurations()
83
    {
84 1
        return $this->allowedConfigurations;
85
    }
86
87
    /**
88
     * Returns the setID of the crawler
89
     *
90
     * @return int
91
     */
92 1
    public function getSetId()
93
    {
94 1
        return $this->findCrawler()->setID;
95
    }
96
97
    /**
98
     * Method to get an instance of the internal crawler singleton
99
     *
100
     * @return CrawlerController Instance of the crawler lib
101
     *
102
     * @throws \Exception
103
     */
104 2
    protected function findCrawler()
105
    {
106 2
        if (!is_object($this->crawlerController)) {
107 2
            $this->crawlerController = GeneralUtility::makeInstance(CrawlerController::class);
108 2
            $this->crawlerController->setID = GeneralUtility::md5int(microtime());
109
        }
110
111 2
        if (is_object($this->crawlerController)) {
112 2
            return $this->crawlerController;
113
        } else {
114
            throw new \Exception('no crawler object', 1512659759);
115
        }
116
    }
117
118
    /**
119
     * Adds a page to the crawlerqueue by uid
120
     *
121
     * @param int $uid uid
122
     */
123
    public function addPageToQueue($uid)
124
    {
125
        $uid = intval($uid);
126
        //non timed elements will be added with timestamp 0
127
        $this->addPageToQueueTimed($uid, 0);
128
    }
129
130
    /**
131
     * This method is used to limit the processing instructions to the processing instructions
132
     * that are allowed.
133
     *
134
     * @return array
135
     */
136 4
    protected function filterUnallowedConfigurations($configurations)
137
    {
138 4
        if (count($this->allowedConfigurations) > 0) {
139
            // 	remove configuration that does not match the current selection
140
            foreach ($configurations as $confKey => $confArray) {
141
                if (!in_array($confKey, $this->allowedConfigurations)) {
142
                    unset($configurations[$confKey]);
143
                }
144
            }
145
        }
146
147 4
        return $configurations;
148
    }
149
150
    /**
151
     * Adds a page to the crawlerqueue by uid and sets a
152
     * timestamp when the page should be crawled.
153
     *
154
     * @param int $uid pageid
155
     * @param int $time timestamp
156
     */
157 4
    public function addPageToQueueTimed($uid, $time)
158
    {
159 4
        $uid = intval($uid);
160 4
        $time = intval($time);
161
162 4
        $crawler = $this->findCrawler();
163 4
        $pageData = GeneralUtility::makeInstance(PageRepository::class)->getPage($uid);
164 4
        $configurations = $crawler->getUrlsForPageRow($pageData);
165 4
        $configurations = $this->filterUnallowedConfigurations($configurations);
166 4
        $downloadUrls = [];
167 4
        $duplicateTrack = [];
168
169 4
        if (is_array($configurations)) {
170 4
            foreach ($configurations as $cv) {
171
                //enable inserting of entries
172 4
                $crawler->registerQueueEntriesInternallyOnly = false;
173 4
                $crawler->urlListFromUrlArray(
174 4
                    $cv,
175 4
                    $pageData,
176 4
                    $time,
177 4
                    300,
178 4
                    true,
179 4
                    false,
180 4
                    $duplicateTrack,
181 4
                    $downloadUrls,
182 4
                    array_keys($this->getCrawlerProcInstructions())
183
                );
184
185
                //reset the queue because the entries have been written to the db
186 4
                unset($crawler->queueEntries);
187
            }
188
        } else {
189
            //no configuration found
190
        }
191 4
    }
192
193
    /**
194
     * Counts all entrys in the database which are scheduled for a given page id and a schedule timestamp.
195
     *
196
     * @param int $page_uid
197
     * @param int $schedule_timestamp
198
     *
199
     * @return int
200
     */
201 1
    protected function countEntriesInQueueForPageByScheduleTime($page_uid, $schedule_timestamp)
202
    {
203
        //if the same page is scheduled for the same time and has not be executed?
204 1
        if ($schedule_timestamp == 0) {
205
            //un-timed elements need an exec_time with 0 because they can occur multiple times
206 1
            $where = 'page_id=' . $page_uid . ' AND exec_time = 0 AND scheduled=' . $schedule_timestamp;
207
        } else {
208
            //timed elements have got a fixed schedule time, if a record with this time
209
            //exists it is maybe queued for the future, or is has been queue for the past and therefore
210
            //also been processed.
211 1
            $where = 'page_id=' . $page_uid . ' AND scheduled=' . $schedule_timestamp;
212
        }
213
214 1
        $row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($GLOBALS['TYPO3_DB']->exec_SELECTquery(
215 1
            'count(*) as cnt',
216 1
            'tx_crawler_queue',
217 1
            $where
218
        ));
219
220 1
        return intval($row['cnt']);
221
    }
222
223
    /**
224
     * Determines if a page is queued
225
     *
226
     * @param $uid
227
     * @param bool $unprocessed_only
228
     * @param bool $timed_only
229
     * @param bool $timestamp
230
     *
231
     * @return bool
232
     */
233 6
    public function isPageInQueue($uid, $unprocessed_only = true, $timed_only = false, $timestamp = false)
234
    {
235 6
        if (!MathUtility::canBeInterpretedAsInteger($uid)) {
236 1
            throw new \InvalidArgumentException('Invalid parameter type', 1468931945);
237
        }
238
239 5
        $isPageInQueue = false;
240
241 5
        $whereClause = 'page_id = ' . (integer)$uid;
242
243 5
        if (false !== $unprocessed_only) {
244 2
            $whereClause .= ' AND exec_time = 0';
245
        }
246
247 5
        if (false !== $timed_only) {
248 1
            $whereClause .= ' AND scheduled != 0';
249
        }
250
251 5
        if (false !== $timestamp) {
252 1
            $whereClause .= ' AND scheduled = ' . (integer)$timestamp;
253
        }
254
255 5
        $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows(
256 5
            '*',
257 5
            'tx_crawler_queue',
258 5
            $whereClause
259
        );
260
261 5
        if (false !== $count && $count > 0) {
262 4
            $isPageInQueue = true;
263
        }
264
265 5
        return $isPageInQueue;
266
    }
267
268
    /**
269
     * Method to return the latest Crawle Timestamp for a page.
270
     *
271
     * @param int $uid uid id of the page
272
     * @param bool $future_crawldates_only
273
     * @param bool $unprocessed_only
274
     *
275
     * @return int
276
     */
277 1
    public function getLatestCrawlTimestampForPage($uid, $future_crawldates_only = false, $unprocessed_only = false)
278
    {
279 1
        $uid = intval($uid);
280 1
        $query = 'max(scheduled) as latest';
281 1
        $where = ' page_id = ' . $uid;
282
283 1
        if ($future_crawldates_only) {
284
            $where .= ' AND scheduled > ' . time();
285
        }
286
287 1
        if ($unprocessed_only) {
288
            $where .= ' AND exec_time = 0';
289
        }
290
291 1
        $rs = $GLOBALS['TYPO3_DB']->exec_SELECTquery($query, 'tx_crawler_queue', $where);
292 1
        if ($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($rs)) {
293 1
            $res = $row['latest'];
294
        } else {
295
            $res = 0;
296
        }
297
298 1
        return $res;
299
    }
300
301
    /**
302
     * Returns an array with timestamps when the page has been scheduled for crawling and
303
     * at what time the scheduled crawl has been executed. The array also contains items that are
304
     * scheduled but have note been crawled yet.
305
     *
306
     * @param int $uid uid of the page
307
     * @param bool $limit
308
     *
309
     * @return array array with the crawl-history of a page => 0 : scheduled time , 1 : executed_time, 2 : set_id
310
     */
311 1
    public function getCrawlHistoryForPage($uid, $limit = 0)
312
    {
313 1
        $uid = intval($uid);
314 1
        $limit = intval($limit);
315
316 1
        $query = 'scheduled, exec_time, set_id';
317 1
        $where = ' page_id = ' . $uid;
318
319 1
        $limit_query = ($limit) ? $limit : null;
320
321 1
        $rows = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows($query, 'tx_crawler_queue', $where, null, null, $limit_query);
322 1
        return $rows;
323
    }
324
325
    /**
326
     * Method to determine unprocessed Items in the crawler queue.
327
     *
328
     * @return array
329
     */
330 1
    public function getUnprocessedItems()
331
    {
332 1
        $query = '*';
333 1
        $where = 'exec_time = 0';
334 1
        $rows = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows($query, 'tx_crawler_queue', $where, '', 'page_id, scheduled');
335
336 1
        return $rows;
337
    }
338
339
    /**
340
     * Method to get the number of unprocessed items in the crawler
341
     *
342
     * @param int number of unprocessed items in the queue
343
     */
344 4
    public function countUnprocessedItems()
345
    {
346 4
        $query = 'count(page_id) as anz';
347 4
        $where = 'exec_time = 0';
348 4
        $rs = $GLOBALS['TYPO3_DB']->exec_SELECTquery($query, 'tx_crawler_queue', $where);
349 4
        $row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($rs);
350
351 4
        return $row['anz'];
352
    }
353
354
    /**
355
     * Method to check if a page is in the queue which is timed for a
356
     * date when it should be crawled
357
     *
358
     * @param int $uid uid of the page
359
     * @param boolean $show_unprocessed only respect unprocessed pages
360
     *
361
     * @return boolean
362
     */
363 1
    public function isPageInQueueTimed($uid, $show_unprocessed = true)
364
    {
365 1
        $uid = intval($uid);
366
367 1
        return $this->isPageInQueue($uid, $show_unprocessed);
368
    }
369
370
    /**
371
     * Reads the registered processingInstructions of the crawler
372
     *
373
     * @return array
374
     */
375 4
    private function getCrawlerProcInstructions()
376
    {
377 4
        if (isset($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['procInstructions'])) {
378
            return $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['procInstructions'];
379
        }
380
381 4
        return [];
382
    }
383
384
    /**
385
     * Removes an queue entry with a given queue id
386
     *
387
     * @param int $qid
388
     */
389
    public function removeQueueEntrie($qid)
390
    {
391
        $qid = intval($qid);
392
        $table = 'tx_crawler_queue';
393
        $where = ' qid=' . $qid;
394
        $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, $where);
395
    }
396
397
    /**
398
     * Get queue statistics
399
     *
400
     * @param void
401
     *
402
     * @return array array('assignedButUnprocessed' => <>, 'unprocessed' => <>);
403
     */
404 1
    public function getQueueStatistics()
405
    {
406
        return [
407 1
            'assignedButUnprocessed' => $this->getQueueRepository()->countAllAssignedPendingItems(),
408 1
            'unprocessed' => $this->getQueueRepository()->countAllPendingItems()
409
        ];
410
    }
411
412
    /**
413
     * Get queue repository
414
     *
415
     * @return QueueRepository
416
     */
417 2
    protected function getQueueRepository()
418
    {
419 2
        if (!$this->queueRepository instanceof QueueRepository) {
420 2
            $this->queueRepository = new QueueRepository();
421
        }
422
423 2
        return $this->queueRepository;
424
    }
425
426
    /**
427
     * Get queue statistics by configuration
428
     *
429
     * @return array array of array('configuration' => <>, 'assignedButUnprocessed' => <>, 'unprocessed' => <>)
430
     */
431
    public function getQueueStatisticsByConfiguration()
432
    {
433
        $statistics = $this->getQueueRepository()->countPendingItemsGroupedByConfigurationKey();
434
435
        $setIds = $this->getQueueRepository()->getSetIdWithUnprocessedEntries();
436
437
        $totals = $this->getQueueRepository()->getTotalQueueEntriesByConfiguration($setIds);
438
439
        // "merge" arrays
440
        foreach ($statistics as $key => &$value) {
441
            $value['total'] = $totals[$value['configuration']];
442
        }
443
444
        return $statistics;
445
    }
446
447
    /**
448
     * Get active processes count
449
     *
450
     * @param void
451
     *
452
     * @return int
453
     */
454
    public function getActiveProcessesCount()
455
    {
456
        $processRepository = new ProcessRepository();
457
458
        return $processRepository->countActive();
459
    }
460
461
    /**
462
     * Get last processed entries
463
     *
464
     * @param int $limit
465
     *
466
     * @return array
467
     */
468
    public function getLastProcessedQueueEntries($limit)
469
    {
470
        return $this->getQueueRepository()->getLastProcessedEntries('*', $limit);
471
    }
472
473
    /**
474
     * Get current crawling speed
475
     *
476
     * @param float|false page speed in pages per minute
477
     *
478
     * @return int
479
     */
480
    public function getCurrentCrawlingSpeed()
481
    {
482
        $lastProcessedEntries = $this->getQueueRepository()->getLastProcessedEntriesTimestamps();
483
484
        if (count($lastProcessedEntries) < 10) {
485
            // not enough information
486
            return false;
487
        }
488
489
        $tooOldDelta = 60; // time between two entries is "too old"
490
491
        $compareValue = time();
492
        $startTime = $lastProcessedEntries[0];
493
494
        $pages = 0;
495
496
        reset($lastProcessedEntries);
497
        while (list($key, $timestamp) = each($lastProcessedEntries)) {
0 ignored issues
show
The assignment to $key is unused. Consider omitting it like so list($first,,$third).

This checks looks for assignemnts to variables using the list(...) function, where not all assigned variables are subsequently used.

Consider the following code example.

<?php

function returnThreeValues() {
    return array('a', 'b', 'c');
}

list($a, $b, $c) = returnThreeValues();

print $a . " - " . $c;

Only the variables $a and $c are used. There was no need to assign $b.

Instead, the list call could have been.

list($a,, $c) = returnThreeValues();
Loading history...
498
            if ($compareValue - $timestamp > $tooOldDelta) {
499
                break;
500
            }
501
            $compareValue = $timestamp;
502
            $pages++;
503
        }
504
505
        if ($pages < 10) {
506
            // not enough information
507
            return false;
508
        }
509
        $oldestTimestampThatIsNotTooOld = $compareValue;
510
        $time = $startTime - $oldestTimestampThatIsNotTooOld;
511
        $speed = $pages / ($time / 60);
512
513
        return $speed;
514
    }
515
516
    /**
517
     * Get some performance data
518
     *
519
     * @param integer $start
520
     * @param integer $end
521
     * @param integer $resolution
522
     *
523
     * @return array data
524
     *
525
     * @throws \Exception
526
     */
527
    public function getPerformanceData($start, $end, $resolution)
528
    {
529
        $data = [];
530
531
        $data['urlcount'] = 0;
532
        $data['start'] = $start;
533
        $data['end'] = $end;
534
        $data['duration'] = $data['end'] - $data['start'];
535
536
        if ($data['duration'] < 1) {
537
            throw new \Exception('End timestamp must be after start timestamp', 1512659945);
538
        }
539
540
        for ($slotStart = $start; $slotStart < $end; $slotStart += $resolution) {
541
            $slotEnd = min($slotStart + $resolution - 1, $end);
542
            $slotData = $this->getQueueRepository()->getPerformanceData($slotStart, $slotEnd);
543
544
            $slotUrlCount = 0;
545
            foreach ($slotData as $processId => &$processData) {
546
                $duration = $processData['end'] - $processData['start'];
547
                if ($processData['urlcount'] > 5 && $duration > 0) {
548
                    $processData['speed'] = 60 * 1 / ($duration / $processData['urlcount']);
549
                }
550
                $slotUrlCount += $processData['urlcount'];
551
            }
552
553
            $data['urlcount'] += $slotUrlCount;
554
555
            $data['slots'][$slotEnd] = [
556
                'amountProcesses' => count($slotData),
557
                'urlcount' => $slotUrlCount,
558
                'processes' => $slotData,
559
            ];
560
561
            if ($slotUrlCount > 5) {
562
                $data['slots'][$slotEnd]['speed'] = 60 * 1 / ($slotEnd - $slotStart / $slotUrlCount);
563
            } else {
564
                $data['slots'][$slotEnd]['speed'] = 0;
565
            }
566
        }
567
568
        if ($data['urlcount'] > 5) {
569
            $data['speed'] = 60 * 1 / ($data['duration'] / $data['urlcount']);
570
        } else {
571
            $data['speed'] = 0;
572
        }
573
574
        return $data;
575
    }
576
}
577