Completed
Branch issue/163 (2fdfdc)
by Tomas Norre
03:50
created

CrawlerApi::addPageToQueueTimed()   B

Complexity

Conditions 3
Paths 3

Size

Total Lines 36
Code Lines 24

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 3
eloc 24
nc 3
nop 2
dl 0
loc 36
rs 8.8571
c 0
b 0
f 0
1
<?php
2
namespace AOE\Crawler\Api;
3
4
/***************************************************************
5
 *  Copyright notice
6
 *
7
 *  (c) 2016 AOE GmbH <[email protected]>
8
 *
9
 *  All rights reserved
10
 *
11
 *  This script is part of the TYPO3 project. The TYPO3 project is
12
 *  free software; you can redistribute it and/or modify
13
 *  it under the terms of the GNU General Public License as published by
14
 *  the Free Software Foundation; either version 3 of the License, or
15
 *  (at your option) any later version.
16
 *
17
 *  The GNU General Public License can be found at
18
 *  http://www.gnu.org/copyleft/gpl.html.
19
 *
20
 *  This script is distributed in the hope that it will be useful,
21
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
22
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
23
 *  GNU General Public License for more details.
24
 *
25
 *  This copyright notice MUST APPEAR in all copies of the script!
26
 ***************************************************************/
27
28
use TYPO3\CMS\Core\Utility\GeneralUtility;
29
use TYPO3\CMS\Core\Utility\MathUtility;
30
31
/**
32
 * Class CrawlerApi
33
 *
34
 * @package AOE\Crawler\Api
35
 */
36
class CrawlerApi
37
{
38
    /**
39
     * @var \tx_crawler_lib
40
     */
41
    private $crawlerObj;
42
43
    /**
44
     * @var \tx_crawler_domain_queue_repository queue repository
45
     */
46
    protected $queueRepository;
47
48
    /**
49
     * @var $allowedConfigrations array
50
     */
51
    protected $allowedConfigrations = array();
52
53
    /**
54
     * Each crawler run has a setid, this facade method delegates
55
     * the it to the crawler object
56
     *
57
     * @param int
58
     */
59
    public function overwriteSetId($id)
60
    {
61
        $this->findCrawler()->setID = intval($id);
62
    }
63
64
    /**
65
     * This method is used to limit the configuration selection to
66
     * a set of configurations.
67
     *
68
     * @param array $allowedConfigurations
69
     */
70
    public function setAllowedConfigurations(array $allowedConfigurations)
71
    {
72
        $this->allowedConfigrations = $allowedConfigurations;
73
    }
74
75
    /**
76
     * Returns the setID of the crawler
77
     *
78
     * @return int
79
     */
80
    public function getSetId()
81
    {
82
        return $this->findCrawler()->setID;
83
    }
84
85
    /**
86
     * Method to get an instance of the internal crawler singleton
87
     *
88
     * @return \tx_crawler_lib Instance of the crawler lib
89
     *
90
     * @throws \Exception
91
     */
92
    protected function findCrawler()
93
    {
94
        if ( ! is_object($this->crawlerObj)) {
95
            $this->crawlerObj = GeneralUtility::makeInstance('tx_crawler_lib');
96
            $this->crawlerObj->setID = GeneralUtility::md5int(microtime());
97
        }
98
99
        if (is_object($this->crawlerObj)) {
100
            return $this->crawlerObj;
101
        } else {
102
            throw new \Exception("no crawler object");
103
        }
104
    }
105
106
    /**
107
     * Adds a page to the crawlerqueue by uid
108
     *
109
     * @param int $uid uid
110
     */
111
    public function addPageToQueue($uid)
112
    {
113
        $uid = intval($uid);
114
        //non timed elements will be added with timestamp 0
115
        $this->addPageToQueueTimed($uid, 0);
116
    }
117
118
    /**
119
     * This method is used to limit the processing instructions to the processing instructions
120
     * that are allowed.
121
     *
122
     * @return array
123
     */
124
    protected function filterUnallowedConfigurations($configurations)
125
    {
126
        if (count($this->allowedConfigrations) > 0) {
127
            // 	remove configuration that does not match the current selection
128
            foreach ($configurations as $confKey => $confArray) {
129
                if ( ! in_array($confKey, $this->allowedConfigrations)) {
130
                    unset($configurations[$confKey]);
131
                }
132
            }
133
        }
134
135
        return $configurations;
136
    }
137
138
    /**
139
     * Adds a page to the crawlerqueue by uid and sets a
140
     * timestamp when the page should be crawled.
141
     *
142
     * @param int $uid pageid
143
     * @param int $time timestamp
144
     */
145
    public function addPageToQueueTimed($uid, $time)
146
    {
147
148
        $uid  = intval($uid);
149
        $time = intval($time);
150
151
        $crawler = $this->findCrawler();
152
        $pageData = GeneralUtility::makeInstance('TYPO3\CMS\Frontend\Page\PageRepository')->getPage($uid);
153
        $configurations = $crawler->getUrlsForPageRow($pageData);
154
        $configurations = $this->filterUnallowedConfigurations($configurations);
155
        $downloadUrls = array();
156
        $duplicateTrack = array();
157
158
        if (is_array($configurations)) {
159
            foreach ($configurations as $cv) {
160
                //enable inserting of entries
161
                $crawler->registerQueueEntriesInternallyOnly = false;
162
                $crawler->urlListFromUrlArray(
163
                    $cv,
164
                    $pageData,
165
                    $time,
166
                    300,
167
                    true,
168
                    false,
169
                    $duplicateTrack,
170
                    $downloadUrls,
171
                    array_keys($this->getCrawlerProcInstructions())
172
                );
173
174
                //reset the queue because the entries have been written to the db
175
                unset($crawler->queueEntries);
176
            }
177
        } else {
178
            //no configuration found
179
        }
180
    }
181
182
    /**
183
     * Counts all entrys in the database which are scheduled for a given page id and a schedule timestamp.
184
     *
185
     * @param int $page_uid
186
     * @param int $schedule_timestamp
187
     *
188
     * @return int
189
     */
190
    protected function countEntriesInQueueForPageByScheduletime($page_uid, $schedule_timestamp)
191
    {
192
        //if the same page is scheduled for the same time and has not be executed?
193
        if ($schedule_timestamp == 0) {
194
            //untimed elements need an exec_time with 0 because they can occure multiple times
195
            $where = 'page_id=' . $page_uid . ' AND exec_time = 0 AND scheduled=' . $schedule_timestamp;
196
        } else {
197
            //timed elementes have got a fixed schedule time, if a record with this time
198
            //exists it is maybe queued for the future, or is has been queue for the past and therefore
199
            //also been processed.
200
            $where = 'page_id=' . $page_uid . ' AND scheduled=' . $schedule_timestamp;
201
        }
202
203
        $row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($GLOBALS['TYPO3_DB']->exec_SELECTquery('count(*) as cnt',
204
            'tx_crawler_queue', $where));
205
206
        return intval($row['cnt']);
207
    }
208
209
    /**
210
     * Determines if a page is queued
211
     *
212
     * @param $uid
213
     * @param bool $unprocessed_only
214
     * @param bool $timed_only
215
     * @param bool $timestamp
216
     *
217
     * @return bool
218
     */
219
    public function isPageInQueue($uid, $unprocessed_only = true, $timed_only = false, $timestamp = false)
220
    {
221
        if (MathUtility::canBeInterpretedAsInteger($uid)) {
222
            throw new \InvalidArgumentException('Invalid parameter type', 1468931945);
223
        }
224
225
        $isPageInQueue = false;
226
227
        $whereClause = 'page_id = ' . (integer)$uid;
228
229
        if (false !== $unprocessed_only) {
230
            $whereClause .= ' AND exec_time = 0';
231
        }
232
233
        if (false !== $timed_only) {
234
            $whereClause .= ' AND scheduled != 0';
235
        }
236
237
        if (false !== $timestamp) {
238
            $whereClause .= ' AND scheduled = ' . (integer)$timestamp;
239
        }
240
241
        $count = $GLOBALS['TYPO3_DB']->exec_SELECTcountRows(
242
            '*',
243
            'tx_crawler_queue',
244
            $whereClause
245
        );
246
247
        if (false !== $count && $count > 0) {
248
            $isPageInQueue = true;
249
        }
250
251
        return $isPageInQueue;
252
    }
253
254
    /**
255
     * Method to return the latest Crawle Timestamp for a page.
256
     *
257
     * @param int $uid uid id of the page
258
     * @param bool $future_crawldates_only
259
     * @param bool $unprocessed_only
260
     *
261
     * @return int
262
     */
263
    public function getLatestCrawlTimestampForPage($uid, $future_crawldates_only = false, $unprocessed_only = false)
264
    {
265
        $uid   = intval($uid);
266
        $query = 'max(scheduled) as latest';
267
        $where = ' page_id = ' . $uid;
268
269
        if ($future_crawldates_only) {
270
            $where .= ' AND scheduled > ' . time();
271
        }
272
273
        if ($unprocessed_only) {
274
            $where .= ' AND exec_time = 0';
275
        }
276
277
        $rs = $GLOBALS['TYPO3_DB']->exec_SELECTquery($query, 'tx_crawler_queue', $where);
278
        if ($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($rs)) {
279
            $res = $row['latest'];
280
        } else {
281
            $res = 0;
282
        }
283
284
        return $res;
285
    }
286
287
    /**
288
     * Returns an array with timestamps when the page has been scheduled for crawling and
289
     * at what time the scheduled crawl has been executed. The array also contains items that are
290
     * scheduled but have note been crawled yet.
291
     *
292
     * @param int $uid uid of the page
293
     * @param bool $limit
294
     *
295
     * @return array array with the crawlhistory of a page => 0 : scheduled time , 1 : execuded_time, 2 : set_id
296
     */
297
    public function getCrawlHistoryForPage($uid, $limit = false)
298
    {
299
        $uid   = intval($uid);
300
        $limit = $GLOBALS['TYPO3_DB']->fullQuoteStr($limit, 'tx_crawler_queue');
301
302
        $query = 'scheduled, exec_time, set_id';
303
        $where = ' page_id = ' . $uid;
304
305
        $limit_query = ($limit) ? $limit : null;
306
307
        $rows = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows($query, 'tx_crawler_queue', $where, null, null, $limit_query);
308
309
        return $rows;
310
    }
311
312
    /**
313
     * Method to determine unprocessed Items in the crawler queue.
314
     *
315
     * @return array
316
     */
317
    public function getUnprocessedItems()
318
    {
319
        $query = '*';
320
        $where = 'exec_time = 0';
321
        $rows  = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows($query, 'tx_crawler_queue', $where, '', 'page_id, scheduled');
322
323
        return $rows;
324
    }
325
326
    /**
327
     * Method to get the number of unprocessed items in the crawler
328
     *
329
     * @param int number of unprocessed items in the queue
330
     */
331
    public function countUnprocessedItems()
332
    {
333
        $query = 'count(page_id) as anz';
334
        $where = 'exec_time = 0';
335
        $rs    = $GLOBALS['TYPO3_DB']->exec_SELECTquery($query, 'tx_crawler_queue', $where);
336
        $row   = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($rs);
337
338
        return $row['anz'];
339
    }
340
341
    /**
342
     * Method to check if a page is in the queue which is timed for a
343
     * date when it should be crawled
344
     *
345
     * @param int $uid uid of the page
346
     * @param boolean $show_unprocessed only respect unprocessed pages
347
     *
348
     * @return boolean
349
     */
350
    public function isPageInQueueTimed($uid, $show_unprocessed = true)
351
    {
352
        $uid = intval($uid);
353
354
        return $this->isPageInQueue($uid, $show_unprocessed);
355
    }
356
357
    /**
358
     * Reads the registered processingInstructions of the crawler
359
     *
360
     * @return array
361
     */
362
    private function getCrawlerProcInstructions()
363
    {
364
        if (isset($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['procInstructions'])) {
365
            return $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['procInstructions'];
366
        }
367
368
        return array();
369
370
    }
371
372
    /**
373
     * Removes an queue entry with a given queue id
374
     *
375
     * @param int $qid
376
     */
377
    public function removeQueueEntrie($qid)
378
    {
379
        $qid   = intval($qid);
380
        $table = 'tx_crawler_queue';
381
        $where = ' qid=' . $qid;
382
        $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, $where);
383
    }
384
385
    /**
386
     * Get queue statistics
387
     *
388
     * @param void
389
     *
390
     * @return array array('assignedButUnprocessed' => <>, 'unprocessed' => <>);
391
     */
392
    public function getQueueStatistics()
393
    {
394
        return array(
395
            'assignedButUnprocessed' => $this->getQueueRepository()->countAllAssignedPendingItems(),
396
            'unprocessed'            => $this->getQueueRepository()->countAllPendingItems()
397
        );
398
    }
399
400
    /**
401
     * Get queue repository
402
     *
403
     * @param void
404
     *
405
     * @return \tx_crawler_domain_queue_repository queue repository
406
     */
407
    protected function getQueueRepository()
408
    {
409
        if ( ! $this->queueRepository instanceof \tx_crawler_domain_queue_repository) {
410
            $this->queueRepository = new \tx_crawler_domain_queue_repository();
411
        }
412
413
        return $this->queueRepository;
414
    }
415
416
    /**
417
     * Get queue statistics by configuration
418
     *
419
     * @param void
420
     *
421
     * @return array array of array('configuration' => <>, 'assignedButUnprocessed' => <>, 'unprocessed' => <>)
422
     */
423
    public function getQueueStatisticsByConfiguration()
424
    {
425
        $statistics = $this->getQueueRepository()->countPendingItemsGroupedByConfigurationKey();
426
427
        $setIds = $this->getQueueRepository()->getSetIdWithUnprocessedEntries();
428
429
        $totals = $this->getQueueRepository()->getTotalQueueEntriesByConfiguration($setIds);
430
431
        // "merge" arrays
432
        foreach ($statistics as $key => &$value) {
433
            $value['total'] = $totals[$value['configuration']];
434
        }
435
436
        return $statistics;
437
    }
438
439
    /**
440
     * Get active processes count
441
     *
442
     * @param void
443
     *
444
     * @return int
445
     * @author Fabrizio Branca <[email protected]>
446
     * @since 2009-09-03
447
     */
448
    public function getActiveProcessesCount()
449
    {
450
        $processRepository = new \tx_crawler_domain_process_repository();
451
452
        return $processRepository->countActive();
453
    }
454
455
    /**
456
     * Get last processed entries
457
     *
458
     * @param int limit
459
     *
460
     * @return array
461
     */
462
    public function getLastProcessedQueueEntries($limit)
463
    {
464
        return $this->getQueueRepository()->getLastProcessedEntries('*', $limit);
465
    }
466
467
    /**
468
     * Get current crawling speed
469
     *
470
     * @param float|false page speed in pages per minute
471
     *
472
     * @return int
473
     */
474
    public function getCurrentCrawlingSpeed()
475
    {
476
        $lastProcessedEntries = $this->getQueueRepository()->getLastProcessedEntriesTimestamps();
477
478
        if (count($lastProcessedEntries) < 10) {
479
            // not enough information
480
            return false;
481
        }
482
483
        $tooOldDelta = 60; // time between two entries is "too old"
484
485
        $compareValue = time();
486
        $startTime    = $lastProcessedEntries[0];
487
488
        $pages = 0;
489
490
        reset($lastProcessedEntries);
491
        while (list($key, $timestamp) = each($lastProcessedEntries)) {
0 ignored issues
show
Unused Code introduced by
The assignment to $key is unused. Consider omitting it like so list($first,,$third).

This checks looks for assignemnts to variables using the list(...) function, where not all assigned variables are subsequently used.

Consider the following code example.

<?php

function returnThreeValues() {
    return array('a', 'b', 'c');
}

list($a, $b, $c) = returnThreeValues();

print $a . " - " . $c;

Only the variables $a and $c are used. There was no need to assign $b.

Instead, the list call could have been.

list($a,, $c) = returnThreeValues();
Loading history...
492
            if ($compareValue - $timestamp > $tooOldDelta) {
493
                break;
494
            }
495
            $compareValue = $timestamp;
496
            $pages++;
497
        }
498
499
        if ($pages < 10) {
500
            // not enough information
501
            return false;
502
        }
503
        $oldestTimestampThatIsNotTooOld = $compareValue;
504
        $time                           = $startTime - $oldestTimestampThatIsNotTooOld;
505
        $speed                          = $pages / ($time / 60);
506
507
        return $speed;
508
    }
509
510
    /**
511
     * Get some performance data
512
     *
513
     * @param integer $start
514
     * @param integer $end
515
     * @param integer $resolution
516
     *
517
     * @return array data
518
     *
519
     * @throws \Exception
520
     */
521
    public function getPerformanceData($start, $end, $resolution)
522
    {
523
        $data = array();
524
525
        $data['urlcount'] = 0;
526
        $data['start']    = $start;
527
        $data['end']      = $end;
528
        $data['duration'] = $data['end'] - $data['start'];
529
530
        if ($data['duration'] < 1) {
531
            throw new \Exception('End timestamp must be after start timestamp');
532
        }
533
534
        for ($slotStart = $start; $slotStart < $end; $slotStart += $resolution) {
535
            $slotEnd  = min($slotStart + $resolution - 1, $end);
536
            $slotData = $this->getQueueRepository()->getPerformanceData($slotStart, $slotEnd);
537
538
            $slotUrlCount = 0;
539
            foreach ($slotData as $processId => &$processData) {
540
                $duration = $processData['end'] - $processData['start'];
541
                if ($processData['urlcount'] > 5 && $duration > 0) {
542
                    $processData['speed'] = 60 * 1 / ($duration / $processData['urlcount']);
543
                }
544
                $slotUrlCount += $processData['urlcount'];
545
            }
546
547
            $data['urlcount'] += $slotUrlCount;
548
549
            $data['slots'][$slotEnd] = array(
550
                'amountProcesses' => count($slotData),
551
                'urlcount'        => $slotUrlCount,
552
                'processes'       => $slotData,
553
            );
554
555
            if ($slotUrlCount > 5) {
556
                $data['slots'][$slotEnd]['speed'] = 60 * 1 / ($slotEnd - $slotStart / $slotUrlCount);
557
            } else {
558
                $data['slots'][$slotEnd]['speed'] = 0;
559
            }
560
561
        }
562
563
        if ($data['urlcount'] > 5) {
564
            $data['speed'] = 60 * 1 / ($data['duration'] / $data['urlcount']);
565
        } else {
566
            $data['speed'] = 0;
567
        }
568
569
        return $data;
570
    }
571
572
    /**
573
     * Wrapper to support old an new method to test integer value.
574
     *
575
     * @param integer $value
576
     * @param integer $min
577
     * @param integer $max
578
     * @param integer $default
579
     *
580
     * @return integer
581
     */
582
    static public function forceIntegerInRange($value, $min, $max = 2000000, $default = 0)
583
    {
584
        $result = MathUtility::forceIntegerInRange($value, $min, $max, $default);
585
586
        return $result;
587
    }
588
589
    /**
590
     * Wrapper to support old an new method to test integer value.
591
     *
592
     * @param integer $value
593
     *
594
     * @return bool
595
     */
596
    static public function canBeInterpretedAsInteger($value)
597
    {
598
        $result = MathUtility::canBeInterpretedAsInteger($value);
599
600
        return $result;
601
    }
602
}
603