1
|
|
|
<?php |
2
|
|
|
namespace AOE\Crawler\Api; |
3
|
|
|
|
4
|
|
|
/*************************************************************** |
5
|
|
|
* Copyright notice |
6
|
|
|
* |
7
|
|
|
* (c) 2018 AOE GmbH <[email protected]> |
8
|
|
|
* |
9
|
|
|
* All rights reserved |
10
|
|
|
* |
11
|
|
|
* This script is part of the TYPO3 project. The TYPO3 project is |
12
|
|
|
* free software; you can redistribute it and/or modify |
13
|
|
|
* it under the terms of the GNU General Public License as published by |
14
|
|
|
* the Free Software Foundation; either version 3 of the License, or |
15
|
|
|
* (at your option) any later version. |
16
|
|
|
* |
17
|
|
|
* The GNU General Public License can be found at |
18
|
|
|
* http://www.gnu.org/copyleft/gpl.html. |
19
|
|
|
* |
20
|
|
|
* This script is distributed in the hope that it will be useful, |
21
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
22
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
23
|
|
|
* GNU General Public License for more details. |
24
|
|
|
* |
25
|
|
|
* This copyright notice MUST APPEAR in all copies of the script! |
26
|
|
|
***************************************************************/ |
27
|
|
|
|
28
|
|
|
use AOE\Crawler\Controller\CrawlerController; |
29
|
|
|
use AOE\Crawler\Domain\Repository\ProcessRepository; |
30
|
|
|
use AOE\Crawler\Domain\Repository\QueueRepository; |
31
|
|
|
use TYPO3\CMS\Core\Database\ConnectionPool; |
32
|
|
|
use TYPO3\CMS\Core\Database\Query\QueryBuilder; |
33
|
|
|
use TYPO3\CMS\Core\Utility\GeneralUtility; |
34
|
|
|
use TYPO3\CMS\Core\Utility\MathUtility; |
35
|
|
|
use TYPO3\CMS\Extbase\Object\ObjectManager; |
36
|
|
|
use TYPO3\CMS\Frontend\Page\PageRepository; |
37
|
|
|
|
38
|
|
|
/** |
39
|
|
|
* Class CrawlerApi |
40
|
|
|
* |
41
|
|
|
* @package AOE\Crawler\Api |
42
|
|
|
*/ |
43
|
|
|
class CrawlerApi |
44
|
|
|
{ |
45
|
|
|
/** |
46
|
|
|
* @var CrawlerController |
47
|
|
|
*/ |
48
|
|
|
private $crawlerController; |
49
|
|
|
|
50
|
|
|
/** |
51
|
|
|
* @var QueueRepository |
52
|
|
|
*/ |
53
|
|
|
protected $queueRepository; |
54
|
|
|
|
55
|
|
|
/** |
56
|
|
|
* @var $allowedConfigurations array |
57
|
|
|
*/ |
58
|
|
|
protected $allowedConfigurations = []; |
59
|
|
|
|
60
|
|
|
/** |
61
|
|
|
* @var QueryBuilder |
62
|
|
|
*/ |
63
|
|
|
protected $queryBuilder; |
64
|
|
|
|
65
|
|
|
/** |
66
|
|
|
* @var string |
67
|
|
|
*/ |
68
|
|
|
protected $tableName = 'tx_crawler_queue'; |
69
|
|
|
|
70
|
11 |
|
public function __construct() |
71
|
|
|
{ |
72
|
11 |
|
$objectManager = GeneralUtility::makeInstance(ObjectManager::class); |
73
|
11 |
|
$this->crawlerController = $objectManager->get(CrawlerController::class); |
74
|
11 |
|
} |
75
|
|
|
|
76
|
|
|
/** |
77
|
|
|
* Each crawler run has a setid, this facade method delegates |
78
|
|
|
* the it to the crawler object |
79
|
|
|
* |
80
|
|
|
* @param int |
81
|
|
|
*/ |
82
|
1 |
|
public function overwriteSetId($id) |
83
|
|
|
{ |
84
|
1 |
|
$this->findCrawler()->setID = intval($id); |
85
|
1 |
|
} |
86
|
|
|
|
87
|
|
|
/** |
88
|
|
|
* This method is used to limit the configuration selection to |
89
|
|
|
* a set of configurations. |
90
|
|
|
* |
91
|
|
|
* @param array $allowedConfigurations |
92
|
|
|
*/ |
93
|
1 |
|
public function setAllowedConfigurations(array $allowedConfigurations) |
94
|
|
|
{ |
95
|
1 |
|
$this->allowedConfigurations = $allowedConfigurations; |
96
|
1 |
|
} |
97
|
|
|
|
98
|
|
|
/** |
99
|
|
|
* @return array |
100
|
|
|
*/ |
101
|
1 |
|
public function getAllowedConfigurations() |
102
|
|
|
{ |
103
|
1 |
|
return $this->allowedConfigurations; |
104
|
|
|
} |
105
|
|
|
|
106
|
|
|
/** |
107
|
|
|
* Returns the setID of the crawler |
108
|
|
|
* |
109
|
|
|
* @return int |
110
|
|
|
*/ |
111
|
1 |
|
public function getSetId() |
112
|
|
|
{ |
113
|
1 |
|
return $this->findCrawler()->setID; |
114
|
|
|
} |
115
|
|
|
|
116
|
|
|
/** |
117
|
|
|
* Method to get an instance of the internal crawler singleton |
118
|
|
|
* |
119
|
|
|
* @return CrawlerController Instance of the crawler lib |
120
|
|
|
* |
121
|
|
|
* @throws \Exception |
122
|
|
|
*/ |
123
|
2 |
|
protected function findCrawler() |
124
|
|
|
{ |
125
|
2 |
|
if (!is_object($this->crawlerController)) { |
126
|
|
|
$this->crawlerController = GeneralUtility::makeInstance(CrawlerController::class); |
127
|
|
|
$this->crawlerController->setID = GeneralUtility::md5int(microtime()); |
128
|
|
|
} |
129
|
|
|
|
130
|
2 |
|
if (is_object($this->crawlerController)) { |
131
|
2 |
|
return $this->crawlerController; |
132
|
|
|
} else { |
133
|
|
|
throw new \Exception('no crawler object', 1512659759); |
134
|
|
|
} |
135
|
|
|
} |
136
|
|
|
|
137
|
|
|
/** |
138
|
|
|
* Adds a page to the crawlerqueue by uid |
139
|
|
|
* |
140
|
|
|
* @param int $uid uid |
141
|
|
|
*/ |
142
|
|
|
public function addPageToQueue($uid) |
143
|
|
|
{ |
144
|
|
|
$uid = intval($uid); |
145
|
|
|
//non timed elements will be added with timestamp 0 |
146
|
|
|
$this->addPageToQueueTimed($uid, 0); |
147
|
|
|
} |
148
|
|
|
|
149
|
|
|
/** |
150
|
|
|
* This method is used to limit the processing instructions to the processing instructions |
151
|
|
|
* that are allowed. |
152
|
|
|
* |
153
|
|
|
* @return array |
154
|
|
|
*/ |
155
|
2 |
|
protected function filterUnallowedConfigurations($configurations) |
156
|
|
|
{ |
157
|
2 |
|
if (count($this->allowedConfigurations) > 0) { |
158
|
|
|
// remove configuration that does not match the current selection |
159
|
|
|
foreach ($configurations as $confKey => $confArray) { |
160
|
|
|
if (!in_array($confKey, $this->allowedConfigurations)) { |
161
|
|
|
unset($configurations[$confKey]); |
162
|
|
|
} |
163
|
|
|
} |
164
|
|
|
} |
165
|
|
|
|
166
|
2 |
|
return $configurations; |
167
|
|
|
} |
168
|
|
|
|
169
|
|
|
/** |
170
|
|
|
* Adds a page to the crawlerqueue by uid and sets a |
171
|
|
|
* timestamp when the page should be crawled. |
172
|
|
|
* |
173
|
|
|
* @param int $uid pageid |
174
|
|
|
* @param int $time timestamp |
175
|
|
|
*/ |
176
|
2 |
|
public function addPageToQueueTimed($uid, $time) |
177
|
|
|
{ |
178
|
2 |
|
$uid = intval($uid); |
179
|
2 |
|
$time = intval($time); |
180
|
|
|
|
181
|
2 |
|
$crawler = $this->findCrawler(); |
182
|
2 |
|
$pageData = GeneralUtility::makeInstance(PageRepository::class)->getPage($uid); |
183
|
2 |
|
$configurations = $crawler->getUrlsForPageRow($pageData); |
184
|
2 |
|
$configurations = $this->filterUnallowedConfigurations($configurations); |
185
|
2 |
|
$downloadUrls = []; |
186
|
2 |
|
$duplicateTrack = []; |
187
|
|
|
|
188
|
2 |
|
if (is_array($configurations)) { |
189
|
2 |
|
foreach ($configurations as $cv) { |
190
|
|
|
//enable inserting of entries |
191
|
2 |
|
$crawler->registerQueueEntriesInternallyOnly = false; |
192
|
2 |
|
$crawler->urlListFromUrlArray( |
193
|
2 |
|
$cv, |
194
|
2 |
|
$pageData, |
195
|
2 |
|
$time, |
196
|
2 |
|
300, |
197
|
2 |
|
true, |
198
|
2 |
|
false, |
199
|
2 |
|
$duplicateTrack, |
200
|
2 |
|
$downloadUrls, |
201
|
2 |
|
array_keys($this->getCrawlerProcInstructions()) |
202
|
|
|
); |
203
|
|
|
|
204
|
|
|
//reset the queue because the entries have been written to the db |
205
|
2 |
|
unset($crawler->queueEntries); |
206
|
|
|
} |
207
|
|
|
} else { |
208
|
|
|
//no configuration found |
209
|
|
|
} |
210
|
2 |
|
} |
211
|
|
|
|
212
|
|
|
/** |
213
|
|
|
* Counts all entries in the database which are scheduled for a given page id and a schedule timestamp. |
214
|
|
|
* |
215
|
|
|
* @param int $page_uid |
216
|
|
|
* @param int $schedule_timestamp |
217
|
|
|
* |
218
|
|
|
* @return int |
219
|
|
|
*/ |
220
|
1 |
|
protected function countEntriesInQueueForPageByScheduleTime($page_uid, $schedule_timestamp) |
221
|
|
|
{ |
222
|
|
|
|
223
|
1 |
|
$queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName); |
224
|
|
|
$count = $queryBuilder |
225
|
1 |
|
->count('*') |
226
|
1 |
|
->from($this->tableName); |
227
|
|
|
|
228
|
|
|
//if the same page is scheduled for the same time and has not be executed? |
229
|
|
|
//un-timed elements need an exec_time with 0 because they can occur multiple times |
230
|
1 |
|
if ($schedule_timestamp == 0) { |
231
|
1 |
|
$count->where( |
232
|
1 |
|
$queryBuilder->expr()->eq('page_id', $page_uid), |
233
|
1 |
|
$queryBuilder->expr()->eq('exec_time', 0), |
234
|
1 |
|
$queryBuilder->expr()->eq('scheduled', $schedule_timestamp) |
235
|
|
|
); |
236
|
|
|
} else { |
237
|
|
|
//timed elements have got a fixed schedule time, if a record with this time |
238
|
|
|
//exists it is maybe queued for the future, or is has been queue for the past and therefore |
239
|
|
|
//also been processed. |
240
|
1 |
|
$count->where( |
241
|
1 |
|
$queryBuilder->expr()->eq('page_id', $page_uid), |
242
|
1 |
|
$queryBuilder->expr()->eq('scheduled', $schedule_timestamp) |
243
|
|
|
); |
244
|
|
|
} |
245
|
|
|
|
246
|
1 |
|
return $count->execute()->rowCount(); |
247
|
|
|
} |
248
|
|
|
|
249
|
|
|
/** |
250
|
|
|
* Determines if a page is queued |
251
|
|
|
* |
252
|
|
|
* @param $uid |
253
|
|
|
* @param bool $unprocessed_only |
254
|
|
|
* @param bool $timed_only |
255
|
|
|
* @param bool $timestamp |
256
|
|
|
* |
257
|
|
|
* @return bool |
258
|
|
|
* |
259
|
|
|
* @deprecated since crawler v7.0.0, will be removed in crawler v8.0.0. |
260
|
|
|
*/ |
261
|
|
|
public function isPageInQueue($uid, $unprocessed_only = true, $timed_only = false, $timestamp = false) |
262
|
|
|
{ |
263
|
|
|
if (!MathUtility::canBeInterpretedAsInteger($uid)) { |
264
|
|
|
throw new \InvalidArgumentException('Invalid parameter type', 1468931945); |
265
|
|
|
} |
266
|
|
|
|
267
|
|
|
$isPageInQueue = false; |
268
|
|
|
|
269
|
|
|
$queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName); |
270
|
|
|
$queryBuilder |
271
|
|
|
->count('*') |
272
|
|
|
->from($this->tableName) |
273
|
|
|
->where( |
274
|
|
|
$queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($uid, \PDO::PARAM_INT)) |
275
|
|
|
); |
276
|
|
|
|
277
|
|
|
if (false !== $unprocessed_only) { |
278
|
|
|
$queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0)); |
279
|
|
|
} |
280
|
|
|
|
281
|
|
|
if (false !== $timed_only) { |
282
|
|
|
$queryBuilder->andWhere($queryBuilder->expr()->neq('scheduled', 0)); |
283
|
|
|
} |
284
|
|
|
|
285
|
|
|
if (false !== $timestamp) { |
286
|
|
|
$queryBuilder->andWhere($queryBuilder->expr()->neq('scheduled', $queryBuilder->createNamedParameter($timestamp, \PDO::PARAM_INT))); |
287
|
|
|
} |
288
|
|
|
|
289
|
|
|
$count = $queryBuilder->execute()->fetchColumn(0); |
290
|
|
|
|
291
|
|
|
if (false !== $count && $count > 0) { |
292
|
|
|
$isPageInQueue = true; |
293
|
|
|
} |
294
|
|
|
|
295
|
|
|
return $isPageInQueue; |
296
|
|
|
} |
297
|
|
|
|
298
|
|
|
/** |
299
|
|
|
* Method to return the latest Crawle Timestamp for a page. |
300
|
|
|
* |
301
|
|
|
* @param int $uid uid id of the page |
302
|
|
|
* @param bool $future_crawldates_only |
303
|
|
|
* @param bool $unprocessed_only |
304
|
|
|
* |
305
|
|
|
* @return int |
306
|
|
|
*/ |
307
|
1 |
|
public function getLatestCrawlTimestampForPage($uid, $future_crawldates_only = false, $unprocessed_only = false) |
308
|
|
|
{ |
309
|
1 |
|
$uid = intval($uid); |
310
|
|
|
|
311
|
1 |
|
$queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName); |
312
|
|
|
$query = $queryBuilder |
313
|
1 |
|
->from($this->tableName) |
314
|
1 |
|
->selectLiteral('max(scheduled) as latest') |
315
|
1 |
|
->where( |
316
|
1 |
|
$queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($uid)) |
317
|
|
|
); |
318
|
|
|
|
319
|
1 |
|
if ($future_crawldates_only) { |
320
|
|
|
$query->andWhere( |
321
|
|
|
$queryBuilder->expr()->gt('scheduled', time()) |
322
|
|
|
); |
323
|
|
|
} |
324
|
|
|
|
325
|
1 |
|
if ($unprocessed_only) { |
326
|
|
|
$query->andWhere( |
327
|
|
|
$queryBuilder->expr()->eq('exec_time', 0) |
328
|
|
|
); |
329
|
|
|
} |
330
|
|
|
|
331
|
1 |
|
$row = $query->execute()->fetch(0); |
332
|
1 |
|
if ($row['latest']) { |
333
|
1 |
|
$res = $row['latest']; |
334
|
|
|
} else { |
335
|
|
|
$res = 0; |
336
|
|
|
} |
337
|
|
|
|
338
|
1 |
|
return $res; |
339
|
|
|
} |
340
|
|
|
|
341
|
|
|
/** |
342
|
|
|
* Returns an array with timestamps when the page has been scheduled for crawling and |
343
|
|
|
* at what time the scheduled crawl has been executed. The array also contains items that are |
344
|
|
|
* scheduled but have note been crawled yet. |
345
|
|
|
* |
346
|
|
|
* @param int $uid uid of the page |
347
|
|
|
* @param bool $limit |
348
|
|
|
* |
349
|
|
|
* @return array array with the crawl-history of a page => 0 : scheduled time , 1 : executed_time, 2 : set_id |
350
|
|
|
*/ |
351
|
1 |
|
public function getCrawlHistoryForPage($uid, $limit = 0) |
352
|
|
|
{ |
353
|
1 |
|
$uid = intval($uid); |
354
|
1 |
|
$limit = intval($limit); |
355
|
|
|
|
356
|
1 |
|
$queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName); |
357
|
|
|
$statement = $queryBuilder |
358
|
1 |
|
->from($this->tableName) |
359
|
1 |
|
->select('scheduled', 'exec_time', 'set_id') |
360
|
1 |
|
->where( |
361
|
1 |
|
$queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($uid, \PDO::PARAM_INT)) |
362
|
|
|
); |
363
|
1 |
|
if($limit) { |
364
|
1 |
|
$statement->setMaxResults($limit); |
365
|
|
|
} |
366
|
|
|
|
367
|
1 |
|
return $statement->execute()->fetchAll(); |
368
|
|
|
} |
369
|
|
|
|
370
|
|
|
/** |
371
|
|
|
* Method to determine unprocessed Items in the crawler queue. |
372
|
|
|
* |
373
|
|
|
* @return array |
374
|
|
|
* |
375
|
|
|
* @deprecated since crawler v7.0.0, will be removed in crawler v8.0.0. |
376
|
|
|
*/ |
377
|
|
|
public function getUnprocessedItems() |
378
|
|
|
{ |
379
|
|
|
$queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName); |
380
|
|
|
return $queryBuilder |
381
|
|
|
->select('*') |
382
|
|
|
->from($this->tableName) |
383
|
|
|
->where( |
384
|
|
|
$queryBuilder->expr()->eq('exec_time', 0) |
385
|
|
|
) |
386
|
|
|
->orderBy('page_id') |
387
|
|
|
->addOrderBy('scheduled') |
388
|
|
|
->execute() |
389
|
|
|
->fetchAll(); |
390
|
|
|
} |
391
|
|
|
|
392
|
|
|
/** |
393
|
|
|
* Method to get the number of unprocessed items in the crawler |
394
|
|
|
* |
395
|
|
|
* @param int number of unprocessed items in the queue |
396
|
|
|
* |
397
|
|
|
* @deprecated since crawler v7.0.0, will be removed in crawler v8.0.0. |
398
|
|
|
*/ |
399
|
|
|
public function countUnprocessedItems() |
400
|
|
|
{ |
401
|
|
|
$queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName); |
402
|
|
|
return $queryBuilder |
403
|
|
|
->count('page_id') |
404
|
|
|
->from($this->tableName) |
405
|
|
|
->where( |
406
|
|
|
$queryBuilder->expr()->eq('exec_time', 0) |
407
|
|
|
) |
408
|
|
|
->execute() |
409
|
|
|
->fetchColumn(0); |
410
|
|
|
} |
411
|
|
|
|
412
|
|
|
/** |
413
|
|
|
* Method to check if a page is in the queue which is timed for a |
414
|
|
|
* date when it should be crawled |
415
|
|
|
* |
416
|
|
|
* @param int $uid uid of the page |
417
|
|
|
* @param boolean $show_unprocessed only respect unprocessed pages |
418
|
|
|
* |
419
|
|
|
* @return boolean |
420
|
|
|
* |
421
|
|
|
* @deprecated since crawler v7.0.0, will be removed in crawler v8.0.0. |
422
|
|
|
*/ |
423
|
|
|
public function isPageInQueueTimed($uid, $show_unprocessed = true) |
424
|
|
|
{ |
425
|
|
|
$uid = intval($uid); |
426
|
|
|
|
427
|
|
|
return $this->isPageInQueue($uid, $show_unprocessed); |
|
|
|
|
428
|
|
|
} |
429
|
|
|
|
430
|
|
|
/** |
431
|
|
|
* Reads the registered processingInstructions of the crawler |
432
|
|
|
* |
433
|
|
|
* @return array |
434
|
|
|
*/ |
435
|
2 |
|
private function getCrawlerProcInstructions() |
436
|
|
|
{ |
437
|
2 |
|
if (isset($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['procInstructions'])) { |
438
|
|
|
return $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['procInstructions']; |
439
|
|
|
} |
440
|
|
|
|
441
|
2 |
|
return []; |
442
|
|
|
} |
443
|
|
|
|
444
|
|
|
/** |
445
|
|
|
* Removes an queue entry with a given queue id |
446
|
|
|
* |
447
|
|
|
* @param int $qid |
448
|
|
|
* |
449
|
|
|
* @deprecated since crawler v7.0.0, will be removed in crawler v8.0.0. |
450
|
|
|
*/ |
451
|
|
|
public function removeQueueEntrie($qid) |
452
|
|
|
{ |
453
|
|
|
$queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName); |
454
|
|
|
$queryBuilder |
455
|
|
|
->delete() |
456
|
|
|
->from($this->tableName) |
457
|
|
|
->where( |
458
|
|
|
$queryBuilder->expr()->eq('qid', $queryBuilder->createNamedParameter($qid, \PDO::PARAM_INT)) |
459
|
|
|
) |
460
|
|
|
->execute(); |
461
|
|
|
} |
462
|
|
|
|
463
|
|
|
/** |
464
|
|
|
* Get queue statistics |
465
|
|
|
* |
466
|
|
|
* @param void |
467
|
|
|
* |
468
|
|
|
* @return array array('assignedButUnprocessed' => <>, 'unprocessed' => <>); |
469
|
|
|
*/ |
470
|
1 |
|
public function getQueueStatistics() |
471
|
|
|
{ |
472
|
|
|
return [ |
473
|
1 |
|
'assignedButUnprocessed' => $this->getQueueRepository()->countAllAssignedPendingItems(), |
474
|
1 |
|
'unprocessed' => $this->getQueueRepository()->countAllPendingItems() |
475
|
|
|
]; |
476
|
|
|
} |
477
|
|
|
|
478
|
|
|
/** |
479
|
|
|
* Get queue repository |
480
|
|
|
* |
481
|
|
|
* @return QueueRepository |
482
|
|
|
*/ |
483
|
2 |
|
protected function getQueueRepository() |
484
|
|
|
{ |
485
|
2 |
|
if (!$this->queueRepository instanceof QueueRepository) { |
486
|
2 |
|
$this->queueRepository = new QueueRepository(); |
487
|
|
|
} |
488
|
|
|
|
489
|
2 |
|
return $this->queueRepository; |
490
|
|
|
} |
491
|
|
|
|
492
|
|
|
/** |
493
|
|
|
* Get queue statistics by configuration |
494
|
|
|
* |
495
|
|
|
* @return array array of array('configuration' => <>, 'assignedButUnprocessed' => <>, 'unprocessed' => <>) |
496
|
|
|
*/ |
497
|
|
|
public function getQueueStatisticsByConfiguration() |
498
|
|
|
{ |
499
|
|
|
$statistics = $this->getQueueRepository()->countPendingItemsGroupedByConfigurationKey(); |
500
|
|
|
|
501
|
|
|
$setIds = $this->getQueueRepository()->getSetIdWithUnprocessedEntries(); |
502
|
|
|
|
503
|
|
|
$totals = $this->getQueueRepository()->getTotalQueueEntriesByConfiguration($setIds); |
504
|
|
|
|
505
|
|
|
// "merge" arrays |
506
|
|
|
foreach ($statistics as $key => &$value) { |
507
|
|
|
$value['total'] = $totals[$value['configuration']]; |
508
|
|
|
} |
509
|
|
|
|
510
|
|
|
return $statistics; |
511
|
|
|
} |
512
|
|
|
|
513
|
|
|
/** |
514
|
|
|
* Get active processes count |
515
|
|
|
* |
516
|
|
|
* @param void |
517
|
|
|
* |
518
|
|
|
* @return int |
519
|
|
|
*/ |
520
|
|
|
public function getActiveProcessesCount() |
521
|
|
|
{ |
522
|
|
|
$processRepository = new ProcessRepository(); |
523
|
|
|
|
524
|
|
|
return $processRepository->countActive(); |
525
|
|
|
} |
526
|
|
|
|
527
|
|
|
/** |
528
|
|
|
* Get last processed entries |
529
|
|
|
* |
530
|
|
|
* @param int $limit |
531
|
|
|
* |
532
|
|
|
* @return array |
533
|
|
|
*/ |
534
|
|
|
public function getLastProcessedQueueEntries($limit) |
535
|
|
|
{ |
536
|
|
|
return $this->getQueueRepository()->getLastProcessedEntries('*', $limit); |
|
|
|
|
537
|
|
|
} |
538
|
|
|
|
539
|
|
|
/** |
540
|
|
|
* Get current crawling speed |
541
|
|
|
* |
542
|
|
|
* @param float|false page speed in pages per minute |
543
|
|
|
* |
544
|
|
|
* @return int |
545
|
|
|
*/ |
546
|
|
|
public function getCurrentCrawlingSpeed() |
547
|
|
|
{ |
548
|
|
|
$lastProcessedEntries = $this->getQueueRepository()->getLastProcessedEntriesTimestamps(); |
549
|
|
|
|
550
|
|
|
if (count($lastProcessedEntries) < 10) { |
551
|
|
|
// not enough information |
552
|
|
|
return false; |
553
|
|
|
} |
554
|
|
|
|
555
|
|
|
$tooOldDelta = 60; // time between two entries is "too old" |
556
|
|
|
|
557
|
|
|
$compareValue = time(); |
558
|
|
|
$startTime = $lastProcessedEntries[0]; |
559
|
|
|
|
560
|
|
|
$pages = 0; |
561
|
|
|
|
562
|
|
|
reset($lastProcessedEntries); |
563
|
|
|
foreach($lastProcessedEntries as $key => $timestamp) { |
564
|
|
|
if ($compareValue - $timestamp > $tooOldDelta) { |
565
|
|
|
break; |
566
|
|
|
} |
567
|
|
|
$compareValue = $timestamp; |
568
|
|
|
$pages++; |
569
|
|
|
} |
570
|
|
|
|
571
|
|
|
if ($pages < 10) { |
572
|
|
|
// not enough information |
573
|
|
|
return false; |
574
|
|
|
} |
575
|
|
|
$oldestTimestampThatIsNotTooOld = $compareValue; |
576
|
|
|
$time = $startTime - $oldestTimestampThatIsNotTooOld; |
577
|
|
|
$speed = $pages / ($time / 60); |
578
|
|
|
|
579
|
|
|
return $speed; |
580
|
|
|
} |
581
|
|
|
|
582
|
|
|
/** |
583
|
|
|
* Get some performance data |
584
|
|
|
* |
585
|
|
|
* @param integer $start |
586
|
|
|
* @param integer $end |
587
|
|
|
* @param integer $resolution |
588
|
|
|
* |
589
|
|
|
* @return array data |
590
|
|
|
* |
591
|
|
|
* @throws \Exception |
592
|
|
|
*/ |
593
|
|
|
public function getPerformanceData($start, $end, $resolution) |
594
|
|
|
{ |
595
|
|
|
$data = []; |
596
|
|
|
|
597
|
|
|
$data['urlcount'] = 0; |
598
|
|
|
$data['start'] = $start; |
599
|
|
|
$data['end'] = $end; |
600
|
|
|
$data['duration'] = $data['end'] - $data['start']; |
601
|
|
|
|
602
|
|
|
if ($data['duration'] < 1) { |
603
|
|
|
throw new \Exception('End timestamp must be after start timestamp', 1512659945); |
604
|
|
|
} |
605
|
|
|
|
606
|
|
|
for ($slotStart = $start; $slotStart < $end; $slotStart += $resolution) { |
607
|
|
|
$slotEnd = min($slotStart + $resolution - 1, $end); |
608
|
|
|
$slotData = $this->getQueueRepository()->getPerformanceData($slotStart, $slotEnd); |
609
|
|
|
|
610
|
|
|
$slotUrlCount = 0; |
611
|
|
|
foreach ($slotData as $processId => &$processData) { |
612
|
|
|
$duration = $processData['end'] - $processData['start']; |
613
|
|
|
if ($processData['urlcount'] > 5 && $duration > 0) { |
614
|
|
|
$processData['speed'] = 60 * 1 / ($duration / $processData['urlcount']); |
615
|
|
|
} |
616
|
|
|
$slotUrlCount += $processData['urlcount']; |
617
|
|
|
} |
618
|
|
|
|
619
|
|
|
$data['urlcount'] += $slotUrlCount; |
620
|
|
|
|
621
|
|
|
$data['slots'][$slotEnd] = [ |
622
|
|
|
'amountProcesses' => count($slotData), |
623
|
|
|
'urlcount' => $slotUrlCount, |
624
|
|
|
'processes' => $slotData, |
625
|
|
|
]; |
626
|
|
|
|
627
|
|
|
if ($slotUrlCount > 5) { |
628
|
|
|
$data['slots'][$slotEnd]['speed'] = 60 * 1 / ($slotEnd - $slotStart / $slotUrlCount); |
629
|
|
|
} else { |
630
|
|
|
$data['slots'][$slotEnd]['speed'] = 0; |
631
|
|
|
} |
632
|
|
|
} |
633
|
|
|
|
634
|
|
|
if ($data['urlcount'] > 5) { |
635
|
|
|
$data['speed'] = 60 * 1 / ($data['duration'] / $data['urlcount']); |
636
|
|
|
} else { |
637
|
|
|
$data['speed'] = 0; |
638
|
|
|
} |
639
|
|
|
|
640
|
|
|
return $data; |
641
|
|
|
} |
642
|
|
|
} |
643
|
|
|
|
This method has been deprecated. The supplier of the class has supplied an explanatory message.
The explanatory message should give you some clue as to whether and when the method will be removed from the class and what other method or class to use instead.