1 | <?php |
||
2 | |||
3 | declare(strict_types=1); |
||
4 | |||
5 | namespace AOE\Crawler\Api; |
||
6 | |||
7 | /*************************************************************** |
||
8 | * Copyright notice |
||
9 | * |
||
10 | * (c) 2018 AOE GmbH <[email protected]> |
||
11 | * |
||
12 | * All rights reserved |
||
13 | * |
||
14 | * This script is part of the TYPO3 project. The TYPO3 project is |
||
15 | * free software; you can redistribute it and/or modify |
||
16 | * it under the terms of the GNU General Public License as published by |
||
17 | * the Free Software Foundation; either version 3 of the License, or |
||
18 | * (at your option) any later version. |
||
19 | * |
||
20 | * The GNU General Public License can be found at |
||
21 | * http://www.gnu.org/copyleft/gpl.html. |
||
22 | * |
||
23 | * This script is distributed in the hope that it will be useful, |
||
24 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
||
25 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
||
26 | * GNU General Public License for more details. |
||
27 | * |
||
28 | * This copyright notice MUST APPEAR in all copies of the script! |
||
29 | ***************************************************************/ |
||
30 | |||
31 | use AOE\Crawler\Controller\CrawlerController; |
||
32 | use AOE\Crawler\Domain\Repository\ProcessRepository; |
||
33 | use AOE\Crawler\Domain\Repository\QueueRepository; |
||
34 | use AOE\Crawler\Exception\CrawlerObjectException; |
||
35 | use AOE\Crawler\Exception\TimeStampException; |
||
36 | use TYPO3\CMS\Core\Database\ConnectionPool; |
||
37 | use TYPO3\CMS\Core\Database\Query\QueryBuilder; |
||
38 | use TYPO3\CMS\Core\Domain\Repository\PageRepository; |
||
39 | use TYPO3\CMS\Core\Utility\GeneralUtility; |
||
40 | use TYPO3\CMS\Extbase\Object\ObjectManager; |
||
41 | |||
42 | /** |
||
43 | * Class CrawlerApi |
||
44 | * |
||
45 | * @package AOE\Crawler\Api |
||
46 | * @deprecated Since v9.2.0 - This class will be removed when dropping support for TYPO3 9LTS and 10LTS |
||
47 | */ |
||
48 | class CrawlerApi |
||
49 | { |
||
50 | /** |
||
51 | * @var QueueRepository |
||
52 | */ |
||
53 | protected $queueRepository; |
||
54 | |||
55 | /** |
||
56 | * @var array |
||
57 | */ |
||
58 | protected $allowedConfigurations = []; |
||
59 | |||
60 | /** |
||
61 | * @var QueryBuilder |
||
62 | */ |
||
63 | protected $queryBuilder; |
||
64 | |||
65 | /** |
||
66 | * @var string |
||
67 | */ |
||
68 | protected $tableName = 'tx_crawler_queue'; |
||
69 | |||
70 | /** |
||
71 | * @var CrawlerController |
||
72 | */ |
||
73 | protected $crawlerController; |
||
74 | |||
75 | 10 | public function __construct() |
|
76 | { |
||
77 | /** @var ObjectManager $objectManager */ |
||
78 | 10 | $objectManager = GeneralUtility::makeInstance(ObjectManager::class); |
|
79 | 10 | $this->queueRepository = $objectManager->get(QueueRepository::class); |
|
0 ignored issues
–
show
|
|||
80 | 10 | } |
|
81 | |||
82 | /** |
||
83 | * Each crawler run has a setid, this facade method delegates |
||
84 | * the it to the crawler object |
||
85 | * |
||
86 | * @throws \Exception |
||
87 | */ |
||
88 | 1 | public function overwriteSetId(int $id): void |
|
89 | { |
||
90 | 1 | $this->findCrawler()->setID = $id; |
|
91 | 1 | } |
|
92 | |||
93 | /** |
||
94 | * This method is used to limit the configuration selection to |
||
95 | * a set of configurations. |
||
96 | */ |
||
97 | 1 | public function setAllowedConfigurations(array $allowedConfigurations): void |
|
98 | { |
||
99 | 1 | $this->allowedConfigurations = $allowedConfigurations; |
|
100 | 1 | } |
|
101 | |||
102 | /** |
||
103 | * @return array |
||
104 | */ |
||
105 | 1 | public function getAllowedConfigurations() |
|
106 | { |
||
107 | 1 | return $this->allowedConfigurations; |
|
108 | } |
||
109 | |||
110 | /** |
||
111 | * Returns the setID of the crawler |
||
112 | * |
||
113 | * @return int |
||
114 | */ |
||
115 | 1 | public function getSetId() |
|
116 | { |
||
117 | 1 | return $this->findCrawler()->setID; |
|
118 | } |
||
119 | |||
120 | /** |
||
121 | * Adds a page to the crawlerqueue by uid |
||
122 | * |
||
123 | * @param int $uid uid |
||
124 | * @codeCoverageIgnore |
||
125 | */ |
||
126 | public function addPageToQueue($uid): void |
||
127 | { |
||
128 | $uid = intval($uid); |
||
129 | //non timed elements will be added with timestamp 0 |
||
130 | $this->addPageToQueueTimed($uid, 0); |
||
131 | } |
||
132 | |||
133 | /** |
||
134 | * Adds a page to the crawlerqueue by uid and sets a |
||
135 | * timestamp when the page should be crawled. |
||
136 | * |
||
137 | * @param int $uid pageid |
||
138 | * @param int $time timestamp |
||
139 | * |
||
140 | * @throws \Exception |
||
141 | */ |
||
142 | 4 | public function addPageToQueueTimed($uid, $time): void |
|
143 | { |
||
144 | 4 | $uid = intval($uid); |
|
145 | 4 | $time = intval($time); |
|
146 | |||
147 | 4 | $crawler = $this->findCrawler(); |
|
148 | 4 | $pageData = $this->getPageRepository()->getPage($uid, true); |
|
149 | 4 | $configurations = $crawler->getUrlsForPageRow($pageData); |
|
150 | 4 | $configurations = $this->filterUnallowedConfigurations($configurations); |
|
151 | 4 | $downloadUrls = []; |
|
152 | 4 | $duplicateTrack = []; |
|
153 | |||
154 | 4 | if (is_array($configurations)) { |
|
0 ignored issues
–
show
|
|||
155 | 4 | foreach ($configurations as $cv) { |
|
156 | //enable inserting of entries |
||
157 | 4 | $crawler->registerQueueEntriesInternallyOnly = false; |
|
158 | 4 | $crawler->urlListFromUrlArray( |
|
159 | 4 | $cv, |
|
160 | $pageData, |
||
161 | $time, |
||
162 | 4 | 300, |
|
163 | 4 | true, |
|
164 | 4 | false, |
|
165 | $duplicateTrack, |
||
166 | $downloadUrls, |
||
167 | 4 | array_keys($this->getCrawlerProcInstructions()) |
|
168 | ); |
||
169 | |||
170 | //reset the queue because the entries have been written to the db |
||
171 | 4 | unset($crawler->queueEntries); |
|
172 | } |
||
173 | } |
||
174 | 4 | } |
|
175 | |||
176 | /** |
||
177 | * Method to return the latest Crawle Timestamp for a page. |
||
178 | * |
||
179 | * @param int $uid uid id of the page |
||
180 | * @param bool $future_crawldates_only |
||
181 | * @param bool $unprocessed_only |
||
182 | * |
||
183 | * @return int |
||
184 | */ |
||
185 | 1 | public function getLatestCrawlTimestampForPage($uid, $future_crawldates_only = false, $unprocessed_only = false) |
|
186 | { |
||
187 | 1 | $uid = intval($uid); |
|
188 | |||
189 | 1 | $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable(QueueRepository::TABLE_NAME); |
|
190 | $query = $queryBuilder |
||
191 | 1 | ->from(QueueRepository::TABLE_NAME) |
|
192 | 1 | ->selectLiteral('max(scheduled) as latest') |
|
193 | 1 | ->where( |
|
194 | 1 | $queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($uid)) |
|
195 | ); |
||
196 | |||
197 | 1 | if ($future_crawldates_only) { |
|
198 | $query->andWhere( |
||
199 | $queryBuilder->expr()->gt('scheduled', time()) |
||
200 | ); |
||
201 | } |
||
202 | |||
203 | 1 | if ($unprocessed_only) { |
|
204 | $query->andWhere( |
||
205 | $queryBuilder->expr()->eq('exec_time', 0) |
||
206 | ); |
||
207 | } |
||
208 | |||
209 | 1 | $row = $query->execute()->fetch(0); |
|
210 | 1 | if ($row['latest']) { |
|
211 | 1 | $res = $row['latest']; |
|
212 | } else { |
||
213 | $res = 0; |
||
214 | } |
||
215 | |||
216 | 1 | return intval($res); |
|
217 | } |
||
218 | |||
219 | /** |
||
220 | * Returns an array with timestamps when the page has been scheduled for crawling and |
||
221 | * at what time the scheduled crawl has been executed. The array also contains items that are |
||
222 | * scheduled but have note been crawled yet. |
||
223 | * |
||
224 | * @return array array with the crawl-history of a page => 0 : scheduled time , 1 : executed_time, 2 : set_id |
||
225 | */ |
||
226 | 1 | public function getCrawlHistoryForPage(int $uid, int $limit = 0) |
|
227 | { |
||
228 | 1 | $uid = intval($uid); |
|
229 | 1 | $limit = intval($limit); |
|
230 | |||
231 | 1 | $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable(QueueRepository::TABLE_NAME); |
|
232 | $statement = $queryBuilder |
||
233 | 1 | ->from(QueueRepository::TABLE_NAME) |
|
234 | 1 | ->select('scheduled', 'exec_time', 'set_id') |
|
235 | 1 | ->where( |
|
236 | 1 | $queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($uid, \PDO::PARAM_INT)) |
|
237 | ); |
||
238 | 1 | if ($limit) { |
|
239 | 1 | $statement->setMaxResults($limit); |
|
240 | } |
||
241 | |||
242 | 1 | return $statement->execute()->fetchAll(); |
|
243 | } |
||
244 | |||
245 | 1 | public function getQueueStatistics(): array |
|
246 | { |
||
247 | return [ |
||
248 | 1 | 'assignedButUnprocessed' => $this->queueRepository->countAllAssignedPendingItems(), |
|
249 | 1 | 'unprocessed' => $this->queueRepository->countAllPendingItems(), |
|
250 | ]; |
||
251 | } |
||
252 | |||
253 | /** |
||
254 | * Get queue statistics by configuration |
||
255 | * |
||
256 | * @return array array of array('configuration' => <>, 'assignedButUnprocessed' => <>, 'unprocessed' => <>) |
||
257 | * @codeCoverageIgnore |
||
258 | */ |
||
259 | public function getQueueStatisticsByConfiguration() |
||
260 | { |
||
261 | $statistics = $this->queueRepository->countPendingItemsGroupedByConfigurationKey(); |
||
262 | $setIds = $this->queueRepository->getSetIdWithUnprocessedEntries(); |
||
263 | $totals = $this->queueRepository->getTotalQueueEntriesByConfiguration($setIds); |
||
264 | |||
265 | // "merge" arrays |
||
266 | foreach ($statistics as &$value) { |
||
267 | $value['total'] = $totals[$value['configuration']]; |
||
268 | } |
||
269 | |||
270 | return $statistics; |
||
271 | } |
||
272 | |||
273 | /** |
||
274 | * Get active processes count |
||
275 | * @codeCoverageIgnore |
||
276 | */ |
||
277 | public function getActiveProcessesCount(): int |
||
278 | { |
||
279 | $processRepository = GeneralUtility::makeInstance(ProcessRepository::class); |
||
280 | return $processRepository->findAllActive()->count(); |
||
281 | } |
||
282 | |||
283 | /** |
||
284 | * @codeCoverageIgnore |
||
285 | */ |
||
286 | public function getLastProcessedQueueEntries(int $limit): array |
||
287 | { |
||
288 | return $this->queueRepository->getLastProcessedEntries($limit); |
||
289 | } |
||
290 | |||
291 | /** |
||
292 | * Get current crawling speed |
||
293 | * |
||
294 | * @return int|float|bool |
||
295 | * @codeCoverageIgnore |
||
296 | */ |
||
297 | public function getCurrentCrawlingSpeed() |
||
298 | { |
||
299 | $lastProcessedEntries = $this->queueRepository->getLastProcessedEntriesTimestamps(); |
||
300 | |||
301 | if (count($lastProcessedEntries) < 10) { |
||
302 | // not enough information |
||
303 | return false; |
||
304 | } |
||
305 | |||
306 | // time between two entries is "too old" |
||
307 | $tooOldDelta = 60; |
||
308 | |||
309 | $compareValue = time(); |
||
310 | $startTime = $lastProcessedEntries[0]; |
||
311 | |||
312 | $pages = 0; |
||
313 | |||
314 | reset($lastProcessedEntries); |
||
315 | foreach ($lastProcessedEntries as $timestamp) { |
||
316 | if ($compareValue - $timestamp > $tooOldDelta) { |
||
317 | break; |
||
318 | } |
||
319 | $compareValue = $timestamp; |
||
320 | $pages++; |
||
321 | } |
||
322 | |||
323 | if ($pages < 10) { |
||
324 | // not enough information |
||
325 | return false; |
||
326 | } |
||
327 | $oldestTimestampThatIsNotTooOld = $compareValue; |
||
328 | $time = $startTime - $oldestTimestampThatIsNotTooOld; |
||
329 | |||
330 | return $pages / ($time / 60); |
||
331 | } |
||
332 | |||
333 | /** |
||
334 | * Get some performance data |
||
335 | * |
||
336 | * @param integer $start |
||
337 | * @param integer $end |
||
338 | * @param integer $resolution |
||
339 | * |
||
340 | * @return array data |
||
341 | * |
||
342 | * @throws TimeStampException |
||
343 | * @codeCoverageIgnore |
||
344 | */ |
||
345 | public function getPerformanceData($start, $end, $resolution) |
||
346 | { |
||
347 | $data = []; |
||
348 | |||
349 | $data['urlcount'] = 0; |
||
350 | $data['start'] = $start; |
||
351 | $data['end'] = $end; |
||
352 | $data['duration'] = $data['end'] - $data['start']; |
||
353 | |||
354 | if ($data['duration'] < 1) { |
||
355 | throw new TimeStampException('End timestamp must be after start timestamp', 1512659945); |
||
356 | } |
||
357 | |||
358 | for ($slotStart = $start; $slotStart < $end; $slotStart += $resolution) { |
||
359 | $slotEnd = min($slotStart + $resolution - 1, $end); |
||
360 | $slotData = $this->queueRepository->getPerformanceData($slotStart, $slotEnd); |
||
361 | |||
362 | $slotUrlCount = 0; |
||
363 | foreach ($slotData as &$processData) { |
||
364 | $duration = $processData['end'] - $processData['start']; |
||
365 | if ($processData['urlcount'] > 5 && $duration > 0) { |
||
366 | $processData['speed'] = 60 * 1 / ($duration / $processData['urlcount']); |
||
367 | } |
||
368 | $slotUrlCount += $processData['urlcount']; |
||
369 | } |
||
370 | |||
371 | $data['urlcount'] += $slotUrlCount; |
||
372 | |||
373 | $data['slots'][$slotEnd] = [ |
||
374 | 'amountProcesses' => count($slotData), |
||
375 | 'urlcount' => $slotUrlCount, |
||
376 | 'processes' => $slotData, |
||
377 | ]; |
||
378 | |||
379 | if ($slotUrlCount > 5) { |
||
380 | $data['slots'][$slotEnd]['speed'] = 60 * 1 / ($slotEnd - $slotStart / $slotUrlCount); |
||
381 | } else { |
||
382 | $data['slots'][$slotEnd]['speed'] = 0; |
||
383 | } |
||
384 | } |
||
385 | |||
386 | if ($data['urlcount'] > 5) { |
||
387 | $data['speed'] = 60 * 1 / ($data['duration'] / $data['urlcount']); |
||
388 | } else { |
||
389 | $data['speed'] = 0; |
||
390 | } |
||
391 | |||
392 | return $data; |
||
393 | } |
||
394 | |||
395 | /** |
||
396 | * Method to get an instance of the internal crawler singleton |
||
397 | * |
||
398 | * @return CrawlerController Instance of the crawler lib |
||
399 | * |
||
400 | * @throws CrawlerObjectException |
||
401 | */ |
||
402 | 2 | protected function findCrawler() |
|
403 | { |
||
404 | 2 | if (! is_object($this->crawlerController)) { |
|
405 | 2 | $this->crawlerController = GeneralUtility::makeInstance(CrawlerController::class); |
|
406 | 2 | $this->crawlerController->setID = GeneralUtility::md5int(microtime()); |
|
407 | } |
||
408 | |||
409 | 2 | if (is_object($this->crawlerController)) { |
|
410 | 2 | return $this->crawlerController; |
|
411 | } |
||
412 | throw new CrawlerObjectException('no crawler object', 1512659759); |
||
413 | } |
||
414 | |||
415 | /** |
||
416 | * This method is used to limit the processing instructions to the processing instructions |
||
417 | * that are allowed. |
||
418 | */ |
||
419 | 4 | protected function filterUnallowedConfigurations(array $configurations): array |
|
420 | { |
||
421 | 4 | if (count($this->allowedConfigurations) > 0) { |
|
422 | // remove configuration that does not match the current selection |
||
423 | foreach ($configurations as $confKey => $confArray) { |
||
424 | if (! in_array($confKey, $this->allowedConfigurations, true)) { |
||
425 | unset($configurations[$confKey]); |
||
426 | } |
||
427 | } |
||
428 | } |
||
429 | |||
430 | 4 | return $configurations; |
|
431 | } |
||
432 | |||
433 | /** |
||
434 | * Reads the registered processingInstructions of the crawler |
||
435 | */ |
||
436 | 4 | private function getCrawlerProcInstructions(): array |
|
437 | { |
||
438 | 4 | $crawlerProcInstructions = []; |
|
439 | 4 | if (! empty($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['procInstructions'])) { |
|
440 | foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['procInstructions'] as $configuration) { |
||
441 | $crawlerProcInstructions[$configuration['key']] = $configuration['value']; |
||
442 | } |
||
443 | } |
||
444 | |||
445 | 4 | return $crawlerProcInstructions; |
|
446 | } |
||
447 | |||
448 | 4 | private function getPageRepository(): PageRepository |
|
449 | { |
||
450 | 4 | return GeneralUtility::makeInstance(ObjectManager::class)->get(PageRepository::class); |
|
451 | } |
||
452 | } |
||
453 |
This function has been deprecated. The supplier of the function has supplied an explanatory message.
The explanatory message should give you some clue as to whether and when the function will be removed and what other function to use instead.