1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
declare(strict_types=1); |
4
|
|
|
|
5
|
|
|
namespace AOE\Crawler\Command; |
6
|
|
|
|
7
|
|
|
/*************************************************************** |
8
|
|
|
* Copyright notice |
9
|
|
|
* |
10
|
|
|
* (c) 2019 AOE GmbH <[email protected]> |
11
|
|
|
* |
12
|
|
|
* All rights reserved |
13
|
|
|
* |
14
|
|
|
* This script is part of the TYPO3 project. The TYPO3 project is |
15
|
|
|
* free software; you can redistribute it and/or modify |
16
|
|
|
* it under the terms of the GNU General Public License as published by |
17
|
|
|
* the Free Software Foundation; either version 3 of the License, or |
18
|
|
|
* (at your option) any later version. |
19
|
|
|
* |
20
|
|
|
* The GNU General Public License can be found at |
21
|
|
|
* http://www.gnu.org/copyleft/gpl.html. |
22
|
|
|
* |
23
|
|
|
* This script is distributed in the hope that it will be useful, |
24
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
25
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
26
|
|
|
* GNU General Public License for more details. |
27
|
|
|
* |
28
|
|
|
* This copyright notice MUST APPEAR in all copies of the script! |
29
|
|
|
***************************************************************/ |
30
|
|
|
|
31
|
|
|
use AOE\Crawler\Configuration\ExtensionConfigurationProvider; |
32
|
|
|
use AOE\Crawler\Controller\CrawlerController; |
33
|
|
|
use AOE\Crawler\Crawler; |
34
|
|
|
use AOE\Crawler\Domain\Model\Process; |
35
|
|
|
use AOE\Crawler\Domain\Repository\ProcessRepository; |
36
|
|
|
use AOE\Crawler\Domain\Repository\QueueRepository; |
37
|
|
|
use Symfony\Component\Console\Command\Command; |
38
|
|
|
use Symfony\Component\Console\Input\InputInterface; |
39
|
|
|
use Symfony\Component\Console\Input\InputOption; |
40
|
|
|
use Symfony\Component\Console\Output\OutputInterface; |
41
|
|
|
use TYPO3\CMS\Core\Utility\GeneralUtility; |
42
|
|
|
use TYPO3\CMS\Extbase\Object\ObjectManager; |
43
|
|
|
|
44
|
|
|
class ProcessQueueCommand extends Command |
45
|
|
|
{ |
46
|
|
|
/** |
47
|
|
|
* @deprecated since 9.2.5 will be made private in v11.x |
48
|
|
|
*/ |
49
|
|
|
public const CLI_STATUS_NOTHING_PROCCESSED = 0; |
50
|
|
|
|
51
|
|
|
/** |
52
|
|
|
* queue not empty |
53
|
|
|
* @deprecated since 9.2.5 will be made private in v11.x |
54
|
|
|
*/ |
55
|
|
|
public const CLI_STATUS_REMAIN = 1; |
56
|
|
|
|
57
|
|
|
/** |
58
|
|
|
* (some) queue items where processed |
59
|
|
|
* @deprecated since 9.2.5 will be made private in v11.x |
60
|
|
|
*/ |
61
|
|
|
public const CLI_STATUS_PROCESSED = 2; |
62
|
|
|
|
63
|
|
|
/** |
64
|
|
|
* instance didn't finish |
65
|
|
|
* @deprecated since 9.2.5 will be made private in v11.x |
66
|
|
|
*/ |
67
|
|
|
public const CLI_STATUS_ABORTED = 4; |
68
|
|
|
|
69
|
|
|
/** |
70
|
|
|
* @deprecated since 9.2.5 will be made private in v11.x |
71
|
|
|
*/ |
72
|
|
|
public const CLI_STATUS_POLLABLE_PROCESSED = 8; |
73
|
|
|
|
74
|
|
|
/** |
75
|
|
|
* @var Crawler |
76
|
|
|
*/ |
77
|
|
|
private $crawler; |
78
|
|
|
|
79
|
|
|
/** |
80
|
|
|
* @var CrawlerController |
81
|
|
|
*/ |
82
|
|
|
private $crawlerController; |
83
|
|
|
|
84
|
|
|
/** |
85
|
|
|
* @var ProcessRepository |
86
|
|
|
*/ |
87
|
|
|
private $processRepository; |
88
|
|
|
|
89
|
|
|
/** |
90
|
|
|
* @var QueueRepository |
91
|
|
|
*/ |
92
|
|
|
private $queueRepository; |
93
|
|
|
|
94
|
|
|
/** |
95
|
|
|
* @var string |
96
|
|
|
*/ |
97
|
|
|
private $processId; |
98
|
|
|
|
99
|
|
|
/** |
100
|
|
|
* @var array |
101
|
|
|
*/ |
102
|
|
|
private $extensionSettings; |
103
|
|
|
|
104
|
|
|
/** |
105
|
|
|
* Crawler Command - Crawling the URLs from the queue |
106
|
|
|
* |
107
|
|
|
* Examples: |
108
|
|
|
* |
109
|
|
|
* --- Will trigger the crawler which starts to process the queue entries |
110
|
|
|
* $ typo3 crawler:crawlQueue |
111
|
|
|
* |
112
|
|
|
* @return int |
113
|
|
|
*/ |
114
|
2 |
|
public function execute(InputInterface $input, OutputInterface $output) |
115
|
|
|
{ |
116
|
2 |
|
$amount = $input->getOption('amount'); |
117
|
2 |
|
$sleeptime = $input->getOption('sleeptime'); |
118
|
2 |
|
$sleepafter = $input->getOption('sleepafter'); |
119
|
|
|
|
120
|
2 |
|
$objectManager = GeneralUtility::makeInstance(ObjectManager::class); |
121
|
2 |
|
$this->extensionSettings = $this->getExtensionSettings(); |
122
|
|
|
|
123
|
2 |
|
$result = self::CLI_STATUS_NOTHING_PROCCESSED; |
|
|
|
|
124
|
|
|
|
125
|
|
|
/** @var QueueRepository $queueRepository */ |
126
|
2 |
|
$queueRepository = $objectManager->get(QueueRepository::class); |
127
|
|
|
/** @var ProcessRepository $processRepository */ |
128
|
2 |
|
$processRepository = $objectManager->get(ProcessRepository::class); |
129
|
|
|
|
130
|
|
|
/** @var Crawler $crawler */ |
131
|
2 |
|
$crawler = GeneralUtility::makeInstance(Crawler::class); |
132
|
|
|
|
133
|
2 |
|
if (! $crawler->isDisabled() && $this->checkAndAcquireNewProcess($this->getProcessId())) { |
134
|
2 |
|
$countInARun = $amount ? (int) $amount : (int) $this->extensionSettings['countInARun']; |
135
|
2 |
|
$sleepAfterFinish = $sleepafter ? (int) $sleepafter : (int) $this->extensionSettings['sleepAfterFinish']; |
136
|
2 |
|
$sleepTime = $sleeptime ? (int) $sleeptime : (int) $this->extensionSettings['sleepTime']; |
137
|
|
|
|
138
|
|
|
try { |
139
|
|
|
// Run process: |
140
|
2 |
|
$result = $this->runProcess($countInARun, $sleepTime, $sleepAfterFinish); |
141
|
|
|
} catch (\Throwable $e) { |
142
|
|
|
$output->writeln('<warning>' . get_class($e) . ': ' . $e->getMessage() . '</warning>'); |
143
|
|
|
$result = self::CLI_STATUS_ABORTED; |
|
|
|
|
144
|
|
|
} |
145
|
|
|
|
146
|
|
|
// Cleanup |
147
|
2 |
|
$processRepository->deleteProcessesWithoutItemsAssigned(); |
148
|
2 |
|
$processRepository->markRequestedProcessesAsNotActive([$this->getProcessId()]); |
149
|
2 |
|
$queueRepository->unsetProcessScheduledAndProcessIdForQueueEntries([$this->getProcessId()]); |
150
|
|
|
|
151
|
2 |
|
$output->writeln('<info>Unprocessed Items remaining:' . count($queueRepository->getUnprocessedItems()) . ' (' . $this->getProcessId() . ')</info>'); |
152
|
2 |
|
$result |= (count($queueRepository->getUnprocessedItems()) > 0 ? self::CLI_STATUS_REMAIN : self::CLI_STATUS_NOTHING_PROCCESSED); |
|
|
|
|
153
|
|
|
} else { |
154
|
|
|
$result |= self::CLI_STATUS_ABORTED; |
|
|
|
|
155
|
|
|
} |
156
|
|
|
|
157
|
2 |
|
$output->writeln($result); |
158
|
2 |
|
return $result & self::CLI_STATUS_ABORTED; |
|
|
|
|
159
|
|
|
} |
160
|
|
|
|
161
|
2 |
|
protected function configure(): void |
162
|
|
|
{ |
163
|
2 |
|
$this->setDescription('Trigger the crawler to process the queue entries'); |
164
|
|
|
|
165
|
2 |
|
$this->setHelp( |
166
|
2 |
|
'Crawler Command - Crawling the URLs from the queue' . chr(10) . chr(10) . |
167
|
2 |
|
' |
168
|
|
|
Examples: |
169
|
|
|
--- Will trigger the crawler which starts to process the queue entries |
170
|
|
|
$ typo3 crawler:processqueue --amount 15 --sleepafter 5 --sleeptime 2 |
171
|
|
|
' |
172
|
|
|
); |
173
|
2 |
|
$this->addOption( |
174
|
2 |
|
'amount', |
175
|
2 |
|
'', |
176
|
2 |
|
InputOption::VALUE_OPTIONAL, |
177
|
2 |
|
'How many pages should be crawled during that run', |
178
|
2 |
|
'0' |
179
|
|
|
); |
180
|
|
|
|
181
|
2 |
|
$this->addOption( |
182
|
2 |
|
'sleepafter', |
183
|
2 |
|
'', |
184
|
2 |
|
InputOption::VALUE_OPTIONAL, |
185
|
2 |
|
'Amount of milliseconds which the system should use to relax between crawls', |
186
|
2 |
|
'0' |
187
|
|
|
); |
188
|
|
|
|
189
|
2 |
|
$this->addOption( |
190
|
2 |
|
'sleeptime', |
191
|
2 |
|
'', |
192
|
2 |
|
InputOption::VALUE_OPTIONAL, |
193
|
2 |
|
'Amount of seconds which the system should use to relax after all crawls are done.' |
194
|
|
|
); |
195
|
2 |
|
} |
196
|
|
|
|
197
|
|
|
/** |
198
|
|
|
* Running the functionality of the CLI (crawling URLs from queue) |
199
|
|
|
*/ |
200
|
2 |
|
private function runProcess(int $countInARun, int $sleepTime, int $sleepAfterFinish): int |
201
|
|
|
{ |
202
|
2 |
|
$result = 0; |
203
|
2 |
|
$counter = 0; |
204
|
|
|
|
205
|
|
|
// First, run hooks: |
206
|
2 |
|
foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['cli_hooks'] ?? [] as $objRef) { |
207
|
2 |
|
trigger_error( |
208
|
2 |
|
'This hook (crawler/cli_hooks) is deprecated since 9.1.5 and will be removed when dropping support for TYPO3 9LTS and 10LTS', |
209
|
2 |
|
E_USER_DEPRECATED |
210
|
|
|
); |
211
|
2 |
|
$hookObj = GeneralUtility::makeInstance($objRef); |
212
|
2 |
|
if (is_object($hookObj)) { |
213
|
2 |
|
$hookObj->crawler_init($this->getCrawlerController()); |
214
|
|
|
} |
215
|
|
|
} |
216
|
|
|
|
217
|
|
|
// Clean up the queue |
218
|
2 |
|
$this->getQueueRepository()->cleanupQueue(); |
219
|
|
|
|
220
|
|
|
// Select entries: |
221
|
2 |
|
$records = $this->getQueueRepository()->fetchRecordsToBeCrawled($countInARun); |
222
|
|
|
|
223
|
2 |
|
if (! empty($records)) { |
224
|
2 |
|
$quidList = []; |
225
|
|
|
|
226
|
2 |
|
foreach ($records as $record) { |
227
|
2 |
|
$quidList[] = $record['qid']; |
228
|
|
|
} |
229
|
|
|
|
230
|
2 |
|
$processId = $this->getProcessId(); |
231
|
|
|
|
232
|
|
|
//save the number of assigned queue entries to determine how many have been processed later |
233
|
2 |
|
$numberOfAffectedRows = $this->getQueueRepository()->updateProcessIdAndSchedulerForQueueIds($quidList, $processId); |
234
|
2 |
|
$this->getProcessRepository()->updateProcessAssignItemsCount($numberOfAffectedRows, $processId); |
235
|
|
|
|
236
|
2 |
|
if ($numberOfAffectedRows !== count($quidList)) { |
237
|
|
|
return ($result | self::CLI_STATUS_ABORTED); |
|
|
|
|
238
|
|
|
} |
239
|
|
|
|
240
|
2 |
|
foreach ($records as $record) { |
241
|
2 |
|
$result |= $this->getCrawlerController()->readUrl($record['qid']); |
242
|
|
|
|
243
|
2 |
|
$counter++; |
244
|
|
|
// Just to relax the system |
245
|
2 |
|
usleep($sleepTime); |
246
|
|
|
|
247
|
|
|
// if during the start and the current read url the cli has been disable we need to return from the function |
248
|
|
|
// mark the process NOT as ended. |
249
|
2 |
|
if ($this->getCrawler()->isDisabled()) { |
250
|
|
|
return ($result | self::CLI_STATUS_ABORTED); |
|
|
|
|
251
|
|
|
} |
252
|
|
|
|
253
|
2 |
|
if (! $this->getProcessRepository()->isProcessActive($this->getProcessId())) { |
254
|
|
|
$result |= self::CLI_STATUS_ABORTED; |
|
|
|
|
255
|
|
|
//possible timeout |
256
|
|
|
break; |
257
|
|
|
} |
258
|
|
|
} |
259
|
|
|
|
260
|
2 |
|
sleep($sleepAfterFinish); |
261
|
|
|
} |
262
|
|
|
|
263
|
2 |
|
if ($counter > 0) { |
264
|
2 |
|
$result |= self::CLI_STATUS_PROCESSED; |
|
|
|
|
265
|
|
|
} |
266
|
|
|
|
267
|
2 |
|
return $result; |
268
|
|
|
} |
269
|
|
|
|
270
|
|
|
/** |
271
|
|
|
* Try to acquire a new process with the given id |
272
|
|
|
* also performs some auto-cleanup for orphan processes |
273
|
|
|
*/ |
274
|
2 |
|
private function checkAndAcquireNewProcess(string $id): bool |
275
|
|
|
{ |
276
|
2 |
|
$returnValue = true; |
277
|
|
|
|
278
|
2 |
|
$systemProcessId = getmypid(); |
279
|
2 |
|
if (! $systemProcessId) { |
280
|
|
|
return false; |
281
|
|
|
} |
282
|
|
|
|
283
|
2 |
|
$processCount = 0; |
284
|
2 |
|
$orphanProcesses = []; |
285
|
|
|
|
286
|
2 |
|
$activeProcesses = $this->getProcessRepository()->findAllActive(); |
287
|
|
|
|
288
|
|
|
/** @var Process $process */ |
289
|
2 |
|
foreach ($activeProcesses as $process) { |
290
|
|
|
if ($process->getTtl() < time()) { |
291
|
|
|
$orphanProcesses[] = $process->getProcessId(); |
292
|
|
|
} else { |
293
|
|
|
$processCount++; |
294
|
|
|
} |
295
|
|
|
} |
296
|
|
|
|
297
|
|
|
// if there are less than allowed active processes then add a new one |
298
|
2 |
|
if ($processCount < (int) $this->extensionSettings['processLimit']) { |
299
|
2 |
|
$this->getProcessRepository()->addProcess($id, $systemProcessId); |
300
|
|
|
} else { |
301
|
|
|
$returnValue = false; |
302
|
|
|
} |
303
|
|
|
|
304
|
2 |
|
$this->getProcessRepository()->deleteProcessesMarkedAsDeleted(); |
305
|
2 |
|
$this->getProcessRepository()->markRequestedProcessesAsNotActive($orphanProcesses); |
306
|
2 |
|
$this->getQueueRepository()->unsetProcessScheduledAndProcessIdForQueueEntries($orphanProcesses); |
307
|
|
|
|
308
|
2 |
|
return $returnValue; |
309
|
|
|
} |
310
|
|
|
|
311
|
|
|
/** |
312
|
|
|
* Create a unique Id for the current process |
313
|
|
|
*/ |
314
|
2 |
|
private function getProcessId(): string |
315
|
|
|
{ |
316
|
2 |
|
if (! $this->processId) { |
317
|
2 |
|
$this->processId = GeneralUtility::shortMD5(microtime(true)); |
318
|
|
|
} |
319
|
2 |
|
return $this->processId; |
320
|
|
|
} |
321
|
|
|
|
322
|
2 |
|
private function getCrawler(): Crawler |
323
|
|
|
{ |
324
|
2 |
|
return $this->crawler ?? new Crawler(); |
325
|
|
|
} |
326
|
|
|
|
327
|
2 |
|
private function getCrawlerController(): CrawlerController |
328
|
|
|
{ |
329
|
2 |
|
return $this->crawlerController ?? GeneralUtility::makeInstance(CrawlerController::class); |
330
|
|
|
} |
331
|
|
|
|
332
|
2 |
|
private function getProcessRepository(): ProcessRepository |
333
|
|
|
{ |
334
|
2 |
|
return $this->processRepository ?? GeneralUtility::makeInstance(ProcessRepository::class); |
335
|
|
|
} |
336
|
|
|
|
337
|
2 |
|
private function getQueueRepository(): QueueRepository |
338
|
|
|
{ |
339
|
2 |
|
return $this->queueRepository ?? GeneralUtility::makeInstance(QueueRepository::class); |
340
|
|
|
} |
341
|
|
|
|
342
|
2 |
|
private function getExtensionSettings(): array |
343
|
|
|
{ |
344
|
2 |
|
return GeneralUtility::makeInstance(ExtensionConfigurationProvider::class)->getExtensionConfiguration(); |
345
|
|
|
} |
346
|
|
|
} |
347
|
|
|
|
This class constant has been deprecated. The supplier of the class has supplied an explanatory message.
The explanatory message should give you some clue as to whether and when the constant will be removed from the class and what other constant to use instead.