1
|
|
|
<?php |
2
|
|
|
namespace AOE\Crawler\Service; |
3
|
|
|
|
4
|
|
|
/*************************************************************** |
5
|
|
|
* Copyright notice |
6
|
|
|
* |
7
|
|
|
* (c) 2017 AOE GmbH <[email protected]> |
8
|
|
|
* |
9
|
|
|
* All rights reserved |
10
|
|
|
* |
11
|
|
|
* This script is part of the TYPO3 project. The TYPO3 project is |
12
|
|
|
* free software; you can redistribute it and/or modify |
13
|
|
|
* it under the terms of the GNU General Public License as published by |
14
|
|
|
* the Free Software Foundation; either version 3 of the License, or |
15
|
|
|
* (at your option) any later version. |
16
|
|
|
* |
17
|
|
|
* The GNU General Public License can be found at |
18
|
|
|
* http://www.gnu.org/copyleft/gpl.html. |
19
|
|
|
* |
20
|
|
|
* This script is distributed in the hope that it will be useful, |
21
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
22
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
23
|
|
|
* GNU General Public License for more details. |
24
|
|
|
* |
25
|
|
|
* This copyright notice MUST APPEAR in all copies of the script! |
26
|
|
|
***************************************************************/ |
27
|
|
|
|
28
|
|
|
use AOE\Crawler\Controller\CrawlerController; |
29
|
|
|
use AOE\Crawler\Domain\Repository\ProcessRepository; |
30
|
|
|
use AOE\Crawler\Domain\Repository\QueueRepository; |
31
|
|
|
use TYPO3\CMS\Core\Utility\GeneralUtility; |
32
|
|
|
|
33
|
|
|
/** |
34
|
|
|
* Class ProcessService |
35
|
|
|
* |
36
|
|
|
* @package AOE\Crawler\Service |
37
|
|
|
*/ |
38
|
|
|
class ProcessService |
39
|
|
|
{ |
40
|
|
|
/** |
41
|
|
|
* @var $timeToLive integer |
42
|
|
|
*/ |
43
|
|
|
private $timeToLive; |
44
|
|
|
|
45
|
|
|
/** |
46
|
|
|
* @var integer |
47
|
|
|
*/ |
48
|
|
|
private $countInARun; |
49
|
|
|
|
50
|
|
|
/** |
51
|
|
|
* @var integer |
52
|
|
|
*/ |
53
|
|
|
private $processLimit; |
54
|
|
|
|
55
|
|
|
/** |
56
|
|
|
* @var CrawlerController |
57
|
|
|
*/ |
58
|
|
|
private $crawlerObj; |
59
|
|
|
|
60
|
|
|
/** |
61
|
|
|
* @var \AOE\Crawler\Domain\Repository\QueueRepository |
62
|
|
|
*/ |
63
|
|
|
private $queueRepository; |
64
|
|
|
|
65
|
|
|
/** |
66
|
|
|
* @var \AOE\Crawler\Domain\Repository\ProcessRepository |
67
|
|
|
*/ |
68
|
|
|
private $processRepository; |
69
|
|
|
|
70
|
|
|
/** |
71
|
|
|
* @var $verbose boolean |
72
|
|
|
*/ |
73
|
|
|
private $verbose; |
74
|
|
|
|
75
|
|
|
/** |
76
|
|
|
* the constructor |
77
|
|
|
*/ |
78
|
|
|
public function __construct() |
79
|
|
|
{ |
80
|
|
|
$this->processRepository = new ProcessRepository(); |
81
|
|
|
$this->queueRepository = new QueueRepository(); |
82
|
|
|
$this->crawlerObj = GeneralUtility::makeInstance(CrawlerController::class); |
83
|
|
|
$this->timeToLive = intval($this->crawlerObj->extensionSettings['processMaxRunTime']); |
84
|
|
|
$this->countInARun = intval($this->crawlerObj->extensionSettings['countInARun']); |
85
|
|
|
$this->processLimit = intval($this->crawlerObj->extensionSettings['processLimit']); |
86
|
|
|
$this->verbose = intval($this->crawlerObj->extensionSettings['processVerbose']); |
87
|
|
|
} |
88
|
|
|
|
89
|
|
|
/** |
90
|
|
|
* starts multiple processes |
91
|
|
|
* |
92
|
|
|
* @param integer $timeout |
93
|
|
|
* |
94
|
|
|
* @throws \RuntimeException |
95
|
|
|
*/ |
96
|
|
|
public function multiProcess($timeout) |
97
|
|
|
{ |
98
|
|
|
if ($this->processLimit <= 1) { |
99
|
|
|
throw new \RuntimeException('To run crawler in multi process mode you have to configure the processLimit > 1.' . PHP_EOL); |
100
|
|
|
} |
101
|
|
|
|
102
|
|
|
$pendingItemsStart = $this->queueRepository->countAllPendingItems(); |
103
|
|
|
$itemReportLimit = 20; |
104
|
|
|
$reportItemCount = $pendingItemsStart - $itemReportLimit; |
105
|
|
|
if ($this->verbose) { |
106
|
|
|
$this->reportItemStatus(); |
107
|
|
|
} |
108
|
|
|
$this->startRequiredProcesses(); |
109
|
|
|
$nextTimeOut = time() + $this->timeToLive; |
110
|
|
|
for ($i = 0; $i < $timeout; $i++) { |
111
|
|
|
$currentPendingItems = $this->queueRepository->countAllPendingItems(); |
112
|
|
|
if ($this->startRequiredProcesses($this->verbose)) { |
|
|
|
|
113
|
|
|
$nextTimeOut = time() + $this->timeToLive; |
114
|
|
|
} |
115
|
|
|
if ($currentPendingItems == 0) { |
116
|
|
|
if ($this->verbose) { |
117
|
|
|
echo 'Finished...' . chr(10); |
118
|
|
|
} |
119
|
|
|
break; |
120
|
|
|
} |
121
|
|
|
if ($currentPendingItems < $reportItemCount) { |
122
|
|
|
if ($this->verbose) { |
123
|
|
|
$this->reportItemStatus(); |
124
|
|
|
} |
125
|
|
|
$reportItemCount = $currentPendingItems - $itemReportLimit; |
126
|
|
|
} |
127
|
|
|
sleep(1); |
128
|
|
|
if ($nextTimeOut < time()) { |
129
|
|
|
$timedOutProcesses = $this->processRepository->findAll('', 'DESC', null, 0, 'ttl >' . $nextTimeOut); |
130
|
|
|
$nextTimeOut = time() + $this->timeToLive; |
131
|
|
|
if ($this->verbose) { |
132
|
|
|
echo 'Cleanup' . implode(',', $timedOutProcesses->getProcessIds()) . chr(10); |
133
|
|
|
} |
134
|
|
|
$this->crawlerObj->CLI_releaseProcesses($timedOutProcesses->getProcessIds(), true); |
135
|
|
|
} |
136
|
|
|
} |
137
|
|
|
if ($currentPendingItems > 0 && $this->verbose) { |
|
|
|
|
138
|
|
|
echo 'Stop with timeout' . chr(10); |
139
|
|
|
} |
140
|
|
|
} |
141
|
|
|
|
142
|
|
|
/** |
143
|
|
|
* Reports curent Status of queue |
144
|
|
|
*/ |
145
|
|
|
protected function reportItemStatus() |
146
|
|
|
{ |
147
|
|
|
echo 'Pending:' . $this->queueRepository->countAllPendingItems() . ' / Assigned:' . $this->queueRepository->countAllAssignedPendingItems() . chr(10); |
148
|
|
|
} |
149
|
|
|
|
150
|
|
|
/** |
151
|
|
|
* according to the given count of pending items and the countInARun Setting this method |
152
|
|
|
* starts more crawling processes |
153
|
|
|
* |
154
|
|
|
* @throws \Exception |
155
|
|
|
* |
156
|
|
|
* @return boolean if processes are started |
157
|
|
|
*/ |
158
|
|
|
private function startRequiredProcesses() |
159
|
|
|
{ |
160
|
|
|
$ret = false; |
161
|
|
|
$currentProcesses = $this->processRepository->countActive(); |
162
|
|
|
$availableProcessesCount = $this->processLimit - $currentProcesses; |
163
|
|
|
$requiredProcessesCount = ceil($this->queueRepository->countAllUnassignedPendingItems() / $this->countInARun); |
164
|
|
|
$startProcessCount = min([$availableProcessesCount,$requiredProcessesCount]); |
165
|
|
|
if ($startProcessCount <= 0) { |
166
|
|
|
return $ret; |
167
|
|
|
} |
168
|
|
|
if ($startProcessCount && $this->verbose) { |
169
|
|
|
echo 'Start ' . $startProcessCount . ' new processes (Running:' . $currentProcesses . ')'; |
170
|
|
|
} |
171
|
|
|
for ($i = 0;$i < $startProcessCount;$i++) { |
172
|
|
|
usleep(100); |
173
|
|
|
if ($this->startProcess()) { |
174
|
|
|
if ($this->verbose) { |
175
|
|
|
echo '.'; |
176
|
|
|
$ret = true; |
177
|
|
|
} |
178
|
|
|
} |
179
|
|
|
} |
180
|
|
|
if ($this->verbose) { |
181
|
|
|
echo chr(10); |
182
|
|
|
} |
183
|
|
|
return $ret; |
184
|
|
|
} |
185
|
|
|
|
186
|
|
|
/** |
187
|
|
|
* starts new process |
188
|
|
|
* @throws \Exception if no crawler process was started |
189
|
|
|
*/ |
190
|
|
|
public function startProcess() |
191
|
|
|
{ |
192
|
|
|
$ttl = (time() + $this->timeToLive - 1); |
193
|
|
|
$current = $this->processRepository->countNotTimeouted($ttl); |
194
|
|
|
|
195
|
|
|
// Check whether OS is Windows |
196
|
|
|
if (TYPO3_OS === 'WIN') { |
197
|
|
|
$completePath = escapeshellcmd('start ' . $this->getCrawlerCliPath()); |
198
|
|
|
} else { |
199
|
|
|
$completePath = '(' . escapeshellcmd($this->getCrawlerCliPath()) . ' &) > /dev/null'; |
200
|
|
|
} |
201
|
|
|
|
202
|
|
|
$fileHandler = system($completePath); |
203
|
|
|
if ($fileHandler === false) { |
204
|
|
|
throw new \Exception('could not start process!'); |
205
|
|
|
} else { |
206
|
|
|
for ($i = 0;$i < 10;$i++) { |
207
|
|
|
if ($this->processRepository->countNotTimeouted($ttl) > $current) { |
208
|
|
|
return true; |
209
|
|
|
} |
210
|
|
|
sleep(1); |
211
|
|
|
} |
212
|
|
|
throw new \Exception('Something went wrong: process did not appear within 10 seconds.'); |
213
|
|
|
} |
214
|
|
|
} |
215
|
|
|
|
216
|
|
|
/** |
217
|
|
|
* Returns the path to start the crawler from the command line |
218
|
|
|
* |
219
|
|
|
* @return string |
220
|
|
|
*/ |
221
|
|
|
public function getCrawlerCliPath() |
222
|
|
|
{ |
223
|
|
|
$phpPath = $this->crawlerObj->extensionSettings['phpPath'] . ' '; |
224
|
|
|
$pathToTypo3 = rtrim(GeneralUtility::getIndpEnv('TYPO3_DOCUMENT_ROOT'), '/'); |
225
|
|
|
$pathToTypo3 .= rtrim(GeneralUtility::getIndpEnv('TYPO3_SITE_PATH'), '/'); |
226
|
|
|
$cliPart = '/typo3/cli_dispatch.phpsh crawler'; |
227
|
|
|
$scriptPath = $phpPath . $pathToTypo3 . $cliPart; |
228
|
|
|
|
229
|
|
|
if (TYPO3_OS === 'WIN') { |
230
|
|
|
$scriptPath = str_replace('/', '\\', $scriptPath); |
231
|
|
|
} |
232
|
|
|
|
233
|
|
|
return $scriptPath; |
234
|
|
|
} |
235
|
|
|
} |
236
|
|
|
|
This check compares calls to functions or methods with their respective definitions. If the call has more arguments than are defined, it raises an issue.
If a function is defined several times with a different number of parameters, the check may pick up the wrong definition and report false positives. One codebase where this has been known to happen is Wordpress.
In this case you can add the
@ignore
PhpDoc annotation to the duplicate definition and it will be ignored.