These results are based on our legacy PHP analysis, consider migrating to our new PHP analysis engine instead. Learn more
1 | <?php |
||
2 | namespace AOE\Crawler\Controller; |
||
3 | |||
4 | /*************************************************************** |
||
5 | * Copyright notice |
||
6 | * |
||
7 | * (c) 2017 AOE GmbH <[email protected]> |
||
8 | * |
||
9 | * All rights reserved |
||
10 | * |
||
11 | * This script is part of the TYPO3 project. The TYPO3 project is |
||
12 | * free software; you can redistribute it and/or modify |
||
13 | * it under the terms of the GNU General Public License as published by |
||
14 | * the Free Software Foundation; either version 3 of the License, or |
||
15 | * (at your option) any later version. |
||
16 | * |
||
17 | * The GNU General Public License can be found at |
||
18 | * http://www.gnu.org/copyleft/gpl.html. |
||
19 | * |
||
20 | * This script is distributed in the hope that it will be useful, |
||
21 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
||
22 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
||
23 | * GNU General Public License for more details. |
||
24 | * |
||
25 | * This copyright notice MUST APPEAR in all copies of the script! |
||
26 | ***************************************************************/ |
||
27 | |||
28 | use AOE\Crawler\Command\CrawlerCommandLineController; |
||
29 | use AOE\Crawler\Command\FlushCommandLineController; |
||
30 | use AOE\Crawler\Command\QueueCommandLineController; |
||
31 | use AOE\Crawler\Domain\Model\Reason; |
||
32 | use AOE\Crawler\Domain\Repository\QueueRepository; |
||
33 | use AOE\Crawler\Event\EventDispatcher; |
||
34 | use AOE\Crawler\Utility\IconUtility; |
||
35 | use AOE\Crawler\Utility\SignalSlotUtility; |
||
36 | use TYPO3\CMS\Backend\Utility\BackendUtility; |
||
37 | use TYPO3\CMS\Backend\Tree\View\PageTreeView; |
||
38 | use TYPO3\CMS\Core\Authentication\BackendUserAuthentication; |
||
39 | use TYPO3\CMS\Core\Database\DatabaseConnection; |
||
40 | use TYPO3\CMS\Core\Log\LogLevel; |
||
41 | use TYPO3\CMS\Core\TimeTracker\NullTimeTracker; |
||
42 | use TYPO3\CMS\Core\Utility\DebugUtility; |
||
43 | use TYPO3\CMS\Core\Utility\ExtensionManagementUtility; |
||
44 | use TYPO3\CMS\Core\Utility\GeneralUtility; |
||
45 | use TYPO3\CMS\Core\Utility\MathUtility; |
||
46 | use TYPO3\CMS\Extbase\Object\ObjectManager; |
||
47 | use TYPO3\CMS\Frontend\Controller\TypoScriptFrontendController; |
||
48 | use TYPO3\CMS\Frontend\Page\PageGenerator; |
||
49 | use TYPO3\CMS\Frontend\Page\PageRepository; |
||
50 | use TYPO3\CMS\Frontend\Utility\EidUtility; |
||
51 | use TYPO3\CMS\Lang\LanguageService; |
||
52 | |||
53 | /** |
||
54 | * Class CrawlerController |
||
55 | * |
||
56 | * @package AOE\Crawler\Controller |
||
57 | */ |
||
58 | class CrawlerController |
||
59 | { |
||
60 | const CLI_STATUS_NOTHING_PROCCESSED = 0; |
||
61 | const CLI_STATUS_REMAIN = 1; //queue not empty |
||
62 | const CLI_STATUS_PROCESSED = 2; //(some) queue items where processed |
||
63 | const CLI_STATUS_ABORTED = 4; //instance didn't finish |
||
64 | const CLI_STATUS_POLLABLE_PROCESSED = 8; |
||
65 | |||
66 | /** |
||
67 | * @var integer |
||
68 | */ |
||
69 | public $setID = 0; |
||
70 | |||
71 | /** |
||
72 | * @var string |
||
73 | */ |
||
74 | public $processID = ''; |
||
75 | |||
76 | /** |
||
77 | * One hour is max stalled time for the CLI |
||
78 | * If the process had the status "start" for 3600 seconds, it will be regarded stalled and a new process is started |
||
79 | * |
||
80 | * @var integer |
||
81 | */ |
||
82 | public $max_CLI_exec_time = 3600; |
||
83 | |||
84 | /** |
||
85 | * @var array |
||
86 | */ |
||
87 | public $duplicateTrack = []; |
||
88 | |||
89 | /** |
||
90 | * @var array |
||
91 | */ |
||
92 | public $downloadUrls = []; |
||
93 | |||
94 | /** |
||
95 | * @var array |
||
96 | */ |
||
97 | public $incomingProcInstructions = []; |
||
98 | |||
99 | /** |
||
100 | * @var array |
||
101 | */ |
||
102 | public $incomingConfigurationSelection = []; |
||
103 | |||
104 | /** |
||
105 | * @var bool |
||
106 | */ |
||
107 | public $registerQueueEntriesInternallyOnly = false; |
||
108 | |||
109 | /** |
||
110 | * @var array |
||
111 | */ |
||
112 | public $queueEntries = []; |
||
113 | |||
114 | /** |
||
115 | * @var array |
||
116 | */ |
||
117 | public $urlList = []; |
||
118 | |||
119 | /** |
||
120 | * @var boolean |
||
121 | */ |
||
122 | public $debugMode = false; |
||
123 | |||
124 | /** |
||
125 | * @var array |
||
126 | */ |
||
127 | public $extensionSettings = []; |
||
128 | |||
129 | /** |
||
130 | * Mount Point |
||
131 | * |
||
132 | * @var boolean |
||
133 | */ |
||
134 | public $MP = false; |
||
135 | |||
136 | /** |
||
137 | * @var string |
||
138 | */ |
||
139 | protected $processFilename; |
||
140 | |||
141 | /** |
||
142 | * Holds the internal access mode can be 'gui','cli' or 'cli_im' |
||
143 | * |
||
144 | * @var string |
||
145 | */ |
||
146 | protected $accessMode; |
||
147 | |||
148 | /** |
||
149 | * @var DatabaseConnection |
||
150 | */ |
||
151 | private $db; |
||
152 | |||
153 | /** |
||
154 | * @var BackendUserAuthentication |
||
155 | */ |
||
156 | private $backendUser; |
||
157 | |||
158 | /** |
||
159 | * @var integer |
||
160 | */ |
||
161 | private $scheduledTime = 0; |
||
162 | |||
163 | /** |
||
164 | * @var integer |
||
165 | */ |
||
166 | private $reqMinute = 0; |
||
167 | |||
168 | /** |
||
169 | * @var bool |
||
170 | */ |
||
171 | private $submitCrawlUrls = false; |
||
172 | |||
173 | /** |
||
174 | * @var bool |
||
175 | */ |
||
176 | private $downloadCrawlUrls = false; |
||
177 | |||
178 | /** |
||
179 | * @var QueueRepository |
||
180 | */ |
||
181 | protected $queueRepository; |
||
182 | |||
183 | /** |
||
184 | * Method to set the accessMode can be gui, cli or cli_im |
||
185 | * |
||
186 | * @return string |
||
187 | */ |
||
188 | 1 | public function getAccessMode() |
|
189 | { |
||
190 | 1 | return $this->accessMode; |
|
191 | } |
||
192 | |||
193 | /** |
||
194 | * @param string $accessMode |
||
195 | */ |
||
196 | 1 | public function setAccessMode($accessMode) |
|
197 | { |
||
198 | 1 | $this->accessMode = $accessMode; |
|
199 | 1 | } |
|
200 | |||
201 | /** |
||
202 | * Set disabled status to prevent processes from being processed |
||
203 | * |
||
204 | * @param bool $disabled (optional, defaults to true) |
||
205 | * @return void |
||
206 | */ |
||
207 | 3 | public function setDisabled($disabled = true) |
|
208 | { |
||
209 | 3 | if ($disabled) { |
|
210 | 2 | GeneralUtility::writeFile($this->processFilename, ''); |
|
211 | } else { |
||
212 | 1 | if (is_file($this->processFilename)) { |
|
213 | 1 | unlink($this->processFilename); |
|
214 | } |
||
215 | } |
||
216 | 3 | } |
|
217 | |||
218 | /** |
||
219 | * Get disable status |
||
220 | * |
||
221 | * @return bool true if disabled |
||
222 | */ |
||
223 | 3 | public function getDisabled() |
|
224 | { |
||
225 | 3 | if (is_file($this->processFilename)) { |
|
226 | 2 | return true; |
|
227 | } else { |
||
228 | 1 | return false; |
|
229 | } |
||
230 | } |
||
231 | |||
232 | /** |
||
233 | * @param string $filenameWithPath |
||
234 | * |
||
235 | * @return void |
||
236 | */ |
||
237 | 4 | public function setProcessFilename($filenameWithPath) |
|
238 | { |
||
239 | 4 | $this->processFilename = $filenameWithPath; |
|
240 | 4 | } |
|
241 | |||
242 | /** |
||
243 | * @return string |
||
244 | */ |
||
245 | 1 | public function getProcessFilename() |
|
246 | { |
||
247 | 1 | return $this->processFilename; |
|
248 | } |
||
249 | |||
250 | /************************************ |
||
251 | * |
||
252 | * Getting URLs based on Page TSconfig |
||
253 | * |
||
254 | ************************************/ |
||
255 | |||
256 | 28 | public function __construct() |
|
257 | { |
||
258 | 28 | $objectManager = GeneralUtility::makeInstance(ObjectManager::class); |
|
259 | 28 | $this->queueRepository = $objectManager->get(QueueRepository::class); |
|
260 | |||
261 | 28 | $this->db = $GLOBALS['TYPO3_DB']; |
|
262 | 28 | $this->backendUser = $GLOBALS['BE_USER']; |
|
263 | 28 | $this->processFilename = PATH_site . 'typo3temp/tx_crawler.proc'; |
|
264 | |||
265 | 28 | $settings = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['crawler']); |
|
266 | 28 | $settings = is_array($settings) ? $settings : []; |
|
267 | |||
268 | // read ext_em_conf_template settings and set |
||
269 | 28 | $this->setExtensionSettings($settings); |
|
270 | |||
271 | // set defaults: |
||
272 | 28 | if (MathUtility::convertToPositiveInteger($this->extensionSettings['countInARun']) == 0) { |
|
273 | 21 | $this->extensionSettings['countInARun'] = 100; |
|
274 | } |
||
275 | |||
276 | 28 | $this->extensionSettings['processLimit'] = MathUtility::forceIntegerInRange($this->extensionSettings['processLimit'], 1, 99, 1); |
|
277 | 28 | } |
|
278 | |||
279 | /** |
||
280 | * Sets the extensions settings (unserialized pendant of $TYPO3_CONF_VARS['EXT']['extConf']['crawler']). |
||
281 | * |
||
282 | * @param array $extensionSettings |
||
283 | * @return void |
||
284 | */ |
||
285 | 37 | public function setExtensionSettings(array $extensionSettings) |
|
286 | { |
||
287 | 37 | $this->extensionSettings = $extensionSettings; |
|
288 | 37 | } |
|
289 | |||
290 | /** |
||
291 | * Check if the given page should be crawled |
||
292 | * |
||
293 | * @param array $pageRow |
||
294 | * @return false|string false if the page should be crawled (not excluded), true / skipMessage if it should be skipped |
||
295 | */ |
||
296 | 10 | public function checkIfPageShouldBeSkipped(array $pageRow) |
|
297 | { |
||
298 | 10 | $skipPage = false; |
|
299 | 10 | $skipMessage = 'Skipped'; // message will be overwritten later |
|
300 | |||
301 | // if page is hidden |
||
302 | 10 | if (!$this->extensionSettings['crawlHiddenPages']) { |
|
303 | 10 | if ($pageRow['hidden']) { |
|
304 | 1 | $skipPage = true; |
|
305 | 1 | $skipMessage = 'Because page is hidden'; |
|
306 | } |
||
307 | } |
||
308 | |||
309 | 10 | if (!$skipPage) { |
|
310 | 9 | if (GeneralUtility::inList('3,4', $pageRow['doktype']) || $pageRow['doktype'] >= 199) { |
|
311 | 3 | $skipPage = true; |
|
312 | 3 | $skipMessage = 'Because doktype is not allowed'; |
|
313 | } |
||
314 | } |
||
315 | |||
316 | 10 | if (!$skipPage) { |
|
317 | 6 | if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'])) { |
|
318 | 2 | foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'] as $key => $doktypeList) { |
|
319 | 1 | if (GeneralUtility::inList($doktypeList, $pageRow['doktype'])) { |
|
320 | 1 | $skipPage = true; |
|
321 | 1 | $skipMessage = 'Doktype was excluded by "' . $key . '"'; |
|
322 | 1 | break; |
|
323 | } |
||
324 | } |
||
325 | } |
||
326 | } |
||
327 | |||
328 | 10 | if (!$skipPage) { |
|
329 | // veto hook |
||
330 | 5 | if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'])) { |
|
331 | foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'] as $key => $func) { |
||
332 | $params = [ |
||
333 | 'pageRow' => $pageRow |
||
334 | ]; |
||
335 | // expects "false" if page is ok and "true" or a skipMessage if this page should _not_ be crawled |
||
336 | $veto = GeneralUtility::callUserFunction($func, $params, $this); |
||
337 | if ($veto !== false) { |
||
338 | $skipPage = true; |
||
339 | if (is_string($veto)) { |
||
340 | $skipMessage = $veto; |
||
341 | } else { |
||
342 | $skipMessage = 'Veto from hook "' . htmlspecialchars($key) . '"'; |
||
343 | } |
||
344 | // no need to execute other hooks if a previous one return a veto |
||
345 | break; |
||
346 | } |
||
347 | } |
||
348 | } |
||
349 | } |
||
350 | |||
351 | 10 | return $skipPage ? $skipMessage : false; |
|
352 | } |
||
353 | |||
354 | /** |
||
355 | * Wrapper method for getUrlsForPageId() |
||
356 | * It returns an array of configurations and no urls! |
||
357 | * |
||
358 | * @param array $pageRow Page record with at least dok-type and uid columns. |
||
359 | * @param string $skipMessage |
||
360 | * @return array |
||
361 | * @see getUrlsForPageId() |
||
362 | */ |
||
363 | 6 | public function getUrlsForPageRow(array $pageRow, &$skipMessage = '') |
|
364 | { |
||
365 | 6 | $message = $this->checkIfPageShouldBeSkipped($pageRow); |
|
366 | |||
367 | 6 | if ($message === false) { |
|
368 | 5 | $forceSsl = ($pageRow['url_scheme'] === 2) ? true : false; |
|
369 | 5 | $res = $this->getUrlsForPageId($pageRow['uid'], $forceSsl); |
|
370 | 5 | $skipMessage = ''; |
|
371 | } else { |
||
372 | 1 | $skipMessage = $message; |
|
373 | 1 | $res = []; |
|
374 | } |
||
375 | |||
376 | 6 | return $res; |
|
377 | } |
||
378 | |||
379 | /** |
||
380 | * This method is used to count if there are ANY unprocessed queue entries |
||
381 | * of a given page_id and the configuration which matches a given hash. |
||
382 | * If there if none, we can skip an inner detail check |
||
383 | * |
||
384 | * @param int $uid |
||
385 | * @param string $configurationHash |
||
386 | * @return boolean |
||
387 | */ |
||
388 | 7 | protected function noUnprocessedQueueEntriesForPageWithConfigurationHashExist($uid, $configurationHash) |
|
389 | { |
||
390 | 7 | $configurationHash = $this->db->fullQuoteStr($configurationHash, 'tx_crawler_queue'); |
|
391 | 7 | $res = $this->db->exec_SELECTquery('count(*) as anz', 'tx_crawler_queue', "page_id=" . intval($uid) . " AND configuration_hash=" . $configurationHash . " AND exec_time=0"); |
|
392 | 7 | $row = $this->db->sql_fetch_assoc($res); |
|
393 | |||
394 | 7 | return ($row['anz'] == 0); |
|
395 | } |
||
396 | |||
397 | /** |
||
398 | * Creates a list of URLs from input array (and submits them to queue if asked for) |
||
399 | * See Web > Info module script + "indexed_search"'s crawler hook-client using this! |
||
400 | * |
||
401 | * @param array Information about URLs from pageRow to crawl. |
||
402 | * @param array Page row |
||
403 | * @param integer Unix time to schedule indexing to, typically time() |
||
404 | * @param integer Number of requests per minute (creates the interleave between requests) |
||
405 | * @param boolean If set, submits the URLs to queue |
||
406 | * @param boolean If set (and submitcrawlUrls is false) will fill $downloadUrls with entries) |
||
407 | * @param array Array which is passed by reference and contains the an id per url to secure we will not crawl duplicates |
||
408 | * @param array Array which will be filled with URLS for download if flag is set. |
||
409 | * @param array Array of processing instructions |
||
410 | * @return string List of URLs (meant for display in backend module) |
||
411 | * |
||
412 | */ |
||
413 | 4 | public function urlListFromUrlArray( |
|
414 | array $vv, |
||
415 | array $pageRow, |
||
416 | $scheduledTime, |
||
417 | $reqMinute, |
||
418 | $submitCrawlUrls, |
||
419 | $downloadCrawlUrls, |
||
420 | array &$duplicateTrack, |
||
421 | array &$downloadUrls, |
||
422 | array $incomingProcInstructions |
||
423 | ) { |
||
424 | 4 | $urlList = ''; |
|
425 | // realurl support (thanks to Ingo Renner) |
||
426 | 4 | if (ExtensionManagementUtility::isLoaded('realurl') && $vv['subCfg']['realurl']) { |
|
427 | |||
428 | /** @var tx_realurl $urlObj */ |
||
429 | $urlObj = GeneralUtility::makeInstance('tx_realurl'); |
||
430 | |||
431 | if (!empty($vv['subCfg']['baseUrl'])) { |
||
432 | $urlParts = parse_url($vv['subCfg']['baseUrl']); |
||
433 | $host = strtolower($urlParts['host']); |
||
434 | $urlObj->host = $host; |
||
435 | |||
436 | // First pass, finding configuration OR pointer string: |
||
437 | $urlObj->extConf = isset($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl'][$urlObj->host]) ? $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl'][$urlObj->host] : $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl']['_DEFAULT']; |
||
438 | |||
439 | // If it turned out to be a string pointer, then look up the real config: |
||
440 | if (is_string($urlObj->extConf)) { |
||
441 | $urlObj->extConf = is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl'][$urlObj->extConf]) ? $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl'][$urlObj->extConf] : $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl']['_DEFAULT']; |
||
442 | } |
||
443 | } |
||
444 | |||
445 | if (!$GLOBALS['TSFE']->sys_page) { |
||
446 | $GLOBALS['TSFE']->sys_page = GeneralUtility::makeInstance('TYPO3\CMS\Frontend\Page\PageRepository'); |
||
447 | } |
||
448 | if (!$GLOBALS['TSFE']->csConvObj) { |
||
449 | $GLOBALS['TSFE']->csConvObj = GeneralUtility::makeInstance('TYPO3\CMS\Core\Charset\CharsetConverter'); |
||
450 | } |
||
451 | if (!$GLOBALS['TSFE']->tmpl->rootLine[0]['uid']) { |
||
452 | $GLOBALS['TSFE']->tmpl->rootLine[0]['uid'] = $urlObj->extConf['pagePath']['rootpage_id']; |
||
453 | } |
||
454 | } |
||
455 | |||
456 | 4 | if (is_array($vv['URLs'])) { |
|
457 | 4 | $configurationHash = $this->getConfigurationHash($vv); |
|
458 | 4 | $skipInnerCheck = $this->noUnprocessedQueueEntriesForPageWithConfigurationHashExist($pageRow['uid'], $configurationHash); |
|
459 | |||
460 | 4 | foreach ($vv['URLs'] as $urlQuery) { |
|
461 | 4 | if ($this->drawURLs_PIfilter($vv['subCfg']['procInstrFilter'], $incomingProcInstructions)) { |
|
462 | |||
463 | // Calculate cHash: |
||
464 | 4 | if ($vv['subCfg']['cHash']) { |
|
465 | /* @var $cacheHash \TYPO3\CMS\Frontend\Page\CacheHashCalculator */ |
||
466 | $cacheHash = GeneralUtility::makeInstance('TYPO3\CMS\Frontend\Page\CacheHashCalculator'); |
||
467 | $urlQuery .= '&cHash=' . $cacheHash->generateForParameters($urlQuery); |
||
468 | } |
||
469 | |||
470 | // Create key by which to determine unique-ness: |
||
471 | 4 | $uKey = $urlQuery . '|' . $vv['subCfg']['userGroups'] . '|' . $vv['subCfg']['baseUrl'] . '|' . $vv['subCfg']['procInstrFilter']; |
|
472 | |||
473 | // realurl support (thanks to Ingo Renner) |
||
474 | 4 | $urlQuery = 'index.php' . $urlQuery; |
|
475 | 4 | if (ExtensionManagementUtility::isLoaded('realurl') && $vv['subCfg']['realurl']) { |
|
476 | $params = [ |
||
477 | 'LD' => [ |
||
478 | 'totalURL' => $urlQuery |
||
479 | ], |
||
480 | 'TCEmainHook' => true |
||
481 | ]; |
||
482 | $urlObj->encodeSpURL($params); |
||
483 | $urlQuery = $params['LD']['totalURL']; |
||
484 | } |
||
485 | |||
486 | // Scheduled time: |
||
487 | 4 | $schTime = $scheduledTime + round(count($duplicateTrack) * (60 / $reqMinute)); |
|
488 | 4 | $schTime = floor($schTime / 60) * 60; |
|
489 | |||
490 | 4 | if (isset($duplicateTrack[$uKey])) { |
|
491 | |||
492 | //if the url key is registered just display it and do not resubmit is |
||
493 | $urlList = '<em><span class="typo3-dimmed">' . htmlspecialchars($urlQuery) . '</span></em><br/>'; |
||
494 | } else { |
||
495 | 4 | $urlList = '[' . date('d.m.y H:i', $schTime) . '] ' . htmlspecialchars($urlQuery); |
|
496 | 4 | $this->urlList[] = '[' . date('d.m.y H:i', $schTime) . '] ' . $urlQuery; |
|
497 | |||
498 | 4 | $theUrl = ($vv['subCfg']['baseUrl'] ? $vv['subCfg']['baseUrl'] : GeneralUtility::getIndpEnv('TYPO3_SITE_URL')) . $urlQuery; |
|
499 | |||
500 | // Submit for crawling! |
||
501 | 4 | if ($submitCrawlUrls) { |
|
502 | 4 | $added = $this->addUrl( |
|
503 | 4 | $pageRow['uid'], |
|
504 | 4 | $theUrl, |
|
505 | 4 | $vv['subCfg'], |
|
506 | 4 | $scheduledTime, |
|
507 | 4 | $configurationHash, |
|
508 | 4 | $skipInnerCheck |
|
509 | ); |
||
510 | 4 | if ($added === false) { |
|
511 | 4 | $urlList .= ' (Url already existed)'; |
|
512 | } |
||
513 | } elseif ($downloadCrawlUrls) { |
||
514 | $downloadUrls[$theUrl] = $theUrl; |
||
515 | } |
||
516 | |||
517 | 4 | $urlList .= '<br />'; |
|
518 | } |
||
519 | 4 | $duplicateTrack[$uKey] = true; |
|
520 | } |
||
521 | } |
||
522 | } else { |
||
523 | $urlList = 'ERROR - no URL generated'; |
||
524 | } |
||
525 | |||
526 | 4 | return $urlList; |
|
527 | } |
||
528 | |||
529 | /** |
||
530 | * Returns true if input processing instruction is among registered ones. |
||
531 | * |
||
532 | * @param string $piString PI to test |
||
533 | * @param array $incomingProcInstructions Processing instructions |
||
534 | * @return boolean |
||
535 | */ |
||
536 | 5 | public function drawURLs_PIfilter($piString, array $incomingProcInstructions) |
|
537 | { |
||
538 | 5 | if (empty($incomingProcInstructions)) { |
|
539 | 1 | return true; |
|
540 | } |
||
541 | |||
542 | 4 | foreach ($incomingProcInstructions as $pi) { |
|
543 | 4 | if (GeneralUtility::inList($piString, $pi)) { |
|
544 | 4 | return true; |
|
545 | } |
||
546 | } |
||
547 | 2 | } |
|
548 | |||
549 | 4 | public function getPageTSconfigForId($id) |
|
550 | { |
||
551 | 4 | if (!$this->MP) { |
|
552 | 4 | $pageTSconfig = BackendUtility::getPagesTSconfig($id); |
|
553 | } else { |
||
554 | list(, $mountPointId) = explode('-', $this->MP); |
||
555 | $pageTSconfig = BackendUtility::getPagesTSconfig($mountPointId); |
||
556 | } |
||
557 | |||
558 | // Call a hook to alter configuration |
||
559 | 4 | if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'])) { |
|
560 | $params = [ |
||
561 | 'pageId' => $id, |
||
562 | 'pageTSConfig' => &$pageTSconfig |
||
563 | ]; |
||
564 | foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'] as $userFunc) { |
||
565 | GeneralUtility::callUserFunction($userFunc, $params, $this); |
||
566 | } |
||
567 | } |
||
568 | |||
569 | 4 | return $pageTSconfig; |
|
570 | } |
||
571 | |||
572 | /** |
||
573 | * This methods returns an array of configurations. |
||
574 | * And no urls! |
||
575 | * |
||
576 | * @param integer $id Page ID |
||
577 | * @param bool $forceSsl Use https |
||
578 | * @return array |
||
579 | * |
||
580 | * TODO: Should be switched back to protected - TNM 2018-11-16 |
||
581 | */ |
||
582 | 4 | public function getUrlsForPageId($id, $forceSsl = false) |
|
583 | { |
||
584 | |||
585 | /** |
||
586 | * Get configuration from tsConfig |
||
587 | */ |
||
588 | |||
589 | // Get page TSconfig for page ID: |
||
590 | 4 | $pageTSconfig = $this->getPageTSconfigForId($id); |
|
591 | |||
592 | 4 | $res = []; |
|
593 | |||
594 | 4 | if (is_array($pageTSconfig) && is_array($pageTSconfig['tx_crawler.']['crawlerCfg.'])) { |
|
595 | 3 | $crawlerCfg = $pageTSconfig['tx_crawler.']['crawlerCfg.']; |
|
596 | |||
597 | 3 | if (is_array($crawlerCfg['paramSets.'])) { |
|
598 | 3 | foreach ($crawlerCfg['paramSets.'] as $key => $values) { |
|
599 | 3 | if (is_array($values)) { |
|
600 | 3 | $key = str_replace('.', '', $key); |
|
601 | // Sub configuration for a single configuration string: |
||
602 | 3 | $subCfg = (array)$crawlerCfg['paramSets.'][$key . '.']; |
|
603 | 3 | $subCfg['key'] = $key; |
|
604 | |||
605 | 3 | if (strcmp($subCfg['procInstrFilter'], '')) { |
|
606 | 3 | $subCfg['procInstrFilter'] = implode(',', GeneralUtility::trimExplode(',', $subCfg['procInstrFilter'])); |
|
607 | } |
||
608 | 3 | $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $subCfg['pidsOnly'], true)); |
|
609 | |||
610 | // process configuration if it is not page-specific or if the specific page is the current page: |
||
611 | 3 | if (!strcmp($subCfg['pidsOnly'], '') || GeneralUtility::inList($pidOnlyList, $id)) { |
|
612 | |||
613 | // add trailing slash if not present |
||
614 | 3 | if (!empty($subCfg['baseUrl']) && substr($subCfg['baseUrl'], -1) != '/') { |
|
615 | $subCfg['baseUrl'] .= '/'; |
||
616 | } |
||
617 | |||
618 | // Explode, process etc.: |
||
619 | 3 | $res[$key] = []; |
|
620 | 3 | $res[$key]['subCfg'] = $subCfg; |
|
621 | 3 | $res[$key]['paramParsed'] = $this->parseParams($crawlerCfg['paramSets.'][$key]); |
|
622 | 3 | $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $id); |
|
623 | 3 | $res[$key]['origin'] = 'pagets'; |
|
624 | |||
625 | // recognize MP value |
||
626 | 3 | if (!$this->MP) { |
|
627 | 3 | $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $id]); |
|
628 | } else { |
||
629 | 3 | $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $id . '&MP=' . $this->MP]); |
|
630 | } |
||
631 | } |
||
632 | } |
||
633 | } |
||
634 | } |
||
635 | } |
||
636 | |||
637 | /** |
||
638 | * Get configuration from tx_crawler_configuration records |
||
639 | */ |
||
640 | |||
641 | // get records along the rootline |
||
642 | 4 | $rootLine = BackendUtility::BEgetRootLine($id); |
|
643 | |||
644 | 4 | foreach ($rootLine as $page) { |
|
645 | 4 | $configurationRecordsForCurrentPage = BackendUtility::getRecordsByField( |
|
646 | 4 | 'tx_crawler_configuration', |
|
647 | 4 | 'pid', |
|
648 | 4 | intval($page['uid']), |
|
649 | 4 | BackendUtility::BEenableFields('tx_crawler_configuration') . BackendUtility::deleteClause('tx_crawler_configuration') |
|
650 | ); |
||
651 | |||
652 | 4 | if (is_array($configurationRecordsForCurrentPage)) { |
|
653 | 1 | foreach ($configurationRecordsForCurrentPage as $configurationRecord) { |
|
654 | |||
655 | // check access to the configuration record |
||
656 | 1 | if (empty($configurationRecord['begroups']) || $GLOBALS['BE_USER']->isAdmin() || $this->hasGroupAccess($GLOBALS['BE_USER']->user['usergroup_cached_list'], $configurationRecord['begroups'])) { |
|
657 | 1 | $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $configurationRecord['pidsonly'], true)); |
|
658 | |||
659 | // process configuration if it is not page-specific or if the specific page is the current page: |
||
660 | 1 | if (!strcmp($configurationRecord['pidsonly'], '') || GeneralUtility::inList($pidOnlyList, $id)) { |
|
661 | 1 | $key = $configurationRecord['name']; |
|
662 | |||
663 | // don't overwrite previously defined paramSets |
||
664 | 1 | if (!isset($res[$key])) { |
|
665 | |||
666 | /* @var $TSparserObject \TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser */ |
||
667 | 1 | $TSparserObject = GeneralUtility::makeInstance('TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser'); |
|
668 | 1 | $TSparserObject->parse($configurationRecord['processing_instruction_parameters_ts']); |
|
669 | |||
670 | 1 | $isCrawlingProtocolHttps = $this->isCrawlingProtocolHttps($configurationRecord['force_ssl'], $forceSsl); |
|
671 | |||
672 | $subCfg = [ |
||
673 | 1 | 'procInstrFilter' => $configurationRecord['processing_instruction_filter'], |
|
674 | 1 | 'procInstrParams.' => $TSparserObject->setup, |
|
675 | 1 | 'baseUrl' => $this->getBaseUrlForConfigurationRecord( |
|
676 | 1 | $configurationRecord['base_url'], |
|
677 | 1 | $configurationRecord['sys_domain_base_url'], |
|
678 | 1 | $isCrawlingProtocolHttps |
|
679 | ), |
||
680 | 1 | 'realurl' => $configurationRecord['realurl'], |
|
681 | 1 | 'cHash' => $configurationRecord['chash'], |
|
682 | 1 | 'userGroups' => $configurationRecord['fegroups'], |
|
683 | 1 | 'exclude' => $configurationRecord['exclude'], |
|
684 | 1 | 'rootTemplatePid' => (int) $configurationRecord['root_template_pid'], |
|
685 | 1 | 'key' => $key |
|
686 | ]; |
||
687 | |||
688 | // add trailing slash if not present |
||
689 | 1 | if (!empty($subCfg['baseUrl']) && substr($subCfg['baseUrl'], -1) != '/') { |
|
690 | $subCfg['baseUrl'] .= '/'; |
||
691 | } |
||
692 | 1 | if (!in_array($id, $this->expandExcludeString($subCfg['exclude']))) { |
|
693 | 1 | $res[$key] = []; |
|
694 | 1 | $res[$key]['subCfg'] = $subCfg; |
|
695 | 1 | $res[$key]['paramParsed'] = $this->parseParams($configurationRecord['configuration']); |
|
696 | 1 | $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $id); |
|
697 | 1 | $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $id]); |
|
698 | 4 | $res[$key]['origin'] = 'tx_crawler_configuration_' . $configurationRecord['uid']; |
|
699 | } |
||
700 | } |
||
701 | } |
||
702 | } |
||
703 | } |
||
704 | } |
||
705 | } |
||
706 | |||
707 | 4 | if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'])) { |
|
708 | foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'] as $func) { |
||
709 | $params = [ |
||
710 | 'res' => &$res, |
||
711 | ]; |
||
712 | GeneralUtility::callUserFunction($func, $params, $this); |
||
713 | } |
||
714 | } |
||
715 | |||
716 | 4 | return $res; |
|
717 | } |
||
718 | |||
719 | /** |
||
720 | * Checks if a domain record exist and returns the base-url based on the record. If not the given baseUrl string is used. |
||
721 | * |
||
722 | * @param string $baseUrl |
||
723 | * @param integer $sysDomainUid |
||
724 | * @param bool $ssl |
||
725 | * @return string |
||
726 | */ |
||
727 | 4 | protected function getBaseUrlForConfigurationRecord($baseUrl, $sysDomainUid, $ssl = false) |
|
728 | { |
||
729 | 4 | $sysDomainUid = intval($sysDomainUid); |
|
730 | 4 | $urlScheme = ($ssl === false) ? 'http' : 'https'; |
|
731 | |||
732 | 4 | if ($sysDomainUid > 0) { |
|
733 | 2 | $res = $this->db->exec_SELECTquery( |
|
734 | 2 | '*', |
|
735 | 2 | 'sys_domain', |
|
736 | 2 | 'uid = ' . $sysDomainUid . |
|
737 | 2 | BackendUtility::BEenableFields('sys_domain') . |
|
738 | 2 | BackendUtility::deleteClause('sys_domain') |
|
739 | ); |
||
740 | 2 | $row = $this->db->sql_fetch_assoc($res); |
|
741 | 2 | if ($row['domainName'] != '') { |
|
742 | 1 | return $urlScheme . '://' . $row['domainName']; |
|
743 | } |
||
744 | } |
||
745 | 3 | return $baseUrl; |
|
746 | } |
||
747 | |||
748 | public function getConfigurationsForBranch($rootid, $depth) |
||
749 | { |
||
750 | $configurationsForBranch = []; |
||
751 | |||
752 | $pageTSconfig = $this->getPageTSconfigForId($rootid); |
||
753 | if (is_array($pageTSconfig) && is_array($pageTSconfig['tx_crawler.']['crawlerCfg.']) && is_array($pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'])) { |
||
754 | $sets = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.']; |
||
755 | if (is_array($sets)) { |
||
756 | foreach ($sets as $key => $value) { |
||
757 | if (!is_array($value)) { |
||
758 | continue; |
||
759 | } |
||
760 | $configurationsForBranch[] = substr($key, -1) == '.' ? substr($key, 0, -1) : $key; |
||
761 | } |
||
762 | } |
||
763 | } |
||
764 | $pids = []; |
||
765 | $rootLine = BackendUtility::BEgetRootLine($rootid); |
||
766 | foreach ($rootLine as $node) { |
||
767 | $pids[] = $node['uid']; |
||
768 | } |
||
769 | /* @var PageTreeView $tree */ |
||
770 | $tree = GeneralUtility::makeInstance(PageTreeView::class); |
||
771 | $perms_clause = $GLOBALS['BE_USER']->getPagePermsClause(1); |
||
772 | $tree->init('AND ' . $perms_clause); |
||
773 | $tree->getTree($rootid, $depth, ''); |
||
774 | foreach ($tree->tree as $node) { |
||
775 | $pids[] = $node['row']['uid']; |
||
776 | } |
||
777 | |||
778 | $res = $this->db->exec_SELECTquery( |
||
779 | '*', |
||
780 | 'tx_crawler_configuration', |
||
781 | 'pid IN (' . implode(',', $pids) . ') ' . |
||
782 | BackendUtility::BEenableFields('tx_crawler_configuration') . |
||
783 | BackendUtility::deleteClause('tx_crawler_configuration') . ' ' . |
||
784 | BackendUtility::versioningPlaceholderClause('tx_crawler_configuration') . ' ' |
||
785 | ); |
||
786 | |||
787 | while ($row = $this->db->sql_fetch_assoc($res)) { |
||
788 | $configurationsForBranch[] = $row['name']; |
||
789 | } |
||
790 | $this->db->sql_free_result($res); |
||
791 | return $configurationsForBranch; |
||
792 | } |
||
793 | |||
794 | /** |
||
795 | * Check if a user has access to an item |
||
796 | * (e.g. get the group list of the current logged in user from $GLOBALS['TSFE']->gr_list) |
||
797 | * |
||
798 | * @see \TYPO3\CMS\Frontend\Page\PageRepository::getMultipleGroupsWhereClause() |
||
799 | * @param string $groupList Comma-separated list of (fe_)group UIDs from a user |
||
800 | * @param string $accessList Comma-separated list of (fe_)group UIDs of the item to access |
||
801 | * @return bool TRUE if at least one of the users group UIDs is in the access list or the access list is empty |
||
802 | */ |
||
803 | 3 | public function hasGroupAccess($groupList, $accessList) |
|
804 | { |
||
805 | 3 | if (empty($accessList)) { |
|
806 | 1 | return true; |
|
807 | } |
||
808 | 2 | foreach (GeneralUtility::intExplode(',', $groupList) as $groupUid) { |
|
809 | 2 | if (GeneralUtility::inList($accessList, $groupUid)) { |
|
810 | 2 | return true; |
|
811 | } |
||
812 | } |
||
813 | 1 | return false; |
|
814 | } |
||
815 | |||
816 | /** |
||
817 | * Parse GET vars of input Query into array with key=>value pairs |
||
818 | * |
||
819 | * @param string $inputQuery Input query string |
||
820 | * @return array |
||
821 | */ |
||
822 | 7 | public function parseParams($inputQuery) |
|
823 | { |
||
824 | // Extract all GET parameters into an ARRAY: |
||
825 | 7 | $paramKeyValues = []; |
|
826 | 7 | $GETparams = explode('&', $inputQuery); |
|
827 | |||
828 | 7 | foreach ($GETparams as $paramAndValue) { |
|
829 | 7 | list($p, $v) = explode('=', $paramAndValue, 2); |
|
830 | 7 | if (strlen($p)) { |
|
831 | 7 | $paramKeyValues[rawurldecode($p)] = rawurldecode($v); |
|
832 | } |
||
833 | } |
||
834 | |||
835 | 7 | return $paramKeyValues; |
|
836 | } |
||
837 | |||
838 | /** |
||
839 | * Will expand the parameters configuration to individual values. This follows a certain syntax of the value of each parameter. |
||
840 | * Syntax of values: |
||
841 | * - Basically: If the value is wrapped in [...] it will be expanded according to the following syntax, otherwise the value is taken literally |
||
842 | * - Configuration is splitted by "|" and the parts are processed individually and finally added together |
||
843 | * - For each configuration part: |
||
844 | * - "[int]-[int]" = Integer range, will be expanded to all values in between, values included, starting from low to high (max. 1000). Example "1-34" or "-40--30" |
||
845 | * - "_TABLE:[TCA table name];[_PID:[optional page id, default is current page]];[_ENABLELANG:1]" = Look up of table records from PID, filtering out deleted records. Example "_TABLE:tt_content; _PID:123" |
||
846 | * _ENABLELANG:1 picks only original records without their language overlays |
||
847 | * - Default: Literal value |
||
848 | * |
||
849 | * @param array $paramArray Array with key (GET var name) and values (value of GET var which is configuration for expansion) |
||
850 | * @param integer $pid Current page ID |
||
851 | * @return array |
||
852 | */ |
||
853 | 4 | public function expandParameters($paramArray, $pid) |
|
854 | { |
||
855 | 4 | global $TCA; |
|
856 | |||
857 | // Traverse parameter names: |
||
858 | 4 | foreach ($paramArray as $p => $v) { |
|
859 | 4 | $v = trim($v); |
|
860 | |||
861 | // If value is encapsulated in square brackets it means there are some ranges of values to find, otherwise the value is literal |
||
862 | 4 | if (substr($v, 0, 1) === '[' && substr($v, -1) === ']') { |
|
863 | // So, find the value inside brackets and reset the paramArray value as an array. |
||
864 | 4 | $v = substr($v, 1, -1); |
|
865 | 4 | $paramArray[$p] = []; |
|
866 | |||
867 | // Explode parts and traverse them: |
||
868 | 4 | $parts = explode('|', $v); |
|
869 | 4 | foreach ($parts as $pV) { |
|
870 | |||
871 | // Look for integer range: (fx. 1-34 or -40--30 // reads minus 40 to minus 30) |
||
872 | 4 | if (preg_match('/^(-?[0-9]+)\s*-\s*(-?[0-9]+)$/', trim($pV), $reg)) { |
|
873 | |||
874 | // Swap if first is larger than last: |
||
875 | if ($reg[1] > $reg[2]) { |
||
876 | $temp = $reg[2]; |
||
877 | $reg[2] = $reg[1]; |
||
878 | $reg[1] = $temp; |
||
879 | } |
||
880 | |||
881 | // Traverse range, add values: |
||
882 | $runAwayBrake = 1000; // Limit to size of range! |
||
883 | for ($a = $reg[1]; $a <= $reg[2];$a++) { |
||
884 | $paramArray[$p][] = $a; |
||
885 | $runAwayBrake--; |
||
886 | if ($runAwayBrake <= 0) { |
||
887 | break; |
||
888 | } |
||
889 | } |
||
890 | 4 | } elseif (substr(trim($pV), 0, 7) == '_TABLE:') { |
|
891 | |||
892 | // Parse parameters: |
||
893 | $subparts = GeneralUtility::trimExplode(';', $pV); |
||
894 | $subpartParams = []; |
||
895 | foreach ($subparts as $spV) { |
||
896 | list($pKey, $pVal) = GeneralUtility::trimExplode(':', $spV); |
||
897 | $subpartParams[$pKey] = $pVal; |
||
898 | } |
||
899 | |||
900 | // Table exists: |
||
901 | if (isset($TCA[$subpartParams['_TABLE']])) { |
||
902 | $lookUpPid = isset($subpartParams['_PID']) ? intval($subpartParams['_PID']) : $pid; |
||
903 | $pidField = isset($subpartParams['_PIDFIELD']) ? trim($subpartParams['_PIDFIELD']) : 'pid'; |
||
904 | $where = isset($subpartParams['_WHERE']) ? $subpartParams['_WHERE'] : ''; |
||
905 | $addTable = isset($subpartParams['_ADDTABLE']) ? $subpartParams['_ADDTABLE'] : ''; |
||
906 | |||
907 | $fieldName = $subpartParams['_FIELD'] ? $subpartParams['_FIELD'] : 'uid'; |
||
908 | if ($fieldName === 'uid' || $TCA[$subpartParams['_TABLE']]['columns'][$fieldName]) { |
||
909 | $andWhereLanguage = ''; |
||
910 | $transOrigPointerField = $TCA[$subpartParams['_TABLE']]['ctrl']['transOrigPointerField']; |
||
911 | |||
912 | if ($subpartParams['_ENABLELANG'] && $transOrigPointerField) { |
||
913 | $andWhereLanguage = ' AND ' . $this->db->quoteStr($transOrigPointerField, $subpartParams['_TABLE']) . ' <= 0 '; |
||
914 | } |
||
915 | |||
916 | $where = $this->db->quoteStr($pidField, $subpartParams['_TABLE']) . '=' . intval($lookUpPid) . ' ' . |
||
917 | $andWhereLanguage . $where; |
||
918 | |||
919 | $rows = $this->db->exec_SELECTgetRows( |
||
920 | $fieldName, |
||
921 | $subpartParams['_TABLE'] . $addTable, |
||
922 | $where . BackendUtility::deleteClause($subpartParams['_TABLE']), |
||
923 | '', |
||
924 | '', |
||
925 | '', |
||
926 | $fieldName |
||
927 | ); |
||
928 | |||
929 | if (is_array($rows)) { |
||
930 | $paramArray[$p] = array_merge($paramArray[$p], array_keys($rows)); |
||
931 | } |
||
932 | } |
||
933 | } |
||
934 | } else { // Just add value: |
||
935 | 4 | $paramArray[$p][] = $pV; |
|
936 | } |
||
937 | // Hook for processing own expandParameters place holder |
||
938 | 4 | if (is_array($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'])) { |
|
939 | $_params = [ |
||
940 | 'pObj' => &$this, |
||
941 | 'paramArray' => &$paramArray, |
||
942 | 'currentKey' => $p, |
||
943 | 'currentValue' => $pV, |
||
944 | 'pid' => $pid |
||
945 | ]; |
||
946 | foreach ($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'] as $key => $_funcRef) { |
||
947 | 4 | GeneralUtility::callUserFunction($_funcRef, $_params, $this); |
|
948 | } |
||
949 | } |
||
950 | } |
||
951 | |||
952 | // Make unique set of values and sort array by key: |
||
953 | 4 | $paramArray[$p] = array_unique($paramArray[$p]); |
|
954 | 4 | ksort($paramArray); |
|
955 | } else { |
||
956 | // Set the literal value as only value in array: |
||
957 | 4 | $paramArray[$p] = [$v]; |
|
958 | } |
||
959 | } |
||
960 | |||
961 | 4 | return $paramArray; |
|
962 | } |
||
963 | |||
964 | /** |
||
965 | * Compiling URLs from parameter array (output of expandParameters()) |
||
966 | * The number of URLs will be the multiplication of the number of parameter values for each key |
||
967 | * |
||
968 | * @param array $paramArray Output of expandParameters(): Array with keys (GET var names) and for each an array of values |
||
969 | * @param array $urls URLs accumulated in this array (for recursion) |
||
970 | * @return array |
||
971 | */ |
||
972 | 7 | public function compileUrls($paramArray, $urls = []) |
|
973 | { |
||
974 | 7 | if (count($paramArray) && is_array($urls)) { |
|
975 | // shift first off stack: |
||
976 | 6 | reset($paramArray); |
|
977 | 6 | $varName = key($paramArray); |
|
978 | 6 | $valueSet = array_shift($paramArray); |
|
979 | |||
980 | // Traverse value set: |
||
981 | 6 | $newUrls = []; |
|
982 | 6 | foreach ($urls as $url) { |
|
983 | 5 | foreach ($valueSet as $val) { |
|
984 | 5 | $newUrls[] = $url . (strcmp($val, '') ? '&' . rawurlencode($varName) . '=' . rawurlencode($val) : ''); |
|
985 | |||
986 | 5 | if (count($newUrls) > MathUtility::forceIntegerInRange($this->extensionSettings['maxCompileUrls'], 1, 1000000000, 10000)) { |
|
987 | 5 | break; |
|
988 | } |
||
989 | } |
||
990 | } |
||
991 | 6 | $urls = $newUrls; |
|
992 | 6 | $urls = $this->compileUrls($paramArray, $urls); |
|
993 | } |
||
994 | |||
995 | 7 | return $urls; |
|
996 | } |
||
997 | |||
998 | /************************************ |
||
999 | * |
||
1000 | * Crawler log |
||
1001 | * |
||
1002 | ************************************/ |
||
1003 | |||
1004 | /** |
||
1005 | * Return array of records from crawler queue for input page ID |
||
1006 | * |
||
1007 | * @param integer $id Page ID for which to look up log entries. |
||
1008 | * @param string$filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones |
||
1009 | * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected! |
||
1010 | * @param boolean $doFullFlush |
||
1011 | * @param integer $itemsPerPage Limit the amount of entries per page default is 10 |
||
1012 | * @return array |
||
1013 | */ |
||
1014 | 4 | public function getLogEntriesForPageId($id, $filter = '', $doFlush = false, $doFullFlush = false, $itemsPerPage = 10) |
|
1015 | { |
||
1016 | switch ($filter) { |
||
1017 | 4 | case 'pending': |
|
1018 | $addWhere = ' AND exec_time=0'; |
||
1019 | break; |
||
1020 | 4 | case 'finished': |
|
1021 | $addWhere = ' AND exec_time>0'; |
||
1022 | break; |
||
1023 | default: |
||
1024 | 4 | $addWhere = ''; |
|
1025 | 4 | break; |
|
1026 | } |
||
1027 | |||
1028 | // FIXME: Write unit test that ensures that the right records are deleted. |
||
1029 | 4 | if ($doFlush) { |
|
1030 | 2 | $this->flushQueue(($doFullFlush ? '1=1' : ('page_id=' . intval($id))) . $addWhere); |
|
1031 | 2 | return []; |
|
1032 | } else { |
||
1033 | 2 | return $this->db->exec_SELECTgetRows( |
|
1034 | 2 | '*', |
|
1035 | 2 | 'tx_crawler_queue', |
|
1036 | 2 | 'page_id=' . intval($id) . $addWhere, |
|
1037 | 2 | '', |
|
1038 | 2 | 'scheduled DESC', |
|
1039 | 2 | (intval($itemsPerPage) > 0 ? intval($itemsPerPage) : '') |
|
1040 | ); |
||
1041 | } |
||
1042 | } |
||
1043 | |||
1044 | /** |
||
1045 | * Return array of records from crawler queue for input set ID |
||
1046 | * |
||
1047 | * @param integer $set_id Set ID for which to look up log entries. |
||
1048 | * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones |
||
1049 | * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected! |
||
1050 | * @param integer $itemsPerPage Limit the amount of entires per page default is 10 |
||
1051 | * @return array |
||
1052 | */ |
||
1053 | 6 | public function getLogEntriesForSetId($set_id, $filter = '', $doFlush = false, $doFullFlush = false, $itemsPerPage = 10) |
|
1054 | { |
||
1055 | // FIXME: Write Unit tests for Filters |
||
1056 | switch ($filter) { |
||
1057 | 6 | case 'pending': |
|
1058 | 1 | $addWhere = ' AND exec_time=0'; |
|
1059 | 1 | break; |
|
1060 | 5 | case 'finished': |
|
1061 | 1 | $addWhere = ' AND exec_time>0'; |
|
1062 | 1 | break; |
|
1063 | default: |
||
1064 | 4 | $addWhere = ''; |
|
1065 | 4 | break; |
|
1066 | } |
||
1067 | // FIXME: Write unit test that ensures that the right records are deleted. |
||
1068 | 6 | if ($doFlush) { |
|
1069 | 4 | $this->flushQueue($doFullFlush ? '' : ('set_id=' . intval($set_id) . $addWhere)); |
|
1070 | 4 | return []; |
|
1071 | } else { |
||
1072 | 2 | return $this->db->exec_SELECTgetRows( |
|
1073 | 2 | '*', |
|
1074 | 2 | 'tx_crawler_queue', |
|
1075 | 2 | 'set_id=' . intval($set_id) . $addWhere, |
|
1076 | 2 | '', |
|
1077 | 2 | 'scheduled DESC', |
|
1078 | 2 | (intval($itemsPerPage) > 0 ? intval($itemsPerPage) : '') |
|
1079 | ); |
||
1080 | } |
||
1081 | } |
||
1082 | |||
1083 | /** |
||
1084 | * Removes queue entries |
||
1085 | * |
||
1086 | * @param string $where SQL related filter for the entries which should be removed |
||
1087 | * @return void |
||
1088 | */ |
||
1089 | 10 | protected function flushQueue($where = '') |
|
1090 | { |
||
1091 | 10 | $realWhere = strlen($where) > 0 ? $where : '1=1'; |
|
1092 | |||
1093 | 10 | if (EventDispatcher::getInstance()->hasObserver('queueEntryFlush')) { |
|
1094 | $groups = $GLOBALS['TYPO3_DB']>exec_SELECTgetRows('DISTINCT set_id', 'tx_crawler_queue', $realWhere); |
||
1095 | if (is_array($groups)) { |
||
1096 | foreach ($groups as $group) { |
||
1097 | EventDispatcher::getInstance()->post('queueEntryFlush', $group['set_id'], $GLOBALS['TYPO3_DB']->exec_SELECTgetRows('uid, set_id', 'tx_crawler_queue', $realWhere . ' AND set_id="' . $group['set_id'] . '"')); |
||
1098 | } |
||
1099 | } |
||
1100 | } |
||
1101 | |||
1102 | 10 | $GLOBALS['TYPO3_DB']->exec_DELETEquery('tx_crawler_queue', $realWhere); |
|
1103 | 10 | } |
|
1104 | |||
1105 | /** |
||
1106 | * Adding call back entries to log (called from hooks typically, see indexed search class "class.crawler.php" |
||
1107 | * |
||
1108 | * @param integer $setId Set ID |
||
1109 | * @param array $params Parameters to pass to call back function |
||
1110 | * @param string $callBack Call back object reference, eg. 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler' |
||
1111 | * @param integer $page_id Page ID to attach it to |
||
1112 | * @param integer $schedule Time at which to activate |
||
1113 | * @return void |
||
1114 | */ |
||
1115 | public function addQueueEntry_callBack($setId, $params, $callBack, $page_id = 0, $schedule = 0) |
||
1116 | { |
||
1117 | if (!is_array($params)) { |
||
1118 | $params = []; |
||
1119 | } |
||
1120 | $params['_CALLBACKOBJ'] = $callBack; |
||
1121 | |||
1122 | // Compile value array: |
||
1123 | $fieldArray = [ |
||
1124 | 'page_id' => intval($page_id), |
||
1125 | 'parameters' => serialize($params), |
||
1126 | 'scheduled' => intval($schedule) ? intval($schedule) : $this->getCurrentTime(), |
||
1127 | 'exec_time' => 0, |
||
1128 | 'set_id' => intval($setId), |
||
1129 | 'result_data' => '', |
||
1130 | ]; |
||
1131 | |||
1132 | $this->db->exec_INSERTquery('tx_crawler_queue', $fieldArray); |
||
1133 | } |
||
1134 | |||
1135 | /************************************ |
||
1136 | * |
||
1137 | * URL setting |
||
1138 | * |
||
1139 | ************************************/ |
||
1140 | |||
1141 | /** |
||
1142 | * Setting a URL for crawling: |
||
1143 | * |
||
1144 | * @param integer $id Page ID |
||
1145 | * @param string $url Complete URL |
||
1146 | * @param array $subCfg Sub configuration array (from TS config) |
||
1147 | * @param integer $tstamp Scheduled-time |
||
1148 | * @param string $configurationHash (optional) configuration hash |
||
1149 | * @param bool $skipInnerDuplicationCheck (optional) skip inner duplication check |
||
1150 | * @return bool |
||
1151 | */ |
||
1152 | 4 | public function addUrl( |
|
1153 | $id, |
||
1154 | $url, |
||
1155 | array $subCfg, |
||
1156 | $tstamp, |
||
1157 | $configurationHash = '', |
||
1158 | $skipInnerDuplicationCheck = false |
||
1159 | ) { |
||
1160 | 4 | $urlAdded = false; |
|
1161 | 4 | $rows = []; |
|
1162 | |||
1163 | // Creating parameters: |
||
1164 | $parameters = [ |
||
1165 | 4 | 'url' => $url |
|
1166 | ]; |
||
1167 | |||
1168 | // fe user group simulation: |
||
1169 | 4 | $uGs = implode(',', array_unique(GeneralUtility::intExplode(',', $subCfg['userGroups'], true))); |
|
1170 | 4 | if ($uGs) { |
|
1171 | $parameters['feUserGroupList'] = $uGs; |
||
1172 | } |
||
1173 | |||
1174 | // Setting processing instructions |
||
1175 | 4 | $parameters['procInstructions'] = GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']); |
|
1176 | 4 | if (is_array($subCfg['procInstrParams.'])) { |
|
1177 | 4 | $parameters['procInstrParams'] = $subCfg['procInstrParams.']; |
|
1178 | } |
||
1179 | |||
1180 | // Possible TypoScript Template Parents |
||
1181 | 4 | $parameters['rootTemplatePid'] = $subCfg['rootTemplatePid']; |
|
1182 | |||
1183 | // Compile value array: |
||
1184 | 4 | $parameters_serialized = serialize($parameters); |
|
1185 | $fieldArray = [ |
||
1186 | 4 | 'page_id' => intval($id), |
|
1187 | 4 | 'parameters' => $parameters_serialized, |
|
1188 | 4 | 'parameters_hash' => GeneralUtility::shortMD5($parameters_serialized), |
|
1189 | 4 | 'configuration_hash' => $configurationHash, |
|
1190 | 4 | 'scheduled' => $tstamp, |
|
1191 | 4 | 'exec_time' => 0, |
|
1192 | 4 | 'set_id' => intval($this->setID), |
|
1193 | 4 | 'result_data' => '', |
|
1194 | 4 | 'configuration' => $subCfg['key'], |
|
1195 | ]; |
||
1196 | |||
1197 | 4 | if ($this->registerQueueEntriesInternallyOnly) { |
|
1198 | //the entries will only be registered and not stored to the database |
||
1199 | $this->queueEntries[] = $fieldArray; |
||
1200 | } else { |
||
1201 | 4 | if (!$skipInnerDuplicationCheck) { |
|
1202 | // check if there is already an equal entry |
||
1203 | 4 | $rows = $this->getDuplicateRowsIfExist($tstamp, $fieldArray); |
|
1204 | } |
||
1205 | |||
1206 | 4 | if (count($rows) == 0) { |
|
1207 | 4 | $this->db->exec_INSERTquery('tx_crawler_queue', $fieldArray); |
|
1208 | 4 | $uid = $this->db->sql_insert_id(); |
|
1209 | 4 | $rows[] = $uid; |
|
1210 | 4 | $urlAdded = true; |
|
1211 | 4 | EventDispatcher::getInstance()->post('urlAddedToQueue', $this->setID, ['uid' => $uid, 'fieldArray' => $fieldArray]); |
|
1212 | } else { |
||
1213 | 2 | EventDispatcher::getInstance()->post('duplicateUrlInQueue', $this->setID, ['rows' => $rows, 'fieldArray' => $fieldArray]); |
|
1214 | } |
||
1215 | } |
||
1216 | |||
1217 | 4 | return $urlAdded; |
|
1218 | } |
||
1219 | |||
1220 | /** |
||
1221 | * This method determines duplicates for a queue entry with the same parameters and this timestamp. |
||
1222 | * If the timestamp is in the past, it will check if there is any unprocessed queue entry in the past. |
||
1223 | * If the timestamp is in the future it will check, if the queued entry has exactly the same timestamp |
||
1224 | * |
||
1225 | * @param int $tstamp |
||
1226 | * @param array $fieldArray |
||
1227 | * |
||
1228 | * @return array |
||
1229 | */ |
||
1230 | 4 | protected function getDuplicateRowsIfExist($tstamp, $fieldArray) |
|
1231 | { |
||
1232 | 4 | $rows = []; |
|
1233 | |||
1234 | 4 | $currentTime = $this->getCurrentTime(); |
|
1235 | |||
1236 | //if this entry is scheduled with "now" |
||
1237 | 4 | if ($tstamp <= $currentTime) { |
|
1238 | 1 | if ($this->extensionSettings['enableTimeslot']) { |
|
1239 | 1 | $timeBegin = $currentTime - 100; |
|
1240 | 1 | $timeEnd = $currentTime + 100; |
|
1241 | 1 | $where = ' ((scheduled BETWEEN ' . $timeBegin . ' AND ' . $timeEnd . ' ) OR scheduled <= ' . $currentTime . ') '; |
|
1242 | } else { |
||
1243 | 1 | $where = 'scheduled <= ' . $currentTime; |
|
1244 | } |
||
1245 | 3 | } elseif ($tstamp > $currentTime) { |
|
1246 | //entry with a timestamp in the future need to have the same schedule time |
||
1247 | 3 | $where = 'scheduled = ' . $tstamp ; |
|
1248 | } |
||
1249 | |||
1250 | 4 | if (!empty($where)) { |
|
1251 | 4 | $result = $this->db->exec_SELECTgetRows( |
|
1252 | 4 | 'qid', |
|
1253 | 4 | 'tx_crawler_queue', |
|
1254 | $where . |
||
1255 | 4 | ' AND NOT exec_time' . |
|
1256 | 4 | ' AND NOT process_id ' . |
|
1257 | 4 | ' AND page_id=' . intval($fieldArray['page_id']) . |
|
1258 | 4 | ' AND parameters_hash = ' . $this->db->fullQuoteStr($fieldArray['parameters_hash'], 'tx_crawler_queue') |
|
1259 | ); |
||
1260 | |||
1261 | 4 | if (is_array($result)) { |
|
1262 | 4 | foreach ($result as $value) { |
|
1263 | 2 | $rows[] = $value['qid']; |
|
1264 | } |
||
1265 | } |
||
1266 | } |
||
1267 | |||
1268 | 4 | return $rows; |
|
1269 | } |
||
1270 | |||
1271 | /** |
||
1272 | * Returns the current system time |
||
1273 | * |
||
1274 | * @return int |
||
1275 | */ |
||
1276 | public function getCurrentTime() |
||
1277 | { |
||
1278 | return time(); |
||
1279 | } |
||
1280 | |||
1281 | /************************************ |
||
1282 | * |
||
1283 | * URL reading |
||
1284 | * |
||
1285 | ************************************/ |
||
1286 | |||
1287 | /** |
||
1288 | * Read URL for single queue entry |
||
1289 | * |
||
1290 | * @param integer $queueId |
||
1291 | * @param boolean $force If set, will process even if exec_time has been set! |
||
1292 | * @return integer |
||
1293 | */ |
||
1294 | public function readUrl($queueId, $force = false) |
||
1295 | { |
||
1296 | $ret = 0; |
||
1297 | if ($this->debugMode) { |
||
1298 | GeneralUtility::devlog('crawler-readurl start ' . microtime(true), __FUNCTION__); |
||
1299 | } |
||
1300 | // Get entry: |
||
1301 | list($queueRec) = $this->db->exec_SELECTgetRows( |
||
1302 | '*', |
||
1303 | 'tx_crawler_queue', |
||
1304 | 'qid=' . intval($queueId) . ($force ? '' : ' AND exec_time=0 AND process_scheduled > 0') |
||
1305 | ); |
||
1306 | |||
1307 | if (!is_array($queueRec)) { |
||
1308 | return; |
||
1309 | } |
||
1310 | |||
1311 | $parameters = unserialize($queueRec['parameters']); |
||
1312 | if ($parameters['rootTemplatePid']) { |
||
1313 | $this->initTSFE((int)$parameters['rootTemplatePid']); |
||
1314 | } else { |
||
1315 | GeneralUtility::sysLog( |
||
1316 | 'Page with (' . $queueRec['page_id'] . ') could not be crawled, please check your crawler configuration. Perhaps no Root Template Pid is set', |
||
1317 | 'crawler', |
||
1318 | GeneralUtility::SYSLOG_SEVERITY_WARNING |
||
1319 | ); |
||
1320 | } |
||
1321 | |||
1322 | SignalSlotUtility::emitSignal( |
||
1323 | __CLASS__, |
||
1324 | SignalSlotUtility::SIGNNAL_QUEUEITEM_PREPROCESS, |
||
1325 | [$queueId, &$queueRec] |
||
1326 | ); |
||
1327 | |||
1328 | // Set exec_time to lock record: |
||
1329 | $field_array = ['exec_time' => $this->getCurrentTime()]; |
||
1330 | |||
1331 | if (isset($this->processID)) { |
||
1332 | //if mulitprocessing is used we need to store the id of the process which has handled this entry |
||
1333 | $field_array['process_id_completed'] = $this->processID; |
||
1334 | } |
||
1335 | $this->db->exec_UPDATEquery('tx_crawler_queue', 'qid=' . intval($queueId), $field_array); |
||
1336 | |||
1337 | $result = $this->readUrl_exec($queueRec); |
||
1338 | $resultData = unserialize($result['content']); |
||
1339 | |||
1340 | //atm there's no need to point to specific pollable extensions |
||
1341 | if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'])) { |
||
1342 | foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'] as $pollable) { |
||
1343 | // only check the success value if the instruction is runnig |
||
1344 | // it is important to name the pollSuccess key same as the procInstructions key |
||
1345 | if (is_array($resultData['parameters']['procInstructions']) && in_array( |
||
1346 | $pollable, |
||
1347 | $resultData['parameters']['procInstructions'] |
||
1348 | ) |
||
1349 | ) { |
||
1350 | if (!empty($resultData['success'][$pollable]) && $resultData['success'][$pollable]) { |
||
1351 | $ret |= self::CLI_STATUS_POLLABLE_PROCESSED; |
||
1352 | } |
||
1353 | } |
||
1354 | } |
||
1355 | } |
||
1356 | |||
1357 | // Set result in log which also denotes the end of the processing of this entry. |
||
1358 | $field_array = ['result_data' => serialize($result)]; |
||
1359 | |||
1360 | SignalSlotUtility::emitSignal( |
||
1361 | __CLASS__, |
||
1362 | SignalSlotUtility::SIGNNAL_QUEUEITEM_POSTPROCESS, |
||
1363 | [$queueId, &$field_array] |
||
1364 | ); |
||
1365 | |||
1366 | $this->db->exec_UPDATEquery('tx_crawler_queue', 'qid=' . intval($queueId), $field_array); |
||
1367 | |||
1368 | if ($this->debugMode) { |
||
1369 | GeneralUtility::devlog('crawler-readurl stop ' . microtime(true), __FUNCTION__); |
||
1370 | } |
||
1371 | |||
1372 | return $ret; |
||
1373 | } |
||
1374 | |||
1375 | /** |
||
1376 | * Read URL for not-yet-inserted log-entry |
||
1377 | * |
||
1378 | * @param array $field_array Queue field array, |
||
1379 | * |
||
1380 | * @return string |
||
1381 | */ |
||
1382 | public function readUrlFromArray($field_array) |
||
1383 | { |
||
1384 | |||
1385 | // Set exec_time to lock record: |
||
1386 | $field_array['exec_time'] = $this->getCurrentTime(); |
||
1387 | $this->db->exec_INSERTquery('tx_crawler_queue', $field_array); |
||
1388 | $queueId = $field_array['qid'] = $this->db->sql_insert_id(); |
||
1389 | |||
1390 | $result = $this->readUrl_exec($field_array); |
||
1391 | |||
1392 | // Set result in log which also denotes the end of the processing of this entry. |
||
1393 | $field_array = ['result_data' => serialize($result)]; |
||
1394 | |||
1395 | SignalSlotUtility::emitSignal( |
||
1396 | __CLASS__, |
||
1397 | SignalSlotUtility::SIGNNAL_QUEUEITEM_POSTPROCESS, |
||
1398 | [$queueId, &$field_array] |
||
1399 | ); |
||
1400 | |||
1401 | $this->db->exec_UPDATEquery('tx_crawler_queue', 'qid=' . intval($queueId), $field_array); |
||
1402 | |||
1403 | return $result; |
||
1404 | } |
||
1405 | |||
1406 | /** |
||
1407 | * Read URL for a queue record |
||
1408 | * |
||
1409 | * @param array $queueRec Queue record |
||
1410 | * @return string |
||
1411 | */ |
||
1412 | public function readUrl_exec($queueRec) |
||
1413 | { |
||
1414 | // Decode parameters: |
||
1415 | $parameters = unserialize($queueRec['parameters']); |
||
1416 | $result = 'ERROR'; |
||
1417 | if (is_array($parameters)) { |
||
1418 | if ($parameters['_CALLBACKOBJ']) { // Calling object: |
||
1419 | $objRef = $parameters['_CALLBACKOBJ']; |
||
1420 | $callBackObj = &GeneralUtility::getUserObj($objRef); |
||
1421 | if (is_object($callBackObj)) { |
||
1422 | unset($parameters['_CALLBACKOBJ']); |
||
1423 | $result = ['content' => serialize($callBackObj->crawler_execute($parameters, $this))]; |
||
1424 | } else { |
||
1425 | $result = ['content' => 'No object: ' . $objRef]; |
||
1426 | } |
||
1427 | } else { // Regular FE request: |
||
1428 | |||
1429 | // Prepare: |
||
1430 | $crawlerId = $queueRec['qid'] . ':' . md5($queueRec['qid'] . '|' . $queueRec['set_id'] . '|' . $GLOBALS['TYPO3_CONF_VARS']['SYS']['encryptionKey']); |
||
1431 | |||
1432 | // Get result: |
||
1433 | $result = $this->requestUrl($parameters['url'], $crawlerId); |
||
1434 | |||
1435 | EventDispatcher::getInstance()->post('urlCrawled', $queueRec['set_id'], ['url' => $parameters['url'], 'result' => $result]); |
||
1436 | } |
||
1437 | } |
||
1438 | |||
1439 | return $result; |
||
1440 | } |
||
1441 | |||
1442 | /** |
||
1443 | * Gets the content of a URL. |
||
1444 | * |
||
1445 | * @param string $originalUrl URL to read |
||
1446 | * @param string $crawlerId Crawler ID string (qid + hash to verify) |
||
1447 | * @param integer $timeout Timeout time |
||
1448 | * @param integer $recursion Recursion limiter for 302 redirects |
||
1449 | * @return array |
||
1450 | */ |
||
1451 | 2 | public function requestUrl($originalUrl, $crawlerId, $timeout = 2, $recursion = 10) |
|
1452 | { |
||
1453 | 2 | if (!$recursion) { |
|
1454 | return false; |
||
1455 | } |
||
1456 | |||
1457 | // Parse URL, checking for scheme: |
||
1458 | 2 | $url = parse_url($originalUrl); |
|
1459 | |||
1460 | 2 | if ($url === false) { |
|
1461 | if (TYPO3_DLOG) { |
||
1462 | GeneralUtility::devLog(sprintf('Could not parse_url() for string "%s"', $url), 'crawler', 4, ['crawlerId' => $crawlerId]); |
||
1463 | } |
||
1464 | return false; |
||
1465 | } |
||
1466 | |||
1467 | 2 | if (!in_array($url['scheme'], ['','http','https'])) { |
|
1468 | if (TYPO3_DLOG) { |
||
1469 | GeneralUtility::devLog(sprintf('Scheme does not match for url "%s"', $url), 'crawler', 4, ['crawlerId' => $crawlerId]); |
||
1470 | } |
||
1471 | return false; |
||
1472 | } |
||
1473 | |||
1474 | // direct request |
||
1475 | 2 | if ($this->extensionSettings['makeDirectRequests']) { |
|
1476 | 2 | $result = $this->sendDirectRequest($originalUrl, $crawlerId); |
|
1477 | 2 | return $result; |
|
1478 | } |
||
1479 | |||
1480 | $reqHeaders = $this->buildRequestHeaderArray($url, $crawlerId); |
||
1481 | |||
1482 | // thanks to Pierrick Caillon for adding proxy support |
||
1483 | $rurl = $url; |
||
1484 | |||
1485 | if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['curlUse'] && $GLOBALS['TYPO3_CONF_VARS']['SYS']['curlProxyServer']) { |
||
1486 | $rurl = parse_url($GLOBALS['TYPO3_CONF_VARS']['SYS']['curlProxyServer']); |
||
1487 | $url['path'] = $url['scheme'] . '://' . $url['host'] . ($url['port'] > 0 ? ':' . $url['port'] : '') . $url['path']; |
||
1488 | $reqHeaders = $this->buildRequestHeaderArray($url, $crawlerId); |
||
1489 | } |
||
1490 | |||
1491 | $host = $rurl['host']; |
||
1492 | |||
1493 | if ($url['scheme'] == 'https') { |
||
1494 | $host = 'ssl://' . $host; |
||
1495 | $port = ($rurl['port'] > 0) ? $rurl['port'] : 443; |
||
1496 | } else { |
||
1497 | $port = ($rurl['port'] > 0) ? $rurl['port'] : 80; |
||
1498 | } |
||
1499 | |||
1500 | $startTime = microtime(true); |
||
1501 | $fp = fsockopen($host, $port, $errno, $errstr, $timeout); |
||
1502 | |||
1503 | if (!$fp) { |
||
1504 | if (TYPO3_DLOG) { |
||
1505 | GeneralUtility::devLog(sprintf('Error while opening "%s"', $url), 'crawler', 4, ['crawlerId' => $crawlerId]); |
||
1506 | } |
||
1507 | return false; |
||
1508 | } else { |
||
1509 | // Request message: |
||
1510 | $msg = implode("\r\n", $reqHeaders) . "\r\n\r\n"; |
||
1511 | fputs($fp, $msg); |
||
1512 | |||
1513 | // Read response: |
||
1514 | $d = $this->getHttpResponseFromStream($fp); |
||
1515 | fclose($fp); |
||
1516 | |||
1517 | $time = microtime(true) - $startTime; |
||
1518 | $this->log($originalUrl . ' ' . $time); |
||
1519 | |||
1520 | // Implode content and headers: |
||
1521 | $result = [ |
||
1522 | 'request' => $msg, |
||
1523 | 'headers' => implode('', $d['headers']), |
||
1524 | 'content' => implode('', (array)$d['content']) |
||
1525 | ]; |
||
1526 | |||
1527 | if (($this->extensionSettings['follow30x']) && ($newUrl = $this->getRequestUrlFrom302Header($d['headers'], $url['user'], $url['pass']))) { |
||
1528 | $result = array_merge(['parentRequest' => $result], $this->requestUrl($newUrl, $crawlerId, $recursion--)); |
||
1529 | $newRequestUrl = $this->requestUrl($newUrl, $crawlerId, $timeout, --$recursion); |
||
1530 | |||
1531 | if (is_array($newRequestUrl)) { |
||
1532 | $result = array_merge(['parentRequest' => $result], $newRequestUrl); |
||
1533 | } else { |
||
1534 | if (TYPO3_DLOG) { |
||
1535 | GeneralUtility::devLog(sprintf('Error while opening "%s"', $url), 'crawler', 4, ['crawlerId' => $crawlerId]); |
||
1536 | } |
||
1537 | return false; |
||
1538 | } |
||
1539 | } |
||
1540 | |||
1541 | return $result; |
||
1542 | } |
||
1543 | } |
||
1544 | |||
1545 | /** |
||
1546 | * Gets the base path of the website frontend. |
||
1547 | * (e.g. if you call http://mydomain.com/cms/index.php in |
||
1548 | * the browser the base path is "/cms/") |
||
1549 | * |
||
1550 | * @return string Base path of the website frontend |
||
1551 | */ |
||
1552 | protected function getFrontendBasePath() |
||
1553 | { |
||
1554 | $frontendBasePath = '/'; |
||
1555 | |||
1556 | // Get the path from the extension settings: |
||
1557 | if (isset($this->extensionSettings['frontendBasePath']) && $this->extensionSettings['frontendBasePath']) { |
||
1558 | $frontendBasePath = $this->extensionSettings['frontendBasePath']; |
||
1559 | // If empty, try to use config.absRefPrefix: |
||
1560 | } elseif (isset($GLOBALS['TSFE']->absRefPrefix) && !empty($GLOBALS['TSFE']->absRefPrefix)) { |
||
1561 | $frontendBasePath = $GLOBALS['TSFE']->absRefPrefix; |
||
1562 | // If not in CLI mode the base path can be determined from $_SERVER environment: |
||
1563 | } elseif (!defined('TYPO3_REQUESTTYPE_CLI') || !TYPO3_REQUESTTYPE_CLI) { |
||
1564 | $frontendBasePath = GeneralUtility::getIndpEnv('TYPO3_SITE_PATH'); |
||
1565 | } |
||
1566 | |||
1567 | // Base path must be '/<pathSegements>/': |
||
1568 | if ($frontendBasePath != '/') { |
||
1569 | $frontendBasePath = '/' . ltrim($frontendBasePath, '/'); |
||
1570 | $frontendBasePath = rtrim($frontendBasePath, '/') . '/'; |
||
1571 | } |
||
1572 | |||
1573 | return $frontendBasePath; |
||
1574 | } |
||
1575 | |||
1576 | /** |
||
1577 | * Executes a shell command and returns the outputted result. |
||
1578 | * |
||
1579 | * @param string $command Shell command to be executed |
||
1580 | * @return string Outputted result of the command execution |
||
1581 | */ |
||
1582 | protected function executeShellCommand($command) |
||
1583 | { |
||
1584 | $result = shell_exec($command); |
||
1585 | return $result; |
||
1586 | } |
||
1587 | |||
1588 | /** |
||
1589 | * Reads HTTP response from the given stream. |
||
1590 | * |
||
1591 | * @param resource $streamPointer Pointer to connection stream. |
||
1592 | * @return array Associative array with the following items: |
||
1593 | * headers <array> Response headers sent by server. |
||
1594 | * content <array> Content, with each line as an array item. |
||
1595 | */ |
||
1596 | 1 | protected function getHttpResponseFromStream($streamPointer) |
|
1597 | { |
||
1598 | 1 | $response = ['headers' => [], 'content' => []]; |
|
1599 | |||
1600 | 1 | if (is_resource($streamPointer)) { |
|
1601 | // read headers |
||
1602 | 1 | while ($line = fgets($streamPointer, '2048')) { |
|
1603 | 1 | $line = trim($line); |
|
1604 | 1 | if ($line !== '') { |
|
1605 | 1 | $response['headers'][] = $line; |
|
1606 | } else { |
||
1607 | 1 | break; |
|
1608 | } |
||
1609 | } |
||
1610 | |||
1611 | // read content |
||
1612 | 1 | while ($line = fgets($streamPointer, '2048')) { |
|
1613 | 1 | $response['content'][] = $line; |
|
1614 | } |
||
1615 | } |
||
1616 | |||
1617 | 1 | return $response; |
|
1618 | } |
||
1619 | |||
1620 | /** |
||
1621 | * @param message |
||
1622 | */ |
||
1623 | 2 | protected function log($message) |
|
1624 | { |
||
1625 | 2 | if (!empty($this->extensionSettings['logFileName'])) { |
|
1626 | $fileResult = @file_put_contents($this->extensionSettings['logFileName'], date('Ymd His') . ' ' . $message . PHP_EOL, FILE_APPEND); |
||
1627 | if (!$fileResult) { |
||
1628 | GeneralUtility::devLog('File "' . $this->extensionSettings['logFileName'] . '" could not be written, please check file permissions.', 'crawler', LogLevel::INFO); |
||
1629 | } |
||
1630 | } |
||
1631 | 2 | } |
|
1632 | |||
1633 | /** |
||
1634 | * Builds HTTP request headers. |
||
1635 | * |
||
1636 | * @param array $url |
||
1637 | * @param string $crawlerId |
||
1638 | * |
||
1639 | * @return array |
||
1640 | */ |
||
1641 | 6 | protected function buildRequestHeaderArray(array $url, $crawlerId) |
|
1642 | { |
||
1643 | 6 | $reqHeaders = []; |
|
1644 | 6 | $reqHeaders[] = 'GET ' . $url['path'] . ($url['query'] ? '?' . $url['query'] : '') . ' HTTP/1.0'; |
|
1645 | 6 | $reqHeaders[] = 'Host: ' . $url['host']; |
|
1646 | 6 | if (stristr($url['query'], 'ADMCMD_previewWS')) { |
|
1647 | 2 | $reqHeaders[] = 'Cookie: $Version="1"; be_typo_user="1"; $Path=/'; |
|
1648 | } |
||
1649 | 6 | $reqHeaders[] = 'Connection: close'; |
|
1650 | 6 | if ($url['user'] != '') { |
|
1651 | 2 | $reqHeaders[] = 'Authorization: Basic ' . base64_encode($url['user'] . ':' . $url['pass']); |
|
1652 | } |
||
1653 | 6 | $reqHeaders[] = 'X-T3crawler: ' . $crawlerId; |
|
1654 | 6 | $reqHeaders[] = 'User-Agent: TYPO3 crawler'; |
|
1655 | 6 | return $reqHeaders; |
|
1656 | } |
||
1657 | |||
1658 | /** |
||
1659 | * Check if the submitted HTTP-Header contains a redirect location and built new crawler-url |
||
1660 | * |
||
1661 | * @param array $headers HTTP Header |
||
1662 | * @param string $user HTTP Auth. User |
||
1663 | * @param string $pass HTTP Auth. Password |
||
1664 | * @return bool|string |
||
1665 | */ |
||
1666 | 12 | protected function getRequestUrlFrom302Header($headers, $user = '', $pass = '') |
|
1667 | { |
||
1668 | 12 | $header = []; |
|
1669 | 12 | if (!is_array($headers)) { |
|
1670 | 1 | return false; |
|
1671 | } |
||
1672 | 11 | if (!(stristr($headers[0], '301 Moved') || stristr($headers[0], '302 Found') || stristr($headers[0], '302 Moved'))) { |
|
1673 | 2 | return false; |
|
1674 | } |
||
1675 | |||
1676 | 9 | foreach ($headers as $hl) { |
|
1677 | 9 | $tmp = explode(": ", $hl); |
|
1678 | 9 | $header[trim($tmp[0])] = trim($tmp[1]); |
|
1679 | 9 | if (trim($tmp[0]) == 'Location') { |
|
1680 | 9 | break; |
|
1681 | } |
||
1682 | } |
||
1683 | 9 | if (!array_key_exists('Location', $header)) { |
|
1684 | 3 | return false; |
|
1685 | } |
||
1686 | |||
1687 | 6 | if ($user != '') { |
|
1688 | 3 | if (!($tmp = parse_url($header['Location']))) { |
|
1689 | 1 | return false; |
|
1690 | } |
||
1691 | 2 | $newUrl = $tmp['scheme'] . '://' . $user . ':' . $pass . '@' . $tmp['host'] . $tmp['path']; |
|
1692 | 2 | if ($tmp['query'] != '') { |
|
1693 | 2 | $newUrl .= '?' . $tmp['query']; |
|
1694 | } |
||
1695 | } else { |
||
1696 | 3 | $newUrl = $header['Location']; |
|
1697 | } |
||
1698 | 5 | return $newUrl; |
|
1699 | } |
||
1700 | |||
1701 | /************************** |
||
1702 | * |
||
1703 | * tslib_fe hooks: |
||
1704 | * |
||
1705 | **************************/ |
||
1706 | |||
1707 | /** |
||
1708 | * Initialization hook (called after database connection) |
||
1709 | * Takes the "HTTP_X_T3CRAWLER" header and looks up queue record and verifies if the session comes from the system (by comparing hashes) |
||
1710 | * |
||
1711 | * @param array $params Parameters from frontend |
||
1712 | * @param object $ref TSFE object (reference under PHP5) |
||
1713 | * @return void |
||
1714 | * |
||
1715 | * FIXME: Look like this is not used, in commit 9910d3f40cce15f4e9b7bcd0488bf21f31d53ebc it's added as public, |
||
1716 | * FIXME: I think this can be removed. (TNM) |
||
1717 | */ |
||
1718 | public function fe_init(&$params, $ref) |
||
1719 | { |
||
1720 | // Authenticate crawler request: |
||
1721 | if (isset($_SERVER['HTTP_X_T3CRAWLER'])) { |
||
1722 | list($queueId, $hash) = explode(':', $_SERVER['HTTP_X_T3CRAWLER']); |
||
1723 | list($queueRec) = $this->db->exec_SELECTgetSingleRow('*', 'tx_crawler_queue', 'qid=' . intval($queueId)); |
||
1724 | |||
1725 | // If a crawler record was found and hash was matching, set it up: |
||
1726 | if (is_array($queueRec) && $hash === md5($queueRec['qid'] . '|' . $queueRec['set_id'] . '|' . $GLOBALS['TYPO3_CONF_VARS']['SYS']['encryptionKey'])) { |
||
1727 | $params['pObj']->applicationData['tx_crawler']['running'] = true; |
||
1728 | $params['pObj']->applicationData['tx_crawler']['parameters'] = unserialize($queueRec['parameters']); |
||
1729 | $params['pObj']->applicationData['tx_crawler']['log'] = []; |
||
1730 | } else { |
||
1731 | die('No crawler entry found!'); |
||
1732 | } |
||
1733 | } |
||
1734 | } |
||
1735 | |||
1736 | /***************************** |
||
1737 | * |
||
1738 | * Compiling URLs to crawl - tools |
||
1739 | * |
||
1740 | *****************************/ |
||
1741 | |||
1742 | /** |
||
1743 | * @param integer $id Root page id to start from. |
||
1744 | * @param integer $depth Depth of tree, 0=only id-page, 1= on sublevel, 99 = infinite |
||
1745 | * @param integer $scheduledTime Unix Time when the URL is timed to be visited when put in queue |
||
1746 | * @param integer $reqMinute Number of requests per minute (creates the interleave between requests) |
||
1747 | * @param boolean $submitCrawlUrls If set, submits the URLs to queue in database (real crawling) |
||
1748 | * @param boolean $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries) |
||
1749 | * @param array $incomingProcInstructions Array of processing instructions |
||
1750 | * @param array $configurationSelection Array of configuration keys |
||
1751 | * @return string |
||
1752 | */ |
||
1753 | public function getPageTreeAndUrls( |
||
1754 | $id, |
||
1755 | $depth, |
||
1756 | $scheduledTime, |
||
1757 | $reqMinute, |
||
1758 | $submitCrawlUrls, |
||
1759 | $downloadCrawlUrls, |
||
1760 | array $incomingProcInstructions, |
||
1761 | array $configurationSelection |
||
1762 | ) { |
||
1763 | global $BACK_PATH; |
||
1764 | global $LANG; |
||
1765 | if (!is_object($LANG)) { |
||
1766 | $LANG = GeneralUtility::makeInstance(LanguageService::class); |
||
1767 | $LANG->init(0); |
||
1768 | } |
||
1769 | $this->scheduledTime = $scheduledTime; |
||
1770 | $this->reqMinute = $reqMinute; |
||
1771 | $this->submitCrawlUrls = $submitCrawlUrls; |
||
1772 | $this->downloadCrawlUrls = $downloadCrawlUrls; |
||
1773 | $this->incomingProcInstructions = $incomingProcInstructions; |
||
1774 | $this->incomingConfigurationSelection = $configurationSelection; |
||
1775 | |||
1776 | $this->duplicateTrack = []; |
||
1777 | $this->downloadUrls = []; |
||
1778 | |||
1779 | // Drawing tree: |
||
1780 | /* @var PageTreeView $tree */ |
||
1781 | $tree = GeneralUtility::makeInstance(PageTreeView::class); |
||
1782 | $perms_clause = $GLOBALS['BE_USER']->getPagePermsClause(1); |
||
1783 | $tree->init('AND ' . $perms_clause); |
||
1784 | |||
1785 | $pageInfo = BackendUtility::readPageAccess($id, $perms_clause); |
||
1786 | if (is_array($pageInfo)) { |
||
1787 | // Set root row: |
||
1788 | $tree->tree[] = [ |
||
1789 | 'row' => $pageInfo, |
||
1790 | 'HTML' => IconUtility::getIconForRecord('pages', $pageInfo) |
||
1791 | ]; |
||
1792 | } |
||
1793 | |||
1794 | // Get branch beneath: |
||
1795 | if ($depth) { |
||
1796 | $tree->getTree($id, $depth, ''); |
||
1797 | } |
||
1798 | |||
1799 | // Traverse page tree: |
||
1800 | $code = ''; |
||
1801 | |||
1802 | foreach ($tree->tree as $data) { |
||
1803 | $this->MP = false; |
||
1804 | |||
1805 | // recognize mount points |
||
1806 | if ($data['row']['doktype'] == 7) { |
||
1807 | $mountpage = $this->db->exec_SELECTgetRows('*', 'pages', 'uid = ' . $data['row']['uid']); |
||
1808 | |||
1809 | // fetch mounted pages |
||
1810 | $this->MP = $mountpage[0]['mount_pid'] . '-' . $data['row']['uid']; |
||
1811 | |||
1812 | $mountTree = GeneralUtility::makeInstance(PageTreeView::class); |
||
1813 | $mountTree->init('AND ' . $perms_clause); |
||
1814 | $mountTree->getTree($mountpage[0]['mount_pid'], $depth, ''); |
||
1815 | |||
1816 | foreach ($mountTree->tree as $mountData) { |
||
1817 | $code .= $this->drawURLs_addRowsForPage( |
||
1818 | $mountData['row'], |
||
1819 | $mountData['HTML'] . BackendUtility::getRecordTitle('pages', $mountData['row'], true) |
||
1820 | ); |
||
1821 | } |
||
1822 | |||
1823 | // replace page when mount_pid_ol is enabled |
||
1824 | if ($mountpage[0]['mount_pid_ol']) { |
||
1825 | $data['row']['uid'] = $mountpage[0]['mount_pid']; |
||
1826 | } else { |
||
1827 | // if the mount_pid_ol is not set the MP must not be used for the mountpoint page |
||
1828 | $this->MP = false; |
||
1829 | } |
||
1830 | } |
||
1831 | |||
1832 | $code .= $this->drawURLs_addRowsForPage( |
||
1833 | $data['row'], |
||
1834 | $data['HTML'] . BackendUtility::getRecordTitle('pages', $data['row'], true) |
||
1835 | ); |
||
1836 | } |
||
1837 | |||
1838 | return $code; |
||
1839 | } |
||
1840 | |||
1841 | /** |
||
1842 | * Expands exclude string |
||
1843 | * |
||
1844 | * @param string $excludeString Exclude string |
||
1845 | * @return array |
||
1846 | */ |
||
1847 | 1 | public function expandExcludeString($excludeString) |
|
1848 | { |
||
1849 | // internal static caches; |
||
1850 | 1 | static $expandedExcludeStringCache; |
|
1851 | 1 | static $treeCache; |
|
1852 | |||
1853 | 1 | if (empty($expandedExcludeStringCache[$excludeString])) { |
|
1854 | 1 | $pidList = []; |
|
1855 | |||
1856 | 1 | if (!empty($excludeString)) { |
|
1857 | /** @var PageTreeView $tree */ |
||
1858 | $tree = GeneralUtility::makeInstance(PageTreeView::class); |
||
1859 | $tree->init('AND ' . $this->backendUser->getPagePermsClause(1)); |
||
1860 | |||
1861 | $excludeParts = GeneralUtility::trimExplode(',', $excludeString); |
||
1862 | |||
1863 | foreach ($excludeParts as $excludePart) { |
||
1864 | list($pid, $depth) = GeneralUtility::trimExplode('+', $excludePart); |
||
1865 | |||
1866 | // default is "page only" = "depth=0" |
||
1867 | if (empty($depth)) { |
||
1868 | $depth = (stristr($excludePart, '+')) ? 99 : 0; |
||
1869 | } |
||
1870 | |||
1871 | $pidList[] = $pid; |
||
1872 | |||
1873 | if ($depth > 0) { |
||
1874 | if (empty($treeCache[$pid][$depth])) { |
||
1875 | $tree->reset(); |
||
1876 | $tree->getTree($pid, $depth); |
||
1877 | $treeCache[$pid][$depth] = $tree->tree; |
||
1878 | } |
||
1879 | |||
1880 | foreach ($treeCache[$pid][$depth] as $data) { |
||
1881 | $pidList[] = $data['row']['uid']; |
||
1882 | } |
||
1883 | } |
||
1884 | } |
||
1885 | } |
||
1886 | |||
1887 | 1 | $expandedExcludeStringCache[$excludeString] = array_unique($pidList); |
|
1888 | } |
||
1889 | |||
1890 | 1 | return $expandedExcludeStringCache[$excludeString]; |
|
1891 | } |
||
1892 | |||
1893 | /** |
||
1894 | * Create the rows for display of the page tree |
||
1895 | * For each page a number of rows are shown displaying GET variable configuration |
||
1896 | * |
||
1897 | * @param array Page row |
||
1898 | * @param string Page icon and title for row |
||
1899 | * @return string HTML <tr> content (one or more) |
||
1900 | */ |
||
1901 | public function drawURLs_addRowsForPage(array $pageRow, $pageTitleAndIcon) |
||
1902 | { |
||
1903 | $skipMessage = ''; |
||
1904 | |||
1905 | // Get list of configurations |
||
1906 | $configurations = $this->getUrlsForPageRow($pageRow, $skipMessage); |
||
1907 | |||
1908 | if (count($this->incomingConfigurationSelection) > 0) { |
||
1909 | // remove configuration that does not match the current selection |
||
1910 | foreach ($configurations as $confKey => $confArray) { |
||
1911 | if (!in_array($confKey, $this->incomingConfigurationSelection)) { |
||
1912 | unset($configurations[$confKey]); |
||
1913 | } |
||
1914 | } |
||
1915 | } |
||
1916 | |||
1917 | // Traverse parameter combinations: |
||
1918 | $c = 0; |
||
1919 | $content = ''; |
||
1920 | if (count($configurations)) { |
||
1921 | foreach ($configurations as $confKey => $confArray) { |
||
1922 | |||
1923 | // Title column: |
||
1924 | if (!$c) { |
||
1925 | $titleClm = '<td rowspan="' . count($configurations) . '">' . $pageTitleAndIcon . '</td>'; |
||
1926 | } else { |
||
1927 | $titleClm = ''; |
||
1928 | } |
||
1929 | |||
1930 | if (!in_array($pageRow['uid'], $this->expandExcludeString($confArray['subCfg']['exclude']))) { |
||
1931 | |||
1932 | // URL list: |
||
1933 | $urlList = $this->urlListFromUrlArray( |
||
1934 | $confArray, |
||
1935 | $pageRow, |
||
1936 | $this->scheduledTime, |
||
1937 | $this->reqMinute, |
||
1938 | $this->submitCrawlUrls, |
||
1939 | $this->downloadCrawlUrls, |
||
1940 | $this->duplicateTrack, |
||
1941 | $this->downloadUrls, |
||
1942 | $this->incomingProcInstructions // if empty the urls won't be filtered by processing instructions |
||
1943 | ); |
||
1944 | |||
1945 | // Expanded parameters: |
||
1946 | $paramExpanded = ''; |
||
1947 | $calcAccu = []; |
||
1948 | $calcRes = 1; |
||
1949 | foreach ($confArray['paramExpanded'] as $gVar => $gVal) { |
||
1950 | $paramExpanded .= ' |
||
1951 | <tr> |
||
1952 | <td class="bgColor4-20">' . htmlspecialchars('&' . $gVar . '=') . '<br/>' . |
||
1953 | '(' . count($gVal) . ')' . |
||
1954 | '</td> |
||
1955 | <td class="bgColor4" nowrap="nowrap">' . nl2br(htmlspecialchars(implode(chr(10), $gVal))) . '</td> |
||
1956 | </tr> |
||
1957 | '; |
||
1958 | $calcRes *= count($gVal); |
||
1959 | $calcAccu[] = count($gVal); |
||
1960 | } |
||
1961 | $paramExpanded = '<table class="lrPadding c-list param-expanded">' . $paramExpanded . '</table>'; |
||
1962 | $paramExpanded .= 'Comb: ' . implode('*', $calcAccu) . '=' . $calcRes; |
||
1963 | |||
1964 | // Options |
||
1965 | $optionValues = ''; |
||
1966 | if ($confArray['subCfg']['userGroups']) { |
||
1967 | $optionValues .= 'User Groups: ' . $confArray['subCfg']['userGroups'] . '<br/>'; |
||
1968 | } |
||
1969 | if ($confArray['subCfg']['baseUrl']) { |
||
1970 | $optionValues .= 'Base Url: ' . $confArray['subCfg']['baseUrl'] . '<br/>'; |
||
1971 | } |
||
1972 | if ($confArray['subCfg']['procInstrFilter']) { |
||
1973 | $optionValues .= 'ProcInstr: ' . $confArray['subCfg']['procInstrFilter'] . '<br/>'; |
||
1974 | } |
||
1975 | |||
1976 | // Compile row: |
||
1977 | $content .= ' |
||
1978 | <tr class="bgColor' . ($c % 2 ? '-20' : '-10') . '"> |
||
1979 | ' . $titleClm . ' |
||
1980 | <td>' . htmlspecialchars($confKey) . '</td> |
||
1981 | <td>' . nl2br(htmlspecialchars(rawurldecode(trim(str_replace('&', chr(10) . '&', GeneralUtility::implodeArrayForUrl('', $confArray['paramParsed'])))))) . '</td> |
||
1982 | <td>' . $paramExpanded . '</td> |
||
1983 | <td nowrap="nowrap">' . $urlList . '</td> |
||
1984 | <td nowrap="nowrap">' . $optionValues . '</td> |
||
1985 | <td nowrap="nowrap">' . DebugUtility::viewArray($confArray['subCfg']['procInstrParams.']) . '</td> |
||
1986 | </tr>'; |
||
1987 | } else { |
||
1988 | $content .= '<tr class="bgColor' . ($c % 2 ? '-20' : '-10') . '"> |
||
1989 | ' . $titleClm . ' |
||
1990 | <td>' . htmlspecialchars($confKey) . '</td> |
||
1991 | <td colspan="5"><em>No entries</em> (Page is excluded in this configuration)</td> |
||
1992 | </tr>'; |
||
1993 | } |
||
1994 | |||
1995 | $c++; |
||
1996 | } |
||
1997 | } else { |
||
1998 | $message = !empty($skipMessage) ? ' (' . $skipMessage . ')' : ''; |
||
1999 | |||
2000 | // Compile row: |
||
2001 | $content .= ' |
||
2002 | <tr class="bgColor-20" style="border-bottom: 1px solid black;"> |
||
2003 | <td>' . $pageTitleAndIcon . '</td> |
||
2004 | <td colspan="6"><em>No entries</em>' . $message . '</td> |
||
2005 | </tr>'; |
||
2006 | } |
||
2007 | |||
2008 | return $content; |
||
2009 | } |
||
2010 | |||
2011 | /***************************** |
||
2012 | * |
||
2013 | * CLI functions |
||
2014 | * |
||
2015 | *****************************/ |
||
2016 | |||
2017 | /** |
||
2018 | * Main function for running from Command Line PHP script (cron job) |
||
2019 | * See ext/crawler/cli/crawler_cli.phpsh for details |
||
2020 | * |
||
2021 | * @return int number of remaining items or false if error |
||
2022 | */ |
||
2023 | public function CLI_main() |
||
2024 | { |
||
2025 | $this->setAccessMode('cli'); |
||
2026 | $result = self::CLI_STATUS_NOTHING_PROCCESSED; |
||
2027 | $cliObj = GeneralUtility::makeInstance(CrawlerCommandLineController::class); |
||
2028 | |||
2029 | if (isset($cliObj->cli_args['-h']) || isset($cliObj->cli_args['--help'])) { |
||
2030 | $cliObj->cli_validateArgs(); |
||
2031 | $cliObj->cli_help(); |
||
2032 | exit; |
||
2033 | } |
||
2034 | |||
2035 | if (!$this->getDisabled() && $this->CLI_checkAndAcquireNewProcess($this->CLI_buildProcessId())) { |
||
2036 | $countInARun = $cliObj->cli_argValue('--countInARun') ? intval($cliObj->cli_argValue('--countInARun')) : $this->extensionSettings['countInARun']; |
||
2037 | // Seconds |
||
2038 | $sleepAfterFinish = $cliObj->cli_argValue('--sleepAfterFinish') ? intval($cliObj->cli_argValue('--sleepAfterFinish')) : $this->extensionSettings['sleepAfterFinish']; |
||
2039 | // Milliseconds |
||
2040 | $sleepTime = $cliObj->cli_argValue('--sleepTime') ? intval($cliObj->cli_argValue('--sleepTime')) : $this->extensionSettings['sleepTime']; |
||
2041 | |||
2042 | try { |
||
2043 | // Run process: |
||
2044 | $result = $this->CLI_run($countInARun, $sleepTime, $sleepAfterFinish); |
||
2045 | } catch (\Exception $e) { |
||
2046 | $this->CLI_debug(get_class($e) . ': ' . $e->getMessage()); |
||
2047 | $result = self::CLI_STATUS_ABORTED; |
||
2048 | } |
||
2049 | |||
2050 | // Cleanup |
||
2051 | $this->db->exec_DELETEquery('tx_crawler_process', 'assigned_items_count = 0'); |
||
2052 | |||
2053 | //TODO can't we do that in a clean way? |
||
2054 | $releaseStatus = $this->CLI_releaseProcesses($this->CLI_buildProcessId()); |
||
2055 | |||
2056 | $this->CLI_debug("Unprocessed Items remaining:" . $this->queueRepository->countUnprocessedItems() . " (" . $this->CLI_buildProcessId() . ")"); |
||
2057 | $result |= ($this->queueRepository->countUnprocessedItems() > 0 ? self::CLI_STATUS_REMAIN : self::CLI_STATUS_NOTHING_PROCCESSED); |
||
2058 | } else { |
||
2059 | $result |= self::CLI_STATUS_ABORTED; |
||
2060 | } |
||
2061 | |||
2062 | return $result; |
||
2063 | } |
||
2064 | |||
2065 | /** |
||
2066 | * Function executed by crawler_im.php cli script. |
||
2067 | * |
||
2068 | * @return void |
||
2069 | */ |
||
2070 | public function CLI_main_im() |
||
2071 | { |
||
2072 | $this->setAccessMode('cli_im'); |
||
2073 | |||
2074 | $cliObj = GeneralUtility::makeInstance(QueueCommandLineController::class); |
||
2075 | |||
2076 | // Force user to admin state and set workspace to "Live": |
||
2077 | $this->backendUser->user['admin'] = 1; |
||
2078 | $this->backendUser->setWorkspace(0); |
||
2079 | |||
2080 | // Print help |
||
2081 | if (!isset($cliObj->cli_args['_DEFAULT'][1])) { |
||
2082 | $cliObj->cli_validateArgs(); |
||
2083 | $cliObj->cli_help(); |
||
2084 | exit; |
||
2085 | } |
||
2086 | |||
2087 | $cliObj->cli_validateArgs(); |
||
2088 | |||
2089 | if ($cliObj->cli_argValue('-o') === 'exec') { |
||
2090 | $this->registerQueueEntriesInternallyOnly = true; |
||
2091 | } |
||
2092 | |||
2093 | if (isset($cliObj->cli_args['_DEFAULT'][2])) { |
||
2094 | // Crawler is called over TYPO3 BE |
||
2095 | $pageId = MathUtility::forceIntegerInRange($cliObj->cli_args['_DEFAULT'][2], 0); |
||
2096 | } else { |
||
2097 | // Crawler is called over cli |
||
2098 | $pageId = MathUtility::forceIntegerInRange($cliObj->cli_args['_DEFAULT'][1], 0); |
||
2099 | } |
||
2100 | |||
2101 | $configurationKeys = $this->getConfigurationKeys($cliObj); |
||
0 ignored issues
–
show
|
|||
2102 | |||
2103 | if (!is_array($configurationKeys)) { |
||
2104 | $configurations = $this->getUrlsForPageId($pageId); |
||
2105 | if (is_array($configurations)) { |
||
2106 | $configurationKeys = array_keys($configurations); |
||
2107 | } else { |
||
2108 | $configurationKeys = []; |
||
2109 | } |
||
2110 | } |
||
2111 | |||
2112 | if ($cliObj->cli_argValue('-o') === 'queue' || $cliObj->cli_argValue('-o') === 'exec') { |
||
2113 | $reason = new Reason(); |
||
2114 | $reason->setReason(Reason::REASON_GUI_SUBMIT); |
||
2115 | $reason->setDetailText('The cli script of the crawler added to the queue'); |
||
2116 | EventDispatcher::getInstance()->post( |
||
2117 | 'invokeQueueChange', |
||
2118 | $this->setID, |
||
2119 | ['reason' => $reason] |
||
2120 | ); |
||
2121 | } |
||
2122 | |||
2123 | if ($this->extensionSettings['cleanUpOldQueueEntries']) { |
||
2124 | $this->cleanUpOldQueueEntries(); |
||
2125 | } |
||
2126 | |||
2127 | $this->setID = (int) GeneralUtility::md5int(microtime()); |
||
2128 | $this->getPageTreeAndUrls( |
||
2129 | $pageId, |
||
2130 | MathUtility::forceIntegerInRange($cliObj->cli_argValue('-d'), 0, 99), |
||
2131 | $this->getCurrentTime(), |
||
2132 | MathUtility::forceIntegerInRange($cliObj->cli_isArg('-n') ? $cliObj->cli_argValue('-n') : 30, 1, 1000), |
||
2133 | $cliObj->cli_argValue('-o') === 'queue' || $cliObj->cli_argValue('-o') === 'exec', |
||
2134 | $cliObj->cli_argValue('-o') === 'url', |
||
2135 | GeneralUtility::trimExplode(',', $cliObj->cli_argValue('-proc'), true), |
||
2136 | $configurationKeys |
||
2137 | ); |
||
2138 | |||
2139 | if ($cliObj->cli_argValue('-o') === 'url') { |
||
2140 | $cliObj->cli_echo(implode(chr(10), $this->downloadUrls) . chr(10), true); |
||
2141 | } elseif ($cliObj->cli_argValue('-o') === 'exec') { |
||
2142 | $cliObj->cli_echo("Executing " . count($this->urlList) . " requests right away:\n\n"); |
||
2143 | $cliObj->cli_echo(implode(chr(10), $this->urlList) . chr(10)); |
||
2144 | $cliObj->cli_echo("\nProcessing:\n"); |
||
2145 | |||
2146 | foreach ($this->queueEntries as $queueRec) { |
||
2147 | $p = unserialize($queueRec['parameters']); |
||
2148 | $cliObj->cli_echo($p['url'] . ' (' . implode(',', $p['procInstructions']) . ') => '); |
||
2149 | |||
2150 | $result = $this->readUrlFromArray($queueRec); |
||
2151 | |||
2152 | $requestResult = unserialize($result['content']); |
||
2153 | if (is_array($requestResult)) { |
||
2154 | $resLog = is_array($requestResult['log']) ? chr(10) . chr(9) . chr(9) . implode(chr(10) . chr(9) . chr(9), $requestResult['log']) : ''; |
||
2155 | $cliObj->cli_echo('OK: ' . $resLog . chr(10)); |
||
2156 | } else { |
||
2157 | $cliObj->cli_echo('Error checking Crawler Result: ' . substr(preg_replace('/\s+/', ' ', strip_tags($result['content'])), 0, 30000) . '...' . chr(10)); |
||
2158 | } |
||
2159 | } |
||
2160 | } elseif ($cliObj->cli_argValue('-o') === 'queue') { |
||
2161 | $cliObj->cli_echo("Putting " . count($this->urlList) . " entries in queue:\n\n"); |
||
2162 | $cliObj->cli_echo(implode(chr(10), $this->urlList) . chr(10)); |
||
2163 | } else { |
||
2164 | $cliObj->cli_echo(count($this->urlList) . " entries found for processing. (Use -o to decide action):\n\n", true); |
||
2165 | $cliObj->cli_echo(implode(chr(10), $this->urlList) . chr(10), true); |
||
2166 | } |
||
2167 | } |
||
2168 | |||
2169 | /** |
||
2170 | * Function executed by crawler_im.php cli script. |
||
2171 | * |
||
2172 | * @return bool |
||
2173 | */ |
||
2174 | public function CLI_main_flush() |
||
2175 | { |
||
2176 | $this->setAccessMode('cli_flush'); |
||
2177 | $cliObj = GeneralUtility::makeInstance(FlushCommandLineController::class); |
||
2178 | |||
2179 | // Force user to admin state and set workspace to "Live": |
||
2180 | $this->backendUser->user['admin'] = 1; |
||
2181 | $this->backendUser->setWorkspace(0); |
||
2182 | |||
2183 | // Print help |
||
2184 | if (!isset($cliObj->cli_args['_DEFAULT'][1])) { |
||
2185 | $cliObj->cli_validateArgs(); |
||
2186 | $cliObj->cli_help(); |
||
2187 | exit; |
||
2188 | } |
||
2189 | |||
2190 | $cliObj->cli_validateArgs(); |
||
2191 | $pageId = MathUtility::forceIntegerInRange($cliObj->cli_args['_DEFAULT'][1], 0); |
||
2192 | $fullFlush = ($pageId == 0); |
||
2193 | |||
2194 | $mode = $cliObj->cli_argValue('-o'); |
||
2195 | |||
2196 | switch ($mode) { |
||
2197 | case 'all': |
||
2198 | $result = $this->getLogEntriesForPageId($pageId, '', true, $fullFlush); |
||
2199 | break; |
||
2200 | case 'finished': |
||
2201 | case 'pending': |
||
2202 | $result = $this->getLogEntriesForPageId($pageId, $mode, true, $fullFlush); |
||
2203 | break; |
||
2204 | default: |
||
2205 | $cliObj->cli_validateArgs(); |
||
2206 | $cliObj->cli_help(); |
||
2207 | $result = false; |
||
2208 | } |
||
2209 | |||
2210 | return $result !== false; |
||
2211 | } |
||
2212 | |||
2213 | /** |
||
2214 | * Obtains configuration keys from the CLI arguments |
||
2215 | * |
||
2216 | * @param string $conf |
||
2217 | * @return array |
||
2218 | * |
||
2219 | * @deprecated since crawler v6.3.0, will be removed in crawler v7.0.0. |
||
2220 | */ |
||
2221 | protected function getConfigurationKeys($conf) |
||
2222 | { |
||
2223 | $parameter = trim($conf); |
||
2224 | return ($parameter != '' ? GeneralUtility::trimExplode(',', $parameter) : []); |
||
2225 | } |
||
2226 | |||
2227 | /** |
||
2228 | * Running the functionality of the CLI (crawling URLs from queue) |
||
2229 | * |
||
2230 | * @param int $countInARun |
||
2231 | * @param int $sleepTime |
||
2232 | * @param int $sleepAfterFinish |
||
2233 | * @return string |
||
2234 | */ |
||
2235 | public function CLI_run($countInARun, $sleepTime, $sleepAfterFinish) |
||
2236 | { |
||
2237 | $result = 0; |
||
2238 | $counter = 0; |
||
2239 | |||
2240 | // First, run hooks: |
||
2241 | $this->CLI_runHooks(); |
||
2242 | |||
2243 | // Clean up the queue |
||
2244 | if (intval($this->extensionSettings['purgeQueueDays']) > 0) { |
||
2245 | $purgeDate = $this->getCurrentTime() - 24 * 60 * 60 * intval($this->extensionSettings['purgeQueueDays']); |
||
2246 | $del = $this->db->exec_DELETEquery( |
||
2247 | 'tx_crawler_queue', |
||
2248 | 'exec_time!=0 AND exec_time<' . $purgeDate |
||
2249 | ); |
||
2250 | if (false == $del) { |
||
2251 | GeneralUtility::devLog('Records could not be deleted.', 'crawler', LogLevel::INFO); |
||
2252 | } |
||
2253 | } |
||
2254 | |||
2255 | // Select entries: |
||
2256 | //TODO Shouldn't this reside within the transaction? |
||
2257 | $rows = $this->db->exec_SELECTgetRows( |
||
2258 | 'qid,scheduled', |
||
2259 | 'tx_crawler_queue', |
||
2260 | 'exec_time=0 |
||
2261 | AND process_scheduled= 0 |
||
2262 | AND scheduled<=' . $this->getCurrentTime(), |
||
2263 | '', |
||
2264 | 'scheduled, qid', |
||
2265 | intval($countInARun) |
||
2266 | ); |
||
2267 | |||
2268 | if (count($rows) > 0) { |
||
2269 | $quidList = []; |
||
2270 | |||
2271 | foreach ($rows as $r) { |
||
2272 | $quidList[] = $r['qid']; |
||
2273 | } |
||
2274 | |||
2275 | $processId = $this->CLI_buildProcessId(); |
||
2276 | |||
2277 | //reserve queue entries for process |
||
2278 | $this->db->sql_query('BEGIN'); |
||
2279 | //TODO make sure we're not taking assigned queue-entires |
||
2280 | $this->db->exec_UPDATEquery( |
||
2281 | 'tx_crawler_queue', |
||
2282 | 'qid IN (' . implode(',', $quidList) . ')', |
||
2283 | [ |
||
2284 | 'process_scheduled' => intval($this->getCurrentTime()), |
||
2285 | 'process_id' => $processId |
||
2286 | ] |
||
2287 | ); |
||
2288 | |||
2289 | //save the number of assigned queue entrys to determine who many have been processed later |
||
2290 | $numberOfAffectedRows = $this->db->sql_affected_rows(); |
||
2291 | $this->db->exec_UPDATEquery( |
||
2292 | 'tx_crawler_process', |
||
2293 | "process_id = '" . $processId . "'", |
||
2294 | [ |
||
2295 | 'assigned_items_count' => intval($numberOfAffectedRows) |
||
2296 | ] |
||
2297 | ); |
||
2298 | |||
2299 | if ($numberOfAffectedRows == count($quidList)) { |
||
2300 | $this->db->sql_query('COMMIT'); |
||
2301 | } else { |
||
2302 | $this->db->sql_query('ROLLBACK'); |
||
2303 | $this->CLI_debug("Nothing processed due to multi-process collision (" . $this->CLI_buildProcessId() . ")"); |
||
2304 | return ($result | self::CLI_STATUS_ABORTED); |
||
2305 | } |
||
2306 | |||
2307 | foreach ($rows as $r) { |
||
2308 | $result |= $this->readUrl($r['qid']); |
||
2309 | |||
2310 | $counter++; |
||
2311 | usleep(intval($sleepTime)); // Just to relax the system |
||
2312 | |||
2313 | // if during the start and the current read url the cli has been disable we need to return from the function |
||
2314 | // mark the process NOT as ended. |
||
2315 | if ($this->getDisabled()) { |
||
2316 | return ($result | self::CLI_STATUS_ABORTED); |
||
2317 | } |
||
2318 | |||
2319 | if (!$this->CLI_checkIfProcessIsActive($this->CLI_buildProcessId())) { |
||
2320 | $this->CLI_debug("conflict / timeout (" . $this->CLI_buildProcessId() . ")"); |
||
2321 | |||
2322 | //TODO might need an additional returncode |
||
2323 | $result |= self::CLI_STATUS_ABORTED; |
||
2324 | break; //possible timeout |
||
2325 | } |
||
2326 | } |
||
2327 | |||
2328 | sleep(intval($sleepAfterFinish)); |
||
2329 | |||
2330 | $msg = 'Rows: ' . $counter; |
||
2331 | $this->CLI_debug($msg . " (" . $this->CLI_buildProcessId() . ")"); |
||
2332 | } else { |
||
2333 | $this->CLI_debug("Nothing within queue which needs to be processed (" . $this->CLI_buildProcessId() . ")"); |
||
2334 | } |
||
2335 | |||
2336 | if ($counter > 0) { |
||
2337 | $result |= self::CLI_STATUS_PROCESSED; |
||
2338 | } |
||
2339 | |||
2340 | return $result; |
||
2341 | } |
||
2342 | |||
2343 | /** |
||
2344 | * Activate hooks |
||
2345 | * |
||
2346 | * @return void |
||
2347 | */ |
||
2348 | public function CLI_runHooks() |
||
2349 | { |
||
2350 | global $TYPO3_CONF_VARS; |
||
2351 | if (is_array($TYPO3_CONF_VARS['EXTCONF']['crawler']['cli_hooks'])) { |
||
2352 | foreach ($TYPO3_CONF_VARS['EXTCONF']['crawler']['cli_hooks'] as $objRef) { |
||
2353 | $hookObj = &GeneralUtility::getUserObj($objRef); |
||
2354 | if (is_object($hookObj)) { |
||
2355 | $hookObj->crawler_init($this); |
||
2356 | } |
||
2357 | } |
||
2358 | } |
||
2359 | } |
||
2360 | |||
2361 | /** |
||
2362 | * Try to acquire a new process with the given id |
||
2363 | * also performs some auto-cleanup for orphan processes |
||
2364 | * @todo preemption might not be the most elegant way to clean up |
||
2365 | * |
||
2366 | * @param string $id identification string for the process |
||
2367 | * @return boolean |
||
2368 | */ |
||
2369 | public function CLI_checkAndAcquireNewProcess($id) |
||
2370 | { |
||
2371 | $ret = true; |
||
2372 | |||
2373 | $systemProcessId = getmypid(); |
||
2374 | if ($systemProcessId < 1) { |
||
2375 | return false; |
||
2376 | } |
||
2377 | |||
2378 | $processCount = 0; |
||
2379 | $orphanProcesses = []; |
||
2380 | |||
2381 | $this->db->sql_query('BEGIN'); |
||
2382 | |||
2383 | $res = $this->db->exec_SELECTquery( |
||
2384 | 'process_id,ttl', |
||
2385 | 'tx_crawler_process', |
||
2386 | 'active=1 AND deleted=0' |
||
2387 | ); |
||
2388 | |||
2389 | $currentTime = $this->getCurrentTime(); |
||
2390 | |||
2391 | while ($row = $this->db->sql_fetch_assoc($res)) { |
||
2392 | if ($row['ttl'] < $currentTime) { |
||
2393 | $orphanProcesses[] = $row['process_id']; |
||
2394 | } else { |
||
2395 | $processCount++; |
||
2396 | } |
||
2397 | } |
||
2398 | |||
2399 | // if there are less than allowed active processes then add a new one |
||
2400 | if ($processCount < intval($this->extensionSettings['processLimit'])) { |
||
2401 | $this->CLI_debug("add process " . $this->CLI_buildProcessId() . " (" . ($processCount + 1) . "/" . intval($this->extensionSettings['processLimit']) . ")"); |
||
2402 | |||
2403 | // create new process record |
||
2404 | $this->db->exec_INSERTquery( |
||
2405 | 'tx_crawler_process', |
||
2406 | [ |
||
2407 | 'process_id' => $id, |
||
2408 | 'active' => '1', |
||
2409 | 'ttl' => ($currentTime + intval($this->extensionSettings['processMaxRunTime'])), |
||
2410 | 'system_process_id' => $systemProcessId |
||
2411 | ] |
||
2412 | ); |
||
2413 | } else { |
||
2414 | $this->CLI_debug("Processlimit reached (" . ($processCount) . "/" . intval($this->extensionSettings['processLimit']) . ")"); |
||
2415 | $ret = false; |
||
2416 | } |
||
2417 | |||
2418 | $this->CLI_releaseProcesses($orphanProcesses, true); // maybe this should be somehow included into the current lock |
||
2419 | $this->CLI_deleteProcessesMarkedDeleted(); |
||
2420 | |||
2421 | $this->db->sql_query('COMMIT'); |
||
2422 | |||
2423 | return $ret; |
||
2424 | } |
||
2425 | |||
2426 | /** |
||
2427 | * Release a process and the required resources |
||
2428 | * |
||
2429 | * @param mixed $releaseIds string with a single process-id or array with multiple process-ids |
||
2430 | * @param boolean $withinLock show whether the DB-actions are included within an existing lock |
||
2431 | * @return boolean |
||
2432 | */ |
||
2433 | public function CLI_releaseProcesses($releaseIds, $withinLock = false) |
||
2434 | { |
||
2435 | if (!is_array($releaseIds)) { |
||
2436 | $releaseIds = [$releaseIds]; |
||
2437 | } |
||
2438 | |||
2439 | if (!count($releaseIds) > 0) { |
||
2440 | return false; //nothing to release |
||
2441 | } |
||
2442 | |||
2443 | if (!$withinLock) { |
||
2444 | $this->db->sql_query('BEGIN'); |
||
2445 | } |
||
2446 | |||
2447 | // some kind of 2nd chance algo - this way you need at least 2 processes to have a real cleanup |
||
2448 | // this ensures that a single process can't mess up the entire process table |
||
2449 | |||
2450 | // mark all processes as deleted which have no "waiting" queue-entires and which are not active |
||
2451 | $this->db->exec_UPDATEquery( |
||
2452 | 'tx_crawler_queue', |
||
2453 | 'process_id IN (SELECT process_id FROM tx_crawler_process WHERE active=0 AND deleted=0)', |
||
2454 | [ |
||
2455 | 'process_scheduled' => 0, |
||
2456 | 'process_id' => '' |
||
2457 | ] |
||
2458 | ); |
||
2459 | $this->db->exec_UPDATEquery( |
||
2460 | 'tx_crawler_process', |
||
2461 | 'active=0 AND deleted=0 |
||
2462 | AND NOT EXISTS ( |
||
2463 | SELECT * FROM tx_crawler_queue |
||
2464 | WHERE tx_crawler_queue.process_id = tx_crawler_process.process_id |
||
2465 | AND tx_crawler_queue.exec_time = 0 |
||
2466 | )', |
||
2467 | [ |
||
2468 | 'deleted' => '1', |
||
2469 | 'system_process_id' => 0 |
||
2470 | ] |
||
2471 | ); |
||
2472 | // mark all requested processes as non-active |
||
2473 | $this->db->exec_UPDATEquery( |
||
2474 | 'tx_crawler_process', |
||
2475 | 'process_id IN (\'' . implode('\',\'', $releaseIds) . '\') AND deleted=0', |
||
2476 | [ |
||
2477 | 'active' => '0' |
||
2478 | ] |
||
2479 | ); |
||
2480 | $this->db->exec_UPDATEquery( |
||
2481 | 'tx_crawler_queue', |
||
2482 | 'exec_time=0 AND process_id IN ("' . implode('","', $releaseIds) . '")', |
||
2483 | [ |
||
2484 | 'process_scheduled' => 0, |
||
2485 | 'process_id' => '' |
||
2486 | ] |
||
2487 | ); |
||
2488 | |||
2489 | if (!$withinLock) { |
||
2490 | $this->db->sql_query('COMMIT'); |
||
2491 | } |
||
2492 | |||
2493 | return true; |
||
2494 | } |
||
2495 | |||
2496 | /** |
||
2497 | * Delete processes marked as deleted |
||
2498 | * |
||
2499 | * @return void |
||
2500 | */ |
||
2501 | 1 | public function CLI_deleteProcessesMarkedDeleted() |
|
2502 | { |
||
2503 | 1 | $this->db->exec_DELETEquery('tx_crawler_process', 'deleted = 1'); |
|
2504 | 1 | } |
|
2505 | |||
2506 | /** |
||
2507 | * Check if there are still resources left for the process with the given id |
||
2508 | * Used to determine timeouts and to ensure a proper cleanup if there's a timeout |
||
2509 | * |
||
2510 | * @param string identification string for the process |
||
2511 | * @return boolean determines if the process is still active / has resources |
||
2512 | * |
||
2513 | * FIXME: Please remove Transaction, not needed as only a select query. |
||
2514 | */ |
||
2515 | public function CLI_checkIfProcessIsActive($pid) |
||
2516 | { |
||
2517 | $ret = false; |
||
2518 | $this->db->sql_query('BEGIN'); |
||
2519 | $res = $this->db->exec_SELECTquery( |
||
2520 | 'process_id,active,ttl', |
||
2521 | 'tx_crawler_process', |
||
2522 | 'process_id = \'' . $pid . '\' AND deleted=0', |
||
2523 | '', |
||
2524 | 'ttl', |
||
2525 | '0,1' |
||
2526 | ); |
||
2527 | if ($row = $this->db->sql_fetch_assoc($res)) { |
||
2528 | $ret = intVal($row['active']) == 1; |
||
2529 | } |
||
2530 | $this->db->sql_query('COMMIT'); |
||
2531 | |||
2532 | return $ret; |
||
2533 | } |
||
2534 | |||
2535 | /** |
||
2536 | * Create a unique Id for the current process |
||
2537 | * |
||
2538 | * @return string the ID |
||
2539 | */ |
||
2540 | 2 | public function CLI_buildProcessId() |
|
2541 | { |
||
2542 | 2 | if (!$this->processID) { |
|
2543 | 1 | $this->processID = GeneralUtility::shortMD5($this->microtime(true)); |
|
2544 | } |
||
2545 | 2 | return $this->processID; |
|
2546 | } |
||
2547 | |||
2548 | /** |
||
2549 | * @param bool $get_as_float |
||
2550 | * |
||
2551 | * @return mixed |
||
2552 | */ |
||
2553 | protected function microtime($get_as_float = false) |
||
2554 | { |
||
2555 | return microtime($get_as_float); |
||
2556 | } |
||
2557 | |||
2558 | /** |
||
2559 | * Prints a message to the stdout (only if debug-mode is enabled) |
||
2560 | * |
||
2561 | * @param string $msg the message |
||
2562 | */ |
||
2563 | public function CLI_debug($msg) |
||
2564 | { |
||
2565 | if (intval($this->extensionSettings['processDebug'])) { |
||
2566 | echo $msg . "\n"; |
||
2567 | flush(); |
||
2568 | } |
||
2569 | } |
||
2570 | |||
2571 | /** |
||
2572 | * Get URL content by making direct request to TYPO3. |
||
2573 | * |
||
2574 | * @param string $url Page URL |
||
2575 | * @param int $crawlerId Crawler-ID |
||
2576 | * @return array |
||
2577 | */ |
||
2578 | 2 | protected function sendDirectRequest($url, $crawlerId) |
|
2579 | { |
||
2580 | 2 | $parsedUrl = parse_url($url); |
|
2581 | 2 | if (!is_array($parsedUrl)) { |
|
2582 | return []; |
||
2583 | } |
||
2584 | |||
2585 | 2 | $requestHeaders = $this->buildRequestHeaderArray($parsedUrl, $crawlerId); |
|
2586 | |||
2587 | 2 | $cmd = escapeshellcmd($this->extensionSettings['phpPath']); |
|
2588 | 2 | $cmd .= ' '; |
|
2589 | 2 | $cmd .= escapeshellarg(ExtensionManagementUtility::extPath('crawler') . 'cli/bootstrap.php'); |
|
2590 | 2 | $cmd .= ' '; |
|
2591 | 2 | $cmd .= escapeshellarg($this->getFrontendBasePath()); |
|
2592 | 2 | $cmd .= ' '; |
|
2593 | 2 | $cmd .= escapeshellarg($url); |
|
2594 | 2 | $cmd .= ' '; |
|
2595 | 2 | $cmd .= escapeshellarg(base64_encode(serialize($requestHeaders))); |
|
2596 | |||
2597 | 2 | $startTime = microtime(true); |
|
2598 | 2 | $content = $this->executeShellCommand($cmd); |
|
2599 | 2 | $this->log($url . ' ' . (microtime(true) - $startTime)); |
|
2600 | |||
2601 | $result = [ |
||
2602 | 2 | 'request' => implode("\r\n", $requestHeaders) . "\r\n\r\n", |
|
2603 | 2 | 'headers' => '', |
|
2604 | 2 | 'content' => $content |
|
2605 | ]; |
||
2606 | |||
2607 | 2 | return $result; |
|
2608 | } |
||
2609 | |||
2610 | /** |
||
2611 | * Cleans up entries that stayed for too long in the queue. These are: |
||
2612 | * - processed entries that are over 1.5 days in age |
||
2613 | * - scheduled entries that are over 7 days old |
||
2614 | * |
||
2615 | * @return void |
||
2616 | * |
||
2617 | * TODO: Should be switched back to protected - TNM 2018-11-16 |
||
2618 | */ |
||
2619 | public function cleanUpOldQueueEntries() |
||
2620 | { |
||
2621 | $processedAgeInSeconds = $this->extensionSettings['cleanUpProcessedAge'] * 86400; // 24*60*60 Seconds in 24 hours |
||
2622 | $scheduledAgeInSeconds = $this->extensionSettings['cleanUpScheduledAge'] * 86400; |
||
2623 | |||
2624 | $now = time(); |
||
2625 | $condition = '(exec_time<>0 AND exec_time<' . ($now - $processedAgeInSeconds) . ') OR scheduled<=' . ($now - $scheduledAgeInSeconds); |
||
2626 | $this->flushQueue($condition); |
||
2627 | } |
||
2628 | |||
2629 | /** |
||
2630 | * Initializes a TypoScript Frontend necessary for using TypoScript and TypoLink functions |
||
2631 | * |
||
2632 | * @param int $id |
||
2633 | * @param int $typeNum |
||
2634 | * |
||
2635 | * @return void |
||
2636 | */ |
||
2637 | protected function initTSFE($id = 1, $typeNum = 0) |
||
2638 | { |
||
2639 | EidUtility::initTCA(); |
||
2640 | if (!is_object($GLOBALS['TT'])) { |
||
2641 | $GLOBALS['TT'] = new NullTimeTracker(); |
||
2642 | $GLOBALS['TT']->start(); |
||
2643 | } |
||
2644 | |||
2645 | $GLOBALS['TSFE'] = GeneralUtility::makeInstance(TypoScriptFrontendController::class, $GLOBALS['TYPO3_CONF_VARS'], $id, $typeNum); |
||
2646 | $GLOBALS['TSFE']->sys_page = GeneralUtility::makeInstance(PageRepository::class); |
||
2647 | $GLOBALS['TSFE']->sys_page->init(true); |
||
2648 | $GLOBALS['TSFE']->connectToDB(); |
||
2649 | $GLOBALS['TSFE']->initFEuser(); |
||
2650 | $GLOBALS['TSFE']->determineId(); |
||
2651 | $GLOBALS['TSFE']->initTemplate(); |
||
2652 | $GLOBALS['TSFE']->rootLine = $GLOBALS['TSFE']->sys_page->getRootLine($id, ''); |
||
2653 | $GLOBALS['TSFE']->getConfigArray(); |
||
2654 | PageGenerator::pagegenInit(); |
||
2655 | } |
||
2656 | |||
2657 | /** |
||
2658 | * Returns a md5 hash generated from a serialized configuration array. |
||
2659 | * |
||
2660 | * @param array $configuration |
||
2661 | * |
||
2662 | * @return string |
||
2663 | */ |
||
2664 | 9 | protected function getConfigurationHash(array $configuration) { |
|
2665 | 9 | unset($configuration['paramExpanded']); |
|
2666 | 9 | unset($configuration['URLs']); |
|
2667 | 9 | return md5(serialize($configuration)); |
|
2668 | } |
||
2669 | |||
2670 | /** |
||
2671 | * Check whether the Crawling Protocol should be http or https |
||
2672 | * |
||
2673 | * @param $crawlerConfiguration |
||
2674 | * @param $pageConfiguration |
||
2675 | * |
||
2676 | * @return bool |
||
2677 | */ |
||
2678 | 6 | protected function isCrawlingProtocolHttps($crawlerConfiguration, $pageConfiguration) { |
|
2679 | switch($crawlerConfiguration) { |
||
2680 | 6 | case -1: |
|
2681 | 1 | return false; |
|
2682 | 5 | case 0: |
|
2683 | 3 | return $pageConfiguration; |
|
2684 | 2 | case 1: |
|
2685 | 1 | return true; |
|
2686 | default: |
||
2687 | 1 | return false; |
|
2688 | } |
||
2689 | } |
||
2690 | } |
||
2691 |
This method has been deprecated. The supplier of the class has supplied an explanatory message.
The explanatory message should give you some clue as to whether and when the method will be removed from the class and what other method or class to use instead.