1 | <?php |
||
2 | |||
3 | declare(strict_types=1); |
||
4 | |||
5 | namespace AOE\Crawler\Hooks; |
||
6 | |||
7 | /* |
||
8 | * (c) 2020 AOE GmbH <[email protected]> |
||
9 | * |
||
10 | * This file is part of the TYPO3 Crawler Extension. |
||
11 | * |
||
12 | * It is free software; you can redistribute it and/or modify it under |
||
13 | * the terms of the GNU General Public License, either version 2 |
||
14 | * of the License, or any later version. |
||
15 | * |
||
16 | * For the full copyright and license information, please read the |
||
17 | * LICENSE.txt file that was distributed with this source code. |
||
18 | * |
||
19 | * The TYPO3 project - inspiring people to share! |
||
20 | */ |
||
21 | |||
22 | use AOE\Crawler\Controller\CrawlerController; |
||
23 | use AOE\Crawler\Domain\Repository\QueueRepository; |
||
24 | use TYPO3\CMS\Backend\Utility\BackendUtility; |
||
25 | use TYPO3\CMS\Core\Core\Environment; |
||
26 | use TYPO3\CMS\Core\Database\Connection; |
||
27 | use TYPO3\CMS\Core\Database\ConnectionPool; |
||
28 | use TYPO3\CMS\Core\Database\Query\Restriction\DeletedRestriction; |
||
29 | use TYPO3\CMS\Core\DataHandling\DataHandler; |
||
30 | use TYPO3\CMS\Core\Exception\Page\RootLineException; |
||
31 | use TYPO3\CMS\Core\Utility\GeneralUtility; |
||
32 | use TYPO3\CMS\Core\Utility\MathUtility; |
||
33 | use TYPO3\CMS\Core\Utility\RootlineUtility; |
||
34 | |||
35 | /** |
||
36 | * Crawler hook for indexed search. Works with the "crawler" extension |
||
37 | * @internal this is a TYPO3-internal hook implementation and not part of TYPO3's Core API. |
||
38 | * @deprecated This class is deprecated and will be remove when dropping support for TYPO3 9LTS and 10LTS |
||
39 | * @codeCoverageIgnore |
||
40 | */ |
||
41 | class IndexedSearchCrawlerHook |
||
42 | { |
||
43 | /** |
||
44 | * Number of seconds to use as interval between queued indexing operations of URLs / files (types 2 & 3) |
||
45 | * |
||
46 | * @var int |
||
47 | */ |
||
48 | public $secondsPerExternalUrl = 3; |
||
49 | |||
50 | /** |
||
51 | * Counts up for each added URL (type 3) |
||
52 | * |
||
53 | * @var int |
||
54 | */ |
||
55 | public $instanceCounter = 0; |
||
56 | |||
57 | /** |
||
58 | * @var string |
||
59 | */ |
||
60 | public $callBack = self::class; |
||
61 | |||
62 | public function __construct() |
||
63 | { |
||
64 | // To make sure the backend charset is available: |
||
65 | if (! is_object($GLOBALS['LANG'])) { |
||
66 | $GLOBALS['LANG'] = GeneralUtility::makeInstance(\TYPO3\CMS\Core\Localization\LanguageService::class); |
||
67 | $GLOBALS['LANG']->init($GLOBALS['BE_USER']->uc['lang']); |
||
68 | } |
||
69 | } |
||
70 | |||
71 | /** |
||
72 | * Initialization of crawler hook. |
||
73 | * This function is asked for each instance of the crawler and we must check if something is timed to happen and if so put entry(s) in the crawlers log to start processing. |
||
74 | * In reality we select indexing configurations and evaluate if any of them needs to run. |
||
75 | */ |
||
76 | public function crawler_init(CrawlerController &$pObj): void |
||
77 | { |
||
78 | // Select all indexing configuration which are waiting to be activated: |
||
79 | $connection = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('index_config'); |
||
80 | $queryBuilder = $connection->createQueryBuilder(); |
||
81 | |||
82 | $result = $queryBuilder->select('*') |
||
83 | ->from('index_config') |
||
84 | ->where( |
||
85 | $queryBuilder->expr()->lt( |
||
86 | 'timer_next_indexing', |
||
87 | $queryBuilder->createNamedParameter($GLOBALS['EXEC_TIME'], \PDO::PARAM_INT) |
||
88 | ), |
||
89 | $queryBuilder->expr()->eq('set_id', $queryBuilder->createNamedParameter(0, \PDO::PARAM_INT)) |
||
90 | ) |
||
91 | ->execute(); |
||
92 | |||
93 | // For each configuration, check if it should be executed and if so, start: |
||
94 | while ($cfgRec = $result->fetch()) { |
||
95 | // Generate a unique set-ID: |
||
96 | $setId = GeneralUtility::md5int(microtime()); |
||
97 | // Get next time: |
||
98 | $nextTime = $this->generateNextIndexingTime($cfgRec); |
||
99 | // Start process by updating index-config record: |
||
100 | $connection->update( |
||
101 | 'index_config', |
||
102 | [ |
||
103 | 'set_id' => $setId, |
||
104 | 'timer_next_indexing' => $nextTime, |
||
105 | 'session_data' => '', |
||
106 | ], |
||
107 | [ |
||
108 | 'uid' => (int) $cfgRec['uid'], |
||
109 | ] |
||
110 | ); |
||
111 | // Based on configuration type: |
||
112 | switch ($cfgRec['type']) { |
||
113 | case 1: |
||
114 | // RECORDS: |
||
115 | // Parameters: |
||
116 | $params = [ |
||
117 | 'indexConfigUid' => $cfgRec['uid'], |
||
118 | 'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']'], |
||
119 | 'url' => 'Records (start)', |
||
120 | ]; |
||
121 | $pObj->addQueueEntry_callBack($setId, $params, $this->callBack, $cfgRec['pid']); |
||
122 | break; |
||
123 | case 2: |
||
124 | // FILES: |
||
125 | // Parameters: |
||
126 | $params = [ |
||
127 | 'indexConfigUid' => $cfgRec['uid'], |
||
128 | // General |
||
129 | 'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']'], |
||
130 | // General |
||
131 | 'url' => $cfgRec['filepath'], |
||
132 | // Partly general... (for URL and file types) |
||
133 | 'depth' => 0, |
||
134 | ]; |
||
135 | $pObj->addQueueEntry_callBack($setId, $params, $this->callBack, $cfgRec['pid']); |
||
136 | break; |
||
137 | case 3: |
||
138 | // External URL: |
||
139 | // Parameters: |
||
140 | $params = [ |
||
141 | 'indexConfigUid' => $cfgRec['uid'], |
||
142 | // General |
||
143 | 'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']'], |
||
144 | // General |
||
145 | 'url' => $cfgRec['externalUrl'], |
||
146 | // Partly general... (for URL and file types) |
||
147 | 'depth' => 0, |
||
148 | ]; |
||
149 | $pObj->addQueueEntry_callBack($setId, $params, $this->callBack, $cfgRec['pid']); |
||
150 | break; |
||
151 | case 4: |
||
152 | // Page tree |
||
153 | // Parameters: |
||
154 | $params = [ |
||
155 | 'indexConfigUid' => $cfgRec['uid'], |
||
156 | // General |
||
157 | 'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']'], |
||
158 | // General |
||
159 | 'url' => (int) $cfgRec['alternative_source_pid'], |
||
160 | // Partly general... (for URL and file types and page tree (root)) |
||
161 | 'depth' => 0, |
||
162 | ]; |
||
163 | $pObj->addQueueEntry_callBack($setId, $params, $this->callBack, $cfgRec['pid']); |
||
164 | break; |
||
165 | case 5: |
||
166 | // Meta configuration, nothing to do: |
||
167 | // NOOP |
||
168 | break; |
||
169 | default: |
||
170 | if ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]) { |
||
171 | $hookObj = GeneralUtility::makeInstance($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]); |
||
172 | // Parameters: |
||
173 | $params = [ |
||
174 | 'indexConfigUid' => $cfgRec['uid'], |
||
175 | // General |
||
176 | 'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . '/CUSTOM]'], |
||
177 | // General |
||
178 | 'url' => $hookObj->initMessage($message), |
||
0 ignored issues
–
show
Comprehensibility
Best Practice
introduced
by
Loading history...
|
|||
179 | ]; |
||
180 | $pObj->addQueueEntry_callBack($setId, $params, $this->callBack, $cfgRec['pid']); |
||
181 | } |
||
182 | } |
||
183 | } |
||
184 | // Finally, look up all old index configurations which are finished and needs to be reset and done. |
||
185 | $this->cleanUpOldRunningConfigurations(); |
||
186 | } |
||
187 | |||
188 | /** |
||
189 | * Call back function for execution of a log element |
||
190 | * |
||
191 | * @param array $params Params from log element. Must contain $params['indexConfigUid'] |
||
192 | * @param object $pObj Parent object (tx_crawler lib) |
||
193 | * @return array Result array |
||
194 | */ |
||
195 | public function crawler_execute($params, &$pObj) |
||
196 | { |
||
197 | // Indexer configuration ID must exist: |
||
198 | if ($params['indexConfigUid']) { |
||
199 | $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class) |
||
200 | ->getQueryBuilderForTable('index_config'); |
||
201 | $queryBuilder->getRestrictions()->removeAll(); |
||
202 | // Load the indexing configuration record: |
||
203 | $cfgRec = $queryBuilder |
||
204 | ->select('*') |
||
205 | ->from('index_config') |
||
206 | ->where( |
||
207 | $queryBuilder->expr()->eq( |
||
208 | 'uid', |
||
209 | $queryBuilder->createNamedParameter($params['indexConfigUid'], \PDO::PARAM_INT) |
||
210 | ) |
||
211 | ) |
||
212 | ->execute() |
||
213 | ->fetch(); |
||
214 | if (is_array($cfgRec)) { |
||
215 | // Unpack session data: |
||
216 | $session_data = unserialize($cfgRec['session_data']); |
||
217 | // Select which type: |
||
218 | switch ($cfgRec['type']) { |
||
219 | case 1: |
||
220 | // Records: |
||
221 | $this->crawler_execute_type1($cfgRec, $session_data, $params, $pObj); |
||
222 | break; |
||
223 | case 2: |
||
224 | // Files |
||
225 | $this->crawler_execute_type2($cfgRec, $session_data, $params, $pObj); |
||
226 | break; |
||
227 | case 3: |
||
228 | // External URL: |
||
229 | $this->crawler_execute_type3($cfgRec, $session_data, $params, $pObj); |
||
230 | break; |
||
231 | case 4: |
||
232 | // Page tree: |
||
233 | $this->crawler_execute_type4($cfgRec, $session_data, $params, $pObj); |
||
234 | break; |
||
235 | case 5: |
||
236 | // Meta |
||
237 | // NOOP (should never enter here!) |
||
238 | break; |
||
239 | default: |
||
240 | if ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]) { |
||
241 | $hookObj = GeneralUtility::makeInstance($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]); |
||
242 | $this->pObj = $pObj; |
||
243 | // For addQueueEntryForHook() |
||
244 | $hookObj->indexOperation($cfgRec, $session_data, $params, $this); |
||
245 | } |
||
246 | } |
||
247 | // Save process data which might be modified: |
||
248 | GeneralUtility::makeInstance(ConnectionPool::class) |
||
249 | ->getConnectionForTable('index_config') |
||
250 | ->update( |
||
251 | 'index_config', |
||
252 | ['session_data' => serialize($session_data)], |
||
253 | ['uid' => (int) $cfgRec['uid']] |
||
254 | ); |
||
255 | } |
||
256 | } |
||
257 | return ['log' => $params]; |
||
258 | } |
||
259 | |||
260 | /** |
||
261 | * Indexing records from a table |
||
262 | * |
||
263 | * @param array $cfgRec Indexing Configuration Record |
||
264 | * @param array $session_data Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call! |
||
265 | * @param array $params Parameters from the log queue. |
||
266 | * @param object $pObj Parent object (from "crawler" extension!) |
||
267 | */ |
||
268 | public function crawler_execute_type1($cfgRec, &$session_data, $params, &$pObj): void |
||
269 | { |
||
270 | if ($cfgRec['table2index'] && isset($GLOBALS['TCA'][$cfgRec['table2index']])) { |
||
271 | // Init session data array if not already: |
||
272 | if (! is_array($session_data)) { |
||
273 | $session_data = [ |
||
274 | 'uid' => 0, |
||
275 | ]; |
||
276 | } |
||
277 | // Init: |
||
278 | $pid = (int) $cfgRec['alternative_source_pid'] ?: $cfgRec['pid']; |
||
279 | $numberOfRecords = $cfgRec['recordsbatch'] |
||
280 | ? MathUtility::forceIntegerInRange($cfgRec['recordsbatch'], 1) |
||
281 | : 100; |
||
282 | |||
283 | // Get root line: |
||
284 | $rootLine = $this->getUidRootLineForClosestTemplate($cfgRec['pid']); |
||
285 | // Select |
||
286 | $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class) |
||
287 | ->getQueryBuilderForTable($cfgRec['table2index']); |
||
288 | |||
289 | $baseQueryBuilder = $queryBuilder->select('*') |
||
290 | ->from($cfgRec['table2index']) |
||
291 | ->where( |
||
292 | $queryBuilder->expr()->eq( |
||
293 | 'pid', |
||
294 | $queryBuilder->createNamedParameter($pid, \PDO::PARAM_INT) |
||
295 | ), |
||
296 | $queryBuilder->expr()->gt( |
||
297 | 'uid', |
||
298 | $queryBuilder->createNamedParameter($session_data['uid'], \PDO::PARAM_INT) |
||
299 | ) |
||
300 | ); |
||
301 | $result = $baseQueryBuilder |
||
302 | ->setMaxResults($numberOfRecords) |
||
303 | ->orderBy('uid') |
||
304 | ->execute(); |
||
305 | |||
306 | // Traverse: |
||
307 | while ($row = $result->fetch()) { |
||
308 | // Index single record: |
||
309 | $this->indexSingleRecord($row, $cfgRec, $rootLine); |
||
310 | // Update the UID we last processed: |
||
311 | $session_data['uid'] = $row['uid']; |
||
312 | } |
||
313 | |||
314 | $rowCount = $baseQueryBuilder->count('uid')->execute()->fetchColumn(0); |
||
315 | // Finally, set entry for next indexing of batch of records: |
||
316 | if ($rowCount) { |
||
317 | $nparams = [ |
||
318 | 'indexConfigUid' => $cfgRec['uid'], |
||
319 | 'url' => 'Records from UID#' . ($session_data['uid'] + 1) . '-?', |
||
320 | 'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']'], |
||
321 | ]; |
||
322 | $pObj->addQueueEntry_callBack($cfgRec['set_id'], $nparams, $this->callBack, $cfgRec['pid']); |
||
323 | } |
||
324 | } |
||
325 | } |
||
326 | |||
327 | /** |
||
328 | * Indexing files from fileadmin |
||
329 | * |
||
330 | * @param array $cfgRec Indexing Configuration Record |
||
331 | * @param array $session_data Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call! |
||
332 | * @param array $params Parameters from the log queue. |
||
333 | * @param object $pObj Parent object (from "crawler" extension!) |
||
334 | */ |
||
335 | public function crawler_execute_type2($cfgRec, &$session_data, $params, &$pObj): void |
||
336 | { |
||
337 | // Prepare path, making it absolute and checking: |
||
338 | $readpath = $params['url']; |
||
339 | if (! GeneralUtility::isAbsPath($readpath)) { |
||
340 | $readpath = GeneralUtility::getFileAbsFileName($readpath); |
||
341 | } |
||
342 | if (GeneralUtility::isAllowedAbsPath($readpath)) { |
||
343 | if (@is_file($readpath)) { |
||
344 | // If file, index it! |
||
345 | // Get root line (need to provide this when indexing external files) |
||
346 | $rl = $this->getUidRootLineForClosestTemplate($cfgRec['pid']); |
||
347 | // (Re)-Indexing file on page. |
||
348 | $indexerObj = GeneralUtility::makeInstance(\TYPO3\CMS\IndexedSearch\Indexer::class); |
||
349 | $indexerObj->backend_initIndexer($cfgRec['pid'], 0, 0, '', $rl); |
||
350 | $indexerObj->backend_setFreeIndexUid($cfgRec['uid'], $cfgRec['set_id']); |
||
351 | $indexerObj->hash['phash'] = -1; |
||
352 | // EXPERIMENT - but to avoid phash_t3 being written to file sections (otherwise they are removed when page is reindexed!!!) |
||
353 | // Index document: |
||
354 | $indexerObj->indexRegularDocument(\TYPO3\CMS\Core\Utility\PathUtility::stripPathSitePrefix($readpath), true); |
||
355 | } elseif (@is_dir($readpath)) { |
||
356 | // If dir, read content and create new pending items for log: |
||
357 | // Select files and directories in path: |
||
358 | $extList = implode(',', GeneralUtility::trimExplode(',', $cfgRec['extensions'], true)); |
||
359 | $fileArr = []; |
||
360 | $files = GeneralUtility::getAllFilesAndFoldersInPath($fileArr, $readpath, $extList, false, 0); |
||
361 | $directoryList = GeneralUtility::get_dirs($readpath); |
||
362 | if (is_array($directoryList) && $params['depth'] < $cfgRec['depth']) { |
||
363 | foreach ($directoryList as $subdir) { |
||
364 | if ($subdir !== '') { |
||
365 | $files[] = $readpath . $subdir . '/'; |
||
366 | } |
||
367 | } |
||
368 | } |
||
369 | $files = GeneralUtility::removePrefixPathFromList($files, Environment::getPublicPath() . '/'); |
||
370 | // traverse the items and create log entries: |
||
371 | foreach ($files as $path) { |
||
372 | $this->instanceCounter++; |
||
373 | if ($path !== $params['url']) { |
||
374 | // Parameters: |
||
375 | $nparams = [ |
||
376 | 'indexConfigUid' => $cfgRec['uid'], |
||
377 | 'url' => $path, |
||
378 | 'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']'], |
||
379 | 'depth' => $params['depth'] + 1, |
||
380 | ]; |
||
381 | $pObj->addQueueEntry_callBack($cfgRec['set_id'], $nparams, $this->callBack, $cfgRec['pid'], $GLOBALS['EXEC_TIME'] + $this->instanceCounter * $this->secondsPerExternalUrl); |
||
382 | } |
||
383 | } |
||
384 | } |
||
385 | } |
||
386 | } |
||
387 | |||
388 | /** |
||
389 | * Indexing External URLs |
||
390 | * |
||
391 | * @param array $cfgRec Indexing Configuration Record |
||
392 | * @param array $session_data Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call! |
||
393 | * @param array $params Parameters from the log queue. |
||
394 | * @param object $pObj Parent object (from "crawler" extension!) |
||
395 | */ |
||
396 | public function crawler_execute_type3($cfgRec, &$session_data, $params, &$pObj): void |
||
397 | { |
||
398 | // Init session data array if not already: |
||
399 | if (! is_array($session_data)) { |
||
400 | $session_data = [ |
||
401 | 'urlLog' => [$params['url']], |
||
402 | ]; |
||
403 | } |
||
404 | // Index the URL: |
||
405 | $rl = $this->getUidRootLineForClosestTemplate($cfgRec['pid']); |
||
406 | $subUrls = $this->indexExtUrl($params['url'], $cfgRec['pid'], $rl, $cfgRec['uid'], $cfgRec['set_id']); |
||
407 | // Add more elements to log now: |
||
408 | if ($params['depth'] < $cfgRec['depth']) { |
||
409 | foreach ($subUrls as $url) { |
||
410 | if ($url = $this->checkUrl($url, $session_data['urlLog'], $cfgRec['externalUrl'])) { |
||
411 | if (! $this->checkDeniedSuburls($url, $cfgRec['url_deny'])) { |
||
412 | $this->instanceCounter++; |
||
413 | $session_data['urlLog'][] = $url; |
||
414 | // Parameters: |
||
415 | $nparams = [ |
||
416 | 'indexConfigUid' => $cfgRec['uid'], |
||
417 | 'url' => $url, |
||
418 | 'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']'], |
||
419 | 'depth' => $params['depth'] + 1, |
||
420 | ]; |
||
421 | $pObj->addQueueEntry_callBack($cfgRec['set_id'], $nparams, $this->callBack, $cfgRec['pid'], $GLOBALS['EXEC_TIME'] + $this->instanceCounter * $this->secondsPerExternalUrl); |
||
422 | } |
||
423 | } |
||
424 | } |
||
425 | } |
||
426 | } |
||
427 | |||
428 | /** |
||
429 | * Page tree indexing type |
||
430 | * |
||
431 | * @param array $cfgRec Indexing Configuration Record |
||
432 | * @param array $session_data Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call! |
||
433 | * @param array $params Parameters from the log queue. |
||
434 | * @param object $pObj Parent object (from "crawler" extension!) |
||
435 | */ |
||
436 | public function crawler_execute_type4($cfgRec, &$session_data, $params, &$pObj): void |
||
437 | { |
||
438 | // Base page uid: |
||
439 | $pageUid = (int) $params['url']; |
||
440 | // Get array of URLs from page: |
||
441 | $pageRow = BackendUtility::getRecord('pages', $pageUid); |
||
442 | $res = $pObj->getUrlsForPageRow($pageRow); |
||
443 | $duplicateTrack = []; |
||
444 | // Registry for duplicates |
||
445 | $downloadUrls = []; |
||
446 | // Dummy. |
||
447 | // Submit URLs: |
||
448 | if (! empty($res)) { |
||
449 | foreach ($res as $vv) { |
||
450 | $pObj->urlListFromUrlArray($vv, $pageRow, $GLOBALS['EXEC_TIME'], 30, 1, 0, $duplicateTrack, $downloadUrls, ['tx_indexedsearch_reindex']); |
||
451 | } |
||
452 | } |
||
453 | // Add subpages to log now: |
||
454 | if ($params['depth'] < $cfgRec['depth']) { |
||
455 | // Subpages selected |
||
456 | $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable('pages'); |
||
457 | $queryBuilder->getRestrictions() |
||
458 | ->removeAll() |
||
459 | ->add(GeneralUtility::makeInstance(DeletedRestriction::class)); |
||
460 | $result = $queryBuilder->select('uid', 'title') |
||
461 | ->from('pages') |
||
462 | ->where( |
||
463 | $queryBuilder->expr()->eq( |
||
464 | 'pid', |
||
465 | $queryBuilder->createNamedParameter($pageUid, \PDO::PARAM_INT) |
||
466 | ) |
||
467 | ) |
||
468 | ->execute(); |
||
469 | // Traverse subpages and add to queue: |
||
470 | while ($row = $result->fetch()) { |
||
471 | $this->instanceCounter++; |
||
472 | $url = 'pages:' . $row['uid'] . ': ' . $row['title']; |
||
473 | $session_data['urlLog'][] = $url; |
||
474 | // Parameters: |
||
475 | $nparams = [ |
||
476 | 'indexConfigUid' => $cfgRec['uid'], |
||
477 | 'url' => $row['uid'], |
||
478 | 'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']'], |
||
479 | 'depth' => $params['depth'] + 1, |
||
480 | ]; |
||
481 | $pObj->addQueueEntry_callBack( |
||
482 | $cfgRec['set_id'], |
||
483 | $nparams, |
||
484 | $this->callBack, |
||
485 | $cfgRec['pid'], |
||
486 | $GLOBALS['EXEC_TIME'] + $this->instanceCounter * $this->secondsPerExternalUrl |
||
487 | ); |
||
488 | } |
||
489 | } |
||
490 | } |
||
491 | |||
492 | /** |
||
493 | * Look up all old index configurations which are finished and needs to be reset and done |
||
494 | */ |
||
495 | public function cleanUpOldRunningConfigurations(): void |
||
496 | { |
||
497 | $connectionPool = GeneralUtility::makeInstance(ConnectionPool::class); |
||
498 | // List of tables that store information related to the phash value |
||
499 | $tablesToClean = [ |
||
500 | 'index_phash', |
||
501 | 'index_rel', |
||
502 | 'index_section', |
||
503 | 'index_grlist', |
||
504 | 'index_fulltext', |
||
505 | 'index_debug', |
||
506 | ]; |
||
507 | |||
508 | $queryBuilder = $connectionPool->getQueryBuilderForTable('index_config'); |
||
509 | $queryBuilder->getRestrictions() |
||
510 | ->removeAll() |
||
511 | ->add(GeneralUtility::makeInstance(DeletedRestriction::class)); |
||
512 | |||
513 | // Lookup running index configurations: |
||
514 | $runningIndexingConfigurations = $queryBuilder->select('*') |
||
515 | ->from('index_config') |
||
516 | ->where($queryBuilder->expr()->neq('set_id', $queryBuilder->createNamedParameter(0, \PDO::PARAM_INT))) |
||
517 | ->execute() |
||
518 | ->fetchAll(); |
||
519 | // For each running configuration, look up how many log entries there are which are scheduled |
||
520 | // for execution and if none, clear the "set_id" (means; Processing was DONE) |
||
521 | foreach ($runningIndexingConfigurations as $cfgRec) { |
||
522 | // Look for ended processes: |
||
523 | $queued_items = $connectionPool->getConnectionForTable(QueueRepository::TABLE_NAME) |
||
524 | ->count( |
||
525 | '*', |
||
526 | QueueRepository::TABLE_NAME, |
||
527 | [ |
||
528 | 'set_id' => (int) $cfgRec['set_id'], |
||
529 | 'exec_time' => 0, |
||
530 | ] |
||
531 | ); |
||
532 | if (! $queued_items) { |
||
533 | // Lookup old phash rows: |
||
534 | $queryBuilder = $connectionPool->getQueryBuilderForTable('index_phash'); |
||
535 | $oldPhashRows = $queryBuilder |
||
536 | ->select('phash') |
||
537 | ->from('index_phash') |
||
538 | ->where( |
||
539 | $queryBuilder->expr()->eq( |
||
540 | 'freeIndexUid', |
||
541 | $queryBuilder->createNamedParameter($cfgRec['uid'], \PDO::PARAM_INT) |
||
542 | ), |
||
543 | $queryBuilder->expr()->neq( |
||
544 | 'freeIndexSetId', |
||
545 | $queryBuilder->createNamedParameter($cfgRec['set_id'], \PDO::PARAM_INT) |
||
546 | ) |
||
547 | ) |
||
548 | ->execute() |
||
549 | ->fetchAll(); |
||
550 | |||
551 | // Removing old registrations for all tables |
||
552 | foreach ($tablesToClean as $table) { |
||
553 | $queryBuilder = $connectionPool->getQueryBuilderForTable($table); |
||
554 | $queryBuilder->delete($table) |
||
555 | ->where( |
||
556 | $queryBuilder->expr()->in( |
||
557 | 'phash', |
||
558 | $queryBuilder->createNamedParameter( |
||
559 | array_column($oldPhashRows, 'phash'), |
||
560 | Connection::PARAM_INT_ARRAY |
||
561 | ) |
||
562 | ) |
||
563 | ) |
||
564 | ->execute(); |
||
565 | } |
||
566 | |||
567 | // End process by updating index-config record: |
||
568 | $connectionPool->getConnectionForTable('index_config') |
||
569 | ->update( |
||
570 | 'index_config', |
||
571 | [ |
||
572 | 'set_id' => 0, |
||
573 | 'session_data' => '', |
||
574 | ], |
||
575 | ['uid' => (int) $cfgRec['uid']] |
||
576 | ); |
||
577 | } |
||
578 | } |
||
579 | } |
||
580 | |||
581 | /***************************************** |
||
582 | * |
||
583 | * Helper functions |
||
584 | * |
||
585 | *****************************************/ |
||
586 | |||
587 | /** |
||
588 | * Check if an input URL are allowed to be indexed. Depends on whether it is already present in the url log. |
||
589 | * |
||
590 | * @param string $url URL string to check |
||
591 | * @param array $urlLog Array of already indexed URLs (input url is looked up here and must not exist already) |
||
592 | * @param string $baseUrl Base URL of the indexing process (input URL must be "inside" the base URL!) |
||
593 | * @return string|false Returls the URL if OK, otherwise FALSE |
||
594 | */ |
||
595 | public function checkUrl($url, $urlLog, $baseUrl) |
||
596 | { |
||
597 | $url = preg_replace('/\\/\\/$/', '/', $url); |
||
598 | [$url] = explode('#', $url); |
||
599 | if ((strpos($url, '../') === false) |
||
600 | && GeneralUtility::isFirstPartOfStr($url, $baseUrl) |
||
601 | && ! in_array($url, $urlLog, true)) { |
||
602 | return $url; |
||
603 | } |
||
604 | |||
605 | return false; |
||
606 | } |
||
607 | |||
608 | /** |
||
609 | * Indexing External URL |
||
610 | * |
||
611 | * @param string $url URL, http://.... |
||
612 | * @param int $pageId Page id to relate indexing to. |
||
613 | * @param array $rl Rootline array to relate indexing to |
||
614 | * @param int $cfgUid Configuration UID |
||
615 | * @param int $setId Set ID value |
||
616 | * @return array URLs found on this page |
||
617 | */ |
||
618 | public function indexExtUrl($url, $pageId, $rl, $cfgUid, $setId) |
||
619 | { |
||
620 | // Index external URL: |
||
621 | $indexerObj = GeneralUtility::makeInstance(\TYPO3\CMS\IndexedSearch\Indexer::class); |
||
622 | $indexerObj->backend_initIndexer($pageId, 0, 0, '', $rl); |
||
623 | $indexerObj->backend_setFreeIndexUid($cfgUid, $setId); |
||
624 | $indexerObj->hash['phash'] = -1; |
||
625 | // To avoid phash_t3 being written to file sections (otherwise they are removed when page is reindexed!!!) |
||
626 | $indexerObj->indexExternalUrl($url); |
||
627 | $url_qParts = parse_url($url); |
||
628 | $baseAbsoluteHref = $url_qParts['scheme'] . '://' . $url_qParts['host']; |
||
629 | $baseHref = $indexerObj->extractBaseHref($indexerObj->indexExternalUrl_content); |
||
630 | if (! $baseHref) { |
||
631 | // Extract base href from current URL |
||
632 | $baseHref = $baseAbsoluteHref; |
||
633 | $baseHref .= substr($url_qParts['path'], 0, strrpos($url_qParts['path'], '/')); |
||
634 | } |
||
635 | $baseHref = rtrim($baseHref, '/'); |
||
636 | // Get URLs on this page: |
||
637 | $subUrls = []; |
||
638 | $list = $indexerObj->extractHyperLinks($indexerObj->indexExternalUrl_content); |
||
639 | // Traverse links: |
||
640 | foreach ($list as $linkInfo) { |
||
641 | // Decode entities: |
||
642 | $subUrl = htmlspecialchars_decode($linkInfo['href']); |
||
643 | $qParts = parse_url($subUrl); |
||
644 | if (! $qParts['scheme']) { |
||
645 | $relativeUrl = GeneralUtility::resolveBackPath($subUrl); |
||
646 | if ($relativeUrl[0] === '/') { |
||
647 | $subUrl = $baseAbsoluteHref . $relativeUrl; |
||
648 | } else { |
||
649 | $subUrl = $baseHref . '/' . $relativeUrl; |
||
650 | } |
||
651 | } |
||
652 | $subUrls[] = $subUrl; |
||
653 | } |
||
654 | return $subUrls; |
||
655 | } |
||
656 | |||
657 | /** |
||
658 | * Indexing Single Record |
||
659 | * |
||
660 | * @param array $r Record to index |
||
661 | * @param array $cfgRec Configuration Record |
||
662 | * @param array $rl Rootline array to relate indexing to |
||
663 | */ |
||
664 | public function indexSingleRecord($r, $cfgRec, $rl = null): void |
||
665 | { |
||
666 | // Init: |
||
667 | $rl = is_array($rl) ? $rl : $this->getUidRootLineForClosestTemplate($cfgRec['pid']); |
||
668 | $fieldList = GeneralUtility::trimExplode(',', $cfgRec['fieldlist'], true); |
||
669 | $languageField = $GLOBALS['TCA'][$cfgRec['table2index']]['ctrl']['languageField']; |
||
670 | $sys_language_uid = $languageField ? $r[$languageField] : 0; |
||
671 | // (Re)-Indexing a row from a table: |
||
672 | $indexerObj = GeneralUtility::makeInstance(\TYPO3\CMS\IndexedSearch\Indexer::class); |
||
673 | parse_str(str_replace('###UID###', $r['uid'], $cfgRec['get_params']), $GETparams); |
||
674 | $indexerObj->backend_initIndexer($cfgRec['pid'], 0, $sys_language_uid, '', $rl, $GETparams, (bool) $cfgRec['chashcalc']); |
||
675 | $indexerObj->backend_setFreeIndexUid($cfgRec['uid'], $cfgRec['set_id']); |
||
676 | $indexerObj->forceIndexing = true; |
||
677 | $theContent = ''; |
||
678 | foreach ($fieldList as $k => $v) { |
||
679 | if (! $k) { |
||
680 | $theTitle = $r[$v]; |
||
681 | } else { |
||
682 | $theContent .= $r[$v] . ' '; |
||
683 | } |
||
684 | } |
||
685 | // Indexing the record as a page (but with parameters set, see ->backend_setFreeIndexUid()) |
||
686 | $indexerObj->backend_indexAsTYPO3Page(strip_tags(str_replace('<', ' <', $theTitle)), '', '', strip_tags(str_replace('<', ' <', $theContent)), 'utf-8', $r[$GLOBALS['TCA'][$cfgRec['table2index']]['ctrl']['tstamp']], $r[$GLOBALS['TCA'][$cfgRec['table2index']]['ctrl']['crdate']], $r['uid']); |
||
0 ignored issues
–
show
Comprehensibility
Best Practice
introduced
by
|
|||
687 | } |
||
688 | |||
689 | /** |
||
690 | * Get rootline for closest TypoScript template root. |
||
691 | * Algorithm same as used in Web > Template, Object browser |
||
692 | * |
||
693 | * @param int $id The page id to traverse rootline back from |
||
694 | * @return array Array where the root lines uid values are found. |
||
695 | */ |
||
696 | public function getUidRootLineForClosestTemplate($id) |
||
697 | { |
||
698 | $rootLineUids = []; |
||
699 | try { |
||
700 | // Gets the rootLine |
||
701 | $rootLine = GeneralUtility::makeInstance(RootlineUtility::class, $id)->get(); |
||
702 | // This generates the constants/config + hierarchy info for the template. |
||
703 | $tmpl = GeneralUtility::makeInstance(\TYPO3\CMS\Core\TypoScript\ExtendedTemplateService::class); |
||
704 | $tmpl->runThroughTemplates($rootLine); |
||
705 | // Root line uids |
||
706 | foreach ($tmpl->rootLine as $rlkey => $rldat) { |
||
707 | $rootLineUids[$rlkey] = $rldat['uid']; |
||
708 | } |
||
709 | } catch (RootLineException $e) { |
||
710 | // do nothing |
||
711 | } |
||
712 | return $rootLineUids; |
||
713 | } |
||
714 | |||
715 | /** |
||
716 | * Generate the unix time stamp for next visit. |
||
717 | * |
||
718 | * @param array $cfgRec Index configuration record |
||
719 | * |
||
720 | * @return float|int The next time stamp |
||
721 | */ |
||
722 | public function generateNextIndexingTime($cfgRec) |
||
723 | { |
||
724 | $currentTime = $GLOBALS['EXEC_TIME']; |
||
725 | // Now, find a midnight time to use for offset calculation. This has to differ depending on whether we have frequencies within a day or more than a day; Less than a day, we don't care which day to use for offset, more than a day we want to respect the currently entered day as offset regardless of when the script is run - thus the day-of-week used in case "Weekly" is selected will be respected |
||
726 | $aMidNight = $this->getMidnightTimestamp($cfgRec); |
||
727 | // Find last offset time plus frequency in seconds: |
||
728 | $lastSureOffset = $aMidNight + MathUtility::forceIntegerInRange($cfgRec['timer_offset'], 0, 86400); |
||
729 | $frequencySeconds = MathUtility::forceIntegerInRange($cfgRec['timer_frequency'], 1); |
||
730 | // Now, find out how many blocks of the length of frequency there is until the next time: |
||
731 | $frequencyBlocksUntilNextTime = (int) ceil(($currentTime - $lastSureOffset) / $frequencySeconds); |
||
732 | // Set next time to the offset + the frequency blocks multiplied with the frequency length in seconds. |
||
733 | return $lastSureOffset + $frequencyBlocksUntilNextTime * $frequencySeconds; |
||
734 | } |
||
735 | |||
736 | /** |
||
737 | * Checks if $url has any of the URls in the $url_deny "list" in it and if so, returns TRUE. |
||
738 | * |
||
739 | * @param string $url URL to test |
||
740 | * @param string $url_deny String where URLs are separated by line-breaks; If any of these strings is the first part of $url, the function returns TRUE (to indicate denial of decend) |
||
741 | * @return bool TRUE if there is a matching URL (hence, do not index!) |
||
742 | */ |
||
743 | public function checkDeniedSuburls($url, $url_deny) |
||
744 | { |
||
745 | if ($url_deny) { |
||
746 | $url_denyArray = GeneralUtility::trimExplode(LF, $url_deny, true); |
||
747 | foreach ($url_denyArray as $testurl) { |
||
748 | if (GeneralUtility::isFirstPartOfStr($url, $testurl)) { |
||
749 | return true; |
||
750 | } |
||
751 | } |
||
752 | } |
||
753 | return false; |
||
754 | } |
||
755 | |||
756 | /** |
||
757 | * Adding entry in queue for Hook |
||
758 | * |
||
759 | * @param array $cfgRec Configuration record |
||
760 | * @param string $title Title/URL |
||
761 | */ |
||
762 | public function addQueueEntryForHook($cfgRec, $title): void |
||
763 | { |
||
764 | $nparams = [ |
||
765 | 'indexConfigUid' => $cfgRec['uid'], |
||
766 | // This must ALWAYS be the cfgRec uid! |
||
767 | 'url' => $title, |
||
768 | 'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']'], |
||
769 | ]; |
||
770 | $this->pObj->addQueueEntry_callBack($cfgRec['set_id'], $nparams, $this->callBack, $cfgRec['pid']); |
||
771 | } |
||
772 | |||
773 | /** |
||
774 | * Deletes all data stored by indexed search for a given page |
||
775 | * |
||
776 | * @param int $id Uid of the page to delete all pHash |
||
777 | */ |
||
778 | public function deleteFromIndex($id): void |
||
779 | { |
||
780 | $connectionPool = GeneralUtility::makeInstance(ConnectionPool::class); |
||
781 | |||
782 | // Lookup old phash rows: |
||
783 | |||
784 | $queryBuilder = $connectionPool->getQueryBuilderForTable('index_section'); |
||
785 | $oldPhashRows = $queryBuilder->select('phash') |
||
786 | ->from('index_section') |
||
787 | ->where( |
||
788 | $queryBuilder->expr()->eq( |
||
789 | 'page_id', |
||
790 | $queryBuilder->createNamedParameter($id, \PDO::PARAM_INT) |
||
791 | ) |
||
792 | ) |
||
793 | ->execute() |
||
794 | ->fetchAll(); |
||
795 | |||
796 | if (empty($oldPhashRows)) { |
||
797 | return; |
||
798 | } |
||
799 | |||
800 | $tables = [ |
||
801 | 'index_debug', |
||
802 | 'index_fulltext', |
||
803 | 'index_grlist', |
||
804 | 'index_phash', |
||
805 | 'index_rel', |
||
806 | 'index_section', |
||
807 | ]; |
||
808 | foreach ($tables as $table) { |
||
809 | $queryBuilder = $connectionPool->getQueryBuilderForTable($table); |
||
810 | $queryBuilder->delete($table) |
||
811 | ->where( |
||
812 | $queryBuilder->expr()->in( |
||
813 | 'phash', |
||
814 | $queryBuilder->createNamedParameter( |
||
815 | array_column($oldPhashRows, 'phash'), |
||
816 | Connection::PARAM_INT_ARRAY |
||
817 | ) |
||
818 | ) |
||
819 | ) |
||
820 | ->execute(); |
||
821 | } |
||
822 | } |
||
823 | |||
824 | /************************* |
||
825 | * |
||
826 | * Hook functions for DataHandler (indexing of records) |
||
827 | * |
||
828 | *************************/ |
||
829 | |||
830 | /** |
||
831 | * DataHandler hook function for on-the-fly indexing of database records |
||
832 | * |
||
833 | * @param string $command DataHandler command |
||
834 | * @param string $table Table name |
||
835 | * @param string $id Record ID. If new record its a string pointing to index inside \TYPO3\CMS\Core\DataHandling\DataHandler::substNEWwithIDs |
||
836 | */ |
||
837 | public function processCmdmap_preProcess($command, $table, $id): void |
||
838 | { |
||
839 | // Clean up the index |
||
840 | if ($command === 'delete' && $table === 'pages') { |
||
841 | $this->deleteFromIndex($id); |
||
842 | } |
||
843 | } |
||
844 | |||
845 | /** |
||
846 | * DataHandler hook function for on-the-fly indexing of database records |
||
847 | * |
||
848 | * @param string $status Status "new" or "update |
||
849 | * @param string $table Table name |
||
850 | * @param string $id Record ID. If new record its a string pointing to index inside \TYPO3\CMS\Core\DataHandling\DataHandler::substNEWwithIDs |
||
851 | * @param array $fieldArray Field array of updated fields in the operation |
||
852 | * @param DataHandler $pObj DataHandler calling object |
||
853 | */ |
||
854 | public function processDatamap_afterDatabaseOperations($status, $table, $id, $fieldArray, $pObj): void |
||
855 | { |
||
856 | // Check if any fields are actually updated: |
||
857 | if (empty($fieldArray)) { |
||
858 | return; |
||
859 | } |
||
860 | // Translate new ids. |
||
861 | if ($status === 'new') { |
||
862 | $id = $pObj->substNEWwithIDs[$id]; |
||
863 | } elseif ($table === 'pages' && $status === 'update' && (array_key_exists('hidden', $fieldArray) && $fieldArray['hidden'] === 1 || array_key_exists('no_search', $fieldArray) && $fieldArray['no_search'] === 1)) { |
||
864 | // If the page should be hidden or not indexed after update, delete index for this page |
||
865 | $this->deleteFromIndex($id); |
||
866 | } |
||
867 | // Get full record and if exists, search for indexing configurations: |
||
868 | $currentRecord = BackendUtility::getRecord($table, $id); |
||
869 | if (is_array($currentRecord)) { |
||
870 | // Select all (not running) indexing configurations of type "record" (1) and |
||
871 | // which points to this table and is located on the same page as the record |
||
872 | // or pointing to the right source PID |
||
873 | $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class) |
||
874 | ->getQueryBuilderForTable('index_config'); |
||
875 | $result = $queryBuilder->select('*') |
||
876 | ->from('index_config') |
||
877 | ->where( |
||
878 | $queryBuilder->expr()->eq('set_id', $queryBuilder->createNamedParameter(0, \PDO::PARAM_INT)), |
||
879 | $queryBuilder->expr()->eq('type', $queryBuilder->createNamedParameter(1, \PDO::PARAM_INT)), |
||
880 | $queryBuilder->expr()->eq( |
||
881 | 'table2index', |
||
882 | $queryBuilder->createNamedParameter($table, \PDO::PARAM_STR) |
||
883 | ), |
||
884 | $queryBuilder->expr()->orX( |
||
885 | $queryBuilder->expr()->andX( |
||
886 | $queryBuilder->expr()->eq( |
||
887 | 'alternative_source_pid', |
||
888 | $queryBuilder->createNamedParameter(0, \PDO::PARAM_INT) |
||
889 | ), |
||
890 | $queryBuilder->expr()->eq( |
||
891 | 'pid', |
||
892 | $queryBuilder->createNamedParameter($currentRecord['pid'], \PDO::PARAM_INT) |
||
893 | ) |
||
894 | ), |
||
895 | $queryBuilder->expr()->eq( |
||
896 | 'alternative_source_pid', |
||
897 | $queryBuilder->createNamedParameter($currentRecord['pid'], \PDO::PARAM_INT) |
||
898 | ) |
||
899 | ), |
||
900 | $queryBuilder->expr()->eq( |
||
901 | 'records_indexonchange', |
||
902 | $queryBuilder->createNamedParameter(1, \PDO::PARAM_INT) |
||
903 | ) |
||
904 | ) |
||
905 | ->execute(); |
||
906 | |||
907 | while ($cfgRec = $result->fetch()) { |
||
908 | $this->indexSingleRecord($currentRecord, $cfgRec); |
||
909 | } |
||
910 | } |
||
911 | } |
||
912 | |||
913 | /** |
||
914 | * @return false|float|int |
||
915 | */ |
||
916 | protected function getMidnightTimestamp(array $cfgRec) |
||
917 | { |
||
918 | if ($cfgRec['timer_frequency'] <= 24 * 3600) { |
||
919 | $aMidNight = mktime(0, 0, 0) - 1 * 24 * 3600; |
||
920 | } else { |
||
921 | $lastTime = $cfgRec['timer_next_indexing'] ?: $GLOBALS['EXEC_TIME']; |
||
922 | $aMidNight = mktime(0, 0, 0, (int) date('m', $lastTime), (int) date('d', $lastTime), (int) date('y', $lastTime)); |
||
923 | } |
||
924 | return $aMidNight; |
||
925 | } |
||
926 | } |
||
927 |