Passed
Push — typo3v9 ( 2404ee...b9b5fa )
by Tomas Norre
05:51
created

IndexedSearchCrawlerHook::checkDeniedSuburls()   A

Complexity

Conditions 4
Paths 3

Size

Total Lines 11
Code Lines 6

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 20

Importance

Changes 0
Metric Value
cc 4
eloc 6
c 0
b 0
f 0
nc 3
nop 2
dl 0
loc 11
ccs 0
cts 11
cp 0
crap 20
rs 10
1
<?php
2
3
declare(strict_types=1);
4
5
namespace AOE\Crawler\Hooks;
6
7
/*
8
 * (c) 2020 AOE GmbH <[email protected]>
9
 *
10
 * This file is part of the TYPO3 Crawler Extension.
11
 *
12
 * It is free software; you can redistribute it and/or modify it under
13
 * the terms of the GNU General Public License, either version 2
14
 * of the License, or any later version.
15
 *
16
 * For the full copyright and license information, please read the
17
 * LICENSE.txt file that was distributed with this source code.
18
 *
19
 * The TYPO3 project - inspiring people to share!
20
 */
21
22
use TYPO3\CMS\Backend\Utility\BackendUtility;
23
use TYPO3\CMS\Core\Core\Environment;
24
use TYPO3\CMS\Core\Database\Connection;
25
use TYPO3\CMS\Core\Database\ConnectionPool;
26
use TYPO3\CMS\Core\Database\Query\Restriction\DeletedRestriction;
27
use TYPO3\CMS\Core\DataHandling\DataHandler;
28
use TYPO3\CMS\Core\Exception\Page\RootLineException;
29
use TYPO3\CMS\Core\Utility\GeneralUtility;
30
use TYPO3\CMS\Core\Utility\MathUtility;
31
use TYPO3\CMS\Core\Utility\RootlineUtility;
32
33
/**
34
 * Crawler hook for indexed search. Works with the "crawler" extension
35
 * @internal this is a TYPO3-internal hook implementation and not part of TYPO3's Core API.
36
 */
37
class IndexedSearchCrawlerHook
38
{
39
    /**
40
     * Number of seconds to use as interval between queued indexing operations of URLs / files (types 2 & 3)
41
     *
42
     * @var int
43
     */
44
    public $secondsPerExternalUrl = 3;
45
46
    /**
47
     * Counts up for each added URL (type 3)
48
     *
49
     * @var int
50
     */
51
    public $instanceCounter = 0;
52
53
    /**
54
     * @var string
55
     */
56
    public $callBack = self::class;
57
58
    /**
59
     * The constructor
60
     */
61
    public function __construct()
62
    {
63
        // To make sure the backend charset is available:
64
        if (!is_object($GLOBALS['LANG'])) {
65
            $GLOBALS['LANG'] = GeneralUtility::makeInstance(\TYPO3\CMS\Core\Localization\LanguageService::class);
66
            $GLOBALS['LANG']->init($GLOBALS['BE_USER']->uc['lang']);
67
        }
68
    }
69
70
    /**
71
     * Initialization of crawler hook.
72
     * This function is asked for each instance of the crawler and we must check if something is timed to happen and if so put entry(s) in the crawlers log to start processing.
73
     * In reality we select indexing configurations and evaluate if any of them needs to run.
74
     *
75
     * @param object $pObj Parent object (tx_crawler lib)
76
     */
77
    public function crawler_init(&$pObj): void
78
    {
79
        // Select all indexing configuration which are waiting to be activated:
80
        $connection = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('index_config');
81
        $queryBuilder = $connection->createQueryBuilder();
82
83
        $result = $queryBuilder->select('*')
84
            ->from('index_config')
85
            ->where(
86
                $queryBuilder->expr()->lt(
87
                    'timer_next_indexing',
88
                    $queryBuilder->createNamedParameter($GLOBALS['EXEC_TIME'], \PDO::PARAM_INT)
89
                ),
90
                $queryBuilder->expr()->eq('set_id', $queryBuilder->createNamedParameter(0, \PDO::PARAM_INT))
91
            )
92
            ->execute();
93
94
        // For each configuration, check if it should be executed and if so, start:
95
        while ($cfgRec = $result->fetch()) {
96
            // Generate a unique set-ID:
97
            $setId = GeneralUtility::md5int(microtime());
98
            // Get next time:
99
            $nextTime = $this->generateNextIndexingTime($cfgRec);
100
            // Start process by updating index-config record:
101
            $connection->update(
102
                'index_config',
103
                [
104
                    'set_id' => $setId,
105
                    'timer_next_indexing' => $nextTime,
106
                    'session_data' => '',
107
                ],
108
                [
109
                    'uid' => (int)$cfgRec['uid'],
110
                ]
111
            );
112
            // Based on configuration type:
113
            switch ($cfgRec['type']) {
114
                case 1:
115
                    // RECORDS:
116
                    // Parameters:
117
                    $params = [
118
                        'indexConfigUid' => $cfgRec['uid'],
119
                        'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']'],
120
                        'url' => 'Records (start)',
121
                    ];
122
                    //
123
                    $pObj->addQueueEntry_callBack($setId, $params, $this->callBack, $cfgRec['pid']);
124
                    break;
125
                case 2:
126
                    // FILES:
127
                    // Parameters:
128
                    $params = [
129
                        'indexConfigUid' => $cfgRec['uid'],
130
                        // General
131
                        'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']'],
132
                        // General
133
                        'url' => $cfgRec['filepath'],
134
                        // Partly general... (for URL and file types)
135
                        'depth' => 0,
136
                    ];
137
                    $pObj->addQueueEntry_callBack($setId, $params, $this->callBack, $cfgRec['pid']);
138
                    break;
139
                case 3:
140
                    // External URL:
141
                    // Parameters:
142
                    $params = [
143
                        'indexConfigUid' => $cfgRec['uid'],
144
                        // General
145
                        'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']'],
146
                        // General
147
                        'url' => $cfgRec['externalUrl'],
148
                        // Partly general... (for URL and file types)
149
                        'depth' => 0,
150
                    ];
151
                    $pObj->addQueueEntry_callBack($setId, $params, $this->callBack, $cfgRec['pid']);
152
                    break;
153
                case 4:
154
                    // Page tree
155
                    // Parameters:
156
                    $params = [
157
                        'indexConfigUid' => $cfgRec['uid'],
158
                        // General
159
                        'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']'],
160
                        // General
161
                        'url' => (int)$cfgRec['alternative_source_pid'],
162
                        // Partly general... (for URL and file types and page tree (root))
163
                        'depth' => 0,
164
                    ];
165
                    $pObj->addQueueEntry_callBack($setId, $params, $this->callBack, $cfgRec['pid']);
166
                    break;
167
                case 5:
168
                    // Meta configuration, nothing to do:
169
                    // NOOP
170
                    break;
171
                default:
172
                    if ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]) {
173
                        $hookObj = GeneralUtility::makeInstance($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]);
174
                        // Parameters:
175
                        $params = [
176
                            'indexConfigUid' => $cfgRec['uid'],
177
                            // General
178
                            'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . '/CUSTOM]'],
179
                            // General
180
                            'url' => $hookObj->initMessage($message),
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable $message seems to be never defined.
Loading history...
181
                        ];
182
                        $pObj->addQueueEntry_callBack($setId, $params, $this->callBack, $cfgRec['pid']);
183
                    }
184
            }
185
        }
186
        // Finally, look up all old index configurations which are finished and needs to be reset and done.
187
        $this->cleanUpOldRunningConfigurations();
188
    }
189
190
    /**
191
     * Call back function for execution of a log element
192
     *
193
     * @param array $params Params from log element. Must contain $params['indexConfigUid']
194
     * @param object $pObj Parent object (tx_crawler lib)
195
     * @return array Result array
196
     */
197
    public function crawler_execute($params, &$pObj)
198
    {
199
        // Indexer configuration ID must exist:
200
        if ($params['indexConfigUid']) {
201
            $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
202
                ->getQueryBuilderForTable('index_config');
203
            $queryBuilder->getRestrictions()->removeAll();
204
            // Load the indexing configuration record:
205
            $cfgRec = $queryBuilder
206
                ->select('*')
207
                ->from('index_config')
208
                ->where(
209
                    $queryBuilder->expr()->eq(
210
                        'uid',
211
                        $queryBuilder->createNamedParameter($params['indexConfigUid'], \PDO::PARAM_INT)
212
                    )
213
                )
214
                ->execute()
215
                ->fetch();
216
            if (is_array($cfgRec)) {
217
                // Unpack session data:
218
                $session_data = unserialize($cfgRec['session_data']);
219
                // Select which type:
220
                switch ($cfgRec['type']) {
221
                    case 1:
222
                        // Records:
223
                        $this->crawler_execute_type1($cfgRec, $session_data, $params, $pObj);
224
                        break;
225
                    case 2:
226
                        // Files
227
                        $this->crawler_execute_type2($cfgRec, $session_data, $params, $pObj);
228
                        break;
229
                    case 3:
230
                        // External URL:
231
                        $this->crawler_execute_type3($cfgRec, $session_data, $params, $pObj);
232
                        break;
233
                    case 4:
234
                        // Page tree:
235
                        $this->crawler_execute_type4($cfgRec, $session_data, $params, $pObj);
236
                        break;
237
                    case 5:
238
                        // Meta
239
                        // NOOP (should never enter here!)
240
                        break;
241
                    default:
242
                        if ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]) {
243
                            $hookObj = GeneralUtility::makeInstance($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]);
244
                            $this->pObj = $pObj;
0 ignored issues
show
Bug Best Practice introduced by
The property pObj does not exist. Although not strictly required by PHP, it is generally a best practice to declare properties explicitly.
Loading history...
245
                            // For addQueueEntryForHook()
246
                            $hookObj->indexOperation($cfgRec, $session_data, $params, $this);
247
                        }
248
                }
249
                // Save process data which might be modified:
250
                GeneralUtility::makeInstance(ConnectionPool::class)
251
                    ->getConnectionForTable('index_config')
252
                    ->update(
253
                        'index_config',
254
                        ['session_data' => serialize($session_data)],
255
                        ['uid' => (int)$cfgRec['uid']]
256
                    );
257
            }
258
        }
259
        return ['log' => $params];
260
    }
261
262
    /**
263
     * Indexing records from a table
264
     *
265
     * @param array $cfgRec Indexing Configuration Record
266
     * @param array $session_data Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call!
267
     * @param array $params Parameters from the log queue.
268
     * @param object $pObj Parent object (from "crawler" extension!)
269
     */
270
    public function crawler_execute_type1($cfgRec, &$session_data, $params, &$pObj): void
0 ignored issues
show
Unused Code introduced by
The parameter $params is not used and could be removed. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-unused  annotation

270
    public function crawler_execute_type1($cfgRec, &$session_data, /** @scrutinizer ignore-unused */ $params, &$pObj): void

This check looks for parameters that have been defined for a function or method, but which are not used in the method body.

Loading history...
271
    {
272
        if ($cfgRec['table2index'] && isset($GLOBALS['TCA'][$cfgRec['table2index']])) {
273
            // Init session data array if not already:
274
            if (!is_array($session_data)) {
0 ignored issues
show
introduced by
The condition is_array($session_data) is always true.
Loading history...
275
                $session_data = [
276
                    'uid' => 0,
277
                ];
278
            }
279
            // Init:
280
            $pid = (int)$cfgRec['alternative_source_pid'] ?: $cfgRec['pid'];
281
            $numberOfRecords = $cfgRec['recordsbatch']
282
                ? MathUtility::forceIntegerInRange($cfgRec['recordsbatch'], 1)
283
                : 100;
284
285
            // Get root line:
286
            $rootLine = $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
287
            // Select
288
            $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
289
                ->getQueryBuilderForTable($cfgRec['table2index']);
290
291
            $baseQueryBuilder = $queryBuilder->select('*')
292
                ->from($cfgRec['table2index'])
293
                ->where(
294
                    $queryBuilder->expr()->eq(
295
                        'pid',
296
                        $queryBuilder->createNamedParameter($pid, \PDO::PARAM_INT)
297
                    ),
298
                    $queryBuilder->expr()->gt(
299
                        'uid',
300
                        $queryBuilder->createNamedParameter($session_data['uid'], \PDO::PARAM_INT)
301
                    )
302
                );
303
            $result = $baseQueryBuilder
304
                ->setMaxResults($numberOfRecords)
305
                ->orderBy('uid')
306
                ->execute();
307
308
            // Traverse:
309
            while ($row = $result->fetch()) {
310
                // Index single record:
311
                $this->indexSingleRecord($row, $cfgRec, $rootLine);
312
                // Update the UID we last processed:
313
                $session_data['uid'] = $row['uid'];
314
            }
315
316
            $rowCount = $baseQueryBuilder->count('uid')->execute()->fetchColumn(0);
317
            // Finally, set entry for next indexing of batch of records:
318
            if ($rowCount) {
319
                $nparams = [
320
                    'indexConfigUid' => $cfgRec['uid'],
321
                    'url' => 'Records from UID#' . ($session_data['uid'] + 1) . '-?',
322
                    'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']'],
323
                ];
324
                $pObj->addQueueEntry_callBack($cfgRec['set_id'], $nparams, $this->callBack, $cfgRec['pid']);
325
            }
326
        }
327
    }
328
329
    /**
330
     * Indexing files from fileadmin
331
     *
332
     * @param array $cfgRec Indexing Configuration Record
333
     * @param array $session_data Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call!
334
     * @param array $params Parameters from the log queue.
335
     * @param object $pObj Parent object (from "crawler" extension!)
336
     */
337
    public function crawler_execute_type2($cfgRec, &$session_data, $params, &$pObj): void
0 ignored issues
show
Unused Code introduced by
The parameter $session_data is not used and could be removed. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-unused  annotation

337
    public function crawler_execute_type2($cfgRec, /** @scrutinizer ignore-unused */ &$session_data, $params, &$pObj): void

This check looks for parameters that have been defined for a function or method, but which are not used in the method body.

Loading history...
338
    {
339
        // Prepare path, making it absolute and checking:
340
        $readpath = $params['url'];
341
        if (!GeneralUtility::isAbsPath($readpath)) {
342
            $readpath = GeneralUtility::getFileAbsFileName($readpath);
343
        }
344
        if (GeneralUtility::isAllowedAbsPath($readpath)) {
345
            if (@is_file($readpath)) {
346
                // If file, index it!
347
                // Get root line (need to provide this when indexing external files)
348
                $rl = $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
349
                // (Re)-Indexing file on page.
350
                $indexerObj = GeneralUtility::makeInstance(\TYPO3\CMS\IndexedSearch\Indexer::class);
351
                $indexerObj->backend_initIndexer($cfgRec['pid'], 0, 0, '', $rl);
352
                $indexerObj->backend_setFreeIndexUid($cfgRec['uid'], $cfgRec['set_id']);
353
                $indexerObj->hash['phash'] = -1;
354
                // EXPERIMENT - but to avoid phash_t3 being written to file sections (otherwise they are removed when page is reindexed!!!)
355
                // Index document:
356
                $indexerObj->indexRegularDocument(\TYPO3\CMS\Core\Utility\PathUtility::stripPathSitePrefix($readpath), true);
357
            } elseif (@is_dir($readpath)) {
358
                // If dir, read content and create new pending items for log:
359
                // Select files and directories in path:
360
                $extList = implode(',', GeneralUtility::trimExplode(',', $cfgRec['extensions'], true));
361
                $fileArr = [];
362
                $files = GeneralUtility::getAllFilesAndFoldersInPath($fileArr, $readpath, $extList, 0, 0);
363
                $directoryList = GeneralUtility::get_dirs($readpath);
364
                if (is_array($directoryList) && $params['depth'] < $cfgRec['depth']) {
365
                    foreach ($directoryList as $subdir) {
366
                        if ((string)$subdir != '') {
367
                            $files[] = $readpath . $subdir . '/';
368
                        }
369
                    }
370
                }
371
                $files = GeneralUtility::removePrefixPathFromList($files, Environment::getPublicPath() . '/');
372
                // traverse the items and create log entries:
373
                foreach ($files as $path) {
374
                    $this->instanceCounter++;
375
                    if ($path !== $params['url']) {
376
                        // Parameters:
377
                        $nparams = [
378
                            'indexConfigUid' => $cfgRec['uid'],
379
                            'url' => $path,
380
                            'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']'],
381
                            'depth' => $params['depth'] + 1,
382
                        ];
383
                        $pObj->addQueueEntry_callBack($cfgRec['set_id'], $nparams, $this->callBack, $cfgRec['pid'], $GLOBALS['EXEC_TIME'] + $this->instanceCounter * $this->secondsPerExternalUrl);
384
                    }
385
                }
386
            }
387
        }
388
    }
389
390
    /**
391
     * Indexing External URLs
392
     *
393
     * @param array $cfgRec Indexing Configuration Record
394
     * @param array $session_data Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call!
395
     * @param array $params Parameters from the log queue.
396
     * @param object $pObj Parent object (from "crawler" extension!)
397
     */
398
    public function crawler_execute_type3($cfgRec, &$session_data, $params, &$pObj): void
399
    {
400
        // Init session data array if not already:
401
        if (!is_array($session_data)) {
0 ignored issues
show
introduced by
The condition is_array($session_data) is always true.
Loading history...
402
            $session_data = [
403
                'urlLog' => [$params['url']],
404
            ];
405
        }
406
        // Index the URL:
407
        $rl = $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
408
        $subUrls = $this->indexExtUrl($params['url'], $cfgRec['pid'], $rl, $cfgRec['uid'], $cfgRec['set_id']);
409
        // Add more elements to log now:
410
        if ($params['depth'] < $cfgRec['depth']) {
411
            foreach ($subUrls as $url) {
412
                if ($url = $this->checkUrl($url, $session_data['urlLog'], $cfgRec['externalUrl'])) {
413
                    if (!$this->checkDeniedSuburls($url, $cfgRec['url_deny'])) {
414
                        $this->instanceCounter++;
415
                        $session_data['urlLog'][] = $url;
416
                        // Parameters:
417
                        $nparams = [
418
                            'indexConfigUid' => $cfgRec['uid'],
419
                            'url' => $url,
420
                            'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']'],
421
                            'depth' => $params['depth'] + 1,
422
                        ];
423
                        $pObj->addQueueEntry_callBack($cfgRec['set_id'], $nparams, $this->callBack, $cfgRec['pid'], $GLOBALS['EXEC_TIME'] + $this->instanceCounter * $this->secondsPerExternalUrl);
424
                    }
425
                }
426
            }
427
        }
428
    }
429
430
    /**
431
     * Page tree indexing type
432
     *
433
     * @param array $cfgRec Indexing Configuration Record
434
     * @param array $session_data Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call!
435
     * @param array $params Parameters from the log queue.
436
     * @param object $pObj Parent object (from "crawler" extension!)
437
     */
438
    public function crawler_execute_type4($cfgRec, &$session_data, $params, &$pObj): void
439
    {
440
        // Base page uid:
441
        $pageUid = (int)$params['url'];
442
        // Get array of URLs from page:
443
        $pageRow = BackendUtility::getRecord('pages', $pageUid);
444
        $res = $pObj->getUrlsForPageRow($pageRow);
445
        $duplicateTrack = [];
446
        // Registry for duplicates
447
        $downloadUrls = [];
448
        // Dummy.
449
        // Submit URLs:
450
        if (!empty($res)) {
451
            foreach ($res as $vv) {
452
                $pObj->urlListFromUrlArray($vv, $pageRow, $GLOBALS['EXEC_TIME'], 30, 1, 0, $duplicateTrack, $downloadUrls, ['tx_indexedsearch_reindex']);
453
            }
454
        }
455
        // Add subpages to log now:
456
        if ($params['depth'] < $cfgRec['depth']) {
457
            // Subpages selected
458
            $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable('pages');
459
            $queryBuilder->getRestrictions()
460
                ->removeAll()
461
                ->add(GeneralUtility::makeInstance(DeletedRestriction::class));
462
            $result = $queryBuilder->select('uid', 'title')
463
                ->from('pages')
464
                ->where(
465
                    $queryBuilder->expr()->eq(
466
                        'pid',
467
                        $queryBuilder->createNamedParameter($pageUid, \PDO::PARAM_INT)
468
                    )
469
                )
470
                ->execute();
471
            // Traverse subpages and add to queue:
472
            while ($row = $result->fetch()) {
473
                $this->instanceCounter++;
474
                $url = 'pages:' . $row['uid'] . ': ' . $row['title'];
475
                $session_data['urlLog'][] = $url;
476
                // Parameters:
477
                $nparams = [
478
                    'indexConfigUid' => $cfgRec['uid'],
479
                    'url' => $row['uid'],
480
                    'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']'],
481
                    'depth' => $params['depth'] + 1,
482
                ];
483
                $pObj->addQueueEntry_callBack(
484
                    $cfgRec['set_id'],
485
                    $nparams,
486
                    $this->callBack,
487
                    $cfgRec['pid'],
488
                    $GLOBALS['EXEC_TIME'] + $this->instanceCounter * $this->secondsPerExternalUrl
489
                );
490
            }
491
        }
492
    }
493
494
    /**
495
     * Look up all old index configurations which are finished and needs to be reset and done
496
     */
497
    public function cleanUpOldRunningConfigurations(): void
498
    {
499
        $connectionPool = GeneralUtility::makeInstance(ConnectionPool::class);
500
        // List of tables that store information related to the phash value
501
        $tablesToClean = [
502
            'index_phash',
503
            'index_rel',
504
            'index_section',
505
            'index_grlist',
506
            'index_fulltext',
507
            'index_debug',
508
        ];
509
510
        $queryBuilder = $connectionPool->getQueryBuilderForTable('index_config');
511
        $queryBuilder->getRestrictions()
512
            ->removeAll()
513
            ->add(GeneralUtility::makeInstance(DeletedRestriction::class));
514
515
        // Lookup running index configurations:
516
        $runningIndexingConfigurations = $queryBuilder->select('*')
517
            ->from('index_config')
518
            ->where($queryBuilder->expr()->neq('set_id', $queryBuilder->createNamedParameter(0, \PDO::PARAM_INT)))
519
            ->execute()
520
            ->fetchAll();
521
        // For each running configuration, look up how many log entries there are which are scheduled
522
        // for execution and if none, clear the "set_id" (means; Processing was DONE)
523
        foreach ($runningIndexingConfigurations as $cfgRec) {
524
            // Look for ended processes:
525
            $queued_items = $connectionPool->getConnectionForTable('tx_crawler_queue')
526
                ->count(
527
                    '*',
528
                    'tx_crawler_queue',
529
                    [
530
                        'set_id' => (int)$cfgRec['set_id'],
531
                        'exec_time' => 0,
532
                    ]
533
                );
534
            if (!$queued_items) {
535
                // Lookup old phash rows:
536
                $queryBuilder = $connectionPool->getQueryBuilderForTable('index_phash');
537
                $oldPhashRows = $queryBuilder
538
                    ->select('phash')
539
                    ->from('index_phash')
540
                    ->where(
541
                        $queryBuilder->expr()->eq(
542
                            'freeIndexUid',
543
                            $queryBuilder->createNamedParameter($cfgRec['uid'], \PDO::PARAM_INT)
544
                        ),
545
                        $queryBuilder->expr()->neq(
546
                            'freeIndexSetId',
547
                            $queryBuilder->createNamedParameter($cfgRec['set_id'], \PDO::PARAM_INT)
548
                        )
549
                    )
550
                    ->execute()
551
                    ->fetchAll();
552
553
                // Removing old registrations for all tables
554
                foreach ($tablesToClean as $table) {
555
                    $queryBuilder = $connectionPool->getQueryBuilderForTable($table);
556
                    $queryBuilder->delete($table)
557
                        ->where(
558
                            $queryBuilder->expr()->in(
559
                                'phash',
560
                                $queryBuilder->createNamedParameter(
561
                                    array_column($oldPhashRows, 'phash'),
562
                                    Connection::PARAM_INT_ARRAY
563
                                )
564
                            )
565
                        )
566
                        ->execute();
567
                }
568
569
                // End process by updating index-config record:
570
                $connectionPool->getConnectionForTable('index_config')
571
                    ->update(
572
                        'index_config',
573
                        [
574
                            'set_id' => 0,
575
                            'session_data' => '',
576
                        ],
577
                        ['uid' => (int)$cfgRec['uid']]
578
                    );
579
            }
580
        }
581
    }
582
583
    /*****************************************
584
     *
585
     * Helper functions
586
     *
587
     *****************************************/
588
    /**
589
     * Check if an input URL are allowed to be indexed. Depends on whether it is already present in the url log.
590
     *
591
     * @param string $url URL string to check
592
     * @param array $urlLog Array of already indexed URLs (input url is looked up here and must not exist already)
593
     * @param string $baseUrl Base URL of the indexing process (input URL must be "inside" the base URL!)
594
     * @return string Returls the URL if OK, otherwise FALSE
595
     */
596
    public function checkUrl($url, $urlLog, $baseUrl)
597
    {
598
        $url = preg_replace('/\\/\\/$/', '/', $url);
599
        [$url] = explode('#', $url);
600
        if (!strstr($url, '../')) {
601
            if (GeneralUtility::isFirstPartOfStr($url, $baseUrl)) {
602
                if (!in_array($url, $urlLog)) {
603
                    return $url;
604
                }
605
            }
606
        }
607
    }
608
609
    /**
610
     * Indexing External URL
611
     *
612
     * @param string $url URL, http://....
613
     * @param int $pageId Page id to relate indexing to.
614
     * @param array $rl Rootline array to relate indexing to
615
     * @param int $cfgUid Configuration UID
616
     * @param int $setId Set ID value
617
     * @return array URLs found on this page
618
     */
619
    public function indexExtUrl($url, $pageId, $rl, $cfgUid, $setId)
620
    {
621
        // Index external URL:
622
        $indexerObj = GeneralUtility::makeInstance(\TYPO3\CMS\IndexedSearch\Indexer::class);
623
        $indexerObj->backend_initIndexer($pageId, 0, 0, '', $rl);
624
        $indexerObj->backend_setFreeIndexUid($cfgUid, $setId);
625
        $indexerObj->hash['phash'] = -1;
626
        // To avoid phash_t3 being written to file sections (otherwise they are removed when page is reindexed!!!)
627
        $indexerObj->indexExternalUrl($url);
628
        $url_qParts = parse_url($url);
629
        $baseAbsoluteHref = $url_qParts['scheme'] . '://' . $url_qParts['host'];
630
        $baseHref = $indexerObj->extractBaseHref($indexerObj->indexExternalUrl_content);
631
        if (!$baseHref) {
632
            // Extract base href from current URL
633
            $baseHref = $baseAbsoluteHref;
634
            $baseHref .= substr($url_qParts['path'], 0, strrpos($url_qParts['path'], '/'));
635
        }
636
        $baseHref = rtrim($baseHref, '/');
637
        // Get URLs on this page:
638
        $subUrls = [];
639
        $list = $indexerObj->extractHyperLinks($indexerObj->indexExternalUrl_content);
640
        // Traverse links:
641
        foreach ($list as $linkInfo) {
642
            // Decode entities:
643
            $subUrl = htmlspecialchars_decode($linkInfo['href']);
644
            $qParts = parse_url($subUrl);
645
            if (!$qParts['scheme']) {
646
                $relativeUrl = GeneralUtility::resolveBackPath($subUrl);
647
                if ($relativeUrl[0] === '/') {
648
                    $subUrl = $baseAbsoluteHref . $relativeUrl;
649
                } else {
650
                    $subUrl = $baseHref . '/' . $relativeUrl;
651
                }
652
            }
653
            $subUrls[] = $subUrl;
654
        }
655
        return $subUrls;
656
    }
657
658
    /**
659
     * Indexing Single Record
660
     *
661
     * @param array $r Record to index
662
     * @param array $cfgRec Configuration Record
663
     * @param array $rl Rootline array to relate indexing to
664
     */
665
    public function indexSingleRecord($r, $cfgRec, $rl = null): void
666
    {
667
        // Init:
668
        $rl = is_array($rl) ? $rl : $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
669
        $fieldList = GeneralUtility::trimExplode(',', $cfgRec['fieldlist'], true);
670
        $languageField = $GLOBALS['TCA'][$cfgRec['table2index']]['ctrl']['languageField'];
671
        $sys_language_uid = $languageField ? $r[$languageField] : 0;
672
        // (Re)-Indexing a row from a table:
673
        $indexerObj = GeneralUtility::makeInstance(\TYPO3\CMS\IndexedSearch\Indexer::class);
674
        parse_str(str_replace('###UID###', $r['uid'], $cfgRec['get_params']), $GETparams);
675
        $indexerObj->backend_initIndexer($cfgRec['pid'], 0, $sys_language_uid, '', $rl, $GETparams, (bool)$cfgRec['chashcalc']);
676
        $indexerObj->backend_setFreeIndexUid($cfgRec['uid'], $cfgRec['set_id']);
677
        $indexerObj->forceIndexing = true;
678
        $theContent = '';
679
        foreach ($fieldList as $k => $v) {
680
            if (!$k) {
681
                $theTitle = $r[$v];
682
            } else {
683
                $theContent .= $r[$v] . ' ';
684
            }
685
        }
686
        // Indexing the record as a page (but with parameters set, see ->backend_setFreeIndexUid())
687
        $indexerObj->backend_indexAsTYPO3Page(strip_tags(str_replace('<', ' <', $theTitle)), '', '', strip_tags(str_replace('<', ' <', $theContent)), 'utf-8', $r[$GLOBALS['TCA'][$cfgRec['table2index']]['ctrl']['tstamp']], $r[$GLOBALS['TCA'][$cfgRec['table2index']]['ctrl']['crdate']], $r['uid']);
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable $theTitle seems to be defined by a foreach iteration on line 679. Are you sure the iterator is never empty, otherwise this variable is not defined?
Loading history...
688
    }
689
690
    /**
691
     * Get rootline for closest TypoScript template root.
692
     * Algorithm same as used in Web > Template, Object browser
693
     *
694
     * @param int $id The page id to traverse rootline back from
695
     * @return array Array where the root lines uid values are found.
696
     */
697
    public function getUidRootLineForClosestTemplate($id)
698
    {
699
        $rootLineUids = [];
700
        try {
701
            // Gets the rootLine
702
            $rootLine = GeneralUtility::makeInstance(RootlineUtility::class, $id)->get();
703
            // This generates the constants/config + hierarchy info for the template.
704
            $tmpl = GeneralUtility::makeInstance(\TYPO3\CMS\Core\TypoScript\ExtendedTemplateService::class);
705
            $tmpl->runThroughTemplates($rootLine);
706
            // Root line uids
707
            foreach ($tmpl->rootLine as $rlkey => $rldat) {
708
                $rootLineUids[$rlkey] = $rldat['uid'];
709
            }
710
        } catch (RootLineException $e) {
711
            // do nothing
712
        }
713
        return $rootLineUids;
714
    }
715
716
    /**
717
     * Generate the unix time stamp for next visit.
718
     *
719
     * @param array $cfgRec Index configuration record
720
     * @return int The next time stamp
721
     */
722
    public function generateNextIndexingTime($cfgRec)
723
    {
724
        $currentTime = $GLOBALS['EXEC_TIME'];
725
        // Now, find a midnight time to use for offset calculation. This has to differ depending on whether we have frequencies within a day or more than a day; Less than a day, we don't care which day to use for offset, more than a day we want to respect the currently entered day as offset regardless of when the script is run - thus the day-of-week used in case "Weekly" is selected will be respected
726
        if ($cfgRec['timer_frequency'] <= 24 * 3600) {
727
            $aMidNight = mktime(0, 0, 0) - 1 * 24 * 3600;
728
        } else {
729
            $lastTime = $cfgRec['timer_next_indexing'] ?: $GLOBALS['EXEC_TIME'];
730
            $aMidNight = mktime(0, 0, 0, date('m', $lastTime), date('d', $lastTime), date('y', $lastTime));
0 ignored issues
show
Bug introduced by
date('y', $lastTime) of type string is incompatible with the type integer expected by parameter $year of mktime(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

730
            $aMidNight = mktime(0, 0, 0, date('m', $lastTime), date('d', $lastTime), /** @scrutinizer ignore-type */ date('y', $lastTime));
Loading history...
Bug introduced by
date('d', $lastTime) of type string is incompatible with the type integer expected by parameter $day of mktime(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

730
            $aMidNight = mktime(0, 0, 0, date('m', $lastTime), /** @scrutinizer ignore-type */ date('d', $lastTime), date('y', $lastTime));
Loading history...
Bug introduced by
date('m', $lastTime) of type string is incompatible with the type integer expected by parameter $month of mktime(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

730
            $aMidNight = mktime(0, 0, 0, /** @scrutinizer ignore-type */ date('m', $lastTime), date('d', $lastTime), date('y', $lastTime));
Loading history...
731
        }
732
        // Find last offset time plus frequency in seconds:
733
        $lastSureOffset = $aMidNight + MathUtility::forceIntegerInRange($cfgRec['timer_offset'], 0, 86400);
734
        $frequencySeconds = MathUtility::forceIntegerInRange($cfgRec['timer_frequency'], 1);
735
        // Now, find out how many blocks of the length of frequency there is until the next time:
736
        $frequencyBlocksUntilNextTime = ceil(($currentTime - $lastSureOffset) / $frequencySeconds);
737
        // Set next time to the offset + the frequencyblocks multiplied with the frequency length in seconds.
738
        return $lastSureOffset + $frequencyBlocksUntilNextTime * $frequencySeconds;
739
    }
740
741
    /**
742
     * Checks if $url has any of the URls in the $url_deny "list" in it and if so, returns TRUE.
743
     *
744
     * @param string $url URL to test
745
     * @param string $url_deny String where URLs are separated by line-breaks; If any of these strings is the first part of $url, the function returns TRUE (to indicate denial of decend)
746
     * @return bool TRUE if there is a matching URL (hence, do not index!)
747
     */
748
    public function checkDeniedSuburls($url, $url_deny)
749
    {
750
        if (trim($url_deny)) {
751
            $url_denyArray = GeneralUtility::trimExplode(LF, $url_deny, true);
752
            foreach ($url_denyArray as $testurl) {
753
                if (GeneralUtility::isFirstPartOfStr($url, $testurl)) {
754
                    return true;
755
                }
756
            }
757
        }
758
        return false;
759
    }
760
761
    /**
762
     * Adding entry in queue for Hook
763
     *
764
     * @param array $cfgRec Configuration record
765
     * @param string $title Title/URL
766
     */
767
    public function addQueueEntryForHook($cfgRec, $title): void
768
    {
769
        $nparams = [
770
            'indexConfigUid' => $cfgRec['uid'],
771
            // This must ALWAYS be the cfgRec uid!
772
            'url' => $title,
773
            'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']'],
774
        ];
775
        $this->pObj->addQueueEntry_callBack($cfgRec['set_id'], $nparams, $this->callBack, $cfgRec['pid']);
776
    }
777
778
    /**
779
     * Deletes all data stored by indexed search for a given page
780
     *
781
     * @param int $id Uid of the page to delete all pHash
782
     */
783
    public function deleteFromIndex($id): void
784
    {
785
        $connectionPool = GeneralUtility::makeInstance(ConnectionPool::class);
786
787
        // Lookup old phash rows:
788
789
        $queryBuilder = $connectionPool->getQueryBuilderForTable('index_section');
790
        $oldPhashRows = $queryBuilder->select('phash')
791
            ->from('index_section')
792
            ->where(
793
                $queryBuilder->expr()->eq(
794
                    'page_id',
795
                    $queryBuilder->createNamedParameter($id, \PDO::PARAM_INT)
796
                )
797
            )
798
            ->execute()
799
            ->fetchAll();
800
801
        if (empty($oldPhashRows)) {
802
            return;
803
        }
804
805
        $tables = [
806
            'index_debug',
807
            'index_fulltext',
808
            'index_grlist',
809
            'index_phash',
810
            'index_rel',
811
            'index_section',
812
        ];
813
        foreach ($tables as $table) {
814
            $queryBuilder = $connectionPool->getQueryBuilderForTable($table);
815
            $queryBuilder->delete($table)
816
                ->where(
817
                    $queryBuilder->expr()->in(
818
                        'phash',
819
                        $queryBuilder->createNamedParameter(
820
                            array_column($oldPhashRows, 'phash'),
821
                            Connection::PARAM_INT_ARRAY
822
                        )
823
                    )
824
                )
825
                ->execute();
826
        }
827
    }
828
829
    /*************************
830
     *
831
     * Hook functions for DataHandler (indexing of records)
832
     *
833
     *************************/
834
    /**
835
     * DataHandler hook function for on-the-fly indexing of database records
836
     *
837
     * @param string $command DataHandler command
838
     * @param string $table Table name
839
     * @param string $id Record ID. If new record its a string pointing to index inside \TYPO3\CMS\Core\DataHandling\DataHandler::substNEWwithIDs
840
     */
841
    public function processCmdmap_preProcess($command, $table, $id): void
842
    {
843
        // Clean up the index
844
        if ($command === 'delete' && $table === 'pages') {
845
            $this->deleteFromIndex($id);
0 ignored issues
show
Bug introduced by
$id of type string is incompatible with the type integer expected by parameter $id of AOE\Crawler\Hooks\Indexe...Hook::deleteFromIndex(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

845
            $this->deleteFromIndex(/** @scrutinizer ignore-type */ $id);
Loading history...
846
        }
847
    }
848
849
    /**
850
     * DataHandler hook function for on-the-fly indexing of database records
851
     *
852
     * @param string $status Status "new" or "update
853
     * @param string $table Table name
854
     * @param string $id Record ID. If new record its a string pointing to index inside \TYPO3\CMS\Core\DataHandling\DataHandler::substNEWwithIDs
855
     * @param array $fieldArray Field array of updated fields in the operation
856
     * @param DataHandler $pObj DataHandler calling object
857
     */
858
    public function processDatamap_afterDatabaseOperations($status, $table, $id, $fieldArray, $pObj): void
859
    {
860
        // Check if any fields are actually updated:
861
        if (empty($fieldArray)) {
862
            return;
863
        }
864
        // Translate new ids.
865
        if ($status === 'new') {
866
            $id = $pObj->substNEWwithIDs[$id];
867
        } elseif ($table === 'pages' && $status === 'update' && (array_key_exists('hidden', $fieldArray) && $fieldArray['hidden'] == 1 || array_key_exists('no_search', $fieldArray) && $fieldArray['no_search'] == 1)) {
868
            // If the page should be hidden or not indexed after update, delete index for this page
869
            $this->deleteFromIndex($id);
0 ignored issues
show
Bug introduced by
$id of type string is incompatible with the type integer expected by parameter $id of AOE\Crawler\Hooks\Indexe...Hook::deleteFromIndex(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

869
            $this->deleteFromIndex(/** @scrutinizer ignore-type */ $id);
Loading history...
870
        }
871
        // Get full record and if exists, search for indexing configurations:
872
        $currentRecord = BackendUtility::getRecord($table, $id);
873
        if (is_array($currentRecord)) {
874
            // Select all (not running) indexing configurations of type "record" (1) and
875
            // which points to this table and is located on the same page as the record
876
            // or pointing to the right source PID
877
            $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
878
                ->getQueryBuilderForTable('index_config');
879
            $result = $queryBuilder->select('*')
880
                ->from('index_config')
881
                ->where(
882
                    $queryBuilder->expr()->eq('set_id', $queryBuilder->createNamedParameter(0, \PDO::PARAM_INT)),
883
                    $queryBuilder->expr()->eq('type', $queryBuilder->createNamedParameter(1, \PDO::PARAM_INT)),
884
                    $queryBuilder->expr()->eq(
885
                        'table2index',
886
                        $queryBuilder->createNamedParameter($table, \PDO::PARAM_STR)
887
                    ),
888
                    $queryBuilder->expr()->orX(
889
                        $queryBuilder->expr()->andX(
890
                            $queryBuilder->expr()->eq(
891
                                'alternative_source_pid',
892
                                $queryBuilder->createNamedParameter(0, \PDO::PARAM_INT)
893
                            ),
894
                            $queryBuilder->expr()->eq(
895
                                'pid',
896
                                $queryBuilder->createNamedParameter($currentRecord['pid'], \PDO::PARAM_INT)
897
                            )
898
                        ),
899
                        $queryBuilder->expr()->eq(
900
                            'alternative_source_pid',
901
                            $queryBuilder->createNamedParameter($currentRecord['pid'], \PDO::PARAM_INT)
902
                        )
903
                    ),
904
                    $queryBuilder->expr()->eq(
905
                        'records_indexonchange',
906
                        $queryBuilder->createNamedParameter(1, \PDO::PARAM_INT)
907
                    )
908
                )
909
                ->execute();
910
911
            while ($cfgRec = $result->fetch()) {
912
                $this->indexSingleRecord($currentRecord, $cfgRec);
913
            }
914
        }
915
    }
916
}
917