Completed
Push — master ( ee3b45...4fa3ae )
by
unknown
16:22
created

CrawlerHook::checkUrl()   A

Complexity

Conditions 4
Paths 4

Size

Total Lines 13
Code Lines 7

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 4
eloc 7
nc 4
nop 3
dl 0
loc 13
rs 10
c 0
b 0
f 0
1
<?php
2
namespace TYPO3\CMS\IndexedSearch\Hook;
3
4
/*
5
 * This file is part of the TYPO3 CMS project.
6
 *
7
 * It is free software; you can redistribute it and/or modify it under
8
 * the terms of the GNU General Public License, either version 2
9
 * of the License, or any later version.
10
 *
11
 * For the full copyright and license information, please read the
12
 * LICENSE.txt file that was distributed with this source code.
13
 *
14
 * The TYPO3 project - inspiring people to share!
15
 */
16
17
use TYPO3\CMS\Backend\Utility\BackendUtility;
18
use TYPO3\CMS\Core\Core\Environment;
19
use TYPO3\CMS\Core\Database\Connection;
20
use TYPO3\CMS\Core\Database\ConnectionPool;
21
use TYPO3\CMS\Core\Database\Query\Restriction\DeletedRestriction;
22
use TYPO3\CMS\Core\DataHandling\DataHandler;
23
use TYPO3\CMS\Core\Exception\Page\RootLineException;
24
use TYPO3\CMS\Core\Utility\GeneralUtility;
25
use TYPO3\CMS\Core\Utility\MathUtility;
26
use TYPO3\CMS\Core\Utility\RootlineUtility;
27
use TYPO3\CMS\IndexedSearch\Indexer;
28
29
/**
30
 * Crawler hook for indexed search. Works with the "crawler" extension
31
 * @internal this is a TYPO3-internal hook implementation and not part of TYPO3's Core API.
32
 */
33
class CrawlerHook
34
{
35
    /**
36
     * Number of seconds to use as interval between queued indexing operations of URLs / files (types 2 & 3)
37
     *
38
     * @var int
39
     */
40
    public $secondsPerExternalUrl = 3;
41
42
    /**
43
     * Counts up for each added URL (type 3)
44
     *
45
     * @var int
46
     */
47
    public $instanceCounter = 0;
48
49
    /**
50
     * @var string
51
     */
52
    public $callBack = self::class;
53
54
    /**
55
     * @var object
56
     */
57
    private $pObj;
58
59
    /**
60
     * Initialization of crawler hook.
61
     * This function is asked for each instance of the crawler and we must check if something is timed to happen and if so put entry(s) in the crawlers log to start processing.
62
     * In reality we select indexing configurations and evaluate if any of them needs to run.
63
     *
64
     * @param object $pObj Parent object (tx_crawler lib)
65
     */
66
    public function crawler_init(&$pObj)
0 ignored issues
show
Coding Style introduced by
Method name "CrawlerHook::crawler_init" is not in camel caps format
Loading history...
67
    {
68
        $this->pObj = $pObj;
69
70
        $message = null;
71
        // Select all indexing configuration which are waiting to be activated:
72
        $connection = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('index_config');
73
        $queryBuilder = $connection->createQueryBuilder();
74
75
        $result = $queryBuilder->select('*')
76
            ->from('index_config')
77
            ->where(
78
                $queryBuilder->expr()->lt(
79
                    'timer_next_indexing',
80
                    $queryBuilder->createNamedParameter($GLOBALS['EXEC_TIME'], \PDO::PARAM_INT)
81
                ),
82
                $queryBuilder->expr()->eq('set_id', $queryBuilder->createNamedParameter(0, \PDO::PARAM_INT))
83
            )
84
            ->execute();
85
86
        // For each configuration, check if it should be executed and if so, start:
87
        while ($cfgRec = $result->fetch()) {
88
            // Generate a unique set-ID:
89
            $setId = GeneralUtility::md5int(microtime());
90
            // Get next time:
91
            $nextTime = $this->generateNextIndexingTime($cfgRec);
92
            // Start process by updating index-config record:
93
            $connection->update(
94
                'index_config',
95
                [
96
                    'set_id' => $setId,
97
                    'timer_next_indexing' => $nextTime,
98
                    'session_data' => ''
99
                ],
100
                [
101
                    'uid' => (int)$cfgRec['uid']
102
                ]
103
            );
104
            // Based on configuration type:
105
            switch ($cfgRec['type']) {
106
                case 1:
107
                    // RECORDS:
108
                    // Parameters:
109
                    $params = [
110
                        'indexConfigUid' => $cfgRec['uid'],
111
                        'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']'],
112
                        'url' => 'Records (start)'
113
                    ];
114
                    //
115
                    $pObj->addQueueEntry_callBack($setId, $params, $this->callBack, $cfgRec['pid']);
116
                    break;
117
                case 2:
118
                    // FILES:
119
                    // Parameters:
120
                    $params = [
121
                        'indexConfigUid' => $cfgRec['uid'],
122
                        // General
123
                        'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']'],
124
                        // General
125
                        'url' => $cfgRec['filepath'],
126
                        // Partly general... (for URL and file types)
127
                        'depth' => 0
128
                    ];
129
                    $pObj->addQueueEntry_callBack($setId, $params, $this->callBack, $cfgRec['pid']);
130
                    break;
131
                case 3:
132
                    // External URL:
133
                    // Parameters:
134
                    $params = [
135
                        'indexConfigUid' => $cfgRec['uid'],
136
                        // General
137
                        'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']'],
138
                        // General
139
                        'url' => $cfgRec['externalUrl'],
140
                        // Partly general... (for URL and file types)
141
                        'depth' => 0
142
                    ];
143
                    $pObj->addQueueEntry_callBack($setId, $params, $this->callBack, $cfgRec['pid']);
144
                    break;
145
                case 4:
146
                    // Page tree
147
                    // Parameters:
148
                    $params = [
149
                        'indexConfigUid' => $cfgRec['uid'],
150
                        // General
151
                        'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']'],
152
                        // General
153
                        'url' => (int)$cfgRec['alternative_source_pid'],
154
                        // Partly general... (for URL and file types and page tree (root))
155
                        'depth' => 0
156
                    ];
157
                    $pObj->addQueueEntry_callBack($setId, $params, $this->callBack, $cfgRec['pid']);
158
                    break;
159
                case 5:
160
                    // Meta configuration, nothing to do:
161
                    // NOOP
162
                    break;
163
                default:
164
                    if ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]) {
165
                        $hookObj = GeneralUtility::makeInstance($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]);
166
                        // Parameters:
167
                        $params = [
168
                            'indexConfigUid' => $cfgRec['uid'],
169
                            // General
170
                            'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . '/CUSTOM]'],
171
                            // General
172
                            'url' => $hookObj->initMessage($message)
173
                        ];
174
                        $pObj->addQueueEntry_callBack($setId, $params, $this->callBack, $cfgRec['pid']);
175
                    }
176
            }
177
        }
178
        // Finally, look up all old index configurations which are finished and needs to be reset and done.
179
        $this->cleanUpOldRunningConfigurations();
180
    }
181
182
    /**
183
     * Call back function for execution of a log element
184
     *
185
     * @param array $params Params from log element. Must contain $params['indexConfigUid']
186
     * @param object $pObj Parent object (tx_crawler lib)
187
     * @return array Result array
188
     */
189
    public function crawler_execute($params, &$pObj)
0 ignored issues
show
Coding Style introduced by
Method name "CrawlerHook::crawler_execute" is not in camel caps format
Loading history...
190
    {
191
        // Indexer configuration ID must exist:
192
        if ($params['indexConfigUid']) {
193
            $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
194
                ->getQueryBuilderForTable('index_config');
195
            $queryBuilder->getRestrictions()->removeAll();
196
            // Load the indexing configuration record:
197
            $cfgRec = $queryBuilder
198
                ->select('*')
199
                ->from('index_config')
200
                ->where(
201
                    $queryBuilder->expr()->eq(
202
                        'uid',
203
                        $queryBuilder->createNamedParameter($params['indexConfigUid'], \PDO::PARAM_INT)
204
                    )
205
                )
206
                ->execute()
207
                ->fetch();
208
            if (is_array($cfgRec)) {
209
                // Unpack session data:
210
                $session_data = unserialize($cfgRec['session_data']);
211
                // Select which type:
212
                switch ($cfgRec['type']) {
213
                    case 1:
214
                        // Records:
215
                        $this->crawler_execute_type1($cfgRec, $session_data, $params, $pObj);
216
                        break;
217
                    case 2:
218
                        // Files
219
                        $this->crawler_execute_type2($cfgRec, $session_data, $params, $pObj);
220
                        break;
221
                    case 3:
222
                        // External URL:
223
                        $this->crawler_execute_type3($cfgRec, $session_data, $params, $pObj);
224
                        break;
225
                    case 4:
226
                        // Page tree:
227
                        $this->crawler_execute_type4($cfgRec, $session_data, $params, $pObj);
228
                        break;
229
                    case 5:
230
                        // Meta
231
                        // NOOP (should never enter here!)
232
                        break;
233
                    default:
234
                        if ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]) {
235
                            $hookObj = GeneralUtility::makeInstance($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]);
236
                            $this->pObj = $pObj;
237
                            // For addQueueEntryForHook()
238
                            $ref = $this; // introduced for phpstan to not lose type information when passing $this into callUserFunction
239
                            $hookObj->indexOperation($cfgRec, $session_data, $params, $ref);
240
                        }
241
                }
242
                // Save process data which might be modified:
243
                GeneralUtility::makeInstance(ConnectionPool::class)
244
                    ->getConnectionForTable('index_config')
245
                    ->update(
246
                        'index_config',
247
                        ['session_data' => serialize($session_data)],
248
                        ['uid' => (int)$cfgRec['uid']]
249
                    );
250
            }
251
        }
252
        return ['log' => $params];
253
    }
254
255
    /**
256
     * Indexing records from a table
257
     *
258
     * @param array $cfgRec Indexing Configuration Record
259
     * @param array $session_data Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call!
260
     * @param array $params Parameters from the log queue.
261
     * @param object $pObj Parent object (from "crawler" extension!)
262
     */
263
    public function crawler_execute_type1($cfgRec, &$session_data, $params, &$pObj)
0 ignored issues
show
Unused Code introduced by
The parameter $params is not used and could be removed. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-unused  annotation

263
    public function crawler_execute_type1($cfgRec, &$session_data, /** @scrutinizer ignore-unused */ $params, &$pObj)

This check looks for parameters that have been defined for a function or method, but which are not used in the method body.

Loading history...
Coding Style introduced by
Method name "CrawlerHook::crawler_execute_type1" is not in camel caps format
Loading history...
264
    {
265
        if ($cfgRec['table2index'] && isset($GLOBALS['TCA'][$cfgRec['table2index']])) {
266
            // Init session data array if not already:
267
            if (!is_array($session_data)) {
0 ignored issues
show
introduced by
The condition is_array($session_data) is always true.
Loading history...
268
                $session_data = [
269
                    'uid' => 0
270
                ];
271
            }
272
            // Init:
273
            $pid = (int)$cfgRec['alternative_source_pid'] ?: $cfgRec['pid'];
274
            $numberOfRecords = $cfgRec['recordsbatch']
275
                ? MathUtility::forceIntegerInRange($cfgRec['recordsbatch'], 1)
0 ignored issues
show
Coding Style introduced by
Expected 1 space before "?"; newline found
Loading history...
276
                : 100;
0 ignored issues
show
Coding Style introduced by
Expected 1 space before ":"; newline found
Loading history...
277
278
            // Get root line:
279
            $rootLine = $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
280
            // Select
281
            $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
282
                ->getQueryBuilderForTable($cfgRec['table2index']);
283
284
            $baseQueryBuilder = $queryBuilder->select('*')
285
                ->from($cfgRec['table2index'])
286
                ->where(
287
                    $queryBuilder->expr()->eq(
288
                        'pid',
289
                        $queryBuilder->createNamedParameter($pid, \PDO::PARAM_INT)
290
                    ),
291
                    $queryBuilder->expr()->gt(
292
                        'uid',
293
                        $queryBuilder->createNamedParameter($session_data['uid'], \PDO::PARAM_INT)
294
                    )
295
                );
296
            $result = $baseQueryBuilder
297
                ->setMaxResults($numberOfRecords)
298
                ->orderBy('uid')
299
                ->execute();
300
301
            // Traverse:
302
            while ($row = $result->fetch()) {
303
                // Index single record:
304
                $this->indexSingleRecord($row, $cfgRec, $rootLine);
305
                // Update the UID we last processed:
306
                $session_data['uid'] = $row['uid'];
307
            }
308
309
            $rowCount = $baseQueryBuilder->count('uid')->execute()->fetchColumn(0);
310
            // Finally, set entry for next indexing of batch of records:
311
            if ($rowCount) {
312
                $nparams = [
313
                    'indexConfigUid' => $cfgRec['uid'],
314
                    'url' => 'Records from UID#' . ($session_data['uid'] + 1) . '-?',
315
                    'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']']
316
                ];
317
                $pObj->addQueueEntry_callBack($cfgRec['set_id'], $nparams, $this->callBack, $cfgRec['pid']);
318
            }
319
        }
320
    }
321
322
    /**
323
     * Indexing files from fileadmin
324
     *
325
     * @param array $cfgRec Indexing Configuration Record
326
     * @param array $session_data Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call!
327
     * @param array $params Parameters from the log queue.
328
     * @param object $pObj Parent object (from "crawler" extension!)
329
     */
330
    public function crawler_execute_type2($cfgRec, &$session_data, $params, &$pObj)
0 ignored issues
show
Unused Code introduced by
The parameter $session_data is not used and could be removed. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-unused  annotation

330
    public function crawler_execute_type2($cfgRec, /** @scrutinizer ignore-unused */ &$session_data, $params, &$pObj)

This check looks for parameters that have been defined for a function or method, but which are not used in the method body.

Loading history...
Coding Style introduced by
Method name "CrawlerHook::crawler_execute_type2" is not in camel caps format
Loading history...
331
    {
332
        // Prepare path, making it absolute and checking:
333
        $readpath = $params['url'];
334
        if (!GeneralUtility::isAbsPath($readpath)) {
335
            $readpath = GeneralUtility::getFileAbsFileName($readpath);
336
        }
337
        if (GeneralUtility::isAllowedAbsPath($readpath)) {
338
            if (@is_file($readpath)) {
339
                // If file, index it!
340
                // Get root line (need to provide this when indexing external files)
341
                $rl = $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
342
                // (Re)-Indexing file on page.
343
                $indexerObj = $this->initializeIndexer($cfgRec['pid'], 0, 0, '', $rl, $cfgRec['uid'], $cfgRec['set_id']);
344
                $indexerObj->hash['phash'] = -1;
345
                // EXPERIMENT - but to avoid phash_t3 being written to file sections (otherwise they are removed when page is reindexed!!!)
346
                // Index document:
347
                $indexerObj->indexRegularDocument(\TYPO3\CMS\Core\Utility\PathUtility::stripPathSitePrefix($readpath), true);
348
            } elseif (@is_dir($readpath)) {
349
                // If dir, read content and create new pending items for log:
350
                // Select files and directories in path:
351
                $extList = implode(',', GeneralUtility::trimExplode(',', $cfgRec['extensions'], true));
352
                $fileArr = [];
353
                $files = GeneralUtility::getAllFilesAndFoldersInPath($fileArr, $readpath, $extList, 0, 0);
354
                $directoryList = GeneralUtility::get_dirs($readpath);
355
                if (is_array($directoryList) && $params['depth'] < $cfgRec['depth']) {
356
                    foreach ($directoryList as $subdir) {
357
                        if ((string)$subdir != '') {
358
                            $files[] = $readpath . $subdir . '/';
359
                        }
360
                    }
361
                }
362
                $files = GeneralUtility::removePrefixPathFromList($files, Environment::getPublicPath() . '/');
363
                // traverse the items and create log entries:
364
                foreach ($files as $path) {
365
                    $this->instanceCounter++;
366
                    if ($path !== $params['url']) {
367
                        // Parameters:
368
                        $nparams = [
369
                            'indexConfigUid' => $cfgRec['uid'],
370
                            'url' => $path,
371
                            'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']'],
372
                            'depth' => $params['depth'] + 1
373
                        ];
374
                        $pObj->addQueueEntry_callBack($cfgRec['set_id'], $nparams, $this->callBack, $cfgRec['pid'], $GLOBALS['EXEC_TIME'] + $this->instanceCounter * $this->secondsPerExternalUrl);
375
                    }
376
                }
377
            }
378
        }
379
    }
380
381
    /**
382
     * Indexing External URLs
383
     *
384
     * @param array $cfgRec Indexing Configuration Record
385
     * @param array $session_data Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call!
386
     * @param array $params Parameters from the log queue.
387
     * @param object $pObj Parent object (from "crawler" extension!)
388
     */
389
    public function crawler_execute_type3($cfgRec, &$session_data, $params, &$pObj)
0 ignored issues
show
Coding Style introduced by
Method name "CrawlerHook::crawler_execute_type3" is not in camel caps format
Loading history...
390
    {
391
        // Init session data array if not already:
392
        if (!is_array($session_data)) {
0 ignored issues
show
introduced by
The condition is_array($session_data) is always true.
Loading history...
393
            $session_data = [
394
                'urlLog' => [$params['url']]
395
            ];
396
        }
397
        // Index the URL:
398
        $rl = $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
399
        $subUrls = $this->indexExtUrl($params['url'], $cfgRec['pid'], $rl, $cfgRec['uid'], $cfgRec['set_id']);
400
        // Add more elements to log now:
401
        if ($params['depth'] < $cfgRec['depth']) {
402
            foreach ($subUrls as $url) {
403
                if ($url = $this->checkUrl($url, $session_data['urlLog'], $cfgRec['externalUrl'])) {
404
                    if (!$this->checkDeniedSuburls($url, $cfgRec['url_deny'])) {
405
                        $this->instanceCounter++;
406
                        $session_data['urlLog'][] = $url;
407
                        // Parameters:
408
                        $nparams = [
409
                            'indexConfigUid' => $cfgRec['uid'],
410
                            'url' => $url,
411
                            'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']'],
412
                            'depth' => $params['depth'] + 1
413
                        ];
414
                        $pObj->addQueueEntry_callBack($cfgRec['set_id'], $nparams, $this->callBack, $cfgRec['pid'], $GLOBALS['EXEC_TIME'] + $this->instanceCounter * $this->secondsPerExternalUrl);
415
                    }
416
                }
417
            }
418
        }
419
    }
420
421
    /**
422
     * Page tree indexing type
423
     *
424
     * @param array $cfgRec Indexing Configuration Record
425
     * @param array $session_data Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call!
426
     * @param array $params Parameters from the log queue.
427
     * @param object $pObj Parent object (from "crawler" extension!)
428
     */
429
    public function crawler_execute_type4($cfgRec, &$session_data, $params, &$pObj)
0 ignored issues
show
Coding Style introduced by
Method name "CrawlerHook::crawler_execute_type4" is not in camel caps format
Loading history...
430
    {
431
        // Base page uid:
432
        $pageUid = (int)$params['url'];
433
        // Get array of URLs from page:
434
        $pageRow = BackendUtility::getRecord('pages', $pageUid);
435
        $res = $pObj->getUrlsForPageRow($pageRow);
436
        $duplicateTrack = [];
437
        // Registry for duplicates
438
        $downloadUrls = [];
439
        // Dummy.
440
        // Submit URLs:
441
        if (!empty($res)) {
442
            foreach ($res as $paramSetKey => $vv) {
443
                $pObj->urlListFromUrlArray($vv, $pageRow, $GLOBALS['EXEC_TIME'], 30, 1, 0, $duplicateTrack, $downloadUrls, ['tx_indexedsearch_reindex']);
444
            }
445
        }
446
        // Add subpages to log now:
447
        if ($params['depth'] < $cfgRec['depth']) {
448
            // Subpages selected
449
            $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable('pages');
450
            $queryBuilder->getRestrictions()
451
                ->removeAll()
452
                ->add(GeneralUtility::makeInstance(DeletedRestriction::class));
453
            $result = $queryBuilder->select('uid', 'title')
454
                ->from('pages')
455
                ->where(
456
                    $queryBuilder->expr()->eq(
457
                        'pid',
458
                        $queryBuilder->createNamedParameter($pageUid, \PDO::PARAM_INT)
459
                    )
460
                )
461
                ->execute();
462
            // Traverse subpages and add to queue:
463
            while ($row = $result->fetch()) {
464
                $this->instanceCounter++;
465
                $url = 'pages:' . $row['uid'] . ': ' . $row['title'];
466
                $session_data['urlLog'][] = $url;
467
                // Parameters:
468
                $nparams = [
469
                    'indexConfigUid' => $cfgRec['uid'],
470
                    'url' => $row['uid'],
471
                    'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']'],
472
                    'depth' => $params['depth'] + 1
473
                ];
474
                $pObj->addQueueEntry_callBack(
475
                    $cfgRec['set_id'],
476
                    $nparams,
477
                    $this->callBack,
478
                    $cfgRec['pid'],
479
                    $GLOBALS['EXEC_TIME'] + $this->instanceCounter * $this->secondsPerExternalUrl
480
                );
481
            }
482
        }
483
    }
484
485
    /**
486
     * Look up all old index configurations which are finished and needs to be reset and done
487
     */
488
    public function cleanUpOldRunningConfigurations()
489
    {
490
        $connectionPool = GeneralUtility::makeInstance(ConnectionPool::class);
491
        // List of tables that store information related to the phash value
492
        $tablesToClean = [
493
            'index_phash',
494
            'index_rel',
495
            'index_section',
496
            'index_grlist',
497
            'index_fulltext',
498
            'index_debug'
499
        ];
500
501
        $queryBuilder = $connectionPool->getQueryBuilderForTable('index_config');
502
        $queryBuilder->getRestrictions()
503
            ->removeAll()
504
            ->add(GeneralUtility::makeInstance(DeletedRestriction::class));
505
506
        // Lookup running index configurations:
507
        $runningIndexingConfigurations = $queryBuilder->select('*')
508
            ->from('index_config')
509
            ->where($queryBuilder->expr()->neq('set_id', $queryBuilder->createNamedParameter(0, \PDO::PARAM_INT)))
510
            ->execute()
511
            ->fetchAll();
512
        // For each running configuration, look up how many log entries there are which are scheduled
513
        // for execution and if none, clear the "set_id" (means; Processing was DONE)
514
        foreach ($runningIndexingConfigurations as $cfgRec) {
515
            // Look for ended processes:
516
            $queued_items = $connectionPool->getConnectionForTable('tx_crawler_queue')
517
                ->count(
518
                    '*',
519
                    'tx_crawler_queue',
520
                    [
521
                        'set_id' => (int)$cfgRec['set_id'],
522
                        'exec_time' => 0
523
                    ]
524
                );
525
            if (!$queued_items) {
526
                // Lookup old phash rows:
527
                $queryBuilder = $connectionPool->getQueryBuilderForTable('index_phash');
528
                $oldPhashRows = $queryBuilder
529
                    ->select('phash')
530
                    ->from('index_phash')
531
                    ->where(
532
                        $queryBuilder->expr()->eq(
533
                            'freeIndexUid',
534
                            $queryBuilder->createNamedParameter($cfgRec['uid'], \PDO::PARAM_INT)
535
                        ),
536
                        $queryBuilder->expr()->neq(
537
                            'freeIndexSetId',
538
                            $queryBuilder->createNamedParameter($cfgRec['set_id'], \PDO::PARAM_INT)
539
                        )
540
                    )
541
                    ->execute()
542
                    ->fetchAll();
543
544
                // Removing old registrations for all tables
545
                foreach ($tablesToClean as $table) {
546
                    $queryBuilder = $connectionPool->getQueryBuilderForTable($table);
547
                    $queryBuilder->delete($table)
548
                        ->where(
549
                            $queryBuilder->expr()->in(
550
                                'phash',
551
                                $queryBuilder->createNamedParameter(
552
                                    array_column($oldPhashRows, 'phash'),
553
                                    Connection::PARAM_INT_ARRAY
554
                                )
555
                            )
556
                        )
557
                        ->execute();
558
                }
559
560
                // End process by updating index-config record:
561
                $connectionPool->getConnectionForTable('index_config')
562
                    ->update(
563
                        'index_config',
564
                        [
565
                            'set_id' => 0,
566
                            'session_data' => ''
567
                        ],
568
                        ['uid' => (int)$cfgRec['uid']]
569
                    );
570
            }
571
        }
572
    }
573
574
    /*****************************************
575
     *
576
     * Helper functions
577
     *
578
     *****************************************/
579
    /**
580
     * Check if an input URL are allowed to be indexed. Depends on whether it is already present in the url log.
581
     *
582
     * @param string $url URL string to check
583
     * @param array $urlLog Array of already indexed URLs (input url is looked up here and must not exist already)
584
     * @param string $baseUrl Base URL of the indexing process (input URL must be "inside" the base URL!)
585
     * @return string Returns the URL if OK, otherwise empty string
586
     */
587
    public function checkUrl($url, $urlLog, $baseUrl)
588
    {
589
        $url = preg_replace('/\\/\\/$/', '/', $url);
590
        [$url] = explode('#', $url);
591
        if (strpos($url, '../') === false) {
592
            if (GeneralUtility::isFirstPartOfStr($url, $baseUrl)) {
593
                if (!in_array($url, $urlLog)) {
594
                    return $url;
595
                }
596
            }
597
        }
598
599
        return '';
600
    }
601
602
    /**
603
     * Indexing External URL
604
     *
605
     * @param string $url URL, http://....
606
     * @param int $pageId Page id to relate indexing to.
607
     * @param array $rl Rootline array to relate indexing to
608
     * @param int $cfgUid Configuration UID
609
     * @param int $setId Set ID value
610
     * @return array URLs found on this page
611
     */
612
    public function indexExtUrl($url, $pageId, $rl, $cfgUid, $setId)
613
    {
614
        // Index external URL:
615
        $indexerObj = $this->initializeIndexer($pageId, 0, 0, '', $rl, [], $cfgUid, $setId);
616
        $indexerObj->hash['phash'] = -1;
617
        // To avoid phash_t3 being written to file sections (otherwise they are removed when page is reindexed!!!)
618
        $indexerObj->indexExternalUrl($url);
619
        $url_qParts = parse_url($url);
620
        $baseAbsoluteHref = $url_qParts['scheme'] . '://' . $url_qParts['host'];
621
        $baseHref = $indexerObj->extractBaseHref($indexerObj->indexExternalUrl_content);
622
        if (!$baseHref) {
623
            // Extract base href from current URL
624
            $baseHref = $baseAbsoluteHref;
625
            $baseHref .= substr($url_qParts['path'], 0, strrpos($url_qParts['path'], '/'));
626
        }
627
        $baseHref = rtrim($baseHref, '/');
628
        // Get URLs on this page:
629
        $subUrls = [];
630
        $list = $indexerObj->extractHyperLinks($indexerObj->indexExternalUrl_content);
631
        // Traverse links:
632
        foreach ($list as $count => $linkInfo) {
633
            // Decode entities:
634
            $subUrl = htmlspecialchars_decode($linkInfo['href']);
635
            $qParts = parse_url($subUrl);
636
            if (!$qParts['scheme']) {
637
                $relativeUrl = GeneralUtility::resolveBackPath($subUrl);
638
                if ($relativeUrl[0] === '/') {
639
                    $subUrl = $baseAbsoluteHref . $relativeUrl;
640
                } else {
641
                    $subUrl = $baseHref . '/' . $relativeUrl;
642
                }
643
            }
644
            $subUrls[] = $subUrl;
645
        }
646
        return $subUrls;
647
    }
648
649
    /**
650
     * Indexing Single Record
651
     *
652
     * @param array $r Record to index
653
     * @param array $cfgRec Configuration Record
654
     * @param array $rl Rootline array to relate indexing to
655
     */
656
    public function indexSingleRecord($r, $cfgRec, $rl = null)
657
    {
658
        $rl = is_array($rl) ? $rl : $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
659
        $fieldList = GeneralUtility::trimExplode(',', $cfgRec['fieldlist'], true);
660
        $languageField = $GLOBALS['TCA'][$cfgRec['table2index']]['ctrl']['languageField'];
661
        $sys_language_uid = $languageField ? $r[$languageField] : 0;
662
        parse_str(str_replace('###UID###', $r['uid'], $cfgRec['get_params']), $GETparams);
663
        // (Re)-Indexing a row from a table
664
        $indexerObj = $this->initializeIndexer($cfgRec['pid'], 0, $sys_language_uid, '', $rl, $GETparams, $cfgRec['uid'], $cfgRec['set_id']);
665
        $indexerObj->forceIndexing = true;
666
        $theContent = '';
667
        $theTitle = '';
668
        foreach ($fieldList as $k => $v) {
669
            if (!$k) {
670
                $theTitle = $r[$v];
671
            } else {
672
                $theContent .= $r[$v] . ' ';
673
            }
674
        }
675
        // Indexing the record as a page (but with parameters set)
676
        $this->indexAsTYPO3Page(
677
            $indexerObj,
678
            strip_tags(str_replace('<', ' <', $theTitle)),
679
            strip_tags(str_replace('<', ' <', $theContent)),
680
            $r[$GLOBALS['TCA'][$cfgRec['table2index']]['ctrl']['tstamp']],
681
            $r[$GLOBALS['TCA'][$cfgRec['table2index']]['ctrl']['crdate']],
682
            $r['uid']
683
        );
684
    }
685
686
    /**
687
     * Get rootline for closest TypoScript template root.
688
     * Algorithm same as used in Web > Template, Object browser
689
     *
690
     * @param int $id The page id to traverse rootline back from
691
     * @return array Array where the root lines uid values are found.
692
     */
693
    public function getUidRootLineForClosestTemplate($id)
694
    {
695
        $rootLineUids = [];
696
        try {
697
            // Gets the rootLine
698
            $rootLine = GeneralUtility::makeInstance(RootlineUtility::class, $id)->get();
699
            // This generates the constants/config + hierarchy info for the template.
700
            $tmpl = GeneralUtility::makeInstance(\TYPO3\CMS\Core\TypoScript\ExtendedTemplateService::class);
701
            $tmpl->runThroughTemplates($rootLine);
702
            // Root line uids
703
            foreach ($tmpl->rootLine as $rlkey => $rldat) {
704
                $rootLineUids[$rlkey] = $rldat['uid'];
705
            }
706
        } catch (RootLineException $e) {
707
            // do nothing
708
        }
709
        return $rootLineUids;
710
    }
711
712
    /**
713
     * Generate the unix time stamp for next visit.
714
     *
715
     * @param array $cfgRec Index configuration record
716
     * @return int The next time stamp
717
     */
718
    public function generateNextIndexingTime($cfgRec)
719
    {
720
        $currentTime = $GLOBALS['EXEC_TIME'];
721
        // Now, find a midnight time to use for offset calculation. This has to differ depending on whether we have frequencies within a day or more than a day; Less than a day, we don't care which day to use for offset, more than a day we want to respect the currently entered day as offset regardless of when the script is run - thus the day-of-week used in case "Weekly" is selected will be respected
722
        if ($cfgRec['timer_frequency'] <= 24 * 3600) {
723
            $aMidNight = mktime(0, 0, 0) - 1 * 24 * 3600;
724
        } else {
725
            $lastTime = $cfgRec['timer_next_indexing'] ?: $GLOBALS['EXEC_TIME'];
726
            $aMidNight = mktime(0, 0, 0, date('m', $lastTime), date('d', $lastTime), date('y', $lastTime));
0 ignored issues
show
Bug introduced by
date('d', $lastTime) of type string is incompatible with the type integer expected by parameter $day of mktime(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

726
            $aMidNight = mktime(0, 0, 0, date('m', $lastTime), /** @scrutinizer ignore-type */ date('d', $lastTime), date('y', $lastTime));
Loading history...
Bug introduced by
date('y', $lastTime) of type string is incompatible with the type integer expected by parameter $year of mktime(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

726
            $aMidNight = mktime(0, 0, 0, date('m', $lastTime), date('d', $lastTime), /** @scrutinizer ignore-type */ date('y', $lastTime));
Loading history...
Bug introduced by
date('m', $lastTime) of type string is incompatible with the type integer expected by parameter $month of mktime(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

726
            $aMidNight = mktime(0, 0, 0, /** @scrutinizer ignore-type */ date('m', $lastTime), date('d', $lastTime), date('y', $lastTime));
Loading history...
727
        }
728
        // Find last offset time plus frequency in seconds:
729
        $lastSureOffset = $aMidNight + MathUtility::forceIntegerInRange($cfgRec['timer_offset'], 0, 86400);
730
        $frequencySeconds = MathUtility::forceIntegerInRange($cfgRec['timer_frequency'], 1);
731
        // Now, find out how many blocks of the length of frequency there is until the next time:
732
        $frequencyBlocksUntilNextTime = ceil(($currentTime - $lastSureOffset) / $frequencySeconds);
733
        // Set next time to the offset + the frequencyblocks multiplied with the frequency length in seconds.
734
        return $lastSureOffset + $frequencyBlocksUntilNextTime * $frequencySeconds;
735
    }
736
737
    /**
738
     * Checks if $url has any of the URls in the $url_deny "list" in it and if so, returns TRUE.
739
     *
740
     * @param string $url URL to test
741
     * @param string $url_deny String where URLs are separated by line-breaks; If any of these strings is the first part of $url, the function returns TRUE (to indicate denial of descend)
742
     * @return bool TRUE if there is a matching URL (hence, do not index!)
743
     */
744
    public function checkDeniedSuburls($url, $url_deny)
745
    {
746
        if (trim($url_deny)) {
747
            $url_denyArray = GeneralUtility::trimExplode(LF, $url_deny, true);
748
            foreach ($url_denyArray as $testurl) {
749
                if (GeneralUtility::isFirstPartOfStr($url, $testurl)) {
750
                    return true;
751
                }
752
            }
753
        }
754
        return false;
755
    }
756
757
    /**
758
     * Adding entry in queue for Hook
759
     *
760
     * @param array $cfgRec Configuration record
761
     * @param string $title Title/URL
762
     */
763
    public function addQueueEntryForHook($cfgRec, $title)
764
    {
765
        $nparams = [
766
            'indexConfigUid' => $cfgRec['uid'],
767
            // This must ALWAYS be the cfgRec uid!
768
            'url' => $title,
769
            'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']']
770
        ];
771
        $this->pObj->addQueueEntry_callBack($cfgRec['set_id'], $nparams, $this->callBack, $cfgRec['pid']);
772
    }
773
774
    /**
775
     * Deletes all data stored by indexed search for a given page
776
     *
777
     * @param int $id Uid of the page to delete all pHash
778
     */
779
    public function deleteFromIndex($id)
780
    {
781
        $connectionPool = GeneralUtility::makeInstance(ConnectionPool::class);
782
783
        // Lookup old phash rows:
784
785
        $queryBuilder = $connectionPool->getQueryBuilderForTable('index_section');
786
        $oldPhashRows = $queryBuilder->select('phash')
787
            ->from('index_section')
788
            ->where(
789
                $queryBuilder->expr()->eq(
790
                    'page_id',
791
                    $queryBuilder->createNamedParameter($id, \PDO::PARAM_INT)
792
                )
793
            )
794
            ->execute()
795
            ->fetchAll();
796
797
        if (empty($oldPhashRows)) {
798
            return;
799
        }
800
801
        $tables = [
802
            'index_debug',
803
            'index_fulltext',
804
            'index_grlist',
805
            'index_phash',
806
            'index_rel',
807
            'index_section',
808
        ];
809
        foreach ($tables as $table) {
810
            $queryBuilder = $connectionPool->getQueryBuilderForTable($table);
811
            $queryBuilder->delete($table)
812
                ->where(
813
                    $queryBuilder->expr()->in(
814
                        'phash',
815
                        $queryBuilder->createNamedParameter(
816
                            array_column($oldPhashRows, 'phash'),
817
                            Connection::PARAM_INT_ARRAY
818
                        )
819
                    )
820
                )
821
                ->execute();
822
        }
823
    }
824
825
    /*************************
826
     *
827
     * Hook functions for DataHandler (indexing of records)
828
     *
829
     *************************/
830
    /**
831
     * DataHandler hook function for on-the-fly indexing of database records
832
     *
833
     * @param string $command DataHandler command
834
     * @param string $table Table name
835
     * @param string $id Record ID. If new record its a string pointing to index inside \TYPO3\CMS\Core\DataHandling\DataHandler::substNEWwithIDs
836
     * @param mixed $value Target value (ignored)
837
     * @param DataHandler $pObj DataHandler calling object
838
     */
839
    public function processCmdmap_preProcess($command, $table, $id, $value, $pObj)
0 ignored issues
show
Unused Code introduced by
The parameter $pObj is not used and could be removed. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-unused  annotation

839
    public function processCmdmap_preProcess($command, $table, $id, $value, /** @scrutinizer ignore-unused */ $pObj)

This check looks for parameters that have been defined for a function or method, but which are not used in the method body.

Loading history...
Unused Code introduced by
The parameter $value is not used and could be removed. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-unused  annotation

839
    public function processCmdmap_preProcess($command, $table, $id, /** @scrutinizer ignore-unused */ $value, $pObj)

This check looks for parameters that have been defined for a function or method, but which are not used in the method body.

Loading history...
Coding Style introduced by
Method name "CrawlerHook::processCmdmap_preProcess" is not in camel caps format
Loading history...
840
    {
841
        // Clean up the index
842
        if ($command === 'delete' && $table === 'pages') {
843
            $this->deleteFromIndex($id);
0 ignored issues
show
Bug introduced by
$id of type string is incompatible with the type integer expected by parameter $id of TYPO3\CMS\IndexedSearch\...Hook::deleteFromIndex(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

843
            $this->deleteFromIndex(/** @scrutinizer ignore-type */ $id);
Loading history...
844
        }
845
    }
846
847
    /**
848
     * DataHandler hook function for on-the-fly indexing of database records
849
     *
850
     * @param string $status Status "new" or "update
851
     * @param string $table Table name
852
     * @param string $id Record ID. If new record its a string pointing to index inside \TYPO3\CMS\Core\DataHandling\DataHandler::substNEWwithIDs
853
     * @param array $fieldArray Field array of updated fields in the operation
854
     * @param DataHandler $pObj DataHandler calling object
855
     */
856
    public function processDatamap_afterDatabaseOperations($status, $table, $id, $fieldArray, $pObj)
0 ignored issues
show
Coding Style introduced by
Method name "CrawlerHook::processDatamap_afterDatabaseOperations" is not in camel caps format
Loading history...
857
    {
858
        // Check if any fields are actually updated:
859
        if (empty($fieldArray)) {
860
            return;
861
        }
862
        // Translate new ids.
863
        if ($status === 'new') {
864
            $id = $pObj->substNEWwithIDs[$id];
865
        } elseif ($table === 'pages' && $status === 'update' && (array_key_exists('hidden', $fieldArray) && $fieldArray['hidden'] == 1 || array_key_exists('no_search', $fieldArray) && $fieldArray['no_search'] == 1)) {
866
            // If the page should be hidden or not indexed after update, delete index for this page
867
            $this->deleteFromIndex($id);
0 ignored issues
show
Bug introduced by
$id of type string is incompatible with the type integer expected by parameter $id of TYPO3\CMS\IndexedSearch\...Hook::deleteFromIndex(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

867
            $this->deleteFromIndex(/** @scrutinizer ignore-type */ $id);
Loading history...
868
        }
869
        // Get full record and if exists, search for indexing configurations:
870
        $currentRecord = BackendUtility::getRecord($table, $id);
871
        if (is_array($currentRecord)) {
872
            // Select all (not running) indexing configurations of type "record" (1) and
873
            // which points to this table and is located on the same page as the record
874
            // or pointing to the right source PID
875
            $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
876
                ->getQueryBuilderForTable('index_config');
877
            $result = $queryBuilder->select('*')
878
                ->from('index_config')
879
                ->where(
880
                    $queryBuilder->expr()->eq('set_id', $queryBuilder->createNamedParameter(0, \PDO::PARAM_INT)),
881
                    $queryBuilder->expr()->eq('type', $queryBuilder->createNamedParameter(1, \PDO::PARAM_INT)),
882
                    $queryBuilder->expr()->eq(
883
                        'table2index',
884
                        $queryBuilder->createNamedParameter($table, \PDO::PARAM_STR)
885
                    ),
886
                    $queryBuilder->expr()->orX(
887
                        $queryBuilder->expr()->andX(
888
                            $queryBuilder->expr()->eq(
889
                                'alternative_source_pid',
890
                                $queryBuilder->createNamedParameter(0, \PDO::PARAM_INT)
891
                            ),
892
                            $queryBuilder->expr()->eq(
893
                                'pid',
894
                                $queryBuilder->createNamedParameter($currentRecord['pid'], \PDO::PARAM_INT)
895
                            )
896
                        ),
897
                        $queryBuilder->expr()->eq(
898
                            'alternative_source_pid',
899
                            $queryBuilder->createNamedParameter($currentRecord['pid'], \PDO::PARAM_INT)
900
                        )
901
                    ),
902
                    $queryBuilder->expr()->eq(
903
                        'records_indexonchange',
904
                        $queryBuilder->createNamedParameter(1, \PDO::PARAM_INT)
905
                    )
906
                )
907
                ->execute();
908
909
            while ($cfgRec = $result->fetch()) {
910
                $this->indexSingleRecord($currentRecord, $cfgRec);
911
            }
912
        }
913
    }
914
915
    /**
916
     * Initializing the "combined ID" of the page (phash) being indexed (or for which external media is attached)
917
     *
918
     * @param int $id The page uid, &id=
919
     * @param int $type The page type, &type=
920
     * @param int $sys_language_uid sys_language uid, typically &L=
921
     * @param string $MP The MP variable (Mount Points), &MP=
922
     * @param array $uidRL Rootline array of only UIDs.
923
     * @param array $queryArguments Array of GET variables to register with this indexing
924
     * @param int $freeIndexUid Free index UID
925
     * @param int $freeIndexSetId Set id - an integer identifying the "set" of indexing operations.
926
     * @return Indexer
927
     */
928
    protected function initializeIndexer($id, $type, $sys_language_uid, $MP, $uidRL, $queryArguments = [], $freeIndexUid = 0, $freeIndexSetId = 0): Indexer
929
    {
930
        $indexerObj = GeneralUtility::makeInstance(Indexer::class);
931
        // Setting up internal configuration from config array:
932
        // Information about page for which the indexing takes place
933
        $configuration = [
934
            // Page id	(int)
935
            'id' => $id,
936
            // Page type (int)
937
            'type' => $type,
938
            // sys_language UID of the language of the indexing (int)
939
            'sys_language_uid' => $sys_language_uid,
940
            // MP variable, if any (Mount Points) (string)
941
            'MP' => $MP,
942
            // Group list (hardcoded for now...)
943
            'gr_list' => '0,-1',
944
            'staticPageArguments' => $queryArguments,
945
            // Set to defaults
946
            'freeIndexUid' => $freeIndexUid,
947
            'freeIndexSetId' => $freeIndexSetId,
948
            // Root line uids
949
            'rootline_uids' => $uidRL,
950
951
            // Configuration of behavior
952
            // Whether to index external documents like PDF, DOC etc. (if possible)
953
            'index_externals' => 1,
954
            // Length of description text (max 250, default 200)
955
            'index_descrLgd' => 200,
956
            // Whether to index document keywords and description (if present)
957
            'index_metatags' => true
958
        ];
959
        $indexerObj->init($configuration);
960
        return $indexerObj;
961
    }
962
963
    /**
964
     * Indexing records as the content of a TYPO3 page.
965
     *
966
     * @param Indexer $indexer
967
     * @param string $title Title equivalent
968
     * @param string $content The main content to index
969
     * @param int $mtime Last modification time, in seconds
970
     * @param int $crdate The creation date of the content, in seconds
971
     * @param int $recordUid The record UID that the content comes from (for registration with the indexed rows)
972
     */
973
    protected function indexAsTYPO3Page(Indexer $indexer, $title, $content, $mtime, $crdate = 0, $recordUid = 0)
974
    {
975
        // Content of page:
976
        $indexer->conf['mtime'] = $mtime;
977
        // Most recent modification time (seconds) of the content
978
        $indexer->conf['crdate'] = $crdate;
979
        // The creation date of the TYPO3 content
980
        $indexer->conf['recordUid'] = $recordUid;
981
        // UID of the record, if applicable
982
        // Construct fake HTML for parsing:
983
        $indexer->conf['content'] = '
984
		<html>
985
			<head>
986
				<title>' . htmlspecialchars($title) . '</title>
987
			</head>
988
			<body>
989
				' . htmlspecialchars($content) . '
990
			</body>
991
		</html>';
992
        // Content string (HTML of TYPO3 page)
993
        // Initializing charset:
994
        $indexer->conf['metaCharset'] = 'utf-8';
995
        // Character set of content (will be converted to utf-8 during indexing)
996
        $indexer->conf['indexedDocTitle'] = '';
997
        // Alternative title for indexing
998
        // Index content as if it was a TYPO3 page:
999
        $indexer->indexTypo3PageContent();
1000
    }
1001
}
1002