Completed
Push — typo3v9 ( c8617d...d49016 )
by Tomas Norre
05:58
created

IndexedSearchCrawlerHook::checkDeniedSuburls()   A

Complexity

Conditions 4
Paths 3

Size

Total Lines 12

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 20

Importance

Changes 0
Metric Value
cc 4
nc 3
nop 2
dl 0
loc 12
ccs 0
cts 12
cp 0
crap 20
rs 9.8666
c 0
b 0
f 0
1
<?php
2
namespace AOEPeople\Crawler\Hooks;
3
4
/*
5
 * This file is part of the TYPO3 CMS project.
6
 *
7
 * It is free software; you can redistribute it and/or modify it under
8
 * the terms of the GNU General Public License, either version 2
9
 * of the License, or any later version.
10
 *
11
 * For the full copyright and license information, please read the
12
 * LICENSE.txt file that was distributed with this source code.
13
 *
14
 * The TYPO3 project - inspiring people to share!
15
 */
16
17
use TYPO3\CMS\Backend\Utility\BackendUtility;
18
use TYPO3\CMS\Core\Core\Environment;
19
use TYPO3\CMS\Core\Database\Connection;
20
use TYPO3\CMS\Core\Database\ConnectionPool;
21
use TYPO3\CMS\Core\Database\Query\Restriction\DeletedRestriction;
22
use TYPO3\CMS\Core\DataHandling\DataHandler;
23
use TYPO3\CMS\Core\Exception\Page\RootLineException;
24
use TYPO3\CMS\Core\Utility\GeneralUtility;
25
use TYPO3\CMS\Core\Utility\MathUtility;
26
use TYPO3\CMS\Core\Utility\RootlineUtility;
27
28
/**
29
 * Crawler hook for indexed search. Works with the "crawler" extension
30
 * @internal this is a TYPO3-internal hook implementation and not part of TYPO3's Core API.
31
 */
32
class IndexedSearchCrawlerHook
33
{
34
    /**
35
     * Number of seconds to use as interval between queued indexing operations of URLs / files (types 2 & 3)
36
     *
37
     * @var int
38
     */
39
    public $secondsPerExternalUrl = 3;
40
41
    /**
42
     * Counts up for each added URL (type 3)
43
     *
44
     * @var int
45
     */
46
    public $instanceCounter = 0;
47
48
    /**
49
     * @var string
50
     */
51
    public $callBack = self::class;
52
53
    /**
54
     * The constructor
55
     */
56
    public function __construct()
57
    {
58
        // To make sure the backend charset is available:
59
        if (!is_object($GLOBALS['LANG'])) {
60
            $GLOBALS['LANG'] = GeneralUtility::makeInstance(\TYPO3\CMS\Core\Localization\LanguageService::class);
61
            $GLOBALS['LANG']->init($GLOBALS['BE_USER']->uc['lang']);
62
        }
63
    }
64
65
    /**
66
     * Initialization of crawler hook.
67
     * This function is asked for each instance of the crawler and we must check if something is timed to happen and if so put entry(s) in the crawlers log to start processing.
68
     * In reality we select indexing configurations and evaluate if any of them needs to run.
69
     *
70
     * @param object $pObj Parent object (tx_crawler lib)
71
     */
72
    public function crawler_init(&$pObj)
73
    {
74
        // Select all indexing configuration which are waiting to be activated:
75
        $connection = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('index_config');
76
        $queryBuilder = $connection->createQueryBuilder();
77
78
        $result = $queryBuilder->select('*')
79
            ->from('index_config')
80
            ->where(
81
                $queryBuilder->expr()->lt(
82
                    'timer_next_indexing',
83
                    $queryBuilder->createNamedParameter($GLOBALS['EXEC_TIME'], \PDO::PARAM_INT)
84
                ),
85
                $queryBuilder->expr()->eq('set_id', $queryBuilder->createNamedParameter(0, \PDO::PARAM_INT))
86
            )
87
            ->execute();
88
89
        // For each configuration, check if it should be executed and if so, start:
90
        while ($cfgRec = $result->fetch()) {
91
            // Generate a unique set-ID:
92
            $setId = GeneralUtility::md5int(microtime());
93
            // Get next time:
94
            $nextTime = $this->generateNextIndexingTime($cfgRec);
95
            // Start process by updating index-config record:
96
            $connection->update(
97
                'index_config',
98
                [
99
                    'set_id' => $setId,
100
                    'timer_next_indexing' => $nextTime,
101
                    'session_data' => ''
102
                ],
103
                [
104
                    'uid' => (int)$cfgRec['uid']
105
                ]
106
            );
107
            // Based on configuration type:
108
            switch ($cfgRec['type']) {
109
                case 1:
110
                    // RECORDS:
111
                    // Parameters:
112
                    $params = [
113
                        'indexConfigUid' => $cfgRec['uid'],
114
                        'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']'],
115
                        'url' => 'Records (start)'
116
                    ];
117
                    //
118
                    $pObj->addQueueEntry_callBack($setId, $params, $this->callBack, $cfgRec['pid']);
119
                    break;
120
                case 2:
121
                    // FILES:
122
                    // Parameters:
123
                    $params = [
124
                        'indexConfigUid' => $cfgRec['uid'],
125
                        // General
126
                        'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']'],
127
                        // General
128
                        'url' => $cfgRec['filepath'],
129
                        // Partly general... (for URL and file types)
130
                        'depth' => 0
131
                    ];
132
                    $pObj->addQueueEntry_callBack($setId, $params, $this->callBack, $cfgRec['pid']);
133
                    break;
134
                case 3:
135
                    // External URL:
136
                    // Parameters:
137
                    $params = [
138
                        'indexConfigUid' => $cfgRec['uid'],
139
                        // General
140
                        'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']'],
141
                        // General
142
                        'url' => $cfgRec['externalUrl'],
143
                        // Partly general... (for URL and file types)
144
                        'depth' => 0
145
                    ];
146
                    $pObj->addQueueEntry_callBack($setId, $params, $this->callBack, $cfgRec['pid']);
147
                    break;
148
                case 4:
149
                    // Page tree
150
                    // Parameters:
151
                    $params = [
152
                        'indexConfigUid' => $cfgRec['uid'],
153
                        // General
154
                        'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']'],
155
                        // General
156
                        'url' => (int)$cfgRec['alternative_source_pid'],
157
                        // Partly general... (for URL and file types and page tree (root))
158
                        'depth' => 0
159
                    ];
160
                    $pObj->addQueueEntry_callBack($setId, $params, $this->callBack, $cfgRec['pid']);
161
                    break;
162
                case 5:
163
                    // Meta configuration, nothing to do:
164
                    // NOOP
165
                    break;
166
                default:
167
                    if ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]) {
168
                        $hookObj = GeneralUtility::makeInstance($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]);
169
                        // Parameters:
170
                        $params = [
171
                            'indexConfigUid' => $cfgRec['uid'],
172
                            // General
173
                            'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . '/CUSTOM]'],
174
                            // General
175
                            'url' => $hookObj->initMessage($message)
0 ignored issues
show
Bug introduced by
The variable $message does not exist. Did you forget to declare it?

This check marks access to variables or properties that have not been declared yet. While PHP has no explicit notion of declaring a variable, accessing it before a value is assigned to it is most likely a bug.

Loading history...
176
                        ];
177
                        $pObj->addQueueEntry_callBack($setId, $params, $this->callBack, $cfgRec['pid']);
178
                    }
179
            }
180
        }
181
        // Finally, look up all old index configurations which are finished and needs to be reset and done.
182
        $this->cleanUpOldRunningConfigurations();
183
    }
184
185
    /**
186
     * Call back function for execution of a log element
187
     *
188
     * @param array $params Params from log element. Must contain $params['indexConfigUid']
189
     * @param object $pObj Parent object (tx_crawler lib)
190
     * @return array Result array
191
     */
192
    public function crawler_execute($params, &$pObj)
193
    {
194
        // Indexer configuration ID must exist:
195
        if ($params['indexConfigUid']) {
196
            $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
197
                ->getQueryBuilderForTable('index_config');
198
            $queryBuilder->getRestrictions()->removeAll();
199
            // Load the indexing configuration record:
200
            $cfgRec = $queryBuilder
201
                ->select('*')
202
                ->from('index_config')
203
                ->where(
204
                    $queryBuilder->expr()->eq(
205
                        'uid',
206
                        $queryBuilder->createNamedParameter($params['indexConfigUid'], \PDO::PARAM_INT)
207
                    )
208
                )
209
                ->execute()
210
                ->fetch();
211
            if (is_array($cfgRec)) {
212
                // Unpack session data:
213
                $session_data = unserialize($cfgRec['session_data']);
214
                // Select which type:
215
                switch ($cfgRec['type']) {
216
                    case 1:
217
                        // Records:
218
                        $this->crawler_execute_type1($cfgRec, $session_data, $params, $pObj);
219
                        break;
220
                    case 2:
221
                        // Files
222
                        $this->crawler_execute_type2($cfgRec, $session_data, $params, $pObj);
223
                        break;
224
                    case 3:
225
                        // External URL:
226
                        $this->crawler_execute_type3($cfgRec, $session_data, $params, $pObj);
227
                        break;
228
                    case 4:
229
                        // Page tree:
230
                        $this->crawler_execute_type4($cfgRec, $session_data, $params, $pObj);
231
                        break;
232
                    case 5:
233
                        // Meta
234
                        // NOOP (should never enter here!)
235
                        break;
236
                    default:
237
                        if ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]) {
238
                            $hookObj = GeneralUtility::makeInstance($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]);
239
                            $this->pObj = $pObj;
0 ignored issues
show
Bug introduced by
The property pObj does not exist. Did you maybe forget to declare it?

In PHP it is possible to write to properties without declaring them. For example, the following is perfectly valid PHP code:

class MyClass { }

$x = new MyClass();
$x->foo = true;

Generally, it is a good practice to explictly declare properties to avoid accidental typos and provide IDE auto-completion:

class MyClass {
    public $foo;
}

$x = new MyClass();
$x->foo = true;
Loading history...
240
                            // For addQueueEntryForHook()
241
                            $hookObj->indexOperation($cfgRec, $session_data, $params, $this);
242
                        }
243
                }
244
                // Save process data which might be modified:
245
                GeneralUtility::makeInstance(ConnectionPool::class)
246
                    ->getConnectionForTable('index_config')
247
                    ->update(
248
                        'index_config',
249
                        ['session_data' => serialize($session_data)],
250
                        ['uid' => (int)$cfgRec['uid']]
251
                    );
252
            }
253
        }
254
        return ['log' => $params];
255
    }
256
257
    /**
258
     * Indexing records from a table
259
     *
260
     * @param array $cfgRec Indexing Configuration Record
261
     * @param array $session_data Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call!
262
     * @param array $params Parameters from the log queue.
263
     * @param object $pObj Parent object (from "crawler" extension!)
264
     */
265
    public function crawler_execute_type1($cfgRec, &$session_data, $params, &$pObj)
0 ignored issues
show
Unused Code introduced by
The parameter $params is not used and could be removed.

This check looks from parameters that have been defined for a function or method, but which are not used in the method body.

Loading history...
266
    {
267
        if ($cfgRec['table2index'] && isset($GLOBALS['TCA'][$cfgRec['table2index']])) {
268
            // Init session data array if not already:
269
            if (!is_array($session_data)) {
270
                $session_data = [
271
                    'uid' => 0
272
                ];
273
            }
274
            // Init:
275
            $pid = (int)$cfgRec['alternative_source_pid'] ?: $cfgRec['pid'];
276
            $numberOfRecords = $cfgRec['recordsbatch']
277
                ? MathUtility::forceIntegerInRange($cfgRec['recordsbatch'], 1)
278
                : 100;
279
280
            // Get root line:
281
            $rootLine = $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
282
            // Select
283
            $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
284
                ->getQueryBuilderForTable($cfgRec['table2index']);
285
286
            $baseQueryBuilder = $queryBuilder->select('*')
287
                ->from($cfgRec['table2index'])
288
                ->where(
289
                    $queryBuilder->expr()->eq(
290
                        'pid',
291
                        $queryBuilder->createNamedParameter($pid, \PDO::PARAM_INT)
292
                    ),
293
                    $queryBuilder->expr()->gt(
294
                        'uid',
295
                        $queryBuilder->createNamedParameter($session_data['uid'], \PDO::PARAM_INT)
296
                    )
297
                );
298
            $result = $baseQueryBuilder
299
                ->setMaxResults($numberOfRecords)
300
                ->orderBy('uid')
301
                ->execute();
302
303
            // Traverse:
304
            while ($row = $result->fetch()) {
305
                // Index single record:
306
                $this->indexSingleRecord($row, $cfgRec, $rootLine);
307
                // Update the UID we last processed:
308
                $session_data['uid'] = $row['uid'];
309
            }
310
311
            $rowCount = $baseQueryBuilder->count('uid')->execute()->fetchColumn(0);
312
            // Finally, set entry for next indexing of batch of records:
313
            if ($rowCount) {
314
                $nparams = [
315
                    'indexConfigUid' => $cfgRec['uid'],
316
                    'url' => 'Records from UID#' . ($session_data['uid'] + 1) . '-?',
317
                    'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']']
318
                ];
319
                $pObj->addQueueEntry_callBack($cfgRec['set_id'], $nparams, $this->callBack, $cfgRec['pid']);
320
            }
321
        }
322
    }
323
324
    /**
325
     * Indexing files from fileadmin
326
     *
327
     * @param array $cfgRec Indexing Configuration Record
328
     * @param array $session_data Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call!
329
     * @param array $params Parameters from the log queue.
330
     * @param object $pObj Parent object (from "crawler" extension!)
331
     */
332
    public function crawler_execute_type2($cfgRec, &$session_data, $params, &$pObj)
0 ignored issues
show
Unused Code introduced by
The parameter $session_data is not used and could be removed.

This check looks from parameters that have been defined for a function or method, but which are not used in the method body.

Loading history...
333
    {
334
        // Prepare path, making it absolute and checking:
335
        $readpath = $params['url'];
336
        if (!GeneralUtility::isAbsPath($readpath)) {
337
            $readpath = GeneralUtility::getFileAbsFileName($readpath);
338
        }
339
        if (GeneralUtility::isAllowedAbsPath($readpath)) {
340
            if (@is_file($readpath)) {
341
                // If file, index it!
342
                // Get root line (need to provide this when indexing external files)
343
                $rl = $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
344
                // (Re)-Indexing file on page.
345
                $indexerObj = GeneralUtility::makeInstance(\TYPO3\CMS\IndexedSearch\Indexer::class);
346
                $indexerObj->backend_initIndexer($cfgRec['pid'], 0, 0, '', $rl);
347
                $indexerObj->backend_setFreeIndexUid($cfgRec['uid'], $cfgRec['set_id']);
348
                $indexerObj->hash['phash'] = -1;
349
                // EXPERIMENT - but to avoid phash_t3 being written to file sections (otherwise they are removed when page is reindexed!!!)
350
                // Index document:
351
                $indexerObj->indexRegularDocument(\TYPO3\CMS\Core\Utility\PathUtility::stripPathSitePrefix($readpath), true);
352
            } elseif (@is_dir($readpath)) {
353
                // If dir, read content and create new pending items for log:
354
                // Select files and directories in path:
355
                $extList = implode(',', GeneralUtility::trimExplode(',', $cfgRec['extensions'], true));
356
                $fileArr = [];
357
                $files = GeneralUtility::getAllFilesAndFoldersInPath($fileArr, $readpath, $extList, 0, 0);
358
                $directoryList = GeneralUtility::get_dirs($readpath);
359
                if (is_array($directoryList) && $params['depth'] < $cfgRec['depth']) {
360
                    foreach ($directoryList as $subdir) {
361
                        if ((string)$subdir != '') {
362
                            $files[] = $readpath . $subdir . '/';
363
                        }
364
                    }
365
                }
366
                $files = GeneralUtility::removePrefixPathFromList($files, Environment::getPublicPath() . '/');
367
                // traverse the items and create log entries:
368
                foreach ($files as $path) {
369
                    $this->instanceCounter++;
370
                    if ($path !== $params['url']) {
371
                        // Parameters:
372
                        $nparams = [
373
                            'indexConfigUid' => $cfgRec['uid'],
374
                            'url' => $path,
375
                            'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']'],
376
                            'depth' => $params['depth'] + 1
377
                        ];
378
                        $pObj->addQueueEntry_callBack($cfgRec['set_id'], $nparams, $this->callBack, $cfgRec['pid'], $GLOBALS['EXEC_TIME'] + $this->instanceCounter * $this->secondsPerExternalUrl);
379
                    }
380
                }
381
            }
382
        }
383
    }
384
385
    /**
386
     * Indexing External URLs
387
     *
388
     * @param array $cfgRec Indexing Configuration Record
389
     * @param array $session_data Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call!
390
     * @param array $params Parameters from the log queue.
391
     * @param object $pObj Parent object (from "crawler" extension!)
392
     */
393
    public function crawler_execute_type3($cfgRec, &$session_data, $params, &$pObj)
394
    {
395
        // Init session data array if not already:
396
        if (!is_array($session_data)) {
397
            $session_data = [
398
                'urlLog' => [$params['url']]
399
            ];
400
        }
401
        // Index the URL:
402
        $rl = $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
403
        $subUrls = $this->indexExtUrl($params['url'], $cfgRec['pid'], $rl, $cfgRec['uid'], $cfgRec['set_id']);
404
        // Add more elements to log now:
405
        if ($params['depth'] < $cfgRec['depth']) {
406
            foreach ($subUrls as $url) {
407
                if ($url = $this->checkUrl($url, $session_data['urlLog'], $cfgRec['externalUrl'])) {
408
                    if (!$this->checkDeniedSuburls($url, $cfgRec['url_deny'])) {
409
                        $this->instanceCounter++;
410
                        $session_data['urlLog'][] = $url;
411
                        // Parameters:
412
                        $nparams = [
413
                            'indexConfigUid' => $cfgRec['uid'],
414
                            'url' => $url,
415
                            'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']'],
416
                            'depth' => $params['depth'] + 1
417
                        ];
418
                        $pObj->addQueueEntry_callBack($cfgRec['set_id'], $nparams, $this->callBack, $cfgRec['pid'], $GLOBALS['EXEC_TIME'] + $this->instanceCounter * $this->secondsPerExternalUrl);
419
                    }
420
                }
421
            }
422
        }
423
    }
424
425
    /**
426
     * Page tree indexing type
427
     *
428
     * @param array $cfgRec Indexing Configuration Record
429
     * @param array $session_data Session data for the indexing session spread over multiple instances of the script. Passed by reference so changes hereto will be saved for the next call!
430
     * @param array $params Parameters from the log queue.
431
     * @param object $pObj Parent object (from "crawler" extension!)
432
     */
433
    public function crawler_execute_type4($cfgRec, &$session_data, $params, &$pObj)
434
    {
435
        // Base page uid:
436
        $pageUid = (int)$params['url'];
437
        // Get array of URLs from page:
438
        $pageRow = BackendUtility::getRecord('pages', $pageUid);
439
        $res = $pObj->getUrlsForPageRow($pageRow);
440
        $duplicateTrack = [];
441
        // Registry for duplicates
442
        $downloadUrls = [];
443
        // Dummy.
444
        // Submit URLs:
445
        if (!empty($res)) {
446
            foreach ($res as $paramSetKey => $vv) {
447
                $pObj->urlListFromUrlArray($vv, $pageRow, $GLOBALS['EXEC_TIME'], 30, 1, 0, $duplicateTrack, $downloadUrls, ['tx_indexedsearch_reindex']);
448
            }
449
        }
450
        // Add subpages to log now:
451
        if ($params['depth'] < $cfgRec['depth']) {
452
            // Subpages selected
453
            $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable('pages');
454
            $queryBuilder->getRestrictions()
455
                ->removeAll()
456
                ->add(GeneralUtility::makeInstance(DeletedRestriction::class));
457
            $result = $queryBuilder->select('uid', 'title')
458
                ->from('pages')
459
                ->where(
460
                    $queryBuilder->expr()->eq(
461
                        'pid',
462
                        $queryBuilder->createNamedParameter($pageUid, \PDO::PARAM_INT)
463
                    )
464
                )
465
                ->execute();
466
            // Traverse subpages and add to queue:
467
            while ($row = $result->fetch()) {
468
                $this->instanceCounter++;
469
                $url = 'pages:' . $row['uid'] . ': ' . $row['title'];
470
                $session_data['urlLog'][] = $url;
471
                // Parameters:
472
                $nparams = [
473
                    'indexConfigUid' => $cfgRec['uid'],
474
                    'url' => $row['uid'],
475
                    'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']'],
476
                    'depth' => $params['depth'] + 1
477
                ];
478
                $pObj->addQueueEntry_callBack(
479
                    $cfgRec['set_id'],
480
                    $nparams,
481
                    $this->callBack,
482
                    $cfgRec['pid'],
483
                    $GLOBALS['EXEC_TIME'] + $this->instanceCounter * $this->secondsPerExternalUrl
484
                );
485
            }
486
        }
487
    }
488
489
    /**
490
     * Look up all old index configurations which are finished and needs to be reset and done
491
     */
492
    public function cleanUpOldRunningConfigurations()
493
    {
494
        $connectionPool = GeneralUtility::makeInstance(ConnectionPool::class);
495
        // List of tables that store information related to the phash value
496
        $tablesToClean = [
497
            'index_phash',
498
            'index_rel',
499
            'index_section',
500
            'index_grlist',
501
            'index_fulltext',
502
            'index_debug'
503
        ];
504
505
        $queryBuilder = $connectionPool->getQueryBuilderForTable('index_config');
506
        $queryBuilder->getRestrictions()
507
            ->removeAll()
508
            ->add(GeneralUtility::makeInstance(DeletedRestriction::class));
509
510
        // Lookup running index configurations:
511
        $runningIndexingConfigurations = $queryBuilder->select('*')
512
            ->from('index_config')
513
            ->where($queryBuilder->expr()->neq('set_id', $queryBuilder->createNamedParameter(0, \PDO::PARAM_INT)))
514
            ->execute()
515
            ->fetchAll();
516
        // For each running configuration, look up how many log entries there are which are scheduled
517
        // for execution and if none, clear the "set_id" (means; Processing was DONE)
518
        foreach ($runningIndexingConfigurations as $cfgRec) {
519
            // Look for ended processes:
520
            $queued_items = $connectionPool->getConnectionForTable('tx_crawler_queue')
521
                ->count(
522
                    '*',
523
                    'tx_crawler_queue',
524
                    [
525
                        'set_id' => (int)$cfgRec['set_id'],
526
                        'exec_time' => 0
527
                    ]
528
                );
529
            if (!$queued_items) {
530
                // Lookup old phash rows:
531
                $queryBuilder = $connectionPool->getQueryBuilderForTable('index_phash');
532
                $oldPhashRows = $queryBuilder
533
                    ->select('phash')
534
                    ->from('index_phash')
535
                    ->where(
536
                        $queryBuilder->expr()->eq(
537
                            'freeIndexUid',
538
                            $queryBuilder->createNamedParameter($cfgRec['uid'], \PDO::PARAM_INT)
539
                        ),
540
                        $queryBuilder->expr()->neq(
541
                            'freeIndexSetId',
542
                            $queryBuilder->createNamedParameter($cfgRec['set_id'], \PDO::PARAM_INT)
543
                        )
544
                    )
545
                    ->execute()
546
                    ->fetchAll();
547
548
                // Removing old registrations for all tables
549
                foreach ($tablesToClean as $table) {
550
                    $queryBuilder = $connectionPool->getQueryBuilderForTable($table);
551
                    $queryBuilder->delete($table)
552
                        ->where(
553
                            $queryBuilder->expr()->in(
554
                                'phash',
555
                                $queryBuilder->createNamedParameter(
556
                                    array_column($oldPhashRows, 'phash'),
557
                                    Connection::PARAM_INT_ARRAY
558
                                )
559
                            )
560
                        )
561
                        ->execute();
562
                }
563
564
                // End process by updating index-config record:
565
                $connectionPool->getConnectionForTable('index_config')
566
                    ->update(
567
                        'index_config',
568
                        [
569
                            'set_id' => 0,
570
                            'session_data' => ''
571
                        ],
572
                        ['uid' => (int)$cfgRec['uid']]
573
                    );
574
            }
575
        }
576
    }
577
578
    /*****************************************
579
     *
580
     * Helper functions
581
     *
582
     *****************************************/
583
    /**
584
     * Check if an input URL are allowed to be indexed. Depends on whether it is already present in the url log.
585
     *
586
     * @param string $url URL string to check
587
     * @param array $urlLog Array of already indexed URLs (input url is looked up here and must not exist already)
588
     * @param string $baseUrl Base URL of the indexing process (input URL must be "inside" the base URL!)
589
     * @return string Returls the URL if OK, otherwise FALSE
590
     */
591
    public function checkUrl($url, $urlLog, $baseUrl)
592
    {
593
        $url = preg_replace('/\\/\\/$/', '/', $url);
594
        list($url) = explode('#', $url);
595
        if (!strstr($url, '../')) {
596
            if (GeneralUtility::isFirstPartOfStr($url, $baseUrl)) {
597
                if (!in_array($url, $urlLog)) {
598
                    return $url;
599
                }
600
            }
601
        }
602
    }
603
604
    /**
605
     * Indexing External URL
606
     *
607
     * @param string $url URL, http://....
608
     * @param int $pageId Page id to relate indexing to.
609
     * @param array $rl Rootline array to relate indexing to
610
     * @param int $cfgUid Configuration UID
611
     * @param int $setId Set ID value
612
     * @return array URLs found on this page
613
     */
614
    public function indexExtUrl($url, $pageId, $rl, $cfgUid, $setId)
615
    {
616
        // Index external URL:
617
        $indexerObj = GeneralUtility::makeInstance(\TYPO3\CMS\IndexedSearch\Indexer::class);
618
        $indexerObj->backend_initIndexer($pageId, 0, 0, '', $rl);
619
        $indexerObj->backend_setFreeIndexUid($cfgUid, $setId);
620
        $indexerObj->hash['phash'] = -1;
621
        // To avoid phash_t3 being written to file sections (otherwise they are removed when page is reindexed!!!)
622
        $indexerObj->indexExternalUrl($url);
623
        $url_qParts = parse_url($url);
624
        $baseAbsoluteHref = $url_qParts['scheme'] . '://' . $url_qParts['host'];
625
        $baseHref = $indexerObj->extractBaseHref($indexerObj->indexExternalUrl_content);
626
        if (!$baseHref) {
627
            // Extract base href from current URL
628
            $baseHref = $baseAbsoluteHref;
629
            $baseHref .= substr($url_qParts['path'], 0, strrpos($url_qParts['path'], '/'));
630
        }
631
        $baseHref = rtrim($baseHref, '/');
632
        // Get URLs on this page:
633
        $subUrls = [];
634
        $list = $indexerObj->extractHyperLinks($indexerObj->indexExternalUrl_content);
635
        // Traverse links:
636
        foreach ($list as $count => $linkInfo) {
637
            // Decode entities:
638
            $subUrl = htmlspecialchars_decode($linkInfo['href']);
639
            $qParts = parse_url($subUrl);
640
            if (!$qParts['scheme']) {
641
                $relativeUrl = GeneralUtility::resolveBackPath($subUrl);
642
                if ($relativeUrl[0] === '/') {
643
                    $subUrl = $baseAbsoluteHref . $relativeUrl;
644
                } else {
645
                    $subUrl = $baseHref . '/' . $relativeUrl;
646
                }
647
            }
648
            $subUrls[] = $subUrl;
649
        }
650
        return $subUrls;
651
    }
652
653
    /**
654
     * Indexing Single Record
655
     *
656
     * @param array $r Record to index
657
     * @param array $cfgRec Configuration Record
658
     * @param array $rl Rootline array to relate indexing to
659
     */
660
    public function indexSingleRecord($r, $cfgRec, $rl = null)
661
    {
662
        // Init:
663
        $rl = is_array($rl) ? $rl : $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
664
        $fieldList = GeneralUtility::trimExplode(',', $cfgRec['fieldlist'], true);
665
        $languageField = $GLOBALS['TCA'][$cfgRec['table2index']]['ctrl']['languageField'];
666
        $sys_language_uid = $languageField ? $r[$languageField] : 0;
667
        // (Re)-Indexing a row from a table:
668
        $indexerObj = GeneralUtility::makeInstance(\TYPO3\CMS\IndexedSearch\Indexer::class);
669
        parse_str(str_replace('###UID###', $r['uid'], $cfgRec['get_params']), $GETparams);
670
        $indexerObj->backend_initIndexer($cfgRec['pid'], 0, $sys_language_uid, '', $rl, $GETparams, (bool)$cfgRec['chashcalc']);
671
        $indexerObj->backend_setFreeIndexUid($cfgRec['uid'], $cfgRec['set_id']);
672
        $indexerObj->forceIndexing = true;
673
        $theContent = '';
674
        foreach ($fieldList as $k => $v) {
675
            if (!$k) {
676
                $theTitle = $r[$v];
677
            } else {
678
                $theContent .= $r[$v] . ' ';
679
            }
680
        }
681
        // Indexing the record as a page (but with parameters set, see ->backend_setFreeIndexUid())
682
        $indexerObj->backend_indexAsTYPO3Page(strip_tags(str_replace('<', ' <', $theTitle)), '', '', strip_tags(str_replace('<', ' <', $theContent)), 'utf-8', $r[$GLOBALS['TCA'][$cfgRec['table2index']]['ctrl']['tstamp']], $r[$GLOBALS['TCA'][$cfgRec['table2index']]['ctrl']['crdate']], $r['uid']);
0 ignored issues
show
Bug introduced by
The variable $theTitle does not seem to be defined for all execution paths leading up to this point.

If you define a variable conditionally, it can happen that it is not defined for all execution paths.

Let’s take a look at an example:

function myFunction($a) {
    switch ($a) {
        case 'foo':
            $x = 1;
            break;

        case 'bar':
            $x = 2;
            break;
    }

    // $x is potentially undefined here.
    echo $x;
}

In the above example, the variable $x is defined if you pass “foo” or “bar” as argument for $a. However, since the switch statement has no default case statement, if you pass any other value, the variable $x would be undefined.

Available Fixes

  1. Check for existence of the variable explicitly:

    function myFunction($a) {
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
        }
    
        if (isset($x)) { // Make sure it's always set.
            echo $x;
        }
    }
    
  2. Define a default value for the variable:

    function myFunction($a) {
        $x = ''; // Set a default which gets overridden for certain paths.
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
        }
    
        echo $x;
    }
    
  3. Add a value for the missing path:

    function myFunction($a) {
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
    
            // We add support for the missing case.
            default:
                $x = '';
                break;
        }
    
        echo $x;
    }
    
Loading history...
683
    }
684
685
    /**
686
     * Get rootline for closest TypoScript template root.
687
     * Algorithm same as used in Web > Template, Object browser
688
     *
689
     * @param int $id The page id to traverse rootline back from
690
     * @return array Array where the root lines uid values are found.
691
     */
692
    public function getUidRootLineForClosestTemplate($id)
693
    {
694
        $rootLineUids = [];
695
        try {
696
            // Gets the rootLine
697
            $rootLine = GeneralUtility::makeInstance(RootlineUtility::class, $id)->get();
698
            // This generates the constants/config + hierarchy info for the template.
699
            $tmpl = GeneralUtility::makeInstance(\TYPO3\CMS\Core\TypoScript\ExtendedTemplateService::class);
700
            $tmpl->runThroughTemplates($rootLine);
701
            // Root line uids
702
            foreach ($tmpl->rootLine as $rlkey => $rldat) {
703
                $rootLineUids[$rlkey] = $rldat['uid'];
704
            }
705
        } catch (RootLineException $e) {
0 ignored issues
show
Bug introduced by
The class TYPO3\CMS\Core\Exception\Page\RootLineException does not exist. Did you forget a USE statement, or did you not list all dependencies?

Scrutinizer analyzes your composer.json/composer.lock file if available to determine the classes, and functions that are defined by your dependencies.

It seems like the listed class was neither found in your dependencies, nor was it found in the analyzed files in your repository. If you are using some other form of dependency management, you might want to disable this analysis.

Loading history...
706
            // do nothing
707
        }
708
        return $rootLineUids;
709
    }
710
711
    /**
712
     * Generate the unix time stamp for next visit.
713
     *
714
     * @param array $cfgRec Index configuration record
715
     * @return int The next time stamp
716
     */
717
    public function generateNextIndexingTime($cfgRec)
718
    {
719
        $currentTime = $GLOBALS['EXEC_TIME'];
720
        // Now, find a midnight time to use for offset calculation. This has to differ depending on whether we have frequencies within a day or more than a day; Less than a day, we don't care which day to use for offset, more than a day we want to respect the currently entered day as offset regardless of when the script is run - thus the day-of-week used in case "Weekly" is selected will be respected
721
        if ($cfgRec['timer_frequency'] <= 24 * 3600) {
722
            $aMidNight = mktime(0, 0, 0) - 1 * 24 * 3600;
723
        } else {
724
            $lastTime = $cfgRec['timer_next_indexing'] ?: $GLOBALS['EXEC_TIME'];
725
            $aMidNight = mktime(0, 0, 0, date('m', $lastTime), date('d', $lastTime), date('y', $lastTime));
726
        }
727
        // Find last offset time plus frequency in seconds:
728
        $lastSureOffset = $aMidNight + MathUtility::forceIntegerInRange($cfgRec['timer_offset'], 0, 86400);
729
        $frequencySeconds = MathUtility::forceIntegerInRange($cfgRec['timer_frequency'], 1);
730
        // Now, find out how many blocks of the length of frequency there is until the next time:
731
        $frequencyBlocksUntilNextTime = ceil(($currentTime - $lastSureOffset) / $frequencySeconds);
732
        // Set next time to the offset + the frequencyblocks multiplied with the frequency length in seconds.
733
        return $lastSureOffset + $frequencyBlocksUntilNextTime * $frequencySeconds;
734
    }
735
736
    /**
737
     * Checks if $url has any of the URls in the $url_deny "list" in it and if so, returns TRUE.
738
     *
739
     * @param string $url URL to test
740
     * @param string $url_deny String where URLs are separated by line-breaks; If any of these strings is the first part of $url, the function returns TRUE (to indicate denial of decend)
741
     * @return bool TRUE if there is a matching URL (hence, do not index!)
742
     */
743
    public function checkDeniedSuburls($url, $url_deny)
744
    {
745
        if (trim($url_deny)) {
746
            $url_denyArray = GeneralUtility::trimExplode(LF, $url_deny, true);
747
            foreach ($url_denyArray as $testurl) {
748
                if (GeneralUtility::isFirstPartOfStr($url, $testurl)) {
749
                    return true;
750
                }
751
            }
752
        }
753
        return false;
754
    }
755
756
    /**
757
     * Adding entry in queue for Hook
758
     *
759
     * @param array $cfgRec Configuration record
760
     * @param string $title Title/URL
761
     */
762
    public function addQueueEntryForHook($cfgRec, $title)
763
    {
764
        $nparams = [
765
            'indexConfigUid' => $cfgRec['uid'],
766
            // This must ALWAYS be the cfgRec uid!
767
            'url' => $title,
768
            'procInstructions' => ['[Index Cfg UID#' . $cfgRec['uid'] . ']']
769
        ];
770
        $this->pObj->addQueueEntry_callBack($cfgRec['set_id'], $nparams, $this->callBack, $cfgRec['pid']);
771
    }
772
773
    /**
774
     * Deletes all data stored by indexed search for a given page
775
     *
776
     * @param int $id Uid of the page to delete all pHash
777
     */
778
    public function deleteFromIndex($id)
779
    {
780
        $connectionPool = GeneralUtility::makeInstance(ConnectionPool::class);
781
782
        // Lookup old phash rows:
783
784
        $queryBuilder = $connectionPool->getQueryBuilderForTable('index_section');
785
        $oldPhashRows = $queryBuilder->select('phash')
786
            ->from('index_section')
787
            ->where(
788
                $queryBuilder->expr()->eq(
789
                    'page_id',
790
                    $queryBuilder->createNamedParameter($id, \PDO::PARAM_INT)
791
                )
792
            )
793
            ->execute()
794
            ->fetchAll();
795
796
        if (empty($oldPhashRows)) {
797
            return;
798
        }
799
800
        $tables = [
801
            'index_debug',
802
            'index_fulltext',
803
            'index_grlist',
804
            'index_phash',
805
            'index_rel',
806
            'index_section',
807
        ];
808
        foreach ($tables as $table) {
809
            $queryBuilder = $connectionPool->getQueryBuilderForTable($table);
810
            $queryBuilder->delete($table)
811
                ->where(
812
                    $queryBuilder->expr()->in(
813
                        'phash',
814
                        $queryBuilder->createNamedParameter(
815
                            array_column($oldPhashRows, 'phash'),
816
                            Connection::PARAM_INT_ARRAY
817
                        )
818
                    )
819
                )
820
                ->execute();
821
        }
822
    }
823
824
    /*************************
825
     *
826
     * Hook functions for DataHandler (indexing of records)
827
     *
828
     *************************/
829
    /**
830
     * DataHandler hook function for on-the-fly indexing of database records
831
     *
832
     * @param string $command DataHandler command
833
     * @param string $table Table name
834
     * @param string $id Record ID. If new record its a string pointing to index inside \TYPO3\CMS\Core\DataHandling\DataHandler::substNEWwithIDs
835
     * @param mixed $value Target value (ignored)
836
     * @param DataHandler $pObj DataHandler calling object
837
     */
838
    public function processCmdmap_preProcess($command, $table, $id, $value, $pObj)
0 ignored issues
show
Unused Code introduced by
The parameter $value is not used and could be removed.

This check looks from parameters that have been defined for a function or method, but which are not used in the method body.

Loading history...
Unused Code introduced by
The parameter $pObj is not used and could be removed.

This check looks from parameters that have been defined for a function or method, but which are not used in the method body.

Loading history...
839
    {
840
        // Clean up the index
841
        if ($command === 'delete' && $table === 'pages') {
842
            $this->deleteFromIndex($id);
843
        }
844
    }
845
846
    /**
847
     * DataHandler hook function for on-the-fly indexing of database records
848
     *
849
     * @param string $status Status "new" or "update
850
     * @param string $table Table name
851
     * @param string $id Record ID. If new record its a string pointing to index inside \TYPO3\CMS\Core\DataHandling\DataHandler::substNEWwithIDs
852
     * @param array $fieldArray Field array of updated fields in the operation
853
     * @param DataHandler $pObj DataHandler calling object
854
     */
855
    public function processDatamap_afterDatabaseOperations($status, $table, $id, $fieldArray, $pObj)
856
    {
857
        // Check if any fields are actually updated:
858
        if (empty($fieldArray)) {
859
            return;
860
        }
861
        // Translate new ids.
862
        if ($status === 'new') {
863
            $id = $pObj->substNEWwithIDs[$id];
864
        } elseif ($table === 'pages' && $status === 'update' && (array_key_exists('hidden', $fieldArray) && $fieldArray['hidden'] == 1 || array_key_exists('no_search', $fieldArray) && $fieldArray['no_search'] == 1)) {
865
            // If the page should be hidden or not indexed after update, delete index for this page
866
            $this->deleteFromIndex($id);
867
        }
868
        // Get full record and if exists, search for indexing configurations:
869
        $currentRecord = BackendUtility::getRecord($table, $id);
870
        if (is_array($currentRecord)) {
871
            // Select all (not running) indexing configurations of type "record" (1) and
872
            // which points to this table and is located on the same page as the record
873
            // or pointing to the right source PID
874
            $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
875
                ->getQueryBuilderForTable('index_config');
876
            $result = $queryBuilder->select('*')
877
                ->from('index_config')
878
                ->where(
879
                    $queryBuilder->expr()->eq('set_id', $queryBuilder->createNamedParameter(0, \PDO::PARAM_INT)),
880
                    $queryBuilder->expr()->eq('type', $queryBuilder->createNamedParameter(1, \PDO::PARAM_INT)),
881
                    $queryBuilder->expr()->eq(
882
                        'table2index',
883
                        $queryBuilder->createNamedParameter($table, \PDO::PARAM_STR)
884
                    ),
885
                    $queryBuilder->expr()->orX(
886
                        $queryBuilder->expr()->andX(
887
                            $queryBuilder->expr()->eq(
888
                                'alternative_source_pid',
889
                                $queryBuilder->createNamedParameter(0, \PDO::PARAM_INT)
890
                            ),
891
                            $queryBuilder->expr()->eq(
892
                                'pid',
893
                                $queryBuilder->createNamedParameter($currentRecord['pid'], \PDO::PARAM_INT)
894
                            )
895
                        ),
896
                        $queryBuilder->expr()->eq(
897
                            'alternative_source_pid',
898
                            $queryBuilder->createNamedParameter($currentRecord['pid'], \PDO::PARAM_INT)
899
                        )
900
                    ),
901
                    $queryBuilder->expr()->eq(
902
                        'records_indexonchange',
903
                        $queryBuilder->createNamedParameter(1, \PDO::PARAM_INT)
904
                    )
905
                )
906
                ->execute();
907
908
            while ($cfgRec = $result->fetch()) {
909
                $this->indexSingleRecord($currentRecord, $cfgRec);
910
            }
911
        }
912
    }
913
}
914