Completed
Push — master ( ee3b45...4fa3ae )
by
unknown
16:22
created

Indexer::setExtHashes()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 13
Code Lines 7

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
eloc 7
nc 1
nop 2
dl 0
loc 13
rs 10
c 0
b 0
f 0
1
<?php
2
namespace TYPO3\CMS\IndexedSearch;
3
4
/*
5
 * This file is part of the TYPO3 CMS project.
6
 *
7
 * It is free software; you can redistribute it and/or modify it under
8
 * the terms of the GNU General Public License, either version 2
9
 * of the License, or any later version.
10
 *
11
 * For the full copyright and license information, please read the
12
 * LICENSE.txt file that was distributed with this source code.
13
 *
14
 * The TYPO3 project - inspiring people to share!
15
 */
16
17
use TYPO3\CMS\Core\Configuration\ExtensionConfiguration;
18
use TYPO3\CMS\Core\Core\Environment;
19
use TYPO3\CMS\Core\Database\Connection;
20
use TYPO3\CMS\Core\Database\ConnectionPool;
21
use TYPO3\CMS\Core\TimeTracker\TimeTracker;
22
use TYPO3\CMS\Core\Utility\GeneralUtility;
23
use TYPO3\CMS\Core\Utility\MathUtility;
24
use TYPO3\CMS\Core\Utility\PathUtility;
25
use TYPO3\CMS\Frontend\Controller\TypoScriptFrontendController;
26
use TYPO3\CMS\IndexedSearch\Utility\IndexedSearchUtility;
27
28
/**
29
 * Indexing class for TYPO3 frontend
30
 */
31
class Indexer
32
{
33
34
    /**
35
     * @var array
36
     */
37
    public $reasons = [
38
        -1 => 'mtime matched the document, so no changes detected and no content updated',
39
        -2 => 'The minimum age was not exceeded',
40
        1 => 'The configured max-age was exceeded for the document and thus it\'s indexed.',
41
        2 => 'The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.',
42
        3 => 'The minimum age was exceed, but mtime was not set, so the page was indexed.',
43
        4 => 'Page has never been indexed (is not represented in the index_phash table).'
44
    ];
45
46
    /**
47
     * HTML code blocks to exclude from indexing
48
     *
49
     * @var string
50
     */
51
    public $excludeSections = 'script,style';
52
53
    /**
54
     * Supported Extensions for external files
55
     *
56
     * @var array
57
     */
58
    public $external_parsers = [];
59
60
    /**
61
     * External parser objects, keys are file extension names. Values are objects with certain methods.
62
     * Fe-group list (pages might be indexed separately for each usergroup combination to support search
63
     * in access limited pages!)
64
     *
65
     * @var string
66
     */
67
    public $defaultGrList = '0,-1';
68
69
    /**
70
     * Min/Max times
71
     *
72
     * @var int
73
     */
74
    public $tstamp_maxAge = 0;
75
76
    /**
77
     * If set, this tells a number of seconds that is the maximum age of an indexed document.
78
     * Regardless of mtime the document will be re-indexed if this limit is exceeded.
79
     *
80
     * @var int
81
     */
82
    public $tstamp_minAge = 0;
83
84
    /**
85
     * If set, this tells a minimum limit before a document can be indexed again. This is regardless of mtime.
86
     *
87
     * @var int
88
     */
89
    public $maxExternalFiles = 0;
90
91
    /**
92
     * Max number of external files to index.
93
     *
94
     * @var bool
95
     */
96
    public $forceIndexing = false;
97
98
    /**
99
     * Set when crawler is detected (internal)
100
     *
101
     * @var array
102
     */
103
    public $defaultContentArray = [
104
        'title' => '',
105
        'description' => '',
106
        'keywords' => '',
107
        'body' => ''
108
    ];
109
110
    /**
111
     * @var int
112
     */
113
    public $wordcount = 0;
114
115
    /**
116
     * @var int
117
     */
118
    public $externalFileCounter = 0;
119
120
    /**
121
     * @var array
122
     */
123
    public $conf = [];
124
125
    /**
126
     * Configuration set internally (see init functions for required keys and their meaning)
127
     *
128
     * @var array
129
     */
130
    public $indexerConfig = [];
131
132
    /**
133
     * Indexer configuration, coming from TYPO3's system configuration for EXT:indexed_search
134
     *
135
     * @var array
136
     */
137
    public $hash = [];
138
139
    /**
140
     * Hash array, contains phash and phash_grouping
141
     *
142
     * @var array
143
     */
144
    public $file_phash_arr = [];
145
146
    /**
147
     * Hash array for files
148
     *
149
     * @var array
150
     */
151
    public $contentParts = [];
152
153
    /**
154
     * Content of TYPO3 page
155
     *
156
     * @var string
157
     */
158
    public $content_md5h = '';
159
160
    /**
161
     * @var array
162
     */
163
    public $internal_log = [];
164
165
    /**
166
     * Internal log
167
     *
168
     * @var string
169
     */
170
    public $indexExternalUrl_content = '';
171
172
    /**
173
     * @var int
174
     */
175
    public $freqRange = 32000;
176
177
    /**
178
     * @var float
179
     */
180
    public $freqMax = 0.1;
181
182
    /**
183
     * @var bool
184
     */
185
    public $enableMetaphoneSearch = false;
186
187
    /**
188
     * @var bool
189
     */
190
    public $storeMetaphoneInfoAsWords;
191
192
    /**
193
     * @var string
194
     */
195
    public $metaphoneContent = '';
196
197
    /**
198
     * Metaphone object, if any
199
     *
200
     * @var \TYPO3\CMS\IndexedSearch\Utility\DoubleMetaPhoneUtility
201
     */
202
    public $metaphoneObj;
203
204
    /**
205
     * Lexer object for word splitting
206
     *
207
     * @var \TYPO3\CMS\IndexedSearch\Lexer
208
     */
209
    public $lexerObj;
210
211
    /**
212
     * @var bool
213
     */
214
    public $flagBitMask;
215
216
    /**
217
     * @var TimeTracker
218
     */
219
    protected $timeTracker;
220
221
    /**
222
     * Indexer constructor.
223
     */
224
    public function __construct()
225
    {
226
        $this->timeTracker = GeneralUtility::makeInstance(TimeTracker::class);
227
        // Indexer configuration from Extension Manager interface
228
        $this->indexerConfig = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get('indexed_search');
229
        $this->tstamp_minAge = MathUtility::forceIntegerInRange((int)($this->indexerConfig['minAge'] ?? 0) * 3600, 0);
230
        $this->tstamp_maxAge = MathUtility::forceIntegerInRange((int)($this->indexerConfig['maxAge'] ?? 0) * 3600, 0);
231
        $this->maxExternalFiles = MathUtility::forceIntegerInRange($this->indexerConfig['maxExternalFiles'], 0, 1000, 5);
232
        $this->flagBitMask = MathUtility::forceIntegerInRange($this->indexerConfig['flagBitMask'], 0, 255);
0 ignored issues
show
Documentation Bug introduced by
The property $flagBitMask was declared of type boolean, but TYPO3\CMS\Core\Utility\M...'flagBitMask'], 0, 255) is of type integer. Maybe add a type cast?

This check looks for assignments to scalar types that may be of the wrong type.

To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.

$answer = 42;

$correct = false;

$correct = (bool) $answer;
Loading history...
233
        // Workaround: If the extension configuration was not updated yet, the value is not existing
234
        $this->enableMetaphoneSearch = !isset($this->indexerConfig['enableMetaphoneSearch']) || $this->indexerConfig['enableMetaphoneSearch'];
235
        $this->storeMetaphoneInfoAsWords = !IndexedSearchUtility::isTableUsed('index_words') && $this->enableMetaphoneSearch;
236
    }
237
238
    /********************************
239
     *
240
     * Initialization
241
     *
242
     *******************************/
243
244
    /**
245
     * Initializes the object.
246
     * @param array|null $configuration will be used to set $this->conf, otherwise $this->conf MUST be set with proper values prior to this call
247
     */
248
    public function init(array $configuration = null)
249
    {
250
        if (is_array($configuration)) {
251
            $this->conf = $configuration;
252
        }
253
        // Setting phash / phash_grouping which identifies the indexed page based on some of these variables:
254
        $this->setT3Hashes();
255
        // Initialize external document parsers:
256
        // Example configuration, see ext_localconf.php of this file!
257
        if ($this->conf['index_externals']) {
258
            $this->initializeExternalParsers();
259
        }
260
        // Initialize lexer (class that deconstructs the text into words):
261
        $lexerObjectClassName = $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['lexer'] ?: Lexer::class;
262
        $this->lexerObj = GeneralUtility::makeInstance($lexerObjectClassName);
263
        $this->lexerObj->debug = $this->indexerConfig['debugMode'];
264
        // Initialize metaphone hook:
265
        // Make sure that the hook is loaded _after_ indexed_search as this may overwrite the hook depending on the configuration.
266
        if ($this->enableMetaphoneSearch && $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['metaphone']) {
267
            $this->metaphoneObj = GeneralUtility::makeInstance($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['metaphone']);
268
            $this->metaphoneObj->pObj = $this;
269
        }
270
    }
271
272
    /**
273
     * Initialize external parsers
274
     *
275
     * @internal
276
     * @see init()
277
     */
278
    public function initializeExternalParsers()
279
    {
280
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['external_parsers'] ?? [] as $extension => $className) {
281
            $this->external_parsers[$extension] = GeneralUtility::makeInstance($className);
282
            $this->external_parsers[$extension]->pObj = $this;
283
            // Init parser and if it returns FALSE, unset its entry again:
284
            if (!$this->external_parsers[$extension]->initParser($extension)) {
285
                unset($this->external_parsers[$extension]);
286
            }
287
        }
288
    }
289
290
    /********************************
291
     *
292
     * Indexing; TYPO3 pages (HTML content)
293
     *
294
     *******************************/
295
    /**
296
     * Start indexing of the TYPO3 page
297
     */
298
    public function indexTypo3PageContent()
299
    {
300
        $check = $this->checkMtimeTstamp($this->conf['mtime'], $this->hash['phash']);
301
        $is_grlist = $this->is_grlist_set($this->hash['phash']);
302
        if ($check > 0 || !$is_grlist || $this->forceIndexing) {
303
            // Setting message:
304
            if ($this->forceIndexing) {
305
                $this->log_setTSlogMessage('Indexing needed, reason: Forced', 1);
306
            } elseif ($check > 0) {
307
                $this->log_setTSlogMessage('Indexing needed, reason: ' . $this->reasons[$check], 1);
308
            } else {
309
                $this->log_setTSlogMessage('Indexing needed, reason: Updates gr_list!', 1);
310
            }
311
            // Divide into title,keywords,description and body:
312
            $this->log_push('Split content', '');
313
            $this->contentParts = $this->splitHTMLContent($this->conf['content']);
314
            if ($this->conf['indexedDocTitle']) {
315
                $this->contentParts['title'] = $this->conf['indexedDocTitle'];
316
            }
317
            $this->log_pull();
318
            // Calculating a hash over what is to be the actual page content. Maybe this hash should not include title,description and keywords? The bodytext is the primary concern. (on the other hand a changed page-title would make no difference then, so don't!)
319
            $this->content_md5h = IndexedSearchUtility::md5inthash(implode('', $this->contentParts));
320
            // This function checks if there is already a page (with gr_list = 0,-1) indexed and if that page has the very same contentHash.
321
            // If the contentHash is the same, then we can rest assured that this page is already indexed and regardless of mtime and origContent we don't need to do anything more.
322
            // This will also prevent pages from being indexed if a fe_users has logged in and it turns out that the page content is not changed anyway. fe_users logged in should always search with hash_gr_list = "0,-1" OR "[their_group_list]". This situation will be prevented only if the page has been indexed with no user login on before hand. Else the page will be indexed by users until that event. However that does not present a serious problem.
323
            $checkCHash = $this->checkContentHash();
324
            if (!is_array($checkCHash) || $check === 1) {
325
                $Pstart = GeneralUtility::milliseconds();
326
                $this->log_push('Converting charset of content (' . $this->conf['metaCharset'] . ') to utf-8', '');
327
                $this->charsetEntity2utf8($this->contentParts, $this->conf['metaCharset']);
328
                $this->log_pull();
329
                // Splitting words
330
                $this->log_push('Extract words from content', '');
331
                $splitInWords = $this->processWordsInArrays($this->contentParts);
332
                $this->log_pull();
333
                // Analyze the indexed words.
334
                $this->log_push('Analyze the extracted words', '');
335
                $indexArr = $this->indexAnalyze($splitInWords);
336
                $this->log_pull();
337
                // Submitting page (phash) record
338
                $this->log_push('Submitting page', '');
339
                $this->submitPage();
340
                $this->log_pull();
341
                // Check words and submit to word list if not there
342
                $this->log_push('Check word list and submit words', '');
343
                if (IndexedSearchUtility::isTableUsed('index_words')) {
344
                    $this->checkWordList($indexArr);
345
                    $this->submitWords($indexArr, $this->hash['phash']);
346
                }
347
                $this->log_pull();
348
                // Set parsetime
349
                $this->updateParsetime($this->hash['phash'], GeneralUtility::milliseconds() - $Pstart);
350
                // Checking external files if configured for.
351
                $this->log_push('Checking external files', '');
352
                if ($this->conf['index_externals']) {
353
                    $this->extractLinks($this->conf['content']);
354
                }
355
                $this->log_pull();
356
            } else {
357
                // Update the timestamp
358
                $this->updateTstamp($this->hash['phash'], $this->conf['mtime']);
359
                $this->updateSetId($this->hash['phash']);
360
                // $checkCHash['phash'] is the phash of the result row that is similar to the current phash regarding the content hash.
361
                $this->update_grlist($checkCHash['phash'], $this->hash['phash']);
362
                $this->updateRootline();
363
                $this->log_setTSlogMessage('Indexing not needed, the contentHash, ' . $this->content_md5h . ', has not changed. Timestamp, grlist and rootline updated if necessary.');
364
            }
365
        } else {
366
            $this->log_setTSlogMessage('Indexing not needed, reason: ' . $this->reasons[$check]);
367
        }
368
    }
369
370
    /**
371
     * Splits HTML content and returns an associative array, with title, a list of metatags, and a list of words in the body.
372
     *
373
     * @param string $content HTML content to index. To some degree expected to be made by TYPO3 (ei. splitting the header by ":")
374
     * @return array Array of content, having keys "title", "body", "keywords" and "description" set.
375
     * @see splitRegularContent()
376
     */
377
    public function splitHTMLContent($content)
378
    {
379
        // divide head from body ( u-ouh :) )
380
        $contentArr = $this->defaultContentArray;
381
        $contentArr['body'] = stristr($content, '<body');
382
        $headPart = substr($content, 0, -strlen($contentArr['body']));
383
        // get title
384
        $this->embracingTags($headPart, 'TITLE', $contentArr['title'], $dummy2, $dummy);
385
        $titleParts = explode(':', $contentArr['title'], 2);
386
        $contentArr['title'] = trim($titleParts[1] ?? $titleParts[0]);
387
        // get keywords and description metatags
388
        if ($this->conf['index_metatags']) {
389
            $meta = [];
390
            $i = 0;
391
            while ($this->embracingTags($headPart, 'meta', $dummy, $headPart, $meta[$i])) {
392
                $i++;
393
            }
394
            // @todo The code below stops at first unset tag. Is that correct?
395
            for ($i = 0; isset($meta[$i]); $i++) {
396
                $meta[$i] = GeneralUtility::get_tag_attributes($meta[$i]);
397
                if (stripos($meta[$i]['name'], 'keywords') !== false) {
398
                    $contentArr['keywords'] .= ',' . $this->addSpacesToKeywordList($meta[$i]['content']);
399
                }
400
                if (stripos($meta[$i]['name'], 'description') !== false) {
401
                    $contentArr['description'] .= ',' . $meta[$i]['content'];
402
                }
403
            }
404
        }
405
        // Process <!--TYPO3SEARCH_begin--> or <!--TYPO3SEARCH_end--> tags:
406
        $this->typoSearchTags($contentArr['body']);
407
        // Get rid of unwanted sections (ie. scripting and style stuff) in body
408
        $tagList = explode(',', $this->excludeSections);
409
        foreach ($tagList as $tag) {
410
            while ($this->embracingTags($contentArr['body'], $tag, $dummy, $contentArr['body'], $dummy2)) {
411
            }
412
        }
413
        // remove tags, but first make sure we don't concatenate words by doing it
414
        $contentArr['body'] = str_replace('<', ' <', $contentArr['body']);
415
        $contentArr['body'] = trim(strip_tags($contentArr['body']));
416
        $contentArr['keywords'] = trim($contentArr['keywords']);
417
        $contentArr['description'] = trim($contentArr['description']);
418
        // Return array
419
        return $contentArr;
420
    }
421
422
    /**
423
     * Extract the charset value from HTML meta tag.
424
     *
425
     * @param string $content HTML content
426
     * @return string The charset value if found.
427
     */
428
    public function getHTMLcharset($content)
429
    {
430
        if (preg_match('/<meta[[:space:]]+[^>]*http-equiv[[:space:]]*=[[:space:]]*["\']CONTENT-TYPE["\'][^>]*>/i', $content, $reg)) {
431
            if (preg_match('/charset[[:space:]]*=[[:space:]]*([[:alnum:]-]+)/i', $reg[0], $reg2)) {
432
                return $reg2[1];
433
            }
434
        }
435
436
        return '';
437
    }
438
439
    /**
440
     * Converts a HTML document to utf-8
441
     *
442
     * @param string $content HTML content, any charset
443
     * @param string $charset Optional charset (otherwise extracted from HTML)
444
     * @return string Converted HTML
445
     */
446
    public function convertHTMLToUtf8($content, $charset = '')
447
    {
448
        // Find charset:
449
        $charset = $charset ?: $this->getHTMLcharset($content);
450
        $charset = trim(strtolower($charset));
451
        // Convert charset:
452
        if ($charset && $charset !== 'utf-8') {
453
            $content = mb_convert_encoding($content, 'utf-8', $charset);
454
        }
455
        // Convert entities, assuming document is now UTF-8:
456
        return html_entity_decode($content);
457
    }
458
459
    /**
460
     * Finds first occurrence of embracing tags and returns the embraced content and the original string with
461
     * the tag removed in the two passed variables. Returns FALSE if no match found. ie. useful for finding
462
     * <title> of document or removing <script>-sections
463
     *
464
     * @param string $string String to search in
465
     * @param string $tagName Tag name, eg. "script
466
     * @param string $tagContent Passed by reference: Content inside found tag
467
     * @param string $stringAfter Passed by reference: Content after found tag
468
     * @param string $paramList Passed by reference: Attributes of the found tag.
469
     * @return bool Returns FALSE if tag was not found, otherwise TRUE.
470
     */
471
    public function embracingTags($string, $tagName, &$tagContent, &$stringAfter, &$paramList)
472
    {
473
        $endTag = '</' . $tagName . '>';
474
        $startTag = '<' . $tagName;
475
        // stristr used because we want a case-insensitive search for the tag.
476
        $isTagInText = stristr($string, $startTag);
477
        // if the tag was not found, return FALSE
478
        if (!$isTagInText) {
479
            return false;
480
        }
481
        [$paramList, $isTagInText] = explode('>', substr($isTagInText, strlen($startTag)), 2);
482
        $afterTagInText = stristr($isTagInText, $endTag);
483
        if ($afterTagInText) {
484
            $stringBefore = substr($string, 0, strpos(strtolower($string), strtolower($startTag)));
485
            $tagContent = substr($isTagInText, 0, strlen($isTagInText) - strlen($afterTagInText));
486
            $stringAfter = $stringBefore . substr($afterTagInText, strlen($endTag));
487
        } else {
488
            $tagContent = '';
489
            $stringAfter = $isTagInText;
490
        }
491
        return true;
492
    }
493
494
    /**
495
     * Removes content that shouldn't be indexed according to TYPO3SEARCH-tags.
496
     *
497
     * @param string $body HTML Content, passed by reference
498
     * @return bool Returns TRUE if a TYPOSEARCH_ tag was found, otherwise FALSE.
499
     */
500
    public function typoSearchTags(&$body)
501
    {
502
        $expBody = preg_split('/\\<\\!\\-\\-[\\s]?TYPO3SEARCH_/', $body);
503
        if (count($expBody) > 1) {
0 ignored issues
show
Bug introduced by
It seems like $expBody can also be of type false; however, parameter $var of count() does only seem to accept Countable|array, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

503
        if (count(/** @scrutinizer ignore-type */ $expBody) > 1) {
Loading history...
504
            $body = '';
505
            $prev = '';
506
            foreach ($expBody as $val) {
507
                $part = explode('-->', $val, 2);
508
                if (trim($part[0]) === 'begin') {
509
                    $body .= $part[1];
510
                    $prev = '';
511
                } elseif (trim($part[0]) === 'end') {
512
                    $body .= $prev;
513
                } else {
514
                    $prev = $val;
515
                }
516
            }
517
            return true;
518
        }
519
        return false;
520
    }
521
522
    /**
523
     * Extract links (hrefs) from HTML content and if indexable media is found, it is indexed.
524
     *
525
     * @param string $content HTML content
526
     */
527
    public function extractLinks($content)
528
    {
529
        $crawler = null;
530
        // Get links:
531
        $list = $this->extractHyperLinks($content);
532
        if ($this->indexerConfig['useCrawlerForExternalFiles'] && \TYPO3\CMS\Core\Utility\ExtensionManagementUtility::isLoaded('crawler')) {
533
            /**
534
             * todo: remove dependency to class tx_crawler_lib
535
             * @link https://forge.typo3.org/issues/83603
536
             */
537
            $crawler = GeneralUtility::makeInstance('tx_crawler_lib');
538
        }
539
        // Traverse links:
540
        foreach ($list as $linkInfo) {
541
            // Decode entities:
542
            if ($linkInfo['localPath']) {
543
                // localPath means: This file is sent by a download script. While the indexed URL has to point to $linkInfo['href'], the absolute path to the file is specified here!
544
                $linkSource = htmlspecialchars_decode($linkInfo['localPath']);
545
            } else {
546
                $linkSource = htmlspecialchars_decode($linkInfo['href']);
547
            }
548
            // Parse URL:
549
            $qParts = parse_url($linkSource);
550
            // Check for jumpurl (TYPO3 specific thing...)
551
            if ($qParts['query'] && strpos($qParts['query'], 'jumpurl=') !== false) {
552
                parse_str($qParts['query'], $getP);
553
                $linkSource = $getP['jumpurl'];
554
                $qParts = parse_url($linkSource);
555
            }
556
            if (!$linkInfo['localPath'] && $qParts['scheme']) {
557
                if ($this->indexerConfig['indexExternalURLs']) {
558
                    // Index external URL (http or otherwise)
559
                    $this->indexExternalUrl($linkSource);
560
                }
561
            } elseif (!$qParts['query']) {
562
                $linkSource = urldecode($linkSource);
563
                if (GeneralUtility::isAllowedAbsPath($linkSource)) {
564
                    $localFile = $linkSource;
565
                } else {
566
                    $localFile = GeneralUtility::getFileAbsFileName(Environment::getPublicPath() . '/' . $linkSource);
567
                }
568
                if ($localFile && @is_file($localFile)) {
569
                    // Index local file:
570
                    if ($linkInfo['localPath']) {
571
                        $fI = pathinfo($linkSource);
572
                        $ext = strtolower($fI['extension']);
573
                        if (is_object($crawler)) {
574
                            $params = [
575
                                'document' => $linkSource,
576
                                'alturl' => $linkInfo['href'],
577
                                'conf' => $this->conf
578
                            ];
579
                            unset($params['conf']['content']);
580
                            $crawler->addQueueEntry_callBack(0, $params, Hook\CrawlerFilesHook::class, $this->conf['id']);
581
                            $this->log_setTSlogMessage('media "' . $params['document'] . '" added to "crawler" queue.', 1);
582
                        } else {
583
                            $this->indexRegularDocument($linkInfo['href'], false, $linkSource, $ext);
584
                        }
585
                    } else {
586
                        if (is_object($crawler)) {
587
                            $params = [
588
                                'document' => $linkSource,
589
                                'conf' => $this->conf
590
                            ];
591
                            unset($params['conf']['content']);
592
                            $crawler->addQueueEntry_callBack(0, $params, Hook\CrawlerFilesHook::class, $this->conf['id']);
593
                            $this->log_setTSlogMessage('media "' . $params['document'] . '" added to "crawler" queue.', 1);
594
                        } else {
595
                            $this->indexRegularDocument($linkSource);
596
                        }
597
                    }
598
                }
599
            }
600
        }
601
    }
602
603
    /**
604
     * Extracts all links to external documents from the HTML content string
605
     *
606
     * @param string $html
607
     * @return array Array of hyperlinks (keys: tag, href, localPath (empty if not local))
608
     * @see extractLinks()
609
     */
610
    public function extractHyperLinks($html)
611
    {
612
        $htmlParser = GeneralUtility::makeInstance(\TYPO3\CMS\Core\Html\HtmlParser::class);
613
        $htmlParts = $htmlParser->splitTags('a', $html);
614
        $hyperLinksData = [];
615
        foreach ($htmlParts as $index => $tagData) {
616
            if ($index % 2 !== 0) {
617
                $tagAttributes = $htmlParser->get_tag_attributes($tagData, true);
618
                $firstTagName = $htmlParser->getFirstTagName($tagData);
619
                if (strtolower($firstTagName) === 'a') {
620
                    if ($tagAttributes[0]['href'] && $tagAttributes[0]['href'][0] !== '#') {
621
                        $hyperLinksData[] = [
622
                            'tag' => $tagData,
623
                            'href' => $tagAttributes[0]['href'],
624
                            'localPath' => $this->createLocalPath($tagAttributes[0]['href'])
625
                        ];
626
                    }
627
                }
628
            }
629
        }
630
        return $hyperLinksData;
631
    }
632
633
    /**
634
     * Extracts the "base href" from content string.
635
     *
636
     * @param string $html Content to analyze
637
     * @return string The base href or an empty string if not found
638
     */
639
    public function extractBaseHref($html)
640
    {
641
        $href = '';
642
        $htmlParser = GeneralUtility::makeInstance(\TYPO3\CMS\Core\Html\HtmlParser::class);
643
        $htmlParts = $htmlParser->splitTags('base', $html);
644
        foreach ($htmlParts as $index => $tagData) {
645
            if ($index % 2 !== 0) {
646
                $tagAttributes = $htmlParser->get_tag_attributes($tagData, true);
647
                $firstTagName = $htmlParser->getFirstTagName($tagData);
648
                if (strtolower($firstTagName) === 'base') {
649
                    $href = $tagAttributes[0]['href'];
650
                    if ($href) {
651
                        break;
652
                    }
653
                }
654
            }
655
        }
656
        return $href;
657
    }
658
659
    /******************************************
660
     *
661
     * Indexing; external URL
662
     *
663
     ******************************************/
664
    /**
665
     * Index External URLs HTML content
666
     *
667
     * @param string $externalUrl URL, eg. "http://typo3.org/
668
     * @see indexRegularDocument()
669
     */
670
    public function indexExternalUrl($externalUrl)
671
    {
672
        // Get headers:
673
        $urlHeaders = $this->getUrlHeaders($externalUrl);
674
        if (stripos($urlHeaders['Content-Type'], 'text/html') !== false) {
675
            $content = ($this->indexExternalUrl_content = GeneralUtility::getUrl($externalUrl));
0 ignored issues
show
Documentation Bug introduced by
It seems like TYPO3\CMS\Core\Utility\G...y::getUrl($externalUrl) can also be of type false. However, the property $indexExternalUrl_content is declared as type string. Maybe add an additional type check?

Our type inference engine has found a suspicous assignment of a value to a property. This check raises an issue when a value that can be of a mixed type is assigned to a property that is type hinted more strictly.

For example, imagine you have a variable $accountId that can either hold an Id object or false (if there is no account id yet). Your code now assigns that value to the id property of an instance of the Account class. This class holds a proper account, so the id value must no longer be false.

Either this assignment is in error or a type check should be added for that assignment.

class Id
{
    public $id;

    public function __construct($id)
    {
        $this->id = $id;
    }

}

class Account
{
    /** @var  Id $id */
    public $id;
}

$account_id = false;

if (starsAreRight()) {
    $account_id = new Id(42);
}

$account = new Account();
if ($account instanceof Id)
{
    $account->id = $account_id;
}
Loading history...
676
            if ((string)$content !== '') {
677
                // Create temporary file:
678
                $tmpFile = GeneralUtility::tempnam('EXTERNAL_URL');
679
                if ($tmpFile) {
680
                    GeneralUtility::writeFile($tmpFile, $content);
0 ignored issues
show
Bug introduced by
It seems like $content can also be of type false; however, parameter $content of TYPO3\CMS\Core\Utility\GeneralUtility::writeFile() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

680
                    GeneralUtility::writeFile($tmpFile, /** @scrutinizer ignore-type */ $content);
Loading history...
681
                    // Index that file:
682
                    $this->indexRegularDocument($externalUrl, true, $tmpFile, 'html');
683
                    // Using "TRUE" for second parameter to force indexing of external URLs (mtime doesn't make sense, does it?)
684
                    unlink($tmpFile);
685
                }
686
            }
687
        }
688
    }
689
690
    /**
691
     * Getting HTTP request headers of URL
692
     *
693
     * @param string $url The URL
694
     * @return mixed If no answer, returns FALSE. Otherwise an array where HTTP headers are keys
695
     */
696
    public function getUrlHeaders($url)
697
    {
698
        // Try to get the headers only
699
        $content = GeneralUtility::getUrl($url, 2);
700
        if ((string)$content !== '') {
701
            // Compile headers:
702
            $headers = GeneralUtility::trimExplode(LF, $content, true);
0 ignored issues
show
Bug introduced by
It seems like $content can also be of type false; however, parameter $string of TYPO3\CMS\Core\Utility\G...lUtility::trimExplode() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

702
            $headers = GeneralUtility::trimExplode(LF, /** @scrutinizer ignore-type */ $content, true);
Loading history...
703
            $retVal = [];
704
            foreach ($headers as $line) {
705
                if (trim($line) === '') {
706
                    break;
707
                }
708
                [$headKey, $headValue] = explode(':', $line, 2);
709
                $retVal[$headKey] = $headValue;
710
            }
711
            return $retVal;
712
        }
713
    }
714
715
    /**
716
     * Checks if the file is local
717
     *
718
     * @param string $sourcePath
719
     * @return string Absolute path to file if file is local, else empty string
720
     */
721
    protected function createLocalPath($sourcePath)
722
    {
723
        $localPath = '';
724
        $pathFunctions = [
725
            'createLocalPathUsingAbsRefPrefix',
726
            'createLocalPathUsingDomainURL',
727
            'createLocalPathFromAbsoluteURL',
728
            'createLocalPathFromRelativeURL'
729
        ];
730
        foreach ($pathFunctions as $functionName) {
731
            $localPath = $this->{$functionName}($sourcePath);
732
            if ($localPath != '') {
733
                break;
734
            }
735
        }
736
        return $localPath;
737
    }
738
739
    /**
740
     * Attempts to create a local file path by matching a current request URL.
741
     *
742
     * @param string $sourcePath
743
     * @return string
744
     */
745
    protected function createLocalPathUsingDomainURL($sourcePath)
746
    {
747
        $localPath = '';
748
        $baseURL = GeneralUtility::getIndpEnv('TYPO3_SITE_URL');
749
        $baseURLLength = strlen($baseURL);
750
        if (strpos($sourcePath, $baseURL) === 0) {
751
            $sourcePath = substr($sourcePath, $baseURLLength);
752
            $localPath = Environment::getPublicPath() . '/' . $sourcePath;
753
            if (!self::isAllowedLocalFile($localPath)) {
754
                $localPath = '';
755
            }
756
        }
757
        return $localPath;
758
    }
759
760
    /**
761
     * Attempts to create a local file path by matching absRefPrefix. This
762
     * requires TSFE. If TSFE is missing, this function does nothing.
763
     *
764
     * @param string $sourcePath
765
     * @return string
766
     */
767
    protected function createLocalPathUsingAbsRefPrefix($sourcePath)
768
    {
769
        $localPath = '';
770
        if (isset($GLOBALS['TSFE']) && $GLOBALS['TSFE'] instanceof TypoScriptFrontendController) {
771
            $absRefPrefix = $GLOBALS['TSFE']->config['config']['absRefPrefix'];
772
            $absRefPrefixLength = strlen($absRefPrefix);
773
            if ($absRefPrefixLength > 0 && strpos($sourcePath, $absRefPrefix) === 0) {
774
                $sourcePath = substr($sourcePath, $absRefPrefixLength);
775
                $localPath = Environment::getPublicPath() . '/' . $sourcePath;
776
                if (!self::isAllowedLocalFile($localPath)) {
777
                    $localPath = '';
778
                }
779
            }
780
        }
781
        return $localPath;
782
    }
783
784
    /**
785
     * Attempts to create a local file path from the absolute URL without
786
     * schema.
787
     *
788
     * @param string $sourcePath
789
     * @return string
790
     */
791
    protected function createLocalPathFromAbsoluteURL($sourcePath)
792
    {
793
        $localPath = '';
794
        if ($sourcePath[0] === '/') {
795
            $sourcePath = substr($sourcePath, 1);
796
            $localPath = Environment::getPublicPath() . '/' . $sourcePath;
797
            if (!self::isAllowedLocalFile($localPath)) {
798
                $localPath = '';
799
            }
800
        }
801
        return $localPath;
802
    }
803
804
    /**
805
     * Attempts to create a local file path from the relative URL.
806
     *
807
     * @param string $sourcePath
808
     * @return string
809
     */
810
    protected function createLocalPathFromRelativeURL($sourcePath)
811
    {
812
        $localPath = '';
813
        if (self::isRelativeURL($sourcePath)) {
814
            $localPath = Environment::getPublicPath() . '/' . $sourcePath;
815
            if (!self::isAllowedLocalFile($localPath)) {
816
                $localPath = '';
817
            }
818
        }
819
        return $localPath;
820
    }
821
822
    /**
823
     * Checks if URL is relative.
824
     *
825
     * @param string $url
826
     * @return bool
827
     */
828
    protected static function isRelativeURL($url)
829
    {
830
        $urlParts = @parse_url($url);
831
        return (!isset($urlParts['scheme']) || $urlParts['scheme'] === '') && $urlParts['path'][0] !== '/';
832
    }
833
834
    /**
835
     * Checks if the path points to the file inside the web site
836
     *
837
     * @param string $filePath
838
     * @return bool
839
     */
840
    protected static function isAllowedLocalFile($filePath)
841
    {
842
        $filePath = GeneralUtility::resolveBackPath($filePath);
843
        $insideWebPath = strpos($filePath, Environment::getPublicPath()) === 0;
844
        $isFile = is_file($filePath);
845
        return $insideWebPath && $isFile;
846
    }
847
848
    /******************************************
849
     *
850
     * Indexing; external files (PDF, DOC, etc)
851
     *
852
     ******************************************/
853
    /**
854
     * Indexing a regular document given as $file (relative to public web path, local file)
855
     *
856
     * @param string $file Relative Filename, relative to public web path. It can also be an absolute path as long as it is inside the lockRootPath (validated with \TYPO3\CMS\Core\Utility\GeneralUtility::isAbsPath()). Finally, if $contentTmpFile is set, this value can be anything, most likely a URL
857
     * @param bool $force If set, indexing is forced (despite content hashes, mtime etc).
858
     * @param string $contentTmpFile Temporary file with the content to read it from (instead of $file). Used when the $file is a URL.
859
     * @param string $altExtension File extension for temporary file.
860
     */
861
    public function indexRegularDocument($file, $force = false, $contentTmpFile = '', $altExtension = '')
862
    {
863
        // Init
864
        $fI = pathinfo($file);
865
        $ext = $altExtension ?: strtolower($fI['extension']);
866
        // Create abs-path:
867
        if (!$contentTmpFile) {
868
            if (!GeneralUtility::isAbsPath($file)) {
869
                // Relative, prepend public web path:
870
                $absFile = GeneralUtility::getFileAbsFileName(Environment::getPublicPath() . '/' . $file);
871
            } else {
872
                // Absolute, pass-through:
873
                $absFile = $file;
874
            }
875
            $absFile = GeneralUtility::isAllowedAbsPath($absFile) ? $absFile : '';
876
        } else {
877
            $absFile = $contentTmpFile;
878
        }
879
        // Indexing the document:
880
        if ($absFile && @is_file($absFile)) {
881
            if ($this->external_parsers[$ext]) {
882
                $fileInfo = stat($absFile);
883
                $cParts = $this->fileContentParts($ext, $absFile);
884
                foreach ($cParts as $cPKey) {
885
                    $this->internal_log = [];
886
                    $this->log_push('Index: ' . str_replace('.', '_', PathUtility::basename($file)) . ($cPKey ? '#' . $cPKey : ''), '');
887
                    $Pstart = GeneralUtility::milliseconds();
888
                    $subinfo = ['key' => $cPKey];
889
                    // Setting page range. This is "0" (zero) when no division is made, otherwise a range like "1-3"
890
                    $phash_arr = ($this->file_phash_arr = $this->setExtHashes($file, $subinfo));
891
                    $check = $this->checkMtimeTstamp($fileInfo['mtime'], $phash_arr['phash']);
892
                    if ($check > 0 || $force) {
893
                        if ($check > 0) {
894
                            $this->log_setTSlogMessage('Indexing needed, reason: ' . $this->reasons[$check], 1);
895
                        } else {
896
                            $this->log_setTSlogMessage('Indexing forced by flag', 1);
897
                        }
898
                        // Check external file counter:
899
                        if ($this->externalFileCounter < $this->maxExternalFiles || $force) {
900
                            // Divide into title,keywords,description and body:
901
                            $this->log_push('Split content', '');
902
                            $contentParts = $this->readFileContent($ext, $absFile, $cPKey);
903
                            $this->log_pull();
904
                            if (is_array($contentParts)) {
905
                                // Calculating a hash over what is to be the actual content. (see indexTypo3PageContent())
906
                                $content_md5h = IndexedSearchUtility::md5inthash(implode('', $contentParts));
907
                                if ($this->checkExternalDocContentHash($phash_arr['phash_grouping'], $content_md5h) || $force) {
908
                                    // Increment counter:
909
                                    $this->externalFileCounter++;
910
                                    // Splitting words
911
                                    $this->log_push('Extract words from content', '');
912
                                    $splitInWords = $this->processWordsInArrays($contentParts);
913
                                    $this->log_pull();
914
                                    // Analyze the indexed words.
915
                                    $this->log_push('Analyze the extracted words', '');
916
                                    $indexArr = $this->indexAnalyze($splitInWords);
917
                                    $this->log_pull();
918
                                    // Submitting page (phash) record
919
                                    $this->log_push('Submitting page', '');
920
                                    // Unfortunately I cannot determine WHEN a file is originally made - so I must return the modification time...
921
                                    $this->submitFilePage($phash_arr, $file, $subinfo, $ext, $fileInfo['mtime'], $fileInfo['ctime'], $fileInfo['size'], $content_md5h, $contentParts);
922
                                    $this->log_pull();
923
                                    // Check words and submit to word list if not there
924
                                    $this->log_push('Check word list and submit words', '');
925
                                    if (IndexedSearchUtility::isTableUsed('index_words')) {
926
                                        $this->checkWordList($indexArr);
927
                                        $this->submitWords($indexArr, $phash_arr['phash']);
928
                                    }
929
                                    $this->log_pull();
930
                                    // Set parsetime
931
                                    $this->updateParsetime($phash_arr['phash'], GeneralUtility::milliseconds() - $Pstart);
932
                                } else {
933
                                    // Update the timestamp
934
                                    $this->updateTstamp($phash_arr['phash'], $fileInfo['mtime']);
935
                                    $this->log_setTSlogMessage('Indexing not needed, the contentHash, ' . $content_md5h . ', has not changed. Timestamp updated.');
936
                                }
937
                            } else {
938
                                $this->log_setTSlogMessage('Could not index file! Unsupported extension.');
939
                            }
940
                        } else {
941
                            $this->log_setTSlogMessage('The limit of ' . $this->maxExternalFiles . ' has already been exceeded, so no indexing will take place this time.');
942
                        }
943
                    } else {
944
                        $this->log_setTSlogMessage('Indexing not needed, reason: ' . $this->reasons[$check]);
945
                    }
946
                    // Checking and setting sections:
947
                    $this->submitFile_section($phash_arr['phash']);
948
                    // Setting a section-record for the file. This is done also if the file is not indexed. Notice that section records are deleted when the page is indexed.
949
                    $this->log_pull();
950
                }
951
            } else {
952
                $this->log_setTSlogMessage('Indexing not possible; The extension "' . $ext . '" was not supported.');
953
            }
954
        } else {
955
            $this->log_setTSlogMessage('Indexing not possible; File "' . $absFile . '" not found or valid.');
956
        }
957
    }
958
959
    /**
960
     * Reads the content of an external file being indexed.
961
     * The content from the external parser MUST be returned in utf-8!
962
     *
963
     * @param string $fileExtension File extension, eg. "pdf", "doc" etc.
964
     * @param string $absoluteFileName Absolute filename of file (must exist and be validated OK before calling function)
965
     * @param string $sectionPointer Pointer to section (zero for all other than PDF which will have an indication of pages into which the document should be splitted.)
966
     * @return array Standard content array (title, description, keywords, body keys)
967
     */
968
    public function readFileContent($fileExtension, $absoluteFileName, $sectionPointer)
969
    {
970
        $contentArray = null;
971
        // Consult relevant external document parser:
972
        if (is_object($this->external_parsers[$fileExtension])) {
973
            $contentArray = $this->external_parsers[$fileExtension]->readFileContent($fileExtension, $absoluteFileName, $sectionPointer);
974
        }
975
        return $contentArray;
976
    }
977
978
    /**
979
     * Creates an array with pointers to divisions of document.
980
     *
981
     * @param string $ext File extension
982
     * @param string $absFile Absolute filename (must exist and be validated OK before calling function)
983
     * @return array Array of pointers to sections that the document should be divided into
984
     */
985
    public function fileContentParts($ext, $absFile)
986
    {
987
        $cParts = [0];
988
        // Consult relevant external document parser:
989
        if (is_object($this->external_parsers[$ext])) {
990
            $cParts = $this->external_parsers[$ext]->fileContentParts($ext, $absFile);
991
        }
992
        return $cParts;
993
    }
994
995
    /**
996
     * Splits non-HTML content (from external files for instance)
997
     *
998
     * @param string $content Input content (non-HTML) to index.
999
     * @return array Array of content, having the key "body" set (plus "title", "description" and "keywords", but empty)
1000
     * @see splitHTMLContent()
1001
     */
1002
    public function splitRegularContent($content)
1003
    {
1004
        $contentArr = $this->defaultContentArray;
1005
        $contentArr['body'] = $content;
1006
        return $contentArr;
1007
    }
1008
1009
    /**********************************
1010
     *
1011
     * Analysing content, Extracting words
1012
     *
1013
     **********************************/
1014
    /**
1015
     * Convert character set and HTML entities in the value of input content array keys
1016
     *
1017
     * @param array $contentArr Standard content array
1018
     * @param string $charset Charset of the input content (converted to utf-8)
1019
     */
1020
    public function charsetEntity2utf8(&$contentArr, $charset)
1021
    {
1022
        // Convert charset if necessary
1023
        foreach ($contentArr as $key => $value) {
1024
            if ((string)$contentArr[$key] !== '') {
1025
                if ($charset !== 'utf-8') {
1026
                    $contentArr[$key] = mb_convert_encoding($contentArr[$key], 'utf-8', $charset);
1027
                }
1028
                // decode all numeric / html-entities in the string to real characters:
1029
                $contentArr[$key] = html_entity_decode($contentArr[$key]);
1030
            }
1031
        }
1032
    }
1033
1034
    /**
1035
     * Processing words in the array from split*Content -functions
1036
     *
1037
     * @param array $contentArr Array of content to index, see splitHTMLContent() and splitRegularContent()
1038
     * @return array Content input array modified so each key is not a unique array of words
1039
     */
1040
    public function processWordsInArrays($contentArr)
1041
    {
1042
        // split all parts to words
1043
        foreach ($contentArr as $key => $value) {
1044
            $contentArr[$key] = $this->lexerObj->split2Words($contentArr[$key]);
1045
        }
1046
        // For title, keywords, and description we don't want duplicates:
1047
        $contentArr['title'] = array_unique($contentArr['title']);
1048
        $contentArr['keywords'] = array_unique($contentArr['keywords']);
1049
        $contentArr['description'] = array_unique($contentArr['description']);
1050
        // Return modified array:
1051
        return $contentArr;
1052
    }
1053
1054
    /**
1055
     * Extracts the sample description text from the content array.
1056
     *
1057
     * @param array $contentArr Content array
1058
     * @return string Description string
1059
     */
1060
    public function bodyDescription($contentArr)
1061
    {
1062
        $bodyDescription = '';
1063
        // Setting description
1064
        $maxL = MathUtility::forceIntegerInRange($this->conf['index_descrLgd'], 0, 255, 200);
1065
        if ($maxL) {
1066
            $bodyDescription = preg_replace('/\s+/u', ' ', $contentArr['body']);
1067
            // Shorten the string:
1068
            $bodyDescription = mb_strcut($bodyDescription, 0, $maxL, 'utf-8');
1069
        }
1070
        return $bodyDescription;
1071
    }
1072
1073
    /**
1074
     * Analyzes content to use for indexing,
1075
     *
1076
     * @param array $content Standard content array: an array with the keys title,keywords,description and body, which all contain an array of words.
1077
     * @return array Index Array (whatever that is...)
1078
     */
1079
    public function indexAnalyze($content)
1080
    {
1081
        $indexArr = [];
1082
        $this->analyzeHeaderinfo($indexArr, $content, 'title', 7);
1083
        $this->analyzeHeaderinfo($indexArr, $content, 'keywords', 6);
1084
        $this->analyzeHeaderinfo($indexArr, $content, 'description', 5);
1085
        $this->analyzeBody($indexArr, $content);
1086
        return $indexArr;
1087
    }
1088
1089
    /**
1090
     * Calculates relevant information for headercontent
1091
     *
1092
     * @param array $retArr Index array, passed by reference
1093
     * @param array $content Standard content array
1094
     * @param string $key Key from standard content array
1095
     * @param int $offset Bit-wise priority to type
1096
     */
1097
    public function analyzeHeaderinfo(&$retArr, $content, $key, $offset)
1098
    {
1099
        foreach ($content[$key] as $val) {
1100
            $val = substr($val, 0, 60);
1101
            // Cut after 60 chars because the index_words.baseword varchar field has this length. This MUST be the same.
1102
            if (!isset($retArr[$val])) {
1103
                // Word ID (wid)
1104
                $retArr[$val]['hash'] = IndexedSearchUtility::md5inthash($val);
1105
                // Metaphone value is also 60 only chars long
1106
                $metaphone = $this->enableMetaphoneSearch ? substr($this->metaphone($val, $this->storeMetaphoneInfoAsWords), 0, 60) : '';
1107
                $retArr[$val]['metaphone'] = $metaphone;
1108
            }
1109
            // Build metaphone fulltext string (can be used for fulltext indexing)
1110
            if ($this->storeMetaphoneInfoAsWords) {
1111
                $this->metaphoneContent .= ' ' . $retArr[$val]['metaphone'];
1112
            }
1113
            // Priority used for flagBitMask feature (see extension configuration)
1114
            $retArr[$val]['cmp'] = $retArr[$val]['cmp'] | 2 ** $offset;
1115
            // Increase number of occurrences
1116
            $retArr[$val]['count']++;
1117
            $this->wordcount++;
1118
        }
1119
    }
1120
1121
    /**
1122
     * Calculates relevant information for bodycontent
1123
     *
1124
     * @param array $retArr Index array, passed by reference
1125
     * @param array $content Standard content array
1126
     */
1127
    public function analyzeBody(&$retArr, $content)
1128
    {
1129
        foreach ($content['body'] as $key => $val) {
1130
            $val = substr($val, 0, 60);
1131
            // Cut after 60 chars because the index_words.baseword varchar field has this length. This MUST be the same.
1132
            if (!isset($retArr[$val])) {
1133
                // First occurrence (used for ranking results)
1134
                $retArr[$val]['first'] = $key;
1135
                // Word ID (wid)
1136
                $retArr[$val]['hash'] = IndexedSearchUtility::md5inthash($val);
1137
                // Metaphone value is also only 60 chars long
1138
                $metaphone = $this->enableMetaphoneSearch ? substr($this->metaphone($val, $this->storeMetaphoneInfoAsWords), 0, 60) : '';
1139
                $retArr[$val]['metaphone'] = $metaphone;
1140
            }
1141
            // Build metaphone fulltext string (can be used for fulltext indexing)
1142
            if ($this->storeMetaphoneInfoAsWords) {
1143
                $this->metaphoneContent .= ' ' . $retArr[$val]['metaphone'];
1144
            }
1145
            // Increase number of occurrences
1146
            $retArr[$val]['count']++;
1147
            $this->wordcount++;
1148
        }
1149
    }
1150
1151
    /**
1152
     * Creating metaphone based hash from input word
1153
     *
1154
     * @param string $word Word to convert
1155
     * @param bool $returnRawMetaphoneValue If set, returns the raw metaphone value (not hashed)
1156
     * @return mixed Metaphone hash integer (or raw value, string)
1157
     */
1158
    public function metaphone($word, $returnRawMetaphoneValue = false)
1159
    {
1160
        if (is_object($this->metaphoneObj)) {
1161
            $metaphoneRawValue = $this->metaphoneObj->metaphone($word, $this->conf['sys_language_uid']);
1162
        } else {
1163
            // Use native PHP function instead of advanced doubleMetaphone class
1164
            $metaphoneRawValue = metaphone($word);
1165
        }
1166
        if ($returnRawMetaphoneValue) {
1167
            $result = $metaphoneRawValue;
1168
        } elseif ($metaphoneRawValue !== '') {
1169
            // Create hash and return integer
1170
            $result = IndexedSearchUtility::md5inthash($metaphoneRawValue);
1171
        } else {
1172
            $result = 0;
1173
        }
1174
        return $result;
1175
    }
1176
1177
    /********************************
1178
     *
1179
     * SQL; TYPO3 Pages
1180
     *
1181
     *******************************/
1182
    /**
1183
     * Updates db with information about the page (TYPO3 page, not external media)
1184
     */
1185
    public function submitPage()
1186
    {
1187
        // Remove any current data for this phash:
1188
        $this->removeOldIndexedPages($this->hash['phash']);
1189
        // setting new phash_row
1190
        $fields = [
1191
            'phash' => $this->hash['phash'],
1192
            'phash_grouping' => $this->hash['phash_grouping'],
1193
            'static_page_arguments' => is_array($this->conf['staticPageArguments']) ? json_encode($this->conf['staticPageArguments']) : null,
1194
            'contentHash' => $this->content_md5h,
1195
            'data_page_id' => $this->conf['id'],
1196
            'data_page_type' => $this->conf['type'],
1197
            'data_page_mp' => $this->conf['MP'],
1198
            'gr_list' => $this->conf['gr_list'],
1199
            'item_type' => 0,
1200
            // TYPO3 page
1201
            'item_title' => $this->contentParts['title'],
1202
            'item_description' => $this->bodyDescription($this->contentParts),
1203
            'item_mtime' => (int)$this->conf['mtime'],
1204
            'item_size' => strlen($this->conf['content']),
1205
            'tstamp' => $GLOBALS['EXEC_TIME'],
1206
            'crdate' => $GLOBALS['EXEC_TIME'],
1207
            'item_crdate' => $this->conf['crdate'],
1208
            // Creation date of page
1209
            'sys_language_uid' => $this->conf['sys_language_uid'],
1210
            // Sys language uid of the page. Should reflect which language it DOES actually display!
1211
            'externalUrl' => 0,
1212
            'recordUid' => (int)$this->conf['recordUid'],
1213
            'freeIndexUid' => (int)$this->conf['freeIndexUid'],
1214
            'freeIndexSetId' => (int)$this->conf['freeIndexSetId']
1215
        ];
1216
        if (IndexedSearchUtility::isTableUsed('index_phash')) {
1217
            $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1218
                ->getConnectionForTable('index_phash');
1219
            $connection->insert(
1220
                'index_phash',
1221
                $fields
1222
            );
1223
        }
1224
        // PROCESSING index_section
1225
        $this->submit_section($this->hash['phash'], $this->hash['phash']);
1226
        // PROCESSING index_grlist
1227
        $this->submit_grlist($this->hash['phash'], $this->hash['phash']);
1228
        // PROCESSING index_fulltext
1229
        $fields = [
1230
            'phash' => $this->hash['phash'],
1231
            'fulltextdata' => implode(' ', $this->contentParts),
1232
            'metaphonedata' => $this->metaphoneContent
1233
        ];
1234
        if ($this->indexerConfig['fullTextDataLength'] > 0) {
1235
            $fields['fulltextdata'] = substr($fields['fulltextdata'], 0, $this->indexerConfig['fullTextDataLength']);
1236
        }
1237
        if (IndexedSearchUtility::isTableUsed('index_fulltext')) {
1238
            $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1239
                ->getConnectionForTable('index_fulltext');
1240
            $connection->insert('index_fulltext', $fields);
1241
        }
1242
        // PROCESSING index_debug
1243
        if ($this->indexerConfig['debugMode']) {
1244
            $fields = [
1245
                'phash' => $this->hash['phash'],
1246
                'debuginfo' => json_encode([
1247
                    'external_parsers initialized' => array_keys($this->external_parsers),
1248
                    'conf' => array_merge($this->conf, ['content' => substr($this->conf['content'], 0, 1000)]),
1249
                    'contentParts' => array_merge($this->contentParts, ['body' => substr($this->contentParts['body'], 0, 1000)]),
1250
                    'logs' => $this->internal_log,
1251
                    'lexer' => $this->lexerObj->debugString
1252
                ])
1253
            ];
1254
            if (IndexedSearchUtility::isTableUsed('index_debug')) {
1255
                $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1256
                    ->getConnectionForTable('index_debug');
1257
                $connection->insert('index_debug', $fields);
1258
            }
1259
        }
1260
    }
1261
1262
    /**
1263
     * Stores gr_list in the database.
1264
     *
1265
     * @param int $hash Search result record phash
1266
     * @param int $phash_x Actual phash of current content
1267
     * @see update_grlist()
1268
     */
1269
    public function submit_grlist($hash, $phash_x)
0 ignored issues
show
Coding Style introduced by
Method name "Indexer::submit_grlist" is not in camel caps format
Loading history...
1270
    {
1271
        // Setting the gr_list record
1272
        $fields = [
1273
            'phash' => $hash,
1274
            'phash_x' => $phash_x,
1275
            'hash_gr_list' => IndexedSearchUtility::md5inthash($this->conf['gr_list']),
1276
            'gr_list' => $this->conf['gr_list']
1277
        ];
1278
        if (IndexedSearchUtility::isTableUsed('index_grlist')) {
1279
            $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1280
                ->getConnectionForTable('index_grlist');
1281
            $connection->insert('index_grlist', $fields);
1282
        }
1283
    }
1284
1285
    /**
1286
     * Stores section
1287
     * $hash and $hash_t3 are the same for TYPO3 pages, but different when it is external files.
1288
     *
1289
     * @param int $hash phash of TYPO3 parent search result record
1290
     * @param int $hash_t3 phash of the file indexation search record
1291
     */
1292
    public function submit_section($hash, $hash_t3)
0 ignored issues
show
Coding Style introduced by
Method name "Indexer::submit_section" is not in camel caps format
Loading history...
1293
    {
1294
        $fields = [
1295
            'phash' => $hash,
1296
            'phash_t3' => $hash_t3,
1297
            'page_id' => (int)$this->conf['id']
1298
        ];
1299
        $this->getRootLineFields($fields);
1300
        if (IndexedSearchUtility::isTableUsed('index_section')) {
1301
            $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1302
                ->getConnectionForTable('index_section');
1303
            $connection->insert('index_section', $fields);
1304
        }
1305
    }
1306
1307
    /**
1308
     * Removes records for the indexed page, $phash
1309
     *
1310
     * @param int $phash phash value to flush
1311
     */
1312
    public function removeOldIndexedPages($phash)
1313
    {
1314
        // Removing old registrations for all tables. Because the pages are TYPO3 pages
1315
        // there can be nothing else than 1-1 relations here.
1316
        $connectionPool = GeneralUtility::makeInstance(ConnectionPool::class);
1317
        $tableArray = ['index_phash', 'index_section', 'index_grlist', 'index_fulltext', 'index_debug'];
1318
        foreach ($tableArray as $table) {
1319
            if (IndexedSearchUtility::isTableUsed($table)) {
1320
                $connectionPool->getConnectionForTable($table)->delete($table, ['phash' => (int)$phash]);
1321
            }
1322
        }
1323
1324
        // Removing all index_section records with hash_t3 set to this hash (this includes such
1325
        // records set for external media on the page as well!). The re-insert of these records
1326
        // are done in indexRegularDocument($file).
1327
        if (IndexedSearchUtility::isTableUsed('index_section')) {
1328
            $connectionPool->getConnectionForTable('index_section')
1329
                ->delete('index_section', ['phash_t3' => (int)$phash]);
1330
        }
1331
    }
1332
1333
    /********************************
1334
     *
1335
     * SQL; External media
1336
     *
1337
     *******************************/
1338
    /**
1339
     * Updates db with information about the file
1340
     *
1341
     * @param array $hash Array with phash and phash_grouping keys for file
1342
     * @param string $file File name
1343
     * @param array $subinfo Array of "static_page_arguments" for files: This is for instance the page index for a PDF file (other document types it will be a zero)
1344
     * @param string $ext File extension determining the type of media.
1345
     * @param int $mtime Modification time of file.
1346
     * @param int $ctime Creation time of file.
1347
     * @param int $size Size of file in bytes
1348
     * @param int $content_md5h Content HASH value.
1349
     * @param array $contentParts Standard content array (using only title and body for a file)
1350
     */
1351
    public function submitFilePage($hash, $file, $subinfo, $ext, $mtime, $ctime, $size, $content_md5h, $contentParts)
1352
    {
1353
        // Find item Type:
1354
        $storeItemType = $this->external_parsers[$ext]->ext2itemtype_map[$ext];
1355
        $storeItemType = $storeItemType ?: $ext;
1356
        // Remove any current data for this phash:
1357
        $this->removeOldIndexedFiles($hash['phash']);
1358
        // Split filename:
1359
        $fileParts = parse_url($file);
1360
        // Setting new
1361
        $fields = [
1362
            'phash' => $hash['phash'],
1363
            'phash_grouping' => $hash['phash_grouping'],
1364
            'static_page_arguments' => json_encode($subinfo),
1365
            'contentHash' => $content_md5h,
1366
            'data_filename' => $file,
1367
            'item_type' => $storeItemType,
1368
            'item_title' => trim($contentParts['title']) ?: PathUtility::basename($file),
1369
            'item_description' => $this->bodyDescription($contentParts),
1370
            'item_mtime' => $mtime,
1371
            'item_size' => $size,
1372
            'item_crdate' => $ctime,
1373
            'tstamp' => $GLOBALS['EXEC_TIME'],
1374
            'crdate' => $GLOBALS['EXEC_TIME'],
1375
            'gr_list' => $this->conf['gr_list'],
1376
            'externalUrl' => $fileParts['scheme'] ? 1 : 0,
1377
            'recordUid' => (int)$this->conf['recordUid'],
1378
            'freeIndexUid' => (int)$this->conf['freeIndexUid'],
1379
            'freeIndexSetId' => (int)$this->conf['freeIndexSetId'],
1380
            'sys_language_uid' => (int)$this->conf['sys_language_uid']
1381
        ];
1382
        if (IndexedSearchUtility::isTableUsed('index_phash')) {
1383
            $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1384
                ->getConnectionForTable('index_phash');
1385
            $connection->insert(
1386
                'index_phash',
1387
                $fields
1388
            );
1389
        }
1390
        // PROCESSING index_fulltext
1391
        $fields = [
1392
            'phash' => $hash['phash'],
1393
            'fulltextdata' => implode(' ', $contentParts),
1394
            'metaphonedata' => $this->metaphoneContent
1395
        ];
1396
        if ($this->indexerConfig['fullTextDataLength'] > 0) {
1397
            $fields['fulltextdata'] = substr($fields['fulltextdata'], 0, $this->indexerConfig['fullTextDataLength']);
1398
        }
1399
        if (IndexedSearchUtility::isTableUsed('index_fulltext')) {
1400
            $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1401
                ->getConnectionForTable('index_fulltext');
1402
            $connection->insert('index_fulltext', $fields);
1403
        }
1404
        // PROCESSING index_debug
1405
        if ($this->indexerConfig['debugMode']) {
1406
            $fields = [
1407
                'phash' => $hash['phash'],
1408
                'debuginfo' => json_encode([
1409
                    'static_page_arguments' => $subinfo,
1410
                    'contentParts' => array_merge($contentParts, ['body' => substr($contentParts['body'], 0, 1000)]),
1411
                    'logs' => $this->internal_log,
1412
                    'lexer' => $this->lexerObj->debugString
1413
                ])
1414
            ];
1415
            if (IndexedSearchUtility::isTableUsed('index_debug')) {
1416
                $connection = GeneralUtility::makeInstance(ConnectionPool::class)
1417
                    ->getConnectionForTable('index_debug');
1418
                $connection->insert('index_debug', $fields);
1419
            }
1420
        }
1421
    }
1422
1423
    /**
1424
     * Stores file gr_list for a file IF it does not exist already
1425
     *
1426
     * @param int $hash phash value of file
1427
     */
1428
    public function submitFile_grlist($hash)
0 ignored issues
show
Coding Style introduced by
Method name "Indexer::submitFile_grlist" is not in camel caps format
Loading history...
1429
    {
1430
        // Testing if there is a gr_list record for a non-logged in user and if so, there is no need to place another one.
1431
        if (!IndexedSearchUtility::isTableUsed('index_grlist')) {
1432
            return;
1433
        }
1434
1435
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
1436
            ->getQueryBuilderForTable('index_grlist');
1437
        $count = (int)$queryBuilder->count('*')
1438
            ->from('index_grlist')
1439
            ->where(
1440
                $queryBuilder->expr()->eq(
1441
                    'phash',
1442
                    $queryBuilder->createNamedParameter($hash, \PDO::PARAM_INT)
1443
                ),
1444
                $queryBuilder->expr()->orX(
1445
                    $queryBuilder->expr()->eq(
1446
                        'hash_gr_list',
1447
                        $queryBuilder->createNamedParameter(
1448
                            IndexedSearchUtility::md5inthash($this->defaultGrList),
1449
                            \PDO::PARAM_INT
1450
                        )
1451
                    ),
1452
                    $queryBuilder->expr()->eq(
1453
                        'hash_gr_list',
1454
                        $queryBuilder->createNamedParameter(
1455
                            IndexedSearchUtility::md5inthash($this->conf['gr_list']),
1456
                            \PDO::PARAM_INT
1457
                        )
1458
                    )
1459
                )
1460
            )
1461
            ->execute()
1462
            ->fetchColumn();
1463
1464
        if ($count === 0) {
1465
            $this->submit_grlist($hash, $hash);
1466
        }
1467
    }
1468
1469
    /**
1470
     * Stores file section for a file IF it does not exist
1471
     *
1472
     * @param int $hash phash value of file
1473
     */
1474
    public function submitFile_section($hash)
0 ignored issues
show
Coding Style introduced by
Method name "Indexer::submitFile_section" is not in camel caps format
Loading history...
1475
    {
1476
        // Testing if there is already a section
1477
        if (!IndexedSearchUtility::isTableUsed('index_section')) {
1478
            return;
1479
        }
1480
1481
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
1482
            ->getQueryBuilderForTable('index_section');
1483
        $count = (int)$queryBuilder->count('phash')
1484
            ->from('index_section')
1485
            ->where(
1486
                $queryBuilder->expr()->eq(
1487
                    'phash',
1488
                    $queryBuilder->createNamedParameter($hash, \PDO::PARAM_INT)
1489
                ),
1490
                $queryBuilder->expr()->eq(
1491
                    'page_id',
1492
                    $queryBuilder->createNamedParameter($this->conf['id'], \PDO::PARAM_INT)
1493
                )
1494
            )
1495
            ->execute()
1496
            ->fetchColumn();
1497
1498
        if ($count === 0) {
1499
            $this->submit_section($hash, $this->hash['phash']);
1500
        }
1501
    }
1502
1503
    /**
1504
     * Removes records for the indexed page, $phash
1505
     *
1506
     * @param int $phash phash value to flush
1507
     */
1508
    public function removeOldIndexedFiles($phash)
1509
    {
1510
        $connectionPool = GeneralUtility::makeInstance(ConnectionPool::class);
1511
        // Removing old registrations for tables.
1512
        $tableArray = ['index_phash', 'index_grlist', 'index_fulltext', 'index_debug'];
1513
        foreach ($tableArray as $table) {
1514
            if (!IndexedSearchUtility::isTableUsed($table)) {
1515
                continue;
1516
            }
1517
            $connectionPool->getConnectionForTable($table)->delete($table, ['phash' => (int)$phash]);
1518
        }
1519
    }
1520
1521
    /********************************
1522
     *
1523
     * SQL Helper functions
1524
     *
1525
     *******************************/
1526
    /**
1527
     * Check the mtime / tstamp of the currently indexed page/file (based on phash)
1528
     * Return positive integer if the page needs to be indexed
1529
     *
1530
     * @param int $mtime mtime value to test against limits and indexed page (usually this is the mtime of the cached document)
1531
     * @param int $phash "phash" used to select any already indexed page to see what its mtime is.
1532
     * @return int Result integer: Generally: <0 = No indexing, >0 = Do indexing (see $this->reasons): -2) Min age was NOT exceeded and so indexing cannot occur.  -1) mtime matched so no need to reindex page. 0) N/A   1) Max age exceeded, page must be indexed again.   2) mtime of indexed page doesn't match mtime given for current content and we must index page.  3) No mtime was set, so we will index...  4) No indexed page found, so of course we will index.
1533
     */
1534
    public function checkMtimeTstamp($mtime, $phash)
1535
    {
1536
        if (!IndexedSearchUtility::isTableUsed('index_phash')) {
1537
            // Not indexed (not in index_phash)
1538
            $result = 4;
1539
        } else {
1540
            $row = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('index_phash')
1541
                ->select(
1542
                    ['item_mtime', 'tstamp'],
1543
                    'index_phash',
1544
                    ['phash' => (int)$phash],
1545
                    [],
1546
                    [],
1547
                    1
1548
                )
1549
                ->fetch();
1550
            // If there was an indexing of the page...:
1551
            if (!empty($row)) {
1552
                if ($this->tstamp_maxAge && $row['tstamp'] + $this->tstamp_maxAge < $GLOBALS['EXEC_TIME']) {
1553
                    // If max age is exceeded, index the page
1554
                    // The configured max-age was exceeded for the document and thus it's indexed.
1555
                    $result = 1;
1556
                } else {
1557
                    if (!$this->tstamp_minAge || $row['tstamp'] + $this->tstamp_minAge < $GLOBALS['EXEC_TIME']) {
1558
                        // if minAge is not set or if minAge is exceeded, consider at mtime
1559
                        if ($mtime) {
1560
                            // It mtime is set, then it's tested. If not, the page must clearly be indexed.
1561
                            if ($row['item_mtime'] != $mtime) {
1562
                                // And if mtime is different from the index_phash mtime, it's about time to re-index.
1563
                                // The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.
1564
                                $result = 2;
1565
                            } else {
1566
                                // mtime matched the document, so no changes detected and no content updated
1567
                                $result = -1;
1568
                                if ($this->tstamp_maxAge) {
1569
                                    $this->log_setTSlogMessage('mtime matched, timestamp NOT updated because a maxAge is set (' . ($row['tstamp'] + $this->tstamp_maxAge - $GLOBALS['EXEC_TIME']) . ' seconds to expire time).', 1);
1570
                                } else {
1571
                                    $this->updateTstamp($phash);
1572
                                    $this->log_setTSlogMessage('mtime matched, timestamp updated.', 1);
1573
                                }
1574
                            }
1575
                        } else {
1576
                            // The minimum age was exceed, but mtime was not set, so the page was indexed.
1577
                            $result = 3;
1578
                        }
1579
                    } else {
1580
                        // The minimum age was not exceeded
1581
                        $result = -2;
1582
                    }
1583
                }
1584
            } else {
1585
                // Page has never been indexed (is not represented in the index_phash table).
1586
                $result = 4;
1587
            }
1588
        }
1589
        return $result;
1590
    }
1591
1592
    /**
1593
     * Check content hash in phash table
1594
     *
1595
     * @return mixed Returns TRUE if the page needs to be indexed (that is, there was no result), otherwise the phash value (in an array) of the phash record to which the grlist_record should be related!
1596
     */
1597
    public function checkContentHash()
1598
    {
1599
        // With this query the page will only be indexed if it's content is different from the same "phash_grouping" -page.
1600
        $result = true;
1601
        if (IndexedSearchUtility::isTableUsed('index_phash')) {
1602
            $row = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('index_phash')
1603
                ->select(
1604
                    ['phash'],
1605
                    'index_phash',
1606
                    [
1607
                        'phash_grouping' => (int)$this->hash['phash_grouping'],
1608
                        'contentHash' => (int)$this->content_md5h
1609
                    ],
1610
                    [],
1611
                    [],
1612
                    1
1613
                )
1614
                ->fetch();
1615
1616
            if (!empty($row)) {
1617
                $result = $row;
1618
            }
1619
        }
1620
        return $result;
1621
    }
1622
1623
    /**
1624
     * Check content hash for external documents
1625
     * Returns TRUE if the document needs to be indexed (that is, there was no result)
1626
     *
1627
     * @param int $hashGr phash value to check (phash_grouping)
1628
     * @param int $content_md5h Content hash to check
1629
     * @return bool Returns TRUE if the document needs to be indexed (that is, there was no result)
1630
     */
1631
    public function checkExternalDocContentHash($hashGr, $content_md5h)
1632
    {
1633
        $result = true;
1634
        if (IndexedSearchUtility::isTableUsed('index_phash')) {
1635
            $count = (int)GeneralUtility::makeInstance(ConnectionPool::class)
1636
                ->getConnectionForTable('index_phash')
1637
                ->count(
1638
                    '*',
1639
                    'index_phash',
1640
                    [
1641
                        'phash_grouping' => (int)$hashGr,
1642
                        'contentHash' => (int)$content_md5h
1643
                    ]
1644
                );
1645
1646
            $result = $count === 0;
1647
        }
1648
        return $result;
1649
    }
1650
1651
    /**
1652
     * Checks if a grlist record has been set for the phash value input (looking at the "real" phash of the current content, not the linked-to phash of the common search result page)
1653
     *
1654
     * @param int $phash_x Phash integer to test.
1655
     * @return bool
1656
     */
1657
    public function is_grlist_set($phash_x)
0 ignored issues
show
Coding Style introduced by
Method name "Indexer::is_grlist_set" is not in camel caps format
Loading history...
1658
    {
1659
        $result = false;
1660
        if (IndexedSearchUtility::isTableUsed('index_grlist')) {
1661
            $count = (int)GeneralUtility::makeInstance(ConnectionPool::class)
1662
                ->getConnectionForTable('index_grlist')
1663
                ->count(
1664
                    'phash_x',
1665
                    'index_grlist',
1666
                    ['phash_x' => (int)$phash_x]
1667
                );
1668
1669
            $result = $count > 0;
1670
        }
1671
        return $result;
1672
    }
1673
1674
    /**
1675
     * Check if a grlist-entry for this hash exists and if not so, write one.
1676
     *
1677
     * @param int $phash phash of the search result that should be found
1678
     * @param int $phash_x The real phash of the current content. The two values are different when a page with userlogin turns out to contain the exact same content as another already indexed version of the page; This is the whole reason for the grlist table in fact...
1679
     * @see submit_grlist()
1680
     */
1681
    public function update_grlist($phash, $phash_x)
0 ignored issues
show
Coding Style introduced by
Method name "Indexer::update_grlist" is not in camel caps format
Loading history...
1682
    {
1683
        if (IndexedSearchUtility::isTableUsed('index_grlist')) {
1684
            $count = (int)GeneralUtility::makeInstance(ConnectionPool::class)
1685
                ->getConnectionForTable('index_grlist')
1686
                ->count(
1687
                    'phash',
1688
                    'index_grlist',
1689
                    [
1690
                        'phash' => (int)$phash,
1691
                        'hash_gr_list' => IndexedSearchUtility::md5inthash($this->conf['gr_list'])
1692
                    ]
1693
                );
1694
1695
            if ($count === 0) {
1696
                $this->submit_grlist($phash, $phash_x);
1697
                $this->log_setTSlogMessage('Inserted gr_list \'' . $this->conf['gr_list'] . '\' for phash \'' . $phash . '\'', 1);
1698
            }
1699
        }
1700
    }
1701
1702
    /**
1703
     * Update tstamp for a phash row.
1704
     *
1705
     * @param int $phash phash value
1706
     * @param int $mtime If set, update the mtime field to this value.
1707
     */
1708
    public function updateTstamp($phash, $mtime = 0)
1709
    {
1710
        if (!IndexedSearchUtility::isTableUsed('index_phash')) {
1711
            return;
1712
        }
1713
1714
        $updateFields = [
1715
            'tstamp' => $GLOBALS['EXEC_TIME']
1716
        ];
1717
1718
        if ($mtime) {
1719
            $updateFields['item_mtime'] = (int)$mtime;
1720
        }
1721
1722
        GeneralUtility::makeInstance(ConnectionPool::class)
1723
            ->getConnectionForTable('index_phash')
1724
            ->update(
1725
                'index_phash',
1726
                $updateFields,
1727
                [
1728
                    'phash' => (int)$phash
1729
                ]
1730
            );
1731
    }
1732
1733
    /**
1734
     * Update SetID of the index_phash record.
1735
     *
1736
     * @param int $phash phash value
1737
     */
1738
    public function updateSetId($phash)
1739
    {
1740
        if (!IndexedSearchUtility::isTableUsed('index_phash')) {
1741
            return;
1742
        }
1743
1744
        GeneralUtility::makeInstance(ConnectionPool::class)
1745
            ->getConnectionForTable('index_phash')
1746
            ->update(
1747
                'index_phash',
1748
                [
1749
                    'freeIndexSetId' => (int)$this->conf['freeIndexSetId']
1750
                ],
1751
                [
1752
                    'phash' => (int)$phash
1753
                ]
1754
            );
1755
    }
1756
1757
    /**
1758
     * Update parsetime for phash row.
1759
     *
1760
     * @param int $phash phash value.
1761
     * @param int $parsetime Parsetime value to set.
1762
     */
1763
    public function updateParsetime($phash, $parsetime)
1764
    {
1765
        if (!IndexedSearchUtility::isTableUsed('index_phash')) {
1766
            return;
1767
        }
1768
1769
        GeneralUtility::makeInstance(ConnectionPool::class)
1770
            ->getConnectionForTable('index_phash')
1771
            ->update(
1772
                'index_phash',
1773
                [
1774
                    'parsetime' => (int)$parsetime
1775
                ],
1776
                [
1777
                    'phash' => (int)$phash
1778
                ]
1779
            );
1780
    }
1781
1782
    /**
1783
     * Update section rootline for the page
1784
     */
1785
    public function updateRootline()
1786
    {
1787
        if (!IndexedSearchUtility::isTableUsed('index_section')) {
1788
            return;
1789
        }
1790
1791
        $updateFields = [];
1792
        $this->getRootLineFields($updateFields);
1793
1794
        GeneralUtility::makeInstance(ConnectionPool::class)
1795
            ->getConnectionForTable('index_section')
1796
            ->update(
1797
                'index_section',
1798
                $updateFields,
1799
                [
1800
                    'page_id' => (int)$this->conf['id']
1801
                ]
1802
            );
1803
    }
1804
1805
    /**
1806
     * Adding values for root-line fields.
1807
     * rl0, rl1 and rl2 are standard. A hook might add more.
1808
     *
1809
     * @param array $fieldArray Field array, passed by reference
1810
     */
1811
    public function getRootLineFields(array &$fieldArray)
1812
    {
1813
        $fieldArray['rl0'] = (int)$this->conf['rootline_uids'][0];
1814
        $fieldArray['rl1'] = (int)$this->conf['rootline_uids'][1];
1815
        $fieldArray['rl2'] = (int)$this->conf['rootline_uids'][2];
1816
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['addRootLineFields'] ?? [] as $fieldName => $rootLineLevel) {
1817
            $fieldArray[$fieldName] = (int)$this->conf['rootline_uids'][$rootLineLevel];
1818
        }
1819
    }
1820
1821
    /********************************
1822
     *
1823
     * SQL; Submitting words
1824
     *
1825
     *******************************/
1826
    /**
1827
     * Adds new words to db
1828
     *
1829
     * @param array $wordListArray Word List array (where each word has information about position etc).
1830
     */
1831
    public function checkWordList($wordListArray)
1832
    {
1833
        if (!IndexedSearchUtility::isTableUsed('index_words') || empty($wordListArray)) {
1834
            return;
1835
        }
1836
1837
        $wordListArrayCount = count($wordListArray);
1838
        $phashArray = array_map('intval', array_column($wordListArray, 'hash'));
1839
1840
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable('index_words');
1841
        $count = (int)$queryBuilder->count('baseword')
1842
            ->from('index_words')
1843
            ->where(
1844
                $queryBuilder->expr()->in(
1845
                    'wid',
1846
                    $queryBuilder->createNamedParameter($phashArray, Connection::PARAM_INT_ARRAY)
1847
                )
1848
            )
1849
            ->execute()
1850
            ->fetchColumn();
1851
1852
        if ($count !== $wordListArrayCount) {
1853
            $connection = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('index_words');
1854
            $queryBuilder = $connection->createQueryBuilder();
1855
1856
            $result = $queryBuilder->select('baseword')
1857
                ->from('index_words')
1858
                ->where(
1859
                    $queryBuilder->expr()->in(
1860
                        'wid',
1861
                        $queryBuilder->createNamedParameter($phashArray, Connection::PARAM_INT_ARRAY)
1862
                    )
1863
                )
1864
                ->execute();
1865
1866
            $this->log_setTSlogMessage('Inserting words: ' . ($wordListArrayCount - $count), 1);
1867
            while ($row = $result->fetch()) {
1868
                unset($wordListArray[$row['baseword']]);
1869
            }
1870
1871
            foreach ($wordListArray as $key => $val) {
1872
                // A duplicate-key error will occur here if a word is NOT unset in the unset() line. However as
1873
                // long as the words in $wl are NOT longer as 60 chars (the baseword varchar is 60 characters...)
1874
                // this is not a problem.
1875
                $connection->insert(
1876
                    'index_words',
1877
                    [
1878
                        'wid' => $val['hash'],
1879
                        'baseword' => $key,
1880
                        'metaphone' => $val['metaphone']
1881
                    ]
1882
                );
1883
            }
1884
        }
1885
    }
1886
1887
    /**
1888
     * Submits RELATIONS between words and phash
1889
     *
1890
     * @param array $wordList Word list array
1891
     * @param int $phash phash value
1892
     */
1893
    public function submitWords($wordList, $phash)
1894
    {
1895
        if (!IndexedSearchUtility::isTableUsed('index_rel')) {
1896
            return;
1897
        }
1898
        $connectionPool = GeneralUtility::makeInstance(ConnectionPool::class);
1899
        $queryBuilder = $connectionPool->getQueryBuilderForTable('index_words');
1900
        $result = $queryBuilder->select('wid')
1901
            ->from('index_words')
1902
            ->where(
1903
                $queryBuilder->expr()->neq('is_stopword', $queryBuilder->createNamedParameter(0, \PDO::PARAM_INT))
1904
            )
1905
            ->groupBy('wid')
1906
            ->execute();
1907
1908
        $stopWords = [];
1909
        while ($row = $result->fetch()) {
1910
            $stopWords[$row['wid']] = $row;
1911
        }
1912
1913
        $connectionPool->getConnectionForTable('index_rel')->delete('index_rel', ['phash' => (int)$phash]);
1914
1915
        $fields = ['phash', 'wid', 'count', 'first', 'freq', 'flags'];
1916
        $rows = [];
1917
        foreach ($wordList as $val) {
1918
            if (isset($stopWords[$val['hash']])) {
1919
                continue;
1920
            }
1921
            $rows[] = [
1922
                (int)$phash,
1923
                (int)$val['hash'],
1924
                (int)$val['count'],
1925
                (int)$val['first'],
1926
                $this->freqMap($val['count'] / $this->wordcount),
1927
                $val['cmp'] & $this->flagBitMask
1928
            ];
1929
        }
1930
1931
        if (!empty($rows)) {
1932
            $connectionPool->getConnectionForTable('index_rel')->bulkInsert('index_rel', $rows, $fields);
1933
        }
1934
    }
1935
1936
    /**
1937
     * maps frequency from a real number in [0;1] to an integer in [0;$this->freqRange] with anything above $this->freqMax as 1
1938
     * and back.
1939
     *
1940
     * @param float $freq Frequency
1941
     * @return int Frequency in range.
1942
     */
1943
    public function freqMap($freq)
1944
    {
1945
        $mapFactor = $this->freqMax * 100 * $this->freqRange;
1946
        if ($freq <= 1) {
1947
            $newFreq = $freq * $mapFactor;
1948
            $newFreq = $newFreq > $this->freqRange ? $this->freqRange : $newFreq;
1949
        } else {
1950
            $newFreq = $freq / $mapFactor;
1951
        }
1952
        return $newFreq;
1953
    }
1954
1955
    /********************************
1956
     *
1957
     * Hashing
1958
     *
1959
     *******************************/
1960
    /**
1961
     * Get search hash, T3 pages
1962
     */
1963
    public function setT3Hashes()
1964
    {
1965
        //  Set main array:
1966
        $hArray = [
1967
            'id' => (int)$this->conf['id'],
1968
            'type' => (int)$this->conf['type'],
1969
            'sys_lang' => (int)$this->conf['sys_language_uid'],
1970
            'MP' => (string)$this->conf['MP'],
1971
            'staticPageArguments' => is_array($this->conf['staticPageArguments']) ? json_encode($this->conf['staticPageArguments']) : null,
1972
        ];
1973
        // Set grouping hash (Identifies a "page" combined of id, type, language, mountpoint and cHash parameters):
1974
        $this->hash['phash_grouping'] = IndexedSearchUtility::md5inthash(serialize($hArray));
1975
        // Add gr_list and set plain phash (Subdivision where special page composition based on login is taken into account as well. It is expected that such pages are normally similar regardless of the login.)
1976
        $hArray['gr_list'] = (string)$this->conf['gr_list'];
1977
        $this->hash['phash'] = IndexedSearchUtility::md5inthash(serialize($hArray));
1978
    }
1979
1980
    /**
1981
     * Get search hash, external files
1982
     *
1983
     * @param string $file File name / path which identifies it on the server
1984
     * @param array $subinfo Additional content identifying the (subpart of) content. For instance; PDF files are divided into groups of pages for indexing.
1985
     * @return array Array with "phash_grouping" and "phash" inside.
1986
     */
1987
    public function setExtHashes($file, $subinfo = [])
1988
    {
1989
        //  Set main array:
1990
        $hash = [];
1991
        $hArray = [
1992
            'file' => $file
1993
        ];
1994
        // Set grouping hash:
1995
        $hash['phash_grouping'] = IndexedSearchUtility::md5inthash(serialize($hArray));
1996
        // Add subinfo
1997
        $hArray['subinfo'] = $subinfo;
1998
        $hash['phash'] = IndexedSearchUtility::md5inthash(serialize($hArray));
1999
        return $hash;
2000
    }
2001
2002
    /*********************************
2003
     *
2004
     * Internal logging functions
2005
     *
2006
     *********************************/
2007
    /**
2008
     * Push function wrapper for TT logging
2009
     *
2010
     * @param string $msg Title to set
2011
     * @param string $key Key (?)
2012
     */
2013
    public function log_push($msg, $key)
0 ignored issues
show
Coding Style introduced by
Method name "Indexer::log_push" is not in camel caps format
Loading history...
2014
    {
2015
        $this->timeTracker->push($msg, $key);
2016
    }
2017
2018
    /**
2019
     * Pull function wrapper for TT logging
2020
     */
2021
    public function log_pull()
0 ignored issues
show
Coding Style introduced by
Method name "Indexer::log_pull" is not in camel caps format
Loading history...
2022
    {
2023
        $this->timeTracker->pull();
2024
    }
2025
2026
    /**
2027
     * Set log message function wrapper for TT logging
2028
     *
2029
     * @param string $msg Message to set
2030
     * @param int $errorNum Error number
2031
     */
2032
    public function log_setTSlogMessage($msg, $errorNum = 0)
0 ignored issues
show
Coding Style introduced by
Method name "Indexer::log_setTSlogMessage" is not in camel caps format
Loading history...
2033
    {
2034
        $this->timeTracker->setTSlogMessage($msg, $errorNum);
2035
        $this->internal_log[] = $msg;
2036
    }
2037
2038
    /**
2039
     * Makes sure that keywords are space-separated. This is important for their
2040
     * proper displaying as a part of fulltext index.
2041
     *
2042
     * @param string $keywordList
2043
     * @return string
2044
     * @see http://forge.typo3.org/issues/14959
2045
     */
2046
    protected function addSpacesToKeywordList($keywordList)
2047
    {
2048
        $keywords = GeneralUtility::trimExplode(',', $keywordList);
2049
        return ' ' . implode(', ', $keywords) . ' ';
2050
    }
2051
}
2052