| Total Complexity | 238 |
| Total Lines | 2019 |
| Duplicated Lines | 0 % |
| Changes | 3 | ||
| Bugs | 0 | Features | 0 |
Complex classes like Indexer often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use Indexer, and based on these observations, apply Extract Interface, too.
| 1 | <?php |
||
| 31 | class Indexer |
||
| 32 | { |
||
| 33 | |||
| 34 | /** |
||
| 35 | * @var array |
||
| 36 | */ |
||
| 37 | public $reasons = [ |
||
| 38 | -1 => 'mtime matched the document, so no changes detected and no content updated', |
||
| 39 | -2 => 'The minimum age was not exceeded', |
||
| 40 | 1 => 'The configured max-age was exceeded for the document and thus it\'s indexed.', |
||
| 41 | 2 => 'The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.', |
||
| 42 | 3 => 'The minimum age was exceed, but mtime was not set, so the page was indexed.', |
||
| 43 | 4 => 'Page has never been indexed (is not represented in the index_phash table).' |
||
| 44 | ]; |
||
| 45 | |||
| 46 | /** |
||
| 47 | * HTML code blocks to exclude from indexing |
||
| 48 | * |
||
| 49 | * @var string |
||
| 50 | */ |
||
| 51 | public $excludeSections = 'script,style'; |
||
| 52 | |||
| 53 | /** |
||
| 54 | * Supported Extensions for external files |
||
| 55 | * |
||
| 56 | * @var array |
||
| 57 | */ |
||
| 58 | public $external_parsers = []; |
||
| 59 | |||
| 60 | /** |
||
| 61 | * External parser objects, keys are file extension names. Values are objects with certain methods. |
||
| 62 | * Fe-group list (pages might be indexed separately for each usergroup combination to support search |
||
| 63 | * in access limited pages!) |
||
| 64 | * |
||
| 65 | * @var string |
||
| 66 | */ |
||
| 67 | public $defaultGrList = '0,-1'; |
||
| 68 | |||
| 69 | /** |
||
| 70 | * Min/Max times |
||
| 71 | * |
||
| 72 | * @var int |
||
| 73 | */ |
||
| 74 | public $tstamp_maxAge = 0; |
||
| 75 | |||
| 76 | /** |
||
| 77 | * If set, this tells a number of seconds that is the maximum age of an indexed document. |
||
| 78 | * Regardless of mtime the document will be re-indexed if this limit is exceeded. |
||
| 79 | * |
||
| 80 | * @var int |
||
| 81 | */ |
||
| 82 | public $tstamp_minAge = 0; |
||
| 83 | |||
| 84 | /** |
||
| 85 | * If set, this tells a minimum limit before a document can be indexed again. This is regardless of mtime. |
||
| 86 | * |
||
| 87 | * @var int |
||
| 88 | */ |
||
| 89 | public $maxExternalFiles = 0; |
||
| 90 | |||
| 91 | /** |
||
| 92 | * Max number of external files to index. |
||
| 93 | * |
||
| 94 | * @var bool |
||
| 95 | */ |
||
| 96 | public $forceIndexing = false; |
||
| 97 | |||
| 98 | /** |
||
| 99 | * Set when crawler is detected (internal) |
||
| 100 | * |
||
| 101 | * @var array |
||
| 102 | */ |
||
| 103 | public $defaultContentArray = [ |
||
| 104 | 'title' => '', |
||
| 105 | 'description' => '', |
||
| 106 | 'keywords' => '', |
||
| 107 | 'body' => '' |
||
| 108 | ]; |
||
| 109 | |||
| 110 | /** |
||
| 111 | * @var int |
||
| 112 | */ |
||
| 113 | public $wordcount = 0; |
||
| 114 | |||
| 115 | /** |
||
| 116 | * @var int |
||
| 117 | */ |
||
| 118 | public $externalFileCounter = 0; |
||
| 119 | |||
| 120 | /** |
||
| 121 | * @var array |
||
| 122 | */ |
||
| 123 | public $conf = []; |
||
| 124 | |||
| 125 | /** |
||
| 126 | * Configuration set internally (see init functions for required keys and their meaning) |
||
| 127 | * |
||
| 128 | * @var array |
||
| 129 | */ |
||
| 130 | public $indexerConfig = []; |
||
| 131 | |||
| 132 | /** |
||
| 133 | * Indexer configuration, coming from TYPO3's system configuration for EXT:indexed_search |
||
| 134 | * |
||
| 135 | * @var array |
||
| 136 | */ |
||
| 137 | public $hash = []; |
||
| 138 | |||
| 139 | /** |
||
| 140 | * Hash array, contains phash and phash_grouping |
||
| 141 | * |
||
| 142 | * @var array |
||
| 143 | */ |
||
| 144 | public $file_phash_arr = []; |
||
| 145 | |||
| 146 | /** |
||
| 147 | * Hash array for files |
||
| 148 | * |
||
| 149 | * @var array |
||
| 150 | */ |
||
| 151 | public $contentParts = []; |
||
| 152 | |||
| 153 | /** |
||
| 154 | * Content of TYPO3 page |
||
| 155 | * |
||
| 156 | * @var string |
||
| 157 | */ |
||
| 158 | public $content_md5h = ''; |
||
| 159 | |||
| 160 | /** |
||
| 161 | * @var array |
||
| 162 | */ |
||
| 163 | public $internal_log = []; |
||
| 164 | |||
| 165 | /** |
||
| 166 | * Internal log |
||
| 167 | * |
||
| 168 | * @var string |
||
| 169 | */ |
||
| 170 | public $indexExternalUrl_content = ''; |
||
| 171 | |||
| 172 | /** |
||
| 173 | * @var int |
||
| 174 | */ |
||
| 175 | public $freqRange = 32000; |
||
| 176 | |||
| 177 | /** |
||
| 178 | * @var float |
||
| 179 | */ |
||
| 180 | public $freqMax = 0.1; |
||
| 181 | |||
| 182 | /** |
||
| 183 | * @var bool |
||
| 184 | */ |
||
| 185 | public $enableMetaphoneSearch = false; |
||
| 186 | |||
| 187 | /** |
||
| 188 | * @var bool |
||
| 189 | */ |
||
| 190 | public $storeMetaphoneInfoAsWords; |
||
| 191 | |||
| 192 | /** |
||
| 193 | * @var string |
||
| 194 | */ |
||
| 195 | public $metaphoneContent = ''; |
||
| 196 | |||
| 197 | /** |
||
| 198 | * Metaphone object, if any |
||
| 199 | * |
||
| 200 | * @var \TYPO3\CMS\IndexedSearch\Utility\DoubleMetaPhoneUtility |
||
| 201 | */ |
||
| 202 | public $metaphoneObj; |
||
| 203 | |||
| 204 | /** |
||
| 205 | * Lexer object for word splitting |
||
| 206 | * |
||
| 207 | * @var \TYPO3\CMS\IndexedSearch\Lexer |
||
| 208 | */ |
||
| 209 | public $lexerObj; |
||
| 210 | |||
| 211 | /** |
||
| 212 | * @var bool |
||
| 213 | */ |
||
| 214 | public $flagBitMask; |
||
| 215 | |||
| 216 | /** |
||
| 217 | * @var TimeTracker |
||
| 218 | */ |
||
| 219 | protected $timeTracker; |
||
| 220 | |||
| 221 | /** |
||
| 222 | * Indexer constructor. |
||
| 223 | */ |
||
| 224 | public function __construct() |
||
| 225 | { |
||
| 226 | $this->timeTracker = GeneralUtility::makeInstance(TimeTracker::class); |
||
| 227 | // Indexer configuration from Extension Manager interface |
||
| 228 | $this->indexerConfig = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get('indexed_search'); |
||
| 229 | $this->tstamp_minAge = MathUtility::forceIntegerInRange((int)($this->indexerConfig['minAge'] ?? 0) * 3600, 0); |
||
| 230 | $this->tstamp_maxAge = MathUtility::forceIntegerInRange((int)($this->indexerConfig['maxAge'] ?? 0) * 3600, 0); |
||
| 231 | $this->maxExternalFiles = MathUtility::forceIntegerInRange($this->indexerConfig['maxExternalFiles'], 0, 1000, 5); |
||
| 232 | $this->flagBitMask = MathUtility::forceIntegerInRange($this->indexerConfig['flagBitMask'], 0, 255); |
||
|
|
|||
| 233 | // Workaround: If the extension configuration was not updated yet, the value is not existing |
||
| 234 | $this->enableMetaphoneSearch = !isset($this->indexerConfig['enableMetaphoneSearch']) || $this->indexerConfig['enableMetaphoneSearch']; |
||
| 235 | $this->storeMetaphoneInfoAsWords = !IndexedSearchUtility::isTableUsed('index_words') && $this->enableMetaphoneSearch; |
||
| 236 | } |
||
| 237 | |||
| 238 | /******************************** |
||
| 239 | * |
||
| 240 | * Initialization |
||
| 241 | * |
||
| 242 | *******************************/ |
||
| 243 | |||
| 244 | /** |
||
| 245 | * Initializes the object. |
||
| 246 | * @param array|null $configuration will be used to set $this->conf, otherwise $this->conf MUST be set with proper values prior to this call |
||
| 247 | */ |
||
| 248 | public function init(array $configuration = null) |
||
| 249 | { |
||
| 250 | if (is_array($configuration)) { |
||
| 251 | $this->conf = $configuration; |
||
| 252 | } |
||
| 253 | // Setting phash / phash_grouping which identifies the indexed page based on some of these variables: |
||
| 254 | $this->setT3Hashes(); |
||
| 255 | // Initialize external document parsers: |
||
| 256 | // Example configuration, see ext_localconf.php of this file! |
||
| 257 | if ($this->conf['index_externals']) { |
||
| 258 | $this->initializeExternalParsers(); |
||
| 259 | } |
||
| 260 | // Initialize lexer (class that deconstructs the text into words): |
||
| 261 | $lexerObjectClassName = $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['lexer'] ?: Lexer::class; |
||
| 262 | $this->lexerObj = GeneralUtility::makeInstance($lexerObjectClassName); |
||
| 263 | $this->lexerObj->debug = $this->indexerConfig['debugMode']; |
||
| 264 | // Initialize metaphone hook: |
||
| 265 | // Make sure that the hook is loaded _after_ indexed_search as this may overwrite the hook depending on the configuration. |
||
| 266 | if ($this->enableMetaphoneSearch && $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['metaphone']) { |
||
| 267 | $this->metaphoneObj = GeneralUtility::makeInstance($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['metaphone']); |
||
| 268 | $this->metaphoneObj->pObj = $this; |
||
| 269 | } |
||
| 270 | } |
||
| 271 | |||
| 272 | /** |
||
| 273 | * Initialize external parsers |
||
| 274 | * |
||
| 275 | * @internal |
||
| 276 | * @see init() |
||
| 277 | */ |
||
| 278 | public function initializeExternalParsers() |
||
| 279 | { |
||
| 280 | foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['external_parsers'] ?? [] as $extension => $className) { |
||
| 281 | $this->external_parsers[$extension] = GeneralUtility::makeInstance($className); |
||
| 282 | $this->external_parsers[$extension]->pObj = $this; |
||
| 283 | // Init parser and if it returns FALSE, unset its entry again: |
||
| 284 | if (!$this->external_parsers[$extension]->initParser($extension)) { |
||
| 285 | unset($this->external_parsers[$extension]); |
||
| 286 | } |
||
| 287 | } |
||
| 288 | } |
||
| 289 | |||
| 290 | /******************************** |
||
| 291 | * |
||
| 292 | * Indexing; TYPO3 pages (HTML content) |
||
| 293 | * |
||
| 294 | *******************************/ |
||
| 295 | /** |
||
| 296 | * Start indexing of the TYPO3 page |
||
| 297 | */ |
||
| 298 | public function indexTypo3PageContent() |
||
| 299 | { |
||
| 300 | $check = $this->checkMtimeTstamp($this->conf['mtime'], $this->hash['phash']); |
||
| 301 | $is_grlist = $this->is_grlist_set($this->hash['phash']); |
||
| 302 | if ($check > 0 || !$is_grlist || $this->forceIndexing) { |
||
| 303 | // Setting message: |
||
| 304 | if ($this->forceIndexing) { |
||
| 305 | $this->log_setTSlogMessage('Indexing needed, reason: Forced', 1); |
||
| 306 | } elseif ($check > 0) { |
||
| 307 | $this->log_setTSlogMessage('Indexing needed, reason: ' . $this->reasons[$check], 1); |
||
| 308 | } else { |
||
| 309 | $this->log_setTSlogMessage('Indexing needed, reason: Updates gr_list!', 1); |
||
| 310 | } |
||
| 311 | // Divide into title,keywords,description and body: |
||
| 312 | $this->log_push('Split content', ''); |
||
| 313 | $this->contentParts = $this->splitHTMLContent($this->conf['content']); |
||
| 314 | if ($this->conf['indexedDocTitle']) { |
||
| 315 | $this->contentParts['title'] = $this->conf['indexedDocTitle']; |
||
| 316 | } |
||
| 317 | $this->log_pull(); |
||
| 318 | // Calculating a hash over what is to be the actual page content. Maybe this hash should not include title,description and keywords? The bodytext is the primary concern. (on the other hand a changed page-title would make no difference then, so don't!) |
||
| 319 | $this->content_md5h = IndexedSearchUtility::md5inthash(implode('', $this->contentParts)); |
||
| 320 | // This function checks if there is already a page (with gr_list = 0,-1) indexed and if that page has the very same contentHash. |
||
| 321 | // If the contentHash is the same, then we can rest assured that this page is already indexed and regardless of mtime and origContent we don't need to do anything more. |
||
| 322 | // This will also prevent pages from being indexed if a fe_users has logged in and it turns out that the page content is not changed anyway. fe_users logged in should always search with hash_gr_list = "0,-1" OR "[their_group_list]". This situation will be prevented only if the page has been indexed with no user login on before hand. Else the page will be indexed by users until that event. However that does not present a serious problem. |
||
| 323 | $checkCHash = $this->checkContentHash(); |
||
| 324 | if (!is_array($checkCHash) || $check === 1) { |
||
| 325 | $Pstart = GeneralUtility::milliseconds(); |
||
| 326 | $this->log_push('Converting charset of content (' . $this->conf['metaCharset'] . ') to utf-8', ''); |
||
| 327 | $this->charsetEntity2utf8($this->contentParts, $this->conf['metaCharset']); |
||
| 328 | $this->log_pull(); |
||
| 329 | // Splitting words |
||
| 330 | $this->log_push('Extract words from content', ''); |
||
| 331 | $splitInWords = $this->processWordsInArrays($this->contentParts); |
||
| 332 | $this->log_pull(); |
||
| 333 | // Analyze the indexed words. |
||
| 334 | $this->log_push('Analyze the extracted words', ''); |
||
| 335 | $indexArr = $this->indexAnalyze($splitInWords); |
||
| 336 | $this->log_pull(); |
||
| 337 | // Submitting page (phash) record |
||
| 338 | $this->log_push('Submitting page', ''); |
||
| 339 | $this->submitPage(); |
||
| 340 | $this->log_pull(); |
||
| 341 | // Check words and submit to word list if not there |
||
| 342 | $this->log_push('Check word list and submit words', ''); |
||
| 343 | if (IndexedSearchUtility::isTableUsed('index_words')) { |
||
| 344 | $this->checkWordList($indexArr); |
||
| 345 | $this->submitWords($indexArr, $this->hash['phash']); |
||
| 346 | } |
||
| 347 | $this->log_pull(); |
||
| 348 | // Set parsetime |
||
| 349 | $this->updateParsetime($this->hash['phash'], GeneralUtility::milliseconds() - $Pstart); |
||
| 350 | // Checking external files if configured for. |
||
| 351 | $this->log_push('Checking external files', ''); |
||
| 352 | if ($this->conf['index_externals']) { |
||
| 353 | $this->extractLinks($this->conf['content']); |
||
| 354 | } |
||
| 355 | $this->log_pull(); |
||
| 356 | } else { |
||
| 357 | // Update the timestamp |
||
| 358 | $this->updateTstamp($this->hash['phash'], $this->conf['mtime']); |
||
| 359 | $this->updateSetId($this->hash['phash']); |
||
| 360 | // $checkCHash['phash'] is the phash of the result row that is similar to the current phash regarding the content hash. |
||
| 361 | $this->update_grlist($checkCHash['phash'], $this->hash['phash']); |
||
| 362 | $this->updateRootline(); |
||
| 363 | $this->log_setTSlogMessage('Indexing not needed, the contentHash, ' . $this->content_md5h . ', has not changed. Timestamp, grlist and rootline updated if necessary.'); |
||
| 364 | } |
||
| 365 | } else { |
||
| 366 | $this->log_setTSlogMessage('Indexing not needed, reason: ' . $this->reasons[$check]); |
||
| 367 | } |
||
| 368 | } |
||
| 369 | |||
| 370 | /** |
||
| 371 | * Splits HTML content and returns an associative array, with title, a list of metatags, and a list of words in the body. |
||
| 372 | * |
||
| 373 | * @param string $content HTML content to index. To some degree expected to be made by TYPO3 (ei. splitting the header by ":") |
||
| 374 | * @return array Array of content, having keys "title", "body", "keywords" and "description" set. |
||
| 375 | * @see splitRegularContent() |
||
| 376 | */ |
||
| 377 | public function splitHTMLContent($content) |
||
| 378 | { |
||
| 379 | // divide head from body ( u-ouh :) ) |
||
| 380 | $contentArr = $this->defaultContentArray; |
||
| 381 | $contentArr['body'] = stristr($content, '<body'); |
||
| 382 | $headPart = substr($content, 0, -strlen($contentArr['body'])); |
||
| 383 | // get title |
||
| 384 | $this->embracingTags($headPart, 'TITLE', $contentArr['title'], $dummy2, $dummy); |
||
| 385 | $titleParts = explode(':', $contentArr['title'], 2); |
||
| 386 | $contentArr['title'] = trim($titleParts[1] ?? $titleParts[0]); |
||
| 387 | // get keywords and description metatags |
||
| 388 | if ($this->conf['index_metatags']) { |
||
| 389 | $meta = []; |
||
| 390 | $i = 0; |
||
| 391 | while ($this->embracingTags($headPart, 'meta', $dummy, $headPart, $meta[$i])) { |
||
| 392 | $i++; |
||
| 393 | } |
||
| 394 | // @todo The code below stops at first unset tag. Is that correct? |
||
| 395 | for ($i = 0; isset($meta[$i]); $i++) { |
||
| 396 | $meta[$i] = GeneralUtility::get_tag_attributes($meta[$i]); |
||
| 397 | if (stripos($meta[$i]['name'], 'keywords') !== false) { |
||
| 398 | $contentArr['keywords'] .= ',' . $this->addSpacesToKeywordList($meta[$i]['content']); |
||
| 399 | } |
||
| 400 | if (stripos($meta[$i]['name'], 'description') !== false) { |
||
| 401 | $contentArr['description'] .= ',' . $meta[$i]['content']; |
||
| 402 | } |
||
| 403 | } |
||
| 404 | } |
||
| 405 | // Process <!--TYPO3SEARCH_begin--> or <!--TYPO3SEARCH_end--> tags: |
||
| 406 | $this->typoSearchTags($contentArr['body']); |
||
| 407 | // Get rid of unwanted sections (ie. scripting and style stuff) in body |
||
| 408 | $tagList = explode(',', $this->excludeSections); |
||
| 409 | foreach ($tagList as $tag) { |
||
| 410 | while ($this->embracingTags($contentArr['body'], $tag, $dummy, $contentArr['body'], $dummy2)) { |
||
| 411 | } |
||
| 412 | } |
||
| 413 | // remove tags, but first make sure we don't concatenate words by doing it |
||
| 414 | $contentArr['body'] = str_replace('<', ' <', $contentArr['body']); |
||
| 415 | $contentArr['body'] = trim(strip_tags($contentArr['body'])); |
||
| 416 | $contentArr['keywords'] = trim($contentArr['keywords']); |
||
| 417 | $contentArr['description'] = trim($contentArr['description']); |
||
| 418 | // Return array |
||
| 419 | return $contentArr; |
||
| 420 | } |
||
| 421 | |||
| 422 | /** |
||
| 423 | * Extract the charset value from HTML meta tag. |
||
| 424 | * |
||
| 425 | * @param string $content HTML content |
||
| 426 | * @return string The charset value if found. |
||
| 427 | */ |
||
| 428 | public function getHTMLcharset($content) |
||
| 429 | { |
||
| 430 | if (preg_match('/<meta[[:space:]]+[^>]*http-equiv[[:space:]]*=[[:space:]]*["\']CONTENT-TYPE["\'][^>]*>/i', $content, $reg)) { |
||
| 431 | if (preg_match('/charset[[:space:]]*=[[:space:]]*([[:alnum:]-]+)/i', $reg[0], $reg2)) { |
||
| 432 | return $reg2[1]; |
||
| 433 | } |
||
| 434 | } |
||
| 435 | |||
| 436 | return ''; |
||
| 437 | } |
||
| 438 | |||
| 439 | /** |
||
| 440 | * Converts a HTML document to utf-8 |
||
| 441 | * |
||
| 442 | * @param string $content HTML content, any charset |
||
| 443 | * @param string $charset Optional charset (otherwise extracted from HTML) |
||
| 444 | * @return string Converted HTML |
||
| 445 | */ |
||
| 446 | public function convertHTMLToUtf8($content, $charset = '') |
||
| 447 | { |
||
| 448 | // Find charset: |
||
| 449 | $charset = $charset ?: $this->getHTMLcharset($content); |
||
| 450 | $charset = trim(strtolower($charset)); |
||
| 451 | // Convert charset: |
||
| 452 | if ($charset && $charset !== 'utf-8') { |
||
| 453 | $content = mb_convert_encoding($content, 'utf-8', $charset); |
||
| 454 | } |
||
| 455 | // Convert entities, assuming document is now UTF-8: |
||
| 456 | return html_entity_decode($content); |
||
| 457 | } |
||
| 458 | |||
| 459 | /** |
||
| 460 | * Finds first occurrence of embracing tags and returns the embraced content and the original string with |
||
| 461 | * the tag removed in the two passed variables. Returns FALSE if no match found. ie. useful for finding |
||
| 462 | * <title> of document or removing <script>-sections |
||
| 463 | * |
||
| 464 | * @param string $string String to search in |
||
| 465 | * @param string $tagName Tag name, eg. "script |
||
| 466 | * @param string $tagContent Passed by reference: Content inside found tag |
||
| 467 | * @param string $stringAfter Passed by reference: Content after found tag |
||
| 468 | * @param string $paramList Passed by reference: Attributes of the found tag. |
||
| 469 | * @return bool Returns FALSE if tag was not found, otherwise TRUE. |
||
| 470 | */ |
||
| 471 | public function embracingTags($string, $tagName, &$tagContent, &$stringAfter, &$paramList) |
||
| 472 | { |
||
| 473 | $endTag = '</' . $tagName . '>'; |
||
| 474 | $startTag = '<' . $tagName; |
||
| 475 | // stristr used because we want a case-insensitive search for the tag. |
||
| 476 | $isTagInText = stristr($string, $startTag); |
||
| 477 | // if the tag was not found, return FALSE |
||
| 478 | if (!$isTagInText) { |
||
| 479 | return false; |
||
| 480 | } |
||
| 481 | [$paramList, $isTagInText] = explode('>', substr($isTagInText, strlen($startTag)), 2); |
||
| 482 | $afterTagInText = stristr($isTagInText, $endTag); |
||
| 483 | if ($afterTagInText) { |
||
| 484 | $stringBefore = substr($string, 0, strpos(strtolower($string), strtolower($startTag))); |
||
| 485 | $tagContent = substr($isTagInText, 0, strlen($isTagInText) - strlen($afterTagInText)); |
||
| 486 | $stringAfter = $stringBefore . substr($afterTagInText, strlen($endTag)); |
||
| 487 | } else { |
||
| 488 | $tagContent = ''; |
||
| 489 | $stringAfter = $isTagInText; |
||
| 490 | } |
||
| 491 | return true; |
||
| 492 | } |
||
| 493 | |||
| 494 | /** |
||
| 495 | * Removes content that shouldn't be indexed according to TYPO3SEARCH-tags. |
||
| 496 | * |
||
| 497 | * @param string $body HTML Content, passed by reference |
||
| 498 | * @return bool Returns TRUE if a TYPOSEARCH_ tag was found, otherwise FALSE. |
||
| 499 | */ |
||
| 500 | public function typoSearchTags(&$body) |
||
| 501 | { |
||
| 502 | $expBody = preg_split('/\\<\\!\\-\\-[\\s]?TYPO3SEARCH_/', $body); |
||
| 503 | if (count($expBody) > 1) { |
||
| 504 | $body = ''; |
||
| 505 | $prev = ''; |
||
| 506 | foreach ($expBody as $val) { |
||
| 507 | $part = explode('-->', $val, 2); |
||
| 508 | if (trim($part[0]) === 'begin') { |
||
| 509 | $body .= $part[1]; |
||
| 510 | $prev = ''; |
||
| 511 | } elseif (trim($part[0]) === 'end') { |
||
| 512 | $body .= $prev; |
||
| 513 | } else { |
||
| 514 | $prev = $val; |
||
| 515 | } |
||
| 516 | } |
||
| 517 | return true; |
||
| 518 | } |
||
| 519 | return false; |
||
| 520 | } |
||
| 521 | |||
| 522 | /** |
||
| 523 | * Extract links (hrefs) from HTML content and if indexable media is found, it is indexed. |
||
| 524 | * |
||
| 525 | * @param string $content HTML content |
||
| 526 | */ |
||
| 527 | public function extractLinks($content) |
||
| 528 | { |
||
| 529 | $crawler = null; |
||
| 530 | // Get links: |
||
| 531 | $list = $this->extractHyperLinks($content); |
||
| 532 | if ($this->indexerConfig['useCrawlerForExternalFiles'] && \TYPO3\CMS\Core\Utility\ExtensionManagementUtility::isLoaded('crawler')) { |
||
| 533 | /** |
||
| 534 | * todo: remove dependency to class tx_crawler_lib |
||
| 535 | * @link https://forge.typo3.org/issues/83603 |
||
| 536 | */ |
||
| 537 | $crawler = GeneralUtility::makeInstance('tx_crawler_lib'); |
||
| 538 | } |
||
| 539 | // Traverse links: |
||
| 540 | foreach ($list as $linkInfo) { |
||
| 541 | // Decode entities: |
||
| 542 | if ($linkInfo['localPath']) { |
||
| 543 | // localPath means: This file is sent by a download script. While the indexed URL has to point to $linkInfo['href'], the absolute path to the file is specified here! |
||
| 544 | $linkSource = htmlspecialchars_decode($linkInfo['localPath']); |
||
| 545 | } else { |
||
| 546 | $linkSource = htmlspecialchars_decode($linkInfo['href']); |
||
| 547 | } |
||
| 548 | // Parse URL: |
||
| 549 | $qParts = parse_url($linkSource); |
||
| 550 | // Check for jumpurl (TYPO3 specific thing...) |
||
| 551 | if ($qParts['query'] && strpos($qParts['query'], 'jumpurl=') !== false) { |
||
| 552 | parse_str($qParts['query'], $getP); |
||
| 553 | $linkSource = $getP['jumpurl']; |
||
| 554 | $qParts = parse_url($linkSource); |
||
| 555 | } |
||
| 556 | if (!$linkInfo['localPath'] && $qParts['scheme']) { |
||
| 557 | if ($this->indexerConfig['indexExternalURLs']) { |
||
| 558 | // Index external URL (http or otherwise) |
||
| 559 | $this->indexExternalUrl($linkSource); |
||
| 560 | } |
||
| 561 | } elseif (!$qParts['query']) { |
||
| 562 | $linkSource = urldecode($linkSource); |
||
| 563 | if (GeneralUtility::isAllowedAbsPath($linkSource)) { |
||
| 564 | $localFile = $linkSource; |
||
| 565 | } else { |
||
| 566 | $localFile = GeneralUtility::getFileAbsFileName(Environment::getPublicPath() . '/' . $linkSource); |
||
| 567 | } |
||
| 568 | if ($localFile && @is_file($localFile)) { |
||
| 569 | // Index local file: |
||
| 570 | if ($linkInfo['localPath']) { |
||
| 571 | $fI = pathinfo($linkSource); |
||
| 572 | $ext = strtolower($fI['extension']); |
||
| 573 | if (is_object($crawler)) { |
||
| 574 | $params = [ |
||
| 575 | 'document' => $linkSource, |
||
| 576 | 'alturl' => $linkInfo['href'], |
||
| 577 | 'conf' => $this->conf |
||
| 578 | ]; |
||
| 579 | unset($params['conf']['content']); |
||
| 580 | $crawler->addQueueEntry_callBack(0, $params, Hook\CrawlerFilesHook::class, $this->conf['id']); |
||
| 581 | $this->log_setTSlogMessage('media "' . $params['document'] . '" added to "crawler" queue.', 1); |
||
| 582 | } else { |
||
| 583 | $this->indexRegularDocument($linkInfo['href'], false, $linkSource, $ext); |
||
| 584 | } |
||
| 585 | } else { |
||
| 586 | if (is_object($crawler)) { |
||
| 587 | $params = [ |
||
| 588 | 'document' => $linkSource, |
||
| 589 | 'conf' => $this->conf |
||
| 590 | ]; |
||
| 591 | unset($params['conf']['content']); |
||
| 592 | $crawler->addQueueEntry_callBack(0, $params, Hook\CrawlerFilesHook::class, $this->conf['id']); |
||
| 593 | $this->log_setTSlogMessage('media "' . $params['document'] . '" added to "crawler" queue.', 1); |
||
| 594 | } else { |
||
| 595 | $this->indexRegularDocument($linkSource); |
||
| 596 | } |
||
| 597 | } |
||
| 598 | } |
||
| 599 | } |
||
| 600 | } |
||
| 601 | } |
||
| 602 | |||
| 603 | /** |
||
| 604 | * Extracts all links to external documents from the HTML content string |
||
| 605 | * |
||
| 606 | * @param string $html |
||
| 607 | * @return array Array of hyperlinks (keys: tag, href, localPath (empty if not local)) |
||
| 608 | * @see extractLinks() |
||
| 609 | */ |
||
| 610 | public function extractHyperLinks($html) |
||
| 611 | { |
||
| 612 | $htmlParser = GeneralUtility::makeInstance(\TYPO3\CMS\Core\Html\HtmlParser::class); |
||
| 613 | $htmlParts = $htmlParser->splitTags('a', $html); |
||
| 614 | $hyperLinksData = []; |
||
| 615 | foreach ($htmlParts as $index => $tagData) { |
||
| 616 | if ($index % 2 !== 0) { |
||
| 617 | $tagAttributes = $htmlParser->get_tag_attributes($tagData, true); |
||
| 618 | $firstTagName = $htmlParser->getFirstTagName($tagData); |
||
| 619 | if (strtolower($firstTagName) === 'a') { |
||
| 620 | if ($tagAttributes[0]['href'] && $tagAttributes[0]['href'][0] !== '#') { |
||
| 621 | $hyperLinksData[] = [ |
||
| 622 | 'tag' => $tagData, |
||
| 623 | 'href' => $tagAttributes[0]['href'], |
||
| 624 | 'localPath' => $this->createLocalPath($tagAttributes[0]['href']) |
||
| 625 | ]; |
||
| 626 | } |
||
| 627 | } |
||
| 628 | } |
||
| 629 | } |
||
| 630 | return $hyperLinksData; |
||
| 631 | } |
||
| 632 | |||
| 633 | /** |
||
| 634 | * Extracts the "base href" from content string. |
||
| 635 | * |
||
| 636 | * @param string $html Content to analyze |
||
| 637 | * @return string The base href or an empty string if not found |
||
| 638 | */ |
||
| 639 | public function extractBaseHref($html) |
||
| 640 | { |
||
| 641 | $href = ''; |
||
| 642 | $htmlParser = GeneralUtility::makeInstance(\TYPO3\CMS\Core\Html\HtmlParser::class); |
||
| 643 | $htmlParts = $htmlParser->splitTags('base', $html); |
||
| 644 | foreach ($htmlParts as $index => $tagData) { |
||
| 645 | if ($index % 2 !== 0) { |
||
| 646 | $tagAttributes = $htmlParser->get_tag_attributes($tagData, true); |
||
| 647 | $firstTagName = $htmlParser->getFirstTagName($tagData); |
||
| 648 | if (strtolower($firstTagName) === 'base') { |
||
| 649 | $href = $tagAttributes[0]['href']; |
||
| 650 | if ($href) { |
||
| 651 | break; |
||
| 652 | } |
||
| 653 | } |
||
| 654 | } |
||
| 655 | } |
||
| 656 | return $href; |
||
| 657 | } |
||
| 658 | |||
| 659 | /****************************************** |
||
| 660 | * |
||
| 661 | * Indexing; external URL |
||
| 662 | * |
||
| 663 | ******************************************/ |
||
| 664 | /** |
||
| 665 | * Index External URLs HTML content |
||
| 666 | * |
||
| 667 | * @param string $externalUrl URL, eg. "http://typo3.org/ |
||
| 668 | * @see indexRegularDocument() |
||
| 669 | */ |
||
| 670 | public function indexExternalUrl($externalUrl) |
||
| 671 | { |
||
| 672 | // Get headers: |
||
| 673 | $urlHeaders = $this->getUrlHeaders($externalUrl); |
||
| 674 | if (stripos($urlHeaders['Content-Type'], 'text/html') !== false) { |
||
| 675 | $content = ($this->indexExternalUrl_content = GeneralUtility::getUrl($externalUrl)); |
||
| 676 | if ((string)$content !== '') { |
||
| 677 | // Create temporary file: |
||
| 678 | $tmpFile = GeneralUtility::tempnam('EXTERNAL_URL'); |
||
| 679 | if ($tmpFile) { |
||
| 680 | GeneralUtility::writeFile($tmpFile, $content); |
||
| 681 | // Index that file: |
||
| 682 | $this->indexRegularDocument($externalUrl, true, $tmpFile, 'html'); |
||
| 683 | // Using "TRUE" for second parameter to force indexing of external URLs (mtime doesn't make sense, does it?) |
||
| 684 | unlink($tmpFile); |
||
| 685 | } |
||
| 686 | } |
||
| 687 | } |
||
| 688 | } |
||
| 689 | |||
| 690 | /** |
||
| 691 | * Getting HTTP request headers of URL |
||
| 692 | * |
||
| 693 | * @param string $url The URL |
||
| 694 | * @return mixed If no answer, returns FALSE. Otherwise an array where HTTP headers are keys |
||
| 695 | */ |
||
| 696 | public function getUrlHeaders($url) |
||
| 697 | { |
||
| 698 | // Try to get the headers only |
||
| 699 | $content = GeneralUtility::getUrl($url, 2); |
||
| 700 | if ((string)$content !== '') { |
||
| 701 | // Compile headers: |
||
| 702 | $headers = GeneralUtility::trimExplode(LF, $content, true); |
||
| 703 | $retVal = []; |
||
| 704 | foreach ($headers as $line) { |
||
| 705 | if (trim($line) === '') { |
||
| 706 | break; |
||
| 707 | } |
||
| 708 | [$headKey, $headValue] = explode(':', $line, 2); |
||
| 709 | $retVal[$headKey] = $headValue; |
||
| 710 | } |
||
| 711 | return $retVal; |
||
| 712 | } |
||
| 713 | } |
||
| 714 | |||
| 715 | /** |
||
| 716 | * Checks if the file is local |
||
| 717 | * |
||
| 718 | * @param string $sourcePath |
||
| 719 | * @return string Absolute path to file if file is local, else empty string |
||
| 720 | */ |
||
| 721 | protected function createLocalPath($sourcePath) |
||
| 722 | { |
||
| 723 | $localPath = ''; |
||
| 724 | $pathFunctions = [ |
||
| 725 | 'createLocalPathUsingAbsRefPrefix', |
||
| 726 | 'createLocalPathUsingDomainURL', |
||
| 727 | 'createLocalPathFromAbsoluteURL', |
||
| 728 | 'createLocalPathFromRelativeURL' |
||
| 729 | ]; |
||
| 730 | foreach ($pathFunctions as $functionName) { |
||
| 731 | $localPath = $this->{$functionName}($sourcePath); |
||
| 732 | if ($localPath != '') { |
||
| 733 | break; |
||
| 734 | } |
||
| 735 | } |
||
| 736 | return $localPath; |
||
| 737 | } |
||
| 738 | |||
| 739 | /** |
||
| 740 | * Attempts to create a local file path by matching a current request URL. |
||
| 741 | * |
||
| 742 | * @param string $sourcePath |
||
| 743 | * @return string |
||
| 744 | */ |
||
| 745 | protected function createLocalPathUsingDomainURL($sourcePath) |
||
| 746 | { |
||
| 747 | $localPath = ''; |
||
| 748 | $baseURL = GeneralUtility::getIndpEnv('TYPO3_SITE_URL'); |
||
| 749 | $baseURLLength = strlen($baseURL); |
||
| 750 | if (strpos($sourcePath, $baseURL) === 0) { |
||
| 751 | $sourcePath = substr($sourcePath, $baseURLLength); |
||
| 752 | $localPath = Environment::getPublicPath() . '/' . $sourcePath; |
||
| 753 | if (!self::isAllowedLocalFile($localPath)) { |
||
| 754 | $localPath = ''; |
||
| 755 | } |
||
| 756 | } |
||
| 757 | return $localPath; |
||
| 758 | } |
||
| 759 | |||
| 760 | /** |
||
| 761 | * Attempts to create a local file path by matching absRefPrefix. This |
||
| 762 | * requires TSFE. If TSFE is missing, this function does nothing. |
||
| 763 | * |
||
| 764 | * @param string $sourcePath |
||
| 765 | * @return string |
||
| 766 | */ |
||
| 767 | protected function createLocalPathUsingAbsRefPrefix($sourcePath) |
||
| 768 | { |
||
| 769 | $localPath = ''; |
||
| 770 | if (isset($GLOBALS['TSFE']) && $GLOBALS['TSFE'] instanceof TypoScriptFrontendController) { |
||
| 771 | $absRefPrefix = $GLOBALS['TSFE']->config['config']['absRefPrefix']; |
||
| 772 | $absRefPrefixLength = strlen($absRefPrefix); |
||
| 773 | if ($absRefPrefixLength > 0 && strpos($sourcePath, $absRefPrefix) === 0) { |
||
| 774 | $sourcePath = substr($sourcePath, $absRefPrefixLength); |
||
| 775 | $localPath = Environment::getPublicPath() . '/' . $sourcePath; |
||
| 776 | if (!self::isAllowedLocalFile($localPath)) { |
||
| 777 | $localPath = ''; |
||
| 778 | } |
||
| 779 | } |
||
| 780 | } |
||
| 781 | return $localPath; |
||
| 782 | } |
||
| 783 | |||
| 784 | /** |
||
| 785 | * Attempts to create a local file path from the absolute URL without |
||
| 786 | * schema. |
||
| 787 | * |
||
| 788 | * @param string $sourcePath |
||
| 789 | * @return string |
||
| 790 | */ |
||
| 791 | protected function createLocalPathFromAbsoluteURL($sourcePath) |
||
| 792 | { |
||
| 793 | $localPath = ''; |
||
| 794 | if ($sourcePath[0] === '/') { |
||
| 795 | $sourcePath = substr($sourcePath, 1); |
||
| 796 | $localPath = Environment::getPublicPath() . '/' . $sourcePath; |
||
| 797 | if (!self::isAllowedLocalFile($localPath)) { |
||
| 798 | $localPath = ''; |
||
| 799 | } |
||
| 800 | } |
||
| 801 | return $localPath; |
||
| 802 | } |
||
| 803 | |||
| 804 | /** |
||
| 805 | * Attempts to create a local file path from the relative URL. |
||
| 806 | * |
||
| 807 | * @param string $sourcePath |
||
| 808 | * @return string |
||
| 809 | */ |
||
| 810 | protected function createLocalPathFromRelativeURL($sourcePath) |
||
| 811 | { |
||
| 812 | $localPath = ''; |
||
| 813 | if (self::isRelativeURL($sourcePath)) { |
||
| 814 | $localPath = Environment::getPublicPath() . '/' . $sourcePath; |
||
| 815 | if (!self::isAllowedLocalFile($localPath)) { |
||
| 816 | $localPath = ''; |
||
| 817 | } |
||
| 818 | } |
||
| 819 | return $localPath; |
||
| 820 | } |
||
| 821 | |||
| 822 | /** |
||
| 823 | * Checks if URL is relative. |
||
| 824 | * |
||
| 825 | * @param string $url |
||
| 826 | * @return bool |
||
| 827 | */ |
||
| 828 | protected static function isRelativeURL($url) |
||
| 829 | { |
||
| 830 | $urlParts = @parse_url($url); |
||
| 831 | return (!isset($urlParts['scheme']) || $urlParts['scheme'] === '') && $urlParts['path'][0] !== '/'; |
||
| 832 | } |
||
| 833 | |||
| 834 | /** |
||
| 835 | * Checks if the path points to the file inside the web site |
||
| 836 | * |
||
| 837 | * @param string $filePath |
||
| 838 | * @return bool |
||
| 839 | */ |
||
| 840 | protected static function isAllowedLocalFile($filePath) |
||
| 841 | { |
||
| 842 | $filePath = GeneralUtility::resolveBackPath($filePath); |
||
| 843 | $insideWebPath = strpos($filePath, Environment::getPublicPath()) === 0; |
||
| 844 | $isFile = is_file($filePath); |
||
| 845 | return $insideWebPath && $isFile; |
||
| 846 | } |
||
| 847 | |||
| 848 | /****************************************** |
||
| 849 | * |
||
| 850 | * Indexing; external files (PDF, DOC, etc) |
||
| 851 | * |
||
| 852 | ******************************************/ |
||
| 853 | /** |
||
| 854 | * Indexing a regular document given as $file (relative to public web path, local file) |
||
| 855 | * |
||
| 856 | * @param string $file Relative Filename, relative to public web path. It can also be an absolute path as long as it is inside the lockRootPath (validated with \TYPO3\CMS\Core\Utility\GeneralUtility::isAbsPath()). Finally, if $contentTmpFile is set, this value can be anything, most likely a URL |
||
| 857 | * @param bool $force If set, indexing is forced (despite content hashes, mtime etc). |
||
| 858 | * @param string $contentTmpFile Temporary file with the content to read it from (instead of $file). Used when the $file is a URL. |
||
| 859 | * @param string $altExtension File extension for temporary file. |
||
| 860 | */ |
||
| 861 | public function indexRegularDocument($file, $force = false, $contentTmpFile = '', $altExtension = '') |
||
| 862 | { |
||
| 863 | // Init |
||
| 864 | $fI = pathinfo($file); |
||
| 865 | $ext = $altExtension ?: strtolower($fI['extension']); |
||
| 866 | // Create abs-path: |
||
| 867 | if (!$contentTmpFile) { |
||
| 868 | if (!GeneralUtility::isAbsPath($file)) { |
||
| 869 | // Relative, prepend public web path: |
||
| 870 | $absFile = GeneralUtility::getFileAbsFileName(Environment::getPublicPath() . '/' . $file); |
||
| 871 | } else { |
||
| 872 | // Absolute, pass-through: |
||
| 873 | $absFile = $file; |
||
| 874 | } |
||
| 875 | $absFile = GeneralUtility::isAllowedAbsPath($absFile) ? $absFile : ''; |
||
| 876 | } else { |
||
| 877 | $absFile = $contentTmpFile; |
||
| 878 | } |
||
| 879 | // Indexing the document: |
||
| 880 | if ($absFile && @is_file($absFile)) { |
||
| 881 | if ($this->external_parsers[$ext]) { |
||
| 882 | $fileInfo = stat($absFile); |
||
| 883 | $cParts = $this->fileContentParts($ext, $absFile); |
||
| 884 | foreach ($cParts as $cPKey) { |
||
| 885 | $this->internal_log = []; |
||
| 886 | $this->log_push('Index: ' . str_replace('.', '_', PathUtility::basename($file)) . ($cPKey ? '#' . $cPKey : ''), ''); |
||
| 887 | $Pstart = GeneralUtility::milliseconds(); |
||
| 888 | $subinfo = ['key' => $cPKey]; |
||
| 889 | // Setting page range. This is "0" (zero) when no division is made, otherwise a range like "1-3" |
||
| 890 | $phash_arr = ($this->file_phash_arr = $this->setExtHashes($file, $subinfo)); |
||
| 891 | $check = $this->checkMtimeTstamp($fileInfo['mtime'], $phash_arr['phash']); |
||
| 892 | if ($check > 0 || $force) { |
||
| 893 | if ($check > 0) { |
||
| 894 | $this->log_setTSlogMessage('Indexing needed, reason: ' . $this->reasons[$check], 1); |
||
| 895 | } else { |
||
| 896 | $this->log_setTSlogMessage('Indexing forced by flag', 1); |
||
| 897 | } |
||
| 898 | // Check external file counter: |
||
| 899 | if ($this->externalFileCounter < $this->maxExternalFiles || $force) { |
||
| 900 | // Divide into title,keywords,description and body: |
||
| 901 | $this->log_push('Split content', ''); |
||
| 902 | $contentParts = $this->readFileContent($ext, $absFile, $cPKey); |
||
| 903 | $this->log_pull(); |
||
| 904 | if (is_array($contentParts)) { |
||
| 905 | // Calculating a hash over what is to be the actual content. (see indexTypo3PageContent()) |
||
| 906 | $content_md5h = IndexedSearchUtility::md5inthash(implode('', $contentParts)); |
||
| 907 | if ($this->checkExternalDocContentHash($phash_arr['phash_grouping'], $content_md5h) || $force) { |
||
| 908 | // Increment counter: |
||
| 909 | $this->externalFileCounter++; |
||
| 910 | // Splitting words |
||
| 911 | $this->log_push('Extract words from content', ''); |
||
| 912 | $splitInWords = $this->processWordsInArrays($contentParts); |
||
| 913 | $this->log_pull(); |
||
| 914 | // Analyze the indexed words. |
||
| 915 | $this->log_push('Analyze the extracted words', ''); |
||
| 916 | $indexArr = $this->indexAnalyze($splitInWords); |
||
| 917 | $this->log_pull(); |
||
| 918 | // Submitting page (phash) record |
||
| 919 | $this->log_push('Submitting page', ''); |
||
| 920 | // Unfortunately I cannot determine WHEN a file is originally made - so I must return the modification time... |
||
| 921 | $this->submitFilePage($phash_arr, $file, $subinfo, $ext, $fileInfo['mtime'], $fileInfo['ctime'], $fileInfo['size'], $content_md5h, $contentParts); |
||
| 922 | $this->log_pull(); |
||
| 923 | // Check words and submit to word list if not there |
||
| 924 | $this->log_push('Check word list and submit words', ''); |
||
| 925 | if (IndexedSearchUtility::isTableUsed('index_words')) { |
||
| 926 | $this->checkWordList($indexArr); |
||
| 927 | $this->submitWords($indexArr, $phash_arr['phash']); |
||
| 928 | } |
||
| 929 | $this->log_pull(); |
||
| 930 | // Set parsetime |
||
| 931 | $this->updateParsetime($phash_arr['phash'], GeneralUtility::milliseconds() - $Pstart); |
||
| 932 | } else { |
||
| 933 | // Update the timestamp |
||
| 934 | $this->updateTstamp($phash_arr['phash'], $fileInfo['mtime']); |
||
| 935 | $this->log_setTSlogMessage('Indexing not needed, the contentHash, ' . $content_md5h . ', has not changed. Timestamp updated.'); |
||
| 936 | } |
||
| 937 | } else { |
||
| 938 | $this->log_setTSlogMessage('Could not index file! Unsupported extension.'); |
||
| 939 | } |
||
| 940 | } else { |
||
| 941 | $this->log_setTSlogMessage('The limit of ' . $this->maxExternalFiles . ' has already been exceeded, so no indexing will take place this time.'); |
||
| 942 | } |
||
| 943 | } else { |
||
| 944 | $this->log_setTSlogMessage('Indexing not needed, reason: ' . $this->reasons[$check]); |
||
| 945 | } |
||
| 946 | // Checking and setting sections: |
||
| 947 | $this->submitFile_section($phash_arr['phash']); |
||
| 948 | // Setting a section-record for the file. This is done also if the file is not indexed. Notice that section records are deleted when the page is indexed. |
||
| 949 | $this->log_pull(); |
||
| 950 | } |
||
| 951 | } else { |
||
| 952 | $this->log_setTSlogMessage('Indexing not possible; The extension "' . $ext . '" was not supported.'); |
||
| 953 | } |
||
| 954 | } else { |
||
| 955 | $this->log_setTSlogMessage('Indexing not possible; File "' . $absFile . '" not found or valid.'); |
||
| 956 | } |
||
| 957 | } |
||
| 958 | |||
| 959 | /** |
||
| 960 | * Reads the content of an external file being indexed. |
||
| 961 | * The content from the external parser MUST be returned in utf-8! |
||
| 962 | * |
||
| 963 | * @param string $fileExtension File extension, eg. "pdf", "doc" etc. |
||
| 964 | * @param string $absoluteFileName Absolute filename of file (must exist and be validated OK before calling function) |
||
| 965 | * @param string $sectionPointer Pointer to section (zero for all other than PDF which will have an indication of pages into which the document should be splitted.) |
||
| 966 | * @return array Standard content array (title, description, keywords, body keys) |
||
| 967 | */ |
||
| 968 | public function readFileContent($fileExtension, $absoluteFileName, $sectionPointer) |
||
| 969 | { |
||
| 970 | $contentArray = null; |
||
| 971 | // Consult relevant external document parser: |
||
| 972 | if (is_object($this->external_parsers[$fileExtension])) { |
||
| 973 | $contentArray = $this->external_parsers[$fileExtension]->readFileContent($fileExtension, $absoluteFileName, $sectionPointer); |
||
| 974 | } |
||
| 975 | return $contentArray; |
||
| 976 | } |
||
| 977 | |||
| 978 | /** |
||
| 979 | * Creates an array with pointers to divisions of document. |
||
| 980 | * |
||
| 981 | * @param string $ext File extension |
||
| 982 | * @param string $absFile Absolute filename (must exist and be validated OK before calling function) |
||
| 983 | * @return array Array of pointers to sections that the document should be divided into |
||
| 984 | */ |
||
| 985 | public function fileContentParts($ext, $absFile) |
||
| 993 | } |
||
| 994 | |||
| 995 | /** |
||
| 996 | * Splits non-HTML content (from external files for instance) |
||
| 997 | * |
||
| 998 | * @param string $content Input content (non-HTML) to index. |
||
| 999 | * @return array Array of content, having the key "body" set (plus "title", "description" and "keywords", but empty) |
||
| 1000 | * @see splitHTMLContent() |
||
| 1001 | */ |
||
| 1002 | public function splitRegularContent($content) |
||
| 1003 | { |
||
| 1004 | $contentArr = $this->defaultContentArray; |
||
| 1005 | $contentArr['body'] = $content; |
||
| 1006 | return $contentArr; |
||
| 1007 | } |
||
| 1008 | |||
| 1009 | /********************************** |
||
| 1010 | * |
||
| 1011 | * Analysing content, Extracting words |
||
| 1012 | * |
||
| 1013 | **********************************/ |
||
| 1014 | /** |
||
| 1015 | * Convert character set and HTML entities in the value of input content array keys |
||
| 1016 | * |
||
| 1017 | * @param array $contentArr Standard content array |
||
| 1018 | * @param string $charset Charset of the input content (converted to utf-8) |
||
| 1019 | */ |
||
| 1020 | public function charsetEntity2utf8(&$contentArr, $charset) |
||
| 1021 | { |
||
| 1022 | // Convert charset if necessary |
||
| 1023 | foreach ($contentArr as $key => $value) { |
||
| 1024 | if ((string)$contentArr[$key] !== '') { |
||
| 1025 | if ($charset !== 'utf-8') { |
||
| 1026 | $contentArr[$key] = mb_convert_encoding($contentArr[$key], 'utf-8', $charset); |
||
| 1027 | } |
||
| 1028 | // decode all numeric / html-entities in the string to real characters: |
||
| 1029 | $contentArr[$key] = html_entity_decode($contentArr[$key]); |
||
| 1030 | } |
||
| 1031 | } |
||
| 1032 | } |
||
| 1033 | |||
| 1034 | /** |
||
| 1035 | * Processing words in the array from split*Content -functions |
||
| 1036 | * |
||
| 1037 | * @param array $contentArr Array of content to index, see splitHTMLContent() and splitRegularContent() |
||
| 1038 | * @return array Content input array modified so each key is not a unique array of words |
||
| 1039 | */ |
||
| 1040 | public function processWordsInArrays($contentArr) |
||
| 1041 | { |
||
| 1042 | // split all parts to words |
||
| 1043 | foreach ($contentArr as $key => $value) { |
||
| 1044 | $contentArr[$key] = $this->lexerObj->split2Words($contentArr[$key]); |
||
| 1045 | } |
||
| 1046 | // For title, keywords, and description we don't want duplicates: |
||
| 1047 | $contentArr['title'] = array_unique($contentArr['title']); |
||
| 1048 | $contentArr['keywords'] = array_unique($contentArr['keywords']); |
||
| 1049 | $contentArr['description'] = array_unique($contentArr['description']); |
||
| 1050 | // Return modified array: |
||
| 1051 | return $contentArr; |
||
| 1052 | } |
||
| 1053 | |||
| 1054 | /** |
||
| 1055 | * Extracts the sample description text from the content array. |
||
| 1056 | * |
||
| 1057 | * @param array $contentArr Content array |
||
| 1058 | * @return string Description string |
||
| 1059 | */ |
||
| 1060 | public function bodyDescription($contentArr) |
||
| 1061 | { |
||
| 1062 | $bodyDescription = ''; |
||
| 1063 | // Setting description |
||
| 1064 | $maxL = MathUtility::forceIntegerInRange($this->conf['index_descrLgd'], 0, 255, 200); |
||
| 1065 | if ($maxL) { |
||
| 1066 | $bodyDescription = preg_replace('/\s+/u', ' ', $contentArr['body']); |
||
| 1067 | // Shorten the string: |
||
| 1068 | $bodyDescription = mb_strcut($bodyDescription, 0, $maxL, 'utf-8'); |
||
| 1069 | } |
||
| 1070 | return $bodyDescription; |
||
| 1071 | } |
||
| 1072 | |||
| 1073 | /** |
||
| 1074 | * Analyzes content to use for indexing, |
||
| 1075 | * |
||
| 1076 | * @param array $content Standard content array: an array with the keys title,keywords,description and body, which all contain an array of words. |
||
| 1077 | * @return array Index Array (whatever that is...) |
||
| 1078 | */ |
||
| 1079 | public function indexAnalyze($content) |
||
| 1080 | { |
||
| 1081 | $indexArr = []; |
||
| 1082 | $this->analyzeHeaderinfo($indexArr, $content, 'title', 7); |
||
| 1083 | $this->analyzeHeaderinfo($indexArr, $content, 'keywords', 6); |
||
| 1084 | $this->analyzeHeaderinfo($indexArr, $content, 'description', 5); |
||
| 1085 | $this->analyzeBody($indexArr, $content); |
||
| 1086 | return $indexArr; |
||
| 1087 | } |
||
| 1088 | |||
| 1089 | /** |
||
| 1090 | * Calculates relevant information for headercontent |
||
| 1091 | * |
||
| 1092 | * @param array $retArr Index array, passed by reference |
||
| 1093 | * @param array $content Standard content array |
||
| 1094 | * @param string $key Key from standard content array |
||
| 1095 | * @param int $offset Bit-wise priority to type |
||
| 1096 | */ |
||
| 1097 | public function analyzeHeaderinfo(&$retArr, $content, $key, $offset) |
||
| 1098 | { |
||
| 1099 | foreach ($content[$key] as $val) { |
||
| 1100 | $val = substr($val, 0, 60); |
||
| 1101 | // Cut after 60 chars because the index_words.baseword varchar field has this length. This MUST be the same. |
||
| 1102 | if (!isset($retArr[$val])) { |
||
| 1103 | // Word ID (wid) |
||
| 1104 | $retArr[$val]['hash'] = IndexedSearchUtility::md5inthash($val); |
||
| 1105 | // Metaphone value is also 60 only chars long |
||
| 1106 | $metaphone = $this->enableMetaphoneSearch ? substr($this->metaphone($val, $this->storeMetaphoneInfoAsWords), 0, 60) : ''; |
||
| 1107 | $retArr[$val]['metaphone'] = $metaphone; |
||
| 1108 | } |
||
| 1109 | // Build metaphone fulltext string (can be used for fulltext indexing) |
||
| 1110 | if ($this->storeMetaphoneInfoAsWords) { |
||
| 1111 | $this->metaphoneContent .= ' ' . $retArr[$val]['metaphone']; |
||
| 1112 | } |
||
| 1113 | // Priority used for flagBitMask feature (see extension configuration) |
||
| 1114 | $retArr[$val]['cmp'] = $retArr[$val]['cmp'] | 2 ** $offset; |
||
| 1115 | // Increase number of occurrences |
||
| 1116 | $retArr[$val]['count']++; |
||
| 1117 | $this->wordcount++; |
||
| 1118 | } |
||
| 1119 | } |
||
| 1120 | |||
| 1121 | /** |
||
| 1122 | * Calculates relevant information for bodycontent |
||
| 1123 | * |
||
| 1124 | * @param array $retArr Index array, passed by reference |
||
| 1125 | * @param array $content Standard content array |
||
| 1126 | */ |
||
| 1127 | public function analyzeBody(&$retArr, $content) |
||
| 1128 | { |
||
| 1129 | foreach ($content['body'] as $key => $val) { |
||
| 1130 | $val = substr($val, 0, 60); |
||
| 1131 | // Cut after 60 chars because the index_words.baseword varchar field has this length. This MUST be the same. |
||
| 1132 | if (!isset($retArr[$val])) { |
||
| 1133 | // First occurrence (used for ranking results) |
||
| 1134 | $retArr[$val]['first'] = $key; |
||
| 1135 | // Word ID (wid) |
||
| 1136 | $retArr[$val]['hash'] = IndexedSearchUtility::md5inthash($val); |
||
| 1137 | // Metaphone value is also only 60 chars long |
||
| 1138 | $metaphone = $this->enableMetaphoneSearch ? substr($this->metaphone($val, $this->storeMetaphoneInfoAsWords), 0, 60) : ''; |
||
| 1139 | $retArr[$val]['metaphone'] = $metaphone; |
||
| 1140 | } |
||
| 1141 | // Build metaphone fulltext string (can be used for fulltext indexing) |
||
| 1142 | if ($this->storeMetaphoneInfoAsWords) { |
||
| 1143 | $this->metaphoneContent .= ' ' . $retArr[$val]['metaphone']; |
||
| 1144 | } |
||
| 1145 | // Increase number of occurrences |
||
| 1146 | $retArr[$val]['count']++; |
||
| 1147 | $this->wordcount++; |
||
| 1148 | } |
||
| 1149 | } |
||
| 1150 | |||
| 1151 | /** |
||
| 1152 | * Creating metaphone based hash from input word |
||
| 1153 | * |
||
| 1154 | * @param string $word Word to convert |
||
| 1155 | * @param bool $returnRawMetaphoneValue If set, returns the raw metaphone value (not hashed) |
||
| 1156 | * @return mixed Metaphone hash integer (or raw value, string) |
||
| 1157 | */ |
||
| 1158 | public function metaphone($word, $returnRawMetaphoneValue = false) |
||
| 1159 | { |
||
| 1160 | if (is_object($this->metaphoneObj)) { |
||
| 1161 | $metaphoneRawValue = $this->metaphoneObj->metaphone($word, $this->conf['sys_language_uid']); |
||
| 1162 | } else { |
||
| 1163 | // Use native PHP function instead of advanced doubleMetaphone class |
||
| 1164 | $metaphoneRawValue = metaphone($word); |
||
| 1165 | } |
||
| 1166 | if ($returnRawMetaphoneValue) { |
||
| 1167 | $result = $metaphoneRawValue; |
||
| 1168 | } elseif ($metaphoneRawValue !== '') { |
||
| 1169 | // Create hash and return integer |
||
| 1170 | $result = IndexedSearchUtility::md5inthash($metaphoneRawValue); |
||
| 1171 | } else { |
||
| 1172 | $result = 0; |
||
| 1173 | } |
||
| 1174 | return $result; |
||
| 1175 | } |
||
| 1176 | |||
| 1177 | /******************************** |
||
| 1178 | * |
||
| 1179 | * SQL; TYPO3 Pages |
||
| 1180 | * |
||
| 1181 | *******************************/ |
||
| 1182 | /** |
||
| 1183 | * Updates db with information about the page (TYPO3 page, not external media) |
||
| 1184 | */ |
||
| 1185 | public function submitPage() |
||
| 1186 | { |
||
| 1187 | // Remove any current data for this phash: |
||
| 1188 | $this->removeOldIndexedPages($this->hash['phash']); |
||
| 1189 | // setting new phash_row |
||
| 1190 | $fields = [ |
||
| 1191 | 'phash' => $this->hash['phash'], |
||
| 1192 | 'phash_grouping' => $this->hash['phash_grouping'], |
||
| 1193 | 'static_page_arguments' => is_array($this->conf['staticPageArguments']) ? json_encode($this->conf['staticPageArguments']) : null, |
||
| 1194 | 'contentHash' => $this->content_md5h, |
||
| 1195 | 'data_page_id' => $this->conf['id'], |
||
| 1196 | 'data_page_type' => $this->conf['type'], |
||
| 1197 | 'data_page_mp' => $this->conf['MP'], |
||
| 1198 | 'gr_list' => $this->conf['gr_list'], |
||
| 1199 | 'item_type' => 0, |
||
| 1200 | // TYPO3 page |
||
| 1201 | 'item_title' => $this->contentParts['title'], |
||
| 1202 | 'item_description' => $this->bodyDescription($this->contentParts), |
||
| 1203 | 'item_mtime' => (int)$this->conf['mtime'], |
||
| 1204 | 'item_size' => strlen($this->conf['content']), |
||
| 1205 | 'tstamp' => $GLOBALS['EXEC_TIME'], |
||
| 1206 | 'crdate' => $GLOBALS['EXEC_TIME'], |
||
| 1207 | 'item_crdate' => $this->conf['crdate'], |
||
| 1208 | // Creation date of page |
||
| 1209 | 'sys_language_uid' => $this->conf['sys_language_uid'], |
||
| 1210 | // Sys language uid of the page. Should reflect which language it DOES actually display! |
||
| 1211 | 'externalUrl' => 0, |
||
| 1212 | 'recordUid' => (int)$this->conf['recordUid'], |
||
| 1213 | 'freeIndexUid' => (int)$this->conf['freeIndexUid'], |
||
| 1214 | 'freeIndexSetId' => (int)$this->conf['freeIndexSetId'] |
||
| 1215 | ]; |
||
| 1216 | if (IndexedSearchUtility::isTableUsed('index_phash')) { |
||
| 1217 | $connection = GeneralUtility::makeInstance(ConnectionPool::class) |
||
| 1218 | ->getConnectionForTable('index_phash'); |
||
| 1219 | $connection->insert( |
||
| 1220 | 'index_phash', |
||
| 1221 | $fields |
||
| 1222 | ); |
||
| 1223 | } |
||
| 1224 | // PROCESSING index_section |
||
| 1225 | $this->submit_section($this->hash['phash'], $this->hash['phash']); |
||
| 1226 | // PROCESSING index_grlist |
||
| 1227 | $this->submit_grlist($this->hash['phash'], $this->hash['phash']); |
||
| 1228 | // PROCESSING index_fulltext |
||
| 1229 | $fields = [ |
||
| 1230 | 'phash' => $this->hash['phash'], |
||
| 1231 | 'fulltextdata' => implode(' ', $this->contentParts), |
||
| 1232 | 'metaphonedata' => $this->metaphoneContent |
||
| 1233 | ]; |
||
| 1234 | if ($this->indexerConfig['fullTextDataLength'] > 0) { |
||
| 1235 | $fields['fulltextdata'] = substr($fields['fulltextdata'], 0, $this->indexerConfig['fullTextDataLength']); |
||
| 1236 | } |
||
| 1237 | if (IndexedSearchUtility::isTableUsed('index_fulltext')) { |
||
| 1238 | $connection = GeneralUtility::makeInstance(ConnectionPool::class) |
||
| 1239 | ->getConnectionForTable('index_fulltext'); |
||
| 1240 | $connection->insert('index_fulltext', $fields); |
||
| 1241 | } |
||
| 1242 | // PROCESSING index_debug |
||
| 1243 | if ($this->indexerConfig['debugMode']) { |
||
| 1244 | $fields = [ |
||
| 1245 | 'phash' => $this->hash['phash'], |
||
| 1246 | 'debuginfo' => json_encode([ |
||
| 1247 | 'external_parsers initialized' => array_keys($this->external_parsers), |
||
| 1248 | 'conf' => array_merge($this->conf, ['content' => substr($this->conf['content'], 0, 1000)]), |
||
| 1249 | 'contentParts' => array_merge($this->contentParts, ['body' => substr($this->contentParts['body'], 0, 1000)]), |
||
| 1250 | 'logs' => $this->internal_log, |
||
| 1251 | 'lexer' => $this->lexerObj->debugString |
||
| 1252 | ]) |
||
| 1253 | ]; |
||
| 1254 | if (IndexedSearchUtility::isTableUsed('index_debug')) { |
||
| 1255 | $connection = GeneralUtility::makeInstance(ConnectionPool::class) |
||
| 1256 | ->getConnectionForTable('index_debug'); |
||
| 1257 | $connection->insert('index_debug', $fields); |
||
| 1258 | } |
||
| 1259 | } |
||
| 1260 | } |
||
| 1261 | |||
| 1262 | /** |
||
| 1263 | * Stores gr_list in the database. |
||
| 1264 | * |
||
| 1265 | * @param int $hash Search result record phash |
||
| 1266 | * @param int $phash_x Actual phash of current content |
||
| 1267 | * @see update_grlist() |
||
| 1268 | */ |
||
| 1269 | public function submit_grlist($hash, $phash_x) |
||
| 1270 | { |
||
| 1271 | // Setting the gr_list record |
||
| 1272 | $fields = [ |
||
| 1273 | 'phash' => $hash, |
||
| 1274 | 'phash_x' => $phash_x, |
||
| 1275 | 'hash_gr_list' => IndexedSearchUtility::md5inthash($this->conf['gr_list']), |
||
| 1276 | 'gr_list' => $this->conf['gr_list'] |
||
| 1277 | ]; |
||
| 1278 | if (IndexedSearchUtility::isTableUsed('index_grlist')) { |
||
| 1279 | $connection = GeneralUtility::makeInstance(ConnectionPool::class) |
||
| 1280 | ->getConnectionForTable('index_grlist'); |
||
| 1281 | $connection->insert('index_grlist', $fields); |
||
| 1282 | } |
||
| 1283 | } |
||
| 1284 | |||
| 1285 | /** |
||
| 1286 | * Stores section |
||
| 1287 | * $hash and $hash_t3 are the same for TYPO3 pages, but different when it is external files. |
||
| 1288 | * |
||
| 1289 | * @param int $hash phash of TYPO3 parent search result record |
||
| 1290 | * @param int $hash_t3 phash of the file indexation search record |
||
| 1291 | */ |
||
| 1292 | public function submit_section($hash, $hash_t3) |
||
| 1304 | } |
||
| 1305 | } |
||
| 1306 | |||
| 1307 | /** |
||
| 1308 | * Removes records for the indexed page, $phash |
||
| 1309 | * |
||
| 1310 | * @param int $phash phash value to flush |
||
| 1311 | */ |
||
| 1312 | public function removeOldIndexedPages($phash) |
||
| 1313 | { |
||
| 1314 | // Removing old registrations for all tables. Because the pages are TYPO3 pages |
||
| 1315 | // there can be nothing else than 1-1 relations here. |
||
| 1316 | $connectionPool = GeneralUtility::makeInstance(ConnectionPool::class); |
||
| 1317 | $tableArray = ['index_phash', 'index_section', 'index_grlist', 'index_fulltext', 'index_debug']; |
||
| 1318 | foreach ($tableArray as $table) { |
||
| 1319 | if (IndexedSearchUtility::isTableUsed($table)) { |
||
| 1320 | $connectionPool->getConnectionForTable($table)->delete($table, ['phash' => (int)$phash]); |
||
| 1321 | } |
||
| 1322 | } |
||
| 1323 | |||
| 1324 | // Removing all index_section records with hash_t3 set to this hash (this includes such |
||
| 1325 | // records set for external media on the page as well!). The re-insert of these records |
||
| 1326 | // are done in indexRegularDocument($file). |
||
| 1327 | if (IndexedSearchUtility::isTableUsed('index_section')) { |
||
| 1328 | $connectionPool->getConnectionForTable('index_section') |
||
| 1329 | ->delete('index_section', ['phash_t3' => (int)$phash]); |
||
| 1330 | } |
||
| 1331 | } |
||
| 1332 | |||
| 1333 | /******************************** |
||
| 1334 | * |
||
| 1335 | * SQL; External media |
||
| 1336 | * |
||
| 1337 | *******************************/ |
||
| 1338 | /** |
||
| 1339 | * Updates db with information about the file |
||
| 1340 | * |
||
| 1341 | * @param array $hash Array with phash and phash_grouping keys for file |
||
| 1342 | * @param string $file File name |
||
| 1343 | * @param array $subinfo Array of "static_page_arguments" for files: This is for instance the page index for a PDF file (other document types it will be a zero) |
||
| 1344 | * @param string $ext File extension determining the type of media. |
||
| 1345 | * @param int $mtime Modification time of file. |
||
| 1346 | * @param int $ctime Creation time of file. |
||
| 1347 | * @param int $size Size of file in bytes |
||
| 1348 | * @param int $content_md5h Content HASH value. |
||
| 1349 | * @param array $contentParts Standard content array (using only title and body for a file) |
||
| 1350 | */ |
||
| 1351 | public function submitFilePage($hash, $file, $subinfo, $ext, $mtime, $ctime, $size, $content_md5h, $contentParts) |
||
| 1352 | { |
||
| 1353 | // Find item Type: |
||
| 1354 | $storeItemType = $this->external_parsers[$ext]->ext2itemtype_map[$ext]; |
||
| 1355 | $storeItemType = $storeItemType ?: $ext; |
||
| 1356 | // Remove any current data for this phash: |
||
| 1357 | $this->removeOldIndexedFiles($hash['phash']); |
||
| 1358 | // Split filename: |
||
| 1359 | $fileParts = parse_url($file); |
||
| 1360 | // Setting new |
||
| 1361 | $fields = [ |
||
| 1362 | 'phash' => $hash['phash'], |
||
| 1363 | 'phash_grouping' => $hash['phash_grouping'], |
||
| 1364 | 'static_page_arguments' => json_encode($subinfo), |
||
| 1365 | 'contentHash' => $content_md5h, |
||
| 1366 | 'data_filename' => $file, |
||
| 1367 | 'item_type' => $storeItemType, |
||
| 1368 | 'item_title' => trim($contentParts['title']) ?: PathUtility::basename($file), |
||
| 1369 | 'item_description' => $this->bodyDescription($contentParts), |
||
| 1370 | 'item_mtime' => $mtime, |
||
| 1371 | 'item_size' => $size, |
||
| 1372 | 'item_crdate' => $ctime, |
||
| 1373 | 'tstamp' => $GLOBALS['EXEC_TIME'], |
||
| 1374 | 'crdate' => $GLOBALS['EXEC_TIME'], |
||
| 1375 | 'gr_list' => $this->conf['gr_list'], |
||
| 1376 | 'externalUrl' => $fileParts['scheme'] ? 1 : 0, |
||
| 1377 | 'recordUid' => (int)$this->conf['recordUid'], |
||
| 1378 | 'freeIndexUid' => (int)$this->conf['freeIndexUid'], |
||
| 1379 | 'freeIndexSetId' => (int)$this->conf['freeIndexSetId'], |
||
| 1380 | 'sys_language_uid' => (int)$this->conf['sys_language_uid'] |
||
| 1381 | ]; |
||
| 1382 | if (IndexedSearchUtility::isTableUsed('index_phash')) { |
||
| 1383 | $connection = GeneralUtility::makeInstance(ConnectionPool::class) |
||
| 1384 | ->getConnectionForTable('index_phash'); |
||
| 1385 | $connection->insert( |
||
| 1386 | 'index_phash', |
||
| 1387 | $fields |
||
| 1388 | ); |
||
| 1389 | } |
||
| 1390 | // PROCESSING index_fulltext |
||
| 1391 | $fields = [ |
||
| 1392 | 'phash' => $hash['phash'], |
||
| 1393 | 'fulltextdata' => implode(' ', $contentParts), |
||
| 1394 | 'metaphonedata' => $this->metaphoneContent |
||
| 1395 | ]; |
||
| 1396 | if ($this->indexerConfig['fullTextDataLength'] > 0) { |
||
| 1397 | $fields['fulltextdata'] = substr($fields['fulltextdata'], 0, $this->indexerConfig['fullTextDataLength']); |
||
| 1398 | } |
||
| 1399 | if (IndexedSearchUtility::isTableUsed('index_fulltext')) { |
||
| 1400 | $connection = GeneralUtility::makeInstance(ConnectionPool::class) |
||
| 1401 | ->getConnectionForTable('index_fulltext'); |
||
| 1402 | $connection->insert('index_fulltext', $fields); |
||
| 1403 | } |
||
| 1404 | // PROCESSING index_debug |
||
| 1405 | if ($this->indexerConfig['debugMode']) { |
||
| 1406 | $fields = [ |
||
| 1407 | 'phash' => $hash['phash'], |
||
| 1408 | 'debuginfo' => json_encode([ |
||
| 1409 | 'static_page_arguments' => $subinfo, |
||
| 1410 | 'contentParts' => array_merge($contentParts, ['body' => substr($contentParts['body'], 0, 1000)]), |
||
| 1411 | 'logs' => $this->internal_log, |
||
| 1412 | 'lexer' => $this->lexerObj->debugString |
||
| 1413 | ]) |
||
| 1414 | ]; |
||
| 1415 | if (IndexedSearchUtility::isTableUsed('index_debug')) { |
||
| 1416 | $connection = GeneralUtility::makeInstance(ConnectionPool::class) |
||
| 1417 | ->getConnectionForTable('index_debug'); |
||
| 1418 | $connection->insert('index_debug', $fields); |
||
| 1419 | } |
||
| 1420 | } |
||
| 1421 | } |
||
| 1422 | |||
| 1423 | /** |
||
| 1424 | * Stores file gr_list for a file IF it does not exist already |
||
| 1425 | * |
||
| 1426 | * @param int $hash phash value of file |
||
| 1427 | */ |
||
| 1428 | public function submitFile_grlist($hash) |
||
| 1466 | } |
||
| 1467 | } |
||
| 1468 | |||
| 1469 | /** |
||
| 1470 | * Stores file section for a file IF it does not exist |
||
| 1471 | * |
||
| 1472 | * @param int $hash phash value of file |
||
| 1473 | */ |
||
| 1474 | public function submitFile_section($hash) |
||
| 1475 | { |
||
| 1476 | // Testing if there is already a section |
||
| 1477 | if (!IndexedSearchUtility::isTableUsed('index_section')) { |
||
| 1478 | return; |
||
| 1479 | } |
||
| 1480 | |||
| 1481 | $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class) |
||
| 1482 | ->getQueryBuilderForTable('index_section'); |
||
| 1483 | $count = (int)$queryBuilder->count('phash') |
||
| 1484 | ->from('index_section') |
||
| 1485 | ->where( |
||
| 1486 | $queryBuilder->expr()->eq( |
||
| 1487 | 'phash', |
||
| 1488 | $queryBuilder->createNamedParameter($hash, \PDO::PARAM_INT) |
||
| 1489 | ), |
||
| 1490 | $queryBuilder->expr()->eq( |
||
| 1491 | 'page_id', |
||
| 1492 | $queryBuilder->createNamedParameter($this->conf['id'], \PDO::PARAM_INT) |
||
| 1493 | ) |
||
| 1494 | ) |
||
| 1495 | ->execute() |
||
| 1496 | ->fetchColumn(); |
||
| 1497 | |||
| 1498 | if ($count === 0) { |
||
| 1499 | $this->submit_section($hash, $this->hash['phash']); |
||
| 1500 | } |
||
| 1501 | } |
||
| 1502 | |||
| 1503 | /** |
||
| 1504 | * Removes records for the indexed page, $phash |
||
| 1505 | * |
||
| 1506 | * @param int $phash phash value to flush |
||
| 1507 | */ |
||
| 1508 | public function removeOldIndexedFiles($phash) |
||
| 1509 | { |
||
| 1510 | $connectionPool = GeneralUtility::makeInstance(ConnectionPool::class); |
||
| 1511 | // Removing old registrations for tables. |
||
| 1512 | $tableArray = ['index_phash', 'index_grlist', 'index_fulltext', 'index_debug']; |
||
| 1513 | foreach ($tableArray as $table) { |
||
| 1514 | if (!IndexedSearchUtility::isTableUsed($table)) { |
||
| 1515 | continue; |
||
| 1516 | } |
||
| 1517 | $connectionPool->getConnectionForTable($table)->delete($table, ['phash' => (int)$phash]); |
||
| 1518 | } |
||
| 1519 | } |
||
| 1520 | |||
| 1521 | /******************************** |
||
| 1522 | * |
||
| 1523 | * SQL Helper functions |
||
| 1524 | * |
||
| 1525 | *******************************/ |
||
| 1526 | /** |
||
| 1527 | * Check the mtime / tstamp of the currently indexed page/file (based on phash) |
||
| 1528 | * Return positive integer if the page needs to be indexed |
||
| 1529 | * |
||
| 1530 | * @param int $mtime mtime value to test against limits and indexed page (usually this is the mtime of the cached document) |
||
| 1531 | * @param int $phash "phash" used to select any already indexed page to see what its mtime is. |
||
| 1532 | * @return int Result integer: Generally: <0 = No indexing, >0 = Do indexing (see $this->reasons): -2) Min age was NOT exceeded and so indexing cannot occur. -1) mtime matched so no need to reindex page. 0) N/A 1) Max age exceeded, page must be indexed again. 2) mtime of indexed page doesn't match mtime given for current content and we must index page. 3) No mtime was set, so we will index... 4) No indexed page found, so of course we will index. |
||
| 1533 | */ |
||
| 1534 | public function checkMtimeTstamp($mtime, $phash) |
||
| 1535 | { |
||
| 1536 | if (!IndexedSearchUtility::isTableUsed('index_phash')) { |
||
| 1537 | // Not indexed (not in index_phash) |
||
| 1538 | $result = 4; |
||
| 1539 | } else { |
||
| 1540 | $row = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('index_phash') |
||
| 1541 | ->select( |
||
| 1542 | ['item_mtime', 'tstamp'], |
||
| 1543 | 'index_phash', |
||
| 1544 | ['phash' => (int)$phash], |
||
| 1545 | [], |
||
| 1546 | [], |
||
| 1547 | 1 |
||
| 1548 | ) |
||
| 1549 | ->fetch(); |
||
| 1550 | // If there was an indexing of the page...: |
||
| 1551 | if (!empty($row)) { |
||
| 1552 | if ($this->tstamp_maxAge && $row['tstamp'] + $this->tstamp_maxAge < $GLOBALS['EXEC_TIME']) { |
||
| 1553 | // If max age is exceeded, index the page |
||
| 1554 | // The configured max-age was exceeded for the document and thus it's indexed. |
||
| 1555 | $result = 1; |
||
| 1556 | } else { |
||
| 1557 | if (!$this->tstamp_minAge || $row['tstamp'] + $this->tstamp_minAge < $GLOBALS['EXEC_TIME']) { |
||
| 1558 | // if minAge is not set or if minAge is exceeded, consider at mtime |
||
| 1559 | if ($mtime) { |
||
| 1560 | // It mtime is set, then it's tested. If not, the page must clearly be indexed. |
||
| 1561 | if ($row['item_mtime'] != $mtime) { |
||
| 1562 | // And if mtime is different from the index_phash mtime, it's about time to re-index. |
||
| 1563 | // The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed. |
||
| 1564 | $result = 2; |
||
| 1565 | } else { |
||
| 1566 | // mtime matched the document, so no changes detected and no content updated |
||
| 1567 | $result = -1; |
||
| 1568 | if ($this->tstamp_maxAge) { |
||
| 1569 | $this->log_setTSlogMessage('mtime matched, timestamp NOT updated because a maxAge is set (' . ($row['tstamp'] + $this->tstamp_maxAge - $GLOBALS['EXEC_TIME']) . ' seconds to expire time).', 1); |
||
| 1570 | } else { |
||
| 1571 | $this->updateTstamp($phash); |
||
| 1572 | $this->log_setTSlogMessage('mtime matched, timestamp updated.', 1); |
||
| 1573 | } |
||
| 1574 | } |
||
| 1575 | } else { |
||
| 1576 | // The minimum age was exceed, but mtime was not set, so the page was indexed. |
||
| 1577 | $result = 3; |
||
| 1578 | } |
||
| 1579 | } else { |
||
| 1580 | // The minimum age was not exceeded |
||
| 1581 | $result = -2; |
||
| 1582 | } |
||
| 1583 | } |
||
| 1584 | } else { |
||
| 1585 | // Page has never been indexed (is not represented in the index_phash table). |
||
| 1586 | $result = 4; |
||
| 1587 | } |
||
| 1588 | } |
||
| 1589 | return $result; |
||
| 1590 | } |
||
| 1591 | |||
| 1592 | /** |
||
| 1593 | * Check content hash in phash table |
||
| 1594 | * |
||
| 1595 | * @return mixed Returns TRUE if the page needs to be indexed (that is, there was no result), otherwise the phash value (in an array) of the phash record to which the grlist_record should be related! |
||
| 1596 | */ |
||
| 1597 | public function checkContentHash() |
||
| 1598 | { |
||
| 1599 | // With this query the page will only be indexed if it's content is different from the same "phash_grouping" -page. |
||
| 1600 | $result = true; |
||
| 1601 | if (IndexedSearchUtility::isTableUsed('index_phash')) { |
||
| 1602 | $row = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('index_phash') |
||
| 1603 | ->select( |
||
| 1604 | ['phash'], |
||
| 1605 | 'index_phash', |
||
| 1606 | [ |
||
| 1607 | 'phash_grouping' => (int)$this->hash['phash_grouping'], |
||
| 1608 | 'contentHash' => (int)$this->content_md5h |
||
| 1609 | ], |
||
| 1610 | [], |
||
| 1611 | [], |
||
| 1612 | 1 |
||
| 1613 | ) |
||
| 1614 | ->fetch(); |
||
| 1615 | |||
| 1616 | if (!empty($row)) { |
||
| 1617 | $result = $row; |
||
| 1618 | } |
||
| 1619 | } |
||
| 1620 | return $result; |
||
| 1621 | } |
||
| 1622 | |||
| 1623 | /** |
||
| 1624 | * Check content hash for external documents |
||
| 1625 | * Returns TRUE if the document needs to be indexed (that is, there was no result) |
||
| 1626 | * |
||
| 1627 | * @param int $hashGr phash value to check (phash_grouping) |
||
| 1628 | * @param int $content_md5h Content hash to check |
||
| 1629 | * @return bool Returns TRUE if the document needs to be indexed (that is, there was no result) |
||
| 1630 | */ |
||
| 1631 | public function checkExternalDocContentHash($hashGr, $content_md5h) |
||
| 1632 | { |
||
| 1633 | $result = true; |
||
| 1634 | if (IndexedSearchUtility::isTableUsed('index_phash')) { |
||
| 1635 | $count = (int)GeneralUtility::makeInstance(ConnectionPool::class) |
||
| 1636 | ->getConnectionForTable('index_phash') |
||
| 1637 | ->count( |
||
| 1638 | '*', |
||
| 1639 | 'index_phash', |
||
| 1640 | [ |
||
| 1641 | 'phash_grouping' => (int)$hashGr, |
||
| 1642 | 'contentHash' => (int)$content_md5h |
||
| 1643 | ] |
||
| 1644 | ); |
||
| 1645 | |||
| 1646 | $result = $count === 0; |
||
| 1647 | } |
||
| 1648 | return $result; |
||
| 1649 | } |
||
| 1650 | |||
| 1651 | /** |
||
| 1652 | * Checks if a grlist record has been set for the phash value input (looking at the "real" phash of the current content, not the linked-to phash of the common search result page) |
||
| 1653 | * |
||
| 1654 | * @param int $phash_x Phash integer to test. |
||
| 1655 | * @return bool |
||
| 1656 | */ |
||
| 1657 | public function is_grlist_set($phash_x) |
||
| 1658 | { |
||
| 1659 | $result = false; |
||
| 1660 | if (IndexedSearchUtility::isTableUsed('index_grlist')) { |
||
| 1661 | $count = (int)GeneralUtility::makeInstance(ConnectionPool::class) |
||
| 1662 | ->getConnectionForTable('index_grlist') |
||
| 1663 | ->count( |
||
| 1664 | 'phash_x', |
||
| 1665 | 'index_grlist', |
||
| 1666 | ['phash_x' => (int)$phash_x] |
||
| 1667 | ); |
||
| 1668 | |||
| 1669 | $result = $count > 0; |
||
| 1670 | } |
||
| 1671 | return $result; |
||
| 1672 | } |
||
| 1673 | |||
| 1674 | /** |
||
| 1675 | * Check if a grlist-entry for this hash exists and if not so, write one. |
||
| 1676 | * |
||
| 1677 | * @param int $phash phash of the search result that should be found |
||
| 1678 | * @param int $phash_x The real phash of the current content. The two values are different when a page with userlogin turns out to contain the exact same content as another already indexed version of the page; This is the whole reason for the grlist table in fact... |
||
| 1679 | * @see submit_grlist() |
||
| 1680 | */ |
||
| 1681 | public function update_grlist($phash, $phash_x) |
||
| 1682 | { |
||
| 1683 | if (IndexedSearchUtility::isTableUsed('index_grlist')) { |
||
| 1684 | $count = (int)GeneralUtility::makeInstance(ConnectionPool::class) |
||
| 1685 | ->getConnectionForTable('index_grlist') |
||
| 1686 | ->count( |
||
| 1687 | 'phash', |
||
| 1688 | 'index_grlist', |
||
| 1689 | [ |
||
| 1690 | 'phash' => (int)$phash, |
||
| 1691 | 'hash_gr_list' => IndexedSearchUtility::md5inthash($this->conf['gr_list']) |
||
| 1692 | ] |
||
| 1693 | ); |
||
| 1694 | |||
| 1695 | if ($count === 0) { |
||
| 1696 | $this->submit_grlist($phash, $phash_x); |
||
| 1697 | $this->log_setTSlogMessage('Inserted gr_list \'' . $this->conf['gr_list'] . '\' for phash \'' . $phash . '\'', 1); |
||
| 1698 | } |
||
| 1699 | } |
||
| 1700 | } |
||
| 1701 | |||
| 1702 | /** |
||
| 1703 | * Update tstamp for a phash row. |
||
| 1704 | * |
||
| 1705 | * @param int $phash phash value |
||
| 1706 | * @param int $mtime If set, update the mtime field to this value. |
||
| 1707 | */ |
||
| 1708 | public function updateTstamp($phash, $mtime = 0) |
||
| 1709 | { |
||
| 1710 | if (!IndexedSearchUtility::isTableUsed('index_phash')) { |
||
| 1711 | return; |
||
| 1712 | } |
||
| 1713 | |||
| 1714 | $updateFields = [ |
||
| 1715 | 'tstamp' => $GLOBALS['EXEC_TIME'] |
||
| 1716 | ]; |
||
| 1717 | |||
| 1718 | if ($mtime) { |
||
| 1719 | $updateFields['item_mtime'] = (int)$mtime; |
||
| 1720 | } |
||
| 1721 | |||
| 1722 | GeneralUtility::makeInstance(ConnectionPool::class) |
||
| 1723 | ->getConnectionForTable('index_phash') |
||
| 1724 | ->update( |
||
| 1725 | 'index_phash', |
||
| 1726 | $updateFields, |
||
| 1727 | [ |
||
| 1728 | 'phash' => (int)$phash |
||
| 1729 | ] |
||
| 1730 | ); |
||
| 1731 | } |
||
| 1732 | |||
| 1733 | /** |
||
| 1734 | * Update SetID of the index_phash record. |
||
| 1735 | * |
||
| 1736 | * @param int $phash phash value |
||
| 1737 | */ |
||
| 1738 | public function updateSetId($phash) |
||
| 1739 | { |
||
| 1740 | if (!IndexedSearchUtility::isTableUsed('index_phash')) { |
||
| 1741 | return; |
||
| 1742 | } |
||
| 1743 | |||
| 1744 | GeneralUtility::makeInstance(ConnectionPool::class) |
||
| 1745 | ->getConnectionForTable('index_phash') |
||
| 1746 | ->update( |
||
| 1747 | 'index_phash', |
||
| 1748 | [ |
||
| 1749 | 'freeIndexSetId' => (int)$this->conf['freeIndexSetId'] |
||
| 1750 | ], |
||
| 1751 | [ |
||
| 1752 | 'phash' => (int)$phash |
||
| 1753 | ] |
||
| 1754 | ); |
||
| 1755 | } |
||
| 1756 | |||
| 1757 | /** |
||
| 1758 | * Update parsetime for phash row. |
||
| 1759 | * |
||
| 1760 | * @param int $phash phash value. |
||
| 1761 | * @param int $parsetime Parsetime value to set. |
||
| 1762 | */ |
||
| 1763 | public function updateParsetime($phash, $parsetime) |
||
| 1764 | { |
||
| 1765 | if (!IndexedSearchUtility::isTableUsed('index_phash')) { |
||
| 1766 | return; |
||
| 1767 | } |
||
| 1768 | |||
| 1769 | GeneralUtility::makeInstance(ConnectionPool::class) |
||
| 1770 | ->getConnectionForTable('index_phash') |
||
| 1771 | ->update( |
||
| 1772 | 'index_phash', |
||
| 1773 | [ |
||
| 1774 | 'parsetime' => (int)$parsetime |
||
| 1775 | ], |
||
| 1776 | [ |
||
| 1777 | 'phash' => (int)$phash |
||
| 1778 | ] |
||
| 1779 | ); |
||
| 1780 | } |
||
| 1781 | |||
| 1782 | /** |
||
| 1783 | * Update section rootline for the page |
||
| 1784 | */ |
||
| 1785 | public function updateRootline() |
||
| 1786 | { |
||
| 1787 | if (!IndexedSearchUtility::isTableUsed('index_section')) { |
||
| 1788 | return; |
||
| 1789 | } |
||
| 1790 | |||
| 1791 | $updateFields = []; |
||
| 1792 | $this->getRootLineFields($updateFields); |
||
| 1793 | |||
| 1794 | GeneralUtility::makeInstance(ConnectionPool::class) |
||
| 1795 | ->getConnectionForTable('index_section') |
||
| 1796 | ->update( |
||
| 1797 | 'index_section', |
||
| 1798 | $updateFields, |
||
| 1799 | [ |
||
| 1800 | 'page_id' => (int)$this->conf['id'] |
||
| 1801 | ] |
||
| 1802 | ); |
||
| 1803 | } |
||
| 1804 | |||
| 1805 | /** |
||
| 1806 | * Adding values for root-line fields. |
||
| 1807 | * rl0, rl1 and rl2 are standard. A hook might add more. |
||
| 1808 | * |
||
| 1809 | * @param array $fieldArray Field array, passed by reference |
||
| 1810 | */ |
||
| 1811 | public function getRootLineFields(array &$fieldArray) |
||
| 1812 | { |
||
| 1813 | $fieldArray['rl0'] = (int)$this->conf['rootline_uids'][0]; |
||
| 1814 | $fieldArray['rl1'] = (int)$this->conf['rootline_uids'][1]; |
||
| 1815 | $fieldArray['rl2'] = (int)$this->conf['rootline_uids'][2]; |
||
| 1816 | foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['addRootLineFields'] ?? [] as $fieldName => $rootLineLevel) { |
||
| 1817 | $fieldArray[$fieldName] = (int)$this->conf['rootline_uids'][$rootLineLevel]; |
||
| 1818 | } |
||
| 1819 | } |
||
| 1820 | |||
| 1821 | /******************************** |
||
| 1822 | * |
||
| 1823 | * SQL; Submitting words |
||
| 1824 | * |
||
| 1825 | *******************************/ |
||
| 1826 | /** |
||
| 1827 | * Adds new words to db |
||
| 1828 | * |
||
| 1829 | * @param array $wordListArray Word List array (where each word has information about position etc). |
||
| 1830 | */ |
||
| 1831 | public function checkWordList($wordListArray) |
||
| 1832 | { |
||
| 1833 | if (!IndexedSearchUtility::isTableUsed('index_words') || empty($wordListArray)) { |
||
| 1834 | return; |
||
| 1835 | } |
||
| 1836 | |||
| 1837 | $wordListArrayCount = count($wordListArray); |
||
| 1838 | $phashArray = array_map('intval', array_column($wordListArray, 'hash')); |
||
| 1839 | |||
| 1840 | $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable('index_words'); |
||
| 1841 | $count = (int)$queryBuilder->count('baseword') |
||
| 1842 | ->from('index_words') |
||
| 1843 | ->where( |
||
| 1844 | $queryBuilder->expr()->in( |
||
| 1845 | 'wid', |
||
| 1846 | $queryBuilder->createNamedParameter($phashArray, Connection::PARAM_INT_ARRAY) |
||
| 1847 | ) |
||
| 1848 | ) |
||
| 1849 | ->execute() |
||
| 1850 | ->fetchColumn(); |
||
| 1851 | |||
| 1852 | if ($count !== $wordListArrayCount) { |
||
| 1853 | $connection = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('index_words'); |
||
| 1854 | $queryBuilder = $connection->createQueryBuilder(); |
||
| 1855 | |||
| 1856 | $result = $queryBuilder->select('baseword') |
||
| 1857 | ->from('index_words') |
||
| 1858 | ->where( |
||
| 1859 | $queryBuilder->expr()->in( |
||
| 1860 | 'wid', |
||
| 1861 | $queryBuilder->createNamedParameter($phashArray, Connection::PARAM_INT_ARRAY) |
||
| 1862 | ) |
||
| 1863 | ) |
||
| 1864 | ->execute(); |
||
| 1865 | |||
| 1866 | $this->log_setTSlogMessage('Inserting words: ' . ($wordListArrayCount - $count), 1); |
||
| 1867 | while ($row = $result->fetch()) { |
||
| 1868 | unset($wordListArray[$row['baseword']]); |
||
| 1869 | } |
||
| 1870 | |||
| 1871 | foreach ($wordListArray as $key => $val) { |
||
| 1872 | // A duplicate-key error will occur here if a word is NOT unset in the unset() line. However as |
||
| 1873 | // long as the words in $wl are NOT longer as 60 chars (the baseword varchar is 60 characters...) |
||
| 1874 | // this is not a problem. |
||
| 1875 | $connection->insert( |
||
| 1876 | 'index_words', |
||
| 1877 | [ |
||
| 1878 | 'wid' => $val['hash'], |
||
| 1879 | 'baseword' => $key, |
||
| 1880 | 'metaphone' => $val['metaphone'] |
||
| 1881 | ] |
||
| 1882 | ); |
||
| 1883 | } |
||
| 1884 | } |
||
| 1885 | } |
||
| 1886 | |||
| 1887 | /** |
||
| 1888 | * Submits RELATIONS between words and phash |
||
| 1889 | * |
||
| 1890 | * @param array $wordList Word list array |
||
| 1891 | * @param int $phash phash value |
||
| 1892 | */ |
||
| 1893 | public function submitWords($wordList, $phash) |
||
| 1894 | { |
||
| 1895 | if (!IndexedSearchUtility::isTableUsed('index_rel')) { |
||
| 1896 | return; |
||
| 1897 | } |
||
| 1898 | $connectionPool = GeneralUtility::makeInstance(ConnectionPool::class); |
||
| 1899 | $queryBuilder = $connectionPool->getQueryBuilderForTable('index_words'); |
||
| 1900 | $result = $queryBuilder->select('wid') |
||
| 1901 | ->from('index_words') |
||
| 1902 | ->where( |
||
| 1903 | $queryBuilder->expr()->neq('is_stopword', $queryBuilder->createNamedParameter(0, \PDO::PARAM_INT)) |
||
| 1904 | ) |
||
| 1905 | ->groupBy('wid') |
||
| 1906 | ->execute(); |
||
| 1907 | |||
| 1908 | $stopWords = []; |
||
| 1909 | while ($row = $result->fetch()) { |
||
| 1910 | $stopWords[$row['wid']] = $row; |
||
| 1911 | } |
||
| 1912 | |||
| 1913 | $connectionPool->getConnectionForTable('index_rel')->delete('index_rel', ['phash' => (int)$phash]); |
||
| 1914 | |||
| 1915 | $fields = ['phash', 'wid', 'count', 'first', 'freq', 'flags']; |
||
| 1916 | $rows = []; |
||
| 1917 | foreach ($wordList as $val) { |
||
| 1918 | if (isset($stopWords[$val['hash']])) { |
||
| 1919 | continue; |
||
| 1920 | } |
||
| 1921 | $rows[] = [ |
||
| 1922 | (int)$phash, |
||
| 1923 | (int)$val['hash'], |
||
| 1924 | (int)$val['count'], |
||
| 1925 | (int)$val['first'], |
||
| 1926 | $this->freqMap($val['count'] / $this->wordcount), |
||
| 1927 | $val['cmp'] & $this->flagBitMask |
||
| 1928 | ]; |
||
| 1929 | } |
||
| 1930 | |||
| 1931 | if (!empty($rows)) { |
||
| 1932 | $connectionPool->getConnectionForTable('index_rel')->bulkInsert('index_rel', $rows, $fields); |
||
| 1933 | } |
||
| 1934 | } |
||
| 1935 | |||
| 1936 | /** |
||
| 1937 | * maps frequency from a real number in [0;1] to an integer in [0;$this->freqRange] with anything above $this->freqMax as 1 |
||
| 1938 | * and back. |
||
| 1939 | * |
||
| 1940 | * @param float $freq Frequency |
||
| 1941 | * @return int Frequency in range. |
||
| 1942 | */ |
||
| 1943 | public function freqMap($freq) |
||
| 1944 | { |
||
| 1945 | $mapFactor = $this->freqMax * 100 * $this->freqRange; |
||
| 1946 | if ($freq <= 1) { |
||
| 1947 | $newFreq = $freq * $mapFactor; |
||
| 1948 | $newFreq = $newFreq > $this->freqRange ? $this->freqRange : $newFreq; |
||
| 1949 | } else { |
||
| 1950 | $newFreq = $freq / $mapFactor; |
||
| 1951 | } |
||
| 1952 | return $newFreq; |
||
| 1953 | } |
||
| 1954 | |||
| 1955 | /******************************** |
||
| 1956 | * |
||
| 1957 | * Hashing |
||
| 1958 | * |
||
| 1959 | *******************************/ |
||
| 1960 | /** |
||
| 1961 | * Get search hash, T3 pages |
||
| 1962 | */ |
||
| 1963 | public function setT3Hashes() |
||
| 1964 | { |
||
| 1965 | // Set main array: |
||
| 1966 | $hArray = [ |
||
| 1967 | 'id' => (int)$this->conf['id'], |
||
| 1968 | 'type' => (int)$this->conf['type'], |
||
| 1969 | 'sys_lang' => (int)$this->conf['sys_language_uid'], |
||
| 1970 | 'MP' => (string)$this->conf['MP'], |
||
| 1971 | 'staticPageArguments' => is_array($this->conf['staticPageArguments']) ? json_encode($this->conf['staticPageArguments']) : null, |
||
| 1972 | ]; |
||
| 1973 | // Set grouping hash (Identifies a "page" combined of id, type, language, mountpoint and cHash parameters): |
||
| 1974 | $this->hash['phash_grouping'] = IndexedSearchUtility::md5inthash(serialize($hArray)); |
||
| 1975 | // Add gr_list and set plain phash (Subdivision where special page composition based on login is taken into account as well. It is expected that such pages are normally similar regardless of the login.) |
||
| 1976 | $hArray['gr_list'] = (string)$this->conf['gr_list']; |
||
| 1977 | $this->hash['phash'] = IndexedSearchUtility::md5inthash(serialize($hArray)); |
||
| 1978 | } |
||
| 1979 | |||
| 1980 | /** |
||
| 1981 | * Get search hash, external files |
||
| 1982 | * |
||
| 1983 | * @param string $file File name / path which identifies it on the server |
||
| 1984 | * @param array $subinfo Additional content identifying the (subpart of) content. For instance; PDF files are divided into groups of pages for indexing. |
||
| 1985 | * @return array Array with "phash_grouping" and "phash" inside. |
||
| 1986 | */ |
||
| 1987 | public function setExtHashes($file, $subinfo = []) |
||
| 1988 | { |
||
| 1989 | // Set main array: |
||
| 1990 | $hash = []; |
||
| 1991 | $hArray = [ |
||
| 1992 | 'file' => $file |
||
| 1993 | ]; |
||
| 1994 | // Set grouping hash: |
||
| 1995 | $hash['phash_grouping'] = IndexedSearchUtility::md5inthash(serialize($hArray)); |
||
| 1996 | // Add subinfo |
||
| 1997 | $hArray['subinfo'] = $subinfo; |
||
| 1998 | $hash['phash'] = IndexedSearchUtility::md5inthash(serialize($hArray)); |
||
| 1999 | return $hash; |
||
| 2000 | } |
||
| 2001 | |||
| 2002 | /********************************* |
||
| 2003 | * |
||
| 2004 | * Internal logging functions |
||
| 2005 | * |
||
| 2006 | *********************************/ |
||
| 2007 | /** |
||
| 2008 | * Push function wrapper for TT logging |
||
| 2009 | * |
||
| 2010 | * @param string $msg Title to set |
||
| 2011 | * @param string $key Key (?) |
||
| 2012 | */ |
||
| 2013 | public function log_push($msg, $key) |
||
| 2014 | { |
||
| 2015 | $this->timeTracker->push($msg, $key); |
||
| 2016 | } |
||
| 2017 | |||
| 2018 | /** |
||
| 2019 | * Pull function wrapper for TT logging |
||
| 2020 | */ |
||
| 2021 | public function log_pull() |
||
| 2022 | { |
||
| 2023 | $this->timeTracker->pull(); |
||
| 2024 | } |
||
| 2025 | |||
| 2026 | /** |
||
| 2027 | * Set log message function wrapper for TT logging |
||
| 2028 | * |
||
| 2029 | * @param string $msg Message to set |
||
| 2030 | * @param int $errorNum Error number |
||
| 2031 | */ |
||
| 2032 | public function log_setTSlogMessage($msg, $errorNum = 0) |
||
| 2033 | { |
||
| 2034 | $this->timeTracker->setTSlogMessage($msg, $errorNum); |
||
| 2035 | $this->internal_log[] = $msg; |
||
| 2036 | } |
||
| 2037 | |||
| 2038 | /** |
||
| 2039 | * Makes sure that keywords are space-separated. This is important for their |
||
| 2040 | * proper displaying as a part of fulltext index. |
||
| 2041 | * |
||
| 2042 | * @param string $keywordList |
||
| 2043 | * @return string |
||
| 2044 | * @see http://forge.typo3.org/issues/14959 |
||
| 2045 | */ |
||
| 2046 | protected function addSpacesToKeywordList($keywordList) |
||
| 2050 | } |
||
| 2051 | } |
||
| 2052 |
This check looks for assignments to scalar types that may be of the wrong type.
To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.