dkd-kaehm /
ext-solr
| 1 | <?php |
||
| 2 | namespace ApacheSolrForTypo3\Solr; |
||
| 3 | |||
| 4 | /*************************************************************** |
||
| 5 | * Copyright notice |
||
| 6 | * |
||
| 7 | * (c) 2010-2015 Ingo Renner <[email protected]> |
||
| 8 | * All rights reserved |
||
| 9 | * |
||
| 10 | * This script is part of the TYPO3 project. The TYPO3 project is |
||
| 11 | * free software; you can redistribute it and/or modify |
||
| 12 | * it under the terms of the GNU General Public License as published by |
||
| 13 | * the Free Software Foundation; either version 3 of the License, or |
||
| 14 | * (at your option) any later version. |
||
| 15 | * |
||
| 16 | * The GNU General Public License can be found at |
||
| 17 | * http://www.gnu.org/copyleft/gpl.html. |
||
| 18 | * |
||
| 19 | * This script is distributed in the hope that it will be useful, |
||
| 20 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
||
| 21 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
||
| 22 | * GNU General Public License for more details. |
||
| 23 | * |
||
| 24 | * This copyright notice MUST APPEAR in all copies of the script! |
||
| 25 | ***************************************************************/ |
||
| 26 | |||
| 27 | use ApacheSolrForTypo3\Solr\System\Logging\SolrLogManager; |
||
| 28 | use TYPO3\CMS\Core\Utility\GeneralUtility; |
||
| 29 | |||
| 30 | /** |
||
| 31 | * Content extraction class for TYPO3 pages. |
||
| 32 | * |
||
| 33 | * @author Ingo Renner <[email protected]> |
||
| 34 | */ |
||
| 35 | class Typo3PageContentExtractor extends HtmlContentExtractor |
||
| 36 | { |
||
| 37 | |||
| 38 | /** |
||
| 39 | * @var \ApacheSolrForTypo3\Solr\System\Logging\SolrLogManager |
||
| 40 | */ |
||
| 41 | protected $logger = null; |
||
| 42 | |||
| 43 | /** |
||
| 44 | * Shortcut method to retrieve the raw content marked for indexing. |
||
| 45 | * |
||
| 46 | * @return string Content marked for indexing. |
||
| 47 | */ |
||
| 48 | 66 | public function getContentMarkedForIndexing() |
|
| 49 | { |
||
| 50 | 66 | // @extensionScannerIgnoreLine |
|
| 51 | return $this->extractContentMarkedForIndexing($this->content); |
||
| 52 | } |
||
| 53 | |||
| 54 | /** |
||
| 55 | * Extracts the markup wrapped with TYPO3SEARCH_begin and TYPO3SEARCH_end |
||
| 56 | * markers. |
||
| 57 | * |
||
| 58 | * @param string $html HTML markup with TYPO3SEARCH markers for content that should be indexed |
||
| 59 | * @return string HTML markup found between TYPO3SEARCH markers |
||
| 60 | 73 | */ |
|
| 61 | protected function extractContentMarkedForIndexing($html) |
||
| 62 | 73 | { |
|
| 63 | 73 | preg_match_all('/<!--\s*?TYPO3SEARCH_begin\s*?-->.*?<!--\s*?TYPO3SEARCH_end\s*?-->/mis', |
|
| 64 | 73 | $html, $indexableContents); |
|
| 65 | $indexableContent = implode('', $indexableContents[0]); |
||
| 66 | 73 | ||
| 67 | 73 | $indexableContent = $this->excludeContentByClass($indexableContent); |
|
| 68 | 17 | if (empty($indexableContent) && $this->getConfiguration()->getLoggingIndexingMissingTypo3SearchMarkers()) { |
|
| 69 | 17 | $this->logger = GeneralUtility::makeInstance(SolrLogManager::class, /** @scrutinizer ignore-type */ __CLASS__); |
|
| 70 | 17 | $this->logger->log(SolrLogManager::WARNING, 'No TYPO3SEARCH markers found.'); |
|
| 71 | 17 | } |
|
| 72 | |||
| 73 | return $indexableContent; |
||
| 74 | } |
||
| 75 | 73 | ||
| 76 | /** |
||
| 77 | * Exclude some html parts by class inside content wrapped with TYPO3SEARCH_begin and TYPO3SEARCH_end |
||
| 78 | * markers. |
||
| 79 | * |
||
| 80 | * @param string $indexableContent HTML markup |
||
| 81 | * @return string HTML |
||
| 82 | */ |
||
| 83 | public function excludeContentByClass($indexableContent) |
||
| 84 | { |
||
| 85 | 76 | if (empty(trim($indexableContent))) { |
|
| 86 | return $indexableContent; |
||
| 87 | 76 | } |
|
| 88 | 17 | ||
| 89 | $excludeClasses = $this->getConfiguration()->getIndexQueuePagesExcludeContentByClassArray(); |
||
| 90 | if (count($excludeClasses) === 0) { |
||
| 91 | 59 | return $indexableContent; |
|
| 92 | 59 | } |
|
| 93 | 7 | ||
| 94 | $isInContent = Util::containsOneOfTheStrings($indexableContent, $excludeClasses); |
||
| 95 | if (!$isInContent) { |
||
| 96 | 52 | return $indexableContent; |
|
| 97 | 52 | } |
|
| 98 | 49 | ||
| 99 | $doc = new \DOMDocument('1.0', 'UTF-8'); |
||
| 100 | libxml_use_internal_errors(true); |
||
| 101 | 3 | $doc->loadHTML('<?xml version="1.0" encoding="UTF-8"?>' . PHP_EOL . $indexableContent); |
|
| 102 | 3 | $xpath = new \DOMXPath($doc); |
|
| 103 | 3 | foreach ($excludeClasses as $excludePart) { |
|
| 104 | 3 | $elements = $xpath->query("//*[contains(@class,'" . $excludePart . "')]"); |
|
| 105 | 3 | if (count($elements) == 0) { |
|
|
0 ignored issues
–
show
Bug
introduced
by
Loading history...
|
|||
| 106 | 3 | continue; |
|
| 107 | 3 | } |
|
| 108 | |||
| 109 | foreach ($elements as $element) { |
||
| 110 | $element->parentNode->removeChild($element); |
||
| 111 | 3 | } |
|
| 112 | 3 | } |
|
| 113 | $html = $doc->saveHTML($doc->documentElement->parentNode); |
||
| 114 | // remove XML-Preamble, newlines and doctype |
||
| 115 | 3 | $html = preg_replace('/(<\?xml[^>]+\?>|\r?\n|<!DOCTYPE.+?>)/imS', '', $html); |
|
| 116 | $html = str_replace(['<html>', '</html>', '<body>', '</body>'], ['', '', '', ''], $html); |
||
| 117 | 3 | ||
| 118 | 3 | return $html; |
|
| 119 | } |
||
| 120 | 3 | ||
| 121 | /** |
||
| 122 | * Returns the cleaned indexable content from the page's HTML markup. |
||
| 123 | * |
||
| 124 | * The content is cleaned from HTML tags and control chars Solr could |
||
| 125 | * stumble on. |
||
| 126 | * |
||
| 127 | * @return string Indexable, cleaned content ready for indexing. |
||
| 128 | */ |
||
| 129 | public function getIndexableContent() |
||
| 130 | { |
||
| 131 | 73 | // @extensionScannerIgnoreLine |
|
| 132 | $content = $this->extractContentMarkedForIndexing($this->content); |
||
| 133 | 73 | ||
| 134 | // clean content |
||
| 135 | $content = self::cleanContent($content); |
||
| 136 | 73 | $content = trim($content); |
|
| 137 | 73 | $content = preg_replace('!\s+!', ' ', $content); // reduce multiple spaces to one space |
|
| 138 | 73 | ||
| 139 | return $content; |
||
| 140 | 73 | } |
|
| 141 | 73 | ||
| 142 | /** |
||
| 143 | 73 | * Retrieves the page's title by checking the indexedDocTitle, altPageTitle, |
|
| 144 | * and regular page title - in that order. |
||
| 145 | * |
||
| 146 | * @return string the page's title |
||
| 147 | */ |
||
| 148 | public function getPageTitle() |
||
| 149 | { |
||
| 150 | $page = $GLOBALS['TSFE']; |
||
| 151 | |||
| 152 | 66 | if ($page->indexedDocTitle) { |
|
| 153 | $pageTitle = $page->indexedDocTitle; |
||
| 154 | 66 | } elseif ($page->altPageTitle) { |
|
| 155 | $pageTitle = $page->altPageTitle; |
||
| 156 | 66 | } else { |
|
| 157 | 49 | $pageTitle = $page->page['title']; |
|
| 158 | 17 | } |
|
| 159 | |||
| 160 | return $pageTitle; |
||
| 161 | 17 | } |
|
| 162 | |||
| 163 | /** |
||
| 164 | 66 | * Retrieves the page's body |
|
| 165 | * |
||
| 166 | * @return string the page's body |
||
| 167 | */ |
||
| 168 | public function getPageBody() |
||
| 169 | { |
||
| 170 | // @extensionScannerIgnoreLine |
||
| 171 | $pageContent = $this->content; |
||
| 172 | |||
| 173 | return stristr($pageContent, '<body'); |
||
| 174 | } |
||
| 175 | } |
||
| 176 |