1 | <?php |
||
35 | class Typo3PageContentExtractor extends HtmlContentExtractor |
||
36 | { |
||
37 | |||
38 | /** |
||
39 | * @var \ApacheSolrForTypo3\Solr\System\Logging\SolrLogManager |
||
40 | */ |
||
41 | protected $logger = null; |
||
42 | |||
43 | /** |
||
44 | * Shortcut method to retrieve the raw content marked for indexing. |
||
45 | * |
||
46 | * @return string Content marked for indexing. |
||
47 | */ |
||
48 | 45 | public function getContentMarkedForIndexing() |
|
52 | |||
53 | /** |
||
54 | * Extracts the markup wrapped with TYPO3SEARCH_begin and TYPO3SEARCH_end |
||
55 | * markers. |
||
56 | * |
||
57 | * @param string $html HTML markup with TYPO3SEARCH markers for content that should be indexed |
||
58 | * @return string HTML markup found between TYPO3SEARCH markers |
||
59 | */ |
||
60 | 46 | protected function extractContentMarkedForIndexing($html) |
|
61 | { |
||
62 | 46 | preg_match_all('/<!--\s*?TYPO3SEARCH_begin\s*?-->.*?<!--\s*?TYPO3SEARCH_end\s*?-->/mis', |
|
63 | 46 | $html, $indexableContents); |
|
64 | 46 | $indexableContent = implode($indexableContents[0], ''); |
|
65 | |||
66 | 46 | $indexableContent = $this->excludeContentByClass($indexableContent); |
|
67 | 46 | if (empty($indexableContent) && $this->getConfiguration()->getLoggingIndexingMissingTypo3SearchMarkers()) { |
|
68 | 7 | $this->logger = GeneralUtility::makeInstance(SolrLogManager::class, __CLASS__); |
|
69 | 7 | $this->logger->log( |
|
70 | 7 | SolrLogManager::WARNING, |
|
71 | 'No TYPO3SEARCH markers found.' |
||
72 | 7 | ); |
|
73 | 7 | } |
|
74 | |||
75 | 46 | return $indexableContent; |
|
76 | } |
||
77 | |||
78 | /** |
||
79 | * Exclude some html parts by class inside content wrapped with TYPO3SEARCH_begin and TYPO3SEARCH_end |
||
80 | * markers. |
||
81 | * |
||
82 | * @param string $indexableContent HTML markup |
||
83 | * @return string HTML |
||
84 | */ |
||
85 | 49 | public function excludeContentByClass($indexableContent) |
|
86 | { |
||
87 | 49 | if (empty(trim($indexableContent))) { |
|
88 | 7 | return html_entity_decode($indexableContent); |
|
89 | } |
||
90 | |||
91 | 42 | $excludeClasses = $this->getConfiguration()->getIndexQueuePagesExcludeContentByClassArray(); |
|
92 | 42 | if (count($excludeClasses) === 0) { |
|
93 | 7 | return html_entity_decode($indexableContent); |
|
94 | } |
||
95 | |||
96 | 35 | $isInContent = Util::containsOneOfTheStrings($indexableContent, $excludeClasses); |
|
97 | 35 | if (!$isInContent) { |
|
98 | 32 | return html_entity_decode($indexableContent); |
|
99 | } |
||
100 | |||
101 | 3 | $doc = new \DOMDocument('1.0', 'UTF-8'); |
|
102 | 3 | libxml_use_internal_errors(true); |
|
103 | 3 | $doc->loadHTML('<?xml version="1.0" encoding="UTF-8"?>' . PHP_EOL . $indexableContent); |
|
104 | 3 | $xpath = new \DOMXPath($doc); |
|
105 | 3 | foreach ($excludeClasses as $excludePart) { |
|
106 | 3 | $elements = $xpath->query("//*[contains(@class,'" . $excludePart . "')]"); |
|
107 | 3 | if (count($elements) == 0) { |
|
108 | continue; |
||
109 | } |
||
110 | |||
111 | 3 | foreach ($elements as $element) { |
|
112 | 3 | $element->parentNode->removeChild($element); |
|
113 | 3 | } |
|
114 | 3 | } |
|
115 | 3 | $html = $doc->saveHTML($doc->documentElement->parentNode); |
|
116 | // remove XML-Preamble, newlines and doctype |
||
117 | 3 | $html = preg_replace('/(<\?xml[^>]+\?>|\r?\n|<!DOCTYPE.+?>)/imS', '', $html); |
|
118 | 3 | $html = str_replace(['<html>', '</html>', '<body>', '</body>'], ['', '', '', ''], $html); |
|
119 | |||
120 | 3 | return $html; |
|
121 | } |
||
122 | |||
123 | /** |
||
124 | * Returns the cleaned indexable content from the page's HTML markup. |
||
125 | * |
||
126 | * The content is cleaned from HTML tags and control chars Solr could |
||
127 | * stumble on. |
||
128 | * |
||
129 | * @return string Indexable, cleaned content ready for indexing. |
||
130 | */ |
||
131 | 46 | public function getIndexableContent() |
|
143 | |||
144 | /** |
||
145 | * Retrieves the page's title by checking the indexedDocTitle, altPageTitle, |
||
146 | * and regular page title - in that order. |
||
147 | * |
||
148 | * @return string the page's title |
||
149 | */ |
||
150 | 45 | public function getPageTitle() |
|
151 | { |
||
152 | 45 | $page = $GLOBALS['TSFE']; |
|
153 | |||
154 | 45 | if ($page->indexedDocTitle) { |
|
155 | 38 | $pageTitle = $page->indexedDocTitle; |
|
156 | 45 | } elseif ($page->altPageTitle) { |
|
157 | $pageTitle = $page->altPageTitle; |
||
158 | } else { |
||
159 | 7 | $pageTitle = $page->page['title']; |
|
160 | } |
||
161 | |||
162 | 45 | return $pageTitle; |
|
163 | } |
||
164 | |||
165 | /** |
||
166 | * Retrieves the page's body |
||
167 | * |
||
168 | * @return string the page's body |
||
169 | */ |
||
170 | public function getPageBody() |
||
176 | } |
||
177 |