1 | <?php |
||
34 | class Typo3PageContentExtractor extends HtmlContentExtractor |
||
35 | { |
||
36 | |||
37 | /** |
||
38 | * Shortcut method to retrieve the raw content marked for indexing. |
||
39 | * |
||
40 | * @return string Content marked for indexing. |
||
41 | */ |
||
42 | 32 | public function getContentMarkedForIndexing() |
|
46 | |||
47 | /** |
||
48 | * Extracts the markup wrapped with TYPO3SEARCH_begin and TYPO3SEARCH_end |
||
49 | * markers. |
||
50 | * |
||
51 | * @param string $html HTML markup with TYPO3SEARCH markers for content that should be indexed |
||
52 | * @return string HTML markup found between TYPO3SEARCH markers |
||
53 | */ |
||
54 | 33 | protected function extractContentMarkedForIndexing($html) |
|
55 | { |
||
56 | 33 | preg_match_all('/<!--\s*?TYPO3SEARCH_begin\s*?-->.*?<!--\s*?TYPO3SEARCH_end\s*?-->/mis', |
|
57 | $html, $indexableContents); |
||
58 | 33 | $indexableContent = implode($indexableContents[0], ''); |
|
59 | |||
60 | 33 | $indexableContent = $this->excludeContentByClass($indexableContent); |
|
61 | 33 | if (empty($indexableContent) && $this->getConfiguration()->getLoggingIndexingMissingTypo3SearchMarkers()) { |
|
62 | 3 | GeneralUtility::devLog('No TYPO3SEARCH markers found.', 'solr', 2); |
|
63 | } |
||
64 | |||
65 | 33 | return $indexableContent; |
|
66 | } |
||
67 | |||
68 | /** |
||
69 | * Exclude some html parts by class inside content wrapped with TYPO3SEARCH_begin and TYPO3SEARCH_end |
||
70 | * markers. |
||
71 | * |
||
72 | * @param string $indexableContent HTML markup |
||
73 | * @return string HTML |
||
74 | */ |
||
75 | 36 | public function excludeContentByClass($indexableContent) |
|
76 | { |
||
77 | 36 | if (empty(trim($indexableContent))) { |
|
78 | 3 | return html_entity_decode($indexableContent); |
|
79 | } |
||
80 | |||
81 | 33 | $excludeClasses = $this->getConfiguration()->getIndexQueuePagesExcludeContentByClassArray(); |
|
82 | 33 | if (count($excludeClasses) === 0) { |
|
83 | return html_entity_decode($indexableContent); |
||
84 | } |
||
85 | |||
86 | 33 | $isInContent = Util::containsOneOfTheStrings($indexableContent, $excludeClasses); |
|
87 | 33 | if (!$isInContent) { |
|
88 | 30 | return html_entity_decode($indexableContent); |
|
89 | } |
||
90 | |||
91 | 3 | $doc = new \DOMDocument('1.0', 'UTF-8'); |
|
92 | 3 | libxml_use_internal_errors(true); |
|
93 | 3 | $doc->loadHTML('<?xml version="1.0" encoding="UTF-8"?>' . PHP_EOL . $indexableContent); |
|
94 | 3 | $xpath = new \DOMXPath($doc); |
|
95 | 3 | foreach ($excludeClasses as $excludePart) { |
|
96 | 3 | $elements = $xpath->query("//*[contains(@class,'" . $excludePart . "')]"); |
|
97 | 3 | if (count($elements) == 0) { |
|
98 | continue; |
||
99 | } |
||
100 | |||
101 | 3 | foreach ($elements as $element) { |
|
102 | 3 | $element->parentNode->removeChild($element); |
|
103 | } |
||
104 | } |
||
105 | 3 | $html = $doc->saveHTML($doc->documentElement->parentNode); |
|
106 | // remove XML-Preamble, newlines and doctype |
||
107 | 3 | $html = preg_replace('/(<\?xml[^>]+\?>|\r?\n|<!DOCTYPE.+?>)/imS', '', $html); |
|
108 | 3 | $html = str_replace(array('<html>', '</html>', '<body>', '</body>'), array('', '', '', ''), $html); |
|
109 | |||
110 | 3 | return $html; |
|
111 | } |
||
112 | |||
113 | /** |
||
114 | * Returns the cleaned indexable content from the page's HTML markup. |
||
115 | * |
||
116 | * The content is cleaned from HTML tags and control chars Solr could |
||
117 | * stumble on. |
||
118 | * |
||
119 | * @return string Indexable, cleaned content ready for indexing. |
||
120 | */ |
||
121 | 33 | public function getIndexableContent() |
|
133 | |||
134 | /** |
||
135 | * Retrieves the page's title by checking the indexedDocTitle, altPageTitle, |
||
136 | * and regular page title - in that order. |
||
137 | * |
||
138 | * @return string the page's title |
||
139 | */ |
||
140 | 32 | public function getPageTitle() |
|
141 | { |
||
142 | 32 | $page = $GLOBALS['TSFE']; |
|
143 | 32 | $pageTitle = ''; |
|
|
|||
144 | |||
145 | 32 | if ($page->indexedDocTitle) { |
|
146 | 29 | $pageTitle = $page->indexedDocTitle; |
|
147 | 3 | } elseif ($page->altPageTitle) { |
|
148 | $pageTitle = $page->altPageTitle; |
||
149 | } else { |
||
150 | 3 | $pageTitle = $page->page['title']; |
|
151 | } |
||
152 | |||
153 | 32 | return $pageTitle; |
|
154 | } |
||
155 | |||
156 | /** |
||
157 | * Retrieves the page's body |
||
158 | * |
||
159 | * @return string the page's body |
||
160 | */ |
||
161 | public function getPageBody() |
||
167 | } |
||
168 |
This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.
Both the
$myVar
assignment in line 1 and the$higher
assignment in line 2 are dead. The first because$myVar
is never used and the second because$higher
is always overwritten for every possible time line.