1
|
|
|
<?php |
2
|
|
|
namespace ApacheSolrForTypo3\Solr; |
3
|
|
|
|
4
|
|
|
/*************************************************************** |
5
|
|
|
* Copyright notice |
6
|
|
|
* |
7
|
|
|
* (c) 2011-2015 Ingo Renner <[email protected]> |
8
|
|
|
* All rights reserved |
9
|
|
|
* |
10
|
|
|
* This script is part of the TYPO3 project. The TYPO3 project is |
11
|
|
|
* free software; you can redistribute it and/or modify |
12
|
|
|
* it under the terms of the GNU General Public License as published by |
13
|
|
|
* the Free Software Foundation; either version 3 of the License, or |
14
|
|
|
* (at your option) any later version. |
15
|
|
|
* |
16
|
|
|
* The GNU General Public License can be found at |
17
|
|
|
* http://www.gnu.org/copyleft/gpl.html. |
18
|
|
|
* |
19
|
|
|
* This script is distributed in the hope that it will be useful, |
20
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
21
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
22
|
|
|
* GNU General Public License for more details. |
23
|
|
|
* |
24
|
|
|
* This copyright notice MUST APPEAR in all copies of the script! |
25
|
|
|
***************************************************************/ |
26
|
|
|
use ApacheSolrForTypo3\Solr\System\Configuration\TypoScriptConfiguration; |
27
|
|
|
|
28
|
|
|
/** |
29
|
|
|
* A content extractor to get clean, indexable content from HTML markup. |
30
|
|
|
* |
31
|
|
|
* @author Ingo Renner <[email protected]> |
32
|
|
|
*/ |
33
|
|
|
class HtmlContentExtractor |
34
|
|
|
{ |
35
|
|
|
|
36
|
|
|
/** |
37
|
|
|
* Unicode ranges which should get stripped before sending a document to solr. |
38
|
|
|
* This is necessary if a document (PDF, etc.) contains unicode characters which |
39
|
|
|
* are valid in the font being used in the document but are not available in the |
40
|
|
|
* font being used for displaying results. |
41
|
|
|
* |
42
|
|
|
* This is often the case if PDFs are being indexed where special fonts are used |
43
|
|
|
* for displaying bullets, etc. Usually those bullets reside in one of the unicode |
44
|
|
|
* "Private Use Zones" or the "Private Use Area" (plane 15 + 16) |
45
|
|
|
* |
46
|
|
|
* @see http://en.wikipedia.org/wiki/Unicode_block |
47
|
|
|
* @var array |
48
|
|
|
*/ |
49
|
|
|
protected static $stripUnicodeRanges = [ |
50
|
|
|
['FFFD', 'FFFD'], |
51
|
|
|
// Replacement Character (�) @see http://en.wikipedia.org/wiki/Specials_%28Unicode_block%29 |
52
|
|
|
['E000', 'F8FF'], |
53
|
|
|
// Private Use Area (part of Plane 0) |
54
|
|
|
['F0000', 'FFFFF'], |
55
|
|
|
// Supplementary Private Use Area (Plane 15) |
|
|
|
|
56
|
|
|
['100000', '10FFFF'], |
57
|
|
|
// Supplementary Private Use Area (Plane 16) |
|
|
|
|
58
|
|
|
]; |
59
|
|
|
/** |
60
|
|
|
* The raw HTML markup content to extract clean content from. |
61
|
|
|
* |
62
|
|
|
* @var string |
63
|
|
|
*/ |
64
|
|
|
protected $content; |
65
|
|
|
/** |
66
|
|
|
* Mapping of HTML tags to Solr document fields. |
67
|
|
|
* |
68
|
|
|
* @var array |
69
|
|
|
*/ |
70
|
|
|
protected $tagToFieldMapping = [ |
71
|
|
|
'h1' => 'tagsH1', |
72
|
|
|
'h2' => 'tagsH2H3', |
73
|
|
|
'h3' => 'tagsH2H3', |
74
|
|
|
'h4' => 'tagsH4H5H6', |
75
|
|
|
'h5' => 'tagsH4H5H6', |
76
|
|
|
'h6' => 'tagsH4H5H6', |
77
|
|
|
'u' => 'tagsInline', |
78
|
|
|
'b' => 'tagsInline', |
79
|
|
|
'strong' => 'tagsInline', |
80
|
|
|
'i' => 'tagsInline', |
81
|
|
|
'em' => 'tagsInline', |
82
|
|
|
'a' => 'tagsA', |
83
|
|
|
]; |
84
|
|
|
|
85
|
|
|
/** |
86
|
|
|
* @var TypoScriptConfiguration |
87
|
|
|
*/ |
88
|
|
|
private $configuration; |
89
|
|
|
|
90
|
|
|
/** |
91
|
|
|
* Constructor. |
92
|
|
|
* |
93
|
|
|
* @param string $content Content HTML markup |
94
|
|
|
*/ |
95
|
81 |
|
public function __construct($content) |
96
|
|
|
{ |
97
|
81 |
|
$this->content = $content; |
98
|
81 |
|
} |
99
|
|
|
|
100
|
|
|
/** |
101
|
|
|
* @return TypoScriptConfiguration|array |
102
|
|
|
*/ |
103
|
76 |
|
protected function getConfiguration() |
104
|
|
|
{ |
105
|
76 |
|
if ($this->configuration == null) { |
106
|
66 |
|
$this->configuration = Util::getSolrConfiguration(); |
107
|
|
|
} |
108
|
|
|
|
109
|
76 |
|
return $this->configuration; |
110
|
|
|
} |
111
|
|
|
|
112
|
|
|
/** |
113
|
|
|
* @param TypoScriptConfiguration $configuration |
114
|
|
|
*/ |
115
|
10 |
|
public function setConfiguration(TypoScriptConfiguration $configuration) |
116
|
|
|
{ |
117
|
10 |
|
$this->configuration = $configuration; |
118
|
10 |
|
} |
119
|
|
|
|
120
|
|
|
/** |
121
|
|
|
* Returns the cleaned indexable content from the page's HTML markup. |
122
|
|
|
* |
123
|
|
|
* The content is cleaned from HTML tags and control chars Solr could |
124
|
|
|
* stumble on. |
125
|
|
|
* |
126
|
|
|
* @return string Indexable, cleaned content ready for indexing. |
127
|
|
|
*/ |
128
|
4 |
|
public function getIndexableContent() |
129
|
|
|
{ |
130
|
4 |
|
$content = self::cleanContent($this->content); |
131
|
4 |
|
$content = html_entity_decode($content, ENT_QUOTES, 'UTF-8'); |
132
|
|
|
// after entity decoding we might have tags again |
133
|
4 |
|
$content = strip_tags($content); |
134
|
4 |
|
$content = trim($content); |
135
|
|
|
|
136
|
4 |
|
return $content; |
137
|
|
|
} |
138
|
|
|
|
139
|
|
|
/** |
140
|
|
|
* Strips html tags, and tab, new-line, carriage-return, whitespace |
141
|
|
|
* characters. |
142
|
|
|
* |
143
|
|
|
* @param string $content String to clean |
144
|
|
|
* @return string String cleaned from tags and special whitespace characters |
145
|
|
|
*/ |
146
|
78 |
|
public static function cleanContent($content) |
147
|
|
|
{ |
148
|
78 |
|
$content = self::stripControlCharacters($content); |
149
|
|
|
// remove Javascript |
150
|
78 |
|
$content = preg_replace('@<script[^>]*>.*?<\/script>@msi', '', $content); |
151
|
|
|
|
152
|
|
|
// remove internal CSS styles |
153
|
78 |
|
$content = preg_replace('@<style[^>]*>.*?<\/style>@msi', '', $content); |
154
|
|
|
|
155
|
|
|
// prevents concatenated words when stripping tags afterwards |
156
|
78 |
|
$content = str_replace(['<', '>'], [' <', '> '], $content); |
157
|
78 |
|
$content = static::stripTags($content); |
158
|
|
|
|
159
|
78 |
|
$content = str_replace(["\t", "\n", "\r", ' '], ' ', $content); |
160
|
78 |
|
$content = self::stripUnicodeRanges($content); |
161
|
78 |
|
$content = preg_replace('/\s{2,}/', ' ', $content); |
162
|
78 |
|
$content = trim($content); |
163
|
|
|
|
164
|
78 |
|
return $content; |
165
|
|
|
} |
166
|
|
|
|
167
|
|
|
/** |
168
|
|
|
* Strips html tags, but keeps single < and > characters. |
169
|
|
|
* |
170
|
|
|
* @param string $content |
171
|
|
|
* @return mixed |
172
|
|
|
*/ |
173
|
78 |
|
protected static function stripTags($content) |
174
|
|
|
{ |
175
|
78 |
|
$content = preg_replace('@<([^>]+(<|\z))@msi', '##lt##$1', $content); |
176
|
78 |
|
$content = strip_tags($content); |
177
|
|
|
// unescape < that are not used to open a tag |
178
|
78 |
|
return str_replace('##lt##', '<', $content); |
179
|
|
|
} |
180
|
|
|
|
181
|
|
|
/** |
182
|
|
|
* Strips control characters that cause Jetty/Solr to fail. |
183
|
|
|
* |
184
|
|
|
* @param string $content the content to sanitize |
185
|
|
|
* @return string the sanitized content |
186
|
|
|
* @see http://w3.org/International/questions/qa-forms-utf-8.html |
187
|
|
|
*/ |
188
|
78 |
|
public static function stripControlCharacters($content) |
189
|
|
|
{ |
190
|
|
|
// Printable utf-8 does not include any of these chars below x7F |
191
|
78 |
|
return preg_replace('@[\x00-\x08\x0B\x0C\x0E-\x1F]@', ' ', $content); |
192
|
|
|
} |
193
|
|
|
|
194
|
|
|
/** |
195
|
|
|
* Strips unusable unicode ranges |
196
|
|
|
* |
197
|
|
|
* @param string $content Content to sanitize |
198
|
|
|
* @return string Sanitized content |
199
|
|
|
*/ |
200
|
78 |
|
public static function stripUnicodeRanges($content) |
201
|
|
|
{ |
202
|
78 |
|
foreach (self::$stripUnicodeRanges as $range) { |
203
|
78 |
|
$content = self::stripUnicodeRange($content, $range[0], $range[1]); |
204
|
|
|
} |
205
|
|
|
|
206
|
78 |
|
return $content; |
207
|
|
|
} |
208
|
|
|
|
209
|
|
|
/** |
210
|
|
|
* Strips a UTF-8 character range |
211
|
|
|
* |
212
|
|
|
* @param string $content Content to sanitize |
213
|
|
|
* @param string $start Unicode range start character as uppercase hexadecimal string |
214
|
|
|
* @param string $end Unicode range end character as uppercase hexadecimal string |
215
|
|
|
* @return string Sanitized content |
216
|
|
|
*/ |
217
|
78 |
|
public static function stripUnicodeRange($content, $start, $end) |
218
|
|
|
{ |
219
|
78 |
|
return preg_replace('/[\x{' . $start . '}-\x{' . $end . '}]/u', '', |
220
|
78 |
|
$content); |
221
|
|
|
} |
222
|
|
|
|
223
|
|
|
/** |
224
|
|
|
* Shortcut method to retrieve the raw content marked for indexing. |
225
|
|
|
* |
226
|
|
|
* @return string Content marked for indexing. |
227
|
|
|
*/ |
228
|
1 |
|
public function getContentMarkedForIndexing() |
229
|
|
|
{ |
230
|
1 |
|
return $this->content; |
231
|
|
|
} |
232
|
|
|
|
233
|
|
|
/** |
234
|
|
|
* Extracts HTML tag content from tags in the content marked for indexing. |
235
|
|
|
* |
236
|
|
|
* @return array A mapping of Solr document field names to content found in defined tags. |
237
|
|
|
*/ |
238
|
67 |
|
public function getTagContent() |
239
|
|
|
{ |
240
|
67 |
|
$result = []; |
241
|
67 |
|
$matches = []; |
242
|
67 |
|
$content = $this->getContentMarkedForIndexing(); |
243
|
|
|
|
244
|
|
|
// strip all ignored tags |
245
|
67 |
|
$content = strip_tags( |
246
|
67 |
|
$content, |
247
|
67 |
|
'<' . implode('><', array_keys($this->tagToFieldMapping)) . '>' |
248
|
|
|
); |
249
|
|
|
|
250
|
67 |
|
preg_match_all( |
251
|
67 |
|
'@<(' . implode('|', |
252
|
67 |
|
array_keys($this->tagToFieldMapping)) . ')[^>]*>(.*)</\1>@Ui', |
253
|
67 |
|
$content, |
254
|
67 |
|
$matches |
255
|
|
|
); |
256
|
|
|
|
257
|
67 |
|
foreach ($matches[1] as $key => $tag) { |
258
|
|
|
// We don't want to index links auto-generated by the url filter. |
259
|
1 |
|
$pattern = '@(?:http://|https://|ftp://|mailto:|smb://|afp://|file://|gopher://|news://|ssl://|sslv2://|sslv3://|tls://|tcp://|udp://|www\.)[a-zA-Z0-9]+@'; |
260
|
1 |
|
if ($tag != 'a' || !preg_match($pattern, $matches[2][$key])) { |
261
|
1 |
|
$fieldName = $this->tagToFieldMapping[$tag]; |
262
|
1 |
|
$hasContentForFieldName = empty($result[$fieldName]); |
263
|
1 |
|
$separator = ($hasContentForFieldName) ? '' : ' '; |
264
|
1 |
|
$result[$fieldName] .= $separator . $matches[2][$key]; |
265
|
|
|
} |
266
|
|
|
} |
267
|
|
|
|
268
|
67 |
|
return $result; |
269
|
|
|
} |
270
|
|
|
} |
271
|
|
|
|
Sometimes obsolete code just ends up commented out instead of removed. In this case it is better to remove the code once you have checked you do not need it.
The code might also have been commented out for debugging purposes. In this case it is vital that someone uncomments it again or your project may behave in very unexpected ways in production.
This check looks for comments that seem to be mostly valid code and reports them.