1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
namespace Elgg\Search; |
4
|
|
|
|
5
|
|
|
/** |
6
|
|
|
* Highlights relavant substrings in search results |
7
|
|
|
* |
8
|
|
|
* @access private |
9
|
|
|
*/ |
10
|
|
|
class Highlighter { |
11
|
|
|
|
12
|
|
|
/** |
13
|
|
|
* @var array |
14
|
|
|
*/ |
15
|
|
|
protected $params = []; |
16
|
|
|
|
17
|
|
|
/** |
18
|
|
|
* Constructor |
19
|
|
|
* |
20
|
|
|
* @param array $params Search params |
21
|
|
|
* |
22
|
|
|
* @access private |
23
|
|
|
*/ |
24
|
3 |
|
public function __construct(array $params = []) { |
25
|
3 |
|
$this->params = $params; |
26
|
3 |
|
} |
27
|
|
|
|
28
|
|
|
/** |
29
|
|
|
* Safely highlights search query words found in $string avoiding recursion |
30
|
|
|
* |
31
|
|
|
* @param string $text Text to highlight |
32
|
|
|
* |
33
|
|
|
* @return string |
34
|
|
|
* |
35
|
|
|
* @access public |
36
|
|
|
*/ |
37
|
3 |
|
public function highlightWords($text) { |
38
|
|
|
|
39
|
3 |
|
$text = _elgg_get_display_query($text); |
40
|
|
|
|
41
|
3 |
|
$i = 1; |
42
|
|
|
$replace_html = [ |
43
|
3 |
|
'span' => rand(10000, 99999), |
44
|
3 |
|
'class' => rand(10000, 99999), |
45
|
3 |
|
'search-highlight' => rand(10000, 99999), |
46
|
3 |
|
'search-highlight-color' => rand(10000, 99999) |
47
|
|
|
]; |
48
|
|
|
|
49
|
3 |
|
$parts = elgg_extract('query_parts', $this->params); |
50
|
|
|
|
51
|
3 |
|
foreach ($parts as $part) { |
52
|
|
|
// remove any boolean mode operators |
53
|
3 |
|
$part = preg_replace("/([\-\+~])([\w]+)/i", '$2', $part); |
54
|
|
|
|
55
|
|
|
// escape the delimiter and any other regexp special chars |
56
|
3 |
|
$part = preg_quote($part, '/'); |
57
|
|
|
|
58
|
3 |
|
$search = "/($part)/i"; |
59
|
|
|
|
60
|
|
|
// Must replace with placeholders in case one of the search terms is in the html string. |
61
|
|
|
// Later will replace the placeholders with the actual html. |
62
|
3 |
|
$span = $replace_html['span']; |
63
|
3 |
|
$class = $replace_html['class']; |
64
|
3 |
|
$highlight = $replace_html['search-highlight']; |
65
|
3 |
|
$color = $replace_html['search-highlight-color']; |
66
|
|
|
|
67
|
3 |
|
$replace = "<$span $class=\"$highlight $color{$i}\">$1</$span>"; |
68
|
3 |
|
$text = preg_replace($search, $replace, $text); |
69
|
3 |
|
$i++; |
70
|
|
|
} |
71
|
|
|
|
72
|
3 |
|
foreach ($replace_html as $replace => $search) { |
73
|
3 |
|
$text = str_replace($search, $replace, $text); |
74
|
|
|
} |
75
|
|
|
|
76
|
3 |
|
return $text; |
77
|
|
|
} |
78
|
|
|
|
79
|
|
|
/** |
80
|
|
|
* Return a string with highlighted matched queries and relevant context |
81
|
|
|
* Determines context based upon occurrence and distance of words with each other. |
82
|
|
|
* |
83
|
|
|
* @todo This also highlights partials even if partial search is not allowed. |
84
|
|
|
* |
85
|
|
|
* @param string $text Text to highlight |
86
|
|
|
* @param int $min_match_context Minimum length of the text to initiate highlighting (default: 30) |
87
|
|
|
* @param int $max_length Maximum length of the truncated and highlighted text (default: 300) |
88
|
|
|
* |
89
|
|
|
* @return string |
90
|
|
|
* |
91
|
|
|
* @access public |
92
|
|
|
*/ |
93
|
2 |
|
public function highlight($text, $min_match_context = 30, $max_length = 300) { |
94
|
|
|
|
95
|
2 |
|
$text = strip_tags($text); |
96
|
|
|
|
97
|
2 |
|
$haystack_length = elgg_strlen($text); |
98
|
2 |
|
$haystack_lc = elgg_strtolower($text); |
99
|
|
|
|
100
|
2 |
|
$parts = elgg_extract('query_parts', $this->params); |
101
|
|
|
|
102
|
|
|
// if haystack < $max_length return the entire haystack w/formatting immediately |
103
|
2 |
|
if ($haystack_length <= $max_length) { |
104
|
2 |
|
return $this->highlightWords($text); |
105
|
|
|
} |
106
|
|
|
|
107
|
|
|
// get the starting positions and lengths for all matching words |
108
|
1 |
|
$starts = []; |
109
|
1 |
|
$lengths = []; |
110
|
1 |
|
foreach ($parts as $part) { |
111
|
1 |
|
$part = elgg_strtolower($part); |
112
|
1 |
|
$count = elgg_substr_count($haystack_lc, $part); |
113
|
1 |
|
$word_len = elgg_strlen($part); |
114
|
1 |
|
$haystack_len = elgg_strlen($haystack_lc); |
115
|
|
|
|
116
|
|
|
// find the start positions for the words |
117
|
1 |
|
if ($count > 1) { |
118
|
1 |
|
$offset = 0; |
119
|
1 |
|
while (false !== $pos = elgg_strpos($haystack_lc, $part, $offset)) { |
120
|
1 |
|
$start = ($pos - $min_match_context > 0) ? $pos - $min_match_context : 0; |
121
|
1 |
|
$starts[] = $start; |
122
|
1 |
|
$stop = $pos + $word_len + $min_match_context; |
123
|
1 |
|
$lengths[] = $stop - $start; |
124
|
1 |
|
$offset += $pos + $word_len; |
125
|
|
|
|
126
|
1 |
|
if ($offset >= $haystack_len) { |
127
|
1 |
|
break; |
128
|
|
|
} |
129
|
|
|
} |
130
|
|
|
} else { |
131
|
|
|
$pos = elgg_strpos($haystack_lc, $part); |
132
|
|
|
$start = ($pos - $min_match_context > 0) ? $pos - $min_match_context : 0; |
133
|
|
|
$starts[] = $start; |
134
|
|
|
$stop = $pos + $word_len + $min_match_context; |
135
|
1 |
|
$lengths[] = $stop - $start; |
136
|
|
|
} |
137
|
|
|
} |
138
|
|
|
|
139
|
1 |
|
$offsets = $this->consolidateSubstrings($starts, $lengths); |
140
|
|
|
|
141
|
|
|
// figure out if we can adjust the offsets and lengths |
142
|
|
|
// in order to return more context |
143
|
1 |
|
$total_length = array_sum($offsets); |
144
|
|
|
|
145
|
1 |
|
$add_length = 0; |
|
|
|
|
146
|
1 |
|
if ($total_length < $max_length && $offsets) { |
|
|
|
|
147
|
1 |
|
$add_length = floor((($max_length - $total_length) / count($offsets)) / 2); |
148
|
|
|
|
149
|
1 |
|
$starts = []; |
150
|
1 |
|
$lengths = []; |
151
|
1 |
|
foreach ($offsets as $offset => $length) { |
152
|
1 |
|
$start = ($offset - $add_length > 0) ? $offset - $add_length : 0; |
153
|
1 |
|
$length = $length + $add_length; |
154
|
1 |
|
$starts[] = $start; |
155
|
1 |
|
$lengths[] = $length; |
156
|
|
|
} |
157
|
|
|
|
158
|
1 |
|
$offsets = $this->consolidateSubstrings($starts, $lengths); |
159
|
|
|
} |
160
|
|
|
|
161
|
|
|
// sort by order of string size descending (which is roughly |
162
|
|
|
// the proximity of matched terms) so we can keep the |
163
|
|
|
// substrings with terms closest together and discard |
164
|
|
|
// the others as needed to fit within $max_length. |
165
|
1 |
|
arsort($offsets); |
166
|
|
|
|
167
|
1 |
|
$return_strs = []; |
168
|
1 |
|
$total_length = 0; |
169
|
1 |
|
foreach ($offsets as $start => $length) { |
170
|
1 |
|
$string = trim(elgg_substr($text, $start, $length)); |
171
|
|
|
|
172
|
|
|
// continue past if adding this substring exceeds max length |
173
|
1 |
|
if ($total_length + $length > $max_length) { |
174
|
|
|
continue; |
175
|
|
|
} |
176
|
|
|
|
177
|
1 |
|
$total_length += $length; |
178
|
1 |
|
$return_strs[$start] = $string; |
179
|
|
|
} |
180
|
|
|
|
181
|
|
|
// put the strings in order of occurence |
182
|
1 |
|
ksort($return_strs); |
183
|
|
|
|
184
|
|
|
// add ...s where needed |
185
|
1 |
|
$return = implode('...', $return_strs); |
186
|
1 |
|
if (!array_key_exists(0, $return_strs)) { |
187
|
|
|
$return = "...$return"; |
188
|
|
|
} |
189
|
|
|
|
190
|
|
|
// add to end of string if last substring doesn't hit the end. |
191
|
1 |
|
$starts = array_keys($return_strs); |
192
|
1 |
|
$last_pos = $starts[count($starts) - 1]; |
193
|
1 |
|
if ($last_pos + elgg_strlen($return_strs[$last_pos]) < $haystack_length) { |
194
|
|
|
$return .= '...'; |
195
|
|
|
} |
196
|
|
|
|
197
|
1 |
|
return $this->highlightWords($return); |
198
|
|
|
} |
199
|
|
|
|
200
|
|
|
/** |
201
|
|
|
* Takes an array of offsets and lengths and consolidates any |
202
|
|
|
* overlapping entries, returning an array of new offsets and lengths |
203
|
|
|
* |
204
|
|
|
* Offsets and lengths are specified in separate arrays because of possible |
205
|
|
|
* index collisions with the offsets. |
206
|
|
|
* |
207
|
|
|
* @param array $offsets offsets |
208
|
|
|
* @param array $lengths lengths |
209
|
|
|
* |
210
|
|
|
* @return array |
211
|
|
|
*/ |
212
|
1 |
|
protected function consolidateSubstrings($offsets, $lengths) { |
213
|
|
|
// sort offsets by occurence |
214
|
1 |
|
asort($offsets, SORT_NUMERIC); |
215
|
|
|
|
216
|
|
|
// reset the indexes maintaining association with the original offsets. |
217
|
1 |
|
$offsets = array_merge($offsets); |
218
|
|
|
|
219
|
1 |
|
$new_lengths = []; |
220
|
1 |
|
foreach ($offsets as $i => $offset) { |
221
|
1 |
|
$new_lengths[] = $lengths[$i]; |
222
|
|
|
} |
223
|
|
|
|
224
|
1 |
|
$lengths = $new_lengths; |
225
|
|
|
|
226
|
1 |
|
$return = []; |
227
|
1 |
|
$count = count($offsets); |
228
|
1 |
|
for ($i = 0; $i < $count; $i++) { |
229
|
1 |
|
$offset = $offsets[$i]; |
230
|
1 |
|
$length = $lengths[$i]; |
231
|
1 |
|
$end_pos = $offset + $length; |
232
|
|
|
|
233
|
|
|
// find the next entry that doesn't overlap |
234
|
1 |
|
while (array_key_exists($i + 1, $offsets) && $end_pos > $offsets[$i + 1]) { |
235
|
1 |
|
$i++; |
236
|
1 |
|
if (!array_key_exists($i, $offsets)) { |
237
|
|
|
break; |
238
|
|
|
} |
239
|
1 |
|
$end_pos = $lengths[$i] + $offsets[$i]; |
240
|
|
|
} |
241
|
|
|
|
242
|
1 |
|
$length = $end_pos - $offset; |
243
|
|
|
|
244
|
|
|
// will never have a colliding offset, so can return as a single array |
245
|
1 |
|
$return[$offset] = $length; |
246
|
|
|
} |
247
|
|
|
|
248
|
1 |
|
return $return; |
249
|
|
|
} |
250
|
|
|
|
251
|
|
|
} |
252
|
|
|
|