1 | <?php |
||
2 | |||
3 | namespace Elgg\Search; |
||
4 | |||
5 | /** |
||
6 | * Highlights relavant substrings in search results |
||
7 | * |
||
8 | * @access private |
||
9 | */ |
||
10 | class Highlighter { |
||
11 | |||
12 | /** |
||
13 | * @var array |
||
14 | */ |
||
15 | protected $params = []; |
||
16 | |||
17 | /** |
||
18 | * Constructor |
||
19 | * |
||
20 | * @param array $params Search params |
||
21 | * |
||
22 | * @access private |
||
23 | */ |
||
24 | 3 | public function __construct(array $params = []) { |
|
25 | 3 | $this->params = $params; |
|
26 | 3 | } |
|
27 | |||
28 | /** |
||
29 | * Safely highlights search query words found in $string avoiding recursion |
||
30 | * |
||
31 | * @param string $text Text to highlight |
||
32 | * |
||
33 | * @return string |
||
34 | * |
||
35 | * @access public |
||
36 | */ |
||
37 | 3 | public function highlightWords($text) { |
|
38 | |||
39 | 3 | $text = _elgg_get_display_query($text); |
|
40 | |||
41 | 3 | $i = 1; |
|
42 | $replace_html = [ |
||
43 | 3 | 'span' => rand(10000, 99999), |
|
44 | 3 | 'class' => rand(10000, 99999), |
|
45 | 3 | 'search-highlight' => rand(10000, 99999), |
|
46 | 3 | 'search-highlight-color' => rand(10000, 99999) |
|
47 | ]; |
||
48 | |||
49 | 3 | $parts = elgg_extract('query_parts', $this->params); |
|
50 | |||
51 | 3 | foreach ($parts as $part) { |
|
52 | // remove any boolean mode operators |
||
53 | 3 | $part = preg_replace("/([\-\+~])([\w]+)/i", '$2', $part); |
|
54 | |||
55 | // escape the delimiter and any other regexp special chars |
||
56 | 3 | $part = preg_quote($part, '/'); |
|
57 | |||
58 | 3 | $search = "/($part)/i"; |
|
59 | |||
60 | // Must replace with placeholders in case one of the search terms is in the html string. |
||
61 | // Later will replace the placeholders with the actual html. |
||
62 | 3 | $span = $replace_html['span']; |
|
63 | 3 | $class = $replace_html['class']; |
|
64 | 3 | $highlight = $replace_html['search-highlight']; |
|
65 | 3 | $color = $replace_html['search-highlight-color']; |
|
66 | |||
67 | 3 | $replace = "<$span $class=\"$highlight $color{$i}\">$1</$span>"; |
|
68 | 3 | $text = preg_replace($search, $replace, $text); |
|
69 | 3 | $i++; |
|
70 | } |
||
71 | |||
72 | 3 | foreach ($replace_html as $replace => $search) { |
|
73 | 3 | $text = str_replace($search, $replace, $text); |
|
74 | } |
||
75 | |||
76 | 3 | return $text; |
|
77 | } |
||
78 | |||
79 | /** |
||
80 | * Return a string with highlighted matched queries and relevant context |
||
81 | * Determines context based upon occurrence and distance of words with each other. |
||
82 | * |
||
83 | * @todo This also highlights partials even if partial search is not allowed. |
||
84 | * |
||
85 | * @param string $text Text to highlight |
||
86 | * @param int $min_match_context Minimum length of the text to initiate highlighting (default: 30) |
||
87 | * @param int $max_length Maximum length of the truncated and highlighted text (default: 300) |
||
88 | * |
||
89 | * @return string |
||
90 | * |
||
91 | * @access public |
||
92 | */ |
||
93 | 2 | public function highlight($text, $min_match_context = 30, $max_length = 300) { |
|
94 | |||
95 | 2 | $text = strip_tags($text); |
|
96 | |||
97 | 2 | $haystack_length = elgg_strlen($text); |
|
98 | 2 | $haystack_lc = elgg_strtolower($text); |
|
99 | |||
100 | 2 | $parts = elgg_extract('query_parts', $this->params); |
|
101 | |||
102 | // if haystack < $max_length return the entire haystack w/formatting immediately |
||
103 | 2 | if ($haystack_length <= $max_length) { |
|
104 | 2 | return $this->highlightWords($text); |
|
105 | } |
||
106 | |||
107 | // get the starting positions and lengths for all matching words |
||
108 | 1 | $starts = []; |
|
109 | 1 | $lengths = []; |
|
110 | 1 | foreach ($parts as $part) { |
|
111 | 1 | $part = elgg_strtolower($part); |
|
112 | 1 | $count = elgg_substr_count($haystack_lc, $part); |
|
113 | 1 | $word_len = elgg_strlen($part); |
|
114 | 1 | $haystack_len = elgg_strlen($haystack_lc); |
|
115 | |||
116 | // find the start positions for the words |
||
117 | 1 | if ($count > 1) { |
|
118 | 1 | $offset = 0; |
|
119 | 1 | while (false !== $pos = elgg_strpos($haystack_lc, $part, $offset)) { |
|
120 | 1 | $start = ($pos - $min_match_context > 0) ? $pos - $min_match_context : 0; |
|
121 | 1 | $starts[] = $start; |
|
122 | 1 | $stop = $pos + $word_len + $min_match_context; |
|
123 | 1 | $lengths[] = $stop - $start; |
|
124 | 1 | $offset += $pos + $word_len; |
|
125 | |||
126 | 1 | if ($offset >= $haystack_len) { |
|
127 | 1 | break; |
|
128 | } |
||
129 | } |
||
130 | } else { |
||
131 | $pos = elgg_strpos($haystack_lc, $part); |
||
132 | $start = ($pos - $min_match_context > 0) ? $pos - $min_match_context : 0; |
||
133 | $starts[] = $start; |
||
134 | $stop = $pos + $word_len + $min_match_context; |
||
135 | 1 | $lengths[] = $stop - $start; |
|
136 | } |
||
137 | } |
||
138 | |||
139 | 1 | $offsets = $this->consolidateSubstrings($starts, $lengths); |
|
140 | |||
141 | // figure out if we can adjust the offsets and lengths |
||
142 | // in order to return more context |
||
143 | 1 | $total_length = array_sum($offsets); |
|
144 | |||
145 | 1 | $add_length = 0; |
|
0 ignored issues
–
show
Unused Code
introduced
by
Loading history...
|
|||
146 | 1 | if ($total_length < $max_length && $offsets) { |
|
147 | 1 | $add_length = floor((($max_length - $total_length) / count($offsets)) / 2); |
|
148 | |||
149 | 1 | $starts = []; |
|
150 | 1 | $lengths = []; |
|
151 | 1 | foreach ($offsets as $offset => $length) { |
|
152 | 1 | $start = ($offset - $add_length > 0) ? $offset - $add_length : 0; |
|
153 | 1 | $length = $length + $add_length; |
|
154 | 1 | $starts[] = $start; |
|
155 | 1 | $lengths[] = $length; |
|
156 | } |
||
157 | |||
158 | 1 | $offsets = $this->consolidateSubstrings($starts, $lengths); |
|
159 | } |
||
160 | |||
161 | // sort by order of string size descending (which is roughly |
||
162 | // the proximity of matched terms) so we can keep the |
||
163 | // substrings with terms closest together and discard |
||
164 | // the others as needed to fit within $max_length. |
||
165 | 1 | arsort($offsets); |
|
166 | |||
167 | 1 | $return_strs = []; |
|
168 | 1 | $total_length = 0; |
|
169 | 1 | foreach ($offsets as $start => $length) { |
|
170 | 1 | $string = trim(elgg_substr($text, $start, $length)); |
|
171 | |||
172 | // continue past if adding this substring exceeds max length |
||
173 | 1 | if ($total_length + $length > $max_length) { |
|
174 | continue; |
||
175 | } |
||
176 | |||
177 | 1 | $total_length += $length; |
|
178 | 1 | $return_strs[$start] = $string; |
|
179 | } |
||
180 | |||
181 | // put the strings in order of occurence |
||
182 | 1 | ksort($return_strs); |
|
183 | |||
184 | // add ...s where needed |
||
185 | 1 | $return = implode('...', $return_strs); |
|
186 | 1 | if (!array_key_exists(0, $return_strs)) { |
|
187 | $return = "...$return"; |
||
188 | } |
||
189 | |||
190 | // add to end of string if last substring doesn't hit the end. |
||
191 | 1 | $starts = array_keys($return_strs); |
|
192 | 1 | $last_pos = $starts[count($starts) - 1]; |
|
193 | 1 | if ($last_pos + elgg_strlen($return_strs[$last_pos]) < $haystack_length) { |
|
194 | $return .= '...'; |
||
195 | } |
||
196 | |||
197 | 1 | return $this->highlightWords($return); |
|
198 | } |
||
199 | |||
200 | /** |
||
201 | * Takes an array of offsets and lengths and consolidates any |
||
202 | * overlapping entries, returning an array of new offsets and lengths |
||
203 | * |
||
204 | * Offsets and lengths are specified in separate arrays because of possible |
||
205 | * index collisions with the offsets. |
||
206 | * |
||
207 | * @param array $offsets offsets |
||
208 | * @param array $lengths lengths |
||
209 | * |
||
210 | * @return array |
||
211 | */ |
||
212 | 1 | protected function consolidateSubstrings($offsets, $lengths) { |
|
213 | // sort offsets by occurence |
||
214 | 1 | asort($offsets, SORT_NUMERIC); |
|
215 | |||
216 | // reset the indexes maintaining association with the original offsets. |
||
217 | 1 | $offsets = array_merge($offsets); |
|
218 | |||
219 | 1 | $new_lengths = []; |
|
220 | 1 | foreach ($offsets as $i => $offset) { |
|
221 | 1 | $new_lengths[] = $lengths[$i]; |
|
222 | } |
||
223 | |||
224 | 1 | $lengths = $new_lengths; |
|
225 | |||
226 | 1 | $return = []; |
|
227 | 1 | $count = count($offsets); |
|
228 | 1 | for ($i = 0; $i < $count; $i++) { |
|
229 | 1 | $offset = $offsets[$i]; |
|
230 | 1 | $length = $lengths[$i]; |
|
231 | 1 | $end_pos = $offset + $length; |
|
232 | |||
233 | // find the next entry that doesn't overlap |
||
234 | 1 | while (array_key_exists($i + 1, $offsets) && $end_pos > $offsets[$i + 1]) { |
|
235 | 1 | $i++; |
|
236 | 1 | if (!array_key_exists($i, $offsets)) { |
|
237 | break; |
||
238 | } |
||
239 | 1 | $end_pos = $lengths[$i] + $offsets[$i]; |
|
240 | } |
||
241 | |||
242 | 1 | $length = $end_pos - $offset; |
|
243 | |||
244 | // will never have a colliding offset, so can return as a single array |
||
245 | 1 | $return[$offset] = $length; |
|
246 | } |
||
247 | |||
248 | 1 | return $return; |
|
249 | } |
||
250 | |||
251 | } |
||
252 |