1 | <?php |
||
2 | |||
3 | namespace Elgg\Search; |
||
4 | |||
5 | /** |
||
6 | * Highlights relavant substrings in search results |
||
7 | * |
||
8 | * @access private |
||
9 | */ |
||
10 | class Highlighter { |
||
11 | |||
12 | /** |
||
13 | * @var array |
||
14 | */ |
||
15 | protected $params = []; |
||
16 | |||
17 | /** |
||
18 | * Constructor |
||
19 | * |
||
20 | * @param array $params Search params |
||
21 | * |
||
22 | * @access private |
||
23 | */ |
||
24 | 3 | public function __construct(array $params = []) { |
|
25 | 3 | $this->params = $params; |
|
26 | 3 | } |
|
27 | |||
28 | /** |
||
29 | * Safely highlights search query words found in $string avoiding recursion |
||
30 | * |
||
31 | * @param string $text Text to highlight |
||
32 | * |
||
33 | * @return string |
||
34 | * |
||
35 | * @access public |
||
36 | */ |
||
37 | 3 | public function highlightWords($text) { |
|
38 | |||
39 | 3 | $text = _elgg_get_display_query($text); |
|
40 | |||
41 | 3 | $i = 1; |
|
42 | $replace_html = [ |
||
43 | 3 | 'span' => rand(10000, 99999), |
|
44 | 3 | 'class' => rand(10000, 99999), |
|
45 | 3 | 'search-highlight' => rand(10000, 99999), |
|
46 | 3 | 'search-highlight-color' => rand(10000, 99999) |
|
47 | ]; |
||
48 | |||
49 | 3 | $parts = elgg_extract('query_parts', $this->params); |
|
50 | |||
51 | 3 | foreach ($parts as $part) { |
|
52 | // remove any boolean mode operators |
||
53 | 3 | $part = preg_replace("/([\-\+~])([\w]+)/i", '$2', $part); |
|
54 | |||
55 | // escape the delimiter and any other regexp special chars |
||
56 | 3 | $part = preg_quote($part, '/'); |
|
57 | |||
58 | 3 | $search = "/($part)/i"; |
|
59 | |||
60 | // Must replace with placeholders in case one of the search terms is in the html string. |
||
61 | // Later will replace the placeholders with the actual html. |
||
62 | 3 | $span = $replace_html['span']; |
|
63 | 3 | $class = $replace_html['class']; |
|
64 | 3 | $highlight = $replace_html['search-highlight']; |
|
65 | 3 | $color = $replace_html['search-highlight-color']; |
|
66 | |||
67 | 3 | $replace = "<$span $class=\"$highlight $color{$i}\">$1</$span>"; |
|
68 | 3 | $text = preg_replace($search, $replace, $text); |
|
69 | 3 | $i++; |
|
70 | } |
||
71 | |||
72 | 3 | foreach ($replace_html as $replace => $search) { |
|
73 | 3 | $text = str_replace($search, $replace, $text); |
|
74 | } |
||
75 | |||
76 | 3 | return $text; |
|
77 | } |
||
78 | |||
79 | /** |
||
80 | * Return a string with highlighted matched queries and relevant context |
||
81 | * Determines context based upon occurrence and distance of words with each other. |
||
82 | * |
||
83 | * @todo This also highlights partials even if partial search is not allowed. |
||
84 | * |
||
85 | * @param string $text Text to highlight |
||
86 | * @param int $min_match_context Minimum length of the text to initiate highlighting (default: 30) |
||
87 | * @param int $max_length Maximum length of the truncated and highlighted text (default: 300) |
||
88 | * |
||
89 | * @return string |
||
90 | * |
||
91 | * @access public |
||
92 | */ |
||
93 | 2 | public function highlight($text, $min_match_context = 30, $max_length = 300) { |
|
94 | |||
95 | 2 | $text = strip_tags($text); |
|
96 | |||
97 | 2 | $haystack_length = elgg_strlen($text); |
|
98 | 2 | $haystack_lc = elgg_strtolower($text); |
|
99 | |||
100 | 2 | $parts = elgg_extract('query_parts', $this->params); |
|
101 | |||
102 | // if haystack < $max_length return the entire haystack w/formatting immediately |
||
103 | 2 | if ($haystack_length <= $max_length) { |
|
104 | 2 | return $this->highlightWords($text); |
|
105 | } |
||
106 | |||
107 | // get the starting positions and lengths for all matching words |
||
108 | 1 | $starts = []; |
|
109 | 1 | $lengths = []; |
|
110 | 1 | foreach ($parts as $part) { |
|
111 | 1 | $part = elgg_strtolower($part); |
|
112 | 1 | $count = elgg_substr_count($haystack_lc, $part); |
|
113 | 1 | $word_len = elgg_strlen($part); |
|
114 | 1 | $haystack_len = elgg_strlen($haystack_lc); |
|
115 | |||
116 | // find the start positions for the words |
||
117 | 1 | if ($count > 1) { |
|
118 | 1 | $offset = 0; |
|
119 | 1 | while (false !== $pos = elgg_strpos($haystack_lc, $part, $offset)) { |
|
120 | 1 | $start = ($pos - $min_match_context > 0) ? $pos - $min_match_context : 0; |
|
121 | 1 | $starts[] = $start; |
|
122 | 1 | $stop = $pos + $word_len + $min_match_context; |
|
123 | 1 | $lengths[] = $stop - $start; |
|
124 | 1 | $offset += $pos + $word_len; |
|
125 | |||
126 | 1 | if ($offset >= $haystack_len) { |
|
127 | 1 | break; |
|
128 | } |
||
129 | } |
||
130 | } else { |
||
131 | $pos = elgg_strpos($haystack_lc, $part); |
||
132 | $start = ($pos - $min_match_context > 0) ? $pos - $min_match_context : 0; |
||
133 | $starts[] = $start; |
||
134 | $stop = $pos + $word_len + $min_match_context; |
||
135 | 1 | $lengths[] = $stop - $start; |
|
136 | } |
||
137 | } |
||
138 | |||
139 | 1 | $offsets = $this->consolidateSubstrings($starts, $lengths); |
|
140 | |||
141 | // figure out if we can adjust the offsets and lengths |
||
142 | // in order to return more context |
||
143 | 1 | $total_length = array_sum($offsets); |
|
144 | |||
145 | 1 | $add_length = 0; |
|
146 | 1 | if ($total_length < $max_length && $offsets) { |
|
0 ignored issues
–
show
|
|||
147 | 1 | $add_length = floor((($max_length - $total_length) / count($offsets)) / 2); |
|
148 | |||
149 | 1 | $starts = []; |
|
150 | 1 | $lengths = []; |
|
151 | 1 | foreach ($offsets as $offset => $length) { |
|
152 | 1 | $start = ($offset - $add_length > 0) ? $offset - $add_length : 0; |
|
153 | 1 | $length = $length + $add_length; |
|
154 | 1 | $starts[] = $start; |
|
155 | 1 | $lengths[] = $length; |
|
156 | } |
||
157 | |||
158 | 1 | $offsets = $this->consolidateSubstrings($starts, $lengths); |
|
159 | } |
||
160 | |||
161 | // sort by order of string size descending (which is roughly |
||
162 | // the proximity of matched terms) so we can keep the |
||
163 | // substrings with terms closest together and discard |
||
164 | // the others as needed to fit within $max_length. |
||
165 | 1 | arsort($offsets); |
|
166 | |||
167 | 1 | $return_strs = []; |
|
168 | 1 | $total_length = 0; |
|
169 | 1 | foreach ($offsets as $start => $length) { |
|
170 | 1 | $string = trim(elgg_substr($text, $start, $length)); |
|
171 | |||
172 | // continue past if adding this substring exceeds max length |
||
173 | 1 | if ($total_length + $length > $max_length) { |
|
174 | continue; |
||
175 | } |
||
176 | |||
177 | 1 | $total_length += $length; |
|
178 | 1 | $return_strs[$start] = $string; |
|
179 | } |
||
180 | |||
181 | // put the strings in order of occurence |
||
182 | 1 | ksort($return_strs); |
|
183 | |||
184 | // add ...s where needed |
||
185 | 1 | $return = implode('...', $return_strs); |
|
186 | 1 | if (!array_key_exists(0, $return_strs)) { |
|
187 | $return = "...$return"; |
||
188 | } |
||
189 | |||
190 | // add to end of string if last substring doesn't hit the end. |
||
191 | 1 | $starts = array_keys($return_strs); |
|
192 | 1 | $last_pos = $starts[count($starts) - 1]; |
|
193 | 1 | if ($last_pos + elgg_strlen($return_strs[$last_pos]) < $haystack_length) { |
|
194 | $return .= '...'; |
||
195 | } |
||
196 | |||
197 | 1 | return $this->highlightWords($return); |
|
198 | } |
||
199 | |||
200 | /** |
||
201 | * Takes an array of offsets and lengths and consolidates any |
||
202 | * overlapping entries, returning an array of new offsets and lengths |
||
203 | * |
||
204 | * Offsets and lengths are specified in separate arrays because of possible |
||
205 | * index collisions with the offsets. |
||
206 | * |
||
207 | * @param array $offsets offsets |
||
208 | * @param array $lengths lengths |
||
209 | * |
||
210 | * @return array |
||
211 | */ |
||
212 | 1 | protected function consolidateSubstrings($offsets, $lengths) { |
|
213 | // sort offsets by occurence |
||
214 | 1 | asort($offsets, SORT_NUMERIC); |
|
215 | |||
216 | // reset the indexes maintaining association with the original offsets. |
||
217 | 1 | $offsets = array_merge($offsets); |
|
218 | |||
219 | 1 | $new_lengths = []; |
|
220 | 1 | foreach ($offsets as $i => $offset) { |
|
221 | 1 | $new_lengths[] = $lengths[$i]; |
|
222 | } |
||
223 | |||
224 | 1 | $lengths = $new_lengths; |
|
225 | |||
226 | 1 | $return = []; |
|
227 | 1 | $count = count($offsets); |
|
228 | 1 | for ($i = 0; $i < $count; $i++) { |
|
229 | 1 | $offset = $offsets[$i]; |
|
230 | 1 | $length = $lengths[$i]; |
|
231 | 1 | $end_pos = $offset + $length; |
|
232 | |||
233 | // find the next entry that doesn't overlap |
||
234 | 1 | while (array_key_exists($i + 1, $offsets) && $end_pos > $offsets[$i + 1]) { |
|
235 | 1 | $i++; |
|
236 | 1 | if (!array_key_exists($i, $offsets)) { |
|
237 | break; |
||
238 | } |
||
239 | 1 | $end_pos = $lengths[$i] + $offsets[$i]; |
|
240 | } |
||
241 | |||
242 | 1 | $length = $end_pos - $offset; |
|
243 | |||
244 | // will never have a colliding offset, so can return as a single array |
||
245 | 1 | $return[$offset] = $length; |
|
246 | } |
||
247 | |||
248 | 1 | return $return; |
|
249 | } |
||
250 | |||
251 | } |
||
252 |
This check marks implicit conversions of arrays to boolean values in a comparison. While in PHP an empty array is considered to be equal (but not identical) to false, this is not always apparent.
Consider making the comparison explicit by using
empty(..)
or! empty(...)
instead.