Elgg /
Elgg
| 1 | <?php |
||
| 2 | |||
| 3 | namespace Elgg\Search; |
||
| 4 | |||
| 5 | /** |
||
| 6 | * Highlights relavant substrings in search results |
||
| 7 | * |
||
| 8 | * @access private |
||
| 9 | */ |
||
| 10 | class Highlighter { |
||
| 11 | |||
| 12 | /** |
||
| 13 | * @var array |
||
| 14 | */ |
||
| 15 | protected $params = []; |
||
| 16 | |||
| 17 | /** |
||
| 18 | * Constructor |
||
| 19 | * |
||
| 20 | * @param array $params Search params |
||
| 21 | * |
||
| 22 | * @access private |
||
| 23 | */ |
||
| 24 | 3 | public function __construct(array $params = []) { |
|
| 25 | 3 | $this->params = $params; |
|
| 26 | 3 | } |
|
| 27 | |||
| 28 | /** |
||
| 29 | * Safely highlights search query words found in $string avoiding recursion |
||
| 30 | * |
||
| 31 | * @param string $text Text to highlight |
||
| 32 | * |
||
| 33 | * @return string |
||
| 34 | * |
||
| 35 | * @access public |
||
| 36 | */ |
||
| 37 | 3 | public function highlightWords($text) { |
|
| 38 | |||
| 39 | 3 | $text = _elgg_get_display_query($text); |
|
| 40 | |||
| 41 | 3 | $i = 1; |
|
| 42 | $replace_html = [ |
||
| 43 | 3 | 'span' => rand(10000, 99999), |
|
| 44 | 3 | 'class' => rand(10000, 99999), |
|
| 45 | 3 | 'search-highlight' => rand(10000, 99999), |
|
| 46 | 3 | 'search-highlight-color' => rand(10000, 99999) |
|
| 47 | ]; |
||
| 48 | |||
| 49 | 3 | $parts = elgg_extract('query_parts', $this->params); |
|
| 50 | |||
| 51 | 3 | foreach ($parts as $part) { |
|
| 52 | // remove any boolean mode operators |
||
| 53 | 3 | $part = preg_replace("/([\-\+~])([\w]+)/i", '$2', $part); |
|
| 54 | |||
| 55 | // escape the delimiter and any other regexp special chars |
||
| 56 | 3 | $part = preg_quote($part, '/'); |
|
| 57 | |||
| 58 | 3 | $search = "/($part)/i"; |
|
| 59 | |||
| 60 | // Must replace with placeholders in case one of the search terms is in the html string. |
||
| 61 | // Later will replace the placeholders with the actual html. |
||
| 62 | 3 | $span = $replace_html['span']; |
|
| 63 | 3 | $class = $replace_html['class']; |
|
| 64 | 3 | $highlight = $replace_html['search-highlight']; |
|
| 65 | 3 | $color = $replace_html['search-highlight-color']; |
|
| 66 | |||
| 67 | 3 | $replace = "<$span $class=\"$highlight $color{$i}\">$1</$span>"; |
|
| 68 | 3 | $text = preg_replace($search, $replace, $text); |
|
| 69 | 3 | $i++; |
|
| 70 | } |
||
| 71 | |||
| 72 | 3 | foreach ($replace_html as $replace => $search) { |
|
| 73 | 3 | $text = str_replace($search, $replace, $text); |
|
| 74 | } |
||
| 75 | |||
| 76 | 3 | return $text; |
|
| 77 | } |
||
| 78 | |||
| 79 | /** |
||
| 80 | * Return a string with highlighted matched queries and relevant context |
||
| 81 | * Determines context based upon occurrence and distance of words with each other. |
||
| 82 | * |
||
| 83 | * @todo This also highlights partials even if partial search is not allowed. |
||
| 84 | * |
||
| 85 | * @param string $text Text to highlight |
||
| 86 | * @param int $min_match_context Minimum length of the text to initiate highlighting (default: 30) |
||
| 87 | * @param int $max_length Maximum length of the truncated and highlighted text (default: 300) |
||
| 88 | * |
||
| 89 | * @return string |
||
| 90 | * |
||
| 91 | * @access public |
||
| 92 | */ |
||
| 93 | 2 | public function highlight($text, $min_match_context = 30, $max_length = 300) { |
|
| 94 | |||
| 95 | 2 | $text = strip_tags($text); |
|
| 96 | |||
| 97 | 2 | $haystack_length = elgg_strlen($text); |
|
| 98 | 2 | $haystack_lc = elgg_strtolower($text); |
|
| 99 | |||
| 100 | 2 | $parts = elgg_extract('query_parts', $this->params); |
|
| 101 | |||
| 102 | // if haystack < $max_length return the entire haystack w/formatting immediately |
||
| 103 | 2 | if ($haystack_length <= $max_length) { |
|
| 104 | 2 | return $this->highlightWords($text); |
|
| 105 | } |
||
| 106 | |||
| 107 | // get the starting positions and lengths for all matching words |
||
| 108 | 1 | $starts = []; |
|
| 109 | 1 | $lengths = []; |
|
| 110 | 1 | foreach ($parts as $part) { |
|
| 111 | 1 | $part = elgg_strtolower($part); |
|
| 112 | 1 | $count = elgg_substr_count($haystack_lc, $part); |
|
| 113 | 1 | $word_len = elgg_strlen($part); |
|
| 114 | 1 | $haystack_len = elgg_strlen($haystack_lc); |
|
| 115 | |||
| 116 | // find the start positions for the words |
||
| 117 | 1 | if ($count > 1) { |
|
| 118 | 1 | $offset = 0; |
|
| 119 | 1 | while (false !== $pos = elgg_strpos($haystack_lc, $part, $offset)) { |
|
| 120 | 1 | $start = ($pos - $min_match_context > 0) ? $pos - $min_match_context : 0; |
|
| 121 | 1 | $starts[] = $start; |
|
| 122 | 1 | $stop = $pos + $word_len + $min_match_context; |
|
| 123 | 1 | $lengths[] = $stop - $start; |
|
| 124 | 1 | $offset += $pos + $word_len; |
|
| 125 | |||
| 126 | 1 | if ($offset >= $haystack_len) { |
|
| 127 | 1 | break; |
|
| 128 | } |
||
| 129 | } |
||
| 130 | } else { |
||
| 131 | $pos = elgg_strpos($haystack_lc, $part); |
||
| 132 | $start = ($pos - $min_match_context > 0) ? $pos - $min_match_context : 0; |
||
| 133 | $starts[] = $start; |
||
| 134 | $stop = $pos + $word_len + $min_match_context; |
||
| 135 | 1 | $lengths[] = $stop - $start; |
|
| 136 | } |
||
| 137 | } |
||
| 138 | |||
| 139 | 1 | $offsets = $this->consolidateSubstrings($starts, $lengths); |
|
| 140 | |||
| 141 | // figure out if we can adjust the offsets and lengths |
||
| 142 | // in order to return more context |
||
| 143 | 1 | $total_length = array_sum($offsets); |
|
| 144 | |||
| 145 | 1 | $add_length = 0; |
|
| 146 | 1 | if ($total_length < $max_length && $offsets) { |
|
|
0 ignored issues
–
show
|
|||
| 147 | 1 | $add_length = floor((($max_length - $total_length) / count($offsets)) / 2); |
|
| 148 | |||
| 149 | 1 | $starts = []; |
|
| 150 | 1 | $lengths = []; |
|
| 151 | 1 | foreach ($offsets as $offset => $length) { |
|
| 152 | 1 | $start = ($offset - $add_length > 0) ? $offset - $add_length : 0; |
|
| 153 | 1 | $length = $length + $add_length; |
|
| 154 | 1 | $starts[] = $start; |
|
| 155 | 1 | $lengths[] = $length; |
|
| 156 | } |
||
| 157 | |||
| 158 | 1 | $offsets = $this->consolidateSubstrings($starts, $lengths); |
|
| 159 | } |
||
| 160 | |||
| 161 | // sort by order of string size descending (which is roughly |
||
| 162 | // the proximity of matched terms) so we can keep the |
||
| 163 | // substrings with terms closest together and discard |
||
| 164 | // the others as needed to fit within $max_length. |
||
| 165 | 1 | arsort($offsets); |
|
| 166 | |||
| 167 | 1 | $return_strs = []; |
|
| 168 | 1 | $total_length = 0; |
|
| 169 | 1 | foreach ($offsets as $start => $length) { |
|
| 170 | 1 | $string = trim(elgg_substr($text, $start, $length)); |
|
| 171 | |||
| 172 | // continue past if adding this substring exceeds max length |
||
| 173 | 1 | if ($total_length + $length > $max_length) { |
|
| 174 | continue; |
||
| 175 | } |
||
| 176 | |||
| 177 | 1 | $total_length += $length; |
|
| 178 | 1 | $return_strs[$start] = $string; |
|
| 179 | } |
||
| 180 | |||
| 181 | // put the strings in order of occurence |
||
| 182 | 1 | ksort($return_strs); |
|
| 183 | |||
| 184 | // add ...s where needed |
||
| 185 | 1 | $return = implode('...', $return_strs); |
|
| 186 | 1 | if (!array_key_exists(0, $return_strs)) { |
|
| 187 | $return = "...$return"; |
||
| 188 | } |
||
| 189 | |||
| 190 | // add to end of string if last substring doesn't hit the end. |
||
| 191 | 1 | $starts = array_keys($return_strs); |
|
| 192 | 1 | $last_pos = $starts[count($starts) - 1]; |
|
| 193 | 1 | if ($last_pos + elgg_strlen($return_strs[$last_pos]) < $haystack_length) { |
|
| 194 | $return .= '...'; |
||
| 195 | } |
||
| 196 | |||
| 197 | 1 | return $this->highlightWords($return); |
|
| 198 | } |
||
| 199 | |||
| 200 | /** |
||
| 201 | * Takes an array of offsets and lengths and consolidates any |
||
| 202 | * overlapping entries, returning an array of new offsets and lengths |
||
| 203 | * |
||
| 204 | * Offsets and lengths are specified in separate arrays because of possible |
||
| 205 | * index collisions with the offsets. |
||
| 206 | * |
||
| 207 | * @param array $offsets offsets |
||
| 208 | * @param array $lengths lengths |
||
| 209 | * |
||
| 210 | * @return array |
||
| 211 | */ |
||
| 212 | 1 | protected function consolidateSubstrings($offsets, $lengths) { |
|
| 213 | // sort offsets by occurence |
||
| 214 | 1 | asort($offsets, SORT_NUMERIC); |
|
| 215 | |||
| 216 | // reset the indexes maintaining association with the original offsets. |
||
| 217 | 1 | $offsets = array_merge($offsets); |
|
| 218 | |||
| 219 | 1 | $new_lengths = []; |
|
| 220 | 1 | foreach ($offsets as $i => $offset) { |
|
| 221 | 1 | $new_lengths[] = $lengths[$i]; |
|
| 222 | } |
||
| 223 | |||
| 224 | 1 | $lengths = $new_lengths; |
|
| 225 | |||
| 226 | 1 | $return = []; |
|
| 227 | 1 | $count = count($offsets); |
|
| 228 | 1 | for ($i = 0; $i < $count; $i++) { |
|
| 229 | 1 | $offset = $offsets[$i]; |
|
| 230 | 1 | $length = $lengths[$i]; |
|
| 231 | 1 | $end_pos = $offset + $length; |
|
| 232 | |||
| 233 | // find the next entry that doesn't overlap |
||
| 234 | 1 | while (array_key_exists($i + 1, $offsets) && $end_pos > $offsets[$i + 1]) { |
|
| 235 | 1 | $i++; |
|
| 236 | 1 | if (!array_key_exists($i, $offsets)) { |
|
| 237 | break; |
||
| 238 | } |
||
| 239 | 1 | $end_pos = $lengths[$i] + $offsets[$i]; |
|
| 240 | } |
||
| 241 | |||
| 242 | 1 | $length = $end_pos - $offset; |
|
| 243 | |||
| 244 | // will never have a colliding offset, so can return as a single array |
||
| 245 | 1 | $return[$offset] = $length; |
|
| 246 | } |
||
| 247 | |||
| 248 | 1 | return $return; |
|
| 249 | } |
||
| 250 | |||
| 251 | } |
||
| 252 |
This check marks implicit conversions of arrays to boolean values in a comparison. While in PHP an empty array is considered to be equal (but not identical) to false, this is not always apparent.
Consider making the comparison explicit by using
empty(..)or! empty(...)instead.