Passed
Push — master ( c0a3a7...3b84a4 )
by Jeroen
58:51
created

mod/search/classes/Elgg/Search/Highlighter.php (1 issue)

1
<?php
2
3
namespace Elgg\Search;
4
5
/**
6
 * Highlights relavant substrings in search results
7
 *
8
 * @access private
9
 */
10
class Highlighter {
11
12
	/**
13
	 * @var array
14
	 */
15
	protected $params = [];
16
17
	/**
18
	 * Constructor
19
	 *
20
	 * @param array $params Search params
21
	 *
22
	 * @access private
23
	 */
24 3
	public function __construct(array $params = []) {
25 3
		$this->params = $params;
26 3
	}
27
28
	/**
29
	 * Safely highlights search query words found in $string avoiding recursion
30
	 *
31
	 * @param string $text Text to highlight
32
	 *
33
	 * @return string
34
	 *
35
	 * @access public
36
	 */
37 3
	public function highlightWords($text) {
38
39 3
		$text = _elgg_get_display_query($text);
40
41 3
		$i = 1;
42
		$replace_html = [
43 3
			'span' => rand(10000, 99999),
44 3
			'class' => rand(10000, 99999),
45 3
			'search-highlight' => rand(10000, 99999),
46 3
			'search-highlight-color' => rand(10000, 99999)
47
		];
48
49 3
		$parts = elgg_extract('query_parts', $this->params);
50
51 3
		foreach ($parts as $part) {
52
			// remove any boolean mode operators
53 3
			$part = preg_replace("/([\-\+~])([\w]+)/i", '$2', $part);
54
55
			// escape the delimiter and any other regexp special chars
56 3
			$part = preg_quote($part, '/');
57
58 3
			$search = "/($part)/i";
59
60
			// Must replace with placeholders in case one of the search terms is in the html string.
61
			// Later will replace the placeholders with the actual html.
62 3
			$span = $replace_html['span'];
63 3
			$class = $replace_html['class'];
64 3
			$highlight = $replace_html['search-highlight'];
65 3
			$color = $replace_html['search-highlight-color'];
66
67 3
			$replace = "<$span $class=\"$highlight $color{$i}\">$1</$span>";
68 3
			$text = preg_replace($search, $replace, $text);
69 3
			$i++;
70
		}
71
72 3
		foreach ($replace_html as $replace => $search) {
73 3
			$text = str_replace($search, $replace, $text);
74
		}
75
76 3
		return $text;
77
	}
78
79
	/**
80
	 * Return a string with highlighted matched queries and relevant context
81
	 * Determines context based upon occurrence and distance of words with each other.
82
	 *
83
	 * @todo   This also highlights partials even if partial search is not allowed.
84
	 *
85
	 * @param string $text              Text to highlight
86
	 * @param int    $min_match_context Minimum length of the text to initiate highlighting (default: 30)
87
	 * @param int    $max_length        Maximum length of the truncated and highlighted text (default: 300)
88
	 *
89
	 * @return string
90
	 *
91
	 * @access public
92
	 */
93 2
	public function highlight($text, $min_match_context = 30, $max_length = 300) {
94
95 2
		$text = strip_tags($text);
96
97 2
		$haystack_length = elgg_strlen($text);
98 2
		$haystack_lc = elgg_strtolower($text);
99
100 2
		$parts = elgg_extract('query_parts', $this->params);
101
102
		// if haystack < $max_length return the entire haystack w/formatting immediately
103 2
		if ($haystack_length <= $max_length) {
104 2
			return $this->highlightWords($text);
105
		}
106
107
		// get the starting positions and lengths for all matching words
108 1
		$starts = [];
109 1
		$lengths = [];
110 1
		foreach ($parts as $part) {
111 1
			$part = elgg_strtolower($part);
112 1
			$count = elgg_substr_count($haystack_lc, $part);
113 1
			$word_len = elgg_strlen($part);
114 1
			$haystack_len = elgg_strlen($haystack_lc);
115
116
			// find the start positions for the words
117 1
			if ($count > 1) {
118 1
				$offset = 0;
119 1
				while (false !== $pos = elgg_strpos($haystack_lc, $part, $offset)) {
120 1
					$start = ($pos - $min_match_context > 0) ? $pos - $min_match_context : 0;
121 1
					$starts[] = $start;
122 1
					$stop = $pos + $word_len + $min_match_context;
123 1
					$lengths[] = $stop - $start;
124 1
					$offset += $pos + $word_len;
125
126 1
					if ($offset >= $haystack_len) {
127 1
						break;
128
					}
129
				}
130
			} else {
131
				$pos = elgg_strpos($haystack_lc, $part);
132
				$start = ($pos - $min_match_context > 0) ? $pos - $min_match_context : 0;
133
				$starts[] = $start;
134
				$stop = $pos + $word_len + $min_match_context;
135 1
				$lengths[] = $stop - $start;
136
			}
137
		}
138
139 1
		$offsets = $this->consolidateSubstrings($starts, $lengths);
140
141
		// figure out if we can adjust the offsets and lengths
142
		// in order to return more context
143 1
		$total_length = array_sum($offsets);
144
145 1
		$add_length = 0;
0 ignored issues
show
The assignment to $add_length is dead and can be removed.
Loading history...
146 1
		if ($total_length < $max_length && $offsets) {
147 1
			$add_length = floor((($max_length - $total_length) / count($offsets)) / 2);
148
149 1
			$starts = [];
150 1
			$lengths = [];
151 1
			foreach ($offsets as $offset => $length) {
152 1
				$start = ($offset - $add_length > 0) ? $offset - $add_length : 0;
153 1
				$length = $length + $add_length;
154 1
				$starts[] = $start;
155 1
				$lengths[] = $length;
156
			}
157
158 1
			$offsets = $this->consolidateSubstrings($starts, $lengths);
159
		}
160
161
		// sort by order of string size descending (which is roughly
162
		// the proximity of matched terms) so we can keep the
163
		// substrings with terms closest together and discard
164
		// the others as needed to fit within $max_length.
165 1
		arsort($offsets);
166
167 1
		$return_strs = [];
168 1
		$total_length = 0;
169 1
		foreach ($offsets as $start => $length) {
170 1
			$string = trim(elgg_substr($text, $start, $length));
171
172
			// continue past if adding this substring exceeds max length
173 1
			if ($total_length + $length > $max_length) {
174
				continue;
175
			}
176
177 1
			$total_length += $length;
178 1
			$return_strs[$start] = $string;
179
		}
180
181
		// put the strings in order of occurence
182 1
		ksort($return_strs);
183
184
		// add ...s where needed
185 1
		$return = implode('...', $return_strs);
186 1
		if (!array_key_exists(0, $return_strs)) {
187
			$return = "...$return";
188
		}
189
190
		// add to end of string if last substring doesn't hit the end.
191 1
		$starts = array_keys($return_strs);
192 1
		$last_pos = $starts[count($starts) - 1];
193 1
		if ($last_pos + elgg_strlen($return_strs[$last_pos]) < $haystack_length) {
194
			$return .= '...';
195
		}
196
197 1
		return $this->highlightWords($return);
198
	}
199
200
	/**
201
	 * Takes an array of offsets and lengths and consolidates any
202
	 * overlapping entries, returning an array of new offsets and lengths
203
	 *
204
	 * Offsets and lengths are specified in separate arrays because of possible
205
	 * index collisions with the offsets.
206
	 *
207
	 * @param array $offsets offsets
208
	 * @param array $lengths lengths
209
	 *
210
	 * @return array
211
	 */
212 1
	protected function consolidateSubstrings($offsets, $lengths) {
213
		// sort offsets by occurence
214 1
		asort($offsets, SORT_NUMERIC);
215
216
		// reset the indexes maintaining association with the original offsets.
217 1
		$offsets = array_merge($offsets);
218
219 1
		$new_lengths = [];
220 1
		foreach ($offsets as $i => $offset) {
221 1
			$new_lengths[] = $lengths[$i];
222
		}
223
224 1
		$lengths = $new_lengths;
225
226 1
		$return = [];
227 1
		$count = count($offsets);
228 1
		for ($i = 0; $i < $count; $i++) {
229 1
			$offset = $offsets[$i];
230 1
			$length = $lengths[$i];
231 1
			$end_pos = $offset + $length;
232
233
			// find the next entry that doesn't overlap
234 1
			while (array_key_exists($i + 1, $offsets) && $end_pos > $offsets[$i + 1]) {
235 1
				$i++;
236 1
				if (!array_key_exists($i, $offsets)) {
237
					break;
238
				}
239 1
				$end_pos = $lengths[$i] + $offsets[$i];
240
			}
241
242 1
			$length = $end_pos - $offset;
243
244
			// will never have a colliding offset, so can return as a single array
245 1
			$return[$offset] = $length;
246
		}
247
248 1
		return $return;
249
	}
250
251
}
252