Passed
Pull Request — development (#3442)
by Elk
12:13 queued 06:23
created

SearchArray::searchArrayExtended()   D

Complexity

Conditions 18
Paths 176

Size

Total Lines 85
Code Lines 35

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 9
CRAP Score 38.736

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 18
eloc 35
nc 176
nop 1
dl 0
loc 85
rs 4.2333
c 1
b 0
f 0
ccs 9
cts 15
cp 0.6
crap 38.736

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
/**
4
 * Utility class for search functionality.
5
 *
6
 * @package   ElkArte Forum
7
 * @copyright ElkArte Forum contributors
8
 * @license   BSD http://opensource.org/licenses/BSD-3-Clause (see accompanying LICENSE.txt file)
9
 *
10
 * This file contains code covered by:
11
 * copyright: 2011 Simple Machines (http://www.simplemachines.org)
12
 *
13
 * @version 2.0 dev
14
 *
15
 */
16
17
namespace ElkArte\Search;
18
19
use ElkArte\Util;
20
21
/**
22
 * Actually do the searches
23
 */
24
class SearchArray
25
{
26
	/**
27
	 * The provided search orwell "striking thirteen" -movie
28
	 *
29
	 * That way, the URLs involved in a search page will be kept as short as possible.
30
	 */
31
	protected $_search_string = [];
32
33
	/** @var array Words not be be found in the search results (-word) */
34
	private $_excludedWords = [];
35
36
	/** @var bool Simplify the fulltext search */
37
	private $_search_simple_fulltext = false;
38
39
	/** @var bool If we are performing a boolean or simple search */
40
	private $_no_regexp = false;
41
42
	/** @var array Holds the words and phrases to be searched on */
43
	private $_searchArray = [];
44
45
	/** @var array Words we do not search due to length or common terms */
46
	private $_blocklist_words = [];
47
48
	/** @var bool If search words were found on the blocklist */
49
	private $_foundBlockListedWords = false;
50
51
	/** @var array Holds words that will not be search on to inform the user they were skipped */
52
	private $_ignored = [];
53
54
	/**
55
	 * Usual constructor that does what any constructor does.
56
	 *
57
	 * @param string $search_string
58
	 * @param string[] $blocklist_words
59
	 * @param bool $search_simple_fulltext
60
	 */
61
	public function __construct($search_string, $blocklist_words, $search_simple_fulltext = false)
62
	{
63
		$this->_search_string = $search_string;
64
		$this->_search_simple_fulltext = $search_simple_fulltext;
65
		$this->_blocklist_words = $blocklist_words;
66
		$this->searchArray();
67
	}
68
69
	/**
70
	 * Builds the search array
71
	 *
72
	 * @return array
73
	 */
74
	protected function searchArray()
75
	{
76
		// Change non-word characters into spaces.
77
		$stripped_query = $this->cleanString($this->_search_string);
78
79
		// This option will do fulltext searching in the most basic way.
80
		if ($this->_search_simple_fulltext)
81
		{
82 2
			$stripped_query = strtr($stripped_query, array('"' => ''));
83
		}
84 2
85 2
		$this->_no_regexp = preg_match('~&#(?:\d{1,7}|x[0-9a-fA-F]{1,6});~', $stripped_query) === 1;
86 2
87 2
		// Extract phrase parts first (e.g. some words "this is a phrase" some more words.)
88 2
		preg_match_all('/(?:^|\s)([-]?)"([^"]+)"(?:$|\s)/', $stripped_query, $matches, PREG_PATTERN_ORDER);
89
		$phraseArray = $matches[2];
90
91
		// Remove the phrase parts and extract the words.
92
		$wordArray = preg_replace('~(?:^|\s)(?:[-]?)"(?:[^"]+)"(?:$|\s)~u', ' ', $this->_search_string);
93
		$wordArray = explode(' ', Util::htmlspecialchars(un_htmlspecialchars($wordArray), ENT_QUOTES));
94
95 2
		// A minus sign in front of a word excludes the word.... so...
96
		// .. first, we check for things like -"some words", but not "-some words".
97
		$phraseArray = $this->_checkExcludePhrase($matches[1], $phraseArray);
98 2
99
		// Now we look for -test, etc.... normaller.
100
		$wordArray = $this->_checkExcludeWord($wordArray);
101 2
102
		// The remaining words and phrases are all included.
103
		$this->_searchArray = array_merge($phraseArray, $wordArray);
104 2
105
		// Trim everything and make sure there are no words that are the same.
106
		foreach ($this->_searchArray as $index => $value)
107
		{
108
			// Skip anything practically empty.
109 2
			if (($this->_searchArray[$index] = trim($value, '-_\' ')) === '')
110
			{
111
				unset($this->_searchArray[$index]);
112 2
			}
113 2
			// Skip blocklisted words. Make sure to note we skipped them in case we end up with nothing.
114
			elseif (in_array($this->_searchArray[$index], $this->_blocklist_words))
115
			{
116 2
				$this->_foundBlockListedWords = true;
117 2
				unset($this->_searchArray[$index]);
118
			}
119
			// Don't allow very, very short words.
120
			elseif (Util::strlen($value) < 2)
121 2
			{
122
				$this->_ignored[] = $value;
123
				unset($this->_searchArray[$index]);
124 2
			}
125
		}
126
127 2
		$this->_searchArray = array_slice(array_unique($this->_searchArray), 0, 10);
128
129
		return $this->_searchArray;
130 2
	}
131
132
	/**
133 2
	 * Looks for phrases that should be excluded from results
134
	 *
135
	 * - Check for things like -"some words", but not "-some words"
136
	 * - Prevents redundancy with blocklist words
137
	 *
138 2
	 * @param string[] $matches
139
	 * @param string[] $phraseArray
140
	 *
141
	 * @return string[]
142
	 */
143
	private function _checkExcludePhrase($matches, $phraseArray)
144 2
	{
145
		foreach ($matches as $index => $word)
146
		{
147 1
			if ($word === '-')
148
			{
149
				if (($word = trim($phraseArray[$index], '-_\' ')) !== '' && !in_array($word, $this->_blocklist_words))
150
				{
151 2
					$this->_excludedWords[] = $word;
152
				}
153 2
154
				unset($phraseArray[$index]);
155
			}
156
		}
157
158
		return $phraseArray;
159
	}
160
161
	/**
162
	 * Looks for words that should be excluded in the results (-word)
163
	 *
164
	 * - Look for -test, etc
165
	 * - Prevents excluding blocklist words since it is redundant
166
	 *
167 2
	 * @param string[] $wordArray
168
	 *
169 2
	 * @return string[]
170
	 */
171
	private function _checkExcludeWord($wordArray)
172
	{
173
		foreach ($wordArray as $index => $word)
174
		{
175
			if (strpos(trim($word), '-') === 0)
176
			{
177
				if (($word = trim($word, '-_\' ')) !== '' && !in_array($word, $this->_blocklist_words))
178
				{
179
					$this->_excludedWords[] = $word;
180
				}
181
182 2
				unset($wordArray[$index]);
183
			}
184
		}
185
186
		return $wordArray;
187
	}
188
189
	/**
190
	 * Constructs a binary mode query to pass back to a search API
191
	 *
192
	 * Understands the use of OR | AND & as search modifiers
193
	 * Currently used by the sphinx API's
194
	 *
195 2
	 * @param string $string The user entered query to construct with
196
	 * @return string A binary mode query
197 2
	 */
198
	public function searchArrayExtended($string)
199 2
	{
200
		$keywords = array('include' => [], 'exclude' => []);
201
202
		// Split our search string and return an empty string if no matches
203
		if (!preg_match_all('~(-?)("[^"]+"|[^" ]+)~', $string, $tokens, PREG_SET_ORDER))
204
		{
205
			return '';
206 1
		}
207
208
		// First we split our string into included and excluded words and phrases
209
		$or_part = false;
210 2
		foreach ($tokens as $token)
211
		{
212
			$phrase = false;
213 2
214
			// Strip the quotes off of a phrase
215 2
			if ($token[2][0] === '"')
216
			{
217
				$token[2] = substr($token[2], 1, -1);
218 2
				$phrase = true;
219
			}
220 2
221
			// Prepare this token
222
			$cleanWords = $this->cleanString($token[2]);
223 2
224
			// Explode the cleanWords again in case the cleaning puts more spaces into it
225 2
			$addWords = $phrase ? array('"' . $cleanWords . '"') : preg_split('~\s+~u', $cleanWords, null, PREG_SPLIT_NO_EMPTY);
226
227
			// Excluding this word?
228
			if ($token[1] === '-')
229
			{
230
				$keywords['exclude'] = array_merge($keywords['exclude'], $addWords);
231
			}
232
			// OR'd keywords (we only do this if we have something to OR with)
233
			elseif (($token[2] === 'OR' || $token[2] === '|') && count($keywords['include']))
234
			{
235
				$last = array_pop($keywords['include']);
236
				$keywords['include'][] = is_array($last) ? $last : [$last];
237
				$or_part = true;
238
				continue;
239
			}
240
			// AND is implied in a Sphinx Search
241
			elseif ($token[2] === 'AND' || $token[2] === '&' || trim($cleanWords) === '')
242
			{
243
				continue;
244
			}
245
			else
246
			{
247
				// Must be something they want to search for!
248
				if ($or_part)
249
				{
250
					// If this was part of an OR branch, add it to the proper section
251
					$keywords['include'][count($keywords['include']) - 1] = array_merge($keywords['include'][count($keywords['include']) - 1], $addWords);
252
				}
253
				else
254
				{
255
					$keywords['include'] = array_merge($keywords['include'], $addWords);
256
				}
257
			}
258
259
			// Start fresh on this...
260
			$or_part = false;
261
		}
262
263
		// Let's make sure they're not canceling each other out
264
		$results = array_diff(array_map('serialize', $keywords['include']), array_map('serialize', $keywords['exclude']));
265
		if (count(array_map('unserialize', $results)) === 0)
266
		{
267
			return '';
268
		}
269
270
		// Now we compile our arrays into a valid search string
271
		$query_parts = [];
272
		foreach ($keywords['include'] as $keyword)
273
		{
274
			$query_parts[] = is_array($keyword) ? '(' . implode(' | ', $keyword) . ')' : $keyword;
275
		}
276
277
		foreach ($keywords['exclude'] as $keyword)
278
		{
279
			$query_parts[] = '-' . $keyword;
280
		}
281
282
		return implode(' ', $query_parts);
283
	}
284
285
	/**
286
	 * Cleans a string of everything but alphanumeric characters and certain
287
	 * special characters ",-,_  so -movie or "animal farm" are preserved
288
	 *
289
	 * @param string $string A string to clean
290
	 * @return string A cleaned up string
291
	 */
292
	public function cleanString($string)
293
	{
294
		// Decode the entities first
295
		$string = html_entity_decode($string, ENT_QUOTES, 'UTF-8');
296
297
		// Lowercase string
298
		$string = Util::strtolower($string);
299
300
		// Fix numbers so they search easier (phone numbers, SSN, dates, etc) 123-45-6789 => 123456789
301
		$string = preg_replace('~([\d]+)\pP+(?=[\d])~u', '$1', $string);
302
303
		// Last but not least, strip everything out that's not alphanumeric
304
		$string = preg_replace('~[^\pL\pN_"-]+~u', ' ', $string);
305
306
		return $string;
307
	}
308
309
	public function getSearchArray()
310
	{
311
		return $this->_searchArray;
312
	}
313
314
	public function getExcludedWords()
315
	{
316
		return $this->_excludedWords;
317
	}
318
319
	public function getNoRegexp()
320
	{
321
		return $this->_no_regexp;
322
	}
323
324
	public function foundBlockListedWords()
325
	{
326
		return $this->_foundBlockListedWords;
327
	}
328
329
	public function getIgnored()
330
	{
331
		return $this->_ignored;
332
	}
333
}
334