custom_search::prepareIndexes()   B
last analyzed

Complexity

Conditions 8
Paths 10

Size

Total Lines 21
Code Lines 12

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 8
eloc 12
c 0
b 0
f 0
nop 4
dl 0
loc 21
rs 8.4444
nc 10
1
<?php
2
3
/**
4
 * Simple Machines Forum (SMF)
5
 *
6
 * @package SMF
7
 * @author Simple Machines https://www.simplemachines.org
8
 * @copyright 2025 Simple Machines and individual contributors
9
 * @license https://www.simplemachines.org/about/smf/license.php BSD
10
 *
11
 * @version 2.1.5
12
 */
13
14
if (!defined('SMF'))
15
	die('No direct access...');
16
17
/**
18
 * Used for the "custom search index" option
19
 * Class custom_search
20
 */
21
class custom_search extends search_api
22
{
23
	/**
24
	 * @var array Index settings
25
	 */
26
	protected $indexSettings = array();
27
28
	/**
29
	 * @var array An array of banned words
30
	 */
31
	protected $bannedWords = array();
32
33
	/**
34
	 * @var int|null Minimum word length (null for no minimum)
35
	 */
36
	protected $min_word_length = null;
37
38
	/**
39
	 * @var array Which databases support this method
40
	 */
41
	protected $supported_databases = array('mysql', 'postgresql');
42
43
	/**
44
	 * Constructor function
45
	 */
46
	public function __construct()
47
	{
48
		global $smcFunc, $modSettings, $db_type;
49
50
		// Is this database supported?
51
		if (!in_array($db_type, $this->supported_databases))
52
		{
53
			$this->is_supported = false;
54
			return;
55
		}
56
57
		if (empty($modSettings['search_custom_index_config']))
58
			return;
59
60
		$this->indexSettings = $smcFunc['json_decode']($modSettings['search_custom_index_config'], true);
61
62
		$this->bannedWords = empty($modSettings['search_stopwords']) ? array() : explode(',', $modSettings['search_stopwords']);
63
		$this->min_word_length = $this->indexSettings['bytes_per_word'];
64
	}
65
66
	/**
67
	 * {@inheritDoc}
68
	 */
69
	public function supportsMethod($methodName, $query_params = null)
70
	{
71
		$return = false;
72
		switch ($methodName)
73
		{
74
			case 'isValid':
75
			case 'searchSort':
76
			case 'prepareIndexes':
77
			case 'indexedWordQuery':
78
			case 'postCreated':
79
			case 'postModified':
80
				$return = true;
81
				break;
82
83
			// All other methods, too bad dunno you.
84
			default:
85
				$return = false;
86
		}
87
88
		// Maybe parent got support
89
		if (!$return)
90
			$return = parent::supportsMethod($methodName, $query_params);
91
92
		return $return;
93
	}
94
95
	/**
96
	 * {@inheritDoc}
97
	 */
98
	public function isValid()
99
	{
100
		global $modSettings;
101
102
		return !empty($modSettings['search_custom_index_config']);
103
	}
104
105
	/**
106
	 * {@inheritDoc}
107
	 */
108
	public function searchSort($a, $b)
109
	{
110
		global $excludedWords;
111
112
		$x = strlen($a) - (in_array($a, $excludedWords) ? 1000 : 0);
113
		$y = strlen($b) - (in_array($b, $excludedWords) ? 1000 : 0);
114
115
		return $y < $x ? 1 : ($y > $x ? -1 : 0);
116
	}
117
118
	/**
119
	 * {@inheritDoc}
120
	 */
121
	public function prepareIndexes($word, array &$wordsSearch, array &$wordsExclude, $isExcluded)
122
	{
123
		global $modSettings, $smcFunc;
124
125
		$subwords = text2words($word, $this->min_word_length, true);
126
127
		if (empty($modSettings['search_force_index']))
128
			$wordsSearch['words'][] = $word;
129
130
		// Excluded phrases don't benefit from being split into subwords.
131
		if (count($subwords) > 1 && $isExcluded)
132
			return;
133
		else
134
		{
135
			foreach ($subwords as $subword)
136
			{
137
				if ($smcFunc['strlen']($subword) >= $this->min_word_length && !in_array($subword, $this->bannedWords))
138
				{
139
					$wordsSearch['indexed_words'][] = $subword;
140
					if ($isExcluded)
141
						$wordsExclude[] = $subword;
142
				}
143
			}
144
		}
145
	}
146
147
	/**
148
	 * {@inheritDoc}
149
	 */
150
	public function indexedWordQuery(array $words, array $search_data)
151
	{
152
		global $modSettings, $smcFunc;
153
154
		// Specify the function to search with. Regex is for word boundaries.
155
		$is_search_regex = !empty($modSettings['search_match_words']) && !$search_data['no_regexp'];
156
		$query_match_type = $is_search_regex ? 'RLIKE' : 'LIKE';
157
		$word_boundary_wrapper = function(string $str) use ($smcFunc): string
158
		{
159
			return sprintf($smcFunc['db_supports_pcre'] ? '\\b%s\\b' : '[[:<:]]%s[[:>:]]', $str);
160
		};
161
		$escape_sql_regex = function(string $str): string
162
		{
163
			return addcslashes(preg_replace('/[\[\]$.+*?&^|{}()]/', '[$0]', $str), '\\\'');
164
		};
165
166
		$query_select = array(
167
			'id_msg' => 'm.id_msg',
168
		);
169
		$query_inner_join = array();
170
		$query_left_join = array();
171
		$query_where = array();
172
		$query_params = $search_data['params'];
173
174
		if ($query_params['id_search'])
175
			$query_select['id_search'] = '{int:id_search}';
176
177
		$count = 0;
178
		foreach ($words['words'] as $regularWord)
179
		{
180
			if (in_array($regularWord, $query_params['excluded_words']))
181
				$query_where[] = 'm.body NOT ' . $query_match_type . ' {string:complex_body_' . $count . '}';
182
			else
183
				$query_where[] = 'm.body ' . $query_match_type . ' {string:complex_body_' . $count . '}';
184
185
			if ($is_search_regex)
186
				$query_params['complex_body_' . $count++] = $word_boundary_wrapper($escape_sql_regex($regularWord));
187
			else
188
				$query_params['complex_body_' . $count++] = '%' . $smcFunc['db_escape_wildcard_string']($regularWord) . '%';
189
		}
190
191
		if ($query_params['user_query'])
192
			$query_where[] = '{raw:user_query}';
193
		if ($query_params['board_query'])
194
			$query_where[] = 'm.id_board {raw:board_query}';
195
196
		if ($query_params['topic'])
197
			$query_where[] = 'm.id_topic = {int:topic}';
198
		if ($query_params['min_msg_id'])
199
			$query_where[] = 'm.id_msg >= {int:min_msg_id}';
200
		if ($query_params['max_msg_id'])
201
			$query_where[] = 'm.id_msg <= {int:max_msg_id}';
202
203
		$count = 0;
204
		if (!empty($query_params['excluded_phrases']) && empty($modSettings['search_force_index']))
205
			foreach ($query_params['excluded_phrases'] as $phrase)
206
			{
207
				$query_where[] = 'subject NOT ' . $query_match_type . ' {string:exclude_subject_words_' . $count . '}';
208
209
				if ($is_search_regex)
210
					$query_params['exclude_subject_words_' . $count++] = $word_boundary_wrapper($escape_sql_regex($phrase));
211
				else
212
					$query_params['exclude_subject_words_' . $count++] = '%' . $smcFunc['db_escape_wildcard_string']($phrase) . '%';
213
			}
214
		$count = 0;
215
		if (!empty($query_params['excluded_subject_words']) && empty($modSettings['search_force_index']))
216
			foreach ($query_params['excluded_subject_words'] as $excludedWord)
217
			{
218
				$query_where[] = 'subject NOT ' . $query_match_type . ' {string:exclude_subject_words_' . $count . '}';
219
220
				if ($is_search_regex)
221
					$query_params['exclude_subject_words_' . $count++] = $word_boundary_wrapper($escape_sql_regex($excludedWord));
222
				else
223
					$query_params['exclude_subject_words_' . $count++] = '%' . $smcFunc['db_escape_wildcard_string']($excludedWord) . '%';
224
			}
225
226
		$numTables = 0;
227
		$prev_join = 0;
228
		foreach ($words['indexed_words'] as $indexedWord)
229
		{
230
			$numTables++;
231
			if (in_array($indexedWord, $query_params['excluded_index_words']))
232
			{
233
				$query_left_join[] = '{db_prefix}log_search_words AS lsw' . $numTables . ' ON (lsw' . $numTables . '.id_word = ' . $indexedWord . ' AND lsw' . $numTables . '.id_msg = m.id_msg)';
234
				$query_where[] = '(lsw' . $numTables . '.id_word IS NULL)';
235
			}
236
			else
237
			{
238
				$query_inner_join[] = '{db_prefix}log_search_words AS lsw' . $numTables . ' ON (lsw' . $numTables . '.id_msg = ' . ($prev_join === 0 ? 'm' : 'lsw' . $prev_join) . '.id_msg)';
239
				$query_where[] = 'lsw' . $numTables . '.id_word = ' . $indexedWord;
240
				$prev_join = $numTables;
241
			}
242
		}
243
244
		$ignoreRequest = $smcFunc['db_search_query']('insert_into_log_messages_fulltext', ($smcFunc['db_support_ignore'] ? ('
245
			INSERT IGNORE INTO {db_prefix}' . $search_data['insert_into'] . '
246
				(' . implode(', ', array_keys($query_select)) . ')') : '') . '
247
			SELECT ' . implode(', ', $query_select) . '
248
			FROM {db_prefix}messages AS m' . (empty($query_inner_join) ? '' : '
249
				INNER JOIN ' . implode('
250
				INNER JOIN ', $query_inner_join)) . (empty($query_left_join) ? '' : '
251
				LEFT JOIN ' . implode('
252
				LEFT JOIN ', $query_left_join)) . '
253
			WHERE ' . implode('
254
				AND ', $query_where) . (empty($search_data['max_results']) ? '' : '
255
			LIMIT ' . ($search_data['max_results'] - $search_data['indexed_results'])),
256
			$query_params
257
		);
258
259
		return $ignoreRequest;
260
	}
261
262
	/**
263
	 * {@inheritDoc}
264
	 */
265
	public function postCreated(array &$msgOptions, array &$topicOptions, array &$posterOptions)
266
	{
267
		global $modSettings, $smcFunc;
268
269
		$customIndexSettings = $smcFunc['json_decode']($modSettings['search_custom_index_config'], true);
270
271
		$inserts = array();
272
		foreach (text2words($msgOptions['body'], $customIndexSettings['bytes_per_word'], true) as $word)
273
			$inserts[] = array($word, $msgOptions['id']);
274
275
		if (!empty($inserts))
276
			$smcFunc['db_insert']('ignore',
277
				'{db_prefix}log_search_words',
278
				array('id_word' => 'int', 'id_msg' => 'int'),
279
				$inserts,
280
				array('id_word', 'id_msg')
281
			);
282
	}
283
284
	/**
285
	 * {@inheritDoc}
286
	 */
287
	public function postModified(array &$msgOptions, array &$topicOptions, array &$posterOptions)
288
	{
289
		global $modSettings, $smcFunc;
290
291
		if (isset($msgOptions['body']))
292
		{
293
			$customIndexSettings = $smcFunc['json_decode']($modSettings['search_custom_index_config'], true);
294
			$stopwords = empty($modSettings['search_stopwords']) ? array() : explode(',', $modSettings['search_stopwords']);
295
			$old_body = isset($msgOptions['old_body']) ? $msgOptions['old_body'] : '';
296
297
			// create thew new and old index
298
			$old_index = text2words($old_body, $customIndexSettings['bytes_per_word'], true);
299
			$new_index = text2words($msgOptions['body'], $customIndexSettings['bytes_per_word'], true);
300
301
			// Calculate the words to be added and removed from the index.
302
			$removed_words = array_diff(array_diff($old_index, $new_index), $stopwords);
303
			$inserted_words = array_diff(array_diff($new_index, $old_index), $stopwords);
304
305
			// Delete the removed words AND the added ones to avoid key constraints.
306
			if (!empty($removed_words))
307
			{
308
				$removed_words = array_merge($removed_words, $inserted_words);
309
				$smcFunc['db_query']('', '
310
					DELETE FROM {db_prefix}log_search_words
311
					WHERE id_msg = {int:id_msg}
312
						AND id_word IN ({array_int:removed_words})',
313
					array(
314
						'removed_words' => $removed_words,
315
						'id_msg' => $msgOptions['id'],
316
					)
317
				);
318
			}
319
320
			// Add the new words to be indexed.
321
			if (!empty($inserted_words))
322
			{
323
				$inserts = array();
324
				foreach ($inserted_words as $word)
325
					$inserts[] = array($word, $msgOptions['id']);
326
				$smcFunc['db_insert']('insert',
327
					'{db_prefix}log_search_words',
328
					array('id_word' => 'string', 'id_msg' => 'int'),
329
					$inserts,
330
					array('id_word', 'id_msg')
331
				);
332
			}
333
		}
334
	}
335
}
336
337
?>