Completed
Push — development ( 6aa9a7...e1c66d )
by Thorsten
15s
created

ManageSearch.subs.php ➔ SphinxVersion()   A

Complexity

Conditions 3
Paths 2

Size

Total Lines 13
Code Lines 6

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 12

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 3
eloc 6
nc 2
nop 0
dl 0
loc 13
ccs 0
cts 9
cp 0
crap 12
rs 9.4285
c 1
b 0
f 0
1
<?php
2
3
/**
4
 * Support functions for setting up the search features and creating search index's
5
 *
6
 * @name      ElkArte Forum
7
 * @copyright ElkArte Forum contributors
8
 * @license   BSD http://opensource.org/licenses/BSD-3-Clause
9
 *
10
 * This file contains code covered by:
11
 * copyright:	2011 Simple Machines (http://www.simplemachines.org)
12
 * license:  	BSD, See included LICENSE.TXT for terms and conditions.
13
 *
14
 * @version 1.1 beta 1
15
 *
16
 */
17
18
if (!defined('ELK'))
19
	die('No access...');
20
21
/**
22
 * Checks if the message table already has a fulltext index created and returns the key name
23
 * Determines if a db is capable of creating a fulltext index
24
 *
25
 * @package Search
26
 */
27
function detectFulltextIndex()
28
{
29
	global $context, $db_prefix;
30
31
	$db = database();
32
33
	$request = $db->query('', '
34
		SHOW INDEX
35
		FROM {db_prefix}messages',
36
		array(
37
		)
38
	);
39
	$context['fulltext_index'] = '';
40
	if ($request !== false || $db->num_rows($request) != 0)
41
	{
42
		while ($row = $db->fetch_assoc($request))
43
			if ($row['Column_name'] == 'body' && (isset($row['Index_type']) && $row['Index_type'] == 'FULLTEXT' || isset($row['Comment']) && $row['Comment'] == 'FULLTEXT'))
44
				$context['fulltext_index'][] = $row['Key_name'];
45
		$db->free_result($request);
46
47
		if (is_array($context['fulltext_index']))
48
			$context['fulltext_index'] = array_unique($context['fulltext_index']);
49
	}
50
51 View Code Duplication
	if (preg_match('~^`(.+?)`\.(.+?)$~', $db_prefix, $match) !== 0)
52
		$request = $db->query('', '
53
			SHOW TABLE STATUS
54
			FROM {string:database_name}
55
			LIKE {string:table_name}',
56
			array(
57
				'database_name' => '`' . strtr($match[1], array('`' => '')) . '`',
58
				'table_name' => str_replace('_', '\_', $match[2]) . 'messages',
59
			)
60
		);
61
	else
62
		$request = $db->query('', '
63
			SHOW TABLE STATUS
64
			LIKE {string:table_name}',
65
			array(
66
				'table_name' => str_replace('_', '\_', $db_prefix) . 'messages',
67
			)
68
		);
69
70
	if ($request !== false)
71
	{
72
		while ($row = $db->fetch_assoc($request))
73
			if ((isset($row['Type']) && strtolower($row['Type']) != 'myisam') || (isset($row['Engine']) && strtolower($row['Engine']) != 'myisam'))
74
				$context['cannot_create_fulltext'] = true;
75
76
		$db->free_result($request);
77
	}
78
}
79
80
/**
81
 * Attempts to determine the version of the Sphinx damon
82
 */
83
function SphinxVersion()
84
{
85
	$version = '0.0.0';
86
87
	// Can we get the version that is running/installed?
88
	@exec('searchd --help', $sphver);
89
	if (!empty($sphver) && preg_match('~Sphinx (\d\.\d\.\d\d?)~', $sphver[0], $match))
90
	{
91
		$version = $match[1];
92
	}
93
94
	return $version;
95
}
96
97
/**
98
 * Creates and outputs the Sphinx configuration file
99
 *
100
 * @package Search
101
 */
102
function createSphinxConfig()
103
{
104
	global $db_server, $db_name, $db_user, $db_passwd, $db_prefix, $modSettings;
105
106
	$version = SphinxVersion();
107
108
	// Set up to output a file to the users browser
109
	while (ob_get_level() > 0)
110
		@ob_end_clean();
111
112
	header('Content-Encoding: none');
113
	header('Pragma: ');
114
	if (!isBrowser('is_gecko'))
115
		header('Content-Transfer-Encoding: binary');
116
	header('Connection: close');
117
	header('Content-Disposition: attachment; filename="sphinx.conf"');
118
	header('Content-Type: application/octet-stream');
119
120
	$weight_factors = array(
121
		'age',
122
		'length',
123
		'first_message',
124
		'sticky',
125
		'likes',
126
	);
127
128
	$weight = array();
129
	$weight_total = 0;
130
	foreach ($weight_factors as $weight_factor)
131
	{
132
		$weight[$weight_factor] = empty($modSettings['search_weight_' . $weight_factor]) ? 0 : (int) $modSettings['search_weight_' . $weight_factor];
133
		$weight_total += $weight[$weight_factor];
134
	}
135
136
	// Weightless, then use defaults
137
	if ($weight_total === 0)
138
	{
139
		$weight = array(
140
			'age' => 25,
141
			'length' => 25,
142
			'first_message' => 25,
143
			'sticky' => 15,
144
			'likes' => 10
145
		);
146
		$weight_total = 100;
147
	}
148
149
	// Check paths are set, if not use some defaults
150
	$modSettings['sphinx_data_path'] = empty($modSettings['sphinx_data_path']) ? '/var/sphinx/data' : $modSettings['sphinx_data_path'];
151
	$modSettings['sphinx_log_path'] = empty($modSettings['sphinx_log_path']) ? '/var/sphinx/log' : $modSettings['sphinx_log_path'];
152
153
	// Output our minimal configuration file to get them started
154
	echo '#
155
# Sphinx configuration file (sphinx.conf), configured for ElkArte
156
#
157
# This is the minimum needed clean, simple, functional
158
#
159
# By default the location of this file would probably be:
160
# /usr/local/etc/sphinx.conf or /etc/sphinxsearch/sphinx.conf
161
#
162
163
source elkarte_source
164
{
165
	type				= mysql
166
	sql_host			= ', $db_server, '
167
	sql_user			= ', $db_user, '
168
	sql_pass			= ', $db_passwd, '
169
	sql_db				= ', $db_name, '
170
	sql_port			= 3306
171
	sql_query_pre		= SET NAMES utf8
172
	# If you do not have query_cache enabled in my.cnf, then you can comment out the next line
173
	sql_query_pre		= SET SESSION query_cache_type=OFF
174
	sql_query_pre		= \
175
		REPLACE INTO ', $db_prefix, 'settings (variable, value) \
176
		SELECT \'sphinx_indexed_msg_until\', MAX(id_msg) \
177
		FROM ', $db_prefix, 'messages
178
	sql_query_range		= \
179
		SELECT 1, value \
180
		FROM ', $db_prefix, 'settings \
181
		WHERE variable	= \'sphinx_indexed_msg_until\'
182
	sql_range_step		= 1000
183
	sql_query			= \
184
		SELECT \
185
			m.id_msg, m.id_topic, m.id_board, IF(m.id_member = 0, 4294967295, m.id_member) AS id_member, m.poster_time, m.body, m.subject, \
186
			t.num_replies + 1 AS num_replies, CEILING(1000000 * ( \
187
				IF(m.id_msg < 0.7 * s.value, 0, (m.id_msg - 0.7 * s.value) / (0.3 * s.value)) * ' . $weight['age'] . ' + \
188
				IF(t.num_replies < 50, t.num_replies / 50, 1) * ' . $weight['length'] . ' + \
189
				IF(m.id_msg = t.id_first_msg, 1, 0) * ' . $weight['first_message'] . ' + \
190
				IF(t.num_likes < 10, t.num_likes / 10, 1) * ' . $weight['likes'] . ' + \
191
				IF(t.is_sticky = 0, 0, 100) * ' . $weight['sticky'] . ' \
192
			) / ' . $weight_total . ') AS relevance \
193
		FROM ', $db_prefix, 'messages AS m, ', $db_prefix, 'topics AS t, ', $db_prefix, 'settings AS s \
194
		WHERE t.id_topic = m.id_topic \
195
			AND s.variable = \'maxMsgID\' \
196
			AND m.id_msg BETWEEN $start AND $end
197
	sql_attr_uint		= id_topic
198
	sql_attr_uint		= id_board
199
	sql_attr_uint		= id_member
200
	sql_attr_timestamp	= poster_time
201
	sql_attr_uint		= relevance
202
	sql_attr_uint		= num_replies
203
}
204
205
source elkarte_delta_source : elkarte_source
206
{
207
	sql_query_pre = SET NAMES utf8
208
	# If you do not have query_cache enabled in my.cnf, then you can comment out the next line
209
	sql_query_pre = SET SESSION query_cache_type=OFF
210
	sql_query_range	= \
211
		SELECT s1.value, s2.value \
212
		FROM ', $db_prefix, 'settings AS s1, ', $db_prefix, 'settings AS s2 \
213
		WHERE s1.variable = \'sphinx_indexed_msg_until\' \
214
			AND s2.variable = \'maxMsgID\'
215
}
216
217
index elkarte_base_index
218
{
219
	html_strip		= 1
220
	source			= elkarte_source
221
	path			= ', $modSettings['sphinx_data_path'], '/elkarte_sphinx_base.index', empty($modSettings['sphinx_stopword_path']) ? '' : '
222
	stopwords		= ', $modSettings['sphinx_stopword_path'], '
223
	min_word_len	= 2', version_compare($version, '2.2.2') < 0 ? '
224
	charset_type	= utf-8' : '', '
225
	charset_table	= 0..9, A..Z->a..z, _, a..z, U+451->U+435, U+401->U+435, U+410..U+42F->U+430..U+44F, U+430..U+44F
226
	ignore_chars	= -, U+AD
227
}
228
229
index elkarte_delta_index : elkarte_base_index
230
{
231
	source			= elkarte_delta_source
232
	path			= ', $modSettings['sphinx_data_path'], '/elkarte_sphinx_delta.index
233
}
234
235
index elkarte_index
236
{
237
	type			= distributed
238
	local			= elkarte_base_index
239
	local			= elkarte_delta_index
240
}
241
242
indexer
243
{
244
	mem_limit		= ', (empty($modSettings['sphinx_indexer_mem']) ? 128 : (int) $modSettings['sphinx_indexer_mem']), 'M
245
}
246
247
searchd
248
{
249
	listen					= ', (empty($modSettings['sphinx_searchd_port']) ? 9312 : (int) $modSettings['sphinx_searchd_port']), '
250
	listen					= ', (empty($modSettings['sphinxql_searchd_port']) ? 9306 : (int) $modSettings['sphinxql_searchd_port']), ':mysql41
251
	log						= ', $modSettings['sphinx_log_path'], '/searchd.log
252
	query_log				= ', $modSettings['sphinx_log_path'], '/query.log
253
	read_timeout			= 5
254
	max_children			= 30
255
	pid_file				= ', $modSettings['sphinx_data_path'], '/searchd.pid', version_compare($version, '2.2.3') < 0 ? '
256
	max_matches				= ' . (empty($modSettings['sphinx_max_results']) ? 2000 : (int) $modSettings['sphinx_max_results']) : '', '
257
}
258
';
259
	obExit(false, false);
260
}
261
262
/**
263
 * Drop one or more indexes from a table and adds them back if specified
264
 *
265
 * @package Search
266
 * @param string $table
267
 * @param string[]|string $indexes
268
 * @param boolean $add
269
 */
270
function alterFullTextIndex($table, $indexes, $add = false)
271
{
272
	$db = database();
273
274
	$indexes = is_array($indexes) ? $indexes : array($indexes);
275
276
	// Make sure it's gone before creating it.
277
	$db->query('', '
278
		ALTER TABLE ' . $table . '
279
		DROP INDEX ' . implode(',
280
		DROP INDEX ', $indexes),
281
		array(
282
			'db_error_skip' => true,
283
		)
284
	);
285
286
	if ($add)
287
	{
288
		foreach ($indexes as $index)
289
			$db->query('', '
290
				ALTER TABLE ' . $table . '
291
				ADD FULLTEXT {raw:name} ({raw:name})',
292
				array(
293
					'name' => $index
294
				)
295
			);
296
	}
297
}
298
299
/**
300
 * Creates a custom search index
301
 *
302
 * @package Search
303
 * @param int $start
304
 * @param int $messages_per_batch
305
 * @param string $column_size_definition
306
 * @param mixed[] $index_settings array containing specifics of what to create e.g. bytes per word
307
 */
308
function createSearchIndex($start, $messages_per_batch, $column_size_definition, $index_settings)
309
{
310
	global $modSettings;
311
312
	$db = database();
313
	$db_search = db_search();
314
	$step = 1;
315
316
	// Starting a new index we set up for the run
317
	if ($start === 0)
318
	{
319
		drop_log_search_words();
320
321
		$db_search->create_word_search($column_size_definition);
322
323
		// Temporarily switch back to not using a search index.
324 View Code Duplication
		if (!empty($modSettings['search_index']) && $modSettings['search_index'] == 'custom')
325
			updateSettings(array('search_index' => ''));
326
327
		// Don't let simultaneous processes be updating the search index.
328
		if (!empty($modSettings['search_custom_index_config']))
329
			updateSettings(array('search_custom_index_config' => ''));
330
	}
331
332
	$num_messages = array(
333
		'done' => 0,
334
		'todo' => 0,
335
	);
336
337
	$request = $db->query('', '
338
		SELECT id_msg >= {int:starting_id} AS todo, COUNT(*) AS num_messages
339
		FROM {db_prefix}messages
340
		GROUP BY todo',
341
		array(
342
			'starting_id' => $start,
343
		)
344
	);
345
	while ($row = $db->fetch_assoc($request))
346
		$num_messages[empty($row['todo']) ? 'done' : 'todo'] = $row['num_messages'];
347
348
	// Done with indexing the messages, on to the next step
349
	if (empty($num_messages['todo']))
350
	{
351
		$step = 2;
352
		$percentage = 80;
353
		$start = 0;
354
	}
355
	// Still on step one, inserting all the indexed words.
356
	else
357
	{
358
		// Number of seconds before the next step.
359
		$stop = time() + 3;
360
		while (time() < $stop)
361
		{
362
			$inserts = array();
363
			$request = $db->query('', '
364
				SELECT id_msg, body
365
				FROM {db_prefix}messages
366
				WHERE id_msg BETWEEN {int:starting_id} AND {int:ending_id}
367
				LIMIT {int:limit}',
368
				array(
369
					'starting_id' => $start,
370
					'ending_id' => $start + $messages_per_batch - 1,
371
					'limit' => $messages_per_batch,
372
				)
373
			);
374
			$forced_break = false;
375
			$number_processed = 0;
376
			while ($row = $db->fetch_assoc($request))
377
			{
378
				// In theory it's possible for one of these to take friggin ages so add more timeout protection.
379
				if ($stop < time())
380
				{
381
					$forced_break = true;
382
					break;
383
				}
384
385
				$number_processed++;
386
				foreach (text2words($row['body'], $index_settings['bytes_per_word'], true) as $id_word)
387
					$inserts[] = array($id_word, $row['id_msg']);
388
			}
389
			$num_messages['done'] += $number_processed;
390
			$num_messages['todo'] -= $number_processed;
391
			$db->free_result($request);
392
393
			$start += $forced_break ? $number_processed : $messages_per_batch;
394
395
			if (!empty($inserts))
396
				$db->insert('ignore',
397
					'{db_prefix}log_search_words',
398
					array('id_word' => 'int', 'id_msg' => 'int'),
399
					$inserts,
400
					array('id_word', 'id_msg')
401
				);
402
403
			// Done then set up for the next step, set up for the next loop.
404
			if ($num_messages['todo'] === 0)
405
			{
406
				$step = 2;
407
				$start = 0;
408
				break;
409
			}
410
			else
411
				updateSettings(array('search_custom_index_resume' => serialize(array_merge($index_settings, array('resume_at' => $start)))));
412
		}
413
414
		// Since there are still steps to go, 80% is the maximum here.
415
		$percentage = round($num_messages['done'] / ($num_messages['done'] + $num_messages['todo']), 3) * 80;
416
	}
417
418
	return array($start, $step, $percentage);
419
}
420
421
/**
422
 * Removes common stop words from the index as they inhibit search performance
423
 *
424
 * @package Search
425
 * @param int $start
426
 * @param mixed[] $column_definition
427
 */
428
function removeCommonWordsFromIndex($start, $column_definition)
429
{
430
	global $modSettings;
431
432
	$db = database();
433
434
	$stop_words = $start === 0 || empty($modSettings['search_stopwords']) ? array() : explode(',', $modSettings['search_stopwords']);
435
	$stop = time() + 3;
436
	$max_messages = ceil(60 * $modSettings['totalMessages'] / 100);
437
	$complete = false;
438
439
	while (time() < $stop)
440
	{
441
		$request = $db->query('', '
442
			SELECT id_word, COUNT(id_word) AS num_words
443
			FROM {db_prefix}log_search_words
444
			WHERE id_word BETWEEN {int:starting_id} AND {int:ending_id}
445
			GROUP BY id_word
446
			HAVING COUNT(id_word) > {int:minimum_messages}',
447
			array(
448
				'starting_id' => $start,
449
				'ending_id' => $start + $column_definition['step_size'] - 1,
450
				'minimum_messages' => $max_messages,
451
			)
452
		);
453
		while ($row = $db->fetch_assoc($request))
454
			$stop_words[] = $row['id_word'];
455
		$db->free_result($request);
456
457
		updateSettings(array('search_stopwords' => implode(',', $stop_words)));
458
459
		if (!empty($stop_words))
460
			$db->query('', '
461
				DELETE FROM {db_prefix}log_search_words
462
				WHERE id_word in ({array_int:stop_words})',
463
				array(
464
					'stop_words' => $stop_words,
465
				)
466
			);
467
468
		$start += $column_definition['step_size'];
469
		if ($start > $column_definition['max_size'])
470
		{
471
			$complete = true;
472
			break;
473
		}
474
	}
475
476
	return array($start, $complete);
477
}
478
479
/**
480
 * Drops the log search words table(s)
481
 *
482
 * @package Search
483
 */
484
function drop_log_search_words()
485
{
486
	$db_table = db_table();
487
488
	$db_table->db_drop_table('{db_prefix}log_search_words');
489
}