Completed
Push — add/search ( 1ce891...804004 )
by
unknown
09:45
created

boost_lang_probs()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 9
Code Lines 6

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 2
eloc 6
nc 2
nop 1
dl 0
loc 9
rs 9.6666
c 0
b 0
f 0
1
<?php
2
3
/**
4
 * Parse a pure text query into WordPress Elasticsearch query. This builds on
5
 * the Jetpack_WPES_Query_Builder() to provide search query parsing.
6
 *
7
 * The key part of this parser is taking a user's query string typed into a box
8
 * and converting it into an ES search query.
9
 *
10
 * This varies by application, but roughly it means extracting some parts of the query
11
 * (authors, tags, and phrases) that are treated as a filter. Then taking the
12
 * remaining words and building the correct query (possibly with prefix searching
13
 * if we are doing search as you type)
14
 *
15
 * This class only supports ES 2.x+
16
 *
17
 * This parser builds queries of the form:
18
 *   bool:
19
 *     must:
20
 *       AND match of a single field (ideally an edgengram field)
21
 *     filter:
22
 *       filter clauses from context (eg @gibrown, #news, etc)
23
 *     should:
24
 *       boosting of results by various fields
25
 *
26
 * Features supported:
27
 *  - search as you type
28
 *  - phrases
29
 *  - supports querying across multiple languages at once
30
 *
31
 * Example usage (from Search on Reader Manage):
32
 *
33
 *		require_lib( 'jetpack-wpes-query-builder/jetpack-wpes-search-query-parser' );
34
 *		$parser = new WPES_Search_Query_Parser( $args['q'], array( $lang ) );
35
 *
36
 *		//author
37
 *		$parser->author_field_filter( array(
38
 *			'prefixes' => array( '@' ),
39
 *			'wpcom_id_field' => 'author_id',
40
 *			'must_query_fields' => array( 'author.engram', 'author_login.engram' ),
41
 *			'boost_query_fields' => array( 'author^2', 'author_login^2', 'title.default.engram' ),
42
 *		) );
43
 *
44
 *		//remainder of query
45
 *		$match_content_fields = $parser->merge_ml_fields(
46
 *			array(
47
 *				'all_content' => 0.1,
48
 *			),
49
 *			array(
50
 *				'all_content.default.engram^0.1',
51
 *			)
52
 *		);
53
 *		$boost_content_fields = $parser->merge_ml_fields(
54
 *			array(
55
 *				'title' => 2,
56
 *				'description' => 1,
57
 *				'tags' => 1,
58
 *			),
59
 *			array(
60
 *				'author_login^2',
61
 *				'author^2',
62
 *			)
63
 *		);
64
 *
65
 *		$parser->phrase_filter( array(
66
 *			'must_query_fields' => $match_content_fields,
67
 *			'boost_query_fields' => $boost_content_fields,
68
 *		) );
69
 *		$parser->remaining_query( array(
70
 *			'must_query_fields' => $match_content_fields,
71
 *			'boost_query_fields' => $boost_content_fields,
72
 *		) );
73
 *
74
 *		//Boost on phrases
75
 *		$parser->remaining_query( array(
76
 *			'boost_query_fields' => $boost_content_fields,
77
 *			'boost_query_type'   => 'phrase',
78
 *		) );
79
 *
80
 *		//boosting
81
 *		$parser->add_max_boost_to_functions( 20 );
82
 *		$parser->add_function( 'field_value_factor', array(
83
 *			'follower_count' => array(
84
 *				'modifier' => 'sqrt',
85
 *				'factor' => 1,
86
 *				'missing' => 0,
87
 *			) ) );
88
 *
89
 *		//Filtering
90
 *		$parser->add_filter( array(
91
 *			'exists' => array( 'field' => 'langs.' . $lang )
92
 *		) );
93
 *
94
 *		//run the query
95
 *		$es_query_args = array(
96
 *			'name' => 'feeds',
97
 *			'blog_id' => false,
98
 *			'security_strategy' => 'a8c',
99
 *			'type' => 'feed,blog',
100
 *			'fields' => array( 'blog_id', 'feed_id' ),
101
 *			'query' => $parser->build_query(),
102
 *			'filter' => $parser->build_filter(),
103
 *			'size' => $size,
104
 *			'from' => $from
105
 *		);
106
 *		$es_results = es_api_search_index( $es_query_args, 'api-feed-find' );
107
 *
108
 */
109
110
jetpack_require_lib( 'jetpack-wpes-query-builder' );
111
112
class Jetpack_WPES_Search_Query_Parser extends Jetpack_WPES_Query_Builder {
113
114
	var $orig_query = '';
0 ignored issues
show
Coding Style introduced by
The visibility should be declared for property $orig_query.

The PSR-2 coding standard requires that all properties in a class have their visibility explicitly declared. If you declare a property using

class A {
    var $property;
}

the property is implicitly global.

To learn more about the PSR-2, please see the PHP-FIG site on the PSR-2.

Loading history...
115
	var $current_query = '';
0 ignored issues
show
Coding Style introduced by
The visibility should be declared for property $current_query.

The PSR-2 coding standard requires that all properties in a class have their visibility explicitly declared. If you declare a property using

class A {
    var $property;
}

the property is implicitly global.

To learn more about the PSR-2, please see the PHP-FIG site on the PSR-2.

Loading history...
116
	var $langs;
0 ignored issues
show
Coding Style introduced by
The visibility should be declared for property $langs.

The PSR-2 coding standard requires that all properties in a class have their visibility explicitly declared. If you declare a property using

class A {
    var $property;
}

the property is implicitly global.

To learn more about the PSR-2, please see the PHP-FIG site on the PSR-2.

Loading history...
117
	var $avail_langs = array( 'ar', 'bg', 'ca', 'cs', 'da', 'de', 'el', 'en', 'es', 'eu', 'fa', 'fi', 'fr', 'he', 'hi', 'hu', 'hy', 'id', 'it', 'ja', 'ko', 'nl', 'no', 'pt', 'ro', 'ru', 'sv', 'tr', 'zh' );
0 ignored issues
show
Coding Style introduced by
The visibility should be declared for property $avail_langs.

The PSR-2 coding standard requires that all properties in a class have their visibility explicitly declared. If you declare a property using

class A {
    var $property;
}

the property is implicitly global.

To learn more about the PSR-2, please see the PHP-FIG site on the PSR-2.

Loading history...
118
119
	function __construct( $user_query, $langs ) {
120
		$this->orig_query = $user_query;
121
		$this->current_query = $this->orig_query;
122
		$this->langs = $this->norm_langs( $langs );
123
	}
124
125
	var $extracted_phrases = array();
0 ignored issues
show
Coding Style introduced by
The visibility should be declared for property $extracted_phrases.

The PSR-2 coding standard requires that all properties in a class have their visibility explicitly declared. If you declare a property using

class A {
    var $property;
}

the property is implicitly global.

To learn more about the PSR-2, please see the PHP-FIG site on the PSR-2.

Loading history...
126
127
	///////////////////////////////////////////////////////
128
	// Methods for Building arrays of multilingual fields
129
130
	/*
131
	 * Normalize language codes
132
	 */
133
	function norm_langs( $langs ) {
134
		$lst = array();
135
		foreach( $langs as $l ) {
136
			$l = strtok( $l, '-_' );
137
			if (in_array( $l, $this->avail_langs ) )
138
				$lst[$l] = true;
139
			else
140
				$lst['default'] = true;
141
		}
142
		return array_keys( $lst );
143
	}
144
145
	/*
146
	 * Take a list of field prefixes and expand them for multi-lingual
147
	 * with the provided boostings.
148
	 */
149
	function merge_ml_fields( $fields2boosts, $additional_fields ) {
150
		$flds = array();
151
		foreach( $fields2boosts as $f => $b ) {
152
			foreach( $this->langs as $l ) {
153
				$flds[] = $f . '.' . $l . '^' . $b;
154
			}
155
		}
156
		foreach( $additional_fields as $f ) {
157
			$flds[] = $f;
158
		}
159
		return $flds;
160
	}
161
162
	////////////////////////////////////
163
	// Extract Fields for Filtering on
164
165
	/*
166
	 * Extract any @mentions from the user query
167
	 *  use them as a filter if we can find a wp.com id
168
	 *  otherwise use them as a
169
	 *
170
	 *  args:
171
	 *    wpcom_id_field: wp.com id field
172
	 *    must_query_fields: array of fields to search for matching results (optional)
173
	 *    boost_query_fields: array of fields to search in for boosting results (optional)
174
	 *    prefixes: array of prefixes that the user can use to indicate an author
175
	 *
176
	 *  returns true/false of whether any were found
177
	 *
178
	 * See also: https://github.com/twitter/twitter-text/blob/master/java/src/com/twitter/Regex.java
179
	 */
180
	function author_field_filter( $args ) {
181
		$defaults = array(
182
			'wpcom_id_field' => 'author_id',
183
			'must_query_fields' => null,
184
			'boost_query_fields' => null,
185
			'prefixes' => array( '@' ),
186
		);
187
		$args = wp_parse_args( $args, $defaults );
188
189
		$names = array();
190 View Code Duplication
		foreach( $args['prefixes'] as $p ) {
191
			$found = $this->get_fields( $p );
192
			if ( $found ) {
193
				foreach( $found as $f )
194
					$names[] = $f;
195
			}
196
		}
197
198
		if ( empty( $names ) )
199
			return false;
200
201
		foreach( $args['prefixes'] as $p ) {
202
			$this->remove_fields( $p );
203
		}
204
205
		$user_ids = array();
206
		$query_names = array();
0 ignored issues
show
Unused Code introduced by
$query_names is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
207
208
		//loop through the matches and separate into filters and queries
209
		foreach( $names as $n ) {
210
			//check for exact match on login
211
			$userdata = get_user_by( 'login', strtolower( $n ) );
212
			$filtering = false;
213
			if ( $userdata ) {
214
				$user_ids[$userdata->ID] = true;
215
				$filtering = true;
216
			}
217
218
			$is_phrase = false;
219
			if ( preg_match( '/"/', $n ) ) {
220
				$is_phrase = true;
221
				$n = preg_replace( '/"/', '', $n );
222
			}
223
224 View Code Duplication
			if ( !empty( $args['must_query_fields'] ) && !$filtering ) {
225
				if ( $is_phrase ) {
226
					$this->add_query( array(
227
						'multi_match' => array(
228
							'fields' => $args['must_query_fields'],
229
							'query' => $n,
230
							'type' => 'phrase',
231
					) ) );
232
				} else {
233
					$this->add_query( array(
234
						'multi_match' => array(
235
							'fields' => $args['must_query_fields'],
236
							'query' => $n,
237
					) ) );
238
				}
239
			}
240
241 View Code Duplication
			if ( !empty( $args['boost_query_fields'] ) ) {
242
				if ( $is_phrase ) {
243
					$this->add_query( array(
244
						'multi_match' => array(
245
							'fields' => $args['boost_query_fields'],
246
							'query' => $n,
247
							'type' => 'phrase',
248
					) ), 'should' );
249
				} else {
250
					$this->add_query( array(
251
						'multi_match' => array(
252
							'fields' => $args['boost_query_fields'],
253
							'query' => $n,
254
					) ), 'should' );
255
				}
256
			}
257
		}
258
259
		if ( ! empty( $user_ids ) ) {
260
			$user_ids = array_keys( $user_ids );
261
			$this->add_filter( array( 'terms' => array( $args['wpcom_id_field'] => $user_ids ) ) );
262
		}
263
264
		return true;
265
	}
266
267
	/*
268
	 * Extract any prefix followed by text use them as a must clause,
269
	 *   and optionally as a boost to the should query
270
	 *   This can be used for hashtags. eg #News, or #"current events",
271
	 *   but also works for any arbitrary field. eg from:Greg
272
	 *
273
	 *  args:
274
	 *    must_query_fields: array of fields that must match the tag (optional)
275
	 *    boost_query_fields: array of fields to boost search on (optional)
276
	 *    prefixes: array of prefixes that the user can use to indicate a tag
277
	 *
278
	 *  returns true/false of whether any were found
279
	 *
280
	 */
281
	function text_field_filter( $args ) {
282
		$defaults = array(
283
			'must_query_fields' => array( 'tag.name' ),
284
			'boost_query_fields' => array( 'tag.name' ),
285
			'prefixes' => array( '#' ),
286
		);
287
		$args = wp_parse_args( $args, $defaults );
288
289
		$tags = array();
290 View Code Duplication
		foreach( $args['prefixes'] as $p ) {
291
			$found = $this->get_fields( $p );
292
			if ( $found ) {
293
				foreach( $found as $f )
294
					$tags[] = $f;
295
			}
296
		}
297
298
		if ( empty( $tags ) )
299
			return false;
300
301
		foreach( $args['prefixes'] as $p ) {
302
			$this->remove_fields( $p );
303
		}
304
305
		foreach( $tags as $t ) {
306
			$is_phrase = false;
307
			if ( preg_match( '/"/', $t ) ) {
308
				$is_phrase = true;
309
				$t = preg_replace( '/"/', '', $t );
310
			}
311
312 View Code Duplication
			if ( !empty( $args['must_query_fields'] ) ) {
313
				if ( $is_phrase ) {
314
					$this->add_query( array(
315
						'multi_match' => array(
316
							'fields' => $args['must_query_fields'],
317
							'query' => $t,
318
							'type' => 'phrase',
319
					) ) );
320
				} else {
321
					$this->add_query( array(
322
						'multi_match' => array(
323
							'fields' => $args['must_query_fields'],
324
							'query' => $t,
325
					) ) );
326
				}
327
			}
328
329 View Code Duplication
			if ( !empty( $args['boost_query_fields'] ) ) {
330
				if ( $is_phrase ) {
331
					$this->add_query( array(
332
						'multi_match' => array(
333
							'fields' => $args['boost_query_fields'],
334
							'query' => $t,
335
							'type' => 'phrase',
336
					) ), 'should' );
337
				} else {
338
					$this->add_query( array(
339
						'multi_match' => array(
340
							'fields' => $args['boost_query_fields'],
341
							'query' => $t,
342
					) ), 'should' );
343
				}
344
			}
345
		}
346
347
		return true;
348
	}
349
350
	/*
351
	 * Extract anything surrounded by quotes or if there is an opening quote
352
	 *   that is not complete, and add them to the query as a phrase query.
353
	 *   Quotes can be either '' or ""
354
	 *
355
	 *  args:
356
	 *    must_query_fields: array of fields that must match the phrases
357
	 *    boost_query_fields: array of fields to boost the phrases on (optional)
358
	 *
359
	 *  returns true/false of whether any were found
360
	 *
361
	 */
362
	function phrase_filter( $args ) {
363
		$defaults = array(
364
			'must_query_fields' => array( 'all_content' ),
365
			'boost_query_fields' => array( 'title' ),
366
		);
367
		$args = wp_parse_args( $args, $defaults );
368
369
		$phrases = array();
370 View Code Duplication
		if ( preg_match_all( '/"([^"]+)"/', $this->current_query, $matches ) ) {
371
			foreach ( $matches[1] as $match ) {
372
				$phrases[] = $match;
373
			}
374
			$this->current_query = preg_replace( '/"([^"]+)"/', '', $this->current_query );
375
		}
376
377 View Code Duplication
		if ( preg_match_all( "/'([^']+)'/", $this->current_query, $matches ) ) {
378
			foreach ( $matches[1] as $match ) {
379
				$phrases[] = $match;
380
			}
381
			$this->current_query = preg_replace( "/'([^']+)'/", '', $this->current_query );
382
		}
383
384
		//look for a final, uncompleted phrase
385
		$phrase_prefix = false;
386 View Code Duplication
		if ( preg_match_all( '/"([^"]+)$/', $this->current_query, $matches ) ) {
387
			$phrase_prefix = $matches[1][0];
388
			$this->current_query = preg_replace( '/"([^"]+)$/', '', $this->current_query );
389
		}
390 View Code Duplication
		if ( preg_match_all( "/'([^']+)$/", $this->current_query, $matches ) ) {
391
			$phrase_prefix = $matches[1][0];
392
			$this->current_query = preg_replace( "/'([^']+)$/", '', $this->current_query );
393
		}
394
395
		if ( $phrase_prefix )
0 ignored issues
show
Bug Best Practice introduced by
The expression $phrase_prefix of type string|false is loosely compared to true; this is ambiguous if the string can be empty. You might want to explicitly use !== false instead.

In PHP, under loose comparison (like ==, or !=, or switch conditions), values of different types might be equal.

For string values, the empty string '' is a special case, in particular the following results might be unexpected:

''   == false // true
''   == null  // true
'ab' == false // false
'ab' == null  // false

// It is often better to use strict comparison
'' === false // false
'' === null  // false
Loading history...
396
			$phrases[] = $phrase_prefix;
397
		if( empty( $phrases ) )
398
			return false;
399
400
		foreach ( $phrases as $p ) {
401
			$this->add_query( array(
402
				'multi_match' => array(
403
					'fields' => $args['must_query_fields'],
404
					'query' => $p,
405
					'type' => 'phrase',
406
				) ) );
407
408 View Code Duplication
			if ( !empty( $args['boost_query_fields'] ) ) {
409
				$this->add_query( array(
410
					'multi_match' => array(
411
						'fields' => $args['boost_query_fields'],
412
						'query' => $p,
413
						'operator' => 'and',
414
				) ), 'should' );
415
			}
416
		}
417
418
		return true;
419
	}
420
421
	/*
422
	 * Query fields based on the remaining parts of the query
423
	 *   This could be the final AND part of the query terms to match, or it
424
	 *   could be boosting certain elements of the query
425
	 *
426
	 *  args:
427
	 *    must_query_fields: array of fields that must match the remaining terms (optional)
428
	 *    boost_query_fields: array of fields to boost the remaining terms on (optional)
429
	 *
430
	 */
431
	function remaining_query( $args ) {
432
		$defaults = array(
433
			'must_query_fields' => null,
434
			'boost_query_fields' => null,
435
			'boost_operator' => 'and',
436
			'boost_query_type' => 'best_fields',
437
		);
438
		$args = wp_parse_args( $args, $defaults );
439
440
		if ( empty( $this->current_query ) || ctype_space( $this->current_query ) )
441
			return;
442
443 View Code Duplication
		if ( !empty( $args['must_query_fields'] ) ) {
444
			$this->add_query( array(
445
				'multi_match' => array(
446
					'fields' => $args['must_query_fields'],
447
					'query' => $this->current_query,
448
					'operator' => 'and',
449
			) ) );
450
		}
451
452 View Code Duplication
		if ( !empty( $args['boost_query_fields'] ) ) {
453
			$this->add_query( array(
454
				'multi_match' => array(
455
					'fields' => $args['boost_query_fields'],
456
					'query' => $this->current_query,
457
					'operator' => $args['boost_operator'],
458
					'type' => $args['boost_query_type'],
459
			) ), 'should' );
460
		}
461
462
	}
463
464
	/*
465
	 * Query fields using a prefix query (alphabetical expansions on the index).
466
	 *   This is not recommended. Slower performance and worse relevancy.
467
	 *
468
	 *  (UNTESTED! Copied from old prefix expansion code)
469
	 *
470
	 *  args:
471
	 *    must_query_fields: array of fields that must match the remaining terms (optional)
472
	 *    boost_query_fields: array of fields to boost the remaining terms on (optional)
473
	 *
474
	 */
475
	function remaining_prefix_query( $args ) {
476
		$defaults = array(
477
			'must_query_fields' => array( 'all_content' ),
478
			'boost_query_fields' => array( 'title' ),
479
			'boost_operator' => 'and',
480
			'boost_query_type' => 'best_fields',
481
		);
482
		$args = wp_parse_args( $args, $defaults );
483
484
		if ( empty( $this->current_query ) || ctype_space( $this->current_query ) )
485
			return;
486
487
		//////////////////////////////////
488
		// Example cases to think about:
489
		// "elasticse"
490
		// "elasticsearch"
491
		// "elasticsearch "
492
		// "elasticsearch lucen"
493
		// "elasticsearch lucene"
494
		// "the future"  - note the stopword which will match nothing!
495
		// "F1" - an exact match that also has tons of expansions
496
		// "こんにちは" ja "hello"
497
		// "こんにちは友人" ja "hello friend" - we just rely on the prefix phrase and ES to split words
498
		//   - this could still be better I bet. Maybe we need to analyze with ES first?
499
		//
500
501
		/////////////////////////////
502
		//extract pieces of query
503
		// eg: "PREFIXREMAINDER PREFIXWORD"
504
		//     "elasticsearch lucen"
505
506
		$prefix_word = false;
507
		$prefix_remainder = false;
0 ignored issues
show
Unused Code introduced by
$prefix_remainder is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
508
		if ( preg_match_all( '/([^ ]+)$/', $this->current_query, $matches ) )
509
			$prefix_word = $matches[1][0];
510
511
		$prefix_remainder = preg_replace( '/([^ ]+)$/', '', $this->current_query );
512
		if ( ctype_space( $prefix_remainder ) )
513
			$prefix_remainder = false;
514
515
		if ( !$prefix_word ) {
0 ignored issues
show
Bug Best Practice introduced by
The expression $prefix_word of type string|false is loosely compared to false; this is ambiguous if the string can be empty. You might want to explicitly use === false instead.

In PHP, under loose comparison (like ==, or !=, or switch conditions), values of different types might be equal.

For string values, the empty string '' is a special case, in particular the following results might be unexpected:

''   == false // true
''   == null  // true
'ab' == false // false
'ab' == null  // false

// It is often better to use strict comparison
'' === false // false
'' === null  // false
Loading history...
516
			//Space at the end of the query, so skip using a prefix query
517 View Code Duplication
			if ( !empty( $args['must_query_fields'] ) ) {
518
				$this->add_query( array(
519
					'multi_match' => array(
520
						'fields' => $args['must_query_fields'],
521
						'query' => $this->current_query,
522
						'operator' => 'and',
523
					) ) );
524
			}
525
526 View Code Duplication
			if ( !empty( $args['boost_query_fields'] ) ) {
527
				$this->add_query( array(
528
					'multi_match' => array(
529
						'fields' => $args['boost_query_fields'],
530
						'query' => $this->current_query,
531
						'operator' => $args['boost_operator'],
532
						'type' => $args['boost_query_type'],
533
					) ), 'should' );
534
			}
535
		} else {
536
537
			//must match the prefix word and the prefix remainder
538
			if ( !empty( $args['must_query_fields'] ) ) {
539
				//need to do an OR across a few fields to handle all cases
540
				$must_q = array( 'bool' => array( 'should' => array( ), 'minimum_should_match' => 1 ) );
541
542
				//treat all words as an exact search (boosts complete word like "news"
543
				//from prefixes of "newspaper")
544
				$must_q['bool']['should'][] = array( 'multi_match' => array(
545
					'fields' => $this->all_fields,
0 ignored issues
show
Bug introduced by
The property all_fields does not exist. Did you maybe forget to declare it?

In PHP it is possible to write to properties without declaring them. For example, the following is perfectly valid PHP code:

class MyClass { }

$x = new MyClass();
$x->foo = true;

Generally, it is a good practice to explictly declare properties to avoid accidental typos and provide IDE auto-completion:

class MyClass {
    public $foo;
}

$x = new MyClass();
$x->foo = true;
Loading history...
546
					'query' => $full_text,
0 ignored issues
show
Bug introduced by
The variable $full_text does not exist. Did you forget to declare it?

This check marks access to variables or properties that have not been declared yet. While PHP has no explicit notion of declaring a variable, accessing it before a value is assigned to it is most likely a bug.

Loading history...
547
					'operator' => 'and',
548
					'type' => 'cross_fields',
549
				) );
550
551
				//always optimistically try and match the full text as a phrase
552
				//prefix "the futu" should try to match "the future"
553
				//otherwise the first stopword kinda breaks
554
				//This also works as the prefix match for a single word "elasticsea"
555
				$must_q['bool']['should'][] = array( 'multi_match' => array(
556
					'fields' => $this->phrase_fields,
0 ignored issues
show
Bug introduced by
The property phrase_fields does not exist. Did you maybe forget to declare it?

In PHP it is possible to write to properties without declaring them. For example, the following is perfectly valid PHP code:

class MyClass { }

$x = new MyClass();
$x->foo = true;

Generally, it is a good practice to explictly declare properties to avoid accidental typos and provide IDE auto-completion:

class MyClass {
    public $foo;
}

$x = new MyClass();
$x->foo = true;
Loading history...
557
					'query' => $full_text,
558
					'operator' => 'and',
559
					'type' => 'phrase_prefix',
560
					'max_expansions' => 100,
561
				) );
562
563
				if ( $prefix_remainder ) {
0 ignored issues
show
Bug Best Practice introduced by
The expression $prefix_remainder of type false|string is loosely compared to true; this is ambiguous if the string can be empty. You might want to explicitly use !== false instead.

In PHP, under loose comparison (like ==, or !=, or switch conditions), values of different types might be equal.

For string values, the empty string '' is a special case, in particular the following results might be unexpected:

''   == false // true
''   == null  // true
'ab' == false // false
'ab' == null  // false

// It is often better to use strict comparison
'' === false // false
'' === null  // false
Loading history...
564
					//Multiple words found, so treat each word on its own and not just as
565
					//a part of a phrase
566
					//"elasticsearch lucen" => "elasticsearch" exact AND "lucen" prefix
567
					$q['bool']['should'][] = array( 'bool' => array(
0 ignored issues
show
Coding Style Comprehensibility introduced by
$q was never initialized. Although not strictly required by PHP, it is generally a good practice to add $q = array(); before regardless.

Adding an explicit array definition is generally preferable to implicit array definition as it guarantees a stable state of the code.

Let’s take a look at an example:

foreach ($collection as $item) {
    $myArray['foo'] = $item->getFoo();

    if ($item->hasBar()) {
        $myArray['bar'] = $item->getBar();
    }

    // do something with $myArray
}

As you can see in this example, the array $myArray is initialized the first time when the foreach loop is entered. You can also see that the value of the bar key is only written conditionally; thus, its value might result from a previous iteration.

This might or might not be intended. To make your intention clear, your code more readible and to avoid accidental bugs, we recommend to add an explicit initialization $myArray = array() either outside or inside the foreach loop.

Loading history...
568
						'must' => array(
569
							array( 'multi_match' => array(
570
								'fields' => $this->phrase_fields,
571
								'query' => $prefix_word,
572
								'operator' => 'and',
573
								'type' => 'phrase_prefix',
574
								'max_expansions' => 100,
575
							) ),
576
							array( 'multi_match' => array(
577
								'fields' => $this->all_fields,
578
								'query' => $prefix_remainder,
579
								'operator' => 'and',
580
								'type' => 'cross_fields',
581
							) ),
582
						)
583
					) );
584
				}
585
586
				$this->add_query( $must_q );
587
			}
588
589
			//Now add any boosting of the query
590
			if ( !empty( $args['boost_query_fields'] ) ) {
591
				//treat all words as an exact search (boosts complete word like "news"
592
				//from prefixes of "newspaper")
593
				$this->add_query( array(
594
					'multi_match' => array(
595
						'fields' => $args['boost_query_fields'],
596
						'query' => $this->current_query,
597
						'operator' => $args['boost_query_operator'],
598
						'type' => $args['boost_query_type'],
599
					) ), 'should' );
600
601
				//optimistically boost the full phrase prefix match
602
				$this->add_query( array(
603
					'multi_match' => array(
604
						'fields' => $args['boost_query_fields'],
605
						'query' => $this->current_query,
606
						'operator' => 'and',
607
						'type' => 'phrase_prefix',
608
						'max_expansions' => 100,
609
					) ) );
610
			}
611
		}
612
	}
613
614
	/*
615
	 * Boost results based on the lang probability overlaps
616
	 *
617
	 *  args:
618
	 *    langs2prob: list of languages to search in with associated boosts
619
	 */
620
	function boost_lang_probs( $langs2prob ) {
621
		foreach( $langs2prob as $l => $p ) {
622
			$this->add_function( 'field_value_factor', array(
623
				'modifier' => 'none',
624
				'factor' => $p,
625
				'missing' => 0.01, //1% chance doc did not have right lang detected
626
			) );
627
		}
628
	}
629
630
	////////////////////////////////////
631
	// Helper Methods
632
633
	//Get the text after some prefix. eg @gibrown, or @"Greg Brown"
634
	protected function get_fields( $field_prefix ) {
635
		$regex = '/' . $field_prefix . '(("[^"]+")|([^\\p{Z}]+))/';
636
		if( preg_match_all( $regex, $this->current_query, $match ) ) {
637
			return $match[1];
638
		}
639
		return false;
640
	}
641
642
	//Remove the prefix and text from the query
643
	protected function remove_fields( $field_name ) {
644
		$regex = '/' . $field_name . '(("[^"]+")|([^\\p{Z}]+))/';
645
		$this->current_query = preg_replace( $regex, '', $this->current_query );
646
	}
647
648
	//Best effort string truncation that splits on word breaks
649
	function truncate_string( $string, $limit, $break=" " ) {
650
		if ( mb_strwidth( $string ) <= $limit )
651
			return $string;
652
653
		// walk backwards from $limit to find first break
654
		$breakpoint = $limit;
655
		$broken = false;
656
		while ( $breakpoint > 0 ) {
657
			if ( $break === mb_strimwidth( $string, $breakpoint, 1 ) ) {
658
				$string = mb_strimwidth( $string, 0, $breakpoint );
659
				$broken = true;
660
				break;
661
			}
662
			$breakpoint--;
663
		}
664
		// if we weren't able to find a break, need to chop mid-word
665
		if ( !$broken ) {
666
			$string = mb_strimwidth( $string, 0, $limit );
667
		}
668
		return $string;
669
	}
670
671
}
672