Completed
Push — renovate/history-4.x ( 8706da...6c1ea7 )
by
unknown
17:57 queued 11:18
created

jetpack-wpes-query-parser.php (5 issues)

Upgrade to new PHP Analysis Engine

These results are based on our legacy PHP analysis, consider migrating to our new PHP analysis engine instead. Learn more

1
<?php
2
3
/**
4
 * Parse a pure text query into WordPress Elasticsearch query. This builds on
5
 * the Jetpack_WPES_Query_Builder() to provide search query parsing.
6
 *
7
 * The key part of this parser is taking a user's query string typed into a box
8
 * and converting it into an ES search query.
9
 *
10
 * This varies by application, but roughly it means extracting some parts of the query
11
 * (authors, tags, and phrases) that are treated as a filter. Then taking the
12
 * remaining words and building the correct query (possibly with prefix searching
13
 * if we are doing search as you type)
14
 *
15
 * This class only supports ES 2.x+
16
 *
17
 * This parser builds queries of the form:
18
 *   bool:
19
 *     must:
20
 *       AND match of a single field (ideally an edgengram field)
21
 *     filter:
22
 *       filter clauses from context (eg @gibrown, #news, etc)
23
 *     should:
24
 *       boosting of results by various fields
25
 *
26
 * Features supported:
27
 *  - search as you type
28
 *  - phrases
29
 *  - supports querying across multiple languages at once
30
 *
31
 * Example usage (from Search on Reader Manage):
32
 *
33
 *		require_lib( 'jetpack-wpes-query-builder/jetpack-wpes-search-query-parser' );
34
 *		$parser = new Jetpack_WPES_Search_Query_Parser( $args['q'], array( $lang ) );
35
 *
36
 *		//author
37
 *		$parser->author_field_filter( array(
38
 *			'prefixes' => array( '@' ),
39
 *			'wpcom_id_field' => 'author_id',
40
 *			'must_query_fields' => array( 'author.engram', 'author_login.engram' ),
41
 *			'boost_query_fields' => array( 'author^2', 'author_login^2', 'title.default.engram' ),
42
 *		) );
43
 *
44
 *		//remainder of query
45
 *		$match_content_fields = $parser->merge_ml_fields(
46
 *			array(
47
 *				'all_content' => 0.1,
48
 *			),
49
 *			array(
50
 *				'all_content.default.engram^0.1',
51
 *			)
52
 *		);
53
 *		$boost_content_fields = $parser->merge_ml_fields(
54
 *			array(
55
 *				'title' => 2,
56
 *				'description' => 1,
57
 *				'tags' => 1,
58
 *			),
59
 *			array(
60
 *				'author_login^2',
61
 *				'author^2',
62
 *			)
63
 *		);
64
 *
65
 *		$parser->phrase_filter( array(
66
 *			'must_query_fields' => $match_content_fields,
67
 *			'boost_query_fields' => $boost_content_fields,
68
 *		) );
69
 *		$parser->remaining_query( array(
70
 *			'must_query_fields' => $match_content_fields,
71
 *			'boost_query_fields' => $boost_content_fields,
72
 *		) );
73
 *
74
 *		//Boost on phrases
75
 *		$parser->remaining_query( array(
76
 *			'boost_query_fields' => $boost_content_fields,
77
 *			'boost_query_type'   => 'phrase',
78
 *		) );
79
 *
80
 *		//boosting
81
 *		$parser->add_max_boost_to_functions( 20 );
82
 *		$parser->add_function( 'field_value_factor', array(
83
 *			'follower_count' => array(
84
 *				'modifier' => 'sqrt',
85
 *				'factor' => 1,
86
 *				'missing' => 0,
87
 *			) ) );
88
 *
89
 *		//Filtering
90
 *		$parser->add_filter( array(
91
 *			'exists' => array( 'field' => 'langs.' . $lang )
92
 *		) );
93
 *
94
 *		//run the query
95
 *		$es_query_args = array(
96
 *			'name' => 'feeds',
97
 *			'blog_id' => false,
98
 *			'security_strategy' => 'a8c',
99
 *			'type' => 'feed,blog',
100
 *			'fields' => array( 'blog_id', 'feed_id' ),
101
 *			'query' => $parser->build_query(),
102
 *			'filter' => $parser->build_filter(),
103
 *			'size' => $size,
104
 *			'from' => $from
105
 *		);
106
 *		$es_results = es_api_search_index( $es_query_args, 'api-feed-find' );
107
 *
108
 */
109
110
jetpack_require_lib( 'jetpack-wpes-query-builder' );
111
112
class Jetpack_WPES_Search_Query_Parser extends Jetpack_WPES_Query_Builder {
113
114
	protected $orig_query = '';
115
	protected $current_query = '';
116
	protected $langs;
117
	protected $avail_langs = array( 'ar', 'bg', 'ca', 'cs', 'da', 'de', 'el', 'en', 'es', 'eu', 'fa', 'fi', 'fr', 'he', 'hi', 'hu', 'hy', 'id', 'it', 'ja', 'ko', 'nl', 'no', 'pt', 'ro', 'ru', 'sv', 'tr', 'zh' );
118
119
	public function __construct( $user_query, $langs ) {
120
		$this->orig_query = $user_query;
121
		$this->current_query = $this->orig_query;
122
		$this->langs = $this->norm_langs( $langs );
123
	}
124
125
	protected $extracted_phrases = array();
126
127
	public function get_current_query() {
128
		return $this->current_query;
129
	}
130
131
	public function set_current_query( $q ) {
132
		$this->current_query = $q;
133
	}
134
135
	///////////////////////////////////////////////////////
136
	// Methods for Building arrays of multilingual fields
137
138
	/*
139
	 * Normalize language codes
140
	 */
141
	public function norm_langs( $langs ) {
142
		$lst = array();
143
		foreach( $langs as $l ) {
144
			$l = strtok( $l, '-_' );
145
			if ( in_array( $l, $this->avail_langs ) ) {
146
				$lst[$l] = true;
147
			} else {
148
				$lst['default'] = true;
149
			}
150
		}
151
		return array_keys( $lst );
152
	}
153
154
	/*
155
	 * Take a list of field prefixes and expand them for multi-lingual
156
	 * with the provided boostings.
157
	 */
158
	public function merge_ml_fields( $fields2boosts, $additional_fields ) {
159
		$flds = array();
160
		foreach( $fields2boosts as $f => $b ) {
161
			foreach( $this->langs as $l ) {
162
				$flds[] = $f . '.' . $l . '^' . $b;
163
			}
164
		}
165
		foreach( $additional_fields as $f ) {
166
			$flds[] = $f;
167
		}
168
		return $flds;
169
	}
170
171
	////////////////////////////////////
172
	// Extract Fields for Filtering on
173
174
	/*
175
	 * Extract any @mentions from the user query
176
	 *  use them as a filter if we can find a wp.com id
177
	 *  otherwise use them as a
178
	 *
179
	 *  args:
180
	 *    wpcom_id_field: wp.com id field
181
	 *    must_query_fields: array of fields to search for matching results (optional)
182
	 *    boost_query_fields: array of fields to search in for boosting results (optional)
183
	 *    prefixes: array of prefixes that the user can use to indicate an author
184
	 *
185
	 *  returns true/false of whether any were found
186
	 *
187
	 * See also: https://github.com/twitter/twitter-text/blob/master/java/src/com/twitter/Regex.java
188
	 */
189
	public function author_field_filter( $args ) {
190
		$defaults = array(
191
			'wpcom_id_field' => 'author_id',
192
			'must_query_fields' => null,
193
			'boost_query_fields' => null,
194
			'prefixes' => array( '@' ),
195
		);
196
		$args = wp_parse_args( $args, $defaults );
0 ignored issues
show
$defaults is of type array<string,string|null...,{\"0\":\"string\"}>"}>, but the function expects a string.

It seems like the type of the argument is not accepted by the function/method which you are calling.

In some cases, in particular if PHP’s automatic type-juggling kicks in this might be fine. In other cases, however this might be a bug.

We suggest to add an explicit type cast like in the following example:

function acceptsInteger($int) { }

$x = '123'; // string "123"

// Instead of
acceptsInteger($x);

// we recommend to use
acceptsInteger((integer) $x);
Loading history...
197
198
		$names = array();
199 View Code Duplication
		foreach( $args['prefixes'] as $p ) {
200
			$found = $this->get_fields( $p );
201
			if ( $found ) {
202
				foreach( $found as $f ) {
203
					$names[] = $f;
204
				}
205
			}
206
		}
207
208
		if ( empty( $names ) ) {
209
			return false;
210
		}
211
212
		foreach( $args['prefixes'] as $p ) {
213
			$this->remove_fields( $p );
214
		}
215
216
		$user_ids = array();
217
		$query_names = array();
218
219
		//loop through the matches and separate into filters and queries
220
		foreach( $names as $n ) {
221
			//check for exact match on login
222
			$userdata = get_user_by( 'login', strtolower( $n ) );
223
			$filtering = false;
224
			if ( $userdata ) {
225
				$user_ids[ $userdata->ID ] = true;
226
				$filtering = true;
227
			}
228
229
			$is_phrase = false;
230
			if ( preg_match( '/"/', $n ) ) {
231
				$is_phrase = true;
232
				$n = preg_replace( '/"/', '', $n );
233
			}
234
235 View Code Duplication
			if ( !empty( $args['must_query_fields'] ) && !$filtering ) {
236
				if ( $is_phrase ) {
237
					$this->add_query( array(
238
						'multi_match' => array(
239
							'fields' => $args['must_query_fields'],
240
							'query' => $n,
241
							'type' => 'phrase',
242
					) ) );
243
				} else {
244
					$this->add_query( array(
245
						'multi_match' => array(
246
							'fields' => $args['must_query_fields'],
247
							'query' => $n,
248
					) ) );
249
				}
250
			}
251
252 View Code Duplication
			if ( !empty( $args['boost_query_fields'] ) ) {
253
				if ( $is_phrase ) {
254
					$this->add_query( array(
255
						'multi_match' => array(
256
							'fields' => $args['boost_query_fields'],
257
							'query' => $n,
258
							'type' => 'phrase',
259
					) ), 'should' );
260
				} else {
261
					$this->add_query( array(
262
						'multi_match' => array(
263
							'fields' => $args['boost_query_fields'],
264
							'query' => $n,
265
					) ), 'should' );
266
				}
267
			}
268
		}
269
270
		if ( ! empty( $user_ids ) ) {
271
			$user_ids = array_keys( $user_ids );
272
			$this->add_filter( array( 'terms' => array( $args['wpcom_id_field'] => $user_ids ) ) );
273
		}
274
275
		return true;
276
	}
277
278
	/*
279
	 * Extract any prefix followed by text use them as a must clause,
280
	 *   and optionally as a boost to the should query
281
	 *   This can be used for hashtags. eg #News, or #"current events",
282
	 *   but also works for any arbitrary field. eg from:Greg
283
	 *
284
	 *  args:
285
	 *    must_query_fields: array of fields that must match the tag (optional)
286
	 *    boost_query_fields: array of fields to boost search on (optional)
287
	 *    prefixes: array of prefixes that the user can use to indicate a tag
288
	 *
289
	 *  returns true/false of whether any were found
290
	 *
291
	 */
292
	public function text_field_filter( $args ) {
293
		$defaults = array(
294
			'must_query_fields' => array( 'tag.name' ),
295
			'boost_query_fields' => array( 'tag.name' ),
296
			'prefixes' => array( '#' ),
297
		);
298
		$args = wp_parse_args( $args, $defaults );
0 ignored issues
show
$defaults is of type array<string,array<integ...,{\"0\":\"string\"}>"}>, but the function expects a string.

It seems like the type of the argument is not accepted by the function/method which you are calling.

In some cases, in particular if PHP’s automatic type-juggling kicks in this might be fine. In other cases, however this might be a bug.

We suggest to add an explicit type cast like in the following example:

function acceptsInteger($int) { }

$x = '123'; // string "123"

// Instead of
acceptsInteger($x);

// we recommend to use
acceptsInteger((integer) $x);
Loading history...
299
300
		$tags = array();
301 View Code Duplication
		foreach( $args['prefixes'] as $p ) {
302
			$found = $this->get_fields( $p );
303
			if ( $found ) {
304
				foreach( $found as $f ) {
305
					$tags[] = $f;
306
				}
307
			}
308
		}
309
310
		if ( empty( $tags ) ) {
311
			return false;
312
		}
313
314
		foreach( $args['prefixes'] as $p ) {
315
			$this->remove_fields( $p );
316
		}
317
318
		foreach( $tags as $t ) {
319
			$is_phrase = false;
320
			if ( preg_match( '/"/', $t ) ) {
321
				$is_phrase = true;
322
				$t = preg_replace( '/"/', '', $t );
323
			}
324
325 View Code Duplication
			if ( ! empty( $args['must_query_fields'] ) ) {
326
				if ( $is_phrase ) {
327
					$this->add_query( array(
328
						'multi_match' => array(
329
							'fields' => $args['must_query_fields'],
330
							'query' => $t,
331
							'type' => 'phrase',
332
					) ) );
333
				} else {
334
					$this->add_query( array(
335
						'multi_match' => array(
336
							'fields' => $args['must_query_fields'],
337
							'query' => $t,
338
					) ) );
339
				}
340
			}
341
342 View Code Duplication
			if ( ! empty( $args['boost_query_fields'] ) ) {
343
				if ( $is_phrase ) {
344
					$this->add_query( array(
345
						'multi_match' => array(
346
							'fields' => $args['boost_query_fields'],
347
							'query' => $t,
348
							'type' => 'phrase',
349
					) ), 'should' );
350
				} else {
351
					$this->add_query( array(
352
						'multi_match' => array(
353
							'fields' => $args['boost_query_fields'],
354
							'query' => $t,
355
					) ), 'should' );
356
				}
357
			}
358
		}
359
360
		return true;
361
	}
362
363
	/*
364
	 * Extract anything surrounded by quotes or if there is an opening quote
365
	 *   that is not complete, and add them to the query as a phrase query.
366
	 *   Quotes can be either '' or ""
367
	 *
368
	 *  args:
369
	 *    must_query_fields: array of fields that must match the phrases
370
	 *    boost_query_fields: array of fields to boost the phrases on (optional)
371
	 *
372
	 *  returns true/false of whether any were found
373
	 *
374
	 */
375
	public function phrase_filter( $args ) {
376
		$defaults = array(
377
			'must_query_fields' => array( 'all_content' ),
378
			'boost_query_fields' => array( 'title' ),
379
		);
380
		$args = wp_parse_args( $args, $defaults );
0 ignored issues
show
$defaults is of type array<string,array<integ...,{\"0\":\"string\"}>"}>, but the function expects a string.

It seems like the type of the argument is not accepted by the function/method which you are calling.

In some cases, in particular if PHP’s automatic type-juggling kicks in this might be fine. In other cases, however this might be a bug.

We suggest to add an explicit type cast like in the following example:

function acceptsInteger($int) { }

$x = '123'; // string "123"

// Instead of
acceptsInteger($x);

// we recommend to use
acceptsInteger((integer) $x);
Loading history...
381
382
		$phrases = array();
383 View Code Duplication
		if ( preg_match_all( '/"([^"]+)"/', $this->current_query, $matches ) ) {
384
			foreach ( $matches[1] as $match ) {
385
				$phrases[] = $match;
386
			}
387
			$this->current_query = preg_replace( '/"([^"]+)"/', '', $this->current_query );
388
		}
389
390 View Code Duplication
		if ( preg_match_all( "/'([^']+)'/", $this->current_query, $matches ) ) {
391
			foreach ( $matches[1] as $match ) {
392
				$phrases[] = $match;
393
			}
394
			$this->current_query = preg_replace( "/'([^']+)'/", '', $this->current_query );
395
		}
396
397
		//look for a final, uncompleted phrase
398
		$phrase_prefix = false;
399 View Code Duplication
		if ( preg_match_all( '/"([^"]+)$/', $this->current_query, $matches ) ) {
400
			$phrase_prefix = $matches[1][0];
401
			$this->current_query = preg_replace( '/"([^"]+)$/', '', $this->current_query );
402
		}
403 View Code Duplication
		if ( preg_match_all( "/(?:'\B|\B')([^']+)$/", $this->current_query, $matches ) ) {
404
			$phrase_prefix = $matches[1][0];
405
			$this->current_query = preg_replace( "/(?:'\B|\B')([^']+)$/", '', $this->current_query );
406
		}
407
408
		if ( $phrase_prefix ) {
409
			$phrases[] = $phrase_prefix;
410
		}
411
		if ( empty( $phrases ) ) {
412
			return false;
413
		}
414
415
		foreach ( $phrases as $p ) {
416
			$this->add_query( array(
417
				'multi_match' => array(
418
					'fields' => $args['must_query_fields'],
419
					'query' => $p,
420
					'type' => 'phrase',
421
				) ) );
422
423 View Code Duplication
			if ( ! empty( $args['boost_query_fields'] ) ) {
424
				$this->add_query( array(
425
					'multi_match' => array(
426
						'fields' => $args['boost_query_fields'],
427
						'query' => $p,
428
						'operator' => 'and',
429
				) ), 'should' );
430
			}
431
		}
432
433
		return true;
434
	}
435
436
	/*
437
	 * Query fields based on the remaining parts of the query
438
	 *   This could be the final AND part of the query terms to match, or it
439
	 *   could be boosting certain elements of the query
440
	 *
441
	 *  args:
442
	 *    must_query_fields: array of fields that must match the remaining terms (optional)
443
	 *    boost_query_fields: array of fields to boost the remaining terms on (optional)
444
	 *
445
	 */
446
	public function remaining_query( $args ) {
447
		$defaults = array(
448
			'must_query_fields' => null,
449
			'boost_query_fields' => null,
450
			'boost_operator' => 'and',
451
			'boost_query_type' => 'best_fields',
452
		);
453
		$args = wp_parse_args( $args, $defaults );
0 ignored issues
show
$defaults is of type array<string,null|string..._query_type":"string"}>, but the function expects a string.

It seems like the type of the argument is not accepted by the function/method which you are calling.

In some cases, in particular if PHP’s automatic type-juggling kicks in this might be fine. In other cases, however this might be a bug.

We suggest to add an explicit type cast like in the following example:

function acceptsInteger($int) { }

$x = '123'; // string "123"

// Instead of
acceptsInteger($x);

// we recommend to use
acceptsInteger((integer) $x);
Loading history...
454
455
		if ( empty( $this->current_query ) || ctype_space( $this->current_query ) ) {
456
			return;
457
		}
458
459 View Code Duplication
		if ( ! empty( $args['must_query_fields'] ) ) {
460
			$this->add_query( array(
461
				'multi_match' => array(
462
					'fields' => $args['must_query_fields'],
463
					'query' => $this->current_query,
464
					'operator' => 'and',
465
			) ) );
466
		}
467
468 View Code Duplication
		if ( ! empty( $args['boost_query_fields'] ) ) {
469
			$this->add_query( array(
470
				'multi_match' => array(
471
					'fields' => $args['boost_query_fields'],
472
					'query' => $this->current_query,
473
					'operator' => $args['boost_operator'],
474
					'type' => $args['boost_query_type'],
475
			) ), 'should' );
476
		}
477
478
	}
479
480
	/*
481
	 * Query fields using a prefix query (alphabetical expansions on the index).
482
	 *   This is not recommended. Slower performance and worse relevancy.
483
	 *
484
	 *  (UNTESTED! Copied from old prefix expansion code)
485
	 *
486
	 *  args:
487
	 *    must_query_fields: array of fields that must match the remaining terms (optional)
488
	 *    boost_query_fields: array of fields to boost the remaining terms on (optional)
489
	 *
490
	 */
491
	public function remaining_prefix_query( $args ) {
492
		$defaults = array(
493
			'must_query_fields' => array( 'all_content' ),
494
			'boost_query_fields' => array( 'title' ),
495
			'boost_operator' => 'and',
496
			'boost_query_type' => 'best_fields',
497
		);
498
		$args = wp_parse_args( $args, $defaults );
0 ignored issues
show
$defaults is of type array<string,array<integ..._query_type":"string"}>, but the function expects a string.

It seems like the type of the argument is not accepted by the function/method which you are calling.

In some cases, in particular if PHP’s automatic type-juggling kicks in this might be fine. In other cases, however this might be a bug.

We suggest to add an explicit type cast like in the following example:

function acceptsInteger($int) { }

$x = '123'; // string "123"

// Instead of
acceptsInteger($x);

// we recommend to use
acceptsInteger((integer) $x);
Loading history...
499
500
		if ( empty( $this->current_query ) || ctype_space( $this->current_query ) ) {
501
			return;
502
		}
503
504
		//////////////////////////////////
505
		// Example cases to think about:
506
		// "elasticse"
507
		// "elasticsearch"
508
		// "elasticsearch "
509
		// "elasticsearch lucen"
510
		// "elasticsearch lucene"
511
		// "the future"  - note the stopword which will match nothing!
512
		// "F1" - an exact match that also has tons of expansions
513
		// "こんにちは" ja "hello"
514
		// "こんにちは友人" ja "hello friend" - we just rely on the prefix phrase and ES to split words
515
		//   - this could still be better I bet. Maybe we need to analyze with ES first?
516
		//
517
518
		/////////////////////////////
519
		//extract pieces of query
520
		// eg: "PREFIXREMAINDER PREFIXWORD"
521
		//     "elasticsearch lucen"
522
523
		$prefix_word = false;
524
		$prefix_remainder = false;
525
		if ( preg_match_all( '/([^ ]+)$/', $this->current_query, $matches ) ) {
526
			$prefix_word = $matches[1][0];
527
		}
528
529
		$prefix_remainder = preg_replace( '/([^ ]+)$/', '', $this->current_query );
530
		if ( ctype_space( $prefix_remainder ) ) {
531
			$prefix_remainder = false;
532
		}
533
534
		if ( ! $prefix_word ) {
535
			//Space at the end of the query, so skip using a prefix query
536 View Code Duplication
			if ( ! empty( $args['must_query_fields'] ) ) {
537
				$this->add_query( array(
538
					'multi_match' => array(
539
						'fields' => $args['must_query_fields'],
540
						'query' => $this->current_query,
541
						'operator' => 'and',
542
					) ) );
543
			}
544
545 View Code Duplication
			if ( ! empty( $args['boost_query_fields'] ) ) {
546
				$this->add_query( array(
547
					'multi_match' => array(
548
						'fields' => $args['boost_query_fields'],
549
						'query' => $this->current_query,
550
						'operator' => $args['boost_operator'],
551
						'type' => $args['boost_query_type'],
552
					) ), 'should' );
553
			}
554
		} else {
555
556
			//must match the prefix word and the prefix remainder
557
			if ( ! empty( $args['must_query_fields'] ) ) {
558
				//need to do an OR across a few fields to handle all cases
559
				$must_q = array( 'bool' => array( 'should' => array( ), 'minimum_should_match' => 1 ) );
560
561
				//treat all words as an exact search (boosts complete word like "news"
562
				//from prefixes of "newspaper")
563
				$must_q['bool']['should'][] = array( 'multi_match' => array(
564
					'fields' => $this->all_fields,
565
					'query' => $full_text,
566
					'operator' => 'and',
567
					'type' => 'cross_fields',
568
				) );
569
570
				//always optimistically try and match the full text as a phrase
571
				//prefix "the futu" should try to match "the future"
572
				//otherwise the first stopword kinda breaks
573
				//This also works as the prefix match for a single word "elasticsea"
574
				$must_q['bool']['should'][] = array( 'multi_match' => array(
575
					'fields' => $this->phrase_fields,
576
					'query' => $full_text,
577
					'operator' => 'and',
578
					'type' => 'phrase_prefix',
579
					'max_expansions' => 100,
580
				) );
581
582
				if ( $prefix_remainder ) {
583
					//Multiple words found, so treat each word on its own and not just as
584
					//a part of a phrase
585
					//"elasticsearch lucen" => "elasticsearch" exact AND "lucen" prefix
586
					$q['bool']['should'][] = array( 'bool' => array(
587
						'must' => array(
588
							array( 'multi_match' => array(
589
								'fields' => $this->phrase_fields,
590
								'query' => $prefix_word,
591
								'operator' => 'and',
592
								'type' => 'phrase_prefix',
593
								'max_expansions' => 100,
594
							) ),
595
							array( 'multi_match' => array(
596
								'fields' => $this->all_fields,
597
								'query' => $prefix_remainder,
598
								'operator' => 'and',
599
								'type' => 'cross_fields',
600
							) ),
601
						)
602
					) );
603
				}
604
605
				$this->add_query( $must_q );
606
			}
607
608
			//Now add any boosting of the query
609
			if ( ! empty( $args['boost_query_fields'] ) ) {
610
				//treat all words as an exact search (boosts complete word like "news"
611
				//from prefixes of "newspaper")
612
				$this->add_query( array(
613
					'multi_match' => array(
614
						'fields' => $args['boost_query_fields'],
615
						'query' => $this->current_query,
616
						'operator' => $args['boost_query_operator'],
617
						'type' => $args['boost_query_type'],
618
					) ), 'should' );
619
620
				//optimistically boost the full phrase prefix match
621
				$this->add_query( array(
622
					'multi_match' => array(
623
						'fields' => $args['boost_query_fields'],
624
						'query' => $this->current_query,
625
						'operator' => 'and',
626
						'type' => 'phrase_prefix',
627
						'max_expansions' => 100,
628
					) ) );
629
			}
630
		}
631
	}
632
633
	/*
634
	 * Boost results based on the lang probability overlaps
635
	 *
636
	 *  args:
637
	 *    langs2prob: list of languages to search in with associated boosts
638
	 */
639
	public function boost_lang_probs( $langs2prob ) {
640
		foreach( $langs2prob as $l => $p ) {
641
			$this->add_function( 'field_value_factor', array(
642
				'modifier' => 'none',
643
				'factor' => $p,
644
				'missing' => 0.01, //1% chance doc did not have right lang detected
645
			) );
646
		}
647
	}
648
649
	////////////////////////////////////
650
	// Helper Methods
651
652
	//Get the text after some prefix. eg @gibrown, or @"Greg Brown"
653
	protected function get_fields( $field_prefix ) {
654
		$regex = '/' . $field_prefix . '(("[^"]+")|([^\\p{Z}]+))/';
655
		if ( preg_match_all( $regex, $this->current_query, $match ) ) {
656
			return $match[1];
657
		}
658
		return false;
659
	}
660
661
	//Remove the prefix and text from the query
662
	protected function remove_fields( $field_name ) {
663
		$regex = '/' . $field_name . '(("[^"]+")|([^\\p{Z}]+))/';
664
		$this->current_query = preg_replace( $regex, '', $this->current_query );
665
	}
666
667
	//Best effort string truncation that splits on word breaks
668
	protected function truncate_string( $string, $limit, $break=" " ) {
669
		if ( mb_strwidth( $string ) <= $limit ) {
670
			return $string;
671
		}
672
673
		// walk backwards from $limit to find first break
674
		$breakpoint = $limit;
675
		$broken = false;
676
		while ( $breakpoint > 0 ) {
677
			if ( $break === mb_strimwidth( $string, $breakpoint, 1 ) ) {
678
				$string = mb_strimwidth( $string, 0, $breakpoint );
679
				$broken = true;
680
				break;
681
			}
682
			$breakpoint--;
683
		}
684
		// if we weren't able to find a break, need to chop mid-word
685
		if ( !$broken ) {
686
			$string = mb_strimwidth( $string, 0, $limit );
687
		}
688
		return $string;
689
	}
690
691
}
692