Completed
Push — sync/require-lib ( cb01a1 )
by
unknown
17:14
created

jetpack-wpes-query-parser.php (1 issue)

Severity

Upgrade to new PHP Analysis Engine

These results are based on our legacy PHP analysis, consider migrating to our new PHP analysis engine instead. Learn more

1
<?php
2
3
/**
4
 * Parse a pure text query into WordPress Elasticsearch query. This builds on
5
 * the Jetpack_WPES_Query_Builder() to provide search query parsing.
6
 *
7
 * The key part of this parser is taking a user's query string typed into a box
8
 * and converting it into an ES search query.
9
 *
10
 * This varies by application, but roughly it means extracting some parts of the query
11
 * (authors, tags, and phrases) that are treated as a filter. Then taking the
12
 * remaining words and building the correct query (possibly with prefix searching
13
 * if we are doing search as you type)
14
 *
15
 * This class only supports ES 2.x+
16
 *
17
 * This parser builds queries of the form:
18
 *   bool:
19
 *     must:
20
 *       AND match of a single field (ideally an edgengram field)
21
 *     filter:
22
 *       filter clauses from context (eg @gibrown, #news, etc)
23
 *     should:
24
 *       boosting of results by various fields
25
 *
26
 * Features supported:
27
 *  - search as you type
28
 *  - phrases
29
 *  - supports querying across multiple languages at once
30
 *
31
 * Example usage (from Search on Reader Manage):
32
 *
33
 *		require_lib( 'jetpack-wpes-query-builder/jetpack-wpes-search-query-parser' );
34
 *		$parser = new WPES_Search_Query_Parser( $args['q'], array( $lang ) );
35
 *
36
 *		//author
37
 *		$parser->author_field_filter( array(
38
 *			'prefixes' => array( '@' ),
39
 *			'wpcom_id_field' => 'author_id',
40
 *			'must_query_fields' => array( 'author.engram', 'author_login.engram' ),
41
 *			'boost_query_fields' => array( 'author^2', 'author_login^2', 'title.default.engram' ),
42
 *		) );
43
 *
44
 *		//remainder of query
45
 *		$match_content_fields = $parser->merge_ml_fields(
46
 *			array(
47
 *				'all_content' => 0.1,
48
 *			),
49
 *			array(
50
 *				'all_content.default.engram^0.1',
51
 *			)
52
 *		);
53
 *		$boost_content_fields = $parser->merge_ml_fields(
54
 *			array(
55
 *				'title' => 2,
56
 *				'description' => 1,
57
 *				'tags' => 1,
58
 *			),
59
 *			array(
60
 *				'author_login^2',
61
 *				'author^2',
62
 *			)
63
 *		);
64
 *
65
 *		$parser->phrase_filter( array(
66
 *			'must_query_fields' => $match_content_fields,
67
 *			'boost_query_fields' => $boost_content_fields,
68
 *		) );
69
 *		$parser->remaining_query( array(
70
 *			'must_query_fields' => $match_content_fields,
71
 *			'boost_query_fields' => $boost_content_fields,
72
 *		) );
73
 *
74
 *		//Boost on phrases
75
 *		$parser->remaining_query( array(
76
 *			'boost_query_fields' => $boost_content_fields,
77
 *			'boost_query_type'   => 'phrase',
78
 *		) );
79
 *
80
 *		//boosting
81
 *		$parser->add_max_boost_to_functions( 20 );
82
 *		$parser->add_function( 'field_value_factor', array(
83
 *			'follower_count' => array(
84
 *				'modifier' => 'sqrt',
85
 *				'factor' => 1,
86
 *				'missing' => 0,
87
 *			) ) );
88
 *
89
 *		//Filtering
90
 *		$parser->add_filter( array(
91
 *			'exists' => array( 'field' => 'langs.' . $lang )
92
 *		) );
93
 *
94
 *		//run the query
95
 *		$es_query_args = array(
96
 *			'name' => 'feeds',
97
 *			'blog_id' => false,
98
 *			'security_strategy' => 'a8c',
99
 *			'type' => 'feed,blog',
100
 *			'fields' => array( 'blog_id', 'feed_id' ),
101
 *			'query' => $parser->build_query(),
102
 *			'filter' => $parser->build_filter(),
103
 *			'size' => $size,
104
 *			'from' => $from
105
 *		);
106
 *		$es_results = es_api_search_index( $es_query_args, 'api-feed-find' );
107
 *
108
 */
109
110
jetpack_require_lib( 'jetpack-wpes-query-builder' );
0 ignored issues
show
The call to the function jetpack_require_lib() seems unnecessary as the function has no side-effects.
Loading history...
111
112
class Jetpack_WPES_Search_Query_Parser extends Jetpack_WPES_Query_Builder {
113
114
	protected $orig_query = '';
115
	protected $current_query = '';
116
	protected $langs;
117
	protected $avail_langs = array( 'ar', 'bg', 'ca', 'cs', 'da', 'de', 'el', 'en', 'es', 'eu', 'fa', 'fi', 'fr', 'he', 'hi', 'hu', 'hy', 'id', 'it', 'ja', 'ko', 'nl', 'no', 'pt', 'ro', 'ru', 'sv', 'tr', 'zh' );
118
119
	public function __construct( $user_query, $langs ) {
120
		$this->orig_query = $user_query;
121
		$this->current_query = $this->orig_query;
122
		$this->langs = $this->norm_langs( $langs );
123
	}
124
125
	protected $extracted_phrases = array();
126
127
	///////////////////////////////////////////////////////
128
	// Methods for Building arrays of multilingual fields
129
130
	/*
131
	 * Normalize language codes
132
	 */
133
	public function norm_langs( $langs ) {
134
		$lst = array();
135
		foreach( $langs as $l ) {
136
			$l = strtok( $l, '-_' );
137
			if ( in_array( $l, $this->avail_langs ) ) {
138
				$lst[$l] = true;
139
			} else {
140
				$lst['default'] = true;
141
			}
142
		}
143
		return array_keys( $lst );
144
	}
145
146
	/*
147
	 * Take a list of field prefixes and expand them for multi-lingual
148
	 * with the provided boostings.
149
	 */
150
	public function merge_ml_fields( $fields2boosts, $additional_fields ) {
151
		$flds = array();
152
		foreach( $fields2boosts as $f => $b ) {
153
			foreach( $this->langs as $l ) {
154
				$flds[] = $f . '.' . $l . '^' . $b;
155
			}
156
		}
157
		foreach( $additional_fields as $f ) {
158
			$flds[] = $f;
159
		}
160
		return $flds;
161
	}
162
163
	////////////////////////////////////
164
	// Extract Fields for Filtering on
165
166
	/*
167
	 * Extract any @mentions from the user query
168
	 *  use them as a filter if we can find a wp.com id
169
	 *  otherwise use them as a
170
	 *
171
	 *  args:
172
	 *    wpcom_id_field: wp.com id field
173
	 *    must_query_fields: array of fields to search for matching results (optional)
174
	 *    boost_query_fields: array of fields to search in for boosting results (optional)
175
	 *    prefixes: array of prefixes that the user can use to indicate an author
176
	 *
177
	 *  returns true/false of whether any were found
178
	 *
179
	 * See also: https://github.com/twitter/twitter-text/blob/master/java/src/com/twitter/Regex.java
180
	 */
181
	public function author_field_filter( $args ) {
182
		$defaults = array(
183
			'wpcom_id_field' => 'author_id',
184
			'must_query_fields' => null,
185
			'boost_query_fields' => null,
186
			'prefixes' => array( '@' ),
187
		);
188
		$args = wp_parse_args( $args, $defaults );
189
190
		$names = array();
191 View Code Duplication
		foreach( $args['prefixes'] as $p ) {
192
			$found = $this->get_fields( $p );
193
			if ( $found ) {
194
				foreach( $found as $f ) {
195
					$names[] = $f;
196
				}
197
			}
198
		}
199
200
		if ( empty( $names ) ) {
201
			return false;
202
		}
203
204
		foreach( $args['prefixes'] as $p ) {
205
			$this->remove_fields( $p );
206
		}
207
208
		$user_ids = array();
209
		$query_names = array();
210
211
		//loop through the matches and separate into filters and queries
212
		foreach( $names as $n ) {
213
			//check for exact match on login
214
			$userdata = get_user_by( 'login', strtolower( $n ) );
215
			$filtering = false;
216
			if ( $userdata ) {
217
				$user_ids[ $userdata->ID ] = true;
218
				$filtering = true;
219
			}
220
221
			$is_phrase = false;
222
			if ( preg_match( '/"/', $n ) ) {
223
				$is_phrase = true;
224
				$n = preg_replace( '/"/', '', $n );
225
			}
226
227 View Code Duplication
			if ( !empty( $args['must_query_fields'] ) && !$filtering ) {
228
				if ( $is_phrase ) {
229
					$this->add_query( array(
230
						'multi_match' => array(
231
							'fields' => $args['must_query_fields'],
232
							'query' => $n,
233
							'type' => 'phrase',
234
					) ) );
235
				} else {
236
					$this->add_query( array(
237
						'multi_match' => array(
238
							'fields' => $args['must_query_fields'],
239
							'query' => $n,
240
					) ) );
241
				}
242
			}
243
244 View Code Duplication
			if ( !empty( $args['boost_query_fields'] ) ) {
245
				if ( $is_phrase ) {
246
					$this->add_query( array(
247
						'multi_match' => array(
248
							'fields' => $args['boost_query_fields'],
249
							'query' => $n,
250
							'type' => 'phrase',
251
					) ), 'should' );
252
				} else {
253
					$this->add_query( array(
254
						'multi_match' => array(
255
							'fields' => $args['boost_query_fields'],
256
							'query' => $n,
257
					) ), 'should' );
258
				}
259
			}
260
		}
261
262
		if ( ! empty( $user_ids ) ) {
263
			$user_ids = array_keys( $user_ids );
264
			$this->add_filter( array( 'terms' => array( $args['wpcom_id_field'] => $user_ids ) ) );
265
		}
266
267
		return true;
268
	}
269
270
	/*
271
	 * Extract any prefix followed by text use them as a must clause,
272
	 *   and optionally as a boost to the should query
273
	 *   This can be used for hashtags. eg #News, or #"current events",
274
	 *   but also works for any arbitrary field. eg from:Greg
275
	 *
276
	 *  args:
277
	 *    must_query_fields: array of fields that must match the tag (optional)
278
	 *    boost_query_fields: array of fields to boost search on (optional)
279
	 *    prefixes: array of prefixes that the user can use to indicate a tag
280
	 *
281
	 *  returns true/false of whether any were found
282
	 *
283
	 */
284
	public function text_field_filter( $args ) {
285
		$defaults = array(
286
			'must_query_fields' => array( 'tag.name' ),
287
			'boost_query_fields' => array( 'tag.name' ),
288
			'prefixes' => array( '#' ),
289
		);
290
		$args = wp_parse_args( $args, $defaults );
291
292
		$tags = array();
293 View Code Duplication
		foreach( $args['prefixes'] as $p ) {
294
			$found = $this->get_fields( $p );
295
			if ( $found ) {
296
				foreach( $found as $f ) {
297
					$tags[] = $f;
298
				}
299
			}
300
		}
301
302
		if ( empty( $tags ) ) {
303
			return false;
304
		}
305
306
		foreach( $args['prefixes'] as $p ) {
307
			$this->remove_fields( $p );
308
		}
309
310
		foreach( $tags as $t ) {
311
			$is_phrase = false;
312
			if ( preg_match( '/"/', $t ) ) {
313
				$is_phrase = true;
314
				$t = preg_replace( '/"/', '', $t );
315
			}
316
317 View Code Duplication
			if ( ! empty( $args['must_query_fields'] ) ) {
318
				if ( $is_phrase ) {
319
					$this->add_query( array(
320
						'multi_match' => array(
321
							'fields' => $args['must_query_fields'],
322
							'query' => $t,
323
							'type' => 'phrase',
324
					) ) );
325
				} else {
326
					$this->add_query( array(
327
						'multi_match' => array(
328
							'fields' => $args['must_query_fields'],
329
							'query' => $t,
330
					) ) );
331
				}
332
			}
333
334 View Code Duplication
			if ( ! empty( $args['boost_query_fields'] ) ) {
335
				if ( $is_phrase ) {
336
					$this->add_query( array(
337
						'multi_match' => array(
338
							'fields' => $args['boost_query_fields'],
339
							'query' => $t,
340
							'type' => 'phrase',
341
					) ), 'should' );
342
				} else {
343
					$this->add_query( array(
344
						'multi_match' => array(
345
							'fields' => $args['boost_query_fields'],
346
							'query' => $t,
347
					) ), 'should' );
348
				}
349
			}
350
		}
351
352
		return true;
353
	}
354
355
	/*
356
	 * Extract anything surrounded by quotes or if there is an opening quote
357
	 *   that is not complete, and add them to the query as a phrase query.
358
	 *   Quotes can be either '' or ""
359
	 *
360
	 *  args:
361
	 *    must_query_fields: array of fields that must match the phrases
362
	 *    boost_query_fields: array of fields to boost the phrases on (optional)
363
	 *
364
	 *  returns true/false of whether any were found
365
	 *
366
	 */
367
	public function phrase_filter( $args ) {
368
		$defaults = array(
369
			'must_query_fields' => array( 'all_content' ),
370
			'boost_query_fields' => array( 'title' ),
371
		);
372
		$args = wp_parse_args( $args, $defaults );
373
374
		$phrases = array();
375 View Code Duplication
		if ( preg_match_all( '/"([^"]+)"/', $this->current_query, $matches ) ) {
376
			foreach ( $matches[1] as $match ) {
377
				$phrases[] = $match;
378
			}
379
			$this->current_query = preg_replace( '/"([^"]+)"/', '', $this->current_query );
380
		}
381
382 View Code Duplication
		if ( preg_match_all( "/'([^']+)'/", $this->current_query, $matches ) ) {
383
			foreach ( $matches[1] as $match ) {
384
				$phrases[] = $match;
385
			}
386
			$this->current_query = preg_replace( "/'([^']+)'/", '', $this->current_query );
387
		}
388
389
		//look for a final, uncompleted phrase
390
		$phrase_prefix = false;
391 View Code Duplication
		if ( preg_match_all( '/"([^"]+)$/', $this->current_query, $matches ) ) {
392
			$phrase_prefix = $matches[1][0];
393
			$this->current_query = preg_replace( '/"([^"]+)$/', '', $this->current_query );
394
		}
395 View Code Duplication
		if ( preg_match_all( "/(?:'\B|\B')([^']+)$/", $this->current_query, $matches ) ) {
396
			$phrase_prefix = $matches[1][0];
397
			$this->current_query = preg_replace( "/(?:'\B|\B')([^']+)$/", '', $this->current_query );
398
		}
399
400
		if ( $phrase_prefix ) {
401
			$phrases[] = $phrase_prefix;
402
		}
403
		if ( empty( $phrases ) ) {
404
			return false;
405
		}
406
407
		foreach ( $phrases as $p ) {
408
			$this->add_query( array(
409
				'multi_match' => array(
410
					'fields' => $args['must_query_fields'],
411
					'query' => $p,
412
					'type' => 'phrase',
413
				) ) );
414
415 View Code Duplication
			if ( ! empty( $args['boost_query_fields'] ) ) {
416
				$this->add_query( array(
417
					'multi_match' => array(
418
						'fields' => $args['boost_query_fields'],
419
						'query' => $p,
420
						'operator' => 'and',
421
				) ), 'should' );
422
			}
423
		}
424
425
		return true;
426
	}
427
428
	/*
429
	 * Query fields based on the remaining parts of the query
430
	 *   This could be the final AND part of the query terms to match, or it
431
	 *   could be boosting certain elements of the query
432
	 *
433
	 *  args:
434
	 *    must_query_fields: array of fields that must match the remaining terms (optional)
435
	 *    boost_query_fields: array of fields to boost the remaining terms on (optional)
436
	 *
437
	 */
438
	public function remaining_query( $args ) {
439
		$defaults = array(
440
			'must_query_fields' => null,
441
			'boost_query_fields' => null,
442
			'boost_operator' => 'and',
443
			'boost_query_type' => 'best_fields',
444
		);
445
		$args = wp_parse_args( $args, $defaults );
446
447
		if ( empty( $this->current_query ) || ctype_space( $this->current_query ) ) {
448
			return;
449
		}
450
451 View Code Duplication
		if ( ! empty( $args['must_query_fields'] ) ) {
452
			$this->add_query( array(
453
				'multi_match' => array(
454
					'fields' => $args['must_query_fields'],
455
					'query' => $this->current_query,
456
					'operator' => 'and',
457
			) ) );
458
		}
459
460 View Code Duplication
		if ( ! empty( $args['boost_query_fields'] ) ) {
461
			$this->add_query( array(
462
				'multi_match' => array(
463
					'fields' => $args['boost_query_fields'],
464
					'query' => $this->current_query,
465
					'operator' => $args['boost_operator'],
466
					'type' => $args['boost_query_type'],
467
			) ), 'should' );
468
		}
469
470
	}
471
472
	/*
473
	 * Query fields using a prefix query (alphabetical expansions on the index).
474
	 *   This is not recommended. Slower performance and worse relevancy.
475
	 *
476
	 *  (UNTESTED! Copied from old prefix expansion code)
477
	 *
478
	 *  args:
479
	 *    must_query_fields: array of fields that must match the remaining terms (optional)
480
	 *    boost_query_fields: array of fields to boost the remaining terms on (optional)
481
	 *
482
	 */
483
	public function remaining_prefix_query( $args ) {
484
		$defaults = array(
485
			'must_query_fields' => array( 'all_content' ),
486
			'boost_query_fields' => array( 'title' ),
487
			'boost_operator' => 'and',
488
			'boost_query_type' => 'best_fields',
489
		);
490
		$args = wp_parse_args( $args, $defaults );
491
492
		if ( empty( $this->current_query ) || ctype_space( $this->current_query ) ) {
493
			return;
494
		}
495
496
		//////////////////////////////////
497
		// Example cases to think about:
498
		// "elasticse"
499
		// "elasticsearch"
500
		// "elasticsearch "
501
		// "elasticsearch lucen"
502
		// "elasticsearch lucene"
503
		// "the future"  - note the stopword which will match nothing!
504
		// "F1" - an exact match that also has tons of expansions
505
		// "こんにちは" ja "hello"
506
		// "こんにちは友人" ja "hello friend" - we just rely on the prefix phrase and ES to split words
507
		//   - this could still be better I bet. Maybe we need to analyze with ES first?
508
		//
509
510
		/////////////////////////////
511
		//extract pieces of query
512
		// eg: "PREFIXREMAINDER PREFIXWORD"
513
		//     "elasticsearch lucen"
514
515
		$prefix_word = false;
516
		$prefix_remainder = false;
517
		if ( preg_match_all( '/([^ ]+)$/', $this->current_query, $matches ) ) {
518
			$prefix_word = $matches[1][0];
519
		}
520
521
		$prefix_remainder = preg_replace( '/([^ ]+)$/', '', $this->current_query );
522
		if ( ctype_space( $prefix_remainder ) ) {
523
			$prefix_remainder = false;
524
		}
525
526
		if ( ! $prefix_word ) {
527
			//Space at the end of the query, so skip using a prefix query
528 View Code Duplication
			if ( ! empty( $args['must_query_fields'] ) ) {
529
				$this->add_query( array(
530
					'multi_match' => array(
531
						'fields' => $args['must_query_fields'],
532
						'query' => $this->current_query,
533
						'operator' => 'and',
534
					) ) );
535
			}
536
537 View Code Duplication
			if ( ! empty( $args['boost_query_fields'] ) ) {
538
				$this->add_query( array(
539
					'multi_match' => array(
540
						'fields' => $args['boost_query_fields'],
541
						'query' => $this->current_query,
542
						'operator' => $args['boost_operator'],
543
						'type' => $args['boost_query_type'],
544
					) ), 'should' );
545
			}
546
		} else {
547
548
			//must match the prefix word and the prefix remainder
549
			if ( ! empty( $args['must_query_fields'] ) ) {
550
				//need to do an OR across a few fields to handle all cases
551
				$must_q = array( 'bool' => array( 'should' => array( ), 'minimum_should_match' => 1 ) );
552
553
				//treat all words as an exact search (boosts complete word like "news"
554
				//from prefixes of "newspaper")
555
				$must_q['bool']['should'][] = array( 'multi_match' => array(
556
					'fields' => $this->all_fields,
557
					'query' => $full_text,
558
					'operator' => 'and',
559
					'type' => 'cross_fields',
560
				) );
561
562
				//always optimistically try and match the full text as a phrase
563
				//prefix "the futu" should try to match "the future"
564
				//otherwise the first stopword kinda breaks
565
				//This also works as the prefix match for a single word "elasticsea"
566
				$must_q['bool']['should'][] = array( 'multi_match' => array(
567
					'fields' => $this->phrase_fields,
568
					'query' => $full_text,
569
					'operator' => 'and',
570
					'type' => 'phrase_prefix',
571
					'max_expansions' => 100,
572
				) );
573
574
				if ( $prefix_remainder ) {
575
					//Multiple words found, so treat each word on its own and not just as
576
					//a part of a phrase
577
					//"elasticsearch lucen" => "elasticsearch" exact AND "lucen" prefix
578
					$q['bool']['should'][] = array( 'bool' => array(
579
						'must' => array(
580
							array( 'multi_match' => array(
581
								'fields' => $this->phrase_fields,
582
								'query' => $prefix_word,
583
								'operator' => 'and',
584
								'type' => 'phrase_prefix',
585
								'max_expansions' => 100,
586
							) ),
587
							array( 'multi_match' => array(
588
								'fields' => $this->all_fields,
589
								'query' => $prefix_remainder,
590
								'operator' => 'and',
591
								'type' => 'cross_fields',
592
							) ),
593
						)
594
					) );
595
				}
596
597
				$this->add_query( $must_q );
598
			}
599
600
			//Now add any boosting of the query
601
			if ( ! empty( $args['boost_query_fields'] ) ) {
602
				//treat all words as an exact search (boosts complete word like "news"
603
				//from prefixes of "newspaper")
604
				$this->add_query( array(
605
					'multi_match' => array(
606
						'fields' => $args['boost_query_fields'],
607
						'query' => $this->current_query,
608
						'operator' => $args['boost_query_operator'],
609
						'type' => $args['boost_query_type'],
610
					) ), 'should' );
611
612
				//optimistically boost the full phrase prefix match
613
				$this->add_query( array(
614
					'multi_match' => array(
615
						'fields' => $args['boost_query_fields'],
616
						'query' => $this->current_query,
617
						'operator' => 'and',
618
						'type' => 'phrase_prefix',
619
						'max_expansions' => 100,
620
					) ) );
621
			}
622
		}
623
	}
624
625
	/*
626
	 * Boost results based on the lang probability overlaps
627
	 *
628
	 *  args:
629
	 *    langs2prob: list of languages to search in with associated boosts
630
	 */
631
	public function boost_lang_probs( $langs2prob ) {
632
		foreach( $langs2prob as $l => $p ) {
633
			$this->add_function( 'field_value_factor', array(
634
				'modifier' => 'none',
635
				'factor' => $p,
636
				'missing' => 0.01, //1% chance doc did not have right lang detected
637
			) );
638
		}
639
	}
640
641
	////////////////////////////////////
642
	// Helper Methods
643
644
	//Get the text after some prefix. eg @gibrown, or @"Greg Brown"
645
	protected function get_fields( $field_prefix ) {
646
		$regex = '/' . $field_prefix . '(("[^"]+")|([^\\p{Z}]+))/';
647
		if ( preg_match_all( $regex, $this->current_query, $match ) ) {
648
			return $match[1];
649
		}
650
		return false;
651
	}
652
653
	//Remove the prefix and text from the query
654
	protected function remove_fields( $field_name ) {
655
		$regex = '/' . $field_name . '(("[^"]+")|([^\\p{Z}]+))/';
656
		$this->current_query = preg_replace( $regex, '', $this->current_query );
657
	}
658
659
	//Best effort string truncation that splits on word breaks
660
	protected function truncate_string( $string, $limit, $break=" " ) {
661
		if ( mb_strwidth( $string ) <= $limit ) {
662
			return $string;
663
		}
664
665
		// walk backwards from $limit to find first break
666
		$breakpoint = $limit;
667
		$broken = false;
668
		while ( $breakpoint > 0 ) {
669
			if ( $break === mb_strimwidth( $string, $breakpoint, 1 ) ) {
670
				$string = mb_strimwidth( $string, 0, $breakpoint );
671
				$broken = true;
672
				break;
673
			}
674
			$breakpoint--;
675
		}
676
		// if we weren't able to find a break, need to chop mid-word
677
		if ( !$broken ) {
678
			$string = mb_strimwidth( $string, 0, $limit );
679
		}
680
		return $string;
681
	}
682
683
}
684