|
1
|
|
|
<?php |
|
2
|
|
|
|
|
3
|
|
|
namespace dokuwiki\Search; |
|
4
|
|
|
|
|
5
|
|
|
use dokuwiki\Search\Tokenizer; |
|
6
|
|
|
use dokuwiki\Utf8; |
|
7
|
|
|
|
|
8
|
|
|
/** |
|
9
|
|
|
* DokuWuki QueryParser class |
|
10
|
|
|
*/ |
|
11
|
|
|
class QueryParser |
|
12
|
|
|
{ |
|
13
|
|
|
/** |
|
14
|
|
|
* Transforms given search term into intermediate representation |
|
15
|
|
|
* |
|
16
|
|
|
* This function is used in QueryParser::convert() and not for general purpose use. |
|
17
|
|
|
* |
|
18
|
|
|
* @author Kazutaka Miyasaka <[email protected]> |
|
19
|
|
|
* |
|
20
|
|
|
* @param string $term |
|
21
|
|
|
* @param bool $consider_asian |
|
22
|
|
|
* @param bool $phrase_mode |
|
23
|
|
|
* @return string |
|
24
|
|
|
*/ |
|
25
|
|
|
public function termParser($term, $consider_asian = true, $phrase_mode = false) |
|
26
|
|
|
{ |
|
27
|
|
|
$parsed = ''; |
|
28
|
|
|
if ($consider_asian) { |
|
29
|
|
|
// successive asian characters need to be searched as a phrase |
|
30
|
|
|
$words = Utf8\Asian::splitAsianWords($term); |
|
31
|
|
|
foreach ($words as $word) { |
|
32
|
|
|
$phrase_mode = $phrase_mode ? true : Utf8\Asian::isAsianWords($word); |
|
33
|
|
|
$parsed .= $this->termParser($word, false, $phrase_mode); |
|
34
|
|
|
} |
|
35
|
|
|
} else { |
|
36
|
|
|
$term_noparen = str_replace(['(',')'], ' ', $term); |
|
37
|
|
|
$Tokenizer = Tokenizer::getInstance(); |
|
38
|
|
|
$words = $Tokenizer->getWords($term_noparen, true); |
|
39
|
|
|
|
|
40
|
|
|
// W_: no need to highlight |
|
41
|
|
|
if (empty($words)) { |
|
42
|
|
|
$parsed = '()'; // important: do not remove |
|
43
|
|
|
} elseif ($words[0] === $term) { |
|
44
|
|
|
$parsed = '(W+:'.$words[0].')'; |
|
45
|
|
|
} elseif ($phrase_mode) { |
|
46
|
|
|
$term_encoded = str_replace(['(',')'], ['OP','CP'], $term); |
|
47
|
|
|
$parsed = '((W_:'.implode(')(W_:', $words).')(P+:'.$term_encoded.'))'; |
|
48
|
|
|
} else { |
|
49
|
|
|
$parsed = '((W+:'.implode(')(W+:', $words).'))'; |
|
50
|
|
|
} |
|
51
|
|
|
} |
|
52
|
|
|
return $parsed; |
|
53
|
|
|
} |
|
54
|
|
|
|
|
55
|
|
|
/** |
|
56
|
|
|
* Parses a search query and builds an array of search formulas |
|
57
|
|
|
* |
|
58
|
|
|
* @author Andreas Gohr <[email protected]> |
|
59
|
|
|
* @author Kazutaka Miyasaka <[email protected]> |
|
60
|
|
|
* |
|
61
|
|
|
* @param string $query search query |
|
62
|
|
|
* @return array of search formulas |
|
63
|
|
|
*/ |
|
64
|
|
|
public function convert($query) |
|
65
|
|
|
{ |
|
66
|
|
|
/** |
|
67
|
|
|
* parse a search query and transform it into intermediate representation |
|
68
|
|
|
* |
|
69
|
|
|
* in a search query, you can use the following expressions: |
|
70
|
|
|
* |
|
71
|
|
|
* words: |
|
72
|
|
|
* include |
|
73
|
|
|
* -exclude |
|
74
|
|
|
* phrases: |
|
75
|
|
|
* "phrase to be included" |
|
76
|
|
|
* -"phrase you want to exclude" |
|
77
|
|
|
* namespaces: |
|
78
|
|
|
* @include:namespace (or ns:include:namespace) |
|
79
|
|
|
* ^exclude:namespace (or -ns:exclude:namespace) |
|
80
|
|
|
* groups: |
|
81
|
|
|
* () |
|
82
|
|
|
* -() |
|
83
|
|
|
* operators: |
|
84
|
|
|
* and ('and' is the default operator: you can always omit this) |
|
85
|
|
|
* or (or pipe symbol '|', lower precedence than 'and') |
|
86
|
|
|
* |
|
87
|
|
|
* e.g. a query [ aa "bb cc" @dd:ee ] means "search pages which contain |
|
88
|
|
|
* a word 'aa', a phrase 'bb cc' and are within a namespace 'dd:ee'". |
|
89
|
|
|
* this query is equivalent to [ -(-aa or -"bb cc" or -ns:dd:ee) ] |
|
90
|
|
|
* as long as you don't mind hit counts. |
|
91
|
|
|
* |
|
92
|
|
|
* intermediate representation consists of the following parts: |
|
93
|
|
|
* |
|
94
|
|
|
* ( ) - group |
|
95
|
|
|
* AND - logical and |
|
96
|
|
|
* OR - logical or |
|
97
|
|
|
* NOT - logical not |
|
98
|
|
|
* W+:, W-:, W_: - word (underscore: no need to highlight) |
|
99
|
|
|
* P+:, P-: - phrase (minus sign: logically in NOT group) |
|
100
|
|
|
* N+:, N-: - namespace |
|
101
|
|
|
*/ |
|
102
|
|
|
$parsed_query = ''; |
|
103
|
|
|
$parens_level = 0; |
|
104
|
|
|
$terms = preg_split('/(-?".*?")/u', Utf8\PhpString::strtolower($query), |
|
105
|
|
|
-1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY |
|
106
|
|
|
); |
|
107
|
|
|
|
|
108
|
|
|
foreach ($terms as $term) { |
|
109
|
|
|
$parsed = ''; |
|
110
|
|
|
if (preg_match('/^(-?)"(.+)"$/u', $term, $matches)) { |
|
111
|
|
|
// phrase-include and phrase-exclude |
|
112
|
|
|
$not = $matches[1] ? 'NOT' : ''; |
|
113
|
|
|
$parsed = $not . $this->termParser($matches[2], false, true); |
|
114
|
|
|
} else { |
|
115
|
|
|
// fix incomplete phrase |
|
116
|
|
|
$term = str_replace('"', ' ', $term); |
|
117
|
|
|
|
|
118
|
|
|
// fix parentheses |
|
119
|
|
|
$term = str_replace(')' , ' ) ', $term); |
|
120
|
|
|
$term = str_replace('(' , ' ( ', $term); |
|
121
|
|
|
$term = str_replace('- (', ' -(', $term); |
|
122
|
|
|
|
|
123
|
|
|
// treat pipe symbols as 'OR' operators |
|
124
|
|
|
$term = str_replace('|', ' or ', $term); |
|
125
|
|
|
|
|
126
|
|
|
// treat ideographic spaces (U+3000) as search term separators |
|
127
|
|
|
// FIXME: some more separators? |
|
128
|
|
|
$term = preg_replace('/[ \x{3000}]+/u', ' ', $term); |
|
129
|
|
|
$term = trim($term); |
|
130
|
|
|
if ($term === '') continue; |
|
131
|
|
|
|
|
132
|
|
|
$tokens = explode(' ', $term); |
|
133
|
|
|
foreach ($tokens as $token) { |
|
134
|
|
|
if ($token === '(') { |
|
135
|
|
|
// parenthesis-include-open |
|
136
|
|
|
$parsed .= '('; |
|
137
|
|
|
++$parens_level; |
|
138
|
|
|
} elseif ($token === '-(') { |
|
139
|
|
|
// parenthesis-exclude-open |
|
140
|
|
|
$parsed .= 'NOT('; |
|
141
|
|
|
++$parens_level; |
|
142
|
|
|
} elseif ($token === ')') { |
|
143
|
|
|
// parenthesis-any-close |
|
144
|
|
|
if ($parens_level === 0) continue; |
|
145
|
|
|
$parsed .= ')'; |
|
146
|
|
|
$parens_level--; |
|
147
|
|
|
} elseif ($token === 'and') { |
|
148
|
|
|
// logical-and (do nothing) |
|
149
|
|
|
} elseif ($token === 'or') { |
|
150
|
|
|
// logical-or |
|
151
|
|
|
$parsed .= 'OR'; |
|
152
|
|
|
} elseif (preg_match('/^(?:\^|-ns:)(.+)$/u', $token, $matches)) { |
|
153
|
|
|
// namespace-exclude |
|
154
|
|
|
$parsed .= 'NOT(N+:'.$matches[1].')'; |
|
155
|
|
|
} elseif (preg_match('/^(?:@|ns:)(.+)$/u', $token, $matches)) { |
|
156
|
|
|
// namespace-include |
|
157
|
|
|
$parsed .= '(N+:'.$matches[1].')'; |
|
158
|
|
|
} elseif (preg_match('/^-(.+)$/', $token, $matches)) { |
|
159
|
|
|
// word-exclude |
|
160
|
|
|
$parsed .= 'NOT('.$this->termParser($matches[1]).')'; |
|
161
|
|
|
} else { |
|
162
|
|
|
// word-include |
|
163
|
|
|
$parsed .= $this->termParser($token); |
|
164
|
|
|
} |
|
165
|
|
|
} |
|
166
|
|
|
} |
|
167
|
|
|
$parsed_query .= $parsed; |
|
168
|
|
|
} |
|
169
|
|
|
|
|
170
|
|
|
// cleanup (very sensitive) |
|
171
|
|
|
$parsed_query .= str_repeat(')', $parens_level); |
|
172
|
|
|
do { |
|
173
|
|
|
$parsed_query_old = $parsed_query; |
|
174
|
|
|
$parsed_query = preg_replace('/(NOT)?\(\)/u', '', $parsed_query); |
|
175
|
|
|
} while ($parsed_query !== $parsed_query_old); |
|
176
|
|
|
$parsed_query = preg_replace('/(NOT|OR)+\)/u', ')' , $parsed_query); |
|
177
|
|
|
$parsed_query = preg_replace('/(OR)+/u' , 'OR' , $parsed_query); |
|
178
|
|
|
$parsed_query = preg_replace('/\(OR/u' , '(' , $parsed_query); |
|
179
|
|
|
$parsed_query = preg_replace('/^OR|OR$/u' , '' , $parsed_query); |
|
180
|
|
|
$parsed_query = preg_replace('/\)(NOT)?\(/u' , ')AND$1(', $parsed_query); |
|
181
|
|
|
|
|
182
|
|
|
// adjustment: make highlightings right |
|
183
|
|
|
$parens_level = 0; |
|
184
|
|
|
$notgrp_levels = array(); |
|
185
|
|
|
$parsed_query_new = ''; |
|
186
|
|
|
$tokens = preg_split('/(NOT\(|[()])/u', $parsed_query, |
|
187
|
|
|
-1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY |
|
188
|
|
|
); |
|
189
|
|
|
foreach ($tokens as $token) { |
|
190
|
|
|
if ($token === 'NOT(') { |
|
191
|
|
|
$notgrp_levels[] = ++$parens_level; |
|
192
|
|
|
} elseif ($token === '(') { |
|
193
|
|
|
++$parens_level; |
|
194
|
|
|
} elseif ($token === ')') { |
|
195
|
|
|
if ($parens_level-- === end($notgrp_levels)) array_pop($notgrp_levels); |
|
196
|
|
|
} elseif (count($notgrp_levels) % 2 === 1) { |
|
197
|
|
|
// turn highlight-flag off if terms are logically in "NOT" group |
|
198
|
|
|
$token = preg_replace('/([WPN])\+\:/u', '$1-:', $token); |
|
199
|
|
|
} |
|
200
|
|
|
$parsed_query_new .= $token; |
|
201
|
|
|
} |
|
202
|
|
|
$parsed_query = $parsed_query_new; |
|
203
|
|
|
|
|
204
|
|
|
/** |
|
205
|
|
|
* convert infix notation string into postfix (Reverse Polish notation) array |
|
206
|
|
|
* by Shunting-yard algorithm |
|
207
|
|
|
* |
|
208
|
|
|
* see: http://en.wikipedia.org/wiki/Reverse_Polish_notation |
|
209
|
|
|
* see: http://en.wikipedia.org/wiki/Shunting-yard_algorithm |
|
210
|
|
|
*/ |
|
211
|
|
|
$parsed_ary = array(); |
|
212
|
|
|
$ope_stack = array(); |
|
213
|
|
|
$ope_precedence = array(')' => 1, 'OR' => 2, 'AND' => 3, 'NOT' => 4, '(' => 5); |
|
214
|
|
|
$ope_regex = '/([()]|OR|AND|NOT)/u'; |
|
215
|
|
|
|
|
216
|
|
|
$tokens = preg_split($ope_regex, $parsed_query, |
|
217
|
|
|
-1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY |
|
218
|
|
|
); |
|
219
|
|
|
foreach ($tokens as $token) { |
|
220
|
|
|
if (preg_match($ope_regex, $token)) { |
|
221
|
|
|
// operator |
|
222
|
|
|
$last_ope = end($ope_stack); |
|
223
|
|
|
while ($last_ope !== false |
|
224
|
|
|
&& $ope_precedence[$token] <= $ope_precedence[$last_ope] |
|
225
|
|
|
&& $last_ope != '(' |
|
226
|
|
|
) { |
|
227
|
|
|
$parsed_ary[] = array_pop($ope_stack); |
|
228
|
|
|
$last_ope = end($ope_stack); |
|
229
|
|
|
} |
|
230
|
|
|
if ($token == ')') { |
|
231
|
|
|
array_pop($ope_stack); // this array_pop always deletes '(' |
|
232
|
|
|
} else { |
|
233
|
|
|
$ope_stack[] = $token; |
|
234
|
|
|
} |
|
235
|
|
|
} else { |
|
236
|
|
|
// operand |
|
237
|
|
|
$token_decoded = str_replace(['OP','CP'], ['(',')'], $token); |
|
238
|
|
|
$parsed_ary[] = $token_decoded; |
|
239
|
|
|
} |
|
240
|
|
|
} |
|
241
|
|
|
$parsed_ary = array_values(array_merge($parsed_ary, array_reverse($ope_stack))); |
|
242
|
|
|
|
|
243
|
|
|
// cleanup: each double "NOT" in RPN array actually does nothing |
|
244
|
|
|
$parsed_ary_count = count($parsed_ary); |
|
245
|
|
|
for ($i = 1; $i < $parsed_ary_count; ++$i) { |
|
246
|
|
|
if ($parsed_ary[$i] === 'NOT' && $parsed_ary[$i - 1] === 'NOT') { |
|
247
|
|
|
unset($parsed_ary[$i], $parsed_ary[$i - 1]); |
|
248
|
|
|
} |
|
249
|
|
|
} |
|
250
|
|
|
$parsed_ary = array_values($parsed_ary); |
|
251
|
|
|
|
|
252
|
|
|
// build return value |
|
253
|
|
|
$q = array(); |
|
254
|
|
|
$q['query'] = $query; |
|
255
|
|
|
$q['parsed_str'] = $parsed_query; |
|
256
|
|
|
$q['parsed_ary'] = $parsed_ary; |
|
257
|
|
|
|
|
258
|
|
|
foreach ($q['parsed_ary'] as $token) { |
|
259
|
|
|
if ($token[2] !== ':') continue; |
|
260
|
|
|
$body = substr($token, 3); |
|
261
|
|
|
|
|
262
|
|
|
switch (substr($token, 0, 3)) { |
|
263
|
|
|
case 'N+:': |
|
264
|
|
|
$q['ns'][] = $body; // for backward compatibility |
|
265
|
|
|
break; |
|
266
|
|
|
case 'N-:': |
|
267
|
|
|
$q['notns'][] = $body; // for backward compatibility |
|
268
|
|
|
break; |
|
269
|
|
|
case 'W_:': |
|
270
|
|
|
$q['words'][] = $body; |
|
271
|
|
|
break; |
|
272
|
|
|
case 'W-:': |
|
273
|
|
|
$q['words'][] = $body; |
|
274
|
|
|
$q['not'][] = $body; // for backward compatibility |
|
275
|
|
|
break; |
|
276
|
|
|
case 'W+:': |
|
277
|
|
|
$q['words'][] = $body; |
|
278
|
|
|
$q['highlight'][] = $body; |
|
279
|
|
|
$q['and'][] = $body; // for backward compatibility |
|
280
|
|
|
break; |
|
281
|
|
|
case 'P-:': |
|
282
|
|
|
$q['phrases'][] = $body; |
|
283
|
|
|
break; |
|
284
|
|
|
case 'P+:': |
|
285
|
|
|
$q['phrases'][] = $body; |
|
286
|
|
|
$q['highlight'][] = $body; |
|
287
|
|
|
break; |
|
288
|
|
|
} |
|
289
|
|
|
} |
|
290
|
|
|
foreach (['words', 'phrases', 'highlight', 'ns', 'notns', 'and', 'not'] as $key) { |
|
291
|
|
|
$q[$key] = empty($q[$key]) ? array() : array_values(array_unique($q[$key])); |
|
292
|
|
|
} |
|
293
|
|
|
|
|
294
|
|
|
return $q; |
|
295
|
|
|
} |
|
296
|
|
|
|
|
297
|
|
|
/** |
|
298
|
|
|
* Recreate a search query string based on parsed parts, |
|
299
|
|
|
* doesn't support negated phrases and `OR` searches |
|
300
|
|
|
* |
|
301
|
|
|
* @param array $and |
|
302
|
|
|
* @param array $not |
|
303
|
|
|
* @param array $phrases |
|
304
|
|
|
* @param array $ns |
|
305
|
|
|
* @param array $notns |
|
306
|
|
|
* |
|
307
|
|
|
* @return string |
|
308
|
|
|
*/ |
|
309
|
|
|
public function revert(array $and, array $not, array $phrases, array $ns, array $notns) |
|
310
|
|
|
{ |
|
311
|
|
|
$query = implode(' ', $and); |
|
312
|
|
|
|
|
313
|
|
|
if (!empty($not)) { |
|
314
|
|
|
$query .= ' -' . implode(' -', $not); |
|
315
|
|
|
} |
|
316
|
|
|
if (!empty($phrases)) { |
|
317
|
|
|
$query .= ' "' . implode('" "', $phrases) . '"'; |
|
318
|
|
|
} |
|
319
|
|
|
if (!empty($ns)) { |
|
320
|
|
|
$query .= ' @' . implode(' @', $ns); |
|
321
|
|
|
} |
|
322
|
|
|
if (!empty($notns)) { |
|
323
|
|
|
$query .= ' ^' . implode(' ^', $notns); |
|
324
|
|
|
} |
|
325
|
|
|
return $query; |
|
326
|
|
|
} |
|
327
|
|
|
} |
|
328
|
|
|
|