Failed Conditions
Pull Request — master (#2943)
by
unknown
03:12
created

QueryParser::revert()   A

Complexity

Conditions 5
Paths 16

Size

Total Lines 18

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 5
nc 16
nop 5
dl 0
loc 18
rs 9.3554
c 0
b 0
f 0
1
<?php
2
3
namespace dokuwiki\Search;
4
5
use dokuwiki\Search\Tokenizer;
6
use dokuwiki\Utf8;
7
8
/**
9
 * DokuWuki QueryParser class
10
 */
11
class QueryParser
12
{
13
    /**
14
     * Transforms given search term into intermediate representation
15
     *
16
     * This function is used in QueryParser::convert() and not for general purpose use.
17
     *
18
     * @author Kazutaka Miyasaka <[email protected]>
19
     *
20
     * @param string       $term
21
     * @param bool         $consider_asian
22
     * @param bool         $phrase_mode
23
     * @return string
24
     */
25
    public function termParser($term, $consider_asian = true, $phrase_mode = false)
26
    {
27
        $parsed = '';
28
        if ($consider_asian) {
29
            // successive asian characters need to be searched as a phrase
30
            $words = Utf8\Asian::splitAsianWords($term);
31
            foreach ($words as $word) {
32
                $phrase_mode = $phrase_mode ? true : Utf8\Asian::isAsianWords($word);
33
                $parsed .= $this->termParser($word, false, $phrase_mode);
34
            }
35
        } else {
36
            $term_noparen = str_replace(['(',')'], ' ', $term);
37
            $words = Tokenizer::getWords($term_noparen, true);
38
39
            // W_: no need to highlight
40
            if (empty($words)) {
41
                $parsed = '()'; // important: do not remove
42
            } elseif ($words[0] === $term) {
43
                $parsed = '(W+:'.$words[0].')';
44
            } elseif ($phrase_mode) {
45
                $term_encoded = str_replace(['(',')'], ['OP','CP'], $term);
46
                $parsed = '((W_:'.implode(')(W_:', $words).')(P+:'.$term_encoded.'))';
47
            } else {
48
                $parsed = '((W+:'.implode(')(W+:', $words).'))';
49
            }
50
        }
51
        return $parsed;
52
    }
53
54
    /**
55
     * Parses a search query and builds an array of search formulas
56
     *
57
     * @author Andreas Gohr <[email protected]>
58
     * @author Kazutaka Miyasaka <[email protected]>
59
     *
60
     * @param string $query search query
61
     * @return array of search formulas
62
     */
63
    public function convert($query)
64
    {
65
        /**
66
         * parse a search query and transform it into intermediate representation
67
         *
68
         * in a search query, you can use the following expressions:
69
         *
70
         *   words:
71
         *     include
72
         *     -exclude
73
         *   phrases:
74
         *     "phrase to be included"
75
         *     -"phrase you want to exclude"
76
         *   namespaces:
77
         *     @include:namespace (or ns:include:namespace)
78
         *     ^exclude:namespace (or -ns:exclude:namespace)
79
         *   groups:
80
         *     ()
81
         *     -()
82
         *   operators:
83
         *     and ('and' is the default operator: you can always omit this)
84
         *     or  (or pipe symbol '|', lower precedence than 'and')
85
         *
86
         * e.g. a query [ aa "bb cc" @dd:ee ] means "search pages which contain
87
         *      a word 'aa', a phrase 'bb cc' and are within a namespace 'dd:ee'".
88
         *      this query is equivalent to [ -(-aa or -"bb cc" or -ns:dd:ee) ]
89
         *      as long as you don't mind hit counts.
90
         *
91
         * intermediate representation consists of the following parts:
92
         *
93
         *   ( )           - group
94
         *   AND           - logical and
95
         *   OR            - logical or
96
         *   NOT           - logical not
97
         *   W+:, W-:, W_: - word      (underscore: no need to highlight)
98
         *   P+:, P-:      - phrase    (minus sign: logically in NOT group)
99
         *   N+:, N-:      - namespace
100
         */
101
        $parsed_query = '';
102
        $parens_level = 0;
103
        $terms = preg_split('/(-?".*?")/u', Utf8\PhpString::strtolower($query),
104
                    -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY
105
        );
106
107
        foreach ($terms as $term) {
108
            $parsed = '';
109
            if (preg_match('/^(-?)"(.+)"$/u', $term, $matches)) {
110
                // phrase-include and phrase-exclude
111
                $not = $matches[1] ? 'NOT' : '';
112
                $parsed = $not . $this->termParser($matches[2], false, true);
113
            } else {
114
                // fix incomplete phrase
115
                $term = str_replace('"', ' ', $term);
116
117
                // fix parentheses
118
                $term = str_replace(')'  , ' ) ', $term);
119
                $term = str_replace('('  , ' ( ', $term);
120
                $term = str_replace('- (', ' -(', $term);
121
122
                // treat pipe symbols as 'OR' operators
123
                $term = str_replace('|', ' or ', $term);
124
125
                // treat ideographic spaces (U+3000) as search term separators
126
                // FIXME: some more separators?
127
                $term = preg_replace('/[ \x{3000}]+/u', ' ',  $term);
128
                $term = trim($term);
129
                if ($term === '') continue;
130
131
                $tokens = explode(' ', $term);
132
                foreach ($tokens as $token) {
133
                    if ($token === '(') {
134
                        // parenthesis-include-open
135
                        $parsed .= '(';
136
                        ++$parens_level;
137
                    } elseif ($token === '-(') {
138
                        // parenthesis-exclude-open
139
                        $parsed .= 'NOT(';
140
                        ++$parens_level;
141
                    } elseif ($token === ')') {
142
                        // parenthesis-any-close
143
                        if ($parens_level === 0) continue;
144
                        $parsed .= ')';
145
                        $parens_level--;
146
                    } elseif ($token === 'and') {
147
                        // logical-and (do nothing)
148
                    } elseif ($token === 'or') {
149
                        // logical-or
150
                        $parsed .= 'OR';
151
                    } elseif (preg_match('/^(?:\^|-ns:)(.+)$/u', $token, $matches)) {
152
                        // namespace-exclude
153
                        $parsed .= 'NOT(N+:'.$matches[1].')';
154
                    } elseif (preg_match('/^(?:@|ns:)(.+)$/u', $token, $matches)) {
155
                        // namespace-include
156
                        $parsed .= '(N+:'.$matches[1].')';
157
                    } elseif (preg_match('/^-(.+)$/', $token, $matches)) {
158
                        // word-exclude
159
                        $parsed .= 'NOT('.$this->termParser($matches[1]).')';
160
                    } else {
161
                        // word-include
162
                        $parsed .= $this->termParser($token);
163
                    }
164
                }
165
            }
166
            $parsed_query .= $parsed;
167
        }
168
169
        // cleanup (very sensitive)
170
        $parsed_query .= str_repeat(')', $parens_level);
171
        do {
172
            $parsed_query_old = $parsed_query;
173
            $parsed_query = preg_replace('/(NOT)?\(\)/u', '', $parsed_query);
174
        } while ($parsed_query !== $parsed_query_old);
175
        $parsed_query = preg_replace('/(NOT|OR)+\)/u', ')'      , $parsed_query);
176
        $parsed_query = preg_replace('/(OR)+/u'      , 'OR'     , $parsed_query);
177
        $parsed_query = preg_replace('/\(OR/u'       , '('      , $parsed_query);
178
        $parsed_query = preg_replace('/^OR|OR$/u'    , ''       , $parsed_query);
179
        $parsed_query = preg_replace('/\)(NOT)?\(/u' , ')AND$1(', $parsed_query);
180
181
        // adjustment: make highlightings right
182
        $parens_level     = 0;
183
        $notgrp_levels    = array();
184
        $parsed_query_new = '';
185
        $tokens = preg_split('/(NOT\(|[()])/u', $parsed_query,
186
                    -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY
187
        );
188
        foreach ($tokens as $token) {
189
            if ($token === 'NOT(') {
190
                $notgrp_levels[] = ++$parens_level;
191
            } elseif ($token === '(') {
192
                ++$parens_level;
193
            } elseif ($token === ')') {
194
                if ($parens_level-- === end($notgrp_levels)) array_pop($notgrp_levels);
195
            } elseif (count($notgrp_levels) % 2 === 1) {
196
                // turn highlight-flag off if terms are logically in "NOT" group
197
                $token = preg_replace('/([WPN])\+\:/u', '$1-:', $token);
198
            }
199
            $parsed_query_new .= $token;
200
        }
201
        $parsed_query = $parsed_query_new;
202
203
        /**
204
         * convert infix notation string into postfix (Reverse Polish notation) array
205
         * by Shunting-yard algorithm
206
         *
207
         * see: http://en.wikipedia.org/wiki/Reverse_Polish_notation
208
         * see: http://en.wikipedia.org/wiki/Shunting-yard_algorithm
209
         */
210
        $parsed_ary     = array();
211
        $ope_stack      = array();
212
        $ope_precedence = array(')' => 1, 'OR' => 2, 'AND' => 3, 'NOT' => 4, '(' => 5);
213
        $ope_regex      = '/([()]|OR|AND|NOT)/u';
214
215
        $tokens = preg_split($ope_regex, $parsed_query,
216
                    -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY
217
        );
218
        foreach ($tokens as $token) {
219
            if (preg_match($ope_regex, $token)) {
220
                // operator
221
                $last_ope = end($ope_stack);
222
                while ($last_ope !== false
223
                    && $ope_precedence[$token] <= $ope_precedence[$last_ope]
224
                    && $last_ope != '('
225
                ) {
226
                    $parsed_ary[] = array_pop($ope_stack);
227
                    $last_ope = end($ope_stack);
228
                }
229
                if ($token == ')') {
230
                    array_pop($ope_stack); // this array_pop always deletes '('
231
                } else {
232
                    $ope_stack[] = $token;
233
                }
234
            } else {
235
                // operand
236
                $token_decoded = str_replace(['OP','CP'], ['(',')'], $token);
237
                $parsed_ary[] = $token_decoded;
238
            }
239
        }
240
        $parsed_ary = array_values(array_merge($parsed_ary, array_reverse($ope_stack)));
241
242
        // cleanup: each double "NOT" in RPN array actually does nothing
243
        $parsed_ary_count = count($parsed_ary);
244
        for ($i = 1; $i < $parsed_ary_count; ++$i) {
245
            if ($parsed_ary[$i] === 'NOT' && $parsed_ary[$i - 1] === 'NOT') {
246
                unset($parsed_ary[$i], $parsed_ary[$i - 1]);
247
            }
248
        }
249
        $parsed_ary = array_values($parsed_ary);
250
251
        // build return value
252
        $q = array();
253
        $q['query']      = $query;
254
        $q['parsed_str'] = $parsed_query;
255
        $q['parsed_ary'] = $parsed_ary;
256
257
        foreach ($q['parsed_ary'] as $token) {
258
            if ($token[2] !== ':') continue;
259
            $body = substr($token, 3);
260
261
            switch (substr($token, 0, 3)) {
262
                case 'N+:':
263
                     $q['ns'][]        = $body; // for backward compatibility
264
                     break;
265
                case 'N-:':
266
                     $q['notns'][]     = $body; // for backward compatibility
267
                     break;
268
                case 'W_:':
269
                     $q['words'][]     = $body;
270
                     break;
271
                case 'W-:':
272
                     $q['words'][]     = $body;
273
                     $q['not'][]       = $body; // for backward compatibility
274
                     break;
275
                case 'W+:':
276
                     $q['words'][]     = $body;
277
                     $q['highlight'][] = $body;
278
                     $q['and'][]       = $body; // for backward compatibility
279
                     break;
280
                case 'P-:':
281
                     $q['phrases'][]   = $body;
282
                     break;
283
                case 'P+:':
284
                     $q['phrases'][]   = $body;
285
                     $q['highlight'][] = $body;
286
                     break;
287
            }
288
        }
289
        foreach (['words', 'phrases', 'highlight', 'ns', 'notns', 'and', 'not'] as $key) {
290
            $q[$key] = empty($q[$key]) ? array() : array_values(array_unique($q[$key]));
291
        }
292
293
        return $q;
294
    }
295
296
    /**
297
     * Recreate a search query string based on parsed parts,
298
     * doesn't support negated phrases and `OR` searches
299
     *
300
     * @param array $and
301
     * @param array $not
302
     * @param array $phrases
303
     * @param array $ns
304
     * @param array $notns
305
     *
306
     * @return string
307
     */
308
    public function revert(array $and, array $not, array $phrases, array $ns, array $notns)
309
    {
310
        $query = implode(' ', $and);
311
312
        if (!empty($not)) {
313
            $query .= ' -' . implode(' -', $not);
314
        }
315
        if (!empty($phrases)) {
316
            $query .= ' "' . implode('" "', $phrases) . '"';
317
        }
318
        if (!empty($ns)) {
319
            $query .= ' @' . implode(' @', $ns);
320
        }
321
        if (!empty($notns)) {
322
            $query .= ' ^' . implode(' ^', $notns);
323
        }
324
        return $query;
325
    }
326
}
327