Failed Conditions
Pull Request — master (#2943)
by Andreas
03:32
created

QueryParser   C

Complexity

Total Complexity 54

Size/Duplication

Total Lines 317
Duplicated Lines 0 %

Coupling/Cohesion

Components 0
Dependencies 3

Importance

Changes 0
Metric Value
dl 0
loc 317
rs 6.4799
c 0
b 0
f 0
wmc 54
lcom 0
cbo 3

3 Methods

Rating   Name   Duplication   Size   Complexity  
B termParser() 0 29 7
F convert() 0 232 42
A revert() 0 18 5

How to fix   Complexity   

Complex Class

Complex classes like QueryParser often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes. You can also have a look at the cohesion graph to spot any un-connected, or weakly-connected components.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

While breaking up the class, it is a good idea to analyze how other classes use QueryParser, and based on these observations, apply Extract Interface, too.

1
<?php
2
3
namespace dokuwiki\Search;
4
5
use dokuwiki\Search\Tokenizer;
6
use dokuwiki\Utf8;
7
8
/**
9
 * DokuWuki QueryParser class
10
 */
11
class QueryParser
12
{
13
    /**
14
     * Transforms given search term into intermediate representation
15
     *
16
     * This function is used in QueryParser::convert() and not for general purpose use.
17
     *
18
     * @author Kazutaka Miyasaka <[email protected]>
19
     *
20
     * @param string       $term
21
     * @param bool         $consider_asian
22
     * @param bool         $phrase_mode
23
     * @return string
24
     */
25
    public function termParser($term, $consider_asian = true, $phrase_mode = false)
26
    {
27
        $parsed = '';
28
        if ($consider_asian) {
29
            // successive asian characters need to be searched as a phrase
30
            $words = Utf8\Asian::splitAsianWords($term);
31
            foreach ($words as $word) {
32
                $phrase_mode = $phrase_mode ? true : Utf8\Asian::isAsianWords($word);
33
                $parsed .= $this->termParser($word, false, $phrase_mode);
34
            }
35
        } else {
36
            $term_noparen = str_replace(['(',')'], ' ', $term);
37
            $Tokenizer = Tokenizer::getInstance();
38
            $words = $Tokenizer->getWords($term_noparen, true);
39
40
            // W_: no need to highlight
41
            if (empty($words)) {
42
                $parsed = '()'; // important: do not remove
43
            } elseif ($words[0] === $term) {
44
                $parsed = '(W+:'.$words[0].')';
45
            } elseif ($phrase_mode) {
46
                $term_encoded = str_replace(['(',')'], ['OP','CP'], $term);
47
                $parsed = '((W_:'.implode(')(W_:', $words).')(P+:'.$term_encoded.'))';
48
            } else {
49
                $parsed = '((W+:'.implode(')(W+:', $words).'))';
50
            }
51
        }
52
        return $parsed;
53
    }
54
55
    /**
56
     * Parses a search query and builds an array of search formulas
57
     *
58
     * @author Andreas Gohr <[email protected]>
59
     * @author Kazutaka Miyasaka <[email protected]>
60
     *
61
     * @param string $query search query
62
     * @return array of search formulas
63
     */
64
    public function convert($query)
65
    {
66
        /**
67
         * parse a search query and transform it into intermediate representation
68
         *
69
         * in a search query, you can use the following expressions:
70
         *
71
         *   words:
72
         *     include
73
         *     -exclude
74
         *   phrases:
75
         *     "phrase to be included"
76
         *     -"phrase you want to exclude"
77
         *   namespaces:
78
         *     @include:namespace (or ns:include:namespace)
79
         *     ^exclude:namespace (or -ns:exclude:namespace)
80
         *   groups:
81
         *     ()
82
         *     -()
83
         *   operators:
84
         *     and ('and' is the default operator: you can always omit this)
85
         *     or  (or pipe symbol '|', lower precedence than 'and')
86
         *
87
         * e.g. a query [ aa "bb cc" @dd:ee ] means "search pages which contain
88
         *      a word 'aa', a phrase 'bb cc' and are within a namespace 'dd:ee'".
89
         *      this query is equivalent to [ -(-aa or -"bb cc" or -ns:dd:ee) ]
90
         *      as long as you don't mind hit counts.
91
         *
92
         * intermediate representation consists of the following parts:
93
         *
94
         *   ( )           - group
95
         *   AND           - logical and
96
         *   OR            - logical or
97
         *   NOT           - logical not
98
         *   W+:, W-:, W_: - word      (underscore: no need to highlight)
99
         *   P+:, P-:      - phrase    (minus sign: logically in NOT group)
100
         *   N+:, N-:      - namespace
101
         */
102
        $parsed_query = '';
103
        $parens_level = 0;
104
        $terms = preg_split('/(-?".*?")/u', Utf8\PhpString::strtolower($query),
105
                    -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY
106
        );
107
108
        foreach ($terms as $term) {
109
            $parsed = '';
110
            if (preg_match('/^(-?)"(.+)"$/u', $term, $matches)) {
111
                // phrase-include and phrase-exclude
112
                $not = $matches[1] ? 'NOT' : '';
113
                $parsed = $not . $this->termParser($matches[2], false, true);
114
            } else {
115
                // fix incomplete phrase
116
                $term = str_replace('"', ' ', $term);
117
118
                // fix parentheses
119
                $term = str_replace(')'  , ' ) ', $term);
120
                $term = str_replace('('  , ' ( ', $term);
121
                $term = str_replace('- (', ' -(', $term);
122
123
                // treat pipe symbols as 'OR' operators
124
                $term = str_replace('|', ' or ', $term);
125
126
                // treat ideographic spaces (U+3000) as search term separators
127
                // FIXME: some more separators?
128
                $term = preg_replace('/[ \x{3000}]+/u', ' ',  $term);
129
                $term = trim($term);
130
                if ($term === '') continue;
131
132
                $tokens = explode(' ', $term);
133
                foreach ($tokens as $token) {
134
                    if ($token === '(') {
135
                        // parenthesis-include-open
136
                        $parsed .= '(';
137
                        ++$parens_level;
138
                    } elseif ($token === '-(') {
139
                        // parenthesis-exclude-open
140
                        $parsed .= 'NOT(';
141
                        ++$parens_level;
142
                    } elseif ($token === ')') {
143
                        // parenthesis-any-close
144
                        if ($parens_level === 0) continue;
145
                        $parsed .= ')';
146
                        $parens_level--;
147
                    } elseif ($token === 'and') {
148
                        // logical-and (do nothing)
149
                    } elseif ($token === 'or') {
150
                        // logical-or
151
                        $parsed .= 'OR';
152
                    } elseif (preg_match('/^(?:\^|-ns:)(.+)$/u', $token, $matches)) {
153
                        // namespace-exclude
154
                        $parsed .= 'NOT(N+:'.$matches[1].')';
155
                    } elseif (preg_match('/^(?:@|ns:)(.+)$/u', $token, $matches)) {
156
                        // namespace-include
157
                        $parsed .= '(N+:'.$matches[1].')';
158
                    } elseif (preg_match('/^-(.+)$/', $token, $matches)) {
159
                        // word-exclude
160
                        $parsed .= 'NOT('.$this->termParser($matches[1]).')';
161
                    } else {
162
                        // word-include
163
                        $parsed .= $this->termParser($token);
164
                    }
165
                }
166
            }
167
            $parsed_query .= $parsed;
168
        }
169
170
        // cleanup (very sensitive)
171
        $parsed_query .= str_repeat(')', $parens_level);
172
        do {
173
            $parsed_query_old = $parsed_query;
174
            $parsed_query = preg_replace('/(NOT)?\(\)/u', '', $parsed_query);
175
        } while ($parsed_query !== $parsed_query_old);
176
        $parsed_query = preg_replace('/(NOT|OR)+\)/u', ')'      , $parsed_query);
177
        $parsed_query = preg_replace('/(OR)+/u'      , 'OR'     , $parsed_query);
178
        $parsed_query = preg_replace('/\(OR/u'       , '('      , $parsed_query);
179
        $parsed_query = preg_replace('/^OR|OR$/u'    , ''       , $parsed_query);
180
        $parsed_query = preg_replace('/\)(NOT)?\(/u' , ')AND$1(', $parsed_query);
181
182
        // adjustment: make highlightings right
183
        $parens_level     = 0;
184
        $notgrp_levels    = array();
185
        $parsed_query_new = '';
186
        $tokens = preg_split('/(NOT\(|[()])/u', $parsed_query,
187
                    -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY
188
        );
189
        foreach ($tokens as $token) {
190
            if ($token === 'NOT(') {
191
                $notgrp_levels[] = ++$parens_level;
192
            } elseif ($token === '(') {
193
                ++$parens_level;
194
            } elseif ($token === ')') {
195
                if ($parens_level-- === end($notgrp_levels)) array_pop($notgrp_levels);
196
            } elseif (count($notgrp_levels) % 2 === 1) {
197
                // turn highlight-flag off if terms are logically in "NOT" group
198
                $token = preg_replace('/([WPN])\+\:/u', '$1-:', $token);
199
            }
200
            $parsed_query_new .= $token;
201
        }
202
        $parsed_query = $parsed_query_new;
203
204
        /**
205
         * convert infix notation string into postfix (Reverse Polish notation) array
206
         * by Shunting-yard algorithm
207
         *
208
         * see: http://en.wikipedia.org/wiki/Reverse_Polish_notation
209
         * see: http://en.wikipedia.org/wiki/Shunting-yard_algorithm
210
         */
211
        $parsed_ary     = array();
212
        $ope_stack      = array();
213
        $ope_precedence = array(')' => 1, 'OR' => 2, 'AND' => 3, 'NOT' => 4, '(' => 5);
214
        $ope_regex      = '/([()]|OR|AND|NOT)/u';
215
216
        $tokens = preg_split($ope_regex, $parsed_query,
217
                    -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY
218
        );
219
        foreach ($tokens as $token) {
220
            if (preg_match($ope_regex, $token)) {
221
                // operator
222
                $last_ope = end($ope_stack);
223
                while ($last_ope !== false
224
                    && $ope_precedence[$token] <= $ope_precedence[$last_ope]
225
                    && $last_ope != '('
226
                ) {
227
                    $parsed_ary[] = array_pop($ope_stack);
228
                    $last_ope = end($ope_stack);
229
                }
230
                if ($token == ')') {
231
                    array_pop($ope_stack); // this array_pop always deletes '('
232
                } else {
233
                    $ope_stack[] = $token;
234
                }
235
            } else {
236
                // operand
237
                $token_decoded = str_replace(['OP','CP'], ['(',')'], $token);
238
                $parsed_ary[] = $token_decoded;
239
            }
240
        }
241
        $parsed_ary = array_values(array_merge($parsed_ary, array_reverse($ope_stack)));
242
243
        // cleanup: each double "NOT" in RPN array actually does nothing
244
        $parsed_ary_count = count($parsed_ary);
245
        for ($i = 1; $i < $parsed_ary_count; ++$i) {
246
            if ($parsed_ary[$i] === 'NOT' && $parsed_ary[$i - 1] === 'NOT') {
247
                unset($parsed_ary[$i], $parsed_ary[$i - 1]);
248
            }
249
        }
250
        $parsed_ary = array_values($parsed_ary);
251
252
        // build return value
253
        $q = array();
254
        $q['query']      = $query;
255
        $q['parsed_str'] = $parsed_query;
256
        $q['parsed_ary'] = $parsed_ary;
257
258
        foreach ($q['parsed_ary'] as $token) {
259
            if ($token[2] !== ':') continue;
260
            $body = substr($token, 3);
261
262
            switch (substr($token, 0, 3)) {
263
                case 'N+:':
264
                     $q['ns'][]        = $body; // for backward compatibility
265
                     break;
266
                case 'N-:':
267
                     $q['notns'][]     = $body; // for backward compatibility
268
                     break;
269
                case 'W_:':
270
                     $q['words'][]     = $body;
271
                     break;
272
                case 'W-:':
273
                     $q['words'][]     = $body;
274
                     $q['not'][]       = $body; // for backward compatibility
275
                     break;
276
                case 'W+:':
277
                     $q['words'][]     = $body;
278
                     $q['highlight'][] = $body;
279
                     $q['and'][]       = $body; // for backward compatibility
280
                     break;
281
                case 'P-:':
282
                     $q['phrases'][]   = $body;
283
                     break;
284
                case 'P+:':
285
                     $q['phrases'][]   = $body;
286
                     $q['highlight'][] = $body;
287
                     break;
288
            }
289
        }
290
        foreach (['words', 'phrases', 'highlight', 'ns', 'notns', 'and', 'not'] as $key) {
291
            $q[$key] = empty($q[$key]) ? array() : array_values(array_unique($q[$key]));
292
        }
293
294
        return $q;
295
    }
296
297
    /**
298
     * Recreate a search query string based on parsed parts,
299
     * doesn't support negated phrases and `OR` searches
300
     *
301
     * @param array $and
302
     * @param array $not
303
     * @param array $phrases
304
     * @param array $ns
305
     * @param array $notns
306
     *
307
     * @return string
308
     */
309
    public function revert(array $and, array $not, array $phrases, array $ns, array $notns)
310
    {
311
        $query = implode(' ', $and);
312
313
        if (!empty($not)) {
314
            $query .= ' -' . implode(' -', $not);
315
        }
316
        if (!empty($phrases)) {
317
            $query .= ' "' . implode('" "', $phrases) . '"';
318
        }
319
        if (!empty($ns)) {
320
            $query .= ' @' . implode(' @', $ns);
321
        }
322
        if (!empty($notns)) {
323
            $query .= ' ^' . implode(' ^', $notns);
324
        }
325
        return $query;
326
    }
327
}
328