QueryParser - Code Metrics - Inspection of "Refactor fulltext search functions and class Doku_..." - splitbrain/dokuwiki - Measure and Improve Code Quality continuously with Scrutinizer

Failed Conditions

Pull Request — master (#2943)

by Andreas

created 2020-09-10 15:40 UTC

QueryParser C

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	317
Duplicated Lines	0 %

Coupling/Cohesion

Components	0
Dependencies	3

Importance

Changes

Metric	Value
dl	0
loc	317
rs	6.4799
c	0
b	0
f	0
wmc	54
lcom	0
cbo	3

3 Methods

Rating	Name	Size	Complexity
B	termParser()	29	7
F	convert()	232	42
A	revert()	18	5

How to fix Complexity

<?php

namespace dokuwiki\Search;

use dokuwiki\Search\Tokenizer;
use dokuwiki\Utf8;

/**
 * DokuWuki QueryParser class
 */
class QueryParser
{
    /**
     * Transforms given search term into intermediate representation
     *
     * This function is used in QueryParser::convert() and not for general purpose use.
     *
     * @author Kazutaka Miyasaka <[email protected]>
     *
     * @param string       $term
     * @param bool         $consider_asian
     * @param bool         $phrase_mode
     * @return string
     */
    public function termParser($term, $consider_asian = true, $phrase_mode = false)
    {
        $parsed = '';
        if ($consider_asian) {
            // successive asian characters need to be searched as a phrase
            $words = Utf8\Asian::splitAsianWords($term);
            foreach ($words as $word) {
                $phrase_mode = $phrase_mode ? true : Utf8\Asian::isAsianWords($word);
                $parsed .= $this->termParser($word, false, $phrase_mode);
            }
        } else {
            $term_noparen = str_replace(['(',')'], ' ', $term);
            $Tokenizer = Tokenizer::getInstance();
            $words = $Tokenizer->getWords($term_noparen, true);

            // W_: no need to highlight
            if (empty($words)) {
                $parsed = '()'; // important: do not remove
            } elseif ($words[0] === $term) {
                $parsed = '(W+:'.$words[0].')';
            } elseif ($phrase_mode) {
                $term_encoded = str_replace(['(',')'], ['OP','CP'], $term);
                $parsed = '((W_:'.implode(')(W_:', $words).')(P+:'.$term_encoded.'))';
            } else {
                $parsed = '((W+:'.implode(')(W+:', $words).'))';
            }
        }
        return $parsed;
    }

    /**
     * Parses a search query and builds an array of search formulas
     *
     * @author Andreas Gohr <[email protected]>
     * @author Kazutaka Miyasaka <[email protected]>
     *
     * @param string $query search query
     * @return array of search formulas
     */
    public function convert($query)
    {
        /**
         * parse a search query and transform it into intermediate representation
         *
         * in a search query, you can use the following expressions:
         *
         *   words:
         *     include
         *     -exclude
         *   phrases:
         *     "phrase to be included"
         *     -"phrase you want to exclude"
         *   namespaces:
         *     @include:namespace (or ns:include:namespace)
         *     ^exclude:namespace (or -ns:exclude:namespace)
         *   groups:
         *     ()
         *     -()
         *   operators:
         *     and ('and' is the default operator: you can always omit this)
         *     or  (or pipe symbol '|', lower precedence than 'and')
         *
         * e.g. a query [ aa "bb cc" @dd:ee ] means "search pages which contain
         *      a word 'aa', a phrase 'bb cc' and are within a namespace 'dd:ee'".
         *      this query is equivalent to [ -(-aa or -"bb cc" or -ns:dd:ee) ]
         *      as long as you don't mind hit counts.
         *
         * intermediate representation consists of the following parts:
         *
         *   ( )           - group
         *   AND           - logical and
         *   OR            - logical or
         *   NOT           - logical not
         *   W+:, W-:, W_: - word      (underscore: no need to highlight)
         *   P+:, P-:      - phrase    (minus sign: logically in NOT group)
         *   N+:, N-:      - namespace
         */
        $parsed_query = '';
        $parens_level = 0;
        $terms = preg_split('/(-?".*?")/u', Utf8\PhpString::strtolower($query),
                    -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY
        );

        foreach ($terms as $term) {
            $parsed = '';
            if (preg_match('/^(-?)"(.+)"$/u', $term, $matches)) {
                // phrase-include and phrase-exclude
                $not = $matches[1] ? 'NOT' : '';
                $parsed = $not . $this->termParser($matches[2], false, true);
            } else {
                // fix incomplete phrase
                $term = str_replace('"', ' ', $term);

                // fix parentheses
                $term = str_replace(')'  , ' ) ', $term);
                $term = str_replace('('  , ' ( ', $term);
                $term = str_replace('- (', ' -(', $term);

                // treat pipe symbols as 'OR' operators
                $term = str_replace('|', ' or ', $term);

                // treat ideographic spaces (U+3000) as search term separators
                // FIXME: some more separators?
                $term = preg_replace('/[ \x{3000}]+/u', ' ',  $term);
                $term = trim($term);
                if ($term === '') continue;

                $tokens = explode(' ', $term);
                foreach ($tokens as $token) {
                    if ($token === '(') {
                        // parenthesis-include-open
                        $parsed .= '(';
                        ++$parens_level;
                    } elseif ($token === '-(') {
                        // parenthesis-exclude-open
                        $parsed .= 'NOT(';
                        ++$parens_level;
                    } elseif ($token === ')') {
                        // parenthesis-any-close
                        if ($parens_level === 0) continue;
                        $parsed .= ')';
                        $parens_level--;
                    } elseif ($token === 'and') {
                        // logical-and (do nothing)
                    } elseif ($token === 'or') {
                        // logical-or
                        $parsed .= 'OR';
                    } elseif (preg_match('/^(?:\^|-ns:)(.+)$/u', $token, $matches)) {
                        // namespace-exclude
                        $parsed .= 'NOT(N+:'.$matches[1].')';
                    } elseif (preg_match('/^(?:@|ns:)(.+)$/u', $token, $matches)) {
                        // namespace-include
                        $parsed .= '(N+:'.$matches[1].')';
                    } elseif (preg_match('/^-(.+)$/', $token, $matches)) {
                        // word-exclude
                        $parsed .= 'NOT('.$this->termParser($matches[1]).')';
                    } else {
                        // word-include
                        $parsed .= $this->termParser($token);
                    }
                }
            }
            $parsed_query .= $parsed;
        }

        // cleanup (very sensitive)
        $parsed_query .= str_repeat(')', $parens_level);
        do {
            $parsed_query_old = $parsed_query;
            $parsed_query = preg_replace('/(NOT)?\(\)/u', '', $parsed_query);
        } while ($parsed_query !== $parsed_query_old);
        $parsed_query = preg_replace('/(NOT|OR)+\)/u', ')'      , $parsed_query);
        $parsed_query = preg_replace('/(OR)+/u'      , 'OR'     , $parsed_query);
        $parsed_query = preg_replace('/\(OR/u'       , '('      , $parsed_query);
        $parsed_query = preg_replace('/^OR|OR$/u'    , ''       , $parsed_query);
        $parsed_query = preg_replace('/\)(NOT)?\(/u' , ')AND$1(', $parsed_query);

        // adjustment: make highlightings right
        $parens_level     = 0;
        $notgrp_levels    = array();
        $parsed_query_new = '';
        $tokens = preg_split('/(NOT\(|[()])/u', $parsed_query,
                    -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY
        );
        foreach ($tokens as $token) {
            if ($token === 'NOT(') {
                $notgrp_levels[] = ++$parens_level;
            } elseif ($token === '(') {
                ++$parens_level;
            } elseif ($token === ')') {
                if ($parens_level-- === end($notgrp_levels)) array_pop($notgrp_levels);
            } elseif (count($notgrp_levels) % 2 === 1) {
                // turn highlight-flag off if terms are logically in "NOT" group
                $token = preg_replace('/([WPN])\+\:/u', '$1-:', $token);
            }
            $parsed_query_new .= $token;
        }
        $parsed_query = $parsed_query_new;

        /**
         * convert infix notation string into postfix (Reverse Polish notation) array
         * by Shunting-yard algorithm
         *
         * see: http://en.wikipedia.org/wiki/Reverse_Polish_notation
         * see: http://en.wikipedia.org/wiki/Shunting-yard_algorithm
         */
        $parsed_ary     = array();
        $ope_stack      = array();
        $ope_precedence = array(')' => 1, 'OR' => 2, 'AND' => 3, 'NOT' => 4, '(' => 5);
        $ope_regex      = '/([()]|OR|AND|NOT)/u';

        $tokens = preg_split($ope_regex, $parsed_query,
                    -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY
        );
        foreach ($tokens as $token) {
            if (preg_match($ope_regex, $token)) {
                // operator
                $last_ope = end($ope_stack);
                while ($last_ope !== false
                    && $ope_precedence[$token] <= $ope_precedence[$last_ope]
                    && $last_ope != '('
                ) {
                    $parsed_ary[] = array_pop($ope_stack);
                    $last_ope = end($ope_stack);
                }
                if ($token == ')') {
                    array_pop($ope_stack); // this array_pop always deletes '('
                } else {
                    $ope_stack[] = $token;
                }
            } else {
                // operand
                $token_decoded = str_replace(['OP','CP'], ['(',')'], $token);
                $parsed_ary[] = $token_decoded;
            }
        }
        $parsed_ary = array_values(array_merge($parsed_ary, array_reverse($ope_stack)));

        // cleanup: each double "NOT" in RPN array actually does nothing
        $parsed_ary_count = count($parsed_ary);
        for ($i = 1; $i < $parsed_ary_count; ++$i) {
            if ($parsed_ary[$i] === 'NOT' && $parsed_ary[$i - 1] === 'NOT') {
                unset($parsed_ary[$i], $parsed_ary[$i - 1]);
            }
        }
        $parsed_ary = array_values($parsed_ary);

        // build return value
        $q = array();
        $q['query']      = $query;
        $q['parsed_str'] = $parsed_query;
        $q['parsed_ary'] = $parsed_ary;

        foreach ($q['parsed_ary'] as $token) {
            if ($token[2] !== ':') continue;
            $body = substr($token, 3);

            switch (substr($token, 0, 3)) {
                case 'N+:':
                     $q['ns'][]        = $body; // for backward compatibility
                     break;
                case 'N-:':
                     $q['notns'][]     = $body; // for backward compatibility
                     break;
                case 'W_:':
                     $q['words'][]     = $body;
                     break;
                case 'W-:':
                     $q['words'][]     = $body;
                     $q['not'][]       = $body; // for backward compatibility
                     break;
                case 'W+:':
                     $q['words'][]     = $body;
                     $q['highlight'][] = $body;
                     $q['and'][]       = $body; // for backward compatibility
                     break;
                case 'P-:':
                     $q['phrases'][]   = $body;
                     break;
                case 'P+:':
                     $q['phrases'][]   = $body;
                     $q['highlight'][] = $body;
                     break;
            }
        }
        foreach (['words', 'phrases', 'highlight', 'ns', 'notns', 'and', 'not'] as $key) {
            $q[$key] = empty($q[$key]) ? array() : array_values(array_unique($q[$key]));
        }

        return $q;
    }

    /**
     * Recreate a search query string based on parsed parts,
     * doesn't support negated phrases and `OR` searches
     *
     * @param array $and
     * @param array $not
     * @param array $phrases
     * @param array $ns
     * @param array $notns
     *
     * @return string
     */
    public function revert(array $and, array $not, array $phrases, array $ns, array $notns)
    {
        $query = implode(' ', $and);

        if (!empty($not)) {
            $query .= ' -' . implode(' -', $not);
        }
        if (!empty($phrases)) {
            $query .= ' "' . implode('" "', $phrases) . '"';
        }
        if (!empty($ns)) {
            $query .= ' @' . implode(' @', $ns);
        }
        if (!empty($notns)) {
            $query .= ' ^' . implode(' ^', $notns);
        }
        return $query;
    }
}


1			<?php
2
3			namespace dokuwiki\Search;
4
5			use dokuwiki\Search\Tokenizer;
6			use dokuwiki\Utf8;
7
8			/**
9			* DokuWuki QueryParser class
10			*/
11			class QueryParser
12			{
13			/**
14			* Transforms given search term into intermediate representation
15			*
16			* This function is used in QueryParser::convert() and not for general purpose use.
17			*
18			* @author Kazutaka Miyasaka <[email protected]>
19			*
20			* @param string $term
21			* @param bool $consider_asian
22			* @param bool $phrase_mode
23			* @return string
24			*/
25			public function termParser($term, $consider_asian = true, $phrase_mode = false)
26			{
27			$parsed = '';
28			if ($consider_asian) {
29			// successive asian characters need to be searched as a phrase
30			$words = Utf8\Asian::splitAsianWords($term);
31			foreach ($words as $word) {
32			$phrase_mode = $phrase_mode ? true : Utf8\Asian::isAsianWords($word);
33			$parsed .= $this->termParser($word, false, $phrase_mode);
34			}
35			} else {
36			$term_noparen = str_replace(['(',')'], ' ', $term);
37			$Tokenizer = Tokenizer::getInstance();
38			$words = $Tokenizer->getWords($term_noparen, true);
39
40			// W_: no need to highlight
41			if (empty($words)) {
42			$parsed = '()'; // important: do not remove
43			} elseif ($words[0] === $term) {
44			$parsed = '(W+:'.$words[0].')';
45			} elseif ($phrase_mode) {
46			$term_encoded = str_replace(['(',')'], ['OP','CP'], $term);
47			$parsed = '((W_:'.implode(')(W_:', $words).')(P+:'.$term_encoded.'))';
48			} else {
49			$parsed = '((W+:'.implode(')(W+:', $words).'))';
50			}
51			}
52			return $parsed;
53			}
54
55			/**
56			* Parses a search query and builds an array of search formulas
57			*
58			* @author Andreas Gohr <[email protected]>
59			* @author Kazutaka Miyasaka <[email protected]>
60			*
61			* @param string $query search query
62			* @return array of search formulas
63			*/
64			public function convert($query)
65			{
66			/**
67			* parse a search query and transform it into intermediate representation
68			*
69			* in a search query, you can use the following expressions:
70			*
71			* words:
72			* include
73			* -exclude
74			* phrases:
75			* "phrase to be included"
76			* -"phrase you want to exclude"
77			* namespaces:
78			* @include:namespace (or ns:include:namespace)
79			* ^exclude:namespace (or -ns:exclude:namespace)
80			* groups:
81			* ()
82			* -()
83			* operators:
84			* and ('and' is the default operator: you can always omit this)
85			* or (or pipe symbol '\|', lower precedence than 'and')
86			*
87			* e.g. a query [ aa "bb cc" @dd:ee ] means "search pages which contain
88			* a word 'aa', a phrase 'bb cc' and are within a namespace 'dd:ee'".
89			* this query is equivalent to [ -(-aa or -"bb cc" or -ns:dd:ee) ]
90			* as long as you don't mind hit counts.
91			*
92			* intermediate representation consists of the following parts:
93			*
94			* ( ) - group
95			* AND - logical and
96			* OR - logical or
97			* NOT - logical not
98			* W+:, W-:, W_: - word (underscore: no need to highlight)
99			* P+:, P-: - phrase (minus sign: logically in NOT group)
100			* N+:, N-: - namespace
101			*/
102			$parsed_query = '';
103			$parens_level = 0;
104			$terms = preg_split('/(-?".*?")/u', Utf8\PhpString::strtolower($query),
105			-1, PREG_SPLIT_DELIM_CAPTURE \| PREG_SPLIT_NO_EMPTY
106			);
107
108			foreach ($terms as $term) {
109			$parsed = '';
110			if (preg_match('/^(-?)"(.+)"$/u', $term, $matches)) {
111			// phrase-include and phrase-exclude
112			$not = $matches[1] ? 'NOT' : '';
113			$parsed = $not . $this->termParser($matches[2], false, true);
114			} else {
115			// fix incomplete phrase
116			$term = str_replace('"', ' ', $term);
117
118			// fix parentheses
119			$term = str_replace(')' , ' ) ', $term);
120			$term = str_replace('(' , ' ( ', $term);
121			$term = str_replace('- (', ' -(', $term);
122
123			// treat pipe symbols as 'OR' operators
124			$term = str_replace('\|', ' or ', $term);
125
126			// treat ideographic spaces (U+3000) as search term separators
127			// FIXME: some more separators?
128			$term = preg_replace('/[ \x{3000}]+/u', ' ', $term);
129			$term = trim($term);
130			if ($term === '') continue;
131
132			$tokens = explode(' ', $term);
133			foreach ($tokens as $token) {
134			if ($token === '(') {
135			// parenthesis-include-open
136			$parsed .= '(';
137			++$parens_level;
138			} elseif ($token === '-(') {
139			// parenthesis-exclude-open
140			$parsed .= 'NOT(';
141			++$parens_level;
142			} elseif ($token === ')') {
143			// parenthesis-any-close
144			if ($parens_level === 0) continue;
145			$parsed .= ')';
146			$parens_level--;
147			} elseif ($token === 'and') {
148			// logical-and (do nothing)
149			} elseif ($token === 'or') {
150			// logical-or
151			$parsed .= 'OR';
152			} elseif (preg_match('/^(?:\^\|-ns:)(.+)$/u', $token, $matches)) {
153			// namespace-exclude
154			$parsed .= 'NOT(N+:'.$matches[1].')';
155			} elseif (preg_match('/^(?:@\|ns:)(.+)$/u', $token, $matches)) {
156			// namespace-include
157			$parsed .= '(N+:'.$matches[1].')';
158			} elseif (preg_match('/^-(.+)$/', $token, $matches)) {
159			// word-exclude
160			$parsed .= 'NOT('.$this->termParser($matches[1]).')';
161			} else {
162			// word-include
163			$parsed .= $this->termParser($token);
164			}
165			}
166			}
167			$parsed_query .= $parsed;
168			}
169
170			// cleanup (very sensitive)
171			$parsed_query .= str_repeat(')', $parens_level);
172			do {
173			$parsed_query_old = $parsed_query;
174			$parsed_query = preg_replace('/(NOT)?\(\)/u', '', $parsed_query);
175			} while ($parsed_query !== $parsed_query_old);
176			$parsed_query = preg_replace('/(NOT\|OR)+\)/u', ')' , $parsed_query);
177			$parsed_query = preg_replace('/(OR)+/u' , 'OR' , $parsed_query);
178			$parsed_query = preg_replace('/\(OR/u' , '(' , $parsed_query);
179			$parsed_query = preg_replace('/^OR\|OR$/u' , '' , $parsed_query);
180			$parsed_query = preg_replace('/\)(NOT)?\(/u' , ')AND$1(', $parsed_query);
181
182			// adjustment: make highlightings right
183			$parens_level = 0;
184			$notgrp_levels = array();
185			$parsed_query_new = '';
186			$tokens = preg_split('/(NOT\(\|[()])/u', $parsed_query,
187			-1, PREG_SPLIT_DELIM_CAPTURE \| PREG_SPLIT_NO_EMPTY
188			);
189			foreach ($tokens as $token) {
190			if ($token === 'NOT(') {
191			$notgrp_levels[] = ++$parens_level;
192			} elseif ($token === '(') {
193			++$parens_level;
194			} elseif ($token === ')') {
195			if ($parens_level-- === end($notgrp_levels)) array_pop($notgrp_levels);
196			} elseif (count($notgrp_levels) % 2 === 1) {
197			// turn highlight-flag off if terms are logically in "NOT" group
198			$token = preg_replace('/([WPN])\+\:/u', '$1-:', $token);
199			}
200			$parsed_query_new .= $token;
201			}
202			$parsed_query = $parsed_query_new;
203
204			/**
205			* convert infix notation string into postfix (Reverse Polish notation) array
206			* by Shunting-yard algorithm
207			*
208			* see: http://en.wikipedia.org/wiki/Reverse_Polish_notation
209			* see: http://en.wikipedia.org/wiki/Shunting-yard_algorithm
210			*/
211			$parsed_ary = array();
212			$ope_stack = array();
213			$ope_precedence = array(')' => 1, 'OR' => 2, 'AND' => 3, 'NOT' => 4, '(' => 5);
214			$ope_regex = '/([()]\|OR\|AND\|NOT)/u';
215
216			$tokens = preg_split($ope_regex, $parsed_query,
217			-1, PREG_SPLIT_DELIM_CAPTURE \| PREG_SPLIT_NO_EMPTY
218			);
219			foreach ($tokens as $token) {
220			if (preg_match($ope_regex, $token)) {
221			// operator
222			$last_ope = end($ope_stack);
223			while ($last_ope !== false
224			&& $ope_precedence[$token] <= $ope_precedence[$last_ope]
225			&& $last_ope != '('
226			) {
227			$parsed_ary[] = array_pop($ope_stack);
228			$last_ope = end($ope_stack);
229			}
230			if ($token == ')') {
231			array_pop($ope_stack); // this array_pop always deletes '('
232			} else {
233			$ope_stack[] = $token;
234			}
235			} else {
236			// operand
237			$token_decoded = str_replace(['OP','CP'], ['(',')'], $token);
238			$parsed_ary[] = $token_decoded;
239			}
240			}
241			$parsed_ary = array_values(array_merge($parsed_ary, array_reverse($ope_stack)));
242
243			// cleanup: each double "NOT" in RPN array actually does nothing
244			$parsed_ary_count = count($parsed_ary);
245			for ($i = 1; $i < $parsed_ary_count; ++$i) {
246			if ($parsed_ary[$i] === 'NOT' && $parsed_ary[$i - 1] === 'NOT') {
247			unset($parsed_ary[$i], $parsed_ary[$i - 1]);
248			}
249			}
250			$parsed_ary = array_values($parsed_ary);
251
252			// build return value
253			$q = array();
254			$q['query'] = $query;
255			$q['parsed_str'] = $parsed_query;
256			$q['parsed_ary'] = $parsed_ary;
257
258			foreach ($q['parsed_ary'] as $token) {
259			if ($token[2] !== ':') continue;
260			$body = substr($token, 3);
261
262			switch (substr($token, 0, 3)) {
263			case 'N+:':
264			$q['ns'][] = $body; // for backward compatibility
265			break;
266			case 'N-:':
267			$q['notns'][] = $body; // for backward compatibility
268			break;
269			case 'W_:':
270			$q['words'][] = $body;
271			break;
272			case 'W-:':
273			$q['words'][] = $body;
274			$q['not'][] = $body; // for backward compatibility
275			break;
276			case 'W+:':
277			$q['words'][] = $body;
278			$q['highlight'][] = $body;
279			$q['and'][] = $body; // for backward compatibility
280			break;
281			case 'P-:':
282			$q['phrases'][] = $body;
283			break;
284			case 'P+:':
285			$q['phrases'][] = $body;
286			$q['highlight'][] = $body;
287			break;
288			}
289			}
290			foreach (['words', 'phrases', 'highlight', 'ns', 'notns', 'and', 'not'] as $key) {
291			$q[$key] = empty($q[$key]) ? array() : array_values(array_unique($q[$key]));
292			}
293
294			return $q;
295			}
296
297			/**
298			* Recreate a search query string based on parsed parts,
299			* doesn't support negated phrases and `OR` searches
300			*
301			* @param array $and
302			* @param array $not
303			* @param array $phrases
304			* @param array $ns
305			* @param array $notns
306			*
307			* @return string
308			*/
309			public function revert(array $and, array $not, array $phrases, array $ns, array $notns)
310			{
311			$query = implode(' ', $and);
312
313			if (!empty($not)) {
314			$query .= ' -' . implode(' -', $not);
315			}
316			if (!empty($phrases)) {
317			$query .= ' "' . implode('" "', $phrases) . '"';
318			}
319			if (!empty($ns)) {
320			$query .= ' @' . implode(' @', $ns);
321			}
322			if (!empty($notns)) {
323			$query .= ' ^' . implode(' ^', $notns);
324			}
325			return $query;
326			}
327			}
328

splitbrain / dokuwiki

Pull Request — master (#2943)

QueryParser C

Complexity

Size/Duplication

Coupling/Cohesion

Importance

3 Methods

How to fix Complexity

Complex Class

Duplication Side-by-Side

Filter issues like