ParallelRegex::getCompoundedRegex() - Code Metrics - Inspection of "Lexer with support for Unicode character propertie..." - splitbrain/dokuwiki - Measure and Improve Code Quality continuously with Scrutinizer

Failed Conditions

Pull Request — master (#3255)

unknown

created 2020-09-06 16:58 UTC

ParallelRegex::getCompoundedRegex() B

↳ Parent: ParallelRegex

Complexity

Conditions	9
Paths	2

Size

Total Lines

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	9
nc	2
nop	1
dl	0
loc	52
rs	7.4917
c	0
b	0
f	0

How to fix Long Method

<?php
/**
 * Lexer adapted from Simple Test: http://sourceforge.net/projects/simpletest/
 * For an intro to the Lexer see:
 * https://web.archive.org/web/20120125041816/http://www.phppatterns.com/docs/develop/simple_test_lexer_notes
 *
 * @author Marcus Baker http://www.lastcraft.com
 * @author Moisés Braga Ribeiro <[email protected]>
 */

namespace dokuwiki\Parsing\Lexer;

/**
 * Compounded regular expression.
 *
 * Any of the contained patterns could match and when one does it's label is returned.
 */
class ParallelRegex
{
    /** @var string[][] patterns to match */
    protected $patterns;
    /** @var string[][] labels for above patterns */
    protected $labels;
    /** @var string[] the compound regexes matching all patterns */
    protected $regexes;
    /** @var bool case sensitive matching? */
    protected $case;

    /**
     * Constructor. Starts with no patterns.
     *
     * @param boolean $case    True for case sensitive, false for insensitive.
     */
    public function __construct($case)
    {
        $this->case = $case;
        $this->patterns = array();
        $this->labels = array();
        $this->regexes = array();
    }

    /**
     * Adds a pattern with an optional label.
     *
     * @param mixed $pattern       Perl style regex. Must be UTF-8 encoded. If its a string,
     *                             the (, ) lose their meaning unless they form part of
     *                             a lookahead or lookbehind assertation.
     * @param bool|string $label   Label of regex to be returned on a match. Label must be ASCII
     */
    public function addPattern($pattern, $label = true)
    {
        $unicode = $this->needsUnicodeAware($pattern);
        if (! isset($this->patterns[$unicode])) {
            $this->patterns[$unicode] = array();
            $this->labels[$unicode] = array();
        }
        $count = count($this->patterns[$unicode]);
        $this->patterns[$unicode][$count] = $pattern;
        $this->labels[$unicode][$count] = $label;
        $this->regexes[$unicode] = null;
    }

    /**
     * Decides whether the given pattern needs Unicode-aware regex treatment.
     * Reference: https://www.php.net/manual/en/regexp.reference.unicode.php
     *
     * @param mixed $pattern       Perl style regex. Must be UTF-8 encoded.
     * @param boolean $unicode     True for Unicode-aware, false for byte-oriented.
/**
 * @param array $germany
 * @param array $island
 * @param array $italy
 */
function finale($germany, $island) {
    return "2:1";
}
     *
     * @author Moisés Braga Ribeiro <[email protected]>
     */
    protected function needsUnicodeAware($pattern)
    {
        return preg_match("/[\\x80-\\xFF]|\\\\(X|([pP]([A-Z]|\{\^?[A-Za-z_]+\})))/S", $pattern);
    }

    /**
     * Attempts to match all patterns at once against a string.
     *
     * @param string $subject      String to match against.
     * @param string $match        First matched portion of subject.
     * @return bool|string         False if no match found, label if label exists, true if not
     *
     * @author Moisés Braga Ribeiro <[email protected]>
     */
    public function match($subject, &$match)
    {
        $resultByteOriented = $this->partialMatch($subject, $matchByteOriented, $offsetByteOriented, false);
        $resultUnicodeAware = $this->partialMatch($subject, $matchUnicodeAware, $offsetUnicodeAware, true);
        if (! $resultUnicodeAware) {
            $match = $matchByteOriented;
            return $resultByteOriented;
        }
        if (! $resultByteOriented) {
            $match = $matchUnicodeAware;
            return $resultUnicodeAware;
        }
        $chooseByteOriented = ($offsetByteOriented < $offsetUnicodeAware) ||
                              ($offsetByteOriented == $offsetUnicodeAware &&
                                  (strlen($matchByteOriented) >= strlen($matchUnicodeAware)));
        $match = $chooseByteOriented ? $matchByteOriented : $matchUnicodeAware;
        return $chooseByteOriented ? $resultByteOriented : $resultUnicodeAware;
    }

    /**
     * Attempts to match all patterns of a certain type at once against a string.
     *
     * @param string $subject      String to match against.
     * @param string $match        First matched portion of subject.
     * @param int $offset          Offset of the first matched portion of subject.
     * @param boolean $unicode     True for Unicode-aware, false for byte-oriented.
     * @return bool|string         False if no match found, label if label exists, true if not
     */
    protected function partialMatch($subject, &$match, &$offset, $unicode)
    {
        if (! isset($this->patterns[$unicode]) || count($this->patterns[$unicode]) == 0) {
            return false;
        }
        if (! preg_match($this->getCompoundedRegex($unicode), $subject, $matches, PREG_OFFSET_CAPTURE)) {
            $match = "";
            return false;
        }

        $match = $matches[0][0];
        $offset = $matches[0][1];
        $size = count($matches);
        // FIXME this could be made faster by storing the labels as keys in a hashmap
        for ($i = 1; $i < $size; $i++) {
            if ($matches[$i][0] && isset($this->labels[$unicode][$i - 1])) {
                return $this->labels[$unicode][$i - 1];
            }
        }
        return true;
    }

    /**
     * Attempts to split the string against all patterns at once.
     *
     * @param string $subject      String to match against.
     * @param array $split         The split result: array containing pre-match, match & post-match strings
     * @return boolean             True on success.
     *
     * @author Moisés Braga Ribeiro <[email protected]>
     */
    public function split($subject, &$split)
    {
        $resultByteOriented = $this->partialSplit($subject, $splitByteOriented, false);
        $resultUnicodeAware = $this->partialSplit($subject, $splitUnicodeAware, true);
        if (! $resultUnicodeAware) {
            $split = $splitByteOriented;
            return $resultByteOriented;
        }
        if (! $resultByteOriented) {
            $split = $splitUnicodeAware;
            return $resultUnicodeAware;
        }
        list($preByteOriented, $matchByteOriented, /* $postByteOriented */) = $splitByteOriented;
        list($preUnicodeAware, $matchUnicodeAware, /* $postUnicodeAware */) = $splitUnicodeAware;
        $chooseByteOriented = (strlen($preByteOriented) < strlen($preUnicodeAware)) ||
                              (strlen($preByteOriented) == strlen($preUnicodeAware) &&
                                  (strlen($matchByteOriented) >= strlen($matchUnicodeAware)));
        $split = $chooseByteOriented ? $splitByteOriented : $splitUnicodeAware;
        return $chooseByteOriented ? $resultByteOriented : $resultUnicodeAware;
    }

    /**
     * Attempts to split the string against all patterns of a certain type at once.
     *
     * @param string $subject      String to match against.
     * @param array $split         The split result: array containing pre-match, match & post-match strings
     * @param boolean $unicode     True for Unicode-aware, false for byte-oriented.
     * @return boolean             True on success.
     *
     * @author Christopher Smith <[email protected]>
     */
    protected function partialSplit($subject, &$split, $unicode)
    {
        if (! isset($this->patterns[$unicode]) || count($this->patterns[$unicode]) == 0) {
            return false;
        }

        if (! preg_match($this->getCompoundedRegex($unicode), $subject, $matches)) {
            if (function_exists('preg_last_error')) {
                $err = preg_last_error();
                switch ($err) {
                    case PREG_BACKTRACK_LIMIT_ERROR:
                        msg('A PCRE backtrack error occured. Try to increase the pcre.backtrack_limit in php.ini', -1);
                        break;
                    case PREG_RECURSION_LIMIT_ERROR:
                        msg('A PCRE recursion error occured. Try to increase the pcre.recursion_limit in php.ini', -1);
                        break;
                    case PREG_BAD_UTF8_ERROR:
                        msg('A PCRE UTF-8 error occured. This might be caused by a faulty plugin', -1);
                        break;
                    case PREG_INTERNAL_ERROR:
                        msg('A PCRE internal error occured. This might be caused by a faulty plugin', -1);
                        break;
                }
            }

            $split = array($subject, "", "");
            return false;
        }

        $idx = count($matches)-2;
        $pattern = $this->patterns[$unicode][$idx] . $this->getPerlMatchingFlags($unicode);
        list($pre, $post) = preg_split($pattern, $subject, 2);
        $split = array($pre, $matches[0], $post);

        return isset($this->labels[$unicode][$idx]) ? $this->labels[$unicode][$idx] : true;
    }

    /**
     * Compounds the patterns into a single regular expression separated with the
     * "or" operator. Caches the regex. Will automatically escape (, ) and / tokens.
     *
     * @param boolean $unicode     True for Unicode-aware, false for byte-oriented.
     * @return null|string
     */
    protected function getCompoundedRegex($unicode)
    {
        if ($this->regexes[$unicode] == null) {
            $cnt = count($this->patterns[$unicode]);
            for ($i = 0; $i < $cnt; $i++) {
                /*
                 * decompose the input pattern into "(", "(?", ")",
                 * "[...]", "[]..]", "[^]..]", "[...[:...:]..]", "\x"...
                 * elements.
                 */
                preg_match_all('/\\\\.|' .
                               '\(\?|' .
                               '[()]|' .
                               '\[\^?\]?(?:\\\\.|\[:[^]]*:\]|[^]\\\\])*\]|' .
                               '[^[()\\\\]+/', $this->patterns[$unicode][$i], $elts);

                $pattern = "";
                $level = 0;

                foreach ($elts[0] as $elt) {
                    /*
                     * for "(", ")" remember the nesting level, add "\"
                     * only to the non-"(?" ones.
                     */

                    switch ($elt) {
                        case '(':
                            $pattern .= '\(';
                            break;
                        case ')':
                            if ($level > 0)
                                $level--; /* closing (? */
                            else $pattern .= '\\';
                            $pattern .= ')';
                            break;
                        case '(?':
                            $level++;
                            $pattern .= '(?';
                            break;
                        default:
                            if (substr($elt, 0, 1) == '\\')
                                $pattern .= $elt;
                            else $pattern .= str_replace('/', '\/', $elt);
                    }
                }
                $this->patterns[$unicode][$i] = "($pattern)";
            }
            $this->regexes[$unicode] = "/" . implode("|", $this->patterns[$unicode]) .
                                       "/" . $this->getPerlMatchingFlags($unicode);
        }
        return $this->regexes[$unicode];
    }

    /**
     * Accessor for perl regex mode flags to use.
     * @param boolean $unicode     True for Unicode-aware, false for byte-oriented.
     * @return string              Perl regex flags.
     */
    protected function getPerlMatchingFlags($unicode)
    {
        $u = ($unicode ? "u" : "");
        $i = ($this->case ? "" : "i");
        return $u . "msS" . $i;
    }
}


1			<?php
2			/**
3			* Lexer adapted from Simple Test: http://sourceforge.net/projects/simpletest/
4			* For an intro to the Lexer see:
5			* https://web.archive.org/web/20120125041816/http://www.phppatterns.com/docs/develop/simple_test_lexer_notes
6			*
7			* @author Marcus Baker http://www.lastcraft.com
8			* @author Moisés Braga Ribeiro <[email protected]>
9			*/
10
11			namespace dokuwiki\Parsing\Lexer;
12
13			/**
14			* Compounded regular expression.
15			*
16			* Any of the contained patterns could match and when one does it's label is returned.
17			*/
18			class ParallelRegex
19			{
20			/** @var string[][] patterns to match */
21			protected $patterns;
22			/** @var string[][] labels for above patterns */
23			protected $labels;
24			/** @var string[] the compound regexes matching all patterns */
25			protected $regexes;
26			/** @var bool case sensitive matching? */
27			protected $case;
28
29			/**
30			* Constructor. Starts with no patterns.
31			*
32			* @param boolean $case True for case sensitive, false for insensitive.
33			*/
34			public function __construct($case)
35			{
36			$this->case = $case;
37			$this->patterns = array();
38			$this->labels = array();
39			$this->regexes = array();
40			}
41
42			/**
43			* Adds a pattern with an optional label.
44			*
45			* @param mixed $pattern Perl style regex. Must be UTF-8 encoded. If its a string,
46			* the (, ) lose their meaning unless they form part of
47			* a lookahead or lookbehind assertation.
48			* @param bool\|string $label Label of regex to be returned on a match. Label must be ASCII
49			*/
50			public function addPattern($pattern, $label = true)
51			{
52			$unicode = $this->needsUnicodeAware($pattern);
53			if (! isset($this->patterns[$unicode])) {
54			$this->patterns[$unicode] = array();
55			$this->labels[$unicode] = array();
56			}
57			$count = count($this->patterns[$unicode]);
58			$this->patterns[$unicode][$count] = $pattern;
59			$this->labels[$unicode][$count] = $label;
60			$this->regexes[$unicode] = null;
61			}
62
63			/**
64			* Decides whether the given pattern needs Unicode-aware regex treatment.
65			* Reference: https://www.php.net/manual/en/regexp.reference.unicode.php
66			*
67			* @param mixed $pattern Perl style regex. Must be UTF-8 encoded.
68			* @param boolean $unicode True for Unicode-aware, false for byte-oriented.
			0 ignored issues – show Bug introduced 2020-09-06 17:01 UTC by Report Bug Copy Issue Report There is no parameter named `$unicode`. Was it maybe removed? This check looks for PHPDoc comments describing methods or function parameters that do not exist on the corresponding method or function. Consider the following example. The parameter `$italy` is not defined by the method `finale(...)`. /** * @param array $germany * @param array $island * @param array $italy */ function finale($germany, $island) { return "2:1"; } The most likely cause is that the parameter was removed, but the annotation was not. Loading history...
69			*
70			* @author Moisés Braga Ribeiro <[email protected]>
71			*/
72			protected function needsUnicodeAware($pattern)
73			{
74			return preg_match("/[\\x80-\\xFF]\|\\\\(X\|([pP]([A-Z]\|\{\^?[A-Za-z_]+\})))/S", $pattern);
75			}
76
77			/**
78			* Attempts to match all patterns at once against a string.
79			*
80			* @param string $subject String to match against.
81			* @param string $match First matched portion of subject.
82			* @return bool\|string False if no match found, label if label exists, true if not
83			*
84			* @author Moisés Braga Ribeiro <[email protected]>
85			*/
86			public function match($subject, &$match)
87			{
88			$resultByteOriented = $this->partialMatch($subject, $matchByteOriented, $offsetByteOriented, false);
89			$resultUnicodeAware = $this->partialMatch($subject, $matchUnicodeAware, $offsetUnicodeAware, true);
90			if (! $resultUnicodeAware) {
91			$match = $matchByteOriented;
92			return $resultByteOriented;
93			}
94			if (! $resultByteOriented) {
95			$match = $matchUnicodeAware;
96			return $resultUnicodeAware;
97			}
98			$chooseByteOriented = ($offsetByteOriented < $offsetUnicodeAware) \|\|
99			($offsetByteOriented == $offsetUnicodeAware &&
100			(strlen($matchByteOriented) >= strlen($matchUnicodeAware)));
101			$match = $chooseByteOriented ? $matchByteOriented : $matchUnicodeAware;
102			return $chooseByteOriented ? $resultByteOriented : $resultUnicodeAware;
103			}
104
105			/**
106			* Attempts to match all patterns of a certain type at once against a string.
107			*
108			* @param string $subject String to match against.
109			* @param string $match First matched portion of subject.
110			* @param int $offset Offset of the first matched portion of subject.
111			* @param boolean $unicode True for Unicode-aware, false for byte-oriented.
112			* @return bool\|string False if no match found, label if label exists, true if not
113			*/
114			protected function partialMatch($subject, &$match, &$offset, $unicode)
115			{
116			if (! isset($this->patterns[$unicode]) \|\| count($this->patterns[$unicode]) == 0) {
117			return false;
118			}
119			if (! preg_match($this->getCompoundedRegex($unicode), $subject, $matches, PREG_OFFSET_CAPTURE)) {
120			$match = "";
121			return false;
122			}
123
124			$match = $matches[0][0];
125			$offset = $matches[0][1];
126			$size = count($matches);
127			// FIXME this could be made faster by storing the labels as keys in a hashmap
128			for ($i = 1; $i < $size; $i++) {
129			if ($matches[$i][0] && isset($this->labels[$unicode][$i - 1])) {
130			return $this->labels[$unicode][$i - 1];
131			}
132			}
133			return true;
134			}
135
136			/**
137			* Attempts to split the string against all patterns at once.
138			*
139			* @param string $subject String to match against.
140			* @param array $split The split result: array containing pre-match, match & post-match strings
141			* @return boolean True on success.
142			*
143			* @author Moisés Braga Ribeiro <[email protected]>
144			*/
145			public function split($subject, &$split)
146			{
147			$resultByteOriented = $this->partialSplit($subject, $splitByteOriented, false);
148			$resultUnicodeAware = $this->partialSplit($subject, $splitUnicodeAware, true);
149			if (! $resultUnicodeAware) {
150			$split = $splitByteOriented;
151			return $resultByteOriented;
152			}
153			if (! $resultByteOriented) {
154			$split = $splitUnicodeAware;
155			return $resultUnicodeAware;
156			}
157			list($preByteOriented, $matchByteOriented, /* $postByteOriented */) = $splitByteOriented;
158			list($preUnicodeAware, $matchUnicodeAware, /* $postUnicodeAware */) = $splitUnicodeAware;
159			$chooseByteOriented = (strlen($preByteOriented) < strlen($preUnicodeAware)) \|\|
160			(strlen($preByteOriented) == strlen($preUnicodeAware) &&
161			(strlen($matchByteOriented) >= strlen($matchUnicodeAware)));
162			$split = $chooseByteOriented ? $splitByteOriented : $splitUnicodeAware;
163			return $chooseByteOriented ? $resultByteOriented : $resultUnicodeAware;
164			}
165
166			/**
167			* Attempts to split the string against all patterns of a certain type at once.
168			*
169			* @param string $subject String to match against.
170			* @param array $split The split result: array containing pre-match, match & post-match strings
171			* @param boolean $unicode True for Unicode-aware, false for byte-oriented.
172			* @return boolean True on success.
173			*
174			* @author Christopher Smith <[email protected]>
175			*/
176			protected function partialSplit($subject, &$split, $unicode)
177			{
178			if (! isset($this->patterns[$unicode]) \|\| count($this->patterns[$unicode]) == 0) {
179			return false;
180			}
181
182			if (! preg_match($this->getCompoundedRegex($unicode), $subject, $matches)) {
183			if (function_exists('preg_last_error')) {
184			$err = preg_last_error();
185			switch ($err) {
186			case PREG_BACKTRACK_LIMIT_ERROR:
187			msg('A PCRE backtrack error occured. Try to increase the pcre.backtrack_limit in php.ini', -1);
188			break;
189			case PREG_RECURSION_LIMIT_ERROR:
190			msg('A PCRE recursion error occured. Try to increase the pcre.recursion_limit in php.ini', -1);
191			break;
192			case PREG_BAD_UTF8_ERROR:
193			msg('A PCRE UTF-8 error occured. This might be caused by a faulty plugin', -1);
194			break;
195			case PREG_INTERNAL_ERROR:
196			msg('A PCRE internal error occured. This might be caused by a faulty plugin', -1);
197			break;
198			}
199			}
200
201			$split = array($subject, "", "");
202			return false;
203			}
204
205			$idx = count($matches)-2;
206			$pattern = $this->patterns[$unicode][$idx] . $this->getPerlMatchingFlags($unicode);
207			list($pre, $post) = preg_split($pattern, $subject, 2);
208			$split = array($pre, $matches[0], $post);
209
210			return isset($this->labels[$unicode][$idx]) ? $this->labels[$unicode][$idx] : true;
211			}
212
213			/**
214			* Compounds the patterns into a single regular expression separated with the
215			* "or" operator. Caches the regex. Will automatically escape (, ) and / tokens.
216			*
217			* @param boolean $unicode True for Unicode-aware, false for byte-oriented.
218			* @return null\|string
219			*/
220			protected function getCompoundedRegex($unicode)
221			{
222			if ($this->regexes[$unicode] == null) {
223			$cnt = count($this->patterns[$unicode]);
224			for ($i = 0; $i < $cnt; $i++) {
225			/*
226			* decompose the input pattern into "(", "(?", ")",
227			* "[...]", "[]..]", "[^]..]", "[...[:...:]..]", "\x"...
228			* elements.
229			*/
230			preg_match_all('/\\\\.\|' .
231			'\(\?\|' .
232			'[()]\|' .
233			'\[\^?\]?(?:\\\\.\|\[:[^]]:\]\|[^]\\\\])\]\|' .
234			'[^[()\\\\]+/', $this->patterns[$unicode][$i], $elts);
235
236			$pattern = "";
237			$level = 0;
238
239			foreach ($elts[0] as $elt) {
240			/*
241			* for "(", ")" remember the nesting level, add "\"
242			* only to the non-"(?" ones.
243			*/
244
245			switch ($elt) {
246			case '(':
247			$pattern .= '\(';
248			break;
249			case ')':
250			if ($level > 0)
251			$level--; /* closing (? */
252			else $pattern .= '\\';
253			$pattern .= ')';
254			break;
255			case '(?':
256			$level++;
257			$pattern .= '(?';
258			break;
259			default:
260			if (substr($elt, 0, 1) == '\\')
261			$pattern .= $elt;
262			else $pattern .= str_replace('/', '\/', $elt);
263			}
264			}
265			$this->patterns[$unicode][$i] = "($pattern)";
266			}
267			$this->regexes[$unicode] = "/" . implode("\|", $this->patterns[$unicode]) .
268			"/" . $this->getPerlMatchingFlags($unicode);
269			}
270			return $this->regexes[$unicode];
271			}
272
273			/**
274			* Accessor for perl regex mode flags to use.
275			* @param boolean $unicode True for Unicode-aware, false for byte-oriented.
276			* @return string Perl regex flags.
277			*/
278			protected function getPerlMatchingFlags($unicode)
279			{
280			$u = ($unicode ? "u" : "");
281			$i = ($this->case ? "" : "i");
282			return $u . "msS" . $i;
283			}
284			}
285

splitbrain / dokuwiki

Pull Request — master (#3255)

ParallelRegex::getCompoundedRegex() B

Complexity

Size

Duplication

Importance

How to fix Long Method

Long Method

Duplication Side-by-Side

Filter issues like