ParallelRegex::getCompoundedRegex() - Code Metrics - Inspection of "Lexer with support for Unicode character propertie..." - splitbrain/dokuwiki - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Pull Request — master (#3255)

unknown

created 2020-09-06 17:30 UTC

ParallelRegex::getCompoundedRegex() B

↳ Parent: ParallelRegex

Complexity

Conditions	9
Paths	2

Size

Total Lines

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	9
nc	2
nop	1
dl	0
loc	52
rs	7.4917
c	0
b	0
f	0

How to fix Long Method

<?php
/**
 * Lexer adapted from Simple Test: http://sourceforge.net/projects/simpletest/
 * For an intro to the Lexer see:
 * https://web.archive.org/web/20120125041816/http://www.phppatterns.com/docs/develop/simple_test_lexer_notes
 *
 * @author Marcus Baker http://www.lastcraft.com
 * @author Moisés Braga Ribeiro <[email protected]>
 */

namespace dokuwiki\Parsing\Lexer;

/**
 * Compounded regular expression.
 *
 * Any of the contained patterns could match and when one does it's label is returned.
 */
class ParallelRegex
{
    /** @var string[][] patterns to match */
    protected $patterns;
    /** @var string[][] labels for above patterns */
    protected $labels;
    /** @var string[] the compound regexes matching all patterns */
    protected $regexes;
    /** @var bool case sensitive matching? */
    protected $case;

    /**
     * Constructor. Starts with no patterns.
     *
     * @param boolean $case    True for case sensitive, false for insensitive.
     */
    public function __construct($case)
    {
        $this->case = $case;
        $this->patterns = array();
        $this->labels = array();
        $this->regexes = array();
    }

    /**
     * Adds a pattern with an optional label.
     *
     * @param mixed $pattern       Perl style regex. Must be UTF-8 encoded. If its a string,
     *                             the (, ) lose their meaning unless they form part of
     *                             a lookahead or lookbehind assertation.
     * @param bool|string $label   Label of regex to be returned on a match. Label must be ASCII
     */
    public function addPattern($pattern, $label = true)
    {
        $unicode = $this->needsUnicodeAware($pattern);
        if (! isset($this->patterns[$unicode])) {
            $this->patterns[$unicode] = array();
            $this->labels[$unicode] = array();
        }
        $count = count($this->patterns[$unicode]);
        $this->patterns[$unicode][$count] = $pattern;
        $this->labels[$unicode][$count] = $label;
        $this->regexes[$unicode] = null;
    }

    /**
     * Decides whether the given pattern needs Unicode-aware regex treatment.
     * Reference: https://www.php.net/manual/en/regexp.reference.unicode.php
     *
     * @param mixed $pattern       Perl style regex. Must be UTF-8 encoded.
     * @return boolean             True for Unicode-aware, false for byte-oriented.
     *
     * @author Moisés Braga Ribeiro <[email protected]>
     */
    protected function needsUnicodeAware($pattern)
    {
        return preg_match("/[\\x80-\\xFF]|\\\\(X|([pP]([A-Z]|\{\^?[A-Za-z_]+\})))/S", $pattern);
    }

    /**
     * Attempts to match all patterns at once against a string.
     *
     * @param string $subject      String to match against.
     * @param string $match        First matched portion of subject.
     * @return bool|string         False if no match found, label if label exists, true if not
     *
     * @author Moisés Braga Ribeiro <[email protected]>
     */
    public function match($subject, &$match)
    {
        $resultByteOriented = $this->partialMatch($subject, $matchByteOriented, $offsetByteOriented, false);
        $resultUnicodeAware = $this->partialMatch($subject, $matchUnicodeAware, $offsetUnicodeAware, true);
        if (! $resultUnicodeAware) {
            $match = $matchByteOriented;
            return $resultByteOriented;
        }
        if (! $resultByteOriented) {
            $match = $matchUnicodeAware;
            return $resultUnicodeAware;
        }
        $chooseByteOriented = ($offsetByteOriented < $offsetUnicodeAware) ||
                              ($offsetByteOriented == $offsetUnicodeAware &&
                                  (strlen($matchByteOriented) >= strlen($matchUnicodeAware)));
        $match = $chooseByteOriented ? $matchByteOriented : $matchUnicodeAware;
        return $chooseByteOriented ? $resultByteOriented : $resultUnicodeAware;
    }

    /**
     * Attempts to match all patterns of a certain type at once against a string.
     *
     * @param string $subject      String to match against.
     * @param string $match        First matched portion of subject.
     * @param int $offset          Offset of the first matched portion of subject.
     * @param boolean $unicode     True for Unicode-aware, false for byte-oriented.
     * @return bool|string         False if no match found, label if label exists, true if not
     */
    protected function partialMatch($subject, &$match, &$offset, $unicode)
    {
        if (! isset($this->patterns[$unicode]) || count($this->patterns[$unicode]) == 0) {
            return false;
        }
        if (! preg_match($this->getCompoundedRegex($unicode), $subject, $matches, PREG_OFFSET_CAPTURE)) {
            $match = "";
            return false;
        }

        $match = $matches[0][0];
        $offset = $matches[0][1];
        $size = count($matches);
        // FIXME this could be made faster by storing the labels as keys in a hashmap
        for ($i = 1; $i < $size; $i++) {
            if ($matches[$i][0] && isset($this->labels[$unicode][$i - 1])) {
                return $this->labels[$unicode][$i - 1];
            }
        }
        return true;
    }

    /**
     * Attempts to split the string against all patterns at once.
     *
     * @param string $subject      String to match against.
     * @param array $split         The split result: array containing pre-match, match & post-match strings
     * @return boolean             True on success.
     *
     * @author Moisés Braga Ribeiro <[email protected]>
     */
    public function split($subject, &$split)
    {
        $resultByteOriented = $this->partialSplit($subject, $splitByteOriented, false);
        $resultUnicodeAware = $this->partialSplit($subject, $splitUnicodeAware, true);
        if (! $resultUnicodeAware) {
            $split = $splitByteOriented;
            return $resultByteOriented;
        }
        if (! $resultByteOriented) {
            $split = $splitUnicodeAware;
            return $resultUnicodeAware;
        }
        list($preByteOriented, $matchByteOriented, /* $postByteOriented */) = $splitByteOriented;
        list($preUnicodeAware, $matchUnicodeAware, /* $postUnicodeAware */) = $splitUnicodeAware;
        $chooseByteOriented = (strlen($preByteOriented) < strlen($preUnicodeAware)) ||
                              (strlen($preByteOriented) == strlen($preUnicodeAware) &&
                                  (strlen($matchByteOriented) >= strlen($matchUnicodeAware)));
        $split = $chooseByteOriented ? $splitByteOriented : $splitUnicodeAware;
        return $chooseByteOriented ? $resultByteOriented : $resultUnicodeAware;
    }

    /**
     * Attempts to split the string against all patterns of a certain type at once.
     *
     * @param string $subject      String to match against.
     * @param array $split         The split result: array containing pre-match, match & post-match strings
     * @param boolean $unicode     True for Unicode-aware, false for byte-oriented.
     * @return boolean             True on success.
     *
     * @author Christopher Smith <[email protected]>
     */
    protected function partialSplit($subject, &$split, $unicode)
    {
        if (! isset($this->patterns[$unicode]) || count($this->patterns[$unicode]) == 0) {
            return false;
        }

        if (! preg_match($this->getCompoundedRegex($unicode), $subject, $matches)) {
            if (function_exists('preg_last_error')) {
                $err = preg_last_error();
                switch ($err) {
                    case PREG_BACKTRACK_LIMIT_ERROR:
                        msg('A PCRE backtrack error occured. Try to increase the pcre.backtrack_limit in php.ini', -1);
                        break;
                    case PREG_RECURSION_LIMIT_ERROR:
                        msg('A PCRE recursion error occured. Try to increase the pcre.recursion_limit in php.ini', -1);
                        break;
                    case PREG_BAD_UTF8_ERROR:
                        msg('A PCRE UTF-8 error occured. This might be caused by a faulty plugin', -1);
                        break;
                    case PREG_INTERNAL_ERROR:
                        msg('A PCRE internal error occured. This might be caused by a faulty plugin', -1);
                        break;
                }
            }

            $split = array($subject, "", "");
            return false;
        }

        $idx = count($matches)-2;
        $pattern = $this->patterns[$unicode][$idx] . $this->getPerlMatchingFlags($unicode);
        list($pre, $post) = preg_split($pattern, $subject, 2);
        $split = array($pre, $matches[0], $post);

        return isset($this->labels[$unicode][$idx]) ? $this->labels[$unicode][$idx] : true;
    }

    /**
     * Compounds the patterns into a single regular expression separated with the
     * "or" operator. Caches the regex. Will automatically escape (, ) and / tokens.
     *
     * @param boolean $unicode     True for Unicode-aware, false for byte-oriented.
     * @return null|string
     */
    protected function getCompoundedRegex($unicode)
    {
        if ($this->regexes[$unicode] == null) {
            $cnt = count($this->patterns[$unicode]);
            for ($i = 0; $i < $cnt; $i++) {
                /*
                 * decompose the input pattern into "(", "(?", ")",
                 * "[...]", "[]..]", "[^]..]", "[...[:...:]..]", "\x"...
                 * elements.
                 */
                preg_match_all('/\\\\.|' .
                               '\(\?|' .
                               '[()]|' .
                               '\[\^?\]?(?:\\\\.|\[:[^]]*:\]|[^]\\\\])*\]|' .
                               '[^[()\\\\]+/', $this->patterns[$unicode][$i], $elts);

                $pattern = "";
                $level = 0;

                foreach ($elts[0] as $elt) {
                    /*
                     * for "(", ")" remember the nesting level, add "\"
                     * only to the non-"(?" ones.
                     */

                    switch ($elt) {
                        case '(':
                            $pattern .= '\(';
                            break;
                        case ')':
                            if ($level > 0)
                                $level--; /* closing (? */
                            else $pattern .= '\\';
                            $pattern .= ')';
                            break;
                        case '(?':
                            $level++;
                            $pattern .= '(?';
                            break;
                        default:
                            if (substr($elt, 0, 1) == '\\')
                                $pattern .= $elt;
                            else $pattern .= str_replace('/', '\/', $elt);
                    }
                }
                $this->patterns[$unicode][$i] = "($pattern)";
            }
            $this->regexes[$unicode] = "/" . implode("|", $this->patterns[$unicode]) .
                                       "/" . $this->getPerlMatchingFlags($unicode);
        }
        return $this->regexes[$unicode];
    }

    /**
     * Accessor for perl regex mode flags to use.
     * @param boolean $unicode     True for Unicode-aware, false for byte-oriented.
     * @return string              Perl regex flags.
     */
    protected function getPerlMatchingFlags($unicode)
    {
        $u = ($unicode ? "u" : "");
        $i = ($this->case ? "" : "i");
        return $u . "msS" . $i;
    }
}


1			<?php
2			/**
3			* Lexer adapted from Simple Test: http://sourceforge.net/projects/simpletest/
4			* For an intro to the Lexer see:
5			* https://web.archive.org/web/20120125041816/http://www.phppatterns.com/docs/develop/simple_test_lexer_notes
6			*
7			* @author Marcus Baker http://www.lastcraft.com
8			* @author Moisés Braga Ribeiro <[email protected]>
9			*/
10
11			namespace dokuwiki\Parsing\Lexer;
12
13			/**
14			* Compounded regular expression.
15			*
16			* Any of the contained patterns could match and when one does it's label is returned.
17			*/
18			class ParallelRegex
19			{
20			/** @var string[][] patterns to match */
21			protected $patterns;
22			/** @var string[][] labels for above patterns */
23			protected $labels;
24			/** @var string[] the compound regexes matching all patterns */
25			protected $regexes;
26			/** @var bool case sensitive matching? */
27			protected $case;
28
29			/**
30			* Constructor. Starts with no patterns.
31			*
32			* @param boolean $case True for case sensitive, false for insensitive.
33			*/
34			public function __construct($case)
35			{
36			$this->case = $case;
37			$this->patterns = array();
38			$this->labels = array();
39			$this->regexes = array();
40			}
41
42			/**
43			* Adds a pattern with an optional label.
44			*
45			* @param mixed $pattern Perl style regex. Must be UTF-8 encoded. If its a string,
46			* the (, ) lose their meaning unless they form part of
47			* a lookahead or lookbehind assertation.
48			* @param bool\|string $label Label of regex to be returned on a match. Label must be ASCII
49			*/
50			public function addPattern($pattern, $label = true)
51			{
52			$unicode = $this->needsUnicodeAware($pattern);
53			if (! isset($this->patterns[$unicode])) {
54			$this->patterns[$unicode] = array();
55			$this->labels[$unicode] = array();
56			}
57			$count = count($this->patterns[$unicode]);
58			$this->patterns[$unicode][$count] = $pattern;
59			$this->labels[$unicode][$count] = $label;
60			$this->regexes[$unicode] = null;
61			}
62
63			/**
64			* Decides whether the given pattern needs Unicode-aware regex treatment.
65			* Reference: https://www.php.net/manual/en/regexp.reference.unicode.php
66			*
67			* @param mixed $pattern Perl style regex. Must be UTF-8 encoded.
68			* @return boolean True for Unicode-aware, false for byte-oriented.
69			*
70			* @author Moisés Braga Ribeiro <[email protected]>
71			*/
72			protected function needsUnicodeAware($pattern)
73			{
74			return preg_match("/[\\x80-\\xFF]\|\\\\(X\|([pP]([A-Z]\|\{\^?[A-Za-z_]+\})))/S", $pattern);
75			}
76
77			/**
78			* Attempts to match all patterns at once against a string.
79			*
80			* @param string $subject String to match against.
81			* @param string $match First matched portion of subject.
82			* @return bool\|string False if no match found, label if label exists, true if not
83			*
84			* @author Moisés Braga Ribeiro <[email protected]>
85			*/
86			public function match($subject, &$match)
87			{
88			$resultByteOriented = $this->partialMatch($subject, $matchByteOriented, $offsetByteOriented, false);
89			$resultUnicodeAware = $this->partialMatch($subject, $matchUnicodeAware, $offsetUnicodeAware, true);
90			if (! $resultUnicodeAware) {
91			$match = $matchByteOriented;
92			return $resultByteOriented;
93			}
94			if (! $resultByteOriented) {
95			$match = $matchUnicodeAware;
96			return $resultUnicodeAware;
97			}
98			$chooseByteOriented = ($offsetByteOriented < $offsetUnicodeAware) \|\|
99			($offsetByteOriented == $offsetUnicodeAware &&
100			(strlen($matchByteOriented) >= strlen($matchUnicodeAware)));
101			$match = $chooseByteOriented ? $matchByteOriented : $matchUnicodeAware;
102			return $chooseByteOriented ? $resultByteOriented : $resultUnicodeAware;
103			}
104
105			/**
106			* Attempts to match all patterns of a certain type at once against a string.
107			*
108			* @param string $subject String to match against.
109			* @param string $match First matched portion of subject.
110			* @param int $offset Offset of the first matched portion of subject.
111			* @param boolean $unicode True for Unicode-aware, false for byte-oriented.
112			* @return bool\|string False if no match found, label if label exists, true if not
113			*/
114			protected function partialMatch($subject, &$match, &$offset, $unicode)
115			{
116			if (! isset($this->patterns[$unicode]) \|\| count($this->patterns[$unicode]) == 0) {
117			return false;
118			}
119			if (! preg_match($this->getCompoundedRegex($unicode), $subject, $matches, PREG_OFFSET_CAPTURE)) {
120			$match = "";
121			return false;
122			}
123
124			$match = $matches[0][0];
125			$offset = $matches[0][1];
126			$size = count($matches);
127			// FIXME this could be made faster by storing the labels as keys in a hashmap
128			for ($i = 1; $i < $size; $i++) {
129			if ($matches[$i][0] && isset($this->labels[$unicode][$i - 1])) {
130			return $this->labels[$unicode][$i - 1];
131			}
132			}
133			return true;
134			}
135
136			/**
137			* Attempts to split the string against all patterns at once.
138			*
139			* @param string $subject String to match against.
140			* @param array $split The split result: array containing pre-match, match & post-match strings
141			* @return boolean True on success.
142			*
143			* @author Moisés Braga Ribeiro <[email protected]>
144			*/
145			public function split($subject, &$split)
146			{
147			$resultByteOriented = $this->partialSplit($subject, $splitByteOriented, false);
148			$resultUnicodeAware = $this->partialSplit($subject, $splitUnicodeAware, true);
149			if (! $resultUnicodeAware) {
150			$split = $splitByteOriented;
151			return $resultByteOriented;
152			}
153			if (! $resultByteOriented) {
154			$split = $splitUnicodeAware;
155			return $resultUnicodeAware;
156			}
157			list($preByteOriented, $matchByteOriented, /* $postByteOriented */) = $splitByteOriented;
158			list($preUnicodeAware, $matchUnicodeAware, /* $postUnicodeAware */) = $splitUnicodeAware;
159			$chooseByteOriented = (strlen($preByteOriented) < strlen($preUnicodeAware)) \|\|
160			(strlen($preByteOriented) == strlen($preUnicodeAware) &&
161			(strlen($matchByteOriented) >= strlen($matchUnicodeAware)));
162			$split = $chooseByteOriented ? $splitByteOriented : $splitUnicodeAware;
163			return $chooseByteOriented ? $resultByteOriented : $resultUnicodeAware;
164			}
165
166			/**
167			* Attempts to split the string against all patterns of a certain type at once.
168			*
169			* @param string $subject String to match against.
170			* @param array $split The split result: array containing pre-match, match & post-match strings
171			* @param boolean $unicode True for Unicode-aware, false for byte-oriented.
172			* @return boolean True on success.
173			*
174			* @author Christopher Smith <[email protected]>
175			*/
176			protected function partialSplit($subject, &$split, $unicode)
177			{
178			if (! isset($this->patterns[$unicode]) \|\| count($this->patterns[$unicode]) == 0) {
179			return false;
180			}
181
182			if (! preg_match($this->getCompoundedRegex($unicode), $subject, $matches)) {
183			if (function_exists('preg_last_error')) {
184			$err = preg_last_error();
185			switch ($err) {
186			case PREG_BACKTRACK_LIMIT_ERROR:
187			msg('A PCRE backtrack error occured. Try to increase the pcre.backtrack_limit in php.ini', -1);
188			break;
189			case PREG_RECURSION_LIMIT_ERROR:
190			msg('A PCRE recursion error occured. Try to increase the pcre.recursion_limit in php.ini', -1);
191			break;
192			case PREG_BAD_UTF8_ERROR:
193			msg('A PCRE UTF-8 error occured. This might be caused by a faulty plugin', -1);
194			break;
195			case PREG_INTERNAL_ERROR:
196			msg('A PCRE internal error occured. This might be caused by a faulty plugin', -1);
197			break;
198			}
199			}
200
201			$split = array($subject, "", "");
202			return false;
203			}
204
205			$idx = count($matches)-2;
206			$pattern = $this->patterns[$unicode][$idx] . $this->getPerlMatchingFlags($unicode);
207			list($pre, $post) = preg_split($pattern, $subject, 2);
208			$split = array($pre, $matches[0], $post);
209
210			return isset($this->labels[$unicode][$idx]) ? $this->labels[$unicode][$idx] : true;
211			}
212
213			/**
214			* Compounds the patterns into a single regular expression separated with the
215			* "or" operator. Caches the regex. Will automatically escape (, ) and / tokens.
216			*
217			* @param boolean $unicode True for Unicode-aware, false for byte-oriented.
218			* @return null\|string
219			*/
220			protected function getCompoundedRegex($unicode)
221			{
222			if ($this->regexes[$unicode] == null) {
223			$cnt = count($this->patterns[$unicode]);
224			for ($i = 0; $i < $cnt; $i++) {
225			/*
226			* decompose the input pattern into "(", "(?", ")",
227			* "[...]", "[]..]", "[^]..]", "[...[:...:]..]", "\x"...
228			* elements.
229			*/
230			preg_match_all('/\\\\.\|' .
231			'\(\?\|' .
232			'[()]\|' .
233			'\[\^?\]?(?:\\\\.\|\[:[^]]:\]\|[^]\\\\])\]\|' .
234			'[^[()\\\\]+/', $this->patterns[$unicode][$i], $elts);
235
236			$pattern = "";
237			$level = 0;
238
239			foreach ($elts[0] as $elt) {
240			/*
241			* for "(", ")" remember the nesting level, add "\"
242			* only to the non-"(?" ones.
243			*/
244
245			switch ($elt) {
246			case '(':
247			$pattern .= '\(';
248			break;
249			case ')':
250			if ($level > 0)
251			$level--; /* closing (? */
252			else $pattern .= '\\';
253			$pattern .= ')';
254			break;
255			case '(?':
256			$level++;
257			$pattern .= '(?';
258			break;
259			default:
260			if (substr($elt, 0, 1) == '\\')
261			$pattern .= $elt;
262			else $pattern .= str_replace('/', '\/', $elt);
263			}
264			}
265			$this->patterns[$unicode][$i] = "($pattern)";
266			}
267			$this->regexes[$unicode] = "/" . implode("\|", $this->patterns[$unicode]) .
268			"/" . $this->getPerlMatchingFlags($unicode);
269			}
270			return $this->regexes[$unicode];
271			}
272
273			/**
274			* Accessor for perl regex mode flags to use.
275			* @param boolean $unicode True for Unicode-aware, false for byte-oriented.
276			* @return string Perl regex flags.
277			*/
278			protected function getPerlMatchingFlags($unicode)
279			{
280			$u = ($unicode ? "u" : "");
281			$i = ($this->case ? "" : "i");
282			return $u . "msS" . $i;
283			}
284			}
285

splitbrain / dokuwiki

Pull Request — master (#3255)

ParallelRegex::getCompoundedRegex() B

Complexity

Size

Duplication

Importance

How to fix Long Method

Long Method

Duplication Side-by-Side

Filter issues like