Completed
Pull Request — master (#3255)
by
unknown
02:46
created

ParallelRegex::getCompoundedRegex()   B

Complexity

Conditions 9
Paths 2

Size

Total Lines 52

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 9
nc 2
nop 1
dl 0
loc 52
rs 7.4917
c 0
b 0
f 0

How to fix   Long Method   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
/**
3
 * Lexer adapted from Simple Test: http://sourceforge.net/projects/simpletest/
4
 * For an intro to the Lexer see:
5
 * https://web.archive.org/web/20120125041816/http://www.phppatterns.com/docs/develop/simple_test_lexer_notes
6
 *
7
 * @author Marcus Baker http://www.lastcraft.com
8
 * @author Moisés Braga Ribeiro <[email protected]>
9
 */
10
11
namespace dokuwiki\Parsing\Lexer;
12
13
/**
14
 * Compounded regular expression.
15
 *
16
 * Any of the contained patterns could match and when one does it's label is returned.
17
 */
18
class ParallelRegex
19
{
20
    /** @var string[][] patterns to match */
21
    protected $patterns;
22
    /** @var string[][] labels for above patterns */
23
    protected $labels;
24
    /** @var string[] the compound regexes matching all patterns */
25
    protected $regexes;
26
    /** @var bool case sensitive matching? */
27
    protected $case;
28
29
    /**
30
     * Constructor. Starts with no patterns.
31
     *
32
     * @param boolean $case    True for case sensitive, false for insensitive.
33
     */
34
    public function __construct($case)
35
    {
36
        $this->case = $case;
37
        $this->patterns = array();
38
        $this->labels = array();
39
        $this->regexes = array();
40
    }
41
42
    /**
43
     * Adds a pattern with an optional label.
44
     *
45
     * @param mixed $pattern       Perl style regex. Must be UTF-8 encoded. If its a string,
46
     *                             the (, ) lose their meaning unless they form part of
47
     *                             a lookahead or lookbehind assertation.
48
     * @param bool|string $label   Label of regex to be returned on a match. Label must be ASCII
49
     */
50
    public function addPattern($pattern, $label = true)
51
    {
52
        $unicode = $this->needsUnicodeAware($pattern);
53
        if (! isset($this->patterns[$unicode])) {
54
            $this->patterns[$unicode] = array();
55
            $this->labels[$unicode] = array();
56
        }
57
        $count = count($this->patterns[$unicode]);
58
        $this->patterns[$unicode][$count] = $pattern;
59
        $this->labels[$unicode][$count] = $label;
60
        $this->regexes[$unicode] = null;
61
    }
62
63
    /**
64
     * Decides whether the given pattern needs Unicode-aware regex treatment.
65
     * Reference: https://www.php.net/manual/en/regexp.reference.unicode.php
66
     *
67
     * @param mixed $pattern       Perl style regex. Must be UTF-8 encoded.
68
     * @return boolean             True for Unicode-aware, false for byte-oriented.
69
     *
70
     * @author Moisés Braga Ribeiro <[email protected]>
71
     */
72
    protected function needsUnicodeAware($pattern)
73
    {
74
        return preg_match("/[\\x80-\\xFF]|\\\\(X|([pP]([A-Z]|\{\^?[A-Za-z_]+\})))/S", $pattern);
75
    }
76
77
    /**
78
     * Attempts to match all patterns at once against a string.
79
     *
80
     * @param string $subject      String to match against.
81
     * @param string $match        First matched portion of subject.
82
     * @return bool|string         False if no match found, label if label exists, true if not
83
     *
84
     * @author Moisés Braga Ribeiro <[email protected]>
85
     */
86
    public function match($subject, &$match)
87
    {
88
        $resultByteOriented = $this->partialMatch($subject, $matchByteOriented, $offsetByteOriented, false);
89
        $resultUnicodeAware = $this->partialMatch($subject, $matchUnicodeAware, $offsetUnicodeAware, true);
90
        if (! $resultUnicodeAware) {
91
            $match = $matchByteOriented;
92
            return $resultByteOriented;
93
        }
94
        if (! $resultByteOriented) {
95
            $match = $matchUnicodeAware;
96
            return $resultUnicodeAware;
97
        }
98
        $chooseByteOriented = ($offsetByteOriented < $offsetUnicodeAware) ||
99
                              ($offsetByteOriented == $offsetUnicodeAware &&
100
                                  (strlen($matchByteOriented) >= strlen($matchUnicodeAware)));
101
        $match = $chooseByteOriented ? $matchByteOriented : $matchUnicodeAware;
102
        return $chooseByteOriented ? $resultByteOriented : $resultUnicodeAware;
103
    }
104
105
    /**
106
     * Attempts to match all patterns of a certain type at once against a string.
107
     *
108
     * @param string $subject      String to match against.
109
     * @param string $match        First matched portion of subject.
110
     * @param int $offset          Offset of the first matched portion of subject.
111
     * @param boolean $unicode     True for Unicode-aware, false for byte-oriented.
112
     * @return bool|string         False if no match found, label if label exists, true if not
113
     */
114
    protected function partialMatch($subject, &$match, &$offset, $unicode)
115
    {
116
        if (! isset($this->patterns[$unicode]) || count($this->patterns[$unicode]) == 0) {
117
            return false;
118
        }
119
        if (! preg_match($this->getCompoundedRegex($unicode), $subject, $matches, PREG_OFFSET_CAPTURE)) {
120
            $match = "";
121
            return false;
122
        }
123
124
        $match = $matches[0][0];
125
        $offset = $matches[0][1];
126
        $size = count($matches);
127
        // FIXME this could be made faster by storing the labels as keys in a hashmap
128
        for ($i = 1; $i < $size; $i++) {
129
            if ($matches[$i][0] && isset($this->labels[$unicode][$i - 1])) {
130
                return $this->labels[$unicode][$i - 1];
131
            }
132
        }
133
        return true;
134
    }
135
136
    /**
137
     * Attempts to split the string against all patterns at once.
138
     *
139
     * @param string $subject      String to match against.
140
     * @param array $split         The split result: array containing pre-match, match & post-match strings
141
     * @return boolean             True on success.
142
     *
143
     * @author Moisés Braga Ribeiro <[email protected]>
144
     */
145
    public function split($subject, &$split)
146
    {
147
        $resultByteOriented = $this->partialSplit($subject, $splitByteOriented, false);
148
        $resultUnicodeAware = $this->partialSplit($subject, $splitUnicodeAware, true);
149
        if (! $resultUnicodeAware) {
150
            $split = $splitByteOriented;
151
            return $resultByteOriented;
152
        }
153
        if (! $resultByteOriented) {
154
            $split = $splitUnicodeAware;
155
            return $resultUnicodeAware;
156
        }
157
        list($preByteOriented, $matchByteOriented, /* $postByteOriented */) = $splitByteOriented;
158
        list($preUnicodeAware, $matchUnicodeAware, /* $postUnicodeAware */) = $splitUnicodeAware;
159
        $chooseByteOriented = (strlen($preByteOriented) < strlen($preUnicodeAware)) ||
160
                              (strlen($preByteOriented) == strlen($preUnicodeAware) &&
161
                                  (strlen($matchByteOriented) >= strlen($matchUnicodeAware)));
162
        $split = $chooseByteOriented ? $splitByteOriented : $splitUnicodeAware;
163
        return $chooseByteOriented ? $resultByteOriented : $resultUnicodeAware;
164
    }
165
166
    /**
167
     * Attempts to split the string against all patterns of a certain type at once.
168
     *
169
     * @param string $subject      String to match against.
170
     * @param array $split         The split result: array containing pre-match, match & post-match strings
171
     * @param boolean $unicode     True for Unicode-aware, false for byte-oriented.
172
     * @return boolean             True on success.
173
     *
174
     * @author Christopher Smith <[email protected]>
175
     */
176
    protected function partialSplit($subject, &$split, $unicode)
177
    {
178
        if (! isset($this->patterns[$unicode]) || count($this->patterns[$unicode]) == 0) {
179
            return false;
180
        }
181
182
        if (! preg_match($this->getCompoundedRegex($unicode), $subject, $matches)) {
183
            if (function_exists('preg_last_error')) {
184
                $err = preg_last_error();
185
                switch ($err) {
186
                    case PREG_BACKTRACK_LIMIT_ERROR:
187
                        msg('A PCRE backtrack error occured. Try to increase the pcre.backtrack_limit in php.ini', -1);
188
                        break;
189
                    case PREG_RECURSION_LIMIT_ERROR:
190
                        msg('A PCRE recursion error occured. Try to increase the pcre.recursion_limit in php.ini', -1);
191
                        break;
192
                    case PREG_BAD_UTF8_ERROR:
193
                        msg('A PCRE UTF-8 error occured. This might be caused by a faulty plugin', -1);
194
                        break;
195
                    case PREG_INTERNAL_ERROR:
196
                        msg('A PCRE internal error occured. This might be caused by a faulty plugin', -1);
197
                        break;
198
                }
199
            }
200
201
            $split = array($subject, "", "");
202
            return false;
203
        }
204
205
        $idx = count($matches)-2;
206
        $pattern = $this->patterns[$unicode][$idx] . $this->getPerlMatchingFlags($unicode);
207
        list($pre, $post) = preg_split($pattern, $subject, 2);
208
        $split = array($pre, $matches[0], $post);
209
210
        return isset($this->labels[$unicode][$idx]) ? $this->labels[$unicode][$idx] : true;
211
    }
212
213
    /**
214
     * Compounds the patterns into a single regular expression separated with the
215
     * "or" operator. Caches the regex. Will automatically escape (, ) and / tokens.
216
     *
217
     * @param boolean $unicode     True for Unicode-aware, false for byte-oriented.
218
     * @return null|string
219
     */
220
    protected function getCompoundedRegex($unicode)
221
    {
222
        if ($this->regexes[$unicode] == null) {
223
            $cnt = count($this->patterns[$unicode]);
224
            for ($i = 0; $i < $cnt; $i++) {
225
                /*
226
                 * decompose the input pattern into "(", "(?", ")",
227
                 * "[...]", "[]..]", "[^]..]", "[...[:...:]..]", "\x"...
228
                 * elements.
229
                 */
230
                preg_match_all('/\\\\.|' .
231
                               '\(\?|' .
232
                               '[()]|' .
233
                               '\[\^?\]?(?:\\\\.|\[:[^]]*:\]|[^]\\\\])*\]|' .
234
                               '[^[()\\\\]+/', $this->patterns[$unicode][$i], $elts);
235
236
                $pattern = "";
237
                $level = 0;
238
239
                foreach ($elts[0] as $elt) {
240
                    /*
241
                     * for "(", ")" remember the nesting level, add "\"
242
                     * only to the non-"(?" ones.
243
                     */
244
245
                    switch ($elt) {
246
                        case '(':
247
                            $pattern .= '\(';
248
                            break;
249
                        case ')':
250
                            if ($level > 0)
251
                                $level--; /* closing (? */
252
                            else $pattern .= '\\';
253
                            $pattern .= ')';
254
                            break;
255
                        case '(?':
256
                            $level++;
257
                            $pattern .= '(?';
258
                            break;
259
                        default:
260
                            if (substr($elt, 0, 1) == '\\')
261
                                $pattern .= $elt;
262
                            else $pattern .= str_replace('/', '\/', $elt);
263
                    }
264
                }
265
                $this->patterns[$unicode][$i] = "($pattern)";
266
            }
267
            $this->regexes[$unicode] = "/" . implode("|", $this->patterns[$unicode]) .
268
                                       "/" . $this->getPerlMatchingFlags($unicode);
269
        }
270
        return $this->regexes[$unicode];
271
    }
272
273
    /**
274
     * Accessor for perl regex mode flags to use.
275
     * @param boolean $unicode     True for Unicode-aware, false for byte-oriented.
276
     * @return string              Perl regex flags.
277
     */
278
    protected function getPerlMatchingFlags($unicode)
279
    {
280
        $u = ($unicode ? "u" : "");
281
        $i = ($this->case ? "" : "i");
282
        return $u . "msS" . $i;
283
    }
284
}
285