Completed
Pull Request — master (#3255)
by
unknown
02:51
created

ParallelRegex::split()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 8

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 2
nc 2
nop 2
dl 0
loc 8
rs 10
c 0
b 0
f 0
1
<?php
2
/**
3
 * Lexer adapted from Simple Test: http://sourceforge.net/projects/simpletest/
4
 * For an intro to the Lexer see:
5
 * https://web.archive.org/web/20120125041816/http://www.phppatterns.com/docs/develop/simple_test_lexer_notes
6
 *
7
 * @author Marcus Baker http://www.lastcraft.com
8
 */
9
10
namespace dokuwiki\Parsing\Lexer;
11
12
/**
13
 * Compounded regular expression.
14
 *
15
 * Any of the contained patterns could match and when one does it's label is returned.
16
 */
17
class ParallelRegex
18
{
19
    /** @var string[][] patterns to match */
20
    protected $patterns;
21
    /** @var string[][] labels for above patterns */
22
    protected $labels;
23
    /** @var string[] the compound regexes matching all patterns */
24
    protected $regexes;
25
    /** @var bool case sensitive matching? */
26
    protected $case;
27
28
    /**
29
     * Constructor. Starts with no patterns.
30
     *
31
     * @param boolean $case    True for case sensitive, false
32
     *                         for insensitive.
33
     */
34
    public function __construct($case)
35
    {
36
        $this->case = $case;
37
        $this->patterns = array();
38
        $this->labels = array();
39
        $this->regexes = array();
40
    }
41
42
    /**
43
     * Adds a pattern with an optional label.
44
     *
45
     * @param mixed       $pattern Perl style regex. Must be UTF-8
46
     *                             encoded. If its a string, the (, )
47
     *                             lose their meaning unless they
48
     *                             form part of a lookahead or
49
     *                             lookbehind assertation.
50
     * @param bool|string $label   Label of regex to be returned
51
     *                             on a match. Label must be ASCII
52
     * @param boolean $unicode     True for Unicode-aware, false for single-byte treatment.
53
     */
54
    public function addPattern($pattern, $label = true, $unicode = false)
55
    {
56
        if (! isset($this->patterns[$unicode])) {
57
            $this->patterns[$unicode] = array();
58
            $this->labels[$unicode] = array();
59
        }
60
        $count = count($this->patterns[$unicode]);
61
        $this->patterns[$unicode][$count] = $pattern;
62
        $this->labels[$unicode][$count] = $label;
63
        $this->regexes[$unicode] = null;
64
    }
65
66
    /**
67
     * Attempts to match all patterns at once against a string.
68
     *
69
     * @param string $subject      String to match against.
70
     * @param string $match        First matched portion of
71
     *                             subject.
72
     * @return bool|string         False if no match found, label if label exists, true if not
73
     */
74
    public function match($subject, &$match)
75
    {
76
        $trySingleByte = $this->partialMatch($subject, $match, false);
77
        if ($trySingleByte !== false) {
78
            return $trySingleByte;
79
        }
80
        return $this->partialMatch($subject, $match, true);
81
    }
82
83
    /**
84
     * Attempts to match all patterns at once against a string.
85
     *
86
     * @param string $subject      String to match against.
87
     * @param string $match        First matched portion of
88
     *                             subject.
89
     * @param boolean $unicode     True for Unicode-aware, false for single-byte treatment.
90
     * @return bool|string         False if no match found, label if label exists, true if not
91
     */
92
    protected function partialMatch($subject, &$match, $unicode)
93
    {
94
        if (! isset($this->patterns[$unicode]) || count($this->patterns[$unicode]) == 0) {
95
            return false;
96
        }
97
        if (! preg_match($this->getCompoundedRegex($unicode), $subject, $matches)) {
98
            $match = "";
99
            return false;
100
        }
101
102
        $match = $matches[0];
103
        $size = count($matches);
104
        // FIXME this could be made faster by storing the labels as keys in a hashmap
105
        for ($i = 1; $i < $size; $i++) {
106
            if ($matches[$i] && isset($this->labels[$unicode][$i - 1])) {
107
                return $this->labels[$unicode][$i - 1];
108
            }
109
        }
110
        return true;
111
    }
112
113
    /**
114
     * Attempts to split the string against all patterns at once
115
     *
116
     * @param string $subject      String to match against.
117
     * @param array $split         The split result: array containing, pre-match, match & post-match strings
118
     * @return boolean             True on success.
119
     *
120
     * @author Christopher Smith <[email protected]>
121
     */
122
    public function split($subject, &$split)
123
    {
124
        $trySingleByte = $this->partialSplit($subject, $split, false);
125
        if ($trySingleByte !== false) {
126
            return $trySingleByte;
127
        }
128
        return $this->partialSplit($subject, $split, true);
129
    }
130
131
    /**
132
     * Attempts to split the string against all patterns at once
133
     *
134
     * @param string $subject      String to match against.
135
     * @param array $split         The split result: array containing, pre-match, match & post-match strings
136
     * @param boolean $unicode     True for Unicode-aware, false for single-byte treatment.
137
     * @return boolean             True on success.
138
     *
139
     * @author Christopher Smith <[email protected]>
140
     */
141
    protected function partialSplit($subject, &$split, $unicode)
142
    {
143
        if (! isset($this->patterns[$unicode]) || count($this->patterns[$unicode]) == 0) {
144
            return false;
145
        }
146
147
        if (! preg_match($this->getCompoundedRegex($unicode), $subject, $matches)) {
148
            if (function_exists('preg_last_error')) {
149
                $err = preg_last_error();
150
                switch ($err) {
151
                    case PREG_BACKTRACK_LIMIT_ERROR:
152
                        msg('A PCRE backtrack error occured. Try to increase the pcre.backtrack_limit in php.ini', -1);
153
                        break;
154
                    case PREG_RECURSION_LIMIT_ERROR:
155
                        msg('A PCRE recursion error occured. Try to increase the pcre.recursion_limit in php.ini', -1);
156
                        break;
157
                    case PREG_BAD_UTF8_ERROR:
158
                        msg('A PCRE UTF-8 error occured. This might be caused by a faulty plugin', -1);
159
                        break;
160
                    case PREG_INTERNAL_ERROR:
161
                        msg('A PCRE internal error occured. This might be caused by a faulty plugin', -1);
162
                        break;
163
                }
164
            }
165
166
            $split = array($subject, "", "");
167
            return false;
168
        }
169
170
        $idx = count($matches)-2;
171
        list($pre, $post) = preg_split($this->patterns[$unicode][$idx].$this->getPerlMatchingFlags($unicode), $subject, 2);
172
        $split = array($pre, $matches[0], $post);
173
174
        return isset($this->labels[$unicode][$idx]) ? $this->labels[$unicode][$idx] : true;
175
    }
176
177
    /**
178
     * Compounds the patterns into a single
179
     * regular expression separated with the
180
     * "or" operator. Caches the regex.
181
     * Will automatically escape (, ) and / tokens.
182
     *
183
     * @param boolean $unicode     True for Unicode-aware, false for single-byte treatment.
184
     * @return null|string
185
     */
186
    protected function getCompoundedRegex($unicode)
187
    {
188
        if ($this->regexes[$unicode] == null) {
189
            $cnt = count($this->patterns[$unicode]);
190
            for ($i = 0; $i < $cnt; $i++) {
191
                /*
192
                 * decompose the input pattern into "(", "(?", ")",
193
                 * "[...]", "[]..]", "[^]..]", "[...[:...:]..]", "\x"...
194
                 * elements.
195
                 */
196
                preg_match_all('/\\\\.|' .
197
                               '\(\?|' .
198
                               '[()]|' .
199
                               '\[\^?\]?(?:\\\\.|\[:[^]]*:\]|[^]\\\\])*\]|' .
200
                               '[^[()\\\\]+/', $this->patterns[$unicode][$i], $elts);
201
202
                $pattern = "";
203
                $level = 0;
204
205
                foreach ($elts[0] as $elt) {
206
                    /*
207
                     * for "(", ")" remember the nesting level, add "\"
208
                     * only to the non-"(?" ones.
209
                     */
210
211
                    switch ($elt) {
212
                        case '(':
213
                            $pattern .= '\(';
214
                            break;
215
                        case ')':
216
                            if ($level > 0)
217
                                $level--; /* closing (? */
218
                            else $pattern .= '\\';
219
                            $pattern .= ')';
220
                            break;
221
                        case '(?':
222
                            $level++;
223
                            $pattern .= '(?';
224
                            break;
225
                        default:
226
                            if (substr($elt, 0, 1) == '\\')
227
                                $pattern .= $elt;
228
                            else $pattern .= str_replace('/', '\/', $elt);
229
                    }
230
                }
231
                $this->patterns[$unicode][$i] = "($pattern)";
232
            }
233
            $this->regexes[$unicode] = "/" . implode("|", $this->patterns[$unicode]) . "/" . $this->getPerlMatchingFlags($unicode);
234
        }
235
        return $this->regexes[$unicode];
236
    }
237
238
    /**
239
     * Accessor for perl regex mode flags to use.
240
     * @param boolean $unicode     True for Unicode-aware, false for single-byte treatment.
241
     * @return string              Perl regex flags.
242
     */
243
    protected function getPerlMatchingFlags($unicode)
244
    {
245
        $u = ($unicode ? "u" : "");
246
        return ($this->case ? $u . "msS" : $u . "msSi");
247
    }
248
}
249