Completed
Pull Request — master (#3255)
by
unknown
02:59
created

ParallelRegex::match()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 8

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 2
nc 2
nop 2
dl 0
loc 8
rs 10
c 0
b 0
f 0
1
<?php
2
/**
3
 * Lexer adapted from Simple Test: http://sourceforge.net/projects/simpletest/
4
 * For an intro to the Lexer see:
5
 * https://web.archive.org/web/20120125041816/http://www.phppatterns.com/docs/develop/simple_test_lexer_notes
6
 *
7
 * @author Marcus Baker http://www.lastcraft.com
8
 */
9
10
namespace dokuwiki\Parsing\Lexer;
11
12
/**
13
 * Compounded regular expression.
14
 *
15
 * Any of the contained patterns could match and when one does it's label is returned.
16
 */
17
class ParallelRegex
18
{
19
    /** @var string[][] patterns to match */
20
    protected $patterns;
21
    /** @var string[][] labels for above patterns */
22
    protected $labels;
23
    /** @var string[] the compound regexes matching all patterns */
24
    protected $regexes;
25
    /** @var bool case sensitive matching? */
26
    protected $case;
27
28
    /**
29
     * Constructor. Starts with no patterns.
30
     *
31
     * @param boolean $case    True for case sensitive, false for insensitive.
32
     */
33
    public function __construct($case)
34
    {
35
        $this->case = $case;
36
        $this->patterns = array();
37
        $this->labels = array();
38
        $this->regexes = array();
39
    }
40
41
    /**
42
     * Adds a pattern with an optional label.
43
     *
44
     * @param mixed       $pattern Perl style regex. Must be UTF-8 encoded. If its a string,
45
     *                             the (, ) lose their meaning unless they form part of
46
     *                             a lookahead or lookbehind assertation.
47
     * @param bool|string $label   Label of regex to be returned on a match. Label must be ASCII
48
     * @param boolean $unicode     True for Unicode-aware, false for single-byte treatment.
49
     */
50
    public function addPattern($pattern, $label = true, $unicode = false)
51
    {
52
        if (! isset($this->patterns[$unicode])) {
53
            $this->patterns[$unicode] = array();
54
            $this->labels[$unicode] = array();
55
        }
56
        $count = count($this->patterns[$unicode]);
57
        $this->patterns[$unicode][$count] = $pattern;
58
        $this->labels[$unicode][$count] = $label;
59
        $this->regexes[$unicode] = null;
60
    }
61
62
    /**
63
     * Attempts to match all patterns at once against a string.
64
     *
65
     * @param string $subject      String to match against.
66
     * @param string $match        First matched portion of subject.
67
     * @return bool|string         False if no match found, label if label exists, true if not
68
     */
69
    public function match($subject, &$match)
70
    {
71
        $trySingleByte = $this->partialMatch($subject, $match, false);
72
        if ($trySingleByte !== false) {
73
            return $trySingleByte;
74
        }
75
        return $this->partialMatch($subject, $match, true);
76
    }
77
78
    /**
79
     * Attempts to match all patterns of a certain type at once against a string.
80
     *
81
     * @param string $subject      String to match against.
82
     * @param string $match        First matched portion of subject.
83
     * @param boolean $unicode     True for Unicode-aware, false for single-byte treatment.
84
     * @return bool|string         False if no match found, label if label exists, true if not
85
     */
86
    protected function partialMatch($subject, &$match, $unicode)
87
    {
88
        if (! isset($this->patterns[$unicode]) || count($this->patterns[$unicode]) == 0) {
89
            return false;
90
        }
91
        if (! preg_match($this->getCompoundedRegex($unicode), $subject, $matches)) {
92
            $match = "";
93
            return false;
94
        }
95
96
        $match = $matches[0];
97
        $size = count($matches);
98
        // FIXME this could be made faster by storing the labels as keys in a hashmap
99
        for ($i = 1; $i < $size; $i++) {
100
            if ($matches[$i] && isset($this->labels[$unicode][$i - 1])) {
101
                return $this->labels[$unicode][$i - 1];
102
            }
103
        }
104
        return true;
105
    }
106
107
    /**
108
     * Attempts to split the string against all patterns at once.
109
     *
110
     * @param string $subject      String to match against.
111
     * @param array $split         The split result: array containing, pre-match, match & post-match strings
112
     * @return boolean             True on success.
113
     *
114
     * @author Christopher Smith <[email protected]>
115
     */
116
    public function split($subject, &$split)
117
    {
118
        $trySingleByte = $this->partialSplit($subject, $split, false);
119
        if ($trySingleByte !== false) {
120
            return $trySingleByte;
121
        }
122
        return $this->partialSplit($subject, $split, true);
123
    }
124
125
    /**
126
     * Attempts to split the string against all patterns of a certain type at once.
127
     *
128
     * @param string $subject      String to match against.
129
     * @param array $split         The split result: array containing, pre-match, match & post-match strings
130
     * @param boolean $unicode     True for Unicode-aware, false for single-byte treatment.
131
     * @return boolean             True on success.
132
     *
133
     * @author Christopher Smith <[email protected]>
134
     */
135
    protected function partialSplit($subject, &$split, $unicode)
136
    {
137
        if (! isset($this->patterns[$unicode]) || count($this->patterns[$unicode]) == 0) {
138
            return false;
139
        }
140
141
        if (! preg_match($this->getCompoundedRegex($unicode), $subject, $matches)) {
142
            if (function_exists('preg_last_error')) {
143
                $err = preg_last_error();
144
                switch ($err) {
145
                    case PREG_BACKTRACK_LIMIT_ERROR:
146
                        msg('A PCRE backtrack error occured. Try to increase the pcre.backtrack_limit in php.ini', -1);
147
                        break;
148
                    case PREG_RECURSION_LIMIT_ERROR:
149
                        msg('A PCRE recursion error occured. Try to increase the pcre.recursion_limit in php.ini', -1);
150
                        break;
151
                    case PREG_BAD_UTF8_ERROR:
152
                        msg('A PCRE UTF-8 error occured. This might be caused by a faulty plugin', -1);
153
                        break;
154
                    case PREG_INTERNAL_ERROR:
155
                        msg('A PCRE internal error occured. This might be caused by a faulty plugin', -1);
156
                        break;
157
                }
158
            }
159
160
            $split = array($subject, "", "");
161
            return false;
162
        }
163
164
        $idx = count($matches)-2;
165
        $pattern = $this->patterns[$unicode][$idx] . $this->getPerlMatchingFlags($unicode);
166
        list($pre, $post) = preg_split($pattern, $subject, 2);
167
        $split = array($pre, $matches[0], $post);
168
169
        return isset($this->labels[$unicode][$idx]) ? $this->labels[$unicode][$idx] : true;
170
    }
171
172
    /**
173
     * Compounds the patterns into a single regular expression separated with the
174
     * "or" operator. Caches the regex. Will automatically escape (, ) and / tokens.
175
     *
176
     * @param boolean $unicode     True for Unicode-aware, false for single-byte treatment.
177
     * @return null|string
178
     */
179
    protected function getCompoundedRegex($unicode)
180
    {
181
        if ($this->regexes[$unicode] == null) {
182
            $cnt = count($this->patterns[$unicode]);
183
            for ($i = 0; $i < $cnt; $i++) {
184
                /*
185
                 * decompose the input pattern into "(", "(?", ")",
186
                 * "[...]", "[]..]", "[^]..]", "[...[:...:]..]", "\x"...
187
                 * elements.
188
                 */
189
                preg_match_all('/\\\\.|' .
190
                               '\(\?|' .
191
                               '[()]|' .
192
                               '\[\^?\]?(?:\\\\.|\[:[^]]*:\]|[^]\\\\])*\]|' .
193
                               '[^[()\\\\]+/', $this->patterns[$unicode][$i], $elts);
194
195
                $pattern = "";
196
                $level = 0;
197
198
                foreach ($elts[0] as $elt) {
199
                    /*
200
                     * for "(", ")" remember the nesting level, add "\"
201
                     * only to the non-"(?" ones.
202
                     */
203
204
                    switch ($elt) {
205
                        case '(':
206
                            $pattern .= '\(';
207
                            break;
208
                        case ')':
209
                            if ($level > 0)
210
                                $level--; /* closing (? */
211
                            else $pattern .= '\\';
212
                            $pattern .= ')';
213
                            break;
214
                        case '(?':
215
                            $level++;
216
                            $pattern .= '(?';
217
                            break;
218
                        default:
219
                            if (substr($elt, 0, 1) == '\\')
220
                                $pattern .= $elt;
221
                            else $pattern .= str_replace('/', '\/', $elt);
222
                    }
223
                }
224
                $this->patterns[$unicode][$i] = "($pattern)";
225
            }
226
            $this->regexes[$unicode] = "/" . implode("|", $this->patterns[$unicode]) .
227
                                       "/" . $this->getPerlMatchingFlags($unicode);
228
        }
229
        return $this->regexes[$unicode];
230
    }
231
232
    /**
233
     * Accessor for perl regex mode flags to use.
234
     * @param boolean $unicode     True for Unicode-aware, false for single-byte treatment.
235
     * @return string              Perl regex flags.
236
     */
237
    protected function getPerlMatchingFlags($unicode)
238
    {
239
        $u = ($unicode ? "u" : "");
240
        $i = ($this->case ? "" : "i");
241
        return $u . "msS" . $i;
242
    }
243
}
244