|
1
|
|
|
<?php |
|
2
|
|
|
|
|
3
|
|
|
/** |
|
4
|
|
|
* @package s9e\TextFormatter |
|
5
|
|
|
* @copyright Copyright (c) The s9e authors |
|
6
|
|
|
* @license http://www.opensource.org/licenses/mit-license.php The MIT License |
|
7
|
|
|
*/ |
|
8
|
|
|
namespace s9e\TextFormatter\Configurator\Helpers; |
|
9
|
|
|
|
|
10
|
|
|
use RuntimeException; |
|
11
|
|
|
|
|
12
|
|
|
abstract class RegexpParser |
|
13
|
|
|
{ |
|
14
|
|
|
/** |
|
15
|
|
|
* Generate a regexp that matches any single character allowed in a regexp |
|
16
|
|
|
* |
|
17
|
|
|
* This method will generate a regexp that can be used to determine whether a given character |
|
18
|
|
|
* could in theory be allowed in a string that matches the source regexp. For example, the source |
|
19
|
|
|
* regexp /^a+$/D would generate /a/ while /^foo\d+$/D would generate /[fo\d]/ whereas the regexp |
|
20
|
|
|
* /foo/ would generate // because it's not anchored so any characters could be found before or |
|
21
|
|
|
* after the literal "foo". |
|
22
|
|
|
* |
|
23
|
|
|
* @param string $regexp Source regexp |
|
24
|
|
|
* @return string Regexp that matches any single character allowed in the source regexp |
|
25
|
|
|
*/ |
|
26
|
44 |
|
public static function getAllowedCharacterRegexp($regexp) |
|
27
|
|
|
{ |
|
28
|
44 |
|
$def = self::parse($regexp); |
|
29
|
|
|
|
|
30
|
|
|
// If the regexp is uses the multiline modifier, this regexp can't match the whole string if |
|
31
|
|
|
// it contains newlines, so in effect it could allow any content |
|
32
|
44 |
|
if (strpos($def['modifiers'], 'm') !== false) |
|
33
|
|
|
{ |
|
34
|
1 |
|
return '//'; |
|
35
|
|
|
} |
|
36
|
|
|
|
|
37
|
43 |
|
if (substr($def['regexp'], 0, 1) !== '^' |
|
38
|
43 |
|
|| substr($def['regexp'], -1) !== '$') |
|
39
|
|
|
{ |
|
40
|
1 |
|
return '//'; |
|
41
|
|
|
} |
|
42
|
|
|
|
|
43
|
|
|
// Append a token to mark the end of the regexp |
|
44
|
42 |
|
$def['tokens'][] = [ |
|
45
|
42 |
|
'pos' => strlen($def['regexp']), |
|
46
|
42 |
|
'len' => 0, |
|
47
|
42 |
|
'type' => 'end' |
|
48
|
|
|
]; |
|
49
|
|
|
|
|
50
|
42 |
|
$patterns = []; |
|
51
|
|
|
|
|
52
|
|
|
// Collect the literal portions of the source regexp while testing for alternations |
|
53
|
42 |
|
$literal = ''; |
|
54
|
42 |
|
$pos = 0; |
|
55
|
42 |
|
$skipPos = 0; |
|
56
|
42 |
|
$depth = 0; |
|
57
|
42 |
|
foreach ($def['tokens'] as $token) |
|
58
|
|
|
{ |
|
59
|
|
|
// Skip options |
|
60
|
42 |
|
if ($token['type'] === 'option') |
|
61
|
|
|
{ |
|
62
|
1 |
|
$skipPos = max($skipPos, $token['pos'] + $token['len']); |
|
63
|
|
|
} |
|
64
|
|
|
|
|
65
|
|
|
// Skip assertions |
|
66
|
42 |
|
if (strpos($token['type'], 'AssertionStart') !== false) |
|
67
|
|
|
{ |
|
68
|
3 |
|
$endToken = $def['tokens'][$token['endToken']]; |
|
69
|
3 |
|
$skipPos = max($skipPos, $endToken['pos'] + $endToken['len']); |
|
70
|
|
|
} |
|
71
|
|
|
|
|
72
|
42 |
|
if ($token['pos'] >= $skipPos) |
|
73
|
|
|
{ |
|
74
|
42 |
|
if ($token['type'] === 'characterClass') |
|
75
|
|
|
{ |
|
76
|
10 |
|
$patterns[] = '[' . $token['content'] . ']'; |
|
77
|
|
|
} |
|
78
|
|
|
|
|
79
|
42 |
|
if ($token['pos'] > $pos) |
|
80
|
|
|
{ |
|
81
|
|
|
// Capture the content between last position and current position |
|
82
|
42 |
|
$tmp = substr($def['regexp'], $pos, $token['pos'] - $pos); |
|
83
|
|
|
|
|
84
|
|
|
// Append the content to the literal portion |
|
85
|
42 |
|
$literal .= $tmp; |
|
86
|
|
|
|
|
87
|
|
|
// Test for alternations if it's the root of the regexp |
|
88
|
42 |
|
if (!$depth) |
|
89
|
|
|
{ |
|
90
|
|
|
// Remove literal backslashes for convenience |
|
91
|
42 |
|
$tmp = str_replace('\\\\', '', $tmp); |
|
92
|
|
|
|
|
93
|
|
|
// Look for an unescaped | that is not followed by ^ |
|
94
|
42 |
|
if (preg_match('/(?<!\\\\)\\|(?!\\^)/', $tmp)) |
|
95
|
|
|
{ |
|
96
|
2 |
|
return '//'; |
|
97
|
|
|
} |
|
98
|
|
|
|
|
99
|
|
|
// Look for an unescaped | that is not preceded by $ |
|
100
|
41 |
|
if (preg_match('/(?<![$\\\\])\\|/', $tmp)) |
|
101
|
|
|
{ |
|
102
|
1 |
|
return '//'; |
|
103
|
|
|
} |
|
104
|
|
|
} |
|
105
|
|
|
} |
|
106
|
|
|
} |
|
107
|
|
|
|
|
108
|
41 |
|
if (substr($token['type'], -5) === 'Start') |
|
109
|
|
|
{ |
|
110
|
10 |
|
++$depth; |
|
111
|
|
|
} |
|
112
|
41 |
|
elseif (substr($token['type'], -3) === 'End') |
|
113
|
|
|
{ |
|
114
|
10 |
|
--$depth; |
|
115
|
|
|
} |
|
116
|
|
|
|
|
117
|
41 |
|
$pos = max($skipPos, $token['pos'] + $token['len']); |
|
118
|
|
|
} |
|
119
|
|
|
|
|
120
|
|
|
// Test for the presence of an unescaped dot |
|
121
|
39 |
|
if (preg_match('#(?<!\\\\)(?:\\\\\\\\)*\\.#', $literal)) |
|
122
|
|
|
{ |
|
123
|
4 |
|
if (strpos($def['modifiers'], 's') !== false |
|
124
|
4 |
|
|| strpos($literal, "\n") !== false) |
|
125
|
|
|
{ |
|
126
|
1 |
|
return '//'; |
|
127
|
|
|
} |
|
128
|
|
|
|
|
129
|
3 |
|
$patterns[] = '.'; |
|
130
|
|
|
|
|
131
|
|
|
// Remove unescaped dots |
|
132
|
3 |
|
$literal = preg_replace('#(?<!\\\\)((?:\\\\\\\\)*)\\.#', '$1', $literal); |
|
133
|
|
|
} |
|
134
|
|
|
|
|
135
|
|
|
// Remove unescaped quantifiers *, + and ? |
|
136
|
38 |
|
$literal = preg_replace('#(?<!\\\\)((?:\\\\\\\\)*)[*+?]#', '$1', $literal); |
|
137
|
|
|
|
|
138
|
|
|
// Remove unescaped quantifiers {} |
|
139
|
38 |
|
$literal = preg_replace('#(?<!\\\\)((?:\\\\\\\\)*)\\{[^}]+\\}#', '$1', $literal); |
|
140
|
|
|
|
|
141
|
|
|
// Remove backslash assertions \b, \B, \A, \Z, \z and \G, as well as back references |
|
142
|
38 |
|
$literal = preg_replace('#(?<!\\\\)((?:\\\\\\\\)*)\\\\[bBAZzG1-9]#', '$1', $literal); |
|
143
|
|
|
|
|
144
|
|
|
// Remove unescaped ^, | and $ |
|
145
|
38 |
|
$literal = preg_replace('#(?<!\\\\)((?:\\\\\\\\)*)[$^|]#', '$1', $literal); |
|
146
|
|
|
|
|
147
|
|
|
// Escape unescaped - and ] so they are safe to use in a character class |
|
148
|
38 |
|
$literal = preg_replace('#(?<!\\\\)((?:\\\\\\\\)*)([-^\\]])#', '$1\\\\$2', $literal); |
|
149
|
|
|
|
|
150
|
|
|
// If the regexp doesn't use PCRE_DOLLAR_ENDONLY, it could end with a \n |
|
151
|
38 |
|
if (strpos($def['modifiers'], 'D') === false) |
|
152
|
|
|
{ |
|
153
|
22 |
|
$literal .= "\n"; |
|
154
|
|
|
} |
|
155
|
|
|
|
|
156
|
|
|
// Add the literal portion of the regexp to the patterns, as a character class |
|
157
|
38 |
|
if ($literal !== '') |
|
158
|
|
|
{ |
|
159
|
30 |
|
$patterns[] = '[' . $literal . ']'; |
|
160
|
|
|
} |
|
161
|
|
|
|
|
162
|
|
|
// Test whether this regexp actually matches anything |
|
163
|
38 |
|
if (empty($patterns)) |
|
164
|
|
|
{ |
|
165
|
1 |
|
return '/^$/D'; |
|
166
|
|
|
} |
|
167
|
|
|
|
|
168
|
|
|
// Build the allowed characters regexp |
|
169
|
37 |
|
$regexp = $def['delimiter'] . implode('|', $patterns) . $def['delimiter']; |
|
170
|
|
|
|
|
171
|
|
|
// Add the modifiers |
|
172
|
37 |
|
if (strpos($def['modifiers'], 'i') !== false) |
|
173
|
|
|
{ |
|
174
|
4 |
|
$regexp .= 'i'; |
|
175
|
|
|
} |
|
176
|
37 |
|
if (strpos($def['modifiers'], 'u') !== false) |
|
177
|
|
|
{ |
|
178
|
2 |
|
$regexp .= 'u'; |
|
179
|
|
|
} |
|
180
|
|
|
|
|
181
|
37 |
|
return $regexp; |
|
182
|
|
|
} |
|
183
|
|
|
|
|
184
|
|
|
/** |
|
185
|
|
|
* Return the name of each capture in given regexp |
|
186
|
|
|
* |
|
187
|
|
|
* Will return an empty string for unnamed captures |
|
188
|
|
|
* |
|
189
|
|
|
* @param string $regexp |
|
190
|
|
|
* @return string[] |
|
191
|
|
|
*/ |
|
192
|
3 |
|
public static function getCaptureNames($regexp) |
|
193
|
|
|
{ |
|
194
|
3 |
|
$map = ['']; |
|
195
|
3 |
|
$regexpInfo = self::parse($regexp); |
|
196
|
3 |
|
foreach ($regexpInfo['tokens'] as $tok) |
|
197
|
|
|
{ |
|
198
|
2 |
|
if ($tok['type'] === 'capturingSubpatternStart') |
|
199
|
|
|
{ |
|
200
|
2 |
|
$map[] = $tok['name'] ?? ''; |
|
201
|
|
|
} |
|
202
|
|
|
} |
|
203
|
|
|
|
|
204
|
3 |
|
return $map; |
|
205
|
|
|
} |
|
206
|
|
|
|
|
207
|
|
|
/** |
|
208
|
|
|
* @param string $regexp |
|
209
|
|
|
* @return array |
|
210
|
|
|
*/ |
|
211
|
75 |
|
public static function parse($regexp) |
|
212
|
|
|
{ |
|
213
|
75 |
|
if (!preg_match('#^(.)(.*?)\\1([a-zA-Z]*)$#Ds', $regexp, $m)) |
|
214
|
|
|
{ |
|
215
|
1 |
|
throw new RuntimeException('Could not parse regexp delimiters'); |
|
216
|
|
|
} |
|
217
|
|
|
|
|
218
|
|
|
$ret = [ |
|
219
|
74 |
|
'delimiter' => $m[1], |
|
220
|
74 |
|
'modifiers' => $m[3], |
|
221
|
74 |
|
'regexp' => $m[2], |
|
222
|
|
|
'tokens' => [] |
|
223
|
|
|
]; |
|
224
|
|
|
|
|
225
|
74 |
|
$regexp = $m[2]; |
|
226
|
|
|
|
|
227
|
74 |
|
$openSubpatterns = []; |
|
228
|
|
|
|
|
229
|
74 |
|
$pos = 0; |
|
230
|
74 |
|
$regexpLen = strlen($regexp); |
|
231
|
|
|
|
|
232
|
74 |
|
while ($pos < $regexpLen) |
|
233
|
|
|
{ |
|
234
|
73 |
|
switch ($regexp[$pos]) |
|
235
|
|
|
{ |
|
236
|
|
|
case '\\': |
|
237
|
|
|
// skip next character |
|
238
|
16 |
|
$pos += 2; |
|
239
|
16 |
|
break; |
|
240
|
|
|
|
|
241
|
|
|
case '[': |
|
242
|
16 |
|
if (!preg_match('#\\[(.*?(?<!\\\\)(?:\\\\\\\\)*+)\\]((?:[+*][+?]?|\\?)?)#A', $regexp, $m, 0, $pos)) |
|
243
|
|
|
{ |
|
244
|
1 |
|
throw new RuntimeException('Could not find matching bracket from pos ' . $pos); |
|
245
|
|
|
} |
|
246
|
|
|
|
|
247
|
15 |
|
$ret['tokens'][] = [ |
|
248
|
15 |
|
'pos' => $pos, |
|
249
|
15 |
|
'len' => strlen($m[0]), |
|
250
|
15 |
|
'type' => 'characterClass', |
|
251
|
15 |
|
'content' => $m[1], |
|
252
|
15 |
|
'quantifiers' => $m[2] |
|
253
|
|
|
]; |
|
254
|
|
|
|
|
255
|
15 |
|
$pos += strlen($m[0]); |
|
256
|
15 |
|
break; |
|
257
|
|
|
|
|
258
|
|
|
case '(': |
|
259
|
30 |
|
if (preg_match('#\\(\\?([a-z]*)\\)#iA', $regexp, $m, 0, $pos)) |
|
260
|
|
|
{ |
|
261
|
|
|
// This is an option (?i) so we skip past the right parenthesis |
|
262
|
2 |
|
$ret['tokens'][] = [ |
|
263
|
2 |
|
'pos' => $pos, |
|
264
|
2 |
|
'len' => strlen($m[0]), |
|
265
|
2 |
|
'type' => 'option', |
|
266
|
2 |
|
'options' => $m[1] |
|
267
|
|
|
]; |
|
268
|
|
|
|
|
269
|
2 |
|
$pos += strlen($m[0]); |
|
270
|
2 |
|
break; |
|
271
|
|
|
} |
|
272
|
|
|
|
|
273
|
|
|
// This should be a subpattern, we just have to sniff which kind |
|
274
|
28 |
|
if (preg_match("#(?J)\\(\\?(?:P?<(?<name>[a-z_0-9]+)>|'(?<name>[a-z_0-9]+)')#A", $regexp, $m, \PREG_OFFSET_CAPTURE, $pos)) |
|
275
|
|
|
{ |
|
276
|
|
|
// This is a named capture |
|
277
|
|
|
$tok = [ |
|
278
|
4 |
|
'pos' => $pos, |
|
279
|
4 |
|
'len' => strlen($m[0][0]), |
|
280
|
4 |
|
'type' => 'capturingSubpatternStart', |
|
281
|
4 |
|
'name' => $m['name'][0] |
|
282
|
|
|
]; |
|
283
|
|
|
|
|
284
|
4 |
|
$pos += strlen($m[0][0]); |
|
285
|
|
|
} |
|
286
|
25 |
|
elseif (preg_match('#\\(\\?([a-z]*):#iA', $regexp, $m, 0, $pos)) |
|
287
|
|
|
{ |
|
288
|
|
|
// This is a non-capturing subpattern (?:xxx) |
|
289
|
|
|
$tok = [ |
|
290
|
8 |
|
'pos' => $pos, |
|
291
|
8 |
|
'len' => strlen($m[0]), |
|
292
|
8 |
|
'type' => 'nonCapturingSubpatternStart', |
|
293
|
8 |
|
'options' => $m[1] |
|
294
|
|
|
]; |
|
295
|
|
|
|
|
296
|
8 |
|
$pos += strlen($m[0]); |
|
297
|
|
|
} |
|
298
|
18 |
|
elseif (preg_match('#\\(\\?>#iA', $regexp, $m, 0, $pos)) |
|
299
|
|
|
{ |
|
300
|
|
|
/* This is a non-capturing subpattern with atomic grouping "(?>x+)" */ |
|
301
|
|
|
$tok = [ |
|
302
|
1 |
|
'pos' => $pos, |
|
303
|
1 |
|
'len' => strlen($m[0]), |
|
304
|
1 |
|
'type' => 'nonCapturingSubpatternStart', |
|
305
|
1 |
|
'subtype' => 'atomic' |
|
306
|
|
|
]; |
|
307
|
|
|
|
|
308
|
1 |
|
$pos += strlen($m[0]); |
|
309
|
|
|
} |
|
310
|
17 |
|
elseif (preg_match('#\\(\\?(<?[!=])#A', $regexp, $m, 0, $pos)) |
|
311
|
|
|
{ |
|
312
|
|
|
// This is an assertion |
|
313
|
|
|
$assertions = [ |
|
314
|
7 |
|
'=' => 'lookahead', |
|
315
|
|
|
'<=' => 'lookbehind', |
|
316
|
|
|
'!' => 'negativeLookahead', |
|
317
|
|
|
'<!' => 'negativeLookbehind' |
|
318
|
|
|
]; |
|
319
|
|
|
|
|
320
|
|
|
$tok = [ |
|
321
|
7 |
|
'pos' => $pos, |
|
322
|
7 |
|
'len' => strlen($m[0]), |
|
323
|
7 |
|
'type' => $assertions[$m[1]] . 'AssertionStart' |
|
324
|
|
|
]; |
|
325
|
|
|
|
|
326
|
7 |
|
$pos += strlen($m[0]); |
|
327
|
|
|
} |
|
328
|
10 |
|
elseif (preg_match('#\\(\\?#A', $regexp, $m, 0, $pos)) |
|
329
|
|
|
{ |
|
330
|
1 |
|
throw new RuntimeException('Unsupported subpattern type at pos ' . $pos); |
|
331
|
|
|
} |
|
332
|
|
|
else |
|
333
|
|
|
{ |
|
334
|
|
|
// This should be a normal capture |
|
335
|
|
|
$tok = [ |
|
336
|
9 |
|
'pos' => $pos, |
|
337
|
9 |
|
'len' => 1, |
|
338
|
9 |
|
'type' => 'capturingSubpatternStart' |
|
339
|
|
|
]; |
|
340
|
|
|
|
|
341
|
9 |
|
++$pos; |
|
342
|
|
|
} |
|
343
|
|
|
|
|
344
|
27 |
|
$openSubpatterns[] = count($ret['tokens']); |
|
345
|
27 |
|
$ret['tokens'][] = $tok; |
|
346
|
27 |
|
break; |
|
347
|
|
|
|
|
348
|
|
|
case ')': |
|
349
|
27 |
|
if (empty($openSubpatterns)) |
|
350
|
|
|
{ |
|
351
|
1 |
|
throw new RuntimeException('Could not find matching pattern start for right parenthesis at pos ' . $pos); |
|
352
|
|
|
} |
|
353
|
|
|
|
|
354
|
|
|
// Add the key to this token to its matching token and capture this subpattern's |
|
355
|
|
|
// content |
|
356
|
26 |
|
$k = array_pop($openSubpatterns); |
|
357
|
26 |
|
$startToken =& $ret['tokens'][$k]; |
|
358
|
26 |
|
$startToken['endToken'] = count($ret['tokens']); |
|
359
|
26 |
|
$startToken['content'] = substr( |
|
360
|
26 |
|
$regexp, |
|
361
|
26 |
|
$startToken['pos'] + $startToken['len'], |
|
362
|
26 |
|
$pos - ($startToken['pos'] + $startToken['len']) |
|
363
|
|
|
); |
|
364
|
|
|
|
|
365
|
|
|
// Look for quantifiers after the subpattern, e.g. (?:ab)++ |
|
366
|
26 |
|
$spn = strspn($regexp, '+*?', 1 + $pos); |
|
367
|
26 |
|
$quantifiers = substr($regexp, 1 + $pos, $spn); |
|
368
|
|
|
|
|
369
|
26 |
|
$ret['tokens'][] = [ |
|
370
|
26 |
|
'pos' => $pos, |
|
371
|
26 |
|
'len' => 1 + $spn, |
|
372
|
26 |
|
'type' => substr($startToken['type'], 0, -5) . 'End', |
|
373
|
26 |
|
'quantifiers' => $quantifiers |
|
374
|
|
|
]; |
|
375
|
|
|
|
|
376
|
26 |
|
unset($startToken); |
|
377
|
|
|
|
|
378
|
26 |
|
$pos += 1 + $spn; |
|
379
|
26 |
|
break; |
|
380
|
|
|
|
|
381
|
|
|
default: |
|
382
|
67 |
|
++$pos; |
|
383
|
|
|
} |
|
384
|
|
|
} |
|
385
|
|
|
|
|
386
|
71 |
|
if (!empty($openSubpatterns)) |
|
387
|
|
|
{ |
|
388
|
1 |
|
throw new RuntimeException('Could not find matching pattern end for left parenthesis at pos ' . $ret['tokens'][$openSubpatterns[0]]['pos']); |
|
389
|
|
|
} |
|
390
|
|
|
|
|
391
|
70 |
|
return $ret; |
|
392
|
|
|
} |
|
393
|
|
|
} |