1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
/** |
4
|
|
|
* @package s9e\TextFormatter |
5
|
|
|
* @copyright Copyright (c) The s9e authors |
6
|
|
|
* @license http://www.opensource.org/licenses/mit-license.php The MIT License |
7
|
|
|
*/ |
8
|
|
|
namespace s9e\TextFormatter\Configurator\Helpers; |
9
|
|
|
|
10
|
|
|
use RuntimeException; |
11
|
|
|
|
12
|
|
|
abstract class RegexpParser |
13
|
|
|
{ |
14
|
|
|
/** |
15
|
|
|
* Generate a regexp that matches any single character allowed in a regexp |
16
|
|
|
* |
17
|
|
|
* This method will generate a regexp that can be used to determine whether a given character |
18
|
|
|
* could in theory be allowed in a string that matches the source regexp. For example, the source |
19
|
|
|
* regexp /^a+$/D would generate /a/ while /^foo\d+$/D would generate /[fo\d]/ whereas the regexp |
20
|
|
|
* /foo/ would generate // because it's not anchored so any characters could be found before or |
21
|
|
|
* after the literal "foo". |
22
|
|
|
* |
23
|
|
|
* @param string $regexp Source regexp |
24
|
|
|
* @return string Regexp that matches any single character allowed in the source regexp |
25
|
|
|
*/ |
26
|
44 |
|
public static function getAllowedCharacterRegexp($regexp) |
27
|
|
|
{ |
28
|
44 |
|
$def = self::parse($regexp); |
29
|
|
|
|
30
|
|
|
// If the regexp is uses the multiline modifier, this regexp can't match the whole string if |
31
|
|
|
// it contains newlines, so in effect it could allow any content |
32
|
44 |
|
if (strpos($def['modifiers'], 'm') !== false) |
33
|
|
|
{ |
34
|
1 |
|
return '//'; |
35
|
|
|
} |
36
|
|
|
|
37
|
43 |
|
if (substr($def['regexp'], 0, 1) !== '^' |
38
|
43 |
|
|| substr($def['regexp'], -1) !== '$') |
39
|
|
|
{ |
40
|
1 |
|
return '//'; |
41
|
|
|
} |
42
|
|
|
|
43
|
|
|
// Append a token to mark the end of the regexp |
44
|
42 |
|
$def['tokens'][] = [ |
45
|
42 |
|
'pos' => strlen($def['regexp']), |
46
|
42 |
|
'len' => 0, |
47
|
42 |
|
'type' => 'end' |
48
|
|
|
]; |
49
|
|
|
|
50
|
42 |
|
$patterns = []; |
51
|
|
|
|
52
|
|
|
// Collect the literal portions of the source regexp while testing for alternations |
53
|
42 |
|
$literal = ''; |
54
|
42 |
|
$pos = 0; |
55
|
42 |
|
$skipPos = 0; |
56
|
42 |
|
$depth = 0; |
57
|
42 |
|
foreach ($def['tokens'] as $token) |
58
|
|
|
{ |
59
|
|
|
// Skip options |
60
|
42 |
|
if ($token['type'] === 'option') |
61
|
|
|
{ |
62
|
1 |
|
$skipPos = max($skipPos, $token['pos'] + $token['len']); |
63
|
|
|
} |
64
|
|
|
|
65
|
|
|
// Skip assertions |
66
|
42 |
|
if (strpos($token['type'], 'AssertionStart') !== false) |
67
|
|
|
{ |
68
|
3 |
|
$endToken = $def['tokens'][$token['endToken']]; |
69
|
3 |
|
$skipPos = max($skipPos, $endToken['pos'] + $endToken['len']); |
70
|
|
|
} |
71
|
|
|
|
72
|
42 |
|
if ($token['pos'] >= $skipPos) |
73
|
|
|
{ |
74
|
42 |
|
if ($token['type'] === 'characterClass') |
75
|
|
|
{ |
76
|
10 |
|
$patterns[] = '[' . $token['content'] . ']'; |
77
|
|
|
} |
78
|
|
|
|
79
|
42 |
|
if ($token['pos'] > $pos) |
80
|
|
|
{ |
81
|
|
|
// Capture the content between last position and current position |
82
|
42 |
|
$tmp = substr($def['regexp'], $pos, $token['pos'] - $pos); |
83
|
|
|
|
84
|
|
|
// Append the content to the literal portion |
85
|
42 |
|
$literal .= $tmp; |
86
|
|
|
|
87
|
|
|
// Test for alternations if it's the root of the regexp |
88
|
42 |
|
if (!$depth) |
89
|
|
|
{ |
90
|
|
|
// Remove literal backslashes for convenience |
91
|
42 |
|
$tmp = str_replace('\\\\', '', $tmp); |
92
|
|
|
|
93
|
|
|
// Look for an unescaped | that is not followed by ^ |
94
|
42 |
|
if (preg_match('/(?<!\\\\)\\|(?!\\^)/', $tmp)) |
95
|
|
|
{ |
96
|
2 |
|
return '//'; |
97
|
|
|
} |
98
|
|
|
|
99
|
|
|
// Look for an unescaped | that is not preceded by $ |
100
|
41 |
|
if (preg_match('/(?<![$\\\\])\\|/', $tmp)) |
101
|
|
|
{ |
102
|
1 |
|
return '//'; |
103
|
|
|
} |
104
|
|
|
} |
105
|
|
|
} |
106
|
|
|
} |
107
|
|
|
|
108
|
41 |
|
if (substr($token['type'], -5) === 'Start') |
109
|
|
|
{ |
110
|
10 |
|
++$depth; |
111
|
|
|
} |
112
|
41 |
|
elseif (substr($token['type'], -3) === 'End') |
113
|
|
|
{ |
114
|
10 |
|
--$depth; |
115
|
|
|
} |
116
|
|
|
|
117
|
41 |
|
$pos = max($skipPos, $token['pos'] + $token['len']); |
118
|
|
|
} |
119
|
|
|
|
120
|
|
|
// Test for the presence of an unescaped dot |
121
|
39 |
|
if (preg_match('#(?<!\\\\)(?:\\\\\\\\)*\\.#', $literal)) |
122
|
|
|
{ |
123
|
4 |
|
if (strpos($def['modifiers'], 's') !== false |
124
|
4 |
|
|| strpos($literal, "\n") !== false) |
125
|
|
|
{ |
126
|
1 |
|
return '//'; |
127
|
|
|
} |
128
|
|
|
|
129
|
3 |
|
$patterns[] = '.'; |
130
|
|
|
|
131
|
|
|
// Remove unescaped dots |
132
|
3 |
|
$literal = preg_replace('#(?<!\\\\)((?:\\\\\\\\)*)\\.#', '$1', $literal); |
133
|
|
|
} |
134
|
|
|
|
135
|
|
|
// Remove unescaped quantifiers *, + and ? |
136
|
38 |
|
$literal = preg_replace('#(?<!\\\\)((?:\\\\\\\\)*)[*+?]#', '$1', $literal); |
137
|
|
|
|
138
|
|
|
// Remove unescaped quantifiers {} |
139
|
38 |
|
$literal = preg_replace('#(?<!\\\\)((?:\\\\\\\\)*)\\{[^}]+\\}#', '$1', $literal); |
140
|
|
|
|
141
|
|
|
// Remove backslash assertions \b, \B, \A, \Z, \z and \G, as well as back references |
142
|
38 |
|
$literal = preg_replace('#(?<!\\\\)((?:\\\\\\\\)*)\\\\[bBAZzG1-9]#', '$1', $literal); |
143
|
|
|
|
144
|
|
|
// Remove unescaped ^, | and $ |
145
|
38 |
|
$literal = preg_replace('#(?<!\\\\)((?:\\\\\\\\)*)[$^|]#', '$1', $literal); |
146
|
|
|
|
147
|
|
|
// Escape unescaped - and ] so they are safe to use in a character class |
148
|
38 |
|
$literal = preg_replace('#(?<!\\\\)((?:\\\\\\\\)*)([-^\\]])#', '$1\\\\$2', $literal); |
149
|
|
|
|
150
|
|
|
// If the regexp doesn't use PCRE_DOLLAR_ENDONLY, it could end with a \n |
151
|
38 |
|
if (strpos($def['modifiers'], 'D') === false) |
152
|
|
|
{ |
153
|
22 |
|
$literal .= "\n"; |
154
|
|
|
} |
155
|
|
|
|
156
|
|
|
// Add the literal portion of the regexp to the patterns, as a character class |
157
|
38 |
|
if ($literal !== '') |
158
|
|
|
{ |
159
|
30 |
|
$patterns[] = '[' . $literal . ']'; |
160
|
|
|
} |
161
|
|
|
|
162
|
|
|
// Test whether this regexp actually matches anything |
163
|
38 |
|
if (empty($patterns)) |
164
|
|
|
{ |
165
|
1 |
|
return '/^$/D'; |
166
|
|
|
} |
167
|
|
|
|
168
|
|
|
// Build the allowed characters regexp |
169
|
37 |
|
$regexp = $def['delimiter'] . implode('|', $patterns) . $def['delimiter']; |
170
|
|
|
|
171
|
|
|
// Add the modifiers |
172
|
37 |
|
if (strpos($def['modifiers'], 'i') !== false) |
173
|
|
|
{ |
174
|
4 |
|
$regexp .= 'i'; |
175
|
|
|
} |
176
|
37 |
|
if (strpos($def['modifiers'], 'u') !== false) |
177
|
|
|
{ |
178
|
2 |
|
$regexp .= 'u'; |
179
|
|
|
} |
180
|
|
|
|
181
|
37 |
|
return $regexp; |
182
|
|
|
} |
183
|
|
|
|
184
|
|
|
/** |
185
|
|
|
* Return the name of each capture in given regexp |
186
|
|
|
* |
187
|
|
|
* Will return an empty string for unnamed captures |
188
|
|
|
* |
189
|
|
|
* @param string $regexp |
190
|
|
|
* @return string[] |
191
|
|
|
*/ |
192
|
3 |
|
public static function getCaptureNames($regexp) |
193
|
|
|
{ |
194
|
3 |
|
$map = ['']; |
195
|
3 |
|
$regexpInfo = self::parse($regexp); |
196
|
3 |
|
foreach ($regexpInfo['tokens'] as $tok) |
197
|
|
|
{ |
198
|
2 |
|
if ($tok['type'] === 'capturingSubpatternStart') |
199
|
|
|
{ |
200
|
2 |
|
$map[] = $tok['name'] ?? ''; |
201
|
|
|
} |
202
|
|
|
} |
203
|
|
|
|
204
|
3 |
|
return $map; |
205
|
|
|
} |
206
|
|
|
|
207
|
|
|
/** |
208
|
|
|
* @param string $regexp |
209
|
|
|
* @return array |
210
|
|
|
*/ |
211
|
75 |
|
public static function parse($regexp) |
212
|
|
|
{ |
213
|
75 |
|
if (!preg_match('#^(.)(.*?)\\1([a-zA-Z]*)$#Ds', $regexp, $m)) |
214
|
|
|
{ |
215
|
1 |
|
throw new RuntimeException('Could not parse regexp delimiters'); |
216
|
|
|
} |
217
|
|
|
|
218
|
|
|
$ret = [ |
219
|
74 |
|
'delimiter' => $m[1], |
220
|
74 |
|
'modifiers' => $m[3], |
221
|
74 |
|
'regexp' => $m[2], |
222
|
|
|
'tokens' => [] |
223
|
|
|
]; |
224
|
|
|
|
225
|
74 |
|
$regexp = $m[2]; |
226
|
|
|
|
227
|
74 |
|
$openSubpatterns = []; |
228
|
|
|
|
229
|
74 |
|
$pos = 0; |
230
|
74 |
|
$regexpLen = strlen($regexp); |
231
|
|
|
|
232
|
74 |
|
while ($pos < $regexpLen) |
233
|
|
|
{ |
234
|
73 |
|
switch ($regexp[$pos]) |
235
|
|
|
{ |
236
|
|
|
case '\\': |
237
|
|
|
// skip next character |
238
|
16 |
|
$pos += 2; |
239
|
16 |
|
break; |
240
|
|
|
|
241
|
|
|
case '[': |
242
|
16 |
|
if (!preg_match('#\\[(.*?(?<!\\\\)(?:\\\\\\\\)*+)\\]((?:[+*][+?]?|\\?)?)#A', $regexp, $m, 0, $pos)) |
243
|
|
|
{ |
244
|
1 |
|
throw new RuntimeException('Could not find matching bracket from pos ' . $pos); |
245
|
|
|
} |
246
|
|
|
|
247
|
15 |
|
$ret['tokens'][] = [ |
248
|
15 |
|
'pos' => $pos, |
249
|
15 |
|
'len' => strlen($m[0]), |
250
|
15 |
|
'type' => 'characterClass', |
251
|
15 |
|
'content' => $m[1], |
252
|
15 |
|
'quantifiers' => $m[2] |
253
|
|
|
]; |
254
|
|
|
|
255
|
15 |
|
$pos += strlen($m[0]); |
256
|
15 |
|
break; |
257
|
|
|
|
258
|
|
|
case '(': |
259
|
30 |
|
if (preg_match('#\\(\\?([a-z]*)\\)#iA', $regexp, $m, 0, $pos)) |
260
|
|
|
{ |
261
|
|
|
// This is an option (?i) so we skip past the right parenthesis |
262
|
2 |
|
$ret['tokens'][] = [ |
263
|
2 |
|
'pos' => $pos, |
264
|
2 |
|
'len' => strlen($m[0]), |
265
|
2 |
|
'type' => 'option', |
266
|
2 |
|
'options' => $m[1] |
267
|
|
|
]; |
268
|
|
|
|
269
|
2 |
|
$pos += strlen($m[0]); |
270
|
2 |
|
break; |
271
|
|
|
} |
272
|
|
|
|
273
|
|
|
// This should be a subpattern, we just have to sniff which kind |
274
|
28 |
|
if (preg_match("#(?J)\\(\\?(?:P?<(?<name>[a-z_0-9]+)>|'(?<name>[a-z_0-9]+)')#A", $regexp, $m, \PREG_OFFSET_CAPTURE, $pos)) |
275
|
|
|
{ |
276
|
|
|
// This is a named capture |
277
|
|
|
$tok = [ |
278
|
4 |
|
'pos' => $pos, |
279
|
4 |
|
'len' => strlen($m[0][0]), |
280
|
4 |
|
'type' => 'capturingSubpatternStart', |
281
|
4 |
|
'name' => $m['name'][0] |
282
|
|
|
]; |
283
|
|
|
|
284
|
4 |
|
$pos += strlen($m[0][0]); |
285
|
|
|
} |
286
|
25 |
|
elseif (preg_match('#\\(\\?([a-z]*):#iA', $regexp, $m, 0, $pos)) |
287
|
|
|
{ |
288
|
|
|
// This is a non-capturing subpattern (?:xxx) |
289
|
|
|
$tok = [ |
290
|
8 |
|
'pos' => $pos, |
291
|
8 |
|
'len' => strlen($m[0]), |
292
|
8 |
|
'type' => 'nonCapturingSubpatternStart', |
293
|
8 |
|
'options' => $m[1] |
294
|
|
|
]; |
295
|
|
|
|
296
|
8 |
|
$pos += strlen($m[0]); |
297
|
|
|
} |
298
|
18 |
|
elseif (preg_match('#\\(\\?>#iA', $regexp, $m, 0, $pos)) |
299
|
|
|
{ |
300
|
|
|
/* This is a non-capturing subpattern with atomic grouping "(?>x+)" */ |
301
|
|
|
$tok = [ |
302
|
1 |
|
'pos' => $pos, |
303
|
1 |
|
'len' => strlen($m[0]), |
304
|
1 |
|
'type' => 'nonCapturingSubpatternStart', |
305
|
1 |
|
'subtype' => 'atomic' |
306
|
|
|
]; |
307
|
|
|
|
308
|
1 |
|
$pos += strlen($m[0]); |
309
|
|
|
} |
310
|
17 |
|
elseif (preg_match('#\\(\\?(<?[!=])#A', $regexp, $m, 0, $pos)) |
311
|
|
|
{ |
312
|
|
|
// This is an assertion |
313
|
|
|
$assertions = [ |
314
|
7 |
|
'=' => 'lookahead', |
315
|
|
|
'<=' => 'lookbehind', |
316
|
|
|
'!' => 'negativeLookahead', |
317
|
|
|
'<!' => 'negativeLookbehind' |
318
|
|
|
]; |
319
|
|
|
|
320
|
|
|
$tok = [ |
321
|
7 |
|
'pos' => $pos, |
322
|
7 |
|
'len' => strlen($m[0]), |
323
|
7 |
|
'type' => $assertions[$m[1]] . 'AssertionStart' |
324
|
|
|
]; |
325
|
|
|
|
326
|
7 |
|
$pos += strlen($m[0]); |
327
|
|
|
} |
328
|
10 |
|
elseif (preg_match('#\\(\\?#A', $regexp, $m, 0, $pos)) |
329
|
|
|
{ |
330
|
1 |
|
throw new RuntimeException('Unsupported subpattern type at pos ' . $pos); |
331
|
|
|
} |
332
|
|
|
else |
333
|
|
|
{ |
334
|
|
|
// This should be a normal capture |
335
|
|
|
$tok = [ |
336
|
9 |
|
'pos' => $pos, |
337
|
9 |
|
'len' => 1, |
338
|
9 |
|
'type' => 'capturingSubpatternStart' |
339
|
|
|
]; |
340
|
|
|
|
341
|
9 |
|
++$pos; |
342
|
|
|
} |
343
|
|
|
|
344
|
27 |
|
$openSubpatterns[] = count($ret['tokens']); |
345
|
27 |
|
$ret['tokens'][] = $tok; |
346
|
27 |
|
break; |
347
|
|
|
|
348
|
|
|
case ')': |
349
|
27 |
|
if (empty($openSubpatterns)) |
350
|
|
|
{ |
351
|
1 |
|
throw new RuntimeException('Could not find matching pattern start for right parenthesis at pos ' . $pos); |
352
|
|
|
} |
353
|
|
|
|
354
|
|
|
// Add the key to this token to its matching token and capture this subpattern's |
355
|
|
|
// content |
356
|
26 |
|
$k = array_pop($openSubpatterns); |
357
|
26 |
|
$startToken =& $ret['tokens'][$k]; |
358
|
26 |
|
$startToken['endToken'] = count($ret['tokens']); |
359
|
26 |
|
$startToken['content'] = substr( |
360
|
26 |
|
$regexp, |
361
|
26 |
|
$startToken['pos'] + $startToken['len'], |
362
|
26 |
|
$pos - ($startToken['pos'] + $startToken['len']) |
363
|
|
|
); |
364
|
|
|
|
365
|
|
|
// Look for quantifiers after the subpattern, e.g. (?:ab)++ |
366
|
26 |
|
$spn = strspn($regexp, '+*?', 1 + $pos); |
367
|
26 |
|
$quantifiers = substr($regexp, 1 + $pos, $spn); |
368
|
|
|
|
369
|
26 |
|
$ret['tokens'][] = [ |
370
|
26 |
|
'pos' => $pos, |
371
|
26 |
|
'len' => 1 + $spn, |
372
|
26 |
|
'type' => substr($startToken['type'], 0, -5) . 'End', |
373
|
26 |
|
'quantifiers' => $quantifiers |
374
|
|
|
]; |
375
|
|
|
|
376
|
26 |
|
unset($startToken); |
377
|
|
|
|
378
|
26 |
|
$pos += 1 + $spn; |
379
|
26 |
|
break; |
380
|
|
|
|
381
|
|
|
default: |
382
|
67 |
|
++$pos; |
383
|
|
|
} |
384
|
|
|
} |
385
|
|
|
|
386
|
71 |
|
if (!empty($openSubpatterns)) |
387
|
|
|
{ |
388
|
1 |
|
throw new RuntimeException('Could not find matching pattern end for left parenthesis at pos ' . $ret['tokens'][$openSubpatterns[0]]['pos']); |
389
|
|
|
} |
390
|
|
|
|
391
|
70 |
|
return $ret; |
392
|
|
|
} |
393
|
|
|
} |