RegexpParser::getAllowedCharacterRegexp()   F
last analyzed

Complexity

Conditions 23
Paths 3503

Size

Total Lines 156
Code Lines 61

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 62
CRAP Score 23

Importance

Changes 0
Metric Value
eloc 61
dl 0
loc 156
ccs 62
cts 62
cp 1
rs 0
c 0
b 0
f 0
cc 23
nc 3503
nop 1
crap 23

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
/**
4
* @package   s9e\TextFormatter
5
* @copyright Copyright (c) The s9e authors
6
* @license   http://www.opensource.org/licenses/mit-license.php The MIT License
7
*/
8
namespace s9e\TextFormatter\Configurator\Helpers;
9
10
use RuntimeException;
11
12
abstract class RegexpParser
13
{
14
	/**
15
	* Generate a regexp that matches any single character allowed in a regexp
16
	*
17
	* This method will generate a regexp that can be used to determine whether a given character
18
	* could in theory be allowed in a string that matches the source regexp. For example, the source
19
	* regexp /^a+$/D would generate /a/ while /^foo\d+$/D would generate /[fo\d]/ whereas the regexp
20
	* /foo/ would generate // because it's not anchored so any characters could be found before or
21
	* after the literal "foo".
22
	*
23
	* @param  string $regexp Source regexp
24
	* @return string         Regexp that matches any single character allowed in the source regexp
25
	*/
26 44
	public static function getAllowedCharacterRegexp($regexp)
27
	{
28 44
		$def = self::parse($regexp);
29
30
		// If the regexp is uses the multiline modifier, this regexp can't match the whole string if
31
		// it contains newlines, so in effect it could allow any content
32 44
		if (strpos($def['modifiers'], 'm') !== false)
33
		{
34 1
			return '//';
35
		}
36
37 43
		if (substr($def['regexp'], 0, 1) !== '^'
38 43
		 || substr($def['regexp'], -1)   !== '$')
39
		{
40 1
			return '//';
41
		}
42
43
		// Append a token to mark the end of the regexp
44 42
		$def['tokens'][] = [
45 42
			'pos'  => strlen($def['regexp']),
46 42
			'len'  => 0,
47 42
			'type' => 'end'
48
		];
49
50 42
		$patterns = [];
51
52
		// Collect the literal portions of the source regexp while testing for alternations
53 42
		$literal = '';
54 42
		$pos     = 0;
55 42
		$skipPos = 0;
56 42
		$depth   = 0;
57 42
		foreach ($def['tokens'] as $token)
58
		{
59
			// Skip options
60 42
			if ($token['type'] === 'option')
61
			{
62 1
				$skipPos = max($skipPos, $token['pos'] + $token['len']);
63
			}
64
65
			// Skip assertions
66 42
			if (strpos($token['type'], 'AssertionStart') !== false)
67
			{
68 3
				$endToken = $def['tokens'][$token['endToken']];
69 3
				$skipPos  = max($skipPos, $endToken['pos'] + $endToken['len']);
70
			}
71
72 42
			if ($token['pos'] >= $skipPos)
73
			{
74 42
				if ($token['type'] === 'characterClass')
75
				{
76 10
					$patterns[] = '[' . $token['content'] . ']';
77
				}
78
79 42
				if ($token['pos'] > $pos)
80
				{
81
					// Capture the content between last position and current position
82 42
					$tmp = substr($def['regexp'], $pos, $token['pos'] - $pos);
83
84
					// Append the content to the literal portion
85 42
					$literal .= $tmp;
86
87
					// Test for alternations if it's the root of the regexp
88 42
					if (!$depth)
89
					{
90
						// Remove literal backslashes for convenience
91 42
						$tmp = str_replace('\\\\', '', $tmp);
92
93
						// Look for an unescaped | that is not followed by ^
94 42
						if (preg_match('/(?<!\\\\)\\|(?!\\^)/', $tmp))
95
						{
96 2
							return '//';
97
						}
98
99
						// Look for an unescaped | that is not preceded by $
100 41
						if (preg_match('/(?<![$\\\\])\\|/', $tmp))
101
						{
102 1
							return '//';
103
						}
104
					}
105
				}
106
			}
107
108 41
			if (substr($token['type'], -5) === 'Start')
109
			{
110 10
				++$depth;
111
			}
112 41
			elseif (substr($token['type'], -3) === 'End')
113
			{
114 10
				--$depth;
115
			}
116
117 41
			$pos = max($skipPos, $token['pos'] + $token['len']);
118
		}
119
120
		// Test for the presence of an unescaped dot
121 39
		if (preg_match('#(?<!\\\\)(?:\\\\\\\\)*\\.#', $literal))
122
		{
123 4
			if (strpos($def['modifiers'], 's') !== false
124 4
			 || strpos($literal, "\n") !== false)
125
			{
126 1
				return '//';
127
			}
128
129 3
			$patterns[] = '.';
130
131
			// Remove unescaped dots
132 3
			$literal = preg_replace('#(?<!\\\\)((?:\\\\\\\\)*)\\.#', '$1', $literal);
133
		}
134
135
		// Remove unescaped quantifiers *, + and ?
136 38
		$literal = preg_replace('#(?<!\\\\)((?:\\\\\\\\)*)[*+?]#', '$1', $literal);
137
138
		// Remove unescaped quantifiers {}
139 38
		$literal = preg_replace('#(?<!\\\\)((?:\\\\\\\\)*)\\{[^}]+\\}#', '$1', $literal);
140
141
		// Remove backslash assertions \b, \B, \A, \Z, \z and \G, as well as back references
142 38
		$literal = preg_replace('#(?<!\\\\)((?:\\\\\\\\)*)\\\\[bBAZzG1-9]#', '$1', $literal);
143
144
		// Remove unescaped ^, | and $
145 38
		$literal = preg_replace('#(?<!\\\\)((?:\\\\\\\\)*)[$^|]#', '$1', $literal);
146
147
		// Escape unescaped - and ] so they are safe to use in a character class
148 38
		$literal = preg_replace('#(?<!\\\\)((?:\\\\\\\\)*)([-^\\]])#', '$1\\\\$2', $literal);
149
150
		// If the regexp doesn't use PCRE_DOLLAR_ENDONLY, it could end with a \n
151 38
		if (strpos($def['modifiers'], 'D') === false)
152
		{
153 22
			$literal .= "\n";
154
		}
155
156
		// Add the literal portion of the regexp to the patterns, as a character class
157 38
		if ($literal !== '')
158
		{
159 30
			$patterns[] = '[' . $literal . ']';
160
		}
161
162
		// Test whether this regexp actually matches anything
163 38
		if (empty($patterns))
164
		{
165 1
			return '/^$/D';
166
		}
167
168
		// Build the allowed characters regexp
169 37
		$regexp = $def['delimiter'] . implode('|', $patterns) . $def['delimiter'];
170
171
		// Add the modifiers
172 37
		if (strpos($def['modifiers'], 'i') !== false)
173
		{
174 4
			$regexp .= 'i';
175
		}
176 37
		if (strpos($def['modifiers'], 'u') !== false)
177
		{
178 2
			$regexp .= 'u';
179
		}
180
181 37
		return $regexp;
182
	}
183
184
	/**
185
	* Return the name of each capture in given regexp
186
	*
187
	* Will return an empty string for unnamed captures
188
	*
189
	* @param  string   $regexp
190
	* @return string[]
191
	*/
192 3
	public static function getCaptureNames($regexp)
193
	{
194 3
		$map        = [''];
195 3
		$regexpInfo = self::parse($regexp);
196 3
		foreach ($regexpInfo['tokens'] as $tok)
197
		{
198 2
			if ($tok['type'] === 'capturingSubpatternStart')
199
			{
200 2
				$map[] = $tok['name'] ?? '';
201
			}
202
		}
203
204 3
		return $map;
205
	}
206
207
	/**
208
	* @param  string $regexp
209
	* @return array
210
	*/
211 75
	public static function parse($regexp)
212
	{
213 75
		if (!preg_match('#^(.)(.*?)\\1([a-zA-Z]*)$#Ds', $regexp, $m))
214
		{
215 1
			throw new RuntimeException('Could not parse regexp delimiters');
216
		}
217
218
		$ret = [
219 74
			'delimiter' => $m[1],
220 74
			'modifiers' => $m[3],
221 74
			'regexp'    => $m[2],
222
			'tokens'    => []
223
		];
224
225 74
		$regexp = $m[2];
226
227 74
		$openSubpatterns = [];
228
229 74
		$pos = 0;
230 74
		$regexpLen = strlen($regexp);
231
232 74
		while ($pos < $regexpLen)
233
		{
234 73
			switch ($regexp[$pos])
235
			{
236
				case '\\':
237
					// skip next character
238 16
					$pos += 2;
239 16
					break;
240
241
				case '[':
242 16
					if (!preg_match('#\\[(.*?(?<!\\\\)(?:\\\\\\\\)*+)\\]((?:[+*][+?]?|\\?)?)#A', $regexp, $m, 0, $pos))
243
					{
244 1
						throw new RuntimeException('Could not find matching bracket from pos ' . $pos);
245
					}
246
247 15
					$ret['tokens'][] = [
248 15
						'pos'         => $pos,
249 15
						'len'         => strlen($m[0]),
250 15
						'type'        => 'characterClass',
251 15
						'content'     => $m[1],
252 15
						'quantifiers' => $m[2]
253
					];
254
255 15
					$pos += strlen($m[0]);
256 15
					break;
257
258
				case '(':
259 30
					if (preg_match('#\\(\\?([a-z]*)\\)#iA', $regexp, $m, 0, $pos))
260
					{
261
						// This is an option (?i) so we skip past the right parenthesis
262 2
						$ret['tokens'][] = [
263 2
							'pos'     => $pos,
264 2
							'len'     => strlen($m[0]),
265 2
							'type'    => 'option',
266 2
							'options' => $m[1]
267
						];
268
269 2
						$pos += strlen($m[0]);
270 2
						break;
271
					}
272
273
					// This should be a subpattern, we just have to sniff which kind
274 28
					if (preg_match("#(?J)\\(\\?(?:P?<(?<name>[a-z_0-9]+)>|'(?<name>[a-z_0-9]+)')#A", $regexp, $m, \PREG_OFFSET_CAPTURE, $pos))
275
					{
276
						// This is a named capture
277
						$tok = [
278 4
							'pos'  => $pos,
279 4
							'len'  => strlen($m[0][0]),
280 4
							'type' => 'capturingSubpatternStart',
281 4
							'name' => $m['name'][0]
282
						];
283
284 4
						$pos += strlen($m[0][0]);
285
					}
286 25
					elseif (preg_match('#\\(\\?([a-z]*):#iA', $regexp, $m, 0, $pos))
287
					{
288
						// This is a non-capturing subpattern (?:xxx)
289
						$tok = [
290 8
							'pos'     => $pos,
291 8
							'len'     => strlen($m[0]),
292 8
							'type'    => 'nonCapturingSubpatternStart',
293 8
							'options' => $m[1]
294
						];
295
296 8
						$pos += strlen($m[0]);
297
					}
298 18
					elseif (preg_match('#\\(\\?>#iA', $regexp, $m, 0, $pos))
299
					{
300
						/* This is a non-capturing subpattern with atomic grouping "(?>x+)" */
301
						$tok = [
302 1
							'pos'     => $pos,
303 1
							'len'     => strlen($m[0]),
304 1
							'type'    => 'nonCapturingSubpatternStart',
305 1
							'subtype' => 'atomic'
306
						];
307
308 1
						$pos += strlen($m[0]);
309
					}
310 17
					elseif (preg_match('#\\(\\?(<?[!=])#A', $regexp, $m, 0, $pos))
311
					{
312
						// This is an assertion
313
						$assertions = [
314 7
							'='  => 'lookahead',
315
							'<=' => 'lookbehind',
316
							'!'  => 'negativeLookahead',
317
							'<!' => 'negativeLookbehind'
318
						];
319
320
						$tok = [
321 7
							'pos'     => $pos,
322 7
							'len'     => strlen($m[0]),
323 7
							'type'    => $assertions[$m[1]] . 'AssertionStart'
324
						];
325
326 7
						$pos += strlen($m[0]);
327
					}
328 10
					elseif (preg_match('#\\(\\?#A', $regexp, $m, 0, $pos))
329
					{
330 1
						throw new RuntimeException('Unsupported subpattern type at pos ' . $pos);
331
					}
332
					else
333
					{
334
						// This should be a normal capture
335
						$tok = [
336 9
							'pos'  => $pos,
337 9
							'len'  => 1,
338 9
							'type' => 'capturingSubpatternStart'
339
						];
340
341 9
						++$pos;
342
					}
343
344 27
					$openSubpatterns[] = count($ret['tokens']);
345 27
					$ret['tokens'][] = $tok;
346 27
					break;
347
348
				case ')':
349 27
					if (empty($openSubpatterns))
350
					{
351 1
						throw new RuntimeException('Could not find matching pattern start for right parenthesis at pos ' . $pos);
352
					}
353
354
					// Add the key to this token to its matching token and capture this subpattern's
355
					// content
356 26
					$k = array_pop($openSubpatterns);
357 26
					$startToken =& $ret['tokens'][$k];
358 26
					$startToken['endToken'] = count($ret['tokens']);
359 26
					$startToken['content']  = substr(
360 26
						$regexp,
361 26
						$startToken['pos'] + $startToken['len'],
362 26
						$pos - ($startToken['pos'] + $startToken['len'])
363
					);
364
365
					// Look for quantifiers after the subpattern, e.g. (?:ab)++
366 26
					$spn = strspn($regexp, '+*?', 1 + $pos);
367 26
					$quantifiers = substr($regexp, 1 + $pos, $spn);
368
369 26
					$ret['tokens'][] = [
370 26
						'pos'  => $pos,
371 26
						'len'  => 1 + $spn,
372 26
						'type' => substr($startToken['type'], 0, -5) . 'End',
373 26
						'quantifiers' => $quantifiers
374
					];
375
376 26
					unset($startToken);
377
378 26
					$pos += 1 + $spn;
379 26
					break;
380
381
				default:
382 67
					++$pos;
383
			}
384
		}
385
386 71
		if (!empty($openSubpatterns))
387
		{
388 1
			throw new RuntimeException('Could not find matching pattern end for left parenthesis at pos ' . $ret['tokens'][$openSubpatterns[0]]['pos']);
389
		}
390
391 70
		return $ret;
392
	}
393
}