Completed
Push — master ( d91fed...fd66aa )
by Josh
17:36
created

RegexpBuilder::optimizeDotChains()   C

Complexity

Conditions 14
Paths 12

Size

Total Lines 88

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
dl 0
loc 88
rs 5.2751
c 0
b 0
f 0
cc 14
nc 12
nop 1

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
/**
4
* @package   s9e\TextFormatter
5
* @copyright Copyright (c) 2010-2018 The s9e Authors
6
* @license   http://www.opensource.org/licenses/mit-license.php The MIT License
7
*/
8
namespace s9e\TextFormatter\Configurator\Helpers;
9
10
use RuntimeException;
11
12
abstract class RegexpBuilder
13
{
14
	/**
15
	* @var CharacterClassBuilder
16
	*/
17
	protected static $characterClassBuilder;
18
19
	/**
20
	* Create a regexp pattern that matches a list of words
21
	*
22
	* @param  array  $words   Words to sort (must be UTF-8)
23
	* @param  array  $options
24
	* @return string
25
	*/
26
	public static function fromList(array $words, array $options = [])
27
	{
28
		if (empty($words))
29
		{
30
			return '';
31
		}
32
33
		$options += [
34
			'delimiter'       => '/',
35
			'caseInsensitive' => false,
36
			'specialChars'    => [],
37
			'unicode'         => true,
38
			'useLookahead'    => false
39
		];
40
41
		// Normalize ASCII if the regexp is meant to be case-insensitive
42
		if ($options['caseInsensitive'])
43
		{
44
			foreach ($words as &$word)
45
			{
46
				$word = strtr(
47
					$word,
48
					'ABCDEFGHIJKLMNOPQRSTUVWXYZ',
49
					'abcdefghijklmnopqrstuvwxyz'
50
				);
51
			}
52
			unset($word);
53
		}
54
55
		// Deduplicate words in advance because some routines such as mergeChains() make assumptions
56
		// based on the size of some chains without deduplicating them first
57
		$words = array_unique($words);
58
59
		// Sort the words in order to produce the same regexp regardless of the words' order
60
		sort($words);
61
62
		// Used to store the first character of each word so that we can generate the lookahead
63
		// assertion
64
		$initials = [];
65
66
		// Used to store the escaped representation of each character, e.g. "a"=>"a", "."=>"\\."
67
		// Also used to give a special meaning to some characters, e.g. "*" => ".*?"
68
		$esc  = $options['specialChars'];
69
		$esc += [$options['delimiter'] => '\\' . $options['delimiter']];
70
71
		// preg_quote() errs on the safe side when escaping characters that could have a special
72
		// meaning in some situations. Since we're building the regexp in a controlled environment,
73
		// we don't have to escape those characters.
74
		$esc += [
75
			'!' => '!',
76
			'-' => '-',
77
			':' => ':',
78
			'<' => '<',
79
			'=' => '=',
80
			'>' => '>',
81
			'}' => '}'
82
		];
83
84
		// List of words, split by character
85
		$splitWords = [];
86
87
		foreach ($words as $word)
88
		{
89
			$regexp = ($options['unicode']) ? '(.)us' : '(.)s';
90
			if (preg_match_all($regexp, $word, $matches) === false)
91
			{
92
				throw new RuntimeException("Invalid UTF-8 string '" . $word . "'");
93
			}
94
95
			$splitWord = [];
96
			foreach ($matches[0] as $pos => $c)
97
			{
98
				if (!isset($esc[$c]))
99
				{
100
					$esc[$c] = preg_quote($c);
101
				}
102
103
				if ($pos === 0)
104
				{
105
					// Store the initial for later
106
					$initials[] = $esc[$c];
107
				}
108
109
				$splitWord[] = $esc[$c];
110
			}
111
112
			$splitWords[] = $splitWord;
113
		}
114
115
		self::$characterClassBuilder            = new CharacterClassBuilder;
116
		self::$characterClassBuilder->delimiter = $options['delimiter'];
117
		$regexp = self::assemble([self::mergeChains($splitWords)]);
118
119
		if ($options['useLookahead']
120
		 && count($initials) > 1
121
		 && $regexp[0] !== '[')
122
		{
123
			$useLookahead = true;
124
125
			foreach ($initials as $initial)
126
			{
127
				if (!self::canBeUsedInCharacterClass($initial))
128
				{
129
					$useLookahead = false;
130
					break;
131
				}
132
			}
133
134
			if ($useLookahead)
135
			{
136
				$regexp = '(?=' . self::generateCharacterClass($initials) . ')' . $regexp;
137
			}
138
		}
139
140
		return $regexp;
141
	}
142
143
	/**
144
	* Merge a 2D array of split words into a 1D array of expressions
145
	*
146
	* Each element in the passed array is called a "chain". It starts as an array where each element
147
	* is a character (a sort of UTF-8 aware str_split()) but successive iterations replace
148
	* individual characters with an equivalent expression.
149
	*
150
	* How it works:
151
	*
152
	* 1. Remove the longest prefix shared by all the chains
153
	* 2. Remove the longest suffix shared by all the chains
154
	* 3. Group each chain by their first element, e.g. all the chains that start with "a" (or in
155
	*    some cases, "[xy]") are grouped together
156
	* 4. If no group has more than 1 chain, we assemble them in a regexp, such as (aa|bb). If any
157
	*    group has more than 1 chain, for each group we merge the chains from that group together so
158
	*    that no group has more than 1 chain. When we're done, we remerge all the chains together.
159
	*
160
	* @param  array $chains
161
	* @param  bool  $preventRemerge
162
	* @return array
163
	*/
164
	protected static function mergeChains(array $chains, $preventRemerge = false)
165
	{
166
		// If there's only one chain, there's nothing to merge
167
		if (!isset($chains[1]))
168
		{
169
			return $chains[0];
170
		}
171
172
		// The merged chain starts with the chains' common prefix
173
		$mergedChain = self::removeLongestCommonPrefix($chains);
174
175
		if (!isset($chains[0][0])
176
		 && !array_filter($chains))
177
		{
178
			// The chains are empty, either they were already empty or they were identical and their
179
			// content was removed as their prefix. Nothing left to merge
180
			return $mergedChain;
181
		}
182
183
		// Remove the longest common suffix and save it for later
184
		$suffix = self::removeLongestCommonSuffix($chains);
185
186
		// Optimize the joker thing
187
		if (isset($chains[1]))
188
		{
189
			self::optimizeDotChains($chains);
190
			self::optimizeCatchallChains($chains);
191
		}
192
193
		// Whether one of the chain has been completely optimized away by prefix/suffix removal.
194
		// Signals that the middle part of the regexp is optional, e.g. (prefix)(foo)?(suffix)
195
		$endOfChain = false;
196
197
		// Whether these chains need to be remerged
198
		$remerge = false;
199
200
		// Here we group chains by their first atom (head of chain)
201
		$groups = [];
202
		foreach ($chains as $chain)
203
		{
204
			if (!isset($chain[0]))
205
			{
206
				$endOfChain = true;
207
				continue;
208
			}
209
210
			$head = $chain[0];
211
212
			if (isset($groups[$head]))
213
			{
214
				// More than one chain in a group means that we need to remerge
215
				$remerge = true;
216
			}
217
218
			$groups[$head][] = $chain;
219
		}
220
221
		// See if we can replace single characters with a character class
222
		$characterClass = [];
223
		foreach ($groups as $head => $groupChains)
224
		{
225
			$head = (string) $head;
226
227
			if ($groupChains === [[$head]]
228
			 && self::canBeUsedInCharacterClass($head))
229
			{
230
				// The whole chain is composed of exactly one token, a token that can be used in a
231
				// character class
232
				$characterClass[$head] = $head;
233
			}
234
		}
235
236
		// Sort the characters and reset their keys
237
		sort($characterClass);
238
239
		// Test whether there is more than 1 character in the character class
240
		if (isset($characterClass[1]))
241
		{
242
			// Remove each of those characters from the groups
243
			foreach ($characterClass as $char)
244
			{
245
				unset($groups[$char]);
246
			}
247
248
			// Create a new group for this character class
249
			$head = self::generateCharacterClass($characterClass);
250
			$groups[$head][] = [$head];
251
252
			// Ensure that the character class is first in the alternation. Not only it looks nice
253
			// and might be more performant, it's also how assemble() does it, so normalizing it
254
			// might help with generating identical regexps (or subpatterns that would then be
255
			// optimized away as a prefix/suffix)
256
			$groups = [$head => $groups[$head]]
257
			        + $groups;
258
		}
259
260
		if ($remerge && !$preventRemerge)
261
		{
262
			// Merge all chains sharing the same head together
263
			$mergedChains = [];
264
			foreach ($groups as $head => $groupChains)
265
			{
266
				$mergedChains[] = self::mergeChains($groupChains);
267
			}
268
269
			// Merge the tails of all chains if applicable. Helps with [ab][xy] (two chains with
270
			// identical tails)
271
			self::mergeTails($mergedChains);
272
273
			// Now merge all chains together and append it to our merged chain
274
			$regexp = implode('', self::mergeChains($mergedChains, true));
275
276
			if ($endOfChain)
277
			{
278
				$regexp = self::makeRegexpOptional($regexp);
279
			}
280
281
			$mergedChain[] = $regexp;
282
		}
283
		else
284
		{
285
			self::mergeTails($chains);
286
			$mergedChain[] = self::assemble($chains);
287
		}
288
289
		// Add the common suffix
290
		foreach ($suffix as $atom)
291
		{
292
			$mergedChain[] = $atom;
293
		}
294
295
		return $mergedChain;
296
	}
297
298
	/**
299
	* Merge the tails of an array of chains wherever applicable
300
	*
301
	* This method optimizes (a[xy]|b[xy]|c) into ([ab][xy]|c). The expression [xy] is not a suffix
302
	* to every branch of the alternation (common suffix), so it's not automatically removed. What we
303
	* do here is group chains by their last element (their tail) and then try to merge them together
304
	* group by group. This method should only be called AFTER chains have been group-merged by head.
305
	*
306
	* @param array &$chains
307
	*/
308
	protected static function mergeTails(array &$chains)
309
	{
310
		// (a[xy]|b[xy]|c) => ([ab][xy]|c)
311
		self::mergeTailsCC($chains);
312
313
		// (axx|ayy|bbxx|bbyy|c) => ((a|bb)(xx|yy)|c)
314
		self::mergeTailsAltern($chains);
315
316
		// Don't forget to reset the keys
317
		$chains = array_values($chains);
318
	}
319
320
	/**
321
	* Merge the tails of an array of chains if their head can become a character class
322
	*
323
	* @param array &$chains
324
	*/
325
	protected static function mergeTailsCC(array &$chains)
326
	{
327
		$groups = [];
328
329
		foreach ($chains as $k => $chain)
330
		{
331
			if (isset($chain[1])
332
			 && !isset($chain[2])
333
			 && self::canBeUsedInCharacterClass($chain[0]))
334
			{
335
				$groups[$chain[1]][$k] = $chain;
336
			}
337
		}
338
339
		foreach ($groups as $groupChains)
340
		{
341
			if (count($groupChains) < 2)
342
			{
343
				// Only 1 element, skip this group
344
				continue;
345
			}
346
347
			// Remove this group's chains from the original list
348
			$chains = array_diff_key($chains, $groupChains);
349
350
			// Merge this group's chains and add the result to the list
351
			$chains[] = self::mergeChains(array_values($groupChains));
352
		}
353
	}
354
355
	/**
356
	* Merge the tails of an array of chains if it makes the end result shorter
357
	*
358
	* This kind of merging used to be specifically avoided due to performance concerns but some
359
	* light benchmarking showed that there isn't any measurable difference in performance between
360
	*   (?:c|a(?:xx|yy)|bb(?:xx|yy))
361
	* and
362
	*   (?:c|(?:a|bb)(?:xx|yy))
363
	*
364
	* @param array &$chains
365
	*/
366
	protected static function mergeTailsAltern(array &$chains)
367
	{
368
		$groups = [];
369
		foreach ($chains as $k => $chain)
370
		{
371
			if (!empty($chain))
372
			{
373
				$tail = array_slice($chain, -1);
374
				$groups[$tail[0]][$k] = $chain;
375
			}
376
		}
377
378
		foreach ($groups as $tail => $groupChains)
379
		{
380
			if (count($groupChains) < 2)
381
			{
382
				// Only 1 element, skip this group
383
				continue;
384
			}
385
386
			// Create a single chain for this group
387
			$mergedChain = self::mergeChains(array_values($groupChains));
388
389
			// Test whether the merged chain is shorter than the sum of its components
390
			$oldLen = 0;
391
			foreach ($groupChains as $groupChain)
392
			{
393
				$oldLen += array_sum(array_map('strlen', $groupChain));
394
			}
395
396
			if ($oldLen <= array_sum(array_map('strlen', $mergedChain)))
397
			{
398
				continue;
399
			}
400
401
			// Remove this group's chains from the original list
402
			$chains = array_diff_key($chains, $groupChains);
403
404
			// Merge this group's chains and add the result to the list
405
			$chains[] = $mergedChain;
406
		}
407
	}
408
409
	/**
410
	* Remove the longest common prefix from an array of chains
411
	*
412
	* @param  array &$chains
413
	* @return array          Removed elements
414
	*/
415
	protected static function removeLongestCommonPrefix(array &$chains)
416
	{
417
		// Length of longest common prefix
418
		$pLen = 0;
419
420
		while (1)
421
		{
422
			// $c will be used to store the character we're matching against
423
			$c = null;
424
425
			foreach ($chains as $chain)
426
			{
427
				if (!isset($chain[$pLen]))
428
				{
429
					// Reached the end of a word
430
					break 2;
431
				}
432
433
				if (!isset($c))
434
				{
435
					$c = $chain[$pLen];
436
					continue;
437
				}
438
439
				if ($chain[$pLen] !== $c)
440
				{
441
					// Does not match -- don't increment sLen and break out of the loop
442
					break 2;
443
				}
444
			}
445
446
			// We have confirmed that all the words share a same prefix of at least ($pLen + 1)
447
			++$pLen;
448
		}
449
450
		if (!$pLen)
451
		{
452
			return [];
453
		}
454
455
		// Store prefix
456
		$prefix = array_slice($chains[0], 0, $pLen);
457
458
		// Remove prefix from each word
459
		foreach ($chains as &$chain)
460
		{
461
			$chain = array_slice($chain, $pLen);
462
		}
463
		unset($chain);
464
465
		return $prefix;
466
	}
467
468
	/**
469
	* Remove the longest common suffix from an array of chains
470
	*
471
	* NOTE: this method is meant to be called after removeLongestCommonPrefix(). If it's not, then
472
	*       the longest match return may be off by 1.
473
	*
474
	* @param  array &$chains
475
	* @return array          Removed elements
476
	*/
477
	protected static function removeLongestCommonSuffix(array &$chains)
478
	{
479
		// Cache the length of every word
480
		$chainsLen = array_map('count', $chains);
481
482
		// Length of the longest possible suffix
483
		$maxLen = min($chainsLen);
484
485
		// If all the words are the same length, the longest suffix is 1 less than the length of the
486
		// words because we've already extracted the longest prefix
487
		if (max($chainsLen) === $maxLen)
488
		{
489
			--$maxLen;
490
		}
491
492
		// Length of longest common suffix
493
		$sLen = 0;
494
495
		// Try to find the longest common suffix
496
		while ($sLen < $maxLen)
497
		{
498
			// $c will be used to store the character we're matching against
499
			$c = null;
500
501
			foreach ($chains as $k => $chain)
502
			{
503
				$pos = $chainsLen[$k] - ($sLen + 1);
504
505
				if (!isset($c))
506
				{
507
					$c = $chain[$pos];
508
					continue;
509
				}
510
511
				if ($chain[$pos] !== $c)
512
				{
513
					// Does not match -- don't increment sLen and break out of the loop
514
					break 2;
515
				}
516
			}
517
518
			// We have confirmed that all the words share a same suffix of at least ($sLen + 1)
519
			++$sLen;
520
		}
521
522
		if (!$sLen)
523
		{
524
			return [];
525
		}
526
527
		// Store suffix
528
		$suffix = array_slice($chains[0], -$sLen);
529
530
		// Remove suffix from each word
531
		foreach ($chains as &$chain)
532
		{
533
			$chain = array_slice($chain, 0, -$sLen);
534
		}
535
		unset($chain);
536
537
		return $suffix;
538
	}
539
540
	/**
541
	* Assemble an array of chains into one expression
542
	*
543
	* @param  array  $chains
544
	* @return string
545
	*/
546
	protected static function assemble(array $chains)
547
	{
548
		$endOfChain = false;
549
550
		$regexps        = [];
551
		$characterClass = [];
552
553
		foreach ($chains as $chain)
554
		{
555
			if (empty($chain))
556
			{
557
				$endOfChain = true;
558
				continue;
559
			}
560
561
			if (!isset($chain[1])
562
			 && self::canBeUsedInCharacterClass($chain[0]))
563
			{
564
				$characterClass[$chain[0]] = $chain[0];
565
			}
566
			else
567
			{
568
				$regexps[] = implode('', $chain);
569
			}
570
		}
571
572
		if (!empty($characterClass))
573
		{
574
			// Sort the characters and reset their keys
575
			sort($characterClass);
576
577
			// Use a character class if there are more than 1 characters in it
578
			$regexp = (isset($characterClass[1]))
579
					? self::generateCharacterClass($characterClass)
580
					: $characterClass[0];
581
582
			// Prepend the character class to the list of regexps
583
			array_unshift($regexps, $regexp);
584
		}
585
586
		if (empty($regexps))
587
		{
588
			return '';
589
		}
590
591
		if (isset($regexps[1]))
592
		{
593
			// There are several branches, coalesce them
594
			$regexp = implode('|', $regexps);
595
596
			// Put the expression in a subpattern
597
			$regexp = ((self::canUseAtomicGrouping($regexp)) ? '(?>' : '(?:') . $regexp . ')';
598
		}
599
		else
600
		{
601
			$regexp = $regexps[0];
602
		}
603
604
		// If we've reached the end of a chain, it means that the branches are optional
605
		if ($endOfChain)
606
		{
607
			$regexp = self::makeRegexpOptional($regexp);
608
		}
609
610
		return $regexp;
611
	}
612
613
	/**
614
	* Make an entire regexp optional through the use of the ? quantifier
615
	*
616
	* @param  string $regexp
617
	* @return string
618
	*/
619
	protected static function makeRegexpOptional($regexp)
620
	{
621
		// .+ and .+? become .* and .*?
622
		if (preg_match('#^\\.\\+\\??$#', $regexp))
623
		{
624
			return str_replace('+', '*', $regexp);
625
		}
626
627
		// Special case: xx? becomes x?x?, \w\w? becomes \w?\w?
628
		// It covers only the most common case of repetition, it's not a panacea
629
		if (preg_match('#^(\\\\?.)((?:\\1\\?)+)$#Du', $regexp, $m))
630
		{
631
			return $m[1] . '?' . $m[2];
632
		}
633
634
		// Optional assertions are a no-op
635
		if (preg_match('#^(?:[$^]|\\\\[bBAZzGQEK])$#', $regexp))
636
		{
637
			return '';
638
		}
639
640
		// One single character, optionally escaped
641
		if (preg_match('#^\\\\?.$#Dus', $regexp))
642
		{
643
			$isAtomic = true;
644
		}
645
		// At least two characters, but it's not a subpattern or a character class
646
		elseif (preg_match('#^[^[(].#s', $regexp))
647
		{
648
			$isAtomic = false;
649
		}
650
		else
651
		{
652
			$def    = RegexpParser::parse('#' . $regexp . '#');
653
			$tokens = $def['tokens'];
654
655
			switch (count($tokens))
656
			{
657
				// One character class
658
				case 1:
659
					$startPos = $tokens[0]['pos'];
660
					$len      = $tokens[0]['len'];
661
662
					$isAtomic = (bool) ($startPos === 0 && $len === strlen($regexp));
663
664
					// If the regexp is [..]+ it becomes [..]* (to which a ? will be appended)
665
					if ($isAtomic && $tokens[0]['type'] === 'characterClass')
666
					{
667
						$regexp = rtrim($regexp, '+*?');
668
669
						if (!empty($tokens[0]['quantifiers']) && $tokens[0]['quantifiers'] !== '?')
670
						{
671
							$regexp .= '*';
672
						}
673
					}
674
					break;
675
676
				// One subpattern covering the entire regexp
677
				case 2:
678
					if ($tokens[0]['type'] === 'nonCapturingSubpatternStart'
679
					 && $tokens[1]['type'] === 'nonCapturingSubpatternEnd')
680
					{
681
						$startPos = $tokens[0]['pos'];
682
						$len      = $tokens[1]['pos'] + $tokens[1]['len'];
683
684
						$isAtomic = (bool) ($startPos === 0 && $len === strlen($regexp));
685
686
						// If the tokens are not a non-capturing subpattern, we let it fall through
687
						break;
688
					}
689
					// no break; here
690
691
				default:
692
					$isAtomic = false;
693
			}
694
		}
695
696
		if (!$isAtomic)
697
		{
698
			$regexp = ((self::canUseAtomicGrouping($regexp)) ? '(?>' : '(?:') . $regexp . ')';
699
		}
700
701
		$regexp .= '?';
702
703
		return $regexp;
704
	}
705
706
	/**
707
	* Generate a character class from an array of characters
708
	*
709
	* @param  array  $chars
710
	* @return string
711
	*/
712
	protected static function generateCharacterClass(array $chars)
713
	{
714
		return self::$characterClassBuilder->fromList($chars);
715
	}
716
717
	/**
718
	* Test whether a given expression (usually one character) can be used in a character class
719
	*
720
	* @param  string $char
721
	* @return bool
722
	*/
723
	protected static function canBeUsedInCharacterClass($char)
724
	{
725
		/**
726
		* Encoded non-printable characters and generic character classes are allowed
727
		* @link http://docs.php.net/manual/en/regexp.reference.escape.php
728
		*/
729
		if (preg_match('#^\\\\[aefnrtdDhHsSvVwW]$#D', $char))
730
		{
731
			return true;
732
		}
733
734
		// Escaped literals are allowed (escaped sequences excluded)
735
		if (preg_match('#^\\\\[^A-Za-z0-9]$#Dus', $char))
736
		{
737
			return true;
738
		}
739
740
		// More than 1 character => cannot be used in a character class
741
		if (preg_match('#..#Dus', $char))
742
		{
743
			return false;
744
		}
745
746
		// Special characters such as $ or ^ are rejected, but we need to check for characters that
747
		// get escaped by preg_quote() even though it's not necessary, such as ! or =
748
		if (preg_quote($char) !== $char
749
		 && !preg_match('#^[-!:<=>}]$#D', $char))
750
		{
751
			return false;
752
		}
753
754
		return true;
755
	}
756
757
	/**
758
	* Remove chains that overlap with dot chains
759
	*
760
	* Will remove chains that are made redundant by the use of the dot metacharacter, e.g.
761
	* ["a","b","c"] and ["a",".","c"] or ["a","b","c"], ["a","c"] and ["a",".?","c"]
762
	*
763
	* @param  array &$chains
764
	* @return void
765
	*/
766
	protected static function optimizeDotChains(array &$chains)
767
	{
768
		/**
769
		* @var array List of valid atoms that should be matched by a dot but happen to be
770
		*            represented by more than one character
771
		*/
772
		$validAtoms = [
773
			// Escape sequences
774
			'\\d' => 1, '\\D' => 1, '\\h' => 1, '\\H' => 1,
775
			'\\s' => 1, '\\S' => 1, '\\v' => 1, '\\V' => 1,
776
			'\\w' => 1, '\\W' => 1,
777
778
			// Special chars that need to be escaped in order to be used as literals
779
			'\\^' => 1, '\\$' => 1, '\\.' => 1, '\\?' => 1,
780
			'\\[' => 1, '\\]' => 1, '\\(' => 1, '\\)' => 1,
781
			'\\+' => 1, '\\*' => 1, '\\\\' => 1
782
		];
783
784
		// First we replace chains such as ["a",".?","b"] with ["a",".","b"] and ["a","b"]
785
		do
786
		{
787
			$hasMoreDots = false;
788
			foreach ($chains as $k1 => $dotChain)
789
			{
790
				$dotKeys = array_keys($dotChain, '.?', true);
791
792
				if (!empty($dotKeys))
793
				{
794
					// Replace the .? atom in the original chain with a .
795
					$dotChain[$dotKeys[0]] = '.';
796
					$chains[$k1] = $dotChain;
797
798
					// Create a new chain without the atom
799
					array_splice($dotChain, $dotKeys[0], 1);
800
					$chains[] = $dotChain;
801
802
					if (isset($dotKeys[1]))
803
					{
804
						$hasMoreDots = true;
805
					}
806
				}
807
			}
808
		}
809
		while ($hasMoreDots);
810
811
		foreach ($chains as $k1 => $dotChain)
812
		{
813
			$dotKeys = array_keys($dotChain, '.', true);
814
815
			if (empty($dotKeys))
816
			{
817
				continue;
818
			}
819
820
			foreach ($chains as $k2 => $tmpChain)
821
			{
822
				if ($k2 === $k1)
823
				{
824
					continue;
825
				}
826
827
				foreach ($dotKeys as $dotKey)
828
				{
829
					if (!isset($tmpChain[$dotKey]))
830
					{
831
						// The chain is too short to match, skip this chain
832
						continue 2;
833
					}
834
835
					// Skip if the dot is neither a literal nor a valid atom
836
					if (!preg_match('#^.$#Du', preg_quote($tmpChain[$dotKey]))
837
					 && !isset($validAtoms[$tmpChain[$dotKey]]))
838
					{
839
						continue 2;
840
					}
841
842
					// Replace the atom with a dot
843
					$tmpChain[$dotKey] = '.';
844
				}
845
846
				if ($tmpChain === $dotChain)
847
				{
848
					// The chain matches our dot chain, which means we can remove it
849
					unset($chains[$k2]);
850
				}
851
			}
852
		}
853
	}
854
855
	/**
856
	* Remove chains that overlap with chains that contain a catchall expression such as .*
857
	*
858
	* NOTE: cannot handle possessive expressions such as .++ because we don't know whether that
859
	*       chain had its suffix/tail stashed by an earlier iteration
860
	*
861
	* @param  array &$chains
862
	* @return void
863
	*/
864
	protected static function optimizeCatchallChains(array &$chains)
865
	{
866
		// This is how catchall expressions will trump each other in our routine. For instance,
867
		// instead of (?:.*|.+) we will emit (?:.*). Zero-or-more trumps one-or-more and greedy
868
		// trumps non-greedy. In some cases, (?:.+|.*?) might be preferable to (?:.*?) but it does
869
		// not seem like a common enough case to warrant the extra logic
870
		$precedence = [
871
			'.*'  => 3,
872
			'.*?' => 2,
873
			'.+'  => 1,
874
			'.+?' => 0
875
		];
876
877
		$tails = [];
878
879
		foreach ($chains as $k => $chain)
880
		{
881
			if (!isset($chain[0]))
882
			{
883
				continue;
884
			}
885
886
			$head = $chain[0];
887
888
			// Test whether the head is a catchall expression by looking up its precedence
889
			if (!isset($precedence[$head]))
890
			{
891
				continue;
892
			}
893
894
			$tail = implode('', array_slice($chain, 1));
895
			if (!isset($tails[$tail])
896
			 || $precedence[$head] > $tails[$tail]['precedence'])
897
			{
898
				$tails[$tail] = [
899
					'key'        => $k,
900
					'precedence' => $precedence[$head]
901
				];
902
			}
903
		}
904
905
		$catchallChains = [];
906
		foreach ($tails as $tail => $info)
907
		{
908
			$catchallChains[$info['key']] = $chains[$info['key']];
909
		}
910
911
		foreach ($catchallChains as $k1 => $catchallChain)
912
		{
913
			$headExpr = $catchallChain[0];
914
			$tailExpr = false;
915
			$match    = array_slice($catchallChain, 1);
916
917
			// Test whether the catchall chain has another catchall expression at the end
918
			if (isset($catchallChain[1])
919
			 && isset($precedence[end($catchallChain)]))
920
			{
921
				// Remove the catchall expression from the end
922
				$tailExpr = array_pop($match);
923
			}
924
925
			$matchCnt = count($match);
926
927
			foreach ($chains as $k2 => $chain)
928
			{
929
				if ($k2 === $k1)
930
				{
931
					continue;
932
				}
933
934
				/**
935
				* @var integer Offset of the first atom we're trying to match the tail against
936
				*/
937
				$start = 0;
938
939
				/**
940
				* @var integer
941
				*/
942
				$end = count($chain);
943
944
				// If the catchall at the start of the chain must match at least one character, we
945
				// ensure the chain has at least one character at its beginning
946
				if ($headExpr[1] === '+')
947
				{
948
					$found = false;
949
950
					foreach ($chain as $start => $atom)
951
					{
952
						if (self::matchesAtLeastOneCharacter($atom))
953
						{
954
							$found = true;
955
							break;
956
						}
957
					}
958
959
					if (!$found)
960
					{
961
						continue;
962
					}
963
				}
964
965
				// Test whether the catchall chain has another catchall expression at the end
966
				if ($tailExpr === false)
967
				{
968
					$end = $start;
969
				}
970
				else
971
				{
972
					// If the expression must match at least one character, we ensure that the
973
					// chain satisfy the requirement and we adjust $end accordingly so the same atom
974
					// isn't used twice (e.g. used by two consecutive .+ expressions)
975
					if ($tailExpr[1] === '+')
976
					{
977
						$found = false;
978
979
						while (--$end > $start)
980
						{
981
							if (self::matchesAtLeastOneCharacter($chain[$end]))
982
							{
983
								$found = true;
984
								break;
985
							}
986
						}
987
988
						if (!$found)
989
						{
990
							continue;
991
						}
992
					}
993
994
					// Now, $start should point to the first atom we're trying to match the catchall
995
					// chain against, and $end should be equal to the index right after the last
996
					// atom we can match against. We adjust $end to point to the last position our
997
					// match can start at
998
					$end -= $matchCnt;
999
				}
1000
1001
				while ($start <= $end)
1002
				{
1003
					if (array_slice($chain, $start, $matchCnt) === $match)
1004
					{
1005
						unset($chains[$k2]);
1006
						break;
1007
					}
1008
1009
					++$start;
1010
				}
1011
			}
1012
		}
1013
	}
1014
1015
	/**
1016
	* Test whether a given expression can never match an empty space
1017
	*
1018
	* Only basic checks are performed and it only returns true if we're certain the expression
1019
	* will always match/consume at least one character. For instance, it doesn't properly recognize
1020
	* the expression [ab]+ as matching at least one character.
1021
	*
1022
	* @param  string $expr
1023
	* @return bool
1024
	*/
1025
	protected static function matchesAtLeastOneCharacter($expr)
1026
	{
1027
		if (preg_match('#^[$*?^]$#', $expr))
1028
		{
1029
			return false;
1030
		}
1031
1032
		// A single character should be fine
1033
		if (preg_match('#^.$#u', $expr))
1034
		{
1035
			return true;
1036
		}
1037
1038
		// Matches anything that starts with ".+", "a+", etc...
1039
		if (preg_match('#^.\\+#u', $expr))
1040
		{
1041
			return true;
1042
		}
1043
1044
		// Matches anything that starts with "\d", "\+", "\d+", etc... We avoid matching back
1045
		// references as we can't be sure they matched at least one character themselves
1046
		if (preg_match('#^\\\\[^bBAZzGQEK1-9](?![*?])#', $expr))
1047
		{
1048
			return true;
1049
		}
1050
1051
		// Anything else is either too complicated and too circumstancial to investigate further so
1052
		// we'll just return false
1053
		return false;
1054
	}
1055
1056
	/**
1057
	* Test whether given expression can be safely used with atomic grouping
1058
	*
1059
	* @param  string $expr
1060
	* @return bool
1061
	*/
1062
	protected static function canUseAtomicGrouping($expr)
1063
	{
1064
		if (preg_match('#(?<!\\\\)(?>\\\\\\\\)*\\.#', $expr))
1065
		{
1066
			// An unescaped dot should disable atomic grouping. Technically, we could still allow it
1067
			// depending on what comes next in the regexp but it's a lot of work for something very
1068
			// situational
1069
			return false;
1070
		}
1071
1072
		if (preg_match('#(?<!\\\\)(?>\\\\\\\\)*[+*]#', $expr))
1073
		{
1074
			// A quantifier disables atomic grouping. Again, this could be enabled depending on the
1075
			// context
1076
			return false;
1077
		}
1078
1079
		if (preg_match('#(?<!\\\\)(?>\\\\\\\\)*\\(?(?<!\\()\\?#', $expr))
1080
		{
1081
			// An unescaped ? is a quantifier, unless it's preceded by an unescaped (
1082
			return false;
1083
		}
1084
1085
		if (preg_match('#(?<!\\\\)(?>\\\\\\\\)*\\\\[a-z0-9]#', $expr))
1086
		{
1087
			// Escape sequences disable atomic grouping because they might overlap with another
1088
			// branch
1089
			return false;
1090
		}
1091
1092
		// The regexp looks harmless enough to enable atomic grouping
1093
		return true;
1094
	}
1095
}