1
|
|
|
<?php declare(strict_types = 1); |
2
|
|
|
/** |
3
|
|
|
* Created by Vitaly Iegorov <[email protected]>. |
4
|
|
|
* on 02.03.17 at 13:25 |
5
|
|
|
*/ |
6
|
|
|
namespace samsonframework\stringconditiontree; |
7
|
|
|
use phpDocumentor\Reflection\Types\Integer; |
8
|
|
|
|
9
|
|
|
/** |
10
|
|
|
* Class StringConditionTree |
11
|
|
|
* |
12
|
|
|
* @author Vitaly Egorov <[email protected]> |
13
|
|
|
*/ |
14
|
|
|
class StringConditionTree |
15
|
|
|
{ |
16
|
|
|
/** Tree node root element identifier, needed for recursion */ |
17
|
|
|
const ROOT_NAME = ''; |
18
|
|
|
|
19
|
|
|
/** Final tree node branch identifier */ |
20
|
|
|
const SELF_NAME = '@self'; |
21
|
|
|
|
22
|
|
|
/** String parameter start marker */ |
23
|
|
|
const PARAMETER_START = '{'; |
24
|
|
|
|
25
|
|
|
/** String parameter end marker */ |
26
|
|
|
const PARAMETER_END = '}'; |
27
|
|
|
|
28
|
|
|
/** Parameter sorting length value for counting */ |
29
|
|
|
const PARAMETER_COF = 2000; |
30
|
|
|
|
31
|
|
|
/** @var TreeNode Resulting collection for debugging */ |
32
|
|
|
protected $debug; |
33
|
|
|
|
34
|
|
|
/** @var string Parametrized string start marker */ |
35
|
|
|
protected $parameterStartMarker = self::PARAMETER_START; |
36
|
|
|
|
37
|
|
|
/** @var string Parametrized string end marker */ |
38
|
|
|
protected $parameterEndMarker = self::PARAMETER_END; |
39
|
|
|
|
40
|
|
|
/** |
41
|
|
|
* StringConditionTree constructor. |
42
|
|
|
* |
43
|
|
|
* @param string $parameterStartMarker Parametrized string start marker |
44
|
|
|
* @param string $parameterEndMarker Parametrized string end marker |
45
|
|
|
*/ |
46
|
|
|
public function __construct(string $parameterStartMarker = self::PARAMETER_START, string $parameterEndMarker = self::PARAMETER_END) |
47
|
|
|
{ |
48
|
|
|
$this->parameterStartMarker = $parameterStartMarker; |
49
|
|
|
$this->parameterEndMarker = $parameterEndMarker; |
50
|
|
|
} |
51
|
|
|
|
52
|
|
|
/** |
53
|
|
|
* Build similarity strings tree. |
54
|
|
|
* |
55
|
|
|
* @param array $input Collection of strings |
56
|
|
|
* |
57
|
|
|
* @return TreeNode Resulting similarity strings tree |
58
|
|
|
*/ |
59
|
|
|
public function process(array $input): TreeNode |
60
|
|
|
{ |
61
|
|
|
/** |
62
|
|
|
* We need to find first matching character that present at least at one two string |
63
|
|
|
* to start building tree. Otherwise there is no reason to build tree. |
64
|
|
|
*/ |
65
|
|
|
$this->innerProcessor(self::ROOT_NAME, $input, $this->debug = new TreeNode()); |
66
|
|
|
|
67
|
|
|
$this->debug = $this->debug->children[self::ROOT_NAME]; |
68
|
|
|
|
69
|
|
|
return $this->debug; |
70
|
|
|
} |
71
|
|
|
|
72
|
|
|
/** |
73
|
|
|
* Buil string character group structure considering parametrized |
74
|
|
|
* and not parametrized characted groups and their length(PCG, NPCG). |
75
|
|
|
* |
76
|
|
|
* @param string $prefix Prefix string |
77
|
|
|
* |
78
|
|
|
* @return array String character groups structure |
79
|
|
|
*/ |
80
|
|
|
protected function getPrefixStructure(string $prefix): array |
81
|
|
|
{ |
82
|
|
|
/** @var array $structureMatrix String PCG(0)/NPCG(1) structure matrix for comparison */ |
83
|
|
|
$structureMatrix = []; |
84
|
|
|
|
85
|
|
|
// Flags for showing current string character group |
86
|
|
|
/** @var bool $isPCG Flags showing PCG started */ |
87
|
|
|
$isPCG = false; |
88
|
|
|
/** @var bool $isNPCG Flags showing NPCG started */ |
89
|
|
|
$isNPCG = true; |
90
|
|
|
|
91
|
|
|
// Pointer to current CG to count string NPCG length |
92
|
|
|
$currentCG = 0; |
93
|
|
|
|
94
|
|
|
/** |
95
|
|
|
* TODO: Try to find PCG filter :... pattern and process it also as |
96
|
|
|
* PCG with filters should be prioritized over PSG without filter |
97
|
|
|
* even if filter is .* |
98
|
|
|
*/ |
99
|
|
|
|
100
|
|
|
// Iterate string by characters |
101
|
|
|
for ($i = 0, $length = strlen($prefix); $i < $length; $i++) { |
102
|
|
|
if (!$isPCG && $prefix{$i} === $this->parameterStartMarker) { |
103
|
|
|
$isPCG = true; |
104
|
|
|
$isNPCG = false; |
105
|
|
|
$structureMatrix[] = [0,0,$prefix]; |
106
|
|
|
$currentCG = &$structureMatrix[count($structureMatrix)-1][1]; |
107
|
|
|
} elseif ($isPCG && $prefix{$i} === $this->parameterEndMarker) { |
108
|
|
|
$isPCG = false; |
109
|
|
|
$isNPCG = true; |
110
|
|
|
} elseif ($isNPCG) { |
111
|
|
|
$isNPCG = false; |
112
|
|
|
$structureMatrix[] = [1,0,$prefix]; |
113
|
|
|
$currentCG = &$structureMatrix[count($structureMatrix)-1][1]; |
114
|
|
|
} |
115
|
|
|
|
116
|
|
|
// Store current character group length |
117
|
|
|
$currentCG++; |
118
|
|
|
} |
119
|
|
|
|
120
|
|
|
return $structureMatrix; |
121
|
|
|
} |
122
|
|
|
|
123
|
|
|
/** |
124
|
|
|
* Compare string structures. |
125
|
|
|
* |
126
|
|
|
* @param array $initial Initial string structure |
127
|
|
|
* @param array $compared Compared string structure |
128
|
|
|
* |
129
|
|
|
* @return int Result of array elements comparison |
130
|
|
|
*/ |
131
|
|
|
protected function compareStringStructure(array $initial, array $compared): int |
132
|
|
|
{ |
133
|
|
|
$maxStructureSize = max(count($initial), count($compared)); |
134
|
|
|
|
135
|
|
|
// Make structures same size preserving previous existing structure value |
136
|
|
|
for ($i = 1; $i < $maxStructureSize; $i++) { |
137
|
|
|
if (!array_key_exists($i, $initial)) { |
138
|
|
|
$initial[$i] = $initial[$i-1]; |
139
|
|
|
} |
140
|
|
|
if (!array_key_exists($i, $compared)) { |
141
|
|
|
$compared[$i] = $compared[$i-1]; |
142
|
|
|
} |
143
|
|
|
} |
144
|
|
|
|
145
|
|
|
// Iterate every structure group |
146
|
|
|
for ($i = 0; $i < $maxStructureSize; $i++) { |
147
|
|
|
// If initial structure has NPCG than it has higher priority |
148
|
|
|
if ($initial[$i][0] > $compared[$i][0]) { |
149
|
|
|
return -1; |
150
|
|
|
} |
151
|
|
|
|
152
|
|
|
// If compared structure has NPCG than it has higher priority |
153
|
|
|
if ($initial[$i][0] < $compared[$i][0]) { |
154
|
|
|
return 1; |
155
|
|
|
} |
156
|
|
|
|
157
|
|
|
// They are equal continue to next structure group comparison |
158
|
|
|
} |
159
|
|
|
|
160
|
|
|
// If both structures are equal compare lengths of NPCG |
161
|
|
View Code Duplication |
for ($i = 0; $i < $maxStructureSize; $i++) { |
|
|
|
|
162
|
|
|
// If current CG is NPCG |
163
|
|
|
if ($initial[$i][0] === 1) { |
164
|
|
|
if ($initial[$i][1] > $compared[$i][1]) { |
165
|
|
|
return 1; |
166
|
|
|
} |
167
|
|
|
|
168
|
|
|
if ($initial[$i][1] < $compared[$i][1]) { |
169
|
|
|
return -1; |
170
|
|
|
} |
171
|
|
|
} |
172
|
|
|
|
173
|
|
|
// Current NPCG character groups have equal length - continue |
174
|
|
|
} |
175
|
|
|
|
176
|
|
|
// If both structures are equal and NPCG length are equal - compare lengths of PCG |
177
|
|
View Code Duplication |
for ($i = 0; $i < $maxStructureSize; $i++) { |
|
|
|
|
178
|
|
|
// If current CG is PCG |
179
|
|
|
if ($initial[$i][0] === 0) { |
180
|
|
|
if ($initial[$i][1] > $compared[$i][1]) { |
181
|
|
|
return -1; |
182
|
|
|
} |
183
|
|
|
|
184
|
|
|
if ($initial[$i][1] < $compared[$i][1]) { |
185
|
|
|
return 1; |
186
|
|
|
} |
187
|
|
|
} |
188
|
|
|
|
189
|
|
|
// Current PCG character groups have equal length - continue |
190
|
|
|
} |
191
|
|
|
|
192
|
|
|
// Structures are absolutely equal |
193
|
|
|
return 0; |
194
|
|
|
} |
195
|
|
|
|
196
|
|
|
/** |
197
|
|
|
* Sort strings array considering PCG and NPCG string structure. |
198
|
|
|
* |
199
|
|
|
* @param array $input Input array for sorting |
200
|
|
|
* @return array Sorted array |
201
|
|
|
*/ |
202
|
|
|
protected function sortArrayByKeys(array &$input) |
203
|
|
|
{ |
204
|
|
|
$prefixes = array_map([$this, 'getPrefixStructure'], array_keys($input)); |
205
|
|
|
|
206
|
|
|
usort($prefixes, [$this, 'compareStringStructure']); |
207
|
|
|
|
208
|
|
|
$result = []; |
209
|
|
|
foreach ($prefixes as $sortingData) { |
210
|
|
|
$result[$sortingData[0][2]] = $input[$sortingData[0][2]]; |
211
|
|
|
} |
212
|
|
|
|
213
|
|
|
return $result; |
214
|
|
|
} |
215
|
|
|
|
216
|
|
|
/** |
217
|
|
|
* Add only unique value to array. |
218
|
|
|
* |
219
|
|
|
* @param mixed $value Unique value |
220
|
|
|
* @param array $array Array for adding unique value |
221
|
|
|
* @param bool $strict Strict uniqueness check |
222
|
|
|
* |
223
|
|
|
* @see in_array(); |
224
|
|
|
* |
225
|
|
|
* @return bool True if unique value was added |
226
|
|
|
*/ |
227
|
|
|
protected function addUniqueToArray($value, &$array, bool $strict = true) |
228
|
|
|
{ |
229
|
|
|
// Create array if not array is passed |
230
|
|
|
if (!is_array($array)) { |
231
|
|
|
$array = []; |
232
|
|
|
} |
233
|
|
|
|
234
|
|
|
// Add value to array if it is unique |
235
|
|
|
if (!in_array($value, $array, $strict)) { |
236
|
|
|
$array[] = $value; |
237
|
|
|
|
238
|
|
|
return true; |
239
|
|
|
} else { // Value is already in array |
240
|
|
|
return false; |
241
|
|
|
} |
242
|
|
|
} |
243
|
|
|
|
244
|
|
|
/** |
245
|
|
|
* Find longest matching prefix between two strings. |
246
|
|
|
* |
247
|
|
|
* @param string $initialString Initial string |
248
|
|
|
* @param string $comparedString Compared string |
249
|
|
|
* |
250
|
|
|
* @return string Longest matching prefix |
251
|
|
|
*/ |
252
|
|
|
protected function getLongestMatchingPrefix(string $initialString, string $comparedString): string |
253
|
|
|
{ |
254
|
|
|
// Iterate and compare how string matches |
255
|
|
|
$longestPrefix = ''; |
256
|
|
|
|
257
|
|
|
// Define shortest & longest strings to avoid errors |
258
|
|
|
$initialLength = strlen($initialString); |
259
|
|
|
$comparedLength = strlen($comparedString); |
260
|
|
|
|
261
|
|
|
// Define shortest and longest strings to avoid character matching errors |
262
|
|
|
$shortestString = $initialLength < $comparedLength ? $initialString : $comparedString; |
263
|
|
|
$longestString = $initialLength >= $comparedLength ? $initialString : $comparedString; |
264
|
|
|
|
265
|
|
|
// Iterate initial string characters |
266
|
|
|
$isPattern = false; |
267
|
|
|
$parametrizedPrefix = ''; |
268
|
|
|
for ($z = 0, $length = strlen($shortestString); $z < $length; $z++) { |
269
|
|
|
// Pattern support |
270
|
|
|
// TODO: Probably can be optimized |
271
|
|
|
if ($isPattern || $shortestString{$z} === $this->parameterStartMarker) { |
272
|
|
|
$isPattern = true; |
273
|
|
|
|
274
|
|
|
// Concatenate longest matching prefix |
275
|
|
|
$parametrizedPrefix .= $shortestString{$z}; |
276
|
|
|
|
277
|
|
|
// Compare characters with compared string |
278
|
|
|
if ($shortestString{$z} !== $longestString{$z}) { |
279
|
|
|
// Clear pattern as pattern characters mismatch |
280
|
|
|
$parametrizedPrefix = ''; |
|
|
|
|
281
|
|
|
break; |
282
|
|
|
} |
283
|
|
|
|
284
|
|
|
// If pattern id closed unset flag fro special behaviour |
285
|
|
|
if ($shortestString{$z} === $this->parameterEndMarker) { |
286
|
|
|
// If parametrized part ended append to longest matching prefix |
287
|
|
|
$longestPrefix .= $parametrizedPrefix; |
288
|
|
|
// Clear parametrized prefix as we can have other parametrized parts |
289
|
|
|
$parametrizedPrefix = ''; |
290
|
|
|
// Reset parametrized flag |
291
|
|
|
$isPattern = false; |
292
|
|
|
} |
293
|
|
|
} else { |
294
|
|
|
// Compare characters with compared string |
295
|
|
|
if ($shortestString{$z} !== $longestString{$z}) { |
296
|
|
|
break; // Exit on first mismatching character |
297
|
|
|
} |
298
|
|
|
|
299
|
|
|
// Concatenate matching part of two strings |
300
|
|
|
$longestPrefix .= $initialString{$z}; |
301
|
|
|
} |
302
|
|
|
} |
303
|
|
|
|
304
|
|
|
return $longestPrefix; |
305
|
|
|
} |
306
|
|
|
|
307
|
|
|
/** |
308
|
|
|
* Remove key string from the beginning of all sub-array strings. |
309
|
|
|
* |
310
|
|
|
* @param array $array Input array of key => [keyStrings...] |
311
|
|
|
* |
312
|
|
|
* @param string $selfMarker Marker for storing self pointer |
313
|
|
|
* |
314
|
|
|
* @return array Processed array with removed keys from beginning of sub arrays |
315
|
|
|
*/ |
316
|
|
|
protected function removeKeyFromArrayStrings(array $array, string $selfMarker): array |
317
|
|
|
{ |
318
|
|
|
$result = []; |
319
|
|
|
/** @var string[] $values */ |
320
|
|
|
foreach ($array as $key => $values) { |
321
|
|
|
$lmpLength = strlen((string)$key); |
322
|
|
|
foreach ($values as $string) { |
323
|
|
|
$newString = substr($string, $lmpLength); |
324
|
|
|
|
325
|
|
|
if ($newString === false || $newString === '' || $string === $selfMarker) { |
326
|
|
|
$result[$key][] = $selfMarker; |
327
|
|
|
} else { |
328
|
|
|
$result[$key][] = $newString; |
329
|
|
|
} |
330
|
|
|
} |
331
|
|
|
} |
332
|
|
|
|
333
|
|
|
return $result; |
334
|
|
|
} |
335
|
|
|
|
336
|
|
|
/** |
337
|
|
|
* Find all duplication of source array values in compared array and remove them. |
338
|
|
|
* |
339
|
|
|
* @param array $source Source array |
340
|
|
|
* @param array $compared Compared array for filtering duplicates |
341
|
|
|
*/ |
342
|
|
|
protected function removeDuplicatesInSubArray(array $source, array &$compared) |
343
|
|
|
{ |
344
|
|
|
foreach ($source as $value) { |
345
|
|
|
foreach ($compared as $key => &$subValue) { |
346
|
|
|
if ($subValue === $value) { |
347
|
|
|
unset($compared[$key]); |
348
|
|
|
} |
349
|
|
|
} |
350
|
|
|
} |
351
|
|
|
} |
352
|
|
|
|
353
|
|
|
/** |
354
|
|
|
* Recursive string similarity tree builder. |
355
|
|
|
* |
356
|
|
|
* @param string $prefix |
357
|
|
|
* @param array $input |
358
|
|
|
* @param TreeNode $result |
359
|
|
|
* @param string $selfMarker |
360
|
|
|
*/ |
361
|
|
|
protected function innerProcessor(string $prefix, array $input, TreeNode $result, $selfMarker = self::SELF_NAME) |
362
|
|
|
{ |
363
|
|
|
// Create tree node |
364
|
|
|
$newChild = $result->append($prefix); |
365
|
|
|
|
366
|
|
|
/** |
367
|
|
|
* Iterate all combinations of strings and group by LMP |
368
|
|
|
*/ |
369
|
|
|
$longestPrefixes = []; |
370
|
|
|
for ($i = 0, $count = count($input); $i < $count; $i++) { |
371
|
|
|
for ($j = $i + 1; $j < $count; $j++) { |
372
|
|
|
$longestMatchedPrefix = $this->getLongestMatchingPrefix($input[$i], $input[$j]); |
373
|
|
|
|
374
|
|
|
// We have found at least one matching character between strings |
375
|
|
|
if ($longestMatchedPrefix !== '') { |
376
|
|
|
$this->addUniqueToArray($input[$i], $longestPrefixes[$longestMatchedPrefix]); |
377
|
|
|
$this->addUniqueToArray($input[$j], $longestPrefixes[$longestMatchedPrefix]); |
378
|
|
|
} |
379
|
|
|
} |
380
|
|
|
} |
381
|
|
|
|
382
|
|
|
/** |
383
|
|
|
* Sort LMPs(array keys) ascending by key length |
384
|
|
|
*/ |
385
|
|
|
$longestPrefixes = $this->sortArrayByKeys($longestPrefixes); |
386
|
|
|
|
387
|
|
|
/** |
388
|
|
|
* Iterate all sorted LMP strings and remove duplicates from LMP string ordered lower |
389
|
|
|
*/ |
390
|
|
|
$keys = array_keys($longestPrefixes); |
391
|
|
|
for ($i = 0, $length = count($keys); $i < $length; $i++) { |
392
|
|
|
for ($j = $i + 1; $j < $length; $j++) { |
393
|
|
|
$this->removeDuplicatesInSubArray($longestPrefixes[$keys[$i]], $longestPrefixes[$keys[$j]]); |
394
|
|
|
} |
395
|
|
|
} |
396
|
|
|
|
397
|
|
|
// Remove empty LMPs as they are included in smaller LMPs |
398
|
|
|
$longestPrefixes = array_filter($longestPrefixes); |
399
|
|
|
|
400
|
|
|
/** |
401
|
|
|
* Search for input string that do not have LMP, and add missing as LMP |
402
|
|
|
*/ |
403
|
|
|
foreach ($input as $string) { |
404
|
|
|
$found = false; |
405
|
|
|
|
406
|
|
|
if ($string !== $selfMarker) { |
407
|
|
|
foreach ($longestPrefixes as $strings) { |
408
|
|
|
if (in_array($string, $strings, true)) { |
409
|
|
|
$found = true; |
410
|
|
|
break; |
411
|
|
|
} |
412
|
|
|
} |
413
|
|
|
|
414
|
|
|
if (!$found) { |
415
|
|
|
$longestPrefixes[$string] = [$selfMarker]; |
416
|
|
|
} |
417
|
|
|
} |
418
|
|
|
} |
419
|
|
|
|
420
|
|
|
/** |
421
|
|
|
* After filtering LMPs remove LMP from matched string arrays |
422
|
|
|
*/ |
423
|
|
|
$longestPrefixes = $this->removeKeyFromArrayStrings($longestPrefixes, $selfMarker); |
424
|
|
|
|
425
|
|
|
/** |
426
|
|
|
* Sort LMPs(array keys) ascending by key length |
427
|
|
|
*/ |
428
|
|
|
$longestPrefixes = $this->sortArrayByKeys($longestPrefixes); |
429
|
|
|
|
430
|
|
|
/** |
431
|
|
|
* If we have self marker as an input string - create LMP for it |
432
|
|
|
*/ |
433
|
|
|
if (in_array($selfMarker, $input, true)) { |
434
|
|
|
$longestPrefixes = array_merge([$selfMarker => []], $longestPrefixes); |
435
|
|
|
} |
436
|
|
|
|
437
|
|
|
/** |
438
|
|
|
* Recursively iterate current level LMPs |
439
|
|
|
*/ |
440
|
|
|
foreach ($longestPrefixes as $longestPrefix => $strings) { |
441
|
|
|
$this->innerProcessor((string)$longestPrefix, $strings, $newChild); |
442
|
|
|
} |
443
|
|
|
} |
444
|
|
|
} |
445
|
|
|
|
Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.
You can also find more detailed suggestions in the “Code” section of your repository.