1
|
|
|
<?php |
2
|
|
|
/** |
3
|
|
|
* CSVelte: Slender, elegant CSV for PHP |
4
|
|
|
* |
5
|
|
|
* Inspired by Python's CSV module and Frictionless Data and the W3C's CSV |
6
|
|
|
* standardization efforts, CSVelte was written in an effort to take all the |
7
|
|
|
* suck out of working with CSV. |
8
|
|
|
* |
9
|
|
|
* @copyright Copyright (c) 2018 Luke Visinoni |
10
|
|
|
* @author Luke Visinoni <[email protected]> |
11
|
|
|
* @license See LICENSE file (MIT license) |
12
|
|
|
*/ |
13
|
|
|
namespace CSVelte\Sniffer; |
14
|
|
|
|
15
|
|
|
use function Noz\collect; |
16
|
|
|
use Noz\Collection\Collection; |
17
|
|
|
use function Stringy\create as s; |
18
|
|
|
|
19
|
|
|
class SniffDelimiterByConsistency extends AbstractSniffer |
20
|
|
|
{ |
21
|
|
|
/** |
22
|
|
|
* Guess delimiter in a string of data |
23
|
|
|
* |
24
|
|
|
* Guesses the delimiter character by analyzing the count consistency of possible delimiters across several lines. |
25
|
|
|
* Basically, the character that occurs roughly the same number of times on each line will be returned. It is |
26
|
|
|
* possible for this sniffer to return multiple characters if there is a tie. |
27
|
|
|
* |
28
|
|
|
* @param string $data The data to analyze |
29
|
|
|
* |
30
|
|
|
* @return string[] |
31
|
|
|
*/ |
32
|
2 |
|
public function sniff($data) |
33
|
|
|
{ |
34
|
|
|
// build a table of characters and their frequencies for each line. We |
35
|
|
|
// will use this frequency table to then build a table of frequencies of |
36
|
|
|
// each frequency (in 10 lines, "tab" occurred 5 times on 7 of those |
37
|
|
|
// lines, 6 times on 2 lines, and 7 times on 1 line) |
38
|
|
|
|
39
|
2 |
|
$delimiters = $this->getOption('delimiters'); |
40
|
2 |
|
$lineTerminator = $this->getOption('lineTerminator') ?: "\n"; |
41
|
|
|
// @todo it would probably make for more consistent results if you popped the last line since it will most likely be truncated due to the arbitrary nature of the sample size |
42
|
2 |
|
$lines = collect(explode($lineTerminator, $this->removeQuotedStrings($data))); |
43
|
|
|
$frequencies = $lines->map(function($line) use ($delimiters) { |
44
|
2 |
|
$preferred = array_flip($delimiters); |
45
|
2 |
|
return collect($preferred) |
46
|
|
|
->map(function() { return 0; }) |
47
|
2 |
|
->merge(collect(s($line)->chars())->frequency()->kintersect($preferred)) |
48
|
2 |
|
->toArray(); |
49
|
2 |
|
}); |
50
|
|
|
|
51
|
|
|
// now determine the mode for each char to decide the "expected" amount |
52
|
|
|
// of times a char (possible delim) will occur on each line... |
53
|
2 |
|
$modes = collect($delimiters) |
54
|
2 |
|
->flip() |
55
|
|
|
->map(function($freq, $delim) use ($frequencies) { |
56
|
2 |
|
return $frequencies->getColumn($delim)->mode(); |
57
|
2 |
|
}) |
58
|
2 |
|
->filter(); |
59
|
|
|
|
60
|
|
|
/** @var Collection $consistencies */ |
61
|
|
|
$consistencies = $frequencies->recollect(function(Collection $accum, $freq, $line_no) use ($modes) { |
|
|
|
|
62
|
|
|
|
63
|
|
|
$modes->each(function($expected, $char) use ($accum, $freq) { |
64
|
|
|
/** @var Collection $freq */ |
65
|
2 |
|
if (collect($freq)->get($char) == $expected) { |
66
|
2 |
|
$matches = $accum->get($char, 0); |
67
|
2 |
|
$accum->set($char, ++$matches); |
68
|
2 |
|
} |
69
|
2 |
|
}); |
70
|
2 |
|
return $accum; |
71
|
|
|
|
72
|
2 |
|
}) |
73
|
2 |
|
->sort() |
74
|
2 |
|
->reverse(); |
75
|
|
|
|
76
|
2 |
|
$winners = $consistencies->filter(function($freq) use ($consistencies) { |
77
|
2 |
|
return $freq === $consistencies->max(); |
78
|
2 |
|
}) |
79
|
2 |
|
->keys(); |
80
|
|
|
|
81
|
|
|
// return winners in order of preference |
82
|
2 |
|
return collect($delimiters) |
83
|
2 |
|
->intersect($winners) |
84
|
2 |
|
->values() |
85
|
2 |
|
->toArray(); |
86
|
|
|
} |
87
|
|
|
} |
This check looks from parameters that have been defined for a function or method, but which are not used in the method body.