1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
namespace PhpOffice\PhpSpreadsheet\Reader\Csv; |
4
|
|
|
|
5
|
|
|
class Delimiter |
6
|
|
|
{ |
7
|
|
|
protected const POTENTIAL_DELIMETERS = [',', ';', "\t", '|', ':', ' ', '~']; |
8
|
|
|
|
9
|
|
|
protected $fileHandle; |
10
|
|
|
|
11
|
|
|
protected $escapeCharacter; |
12
|
|
|
|
13
|
|
|
protected $enclosure; |
14
|
|
|
|
15
|
|
|
protected $counts = []; |
16
|
|
|
|
17
|
|
|
protected $numberLines = 0; |
18
|
|
|
|
19
|
|
|
protected $delimiter; |
20
|
|
|
|
21
|
50 |
|
public function __construct($fileHandle, $escapeCharacter, $enclosure) |
22
|
|
|
{ |
23
|
50 |
|
$this->fileHandle = $fileHandle; |
24
|
50 |
|
$this->escapeCharacter = $escapeCharacter; |
25
|
50 |
|
$this->enclosure = $enclosure; |
26
|
|
|
|
27
|
50 |
|
$this->countPotentialDelimiters(); |
28
|
50 |
|
} |
29
|
|
|
|
30
|
4 |
|
public function getDefaultDelimiter(): string |
31
|
|
|
{ |
32
|
4 |
|
return self::POTENTIAL_DELIMETERS[0]; |
33
|
|
|
} |
34
|
|
|
|
35
|
50 |
|
public function linesCounted(): int |
36
|
|
|
{ |
37
|
50 |
|
return $this->numberLines; |
38
|
|
|
} |
39
|
|
|
|
40
|
50 |
|
protected function countPotentialDelimiters(): void |
41
|
|
|
{ |
42
|
50 |
|
$this->counts = array_fill_keys(self::POTENTIAL_DELIMETERS, []); |
43
|
50 |
|
$delimiterKeys = array_flip(self::POTENTIAL_DELIMETERS); |
44
|
|
|
|
45
|
|
|
// Count how many times each of the potential delimiters appears in each line |
46
|
50 |
|
$this->numberLines = 0; |
47
|
50 |
|
while (($line = $this->getNextLine()) !== false && (++$this->numberLines < 1000)) { |
48
|
49 |
|
$this->countDelimiterValues($line, $delimiterKeys); |
49
|
|
|
} |
50
|
50 |
|
} |
51
|
|
|
|
52
|
49 |
|
protected function countDelimiterValues(string $line, array $delimiterKeys): void |
53
|
|
|
{ |
54
|
49 |
|
$splitString = str_split($line, 1); |
55
|
49 |
|
if (!is_array($splitString)) { |
56
|
|
|
return; |
57
|
|
|
} |
58
|
|
|
|
59
|
49 |
|
$distribution = array_count_values($splitString); |
60
|
49 |
|
$countLine = array_intersect_key($distribution, $delimiterKeys); |
61
|
|
|
|
62
|
49 |
|
foreach (self::POTENTIAL_DELIMETERS as $delimiter) { |
63
|
49 |
|
$this->counts[$delimiter][] = $countLine[$delimiter] ?? 0; |
64
|
|
|
} |
65
|
49 |
|
} |
66
|
|
|
|
67
|
49 |
|
public function infer(): ?string |
68
|
|
|
{ |
69
|
|
|
// Calculate the mean square deviations for each delimiter |
70
|
|
|
// (ignoring delimiters that haven't been found consistently) |
71
|
49 |
|
$meanSquareDeviations = []; |
72
|
49 |
|
$middleIdx = floor(($this->numberLines - 1) / 2); |
73
|
|
|
|
74
|
49 |
|
foreach (self::POTENTIAL_DELIMETERS as $delimiter) { |
75
|
49 |
|
$series = $this->counts[$delimiter]; |
76
|
49 |
|
sort($series); |
77
|
|
|
|
78
|
49 |
|
$median = ($this->numberLines % 2) |
79
|
13 |
|
? $series[$middleIdx] |
80
|
49 |
|
: ($series[$middleIdx] + $series[$middleIdx + 1]) / 2; |
81
|
|
|
|
82
|
49 |
|
if ($median === 0) { |
83
|
49 |
|
continue; |
84
|
|
|
} |
85
|
|
|
|
86
|
46 |
|
$meanSquareDeviations[$delimiter] = array_reduce( |
87
|
|
|
$series, |
88
|
|
|
function ($sum, $value) use ($median) { |
89
|
46 |
|
return $sum + ($value - $median) ** 2; |
90
|
46 |
|
} |
91
|
46 |
|
) / count($series); |
92
|
|
|
} |
93
|
|
|
|
94
|
|
|
// ... and pick the delimiter with the smallest mean square deviation |
95
|
|
|
// (in case of ties, the order in potentialDelimiters is respected) |
96
|
49 |
|
$min = INF; |
97
|
49 |
|
foreach (self::POTENTIAL_DELIMETERS as $delimiter) { |
98
|
49 |
|
if (!isset($meanSquareDeviations[$delimiter])) { |
99
|
49 |
|
continue; |
100
|
|
|
} |
101
|
|
|
|
102
|
46 |
|
if ($meanSquareDeviations[$delimiter] < $min) { |
103
|
46 |
|
$min = $meanSquareDeviations[$delimiter]; |
104
|
46 |
|
$this->delimiter = $delimiter; |
105
|
|
|
} |
106
|
|
|
} |
107
|
|
|
|
108
|
49 |
|
return $this->delimiter; |
109
|
|
|
} |
110
|
|
|
|
111
|
|
|
/** |
112
|
|
|
* Get the next full line from the file. |
113
|
|
|
* |
114
|
|
|
* @return false|string |
115
|
|
|
*/ |
116
|
50 |
|
public function getNextLine() |
117
|
|
|
{ |
118
|
50 |
|
$line = ''; |
119
|
50 |
|
$enclosure = ($this->escapeCharacter === '' ? '' |
120
|
49 |
|
: ('(?<!' . preg_quote($this->escapeCharacter, '/') . ')')) |
121
|
50 |
|
. preg_quote($this->enclosure, '/'); |
122
|
|
|
|
123
|
|
|
do { |
124
|
|
|
// Get the next line in the file |
125
|
50 |
|
$newLine = fgets($this->fileHandle); |
126
|
|
|
|
127
|
|
|
// Return false if there is no next line |
128
|
50 |
|
if ($newLine === false) { |
129
|
50 |
|
return false; |
130
|
|
|
} |
131
|
|
|
|
132
|
|
|
// Add the new line to the line passed in |
133
|
49 |
|
$line = $line . $newLine; |
134
|
|
|
|
135
|
|
|
// Drop everything that is enclosed to avoid counting false positives in enclosures |
136
|
49 |
|
$line = preg_replace('/(' . $enclosure . '.*' . $enclosure . ')/Us', '', $line); |
137
|
|
|
|
138
|
|
|
// See if we have any enclosures left in the line |
139
|
|
|
// if we still have an enclosure then we need to read the next line as well |
140
|
49 |
|
} while (preg_match('/(' . $enclosure . ')/', $line) > 0); |
141
|
|
|
|
142
|
49 |
|
return $line; |
143
|
|
|
} |
144
|
|
|
} |
145
|
|
|
|