1
|
|
|
<?php |
2
|
|
|
/** |
3
|
|
|
* CSVelte: Slender, elegant CSV for PHP |
4
|
|
|
* |
5
|
|
|
* Inspired by Python's CSV module and Frictionless Data and the W3C's CSV |
6
|
|
|
* standardization efforts, CSVelte was written in an effort to take all the |
7
|
|
|
* suck out of working with CSV. |
8
|
|
|
* |
9
|
|
|
* @copyright Copyright (c) 2018 Luke Visinoni |
10
|
|
|
* @author Luke Visinoni <[email protected]> |
11
|
|
|
* @license See LICENSE file (MIT license) |
12
|
|
|
*/ |
13
|
|
|
namespace CSVelte; |
14
|
|
|
|
15
|
|
|
use CSVelte\Contract\Streamable; |
16
|
|
|
|
17
|
|
|
use CSVelte\Exception\SnifferException; |
18
|
|
|
use CSVelte\Sniffer\SniffDelimiterByConsistency; |
19
|
|
|
use CSVelte\Sniffer\SniffDelimiterByDistribution; |
20
|
|
|
use CSVelte\Sniffer\SniffHeaderByDataType; |
21
|
|
|
use CSVelte\Sniffer\SniffLineTerminatorByCount; |
22
|
|
|
use CSVelte\Sniffer\SniffQuoteAndDelimByAdjacency; |
23
|
|
|
use CSVelte\Sniffer\SniffQuoteStyle; |
24
|
|
|
use Noz\Collection\Collection; |
25
|
|
|
use function Noz\to_array; |
26
|
|
|
use RuntimeException; |
27
|
|
|
|
28
|
|
|
use function Noz\collect; |
29
|
|
|
use function Stringy\create as s; |
30
|
|
|
|
31
|
|
|
class Sniffer |
32
|
|
|
{ |
33
|
|
|
/** CSV data sample size - sniffer will use this many bytes to make its determinations */ |
34
|
|
|
const SAMPLE_SIZE = 2500; |
35
|
|
|
|
36
|
|
|
/** |
37
|
|
|
* ASCII character codes for "invisibles". |
38
|
|
|
*/ |
39
|
|
|
const HORIZONTAL_TAB = 9; |
40
|
|
|
const LINE_FEED = 10; |
41
|
|
|
const CARRIAGE_RETURN = 13; |
42
|
|
|
const SPACE = 32; |
43
|
|
|
|
44
|
|
|
/** |
45
|
|
|
* @var array A list of possible delimiters to check for (in order of preference) |
46
|
|
|
*/ |
47
|
|
|
protected $delims = [',', "\t", ';', '|', ':', '-', '_', '#', '/', '\\', '$', '+', '=', '&', '@']; |
48
|
|
|
|
49
|
|
|
/** |
50
|
|
|
* @var Streamable A stream of the sample data |
51
|
|
|
*/ |
52
|
|
|
protected $stream; |
53
|
|
|
|
54
|
|
|
/** |
55
|
|
|
* Sniffer constructor. |
56
|
|
|
* |
57
|
|
|
* @param Streamable $stream The data to sniff |
58
|
|
|
* @param array $delims A list of possible delimiter characters in order of preference |
59
|
|
|
*/ |
60
|
1 |
|
public function __construct(Streamable $stream, $delims = null) |
61
|
|
|
{ |
62
|
1 |
|
$this->stream = $stream; |
63
|
1 |
|
if (!is_null($delims)) { |
64
|
1 |
|
$this->setPossibleDelimiters($delims); |
65
|
1 |
|
} |
66
|
1 |
|
} |
67
|
|
|
|
68
|
|
|
/** |
69
|
|
|
* Set possible delimiter characters |
70
|
|
|
* |
71
|
|
|
* @param array $delims A list of possible delimiter characters |
72
|
|
|
* |
73
|
|
|
* @return self |
74
|
|
|
*/ |
75
|
1 |
|
public function setPossibleDelimiters(array $delims) |
76
|
|
|
{ |
77
|
1 |
|
$this->delims = collect($delims) |
78
|
1 |
|
->filter(function($val) { |
79
|
1 |
|
return s($val)->length() == 1; |
80
|
1 |
|
}) |
81
|
1 |
|
->values() |
82
|
1 |
|
->toArray(); |
83
|
|
|
|
84
|
1 |
|
return $this; |
85
|
|
|
} |
86
|
|
|
|
87
|
|
|
/** |
88
|
|
|
* Get list of possible delimiter characters |
89
|
|
|
* |
90
|
|
|
* @return array |
91
|
|
|
*/ |
92
|
1 |
|
public function getPossibleDelimiters() |
93
|
|
|
{ |
94
|
1 |
|
return $this->delims; |
95
|
|
|
} |
96
|
|
|
|
97
|
|
|
/** |
98
|
|
|
* Sniff CSV data (determine its dialect) |
99
|
|
|
* |
100
|
|
|
* Since CSV is less a format than a collection of similar formats, you can never be certain how a particular CSV |
101
|
|
|
* file is formatted. This method inspects CSV data and returns its "dialect", an object that can be passed to |
102
|
|
|
* either a `CSVelte\Reader` or `CSVelte\Writer` object to tell it what "dialect" of CSV to use. |
103
|
|
|
* |
104
|
|
|
* @todo look into which other Dialect attributes you can sniff for |
105
|
|
|
* |
106
|
|
|
* @return Dialect |
107
|
|
|
*/ |
108
|
|
|
public function sniff() |
109
|
|
|
{ |
110
|
|
|
$sample = $this->stream->read(static::SAMPLE_SIZE); |
111
|
|
|
$lineTerminator = $this->sniffLineTerminator($sample); |
|
|
|
|
112
|
|
|
try { |
113
|
|
|
list($quoteChar, $delimiter) = $this->sniffQuoteAndDelim($sample, $lineTerminator); |
|
|
|
|
114
|
|
|
} catch (SnifferException $e) { |
115
|
|
|
if ($e->getCode() !== SnifferException::ERR_QUOTE_AND_DELIM) { |
116
|
|
|
throw $e; |
117
|
|
|
} |
118
|
|
|
$quoteChar = '"'; |
119
|
|
|
$delimiter = $this->sniffDelimiter($sample, $lineTerminator); |
120
|
|
|
} |
121
|
|
|
/** |
122
|
|
|
* @todo Should this be null? Because doubleQuote = true means this = null |
123
|
|
|
*/ |
124
|
|
|
$escapeChar = '\\'; |
125
|
|
|
$quoteStyle = $this->sniffQuotingStyle($sample, $delimiter, $lineTerminator); |
126
|
|
|
$header = $this->sniffHasHeader($sample, $delimiter, $lineTerminator); |
127
|
|
|
$encoding = s($sample)->getEncoding(); |
128
|
|
|
|
129
|
|
|
return new Dialect(compact('quoteChar', 'escapeChar', 'delimiter', 'lineTerminator', 'quoteStyle', 'header', 'encoding')); |
130
|
|
|
} |
131
|
|
|
|
132
|
|
|
/** |
133
|
|
|
* Sniff sample data for line terminator character |
134
|
|
|
* |
135
|
|
|
* @param string $data The sample data |
136
|
|
|
* |
137
|
|
|
* @return string |
138
|
|
|
*/ |
139
|
|
|
protected function sniffLineTerminator($data) |
140
|
|
|
{ |
141
|
|
|
$sniffer = new SniffLineTerminatorByCount(); |
142
|
|
|
return $sniffer->sniff($data); |
143
|
|
|
} |
144
|
|
|
|
145
|
|
|
/** |
146
|
|
|
* Sniff quote and delimiter chars |
147
|
|
|
* |
148
|
|
|
* The best way to determine quote and delimiter characters is when columns |
149
|
|
|
* are quoted, often you can seek out a pattern of delim, quote, stuff, quote, delim |
150
|
|
|
* but this only works if you have quoted columns. If you don't you have to |
151
|
|
|
* determine these characters some other way... (see lickDelimiter). |
152
|
|
|
* |
153
|
|
|
* @throws SnifferException |
154
|
|
|
* |
155
|
|
|
* @param string $data The data to analyze |
156
|
|
|
* @param string $lineTerminator The line terminator char/sequence |
157
|
|
|
* |
158
|
|
|
* @return array A two-row array containing quotechar, delimchar |
159
|
|
|
*/ |
160
|
|
|
protected function sniffQuoteAndDelim($data, $lineTerminator) |
161
|
|
|
{ |
162
|
|
|
$sniffer = new SniffQuoteAndDelimByAdjacency(compact('lineTerminator')); |
163
|
|
|
return $sniffer->sniff($data); |
164
|
|
|
} |
165
|
|
|
|
166
|
|
|
protected function sniffDelimiter($data, $lineTerminator) |
167
|
|
|
{ |
168
|
|
|
$delimiters = $this->getPossibleDelimiters(); |
169
|
|
|
$consistency = new SniffDelimiterByConsistency(compact('lineTerminator', 'delimiters')); |
170
|
|
|
$winners = $consistency->sniff($data); |
171
|
|
|
if (count($winners) > 1) { |
172
|
|
|
$delimiters = $winners; |
173
|
|
|
return (new SniffDelimiterByDistribution(compact('lineTerminator', 'delimiters'))) |
174
|
|
|
->sniff($data); |
175
|
|
|
} |
176
|
|
|
return current($winners); |
177
|
|
|
} |
178
|
|
|
|
179
|
|
|
protected function sniffQuotingStyle($data, $delimiter, $lineTerminator) |
180
|
|
|
{ |
181
|
|
|
$sniffer = new SniffQuoteStyle(compact( 'lineTerminator', 'delimiter')); |
182
|
|
|
return $sniffer->sniff($data); |
183
|
|
|
} |
184
|
|
|
|
185
|
|
|
protected function sniffHasHeader($data, $delimiter, $lineTerminator) |
186
|
|
|
{ |
187
|
|
|
$sniffer = new SniffHeaderByDataType(compact( 'lineTerminator', 'delimiter')); |
188
|
|
|
return $sniffer->sniff($data); |
189
|
|
|
} |
190
|
|
|
} |
191
|
|
|
|
This check looks for type mismatches where the missing type is
false
. This is usually indicative of an error condtion.Consider the follow example
This function either returns a new
DateTime
object or false, if there was an error. This is a typical pattern in PHP programming to show that an error has occurred without raising an exception. The calling code should check for this returnedfalse
before passing on the value to another function or method that may not be able to handle afalse
.