1
|
|
|
<?php |
2
|
|
|
/** |
3
|
|
|
* CSVelte: Slender, elegant CSV for PHP |
4
|
|
|
* Inspired by Python's CSV module and Frictionless Data and the W3C's CSV |
5
|
|
|
* standardization efforts, CSVelte was written in an effort to take all the |
6
|
|
|
* suck out of working with CSV. |
7
|
|
|
* |
8
|
|
|
* @version v0.2.1 |
9
|
|
|
* @copyright Copyright (c) 2016 Luke Visinoni <[email protected]> |
10
|
|
|
* @author Luke Visinoni <[email protected]> |
11
|
|
|
* @license https://github.com/deni-zen/csvelte/blob/master/LICENSE The MIT License (MIT) |
12
|
|
|
*/ |
13
|
|
|
namespace CSVelte; |
14
|
|
|
|
15
|
|
|
use \DateTime; |
16
|
|
|
use CSVelte\Contract\Streamable; |
17
|
|
|
|
18
|
|
|
use \Exception; |
19
|
|
|
use \OutOfBoundsException; |
20
|
|
|
use CSVelte\Exception\TasterException; |
21
|
|
|
|
22
|
|
|
use function CSVelte\collect; |
23
|
|
|
|
24
|
|
|
/** |
25
|
|
|
* CSVelte\Taster |
26
|
|
|
* Given CSV data, Taster will "taste" the data and provide its buest guess at |
27
|
|
|
* its "flavor". In other words, this class inspects CSV data and attempts to |
28
|
|
|
* auto-detect various CSV attributes such as line endings, quote characters, etc.. |
29
|
|
|
* |
30
|
|
|
* @package CSVelte |
31
|
|
|
* @copyright (c) 2016, Luke Visinoni <[email protected]> |
32
|
|
|
* @author Luke Visinoni <[email protected]> |
33
|
|
|
* @todo There are a ton of improvements that could be made to this class. |
34
|
|
|
* I'll do a refactor on this fella once I get at least one test |
35
|
|
|
* passing for each of its public methods. |
36
|
|
|
* @todo Should I have a lickEscapeChar method? The python version doesn't |
37
|
|
|
* have one. But then why does it even bother including one in its |
38
|
|
|
* flavor class? |
39
|
|
|
* @todo Examine each of the public methods in this class and determine |
40
|
|
|
* whether it makes sense to ask for the data as a param rather than |
41
|
|
|
* just pulling it from source. I don't think it makes sense... it |
42
|
|
|
* was just easier to write the methods that way during testing. |
43
|
|
|
* @todo There are at least portions of this class that could use the |
44
|
|
|
* Reader class rather than working directly with data. |
45
|
|
|
* @todo Refactor all of the anonymous functions used as callbacks. Rather |
46
|
|
|
* than passing $this all over, use $closure->bindTo() instead... |
47
|
|
|
* Actually, write a method called getBoundClosure() or something... |
48
|
|
|
* maybe even make it a trait I don't know yet. But here it would |
49
|
|
|
* allow me to bind any anon function to $this and give me a certain |
50
|
|
|
* set of commonly needed values ($delim, $eol, etc.) |
51
|
|
|
*/ |
52
|
|
|
class Taster |
53
|
|
|
{ |
54
|
|
|
/** |
55
|
|
|
* End-of-line constants |
56
|
|
|
*/ |
57
|
|
|
const EOL_UNIX = 'lf'; |
58
|
|
|
const EOL_TRS80 = 'cr'; |
59
|
|
|
const EOL_WINDOWS = 'crlf'; |
60
|
|
|
|
61
|
|
|
/** |
62
|
|
|
* ASCII character codes for "invisibles" |
63
|
|
|
*/ |
64
|
|
|
const HORIZONTAL_TAB = 9; |
65
|
|
|
const LINE_FEED = 10; |
66
|
|
|
const CARRIAGE_RETURN = 13; |
67
|
|
|
const SPACE = 32; |
68
|
|
|
|
69
|
|
|
/** |
70
|
|
|
* Data types -- Used within the lickQuotingStyle method |
71
|
|
|
*/ |
72
|
|
|
const DATA_NONNUMERIC = 'nonnumeric'; |
73
|
|
|
const DATA_SPECIAL = 'special'; |
74
|
|
|
const DATA_UNKNOWN = 'unknown'; |
75
|
|
|
|
76
|
|
|
/** |
77
|
|
|
* Placeholder strings -- hold the place of newlines and delimiters contained |
78
|
|
|
* within quoted text so that the explode method doesn't split incorrectly |
79
|
|
|
*/ |
80
|
|
|
const PLACEHOLDER_NEWLINE = '[__NEWLINE__]'; |
81
|
|
|
const PLACEHOLDER_DELIM = '[__DELIM__]'; |
82
|
|
|
|
83
|
|
|
/** |
84
|
|
|
* Recommended data sample size |
85
|
|
|
*/ |
86
|
|
|
const SAMPLE_SIZE = 2500; |
87
|
|
|
|
88
|
|
|
/** |
89
|
|
|
* Column data types -- used within the lickHeader method to determine |
90
|
|
|
* whether the first row contains different types of data than the rest of |
91
|
|
|
* the rows (and thus, is likely a header row) |
92
|
|
|
*/ |
93
|
|
|
// +-987 |
94
|
|
|
const TYPE_NUMBER = 'number'; |
95
|
|
|
// +-12.387 |
96
|
|
|
const TYPE_DOUBLE = 'double'; |
97
|
|
|
// I am a string. I can contain all kinds of stuff. |
98
|
|
|
const TYPE_STRING = 'string'; |
99
|
|
|
// 10-Jul-15, 9/1/2007, April 1st, 2006, etc. |
|
|
|
|
100
|
|
|
const TYPE_DATE = 'date'; |
101
|
|
|
// 10:00pm, 5pm, 13:08, etc. |
|
|
|
|
102
|
|
|
const TYPE_TIME = 'time'; |
103
|
|
|
// $98.96, ¥12389, £6.08, €87.00 |
|
|
|
|
104
|
|
|
const TYPE_CURRENCY = 'currency'; |
105
|
|
|
// 12ab44m1n2_asdf |
106
|
|
|
const TYPE_ALNUM = 'alnum'; |
107
|
|
|
// abababab |
108
|
|
|
const TYPE_ALPHA = 'alpha'; |
109
|
|
|
|
110
|
|
|
/** @var \CSVelte\Contract\Streamable The source of data to examine */ |
111
|
|
|
protected $input; |
112
|
|
|
|
113
|
|
|
/** @var string Sample of CSV data to use for tasting (determining CSV flavor) */ |
114
|
|
|
protected $sample; |
115
|
|
|
|
116
|
|
|
/** @var array Possible delimiter characters in (roughly) the order of likelihood */ |
117
|
|
|
protected $delims = [",", "\t", ";", "|", ":", "-", "_", "#", "/", '\\', '$', '+', '=', '&', '@']; |
118
|
|
|
|
119
|
|
|
/** |
120
|
|
|
* Class constructor--accepts a CSV input source |
121
|
|
|
* |
122
|
|
|
* @param \CSVelte\Contract\Streamable The source of CSV data |
123
|
|
|
* @todo It may be a good idea to skip the first line or two for the sample |
124
|
|
|
* so that the header line(s) don't throw things off (with the exception |
125
|
|
|
* of lickHeader() obviously) |
126
|
|
|
*/ |
127
|
22 |
|
public function __construct(Streamable $input) |
128
|
|
|
{ |
129
|
22 |
|
$this->input = $input; |
130
|
22 |
|
if (!$this->sample = $input->read(self::SAMPLE_SIZE)) { |
|
|
|
|
131
|
1 |
|
throw new TasterException("Invalid input, cannot read sample.", TasterException::ERR_INVALID_SAMPLE); |
132
|
|
|
} |
133
|
21 |
|
} |
134
|
|
|
|
135
|
|
|
/** |
136
|
|
|
* "Invoke" magic method. |
137
|
|
|
* |
138
|
|
|
* Called when an object is invoked as if it were a function. So, for instance, |
139
|
|
|
* $taster = new Taster(); |
140
|
|
|
* $taster(); <-- invoke |
141
|
|
|
* This is imply an alias to the lick method. |
142
|
|
|
* |
143
|
|
|
* @return \CSVelte\Flavor A flavor object |
144
|
|
|
* @throws \CSVelte\Exception\TasterException |
145
|
|
|
*/ |
146
|
20 |
|
public function __invoke() |
147
|
|
|
{ |
148
|
20 |
|
return $this->lick(); |
149
|
|
|
} |
150
|
|
|
|
151
|
|
|
/** |
152
|
|
|
* Examine the input source and determine what "Flavor" of CSV it contains. |
153
|
|
|
* The CSV format, while having an RFC (https://tools.ietf.org/html/rfc4180), |
154
|
|
|
* doesn't necessarily always conform to it. And it doesn't provide meta such as the delimiting character, quote character, or what types of data are quoted. |
155
|
|
|
* such as the delimiting character, quote character, or what types of data are quoted. |
156
|
|
|
* are quoted. |
157
|
|
|
* |
158
|
|
|
* @return \CSVelte\Flavor The metadata that the CSV format doesn't provide |
159
|
|
|
* @throws \CSVelte\Exception\TasterException |
160
|
|
|
* @todo Implement a lickQuote method for when lickQuoteAndDelim method fails |
161
|
|
|
* @todo Should there bea lickEscapeChar method? the python module that inspired |
162
|
|
|
* this library doesn't include one... |
163
|
|
|
* @todo This should cache the results and only regenerate if $this->sample |
164
|
|
|
* changes (or $this->input) |
165
|
|
|
*/ |
166
|
21 |
|
public function lick() |
167
|
|
|
{ |
168
|
21 |
|
$lineTerminator = $this->lickLineEndings(); |
169
|
|
|
try { |
170
|
21 |
|
list($quoteChar, $delimiter) = $this->lickQuoteAndDelim(); |
171
|
21 |
|
} catch (TasterException $e) { |
172
|
6 |
|
if ($e->getCode() !== TasterException::ERR_QUOTE_AND_DELIM) throw $e; |
173
|
6 |
|
$quoteChar = '"'; |
174
|
6 |
|
$delimiter = $this->lickDelimiter($lineTerminator); |
175
|
|
|
} |
176
|
|
|
/** |
177
|
|
|
* @todo Should this be null? Because doubleQuote = true means this = null |
178
|
|
|
*/ |
179
|
21 |
|
$escapeChar = '\\'; |
180
|
21 |
|
$quoteStyle = $this->lickQuotingStyle($delimiter, $lineTerminator); |
181
|
21 |
|
$header = $this->lickHeader($delimiter, $lineTerminator); |
182
|
21 |
|
return new Flavor(compact('quoteChar', 'escapeChar', 'delimiter', 'lineTerminator', 'quoteStyle', 'header')); |
183
|
|
|
} |
184
|
|
|
|
185
|
|
|
/** |
186
|
|
|
* Replaces all quoted columns with a blank string. I was using this method |
187
|
|
|
* to prevent explode() from incorrectly splitting at delimiters and newlines |
188
|
|
|
* within quotes when parsing a file. But this was before I wrote the |
189
|
|
|
* replaceQuotedSpecialChars method which (at least to me) makes more sense. |
190
|
|
|
* |
191
|
|
|
* @param string The string to replace quoted strings within |
192
|
|
|
* @return string The input string with quoted strings removed |
193
|
|
|
* @todo Replace code that uses this method with the replaceQuotedSpecialChars |
194
|
|
|
* method instead. I think it's cleaner. |
195
|
|
|
*/ |
196
|
21 |
|
protected function removeQuotedStrings($data) |
197
|
|
|
{ |
198
|
21 |
|
return preg_replace($pattern = '/(["\'])(?:(?=(\\\\?))\2.)*?\1/sm', $replace = '', $data); |
199
|
|
|
} |
200
|
|
|
|
201
|
|
|
/** |
202
|
|
|
* Examine the input source to determine which character(s) are being used |
203
|
|
|
* as the end-of-line character |
204
|
|
|
* |
205
|
|
|
* @return string The end-of-line char for the input data |
206
|
|
|
* @credit pulled from stackoverflow thread *tips hat to username "Harm"* |
207
|
|
|
* @todo This should throw an exception if it cannot determine the line ending |
208
|
|
|
* @todo I probably will make this method protected when I'm done with testing... |
209
|
|
|
* @todo If there is any way for this method to fail (for instance if a file ) |
210
|
|
|
* is totally empty or contains no line breaks), then it needs to throw |
211
|
|
|
* a relevant TasterException |
212
|
|
|
* @todo Use replaceQuotedSpecialChars rather than removeQuotedStrings() |
213
|
|
|
*/ |
214
|
21 |
|
protected function lickLineEndings() |
215
|
|
|
{ |
216
|
21 |
|
$str = $this->removeQuotedStrings($this->sample); |
217
|
|
|
$eols = [ |
218
|
21 |
|
self::EOL_WINDOWS => "\r\n", // 0x0D - 0x0A - Windows, DOS OS/2 |
219
|
21 |
|
self::EOL_UNIX => "\n", // 0x0A - - Unix, OSX |
220
|
21 |
|
self::EOL_TRS80 => "\r", // 0x0D - - Apple ][, TRS80 |
221
|
21 |
|
]; |
222
|
|
|
|
223
|
21 |
|
$curCount = 0; |
224
|
|
|
// @todo This should return a default maybe? |
225
|
21 |
|
$curEol = PHP_EOL; |
226
|
21 |
|
foreach($eols as $k => $eol) { |
227
|
21 |
|
if( ($count = substr_count($str, $eol)) > $curCount) { |
228
|
21 |
|
$curCount = $count; |
229
|
21 |
|
$curEol = $eol; |
230
|
21 |
|
} |
231
|
21 |
|
} |
232
|
21 |
|
return $curEol; |
233
|
|
|
} |
234
|
|
|
|
235
|
|
|
/** |
236
|
|
|
* The best way to determine quote and delimiter characters is when columns |
237
|
|
|
* are quoted, often you can seek out a pattern of delim, quote, stuff, quote, delim |
238
|
|
|
* but this only works if you have quoted columns. If you don't you have to |
239
|
|
|
* determine these characters some other way... (see lickDelimiter) |
240
|
|
|
* |
241
|
|
|
* @return array A two-row array containing quotechar, delimchar |
242
|
|
|
* @todo make protected |
243
|
|
|
* @todo This should throw an exception if it cannot determine the delimiter |
244
|
|
|
* this way. |
245
|
|
|
* @todo This should check for any line endings not just \n |
246
|
|
|
*/ |
247
|
21 |
|
protected function lickQuoteAndDelim() |
248
|
|
|
{ |
249
|
|
|
/** |
250
|
|
|
* @var array An array of pattern matches |
251
|
|
|
*/ |
252
|
21 |
|
$matches = null; |
253
|
|
|
/** |
254
|
|
|
* @var array An array of patterns (regex) |
255
|
|
|
*/ |
256
|
21 |
|
$patterns = []; |
257
|
|
|
// delim can be anything but line breaks, quotes, alphanumeric, underscore, backslash, or any type of spaces |
258
|
21 |
|
$antidelims = implode(array("\r", "\n", "\w", preg_quote('"', '/'), preg_quote("'", '/')/*, preg_quote('\\', '/')*/, preg_quote(chr(self::SPACE), '/'))); |
|
|
|
|
259
|
21 |
|
$delim = '(?P<delim>[^' . $antidelims . '])'; |
260
|
21 |
|
$quote = '(?P<quoteChar>"|\'|`)'; // @todo I think MS Excel uses some strange encoding for fancy open/close quotes |
261
|
21 |
|
$patterns[] = '/' . $delim . ' ?' . $quote . '.*?\2\1/ms'; // ,"something", - anything but whitespace or quotes followed by a possible space followed by a quote followed by anything followed by same quote, followed by same anything but whitespace |
262
|
21 |
|
$patterns[] = '/(?:^|\n)' . $quote . '.*?\1' . $delim . ' ?/ms'; // 'something', - beginning of line or line break, followed by quote followed by anything followed by quote followed by anything but whitespace or quotes |
263
|
21 |
|
$patterns[] = '/' . $delim . ' ?' . $quote . '.*?\2(?:^|\n)/ms'; // ,'something' - anything but whitespace or quote followed by possible space followed by quote followed by anything followed by quote, followed by end of line |
264
|
21 |
|
$patterns[] = '/(?:^|\n)' . $quote . '.*?\2(?:$|\n)/ms'; // 'something' - beginning of line followed by quote followed by anything followed by quote followed by same quote followed by end of line |
265
|
21 |
|
foreach ($patterns as $pattern) { |
266
|
|
|
// @todo I had to add the error suppression char here because it was |
267
|
|
|
// causing undefined offset errors with certain data sets. strange... |
268
|
21 |
|
if (@preg_match_all($pattern, $this->sample, $matches) && $matches) break; |
269
|
21 |
|
} |
270
|
21 |
|
if ($matches) { |
271
|
|
|
try { |
272
|
|
|
return [ |
273
|
21 |
|
collect($matches) |
274
|
21 |
|
->frequency() |
275
|
21 |
|
->get('quoteChar') |
|
|
|
|
276
|
21 |
|
->sort() |
277
|
21 |
|
->reverse() |
278
|
21 |
|
->getKeyAtPosition(0), |
279
|
18 |
|
collect($matches) |
280
|
18 |
|
->frequency() |
281
|
18 |
|
->get('delim') |
|
|
|
|
282
|
18 |
|
->sort() |
283
|
18 |
|
->reverse() |
284
|
18 |
|
->getKeyAtPosition(0) |
285
|
18 |
|
]; |
286
|
6 |
|
} catch (OutOfBoundsException $e) { |
287
|
|
|
// eat this exception and let the taster exception below be thrown instead... |
288
|
|
|
} |
289
|
6 |
|
} |
290
|
6 |
|
throw new TasterException("quoteChar and delimiter cannot be determined", TasterException::ERR_QUOTE_AND_DELIM); |
291
|
|
|
} |
292
|
|
|
|
293
|
|
|
/** |
294
|
|
|
* Take a list of likely delimiter characters and find the one that occurs |
295
|
|
|
* the most consistent amount of times within the provided data. |
296
|
|
|
* |
297
|
|
|
* @param string The character(s) used for newlines |
298
|
|
|
* @return string One of four Flavor::QUOTING_* constants |
299
|
|
|
* @see \CSVelte\Flavor for possible quote style constants |
300
|
|
|
* @todo Refactor this method--It needs more thorough testing against a wider |
301
|
|
|
* variety of CSV data to be sure it works reliably. And I'm sure there |
302
|
|
|
* are many performance and logic improvements that could be made. This |
303
|
|
|
* is essentially a first draft. |
304
|
|
|
* @todo Can't use replaceQuotedSpecialChars rather than removeQuotedStrings |
305
|
|
|
* because the former requires u to know the delimiter |
306
|
|
|
*/ |
307
|
6 |
|
protected function lickDelimiter($eol = "\n") |
308
|
|
|
{ |
309
|
6 |
|
$frequencies = []; |
310
|
6 |
|
$consistencies = []; |
311
|
|
|
|
312
|
|
|
// build a table of characters and their frequencies for each line. We |
313
|
|
|
// will use this frequency table to then build a table of frequencies of |
314
|
|
|
// each frequency (in 10 lines, "tab" occurred 5 times on 7 of those |
315
|
|
|
// lines, 6 times on 2 lines, and 7 times on 1 line) |
316
|
6 |
|
collect(explode($eol, $this->removeQuotedStrings($this->sample))) |
317
|
|
|
->walk(function($line, $line_no) use (&$frequencies) { |
318
|
6 |
|
collect(str_split($line)) |
319
|
|
|
->filter(function($c) { return collect($this->delims)->contains($c); }) |
320
|
6 |
|
->frequency() |
321
|
6 |
|
->sort() |
322
|
6 |
|
->reverse() |
323
|
|
|
->walk(function($count, $char) use (&$frequencies, $line_no) { |
324
|
6 |
|
$frequencies[$char][$line_no] = $count; |
325
|
6 |
|
}); |
326
|
6 |
|
}) |
327
|
|
|
// the above only finds frequencies for characters if they exist in |
328
|
|
|
// a given line. This will go back and fill in zeroes where a char |
329
|
|
|
// didn't occur at all in a given line (needed to determine mode) |
330
|
|
|
->walk(function($line, $line_no) use (&$frequencies) { |
331
|
6 |
|
collect($frequencies) |
332
|
|
|
->walk(function($counts, $char) use ($line_no, &$frequencies) { |
333
|
6 |
|
if (!isset($frequencies[$char][$line_no])) { |
334
|
6 |
|
$frequencies[$char][$line_no] = 0; |
335
|
6 |
|
} |
336
|
6 |
|
}); |
337
|
6 |
|
}); |
338
|
|
|
|
339
|
|
|
// now determine the mode for each char to decide the "expected" amount |
340
|
|
|
// of times a char (possible delim) will occur on each line... |
341
|
6 |
|
$freqs = collect($frequencies); |
342
|
6 |
|
$modes = $freqs->mode(); |
343
|
|
|
$freqs->walk(function($f, $chr) use ($modes, &$consistencies) { |
344
|
|
|
collect($f)->walk(function($num) use ($modes, $chr, &$consistencies) { |
345
|
6 |
|
if ($expected = $modes->get($chr)) { |
346
|
6 |
|
if ($num == $expected) { |
347
|
|
|
// met the goal, yay! |
348
|
6 |
|
if (!isset($consistencies[$chr])) { |
349
|
6 |
|
$consistencies[$chr] = 0; |
350
|
6 |
|
} |
351
|
6 |
|
$consistencies[$chr]++; |
352
|
6 |
|
} |
353
|
6 |
|
} |
354
|
6 |
|
}); |
355
|
6 |
|
}); |
356
|
|
|
|
357
|
6 |
|
$delims = collect($consistencies); |
358
|
6 |
|
$max = $delims->max(); |
359
|
6 |
|
$dups = $delims->duplicates(); |
360
|
6 |
|
if ($dups->has($max, false)) { |
361
|
|
|
// if more than one candidate, then look at where the character appeared |
362
|
|
|
// in the data. Was it relatively evenly distributed or was there a |
363
|
|
|
// specific area that the character tended to appear? Dates will have a |
364
|
|
|
// consistent format (e.g. 04-23-1986) and so may easily provide a false |
365
|
|
|
// positive for delimiter. But the dash will be focused in that one area, |
366
|
|
|
// whereas the comma character is spread out. You can determine this by |
367
|
|
|
// finding out the number of chars between each occurrence and getting |
368
|
|
|
// the average. If the average is wildly different than any given distance |
369
|
|
|
// than bingo you probably aren't working with a delimiter there... |
370
|
|
|
|
371
|
|
|
// another option to find the delimiter if there is a tie, is to build |
372
|
|
|
// a table of character position within each line. Then use that to |
373
|
|
|
// determine if one character is consistently in the same position or |
374
|
|
|
// at least the same general area. Use the delimiter that is the most |
375
|
|
|
// consistent in that way... |
376
|
|
|
|
377
|
|
|
/** |
378
|
|
|
* @todo Add a method here to figure out where duplicate best-match |
379
|
|
|
* delimiter(s) fall within each line and then, depending on |
380
|
|
|
* which one has the best distribution, return that one. |
381
|
|
|
*/ |
382
|
|
|
|
383
|
|
|
$decision = $dups->get($max); |
384
|
|
|
try { |
385
|
|
|
return $this->guessDelimByDistribution($decision, $eol); |
386
|
|
|
} catch (TasterException $e) { |
387
|
|
|
// if somehow we STILL can't come to a consensus, then fall back to a |
388
|
|
|
// "preferred delimiters" list... |
389
|
|
|
foreach ($this->delims as $key => $val) { |
390
|
|
|
if ($delim = array_search($val, $decision)) return $delim; |
391
|
|
|
} |
392
|
|
|
} |
393
|
|
|
} |
394
|
|
|
return $delims |
395
|
6 |
|
->sort() |
396
|
6 |
|
->getKeyAtPosition(0); |
397
|
|
|
} |
398
|
|
|
|
399
|
|
|
/** |
400
|
|
|
* Compare positional consistency of several characters to determine the |
401
|
|
|
* probable delimiter character. The idea behind this is that the delimiter |
402
|
|
|
* character is likely more consistently distributed than false-positive |
403
|
|
|
* delimiter characters produced by lickDelimiter(). For instance, consider |
404
|
|
|
* a series of rows similar to the following: |
405
|
|
|
* |
406
|
|
|
* 1,luke,visinoni,[email protected],(530) 413-3076,04-23-1986 |
407
|
|
|
* |
408
|
|
|
* The lickDelimiter() method will often not be able to determine whether the |
409
|
|
|
* delimiter is a comma or a dash because they occur the same number of times |
410
|
|
|
* on just about every line (5 for comma, 3 for dash). The difference is |
411
|
|
|
* obvious to you, no doubt. But us humans are pattern-recognition machines! |
412
|
|
|
* The difference between the comma and the dash are that the comma is dist- |
413
|
|
|
* ributed almost evenly throughout the line. The dash characters occur |
414
|
|
|
* entirely at the end of the line. This method accepts any number of possible |
415
|
|
|
* delimiter characters and returns the one that is distributed |
416
|
|
|
* |
417
|
|
|
* If delim character cannot be determined by lickQuoteAndDelim(), taster |
418
|
|
|
* tries lickDelimiter(). When that method runs into a tie, it will use this |
419
|
|
|
* as a tie-breaker. |
420
|
|
|
* |
421
|
|
|
* @param array $delims Possible delimiter characters (method chooses from |
422
|
|
|
* this array of characters) |
423
|
|
|
* @return string The probable delimiter character |
424
|
|
|
*/ |
425
|
|
|
protected function guessDelimByDistribution(array $delims, $eol = "\n") |
426
|
|
|
{ |
427
|
|
|
try { |
428
|
|
|
// @todo Write a method that does this... |
429
|
|
|
$lines = collect(explode($eol, $this->removeQuotedStrings($this->sample))); |
430
|
|
|
return $delims[collect($delims)->map(function($delim) use (&$distrib, $lines) { |
431
|
|
|
$linedist = collect(); |
432
|
|
|
$lines->walk(function($line, $line_no) use (&$linedist, $delim) { |
433
|
|
|
if (!strlen($line)) return; |
434
|
|
|
$sectstot = 10; |
435
|
|
|
$sectlen = (int) (strlen($line) / $sectstot); |
436
|
|
|
$sections = collect(str_split($line, $sectlen)) |
437
|
|
|
->map(function($section) use($delim) { |
438
|
|
|
return substr_count($section, $delim); |
439
|
|
|
}) |
440
|
|
|
->filter(function($count) { return (bool) $count; }); |
441
|
|
|
if (is_numeric($count = $sections->count())) { |
442
|
|
|
$linedist->set($line_no, $count / $sectstot); |
443
|
|
|
} |
444
|
|
|
}); |
445
|
|
|
return $linedist; |
446
|
|
|
})->map(function($dists) { |
447
|
|
|
return $dists->average(); |
448
|
|
|
})->sort() |
449
|
|
|
->reverse() |
450
|
|
|
->getKeyAtPosition(0)]; |
451
|
|
|
} catch (Exception $e) { |
452
|
|
|
throw new TasterException("delimiter cannot be determined by distribution", TasterException::ERR_DELIMITER); |
453
|
|
|
} |
454
|
|
|
} |
455
|
|
|
|
456
|
|
|
/** |
457
|
|
|
* Determine the "style" of data quoting. The CSV format, while having an RFC |
458
|
|
|
* (https://tools.ietf.org/html/rfc4180), doesn't necessarily always conform |
459
|
|
|
* to it. And it doesn't provide metadata such as the delimiting character, |
460
|
|
|
* quote character, or what types of data are quoted. So this method makes a |
461
|
|
|
* logical guess by finding which columns have been quoted (if any) and |
462
|
|
|
* examining their data type. Most often, CSV files will only use quotes |
463
|
|
|
* around columns that contain special characters such as the dilimiter, |
464
|
|
|
* the quoting character, newlines, etc. (we refer to this style as ) |
465
|
|
|
* QUOTE_MINIMAL), but some quote all columns that contain nonnumeric data |
466
|
|
|
* (QUOTE_NONNUMERIC). Then there are CSV files that quote all columns |
467
|
|
|
* (QUOTE_ALL) and those that quote none (QUOTE_NONE). |
468
|
|
|
* |
469
|
|
|
* @param string $delim The character used as the column delimiter |
470
|
|
|
* @param string $eol The character used for newlines |
471
|
|
|
* @return string One of four "QUOTING_" constants defined above--see this |
472
|
|
|
* method's description for more info. |
473
|
|
|
* @todo Refactor this method--It needs more thorough testing against a wider |
474
|
|
|
* variety of CSV data to be sure it works reliably. And I'm sure there |
475
|
|
|
* are many performance and logic improvements that could be made. This |
476
|
|
|
* is essentially a first draft. |
477
|
|
|
*/ |
478
|
21 |
|
protected function lickQuotingStyle($delim, $eol) |
479
|
|
|
{ |
480
|
21 |
|
$quoting_styles = collect([ |
481
|
21 |
|
Flavor::QUOTE_ALL => true, |
482
|
21 |
|
Flavor::QUOTE_NONE => true, |
483
|
21 |
|
Flavor::QUOTE_MINIMAL => true, |
484
|
21 |
|
Flavor::QUOTE_NONNUMERIC => true, |
485
|
21 |
|
]); |
486
|
|
|
|
487
|
21 |
|
$lines = collect(explode($eol, $this->replaceQuotedSpecialChars($this->sample, $delim))); |
488
|
21 |
|
$freq = collect() |
489
|
21 |
|
->set('quoted', collect()) |
490
|
21 |
|
->set('unquoted', collect()); |
491
|
|
|
|
492
|
|
|
// walk through each line from the data sample to determine which fields |
493
|
|
|
// are quoted and which aren't |
494
|
|
|
$qsFunc = function($line, $line_no) use (&$quoting_styles, &$freq, $eol, $delim) { |
|
|
|
|
495
|
21 |
|
$line = str_replace(self::PLACEHOLDER_NEWLINE, $eol, $line); |
496
|
|
|
$qnqaFunc = function($field, $colpos) use (&$quoting_styles, &$freq, $delim) { |
|
|
|
|
497
|
21 |
|
$field = str_replace(self::PLACEHOLDER_DELIM, $delim, $field); |
498
|
21 |
|
if ($this->isQuoted($field)) { |
499
|
18 |
|
$field = $this->unQuote($field); |
500
|
18 |
|
$freq->get('quoted')->push($this->lickDataType($field)); |
|
|
|
|
501
|
|
|
// since we know there's at least one quoted field, |
502
|
|
|
// QUOTE_NONE can be ruled out |
503
|
18 |
|
$quoting_styles->set(Flavor::QUOTE_NONE, false); |
504
|
18 |
|
} else { |
505
|
21 |
|
$type = $this->lickDataType($field); |
|
|
|
|
506
|
21 |
|
$freq->get('unquoted')->push($this->lickDataType($field)); |
|
|
|
|
507
|
|
|
// since we know there's at least one unquoted field, |
508
|
|
|
// QUOTE_ALL can be ruled out |
509
|
21 |
|
$quoting_styles->set(Flavor::QUOTE_ALL, false); |
510
|
|
|
} |
511
|
21 |
|
}; |
512
|
21 |
|
collect(explode($delim, $line)) |
513
|
21 |
|
->walk($qnqaFunc->bindTo($this)); |
514
|
|
|
|
515
|
21 |
|
}; |
516
|
21 |
|
$lines->walk($qsFunc->bindTo($this)); |
517
|
|
|
|
518
|
21 |
|
$types = $freq->get('quoted')->unique(); |
|
|
|
|
519
|
|
|
$quoting_styles = $quoting_styles->filter(function($val) { return (bool) $val; }); |
520
|
|
|
// if quoting_styles still has QUOTE_ALL or QUOTE_NONE, then return |
521
|
|
|
// whichever of them it is, we don't need to do anything else |
522
|
21 |
|
if ($quoting_styles->has(Flavor::QUOTE_ALL)) return Flavor::QUOTE_ALL; |
523
|
21 |
|
if ($quoting_styles->has(Flavor::QUOTE_NONE)) return Flavor::QUOTE_NONE; |
524
|
18 |
|
if (count($types) == 1) { |
525
|
18 |
|
$style = $types->getValueAtPosition(0); |
526
|
18 |
|
if ($quoting_styles->has($style)) { |
527
|
|
|
return $style; |
528
|
|
|
} |
529
|
18 |
|
} else { |
530
|
2 |
|
if ($types->contains(self::DATA_NONNUMERIC)) { |
531
|
|
|
// allow for a SMALL amount of error here |
532
|
2 |
|
$counts = collect([self::DATA_SPECIAL => 0, self::DATA_NONNUMERIC => 0]); |
533
|
|
|
$freq->get('quoted')->walk(function ($type) use (&$counts) { |
|
|
|
|
534
|
2 |
|
$counts->increment($type); |
535
|
2 |
|
}); |
536
|
|
|
// @todo is all this even necessary? seems unnecessary to me... |
537
|
2 |
|
if ($most = $counts->max()) { |
538
|
2 |
|
$least = $counts->min(); |
539
|
2 |
|
$err_margin = $least / $most; |
540
|
2 |
|
if ($err_margin < 1) return Flavor::QUOTE_NONNUMERIC; |
541
|
|
|
} |
542
|
|
|
} |
543
|
|
|
} |
544
|
18 |
|
return Flavor::QUOTE_MINIMAL; |
545
|
|
|
} |
546
|
|
|
|
547
|
|
|
/** |
548
|
|
|
* Remove quotes around a piece of text (if there are any) |
549
|
|
|
* |
550
|
|
|
* @param string The data to "unquote" |
551
|
|
|
* @return string The data passed in, only with quotes stripped (off the edges) |
552
|
|
|
*/ |
553
|
21 |
|
protected function unQuote($data) |
554
|
|
|
{ |
555
|
21 |
|
return preg_replace('/^(["\'])(.*)\1$/', '\2', $data); |
556
|
|
|
} |
557
|
|
|
|
558
|
|
|
/** |
559
|
|
|
* Determine whether a particular string of data has quotes around it. |
560
|
|
|
* |
561
|
|
|
* @param string The data to check |
562
|
|
|
* @return boolean Whether the data is quoted or not |
563
|
|
|
*/ |
564
|
21 |
|
protected function isQuoted($data) |
565
|
|
|
{ |
566
|
21 |
|
return preg_match('/^([\'"])[^\1]*\1$/', $data); |
567
|
|
|
} |
568
|
|
|
|
569
|
|
|
/** |
570
|
|
|
* Determine what type of data is contained within a variable |
571
|
|
|
* Possible types: |
572
|
|
|
* - nonnumeric - only numbers |
573
|
|
|
* - special - contains characters that could potentially need to be quoted (possible delimiter characters) |
574
|
|
|
* - unknown - everything else |
575
|
|
|
* This method is really only used within the "lickQuotingStyle" method to |
576
|
|
|
* help determine whether a particular column has been quoted due to it being |
577
|
|
|
* nonnumeric or because it has some special character in it such as a delimiter |
578
|
|
|
* or newline or quote. |
579
|
|
|
* |
580
|
|
|
* @param string The data to determine the type of |
581
|
|
|
* @return string The type of data (one of the "DATA_" constants above) |
582
|
|
|
* @todo I could probably eliminate this method and use an anonymous function |
583
|
|
|
* instead. It isn't used anywhere else and its name could be misleading. |
584
|
|
|
* Especially since I also have a lickType method that is used within the |
585
|
|
|
* lickHeader method. |
586
|
|
|
*/ |
587
|
21 |
|
protected function lickDataType($data) |
588
|
|
|
{ |
589
|
|
|
// @todo make this check for only the quote and delim that are actually being used |
590
|
|
|
// that will make the guess more accurate |
591
|
21 |
|
if (preg_match('/[\'",\t\|:;-]/', $data)) { |
592
|
18 |
|
return self::DATA_SPECIAL; |
593
|
21 |
|
} elseif (preg_match('/[^0-9]/', $data)) { |
594
|
21 |
|
return self::DATA_NONNUMERIC; |
595
|
|
|
} |
596
|
21 |
|
return self::DATA_UNKNOWN; |
597
|
|
|
} |
598
|
|
|
|
599
|
|
|
/** |
600
|
|
|
* Replace all instances of newlines and whatever character you specify (as |
601
|
|
|
* the delimiter) that are contained within quoted text. The replacements are |
602
|
|
|
* simply a special placeholder string. This is done so that I can use the |
603
|
|
|
* very unsmart "explode" function and not have to worry about it exploding |
604
|
|
|
* on delimiters or newlines within quotes. Once I have exploded, I typically |
605
|
|
|
* sub back in the real characters before doing anything else. Although |
606
|
|
|
* currently there is no dedicated method for doing so I just use str_replace |
607
|
|
|
* |
608
|
|
|
* @param string The string to do the replacements on |
609
|
|
|
* @param string The delimiter character to replace |
610
|
|
|
* @return string The data with replacements performed |
611
|
|
|
* @todo I could probably pass in (maybe optionally) the newline character I |
612
|
|
|
* want to replace as well. I'll do that if I need to. |
613
|
|
|
*/ |
614
|
21 |
|
protected function replaceQuotedSpecialChars($data, $delim) |
615
|
|
|
{ |
616
|
|
|
return preg_replace_callback('/([\'"])(.*)\1/imsU', function($matches) use ($delim) { |
617
|
18 |
|
$ret = preg_replace("/([\r\n])/", self::PLACEHOLDER_NEWLINE, $matches[0]); |
618
|
18 |
|
$ret = str_replace($delim, self::PLACEHOLDER_DELIM, $ret); |
619
|
18 |
|
return $ret; |
620
|
21 |
|
}, $data); |
621
|
|
|
} |
622
|
|
|
|
623
|
|
|
/** |
624
|
|
|
* Determine the "type" of a particular string of data. Used for the lickHeader |
625
|
|
|
* method to assign a type to each column to try to determine whether the |
626
|
|
|
* first for is different than a consistent column type. |
627
|
|
|
* |
628
|
|
|
* @todo As I'm writing this method I'm beginning ot realize how expensive |
629
|
|
|
* the lickHeader method is going to end up being since it has to apply all |
630
|
|
|
* these regexes (potentially) to every column. I may end up writing a much |
631
|
|
|
* simpler type-checking method than this if it proves to be too expensive |
632
|
|
|
* to be practical. |
633
|
|
|
* |
634
|
|
|
* @param string The string of data to check the type of |
635
|
|
|
* @return string One of the TYPE_ string constants above |
636
|
|
|
*/ |
637
|
21 |
|
protected function lickType($data) |
638
|
|
|
{ |
639
|
21 |
|
if (preg_match('/^[+-]?[\d\.]+$/', $data)) { |
640
|
18 |
|
return self::TYPE_NUMBER; |
641
|
21 |
|
} elseif (preg_match('/^[+-]?[\d]+\.[\d]+$/', $data)) { |
642
|
|
|
return self::TYPE_DOUBLE; |
643
|
21 |
|
} elseif (preg_match('/^[+-]?[¥£€$]\d+(\.\d+)$/', $data)) { |
644
|
|
|
return self::TYPE_CURRENCY; |
645
|
21 |
|
} elseif (preg_match('/^[a-zA-Z]+$/', $data)) { |
646
|
21 |
|
return self::TYPE_ALPHA; |
647
|
|
|
} else { |
648
|
|
|
try { |
649
|
21 |
|
$year = '([01][0-9])?[0-9]{2}'; |
650
|
21 |
|
$month = '([01]?[0-9]|Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)'; |
651
|
21 |
|
$day = '[0-3]?[0-9]'; |
652
|
21 |
|
$sep = '[\/\.\-]?'; |
653
|
21 |
|
$time = '([0-2]?[0-9](:[0-5][0-9]){1,2}(am|pm)?|[01]?[0-9](am|pm))'; |
654
|
21 |
|
$date = '(' . $month . $sep . $day . $sep . $year . '|' . $day . $sep . $month . $sep . $year . '|' . $year . $sep . $month . $sep . $day . ')'; |
655
|
21 |
|
$dt = new DateTime($data); |
656
|
21 |
|
$dt->setTime(0,0,0); |
657
|
21 |
|
$now = new DateTime(); |
658
|
21 |
|
$now->setTime(0,0,0); |
659
|
21 |
|
$diff = $dt->diff($now); |
660
|
21 |
|
$diffDays = (integer) $diff->format( "%R%a" ); |
661
|
21 |
|
if ($diffDays === 0) { |
662
|
|
|
// then this is most likely a time string... |
663
|
6 |
|
if (preg_match("/^{$time}$/i", $data)) { |
664
|
|
|
return self::TYPE_TIME; |
665
|
|
|
} |
666
|
6 |
|
} |
667
|
21 |
|
if (preg_match("/^{$date}$/i", $data)) { |
668
|
18 |
|
return self::TYPE_DATE; |
669
|
6 |
|
} elseif(preg_match("/^{$date} {$time}$/i")) { |
670
|
|
|
return self::TYPE_DATETIME; |
671
|
|
|
} |
672
|
21 |
|
} catch (\Exception $e) { |
673
|
|
|
// now go on checking remaining types |
674
|
21 |
|
if (preg_match('/^\w+$/', $data)) { |
675
|
3 |
|
return self::TYPE_ALNUM; |
676
|
|
|
} |
677
|
|
|
} |
678
|
|
|
} |
679
|
21 |
|
return self::TYPE_STRING; |
680
|
|
|
} |
681
|
|
|
|
682
|
|
|
/** |
683
|
|
|
* Examines the contents of the CSV data to make a determination of whether |
684
|
|
|
* or not it contains a header row. To make this determination, it creates |
685
|
|
|
* an array of each column's (in each row)'s data type and length and then |
686
|
|
|
* compares them. If all of the rows except the header look similar, it will |
687
|
|
|
* return true. This is only a guess though. There is no programmatic way to |
688
|
|
|
* determine 100% whether a CSV file has a header. The format does not |
689
|
|
|
* provide metadata such as that. |
690
|
|
|
* |
691
|
|
|
* @param string $delim The CSV data's delimiting char (can be a variety of chars but) |
692
|
|
|
* typically is either a comma or a tab, sometimes a pipe) |
693
|
|
|
* @param string $eol The CSV data's end-of-line char(s) (\n \r or \r\n) |
694
|
|
|
* @return boolean True if the data (most likely) contains a header row |
695
|
|
|
* @todo This method needs a total refactor. It's not necessary to loop twice |
696
|
|
|
* You could get away with one loop and that would allow for me to do |
697
|
|
|
* something like only examining enough rows to get to a particular |
698
|
|
|
* "hasHeader" score (+-100 for instance) & then just return true|false |
699
|
|
|
* @todo Also, break out of the first loop after a certain (perhaps even a |
700
|
|
|
* configurable) amount of lines (you only need to examine so much data ) |
701
|
|
|
* to reliably make a determination and this is an expensive method) |
702
|
|
|
* @todo I could remove the need for quote, delim, and eol by "licking" the |
703
|
|
|
* data sample provided in the first argument. Also, I could actually |
704
|
|
|
* create a Reader object to read the data here. |
705
|
|
|
*/ |
706
|
22 |
|
public function lickHeader($delim, $eol) |
707
|
|
|
{ |
708
|
22 |
|
$types = collect(); |
709
|
|
|
$buildTypes = function($line, $line_no) use (&$types, $delim, $eol) { |
710
|
22 |
|
$line = str_replace(self::PLACEHOLDER_NEWLINE, $eol, $line); |
711
|
|
|
$getType = function($field, $colpos) use (&$types, $line, $line_no, $delim) { |
712
|
22 |
|
$field = str_replace(self::PLACEHOLDER_DELIM, $delim, $field); |
713
|
|
|
// @todo Need a Collection::setTableField($x, $y) method |
714
|
|
|
// See notes in green binder about refactoring Collection |
715
|
22 |
|
if (!$types->has($line_no)) $types->set($line_no, collect()); |
716
|
22 |
|
$types->get($line_no)->set($colpos, [ |
717
|
22 |
|
'type' => $this->lickType($this->unQuote($field)), |
718
|
22 |
|
'length' => strlen($field) |
719
|
22 |
|
]); |
720
|
22 |
|
}; |
721
|
22 |
|
$fields = collect(explode($delim, $line))->walk($getType->bindTo($this)); |
|
|
|
|
722
|
22 |
|
}; |
723
|
22 |
|
$lines = collect(explode( |
|
|
|
|
724
|
22 |
|
$eol, |
725
|
22 |
|
$this->replaceQuotedSpecialChars($this->sample, $delim) |
726
|
22 |
|
)) |
727
|
22 |
|
->walk($buildTypes->bindTo($this)); |
728
|
|
|
|
729
|
22 |
|
$hasHeader = 0; |
730
|
22 |
|
$possibleHeader = $types->shift(); |
731
|
|
|
$types->walk(function($row, $line_no) use (&$hasHeader, $possibleHeader) { |
|
|
|
|
732
|
22 |
|
$row->walk(function($field_info, $col_no) use (&$hasHeader, $possibleHeader) { |
733
|
22 |
|
extract($field_info); |
734
|
|
|
try { |
735
|
22 |
|
extract($possibleHeader->get($col_no, null, true), EXTR_PREFIX_ALL, "header"); |
|
|
|
|
736
|
22 |
|
if ($header_type == self::TYPE_STRING) { |
737
|
|
|
// use length |
738
|
19 |
|
if ($length != $header_length) $hasHeader++; |
739
|
|
|
else $hasHeader--; |
740
|
19 |
|
} else { |
741
|
|
|
// use data type |
742
|
22 |
|
if ($type != $header_type) $hasHeader++; |
743
|
|
|
else $hasHeader--; |
744
|
|
|
} |
745
|
22 |
|
} catch (OutOfBoundsException $e) { |
746
|
|
|
// failure... |
747
|
2 |
|
return; |
748
|
|
|
} |
749
|
22 |
|
}); |
750
|
22 |
|
}); |
751
|
22 |
|
return $hasHeader > 0; |
752
|
|
|
} |
753
|
|
|
} |
754
|
|
|
|
Sometimes obsolete code just ends up commented out instead of removed. In this case it is better to remove the code once you have checked you do not need it.
The code might also have been commented out for debugging purposes. In this case it is vital that someone uncomments it again or your project may behave in very unexpected ways in production.
This check looks for comments that seem to be mostly valid code and reports them.