1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
/* |
4
|
|
|
* CSVelte: Slender, elegant CSV for PHP |
5
|
|
|
* Inspired by Python's CSV module and Frictionless Data and the W3C's CSV |
6
|
|
|
* standardization efforts, CSVelte was written in an effort to take all the |
7
|
|
|
* suck out of working with CSV. |
8
|
|
|
* |
9
|
|
|
* @version {version} |
10
|
|
|
* @copyright Copyright (c) 2016 Luke Visinoni <[email protected]> |
11
|
|
|
* @author Luke Visinoni <[email protected]> |
12
|
|
|
* @license https://github.com/deni-zen/csvelte/blob/master/LICENSE The MIT License (MIT) |
13
|
|
|
*/ |
14
|
|
|
namespace CSVelte; |
15
|
|
|
|
16
|
|
|
use CSVelte\Collection\AbstractCollection; |
17
|
|
|
use CSVelte\Collection\CharCollection; |
18
|
|
|
use CSVelte\Collection\Collection; |
19
|
|
|
use CSVelte\Collection\NumericCollection; |
20
|
|
|
use CSVelte\Collection\TabularCollection; |
21
|
|
|
use CSVelte\Contract\Streamable; |
22
|
|
|
use CSVelte\Exception\TasterException; |
23
|
|
|
|
24
|
|
|
use DateTime; |
25
|
|
|
use Exception; |
26
|
|
|
use OutOfBoundsException; |
27
|
|
|
|
28
|
|
|
use function CSVelte\collect; |
29
|
|
|
|
30
|
|
|
/** |
31
|
|
|
* CSVelte\Taster |
32
|
|
|
* Given CSV data, Taster will "taste" the data and provide its buest guess at |
33
|
|
|
* its "flavor". In other words, this class inspects CSV data and attempts to |
34
|
|
|
* auto-detect various CSV attributes such as line endings, quote characters, etc.. |
35
|
|
|
* |
36
|
|
|
* @package CSVelte |
37
|
|
|
* |
38
|
|
|
* @copyright (c) 2016, Luke Visinoni <[email protected]> |
39
|
|
|
* @author Luke Visinoni <[email protected]> |
40
|
|
|
* |
41
|
|
|
* @todo There are a ton of improvements that could be made to this class. |
42
|
|
|
* I'll do a refactor on this fella once I get at least one test |
43
|
|
|
* passing for each of its public methods. |
44
|
|
|
* @todo Should I have a lickEscapeChar method? The python version doesn't |
45
|
|
|
* have one. But then why does it even bother including one in its |
46
|
|
|
* flavor class? |
47
|
|
|
* @todo Examine each of the public methods in this class and determine |
48
|
|
|
* whether it makes sense to ask for the data as a param rather than |
49
|
|
|
* just pulling it from source. I don't think it makes sense... it |
50
|
|
|
* was just easier to write the methods that way during testing. |
51
|
|
|
* @todo There are at least portions of this class that could use the |
52
|
|
|
* Reader class rather than working directly with data. |
53
|
|
|
* @todo Refactor all of the anonymous functions used as callbacks. Rather |
54
|
|
|
* than passing $this all over, use $closure->bindTo() instead... |
55
|
|
|
* Actually, write a method called getBoundClosure() or something... |
56
|
|
|
* maybe even make it a trait I don't know yet. But here it would |
57
|
|
|
* allow me to bind any anon function to $this and give me a certain |
58
|
|
|
* set of commonly needed values ($delim, $eol, etc.) |
59
|
|
|
*/ |
60
|
|
|
class Taster |
61
|
|
|
{ |
62
|
|
|
/** |
63
|
|
|
* End-of-line constants. |
64
|
|
|
*/ |
65
|
|
|
const EOL_UNIX = 'lf'; |
66
|
|
|
const EOL_TRS80 = 'cr'; |
67
|
|
|
const EOL_WINDOWS = 'crlf'; |
68
|
|
|
|
69
|
|
|
/** |
70
|
|
|
* ASCII character codes for "invisibles". |
71
|
|
|
*/ |
72
|
|
|
const HORIZONTAL_TAB = 9; |
73
|
|
|
const LINE_FEED = 10; |
74
|
|
|
const CARRIAGE_RETURN = 13; |
75
|
|
|
const SPACE = 32; |
76
|
|
|
|
77
|
|
|
/** |
78
|
|
|
* Data types -- Used within the lickQuotingStyle method. |
79
|
|
|
*/ |
80
|
|
|
const DATA_NONNUMERIC = 'nonnumeric'; |
81
|
|
|
const DATA_SPECIAL = 'special'; |
82
|
|
|
const DATA_UNKNOWN = 'unknown'; |
83
|
|
|
|
84
|
|
|
/** |
85
|
|
|
* Placeholder strings -- hold the place of newlines and delimiters contained |
86
|
|
|
* within quoted text so that the explode method doesn't split incorrectly. |
87
|
|
|
*/ |
88
|
|
|
const PLACEHOLDER_NEWLINE = '[__NEWLINE__]'; |
89
|
|
|
const PLACEHOLDER_DELIM = '[__DELIM__]'; |
90
|
|
|
|
91
|
|
|
/** |
92
|
|
|
* Recommended data sample size. |
93
|
|
|
*/ |
94
|
|
|
const SAMPLE_SIZE = 2500; |
95
|
|
|
|
96
|
|
|
/** |
97
|
|
|
* Column data types -- used within the lickHeader method to determine |
98
|
|
|
* whether the first row contains different types of data than the rest of |
99
|
|
|
* the rows (and thus, is likely a header row). |
100
|
|
|
*/ |
101
|
|
|
// +-987 |
102
|
|
|
const TYPE_NUMBER = 'number'; |
103
|
|
|
// +-12.387 |
104
|
|
|
const TYPE_DOUBLE = 'double'; |
105
|
|
|
// I am a string. I can contain all kinds of stuff. |
106
|
|
|
const TYPE_STRING = 'string'; |
107
|
|
|
// 2010-04-23 04:23:00 |
108
|
|
|
const TYPE_DATETIME = 'datetime'; |
109
|
|
|
// 10-Jul-15, 9/1/2007, April 1st, 2006, etc. |
110
|
|
|
const TYPE_DATE = 'date'; |
111
|
|
|
// 10:00pm, 5pm, 13:08, etc. |
112
|
|
|
const TYPE_TIME = 'time'; |
113
|
|
|
// $98.96, ¥12389, £6.08, €87.00 |
114
|
|
|
const TYPE_CURRENCY = 'currency'; |
115
|
|
|
// 12ab44m1n2_asdf |
116
|
|
|
const TYPE_ALNUM = 'alnum'; |
117
|
|
|
// abababab |
118
|
|
|
const TYPE_ALPHA = 'alpha'; |
119
|
|
|
|
120
|
|
|
/** @var Contract\Streamable The source of data to examine */ |
121
|
|
|
protected $input; |
122
|
|
|
|
123
|
|
|
/** @var string Sample of CSV data to use for tasting (determining CSV flavor) */ |
124
|
|
|
protected $sample; |
125
|
|
|
|
126
|
|
|
/** @var CharCollection Possible delimiter characters in (roughly) the order of likelihood */ |
127
|
|
|
protected $delims; |
128
|
|
|
|
129
|
|
|
/** |
130
|
|
|
* Class constructor--accepts a CSV input source. |
131
|
|
|
* |
132
|
|
|
* @param Contract\Streamable The source of CSV data |
133
|
|
|
* |
134
|
|
|
* @throws TasterException |
135
|
|
|
* |
136
|
|
|
* @todo It may be a good idea to skip the first line or two for the sample |
137
|
|
|
* so that the header line(s) don't throw things off (with the exception |
138
|
|
|
* of lickHeader() obviously) |
139
|
|
|
*/ |
140
|
|
|
public function __construct(Streamable $input) |
141
|
|
|
{ |
142
|
|
|
$this->delims = collect([',', "\t", ';', '|', ':', '-', '_', '#', '/', '\\', '$', '+', '=', '&', '@']); |
143
|
|
|
$this->input = $input; |
144
|
|
|
if (!$this->sample = $input->read(self::SAMPLE_SIZE)) { |
145
|
|
|
throw new TasterException('Invalid input, cannot read sample.', TasterException::ERR_INVALID_SAMPLE); |
146
|
|
|
} |
147
|
|
|
} |
148
|
|
|
|
149
|
|
|
/** |
150
|
|
|
* "Invoke" magic method. |
151
|
|
|
* |
152
|
|
|
* Called when an object is invoked as if it were a function. So, for instance, |
153
|
|
|
* This is simply an alias to the lick method. |
154
|
|
|
* |
155
|
|
|
* @throws TasterException |
156
|
|
|
* |
157
|
|
|
* @return Flavor A flavor object |
158
|
|
|
*/ |
159
|
|
|
public function __invoke() |
160
|
|
|
{ |
161
|
|
|
return $this->lick(); |
162
|
|
|
} |
163
|
|
|
|
164
|
|
|
/** |
165
|
|
|
* Examine the input source and determine what "Flavor" of CSV it contains. |
166
|
|
|
* The CSV format, while having an RFC (https://tools.ietf.org/html/rfc4180), |
167
|
|
|
* doesn't necessarily always conform to it. And it doesn't provide meta such as the delimiting character, quote character, or what types of data are quoted. |
168
|
|
|
* such as the delimiting character, quote character, or what types of data are quoted. |
169
|
|
|
* are quoted. |
170
|
|
|
* |
171
|
|
|
* @throws TasterException |
172
|
|
|
* |
173
|
|
|
* @return Flavor The metadata that the CSV format doesn't provide |
174
|
|
|
* |
175
|
|
|
* @todo Implement a lickQuote method for when lickQuoteAndDelim method fails |
176
|
|
|
* @todo Should there bea lickEscapeChar method? the python module that inspired |
177
|
|
|
* this library doesn't include one... |
178
|
|
|
* @todo This should cache the results and only regenerate if $this->sample |
179
|
|
|
* changes (or $this->input) |
180
|
|
|
*/ |
181
|
|
|
public function lick() |
182
|
|
|
{ |
183
|
|
|
$lineTerminator = $this->lickLineEndings(); |
184
|
|
|
try { |
185
|
|
|
list($quoteChar, $delimiter) = $this->lickQuoteAndDelim(); |
186
|
|
|
} catch (TasterException $e) { |
187
|
|
|
if ($e->getCode() !== TasterException::ERR_QUOTE_AND_DELIM) { |
188
|
|
|
throw $e; |
189
|
|
|
} |
190
|
|
|
$quoteChar = '"'; |
191
|
|
|
$delimiter = $this->lickDelimiter($lineTerminator); |
192
|
|
|
} |
193
|
|
|
/** |
194
|
|
|
* @todo Should this be null? Because doubleQuote = true means this = null |
195
|
|
|
*/ |
196
|
|
|
$escapeChar = '\\'; |
197
|
|
|
$quoteStyle = $this->lickQuotingStyle($delimiter, $lineTerminator); |
198
|
|
|
$header = $this->lickHeader($delimiter, $lineTerminator); |
199
|
|
|
|
200
|
|
|
return new Flavor(compact('quoteChar', 'escapeChar', 'delimiter', 'lineTerminator', 'quoteStyle', 'header')); |
201
|
|
|
} |
202
|
|
|
|
203
|
|
|
/** |
204
|
|
|
* Examines the contents of the CSV data to make a determination of whether |
205
|
|
|
* or not it contains a header row. To make this determination, it creates |
206
|
|
|
* an array of each column's (in each row)'s data type and length and then |
207
|
|
|
* compares them. If all of the rows except the header look similar, it will |
208
|
|
|
* return true. This is only a guess though. There is no programmatic way to |
209
|
|
|
* determine 100% whether a CSV file has a header. The format does not |
210
|
|
|
* provide metadata such as that. |
211
|
|
|
* |
212
|
|
|
* @param string $delim The CSV data's delimiting char (can be a variety of chars but) |
213
|
|
|
* typically is either a comma or a tab, sometimes a pipe) |
214
|
|
|
* @param string $eol The CSV data's end-of-line char(s) (\n \r or \r\n) |
215
|
|
|
* |
216
|
|
|
* @return bool True if the data (most likely) contains a header row |
217
|
|
|
* |
218
|
|
|
* @todo This method needs a total refactor. It's not necessary to loop twice |
219
|
|
|
* You could get away with one loop and that would allow for me to do |
220
|
|
|
* something like only examining enough rows to get to a particular |
221
|
|
|
* "hasHeader" score (+-100 for instance) & then just return true|false |
222
|
|
|
* @todo Also, break out of the first loop after a certain (perhaps even a |
223
|
|
|
* configurable) amount of lines (you only need to examine so much data ) |
224
|
|
|
* to reliably make a determination and this is an expensive method) |
225
|
|
|
* @todo I could remove the need for quote, delim, and eol by "licking" the |
226
|
|
|
* data sample provided in the first argument. Also, I could actually |
227
|
|
|
* create a Reader object to read the data here. |
228
|
|
|
*/ |
229
|
|
|
public function lickHeader($delim, $eol) |
230
|
|
|
{ |
231
|
|
|
// this will be filled with the type and length of each column and each row |
232
|
|
|
$types = new TabularCollection(); |
233
|
|
|
|
234
|
|
|
// callback to build the aforementioned collection |
235
|
|
|
$buildTypes = function ($line, $line_no) use ($types, $delim, $eol) { |
236
|
|
|
if ($line_no > 2) { |
237
|
|
|
return; |
238
|
|
|
} |
239
|
|
|
$line = str_replace(self::PLACEHOLDER_NEWLINE, $eol, $line); |
240
|
|
|
$getType = function ($field, $colpos) use ($types, $line, $line_no, $delim) { |
241
|
|
|
$field = str_replace(self::PLACEHOLDER_DELIM, $delim, $field); |
242
|
|
|
$fieldMeta = [ |
243
|
|
|
'value' => $field, |
244
|
|
|
'type' => $this->lickType($this->unQuote($field)), |
245
|
|
|
'length' => strlen($field), |
246
|
|
|
]; |
247
|
|
|
// @todo TabularCollection should have a way to set a value using [row,column] |
248
|
|
|
try { |
249
|
|
|
$row = $types->get($line_no); |
250
|
|
|
} catch (OutOfBoundsException $e) { |
251
|
|
|
$row = []; |
252
|
|
|
} |
253
|
|
|
$row[$colpos] = $fieldMeta; |
254
|
|
|
$types->set($line_no, $row); |
255
|
|
|
}; |
256
|
|
|
collect(explode($delim, $line))->walk($getType->bindTo($this)); |
257
|
|
|
}; |
258
|
|
|
|
259
|
|
|
collect(explode( |
260
|
|
|
$eol, |
261
|
|
|
$this->replaceQuotedSpecialChars($this->sample, $delim) |
262
|
|
|
)) |
263
|
|
|
->walk($buildTypes->bindTo($this)); |
264
|
|
|
|
265
|
|
|
$hasHeader = new NumericCollection(); |
266
|
|
|
$possibleHeader = collect($types->shift()); |
267
|
|
|
$types->walk(function (AbstractCollection $row) use ($hasHeader, $possibleHeader) { |
268
|
|
|
$row->walk(function (AbstractCollection $fieldMeta, $col_no) use ($hasHeader, $possibleHeader) { |
269
|
|
|
try { |
270
|
|
|
$col = collect($possibleHeader->get($col_no, null, true)); |
271
|
|
|
if ($fieldMeta->get('type') == self::TYPE_STRING) { |
272
|
|
|
// use length |
273
|
|
|
if ($fieldMeta->get('length') != $col->get('length')) { |
274
|
|
|
$hasHeader->push(1); |
275
|
|
|
} else { |
276
|
|
|
$hasHeader->push(-1); |
277
|
|
|
} |
278
|
|
|
} else { |
279
|
|
|
// use data type |
280
|
|
|
if ($fieldMeta->get('type') != $col->get('type')) { |
281
|
|
|
$hasHeader->push(1); |
282
|
|
|
} else { |
283
|
|
|
$hasHeader->push(-1); |
284
|
|
|
} |
285
|
|
|
} |
286
|
|
|
} catch (OutOfBoundsException $e) { |
287
|
|
|
// failure... |
288
|
|
|
return; |
289
|
|
|
} |
290
|
|
|
}); |
291
|
|
|
}); |
292
|
|
|
|
293
|
|
|
return $hasHeader->sum() > 0; |
294
|
|
|
} |
295
|
|
|
|
296
|
|
|
/** |
297
|
|
|
* Replaces all quoted columns with a blank string. I was using this method |
298
|
|
|
* to prevent explode() from incorrectly splitting at delimiters and newlines |
299
|
|
|
* within quotes when parsing a file. But this was before I wrote the |
300
|
|
|
* replaceQuotedSpecialChars method which (at least to me) makes more sense. |
301
|
|
|
* |
302
|
|
|
* @param string $data The string to replace quoted strings within |
303
|
|
|
* |
304
|
|
|
* @return string The input string with quoted strings removed |
305
|
|
|
* |
306
|
|
|
* @todo Replace code that uses this method with the replaceQuotedSpecialChars |
307
|
|
|
* method instead. I think it's cleaner. |
308
|
|
|
*/ |
309
|
|
|
protected function removeQuotedStrings($data) |
310
|
|
|
{ |
311
|
|
|
return preg_replace($pattern = '/(["\'])(?:(?=(\\\\?))\2.)*?\1/sm', $replace = '', $data); |
312
|
|
|
} |
313
|
|
|
|
314
|
|
|
/** |
315
|
|
|
* Examine the input source to determine which character(s) are being used |
316
|
|
|
* as the end-of-line character. |
317
|
|
|
* |
318
|
|
|
* @return string The end-of-line char for the input data |
319
|
|
|
* @credit pulled from stackoverflow thread *tips hat to username "Harm"* |
320
|
|
|
* |
321
|
|
|
* @todo This should throw an exception if it cannot determine the line ending |
322
|
|
|
* @todo I probably will make this method protected when I'm done with testing... |
323
|
|
|
* @todo If there is any way for this method to fail (for instance if a file ) |
324
|
|
|
* is totally empty or contains no line breaks), then it needs to throw |
325
|
|
|
* a relevant TasterException |
326
|
|
|
* @todo Use replaceQuotedSpecialChars rather than removeQuotedStrings() |
327
|
|
|
*/ |
328
|
|
|
protected function lickLineEndings() |
329
|
|
|
{ |
330
|
|
|
$str = $this->removeQuotedStrings($this->sample); |
331
|
|
|
$eols = [ |
332
|
|
|
self::EOL_WINDOWS => "\r\n", // 0x0D - 0x0A - Windows, DOS OS/2 |
333
|
|
|
self::EOL_UNIX => "\n", // 0x0A - - Unix, OSX |
334
|
|
|
self::EOL_TRS80 => "\r", // 0x0D - - Apple ][, TRS80 |
335
|
|
|
]; |
336
|
|
|
|
337
|
|
|
$curCount = 0; |
338
|
|
|
// @todo This should return a default maybe? |
339
|
|
|
$curEol = PHP_EOL; |
340
|
|
|
foreach ($eols as $k => $eol) { |
341
|
|
|
if (($count = substr_count($str, $eol)) > $curCount) { |
342
|
|
|
$curCount = $count; |
343
|
|
|
$curEol = $eol; |
344
|
|
|
} |
345
|
|
|
} |
346
|
|
|
|
347
|
|
|
return $curEol; |
348
|
|
|
} |
349
|
|
|
|
350
|
|
|
/** |
351
|
|
|
* The best way to determine quote and delimiter characters is when columns |
352
|
|
|
* are quoted, often you can seek out a pattern of delim, quote, stuff, quote, delim |
353
|
|
|
* but this only works if you have quoted columns. If you don't you have to |
354
|
|
|
* determine these characters some other way... (see lickDelimiter). |
355
|
|
|
* |
356
|
|
|
* @throws TasterException |
357
|
|
|
* |
358
|
|
|
* @return array A two-row array containing quotechar, delimchar |
359
|
|
|
* |
360
|
|
|
* @todo make protected |
361
|
|
|
* @todo This should throw an exception if it cannot determine the delimiter |
362
|
|
|
* this way. |
363
|
|
|
* @todo This should check for any line endings not just \n |
364
|
|
|
*/ |
365
|
|
|
protected function lickQuoteAndDelim() |
366
|
|
|
{ |
367
|
|
|
/** |
368
|
|
|
* @var array An array of pattern matches |
369
|
|
|
*/ |
370
|
|
|
$matches = null; |
371
|
|
|
/** |
372
|
|
|
* @var array An array of patterns (regex) |
373
|
|
|
*/ |
374
|
|
|
$patterns = []; |
375
|
|
|
// delim can be anything but line breaks, quotes, alphanumeric, underscore, backslash, or any type of spaces |
376
|
|
|
$antidelims = implode(["\r", "\n", "\w", preg_quote('"', '/'), preg_quote("'", '/'), preg_quote(chr(self::SPACE), '/')]); |
377
|
|
|
$delim = '(?P<delim>[^' . $antidelims . '])'; |
378
|
|
|
$quote = '(?P<quoteChar>"|\'|`)'; // @todo I think MS Excel uses some strange encoding for fancy open/close quotes |
379
|
|
|
$patterns[] = '/' . $delim . ' ?' . $quote . '.*?\2\1/ms'; // ,"something", - anything but whitespace or quotes followed by a possible space followed by a quote followed by anything followed by same quote, followed by same anything but whitespace |
380
|
|
|
$patterns[] = '/(?:^|\n)' . $quote . '.*?\1' . $delim . ' ?/ms'; // 'something', - beginning of line or line break, followed by quote followed by anything followed by quote followed by anything but whitespace or quotes |
381
|
|
|
$patterns[] = '/' . $delim . ' ?' . $quote . '.*?\2(?:^|\n)/ms'; // ,'something' - anything but whitespace or quote followed by possible space followed by quote followed by anything followed by quote, followed by end of line |
382
|
|
|
$patterns[] = '/(?:^|\n)' . $quote . '.*?\2(?:$|\n)/ms'; // 'something' - beginning of line followed by quote followed by anything followed by quote followed by same quote followed by end of line |
383
|
|
|
foreach ($patterns as $pattern) { |
384
|
|
|
// @todo I had to add the error suppression char here because it was |
385
|
|
|
// causing undefined offset errors with certain data sets. strange... |
386
|
|
|
if (@preg_match_all($pattern, $this->sample, $matches) && $matches) { |
387
|
|
|
break; |
388
|
|
|
} |
389
|
|
|
} |
390
|
|
|
if ($matches) { |
391
|
|
|
$qcad = array_intersect_key($matches, array_flip(['quoteChar', 'delim'])); |
392
|
|
|
if (!empty($matches['quoteChar']) && !empty($matches['delim'])) { |
393
|
|
|
try { |
394
|
|
|
return [ |
395
|
|
|
collect($qcad['quoteChar'])->frequency()->sort()->reverse()->getKeyAtPosition(0), |
396
|
|
|
collect($qcad['delim'])->frequency()->sort()->reverse()->getKeyAtPosition(0), |
397
|
|
|
]; |
398
|
|
|
} catch (OutOfBoundsException $e) { |
399
|
|
|
// eat this exception and let the taster exception below be thrown instead... |
400
|
|
|
} |
401
|
|
|
} |
402
|
|
|
} |
403
|
|
|
throw new TasterException('quoteChar and delimiter cannot be determined', TasterException::ERR_QUOTE_AND_DELIM); |
404
|
|
|
} |
405
|
|
|
|
406
|
|
|
/** |
407
|
|
|
* Take a list of likely delimiter characters and find the one that occurs |
408
|
|
|
* the most consistent amount of times within the provided data. |
409
|
|
|
* |
410
|
|
|
* @param string $eol The character(s) used for newlines |
411
|
|
|
* |
412
|
|
|
* @return string One of four Flavor::QUOTING_* constants |
413
|
|
|
* |
414
|
|
|
* @see Flavor for possible quote style constants |
415
|
|
|
* |
416
|
|
|
* @todo Refactor this method--It needs more thorough testing against a wider |
417
|
|
|
* variety of CSV data to be sure it works reliably. And I'm sure there |
418
|
|
|
* are many performance and logic improvements that could be made. This |
419
|
|
|
* is essentially a first draft. |
420
|
|
|
* @todo Can't use replaceQuotedSpecialChars rather than removeQuotedStrings |
421
|
|
|
* because the former requires u to know the delimiter |
422
|
|
|
*/ |
423
|
|
|
protected function lickDelimiter($eol = "\n") |
424
|
|
|
{ |
425
|
|
|
$frequencies = collect(); |
426
|
|
|
$consistencies = new NumericCollection(); |
427
|
|
|
|
428
|
|
|
// build a table of characters and their frequencies for each line. We |
429
|
|
|
// will use this frequency table to then build a table of frequencies of |
430
|
|
|
// each frequency (in 10 lines, "tab" occurred 5 times on 7 of those |
431
|
|
|
// lines, 6 times on 2 lines, and 7 times on 1 line) |
432
|
|
|
collect(explode($eol, $this->removeQuotedStrings($this->sample))) |
433
|
|
|
->walk(function ($line, $line_no) use ($frequencies) { |
434
|
|
|
collect(str_split($line)) |
435
|
|
|
->filter(function ($c) { |
436
|
|
|
return collect($this->delims)->contains($c); |
437
|
|
|
}) |
438
|
|
|
->frequency() |
439
|
|
|
->sort() |
440
|
|
|
->reverse() |
441
|
|
|
->walk(function ($count, $char) use ($frequencies, $line_no) { |
442
|
|
|
try { |
443
|
|
|
$char_counts = $frequencies->get($char, null, true); |
444
|
|
|
} catch (OutOfBoundsException $e) { |
445
|
|
|
$char_counts = []; |
446
|
|
|
} |
447
|
|
|
$char_counts[$line_no] = $count; |
448
|
|
|
$frequencies->set($char, $char_counts); |
449
|
|
|
}); |
450
|
|
|
}) |
451
|
|
|
// the above only finds frequencies for characters if they exist in |
452
|
|
|
// a given line. This will go back and fill in zeroes where a char |
453
|
|
|
// didn't occur at all in a given line (needed to determine mode) |
454
|
|
|
->walk(function ($line, $line_no) use ($frequencies) { |
455
|
|
|
$frequencies->walk(function ($counts, $char) use ($line_no, $frequencies) { |
456
|
|
|
try { |
457
|
|
|
$char_counts = $frequencies->get($char, null, true); |
458
|
|
|
} catch (OutOfBoundsException $e) { |
459
|
|
|
$char_counts = []; |
460
|
|
|
} |
461
|
|
|
if (!array_key_exists($line_no, $char_counts)) { |
462
|
|
|
$char_counts[$line_no] = 0; |
463
|
|
|
} |
464
|
|
|
$frequencies->set($char, $char_counts); |
465
|
|
|
}); |
466
|
|
|
}); |
467
|
|
|
|
468
|
|
|
// now determine the mode for each char to decide the "expected" amount |
469
|
|
|
// of times a char (possible delim) will occur on each line... |
470
|
|
|
$modes = new NumericCollection([]); |
471
|
|
|
foreach ($frequencies as $char => $freq) { |
472
|
|
|
$modes->set($char, (new NumericCollection($freq))->mode()); |
473
|
|
|
} |
474
|
|
|
$frequencies->walk(function ($f, $chr) use ($modes, $consistencies) { |
475
|
|
|
collect($f)->walk(function ($num) use ($modes, $chr, $consistencies) { |
476
|
|
|
if ($expected = $modes->get($chr)) { |
477
|
|
|
if ($num == $expected) { |
478
|
|
|
// met the goal, yay! |
479
|
|
|
$cc = $consistencies->get($chr, 0); |
480
|
|
|
$consistencies->set($chr, ++$cc); |
481
|
|
|
} |
482
|
|
|
} |
483
|
|
|
}); |
484
|
|
|
}); |
485
|
|
|
|
486
|
|
|
$max = $consistencies->max(); |
487
|
|
|
$dups = $consistencies->duplicates(); |
488
|
|
|
if ($dups->has($max)) { |
489
|
|
|
// if more than one candidate, then look at where the character appeared |
490
|
|
|
// in the data. Was it relatively evenly distributed or was there a |
491
|
|
|
// specific area that the character tended to appear? Dates will have a |
492
|
|
|
// consistent format (e.g. 04-23-1986) and so may easily provide a false |
493
|
|
|
// positive for delimiter. But the dash will be focused in that one area, |
494
|
|
|
// whereas the comma character is spread out. You can determine this by |
495
|
|
|
// finding out the number of chars between each occurrence and getting |
496
|
|
|
// the average. If the average is wildly different than any given distance |
497
|
|
|
// than bingo you probably aren't working with a delimiter there... |
498
|
|
|
|
499
|
|
|
// another option to find the delimiter if there is a tie, is to build |
500
|
|
|
// a table of character position within each line. Then use that to |
501
|
|
|
// determine if one character is consistently in the same position or |
502
|
|
|
// at least the same general area. Use the delimiter that is the most |
503
|
|
|
// consistent in that way... |
504
|
|
|
|
505
|
|
|
/** |
506
|
|
|
* @todo Add a method here to figure out where duplicate best-match |
507
|
|
|
* delimiter(s) fall within each line and then, depending on |
508
|
|
|
* which one has the best distribution, return that one. |
509
|
|
|
*/ |
510
|
|
|
$decision = $dups->get($max); |
511
|
|
|
try { |
512
|
|
|
return $this->guessDelimByDistribution($decision, $eol); |
513
|
|
|
} catch (TasterException $e) { |
514
|
|
|
// if somehow we STILL can't come to a consensus, then fall back to a |
515
|
|
|
// "preferred delimiters" list... |
516
|
|
|
foreach ($this->delims as $key => $chr) { |
517
|
|
|
if (collect($decision)->contains($chr)) { |
518
|
|
|
return $chr; |
519
|
|
|
} |
520
|
|
|
} |
521
|
|
|
} |
522
|
|
|
} |
523
|
|
|
|
524
|
|
|
return $consistencies |
525
|
|
|
->sort() |
526
|
|
|
->reverse() |
527
|
|
|
->getKeyAtPosition(0); |
528
|
|
|
} |
529
|
|
|
|
530
|
|
|
/** |
531
|
|
|
* Compare positional consistency of several characters to determine the |
532
|
|
|
* probable delimiter character. The idea behind this is that the delimiter |
533
|
|
|
* character is likely more consistently distributed than false-positive |
534
|
|
|
* delimiter characters produced by lickDelimiter(). For instance, consider |
535
|
|
|
* a series of rows similar to the following:. |
536
|
|
|
* |
537
|
|
|
* 1,luke,visinoni,[email protected],(530) 413-3076,04-23-1986 |
538
|
|
|
* |
539
|
|
|
* The lickDelimiter() method will often not be able to determine whether the |
540
|
|
|
* delimiter is a comma or a dash because they occur the same number of times |
541
|
|
|
* on just about every line (5 for comma, 3 for dash). The difference is |
542
|
|
|
* obvious to you, no doubt. But us humans are pattern-recognition machines! |
543
|
|
|
* The difference between the comma and the dash are that the comma is dist- |
544
|
|
|
* ributed almost evenly throughout the line. The dash characters occur |
545
|
|
|
* entirely at the end of the line. This method accepts any number of possible |
546
|
|
|
* delimiter characters and returns the one that is distributed |
547
|
|
|
* |
548
|
|
|
* If delim character cannot be determined by lickQuoteAndDelim(), taster |
549
|
|
|
* tries lickDelimiter(). When that method runs into a tie, it will use this |
550
|
|
|
* as a tie-breaker. |
551
|
|
|
* |
552
|
|
|
* @param array $delims Possible delimiter characters (method chooses from |
553
|
|
|
* this array of characters) |
554
|
|
|
* @param string $eol The end-of-line character (or set of characters) |
555
|
|
|
* |
556
|
|
|
* @throws TasterException |
557
|
|
|
* |
558
|
|
|
* @return string The probable delimiter character |
559
|
|
|
*/ |
560
|
|
|
protected function guessDelimByDistribution(array $delims, $eol = "\n") |
561
|
|
|
{ |
562
|
|
|
try { |
563
|
|
|
// @todo Write a method that does this... |
564
|
|
|
$lines = collect(explode($eol, $this->removeQuotedStrings($this->sample))); |
565
|
|
|
|
566
|
|
|
return $delims[collect($delims)->map(function ($delim) use (&$distrib, $lines) { |
567
|
|
|
$linedist = collect(); |
568
|
|
|
$lines->walk(function ($line, $line_no) use (&$linedist, $delim) { |
569
|
|
|
if (!strlen($line)) { |
570
|
|
|
return; |
571
|
|
|
} |
572
|
|
|
$sectstot = 10; |
573
|
|
|
$sectlen = (int) (strlen($line) / $sectstot); |
574
|
|
|
$sections = collect(str_split($line, $sectlen)) |
575
|
|
|
->map(function ($section) use ($delim) { |
576
|
|
|
return substr_count($section, $delim); |
577
|
|
|
}) |
578
|
|
|
->filter(function ($count) { |
579
|
|
|
return (bool) $count; |
580
|
|
|
}); |
581
|
|
|
if (is_numeric($count = $sections->count())) { |
582
|
|
|
$linedist->set($line_no, $count / $sectstot); |
583
|
|
|
} |
584
|
|
|
}); |
585
|
|
|
|
586
|
|
|
return $linedist; |
587
|
|
|
})->map(function ($dists) { |
588
|
|
|
return $dists->average(); |
589
|
|
|
})->sort() |
590
|
|
|
->reverse() |
591
|
|
|
->getKeyAtPosition(0)]; |
592
|
|
|
} catch (Exception $e) { |
593
|
|
|
throw new TasterException('delimiter cannot be determined by distribution', TasterException::ERR_DELIMITER); |
594
|
|
|
} |
595
|
|
|
} |
596
|
|
|
|
597
|
|
|
/** |
598
|
|
|
* Determine the "style" of data quoting. The CSV format, while having an RFC |
599
|
|
|
* (https://tools.ietf.org/html/rfc4180), doesn't necessarily always conform |
600
|
|
|
* to it. And it doesn't provide metadata such as the delimiting character, |
601
|
|
|
* quote character, or what types of data are quoted. So this method makes a |
602
|
|
|
* logical guess by finding which columns have been quoted (if any) and |
603
|
|
|
* examining their data type. Most often, CSV files will only use quotes |
604
|
|
|
* around columns that contain special characters such as the dilimiter, |
605
|
|
|
* the quoting character, newlines, etc. (we refer to this style as ) |
606
|
|
|
* QUOTE_MINIMAL), but some quote all columns that contain nonnumeric data |
607
|
|
|
* (QUOTE_NONNUMERIC). Then there are CSV files that quote all columns |
608
|
|
|
* (QUOTE_ALL) and those that quote none (QUOTE_NONE). |
609
|
|
|
* |
610
|
|
|
* @param string $delim The character used as the column delimiter |
611
|
|
|
* @param string $eol The character used for newlines |
612
|
|
|
* |
613
|
|
|
* @return string One of four "QUOTING_" constants defined above--see this |
614
|
|
|
* method's description for more info. |
615
|
|
|
* |
616
|
|
|
* @todo Refactor this method--It needs more thorough testing against a wider |
617
|
|
|
* variety of CSV data to be sure it works reliably. And I'm sure there |
618
|
|
|
* are many performance and logic improvements that could be made. This |
619
|
|
|
* is essentially a first draft. |
620
|
|
|
*/ |
621
|
|
|
protected function lickQuotingStyle($delim, $eol) |
622
|
|
|
{ |
623
|
|
|
$quoting_styles = collect([ |
624
|
|
|
Flavor::QUOTE_ALL => true, |
625
|
|
|
Flavor::QUOTE_NONE => true, |
626
|
|
|
Flavor::QUOTE_MINIMAL => true, |
627
|
|
|
Flavor::QUOTE_NONNUMERIC => true, |
628
|
|
|
]); |
629
|
|
|
|
630
|
|
|
$lines = collect(explode($eol, $this->replaceQuotedSpecialChars($this->sample, $delim))); |
631
|
|
|
$freq = collect() |
632
|
|
|
->set('quoted', collect()) |
633
|
|
|
->set('unquoted', collect()); |
634
|
|
|
|
635
|
|
|
// walk through each line from the data sample to determine which fields |
636
|
|
|
// are quoted and which aren't |
637
|
|
|
$qsFunc = function ($line) use (&$quoting_styles, &$freq, $eol, $delim) { |
638
|
|
|
$line = str_replace(self::PLACEHOLDER_NEWLINE, $eol, $line); |
639
|
|
|
$qnqaFunc = function ($field) use (&$quoting_styles, &$freq, $delim) { |
640
|
|
|
$field = str_replace(self::PLACEHOLDER_DELIM, $delim, $field); |
641
|
|
|
if ($this->isQuoted($field)) { |
642
|
|
|
$field = $this->unQuote($field); |
643
|
|
|
$freq->get('quoted')->push($this->lickDataType($field)); |
644
|
|
|
// since we know there's at least one quoted field, |
645
|
|
|
// QUOTE_NONE can be ruled out |
646
|
|
|
$quoting_styles->set(Flavor::QUOTE_NONE, false); |
647
|
|
|
} else { |
648
|
|
|
$freq->get('unquoted')->push($this->lickDataType($field)); |
649
|
|
|
// since we know there's at least one unquoted field, |
650
|
|
|
// QUOTE_ALL can be ruled out |
651
|
|
|
$quoting_styles->set(Flavor::QUOTE_ALL, false); |
652
|
|
|
} |
653
|
|
|
}; |
654
|
|
|
collect(explode($delim, $line)) |
655
|
|
|
->walk($qnqaFunc->bindTo($this)); |
656
|
|
|
}; |
657
|
|
|
$lines->walk($qsFunc->bindTo($this)); |
658
|
|
|
|
659
|
|
|
$types = $freq->get('quoted')->unique(); |
660
|
|
|
$quoting_styles = $quoting_styles->filter(function ($val) { |
661
|
|
|
return (bool) $val; |
662
|
|
|
}); |
663
|
|
|
// if quoting_styles still has QUOTE_ALL or QUOTE_NONE, then return |
664
|
|
|
// whichever of them it is, we don't need to do anything else |
665
|
|
|
if ($quoting_styles->has(Flavor::QUOTE_ALL)) { |
666
|
|
|
return Flavor::QUOTE_ALL; |
667
|
|
|
} |
668
|
|
|
if ($quoting_styles->has(Flavor::QUOTE_NONE)) { |
669
|
|
|
return Flavor::QUOTE_NONE; |
670
|
|
|
} |
671
|
|
|
if (count($types) == 1) { |
672
|
|
|
$style = $types->getValueAtPosition(0); |
673
|
|
|
if ($quoting_styles->has($style)) { |
674
|
|
|
return $style; |
675
|
|
|
} |
676
|
|
|
} else { |
677
|
|
|
if ($types->contains(self::DATA_NONNUMERIC)) { |
678
|
|
|
// allow for a SMALL amount of error here |
679
|
|
|
$counts = collect([self::DATA_SPECIAL => 0, self::DATA_NONNUMERIC => 0]); |
680
|
|
|
$freq->get('quoted')->walk(function ($type) use (&$counts) { |
681
|
|
|
$counts->increment($type); |
682
|
|
|
}); |
683
|
|
|
// @todo is all this even necessary? seems unnecessary to me... |
684
|
|
|
if ($most = $counts->max()) { |
685
|
|
|
$least = $counts->min(); |
686
|
|
|
$err_margin = $least / $most; |
687
|
|
|
if ($err_margin < 1) { |
688
|
|
|
return Flavor::QUOTE_NONNUMERIC; |
689
|
|
|
} |
690
|
|
|
} |
691
|
|
|
} |
692
|
|
|
} |
693
|
|
|
|
694
|
|
|
return Flavor::QUOTE_MINIMAL; |
695
|
|
|
} |
696
|
|
|
|
697
|
|
|
/** |
698
|
|
|
* Remove quotes around a piece of text (if there are any). |
699
|
|
|
* |
700
|
|
|
* @param string $data The data to "unquote" |
701
|
|
|
* |
702
|
|
|
* @return string The data passed in, only with quotes stripped (off the edges) |
703
|
|
|
*/ |
704
|
|
|
protected function unQuote($data) |
705
|
|
|
{ |
706
|
|
|
return preg_replace('/^(["\'])(.*)\1$/', '\2', $data); |
707
|
|
|
} |
708
|
|
|
|
709
|
|
|
/** |
710
|
|
|
* Determine whether a particular string of data has quotes around it. |
711
|
|
|
* |
712
|
|
|
* @param string $data The data to check |
713
|
|
|
* |
714
|
|
|
* @return bool Whether the data is quoted or not |
715
|
|
|
*/ |
716
|
|
|
protected function isQuoted($data) |
717
|
|
|
{ |
718
|
|
|
return preg_match('/^([\'"])[^\1]*\1$/', $data); |
719
|
|
|
} |
720
|
|
|
|
721
|
|
|
/** |
722
|
|
|
* Determine what type of data is contained within a variable |
723
|
|
|
* Possible types: |
724
|
|
|
* - nonnumeric - only numbers |
725
|
|
|
* - special - contains characters that could potentially need to be quoted (possible delimiter characters) |
726
|
|
|
* - unknown - everything else |
727
|
|
|
* This method is really only used within the "lickQuotingStyle" method to |
728
|
|
|
* help determine whether a particular column has been quoted due to it being |
729
|
|
|
* nonnumeric or because it has some special character in it such as a delimiter |
730
|
|
|
* or newline or quote. |
731
|
|
|
* |
732
|
|
|
* @param string $data The data to determine the type of |
733
|
|
|
* |
734
|
|
|
* @return string The type of data (one of the "DATA_" constants above) |
735
|
|
|
* |
736
|
|
|
* @todo I could probably eliminate this method and use an anonymous function |
737
|
|
|
* instead. It isn't used anywhere else and its name could be misleading. |
738
|
|
|
* Especially since I also have a lickType method that is used within the |
739
|
|
|
* lickHeader method. |
740
|
|
|
*/ |
741
|
|
|
protected function lickDataType($data) |
742
|
|
|
{ |
743
|
|
|
// @todo make this check for only the quote and delim that are actually being used |
744
|
|
|
// that will make the guess more accurate |
745
|
|
|
if (preg_match('/[\'",\t\|:;-]/', $data)) { |
746
|
|
|
return self::DATA_SPECIAL; |
747
|
|
|
} elseif (preg_match('/[^0-9]/', $data)) { |
748
|
|
|
return self::DATA_NONNUMERIC; |
749
|
|
|
} |
750
|
|
|
|
751
|
|
|
return self::DATA_UNKNOWN; |
752
|
|
|
} |
753
|
|
|
|
754
|
|
|
/** |
755
|
|
|
* Replace all instances of newlines and whatever character you specify (as |
756
|
|
|
* the delimiter) that are contained within quoted text. The replacements are |
757
|
|
|
* simply a special placeholder string. This is done so that I can use the |
758
|
|
|
* very unsmart "explode" function and not have to worry about it exploding |
759
|
|
|
* on delimiters or newlines within quotes. Once I have exploded, I typically |
760
|
|
|
* sub back in the real characters before doing anything else. Although |
761
|
|
|
* currently there is no dedicated method for doing so I just use str_replace. |
762
|
|
|
* |
763
|
|
|
* @param string $data The string to do the replacements on |
764
|
|
|
* @param string $delim The delimiter character to replace |
765
|
|
|
* |
766
|
|
|
* @return string The data with replacements performed |
767
|
|
|
* |
768
|
|
|
* @todo I could probably pass in (maybe optionally) the newline character I |
769
|
|
|
* want to replace as well. I'll do that if I need to. |
770
|
|
|
*/ |
771
|
|
|
protected function replaceQuotedSpecialChars($data, $delim) |
772
|
|
|
{ |
773
|
|
|
return preg_replace_callback('/([\'"])(.*)\1/imsU', function ($matches) use ($delim) { |
774
|
|
|
$ret = preg_replace("/([\r\n])/", self::PLACEHOLDER_NEWLINE, $matches[0]); |
775
|
|
|
$ret = str_replace($delim, self::PLACEHOLDER_DELIM, $ret); |
776
|
|
|
|
777
|
|
|
return $ret; |
778
|
|
|
}, $data); |
779
|
|
|
} |
780
|
|
|
|
781
|
|
|
/** |
782
|
|
|
* Determine the "type" of a particular string of data. Used for the lickHeader |
783
|
|
|
* method to assign a type to each column to try to determine whether the |
784
|
|
|
* first for is different than a consistent column type. |
785
|
|
|
* |
786
|
|
|
* @todo As I'm writing this method I'm beginning ot realize how expensive |
787
|
|
|
* the lickHeader method is going to end up being since it has to apply all |
788
|
|
|
* these regexes (potentially) to every column. I may end up writing a much |
789
|
|
|
* simpler type-checking method than this if it proves to be too expensive |
790
|
|
|
* to be practical. |
791
|
|
|
* |
792
|
|
|
* @param string $data The string of data to check the type of |
793
|
|
|
* |
794
|
|
|
* @return string One of the TYPE_ string constants above |
795
|
|
|
*/ |
796
|
|
|
protected function lickType($data) |
797
|
|
|
{ |
798
|
|
|
if (preg_match('/^[+-]?[\d\.]+$/', $data)) { |
799
|
|
|
return self::TYPE_NUMBER; |
800
|
|
|
} elseif (preg_match('/^[+-]?[\d]+\.[\d]+$/', $data)) { |
801
|
|
|
return self::TYPE_DOUBLE; |
802
|
|
|
} elseif (preg_match('/^[+-]?[¥£€$]\d+(\.\d+)$/', $data)) { |
803
|
|
|
return self::TYPE_CURRENCY; |
804
|
|
|
} elseif (preg_match('/^[a-zA-Z]+$/', $data)) { |
805
|
|
|
return self::TYPE_ALPHA; |
806
|
|
|
} |
807
|
|
|
try { |
808
|
|
|
$year = '([01][0-9])?[0-9]{2}'; |
809
|
|
|
$month = '([01]?[0-9]|Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)'; |
810
|
|
|
$day = '[0-3]?[0-9]'; |
811
|
|
|
$sep = '[\/\.\-]?'; |
812
|
|
|
$time = '([0-2]?[0-9](:[0-5][0-9]){1,2}(am|pm)?|[01]?[0-9](am|pm))'; |
813
|
|
|
$date = '(' . $month . $sep . $day . $sep . $year . '|' . $day . $sep . $month . $sep . $year . '|' . $year . $sep . $month . $sep . $day . ')'; |
814
|
|
|
$dt = new DateTime($data); |
815
|
|
|
$dt->setTime(0, 0, 0); |
816
|
|
|
$now = new DateTime(); |
817
|
|
|
$now->setTime(0, 0, 0); |
818
|
|
|
$diff = $dt->diff($now); |
819
|
|
|
$diffDays = (int) $diff->format('%R%a'); |
820
|
|
|
if ($diffDays === 0) { |
821
|
|
|
// then this is most likely a time string... |
822
|
|
|
if (preg_match("/^{$time}$/i", $data)) { |
823
|
|
|
return self::TYPE_TIME; |
824
|
|
|
} |
825
|
|
|
} |
826
|
|
|
if (preg_match("/^{$date}$/i", $data)) { |
827
|
|
|
return self::TYPE_DATE; |
828
|
|
|
} elseif (preg_match("/^{$date} {$time}$/i", $data)) { |
829
|
|
|
return self::TYPE_DATETIME; |
830
|
|
|
} |
831
|
|
|
} catch (\Exception $e) { |
832
|
|
|
// now go on checking remaining types |
833
|
|
|
if (preg_match('/^\w+$/', $data)) { |
834
|
|
|
return self::TYPE_ALNUM; |
835
|
|
|
} |
836
|
|
|
} |
837
|
|
|
|
838
|
|
|
return self::TYPE_STRING; |
839
|
|
|
} |
840
|
|
|
} |
841
|
|
|
|