|
1
|
|
|
<?php |
|
2
|
|
|
/** |
|
3
|
|
|
* CSVelte: Slender, elegant CSV for PHP |
|
4
|
|
|
* Inspired by Python's CSV module and Frictionless Data and the W3C's CSV |
|
5
|
|
|
* standardization efforts, CSVelte was written in an effort to take all the |
|
6
|
|
|
* suck out of working with CSV. |
|
7
|
|
|
* |
|
8
|
|
|
* @version v0.2 |
|
9
|
|
|
* @copyright Copyright (c) 2016 Luke Visinoni <[email protected]> |
|
10
|
|
|
* @author Luke Visinoni <[email protected]> |
|
11
|
|
|
* @license https://github.com/deni-zen/csvelte/blob/master/LICENSE The MIT License (MIT) |
|
12
|
|
|
*/ |
|
13
|
|
|
namespace CSVelte; |
|
14
|
|
|
|
|
15
|
|
|
use Carbon\Carbon; |
|
16
|
|
|
use CSVelte\Contract\Readable; |
|
17
|
|
|
use CSVelte\Exception\TasterException; |
|
18
|
|
|
|
|
19
|
|
|
/** |
|
20
|
|
|
* CSVelte\Taster |
|
21
|
|
|
* Given CSV data, Taster will "taste" the data and provide its buest guess at |
|
22
|
|
|
* its "flavor". In other words, this class inspects CSV data and attempts to |
|
23
|
|
|
* auto-detect various CSV attributes such as line endings, quote characters, etc.. |
|
24
|
|
|
* |
|
25
|
|
|
* @package CSVelte |
|
26
|
|
|
* @copyright (c) 2016, Luke Visinoni <[email protected]> |
|
27
|
|
|
* @author Luke Visinoni <[email protected]> |
|
28
|
|
|
* @todo There are a ton of improvements that could be made to this class. |
|
29
|
|
|
* I'll do a refactor on this fella once I get at least one test |
|
30
|
|
|
* passing for each of its public methods. |
|
31
|
|
|
* @todo Should I have a lickEscapeChar method? The python version doesn't |
|
32
|
|
|
* have one. But then why does it even bother including one in its |
|
33
|
|
|
* flavor class? |
|
34
|
|
|
* @todo Examine each of the public methods in this class and determine |
|
35
|
|
|
* whether it makes sense to ask for the data as a param rather than |
|
36
|
|
|
* just pulling it from source. I don't think it makes sense... it |
|
37
|
|
|
* was just easier to write the methods that way during testing. |
|
38
|
|
|
* @todo There are at least portions of this class that could use the |
|
39
|
|
|
* Reader class rather than working directly with data. |
|
40
|
|
|
*/ |
|
41
|
|
|
class Taster |
|
42
|
|
|
{ |
|
43
|
|
|
/** |
|
44
|
|
|
* End-of-line constants |
|
45
|
|
|
*/ |
|
46
|
|
|
const EOL_UNIX = 'lf'; |
|
47
|
|
|
const EOL_TRS80 = 'cr'; |
|
48
|
|
|
const EOL_WINDOWS = 'crlf'; |
|
49
|
|
|
|
|
50
|
|
|
/** |
|
51
|
|
|
* ASCII character codes for "invisibles" |
|
52
|
|
|
*/ |
|
53
|
|
|
const HORIZONTAL_TAB = 9; |
|
54
|
|
|
const LINE_FEED = 10; |
|
55
|
|
|
const CARRIAGE_RETURN = 13; |
|
56
|
|
|
const SPACE = 32; |
|
57
|
|
|
|
|
58
|
|
|
/** |
|
59
|
|
|
* Data types -- Used within the lickQuotingStyle method |
|
60
|
|
|
*/ |
|
61
|
|
|
const DATA_NONNUMERIC = 'nonnumeric'; |
|
62
|
|
|
const DATA_SPECIAL = 'special'; |
|
63
|
|
|
const DATA_UNKNOWN = 'unknown'; |
|
64
|
|
|
|
|
65
|
|
|
/** |
|
66
|
|
|
* Placeholder strings -- hold the place of newlines and delimiters contained |
|
67
|
|
|
* within quoted text so that the explode method doesn't split incorrectly |
|
68
|
|
|
*/ |
|
69
|
|
|
const PLACEHOLDER_NEWLINE = '[__NEWLINE__]'; |
|
70
|
|
|
const PLACEHOLDER_DELIM = '[__DELIM__]'; |
|
71
|
|
|
|
|
72
|
|
|
/** |
|
73
|
|
|
* Recommended data sample size |
|
74
|
|
|
*/ |
|
75
|
|
|
const SAMPLE_SIZE = 2500; |
|
76
|
|
|
|
|
77
|
|
|
/** |
|
78
|
|
|
* Column data types -- used within the lickHeader method to determine |
|
79
|
|
|
* whether the first row contains different types of data than the rest of |
|
80
|
|
|
* the rows (and thus, is likely a header row) |
|
81
|
|
|
*/ |
|
82
|
|
|
// +-987 |
|
83
|
|
|
const TYPE_NUMBER = 'number'; |
|
84
|
|
|
// +-12.387 |
|
85
|
|
|
const TYPE_DOUBLE = 'double'; |
|
86
|
|
|
// I am a string. I can contain all kinds of stuff. |
|
87
|
|
|
const TYPE_STRING = 'string'; |
|
88
|
|
|
// 10-Jul-15, 9/1/2007, April 1st, 2006, etc. |
|
89
|
|
|
const TYPE_DATE = 'date'; |
|
90
|
|
|
// 10:00pm, 5pm, 13:08, etc. |
|
91
|
|
|
const TYPE_TIME = 'time'; |
|
92
|
|
|
// $98.96, ¥12389, £6.08, €87.00 |
|
93
|
|
|
const TYPE_CURRENCY = 'currency'; |
|
94
|
|
|
// 12ab44m1n2_asdf |
|
95
|
|
|
const TYPE_ALNUM = 'alnum'; |
|
96
|
|
|
// abababab |
|
97
|
|
|
const TYPE_ALPHA = 'alpha'; |
|
98
|
|
|
|
|
99
|
|
|
/** |
|
100
|
|
|
* @var CSVelte\Contract\Readable The source of data to examine |
|
101
|
|
|
* @access protected |
|
102
|
|
|
*/ |
|
103
|
|
|
protected $input; |
|
104
|
|
|
|
|
105
|
|
|
/** |
|
106
|
|
|
* Sample of CSV data to use for tasting (determining CSV flavor) |
|
107
|
|
|
* @var string |
|
108
|
|
|
*/ |
|
109
|
|
|
protected $sample; |
|
110
|
|
|
|
|
111
|
|
|
/** |
|
112
|
|
|
* Class constructor--accepts a CSV input source |
|
113
|
|
|
* |
|
114
|
|
|
* @param \CSVelte\Contract\Readable The source of CSV data |
|
115
|
|
|
* @return void |
|
|
|
|
|
|
116
|
|
|
* @access public |
|
117
|
|
|
* @todo It may be a good idea to skip the first line or two for the sample |
|
118
|
|
|
* so that the header line(s) don't throw things off (with the exception |
|
119
|
|
|
* of lickHeader() obviously) |
|
120
|
|
|
*/ |
|
121
|
15 |
|
public function __construct(Readable $input) |
|
122
|
|
|
{ |
|
123
|
15 |
|
$this->input = $input; |
|
|
|
|
|
|
124
|
15 |
|
$this->sample = $input->read(self::SAMPLE_SIZE); |
|
125
|
15 |
|
} |
|
126
|
|
|
|
|
127
|
|
|
/** |
|
128
|
|
|
* Examine the input source and determine what "Flavor" of CSV it contains. |
|
129
|
|
|
* The CSV format, while having an RFC (https://tools.ietf.org/html/rfc4180), |
|
130
|
|
|
* doesn't necessarily always conform to it. And it doesn't provide meta such as the delimiting character, quote character, or what types of data are quoted. |
|
131
|
|
|
* such as the delimiting character, quote character, or what types of data are quoted. |
|
132
|
|
|
* are quoted. |
|
133
|
|
|
* |
|
134
|
|
|
* @return \CSVelte\Flavor The metadata that the CSV format doesn't provide |
|
135
|
|
|
* @access public |
|
136
|
|
|
* @todo Implement a lickQuote method for when lickQuoteAndDelim method fails |
|
137
|
|
|
* @todo Should there bea lickEscapeChar method? the python module that inspired |
|
138
|
|
|
* this library doesn't include one... |
|
139
|
|
|
* @todo This should cache the results and only regenerate if $this->sample |
|
140
|
|
|
* changes (or $this->input) |
|
141
|
|
|
*/ |
|
142
|
7 |
|
public function lick() |
|
143
|
|
|
{ |
|
144
|
7 |
|
$lineTerminator = $this->lickLineEndings(); |
|
145
|
|
|
try { |
|
146
|
7 |
|
list($quoteChar, $delimiter) = $this->lickQuoteAndDelim(); |
|
147
|
7 |
|
} catch (TasterException $e) { |
|
148
|
3 |
|
if ($e->getCode() !== TasterException::ERR_QUOTE_AND_DELIM) throw $e; |
|
149
|
3 |
|
$quoteChar = '"'; |
|
150
|
3 |
|
$delimiter = $this->lickDelimiter($lineTerminator); |
|
151
|
|
|
} |
|
152
|
|
|
/** |
|
153
|
|
|
* @todo Should this be null? Because doubleQuote = true means this = null |
|
154
|
|
|
*/ |
|
155
|
7 |
|
$escapeChar = '\\'; |
|
156
|
7 |
|
$quoteStyle = $this->lickQuotingStyle($quoteChar, $delimiter, $lineTerminator); |
|
157
|
7 |
|
$header = $this->lickHeader($quoteChar, $delimiter, $lineTerminator); |
|
158
|
7 |
|
return new Flavor(compact('quoteChar', 'escapeChar', 'delimiter', 'lineTerminator', 'quoteStyle', 'header')); |
|
159
|
|
|
} |
|
160
|
|
|
|
|
161
|
|
|
/** |
|
162
|
|
|
* Replaces all quoted columns with a blank string. I was using this method |
|
163
|
|
|
* to prevent explode() from incorrectly splitting at delimiters and newlines |
|
164
|
|
|
* within quotes when parsing a file. But this was before I wrote the |
|
165
|
|
|
* replaceQuotedSpecialChars method which (at least to me) makes more sense. |
|
166
|
|
|
* |
|
167
|
|
|
* @param string The string to replace quoted strings within |
|
168
|
|
|
* @return string The input string with quoted strings removed |
|
169
|
|
|
* @access protected |
|
170
|
|
|
* @todo Replace code that uses this method with the replaceQuotedSpecialChars |
|
171
|
|
|
* method instead. I think it's cleaner. |
|
172
|
|
|
*/ |
|
173
|
7 |
|
protected function removeQuotedStrings($data) |
|
174
|
|
|
{ |
|
175
|
7 |
|
return preg_replace($pattern = '/(["\'])(?:(?=(\\\\?))\2.)*?\1/sm', $replace = '', $data); |
|
176
|
|
|
} |
|
177
|
|
|
|
|
178
|
|
|
/** |
|
179
|
|
|
* Examine the input source to determine which character(s) are being used |
|
180
|
|
|
* as the end-of-line character |
|
181
|
|
|
* |
|
182
|
|
|
* @return string The end-of-line char for the input data |
|
183
|
|
|
* @access protected |
|
184
|
|
|
* @credit pulled from stackoverflow thread *tips hat to username "Harm"* |
|
185
|
|
|
* @todo This should throw an exception if it cannot determine the line ending |
|
186
|
|
|
* @todo I probably will make this method protected when I'm done with testing... |
|
187
|
|
|
* @todo If there is any way for this method to fail (for instance if a file ) |
|
188
|
|
|
* is totally empty or contains no line breaks), then it needs to throw |
|
189
|
|
|
* a relevant TasterException |
|
190
|
|
|
* @todo Use replaceQuotedSpecialChars rather than removeQuotedStrings() |
|
191
|
|
|
*/ |
|
192
|
7 |
|
protected function lickLineEndings() |
|
193
|
|
|
{ |
|
194
|
7 |
|
$str = $this->removeQuotedStrings($this->sample); |
|
195
|
|
|
$eols = [ |
|
196
|
7 |
|
self::EOL_WINDOWS => "\r\n", // 0x0D - 0x0A - Windows, DOS OS/2 |
|
197
|
7 |
|
self::EOL_UNIX => "\n", // 0x0A - - Unix, OSX |
|
198
|
7 |
|
self::EOL_TRS80 => "\r", // 0x0D - - Apple ][, TRS80 |
|
199
|
7 |
|
]; |
|
200
|
|
|
|
|
201
|
7 |
|
$curCount = 0; |
|
202
|
|
|
// @todo This should return a default maybe? |
|
203
|
7 |
|
$curEol = PHP_EOL; |
|
204
|
7 |
|
foreach($eols as $k => $eol) { |
|
205
|
7 |
|
if( ($count = substr_count($str, $eol)) > $curCount) { |
|
206
|
7 |
|
$curCount = $count; |
|
207
|
7 |
|
$curEol = $eol; |
|
208
|
7 |
|
} |
|
209
|
7 |
|
} |
|
210
|
7 |
|
return $curEol; |
|
211
|
|
|
} |
|
212
|
|
|
|
|
213
|
|
|
/** |
|
214
|
|
|
* The best way to determine quote and delimiter characters is when columns |
|
215
|
|
|
* are quoted, often you can seek out a pattern of delim, quote, stuff, quote, delim |
|
216
|
|
|
* but this only works if you have quoted columns. If you don't you have to |
|
217
|
|
|
* determine these characters some other way... (see lickDelimiter) |
|
218
|
|
|
* |
|
219
|
|
|
* @return array A two-row array containing quotechar, delimchar |
|
220
|
|
|
* @access protected |
|
221
|
|
|
* @todo make protected |
|
222
|
|
|
* @todo This should throw an exception if it cannot determine the delimiter |
|
223
|
|
|
* this way. |
|
224
|
|
|
* @todo This should check for any line endings not just \n |
|
225
|
|
|
*/ |
|
226
|
7 |
|
protected function lickQuoteAndDelim() |
|
227
|
|
|
{ |
|
228
|
|
|
/** |
|
229
|
|
|
* @var array An array of pattern matches |
|
230
|
|
|
*/ |
|
231
|
7 |
|
$matches = null; |
|
232
|
|
|
/** |
|
233
|
|
|
* @var array An array of patterns (regex) |
|
234
|
|
|
*/ |
|
235
|
7 |
|
$patterns = []; |
|
236
|
|
|
// delim can be anything but line breaks, quotes, alphanumeric, underscore, backslash, or any type of spaces |
|
237
|
7 |
|
$antidelims = implode(array("\r", "\n", "\w", preg_quote('"', '/'), preg_quote("'", '/')/*, preg_quote('\\', '/')*/, preg_quote(chr(self::SPACE), '/'))); |
|
238
|
7 |
|
$delim = '(?P<delim>[^' . $antidelims . '])'; |
|
239
|
7 |
|
$quote = '(?P<quoteChar>"|\'|`)'; // @todo I think MS Excel uses some strange encoding for fancy open/close quotes |
|
240
|
7 |
|
$patterns[] = '/' . $delim . ' ?' . $quote . '.*?\2\1/ms'; // ,"something", - anything but whitespace or quotes followed by a possible space followed by a quote followed by anything followed by same quote, followed by same anything but whitespace |
|
241
|
7 |
|
$patterns[] = '/(?:^|\n)' . $quote . '.*?\1' . $delim . ' ?/ms'; // 'something', - beginning of line or line break, followed by quote followed by anything followed by quote followed by anything but whitespace or quotes |
|
242
|
7 |
|
$patterns[] = '/' . $delim . ' ?' . $quote . '.*?\2(?:^|\n)/ms'; // ,'something' - anything but whitespace or quote followed by possible space followed by quote followed by anything followed by quote, followed by end of line |
|
243
|
7 |
|
$patterns[] = '/(?:^|\n)' . $quote . '.*?\2(?:$|\n)/ms'; // 'something' - beginning of line followed by quote followed by anything followed by quote followed by same quote followed by end of line |
|
244
|
7 |
|
foreach ($patterns as $pattern) { |
|
245
|
|
|
// @todo I had to add the error suppression char here because it was |
|
246
|
|
|
// causing undefined offset errors with certain data sets. strange... |
|
247
|
7 |
|
if (@preg_match_all($pattern, $this->sample, $matches) && $matches) break; |
|
248
|
7 |
|
} |
|
249
|
7 |
|
if ($matches) { |
|
250
|
7 |
|
$quotes = array_count_values($matches['quoteChar']); |
|
251
|
7 |
|
arsort($quotes); |
|
252
|
7 |
|
$quotes = array_flip($quotes); |
|
253
|
7 |
|
if ($theQuote = array_shift($quotes)) { |
|
254
|
6 |
|
$delims = array_count_values($matches['delim']); |
|
255
|
6 |
|
arsort($delims); |
|
256
|
6 |
|
$delims = array_flip($delims); |
|
257
|
6 |
|
$theDelim = array_shift($delims); |
|
258
|
6 |
|
return array($theQuote, $theDelim); |
|
259
|
|
|
} |
|
260
|
3 |
|
} |
|
261
|
3 |
|
throw new TasterException("quoteChar and delimiter cannot be determined", TasterException::ERR_QUOTE_AND_DELIM); |
|
262
|
|
|
} |
|
263
|
|
|
|
|
264
|
|
|
/** |
|
265
|
|
|
* Take a list of likely delimiter characters and find the one that occurs |
|
266
|
|
|
* the most consistent amount of times within the provided data. |
|
267
|
|
|
* |
|
268
|
|
|
* @param string The character(s) used for newlines |
|
269
|
|
|
* @return string One of four Flavor::QUOTING_* constants |
|
270
|
|
|
* @see \CSVelte\Flavor for possible quote style constants |
|
271
|
|
|
* @access protected |
|
272
|
|
|
* @todo Refactor this method--It needs more thorough testing against a wider |
|
273
|
|
|
* variety of CSV data to be sure it works reliably. And I'm sure there |
|
274
|
|
|
* are many performance and logic improvements that could be made. This |
|
275
|
|
|
* is essentially a first draft. |
|
276
|
|
|
* @todo Use replaceQuotedSpecialChars rather than removeQuotedStrings |
|
277
|
|
|
*/ |
|
278
|
3 |
|
protected function lickDelimiter($eol = "\n") |
|
279
|
|
|
{ |
|
280
|
3 |
|
$delimiters = array(",", "\t", "|", ":", ";", "/", '\\'); |
|
281
|
3 |
|
$lines = explode($eol, $this->removeQuotedStrings($this->sample)); |
|
282
|
3 |
|
$modes = array(); |
|
|
|
|
|
|
283
|
3 |
|
$start = 0; |
|
284
|
3 |
|
$charFrequency = array(); |
|
285
|
3 |
|
while ($start < count($lines)) { |
|
286
|
3 |
|
foreach ($lines as $key => $line) { |
|
287
|
3 |
|
if (!trim($line)) continue; |
|
288
|
3 |
|
foreach ($delimiters as $char) { |
|
289
|
3 |
|
$freq = substr_count($line, $char); |
|
290
|
3 |
|
$charFrequency[$char][$key] = $freq; |
|
291
|
3 |
|
} |
|
292
|
3 |
|
} |
|
293
|
3 |
|
$start++; |
|
294
|
3 |
|
} |
|
295
|
3 |
|
$averages = Utils::array_average($charFrequency); |
|
296
|
3 |
|
$modes = Utils::array_mode($charFrequency); |
|
297
|
3 |
|
$consistencies = array(); |
|
298
|
3 |
|
foreach ($averages as $achar => $avg) { |
|
299
|
3 |
|
foreach ($modes as $mchar => $mode) { |
|
300
|
3 |
|
if ($achar == $mchar) { |
|
301
|
3 |
|
if ($mode) { |
|
302
|
3 |
|
$consistencies[$achar] = $avg / $mode; |
|
303
|
3 |
|
} else { |
|
304
|
3 |
|
$consistencies[$achar] = 0; |
|
305
|
|
|
} |
|
306
|
3 |
|
break; |
|
307
|
|
|
} |
|
308
|
3 |
|
} |
|
309
|
3 |
|
} |
|
310
|
3 |
|
if (empty($consistencies)) { |
|
311
|
|
|
throw new TasterException('Cannot determine delimiter character', TasterException::ERR_DELIMITER); |
|
312
|
|
|
} |
|
313
|
3 |
|
arsort($consistencies); |
|
314
|
3 |
|
return key($consistencies); |
|
315
|
|
|
} |
|
316
|
|
|
|
|
317
|
|
|
/** |
|
318
|
|
|
* Determine the "style" of data quoting. The CSV format, while having an RFC |
|
319
|
|
|
* (https://tools.ietf.org/html/rfc4180), doesn't necessarily always conform |
|
320
|
|
|
* to it. And it doesn't provide metadata such as the delimiting character, |
|
321
|
|
|
* quote character, or what types of data are quoted. So this method makes a |
|
322
|
|
|
* logical guess by finding which columns have been quoted (if any) and |
|
323
|
|
|
* examining their data type. Most often, CSV files will only use quotes |
|
324
|
|
|
* around columns that contain special characters such as the dilimiter, |
|
325
|
|
|
* the quoting character, newlines, etc. (we refer to this style as ) |
|
326
|
|
|
* QUOTE_MINIMAL), but some quote all columns that contain nonnumeric data |
|
327
|
|
|
* (QUOTE_NONNUMERIC). Then there are CSV files that quote all columns |
|
328
|
|
|
* (QUOTE_ALL) and those that quote none (QUOTE_NONE). |
|
329
|
|
|
* |
|
330
|
|
|
* @param string The data to examime for "quoting style" |
|
331
|
|
|
* @param string The type of quote character being used (single or double) |
|
332
|
|
|
* @param string The character used as the column delimiter |
|
333
|
|
|
* @param string The character used for newlines |
|
334
|
|
|
* @return string One of four "QUOTING_" constants defined above--see this |
|
335
|
|
|
* method's description for more info. |
|
336
|
|
|
* @access protected |
|
337
|
|
|
* @todo Refactor this method--It needs more thorough testing against a wider |
|
338
|
|
|
* variety of CSV data to be sure it works reliably. And I'm sure there |
|
339
|
|
|
* are many performance and logic improvements that could be made. This |
|
340
|
|
|
* is essentially a first draft. |
|
341
|
|
|
*/ |
|
342
|
7 |
|
protected function lickQuotingStyle($quote, $delim, $eol) |
|
|
|
|
|
|
343
|
|
|
{ |
|
344
|
7 |
|
$data = $this->replaceQuotedSpecialChars($this->sample, $delim); |
|
345
|
|
|
|
|
346
|
|
|
$quoting_styles = array( |
|
347
|
7 |
|
Flavor::QUOTE_ALL => 0, |
|
348
|
7 |
|
Flavor::QUOTE_NONE => 0, |
|
349
|
7 |
|
Flavor::QUOTE_MINIMAL => 0, |
|
350
|
7 |
|
Flavor::QUOTE_NONNUMERIC => 0, |
|
351
|
7 |
|
); |
|
352
|
|
|
|
|
353
|
7 |
|
$lines = explode($eol, $data); |
|
354
|
|
|
$freq = array( |
|
355
|
7 |
|
'quoted' => array(), |
|
356
|
7 |
|
'unquoted' => array() |
|
357
|
7 |
|
); |
|
358
|
|
|
|
|
359
|
7 |
|
foreach ($lines as $key => $line) { |
|
360
|
|
|
// now we can sub back in the correct newlines |
|
361
|
7 |
|
$line = str_replace(self::PLACEHOLDER_NEWLINE, $eol, $line); |
|
362
|
7 |
|
$cols = explode($delim, $line); |
|
363
|
7 |
|
foreach ($cols as $colkey => $col) { |
|
364
|
|
|
// now we can sub back in the correct delim characters |
|
365
|
7 |
|
$col = str_replace(self::PLACEHOLDER_DELIM, $delim, $col); |
|
366
|
7 |
|
if ($isQuoted = $this->isQuoted($col)) { |
|
|
|
|
|
|
367
|
6 |
|
$col = $this->unQuote($col); |
|
368
|
6 |
|
$type = $this->lickDataType($col); |
|
369
|
|
|
// we can remove this guy all together since at lease one column is quoted |
|
370
|
6 |
|
unset($quoting_styles[Flavor::QUOTE_NONE]); |
|
371
|
6 |
|
$freq['quoted'][] = $type; |
|
372
|
6 |
|
} else { |
|
373
|
7 |
|
$type = $this->lickDataType($col); |
|
374
|
|
|
// we can remove this guy all together since at lease one column is unquoted |
|
375
|
7 |
|
unset($quoting_styles[Flavor::QUOTE_ALL]); |
|
376
|
7 |
|
$freq['unquoted'][] = $type; |
|
377
|
|
|
} |
|
378
|
7 |
|
} |
|
379
|
7 |
|
} |
|
380
|
7 |
|
$types = array_unique($freq['quoted']); |
|
381
|
|
|
// if quoting_styles still has QUOTE_ALL or QUOTE_NONE, then that's the one to return |
|
382
|
7 |
|
if (array_key_exists(Flavor::QUOTE_ALL, $quoting_styles)) return Flavor::QUOTE_ALL; |
|
383
|
7 |
|
if (array_key_exists(Flavor::QUOTE_NONE, $quoting_styles)) return Flavor::QUOTE_NONE; |
|
384
|
6 |
|
if (count($types) == 1) { |
|
385
|
6 |
|
if (current($types) == self::DATA_SPECIAL) return Flavor::QUOTE_MINIMAL; |
|
386
|
|
|
elseif (current($types) == self::DATA_NONNUMERIC) return Flavor::QUOTE_NONNUMERIC; |
|
387
|
|
|
} else { |
|
388
|
1 |
|
if (array_key_exists(self::DATA_NONNUMERIC, array_flip($types))) { |
|
389
|
|
|
// allow for a SMALL amount of error here |
|
390
|
1 |
|
$counts = array(self::DATA_SPECIAL => 0, self::DATA_NONNUMERIC => 0); |
|
391
|
|
|
array_walk($freq['quoted'], function ($val, $key) use (&$counts) { |
|
|
|
|
|
|
392
|
1 |
|
$counts[$val]++; |
|
393
|
1 |
|
}); |
|
394
|
1 |
|
arsort($counts); |
|
395
|
1 |
|
$most = current($counts); |
|
396
|
1 |
|
$least = end($counts); |
|
397
|
1 |
|
$err_margin = $least / $most; |
|
398
|
1 |
|
if ($err_margin < 1) return Flavor::QUOTE_NONNUMERIC; |
|
399
|
|
|
} |
|
400
|
|
|
} |
|
401
|
|
|
return Flavor::QUOTE_MINIMAL; |
|
402
|
|
|
} |
|
403
|
|
|
|
|
404
|
|
|
/** |
|
405
|
|
|
* Remove quotes around a piece of text (if there are any) |
|
406
|
|
|
* |
|
407
|
|
|
* @param string The data to "unquote" |
|
408
|
|
|
* @return string The data passed in, only with quotes stripped (off the edges) |
|
409
|
|
|
* @access protected |
|
410
|
|
|
*/ |
|
411
|
12 |
|
protected function unQuote($data) |
|
412
|
|
|
{ |
|
413
|
12 |
|
return preg_replace('/^(["\'])(.*)\1$/', '\2', $data); |
|
414
|
|
|
} |
|
415
|
|
|
|
|
416
|
|
|
/** |
|
417
|
|
|
* Determine whether a particular string of data has quotes around it. |
|
418
|
|
|
* |
|
419
|
|
|
* @param string The data to check |
|
420
|
|
|
* @return boolean Whether the data is quoted or not |
|
421
|
|
|
* @access protected |
|
422
|
|
|
*/ |
|
423
|
7 |
|
protected function isQuoted($data) |
|
424
|
|
|
{ |
|
425
|
7 |
|
return preg_match('/^([\'"])[^\1]*\1$/', $data); |
|
426
|
|
|
} |
|
427
|
|
|
|
|
428
|
|
|
/** |
|
429
|
|
|
* Determine what type of data is contained within a variable |
|
430
|
|
|
* Possible types: |
|
431
|
|
|
* - nonnumeric - only numbers |
|
432
|
|
|
* - special - contains characters that could potentially need to be quoted (possible delimiter characters) |
|
433
|
|
|
* - unknown - everything else |
|
434
|
|
|
* This method is really only used within the "lickQuotingStyle" method to |
|
435
|
|
|
* help determine whether a particular column has been quoted due to it being |
|
436
|
|
|
* nonnumeric or because it has some special character in it such as a delimiter |
|
437
|
|
|
* or newline or quote. |
|
438
|
|
|
* |
|
439
|
|
|
* @param string The data to determine the type of |
|
440
|
|
|
* @return string The type of data (one of the "DATA_" constants above) |
|
441
|
|
|
* @access protected |
|
442
|
|
|
* @todo I could probably eliminate this method and use an anonymous function |
|
443
|
|
|
* instead. It isn't used anywhere else and its name could be misleading. |
|
444
|
|
|
* Especially since I also have a lickType method that is used within the |
|
445
|
|
|
* lickHeader method. |
|
446
|
|
|
*/ |
|
447
|
7 |
|
protected function lickDataType($data) |
|
448
|
|
|
{ |
|
449
|
|
|
// @todo make this check for only the quote and delim that are actually being used |
|
450
|
|
|
// that will make the guess more accurate |
|
451
|
7 |
|
if (preg_match('/[\'",\t\|:;-]/', $data)) { |
|
452
|
6 |
|
return self::DATA_SPECIAL; |
|
453
|
7 |
|
} elseif (preg_match('/[^0-9]/', $data)) { |
|
454
|
7 |
|
return self::DATA_NONNUMERIC; |
|
455
|
|
|
} |
|
456
|
7 |
|
return self::DATA_UNKNOWN; |
|
457
|
|
|
} |
|
458
|
|
|
|
|
459
|
|
|
/** |
|
460
|
|
|
* Replace all instances of newlines and whatever character you specify (as |
|
461
|
|
|
* the delimiter) that are contained within quoted text. The replacements are |
|
462
|
|
|
* simply a special placeholder string. This is done so that I can use the |
|
463
|
|
|
* very unsmart "explode" function and not have to worry about it exploding |
|
464
|
|
|
* on delimiters or newlines within quotes. Once I have exploded, I typically |
|
465
|
|
|
* sub back in the real characters before doing anything else. Although |
|
466
|
|
|
* currently there is no dedicated method for doing so I just use str_replace |
|
467
|
|
|
* |
|
468
|
|
|
* @param string The string to do the replacements on |
|
469
|
|
|
* @param string The delimiter character to replace |
|
470
|
|
|
* @return string The data with replacements performed |
|
471
|
|
|
* @access protected |
|
472
|
|
|
* @todo I could probably pass in (maybe optionally) the newline character I |
|
473
|
|
|
* want to replace as well. I'll do that if I need to. |
|
474
|
|
|
*/ |
|
475
|
|
|
protected function replaceQuotedSpecialChars($data, $delim) |
|
476
|
|
|
{ |
|
477
|
12 |
|
return preg_replace_callback('/([\'"])(.*)\1/imsU', function($matches) use ($delim) { |
|
478
|
10 |
|
$ret = preg_replace("/([\r\n])/", self::PLACEHOLDER_NEWLINE, $matches[0]); |
|
479
|
10 |
|
$ret = str_replace($delim, self::PLACEHOLDER_DELIM, $ret); |
|
480
|
10 |
|
return $ret; |
|
481
|
12 |
|
}, $data); |
|
482
|
|
|
} |
|
483
|
|
|
|
|
484
|
|
|
/** |
|
485
|
|
|
* Determine the "type" of a particular string of data. Used for the lickHeader |
|
486
|
|
|
* method to assign a type to each column to try to determine whether the |
|
487
|
|
|
* first for is different than a consistent column type. |
|
488
|
|
|
* |
|
489
|
|
|
* @todo As I'm writing this method I'm beginning ot realize how expensive |
|
490
|
|
|
* the lickHeader method is going to end up being since it has to apply all |
|
491
|
|
|
* these regexes (potentially) to every column. I may end up writing a much |
|
492
|
|
|
* simpler type-checking method than this if it proves to be too expensive |
|
493
|
|
|
* to be practical. |
|
494
|
|
|
* |
|
495
|
|
|
* @param string The string of data to check the type of |
|
496
|
|
|
* @return string One of the TYPE_ string constants above |
|
497
|
|
|
* @access protected |
|
498
|
|
|
* @uses \Carbon\Carbon date/time ilbrary/class |
|
499
|
|
|
*/ |
|
500
|
12 |
|
protected function lickType($data) |
|
501
|
|
|
{ |
|
502
|
12 |
|
if (preg_match('/^[+-]?[\d\.]+$/', $data)) { |
|
503
|
10 |
|
return self::TYPE_NUMBER; |
|
504
|
12 |
|
} elseif (preg_match('/^[+-]?[\d]+\.[\d]+$/', $data)) { |
|
505
|
|
|
return self::TYPE_DOUBLE; |
|
506
|
12 |
|
} elseif (preg_match('/^[+-]?[¥£€$]\d+(\.\d+)$/', $data)) { |
|
507
|
|
|
return self::TYPE_CURRENCY; |
|
508
|
12 |
|
} elseif (preg_match('/^[a-zA-Z]+$/', $data)) { |
|
509
|
11 |
|
return self::TYPE_ALPHA; |
|
510
|
|
|
} else { |
|
511
|
|
|
try { |
|
512
|
12 |
|
$year = '([01][0-9])?[0-9]{2}'; |
|
513
|
12 |
|
$month = '([01]?[0-9]|Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)'; |
|
514
|
12 |
|
$day = '[0-3]?[0-9]'; |
|
515
|
12 |
|
$sep = '[\/\.\-]?'; |
|
516
|
12 |
|
$time = '([0-2]?[0-9](:[0-5][0-9]){1,2}(am|pm)?|[01]?[0-9](am|pm))'; |
|
517
|
12 |
|
$date = '(' . $month . $sep . $day . $sep . $year . '|' . $day . $sep . $month . $sep . $year . '|' . $year . $sep . $month . $sep . $day . ')'; |
|
518
|
12 |
|
$dt = Carbon::parse($data); |
|
519
|
11 |
|
if ($dt->today()) { |
|
520
|
|
|
// then this is most likely a time string... |
|
521
|
11 |
|
if (preg_match("/^{$time}$/i", $data)) { |
|
522
|
|
|
return self::TYPE_TIME; |
|
523
|
|
|
} |
|
524
|
11 |
|
} |
|
525
|
11 |
|
if (preg_match("/^{$date}$/i", $data)) { |
|
526
|
10 |
|
return self::TYPE_DATE; |
|
527
|
3 |
|
} elseif(preg_match("/^{$date} {$time}$/i")) { |
|
528
|
|
|
return self::TYPE_DATETIME; |
|
529
|
|
|
} |
|
530
|
12 |
|
} catch (\Exception $e) { |
|
531
|
|
|
// now go on checking remaining types |
|
532
|
12 |
|
if (preg_match('/^\w+$/', $data)) { |
|
533
|
2 |
|
return self::TYPE_ALNUM; |
|
534
|
|
|
} |
|
535
|
|
|
} |
|
536
|
|
|
} |
|
537
|
12 |
|
return self::TYPE_STRING; |
|
538
|
|
|
} |
|
539
|
|
|
|
|
540
|
|
|
/** |
|
541
|
|
|
* Examines the contents of the CSV data to make a determination of whether |
|
542
|
|
|
* or not it contains a header row. To make this determination, it creates |
|
543
|
|
|
* an array of each column's (in each row)'s data type and length and then |
|
544
|
|
|
* compares them. If all of the rows except the header look similar, it will |
|
545
|
|
|
* return true. This is only a guess though. There is no programmatic way to |
|
546
|
|
|
* determine 100% whether a CSV file has a header. The format does not |
|
547
|
|
|
* provide metadata such as that. |
|
548
|
|
|
* |
|
549
|
|
|
* @param string The CSV data to examine (only 20 rows will be examined so ) |
|
550
|
|
|
* there is no need to provide any more data than that) |
|
551
|
|
|
* @param string The CSV data's quoting char (either double or single quote) |
|
552
|
|
|
* @param string The CSV data's delimiting char (can be a variety of chars but) |
|
553
|
|
|
* typically is either a comma or a tab, sometimes a pipe) |
|
554
|
|
|
* @param string The CSV data's end-of-line char(s) (\n \r or \r\n) |
|
555
|
|
|
* @return boolean True if the data (most likely) contains a header row |
|
556
|
|
|
* @access public |
|
557
|
|
|
* @todo This method needs a total refactor. It's not necessary to loop twice |
|
558
|
|
|
* You could get away with one loop and that would allow for me to do |
|
559
|
|
|
* something like only examining enough rows to get to a particular |
|
560
|
|
|
* "hasHeader" score (+-100 for instance) & then just return true|false |
|
561
|
|
|
* @todo Also, break out of the first loop after a certain (perhaps even a |
|
562
|
|
|
* configurable) amount of lines (you only need to examine so much data ) |
|
563
|
|
|
* to reliably make a determination and this is an expensive method) |
|
564
|
|
|
* @todo Because the header isn't actually part of the "flavor", |
|
565
|
|
|
* I could remove the need for quote, delim, and eol by "licking" the |
|
566
|
|
|
* data sample provided in the first argument. Also, I could actually |
|
567
|
|
|
* create a Reader object to read the data here. |
|
568
|
|
|
*/ |
|
569
|
13 |
|
public function lickHeader($quote, $delim, $eol) |
|
|
|
|
|
|
570
|
|
|
{ |
|
571
|
13 |
|
$data = $this->replaceQuotedSpecialChars($this->sample, $delim); |
|
572
|
13 |
|
$lines = explode($eol, $data); |
|
573
|
13 |
|
$types = array(); |
|
574
|
13 |
|
foreach ($lines as $line_no => $line) { |
|
575
|
|
|
// now we can sub back in the correct newlines |
|
576
|
13 |
|
$line = str_replace(self::PLACEHOLDER_NEWLINE, $eol, $line); |
|
577
|
13 |
|
$cols = explode($delim, $line); |
|
578
|
13 |
|
$col_count = count($cols); |
|
|
|
|
|
|
579
|
13 |
|
foreach ($cols as $col_no => $col) { |
|
580
|
|
|
// now we can sub back in the correct delim characters |
|
581
|
13 |
|
$col = str_replace(self::PLACEHOLDER_DELIM, $delim, $col); |
|
582
|
13 |
|
$types[$line_no][$col_no] = array( |
|
583
|
13 |
|
'type' => $this->lickType($this->unQuote($col)), |
|
584
|
13 |
|
'length' => strlen($col) |
|
585
|
13 |
|
); |
|
586
|
13 |
|
} |
|
587
|
13 |
|
} |
|
588
|
13 |
|
$hasHeader = 0; |
|
589
|
13 |
|
$potential_header = array_shift($types); |
|
590
|
13 |
|
foreach ($types as $line_no => $cols) { |
|
591
|
12 |
|
foreach ($cols as $col_no => $col_info) { |
|
592
|
12 |
|
extract($col_info); |
|
593
|
12 |
|
if (!array_key_exists($col_no, $potential_header)) continue; |
|
594
|
12 |
|
extract($potential_header[$col_no], EXTR_PREFIX_ALL, "header"); |
|
595
|
12 |
|
if ($header_type == self::TYPE_STRING) { |
|
596
|
|
|
// use length |
|
597
|
11 |
|
if ($length != $header_length) $hasHeader++; |
|
598
|
|
|
else $hasHeader--; |
|
599
|
11 |
|
} else { |
|
600
|
12 |
|
if ($type != $header_type) $hasHeader++; |
|
601
|
|
|
else $hasHeader--; |
|
602
|
|
|
} |
|
603
|
12 |
|
} |
|
604
|
13 |
|
} |
|
605
|
13 |
|
return $hasHeader > 0; |
|
606
|
|
|
} |
|
607
|
|
|
} |
|
608
|
|
|
|
Adding a
@returnannotation to a constructor is not recommended, since a constructor does not have a meaningful return value.Please refer to the PHP core documentation on constructors.