These results are based on our legacy PHP analysis, consider migrating to our new PHP analysis engine instead. Learn more
1 | <?php |
||
2 | /** |
||
3 | * CSVelte: Slender, elegant CSV for PHP |
||
4 | * Inspired by Python's CSV module and Frictionless Data and the W3C's CSV |
||
5 | * standardization efforts, CSVelte was written in an effort to take all the |
||
6 | * suck out of working with CSV. |
||
7 | * |
||
8 | * @version v0.2 |
||
9 | * @copyright Copyright (c) 2016 Luke Visinoni <[email protected]> |
||
10 | * @author Luke Visinoni <[email protected]> |
||
11 | * @license https://github.com/deni-zen/csvelte/blob/master/LICENSE The MIT License (MIT) |
||
12 | */ |
||
13 | namespace CSVelte; |
||
14 | |||
15 | use Carbon\Carbon; |
||
16 | use CSVelte\Contract\Readable; |
||
17 | use CSVelte\Exception\TasterException; |
||
18 | |||
19 | /** |
||
20 | * CSVelte\Taster |
||
21 | * Given CSV data, Taster will "taste" the data and provide its buest guess at |
||
22 | * its "flavor". In other words, this class inspects CSV data and attempts to |
||
23 | * auto-detect various CSV attributes such as line endings, quote characters, etc.. |
||
24 | * |
||
25 | * @package CSVelte |
||
26 | * @copyright (c) 2016, Luke Visinoni <[email protected]> |
||
27 | * @author Luke Visinoni <[email protected]> |
||
28 | * @todo There are a ton of improvements that could be made to this class. |
||
29 | * I'll do a refactor on this fella once I get at least one test |
||
30 | * passing for each of its public methods. |
||
31 | * @todo Should I have a lickEscapeChar method? The python version doesn't |
||
32 | * have one. But then why does it even bother including one in its |
||
33 | * flavor class? |
||
34 | * @todo Examine each of the public methods in this class and determine |
||
35 | * whether it makes sense to ask for the data as a param rather than |
||
36 | * just pulling it from source. I don't think it makes sense... it |
||
37 | * was just easier to write the methods that way during testing. |
||
38 | * @todo There are at least portions of this class that could use the |
||
39 | * Reader class rather than working directly with data. |
||
40 | */ |
||
41 | class Taster |
||
42 | { |
||
43 | /** |
||
44 | * End-of-line constants |
||
45 | */ |
||
46 | const EOL_UNIX = 'lf'; |
||
47 | const EOL_TRS80 = 'cr'; |
||
48 | const EOL_WINDOWS = 'crlf'; |
||
49 | |||
50 | /** |
||
51 | * ASCII character codes for "invisibles" |
||
52 | */ |
||
53 | const HORIZONTAL_TAB = 9; |
||
54 | const LINE_FEED = 10; |
||
55 | const CARRIAGE_RETURN = 13; |
||
56 | const SPACE = 32; |
||
57 | |||
58 | /** |
||
59 | * Data types -- Used within the lickQuotingStyle method |
||
60 | */ |
||
61 | const DATA_NONNUMERIC = 'nonnumeric'; |
||
62 | const DATA_SPECIAL = 'special'; |
||
63 | const DATA_UNKNOWN = 'unknown'; |
||
64 | |||
65 | /** |
||
66 | * Placeholder strings -- hold the place of newlines and delimiters contained |
||
67 | * within quoted text so that the explode method doesn't split incorrectly |
||
68 | */ |
||
69 | const PLACEHOLDER_NEWLINE = '[__NEWLINE__]'; |
||
70 | const PLACEHOLDER_DELIM = '[__DELIM__]'; |
||
71 | |||
72 | /** |
||
73 | * Recommended data sample size |
||
74 | */ |
||
75 | const SAMPLE_SIZE = 2500; |
||
76 | |||
77 | /** |
||
78 | * Column data types -- used within the lickHeader method to determine |
||
79 | * whether the first row contains different types of data than the rest of |
||
80 | * the rows (and thus, is likely a header row) |
||
81 | */ |
||
82 | // +-987 |
||
83 | const TYPE_NUMBER = 'number'; |
||
84 | // +-12.387 |
||
85 | const TYPE_DOUBLE = 'double'; |
||
86 | // I am a string. I can contain all kinds of stuff. |
||
87 | const TYPE_STRING = 'string'; |
||
88 | // 10-Jul-15, 9/1/2007, April 1st, 2006, etc. |
||
1 ignored issue
–
show
|
|||
89 | const TYPE_DATE = 'date'; |
||
90 | // 10:00pm, 5pm, 13:08, etc. |
||
1 ignored issue
–
show
Unused Code
Comprehensibility
introduced
by
56% of this comment could be valid code. Did you maybe forget this after debugging?
Sometimes obsolete code just ends up commented out instead of removed. In this case it is better to remove the code once you have checked you do not need it. The code might also have been commented out for debugging purposes. In this case it is vital that someone uncomments it again or your project may behave in very unexpected ways in production. This check looks for comments that seem to be mostly valid code and reports them.
Loading history...
|
|||
91 | const TYPE_TIME = 'time'; |
||
92 | // $98.96, ¥12389, £6.08, €87.00 |
||
1 ignored issue
–
show
Unused Code
Comprehensibility
introduced
by
50% of this comment could be valid code. Did you maybe forget this after debugging?
Sometimes obsolete code just ends up commented out instead of removed. In this case it is better to remove the code once you have checked you do not need it. The code might also have been commented out for debugging purposes. In this case it is vital that someone uncomments it again or your project may behave in very unexpected ways in production. This check looks for comments that seem to be mostly valid code and reports them.
Loading history...
|
|||
93 | const TYPE_CURRENCY = 'currency'; |
||
94 | // 12ab44m1n2_asdf |
||
95 | const TYPE_ALNUM = 'alnum'; |
||
96 | // abababab |
||
97 | const TYPE_ALPHA = 'alpha'; |
||
98 | |||
99 | /** |
||
100 | * @var \CSVelte\Contract\Readable The source of data to examine |
||
101 | * @access protected |
||
102 | */ |
||
103 | protected $input; |
||
104 | |||
105 | /** |
||
106 | * Sample of CSV data to use for tasting (determining CSV flavor) |
||
107 | * @var string |
||
108 | */ |
||
109 | protected $sample; |
||
110 | |||
111 | /** |
||
112 | * Class constructor--accepts a CSV input source |
||
113 | * |
||
114 | * @param \CSVelte\Contract\Readable The source of CSV data |
||
115 | * @todo It may be a good idea to skip the first line or two for the sample |
||
116 | * so that the header line(s) don't throw things off (with the exception |
||
117 | * of lickHeader() obviously) |
||
118 | */ |
||
119 | 22 | public function __construct(Readable $input) |
|
120 | { |
||
121 | 22 | $this->input = $input; |
|
122 | 22 | if (!$this->sample = $input->read(self::SAMPLE_SIZE)) { |
|
123 | 1 | throw new TasterException("Invalid input, cannot read sample.", TasterException::ERR_INVALID_SAMPLE); |
|
124 | } |
||
125 | 21 | } |
|
126 | |||
127 | /** |
||
128 | * Examine the input source and determine what "Flavor" of CSV it contains. |
||
129 | * The CSV format, while having an RFC (https://tools.ietf.org/html/rfc4180), |
||
130 | * doesn't necessarily always conform to it. And it doesn't provide meta such as the delimiting character, quote character, or what types of data are quoted. |
||
131 | * such as the delimiting character, quote character, or what types of data are quoted. |
||
132 | * are quoted. |
||
133 | * |
||
134 | * @return \CSVelte\Flavor The metadata that the CSV format doesn't provide |
||
135 | * @access public |
||
136 | * @todo Implement a lickQuote method for when lickQuoteAndDelim method fails |
||
137 | * @todo Should there bea lickEscapeChar method? the python module that inspired |
||
138 | * this library doesn't include one... |
||
139 | * @todo This should cache the results and only regenerate if $this->sample |
||
140 | * changes (or $this->input) |
||
141 | */ |
||
142 | 13 | public function lick() |
|
143 | { |
||
144 | 13 | $lineTerminator = $this->lickLineEndings(); |
|
145 | try { |
||
146 | 13 | list($quoteChar, $delimiter) = $this->lickQuoteAndDelim(); |
|
147 | 13 | } catch (TasterException $e) { |
|
148 | 5 | if ($e->getCode() !== TasterException::ERR_QUOTE_AND_DELIM) throw $e; |
|
149 | 5 | $quoteChar = '"'; |
|
150 | 5 | $delimiter = $this->lickDelimiter($lineTerminator); |
|
151 | } |
||
152 | /** |
||
153 | * @todo Should this be null? Because doubleQuote = true means this = null |
||
154 | */ |
||
155 | 13 | $escapeChar = '\\'; |
|
156 | 13 | $quoteStyle = $this->lickQuotingStyle($delimiter, $lineTerminator); |
|
157 | 13 | $header = $this->lickHeader($delimiter, $lineTerminator); |
|
158 | 13 | return new Flavor(compact('quoteChar', 'escapeChar', 'delimiter', 'lineTerminator', 'quoteStyle', 'header')); |
|
159 | } |
||
160 | |||
161 | /** |
||
162 | * Replaces all quoted columns with a blank string. I was using this method |
||
163 | * to prevent explode() from incorrectly splitting at delimiters and newlines |
||
164 | * within quotes when parsing a file. But this was before I wrote the |
||
165 | * replaceQuotedSpecialChars method which (at least to me) makes more sense. |
||
166 | * |
||
167 | * @param string The string to replace quoted strings within |
||
168 | * @return string The input string with quoted strings removed |
||
169 | * @access protected |
||
170 | * @todo Replace code that uses this method with the replaceQuotedSpecialChars |
||
171 | * method instead. I think it's cleaner. |
||
172 | */ |
||
173 | 13 | protected function removeQuotedStrings($data) |
|
174 | { |
||
175 | 13 | return preg_replace($pattern = '/(["\'])(?:(?=(\\\\?))\2.)*?\1/sm', $replace = '', $data); |
|
176 | } |
||
177 | |||
178 | /** |
||
179 | * Examine the input source to determine which character(s) are being used |
||
180 | * as the end-of-line character |
||
181 | * |
||
182 | * @return string The end-of-line char for the input data |
||
183 | * @access protected |
||
184 | * @credit pulled from stackoverflow thread *tips hat to username "Harm"* |
||
185 | * @todo This should throw an exception if it cannot determine the line ending |
||
186 | * @todo I probably will make this method protected when I'm done with testing... |
||
187 | * @todo If there is any way for this method to fail (for instance if a file ) |
||
188 | * is totally empty or contains no line breaks), then it needs to throw |
||
189 | * a relevant TasterException |
||
190 | * @todo Use replaceQuotedSpecialChars rather than removeQuotedStrings() |
||
191 | */ |
||
192 | 13 | protected function lickLineEndings() |
|
193 | { |
||
194 | 13 | $str = $this->removeQuotedStrings($this->sample); |
|
195 | $eols = [ |
||
196 | 13 | self::EOL_WINDOWS => "\r\n", // 0x0D - 0x0A - Windows, DOS OS/2 |
|
197 | 13 | self::EOL_UNIX => "\n", // 0x0A - - Unix, OSX |
|
198 | 13 | self::EOL_TRS80 => "\r", // 0x0D - - Apple ][, TRS80 |
|
199 | 13 | ]; |
|
200 | |||
201 | 13 | $curCount = 0; |
|
202 | // @todo This should return a default maybe? |
||
203 | 13 | $curEol = PHP_EOL; |
|
204 | 13 | foreach($eols as $k => $eol) { |
|
205 | 13 | if( ($count = substr_count($str, $eol)) > $curCount) { |
|
206 | 13 | $curCount = $count; |
|
207 | 13 | $curEol = $eol; |
|
208 | 13 | } |
|
209 | 13 | } |
|
210 | 13 | return $curEol; |
|
211 | } |
||
212 | |||
213 | /** |
||
214 | * The best way to determine quote and delimiter characters is when columns |
||
215 | * are quoted, often you can seek out a pattern of delim, quote, stuff, quote, delim |
||
216 | * but this only works if you have quoted columns. If you don't you have to |
||
217 | * determine these characters some other way... (see lickDelimiter) |
||
218 | * |
||
219 | * @return array A two-row array containing quotechar, delimchar |
||
220 | * @access protected |
||
221 | * @todo make protected |
||
222 | * @todo This should throw an exception if it cannot determine the delimiter |
||
223 | * this way. |
||
224 | * @todo This should check for any line endings not just \n |
||
225 | */ |
||
226 | 13 | protected function lickQuoteAndDelim() |
|
227 | { |
||
228 | /** |
||
229 | * @var array An array of pattern matches |
||
230 | */ |
||
231 | 13 | $matches = null; |
|
232 | /** |
||
233 | * @var array An array of patterns (regex) |
||
234 | */ |
||
235 | 13 | $patterns = []; |
|
236 | // delim can be anything but line breaks, quotes, alphanumeric, underscore, backslash, or any type of spaces |
||
237 | 13 | $antidelims = implode(array("\r", "\n", "\w", preg_quote('"', '/'), preg_quote("'", '/')/*, preg_quote('\\', '/')*/, preg_quote(chr(self::SPACE), '/'))); |
|
0 ignored issues
–
show
Unused Code
Comprehensibility
introduced
by
67% of this comment could be valid code. Did you maybe forget this after debugging?
Sometimes obsolete code just ends up commented out instead of removed. In this case it is better to remove the code once you have checked you do not need it. The code might also have been commented out for debugging purposes. In this case it is vital that someone uncomments it again or your project may behave in very unexpected ways in production. This check looks for comments that seem to be mostly valid code and reports them.
Loading history...
|
|||
238 | 13 | $delim = '(?P<delim>[^' . $antidelims . '])'; |
|
239 | 13 | $quote = '(?P<quoteChar>"|\'|`)'; // @todo I think MS Excel uses some strange encoding for fancy open/close quotes |
|
240 | 13 | $patterns[] = '/' . $delim . ' ?' . $quote . '.*?\2\1/ms'; // ,"something", - anything but whitespace or quotes followed by a possible space followed by a quote followed by anything followed by same quote, followed by same anything but whitespace |
|
241 | 13 | $patterns[] = '/(?:^|\n)' . $quote . '.*?\1' . $delim . ' ?/ms'; // 'something', - beginning of line or line break, followed by quote followed by anything followed by quote followed by anything but whitespace or quotes |
|
242 | 13 | $patterns[] = '/' . $delim . ' ?' . $quote . '.*?\2(?:^|\n)/ms'; // ,'something' - anything but whitespace or quote followed by possible space followed by quote followed by anything followed by quote, followed by end of line |
|
243 | 13 | $patterns[] = '/(?:^|\n)' . $quote . '.*?\2(?:$|\n)/ms'; // 'something' - beginning of line followed by quote followed by anything followed by quote followed by same quote followed by end of line |
|
244 | 13 | foreach ($patterns as $pattern) { |
|
245 | // @todo I had to add the error suppression char here because it was |
||
246 | // causing undefined offset errors with certain data sets. strange... |
||
247 | 13 | if (@preg_match_all($pattern, $this->sample, $matches) && $matches) break; |
|
248 | 13 | } |
|
249 | 13 | if ($matches) { |
|
250 | 13 | $quotes = array_count_values($matches['quoteChar']); |
|
251 | 13 | arsort($quotes); |
|
252 | 13 | $quotes = array_flip($quotes); |
|
253 | 13 | if ($theQuote = array_shift($quotes)) { |
|
254 | 10 | $delims = array_count_values($matches['delim']); |
|
255 | 10 | arsort($delims); |
|
256 | 10 | $delims = array_flip($delims); |
|
257 | 10 | $theDelim = array_shift($delims); |
|
258 | 10 | return array($theQuote, $theDelim); |
|
259 | } |
||
260 | 5 | } |
|
261 | 5 | throw new TasterException("quoteChar and delimiter cannot be determined", TasterException::ERR_QUOTE_AND_DELIM); |
|
262 | } |
||
263 | |||
264 | /** |
||
265 | * Take a list of likely delimiter characters and find the one that occurs |
||
266 | * the most consistent amount of times within the provided data. |
||
267 | * |
||
268 | * @param string The character(s) used for newlines |
||
269 | * @return string One of four Flavor::QUOTING_* constants |
||
270 | * @see \CSVelte\Flavor for possible quote style constants |
||
271 | * @access protected |
||
272 | * @todo Refactor this method--It needs more thorough testing against a wider |
||
273 | * variety of CSV data to be sure it works reliably. And I'm sure there |
||
274 | * are many performance and logic improvements that could be made. This |
||
275 | * is essentially a first draft. |
||
276 | * @todo Use replaceQuotedSpecialChars rather than removeQuotedStrings |
||
277 | */ |
||
278 | 5 | protected function lickDelimiter($eol = "\n") |
|
279 | { |
||
280 | 5 | $delimiters = array(",", "\t", "|", ":", ";", "/", '\\'); |
|
281 | 5 | $lines = explode($eol, $this->removeQuotedStrings($this->sample)); |
|
282 | 5 | $start = 0; |
|
283 | 5 | $charFrequency = array(); |
|
284 | 5 | while ($start < count($lines)) { |
|
285 | 5 | foreach ($lines as $key => $line) { |
|
286 | 5 | if (!trim($line)) continue; |
|
287 | 5 | foreach ($delimiters as $char) { |
|
288 | 5 | $freq = substr_count($line, $char); |
|
289 | 5 | $charFrequency[$char][$key] = $freq; |
|
290 | 5 | } |
|
291 | 5 | } |
|
292 | 5 | $start++; |
|
293 | 5 | } |
|
294 | 5 | $averages = Utils::array_average($charFrequency); |
|
295 | 5 | $modes = Utils::array_mode($charFrequency); |
|
296 | 5 | $consistencies = array(); |
|
297 | 5 | foreach ($averages as $achar => $avg) { |
|
298 | 5 | foreach ($modes as $mchar => $mode) { |
|
299 | 5 | if ($achar == $mchar) { |
|
300 | 5 | if ($mode) { |
|
301 | 5 | $consistencies[$achar] = $avg / $mode; |
|
302 | 5 | } else { |
|
303 | 5 | $consistencies[$achar] = 0; |
|
304 | } |
||
305 | 5 | break; |
|
306 | } |
||
307 | 5 | } |
|
308 | 5 | } |
|
309 | 5 | if (empty($consistencies)) { |
|
310 | throw new TasterException('Cannot determine delimiter character', TasterException::ERR_DELIMITER); |
||
311 | } |
||
312 | 5 | arsort($consistencies); |
|
313 | 5 | return key($consistencies); |
|
314 | } |
||
315 | |||
316 | /** |
||
317 | * Determine the "style" of data quoting. The CSV format, while having an RFC |
||
318 | * (https://tools.ietf.org/html/rfc4180), doesn't necessarily always conform |
||
319 | * to it. And it doesn't provide metadata such as the delimiting character, |
||
320 | * quote character, or what types of data are quoted. So this method makes a |
||
321 | * logical guess by finding which columns have been quoted (if any) and |
||
322 | * examining their data type. Most often, CSV files will only use quotes |
||
323 | * around columns that contain special characters such as the dilimiter, |
||
324 | * the quoting character, newlines, etc. (we refer to this style as ) |
||
325 | * QUOTE_MINIMAL), but some quote all columns that contain nonnumeric data |
||
326 | * (QUOTE_NONNUMERIC). Then there are CSV files that quote all columns |
||
327 | * (QUOTE_ALL) and those that quote none (QUOTE_NONE). |
||
328 | * |
||
329 | * @param string $delim The character used as the column delimiter |
||
330 | * @param string $eol The character used for newlines |
||
331 | * @return string One of four "QUOTING_" constants defined above--see this |
||
332 | * method's description for more info. |
||
333 | * @access protected |
||
334 | * @todo Refactor this method--It needs more thorough testing against a wider |
||
335 | * variety of CSV data to be sure it works reliably. And I'm sure there |
||
336 | * are many performance and logic improvements that could be made. This |
||
337 | * is essentially a first draft. |
||
338 | */ |
||
339 | 13 | protected function lickQuotingStyle($delim, $eol) |
|
340 | { |
||
341 | 13 | $data = $this->replaceQuotedSpecialChars($this->sample, $delim); |
|
342 | |||
343 | $quoting_styles = array( |
||
344 | 13 | Flavor::QUOTE_ALL => 0, |
|
345 | 13 | Flavor::QUOTE_NONE => 0, |
|
346 | 13 | Flavor::QUOTE_MINIMAL => 0, |
|
347 | 13 | Flavor::QUOTE_NONNUMERIC => 0, |
|
348 | 13 | ); |
|
349 | |||
350 | 13 | $lines = explode($eol, $data); |
|
351 | $freq = array( |
||
352 | 13 | 'quoted' => array(), |
|
353 | 13 | 'unquoted' => array() |
|
354 | 13 | ); |
|
355 | |||
356 | 13 | foreach ($lines as $key => $line) { |
|
357 | // now we can sub back in the correct newlines |
||
358 | 13 | $line = str_replace(self::PLACEHOLDER_NEWLINE, $eol, $line); |
|
359 | 13 | $cols = explode($delim, $line); |
|
360 | 13 | foreach ($cols as $colkey => $col) { |
|
361 | // now we can sub back in the correct delim characters |
||
362 | 13 | $col = str_replace(self::PLACEHOLDER_DELIM, $delim, $col); |
|
363 | 13 | if ($this->isQuoted($col)) { |
|
364 | 10 | $col = $this->unQuote($col); |
|
365 | 10 | $type = $this->lickDataType($col); |
|
366 | // we can remove this guy all together since at lease one column is quoted |
||
367 | 10 | unset($quoting_styles[Flavor::QUOTE_NONE]); |
|
368 | 10 | $freq['quoted'][] = $type; |
|
369 | 10 | } else { |
|
370 | 13 | $type = $this->lickDataType($col); |
|
371 | // we can remove this guy all together since at lease one column is unquoted |
||
372 | 13 | unset($quoting_styles[Flavor::QUOTE_ALL]); |
|
373 | 13 | $freq['unquoted'][] = $type; |
|
374 | } |
||
375 | 13 | } |
|
376 | 13 | } |
|
377 | 13 | $types = array_unique($freq['quoted']); |
|
378 | // if quoting_styles still has QUOTE_ALL or QUOTE_NONE, then that's the one to return |
||
379 | 13 | if (array_key_exists(Flavor::QUOTE_ALL, $quoting_styles)) return Flavor::QUOTE_ALL; |
|
380 | 13 | if (array_key_exists(Flavor::QUOTE_NONE, $quoting_styles)) return Flavor::QUOTE_NONE; |
|
381 | 10 | if (count($types) == 1) { |
|
382 | 10 | if (current($types) == self::DATA_SPECIAL) return Flavor::QUOTE_MINIMAL; |
|
383 | elseif (current($types) == self::DATA_NONNUMERIC) return Flavor::QUOTE_NONNUMERIC; |
||
384 | } else { |
||
385 | 1 | if (array_key_exists(self::DATA_NONNUMERIC, array_flip($types))) { |
|
386 | // allow for a SMALL amount of error here |
||
387 | 1 | $counts = array(self::DATA_SPECIAL => 0, self::DATA_NONNUMERIC => 0); |
|
388 | array_walk($freq['quoted'], function ($val) use (&$counts) { |
||
389 | 1 | $counts[$val]++; |
|
390 | 1 | }); |
|
391 | 1 | arsort($counts); |
|
392 | 1 | $most = current($counts); |
|
393 | 1 | $least = end($counts); |
|
394 | 1 | $err_margin = $least / $most; |
|
395 | 1 | if ($err_margin < 1) return Flavor::QUOTE_NONNUMERIC; |
|
396 | } |
||
397 | } |
||
398 | return Flavor::QUOTE_MINIMAL; |
||
399 | } |
||
400 | |||
401 | /** |
||
402 | * Remove quotes around a piece of text (if there are any) |
||
403 | * |
||
404 | * @param string The data to "unquote" |
||
405 | * @return string The data passed in, only with quotes stripped (off the edges) |
||
406 | * @access protected |
||
407 | */ |
||
408 | 17 | protected function unQuote($data) |
|
409 | { |
||
410 | 17 | return preg_replace('/^(["\'])(.*)\1$/', '\2', $data); |
|
411 | } |
||
412 | |||
413 | /** |
||
414 | * Determine whether a particular string of data has quotes around it. |
||
415 | * |
||
416 | * @param string The data to check |
||
417 | * @return boolean Whether the data is quoted or not |
||
418 | * @access protected |
||
419 | */ |
||
420 | 13 | protected function isQuoted($data) |
|
421 | { |
||
422 | 13 | return preg_match('/^([\'"])[^\1]*\1$/', $data); |
|
423 | } |
||
424 | |||
425 | /** |
||
426 | * Determine what type of data is contained within a variable |
||
427 | * Possible types: |
||
428 | * - nonnumeric - only numbers |
||
429 | * - special - contains characters that could potentially need to be quoted (possible delimiter characters) |
||
430 | * - unknown - everything else |
||
431 | * This method is really only used within the "lickQuotingStyle" method to |
||
432 | * help determine whether a particular column has been quoted due to it being |
||
433 | * nonnumeric or because it has some special character in it such as a delimiter |
||
434 | * or newline or quote. |
||
435 | * |
||
436 | * @param string The data to determine the type of |
||
437 | * @return string The type of data (one of the "DATA_" constants above) |
||
438 | * @access protected |
||
439 | * @todo I could probably eliminate this method and use an anonymous function |
||
440 | * instead. It isn't used anywhere else and its name could be misleading. |
||
441 | * Especially since I also have a lickType method that is used within the |
||
442 | * lickHeader method. |
||
443 | */ |
||
444 | 13 | protected function lickDataType($data) |
|
445 | { |
||
446 | // @todo make this check for only the quote and delim that are actually being used |
||
447 | // that will make the guess more accurate |
||
448 | 13 | if (preg_match('/[\'",\t\|:;-]/', $data)) { |
|
449 | 10 | return self::DATA_SPECIAL; |
|
450 | 13 | } elseif (preg_match('/[^0-9]/', $data)) { |
|
451 | 13 | return self::DATA_NONNUMERIC; |
|
452 | } |
||
453 | 13 | return self::DATA_UNKNOWN; |
|
454 | } |
||
455 | |||
456 | /** |
||
457 | * Replace all instances of newlines and whatever character you specify (as |
||
458 | * the delimiter) that are contained within quoted text. The replacements are |
||
459 | * simply a special placeholder string. This is done so that I can use the |
||
460 | * very unsmart "explode" function and not have to worry about it exploding |
||
461 | * on delimiters or newlines within quotes. Once I have exploded, I typically |
||
462 | * sub back in the real characters before doing anything else. Although |
||
463 | * currently there is no dedicated method for doing so I just use str_replace |
||
464 | * |
||
465 | * @param string The string to do the replacements on |
||
466 | * @param string The delimiter character to replace |
||
467 | * @return string The data with replacements performed |
||
468 | * @access protected |
||
469 | * @todo I could probably pass in (maybe optionally) the newline character I |
||
470 | * want to replace as well. I'll do that if I need to. |
||
471 | */ |
||
472 | protected function replaceQuotedSpecialChars($data, $delim) |
||
473 | { |
||
474 | 17 | return preg_replace_callback('/([\'"])(.*)\1/imsU', function($matches) use ($delim) { |
|
475 | 14 | $ret = preg_replace("/([\r\n])/", self::PLACEHOLDER_NEWLINE, $matches[0]); |
|
476 | 14 | $ret = str_replace($delim, self::PLACEHOLDER_DELIM, $ret); |
|
477 | 14 | return $ret; |
|
478 | 17 | }, $data); |
|
479 | } |
||
480 | |||
481 | /** |
||
482 | * Determine the "type" of a particular string of data. Used for the lickHeader |
||
483 | * method to assign a type to each column to try to determine whether the |
||
484 | * first for is different than a consistent column type. |
||
485 | * |
||
486 | * @todo As I'm writing this method I'm beginning ot realize how expensive |
||
487 | * the lickHeader method is going to end up being since it has to apply all |
||
488 | * these regexes (potentially) to every column. I may end up writing a much |
||
489 | * simpler type-checking method than this if it proves to be too expensive |
||
490 | * to be practical. |
||
491 | * |
||
492 | * @param string The string of data to check the type of |
||
493 | * @return string One of the TYPE_ string constants above |
||
494 | * @access protected |
||
495 | * @uses \Carbon\Carbon date/time ilbrary/class |
||
496 | */ |
||
497 | 17 | protected function lickType($data) |
|
498 | { |
||
499 | 17 | if (preg_match('/^[+-]?[\d\.]+$/', $data)) { |
|
500 | 14 | return self::TYPE_NUMBER; |
|
501 | 17 | } elseif (preg_match('/^[+-]?[\d]+\.[\d]+$/', $data)) { |
|
502 | return self::TYPE_DOUBLE; |
||
503 | 17 | } elseif (preg_match('/^[+-]?[¥£€$]\d+(\.\d+)$/', $data)) { |
|
504 | return self::TYPE_CURRENCY; |
||
505 | 17 | } elseif (preg_match('/^[a-zA-Z]+$/', $data)) { |
|
506 | 17 | return self::TYPE_ALPHA; |
|
507 | } else { |
||
508 | try { |
||
509 | 17 | $year = '([01][0-9])?[0-9]{2}'; |
|
510 | 17 | $month = '([01]?[0-9]|Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)'; |
|
511 | 17 | $day = '[0-3]?[0-9]'; |
|
512 | 17 | $sep = '[\/\.\-]?'; |
|
513 | 17 | $time = '([0-2]?[0-9](:[0-5][0-9]){1,2}(am|pm)?|[01]?[0-9](am|pm))'; |
|
514 | 17 | $date = '(' . $month . $sep . $day . $sep . $year . '|' . $day . $sep . $month . $sep . $year . '|' . $year . $sep . $month . $sep . $day . ')'; |
|
515 | 17 | $dt = Carbon::parse($data); |
|
516 | 17 | if ($dt->today()) { |
|
517 | // then this is most likely a time string... |
||
518 | 17 | if (preg_match("/^{$time}$/i", $data)) { |
|
519 | return self::TYPE_TIME; |
||
520 | } |
||
521 | 17 | } |
|
522 | 17 | if (preg_match("/^{$date}$/i", $data)) { |
|
523 | 14 | return self::TYPE_DATE; |
|
524 | 5 | } elseif(preg_match("/^{$date} {$time}$/i")) { |
|
525 | return self::TYPE_DATETIME; |
||
526 | } |
||
527 | 17 | } catch (\Exception $e) { |
|
528 | // now go on checking remaining types |
||
529 | 17 | if (preg_match('/^\w+$/', $data)) { |
|
530 | 2 | return self::TYPE_ALNUM; |
|
531 | } |
||
532 | } |
||
533 | } |
||
534 | 17 | return self::TYPE_STRING; |
|
535 | } |
||
536 | |||
537 | /** |
||
538 | * Examines the contents of the CSV data to make a determination of whether |
||
539 | * or not it contains a header row. To make this determination, it creates |
||
540 | * an array of each column's (in each row)'s data type and length and then |
||
541 | * compares them. If all of the rows except the header look similar, it will |
||
542 | * return true. This is only a guess though. There is no programmatic way to |
||
543 | * determine 100% whether a CSV file has a header. The format does not |
||
544 | * provide metadata such as that. |
||
545 | * |
||
546 | * @param string $delim The CSV data's delimiting char (can be a variety of chars but) |
||
547 | * typically $eol is either a comma or a tab, sometimes a pipe) |
||
548 | * @param string The CSV data's end-of-line char(s) (\n \r or \r\n) |
||
549 | * @return boolean True if the data (most likely) contains a header row |
||
550 | * @access public |
||
551 | * @todo This method needs a total refactor. It's not necessary to loop twice |
||
552 | * You could get away with one loop and that would allow for me to do |
||
553 | * something like only examining enough rows to get to a particular |
||
554 | * "hasHeader" score (+-100 for instance) & then just return true|false |
||
555 | * @todo Also, break out of the first loop after a certain (perhaps even a |
||
556 | * configurable) amount of lines (you only need to examine so much data ) |
||
557 | * to reliably make a determination and this is an expensive method) |
||
558 | * @todo Because the header isn't actually part of the "flavor", |
||
559 | * I could remove the need for quote, delim, and eol by "licking" the |
||
560 | * data sample provided in the first argument. Also, I could actually |
||
561 | * create a Reader object to read the data here. |
||
562 | */ |
||
563 | 18 | public function lickHeader($delim, $eol) |
|
564 | { |
||
565 | 18 | $data = $this->replaceQuotedSpecialChars($this->sample, $delim); |
|
566 | 18 | $lines = explode($eol, $data); |
|
567 | 18 | $types = array(); |
|
568 | 18 | foreach ($lines as $line_no => $line) { |
|
569 | // now we can sub back in the correct newlines |
||
570 | 18 | $line = str_replace(self::PLACEHOLDER_NEWLINE, $eol, $line); |
|
571 | 18 | $cols = explode($delim, $line); |
|
572 | 18 | foreach ($cols as $col_no => $col) { |
|
573 | // now we can sub back in the correct delim characters |
||
574 | 18 | $col = str_replace(self::PLACEHOLDER_DELIM, $delim, $col); |
|
575 | 18 | $types[$line_no][$col_no] = array( |
|
576 | 18 | 'type' => $this->lickType($this->unQuote($col)), |
|
577 | 18 | 'length' => strlen($col) |
|
578 | 18 | ); |
|
579 | 18 | } |
|
580 | 18 | } |
|
581 | 18 | $hasHeader = 0; |
|
582 | 18 | $potential_header = array_shift($types); |
|
583 | 18 | foreach ($types as $line_no => $cols) { |
|
584 | 18 | foreach ($cols as $col_no => $col_info) { |
|
585 | 18 | extract($col_info); |
|
586 | 18 | if (!array_key_exists($col_no, $potential_header)) continue; |
|
587 | 18 | extract($potential_header[$col_no], EXTR_PREFIX_ALL, "header"); |
|
588 | 18 | if ($header_type == self::TYPE_STRING) { |
|
589 | // use length |
||
590 | 15 | if ($length != $header_length) $hasHeader++; |
|
591 | else $hasHeader--; |
||
592 | 15 | } else { |
|
593 | 18 | if ($type != $header_type) $hasHeader++; |
|
594 | else $hasHeader--; |
||
595 | } |
||
596 | 18 | } |
|
597 | 18 | } |
|
598 | 18 | return $hasHeader > 0; |
|
599 | } |
||
600 | } |
||
601 |
Sometimes obsolete code just ends up commented out instead of removed. In this case it is better to remove the code once you have checked you do not need it.
The code might also have been commented out for debugging purposes. In this case it is vital that someone uncomments it again or your project may behave in very unexpected ways in production.
This check looks for comments that seem to be mostly valid code and reports them.