Completed
Push — master ( 5c1aea...9ef1c4 )
by Luke
03:03
created

Taster::lick()   A

Complexity

Conditions 3
Paths 3

Size

Total Lines 18
Code Lines 12

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 11
CRAP Score 3

Importance

Changes 0
Metric Value
c 0
b 0
f 0
dl 0
loc 18
ccs 11
cts 11
cp 1
rs 9.4285
cc 3
eloc 12
nc 3
nop 0
crap 3
1
<?php
2
/**
3
 * CSVelte: Slender, elegant CSV for PHP
4
 * Inspired by Python's CSV module and Frictionless Data and the W3C's CSV
5
 * standardization efforts, CSVelte was written in an effort to take all the
6
 * suck out of working with CSV.
7
 *
8
 * @version   v0.2.1
9
 * @copyright Copyright (c) 2016 Luke Visinoni <[email protected]>
10
 * @author    Luke Visinoni <[email protected]>
11
 * @license   https://github.com/deni-zen/csvelte/blob/master/LICENSE The MIT License (MIT)
12
 */
13
namespace CSVelte;
14
15
use \DateTime;
16
use CSVelte\Contract\Streamable;
17
18
use \Exception;
19
use \OutOfBoundsException;
20
use CSVelte\Exception\TasterException;
21
22
use function CSVelte\collect;
23
24
/**
25
 * CSVelte\Taster
26
 * Given CSV data, Taster will "taste" the data and provide its buest guess at
27
 * its "flavor". In other words, this class inspects CSV data and attempts to
28
 * auto-detect various CSV attributes such as line endings, quote characters, etc..
29
 *
30
 * @package   CSVelte
31
 * @copyright (c) 2016, Luke Visinoni <[email protected]>
32
 * @author    Luke Visinoni <[email protected]>
33
 * @todo      There are a ton of improvements that could be made to this class.
34
 *            I'll do a refactor on this fella once I get at least one test
35
 *            passing for each of its public methods.
36
 * @todo      Should I have a lickEscapeChar method? The python version doesn't
37
 *            have one. But then why does it even bother including one in its
38
 *            flavor class?
39
 * @todo      Examine each of the public methods in this class and determine
40
 *            whether it makes sense to ask for the data as a param rather than
41
 *            just pulling it from source. I don't think it makes sense... it
42
 *            was just easier to write the methods that way during testing.
43
 * @todo      There are at least portions of this class that could use the
44
 *            Reader class rather than working directly with data.
45
 * @todo      Refactor all of the anonymous functions used as callbacks. Rather
46
 *            than passing $this all over, use $closure->bindTo() instead...
47
 *            Actually, write a method called getBoundClosure() or something...
48
 *            maybe even make it a trait I don't know yet. But here it would
49
 *            allow me to bind any anon function to $this and give me a certain
50
 *            set of commonly needed values ($delim, $eol, etc.)
51
 */
52
class Taster
53
{
54
    /**
55
     * End-of-line constants
56
     */
57
    const EOL_UNIX    = 'lf';
58
    const EOL_TRS80   = 'cr';
59
    const EOL_WINDOWS = 'crlf';
60
61
    /**
62
     * ASCII character codes for "invisibles"
63
     */
64
    const HORIZONTAL_TAB = 9;
65
    const LINE_FEED = 10;
66
    const CARRIAGE_RETURN = 13;
67
    const SPACE = 32;
68
69
    /**
70
     * Data types -- Used within the lickQuotingStyle method
71
     */
72
    const DATA_NONNUMERIC = 'nonnumeric';
73
    const DATA_SPECIAL = 'special';
74
    const DATA_UNKNOWN = 'unknown';
75
76
    /**
77
     * Placeholder strings -- hold the place of newlines and delimiters contained
78
     * within quoted text so that the explode method doesn't split incorrectly
79
     */
80
    const PLACEHOLDER_NEWLINE = '[__NEWLINE__]';
81
    const PLACEHOLDER_DELIM = '[__DELIM__]';
82
83
    /**
84
     * Recommended data sample size
85
     */
86
    const SAMPLE_SIZE = 2500;
87
88
    /**
89
     * Column data types -- used within the lickHeader method to determine
90
     * whether the first row contains different types of data than the rest of
91
     * the rows (and thus, is likely a header row)
92
     */
93
    // +-987
94
    const TYPE_NUMBER = 'number';
95
    // +-12.387
96
    const TYPE_DOUBLE = 'double';
97
    // I am a string. I can contain all kinds of stuff.
98
    const TYPE_STRING = 'string';
99
    // 2010-04-23 04:23:00
100
    const TYPE_DATETIME = 'datetime';
101
    // 10-Jul-15, 9/1/2007, April 1st, 2006, etc.
102
    const TYPE_DATE = 'date';
103
    // 10:00pm, 5pm, 13:08, etc.
104
    const TYPE_TIME = 'time';
105
    // $98.96, ¥12389, £6.08, €87.00
106
    const TYPE_CURRENCY = 'currency';
107
    // 12ab44m1n2_asdf
108
    const TYPE_ALNUM = 'alnum';
109
    // abababab
110
    const TYPE_ALPHA = 'alpha';
111
112
    /** @var Contract\Streamable The source of data to examine */
113
    protected $input;
114
115
    /** @var string Sample of CSV data to use for tasting (determining CSV flavor) */
116
    protected $sample;
117
118
    /** @var array Possible delimiter characters in (roughly) the order of likelihood */
119
    protected $delims = [",", "\t", ";", "|", ":", "-", "_", "#", "/", '\\', '$', '+', '=', '&', '@'];
120
121
    /**
122
     * Class constructor--accepts a CSV input source
123
     *
124
     * @param Contract\Streamable The source of CSV data
125
     * @throws TasterException
126
     * @todo It may be a good idea to skip the first line or two for the sample
127
     *     so that the header line(s) don't throw things off (with the exception
128
     *     of lickHeader() obviously)
129
     */
130 22
    public function __construct(Streamable $input)
131
    {
132 22
        $this->input = $input;
133 22
        if (!$this->sample = $input->read(self::SAMPLE_SIZE)) {
134 1
            throw new TasterException("Invalid input, cannot read sample.", TasterException::ERR_INVALID_SAMPLE);
135
        }
136 21
    }
137
138
    /**
139
     * "Invoke" magic method.
140
     *
141
     * Called when an object is invoked as if it were a function. So, for instance,
142
     * This is simply an alias to the lick method.
143
     *
144
     * @return Flavor A flavor object
145
     * @throws TasterException
146
     */
147 20
    public function __invoke()
148
    {
149 20
        return $this->lick();
150
    }
151
152
    /**
153
     * Examine the input source and determine what "Flavor" of CSV it contains.
154
     * The CSV format, while having an RFC (https://tools.ietf.org/html/rfc4180),
155
     * doesn't necessarily always conform to it. And it doesn't provide meta such as the delimiting character, quote character, or what types of data are quoted.
156
     * such as the delimiting character, quote character, or what types of data are quoted.
157
     * are quoted.
158
     *
159
     * @return Flavor The metadata that the CSV format doesn't provide
160
     * @throws TasterException
161
     * @todo Implement a lickQuote method for when lickQuoteAndDelim method fails
162
     * @todo Should there bea lickEscapeChar method? the python module that inspired
163
     *     this library doesn't include one...
164
     * @todo This should cache the results and only regenerate if $this->sample
165
     *     changes (or $this->input)
166
     */
167 21
    public function lick()
168
    {
169 21
        $lineTerminator = $this->lickLineEndings();
170
        try {
171 21
            list($quoteChar, $delimiter) = $this->lickQuoteAndDelim();
172 21
        } catch (TasterException $e) {
173 6
            if ($e->getCode() !== TasterException::ERR_QUOTE_AND_DELIM) throw $e;
174 6
            $quoteChar = '"';
175 6
            $delimiter = $this->lickDelimiter($lineTerminator);
176
        }
177
        /**
178
         * @todo Should this be null? Because doubleQuote = true means this = null
179
         */
180 21
        $escapeChar = '\\';
181 21
        $quoteStyle = $this->lickQuotingStyle($delimiter, $lineTerminator);
182 21
        $header = $this->lickHeader($delimiter, $lineTerminator);
183 21
        return new Flavor(compact('quoteChar', 'escapeChar', 'delimiter', 'lineTerminator', 'quoteStyle', 'header'));
184
    }
185
186
    /**
187
     * Replaces all quoted columns with a blank string. I was using this method
188
     * to prevent explode() from incorrectly splitting at delimiters and newlines
189
     * within quotes when parsing a file. But this was before I wrote the
190
     * replaceQuotedSpecialChars method which (at least to me) makes more sense.
191
     *
192
     * @param string $data The string to replace quoted strings within
193
     * @return string The input string with quoted strings removed
194
     * @todo Replace code that uses this method with the replaceQuotedSpecialChars
195
     *     method instead. I think it's cleaner.
196
     */
197 21
    protected function removeQuotedStrings($data)
198
    {
199 21
        return preg_replace($pattern = '/(["\'])(?:(?=(\\\\?))\2.)*?\1/sm', $replace = '', $data);
200
    }
201
202
    /**
203
     * Examine the input source to determine which character(s) are being used
204
     * as the end-of-line character
205
     *
206
     * @return string The end-of-line char for the input data
207
     * @credit pulled from stackoverflow thread *tips hat to username "Harm"*
208
     * @todo This should throw an exception if it cannot determine the line ending
209
     * @todo I probably will make this method protected when I'm done with testing...
210
     * @todo If there is any way for this method to fail (for instance if a file )
211
     *       is totally empty or contains no line breaks), then it needs to throw
212
     *       a relevant TasterException
213
     * @todo Use replaceQuotedSpecialChars rather than removeQuotedStrings()
214
     */
215 21
    protected function lickLineEndings()
216
    {
217 21
        $str = $this->removeQuotedStrings($this->sample);
218
        $eols = [
219 21
            self::EOL_WINDOWS => "\r\n",  // 0x0D - 0x0A - Windows, DOS OS/2
220 21
            self::EOL_UNIX    => "\n",    // 0x0A -      - Unix, OSX
221 21
            self::EOL_TRS80   => "\r",    // 0x0D -      - Apple ][, TRS80
222 21
        ];
223
224 21
        $curCount = 0;
225
        // @todo This should return a default maybe?
226 21
        $curEol = PHP_EOL;
227 21
        foreach($eols as $k => $eol) {
228 21
            if( ($count = substr_count($str, $eol)) > $curCount) {
229 21
                $curCount = $count;
230 21
                $curEol = $eol;
231 21
            }
232 21
        }
233 21
        return $curEol;
234
    }
235
236
    /**
237
     * The best way to determine quote and delimiter characters is when columns
238
     * are quoted, often you can seek out a pattern of delim, quote, stuff, quote, delim
239
     * but this only works if you have quoted columns. If you don't you have to
240
     * determine these characters some other way... (see lickDelimiter)
241
     *
242
     * @return array A two-row array containing quotechar, delimchar
243
     * @throws TasterException
244
     * @todo make protected
245
     * @todo This should throw an exception if it cannot determine the delimiter
246
     *     this way.
247
     * @todo This should check for any line endings not just \n
248
     */
249 21
    protected function lickQuoteAndDelim()
250
    {
251
        /**
252
         * @var array An array of pattern matches
253
         */
254 21
        $matches = null;
255
        /**
256
         * @var array An array of patterns (regex)
257
         */
258 21
        $patterns = [];
259
        // delim can be anything but line breaks, quotes, alphanumeric, underscore, backslash, or any type of spaces
260 21
        $antidelims = implode(array("\r", "\n", "\w", preg_quote('"', '/'), preg_quote("'", '/'), preg_quote(chr(self::SPACE), '/')));
261 21
        $delim = '(?P<delim>[^' . $antidelims . '])';
262 21
        $quote = '(?P<quoteChar>"|\'|`)'; // @todo I think MS Excel uses some strange encoding for fancy open/close quotes
263 21
        $patterns[] = '/' . $delim . ' ?' . $quote . '.*?\2\1/ms'; // ,"something", - anything but whitespace or quotes followed by a possible space followed by a quote followed by anything followed by same quote, followed by same anything but whitespace
264 21
        $patterns[] = '/(?:^|\n)' . $quote . '.*?\1' . $delim . ' ?/ms'; // 'something', - beginning of line or line break, followed by quote followed by anything followed by quote followed by anything but whitespace or quotes
265 21
        $patterns[] = '/' . $delim . ' ?' . $quote . '.*?\2(?:^|\n)/ms'; // ,'something' - anything but whitespace or quote followed by possible space followed by quote followed by anything followed by quote, followed by end of line
266 21
        $patterns[] = '/(?:^|\n)' . $quote . '.*?\2(?:$|\n)/ms'; // 'something' - beginning of line followed by quote followed by anything followed by quote followed by same quote followed by end of line
267 21
        foreach ($patterns as $pattern) {
268
            // @todo I had to add the error suppression char here because it was
269
            //     causing undefined offset errors with certain data sets. strange...
270 21
            if (@preg_match_all($pattern, $this->sample, $matches) && $matches) break;
271 21
        }
272 21
        if ($matches) {
273
            try {
274
                return [
275 21
                    collect($matches)
276 21
                        ->frequency()
277 21
                        ->get('quoteChar')
278 21
                        ->sort()
279 21
                        ->reverse()
280 21
                        ->getKeyAtPosition(0),
281 18
                    collect($matches)
282 18
                        ->frequency()
283 18
                        ->get('delim')
284 18
                        ->sort()
285 18
                        ->reverse()
286 18
                        ->getKeyAtPosition(0)
287 18
                ];
288 6
            } catch (OutOfBoundsException $e) {
289
                // eat this exception and let the taster exception below be thrown instead...
290
            }
291 6
        }
292 6
        throw new TasterException("quoteChar and delimiter cannot be determined", TasterException::ERR_QUOTE_AND_DELIM);
293
    }
294
295
     /**
296
      * Take a list of likely delimiter characters and find the one that occurs
297
      * the most consistent amount of times within the provided data.
298
      *
299
      * @param string $eol The character(s) used for newlines
300
      * @return string One of four Flavor::QUOTING_* constants
301
      * @see Flavor for possible quote style constants
302
      * @todo Refactor this method--It needs more thorough testing against a wider
303
      *     variety of CSV data to be sure it works reliably. And I'm sure there
304
      *     are many performance and logic improvements that could be made. This
305
      *     is essentially a first draft.
306
      * @todo Can't use replaceQuotedSpecialChars rather than removeQuotedStrings
307
      *     because the former requires u to know the delimiter
308
      */
309 6
    protected function lickDelimiter($eol = "\n")
310
    {
311 6
        $frequencies = [];
312 6
        $consistencies = [];
313
314
        // build a table of characters and their frequencies for each line. We
315
        // will use this frequency table to then build a table of frequencies of
316
        // each frequency (in 10 lines, "tab" occurred 5 times on 7 of those
317
        // lines, 6 times on 2 lines, and 7 times on 1 line)
318 6
        collect(explode($eol, $this->removeQuotedStrings($this->sample)))
319
            ->walk(function($line, $line_no) use (&$frequencies) {
320 6
                collect(str_split($line))
321
                    ->filter(function($c) { return collect($this->delims)->contains($c); })
322 6
                    ->frequency()
323 6
                    ->sort()
324 6
                    ->reverse()
325
                    ->walk(function($count, $char) use (&$frequencies, $line_no) {
326 6
                        $frequencies[$char][$line_no] = $count;
327 6
                    });
328 6
            })
329
            // the above only finds frequencies for characters if they exist in
330
            // a given line. This will go back and fill in zeroes where a char
331
            // didn't occur at all in a given line (needed to determine mode)
332
            ->walk(function($line, $line_no) use (&$frequencies) {
333 6
                collect($frequencies)
334
                    ->walk(function($counts, $char) use ($line_no, &$frequencies) {
335 6
                        if (!isset($frequencies[$char][$line_no])) {
336 6
                            $frequencies[$char][$line_no] = 0;
337 6
                        }
338 6
                    });
339 6
            });
340
341
        // now determine the mode for each char to decide the "expected" amount
342
        // of times a char (possible delim) will occur on each line...
343 6
        $freqs = collect($frequencies);
344 6
        $modes = $freqs->mode();
345
        $freqs->walk(function($f, $chr) use ($modes, &$consistencies) {
346
            collect($f)->walk(function($num) use ($modes, $chr, &$consistencies) {
347 6
                if ($expected = $modes->get($chr)) {
348 6
                    if ($num == $expected) {
349
                        // met the goal, yay!
350 6
                        if (!isset($consistencies[$chr])) {
351 6
                            $consistencies[$chr] = 0;
352 6
                        }
353 6
                        $consistencies[$chr]++;
354 6
                    }
355 6
                }
356 6
            });
357 6
        });
358
359 6
        $delims = collect($consistencies);
360 6
        $max = $delims->max();
361 6
        $dups = $delims->duplicates();
362 6
        if ($dups->has($max, false)) {
363
            // if more than one candidate, then look at where the character appeared
364
            // in the data. Was it relatively evenly distributed or was there a
365
            // specific area that the character tended to appear? Dates will have a
366
            // consistent format (e.g. 04-23-1986) and so may easily provide a false
367
            // positive for delimiter. But the dash will be focused in that one area,
368
            // whereas the comma character is spread out. You can determine this by
369
            // finding out the number of chars between each occurrence and getting
370
            // the average. If the average is wildly different than any given distance
371
            // than bingo you probably aren't working with a delimiter there...
372
373
            // another option to find the delimiter if there is a tie, is to build
374
            // a table of character position within each line. Then use that to
375
            // determine if one character is consistently in the same position or
376
            // at least the same general area. Use the delimiter that is the most
377
            // consistent in that way...
378
379
            /**
380
             * @todo Add a method here to figure out where duplicate best-match
381
             *     delimiter(s) fall within each line and then, depending on
382
             *     which one has the best distribution, return that one.
383
             */
384
385
             $decision = $dups->get($max);
386
             try {
387
                 return $this->guessDelimByDistribution($decision, $eol);
388
             } catch (TasterException $e) {
389
                 // if somehow we STILL can't come to a consensus, then fall back to a
390
                 // "preferred delimiters" list...
391
                 foreach ($this->delims as $key => $val) {
392
                    if ($delim = array_search($val, $decision)) return $delim;
393
                 }
394
             }
395
        }
396
        return $delims
397 6
            ->sort()
398 6
            ->getKeyAtPosition(0);
399
    }
400
401
    /**
402
     * Compare positional consistency of several characters to determine the
403
     * probable delimiter character. The idea behind this is that the delimiter
404
     * character is likely more consistently distributed than false-positive
405
     * delimiter characters produced by lickDelimiter(). For instance, consider
406
     * a series of rows similar to the following:
407
     *
408
     * 1,luke,visinoni,[email protected],(530) 413-3076,04-23-1986
409
     *
410
     * The lickDelimiter() method will often not be able to determine whether the
411
     * delimiter is a comma or a dash because they occur the same number of times
412
     * on just about every line (5 for comma, 3 for dash). The difference is
413
     * obvious to you, no doubt. But us humans are pattern-recognition machines!
414
     * The difference between the comma and the dash are that the comma is dist-
415
     * ributed almost evenly throughout the line. The dash characters occur
416
     * entirely at the end of the line. This method accepts any number of possible
417
     * delimiter characters and returns the one that is distributed
418
     *
419
     * If delim character cannot be determined by lickQuoteAndDelim(), taster
420
     * tries lickDelimiter(). When that method runs into a tie, it will use this
421
     * as a tie-breaker.
422
     *
423
     * @param array $delims Possible delimiter characters (method chooses from
424
     *     this array of characters)
425
     * @param string $eol The end-of-line character (or set of characters)
426
     * @return string The probable delimiter character
427
     * @throws TasterException
428
     */
429
    protected function guessDelimByDistribution(array $delims, $eol = "\n")
430
    {
431
        try {
432
            // @todo Write a method that does this...
433
            $lines = collect(explode($eol, $this->removeQuotedStrings($this->sample)));
434
            return $delims[collect($delims)->map(function($delim) use (&$distrib, $lines) {
435
                $linedist = collect();
436
                $lines->walk(function($line, $line_no) use (&$linedist, $delim) {
437
                    if (!strlen($line)) return;
438
                    $sectstot = 10;
439
                    $sectlen = (int) (strlen($line) / $sectstot);
440
                    $sections = collect(str_split($line, $sectlen))
441
                        ->map(function($section) use($delim) {
442
                            return substr_count($section, $delim);
443
                        })
444
                        ->filter(function($count) { return (bool) $count; });
445
                    if (is_numeric($count = $sections->count())) {
446
                        $linedist->set($line_no, $count / $sectstot);
447
                    }
448
                });
449
                return $linedist;
450
            })->map(function($dists) {
451
                return $dists->average();
452
            })->sort()
453
              ->reverse()
454
              ->getKeyAtPosition(0)];
455
        } catch (Exception $e) {
456
            throw new TasterException("delimiter cannot be determined by distribution", TasterException::ERR_DELIMITER);
457
        }
458
    }
459
460
    /**
461
     * Determine the "style" of data quoting. The CSV format, while having an RFC
462
     * (https://tools.ietf.org/html/rfc4180), doesn't necessarily always conform
463
     * to it. And it doesn't provide metadata such as the delimiting character,
464
     * quote character, or what types of data are quoted. So this method makes a
465
     * logical guess by finding which columns have been quoted (if any) and
466
     * examining their data type. Most often, CSV files will only use quotes
467
     * around columns that contain special characters such as the dilimiter,
468
     * the quoting character, newlines, etc. (we refer to this style as )
469
     * QUOTE_MINIMAL), but some quote all columns that contain nonnumeric data
470
     * (QUOTE_NONNUMERIC). Then there are CSV files that quote all columns
471
     * (QUOTE_ALL) and those that quote none (QUOTE_NONE).
472
     *
473
     * @param string $delim The character used as the column delimiter
474
     * @param string $eol The character used for newlines
475
     * @return string One of four "QUOTING_" constants defined above--see this
476
     *     method's description for more info.
477
     * @todo Refactor this method--It needs more thorough testing against a wider
478
     *     variety of CSV data to be sure it works reliably. And I'm sure there
479
     *     are many performance and logic improvements that could be made. This
480
     *     is essentially a first draft.
481
     */
482 21
    protected function lickQuotingStyle($delim, $eol)
483
    {
484 21
        $quoting_styles = collect([
485 21
            Flavor::QUOTE_ALL => true,
486 21
            Flavor::QUOTE_NONE => true,
487 21
            Flavor::QUOTE_MINIMAL => true,
488 21
            Flavor::QUOTE_NONNUMERIC => true,
489 21
        ]);
490
491 21
        $lines = collect(explode($eol, $this->replaceQuotedSpecialChars($this->sample, $delim)));
492 21
        $freq = collect()
493 21
            ->set('quoted', collect())
494 21
            ->set('unquoted', collect());
495
496
        // walk through each line from the data sample to determine which fields
497
        // are quoted and which aren't
498
        $qsFunc = function($line) use (&$quoting_styles, &$freq, $eol, $delim) {
499 21
            $line = str_replace(self::PLACEHOLDER_NEWLINE, $eol, $line);
500
            $qnqaFunc = function($field) use (&$quoting_styles, &$freq, $delim) {
501 21
                $field = str_replace(self::PLACEHOLDER_DELIM, $delim, $field);
502 21
                if ($this->isQuoted($field)) {
503 18
                    $field = $this->unQuote($field);
504 18
                    $freq->get('quoted')->push($this->lickDataType($field));
505
                    // since we know there's at least one quoted field,
506
                    // QUOTE_NONE can be ruled out
507 18
                    $quoting_styles->set(Flavor::QUOTE_NONE, false);
508 18
                } else {
509 21
                    $freq->get('unquoted')->push($this->lickDataType($field));
510
                    // since we know there's at least one unquoted field,
511
                    // QUOTE_ALL can be ruled out
512 21
                    $quoting_styles->set(Flavor::QUOTE_ALL, false);
513
                }
514 21
            };
515 21
            collect(explode($delim, $line))
516 21
                ->walk($qnqaFunc->bindTo($this));
517
518 21
        };
519 21
        $lines->walk($qsFunc->bindTo($this));
520
521 21
        $types = $freq->get('quoted')->unique();
522
        $quoting_styles = $quoting_styles->filter(function($val) { return (bool) $val; });
523
        // if quoting_styles still has QUOTE_ALL or QUOTE_NONE, then return
524
        // whichever of them it is, we don't need to do anything else
525 21
        if ($quoting_styles->has(Flavor::QUOTE_ALL)) return Flavor::QUOTE_ALL;
526 21
        if ($quoting_styles->has(Flavor::QUOTE_NONE)) return Flavor::QUOTE_NONE;
527 18
        if (count($types) == 1) {
528 18
            $style = $types->getValueAtPosition(0);
529 18
            if ($quoting_styles->has($style)) {
530
                return $style;
531
            }
532 18
        } else {
533 2
            if ($types->contains(self::DATA_NONNUMERIC)) {
534
                // allow for a SMALL amount of error here
535 2
                $counts = collect([self::DATA_SPECIAL => 0, self::DATA_NONNUMERIC => 0]);
536
                $freq->get('quoted')->walk(function ($type) use (&$counts) {
537 2
                    $counts->increment($type);
538 2
                });
539
                // @todo is all this even necessary? seems unnecessary to me...
540 2
                if ($most = $counts->max()) {
541 2
                    $least = $counts->min();
542 2
                    $err_margin = $least / $most;
543 2
                    if ($err_margin < 1) return Flavor::QUOTE_NONNUMERIC;
544
                }
545
            }
546
        }
547 18
        return Flavor::QUOTE_MINIMAL;
548
    }
549
550
    /**
551
     * Remove quotes around a piece of text (if there are any)
552
     *
553
     * @param string $data The data to "unquote"
554
     * @return string The data passed in, only with quotes stripped (off the edges)
555
     */
556 21
    protected function unQuote($data)
557
    {
558 21
        return preg_replace('/^(["\'])(.*)\1$/', '\2', $data);
559
    }
560
561
    /**
562
     * Determine whether a particular string of data has quotes around it.
563
     *
564
     * @param string $data The data to check
565
     * @return boolean Whether the data is quoted or not
566
     */
567 21
    protected function isQuoted($data)
568
    {
569 21
        return preg_match('/^([\'"])[^\1]*\1$/', $data);
570
    }
571
572
    /**
573
     * Determine what type of data is contained within a variable
574
     * Possible types:
575
     *     - nonnumeric - only numbers
576
     *     - special - contains characters that could potentially need to be quoted (possible delimiter characters)
577
     *     - unknown - everything else
578
     * This method is really only used within the "lickQuotingStyle" method to
579
     * help determine whether a particular column has been quoted due to it being
580
     * nonnumeric or because it has some special character in it such as a delimiter
581
     * or newline or quote.
582
     *
583
     * @param string $data The data to determine the type of
584
     * @return string The type of data (one of the "DATA_" constants above)
585
     * @todo I could probably eliminate this method and use an anonymous function
586
     *     instead. It isn't used anywhere else and its name could be misleading.
587
     *     Especially since I also have a lickType method that is used within the
588
     *     lickHeader method.
589
     */
590 21
    protected function lickDataType($data)
591
    {
592
        // @todo make this check for only the quote and delim that are actually being used
593
        // that will make the guess more accurate
594 21
        if (preg_match('/[\'",\t\|:;-]/', $data)) {
595 18
            return self::DATA_SPECIAL;
596 21
        } elseif (preg_match('/[^0-9]/', $data)) {
597 21
            return self::DATA_NONNUMERIC;
598
        }
599 21
        return self::DATA_UNKNOWN;
600
    }
601
602
    /**
603
     * Replace all instances of newlines and whatever character you specify (as
604
     * the delimiter) that are contained within quoted text. The replacements are
605
     * simply a special placeholder string. This is done so that I can use the
606
     * very unsmart "explode" function and not have to worry about it exploding
607
     * on delimiters or newlines within quotes. Once I have exploded, I typically
608
     * sub back in the real characters before doing anything else. Although
609
     * currently there is no dedicated method for doing so I just use str_replace
610
     *
611
     * @param string $data The string to do the replacements on
612
     * @param string $delim The delimiter character to replace
613
     * @return string The data with replacements performed
614
     * @todo I could probably pass in (maybe optionally) the newline character I
615
     *     want to replace as well. I'll do that if I need to.
616
     */
617 21
    protected function replaceQuotedSpecialChars($data, $delim)
618
    {
619
        return preg_replace_callback('/([\'"])(.*)\1/imsU', function($matches) use ($delim) {
620 18
            $ret = preg_replace("/([\r\n])/", self::PLACEHOLDER_NEWLINE, $matches[0]);
621 18
            $ret = str_replace($delim, self::PLACEHOLDER_DELIM, $ret);
622 18
            return $ret;
623 21
        }, $data);
624
    }
625
626
    /**
627
     * Determine the "type" of a particular string of data. Used for the lickHeader
628
     * method to assign a type to each column to try to determine whether the
629
     * first for is different than a consistent column type.
630
     *
631
     * @todo As I'm writing this method I'm beginning ot realize how expensive
632
     * the lickHeader method is going to end up being since it has to apply all
633
     * these regexes (potentially) to every column. I may end up writing a much
634
     * simpler type-checking method than this if it proves to be too expensive
635
     * to be practical.
636
     *
637
     * @param string $data The string of data to check the type of
638
     * @return string One of the TYPE_ string constants above
639
     */
640 21
    protected function lickType($data)
641
    {
642 21
        if (preg_match('/^[+-]?[\d\.]+$/', $data)) {
643 18
            return self::TYPE_NUMBER;
644 21
        } elseif (preg_match('/^[+-]?[\d]+\.[\d]+$/', $data)) {
645
            return self::TYPE_DOUBLE;
646 21
        } elseif (preg_match('/^[+-]?[¥£€$]\d+(\.\d+)$/', $data)) {
647
            return self::TYPE_CURRENCY;
648 21
        } elseif (preg_match('/^[a-zA-Z]+$/', $data)) {
649 21
            return self::TYPE_ALPHA;
650
        } else {
651
            try {
652 21
                $year = '([01][0-9])?[0-9]{2}';
653 21
                $month = '([01]?[0-9]|Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)';
654 21
                $day = '[0-3]?[0-9]';
655 21
                $sep = '[\/\.\-]?';
656 21
                $time = '([0-2]?[0-9](:[0-5][0-9]){1,2}(am|pm)?|[01]?[0-9](am|pm))';
657 21
                $date = '(' . $month . $sep . $day . $sep . $year . '|' . $day . $sep . $month . $sep . $year . '|' . $year . $sep . $month . $sep . $day . ')';
658 21
                $dt = new DateTime($data);
659 21
                $dt->setTime(0,0,0);
660 21
                $now = new DateTime();
661 21
                $now->setTime(0,0,0);
662 21
                $diff = $dt->diff($now);
663 21
                $diffDays = (integer) $diff->format( "%R%a" );
664 21
                if ($diffDays === 0) {
665
                    // then this is most likely a time string...
666 6
                    if (preg_match("/^{$time}$/i", $data)) {
667
                        return self::TYPE_TIME;
668
                    }
669 6
                }
670 21
                if (preg_match("/^{$date}$/i", $data)) {
671 18
                    return self::TYPE_DATE;
672 6
                } elseif(preg_match("/^{$date} {$time}$/i")) {
673
                    return self::TYPE_DATETIME;
674
                }
675 21
            } catch (\Exception $e) {
676
                // now go on checking remaining types
677 21
                if (preg_match('/^\w+$/', $data)) {
678 3
                    return self::TYPE_ALNUM;
679
                }
680
            }
681
        }
682 21
        return self::TYPE_STRING;
683
    }
684
685
    /**
686
     * Examines the contents of the CSV data to make a determination of whether
687
     * or not it contains a header row. To make this determination, it creates
688
     * an array of each column's (in each row)'s data type and length and then
689
     * compares them. If all of the rows except the header look similar, it will
690
     * return true. This is only a guess though. There is no programmatic way to
691
     * determine 100% whether a CSV file has a header. The format does not
692
     * provide metadata such as that.
693
     *
694
     * @param string $delim The CSV data's delimiting char (can be a variety of chars but)
695
     *     typically is either a comma or a tab, sometimes a pipe)
696
     * @param string $eol The CSV data's end-of-line char(s) (\n \r or \r\n)
697
     * @return boolean True if the data (most likely) contains a header row
698
     * @todo This method needs a total refactor. It's not necessary to loop twice
699
     *     You could get away with one loop and that would allow for me to do
700
     *     something like only examining enough rows to get to a particular
701
     *     "hasHeader" score (+-100 for instance) & then just return true|false
702
     * @todo Also, break out of the first loop after a certain (perhaps even a
703
     *     configurable) amount of lines (you only need to examine so much data )
704
     *     to reliably make a determination and this is an expensive method)
705
     * @todo I could remove the need for quote, delim, and eol by "licking" the
706
     *     data sample provided in the first argument. Also, I could actually
707
     *     create a Reader object to read the data here.
708
     */
709 22
    public function lickHeader($delim, $eol)
710
    {
711 22
        $types = collect();
712
        $buildTypes = function($line, $line_no) use (&$types, $delim, $eol) {
713 22
            $line = str_replace(self::PLACEHOLDER_NEWLINE, $eol, $line);
714
            $getType = function($field, $colpos) use (&$types, $line, $line_no, $delim) {
715 22
                $field = str_replace(self::PLACEHOLDER_DELIM, $delim, $field);
716
                // @todo Need a Collection::setTableField($x, $y) method
717
                //       See notes in green binder about refactoring Collection
718 22
                if (!$types->has($line_no)) $types->set($line_no, collect());
719 22
                $types->get($line_no)->set($colpos, [
720 22
                    'type' => $this->lickType($this->unQuote($field)),
721 22
                    'length' => strlen($field)
722 22
                ]);
723 22
            };
724 22
            collect(explode($delim, $line))->walk($getType->bindTo($this));
725 22
        };
726 22
        collect(explode(
727 22
            $eol,
728 22
            $this->replaceQuotedSpecialChars($this->sample, $delim)
729 22
        ))
730 22
        ->walk($buildTypes->bindTo($this));
731
732 22
        $hasHeader = 0;
733 22
        $possibleHeader = $types->shift();
734
        $types->walk(function($row) use (&$hasHeader, $possibleHeader) {
735 22
            $row->walk(function($field_info, $col_no) use (&$hasHeader, $possibleHeader) {
736 22
                extract($field_info);
737
                try {
738 22
                    $col = $possibleHeader->get($col_no, null, true);
739 22
                    extract($col, EXTR_PREFIX_ALL, "header");
740 22
                    if ($header_type == self::TYPE_STRING) {
741
                        // use length
742 19
                        if ($length != $header_length) $hasHeader++;
743
                        else $hasHeader--;
744 19
                    } else {
745
                        // use data type
746 22
                        if ($type != $header_type) $hasHeader++;
747
                        else $hasHeader--;
748
                    }
749 22
                } catch (OutOfBoundsException $e) {
750
                    // failure...
751 2
                    return;
752
                }
753 22
            });
754 22
        });
755 22
        return $hasHeader > 0;
756
    }
757
}
758