Completed
Push — releases/v0.2.1 ( 18bcd0...74e7c5 )
by Luke
03:29
created

Taster::lickDelimiter()   C

Complexity

Conditions 9
Paths 4

Size

Total Lines 91
Code Lines 39

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 35
CRAP Score 9.3752

Importance

Changes 0
Metric Value
cc 9
eloc 39
c 0
b 0
f 0
nc 4
nop 1
dl 0
loc 91
ccs 35
cts 42
cp 0.8333
crap 9.3752
rs 5.1434

How to fix   Long Method   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
/**
3
 * CSVelte: Slender, elegant CSV for PHP
4
 * Inspired by Python's CSV module and Frictionless Data and the W3C's CSV
5
 * standardization efforts, CSVelte was written in an effort to take all the
6
 * suck out of working with CSV.
7
 *
8
 * @version   v0.2.1
9
 * @copyright Copyright (c) 2016 Luke Visinoni <[email protected]>
10
 * @author    Luke Visinoni <[email protected]>
11
 * @license   https://github.com/deni-zen/csvelte/blob/master/LICENSE The MIT License (MIT)
12
 */
13
namespace CSVelte;
14
15
use \DateTime;
16
use CSVelte\Contract\Streamable;
17
18
use \Exception;
19
use \OutOfBoundsException;
20
use CSVelte\Exception\TasterException;
21
22
use function CSVelte\collect;
23
24
/**
25
 * CSVelte\Taster
26
 * Given CSV data, Taster will "taste" the data and provide its buest guess at
27
 * its "flavor". In other words, this class inspects CSV data and attempts to
28
 * auto-detect various CSV attributes such as line endings, quote characters, etc..
29
 *
30
 * @package   CSVelte
31
 * @copyright (c) 2016, Luke Visinoni <[email protected]>
32
 * @author    Luke Visinoni <[email protected]>
33
 * @todo      There are a ton of improvements that could be made to this class.
34
 *            I'll do a refactor on this fella once I get at least one test
35
 *            passing for each of its public methods.
36
 * @todo      Should I have a lickEscapeChar method? The python version doesn't
37
 *            have one. But then why does it even bother including one in its
38
 *            flavor class?
39
 * @todo      Examine each of the public methods in this class and determine
40
 *            whether it makes sense to ask for the data as a param rather than
41
 *            just pulling it from source. I don't think it makes sense... it
42
 *            was just easier to write the methods that way during testing.
43
 * @todo      There are at least portions of this class that could use the
44
 *            Reader class rather than working directly with data.
45
 * @todo      Refactor all of the anonymous functions used as callbacks. Rather
46
 *            than passing $this all over, use $closure->bindTo() instead...
47
 *            Actually, write a method called getBoundClosure() or something...
48
 *            maybe even make it a trait I don't know yet. But here it would
49
 *            allow me to bind any anon function to $this and give me a certain
50
 *            set of commonly needed values ($delim, $eol, etc.)
51
 */
52
class Taster
53
{
54
    /**
55
     * End-of-line constants
56
     */
57
    const EOL_UNIX    = 'lf';
58
    const EOL_TRS80   = 'cr';
59
    const EOL_WINDOWS = 'crlf';
60
61
    /**
62
     * ASCII character codes for "invisibles"
63
     */
64
    const HORIZONTAL_TAB = 9;
65
    const LINE_FEED = 10;
66
    const CARRIAGE_RETURN = 13;
67
    const SPACE = 32;
68
69
    /**
70
     * Data types -- Used within the lickQuotingStyle method
71
     */
72
    const DATA_NONNUMERIC = 'nonnumeric';
73
    const DATA_SPECIAL = 'special';
74
    const DATA_UNKNOWN = 'unknown';
75
76
    /**
77
     * Placeholder strings -- hold the place of newlines and delimiters contained
78
     * within quoted text so that the explode method doesn't split incorrectly
79
     */
80
    const PLACEHOLDER_NEWLINE = '[__NEWLINE__]';
81
    const PLACEHOLDER_DELIM = '[__DELIM__]';
82
83
    /**
84
     * Recommended data sample size
85
     */
86
    const SAMPLE_SIZE = 2500;
87
88
    /**
89
     * Column data types -- used within the lickHeader method to determine
90
     * whether the first row contains different types of data than the rest of
91
     * the rows (and thus, is likely a header row)
92
     */
93
    // +-987
94
    const TYPE_NUMBER = 'number';
95
    // +-12.387
96
    const TYPE_DOUBLE = 'double';
97
    // I am a string. I can contain all kinds of stuff.
98
    const TYPE_STRING = 'string';
99
    // 10-Jul-15, 9/1/2007, April 1st, 2006, etc.
100
    const TYPE_DATE = 'date';
101
    // 10:00pm, 5pm, 13:08, etc.
102
    const TYPE_TIME = 'time';
103
    // $98.96, ¥12389, £6.08, €87.00
104
    const TYPE_CURRENCY = 'currency';
105
    // 12ab44m1n2_asdf
106
    const TYPE_ALNUM = 'alnum';
107
    // abababab
108
    const TYPE_ALPHA = 'alpha';
109
110
    /** @var \CSVelte\Contract\Streamable The source of data to examine */
111
    protected $input;
112
113
    /** @var string Sample of CSV data to use for tasting (determining CSV flavor) */
114
    protected $sample;
115
116
    /** @var array Possible delimiter characters in (roughly) the order of likelihood */
117
    protected $delims = [",", "\t", ";", "|", ":", "-", "_", "#", "/", '\\', '$', '+', '=', '&', '@'];
118
119
    /**
120
     * Class constructor--accepts a CSV input source
121
     *
122
     * @param \CSVelte\Contract\Streamable The source of CSV data
123
     * @todo It may be a good idea to skip the first line or two for the sample
124
     *     so that the header line(s) don't throw things off (with the exception
125
     *     of lickHeader() obviously)
126
     */
127 22
    public function __construct(Streamable $input)
128
    {
129 22
        $this->input = $input;
130 22
        if (!$this->sample = $input->read(self::SAMPLE_SIZE)) {
131 1
            throw new TasterException("Invalid input, cannot read sample.", TasterException::ERR_INVALID_SAMPLE);
132
        }
133 21
    }
134
135
    /**
136
     * "Invoke" magic method.
137
     *
138
     * Called when an object is invoked as if it were a function. So, for instance,
139
     * This is simply an alias to the lick method.
140
     *
141
     * @return \CSVelte\Flavor A flavor object
142
     * @throws \CSVelte\Exception\TasterException
143
     */
144 20
    public function __invoke()
145
    {
146 20
        return $this->lick();
147
    }
148
149
    /**
150
     * Examine the input source and determine what "Flavor" of CSV it contains.
151
     * The CSV format, while having an RFC (https://tools.ietf.org/html/rfc4180),
152
     * doesn't necessarily always conform to it. And it doesn't provide meta such as the delimiting character, quote character, or what types of data are quoted.
153
     * such as the delimiting character, quote character, or what types of data are quoted.
154
     * are quoted.
155
     *
156
     * @return \CSVelte\Flavor The metadata that the CSV format doesn't provide
157
     * @throws \CSVelte\Exception\TasterException
158
     * @todo Implement a lickQuote method for when lickQuoteAndDelim method fails
159
     * @todo Should there bea lickEscapeChar method? the python module that inspired
160
     *     this library doesn't include one...
161
     * @todo This should cache the results and only regenerate if $this->sample
162
     *     changes (or $this->input)
163
     */
164 21
    public function lick()
165
    {
166 21
        $lineTerminator = $this->lickLineEndings();
167
        try {
168 21
            list($quoteChar, $delimiter) = $this->lickQuoteAndDelim();
169 21
        } catch (TasterException $e) {
170 6
            if ($e->getCode() !== TasterException::ERR_QUOTE_AND_DELIM) throw $e;
171 6
            $quoteChar = '"';
172 6
            $delimiter = $this->lickDelimiter($lineTerminator);
173
        }
174
        /**
175
         * @todo Should this be null? Because doubleQuote = true means this = null
176
         */
177 21
        $escapeChar = '\\';
178 21
        $quoteStyle = $this->lickQuotingStyle($delimiter, $lineTerminator);
179 21
        $header = $this->lickHeader($delimiter, $lineTerminator);
180 21
        return new Flavor(compact('quoteChar', 'escapeChar', 'delimiter', 'lineTerminator', 'quoteStyle', 'header'));
181
    }
182
183
    /**
184
     * Replaces all quoted columns with a blank string. I was using this method
185
     * to prevent explode() from incorrectly splitting at delimiters and newlines
186
     * within quotes when parsing a file. But this was before I wrote the
187
     * replaceQuotedSpecialChars method which (at least to me) makes more sense.
188
     *
189
     * @param string The string to replace quoted strings within
190
     * @return string The input string with quoted strings removed
191
     * @todo Replace code that uses this method with the replaceQuotedSpecialChars
192
     *     method instead. I think it's cleaner.
193
     */
194 21
    protected function removeQuotedStrings($data)
195
    {
196 21
        return preg_replace($pattern = '/(["\'])(?:(?=(\\\\?))\2.)*?\1/sm', $replace = '', $data);
197
    }
198
199
    /**
200
     * Examine the input source to determine which character(s) are being used
201
     * as the end-of-line character
202
     *
203
     * @return string The end-of-line char for the input data
204
     * @credit pulled from stackoverflow thread *tips hat to username "Harm"*
205
     * @todo This should throw an exception if it cannot determine the line ending
206
     * @todo I probably will make this method protected when I'm done with testing...
207
     * @todo If there is any way for this method to fail (for instance if a file )
208
     *       is totally empty or contains no line breaks), then it needs to throw
209
     *       a relevant TasterException
210
     * @todo Use replaceQuotedSpecialChars rather than removeQuotedStrings()
211
     */
212 21
    protected function lickLineEndings()
213
    {
214 21
        $str = $this->removeQuotedStrings($this->sample);
215
        $eols = [
216 21
            self::EOL_WINDOWS => "\r\n",  // 0x0D - 0x0A - Windows, DOS OS/2
217 21
            self::EOL_UNIX    => "\n",    // 0x0A -      - Unix, OSX
218 21
            self::EOL_TRS80   => "\r",    // 0x0D -      - Apple ][, TRS80
219 21
        ];
220
221 21
        $curCount = 0;
222
        // @todo This should return a default maybe?
223 21
        $curEol = PHP_EOL;
224 21
        foreach($eols as $k => $eol) {
225 21
            if( ($count = substr_count($str, $eol)) > $curCount) {
226 21
                $curCount = $count;
227 21
                $curEol = $eol;
228 21
            }
229 21
        }
230 21
        return $curEol;
231
    }
232
233
    /**
234
     * The best way to determine quote and delimiter characters is when columns
235
     * are quoted, often you can seek out a pattern of delim, quote, stuff, quote, delim
236
     * but this only works if you have quoted columns. If you don't you have to
237
     * determine these characters some other way... (see lickDelimiter)
238
     *
239
     * @return array A two-row array containing quotechar, delimchar
240
     * @todo make protected
241
     * @todo This should throw an exception if it cannot determine the delimiter
242
     *     this way.
243
     * @todo This should check for any line endings not just \n
244
     */
245 21
    protected function lickQuoteAndDelim()
246
    {
247
        /**
248
         * @var array An array of pattern matches
249
         */
250 21
        $matches = null;
251
        /**
252
         * @var array An array of patterns (regex)
253
         */
254 21
        $patterns = [];
255
        // delim can be anything but line breaks, quotes, alphanumeric, underscore, backslash, or any type of spaces
256 21
        $antidelims = implode(array("\r", "\n", "\w", preg_quote('"', '/'), preg_quote("'", '/'), preg_quote(chr(self::SPACE), '/')));
257 21
        $delim = '(?P<delim>[^' . $antidelims . '])';
258 21
        $quote = '(?P<quoteChar>"|\'|`)'; // @todo I think MS Excel uses some strange encoding for fancy open/close quotes
259 21
        $patterns[] = '/' . $delim . ' ?' . $quote . '.*?\2\1/ms'; // ,"something", - anything but whitespace or quotes followed by a possible space followed by a quote followed by anything followed by same quote, followed by same anything but whitespace
260 21
        $patterns[] = '/(?:^|\n)' . $quote . '.*?\1' . $delim . ' ?/ms'; // 'something', - beginning of line or line break, followed by quote followed by anything followed by quote followed by anything but whitespace or quotes
261 21
        $patterns[] = '/' . $delim . ' ?' . $quote . '.*?\2(?:^|\n)/ms'; // ,'something' - anything but whitespace or quote followed by possible space followed by quote followed by anything followed by quote, followed by end of line
262 21
        $patterns[] = '/(?:^|\n)' . $quote . '.*?\2(?:$|\n)/ms'; // 'something' - beginning of line followed by quote followed by anything followed by quote followed by same quote followed by end of line
263 21
        foreach ($patterns as $pattern) {
264
            // @todo I had to add the error suppression char here because it was
265
            //     causing undefined offset errors with certain data sets. strange...
266 21
            if (@preg_match_all($pattern, $this->sample, $matches) && $matches) break;
267 21
        }
268 21
        if ($matches) {
269
            try {
270
                return [
271 21
                    collect($matches)
272 21
                        ->frequency()
273 21
                        ->get('quoteChar')
274 21
                        ->sort()
275 21
                        ->reverse()
276 21
                        ->getKeyAtPosition(0),
277 18
                    collect($matches)
278 18
                        ->frequency()
279 18
                        ->get('delim')
280 18
                        ->sort()
281 18
                        ->reverse()
282 18
                        ->getKeyAtPosition(0)
283 18
                ];
284 6
            } catch (OutOfBoundsException $e) {
285
                // eat this exception and let the taster exception below be thrown instead...
286
            }
287 6
        }
288 6
        throw new TasterException("quoteChar and delimiter cannot be determined", TasterException::ERR_QUOTE_AND_DELIM);
289
    }
290
291
     /**
292
      * Take a list of likely delimiter characters and find the one that occurs
293
      * the most consistent amount of times within the provided data.
294
      *
295
      * @param string The character(s) used for newlines
296
      * @return string One of four Flavor::QUOTING_* constants
297
      * @see \CSVelte\Flavor for possible quote style constants
298
      * @todo Refactor this method--It needs more thorough testing against a wider
299
      *     variety of CSV data to be sure it works reliably. And I'm sure there
300
      *     are many performance and logic improvements that could be made. This
301
      *     is essentially a first draft.
302
      * @todo Can't use replaceQuotedSpecialChars rather than removeQuotedStrings
303
      *     because the former requires u to know the delimiter
304
      */
305 6
    protected function lickDelimiter($eol = "\n")
306
    {
307 6
        $frequencies = [];
308 6
        $consistencies = [];
309
310
        // build a table of characters and their frequencies for each line. We
311
        // will use this frequency table to then build a table of frequencies of
312
        // each frequency (in 10 lines, "tab" occurred 5 times on 7 of those
313
        // lines, 6 times on 2 lines, and 7 times on 1 line)
314 6
        collect(explode($eol, $this->removeQuotedStrings($this->sample)))
315
            ->walk(function($line, $line_no) use (&$frequencies) {
316 6
                collect(str_split($line))
317
                    ->filter(function($c) { return collect($this->delims)->contains($c); })
318 6
                    ->frequency()
319 6
                    ->sort()
320 6
                    ->reverse()
321
                    ->walk(function($count, $char) use (&$frequencies, $line_no) {
322 6
                        $frequencies[$char][$line_no] = $count;
323 6
                    });
324 6
            })
325
            // the above only finds frequencies for characters if they exist in
326
            // a given line. This will go back and fill in zeroes where a char
327
            // didn't occur at all in a given line (needed to determine mode)
328
            ->walk(function($line, $line_no) use (&$frequencies) {
329 6
                collect($frequencies)
330
                    ->walk(function($counts, $char) use ($line_no, &$frequencies) {
331 6
                        if (!isset($frequencies[$char][$line_no])) {
332 6
                            $frequencies[$char][$line_no] = 0;
333 6
                        }
334 6
                    });
335 6
            });
336
337
        // now determine the mode for each char to decide the "expected" amount
338
        // of times a char (possible delim) will occur on each line...
339 6
        $freqs = collect($frequencies);
340 6
        $modes = $freqs->mode();
341
        $freqs->walk(function($f, $chr) use ($modes, &$consistencies) {
342
            collect($f)->walk(function($num) use ($modes, $chr, &$consistencies) {
343 6
                if ($expected = $modes->get($chr)) {
344 6
                    if ($num == $expected) {
345
                        // met the goal, yay!
346 6
                        if (!isset($consistencies[$chr])) {
347 6
                            $consistencies[$chr] = 0;
348 6
                        }
349 6
                        $consistencies[$chr]++;
350 6
                    }
351 6
                }
352 6
            });
353 6
        });
354
355 6
        $delims = collect($consistencies);
356 6
        $max = $delims->max();
357 6
        $dups = $delims->duplicates();
358 6
        if ($dups->has($max, false)) {
359
            // if more than one candidate, then look at where the character appeared
360
            // in the data. Was it relatively evenly distributed or was there a
361
            // specific area that the character tended to appear? Dates will have a
362
            // consistent format (e.g. 04-23-1986) and so may easily provide a false
363
            // positive for delimiter. But the dash will be focused in that one area,
364
            // whereas the comma character is spread out. You can determine this by
365
            // finding out the number of chars between each occurrence and getting
366
            // the average. If the average is wildly different than any given distance
367
            // than bingo you probably aren't working with a delimiter there...
368
369
            // another option to find the delimiter if there is a tie, is to build
370
            // a table of character position within each line. Then use that to
371
            // determine if one character is consistently in the same position or
372
            // at least the same general area. Use the delimiter that is the most
373
            // consistent in that way...
374
375
            /**
376
             * @todo Add a method here to figure out where duplicate best-match
377
             *     delimiter(s) fall within each line and then, depending on
378
             *     which one has the best distribution, return that one.
379
             */
380
381
             $decision = $dups->get($max);
382
             try {
383
                 return $this->guessDelimByDistribution($decision, $eol);
384
             } catch (TasterException $e) {
385
                 // if somehow we STILL can't come to a consensus, then fall back to a
386
                 // "preferred delimiters" list...
387
                 foreach ($this->delims as $key => $val) {
388
                    if ($delim = array_search($val, $decision)) return $delim;
389
                 }
390
             }
391
        }
392
        return $delims
393 6
            ->sort()
394 6
            ->getKeyAtPosition(0);
395
    }
396
397
    /**
398
     * Compare positional consistency of several characters to determine the
399
     * probable delimiter character. The idea behind this is that the delimiter
400
     * character is likely more consistently distributed than false-positive
401
     * delimiter characters produced by lickDelimiter(). For instance, consider
402
     * a series of rows similar to the following:
403
     *
404
     * 1,luke,visinoni,[email protected],(530) 413-3076,04-23-1986
405
     *
406
     * The lickDelimiter() method will often not be able to determine whether the
407
     * delimiter is a comma or a dash because they occur the same number of times
408
     * on just about every line (5 for comma, 3 for dash). The difference is
409
     * obvious to you, no doubt. But us humans are pattern-recognition machines!
410
     * The difference between the comma and the dash are that the comma is dist-
411
     * ributed almost evenly throughout the line. The dash characters occur
412
     * entirely at the end of the line. This method accepts any number of possible
413
     * delimiter characters and returns the one that is distributed
414
     *
415
     * If delim character cannot be determined by lickQuoteAndDelim(), taster
416
     * tries lickDelimiter(). When that method runs into a tie, it will use this
417
     * as a tie-breaker.
418
     *
419
     * @param array $delims Possible delimiter characters (method chooses from
420
     *     this array of characters)
421
     * @return string The probable delimiter character
422
     */
423
    protected function guessDelimByDistribution(array $delims, $eol = "\n")
424
    {
425
        try {
426
            // @todo Write a method that does this...
427
            $lines = collect(explode($eol, $this->removeQuotedStrings($this->sample)));
428
            return $delims[collect($delims)->map(function($delim) use (&$distrib, $lines) {
429
                $linedist = collect();
430
                $lines->walk(function($line, $line_no) use (&$linedist, $delim) {
431
                    if (!strlen($line)) return;
432
                    $sectstot = 10;
433
                    $sectlen = (int) (strlen($line) / $sectstot);
434
                    $sections = collect(str_split($line, $sectlen))
435
                        ->map(function($section) use($delim) {
436
                            return substr_count($section, $delim);
437
                        })
438
                        ->filter(function($count) { return (bool) $count; });
439
                    if (is_numeric($count = $sections->count())) {
440
                        $linedist->set($line_no, $count / $sectstot);
441
                    }
442
                });
443
                return $linedist;
444
            })->map(function($dists) {
445
                return $dists->average();
446
            })->sort()
447
              ->reverse()
448
              ->getKeyAtPosition(0)];
449
        } catch (Exception $e) {
450
            throw new TasterException("delimiter cannot be determined by distribution", TasterException::ERR_DELIMITER);
451
        }
452
    }
453
454
    /**
455
     * Determine the "style" of data quoting. The CSV format, while having an RFC
456
     * (https://tools.ietf.org/html/rfc4180), doesn't necessarily always conform
457
     * to it. And it doesn't provide metadata such as the delimiting character,
458
     * quote character, or what types of data are quoted. So this method makes a
459
     * logical guess by finding which columns have been quoted (if any) and
460
     * examining their data type. Most often, CSV files will only use quotes
461
     * around columns that contain special characters such as the dilimiter,
462
     * the quoting character, newlines, etc. (we refer to this style as )
463
     * QUOTE_MINIMAL), but some quote all columns that contain nonnumeric data
464
     * (QUOTE_NONNUMERIC). Then there are CSV files that quote all columns
465
     * (QUOTE_ALL) and those that quote none (QUOTE_NONE).
466
     *
467
     * @param string $delim The character used as the column delimiter
468
     * @param string $eol The character used for newlines
469
     * @return string One of four "QUOTING_" constants defined above--see this
470
     *     method's description for more info.
471
     * @todo Refactor this method--It needs more thorough testing against a wider
472
     *     variety of CSV data to be sure it works reliably. And I'm sure there
473
     *     are many performance and logic improvements that could be made. This
474
     *     is essentially a first draft.
475
     */
476 21
    protected function lickQuotingStyle($delim, $eol)
477
    {
478 21
        $quoting_styles = collect([
479 21
            Flavor::QUOTE_ALL => true,
480 21
            Flavor::QUOTE_NONE => true,
481 21
            Flavor::QUOTE_MINIMAL => true,
482 21
            Flavor::QUOTE_NONNUMERIC => true,
483 21
        ]);
484
485 21
        $lines = collect(explode($eol, $this->replaceQuotedSpecialChars($this->sample, $delim)));
486 21
        $freq = collect()
487 21
            ->set('quoted', collect())
488 21
            ->set('unquoted', collect());
489
490
        // walk through each line from the data sample to determine which fields
491
        // are quoted and which aren't
492
        $qsFunc = function($line) use (&$quoting_styles, &$freq, $eol, $delim) {
493 21
            $line = str_replace(self::PLACEHOLDER_NEWLINE, $eol, $line);
494
            $qnqaFunc = function($field) use (&$quoting_styles, &$freq, $delim) {
495 21
                $field = str_replace(self::PLACEHOLDER_DELIM, $delim, $field);
496 21
                if ($this->isQuoted($field)) {
497 18
                    $field = $this->unQuote($field);
498 18
                    $freq->get('quoted')->push($this->lickDataType($field));
499
                    // since we know there's at least one quoted field,
500
                    // QUOTE_NONE can be ruled out
501 18
                    $quoting_styles->set(Flavor::QUOTE_NONE, false);
502 18
                } else {
503 21
                    $freq->get('unquoted')->push($this->lickDataType($field));
504
                    // since we know there's at least one unquoted field,
505
                    // QUOTE_ALL can be ruled out
506 21
                    $quoting_styles->set(Flavor::QUOTE_ALL, false);
507
                }
508 21
            };
509 21
            collect(explode($delim, $line))
510 21
                ->walk($qnqaFunc->bindTo($this));
511
512 21
        };
513 21
        $lines->walk($qsFunc->bindTo($this));
514
515 21
        $types = $freq->get('quoted')->unique();
516
        $quoting_styles = $quoting_styles->filter(function($val) { return (bool) $val; });
517
        // if quoting_styles still has QUOTE_ALL or QUOTE_NONE, then return
518
        // whichever of them it is, we don't need to do anything else
519 21
        if ($quoting_styles->has(Flavor::QUOTE_ALL)) return Flavor::QUOTE_ALL;
520 21
        if ($quoting_styles->has(Flavor::QUOTE_NONE)) return Flavor::QUOTE_NONE;
521 18
        if (count($types) == 1) {
522 18
            $style = $types->getValueAtPosition(0);
523 18
            if ($quoting_styles->has($style)) {
524
                return $style;
525
            }
526 18
        } else {
527 2
            if ($types->contains(self::DATA_NONNUMERIC)) {
528
                // allow for a SMALL amount of error here
529 2
                $counts = collect([self::DATA_SPECIAL => 0, self::DATA_NONNUMERIC => 0]);
530
                $freq->get('quoted')->walk(function ($type) use (&$counts) {
531 2
                    $counts->increment($type);
532 2
                });
533
                // @todo is all this even necessary? seems unnecessary to me...
534 2
                if ($most = $counts->max()) {
535 2
                    $least = $counts->min();
536 2
                    $err_margin = $least / $most;
537 2
                    if ($err_margin < 1) return Flavor::QUOTE_NONNUMERIC;
538
                }
539
            }
540
        }
541 18
        return Flavor::QUOTE_MINIMAL;
542
    }
543
544
    /**
545
     * Remove quotes around a piece of text (if there are any)
546
     *
547
     * @param string The data to "unquote"
548
     * @return string The data passed in, only with quotes stripped (off the edges)
549
     */
550 21
    protected function unQuote($data)
551
    {
552 21
        return preg_replace('/^(["\'])(.*)\1$/', '\2', $data);
553
    }
554
555
    /**
556
     * Determine whether a particular string of data has quotes around it.
557
     *
558
     * @param string The data to check
559
     * @return boolean Whether the data is quoted or not
560
     */
561 21
    protected function isQuoted($data)
562
    {
563 21
        return preg_match('/^([\'"])[^\1]*\1$/', $data);
564
    }
565
566
    /**
567
     * Determine what type of data is contained within a variable
568
     * Possible types:
569
     *     - nonnumeric - only numbers
570
     *     - special - contains characters that could potentially need to be quoted (possible delimiter characters)
571
     *     - unknown - everything else
572
     * This method is really only used within the "lickQuotingStyle" method to
573
     * help determine whether a particular column has been quoted due to it being
574
     * nonnumeric or because it has some special character in it such as a delimiter
575
     * or newline or quote.
576
     *
577
     * @param string The data to determine the type of
578
     * @return string The type of data (one of the "DATA_" constants above)
579
     * @todo I could probably eliminate this method and use an anonymous function
580
     *     instead. It isn't used anywhere else and its name could be misleading.
581
     *     Especially since I also have a lickType method that is used within the
582
     *     lickHeader method.
583
     */
584 21
    protected function lickDataType($data)
585
    {
586
        // @todo make this check for only the quote and delim that are actually being used
587
        // that will make the guess more accurate
588 21
        if (preg_match('/[\'",\t\|:;-]/', $data)) {
589 18
            return self::DATA_SPECIAL;
590 21
        } elseif (preg_match('/[^0-9]/', $data)) {
591 21
            return self::DATA_NONNUMERIC;
592
        }
593 21
        return self::DATA_UNKNOWN;
594
    }
595
596
    /**
597
     * Replace all instances of newlines and whatever character you specify (as
598
     * the delimiter) that are contained within quoted text. The replacements are
599
     * simply a special placeholder string. This is done so that I can use the
600
     * very unsmart "explode" function and not have to worry about it exploding
601
     * on delimiters or newlines within quotes. Once I have exploded, I typically
602
     * sub back in the real characters before doing anything else. Although
603
     * currently there is no dedicated method for doing so I just use str_replace
604
     *
605
     * @param string The string to do the replacements on
606
     * @param string The delimiter character to replace
607
     * @return string The data with replacements performed
608
     * @todo I could probably pass in (maybe optionally) the newline character I
609
     *     want to replace as well. I'll do that if I need to.
610
     */
611 21
    protected function replaceQuotedSpecialChars($data, $delim)
612
    {
613
        return preg_replace_callback('/([\'"])(.*)\1/imsU', function($matches) use ($delim) {
614 18
            $ret = preg_replace("/([\r\n])/", self::PLACEHOLDER_NEWLINE, $matches[0]);
615 18
            $ret = str_replace($delim, self::PLACEHOLDER_DELIM, $ret);
616 18
            return $ret;
617 21
        }, $data);
618
    }
619
620
    /**
621
     * Determine the "type" of a particular string of data. Used for the lickHeader
622
     * method to assign a type to each column to try to determine whether the
623
     * first for is different than a consistent column type.
624
     *
625
     * @todo As I'm writing this method I'm beginning ot realize how expensive
626
     * the lickHeader method is going to end up being since it has to apply all
627
     * these regexes (potentially) to every column. I may end up writing a much
628
     * simpler type-checking method than this if it proves to be too expensive
629
     * to be practical.
630
     *
631
     * @param string The string of data to check the type of
632
     * @return string One of the TYPE_ string constants above
633
     */
634 21
    protected function lickType($data)
635
    {
636 21
        if (preg_match('/^[+-]?[\d\.]+$/', $data)) {
637 18
            return self::TYPE_NUMBER;
638 21
        } elseif (preg_match('/^[+-]?[\d]+\.[\d]+$/', $data)) {
639
            return self::TYPE_DOUBLE;
640 21
        } elseif (preg_match('/^[+-]?[¥£€$]\d+(\.\d+)$/', $data)) {
641
            return self::TYPE_CURRENCY;
642 21
        } elseif (preg_match('/^[a-zA-Z]+$/', $data)) {
643 21
            return self::TYPE_ALPHA;
644
        } else {
645
            try {
646 21
                $year = '([01][0-9])?[0-9]{2}';
647 21
                $month = '([01]?[0-9]|Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)';
648 21
                $day = '[0-3]?[0-9]';
649 21
                $sep = '[\/\.\-]?';
650 21
                $time = '([0-2]?[0-9](:[0-5][0-9]){1,2}(am|pm)?|[01]?[0-9](am|pm))';
651 21
                $date = '(' . $month . $sep . $day . $sep . $year . '|' . $day . $sep . $month . $sep . $year . '|' . $year . $sep . $month . $sep . $day . ')';
652 21
                $dt = new DateTime($data);
653 21
                $dt->setTime(0,0,0);
654 21
                $now = new DateTime();
655 21
                $now->setTime(0,0,0);
656 21
                $diff = $dt->diff($now);
657 21
                $diffDays = (integer) $diff->format( "%R%a" );
658 21
                if ($diffDays === 0) {
659
                    // then this is most likely a time string...
660 6
                    if (preg_match("/^{$time}$/i", $data)) {
661
                        return self::TYPE_TIME;
662
                    }
663 6
                }
664 21
                if (preg_match("/^{$date}$/i", $data)) {
665 18
                    return self::TYPE_DATE;
666 6
                } elseif(preg_match("/^{$date} {$time}$/i")) {
667
                    return self::TYPE_DATETIME;
668
                }
669 21
            } catch (\Exception $e) {
670
                // now go on checking remaining types
671 21
                if (preg_match('/^\w+$/', $data)) {
672 3
                    return self::TYPE_ALNUM;
673
                }
674
            }
675
        }
676 21
        return self::TYPE_STRING;
677
    }
678
679
    /**
680
     * Examines the contents of the CSV data to make a determination of whether
681
     * or not it contains a header row. To make this determination, it creates
682
     * an array of each column's (in each row)'s data type and length and then
683
     * compares them. If all of the rows except the header look similar, it will
684
     * return true. This is only a guess though. There is no programmatic way to
685
     * determine 100% whether a CSV file has a header. The format does not
686
     * provide metadata such as that.
687
     *
688
     * @param string $delim The CSV data's delimiting char (can be a variety of chars but)
689
     *     typically is either a comma or a tab, sometimes a pipe)
690
     * @param string $eol The CSV data's end-of-line char(s) (\n \r or \r\n)
691
     * @return boolean True if the data (most likely) contains a header row
692
     * @todo This method needs a total refactor. It's not necessary to loop twice
693
     *     You could get away with one loop and that would allow for me to do
694
     *     something like only examining enough rows to get to a particular
695
     *     "hasHeader" score (+-100 for instance) & then just return true|false
696
     * @todo Also, break out of the first loop after a certain (perhaps even a
697
     *     configurable) amount of lines (you only need to examine so much data )
698
     *     to reliably make a determination and this is an expensive method)
699
     * @todo I could remove the need for quote, delim, and eol by "licking" the
700
     *     data sample provided in the first argument. Also, I could actually
701
     *     create a Reader object to read the data here.
702
     */
703 22
    public function lickHeader($delim, $eol)
704
    {
705 22
        $types = collect();
706
        $buildTypes = function($line, $line_no) use (&$types, $delim, $eol) {
707 22
            $line = str_replace(self::PLACEHOLDER_NEWLINE, $eol, $line);
708
            $getType = function($field, $colpos) use (&$types, $line, $line_no, $delim) {
709 22
                $field = str_replace(self::PLACEHOLDER_DELIM, $delim, $field);
710
                // @todo Need a Collection::setTableField($x, $y) method
711
                //       See notes in green binder about refactoring Collection
712 22
                if (!$types->has($line_no)) $types->set($line_no, collect());
713 22
                $types->get($line_no)->set($colpos, [
714 22
                    'type' => $this->lickType($this->unQuote($field)),
715 22
                    'length' => strlen($field)
716 22
                ]);
717 22
            };
718 22
            collect(explode($delim, $line))->walk($getType->bindTo($this));
719 22
        };
720 22
        collect(explode(
721 22
            $eol,
722 22
            $this->replaceQuotedSpecialChars($this->sample, $delim)
723 22
        ))
724 22
        ->walk($buildTypes->bindTo($this));
725
726 22
        $hasHeader = 0;
727 22
        $possibleHeader = $types->shift();
728
        $types->walk(function($row) use (&$hasHeader, $possibleHeader) {
729 22
            $row->walk(function($field_info, $col_no) use (&$hasHeader, $possibleHeader) {
730 22
                extract($field_info);
731
                try {
732 22
                    $col = $possibleHeader->get($col_no, null, true);
733 22
                    extract($col, EXTR_PREFIX_ALL, "header");
734 22
                    if ($header_type == self::TYPE_STRING) {
735
                        // use length
736 19
                        if ($length != $header_length) $hasHeader++;
737
                        else $hasHeader--;
738 19
                    } else {
739
                        // use data type
740 22
                        if ($type != $header_type) $hasHeader++;
741
                        else $hasHeader--;
742
                    }
743 22
                } catch (OutOfBoundsException $e) {
744
                    // failure...
745 2
                    return;
746
                }
747 22
            });
748 22
        });
749 22
        return $hasHeader > 0;
750
    }
751
}
752