Completed
Push — master ( 5c1aea...9ef1c4 )
by Luke
03:03
created

Taster::lickDelimiter()   C

Complexity

Conditions 9
Paths 4

Size

Total Lines 91
Code Lines 39

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 35
CRAP Score 9.3752

Importance

Changes 0
Metric Value
cc 9
eloc 39
c 0
b 0
f 0
nc 4
nop 1
dl 0
loc 91
ccs 35
cts 42
cp 0.8333
crap 9.3752
rs 5.1434

How to fix   Long Method   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
/**
3
 * CSVelte: Slender, elegant CSV for PHP
4
 * Inspired by Python's CSV module and Frictionless Data and the W3C's CSV
5
 * standardization efforts, CSVelte was written in an effort to take all the
6
 * suck out of working with CSV.
7
 *
8
 * @version   v0.2.1
9
 * @copyright Copyright (c) 2016 Luke Visinoni <[email protected]>
10
 * @author    Luke Visinoni <[email protected]>
11
 * @license   https://github.com/deni-zen/csvelte/blob/master/LICENSE The MIT License (MIT)
12
 */
13
namespace CSVelte;
14
15
use \DateTime;
16
use CSVelte\Contract\Streamable;
17
18
use \Exception;
19
use \OutOfBoundsException;
20
use CSVelte\Exception\TasterException;
21
22
use function CSVelte\collect;
23
24
/**
25
 * CSVelte\Taster
26
 * Given CSV data, Taster will "taste" the data and provide its buest guess at
27
 * its "flavor". In other words, this class inspects CSV data and attempts to
28
 * auto-detect various CSV attributes such as line endings, quote characters, etc..
29
 *
30
 * @package   CSVelte
31
 * @copyright (c) 2016, Luke Visinoni <[email protected]>
32
 * @author    Luke Visinoni <[email protected]>
33
 * @todo      There are a ton of improvements that could be made to this class.
34
 *            I'll do a refactor on this fella once I get at least one test
35
 *            passing for each of its public methods.
36
 * @todo      Should I have a lickEscapeChar method? The python version doesn't
37
 *            have one. But then why does it even bother including one in its
38
 *            flavor class?
39
 * @todo      Examine each of the public methods in this class and determine
40
 *            whether it makes sense to ask for the data as a param rather than
41
 *            just pulling it from source. I don't think it makes sense... it
42
 *            was just easier to write the methods that way during testing.
43
 * @todo      There are at least portions of this class that could use the
44
 *            Reader class rather than working directly with data.
45
 * @todo      Refactor all of the anonymous functions used as callbacks. Rather
46
 *            than passing $this all over, use $closure->bindTo() instead...
47
 *            Actually, write a method called getBoundClosure() or something...
48
 *            maybe even make it a trait I don't know yet. But here it would
49
 *            allow me to bind any anon function to $this and give me a certain
50
 *            set of commonly needed values ($delim, $eol, etc.)
51
 */
52
class Taster
53
{
54
    /**
55
     * End-of-line constants
56
     */
57
    const EOL_UNIX    = 'lf';
58
    const EOL_TRS80   = 'cr';
59
    const EOL_WINDOWS = 'crlf';
60
61
    /**
62
     * ASCII character codes for "invisibles"
63
     */
64
    const HORIZONTAL_TAB = 9;
65
    const LINE_FEED = 10;
66
    const CARRIAGE_RETURN = 13;
67
    const SPACE = 32;
68
69
    /**
70
     * Data types -- Used within the lickQuotingStyle method
71
     */
72
    const DATA_NONNUMERIC = 'nonnumeric';
73
    const DATA_SPECIAL = 'special';
74
    const DATA_UNKNOWN = 'unknown';
75
76
    /**
77
     * Placeholder strings -- hold the place of newlines and delimiters contained
78
     * within quoted text so that the explode method doesn't split incorrectly
79
     */
80
    const PLACEHOLDER_NEWLINE = '[__NEWLINE__]';
81
    const PLACEHOLDER_DELIM = '[__DELIM__]';
82
83
    /**
84
     * Recommended data sample size
85
     */
86
    const SAMPLE_SIZE = 2500;
87
88
    /**
89
     * Column data types -- used within the lickHeader method to determine
90
     * whether the first row contains different types of data than the rest of
91
     * the rows (and thus, is likely a header row)
92
     */
93
    // +-987
94
    const TYPE_NUMBER = 'number';
95
    // +-12.387
96
    const TYPE_DOUBLE = 'double';
97
    // I am a string. I can contain all kinds of stuff.
98
    const TYPE_STRING = 'string';
99
    // 2010-04-23 04:23:00
100
    const TYPE_DATETIME = 'datetime';
101
    // 10-Jul-15, 9/1/2007, April 1st, 2006, etc.
102
    const TYPE_DATE = 'date';
103
    // 10:00pm, 5pm, 13:08, etc.
104
    const TYPE_TIME = 'time';
105
    // $98.96, ¥12389, £6.08, €87.00
106
    const TYPE_CURRENCY = 'currency';
107
    // 12ab44m1n2_asdf
108
    const TYPE_ALNUM = 'alnum';
109
    // abababab
110
    const TYPE_ALPHA = 'alpha';
111
112
    /** @var Contract\Streamable The source of data to examine */
113
    protected $input;
114
115
    /** @var string Sample of CSV data to use for tasting (determining CSV flavor) */
116
    protected $sample;
117
118
    /** @var array Possible delimiter characters in (roughly) the order of likelihood */
119
    protected $delims = [",", "\t", ";", "|", ":", "-", "_", "#", "/", '\\', '$', '+', '=', '&', '@'];
120
121
    /**
122
     * Class constructor--accepts a CSV input source
123
     *
124
     * @param Contract\Streamable The source of CSV data
125
     * @throws TasterException
126
     * @todo It may be a good idea to skip the first line or two for the sample
127
     *     so that the header line(s) don't throw things off (with the exception
128
     *     of lickHeader() obviously)
129
     */
130 22
    public function __construct(Streamable $input)
131
    {
132 22
        $this->input = $input;
133 22
        if (!$this->sample = $input->read(self::SAMPLE_SIZE)) {
134 1
            throw new TasterException("Invalid input, cannot read sample.", TasterException::ERR_INVALID_SAMPLE);
135
        }
136 21
    }
137
138
    /**
139
     * "Invoke" magic method.
140
     *
141
     * Called when an object is invoked as if it were a function. So, for instance,
142
     * This is simply an alias to the lick method.
143
     *
144
     * @return Flavor A flavor object
145
     * @throws TasterException
146
     */
147 20
    public function __invoke()
148
    {
149 20
        return $this->lick();
150
    }
151
152
    /**
153
     * Examine the input source and determine what "Flavor" of CSV it contains.
154
     * The CSV format, while having an RFC (https://tools.ietf.org/html/rfc4180),
155
     * doesn't necessarily always conform to it. And it doesn't provide meta such as the delimiting character, quote character, or what types of data are quoted.
156
     * such as the delimiting character, quote character, or what types of data are quoted.
157
     * are quoted.
158
     *
159
     * @return Flavor The metadata that the CSV format doesn't provide
160
     * @throws TasterException
161
     * @todo Implement a lickQuote method for when lickQuoteAndDelim method fails
162
     * @todo Should there bea lickEscapeChar method? the python module that inspired
163
     *     this library doesn't include one...
164
     * @todo This should cache the results and only regenerate if $this->sample
165
     *     changes (or $this->input)
166
     */
167 21
    public function lick()
168
    {
169 21
        $lineTerminator = $this->lickLineEndings();
170
        try {
171 21
            list($quoteChar, $delimiter) = $this->lickQuoteAndDelim();
172 21
        } catch (TasterException $e) {
173 6
            if ($e->getCode() !== TasterException::ERR_QUOTE_AND_DELIM) throw $e;
174 6
            $quoteChar = '"';
175 6
            $delimiter = $this->lickDelimiter($lineTerminator);
176
        }
177
        /**
178
         * @todo Should this be null? Because doubleQuote = true means this = null
179
         */
180 21
        $escapeChar = '\\';
181 21
        $quoteStyle = $this->lickQuotingStyle($delimiter, $lineTerminator);
182 21
        $header = $this->lickHeader($delimiter, $lineTerminator);
183 21
        return new Flavor(compact('quoteChar', 'escapeChar', 'delimiter', 'lineTerminator', 'quoteStyle', 'header'));
184
    }
185
186
    /**
187
     * Replaces all quoted columns with a blank string. I was using this method
188
     * to prevent explode() from incorrectly splitting at delimiters and newlines
189
     * within quotes when parsing a file. But this was before I wrote the
190
     * replaceQuotedSpecialChars method which (at least to me) makes more sense.
191
     *
192
     * @param string $data The string to replace quoted strings within
193
     * @return string The input string with quoted strings removed
194
     * @todo Replace code that uses this method with the replaceQuotedSpecialChars
195
     *     method instead. I think it's cleaner.
196
     */
197 21
    protected function removeQuotedStrings($data)
198
    {
199 21
        return preg_replace($pattern = '/(["\'])(?:(?=(\\\\?))\2.)*?\1/sm', $replace = '', $data);
200
    }
201
202
    /**
203
     * Examine the input source to determine which character(s) are being used
204
     * as the end-of-line character
205
     *
206
     * @return string The end-of-line char for the input data
207
     * @credit pulled from stackoverflow thread *tips hat to username "Harm"*
208
     * @todo This should throw an exception if it cannot determine the line ending
209
     * @todo I probably will make this method protected when I'm done with testing...
210
     * @todo If there is any way for this method to fail (for instance if a file )
211
     *       is totally empty or contains no line breaks), then it needs to throw
212
     *       a relevant TasterException
213
     * @todo Use replaceQuotedSpecialChars rather than removeQuotedStrings()
214
     */
215 21
    protected function lickLineEndings()
216
    {
217 21
        $str = $this->removeQuotedStrings($this->sample);
218
        $eols = [
219 21
            self::EOL_WINDOWS => "\r\n",  // 0x0D - 0x0A - Windows, DOS OS/2
220 21
            self::EOL_UNIX    => "\n",    // 0x0A -      - Unix, OSX
221 21
            self::EOL_TRS80   => "\r",    // 0x0D -      - Apple ][, TRS80
222 21
        ];
223
224 21
        $curCount = 0;
225
        // @todo This should return a default maybe?
226 21
        $curEol = PHP_EOL;
227 21
        foreach($eols as $k => $eol) {
228 21
            if( ($count = substr_count($str, $eol)) > $curCount) {
229 21
                $curCount = $count;
230 21
                $curEol = $eol;
231 21
            }
232 21
        }
233 21
        return $curEol;
234
    }
235
236
    /**
237
     * The best way to determine quote and delimiter characters is when columns
238
     * are quoted, often you can seek out a pattern of delim, quote, stuff, quote, delim
239
     * but this only works if you have quoted columns. If you don't you have to
240
     * determine these characters some other way... (see lickDelimiter)
241
     *
242
     * @return array A two-row array containing quotechar, delimchar
243
     * @throws TasterException
244
     * @todo make protected
245
     * @todo This should throw an exception if it cannot determine the delimiter
246
     *     this way.
247
     * @todo This should check for any line endings not just \n
248
     */
249 21
    protected function lickQuoteAndDelim()
250
    {
251
        /**
252
         * @var array An array of pattern matches
253
         */
254 21
        $matches = null;
255
        /**
256
         * @var array An array of patterns (regex)
257
         */
258 21
        $patterns = [];
259
        // delim can be anything but line breaks, quotes, alphanumeric, underscore, backslash, or any type of spaces
260 21
        $antidelims = implode(array("\r", "\n", "\w", preg_quote('"', '/'), preg_quote("'", '/'), preg_quote(chr(self::SPACE), '/')));
261 21
        $delim = '(?P<delim>[^' . $antidelims . '])';
262 21
        $quote = '(?P<quoteChar>"|\'|`)'; // @todo I think MS Excel uses some strange encoding for fancy open/close quotes
263 21
        $patterns[] = '/' . $delim . ' ?' . $quote . '.*?\2\1/ms'; // ,"something", - anything but whitespace or quotes followed by a possible space followed by a quote followed by anything followed by same quote, followed by same anything but whitespace
264 21
        $patterns[] = '/(?:^|\n)' . $quote . '.*?\1' . $delim . ' ?/ms'; // 'something', - beginning of line or line break, followed by quote followed by anything followed by quote followed by anything but whitespace or quotes
265 21
        $patterns[] = '/' . $delim . ' ?' . $quote . '.*?\2(?:^|\n)/ms'; // ,'something' - anything but whitespace or quote followed by possible space followed by quote followed by anything followed by quote, followed by end of line
266 21
        $patterns[] = '/(?:^|\n)' . $quote . '.*?\2(?:$|\n)/ms'; // 'something' - beginning of line followed by quote followed by anything followed by quote followed by same quote followed by end of line
267 21
        foreach ($patterns as $pattern) {
268
            // @todo I had to add the error suppression char here because it was
269
            //     causing undefined offset errors with certain data sets. strange...
270 21
            if (@preg_match_all($pattern, $this->sample, $matches) && $matches) break;
271 21
        }
272 21
        if ($matches) {
273
            try {
274
                return [
275 21
                    collect($matches)
276 21
                        ->frequency()
277 21
                        ->get('quoteChar')
278 21
                        ->sort()
279 21
                        ->reverse()
280 21
                        ->getKeyAtPosition(0),
281 18
                    collect($matches)
282 18
                        ->frequency()
283 18
                        ->get('delim')
284 18
                        ->sort()
285 18
                        ->reverse()
286 18
                        ->getKeyAtPosition(0)
287 18
                ];
288 6
            } catch (OutOfBoundsException $e) {
289
                // eat this exception and let the taster exception below be thrown instead...
290
            }
291 6
        }
292 6
        throw new TasterException("quoteChar and delimiter cannot be determined", TasterException::ERR_QUOTE_AND_DELIM);
293
    }
294
295
     /**
296
      * Take a list of likely delimiter characters and find the one that occurs
297
      * the most consistent amount of times within the provided data.
298
      *
299
      * @param string $eol The character(s) used for newlines
300
      * @return string One of four Flavor::QUOTING_* constants
301
      * @see Flavor for possible quote style constants
302
      * @todo Refactor this method--It needs more thorough testing against a wider
303
      *     variety of CSV data to be sure it works reliably. And I'm sure there
304
      *     are many performance and logic improvements that could be made. This
305
      *     is essentially a first draft.
306
      * @todo Can't use replaceQuotedSpecialChars rather than removeQuotedStrings
307
      *     because the former requires u to know the delimiter
308
      */
309 6
    protected function lickDelimiter($eol = "\n")
310
    {
311 6
        $frequencies = [];
312 6
        $consistencies = [];
313
314
        // build a table of characters and their frequencies for each line. We
315
        // will use this frequency table to then build a table of frequencies of
316
        // each frequency (in 10 lines, "tab" occurred 5 times on 7 of those
317
        // lines, 6 times on 2 lines, and 7 times on 1 line)
318 6
        collect(explode($eol, $this->removeQuotedStrings($this->sample)))
319
            ->walk(function($line, $line_no) use (&$frequencies) {
320 6
                collect(str_split($line))
321
                    ->filter(function($c) { return collect($this->delims)->contains($c); })
322 6
                    ->frequency()
323 6
                    ->sort()
324 6
                    ->reverse()
325
                    ->walk(function($count, $char) use (&$frequencies, $line_no) {
326 6
                        $frequencies[$char][$line_no] = $count;
327 6
                    });
328 6
            })
329
            // the above only finds frequencies for characters if they exist in
330
            // a given line. This will go back and fill in zeroes where a char
331
            // didn't occur at all in a given line (needed to determine mode)
332
            ->walk(function($line, $line_no) use (&$frequencies) {
333 6
                collect($frequencies)
334
                    ->walk(function($counts, $char) use ($line_no, &$frequencies) {
335 6
                        if (!isset($frequencies[$char][$line_no])) {
336 6
                            $frequencies[$char][$line_no] = 0;
337 6
                        }
338 6
                    });
339 6
            });
340
341
        // now determine the mode for each char to decide the "expected" amount
342
        // of times a char (possible delim) will occur on each line...
343 6
        $freqs = collect($frequencies);
344 6
        $modes = $freqs->mode();
345
        $freqs->walk(function($f, $chr) use ($modes, &$consistencies) {
346
            collect($f)->walk(function($num) use ($modes, $chr, &$consistencies) {
347 6
                if ($expected = $modes->get($chr)) {
348 6
                    if ($num == $expected) {
349
                        // met the goal, yay!
350 6
                        if (!isset($consistencies[$chr])) {
351 6
                            $consistencies[$chr] = 0;
352 6
                        }
353 6
                        $consistencies[$chr]++;
354 6
                    }
355 6
                }
356 6
            });
357 6
        });
358
359 6
        $delims = collect($consistencies);
360 6
        $max = $delims->max();
361 6
        $dups = $delims->duplicates();
362 6
        if ($dups->has($max, false)) {
363
            // if more than one candidate, then look at where the character appeared
364
            // in the data. Was it relatively evenly distributed or was there a
365
            // specific area that the character tended to appear? Dates will have a
366
            // consistent format (e.g. 04-23-1986) and so may easily provide a false
367
            // positive for delimiter. But the dash will be focused in that one area,
368
            // whereas the comma character is spread out. You can determine this by
369
            // finding out the number of chars between each occurrence and getting
370
            // the average. If the average is wildly different than any given distance
371
            // than bingo you probably aren't working with a delimiter there...
372
373
            // another option to find the delimiter if there is a tie, is to build
374
            // a table of character position within each line. Then use that to
375
            // determine if one character is consistently in the same position or
376
            // at least the same general area. Use the delimiter that is the most
377
            // consistent in that way...
378
379
            /**
380
             * @todo Add a method here to figure out where duplicate best-match
381
             *     delimiter(s) fall within each line and then, depending on
382
             *     which one has the best distribution, return that one.
383
             */
384
385
             $decision = $dups->get($max);
386
             try {
387
                 return $this->guessDelimByDistribution($decision, $eol);
388
             } catch (TasterException $e) {
389
                 // if somehow we STILL can't come to a consensus, then fall back to a
390
                 // "preferred delimiters" list...
391
                 foreach ($this->delims as $key => $val) {
392
                    if ($delim = array_search($val, $decision)) return $delim;
393
                 }
394
             }
395
        }
396
        return $delims
397 6
            ->sort()
398 6
            ->getKeyAtPosition(0);
399
    }
400
401
    /**
402
     * Compare positional consistency of several characters to determine the
403
     * probable delimiter character. The idea behind this is that the delimiter
404
     * character is likely more consistently distributed than false-positive
405
     * delimiter characters produced by lickDelimiter(). For instance, consider
406
     * a series of rows similar to the following:
407
     *
408
     * 1,luke,visinoni,[email protected],(530) 413-3076,04-23-1986
409
     *
410
     * The lickDelimiter() method will often not be able to determine whether the
411
     * delimiter is a comma or a dash because they occur the same number of times
412
     * on just about every line (5 for comma, 3 for dash). The difference is
413
     * obvious to you, no doubt. But us humans are pattern-recognition machines!
414
     * The difference between the comma and the dash are that the comma is dist-
415
     * ributed almost evenly throughout the line. The dash characters occur
416
     * entirely at the end of the line. This method accepts any number of possible
417
     * delimiter characters and returns the one that is distributed
418
     *
419
     * If delim character cannot be determined by lickQuoteAndDelim(), taster
420
     * tries lickDelimiter(). When that method runs into a tie, it will use this
421
     * as a tie-breaker.
422
     *
423
     * @param array $delims Possible delimiter characters (method chooses from
424
     *     this array of characters)
425
     * @param string $eol The end-of-line character (or set of characters)
426
     * @return string The probable delimiter character
427
     * @throws TasterException
428
     */
429
    protected function guessDelimByDistribution(array $delims, $eol = "\n")
430
    {
431
        try {
432
            // @todo Write a method that does this...
433
            $lines = collect(explode($eol, $this->removeQuotedStrings($this->sample)));
434
            return $delims[collect($delims)->map(function($delim) use (&$distrib, $lines) {
435
                $linedist = collect();
436
                $lines->walk(function($line, $line_no) use (&$linedist, $delim) {
437
                    if (!strlen($line)) return;
438
                    $sectstot = 10;
439
                    $sectlen = (int) (strlen($line) / $sectstot);
440
                    $sections = collect(str_split($line, $sectlen))
441
                        ->map(function($section) use($delim) {
442
                            return substr_count($section, $delim);
443
                        })
444
                        ->filter(function($count) { return (bool) $count; });
445
                    if (is_numeric($count = $sections->count())) {
446
                        $linedist->set($line_no, $count / $sectstot);
447
                    }
448
                });
449
                return $linedist;
450
            })->map(function($dists) {
451
                return $dists->average();
452
            })->sort()
453
              ->reverse()
454
              ->getKeyAtPosition(0)];
455
        } catch (Exception $e) {
456
            throw new TasterException("delimiter cannot be determined by distribution", TasterException::ERR_DELIMITER);
457
        }
458
    }
459
460
    /**
461
     * Determine the "style" of data quoting. The CSV format, while having an RFC
462
     * (https://tools.ietf.org/html/rfc4180), doesn't necessarily always conform
463
     * to it. And it doesn't provide metadata such as the delimiting character,
464
     * quote character, or what types of data are quoted. So this method makes a
465
     * logical guess by finding which columns have been quoted (if any) and
466
     * examining their data type. Most often, CSV files will only use quotes
467
     * around columns that contain special characters such as the dilimiter,
468
     * the quoting character, newlines, etc. (we refer to this style as )
469
     * QUOTE_MINIMAL), but some quote all columns that contain nonnumeric data
470
     * (QUOTE_NONNUMERIC). Then there are CSV files that quote all columns
471
     * (QUOTE_ALL) and those that quote none (QUOTE_NONE).
472
     *
473
     * @param string $delim The character used as the column delimiter
474
     * @param string $eol The character used for newlines
475
     * @return string One of four "QUOTING_" constants defined above--see this
476
     *     method's description for more info.
477
     * @todo Refactor this method--It needs more thorough testing against a wider
478
     *     variety of CSV data to be sure it works reliably. And I'm sure there
479
     *     are many performance and logic improvements that could be made. This
480
     *     is essentially a first draft.
481
     */
482 21
    protected function lickQuotingStyle($delim, $eol)
483
    {
484 21
        $quoting_styles = collect([
485 21
            Flavor::QUOTE_ALL => true,
486 21
            Flavor::QUOTE_NONE => true,
487 21
            Flavor::QUOTE_MINIMAL => true,
488 21
            Flavor::QUOTE_NONNUMERIC => true,
489 21
        ]);
490
491 21
        $lines = collect(explode($eol, $this->replaceQuotedSpecialChars($this->sample, $delim)));
492 21
        $freq = collect()
493 21
            ->set('quoted', collect())
494 21
            ->set('unquoted', collect());
495
496
        // walk through each line from the data sample to determine which fields
497
        // are quoted and which aren't
498
        $qsFunc = function($line) use (&$quoting_styles, &$freq, $eol, $delim) {
499 21
            $line = str_replace(self::PLACEHOLDER_NEWLINE, $eol, $line);
500
            $qnqaFunc = function($field) use (&$quoting_styles, &$freq, $delim) {
501 21
                $field = str_replace(self::PLACEHOLDER_DELIM, $delim, $field);
502 21
                if ($this->isQuoted($field)) {
503 18
                    $field = $this->unQuote($field);
504 18
                    $freq->get('quoted')->push($this->lickDataType($field));
505
                    // since we know there's at least one quoted field,
506
                    // QUOTE_NONE can be ruled out
507 18
                    $quoting_styles->set(Flavor::QUOTE_NONE, false);
508 18
                } else {
509 21
                    $freq->get('unquoted')->push($this->lickDataType($field));
510
                    // since we know there's at least one unquoted field,
511
                    // QUOTE_ALL can be ruled out
512 21
                    $quoting_styles->set(Flavor::QUOTE_ALL, false);
513
                }
514 21
            };
515 21
            collect(explode($delim, $line))
516 21
                ->walk($qnqaFunc->bindTo($this));
517
518 21
        };
519 21
        $lines->walk($qsFunc->bindTo($this));
520
521 21
        $types = $freq->get('quoted')->unique();
522
        $quoting_styles = $quoting_styles->filter(function($val) { return (bool) $val; });
523
        // if quoting_styles still has QUOTE_ALL or QUOTE_NONE, then return
524
        // whichever of them it is, we don't need to do anything else
525 21
        if ($quoting_styles->has(Flavor::QUOTE_ALL)) return Flavor::QUOTE_ALL;
526 21
        if ($quoting_styles->has(Flavor::QUOTE_NONE)) return Flavor::QUOTE_NONE;
527 18
        if (count($types) == 1) {
528 18
            $style = $types->getValueAtPosition(0);
529 18
            if ($quoting_styles->has($style)) {
530
                return $style;
531
            }
532 18
        } else {
533 2
            if ($types->contains(self::DATA_NONNUMERIC)) {
534
                // allow for a SMALL amount of error here
535 2
                $counts = collect([self::DATA_SPECIAL => 0, self::DATA_NONNUMERIC => 0]);
536
                $freq->get('quoted')->walk(function ($type) use (&$counts) {
537 2
                    $counts->increment($type);
538 2
                });
539
                // @todo is all this even necessary? seems unnecessary to me...
540 2
                if ($most = $counts->max()) {
541 2
                    $least = $counts->min();
542 2
                    $err_margin = $least / $most;
543 2
                    if ($err_margin < 1) return Flavor::QUOTE_NONNUMERIC;
544
                }
545
            }
546
        }
547 18
        return Flavor::QUOTE_MINIMAL;
548
    }
549
550
    /**
551
     * Remove quotes around a piece of text (if there are any)
552
     *
553
     * @param string $data The data to "unquote"
554
     * @return string The data passed in, only with quotes stripped (off the edges)
555
     */
556 21
    protected function unQuote($data)
557
    {
558 21
        return preg_replace('/^(["\'])(.*)\1$/', '\2', $data);
559
    }
560
561
    /**
562
     * Determine whether a particular string of data has quotes around it.
563
     *
564
     * @param string $data The data to check
565
     * @return boolean Whether the data is quoted or not
566
     */
567 21
    protected function isQuoted($data)
568
    {
569 21
        return preg_match('/^([\'"])[^\1]*\1$/', $data);
570
    }
571
572
    /**
573
     * Determine what type of data is contained within a variable
574
     * Possible types:
575
     *     - nonnumeric - only numbers
576
     *     - special - contains characters that could potentially need to be quoted (possible delimiter characters)
577
     *     - unknown - everything else
578
     * This method is really only used within the "lickQuotingStyle" method to
579
     * help determine whether a particular column has been quoted due to it being
580
     * nonnumeric or because it has some special character in it such as a delimiter
581
     * or newline or quote.
582
     *
583
     * @param string $data The data to determine the type of
584
     * @return string The type of data (one of the "DATA_" constants above)
585
     * @todo I could probably eliminate this method and use an anonymous function
586
     *     instead. It isn't used anywhere else and its name could be misleading.
587
     *     Especially since I also have a lickType method that is used within the
588
     *     lickHeader method.
589
     */
590 21
    protected function lickDataType($data)
591
    {
592
        // @todo make this check for only the quote and delim that are actually being used
593
        // that will make the guess more accurate
594 21
        if (preg_match('/[\'",\t\|:;-]/', $data)) {
595 18
            return self::DATA_SPECIAL;
596 21
        } elseif (preg_match('/[^0-9]/', $data)) {
597 21
            return self::DATA_NONNUMERIC;
598
        }
599 21
        return self::DATA_UNKNOWN;
600
    }
601
602
    /**
603
     * Replace all instances of newlines and whatever character you specify (as
604
     * the delimiter) that are contained within quoted text. The replacements are
605
     * simply a special placeholder string. This is done so that I can use the
606
     * very unsmart "explode" function and not have to worry about it exploding
607
     * on delimiters or newlines within quotes. Once I have exploded, I typically
608
     * sub back in the real characters before doing anything else. Although
609
     * currently there is no dedicated method for doing so I just use str_replace
610
     *
611
     * @param string $data The string to do the replacements on
612
     * @param string $delim The delimiter character to replace
613
     * @return string The data with replacements performed
614
     * @todo I could probably pass in (maybe optionally) the newline character I
615
     *     want to replace as well. I'll do that if I need to.
616
     */
617 21
    protected function replaceQuotedSpecialChars($data, $delim)
618
    {
619
        return preg_replace_callback('/([\'"])(.*)\1/imsU', function($matches) use ($delim) {
620 18
            $ret = preg_replace("/([\r\n])/", self::PLACEHOLDER_NEWLINE, $matches[0]);
621 18
            $ret = str_replace($delim, self::PLACEHOLDER_DELIM, $ret);
622 18
            return $ret;
623 21
        }, $data);
624
    }
625
626
    /**
627
     * Determine the "type" of a particular string of data. Used for the lickHeader
628
     * method to assign a type to each column to try to determine whether the
629
     * first for is different than a consistent column type.
630
     *
631
     * @todo As I'm writing this method I'm beginning ot realize how expensive
632
     * the lickHeader method is going to end up being since it has to apply all
633
     * these regexes (potentially) to every column. I may end up writing a much
634
     * simpler type-checking method than this if it proves to be too expensive
635
     * to be practical.
636
     *
637
     * @param string $data The string of data to check the type of
638
     * @return string One of the TYPE_ string constants above
639
     */
640 21
    protected function lickType($data)
641
    {
642 21
        if (preg_match('/^[+-]?[\d\.]+$/', $data)) {
643 18
            return self::TYPE_NUMBER;
644 21
        } elseif (preg_match('/^[+-]?[\d]+\.[\d]+$/', $data)) {
645
            return self::TYPE_DOUBLE;
646 21
        } elseif (preg_match('/^[+-]?[¥£€$]\d+(\.\d+)$/', $data)) {
647
            return self::TYPE_CURRENCY;
648 21
        } elseif (preg_match('/^[a-zA-Z]+$/', $data)) {
649 21
            return self::TYPE_ALPHA;
650
        } else {
651
            try {
652 21
                $year = '([01][0-9])?[0-9]{2}';
653 21
                $month = '([01]?[0-9]|Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)';
654 21
                $day = '[0-3]?[0-9]';
655 21
                $sep = '[\/\.\-]?';
656 21
                $time = '([0-2]?[0-9](:[0-5][0-9]){1,2}(am|pm)?|[01]?[0-9](am|pm))';
657 21
                $date = '(' . $month . $sep . $day . $sep . $year . '|' . $day . $sep . $month . $sep . $year . '|' . $year . $sep . $month . $sep . $day . ')';
658 21
                $dt = new DateTime($data);
659 21
                $dt->setTime(0,0,0);
660 21
                $now = new DateTime();
661 21
                $now->setTime(0,0,0);
662 21
                $diff = $dt->diff($now);
663 21
                $diffDays = (integer) $diff->format( "%R%a" );
664 21
                if ($diffDays === 0) {
665
                    // then this is most likely a time string...
666 6
                    if (preg_match("/^{$time}$/i", $data)) {
667
                        return self::TYPE_TIME;
668
                    }
669 6
                }
670 21
                if (preg_match("/^{$date}$/i", $data)) {
671 18
                    return self::TYPE_DATE;
672 6
                } elseif(preg_match("/^{$date} {$time}$/i")) {
673
                    return self::TYPE_DATETIME;
674
                }
675 21
            } catch (\Exception $e) {
676
                // now go on checking remaining types
677 21
                if (preg_match('/^\w+$/', $data)) {
678 3
                    return self::TYPE_ALNUM;
679
                }
680
            }
681
        }
682 21
        return self::TYPE_STRING;
683
    }
684
685
    /**
686
     * Examines the contents of the CSV data to make a determination of whether
687
     * or not it contains a header row. To make this determination, it creates
688
     * an array of each column's (in each row)'s data type and length and then
689
     * compares them. If all of the rows except the header look similar, it will
690
     * return true. This is only a guess though. There is no programmatic way to
691
     * determine 100% whether a CSV file has a header. The format does not
692
     * provide metadata such as that.
693
     *
694
     * @param string $delim The CSV data's delimiting char (can be a variety of chars but)
695
     *     typically is either a comma or a tab, sometimes a pipe)
696
     * @param string $eol The CSV data's end-of-line char(s) (\n \r or \r\n)
697
     * @return boolean True if the data (most likely) contains a header row
698
     * @todo This method needs a total refactor. It's not necessary to loop twice
699
     *     You could get away with one loop and that would allow for me to do
700
     *     something like only examining enough rows to get to a particular
701
     *     "hasHeader" score (+-100 for instance) & then just return true|false
702
     * @todo Also, break out of the first loop after a certain (perhaps even a
703
     *     configurable) amount of lines (you only need to examine so much data )
704
     *     to reliably make a determination and this is an expensive method)
705
     * @todo I could remove the need for quote, delim, and eol by "licking" the
706
     *     data sample provided in the first argument. Also, I could actually
707
     *     create a Reader object to read the data here.
708
     */
709 22
    public function lickHeader($delim, $eol)
710
    {
711 22
        $types = collect();
712
        $buildTypes = function($line, $line_no) use (&$types, $delim, $eol) {
713 22
            $line = str_replace(self::PLACEHOLDER_NEWLINE, $eol, $line);
714
            $getType = function($field, $colpos) use (&$types, $line, $line_no, $delim) {
715 22
                $field = str_replace(self::PLACEHOLDER_DELIM, $delim, $field);
716
                // @todo Need a Collection::setTableField($x, $y) method
717
                //       See notes in green binder about refactoring Collection
718 22
                if (!$types->has($line_no)) $types->set($line_no, collect());
719 22
                $types->get($line_no)->set($colpos, [
720 22
                    'type' => $this->lickType($this->unQuote($field)),
721 22
                    'length' => strlen($field)
722 22
                ]);
723 22
            };
724 22
            collect(explode($delim, $line))->walk($getType->bindTo($this));
725 22
        };
726 22
        collect(explode(
727 22
            $eol,
728 22
            $this->replaceQuotedSpecialChars($this->sample, $delim)
729 22
        ))
730 22
        ->walk($buildTypes->bindTo($this));
731
732 22
        $hasHeader = 0;
733 22
        $possibleHeader = $types->shift();
734
        $types->walk(function($row) use (&$hasHeader, $possibleHeader) {
735 22
            $row->walk(function($field_info, $col_no) use (&$hasHeader, $possibleHeader) {
736 22
                extract($field_info);
737
                try {
738 22
                    $col = $possibleHeader->get($col_no, null, true);
739 22
                    extract($col, EXTR_PREFIX_ALL, "header");
740 22
                    if ($header_type == self::TYPE_STRING) {
741
                        // use length
742 19
                        if ($length != $header_length) $hasHeader++;
743
                        else $hasHeader--;
744 19
                    } else {
745
                        // use data type
746 22
                        if ($type != $header_type) $hasHeader++;
747
                        else $hasHeader--;
748
                    }
749 22
                } catch (OutOfBoundsException $e) {
750
                    // failure...
751 2
                    return;
752
                }
753 22
            });
754 22
        });
755 22
        return $hasHeader > 0;
756
    }
757
}
758