Passed
Branch refactor/164-removeoldcollecti... (61eb5d)
by Luke
02:41
created

Taster::lickQuoteAndDelim()   C

Complexity

Conditions 8
Paths 12

Size

Total Lines 40
Code Lines 22

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 8
eloc 22
nc 12
nop 0
dl 0
loc 40
rs 5.3846
c 0
b 0
f 0
1
<?php
2
3
/*
4
 * CSVelte: Slender, elegant CSV for PHP
5
 * Inspired by Python's CSV module and Frictionless Data and the W3C's CSV
6
 * standardization efforts, CSVelte was written in an effort to take all the
7
 * suck out of working with CSV.
8
 *
9
 * @version   {version}
10
 * @copyright Copyright (c) 2016 Luke Visinoni <[email protected]>
11
 * @author    Luke Visinoni <[email protected]>
12
 * @license   https://github.com/deni-zen/csvelte/blob/master/LICENSE The MIT License (MIT)
13
 */
14
namespace CSVelte;
15
16
use CSVelte\Collection\AbstractCollection;
17
use CSVelte\Collection\Collection;
18
use CSVelte\Collection\MultiCollection;
19
use CSVelte\Collection\NumericCollection;
20
use CSVelte\Collection\TabularCollection;
21
use CSVelte\Contract\Streamable;
22
use CSVelte\Exception\TasterException;
23
24
use DateTime;
25
use Exception;
26
use OutOfBoundsException;
27
28
use function CSVelte\collect;
29
30
/**
31
 * CSVelte\Taster
32
 * Given CSV data, Taster will "taste" the data and provide its buest guess at
33
 * its "flavor". In other words, this class inspects CSV data and attempts to
34
 * auto-detect various CSV attributes such as line endings, quote characters, etc..
35
 *
36
 * @package   CSVelte
37
 *
38
 * @copyright (c) 2016, Luke Visinoni <[email protected]>
39
 * @author    Luke Visinoni <[email protected]>
40
 *
41
 * @todo      There are a ton of improvements that could be made to this class.
42
 *            I'll do a refactor on this fella once I get at least one test
43
 *            passing for each of its public methods.
44
 * @todo      Should I have a lickEscapeChar method? The python version doesn't
45
 *            have one. But then why does it even bother including one in its
46
 *            flavor class?
47
 * @todo      Examine each of the public methods in this class and determine
48
 *            whether it makes sense to ask for the data as a param rather than
49
 *            just pulling it from source. I don't think it makes sense... it
50
 *            was just easier to write the methods that way during testing.
51
 * @todo      There are at least portions of this class that could use the
52
 *            Reader class rather than working directly with data.
53
 * @todo      Refactor all of the anonymous functions used as callbacks. Rather
54
 *            than passing $this all over, use $closure->bindTo() instead...
55
 *            Actually, write a method called getBoundClosure() or something...
56
 *            maybe even make it a trait I don't know yet. But here it would
57
 *            allow me to bind any anon function to $this and give me a certain
58
 *            set of commonly needed values ($delim, $eol, etc.)
59
 */
60
class Taster
61
{
62
    /**
63
     * End-of-line constants.
64
     */
65
    const EOL_UNIX    = 'lf';
66
    const EOL_TRS80   = 'cr';
67
    const EOL_WINDOWS = 'crlf';
68
69
    /**
70
     * ASCII character codes for "invisibles".
71
     */
72
    const HORIZONTAL_TAB  = 9;
73
    const LINE_FEED       = 10;
74
    const CARRIAGE_RETURN = 13;
75
    const SPACE           = 32;
76
77
    /**
78
     * Data types -- Used within the lickQuotingStyle method.
79
     */
80
    const DATA_NONNUMERIC = 'nonnumeric';
81
    const DATA_SPECIAL    = 'special';
82
    const DATA_UNKNOWN    = 'unknown';
83
84
    /**
85
     * Placeholder strings -- hold the place of newlines and delimiters contained
86
     * within quoted text so that the explode method doesn't split incorrectly.
87
     */
88
    const PLACEHOLDER_NEWLINE = '[__NEWLINE__]';
89
    const PLACEHOLDER_DELIM   = '[__DELIM__]';
90
91
    /**
92
     * Recommended data sample size.
93
     */
94
    const SAMPLE_SIZE = 2500;
95
96
    /**
97
     * Column data types -- used within the lickHeader method to determine
98
     * whether the first row contains different types of data than the rest of
99
     * the rows (and thus, is likely a header row).
100
     */
101
    // +-987
102
    const TYPE_NUMBER = 'number';
103
    // +-12.387
104
    const TYPE_DOUBLE = 'double';
105
    // I am a string. I can contain all kinds of stuff.
106
    const TYPE_STRING = 'string';
107
    // 2010-04-23 04:23:00
108
    const TYPE_DATETIME = 'datetime';
109
    // 10-Jul-15, 9/1/2007, April 1st, 2006, etc.
110
    const TYPE_DATE = 'date';
111
    // 10:00pm, 5pm, 13:08, etc.
112
    const TYPE_TIME = 'time';
113
    // $98.96, ¥12389, £6.08, €87.00
114
    const TYPE_CURRENCY = 'currency';
115
    // 12ab44m1n2_asdf
116
    const TYPE_ALNUM = 'alnum';
117
    // abababab
118
    const TYPE_ALPHA = 'alpha';
119
120
    /** @var Contract\Streamable The source of data to examine */
121
    protected $input;
122
123
    /** @var string Sample of CSV data to use for tasting (determining CSV flavor) */
124
    protected $sample;
125
126
    /** @var array Possible delimiter characters in (roughly) the order of likelihood */
127
    protected $delims = [',', "\t", ';', '|', ':', '-', '_', '#', '/', '\\', '$', '+', '=', '&', '@'];
128
129
    /**
130
     * Class constructor--accepts a CSV input source.
131
     *
132
     * @param Contract\Streamable The source of CSV data
133
     *
134
     * @throws TasterException
135
     *
136
     * @todo It may be a good idea to skip the first line or two for the sample
137
     *     so that the header line(s) don't throw things off (with the exception
138
     *     of lickHeader() obviously)
139
     */
140
    public function __construct(Streamable $input)
141
    {
142
        $this->delims = collect($this->delims);
0 ignored issues
show
Documentation Bug introduced by
It seems like \CSVelte\collect($this->delims) of type object<CSVelte\Collection\AbstractCollection> is incompatible with the declared type array of property $delims.

Our type inference engine has found an assignment to a property that is incompatible with the declared type of that property.

Either this assignment is in error or the assigned type should be added to the documentation/type hint for that property..

Loading history...
143
        $this->input = $input;
144
        if (!$this->sample = $input->read(self::SAMPLE_SIZE)) {
145
            throw new TasterException('Invalid input, cannot read sample.', TasterException::ERR_INVALID_SAMPLE);
146
        }
147
    }
148
149
    /**
150
     * "Invoke" magic method.
151
     *
152
     * Called when an object is invoked as if it were a function. So, for instance,
153
     * This is simply an alias to the lick method.
154
     *
155
     * @throws TasterException
156
     *
157
     * @return Flavor A flavor object
158
     */
159
    public function __invoke()
160
    {
161
        return $this->lick();
162
    }
163
164
    /**
165
     * Examine the input source and determine what "Flavor" of CSV it contains.
166
     * The CSV format, while having an RFC (https://tools.ietf.org/html/rfc4180),
167
     * doesn't necessarily always conform to it. And it doesn't provide meta such as the delimiting character, quote character, or what types of data are quoted.
168
     * such as the delimiting character, quote character, or what types of data are quoted.
169
     * are quoted.
170
     *
171
     * @throws TasterException
172
     *
173
     * @return Flavor The metadata that the CSV format doesn't provide
174
     *
175
     * @todo Implement a lickQuote method for when lickQuoteAndDelim method fails
176
     * @todo Should there bea lickEscapeChar method? the python module that inspired
177
     *     this library doesn't include one...
178
     * @todo This should cache the results and only regenerate if $this->sample
179
     *     changes (or $this->input)
180
     */
181
    public function lick()
182
    {
183
        $lineTerminator = $this->lickLineEndings();
184
        try {
185
            list($quoteChar, $delimiter) = $this->lickQuoteAndDelim();
186
        } catch (TasterException $e) {
187
            if ($e->getCode() !== TasterException::ERR_QUOTE_AND_DELIM) {
188
                throw $e;
189
            }
190
            $quoteChar = '"';
191
            $delimiter = $this->lickDelimiter($lineTerminator);
192
        }
193
        /**
194
         * @todo Should this be null? Because doubleQuote = true means this = null
195
         */
196
        $escapeChar = '\\';
197
        $quoteStyle = $this->lickQuotingStyle($delimiter, $lineTerminator);
198
        $header     = $this->lickHeader($delimiter, $lineTerminator);
199
200
        return new Flavor(compact('quoteChar', 'escapeChar', 'delimiter', 'lineTerminator', 'quoteStyle', 'header'));
201
    }
202
203
    /**
204
     * Examines the contents of the CSV data to make a determination of whether
205
     * or not it contains a header row. To make this determination, it creates
206
     * an array of each column's (in each row)'s data type and length and then
207
     * compares them. If all of the rows except the header look similar, it will
208
     * return true. This is only a guess though. There is no programmatic way to
209
     * determine 100% whether a CSV file has a header. The format does not
210
     * provide metadata such as that.
211
     *
212
     * @param string $delim The CSV data's delimiting char (can be a variety of chars but)
213
     *                      typically is either a comma or a tab, sometimes a pipe)
214
     * @param string $eol   The CSV data's end-of-line char(s) (\n \r or \r\n)
215
     *
216
     * @return bool True if the data (most likely) contains a header row
217
     *
218
     * @todo This method needs a total refactor. It's not necessary to loop twice
219
     *     You could get away with one loop and that would allow for me to do
220
     *     something like only examining enough rows to get to a particular
221
     *     "hasHeader" score (+-100 for instance) & then just return true|false
222
     * @todo Also, break out of the first loop after a certain (perhaps even a
223
     *     configurable) amount of lines (you only need to examine so much data )
224
     *     to reliably make a determination and this is an expensive method)
225
     * @todo I could remove the need for quote, delim, and eol by "licking" the
226
     *     data sample provided in the first argument. Also, I could actually
227
     *     create a Reader object to read the data here.
228
     */
229
    public function lickHeader($delim, $eol)
230
    {
231
        // this will be filled with the type and length of each column and each row
232
        $types = new TabularCollection();
233
234
        // callback to build the aforementioned collection
235
        $buildTypes = function ($line, $line_no) use ($types, $delim, $eol) {
236
237
            if ($line_no > 2) return;
238
            $line = str_replace(self::PLACEHOLDER_NEWLINE, $eol, $line);
239
            $getType = function ($field, $colpos) use ($types, $line, $line_no, $delim) {
240
                $field = str_replace(self::PLACEHOLDER_DELIM, $delim, $field);
241
                $fieldMeta = [
242
                    "value" => $field,
243
                    "type" => $this->lickType($this->unQuote($field)),
244
                    "length" => strlen($field),
245
                ];
246
                // @todo TabularCollection should have a way to set a value using [row,column]
247
                try {
248
                    $row = $types->get($line_no);
249
                } catch (OutOfBoundsException $e) {
250
                    $row = [];
251
                }
252
                $row[$colpos] = $fieldMeta;
253
                $types->set($line_no, $row);
254
            };
255
            collect(explode($delim, $line))->walk($getType->bindTo($this));
256
257
        };
258
259
        collect(explode(
260
            $eol,
261
            $this->replaceQuotedSpecialChars($this->sample, $delim)
262
        ))
263
        ->walk($buildTypes->bindTo($this));
264
265
        $hasHeader = new NumericCollection();
266
        $possibleHeader = collect($types->shift());
267
        $types->walk(function (AbstractCollection $row) use ($hasHeader, $possibleHeader) {
268
            $row->walk(function (AbstractCollection $fieldMeta, $col_no) use ($hasHeader, $possibleHeader) {
269
                try {
270
                    $col = collect($possibleHeader->get($col_no, null, true));
271
                    if ($fieldMeta->get('type') == self::TYPE_STRING) {
272
                        // use length
273
                        if ($fieldMeta->get('length') != $col->get('length')) {
274
                            $hasHeader->push(1);
275
                        } else {
276
                            $hasHeader->push(-1);
277
                        }
278
                    } else {
279
                        // use data type
280
                        if ($fieldMeta->get('type') != $col->get('type')) {
281
                            $hasHeader->push(1);
282
                        } else {
283
                            $hasHeader->push(-1);
284
                        }
285
                    }
286
                } catch (OutOfBoundsException $e) {
287
                    // failure...
288
                    return;
289
                }
290
            });
291
        });
292
293
        return $hasHeader->sum() > 0;
294
    }
295
296
    /**
297
     * Replaces all quoted columns with a blank string. I was using this method
298
     * to prevent explode() from incorrectly splitting at delimiters and newlines
299
     * within quotes when parsing a file. But this was before I wrote the
300
     * replaceQuotedSpecialChars method which (at least to me) makes more sense.
301
     *
302
     * @param string $data The string to replace quoted strings within
303
     *
304
     * @return string The input string with quoted strings removed
305
     *
306
     * @todo Replace code that uses this method with the replaceQuotedSpecialChars
307
     *     method instead. I think it's cleaner.
308
     */
309
    protected function removeQuotedStrings($data)
310
    {
311
        return preg_replace($pattern = '/(["\'])(?:(?=(\\\\?))\2.)*?\1/sm', $replace = '', $data);
312
    }
313
314
    /**
315
     * Examine the input source to determine which character(s) are being used
316
     * as the end-of-line character.
317
     *
318
     * @return string The end-of-line char for the input data
319
     * @credit pulled from stackoverflow thread *tips hat to username "Harm"*
320
     *
321
     * @todo This should throw an exception if it cannot determine the line ending
322
     * @todo I probably will make this method protected when I'm done with testing...
323
     * @todo If there is any way for this method to fail (for instance if a file )
324
     *       is totally empty or contains no line breaks), then it needs to throw
325
     *       a relevant TasterException
326
     * @todo Use replaceQuotedSpecialChars rather than removeQuotedStrings()
327
     */
328
    protected function lickLineEndings()
329
    {
330
        $str  = $this->removeQuotedStrings($this->sample);
331
        $eols = [
332
            self::EOL_WINDOWS => "\r\n",  // 0x0D - 0x0A - Windows, DOS OS/2
333
            self::EOL_UNIX    => "\n",    // 0x0A -      - Unix, OSX
334
            self::EOL_TRS80   => "\r",    // 0x0D -      - Apple ][, TRS80
335
        ];
336
337
        $curCount = 0;
338
        // @todo This should return a default maybe?
339
        $curEol = PHP_EOL;
340
        foreach ($eols as $k => $eol) {
341
            if (($count = substr_count($str, $eol)) > $curCount) {
342
                $curCount = $count;
343
                $curEol   = $eol;
344
            }
345
        }
346
347
        return $curEol;
348
    }
349
350
    /**
351
     * The best way to determine quote and delimiter characters is when columns
352
     * are quoted, often you can seek out a pattern of delim, quote, stuff, quote, delim
353
     * but this only works if you have quoted columns. If you don't you have to
354
     * determine these characters some other way... (see lickDelimiter).
355
     *
356
     * @throws TasterException
357
     *
358
     * @return array A two-row array containing quotechar, delimchar
359
     *
360
     * @todo make protected
361
     * @todo This should throw an exception if it cannot determine the delimiter
362
     *     this way.
363
     * @todo This should check for any line endings not just \n
364
     */
365
    protected function lickQuoteAndDelim()
366
    {
367
        /**
368
         * @var array An array of pattern matches
369
         */
370
        $matches = null;
371
        /**
372
         * @var array An array of patterns (regex)
373
         */
374
        $patterns = [];
375
        // delim can be anything but line breaks, quotes, alphanumeric, underscore, backslash, or any type of spaces
376
        $antidelims = implode(["\r", "\n", "\w", preg_quote('"', '/'), preg_quote("'", '/'), preg_quote(chr(self::SPACE), '/')]);
377
        $delim      = '(?P<delim>[^' . $antidelims . '])';
378
        $quote      = '(?P<quoteChar>"|\'|`)'; // @todo I think MS Excel uses some strange encoding for fancy open/close quotes
379
        $patterns[] = '/' . $delim . ' ?' . $quote . '.*?\2\1/ms'; // ,"something", - anything but whitespace or quotes followed by a possible space followed by a quote followed by anything followed by same quote, followed by same anything but whitespace
380
        $patterns[] = '/(?:^|\n)' . $quote . '.*?\1' . $delim . ' ?/ms'; // 'something', - beginning of line or line break, followed by quote followed by anything followed by quote followed by anything but whitespace or quotes
381
        $patterns[] = '/' . $delim . ' ?' . $quote . '.*?\2(?:^|\n)/ms'; // ,'something' - anything but whitespace or quote followed by possible space followed by quote followed by anything followed by quote, followed by end of line
382
        $patterns[] = '/(?:^|\n)' . $quote . '.*?\2(?:$|\n)/ms'; // 'something' - beginning of line followed by quote followed by anything followed by quote followed by same quote followed by end of line
383
        foreach ($patterns as $pattern) {
384
            // @todo I had to add the error suppression char here because it was
385
            //     causing undefined offset errors with certain data sets. strange...
386
            if (@preg_match_all($pattern, $this->sample, $matches) && $matches) {
387
                break;
388
            }
389
        }
390
        if ($matches) {
391
            $qcad = array_intersect_key($matches, array_flip(['quoteChar','delim']));
392
            if (!empty($matches['quoteChar']) && !empty($matches['delim'])) {
393
                try {
394
                    return [
395
                        collect($qcad['quoteChar'])->frequency()->sort()->reverse()->getKeyAtPosition(0),
396
                        collect($qcad['delim'])->frequency()->sort()->reverse()->getKeyAtPosition(0),
397
                    ];
398
                } catch (OutOfBoundsException $e) {
399
                    // eat this exception and let the taster exception below be thrown instead...
400
                }
401
            }
402
        }
403
        throw new TasterException('quoteChar and delimiter cannot be determined', TasterException::ERR_QUOTE_AND_DELIM);
404
    }
405
406
    /**
407
     * Take a list of likely delimiter characters and find the one that occurs
408
     * the most consistent amount of times within the provided data.
409
     *
410
     * @param string $eol The character(s) used for newlines
411
     *
412
     * @return string One of four Flavor::QUOTING_* constants
413
     *
414
     * @see Flavor for possible quote style constants
415
     *
416
     * @todo Refactor this method--It needs more thorough testing against a wider
417
     *     variety of CSV data to be sure it works reliably. And I'm sure there
418
     *     are many performance and logic improvements that could be made. This
419
     *     is essentially a first draft.
420
     * @todo Can't use replaceQuotedSpecialChars rather than removeQuotedStrings
421
     *     because the former requires u to know the delimiter
422
     */
423
    protected function lickDelimiter($eol = "\n")
424
    {
425
        $frequencies   = collect();
426
        $consistencies = new NumericCollection();
427
428
        // build a table of characters and their frequencies for each line. We
429
        // will use this frequency table to then build a table of frequencies of
430
        // each frequency (in 10 lines, "tab" occurred 5 times on 7 of those
431
        // lines, 6 times on 2 lines, and 7 times on 1 line)
432
        collect(explode($eol, $this->removeQuotedStrings($this->sample)))
433
            ->walk(function ($line, $line_no) use ($frequencies) {
434
                collect(str_split($line))
435
                    ->filter(function ($c) {
436
                        return collect($this->delims)->contains($c);
437
                    })
438
                    ->frequency()
439
                    ->sort()
440
                    ->reverse()
441
                    ->walk(function ($count, $char) use ($frequencies, $line_no) {
442
                        try {
443
                            $char_counts = $frequencies->get($char, null, true);
444
                        } catch (OutOfBoundsException $e) {
445
                            $char_counts = [];
446
                        }
447
                        $char_counts[$line_no] = $count;
448
                        $frequencies->set($char, $char_counts);
449
                    });
450
            })
451
            // the above only finds frequencies for characters if they exist in
452
            // a given line. This will go back and fill in zeroes where a char
453
            // didn't occur at all in a given line (needed to determine mode)
454
            ->walk(function ($line, $line_no) use ($frequencies) {
455
                $frequencies->walk(function ($counts, $char) use ($line_no, $frequencies) {
456
                    try {
457
                        $char_counts = $frequencies->get($char, null, true);
458
                    } catch (OutOfBoundsException $e) {
459
                        $char_counts = [];
460
                    }
461
                    if (!array_key_exists($line_no, $char_counts)) {
462
                        $char_counts[$line_no] = 0;
463
                    }
464
                    $frequencies->set($char, $char_counts);
465
                });
466
            });
467
468
        // now determine the mode for each char to decide the "expected" amount
469
        // of times a char (possible delim) will occur on each line...
470
        $freqs = $frequencies;
471
        $modes = new NumericCollection([]);
472
        foreach ($freqs as $char => $freq) {
473
            $modes->set($char, collect($freq)->mode());
0 ignored issues
show
Bug introduced by
It seems like you code against a specific sub-type and not the parent class CSVelte\Collection\AbstractCollection as the method mode() does only exist in the following sub-classes of CSVelte\Collection\AbstractCollection: CSVelte\Collection\NumericCollection. Maybe you want to instanceof check for one of these explicitly?

Let’s take a look at an example:

abstract class User
{
    /** @return string */
    abstract public function getPassword();
}

class MyUser extends User
{
    public function getPassword()
    {
        // return something
    }

    public function getDisplayName()
    {
        // return some name.
    }
}

class AuthSystem
{
    public function authenticate(User $user)
    {
        $this->logger->info(sprintf('Authenticating %s.', $user->getDisplayName()));
        // do something.
    }
}

In the above example, the authenticate() method works fine as long as you just pass instances of MyUser. However, if you now also want to pass a different sub-classes of User which does not have a getDisplayName() method, the code will break.

Available Fixes

  1. Change the type-hint for the parameter:

    class AuthSystem
    {
        public function authenticate(MyUser $user) { /* ... */ }
    }
    
  2. Add an additional type-check:

    class AuthSystem
    {
        public function authenticate(User $user)
        {
            if ($user instanceof MyUser) {
                $this->logger->info(/** ... */);
            }
    
            // or alternatively
            if ( ! $user instanceof MyUser) {
                throw new \LogicException(
                    '$user must be an instance of MyUser, '
                   .'other instances are not supported.'
                );
            }
    
        }
    }
    
Note: PHP Analyzer uses reverse abstract interpretation to narrow down the types inside the if block in such a case.
  1. Add the method to the parent class:

    abstract class User
    {
        /** @return string */
        abstract public function getPassword();
    
        /** @return string */
        abstract public function getDisplayName();
    }
    
Loading history...
474
        }
475
        $freqs->walk(function ($f, $chr) use ($modes, $consistencies) {
476
            collect($f)->walk(function ($num) use ($modes, $chr, $consistencies) {
477
                if ($expected = $modes->get($chr)) {
478
                    if ($num == $expected) {
479
                        // met the goal, yay!
480
                        $cc = $consistencies->get($chr, 0);
481
                        $consistencies->set($chr, ++$cc);
482
                    }
483
                }
484
            });
485
        });
486
487
        $delims = $consistencies;
488
        $max    = $delims->max();
489
        $dups   = $delims->duplicates();
490
        if ($dups->has($max)) {
491
            // if more than one candidate, then look at where the character appeared
492
            // in the data. Was it relatively evenly distributed or was there a
493
            // specific area that the character tended to appear? Dates will have a
494
            // consistent format (e.g. 04-23-1986) and so may easily provide a false
495
            // positive for delimiter. But the dash will be focused in that one area,
496
            // whereas the comma character is spread out. You can determine this by
497
            // finding out the number of chars between each occurrence and getting
498
            // the average. If the average is wildly different than any given distance
499
            // than bingo you probably aren't working with a delimiter there...
500
501
            // another option to find the delimiter if there is a tie, is to build
502
            // a table of character position within each line. Then use that to
503
            // determine if one character is consistently in the same position or
504
            // at least the same general area. Use the delimiter that is the most
505
            // consistent in that way...
506
507
            /**
508
             * @todo Add a method here to figure out where duplicate best-match
509
             *     delimiter(s) fall within each line and then, depending on
510
             *     which one has the best distribution, return that one.
511
             */
512
            $decision = $dups->get($max);
513
            try {
514
                return $this->guessDelimByDistribution($decision, $eol);
515
            } catch (TasterException $e) {
516
                // if somehow we STILL can't come to a consensus, then fall back to a
517
                 // "preferred delimiters" list...
518
                 foreach ($this->delims as $key => $val) {
519
                     if ($delim = array_search($val, $decision)) {
520
                         return $delim;
521
                     }
522
                 }
523
            }
524
        }
525
526
        return $delims
527
            ->sort()
528
            ->reverse()
529
            ->getKeyAtPosition(0);
530
    }
531
532
    /**
533
     * Compare positional consistency of several characters to determine the
534
     * probable delimiter character. The idea behind this is that the delimiter
535
     * character is likely more consistently distributed than false-positive
536
     * delimiter characters produced by lickDelimiter(). For instance, consider
537
     * a series of rows similar to the following:.
538
     *
539
     * 1,luke,visinoni,[email protected],(530) 413-3076,04-23-1986
540
     *
541
     * The lickDelimiter() method will often not be able to determine whether the
542
     * delimiter is a comma or a dash because they occur the same number of times
543
     * on just about every line (5 for comma, 3 for dash). The difference is
544
     * obvious to you, no doubt. But us humans are pattern-recognition machines!
545
     * The difference between the comma and the dash are that the comma is dist-
546
     * ributed almost evenly throughout the line. The dash characters occur
547
     * entirely at the end of the line. This method accepts any number of possible
548
     * delimiter characters and returns the one that is distributed
549
     *
550
     * If delim character cannot be determined by lickQuoteAndDelim(), taster
551
     * tries lickDelimiter(). When that method runs into a tie, it will use this
552
     * as a tie-breaker.
553
     *
554
     * @param array  $delims Possible delimiter characters (method chooses from
555
     *                       this array of characters)
556
     * @param string $eol    The end-of-line character (or set of characters)
557
     *
558
     * @throws TasterException
559
     *
560
     * @return string The probable delimiter character
561
     */
562
    protected function guessDelimByDistribution(array $delims, $eol = "\n")
563
    {
564
        try {
565
            // @todo Write a method that does this...
566
            $lines = collect(explode($eol, $this->removeQuotedStrings($this->sample)));
567
568
            return $delims[collect($delims)->map(function ($delim) use (&$distrib, $lines) {
569
                $linedist = collect();
570
                $lines->walk(function ($line, $line_no) use (&$linedist, $delim) {
571
                    if (!strlen($line)) {
572
                        return;
573
                    }
574
                    $sectstot = 10;
575
                    $sectlen = (int) (strlen($line) / $sectstot);
576
                    $sections = collect(str_split($line, $sectlen))
577
                        ->map(function ($section) use ($delim) {
578
                            return substr_count($section, $delim);
579
                        })
580
                        ->filter(function ($count) {
581
                            return (bool) $count;
582
                        });
583
                    if (is_numeric($count = $sections->count())) {
584
                        $linedist->set($line_no, $count / $sectstot);
585
                    }
586
                });
587
588
                return $linedist;
589
            })->map(function ($dists) {
590
                return $dists->average();
591
            })->sort()
592
              ->reverse()
593
              ->getKeyAtPosition(0)];
594
        } catch (Exception $e) {
595
            throw new TasterException('delimiter cannot be determined by distribution', TasterException::ERR_DELIMITER);
596
        }
597
    }
598
599
    /**
600
     * Determine the "style" of data quoting. The CSV format, while having an RFC
601
     * (https://tools.ietf.org/html/rfc4180), doesn't necessarily always conform
602
     * to it. And it doesn't provide metadata such as the delimiting character,
603
     * quote character, or what types of data are quoted. So this method makes a
604
     * logical guess by finding which columns have been quoted (if any) and
605
     * examining their data type. Most often, CSV files will only use quotes
606
     * around columns that contain special characters such as the dilimiter,
607
     * the quoting character, newlines, etc. (we refer to this style as )
608
     * QUOTE_MINIMAL), but some quote all columns that contain nonnumeric data
609
     * (QUOTE_NONNUMERIC). Then there are CSV files that quote all columns
610
     * (QUOTE_ALL) and those that quote none (QUOTE_NONE).
611
     *
612
     * @param string $delim The character used as the column delimiter
613
     * @param string $eol   The character used for newlines
614
     *
615
     * @return string One of four "QUOTING_" constants defined above--see this
616
     *                method's description for more info.
617
     *
618
     * @todo Refactor this method--It needs more thorough testing against a wider
619
     *     variety of CSV data to be sure it works reliably. And I'm sure there
620
     *     are many performance and logic improvements that could be made. This
621
     *     is essentially a first draft.
622
     */
623
    protected function lickQuotingStyle($delim, $eol)
624
    {
625
        $quoting_styles = collect([
626
            Flavor::QUOTE_ALL        => true,
627
            Flavor::QUOTE_NONE       => true,
628
            Flavor::QUOTE_MINIMAL    => true,
629
            Flavor::QUOTE_NONNUMERIC => true,
630
        ]);
631
632
        $lines = collect(explode($eol, $this->replaceQuotedSpecialChars($this->sample, $delim)));
633
        $freq  = collect()
634
            ->set('quoted', collect())
635
            ->set('unquoted', collect());
636
637
        // walk through each line from the data sample to determine which fields
638
        // are quoted and which aren't
639
        $qsFunc = function ($line) use (&$quoting_styles, &$freq, $eol, $delim) {
640
            $line     = str_replace(self::PLACEHOLDER_NEWLINE, $eol, $line);
641
            $qnqaFunc = function ($field) use (&$quoting_styles, &$freq, $delim) {
642
                $field = str_replace(self::PLACEHOLDER_DELIM, $delim, $field);
643
                if ($this->isQuoted($field)) {
644
                    $field = $this->unQuote($field);
645
                    $freq->get('quoted')->push($this->lickDataType($field));
646
                    // since we know there's at least one quoted field,
647
                    // QUOTE_NONE can be ruled out
648
                    $quoting_styles->set(Flavor::QUOTE_NONE, false);
649
                } else {
650
                    $freq->get('unquoted')->push($this->lickDataType($field));
651
                    // since we know there's at least one unquoted field,
652
                    // QUOTE_ALL can be ruled out
653
                    $quoting_styles->set(Flavor::QUOTE_ALL, false);
654
                }
655
            };
656
            collect(explode($delim, $line))
657
                ->walk($qnqaFunc->bindTo($this));
658
        };
659
        $lines->walk($qsFunc->bindTo($this));
660
661
        $types          = $freq->get('quoted')->unique();
662
        $quoting_styles = $quoting_styles->filter(function ($val) {
663
            return (bool) $val;
664
        });
665
        // if quoting_styles still has QUOTE_ALL or QUOTE_NONE, then return
666
        // whichever of them it is, we don't need to do anything else
667
        if ($quoting_styles->has(Flavor::QUOTE_ALL)) {
668
            return Flavor::QUOTE_ALL;
669
        }
670
        if ($quoting_styles->has(Flavor::QUOTE_NONE)) {
671
            return Flavor::QUOTE_NONE;
672
        }
673
        if (count($types) == 1) {
674
            $style = $types->getValueAtPosition(0);
675
            if ($quoting_styles->has($style)) {
676
                return $style;
677
            }
678
        } else {
679
            if ($types->contains(self::DATA_NONNUMERIC)) {
680
                // allow for a SMALL amount of error here
681
                $counts = collect([self::DATA_SPECIAL => 0, self::DATA_NONNUMERIC => 0]);
682
                $freq->get('quoted')->walk(function ($type) use (&$counts) {
683
                    $counts->increment($type);
684
                });
685
                // @todo is all this even necessary? seems unnecessary to me...
686
                if ($most = $counts->max()) {
687
                    $least      = $counts->min();
688
                    $err_margin = $least / $most;
689
                    if ($err_margin < 1) {
690
                        return Flavor::QUOTE_NONNUMERIC;
691
                    }
692
                }
693
            }
694
        }
695
696
        return Flavor::QUOTE_MINIMAL;
697
    }
698
699
    /**
700
     * Remove quotes around a piece of text (if there are any).
701
     *
702
     * @param string $data The data to "unquote"
703
     *
704
     * @return string The data passed in, only with quotes stripped (off the edges)
705
     */
706
    protected function unQuote($data)
707
    {
708
        return preg_replace('/^(["\'])(.*)\1$/', '\2', $data);
709
    }
710
711
    /**
712
     * Determine whether a particular string of data has quotes around it.
713
     *
714
     * @param string $data The data to check
715
     *
716
     * @return bool Whether the data is quoted or not
717
     */
718
    protected function isQuoted($data)
719
    {
720
        return preg_match('/^([\'"])[^\1]*\1$/', $data);
721
    }
722
723
    /**
724
     * Determine what type of data is contained within a variable
725
     * Possible types:
726
     *     - nonnumeric - only numbers
727
     *     - special - contains characters that could potentially need to be quoted (possible delimiter characters)
728
     *     - unknown - everything else
729
     * This method is really only used within the "lickQuotingStyle" method to
730
     * help determine whether a particular column has been quoted due to it being
731
     * nonnumeric or because it has some special character in it such as a delimiter
732
     * or newline or quote.
733
     *
734
     * @param string $data The data to determine the type of
735
     *
736
     * @return string The type of data (one of the "DATA_" constants above)
737
     *
738
     * @todo I could probably eliminate this method and use an anonymous function
739
     *     instead. It isn't used anywhere else and its name could be misleading.
740
     *     Especially since I also have a lickType method that is used within the
741
     *     lickHeader method.
742
     */
743
    protected function lickDataType($data)
744
    {
745
        // @todo make this check for only the quote and delim that are actually being used
746
        // that will make the guess more accurate
747
        if (preg_match('/[\'",\t\|:;-]/', $data)) {
748
            return self::DATA_SPECIAL;
749
        } elseif (preg_match('/[^0-9]/', $data)) {
750
            return self::DATA_NONNUMERIC;
751
        }
752
753
        return self::DATA_UNKNOWN;
754
    }
755
756
    /**
757
     * Replace all instances of newlines and whatever character you specify (as
758
     * the delimiter) that are contained within quoted text. The replacements are
759
     * simply a special placeholder string. This is done so that I can use the
760
     * very unsmart "explode" function and not have to worry about it exploding
761
     * on delimiters or newlines within quotes. Once I have exploded, I typically
762
     * sub back in the real characters before doing anything else. Although
763
     * currently there is no dedicated method for doing so I just use str_replace.
764
     *
765
     * @param string $data  The string to do the replacements on
766
     * @param string $delim The delimiter character to replace
767
     *
768
     * @return string The data with replacements performed
769
     *
770
     * @todo I could probably pass in (maybe optionally) the newline character I
771
     *     want to replace as well. I'll do that if I need to.
772
     */
773
    protected function replaceQuotedSpecialChars($data, $delim)
774
    {
775
        return preg_replace_callback('/([\'"])(.*)\1/imsU', function ($matches) use ($delim) {
776
            $ret = preg_replace("/([\r\n])/", self::PLACEHOLDER_NEWLINE, $matches[0]);
777
            $ret = str_replace($delim, self::PLACEHOLDER_DELIM, $ret);
778
779
            return $ret;
780
        }, $data);
781
    }
782
783
    /**
784
     * Determine the "type" of a particular string of data. Used for the lickHeader
785
     * method to assign a type to each column to try to determine whether the
786
     * first for is different than a consistent column type.
787
     *
788
     * @todo As I'm writing this method I'm beginning ot realize how expensive
789
     * the lickHeader method is going to end up being since it has to apply all
790
     * these regexes (potentially) to every column. I may end up writing a much
791
     * simpler type-checking method than this if it proves to be too expensive
792
     * to be practical.
793
     *
794
     * @param string $data The string of data to check the type of
795
     *
796
     * @return string One of the TYPE_ string constants above
797
     */
798
    protected function lickType($data)
799
    {
800
        if (preg_match('/^[+-]?[\d\.]+$/', $data)) {
801
            return self::TYPE_NUMBER;
802
        } elseif (preg_match('/^[+-]?[\d]+\.[\d]+$/', $data)) {
803
            return self::TYPE_DOUBLE;
804
        } elseif (preg_match('/^[+-]?[¥£€$]\d+(\.\d+)$/', $data)) {
805
            return self::TYPE_CURRENCY;
806
        } elseif (preg_match('/^[a-zA-Z]+$/', $data)) {
807
            return self::TYPE_ALPHA;
808
        }
809
        try {
810
            $year  = '([01][0-9])?[0-9]{2}';
811
            $month = '([01]?[0-9]|Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)';
812
            $day   = '[0-3]?[0-9]';
813
            $sep   = '[\/\.\-]?';
814
            $time  = '([0-2]?[0-9](:[0-5][0-9]){1,2}(am|pm)?|[01]?[0-9](am|pm))';
815
            $date  = '(' . $month . $sep . $day . $sep . $year . '|' . $day . $sep . $month . $sep . $year . '|' . $year . $sep . $month . $sep . $day . ')';
816
            $dt    = new DateTime($data);
817
            $dt->setTime(0, 0, 0);
818
            $now = new DateTime();
819
            $now->setTime(0, 0, 0);
820
            $diff     = $dt->diff($now);
821
            $diffDays = (int) $diff->format('%R%a');
822
            if ($diffDays === 0) {
823
                // then this is most likely a time string...
824
                    if (preg_match("/^{$time}$/i", $data)) {
825
                        return self::TYPE_TIME;
826
                    }
827
            }
828
            if (preg_match("/^{$date}$/i", $data)) {
829
                return self::TYPE_DATE;
830
            } elseif (preg_match("/^{$date} {$time}$/i")) {
831
                return self::TYPE_DATETIME;
832
            }
833
        } catch (\Exception $e) {
834
            // now go on checking remaining types
835
                if (preg_match('/^\w+$/', $data)) {
836
                    return self::TYPE_ALNUM;
837
                }
838
        }
839
        
840
        return self::TYPE_STRING;
841
    }
842
}
843