Passed
Pull Request — master (#183)
by Luke
03:12
created

Sniffer   A

Complexity

Total Complexity 13

Size/Duplication

Total Lines 160
Duplicated Lines 0 %

Coupling/Cohesion

Components 1
Dependencies 11

Test Coverage

Coverage 31.37%

Importance

Changes 0
Metric Value
dl 0
loc 160
ccs 16
cts 51
cp 0.3137
rs 10
c 0
b 0
f 0
wmc 13
lcom 1
cbo 11

9 Methods

Rating   Name   Duplication   Size   Complexity  
A __construct() 0 7 2
A setPossibleDelimiters() 0 11 1
A getPossibleDelimiters() 0 4 1
A sniff() 0 23 3
A sniffLineTerminator() 0 5 1
A sniffQuoteAndDelim() 0 5 1
A sniffDelimiter() 0 12 2
A sniffQuotingStyle() 0 5 1
A sniffHasHeader() 0 5 1
1
<?php
2
/**
3
 * CSVelte: Slender, elegant CSV for PHP
4
 *
5
 * Inspired by Python's CSV module and Frictionless Data and the W3C's CSV
6
 * standardization efforts, CSVelte was written in an effort to take all the
7
 * suck out of working with CSV.
8
 *
9
 * @copyright Copyright (c) 2018 Luke Visinoni
10
 * @author    Luke Visinoni <[email protected]>
11
 * @license   See LICENSE file (MIT license)
12
 */
13
namespace CSVelte;
14
15
use CSVelte\Contract\Streamable;
16
17
use CSVelte\Exception\SnifferException;
18
use CSVelte\Sniffer\SniffDelimiterByConsistency;
19
use CSVelte\Sniffer\SniffDelimiterByDistribution;
20
use CSVelte\Sniffer\SniffHeaderByDataType;
21
use CSVelte\Sniffer\SniffLineTerminatorByCount;
22
use CSVelte\Sniffer\SniffQuoteAndDelimByAdjacency;
23
use CSVelte\Sniffer\SniffQuoteStyle;
24
use Noz\Collection\Collection;
25
use function Noz\to_array;
26
use RuntimeException;
27
28
use function Noz\collect;
29
use function Stringy\create as s;
30
31
class Sniffer
32
{
33
    /** CSV data sample size - sniffer will use this many bytes to make its determinations */
34
    const SAMPLE_SIZE = 2500;
35
36
    /**
37
     * ASCII character codes for "invisibles".
38
     */
39
    const HORIZONTAL_TAB  = 9;
40
    const LINE_FEED       = 10;
41
    const CARRIAGE_RETURN = 13;
42
    const SPACE           = 32;
43
44
    /**
45
     * @var array A list of possible delimiters to check for (in order of preference)
46
     */
47
    protected $delims = [',', "\t", ';', '|', ':', '-', '_', '#', '/', '\\', '$', '+', '=', '&', '@'];
48
49
    /**
50
     * @var Streamable A stream of the sample data
51
     */
52
    protected $stream;
53
54
    /**
55
     * Sniffer constructor.
56
     *
57
     * @param Streamable $stream The data to sniff
58
     * @param array $delims A list of possible delimiter characters in order of preference
59
     */
60 1
    public function __construct(Streamable $stream, $delims = null)
61
    {
62 1
        $this->stream = $stream;
63 1
        if (!is_null($delims)) {
64 1
            $this->setPossibleDelimiters($delims);
65 1
        }
66 1
    }
67
68
    /**
69
     * Set possible delimiter characters
70
     *
71
     * @param array $delims A list of possible delimiter characters
72
     *
73
     * @return self
74
     */
75 1
    public function setPossibleDelimiters(array $delims)
76
    {
77 1
        $this->delims = collect($delims)
78 1
            ->filter(function($val) {
79 1
                return s($val)->length() == 1;
80 1
            })
81 1
            ->values()
82 1
            ->toArray();
83
84 1
        return $this;
85
    }
86
87
    /**
88
     * Get list of possible delimiter characters
89
     *
90
     * @return array
91
     */
92 1
    public function getPossibleDelimiters()
93
    {
94 1
        return $this->delims;
95
    }
96
97
    /**
98
     * Sniff CSV data (determine its dialect)
99
     *
100
     * Since CSV is less a format than a collection of similar formats, you can never be certain how a particular CSV
101
     * file is formatted. This method inspects CSV data and returns its "dialect", an object that can be passed to
102
     * either a `CSVelte\Reader` or `CSVelte\Writer` object to tell it what "dialect" of CSV to use.
103
     *
104
     * @todo look into which other Dialect attributes you can sniff for
105
     *
106
     * @return Dialect
107
     */
108
    public function sniff()
109
    {
110
        $sample = $this->stream->read(static::SAMPLE_SIZE);
111
        $lineTerminator = $this->sniffLineTerminator($sample);
0 ignored issues
show
Security Bug introduced by
It seems like $sample defined by $this->stream->read(static::SAMPLE_SIZE) on line 110 can also be of type false; however, CSVelte\Sniffer::sniffLineTerminator() does only seem to accept string, did you maybe forget to handle an error condition?

This check looks for type mismatches where the missing type is false. This is usually indicative of an error condtion.

Consider the follow example

<?php

function getDate($date)
{
    if ($date !== null) {
        return new DateTime($date);
    }

    return false;
}

This function either returns a new DateTime object or false, if there was an error. This is a typical pattern in PHP programming to show that an error has occurred without raising an exception. The calling code should check for this returned false before passing on the value to another function or method that may not be able to handle a false.

Loading history...
112
        try {
113
            list($quoteChar, $delimiter) = $this->sniffQuoteAndDelim($sample, $lineTerminator);
0 ignored issues
show
Security Bug introduced by
It seems like $sample can also be of type false; however, CSVelte\Sniffer::sniffQuoteAndDelim() does only seem to accept string, did you maybe forget to handle an error condition?
Loading history...
114
        } catch (SnifferException $e) {
115
            if ($e->getCode() !== SnifferException::ERR_QUOTE_AND_DELIM) {
116
                throw $e;
117
            }
118
            $quoteChar = '"';
119
            $delimiter = $this->sniffDelimiter($sample, $lineTerminator);
120
        }
121
        /**
122
         * @todo Should this be null? Because doubleQuote = true means this = null
123
         */
124
        $escapeChar = '\\';
125
        $quoteStyle = $this->sniffQuotingStyle($sample, $delimiter, $lineTerminator);
126
        $header     = $this->sniffHasHeader($sample, $delimiter, $lineTerminator);
127
        $encoding   = s($sample)->getEncoding();
128
129
        return new Dialect(compact('quoteChar', 'escapeChar', 'delimiter', 'lineTerminator', 'quoteStyle', 'header', 'encoding'));
130
    }
131
132
    /**
133
     * Sniff sample data for line terminator character
134
     *
135
     * @param string $data The sample data
136
     *
137
     * @return string
138
     */
139
    protected function sniffLineTerminator($data)
140
    {
141
        $sniffer = new SniffLineTerminatorByCount();
142
        return $sniffer->sniff($data);
143
    }
144
145
    /**
146
     * Sniff quote and delimiter chars
147
     *
148
     * The best way to determine quote and delimiter characters is when columns
149
     * are quoted, often you can seek out a pattern of delim, quote, stuff, quote, delim
150
     * but this only works if you have quoted columns. If you don't you have to
151
     * determine these characters some other way... (see lickDelimiter).
152
     *
153
     * @throws SnifferException
154
     *
155
     * @param string $data The data to analyze
156
     * @param string $lineTerminator The line terminator char/sequence
157
     *
158
     * @return array A two-row array containing quotechar, delimchar
159
     */
160
    protected function sniffQuoteAndDelim($data, $lineTerminator)
161
    {
162
        $sniffer = new SniffQuoteAndDelimByAdjacency(compact('lineTerminator'));
163
        return $sniffer->sniff($data);
164
    }
165
166
    protected function sniffDelimiter($data, $lineTerminator)
167
    {
168
        $delimiters = $this->getPossibleDelimiters();
169
        $consistency = new SniffDelimiterByConsistency(compact('lineTerminator', 'delimiters'));
170
        $winners = $consistency->sniff($data);
171
        if (count($winners) > 1) {
172
            $delimiters = $winners;
173
            return (new SniffDelimiterByDistribution(compact('lineTerminator', 'delimiters')))
174
                ->sniff($data);
175
        }
176
        return current($winners);
177
    }
178
179
    protected function sniffQuotingStyle($data, $delimiter, $lineTerminator)
180
    {
181
        $sniffer = new SniffQuoteStyle(compact( 'lineTerminator', 'delimiter'));
182
        return $sniffer->sniff($data);
183
    }
184
185
    protected function sniffHasHeader($data, $delimiter, $lineTerminator)
186
    {
187
        $sniffer = new SniffHeaderByDataType(compact(  'lineTerminator', 'delimiter'));
188
        return $sniffer->sniff($data);
189
    }
190
}
191