Completed
Push — master ( 0a5b37...5c1aea )
by Luke
02:18
created

Reader::inQuotedString()   B

Complexity

Conditions 6
Paths 7

Size

Total Lines 16
Code Lines 12

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 10
CRAP Score 6.1666

Importance

Changes 0
Metric Value
c 0
b 0
f 0
dl 0
loc 16
ccs 10
cts 12
cp 0.8333
rs 8.8571
cc 6
eloc 12
nc 7
nop 3
crap 6.1666
1
<?php
2
/**
3
 * CSVelte: Slender, elegant CSV for PHP
4
 *
5
 * Inspired by Python's CSV module and Frictionless Data and the W3C's CSV
6
 * standardization efforts, CSVelte was written in an effort to take all the
7
 * suck out of working with CSV.
8
 *
9
 * @version   v0.2
10
 * @copyright Copyright (c) 2016 Luke Visinoni <[email protected]>
11
 * @author    Luke Visinoni <[email protected]>
12
 * @license   https://github.com/deni-zen/csvelte/blob/master/LICENSE The MIT License (MIT)
13
 */
14
namespace CSVelte;
15
16
use \InvalidArgumentException;
17
use \FilterIterator;
18
use CSVelte\IO\Stream;
19
use CSVelte\Contract\Readable;
20
use CSVelte\Table\Row;
21
use CSVelte\Table\HeaderRow;
22
use CSVelte\Exception\EndOfFileException;
23
use CSVelte\Reader\FilteredIterator as FilteredReader;
24
25
/**
26
 * CSV Reader
27
 *
28
 * Reads CSV data from any object that implements CSVelte\Contract\Readable.
29
 *
30
 * @package CSVelte
31
 * @subpackage Reader
32
 * @since v0.1
33
 * @todo Also, is there any way to do some kind of caching or something? Probably
34
 *     not but if you could that would be a cool feature...
35
 */
36
class Reader implements \Iterator
37
{
38
    const PLACEHOLDER_DELIM   = '[=[__DLIM__]=]';
39
    const PLACEHOLDER_NEWLINE = '[=[__NWLN__]=]';
40
41
    /**
42
     * This class supports any sources of input that implements this interface.
43
     * This way I can read from local files, streams, FTP, any class that implements
44
     * the "Readable" interface
45
     * @var \CSVelte\Contract\Readable
46
     */
47
    protected $source;
48
49
    /**
50
     * @var \CSVelte\Flavor The "flavor" or format of the CSV being read
51
     */
52
    protected $flavor;
53
54
    /**
55
     * @var \CSVelte\Table\Row|boolean Row currently loaded into memory
56
     */
57
    protected $current;
58
59
    /**
60
     * @var integer The current line being read (from input source)
61
     */
62
    protected $line = 0;
63
64
    /**
65
     * @var \CSVelte\Table\HeaderRow The header row (if any)
66
     */
67
    protected $header;
68
69
    /**
70
     * @var array An array of callback functions
71
     */
72
    protected $filters = array();
73
74
    /**
75
     * @var bool True if current line ended while inside a quoted string
76
     */
77
    protected $open = false;
78
79
    /**
80
     * @var bool True if last character read was the escape character
81
     */
82
    protected $escape = false;
83
84
    /**
85
     * Reader Constructor.
86
     * Initializes a reader object using an input source and optionally a flavor
87
     *
88
     * @param \CSVelte\Contract\Readable $input The source of our CSV data
89
     * @param \CSVelte\Flavor|array|null $flavor The "flavor" or format specification object
90
     */
91 22
    public function __construct($input, $flavor = null)
92
    {
93 22
        $this->setSource($input)
94 22
             ->setFlavor($flavor)
95 22
             ->rewind();
96 22
    }
97
98
    /**
99
     * Set the flavor.
100
     *
101
     * Set the ``CSVelte\Flavor`` object, used to determine CSV format.
102
     *
103
     * @param \CSVelte\Flavor|array|null $flavor Either an array or a flavor object
104
     */
105 20
    protected function setFlavor($flavor = null)
106
    {
107 20
        if (is_array($flavor)) $flavor = new Flavor($flavor);
108 20
        $taster = new Taster($this->source);
109
        // @todo put this inside a try/catch
110 20
        if (is_null($flavor)) {
111 12
            $flavor = $taster->lick();
112 12
        }
113 20
        if (is_null($flavor->header)) {
114
            // Flavor is immutable, give me a new one with header set to lickHeader return val
115 4
            $flavor = $flavor->copy(['header' => $taster->lickHeader($flavor->delimiter, $flavor->lineTerminator)]);
116 4
        }
117 20
        $this->flavor = $flavor;
118 20
        return $this;
119
    }
120
121
    /**
122
     * Set the reader source.
123
     *
124
     * The reader can accept anything that implements Readable and is actually
125
     * readable (can be read). This will make sure that whatever is passed to
126
     * the reader meets these expectations and set $this->source. It can also
127
     * accept any string (or any object with a __toString() method), or an
128
     * SplFileObject, so long as it represents a file rather than a directory. 
129
     *
130
     * @param \CSVelte\Contract\Readable|object|string|SplFileObject $input See description
131
     * @return $this
132
     */
133 20
    protected function setSource($input)
134
    {
135 20
        if ($input instanceof Readable && $input->isReadable()) {
136 9
            $this->source = $input;
137 20
        } elseif (file_exists((string) $input)) {
138 2
            $this->source = new IO\Stream($input);
139 2
        } else {
140 10
            $this->source = Stream::streamize($input);
141
        }
142 20
        return $this;
143
    }
144
145
    /**
146
     * Load a line into memory
147
     *
148
     * @return void ($this?)
149
     * @access protected
150
     */
151 20
    protected function load()
152
    {
153 20
        if (is_null($this->current)) {
154
            try {
155 20
                $line = $this->readLine();
156 20
                $this->line++;
157 20
                $parsed = $this->parse($line);
158 20
                if ($this->hasHeader() && $this->line === 1) {
159 14
                    $this->header = new HeaderRow($parsed);
160 14
                } else {
161 20
                    $this->current = new Row($parsed);
162 20
                    if ($this->header) $this->current->setHeaderRow($this->header);
163
                }
164 20
            } catch (EndOfFileException $e) {
165 7
                $this->current = false;
166
            }
167 20
        }
168 20
    }
169
170
    /**
171
     * Read single line from CSV data source (stream, file, etc.), taking into
172
     * account CSV's de-facto quoting rules with respect to designated line
173
     * terminator character when they fall within quoted strings.
174
     *
175
     * @return string A CSV row (could possibly span multiple lines depending on
176
     *     quoting and escaping)
177
     * @throws \CSVelte\Exception\EndOfFileException when eof has been reached
178
     *     and the read buffer has all been returned
179
     */
180 20
    protected function readLine()
181
    {
182 20
        $f = $this->getFlavor();
183 20
        $eol = $f->lineTerminator;
184
        try {
185
            do {
186 20
                if (!isset($lines)) $lines = array();
187 20
                if (false === ($line = $this->source->readLine($eol))) {
188 7
                    throw new EndOfFileException("End of file reached: " . $this->source->getName());
189
                }
190 20
                array_push($lines, rtrim($line, $eol));
191 20
            } while ($this->inQuotedString(end($lines), $f->quoteChar, $f->escapeChar));
192 20
        } catch (EndOfFileException $e) {
193
            // only throw the exception if we don't already have lines in the buffer
194 7
            if (!count($lines)) throw $e;
195
        }
196 20
        return rtrim(implode($eol, $lines), $eol);
197
    }
198
199
    /**
200
     * Determine whether last line ended while a quoted string was still "open"
201
     *
202
     * This method is used in a loop to determine if each line being read ends
203
     * while a quoted string is still "open".
204
     *
205
     * @param string $line Line of csv to analyze
206
     * @param string $quoteChar The quote/enclosure character to use
207
     * @param string $escapeChar The escape char/sequence to use
208
     * @return bool True if currently within a quoted string
209
     */
210 20
    protected function inQuotedString($line, $quoteChar, $escapeChar)
211
    {
212 20
        if (!empty($line)) {
213
            do {
214 20
                if (!isset($i)) $i = 0;
215 20
                $c = $line[$i++];
216 20
                if ($this->escape) {
217
                    $this->escape = false;
218
                    continue;
219
                }
220 20
                $this->escape = ($c == $escapeChar);
221 20
                if ($c == $quoteChar) $this->open = !$this->open;
222 20
            } while ($i < strlen($line));
223 20
        }
224 20
        return $this->open;
225
    }
226
227
    /**
228
     * Flavor Getter.
229
     *
230
     * Retreive the "flavor" object being used by the reader
231
     *
232
     * @return \CSVelte\Flavor
233
     * @access public
234
     */
235 20
    public function getFlavor()
236
    {
237 20
        return $this->flavor;
238
    }
239
240
    /**
241
     * Check if flavor object defines header.
242
     *
243
     * Determine whether or not the input source's CSV data contains a header
244
     * row or not. Unless you explicitly specify so within your Flavor object,
245
     * this method is a logical best guess. The CSV format does not
246
     * provide metadata of any kind and therefor does not provide this info.
247
     *
248
     * @return boolean True if the input source has a header row (or, to be more )
249
     *     accurate, if the flavor SAYS it has a header row)
250
     * @todo Rather than always reading in Taster::SAMPLE_SIZE, read in ten lines at a time until
251
     *     whatever method it is has enough data to make a reliable decision/guess
252
     */
253 20
    public function hasHeader()
254
    {
255 20
        return $this->getFlavor()->header;
256
    }
257
258
    /**
259
     * Temporarily replace special characters within a quoted string
260
     *
261
     * Replace all instances of newlines and whatever character you specify (as
262
     * the delimiter) that are contained within quoted text. The replacements are
263
     * simply a special placeholder string. This is done so that I can use the
264
     * very unsmart "explode" function and not have to worry about it exploding
265
     * on delimiters or newlines within quotes. Once I have exploded, I typically
266
     * sub back in the real characters before doing anything else.
267
     *
268
     * @param string $data The string to do the replacements on
269
     * @param string $delim The delimiter character to replace
270
     * @param string $quo The quote character
271
     * @param string $eol Line terminator character/sequence
272
     * @return string The data with replacements performed
273
     * @access protected
274
     * @internal
275
     * @todo I could probably pass in (maybe optionally) the newline character I
276
     *     want to replace as well. I'll do that if I need to.
277
     * @todo Create a regex class so you can do $regex->escape() rather than
278
     *     preg_quote
279
     */
280 20
    protected function replaceQuotedSpecialChars($data, $delim, $quo, $eol)
281
    {
282
        return preg_replace_callback('/(['. preg_quote($quo, '/') . '])(.*)\1/imsU', function($matches) use ($delim, $eol) {
283 11
            $ret = str_replace($eol, self::PLACEHOLDER_NEWLINE, $matches[0]);
284 11
            $ret = str_replace($delim, self::PLACEHOLDER_DELIM, $ret);
285 11
            return $ret;
286 20
        }, $data);
287
    }
288
289
    /**
290
     * Undo temporary special char replacements
291
     *
292
     * Replace the special character placeholders with the characters they
293
     * originally substituted.
294
     *
295
     * @param string $data The data to undo replacements in
296
     * @param string $delim The delimiter character
297
     * @param string $eol The character or string of characters used to terminate lines
298
     * @return string The data with placeholders replaced with original characters
299
     * @internal
300
     */
301 20
    protected function undoReplaceQuotedSpecialChars($data, $delim, $eol)
302
    {
303 20
        $replacements = array(self::PLACEHOLDER_DELIM => $delim, self::PLACEHOLDER_NEWLINE => $eol);
304
        if (array_walk($replacements, function($replacement, $placeholder) use (&$data) {
305 20
            $data = str_replace($placeholder, $replacement, $data);
306 20
        })) {
307 20
            return $data;
308
        }
309
    }
310
311
    /**
312
     * Remove quotes wrapping text.
313
     *
314
     * @param string $data The data to unquote
315
     * @return string The data with quotes stripped from the outside of it
316
     * @internal
317
     */
318 20
    protected function unQuote($data)
319
    {
320 20
        $escapeChar = $this->getFlavor()->doubleQuote ? $this->getFlavor()->quoteChar : $this->getFlavor()->escapeChar;
321 20
        $quoteChar = $this->getFlavor()->quoteChar;
322 20
        $data = $this->unEscape($data, $escapeChar, $quoteChar);
323 20
        return preg_replace('/^(["\'])(.*)\1$/ms', '\2', $data);
324
    }
325
326
    /**
327
     * @internal
328
     * @todo This actually shouldn't even be necessary. Characters should be read
329
     *     in one at a time and a quote that follows another should just be ignored
330
     *     deeming this unnecessary.
331
     */
332 20
    protected function unEscape($str, $esc, $quo)
333
    {
334 20
        return str_replace($esc . $quo, $quo, $str);
335
    }
336
337
    /**
338
     * Parse a line of CSV data into an array of columns
339
     *
340
     * @param string A line of CSV data to parse
341
     * @return array An array of columns
342
     * @access protected
343
     * @internal
344
     */
345 20
    protected function parse($line)
346
    {
347 20
        $f = $this->getFlavor();
348 20
        $replaced = $this->replaceQuotedSpecialChars($line, $f->delimiter, $f->quoteChar, $f->lineTerminator);
349 20
        $columns = explode($f->delimiter, $replaced);
350 20
        $that = $this;
351
        return array_map(function($val) use ($that, $f) {
352 20
            $undone = $that->undoReplaceQuotedSpecialChars($val, $f->delimiter, $f->lineTerminator);
353 20
            return $this->unQuote($undone);
354 20
        }, $columns);
355
    }
356
357
    /**
358
     * Retrieve current row.
359
     *
360
     * @return CSVelte\Table\Row The current row
361
     */
362 20
    public function current()
363
    {
364 20
        return $this->current;
365
    }
366
367
    /**
368
     * Advance to the next row
369
     *
370
     * @return CSVelte\Table\Row|null The current row (if there is one)
371
     */
372 15
    public function next()
373
    {
374
375 15
        $this->current = null;
376 15
        $this->load();
377 15
        return $this->current;
378
    }
379
380
    /**
381
     * Determine if current position has valid row.
382
     *
383
     * @return boolean True if current row is valid
384
     */
385 8
    public function valid()
386
    {
387 8
        return (bool) $this->current;
388
    }
389
390
    /**
391
     * Retrieve current row key (line number).
392
     *
393
     * @return int The current line number
394
     */
395 5
    public function key()
396
    {
397 5
        return $this->line;
398
    }
399
400
    /**
401
     * Rewind to the beginning of the dataset.
402
     *
403
     * @return CSVelte\Table\Row|null The current row
404
     */
405 20
    public function rewind()
406
    {
407 20
        $this->line = 0;
408 20
        $this->source->rewind();
409 20
        $this->current = null;
410 20
        $this->load();
411 20
        if ($this->hasHeader()) {
412 14
            $this->next();
413 14
        }
414 20
        return $this->current();
415
    }
416
417
    /**
418
     * Retrieve header row.
419
     *
420
     * @return CSVelte\Table\HeaderRow|null The header row if there is one
421
     */
422 2
    public function header()
423
    {
424 2
        return $this->header;
425
    }
426
427
    /**
428
     * Add anonumous function as filter.
429
     *
430
     * Add an anonymous function that accepts the current row as its only argument.
431
     * Return true from the function to keep that row, false otherwise.
432
     *
433
     * @param Callable $filter An anonymous function to filter out row by certain criteria
434
     * @return $this
435
     */
436 3
    public function addFilter(Callable $filter)
437
    {
438 3
        array_push($this->filters, $filter);
439 3
        return $this;
440
    }
441
442
    /**
443
     * Add multiple filters at once.
444
     *
445
     * Add an array of anonymous functions to filter out certain rows.
446
     *
447
     * @param array $filters An array of anonymous functions
448
     * @return $this
449
     */
450 1
    public function addFilters(array $filters)
451
    {
452 1
        foreach ($filters as $filter) {
453 1
            $this->addFilter($filter);
454 1
        }
455 1
        return $this;
456
    }
457
458
    /**
459
     * Returns an iterator with rows from user-supplied filter functions removed
460
     *
461
     * @return CSVelte\Reader\FilteredReader An iterator with filtered rows
462
     */
463 3
    public function filter()
464
    {
465 3
        return new FilteredReader($this, $this->filters);
466
    }
467
468
    /**
469
     * Retrieve the contents of the dataset as an array of arrays.
470
     *
471
     * @return array An array of arrays of CSV content
472
     */
473
    public function toArray()
474
    {
475 1
        return array_map(function($row){
476 1
            return $row->toArray();
477 1
        }, iterator_to_array($this));
478
    }
479
480
}
481