Completed
Push — master ( 5c1aea...9ef1c4 )
by Luke
03:03
created

src/CSVelte/Reader.php (1 issue)

Upgrade to new PHP Analysis Engine

These results are based on our legacy PHP analysis, consider migrating to our new PHP analysis engine instead. Learn more

1
<?php
2
/**
3
 * CSVelte: Slender, elegant CSV for PHP
4
 *
5
 * Inspired by Python's CSV module and Frictionless Data and the W3C's CSV
6
 * standardization efforts, CSVelte was written in an effort to take all the
7
 * suck out of working with CSV.
8
 *
9
 * @version   v0.2.1
10
 * @copyright Copyright (c) 2016 Luke Visinoni <[email protected]>
11
 * @author    Luke Visinoni <[email protected]>
12
 * @license   https://github.com/deni-zen/csvelte/blob/master/LICENSE The MIT License (MIT)
13
 */
14
namespace CSVelte;
15
16
use CSVelte\Contract\Streamable;
17
18
use CSVelte\Table\Row;
19
use CSVelte\Table\HeaderRow;
20
use CSVelte\Reader\FilteredIterator as FilteredReader;
21
22
use CSVelte\Exception\EndOfFileException;
23
24
use function
25
    CSVelte\streamize,
26
    CSVelte\taste,
27
    CSVelte\taste_has_header,
28
    CSVelte\collect;
29
30
/**
31
 * CSV Reader
32
 *
33
 * Reads CSV data from any object that implements CSVelte\Contract\Readable.
34
 *
35
 * @package CSVelte
36
 * @subpackage Reader
37
 * @since v0.1
38
 * @todo Also, is there any way to do some kind of caching or something? Probably
39
 *     not but if you could that would be a cool feature...
40
 */
41
class Reader implements \Iterator
42
{
43
    const PLACEHOLDER_DELIM   = '[=[__DLIM__]=]';
44
    const PLACEHOLDER_NEWLINE = '[=[__NWLN__]=]';
45
46
    /**
47
     * This class supports any sources of input that implements this interface.
48
     * This way I can read from local files, streams, FTP, any class that implements
49
     * the "Readable" interface
50
     * @var Contract\Streamable
51
     */
52
    protected $source;
53
54
    /**
55
     * @var Flavor The "flavor" or format of the CSV being read
56
     */
57
    protected $flavor;
58
59
    /**
60
     * @var Table\Row Row currently loaded into memory
61
     */
62
    protected $current;
63
64
    /**
65
     * @var integer The current line being read (from input source)
66
     */
67
    protected $line = 0;
68
69
    /**
70
     * @var Table\HeaderRow The header row (if any)
71
     */
72
    protected $header;
73
74
    /**
75
     * @var array An array of callback functions
76
     */
77
    protected $filters = array();
78
79
    /**
80
     * @var bool True if current line ended while inside a quoted string
81
     */
82
    protected $open = false;
83
84
    /**
85
     * @var bool True if last character read was the escape character
86
     */
87
    protected $escape = false;
88
89
    /**
90
     * Reader Constructor.
91
     * Initializes a reader object using an input source and optionally a flavor
92
     *
93
     * @param mixed $input The source of our CSV data
94
     * @param Flavor|array|null $flavor The "flavor" or format specification object
95
     */
96 21
    public function __construct($input, $flavor = null)
97
    {
98 21
        $this->setSource($input)
99 21
             ->setFlavor($flavor)
100 21
             ->rewind();
101 21
    }
102
103
    /**
104
     * Set the flavor.
105
     *
106
     * Set the ``CSVelte\Flavor`` object, used to determine CSV format.
107
     *
108
     * @param Flavor|array|null $flavor Either an array or a flavor object
109
     * @return $this
110
     */
111 19
    protected function setFlavor($flavor = null)
112
    {
113 19
        if (is_array($flavor)) $flavor = new Flavor($flavor);
114
        // @todo put this inside a try/catch
115 19
        if (is_null($flavor)) {
116 11
            $flavor = taste($this->source);
117 11
        }
118 19
        if (is_null($flavor->header)) {
119
            // Flavor is immutable, give me a new one with header set to lickHeader return val
120 4
            $flavor = $flavor->copy(['header' => taste_has_header($this->source)]);
121 4
        }
122 19
        $this->flavor = $flavor;
123 19
        return $this;
124
    }
125
126
    /**
127
     * Set the reader source.
128
     *
129
     * The reader can accept anything that implements Readable and is actually
130
     * readable (can be read). This will make sure that whatever is passed to
131
     * the reader meets these expectations and set $this->source. It can also
132
     * accept any string (or any object with a __toString() method), or an
133
     * SplFileObject, so long as it represents a file rather than a directory.
134
     *
135
     * @param mixed $input See description
136
     * @return $this
137
     */
138 19
    protected function setSource($input)
139
    {
140 19
        if (!($input instanceof Streamable)) {
141 9
            $input = streamize($input);
142 9
        }
143 19
        $this->source = $input;
144 19
        return $this;
145
    }
146
147
    /**
148
     * Load a line into memory
149
     */
150 19
    protected function load()
151
    {
152 19
        if (is_null($this->current)) {
153
            try {
154 19
                $line = $this->readLine();
155 19
                $this->line++;
156 19
                $parsed = $this->parse($line);
157 19
                if ($this->hasHeader() && $this->line === 1) {
158 13
                    $this->header = new HeaderRow($parsed);
159 13
                } else {
160 19
                    $this->current = new Row($parsed);
161 19
                    if ($this->header) $this->current->setHeaderRow($this->header);
162
                }
163 19
            } catch (EndOfFileException $e) {
164 7
                $this->current = false;
0 ignored issues
show
Documentation Bug introduced by
It seems like false of type false is incompatible with the declared type object<CSVelte\Table\Row> of property $current.

Our type inference engine has found an assignment to a property that is incompatible with the declared type of that property.

Either this assignment is in error or the assigned type should be added to the documentation/type hint for that property..

Loading history...
165
            }
166 19
        }
167 19
    }
168
169
    /**
170
     * Read single line from CSV data source (stream, file, etc.), taking into
171
     * account CSV's de-facto quoting rules with respect to designated line
172
     * terminator character when they fall within quoted strings.
173
     *
174
     * @return string A CSV row (could possibly span multiple lines depending on
175
     *     quoting and escaping)
176
     * @throws Exception\EndOfFileException when eof has been reached
177
     *     and the read buffer has all been returned
178
     */
179 19
    protected function readLine()
180
    {
181 19
        $f = $this->getFlavor();
182 19
        $eol = $f->lineTerminator;
183
        try {
184
            do {
185 19
                if (!isset($lines)) $lines = array();
186 19
                if (false === ($line = $this->source->readLine($eol))) {
187 7
                    throw new EndOfFileException("End of file reached");
188
                }
189 19
                array_push($lines, rtrim($line, $eol));
190 19
            } while ($this->inQuotedString(end($lines), $f->quoteChar, $f->escapeChar));
191 19
        } catch (EndOfFileException $e) {
192
            // only throw the exception if we don't already have lines in the buffer
193 7
            if (!count($lines)) throw $e;
194
        }
195 19
        return rtrim(implode($eol, $lines), $eol);
196
    }
197
198
    /**
199
     * Determine whether last line ended while a quoted string was still "open"
200
     *
201
     * This method is used in a loop to determine if each line being read ends
202
     * while a quoted string is still "open".
203
     *
204
     * @param string $line Line of csv to analyze
205
     * @param string $quoteChar The quote/enclosure character to use
206
     * @param string $escapeChar The escape char/sequence to use
207
     * @return bool True if currently within a quoted string
208
     */
209 19
    protected function inQuotedString($line, $quoteChar, $escapeChar)
210
    {
211 19
        if (!empty($line)) {
212
            do {
213 19
                if (!isset($i)) $i = 0;
214 19
                $c = $line[$i++];
215 19
                if ($this->escape) {
216
                    $this->escape = false;
217
                    continue;
218
                }
219 19
                $this->escape = ($c == $escapeChar);
220 19
                if ($c == $quoteChar) $this->open = !$this->open;
221 19
            } while ($i < strlen($line));
222 19
        }
223 19
        return $this->open;
224
    }
225
226
    /**
227
     * Flavor Getter.
228
     *
229
     * Retreive the "flavor" object being used by the reader
230
     *
231
     * @return Flavor
232
     */
233 19
    public function getFlavor()
234
    {
235 19
        return $this->flavor;
236
    }
237
238
    /**
239
     * Check if flavor object defines header.
240
     *
241
     * Determine whether or not the input source's CSV data contains a header
242
     * row or not. Unless you explicitly specify so within your Flavor object,
243
     * this method is a logical best guess. The CSV format does not
244
     * provide metadata of any kind and therefor does not provide this info.
245
     *
246
     * @return boolean True if the input source has a header row (or, to be more )
247
     *     accurate, if the flavor SAYS it has a header row)
248
     * @todo Rather than always reading in Taster::SAMPLE_SIZE, read in ten lines at a time until
249
     *     whatever method it is has enough data to make a reliable decision/guess
250
     */
251 19
    public function hasHeader()
252
    {
253 19
        return $this->getFlavor()->header;
254
    }
255
256
    /**
257
     * Temporarily replace special characters within a quoted string
258
     *
259
     * Replace all instances of newlines and whatever character you specify (as
260
     * the delimiter) that are contained within quoted text. The replacements are
261
     * simply a special placeholder string. This is done so that I can use the
262
     * very unsmart "explode" function and not have to worry about it exploding
263
     * on delimiters or newlines within quotes. Once I have exploded, I typically
264
     * sub back in the real characters before doing anything else.
265
     *
266
     * @param string $data The string to do the replacements on
267
     * @param string $delim The delimiter character to replace
268
     * @param string $quo The quote character
269
     * @param string $eol Line terminator character/sequence
270
     * @return string The data with replacements performed
271
     * @internal
272
     * @todo I could probably pass in (maybe optionally) the newline character I
273
     *     want to replace as well. I'll do that if I need to.
274
     * @todo Create a regex class so you can do $regex->escape() rather than
275
     *     preg_quote
276
     */
277 19
    protected function replaceQuotedSpecialChars($data, $delim, $quo, $eol)
278
    {
279
        return preg_replace_callback('/(['. preg_quote($quo, '/') . '])(.*)\1/imsU', function($matches) use ($delim, $eol) {
280 11
            $ret = str_replace($eol, self::PLACEHOLDER_NEWLINE, $matches[0]);
281 11
            $ret = str_replace($delim, self::PLACEHOLDER_DELIM, $ret);
282 11
            return $ret;
283 19
        }, $data);
284
    }
285
286
    /**
287
     * Undo temporary special char replacements
288
     *
289
     * Replace the special character placeholders with the characters they
290
     * originally substituted.
291
     *
292
     * @param string $data The data to undo replacements in
293
     * @param string $delim The delimiter character
294
     * @param string $eol The character or string of characters used to terminate lines
295
     * @return string The data with placeholders replaced with original characters
296
     * @internal
297
     */
298 19
    protected function undoReplaceQuotedSpecialChars($data, $delim, $eol)
299
    {
300 19
        $replacements = array(self::PLACEHOLDER_DELIM => $delim, self::PLACEHOLDER_NEWLINE => $eol);
301
        if (array_walk($replacements, function($replacement, $placeholder) use (&$data) {
302 19
            $data = str_replace($placeholder, $replacement, $data);
303 19
        })) {
304 19
            return $data;
305
        }
306
    }
307
308
    /**
309
     * Remove quotes wrapping text.
310
     *
311
     * @param string $data The data to unquote
312
     * @return string The data with quotes stripped from the outside of it
313
     * @internal
314
     */
315 19
    protected function unQuote($data)
316
    {
317 19
        $escapeChar = $this->getFlavor()->doubleQuote ? $this->getFlavor()->quoteChar : $this->getFlavor()->escapeChar;
318 19
        $quoteChar = $this->getFlavor()->quoteChar;
319 19
        $data = $this->unEscape($data, $escapeChar, $quoteChar);
320 19
        return preg_replace('/^(["\'])(.*)\1$/ms', '\2', $data);
321
    }
322
323
    /**
324
     * "Unescape" a string.
325
     *
326
     * Replaces escaped characters with their unescaped versions.
327
     *
328
     * @internal
329
     * @param string $str The string to unescape
330
     * @param string $esc The escape character used
331
     * @param string $quo The quote character used
332
     * @return mixed The string with characters unescaped
333
     * @todo This actually shouldn't even be necessary. Characters should be read
334
     *     in one at a time and a quote that follows another should just be ignored
335
     *     deeming this unnecessary.
336
     */
337 19
    protected function unEscape($str, $esc, $quo)
338
    {
339 19
        return str_replace($esc . $quo, $quo, $str);
340
    }
341
342
    /**
343
     * Parse a line of CSV data into an array of columns
344
     *
345
     * @param string $line A line of CSV data to parse
346
     * @return array An array of columns
347
     * @internal
348
     */
349 19
    protected function parse($line)
350
    {
351 19
        $f = $this->getFlavor();
352 19
        $replaced = $this->replaceQuotedSpecialChars($line, $f->delimiter, $f->quoteChar, $f->lineTerminator);
353 19
        $columns = explode($f->delimiter, $replaced);
354 19
        $that = $this;
355
        return array_map(function($val) use ($that, $f) {
356 19
            $undone = $that->undoReplaceQuotedSpecialChars($val, $f->delimiter, $f->lineTerminator);
357 19
            return $this->unQuote($undone);
358 19
        }, $columns);
359
    }
360
361
    /**
362
     * Retrieve current row.
363
     *
364
     * @return Table\Row The current row
365
     */
366 19
    public function current()
367
    {
368 19
        return $this->current;
369
    }
370
371
    /**
372
     * Advance to the next row
373
     *
374
     * @return Table\Row|null The current row (if there is one)
375
     */
376 14
    public function next()
377
    {
378
379 14
        $this->current = null;
380 14
        $this->load();
381 14
        return $this->current;
382
    }
383
384
    /**
385
     * Determine if current position has valid row.
386
     *
387
     * @return boolean True if current row is valid
388
     */
389 8
    public function valid()
390
    {
391 8
        return (bool) $this->current;
392
    }
393
394
    /**
395
     * Retrieve current row key (line number).
396
     *
397
     * @return int The current line number
398
     */
399 5
    public function key()
400
    {
401 5
        return $this->line;
402
    }
403
404
    /**
405
     * Rewind to the beginning of the dataset.
406
     *
407
     * @return Table\Row|null The current row
408
     */
409 19
    public function rewind()
410
    {
411 19
        $this->line = 0;
412 19
        $this->source->rewind();
413 19
        $this->current = null;
414 19
        $this->load();
415 19
        if ($this->hasHeader()) {
416 13
            $this->next();
417 13
        }
418 19
        return $this->current();
419
    }
420
421
    /**
422
     * Retrieve header row.
423
     *
424
     * @return Table\HeaderRow The header row if there is one
425
     */
426 2
    public function header()
427
    {
428 2
        return $this->header;
429
    }
430
431
    /**
432
     * Add anonumous function as filter.
433
     *
434
     * Add an anonymous function that accepts the current row as its only argument.
435
     * Return true from the function to keep that row, false otherwise.
436
     *
437
     * @param Callable $filter An anonymous function to filter out row by certain criteria
438
     * @return $this
439
     */
440 3
    public function addFilter(Callable $filter)
441
    {
442 3
        array_push($this->filters, $filter);
443 3
        return $this;
444
    }
445
446
    /**
447
     * Add multiple filters at once.
448
     *
449
     * Add an array of anonymous functions to filter out certain rows.
450
     *
451
     * @param array $filters An array of anonymous functions
452
     * @return $this
453
     */
454 1
    public function addFilters(array $filters)
455
    {
456 1
        foreach ($filters as $filter) {
457 1
            $this->addFilter($filter);
458 1
        }
459 1
        return $this;
460
    }
461
462
    /**
463
     * Returns an iterator with rows from user-supplied filter functions removed
464
     *
465
     * @return FilteredReader An iterator with filtered rows
466
     */
467 3
    public function filter()
468
    {
469 3
        return new FilteredReader($this, $this->filters);
470
    }
471
472
    /**
473
     * Retrieve the contents of the dataset as an array of arrays.
474
     *
475
     * @return array An array of arrays of CSV content
476
     */
477
    public function toArray()
478
    {
479 1
        return array_map(function($row){
480 1
            return $row->toArray();
481 1
        }, iterator_to_array($this));
482
    }
483
484
}
485