Completed
Branch refactor/142 (8a1d2c)
by Luke
02:46
created

Reader::undoReplaceQuotedSpecialChars()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 9
Code Lines 5

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 5
CRAP Score 2.0185

Importance

Changes 0
Metric Value
c 0
b 0
f 0
dl 0
loc 9
ccs 5
cts 6
cp 0.8333
rs 9.6666
cc 2
eloc 5
nc 2
nop 3
crap 2.0185
1
<?php
2
/**
3
 * CSVelte: Slender, elegant CSV for PHP
4
 *
5
 * Inspired by Python's CSV module and Frictionless Data and the W3C's CSV
6
 * standardization efforts, CSVelte was written in an effort to take all the
7
 * suck out of working with CSV.
8
 *
9
 * @version   v0.2.1
10
 * @copyright Copyright (c) 2016 Luke Visinoni <[email protected]>
11
 * @author    Luke Visinoni <[email protected]>
12
 * @license   https://github.com/deni-zen/csvelte/blob/master/LICENSE The MIT License (MIT)
13
 */
14
namespace CSVelte;
15
16
use CSVelte\Contract\Streamable;
17
18
use \FilterIterator;
19
use CSVelte\IO\Stream;
20
use CSVelte\Table\Row;
21
use CSVelte\Table\HeaderRow;
22
use CSVelte\Reader\FilteredIterator as FilteredReader;
23
24
use \InvalidArgumentException;
25
use CSVelte\Exception\EndOfFileException;
26
27
use function
28
    CSVelte\streamize,
29
    CSVelte\taste,
30
    CSVelte\taste_has_header,
31
    CSVelte\collect;
32
33
/**
34
 * CSV Reader
35
 *
36
 * Reads CSV data from any object that implements CSVelte\Contract\Readable.
37
 *
38
 * @package CSVelte
39
 * @subpackage Reader
40
 * @since v0.1
41
 * @todo Also, is there any way to do some kind of caching or something? Probably
42
 *     not but if you could that would be a cool feature...
43
 */
44
class Reader implements \Iterator
45
{
46
    const PLACEHOLDER_DELIM   = '[=[__DLIM__]=]';
47
    const PLACEHOLDER_NEWLINE = '[=[__NWLN__]=]';
48
49
    /**
50
     * This class supports any sources of input that implements this interface.
51
     * This way I can read from local files, streams, FTP, any class that implements
52
     * the "Readable" interface
53
     * @var \CSVelte\Contract\Streamable
54
     */
55
    protected $source;
56
57
    /**
58
     * @var \CSVelte\Flavor The "flavor" or format of the CSV being read
59
     */
60
    protected $flavor;
61
62
    /**
63
     * @var \CSVelte\Table\Row|boolean Row currently loaded into memory
64
     */
65
    protected $current;
66
67
    /**
68
     * @var integer The current line being read (from input source)
69
     */
70
    protected $line = 0;
71
72
    /**
73
     * @var \CSVelte\Table\HeaderRow The header row (if any)
74
     */
75
    protected $header;
76
77
    /**
78
     * @var array An array of callback functions
79
     */
80
    protected $filters = array();
81
82
    /**
83
     * @var bool True if current line ended while inside a quoted string
84
     */
85
    protected $open = false;
86
87
    /**
88
     * @var bool True if last character read was the escape character
89
     */
90
    protected $escape = false;
91
92
    /**
93
     * Reader Constructor.
94
     * Initializes a reader object using an input source and optionally a flavor
95
     *
96
     * @param mixed $input The source of our CSV data
97
     * @param \CSVelte\Flavor|array|null $flavor The "flavor" or format specification object
98
     */
99 22
    public function __construct($input, $flavor = null)
100
    {
101 22
        $this->setSource($input)
102 22
             ->setFlavor($flavor)
103 22
             ->rewind();
104 22
    }
105
106
    /**
107
     * Set the flavor.
108
     *
109
     * Set the ``CSVelte\Flavor`` object, used to determine CSV format.
110
     *
111
     * @param \CSVelte\Flavor|array|null $flavor Either an array or a flavor object
112
     */
113 20
    protected function setFlavor($flavor = null)
114
    {
115 20
        if (is_array($flavor)) $flavor = new Flavor($flavor);
116
        // @todo put this inside a try/catch
117 20
        if (is_null($flavor)) {
118 11
            $flavor = taste($this->source);
119 11
        }
120 20
        if (is_null($flavor->header)) {
121
            // Flavor is immutable, give me a new one with header set to lickHeader return val
122 4
            $flavor = $flavor->copy(['header' => taste_has_header($this->source)]);
123 4
        }
124 20
        $this->flavor = $flavor;
125 20
        return $this;
126 4
    }
127
128
    /**
129
     * Set the reader source.
130
     *
131
     * The reader can accept anything that implements Readable and is actually
132
     * readable (can be read). This will make sure that whatever is passed to
133
     * the reader meets these expectations and set $this->source. It can also
134
     * accept any string (or any object with a __toString() method), or an
135
     * SplFileObject, so long as it represents a file rather than a directory.
136
     *
137
     * @param mixed $input See description
138
     * @return $this
139
     */
140 20
    protected function setSource($input)
141
    {
142 20
        if (!($input instanceof Streamable)) {
143 10
            $input = streamize($input);
144 10
        }
145 20
        $this->source = $input;
146 20
        return $this;
147
    }
148
149
    /**
150
     * Load a line into memory
151
     *
152
     * @return void ($this?)
153
     * @access protected
154
     */
155 20
    protected function load()
156
    {
157 20
        if (is_null($this->current)) {
158
            try {
159 20
                $line = $this->readLine();
160 20
                $this->line++;
161 20
                $parsed = $this->parse($line);
162 20
                if ($this->hasHeader() && $this->line === 1) {
163 15
                    $this->header = new HeaderRow($parsed);
164 15
                } else {
165 20
                    $this->current = new Row($parsed);
166 20
                    if ($this->header) $this->current->setHeaderRow($this->header);
167
                }
168 20
            } catch (EndOfFileException $e) {
169 7
                $this->current = false;
170
            }
171 20
        }
172 20
    }
173
174
    /**
175
     * Read single line from CSV data source (stream, file, etc.), taking into
176
     * account CSV's de-facto quoting rules with respect to designated line
177
     * terminator character when they fall within quoted strings.
178
     *
179
     * @return string A CSV row (could possibly span multiple lines depending on
180
     *     quoting and escaping)
181
     * @throws \CSVelte\Exception\EndOfFileException when eof has been reached
182
     *     and the read buffer has all been returned
183
     */
184 20
    protected function readLine()
185
    {
186 20
        $f = $this->getFlavor();
187 20
        $eol = $f->lineTerminator;
188
        try {
189
            do {
190 20
                if (!isset($lines)) $lines = array();
191 20
                if (false === ($line = $this->source->readLine($eol))) {
192 7
                    throw new EndOfFileException("End of file reached");
193
                }
194 20
                array_push($lines, rtrim($line, $eol));
195 20
            } while ($this->inQuotedString(end($lines), $f->quoteChar, $f->escapeChar));
196 20
        } catch (EndOfFileException $e) {
197
            // only throw the exception if we don't already have lines in the buffer
198 7
            if (!count($lines)) throw $e;
199
        }
200 20
        return rtrim(implode($eol, $lines), $eol);
201
    }
202
203
    /**
204
     * Determine whether last line ended while a quoted string was still "open"
205
     *
206
     * This method is used in a loop to determine if each line being read ends
207
     * while a quoted string is still "open".
208
     *
209
     * @param string $line Line of csv to analyze
210
     * @param string $quoteChar The quote/enclosure character to use
211
     * @param string $escapeChar The escape char/sequence to use
212
     * @return bool True if currently within a quoted string
213
     */
214 20
    protected function inQuotedString($line, $quoteChar, $escapeChar)
215
    {
216 20
        if (!empty($line)) {
217
            do {
218 20
                if (!isset($i)) $i = 0;
219 20
                $c = $line[$i++];
220 20
                if ($this->escape) {
221 1
                    $this->escape = false;
222 1
                    continue;
223
                }
224 20
                $this->escape = ($c == $escapeChar);
225 20
                if ($c == $quoteChar) $this->open = !$this->open;
226 20
            } while ($i < strlen($line));
227 20
        }
228 20
        return $this->open;
229
    }
230
231
    /**
232
     * Flavor Getter.
233
     *
234
     * Retreive the "flavor" object being used by the reader
235
     *
236
     * @return \CSVelte\Flavor
237
     * @access public
238
     */
239 20
    public function getFlavor()
240
    {
241 20
        return $this->flavor;
242
    }
243
244
    /**
245
     * Check if flavor object defines header.
246
     *
247
     * Determine whether or not the input source's CSV data contains a header
248
     * row or not. Unless you explicitly specify so within your Flavor object,
249
     * this method is a logical best guess. The CSV format does not
250
     * provide metadata of any kind and therefor does not provide this info.
251
     *
252
     * @return boolean True if the input source has a header row (or, to be more )
253
     *     accurate, if the flavor SAYS it has a header row)
254
     * @todo Rather than always reading in Taster::SAMPLE_SIZE, read in ten lines at a time until
255
     *     whatever method it is has enough data to make a reliable decision/guess
256
     */
257 20
    public function hasHeader()
258
    {
259 20
        return $this->getFlavor()->header;
260
    }
261
262
    /**
263
     * Temporarily replace special characters within a quoted string
264
     *
265
     * Replace all instances of newlines and whatever character you specify (as
266
     * the delimiter) that are contained within quoted text. The replacements are
267
     * simply a special placeholder string. This is done so that I can use the
268
     * very unsmart "explode" function and not have to worry about it exploding
269
     * on delimiters or newlines within quotes. Once I have exploded, I typically
270
     * sub back in the real characters before doing anything else.
271
     *
272
     * @param string $data The string to do the replacements on
273
     * @param string $delim The delimiter character to replace
274
     * @param string $quo The quote character
275
     * @param string $eol Line terminator character/sequence
276
     * @return string The data with replacements performed
277
     * @access protected
278
     * @internal
279
     * @todo I could probably pass in (maybe optionally) the newline character I
280
     *     want to replace as well. I'll do that if I need to.
281
     * @todo Create a regex class so you can do $regex->escape() rather than
282
     *     preg_quote
283
     */
284 20
    protected function replaceQuotedSpecialChars($data, $delim, $quo, $eol)
285
    {
286
        return preg_replace_callback('/(['. preg_quote($quo, '/') . '])(.*)\1/imsU', function($matches) use ($delim, $eol) {
287 12
            $ret = str_replace($eol, self::PLACEHOLDER_NEWLINE, $matches[0]);
288 12
            $ret = str_replace($delim, self::PLACEHOLDER_DELIM, $ret);
289 12
            return $ret;
290 20
        }, $data);
291
    }
292
293
    /**
294
     * Undo temporary special char replacements
295
     *
296
     * Replace the special character placeholders with the characters they
297
     * originally substituted.
298
     *
299
     * @param string $data The data to undo replacements in
300
     * @param string $delim The delimiter character
301
     * @param string $eol The character or string of characters used to terminate lines
302
     * @return string The data with placeholders replaced with original characters
303
     * @internal
304
     */
305 20
    protected function undoReplaceQuotedSpecialChars($data, $delim, $eol)
306
    {
307 20
        $replacements = array(self::PLACEHOLDER_DELIM => $delim, self::PLACEHOLDER_NEWLINE => $eol);
308
        if (array_walk($replacements, function($replacement, $placeholder) use (&$data) {
309 20
            $data = str_replace($placeholder, $replacement, $data);
310 20
        })) {
311 20
            return $data;
312
        }
313
    }
314
315
    /**
316
     * Remove quotes wrapping text.
317
     *
318
     * @param string $data The data to unquote
319
     * @return string The data with quotes stripped from the outside of it
320
     * @internal
321
     */
322 20
    protected function unQuote($data)
323
    {
324 20
        $escapeChar = $this->getFlavor()->doubleQuote ? $this->getFlavor()->quoteChar : $this->getFlavor()->escapeChar;
325 20
        $quoteChar = $this->getFlavor()->quoteChar;
326 20
        $data = $this->unEscape($data, $escapeChar, $quoteChar);
327 20
        return preg_replace('/^(["\'])(.*)\1$/ms', '\2', $data);
328
    }
329
330
    /**
331
     * @internal
332
     * @todo This actually shouldn't even be necessary. Characters should be read
333
     *     in one at a time and a quote that follows another should just be ignored
334
     *     deeming this unnecessary.
335
     */
336 20
    protected function unEscape($str, $esc, $quo)
337
    {
338 20
        return str_replace($esc . $quo, $quo, $str);
339
    }
340
341
    /**
342
     * Parse a line of CSV data into an array of columns
343
     *
344
     * @param string A line of CSV data to parse
345
     * @return array An array of columns
346
     * @access protected
347
     * @internal
348
     */
349 20
    protected function parse($line)
350
    {
351 20
        $f = $this->getFlavor();
352 20
        $replaced = $this->replaceQuotedSpecialChars($line, $f->delimiter, $f->quoteChar, $f->lineTerminator);
353 20
        $columns = explode($f->delimiter, $replaced);
354 20
        $that = $this;
355
        return array_map(function($val) use ($that, $f) {
356 20
            $undone = $that->undoReplaceQuotedSpecialChars($val, $f->delimiter, $f->lineTerminator);
357 20
            return $this->unQuote($undone);
358 20
        }, $columns);
359
    }
360
361
    /**
362
     * Retrieve current row.
363
     *
364
     * @return CSVelte\Table\Row The current row
365
     */
366 20
    public function current()
367
    {
368 20
        return $this->current;
369
    }
370
371
    /**
372
     * Advance to the next row
373
     *
374
     * @return CSVelte\Table\Row|null The current row (if there is one)
375
     */
376 16
    public function next()
377
    {
378
379 16
        $this->current = null;
380 16
        $this->load();
381 16
        return $this->current;
382
    }
383
384
    /**
385
     * Determine if current position has valid row.
386
     *
387
     * @return boolean True if current row is valid
388
     */
389 8
    public function valid()
390
    {
391 8
        return (bool) $this->current;
392
    }
393
394
    /**
395
     * Retrieve current row key (line number).
396
     *
397
     * @return int The current line number
398
     */
399 5
    public function key()
400
    {
401 5
        return $this->line;
402
    }
403
404
    /**
405
     * Rewind to the beginning of the dataset.
406
     *
407
     * @return CSVelte\Table\Row|null The current row
408
     */
409 20
    public function rewind()
410
    {
411 20
        $this->line = 0;
412 20
        $this->source->rewind();
413 20
        $this->current = null;
414 20
        $this->load();
415 20
        if ($this->hasHeader()) {
416 15
            $this->next();
417 15
        }
418 20
        return $this->current();
419
    }
420
421
    /**
422
     * Retrieve header row.
423
     *
424
     * @return CSVelte\Table\HeaderRow|null The header row if there is one
425
     */
426 2
    public function header()
427
    {
428 2
        return $this->header;
429
    }
430
431
    /**
432
     * Add anonumous function as filter.
433
     *
434
     * Add an anonymous function that accepts the current row as its only argument.
435
     * Return true from the function to keep that row, false otherwise.
436
     *
437
     * @param Callable $filter An anonymous function to filter out row by certain criteria
438
     * @return $this
439
     */
440 3
    public function addFilter(Callable $filter)
441
    {
442 3
        array_push($this->filters, $filter);
443 3
        return $this;
444
    }
445
446
    /**
447
     * Add multiple filters at once.
448
     *
449
     * Add an array of anonymous functions to filter out certain rows.
450
     *
451
     * @param array $filters An array of anonymous functions
452
     * @return $this
453
     */
454 1
    public function addFilters(array $filters)
455
    {
456 1
        foreach ($filters as $filter) {
457 1
            $this->addFilter($filter);
458 1
        }
459 1
        return $this;
460
    }
461
462
    /**
463
     * Returns an iterator with rows from user-supplied filter functions removed
464
     *
465
     * @return CSVelte\Reader\FilteredReader An iterator with filtered rows
466
     */
467 3
    public function filter()
468
    {
469 3
        return new FilteredReader($this, $this->filters);
470
    }
471
472
    /**
473
     * Retrieve the contents of the dataset as an array of arrays.
474
     *
475
     * @return array An array of arrays of CSV content
476
     */
477
    public function toArray()
478
    {
479 1
        return array_map(function($row){
480 1
            return $row->toArray();
481 1
        }, iterator_to_array($this));
482
    }
483
484
}
485