|
1
|
|
|
<?php |
|
2
|
|
|
/** |
|
3
|
|
|
* CSVelte: Slender, elegant CSV for PHP |
|
4
|
|
|
* |
|
5
|
|
|
* Inspired by Python's CSV module and Frictionless Data and the W3C's CSV |
|
6
|
|
|
* standardization efforts, CSVelte was written in an effort to take all the |
|
7
|
|
|
* suck out of working with CSV. |
|
8
|
|
|
* |
|
9
|
|
|
* @copyright Copyright (c) 2018 Luke Visinoni |
|
10
|
|
|
* @author Luke Visinoni <[email protected]> |
|
11
|
|
|
* @license See LICENSE file (MIT license) |
|
12
|
|
|
*/ |
|
13
|
|
|
namespace CSVelte\Sniffer; |
|
14
|
|
|
|
|
15
|
|
|
use CSVelte\Sniffer; |
|
16
|
|
|
use CSVelte\Exception\SnifferException; |
|
17
|
|
|
use RuntimeException; |
|
18
|
|
|
|
|
19
|
|
|
use function Noz\collect; |
|
20
|
|
|
|
|
21
|
|
|
class SniffQuoteAndDelimByAdjacency extends AbstractSniffer |
|
22
|
|
|
{ |
|
23
|
|
|
/** |
|
24
|
|
|
* Guess quote and delimiter character(s) |
|
25
|
|
|
* |
|
26
|
|
|
* If there are quoted values within the data, it is often easiest to guess the quote and delimiter characters at |
|
27
|
|
|
* the same time by analyzing their adjacency to one-another. That is to say, in cases where certain values are |
|
28
|
|
|
* wrapped in quotes, it can often be determined what not only that quote character is, but also the delimiter |
|
29
|
|
|
* because it is often on either side of the quote character. |
|
30
|
|
|
* |
|
31
|
|
|
* @param string $data The data to analyze |
|
32
|
|
|
* |
|
33
|
|
|
* @return string[] |
|
34
|
|
|
*/ |
|
35
|
1 |
|
public function sniff($data) |
|
36
|
|
|
{ |
|
37
|
|
|
/** |
|
38
|
|
|
* @var array An array of pattern matches |
|
39
|
|
|
*/ |
|
40
|
1 |
|
$matches = null; |
|
41
|
|
|
/** |
|
42
|
|
|
* @var array An array of patterns (regex) |
|
43
|
|
|
*/ |
|
44
|
1 |
|
$patterns = []; |
|
45
|
1 |
|
$lineTerminator = $this->getOption('lineTerminator') ?: PHP_EOL; |
|
46
|
|
|
// delim can be anything but line breaks, quotes, alphanumeric, underscore, backslash, or any type of spaces |
|
47
|
1 |
|
$antidelims = implode(["\r", "\n", "\w", preg_quote('"', '/'), preg_quote("'", '/'), preg_quote(chr(Sniffer::SPACE), '/')]); |
|
48
|
1 |
|
$delim = "(?P<delim>[^{$antidelims}])"; |
|
49
|
1 |
|
$quote = "(?P<quoteChar>\"|'|`)"; // @todo I think MS Excel uses some strange encoding for fancy open/close quotes |
|
50
|
|
|
// @todo something happeened when I changed to double quotes that causes this to match things like ,"0.8"\n"2", as one when it should be two |
|
51
|
1 |
|
$patterns[] = "/{$delim} ?{$quote}.*?\\2\\1/ms"; // ,"something", - anything but whitespace or quotes followed by a possible space followed by a quote followed by anything followed by same quote, followed by same anything but whitespace |
|
52
|
1 |
|
$patterns[] = "/(?:^|{$lineTerminator}){$quote}.*?\\1{$delim} ?/ms"; // 'something', - beginning of line or line break, followed by quote followed by anything followed by quote followed by anything but whitespace or quotes |
|
53
|
1 |
|
$patterns[] = "/{$delim} ?{$quote}.*?\\2(?:$|{$lineTerminator})/ms"; // ,'something' - anything but whitespace or quote followed by possible space followed by quote followed by anything followed by quote, followed by end of line |
|
54
|
1 |
|
$patterns[] = "/(?:^|{$lineTerminator}){$quote}.*?\\2(?:$|{$lineTerminator})/ms"; // 'something' - beginning of line followed by quote followed by anything followed by quote followed by same quote followed by end of line |
|
55
|
1 |
|
foreach ($patterns as $pattern) { |
|
56
|
|
|
// @todo I had to add the error suppression char here because it was |
|
57
|
|
|
// causing undefined offset errors with certain data sets. strange... |
|
58
|
1 |
|
if (preg_match_all($pattern, $data, $matches) && $matches) { |
|
|
|
|
|
|
59
|
1 |
|
break; |
|
60
|
|
|
} |
|
61
|
1 |
|
} |
|
62
|
1 |
|
if ($matches) { |
|
63
|
|
|
try { |
|
64
|
1 |
|
return collect($matches) |
|
65
|
1 |
|
->kintersect(array_flip(['quoteChar', 'delim'])) |
|
66
|
1 |
|
->map(function($val) { |
|
67
|
1 |
|
return collect($val)->frequency()->sort()->reverse()->getKeyAt(1); |
|
68
|
1 |
|
}) |
|
69
|
1 |
|
->ksort() |
|
70
|
1 |
|
->reverse() |
|
71
|
1 |
|
->values() |
|
72
|
1 |
|
->toArray(); |
|
73
|
|
|
} catch (RuntimeException $e) { |
|
74
|
|
|
// eat this exception and let the sniffer exception below be thrown instead... |
|
75
|
|
|
} |
|
76
|
|
|
} |
|
77
|
|
|
throw new SnifferException('quoteChar and delimiter cannot be determined', SnifferException::ERR_QUOTE_AND_DELIM); |
|
78
|
|
|
} |
|
79
|
|
|
} |
This check marks implicit conversions of arrays to boolean values in a comparison. While in PHP an empty array is considered to be equal (but not identical) to false, this is not always apparent.
Consider making the comparison explicit by using
empty(..)or! empty(...)instead.