1
|
|
|
<?php |
2
|
|
|
/** |
3
|
|
|
* CSVelte: Slender, elegant CSV for PHP |
4
|
|
|
* |
5
|
|
|
* Inspired by Python's CSV module and Frictionless Data and the W3C's CSV |
6
|
|
|
* standardization efforts, CSVelte was written in an effort to take all the |
7
|
|
|
* suck out of working with CSV. |
8
|
|
|
* |
9
|
|
|
* @copyright Copyright (c) 2018 Luke Visinoni |
10
|
|
|
* @author Luke Visinoni <[email protected]> |
11
|
|
|
* @license See LICENSE file (MIT license) |
12
|
|
|
*/ |
13
|
|
|
namespace CSVelte\Sniffer; |
14
|
|
|
|
15
|
|
|
use CSVelte\Sniffer; |
16
|
|
|
use CSVelte\Exception\SnifferException; |
17
|
|
|
use RuntimeException; |
18
|
|
|
|
19
|
|
|
use function Noz\collect; |
20
|
|
|
|
21
|
|
|
class SniffQuoteAndDelimByAdjacency extends AbstractSniffer |
22
|
|
|
{ |
23
|
|
|
/** |
24
|
|
|
* Guess quote and delimiter character(s) |
25
|
|
|
* |
26
|
|
|
* If there are quoted values within the data, it is often easiest to guess the quote and delimiter characters at |
27
|
|
|
* the same time by analyzing their adjacency to one-another. That is to say, in cases where certain values are |
28
|
|
|
* wrapped in quotes, it can often be determined what not only that quote character is, but also the delimiter |
29
|
|
|
* because it is often on either side of the quote character. |
30
|
|
|
* |
31
|
|
|
* @param string $data The data to analyze |
32
|
|
|
* |
33
|
|
|
* @return string[] |
34
|
|
|
*/ |
35
|
1 |
|
public function sniff($data) |
36
|
|
|
{ |
37
|
|
|
/** |
38
|
|
|
* @var array An array of pattern matches |
39
|
|
|
*/ |
40
|
1 |
|
$matches = null; |
41
|
|
|
/** |
42
|
|
|
* @var array An array of patterns (regex) |
43
|
|
|
*/ |
44
|
1 |
|
$patterns = []; |
45
|
1 |
|
$lineTerminator = $this->getOption('lineTerminator') ?: PHP_EOL; |
46
|
|
|
// delim can be anything but line breaks, quotes, alphanumeric, underscore, backslash, or any type of spaces |
47
|
1 |
|
$antidelims = implode(["\r", "\n", "\w", preg_quote('"', '/'), preg_quote("'", '/'), preg_quote(chr(Sniffer::SPACE), '/')]); |
48
|
1 |
|
$delim = "(?P<delim>[^{$antidelims}])"; |
49
|
1 |
|
$quote = "(?P<quoteChar>\"|'|`)"; // @todo I think MS Excel uses some strange encoding for fancy open/close quotes |
50
|
|
|
// @todo something happeened when I changed to double quotes that causes this to match things like ,"0.8"\n"2", as one when it should be two |
51
|
1 |
|
$patterns[] = "/{$delim} ?{$quote}.*?\\2\\1/ms"; // ,"something", - anything but whitespace or quotes followed by a possible space followed by a quote followed by anything followed by same quote, followed by same anything but whitespace |
52
|
1 |
|
$patterns[] = "/(?:^|{$lineTerminator}){$quote}.*?\\1{$delim} ?/ms"; // 'something', - beginning of line or line break, followed by quote followed by anything followed by quote followed by anything but whitespace or quotes |
53
|
1 |
|
$patterns[] = "/{$delim} ?{$quote}.*?\\2(?:$|{$lineTerminator})/ms"; // ,'something' - anything but whitespace or quote followed by possible space followed by quote followed by anything followed by quote, followed by end of line |
54
|
1 |
|
$patterns[] = "/(?:^|{$lineTerminator}){$quote}.*?\\2(?:$|{$lineTerminator})/ms"; // 'something' - beginning of line followed by quote followed by anything followed by quote followed by same quote followed by end of line |
55
|
1 |
|
foreach ($patterns as $pattern) { |
56
|
|
|
// @todo I had to add the error suppression char here because it was |
57
|
|
|
// causing undefined offset errors with certain data sets. strange... |
58
|
1 |
|
if (preg_match_all($pattern, $data, $matches) && $matches) { |
|
|
|
|
59
|
1 |
|
break; |
60
|
|
|
} |
61
|
1 |
|
} |
62
|
1 |
|
if ($matches) { |
63
|
|
|
try { |
64
|
1 |
|
return collect($matches) |
65
|
1 |
|
->kintersect(array_flip(['quoteChar', 'delim'])) |
66
|
1 |
|
->map(function($val) { |
67
|
1 |
|
return collect($val)->frequency()->sort()->reverse()->getKeyAt(1); |
68
|
1 |
|
}) |
69
|
1 |
|
->ksort() |
70
|
1 |
|
->reverse() |
71
|
1 |
|
->values() |
72
|
1 |
|
->toArray(); |
73
|
|
|
} catch (RuntimeException $e) { |
74
|
|
|
// eat this exception and let the sniffer exception below be thrown instead... |
75
|
|
|
} |
76
|
|
|
} |
77
|
|
|
throw new SnifferException('quoteChar and delimiter cannot be determined', SnifferException::ERR_QUOTE_AND_DELIM); |
78
|
|
|
} |
79
|
|
|
} |
This check marks implicit conversions of arrays to boolean values in a comparison. While in PHP an empty array is considered to be equal (but not identical) to false, this is not always apparent.
Consider making the comparison explicit by using
empty(..)
or! empty(...)
instead.