1
|
|
|
<?php |
2
|
|
|
/** |
3
|
|
|
* CSVelte: Slender, elegant CSV for PHP |
4
|
|
|
* |
5
|
|
|
* Inspired by Python's CSV module and Frictionless Data and the W3C's CSV |
6
|
|
|
* standardization efforts, CSVelte was written in an effort to take all the |
7
|
|
|
* suck out of working with CSV. |
8
|
|
|
* |
9
|
|
|
* @copyright Copyright (c) 2018 Luke Visinoni |
10
|
|
|
* @author Luke Visinoni <[email protected]> |
11
|
|
|
* @license See LICENSE file (MIT license) |
12
|
|
|
*/ |
13
|
|
|
namespace CSVelte\Sniffer; |
14
|
|
|
|
15
|
|
|
use function Noz\collect; |
16
|
|
|
use function Stringy\create as s; |
17
|
|
|
use Stringy\Stringy; |
18
|
|
|
|
19
|
|
|
class SniffHeaderByDataType extends AbstractSniffer |
20
|
|
|
{ |
21
|
|
|
/** |
22
|
|
|
* Guess whether there is a header row |
23
|
|
|
* |
24
|
|
|
* Guesses whether the data has a header row by comparing the data types of the first row with the types of |
25
|
|
|
* corresponding columns in other rows. |
26
|
|
|
* |
27
|
|
|
* @note Unlike the original version of this method, this one will be used to ALSO determine HOW MANY header rows |
28
|
|
|
* there likely are. So, compare the header to rows at the END of the sample. |
29
|
|
|
* |
30
|
|
|
* @param string $data The data to analyze |
31
|
|
|
* |
32
|
|
|
* @return bool |
33
|
|
|
*/ |
34
|
1 |
|
public function sniff($data) |
35
|
|
|
{ |
36
|
1 |
|
$delimiter = $this->getOption('delimiter'); |
37
|
1 |
|
$data = s($data); |
38
|
1 |
|
$lines = collect($data->lines()) |
39
|
|
|
->map(function($line) use ($delimiter) { |
40
|
1 |
|
return s($this->replaceQuotedSpecialChars($line, $delimiter)); |
41
|
1 |
|
}); |
42
|
1 |
|
$header = collect($lines->shift()->split($delimiter)) |
43
|
|
|
->map(function($val){ return $this->unQuote($val); }) |
44
|
|
|
->map(function($val) { |
45
|
|
|
return [ |
46
|
1 |
|
'type' => $this->getType($val), |
47
|
1 |
|
'length' => s($val)->length() |
48
|
1 |
|
]; |
49
|
1 |
|
}); |
50
|
1 |
|
$lines->pop(); // get rid of the last line because it may be incomplete |
51
|
1 |
|
$comparison = $lines->slice(0, 10) |
52
|
|
|
->map(function($line, $line_no) use ($header, $delimiter) { |
|
|
|
|
53
|
|
|
/** @var Stringy $line */ |
54
|
1 |
|
$values = collect($line->split($delimiter)); |
55
|
|
|
return $values->map(function($str, $pos) use ($header) { |
56
|
1 |
|
$comp = $header->get($pos); |
57
|
1 |
|
$type = $this->getType($str); |
58
|
|
|
return [ |
59
|
|
|
// true if same, false otherwise |
|
|
|
|
60
|
1 |
|
'type' => $comp['type'] == $type, |
61
|
|
|
// return the difference in length |
62
|
1 |
|
'length' => $comp['length'] - s($str)->length() |
63
|
1 |
|
]; |
64
|
1 |
|
}); |
65
|
1 |
|
}); |
66
|
|
|
|
67
|
1 |
|
$hasHeader = collect(); |
68
|
1 |
|
$comparison->each(function($line) use ($hasHeader) { |
69
|
1 |
|
foreach ($line as $val) { |
70
|
1 |
|
if ($val['type']) { |
71
|
1 |
|
$hasHeader->add(1); |
72
|
1 |
|
} else { |
73
|
1 |
|
if ($val['length'] === 0) { |
74
|
1 |
|
$hasHeader->add(1); |
75
|
1 |
|
} else { |
76
|
1 |
|
$hasHeader->add(-1); |
77
|
|
|
} |
78
|
|
|
} |
79
|
1 |
|
} |
80
|
1 |
|
}); |
81
|
|
|
|
82
|
1 |
|
return $hasHeader->sum() > 0; |
|
|
|
|
83
|
|
|
} |
84
|
|
|
|
85
|
1 |
|
protected function getType($value) |
86
|
|
|
{ |
87
|
1 |
|
$str = s($value); |
88
|
1 |
|
switch (true) { |
89
|
1 |
|
case is_numeric($value): |
90
|
1 |
|
return 'numeric'; |
91
|
1 |
|
case is_string($value): |
92
|
1 |
|
if (strtotime($value) !== false) { |
93
|
1 |
|
return 'datetime'; |
94
|
|
|
} |
95
|
1 |
|
if (preg_match('/^[+-]?[¥£€$]\d+(\.\d+)$/', $value)) { |
96
|
|
|
return 'currency'; |
97
|
|
|
} |
98
|
1 |
|
if ($str->isAlpha()) { |
99
|
1 |
|
return 'alpha'; |
100
|
|
|
} |
101
|
1 |
|
if ($str->isAlphanumeric()) { |
102
|
|
|
return 'alnum'; |
103
|
|
|
} |
104
|
1 |
|
if ($str->isBlank()) { |
105
|
|
|
return 'blank'; |
106
|
|
|
} |
107
|
1 |
|
if ($str->isJson()) { |
108
|
|
|
return 'json'; |
109
|
|
|
} |
110
|
1 |
|
} |
111
|
1 |
|
return 'unknown'; |
112
|
|
|
} |
113
|
|
|
} |
This check looks from parameters that have been defined for a function or method, but which are not used in the method body.