1
|
|
|
<?php |
2
|
|
|
/** |
3
|
|
|
* CSVelte: Slender, elegant CSV for PHP |
4
|
|
|
* |
5
|
|
|
* Inspired by Python's CSV module and Frictionless Data and the W3C's CSV |
6
|
|
|
* standardization efforts, CSVelte was written in an effort to take all the |
7
|
|
|
* suck out of working with CSV. |
8
|
|
|
* |
9
|
|
|
* @copyright Copyright (c) 2018 Luke Visinoni |
10
|
|
|
* @author Luke Visinoni <[email protected]> |
11
|
|
|
* @license See LICENSE file (MIT license) |
12
|
|
|
*/ |
13
|
|
|
namespace CSVelte\Sniffer; |
14
|
|
|
|
15
|
|
|
use CSVelte\Dialect; |
16
|
|
|
use CSVelte\Reader; |
17
|
|
|
use Noz\Collection\Collection; |
18
|
|
|
|
19
|
|
|
use function CSVelte\to_stream; |
20
|
|
|
use function Noz\collect; |
21
|
|
|
use function Stringy\create as s; |
22
|
|
|
|
23
|
|
|
class SniffHeaderByDataType extends AbstractSniffer |
24
|
|
|
{ |
25
|
|
|
/** |
26
|
|
|
* Guess whether there is a header row |
27
|
|
|
* |
28
|
|
|
* Guesses whether the data has a header row by comparing the data types of the first row with the types of |
29
|
|
|
* corresponding columns in other rows. |
30
|
|
|
* |
31
|
|
|
* @note Unlike the original version of this method, this one will be used to ALSO determine HOW MANY header rows |
32
|
|
|
* there likely are. So, compare the header to rows at the END of the sample. |
33
|
|
|
* |
34
|
|
|
* @param string $data The data to analyze |
35
|
|
|
* |
36
|
|
|
* @return bool |
37
|
|
|
*/ |
38
|
1 |
|
public function sniff($data) |
39
|
|
|
{ |
40
|
1 |
|
$delimiter = $this->getOption('delimiter'); |
41
|
|
|
$getFieldInfo = function($val) { |
42
|
|
|
return [ |
43
|
1 |
|
'value' => $val, |
44
|
1 |
|
'type' => $this->getType($val), |
45
|
1 |
|
'length' => s($val)->length() |
46
|
1 |
|
]; |
47
|
1 |
|
}; |
48
|
1 |
|
$reader = new Reader(to_stream($data), new Dialect(['delimiter' => $delimiter, 'header' => false])); |
49
|
1 |
|
$lines = collect($reader->toArray()); |
50
|
1 |
|
$header = collect($lines->shift()) ->map($getFieldInfo); |
51
|
1 |
|
$lines->pop(); // get rid of the last line because it may be incomplete |
52
|
|
|
$comparison = $lines->slice(0, 10)->map(function($fields) use ($getFieldInfo) { |
53
|
1 |
|
return array_map($getFieldInfo, $fields); |
54
|
1 |
|
}); |
55
|
|
|
|
56
|
|
|
/** |
57
|
|
|
* @var Collection $header |
58
|
|
|
* @var Collection $noHeader |
59
|
|
|
*/ |
60
|
|
|
list($header, $noHeader) = $header->map(function($hval, $hind) use ($comparison) { |
61
|
|
|
|
62
|
1 |
|
$isHeader = 0; |
63
|
1 |
|
$type = $comparison->getColumn($hind)->getColumn('type'); |
64
|
1 |
|
$length = $comparison->getColumn($hind)->getColumn('length'); |
65
|
1 |
|
if ($distinct = $type->distinct()) { |
66
|
1 |
|
if ($distinct->count() == 1) { |
67
|
1 |
|
if ($distinct->getValueAt(1) != $hval['type']) { |
68
|
1 |
|
$isHeader = 1; |
69
|
1 |
|
} |
70
|
1 |
|
} |
71
|
1 |
|
} |
72
|
|
|
|
73
|
1 |
|
if (!$isHeader) { |
74
|
|
|
// use standard deviation to determine if header is wildly different length than others |
75
|
1 |
|
$mean = $length->average(); |
76
|
|
|
$sd = sqrt($length->map(function ($len) use ($mean) { |
77
|
1 |
|
return pow($len - $mean, 2); |
78
|
1 |
|
})->average()); |
79
|
|
|
|
80
|
1 |
|
$diff_head_avg = abs($hval['length'] - $mean); |
81
|
1 |
|
if ($diff_head_avg > $sd) { |
82
|
1 |
|
$isHeader = 1; |
83
|
1 |
|
} |
84
|
1 |
|
} |
85
|
1 |
|
return $isHeader; |
86
|
|
|
|
87
|
1 |
|
}) |
88
|
1 |
|
->partition(function($val) { |
89
|
1 |
|
return (bool) $val; |
90
|
1 |
|
}); |
91
|
|
|
|
92
|
1 |
|
return $header->count() > $noHeader->count(); |
|
|
|
|
93
|
|
|
} |
94
|
|
|
|
95
|
|
|
/** |
96
|
|
|
* Get string's type |
97
|
|
|
* |
98
|
|
|
* Returns one of a handful of "types". |
99
|
|
|
* |
100
|
|
|
* @param string $value A string to get the type of |
101
|
|
|
* |
102
|
|
|
* @return string |
103
|
|
|
*/ |
104
|
1 |
|
protected function getType($value) |
105
|
|
|
{ |
106
|
1 |
|
$str = s($value); |
107
|
1 |
|
switch (true) { |
108
|
1 |
|
case is_numeric($value): |
109
|
1 |
|
return 'numeric'; |
110
|
|
|
// note - the order of these is important, do not change unless you know what you're doing |
111
|
1 |
|
case is_string($value): |
112
|
1 |
|
if (preg_match('/^([a-z0-9_\.-]+)@([\da-z\.-]+)\.([a-z\.]{2,6})$/i', $value)) { |
113
|
1 |
|
return 'email'; |
114
|
|
|
} |
115
|
1 |
|
if (preg_match('/^(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w \.-]*)*\/?$/i', $value)) { |
116
|
1 |
|
return 'url'; |
117
|
|
|
} |
118
|
1 |
|
if (strtotime($value) !== false) { |
119
|
1 |
|
return 'datetime'; |
120
|
|
|
} |
121
|
1 |
|
if (preg_match('/^[+-]?[¥£€$]\d+(\.\d+)$/', $value)) { |
122
|
|
|
return 'currency'; |
123
|
|
|
} |
124
|
1 |
|
if (preg_match('/^[a-z0-9_-]{1,35}$/i', $value)) { |
125
|
1 |
|
return 'identifier'; |
126
|
|
|
} |
127
|
1 |
|
if (preg_match('/^[a-z0-9 _\/&\(\),\.?\'!-]{1,50}$/i', $value)) { |
128
|
1 |
|
return 'text_short'; |
129
|
|
|
} |
130
|
1 |
|
if (preg_match('/^[a-z0-9 _\/&\(\),\.?\'!-]{100,}$/i', $value)) { |
131
|
|
|
return 'text_long'; |
132
|
|
|
} |
133
|
1 |
|
if ($str->isAlphanumeric()) { |
134
|
1 |
|
return 'alnum'; |
135
|
|
|
} |
136
|
1 |
|
if ($str->isBlank()) { |
137
|
|
|
return 'blank'; |
138
|
|
|
} |
139
|
1 |
|
if ($str->isJson()) { |
140
|
|
|
return 'json'; |
141
|
|
|
} |
142
|
1 |
|
} |
143
|
1 |
|
return 'other'; |
144
|
|
|
} |
145
|
|
|
} |
146
|
|
|
|
If you return a value from a function or method, it should be a sub-type of the type that is given by the parent type f.e. an interface, or abstract method. This is more formally defined by the Lizkov substitution principle, and guarantees that classes that depend on the parent type can use any instance of a child type interchangably. This principle also belongs to the SOLID principles for object oriented design.
Let’s take a look at an example:
Our function
my_function
expects aPost
object, and outputs the author of the post. The base classPost
returns a simple string and outputting a simple string will work just fine. However, the child classBlogPost
which is a sub-type ofPost
instead decided to return anobject
, and is therefore violating the SOLID principles. If aBlogPost
were passed tomy_function
, PHP would not complain, but ultimately fail when executing thestrtoupper
call in its body.