1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
namespace frictionlessdata\tableschema; |
4
|
|
|
use frictionlessdata\tableschema\Exceptions\DataSourceException; |
5
|
|
|
|
6
|
|
|
/** |
7
|
|
|
* Class for working with Csv Dialect / RFC4180 conforming csv files |
8
|
|
|
*/ |
9
|
|
|
class CsvDialect |
10
|
|
|
{ |
11
|
|
|
/* |
12
|
|
|
* It doesn't handle all the functionality - but validates the dialect construct and parses csv rows according to the dialect |
13
|
|
|
* |
14
|
|
|
* the following requirements should be handled externally by the calling code |
15
|
|
|
* currently this class is focused on parsing a single row, so anything involving first / last rows |
16
|
|
|
* is not handled |
17
|
|
|
* |
18
|
|
|
* RFC4180: |
19
|
|
|
* - The last record in the file may or may not have an ending line break. |
20
|
|
|
* - There maybe an optional header line appearing as the first line of the file with the same format as normal record lines. |
21
|
|
|
* This header will contain names corresponding to the fields in the file and should contain the same number of fields as the records in the rest of the file |
22
|
|
|
* (the presence or absence of the header line should be indicated via the optional "header" parameter of this MIME type) |
23
|
|
|
* - Each line should contain the same number of fields throughout the file. |
24
|
|
|
* |
25
|
|
|
* Tabular Data requirements |
26
|
|
|
* - File encoding must be either UTF-8 (the default) or include encoding property |
27
|
|
|
* - If the CSV differs from this or the RFC in any other way regarding dialect |
28
|
|
|
* (e.g. line terminators, quote charactors, field delimiters), |
29
|
|
|
* the Tabular Data Resource MUST contain a dialect property describing its dialect. |
30
|
|
|
* The dialect property MUST follow the CSV Dialect specification. |
31
|
|
|
*/ |
32
|
|
|
|
33
|
|
|
public $dialect; |
34
|
|
|
|
35
|
|
|
public function __construct($dialect = null) |
36
|
|
|
{ |
37
|
|
|
$defaultDialect = [ |
38
|
|
|
// specifies the character sequence which should separate fields (aka columns). Default = , |
39
|
|
|
"delimiter" => ",", |
40
|
|
|
// specifies the character sequence which should terminate rows. Default = \r\n |
41
|
|
|
"lineTerminator" => "\r\n", |
42
|
|
|
// specifies a one-character string to use as the quoting character. Default = " |
43
|
|
|
"quoteChar" => '"', |
44
|
|
|
// controls the handling of quotes inside fields. If true, two consecutive quotes should be interpreted as one. |
45
|
|
|
// Default = true |
46
|
|
|
"doubleQuote" => true, |
47
|
|
|
// specifies a one-character string to use for escaping (for example, \), mutually exclusive with quoteChar. |
48
|
|
|
// Not set by default |
49
|
|
|
"escapeChar" => null, |
50
|
|
|
// specifies the null sequence (for example \N). Not set by default |
51
|
|
|
"nullSequence" => null, |
52
|
|
|
// specifies how to interpret whitespace which immediately follows a delimiter; |
53
|
|
|
// if false, it means that whitespace immediately after a delimiter should be treated as part of the following field. |
54
|
|
|
// Default = true |
55
|
|
|
"skipInitialSpace" => true, |
56
|
|
|
// indicates whether the file includes a header row. If true the first row in the file is a header row, not data. |
57
|
|
|
// Default = true |
58
|
|
|
"header" => true, |
59
|
|
|
// indicates that case in the header is meaningful. For example, columns CAT and Cat should not be equated. |
60
|
|
|
// Default = false |
61
|
|
|
"caseSensitiveHeader" => false, |
62
|
|
|
// a number, in n.n format, e.g., 1.0. If not present, consumers should assume latest schema version. |
63
|
|
|
"csvddfVersion" => null, |
64
|
|
|
]; |
65
|
|
|
if ($dialect === null) { |
66
|
|
|
$dialect = []; |
67
|
|
|
} else { |
68
|
|
|
$dialect = (array) $dialect; |
69
|
|
|
}; |
70
|
|
|
$this->dialect = array_merge($defaultDialect, $dialect); |
71
|
|
|
if (!in_array($this->dialect["lineTerminator"], ["\r\n", "\n\r", "\n", "\r"])) { |
72
|
|
|
// we rely on PHP stream functions which make it a bit harder to support other line terminators |
73
|
|
|
// TODO: support custom lineTerminator |
74
|
|
|
throw new \Exception("custom lineTerminator is not supported"); |
75
|
|
|
} |
76
|
|
|
if (strlen($this->dialect["delimiter"]) != 1) { |
77
|
|
|
throw new \Exception("delimiter must be a single char"); |
78
|
|
|
} |
79
|
|
|
if ($this->dialect["nullSequence"] !== null) { |
80
|
|
|
throw new \Exception("custom nullSequence is not supported"); |
81
|
|
|
} |
82
|
|
|
} |
83
|
|
|
|
84
|
|
|
public function parseRow($line) |
85
|
|
|
{ |
86
|
|
|
// RFC4180 - Each record is located on a separate line, delimited by a line break (CRLF) |
87
|
|
|
// Tabular Data - The line terminator character MUST be LF or CRLF |
88
|
|
|
$line = rtrim($line, "\r\n"); |
89
|
|
|
|
90
|
|
|
// RFC4180 - Within the header and each record, there may be one or more fields, separated by commas. |
91
|
|
|
// Spaces are considered part of a field and should not be ignored. |
92
|
|
|
// The last field in the record must not be followed by a comma. |
93
|
|
|
// - Each field may or may not be enclosed in double quotes |
94
|
|
|
// (however some programs, such as Microsoft Excel, do not use double quotes at all). |
95
|
|
|
// If fields are not enclosed with double quotes, then double quotes may not appear inside the fields. |
96
|
|
|
// - Fields containing line breaks (CRLF), double quotes, and commas |
97
|
|
|
// should be enclosed in double-quotes. |
98
|
|
|
// - If double-quotes are used to enclose fields, |
99
|
|
|
// then a double-quote appearing inside a field must be escaped by preceding it with another double quote. |
100
|
|
|
$enclosed = null; |
101
|
|
|
$fields = []; |
102
|
|
|
$field = -1; |
103
|
|
|
$lastCharPos = mb_strlen($line)-1; |
104
|
|
|
for ($charPos = 0; $charPos < mb_strlen($line); $charPos++) { |
105
|
|
|
$char = mb_substr($line, $charPos, 1); |
106
|
|
|
if ($enclosed === null) { |
107
|
|
|
// start of a new field |
108
|
|
|
if ($char == $this->dialect["delimiter"]) { |
109
|
|
View Code Duplication |
if ( |
|
|
|
|
110
|
|
|
// delimiter at end of line |
111
|
|
|
($charPos == $lastCharPos) |
112
|
|
|
// double delimiters |
113
|
|
|
|| ($charPos != $lastCharPos && mb_substr($line, $charPos+1, 1) == $this->dialect["delimiter"]) |
114
|
|
|
) { |
115
|
|
|
$field++; |
116
|
|
|
$fields[$field] = ""; |
117
|
|
|
} |
118
|
|
|
continue; |
119
|
|
|
} else { |
120
|
|
|
$field++; |
121
|
|
|
$fields[$field] = ""; |
122
|
|
View Code Duplication |
if ($char == $this->dialect["quoteChar"]) { |
|
|
|
|
123
|
|
|
$enclosed = true; |
124
|
|
|
continue; |
125
|
|
|
} else { |
126
|
|
|
$enclosed = false; |
127
|
|
|
$fields[$field] .= $char; |
128
|
|
|
continue; |
129
|
|
|
} |
130
|
|
|
} |
131
|
|
|
} elseif ($enclosed) { |
132
|
|
|
// processing an enclosed field |
133
|
|
|
if ($this->dialect["doubleQuote"] !== null && $char == $this->dialect["quoteChar"]) { |
134
|
|
|
// encountered quote in doubleQuote mode |
135
|
|
|
if ($charPos !== 0 && mb_substr($line, $charPos-1, 1) == $this->dialect["quoteChar"]) { |
136
|
|
|
// previous char was also a double quote |
137
|
|
|
// the quote was added in previous iteration, nothing to do here |
138
|
|
|
continue; |
139
|
|
|
} elseif ($charPos != $lastCharPos && mb_substr($line, $charPos+1, 1) == $this->dialect["quoteChar"]) { |
140
|
|
|
// next char is a also a double quote - add a quote to the field |
141
|
|
|
$fields[$field] .= $this->dialect["quoteChar"]; |
142
|
|
|
continue; |
143
|
|
|
} |
144
|
|
|
} |
145
|
|
|
if ($this->dialect["escapeChar"]) { |
146
|
|
|
// handle escape chars |
147
|
|
|
if ($char == $this->dialect["escapeChar"]) { |
148
|
|
|
// char is the escape char, add the escaped char to the string |
149
|
|
|
if ($charPos === $lastCharPos) { |
150
|
|
|
throw new DataSourceException("Encountered escape char at end of line"); |
151
|
|
|
} else { |
152
|
|
|
$fields[$field] .= mb_substr($line, $charPos+1, 1); |
153
|
|
|
} |
154
|
|
|
continue; |
155
|
|
|
} elseif ($charPos != 0 && mb_substr($line, $charPos-1, 1) == $this->dialect["escapeChar"]) { |
156
|
|
|
// previous char was the escape string |
157
|
|
|
// added the char in previous iteration, nothing to do here |
158
|
|
|
continue; |
159
|
|
|
} |
160
|
|
|
} |
161
|
|
View Code Duplication |
if ($char == $this->dialect["quoteChar"]) { |
|
|
|
|
162
|
|
|
// encountered a quote signifying the end of the enclosed field |
163
|
|
|
$enclosed = null; |
164
|
|
|
continue; |
165
|
|
|
} else { |
166
|
|
|
// character in enclosed field |
167
|
|
|
$fields[$field] .= $char; |
168
|
|
|
continue; |
169
|
|
|
} |
170
|
|
|
} else { |
171
|
|
|
// processing a non-enclosed field |
172
|
|
|
if ($char == $this->dialect["quoteChar"]) { |
173
|
|
|
// non enclosed field - cannot have a quotes |
174
|
|
|
throw new \Exception("Invalid csv file - if field is not enclosed with double quotes - then double quotes may not appear inside the field"); |
175
|
|
|
} elseif ($char == $this->dialect["delimiter"]) { |
176
|
|
|
// end of non-enclosed field + start of new field |
177
|
|
View Code Duplication |
if ( |
|
|
|
|
178
|
|
|
// delimiter at end of line |
179
|
|
|
($charPos == $lastCharPos) |
180
|
|
|
// double delimiters |
181
|
|
|
|| ($charPos != $lastCharPos && mb_substr($line, $charPos+1, 1) == $this->dialect["delimiter"]) |
182
|
|
|
) { |
183
|
|
|
$field++; |
184
|
|
|
$fields[$field] = ""; |
185
|
|
|
} |
186
|
|
|
$enclosed = null; |
187
|
|
|
continue; |
188
|
|
|
} else { |
189
|
|
|
// character in non-enclosed field |
190
|
|
|
$fields[$field] .= $char; |
191
|
|
|
continue; |
192
|
|
|
} |
193
|
|
|
} |
194
|
|
|
} |
195
|
|
|
if (count($fields) > 1 && mb_strlen($fields[count($fields)-1]) == 0) { |
196
|
|
|
throw new \Exception("Invalid csv file - line must not end with a comma"); |
197
|
|
|
} |
198
|
|
|
if ($this->dialect["skipInitialSpace"]) { |
199
|
|
|
return array_map(function($field) { |
200
|
|
|
return ltrim($field); |
201
|
|
|
}, $fields); |
202
|
|
|
} else { |
203
|
|
|
return $fields; |
204
|
|
|
} |
205
|
|
|
} |
206
|
|
|
} |
207
|
|
|
|
Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.
You can also find more detailed suggestions in the “Code” section of your repository.