1
|
|
|
# SPDX-FileCopyrightText: Copyright 2020-2023, Contributors to typed-dfs |
2
|
|
|
# SPDX-PackageHomePage: https://github.com/dmyersturnbull/typed-dfs |
3
|
|
|
# SPDX-License-Identifier: Apache-2.0 |
4
|
|
|
from io import StringIO |
5
|
|
|
|
6
|
|
|
import numpy as np |
7
|
|
|
import pandas as pd |
8
|
|
|
import pytest |
9
|
|
|
from lxml.etree import XMLSyntaxError # nosec |
10
|
|
|
|
11
|
|
|
from typeddfs.df_errors import NoValueError |
12
|
|
|
from typeddfs.untyped_dfs import UntypedDf |
13
|
|
|
|
14
|
|
|
from . import Ind1NonStrict as Ind1 |
15
|
|
|
from . import Ind2Col2NonStrict as Ind2Col2 |
16
|
|
|
from . import Ind2NonStrict as Ind2 |
17
|
|
|
from . import ( |
18
|
|
|
Trivial, |
19
|
|
|
logger, |
20
|
|
|
sample_data, |
21
|
|
|
sample_data_ind2_col2, |
22
|
|
|
sample_data_ind2_col2_pd_na, |
23
|
|
|
tmpfile, |
24
|
|
|
) |
25
|
|
|
|
26
|
|
|
|
27
|
|
|
class TestReadWrite: |
28
|
|
|
def test_feather_lz4(self): |
29
|
|
|
with tmpfile(".feather") as path: |
30
|
|
|
df = Ind2.convert(Ind2(sample_data())) |
31
|
|
|
df.to_feather(path, compression="lz4") |
32
|
|
|
df2 = Ind2.read_feather(path) |
33
|
|
|
assert df2.index_names() == ["abc", "xyz"] |
34
|
|
|
assert df2.column_names() == ["123"] |
35
|
|
|
|
36
|
|
|
def test_feather_zstd(self): |
37
|
|
|
with tmpfile(".feather") as path: |
38
|
|
|
df = Ind2.convert(Ind2(sample_data())) |
39
|
|
|
df.to_feather(path, compression="zstd") |
40
|
|
|
df2 = Ind2.read_feather(path) |
41
|
|
|
assert df2.index_names() == ["abc", "xyz"] |
42
|
|
|
assert df2.column_names() == ["123"] |
43
|
|
|
|
44
|
|
|
def test_csv_gz(self): |
45
|
|
|
with tmpfile(".csv.gz") as path: |
46
|
|
|
df = UntypedDf(sample_data()) |
47
|
|
|
df.to_csv(path) |
48
|
|
|
df2 = UntypedDf.read_csv(path) |
49
|
|
|
assert list(df2.index.names) == [None] |
50
|
|
|
assert set(df2.columns) == {"abc", "123", "xyz"} |
51
|
|
|
|
52
|
|
|
def test_untyped_read_write_csv(self): |
53
|
|
|
with tmpfile(".csv") as path: |
54
|
|
|
for indices in [None, "abc", ["abc", "xyz"]]: |
55
|
|
|
df = UntypedDf(sample_data()) |
56
|
|
|
if indices is not None: |
57
|
|
|
df = df.set_index(indices) |
58
|
|
|
df.to_csv(path) |
59
|
|
|
df2 = UntypedDf.read_csv(path) |
60
|
|
|
assert list(df2.index.names) == [None] |
61
|
|
|
assert set(df2.columns) == {"abc", "123", "xyz"} |
62
|
|
|
|
63
|
|
|
def test_write_passing_index(self): |
64
|
|
|
with tmpfile(".csv") as path: |
65
|
|
|
df = Trivial(sample_data()) |
66
|
|
|
df.to_csv(path, index=["abc"]) # fine |
67
|
|
|
df = UntypedDf(sample_data()) |
68
|
|
|
df.to_csv(path, index=["abc"]) # calls super immediately |
69
|
|
|
|
70
|
|
|
def test_typed_read_write_csv_noindex(self): |
71
|
|
|
with tmpfile(".csv") as path: |
72
|
|
|
df = Trivial(sample_data()) |
73
|
|
|
df.to_csv(path) |
74
|
|
|
df2 = Trivial.read_csv(path) |
75
|
|
|
assert list(df2.index.names) == [None] |
76
|
|
|
assert set(df2.columns) == {"abc", "123", "xyz"} |
77
|
|
|
|
78
|
|
|
def test_typed_read_write_csv_singleindex(self): |
79
|
|
|
with tmpfile(".csv") as path: |
80
|
|
|
df = Ind1.convert(Ind1(sample_data())) |
81
|
|
|
df.to_csv(path) |
82
|
|
|
assert df.index_names() == ["abc"] |
83
|
|
|
assert df.column_names() == ["123", "xyz"] |
84
|
|
|
df2 = Ind1.read_csv(path) |
85
|
|
|
assert df2.index_names() == ["abc"] |
86
|
|
|
assert df2.column_names() == ["123", "xyz"] |
87
|
|
|
|
88
|
|
|
def test_typed_read_write_csv_multiindex(self): |
89
|
|
|
with tmpfile(".csv") as path: |
90
|
|
|
df = Ind2.convert(Ind2(sample_data())) |
91
|
|
|
df.to_csv(path) |
92
|
|
|
assert df.index_names() == ["abc", "xyz"] |
93
|
|
|
assert df.column_names() == ["123"] |
94
|
|
|
df2 = Ind2.read_csv(path) |
95
|
|
|
assert df2.index_names() == ["abc", "xyz"] |
96
|
|
|
assert df2.column_names() == ["123"] |
97
|
|
|
|
98
|
|
|
def test_parquet(self): |
99
|
|
|
with tmpfile(".parquet") as path: |
100
|
|
|
df = UntypedDf(sample_data()) |
101
|
|
|
df.to_parquet(path) |
102
|
|
|
df2 = UntypedDf.read_parquet(path) |
103
|
|
|
assert list(df2.index.names) == [None] |
104
|
|
|
assert set(df2.columns) == {"abc", "123", "xyz"} |
105
|
|
|
|
106
|
|
|
def test_records(self): |
107
|
|
|
df = UntypedDf(sample_data()) |
108
|
|
|
records = df.to_records() |
109
|
|
|
df2 = UntypedDf.from_records(records) |
110
|
|
|
assert isinstance(df2, UntypedDf) |
111
|
|
|
|
112
|
|
|
def test_numeric_dtypes(self): |
113
|
|
|
dtypes = [ |
114
|
|
|
bool, |
115
|
|
|
np.byte, |
116
|
|
|
np.ubyte, |
117
|
|
|
np.short, |
118
|
|
|
np.ushort, |
119
|
|
|
np.single, |
120
|
|
|
np.int32, |
121
|
|
|
np.intc, |
122
|
|
|
np.half, |
123
|
|
|
np.float16, |
124
|
|
|
np.double, |
125
|
|
|
np.float64, |
126
|
|
|
pd.StringDtype(), |
127
|
|
|
pd.Int64Dtype(), |
128
|
|
|
pd.UInt64Dtype(), |
129
|
|
|
pd.Int32Dtype(), |
130
|
|
|
pd.UInt32Dtype(), |
131
|
|
|
pd.Int16Dtype(), |
132
|
|
|
pd.UInt16Dtype(), |
133
|
|
|
pd.Int8Dtype(), |
134
|
|
|
pd.UInt8Dtype(), |
135
|
|
|
] |
136
|
|
|
for suffix, fn in [ |
137
|
|
|
(".parquet", "parquet"), |
138
|
|
|
(".feather", "feather"), |
139
|
|
|
(".xml", "xml"), |
140
|
|
|
(".csv", "csv"), |
141
|
|
|
(".tsv", "tsv"), |
142
|
|
|
(".json", "json"), |
143
|
|
|
(".xlsx", "xlsx"), |
144
|
|
|
(".xls", "xls"), |
145
|
|
|
(".xlsb", "xlsb"), |
146
|
|
|
(".ods", "ods"), |
147
|
|
|
(".pickle", "pickle"), |
148
|
|
|
]: |
149
|
|
|
with tmpfile(suffix) as path: |
150
|
|
|
for dtype in dtypes: |
151
|
|
|
try: |
152
|
|
|
df = Ind2Col2.convert(Ind2Col2(sample_data_ind2_col2())).astype(dtype) |
153
|
|
|
assert list(df.index.names) == ["qqq", "rrr"] |
154
|
|
|
assert list(df.columns) == ["abc", "xyz"] |
155
|
|
|
getattr(df, "to_" + fn)(path) |
156
|
|
|
df2 = getattr(Ind2Col2, "read_" + fn)(path) |
157
|
|
|
assert list(df2.index.names) == ["qqq", "rrr"] |
158
|
|
|
assert list(df2.columns) == ["abc", "xyz"] |
159
|
|
|
except Exception: |
160
|
|
|
logger.error(f"Failed on path {path}, dtype {dtype}") |
161
|
|
|
raise |
162
|
|
|
|
163
|
|
|
def test_numeric_nullable_dtypes(self): |
164
|
|
|
dtypes = [ |
165
|
|
|
pd.StringDtype(), |
166
|
|
|
pd.BooleanDtype(), |
167
|
|
|
pd.Float64Dtype(), |
168
|
|
|
pd.Float32Dtype(), |
169
|
|
|
pd.Int64Dtype(), |
170
|
|
|
pd.UInt64Dtype(), |
171
|
|
|
pd.Int32Dtype(), |
172
|
|
|
pd.UInt32Dtype(), |
173
|
|
|
pd.Int16Dtype(), |
174
|
|
|
pd.UInt16Dtype(), |
175
|
|
|
pd.Int8Dtype(), |
176
|
|
|
pd.UInt8Dtype(), |
177
|
|
|
pd.StringDtype(), |
178
|
|
|
] |
179
|
|
|
for suffix, fn in [ |
180
|
|
|
(".parquet", "parquet"), |
181
|
|
|
(".feather", "feather"), |
182
|
|
|
(".csv", "csv"), |
183
|
|
|
(".tsv", "tsv"), |
184
|
|
|
(".json", "json"), |
185
|
|
|
(".xlsx", "xlsx"), |
186
|
|
|
(".xls", "xls"), |
187
|
|
|
(".xlsb", "xlsb"), |
188
|
|
|
(".ods", "ods"), |
189
|
|
|
(".pickle", "pickle"), |
190
|
|
|
(".xml", "xml"), |
191
|
|
|
]: |
192
|
|
|
for dtype in dtypes: |
193
|
|
|
with tmpfile(suffix) as path: |
194
|
|
|
try: |
195
|
|
|
df = Ind2Col2.convert(Ind2Col2(sample_data_ind2_col2_pd_na())).astype(dtype) |
196
|
|
|
assert list(df.index.names) == ["qqq", "rrr"] |
197
|
|
|
assert list(df.columns) == ["abc", "xyz"] |
198
|
|
|
getattr(df, "to_" + fn)(path) |
199
|
|
|
df2 = getattr(Ind2Col2, "read_" + fn)(path) |
200
|
|
|
assert list(df2.index.names) == ["qqq", "rrr"] |
201
|
|
|
assert list(df2.columns) == ["abc", "xyz"] |
202
|
|
|
except Exception: |
203
|
|
|
logger.error(f"Failed on path {path}, dtype {dtype}") |
204
|
|
|
raise |
205
|
|
|
|
206
|
|
|
""" |
207
|
|
|
# TODO: waiting for upstream: https://github.com/dmyersturnbull/typed-dfs/issues/46 |
208
|
|
|
def test_raw_to_xml(self): |
209
|
|
|
dtypes = [ |
210
|
|
|
pd.StringDtype(), |
211
|
|
|
pd.BooleanDtype(), |
212
|
|
|
pd.Float64Dtype(), |
213
|
|
|
pd.Float32Dtype(), |
214
|
|
|
pd.Int64Dtype(), |
215
|
|
|
pd.UInt64Dtype(), |
216
|
|
|
pd.Int32Dtype(), |
217
|
|
|
pd.UInt32Dtype(), |
218
|
|
|
pd.Int16Dtype(), |
219
|
|
|
pd.UInt16Dtype(), |
220
|
|
|
pd.Int8Dtype(), |
221
|
|
|
pd.UInt8Dtype(), |
222
|
|
|
pd.StringDtype(), |
223
|
|
|
] |
224
|
|
|
data = [ |
225
|
|
|
pd.Series({"abc": 1, "xyz": pd.NA}), |
226
|
|
|
pd.Series({"abc": pd.NA, "xyz": 0}), |
227
|
|
|
] |
228
|
|
|
failed = {} |
229
|
|
|
for dtype in dtypes: |
230
|
|
|
df = pd.DataFrame(data).astype(dtype) |
231
|
|
|
try: |
232
|
|
|
df.to_xml() |
233
|
|
|
except TypeError as e: |
234
|
|
|
logger.error(dtype, exc_info=True) |
235
|
|
|
failed[str(dtype)] = str(e) |
236
|
|
|
assert failed == [], f"Failed on dtypes: {failed}" |
237
|
|
|
""" |
238
|
|
|
|
239
|
|
|
def test_xml(self): |
240
|
|
|
with tmpfile(".xml.gz") as path: |
241
|
|
|
df = UntypedDf(sample_data()) |
242
|
|
|
df.to_csv(path) |
243
|
|
|
df2 = UntypedDf.read_csv(path) |
244
|
|
|
assert list(df2.index.names) == [None] |
245
|
|
|
assert set(df2.columns) == {"abc", "123", "xyz"} |
246
|
|
|
|
247
|
|
|
def test_html_untyped(self): |
248
|
|
|
with tmpfile(".html") as path: |
249
|
|
|
df = UntypedDf(sample_data()) |
250
|
|
|
df.to_html(path) |
251
|
|
|
df2 = UntypedDf.read_html(path) |
252
|
|
|
assert list(df2.index.names) == [None] |
253
|
|
|
assert set(df2.columns) == {"abc", "123", "xyz"} |
254
|
|
|
|
255
|
|
|
def test_html_singleindex(self): |
256
|
|
|
with tmpfile(".html") as path: |
257
|
|
|
df = Ind1.convert(Ind1(sample_data())) |
258
|
|
|
df.to_html(path) |
259
|
|
|
assert df.index_names() == ["abc"] |
260
|
|
|
assert df.column_names() == ["123", "xyz"] |
261
|
|
|
df2 = Ind1.read_html(path) |
262
|
|
|
assert df2.index_names() == ["abc"] |
263
|
|
|
assert df2.column_names() == ["123", "xyz"] |
264
|
|
|
|
265
|
|
|
def test_html_multiindex(self): |
266
|
|
|
with tmpfile(".html") as path: |
267
|
|
|
df = Ind2.convert(Ind2(sample_data())) |
268
|
|
|
df.to_html(path) |
269
|
|
|
assert df.index_names() == ["abc", "xyz"] |
270
|
|
|
assert df.column_names() == ["123"] |
271
|
|
|
df2 = Ind2.read_html(path) |
272
|
|
|
assert df2.index_names() == ["abc", "xyz"] |
273
|
|
|
assert df2.column_names() == ["123"] |
274
|
|
|
|
275
|
|
|
def test_html_invalid(self): |
276
|
|
|
with tmpfile(".html") as path: |
277
|
|
|
path.write_text("", encoding="utf-8") |
278
|
|
|
with pytest.raises(XMLSyntaxError): |
279
|
|
|
UntypedDf.read_html(path) |
280
|
|
|
|
281
|
|
|
def test_html_empty(self): |
282
|
|
|
with tmpfile(".html") as path: |
283
|
|
|
path.write_text("<html></html>", encoding="utf-8") |
284
|
|
|
with pytest.raises(NoValueError): |
285
|
|
|
UntypedDf.read_html(path) |
286
|
|
|
|
287
|
|
|
def test_read_toml(self): |
288
|
|
|
data = """ |
289
|
|
|
[[row]] |
290
|
|
|
# a comment |
291
|
|
|
key = "value" |
292
|
|
|
""" |
293
|
|
|
s = StringIO(data) |
294
|
|
|
df = UntypedDf.read_toml(s) |
295
|
|
|
assert df.column_names() == ["key"] |
296
|
|
|
assert df.values.tolist() == [["value"]] |
297
|
|
|
|
298
|
|
|
def test_read_toml_jagged(self): |
299
|
|
|
data = """ |
300
|
|
|
[[row]] |
301
|
|
|
key = "value1" |
302
|
|
|
[[row]] |
303
|
|
|
key = "value2" |
304
|
|
|
kitten = "elephant" |
305
|
|
|
cuteness = 10.3 |
306
|
|
|
""" |
307
|
|
|
s = StringIO(data) |
308
|
|
|
df = UntypedDf.read_toml(s) |
309
|
|
|
assert df.column_names() == ["key", "kitten", "cuteness"] |
310
|
|
|
xx = df.fillna(0).values.tolist() |
311
|
|
|
assert xx == [["value1", 0, 0], ["value2", "elephant", 10.3]] |
312
|
|
|
|
313
|
|
|
def test_read_ini(self): |
314
|
|
|
data = """ |
315
|
|
|
[section] |
316
|
|
|
; a comment |
317
|
|
|
key = value |
318
|
|
|
""" |
319
|
|
|
s = StringIO(data) |
320
|
|
|
df = UntypedDf.read_ini(s) |
321
|
|
|
assert df.column_names() == ["key", "value"] |
322
|
|
|
assert df.values.tolist() == [["section.key", "value"]] |
323
|
|
|
|
324
|
|
|
def test_read_properties(self): |
325
|
|
|
data = r""" |
326
|
|
|
[section] |
327
|
|
|
# a comment |
328
|
|
|
! another comment |
329
|
|
|
k\:e\\y = v:a\\lue |
330
|
|
|
""" |
331
|
|
|
s = StringIO(data) |
332
|
|
|
df = UntypedDf.read_properties(s) |
333
|
|
|
assert df.column_names() == ["key", "value"] |
334
|
|
|
assert df.values.tolist() == [[r"section.k:e\y", r"v:a\lue"]] |
335
|
|
|
data: str = df.to_properties() |
336
|
|
|
lines = [s.strip() for s in data.splitlines()] |
337
|
|
|
assert "[section]" in lines |
338
|
|
|
assert r"k\:e\\y = v:a\\lue" in lines |
339
|
|
|
s = StringIO(data) |
340
|
|
|
df2 = UntypedDf.read_properties(s) |
341
|
|
|
assert df2.values.tolist() == df.values.tolist() |
342
|
|
|
|
343
|
|
|
""" |
344
|
|
|
# TODO re-enable when we get a tables 3.9 wheels on Windows |
345
|
|
|
def test_hdf(self): |
346
|
|
|
with tmpfile(".h5") as path: |
347
|
|
|
df = TypedMultiIndex.convert(TypedMultiIndex(sample_data())) |
348
|
|
|
df.to_hdf(path) |
349
|
|
|
df2 = TypedMultiIndex.read_hdf(path) |
350
|
|
|
assert df2.index_names() == ["abc", "xyz"] |
351
|
|
|
assert df2.column_names() == ["123"] |
352
|
|
|
""" |
353
|
|
|
|
354
|
|
|
|
355
|
|
|
if __name__ == "__main__": |
356
|
|
|
pytest.main() |
357
|
|
|
|