tests.test_io   B
last analyzed

Complexity

Total Complexity 49

Size/Duplication

Total Lines 357
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
eloc 258
dl 0
loc 357
rs 8.48
c 0
b 0
f 0
wmc 49

22 Methods

Rating   Name   Duplication   Size   Complexity  
A TestReadWrite.test_feather_zstd() 0 7 2
A TestReadWrite.test_parquet() 0 7 2
A TestReadWrite.test_typed_read_write_csv_multiindex() 0 9 2
B TestReadWrite.test_numeric_dtypes() 0 50 5
A TestReadWrite.test_csv_gz() 0 7 2
A TestReadWrite.test_feather_lz4() 0 7 2
A TestReadWrite.test_typed_read_write_csv_noindex() 0 7 2
A TestReadWrite.test_records() 0 5 1
A TestReadWrite.test_typed_read_write_csv_singleindex() 0 9 2
A TestReadWrite.test_write_passing_index() 0 6 2
A TestReadWrite.test_untyped_read_write_csv() 0 10 4
A TestReadWrite.test_html_empty() 0 5 3
A TestReadWrite.test_html_multiindex() 0 9 2
A TestReadWrite.test_html_invalid() 0 5 3
A TestReadWrite.test_html_singleindex() 0 9 2
A TestReadWrite.test_html_untyped() 0 7 2
B TestReadWrite.test_numeric_nullable_dtypes() 0 42 5
A TestReadWrite.test_read_properties() 0 18 1
A TestReadWrite.test_read_toml_jagged() 0 14 1
A TestReadWrite.test_read_ini() 0 10 1
A TestReadWrite.test_read_toml() 0 10 1
A TestReadWrite.test_xml() 0 7 2

How to fix   Complexity   

Complexity

Complex classes like tests.test_io often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

1
# SPDX-FileCopyrightText: Copyright 2020-2023, Contributors to typed-dfs
2
# SPDX-PackageHomePage: https://github.com/dmyersturnbull/typed-dfs
3
# SPDX-License-Identifier: Apache-2.0
4
from io import StringIO
5
6
import numpy as np
7
import pandas as pd
8
import pytest
9
from lxml.etree import XMLSyntaxError  # nosec
10
11
from typeddfs.df_errors import NoValueError
12
from typeddfs.untyped_dfs import UntypedDf
13
14
from . import Ind1NonStrict as Ind1
15
from . import Ind2Col2NonStrict as Ind2Col2
16
from . import Ind2NonStrict as Ind2
17
from . import (
18
    Trivial,
19
    logger,
20
    sample_data,
21
    sample_data_ind2_col2,
22
    sample_data_ind2_col2_pd_na,
23
    tmpfile,
24
)
25
26
27
class TestReadWrite:
28
    def test_feather_lz4(self):
29
        with tmpfile(".feather") as path:
30
            df = Ind2.convert(Ind2(sample_data()))
31
            df.to_feather(path, compression="lz4")
32
            df2 = Ind2.read_feather(path)
33
            assert df2.index_names() == ["abc", "xyz"]
34
            assert df2.column_names() == ["123"]
35
36
    def test_feather_zstd(self):
37
        with tmpfile(".feather") as path:
38
            df = Ind2.convert(Ind2(sample_data()))
39
            df.to_feather(path, compression="zstd")
40
            df2 = Ind2.read_feather(path)
41
            assert df2.index_names() == ["abc", "xyz"]
42
            assert df2.column_names() == ["123"]
43
44
    def test_csv_gz(self):
45
        with tmpfile(".csv.gz") as path:
46
            df = UntypedDf(sample_data())
47
            df.to_csv(path)
48
            df2 = UntypedDf.read_csv(path)
49
            assert list(df2.index.names) == [None]
50
            assert set(df2.columns) == {"abc", "123", "xyz"}
51
52
    def test_untyped_read_write_csv(self):
53
        with tmpfile(".csv") as path:
54
            for indices in [None, "abc", ["abc", "xyz"]]:
55
                df = UntypedDf(sample_data())
56
                if indices is not None:
57
                    df = df.set_index(indices)
58
                df.to_csv(path)
59
                df2 = UntypedDf.read_csv(path)
60
                assert list(df2.index.names) == [None]
61
                assert set(df2.columns) == {"abc", "123", "xyz"}
62
63
    def test_write_passing_index(self):
64
        with tmpfile(".csv") as path:
65
            df = Trivial(sample_data())
66
            df.to_csv(path, index=["abc"])  # fine
67
            df = UntypedDf(sample_data())
68
            df.to_csv(path, index=["abc"])  # calls super immediately
69
70
    def test_typed_read_write_csv_noindex(self):
71
        with tmpfile(".csv") as path:
72
            df = Trivial(sample_data())
73
            df.to_csv(path)
74
            df2 = Trivial.read_csv(path)
75
            assert list(df2.index.names) == [None]
76
            assert set(df2.columns) == {"abc", "123", "xyz"}
77
78
    def test_typed_read_write_csv_singleindex(self):
79
        with tmpfile(".csv") as path:
80
            df = Ind1.convert(Ind1(sample_data()))
81
            df.to_csv(path)
82
            assert df.index_names() == ["abc"]
83
            assert df.column_names() == ["123", "xyz"]
84
            df2 = Ind1.read_csv(path)
85
            assert df2.index_names() == ["abc"]
86
            assert df2.column_names() == ["123", "xyz"]
87
88
    def test_typed_read_write_csv_multiindex(self):
89
        with tmpfile(".csv") as path:
90
            df = Ind2.convert(Ind2(sample_data()))
91
            df.to_csv(path)
92
            assert df.index_names() == ["abc", "xyz"]
93
            assert df.column_names() == ["123"]
94
            df2 = Ind2.read_csv(path)
95
            assert df2.index_names() == ["abc", "xyz"]
96
            assert df2.column_names() == ["123"]
97
98
    def test_parquet(self):
99
        with tmpfile(".parquet") as path:
100
            df = UntypedDf(sample_data())
101
            df.to_parquet(path)
102
            df2 = UntypedDf.read_parquet(path)
103
            assert list(df2.index.names) == [None]
104
            assert set(df2.columns) == {"abc", "123", "xyz"}
105
106
    def test_records(self):
107
        df = UntypedDf(sample_data())
108
        records = df.to_records()
109
        df2 = UntypedDf.from_records(records)
110
        assert isinstance(df2, UntypedDf)
111
112
    def test_numeric_dtypes(self):
113
        dtypes = [
114
            bool,
115
            np.byte,
116
            np.ubyte,
117
            np.short,
118
            np.ushort,
119
            np.single,
120
            np.int32,
121
            np.intc,
122
            np.half,
123
            np.float16,
124
            np.double,
125
            np.float64,
126
            pd.StringDtype(),
127
            pd.Int64Dtype(),
128
            pd.UInt64Dtype(),
129
            pd.Int32Dtype(),
130
            pd.UInt32Dtype(),
131
            pd.Int16Dtype(),
132
            pd.UInt16Dtype(),
133
            pd.Int8Dtype(),
134
            pd.UInt8Dtype(),
135
        ]
136
        for suffix, fn in [
137
            (".parquet", "parquet"),
138
            (".feather", "feather"),
139
            (".xml", "xml"),
140
            (".csv", "csv"),
141
            (".tsv", "tsv"),
142
            (".json", "json"),
143
            (".xlsx", "xlsx"),
144
            (".xls", "xls"),
145
            (".xlsb", "xlsb"),
146
            (".ods", "ods"),
147
            (".pickle", "pickle"),
148
        ]:
149
            with tmpfile(suffix) as path:
150
                for dtype in dtypes:
151
                    try:
152
                        df = Ind2Col2.convert(Ind2Col2(sample_data_ind2_col2())).astype(dtype)
153
                        assert list(df.index.names) == ["qqq", "rrr"]
154
                        assert list(df.columns) == ["abc", "xyz"]
155
                        getattr(df, "to_" + fn)(path)
156
                        df2 = getattr(Ind2Col2, "read_" + fn)(path)
157
                        assert list(df2.index.names) == ["qqq", "rrr"]
158
                        assert list(df2.columns) == ["abc", "xyz"]
159
                    except Exception:
160
                        logger.error(f"Failed on path {path}, dtype {dtype}")
161
                        raise
162
163
    def test_numeric_nullable_dtypes(self):
164
        dtypes = [
165
            pd.StringDtype(),
166
            pd.BooleanDtype(),
167
            pd.Float64Dtype(),
168
            pd.Float32Dtype(),
169
            pd.Int64Dtype(),
170
            pd.UInt64Dtype(),
171
            pd.Int32Dtype(),
172
            pd.UInt32Dtype(),
173
            pd.Int16Dtype(),
174
            pd.UInt16Dtype(),
175
            pd.Int8Dtype(),
176
            pd.UInt8Dtype(),
177
            pd.StringDtype(),
178
        ]
179
        for suffix, fn in [
180
            (".parquet", "parquet"),
181
            (".feather", "feather"),
182
            (".csv", "csv"),
183
            (".tsv", "tsv"),
184
            (".json", "json"),
185
            (".xlsx", "xlsx"),
186
            (".xls", "xls"),
187
            (".xlsb", "xlsb"),
188
            (".ods", "ods"),
189
            (".pickle", "pickle"),
190
            (".xml", "xml"),
191
        ]:
192
            for dtype in dtypes:
193
                with tmpfile(suffix) as path:
194
                    try:
195
                        df = Ind2Col2.convert(Ind2Col2(sample_data_ind2_col2_pd_na())).astype(dtype)
196
                        assert list(df.index.names) == ["qqq", "rrr"]
197
                        assert list(df.columns) == ["abc", "xyz"]
198
                        getattr(df, "to_" + fn)(path)
199
                        df2 = getattr(Ind2Col2, "read_" + fn)(path)
200
                        assert list(df2.index.names) == ["qqq", "rrr"]
201
                        assert list(df2.columns) == ["abc", "xyz"]
202
                    except Exception:
203
                        logger.error(f"Failed on path {path}, dtype {dtype}")
204
                        raise
205
206
    """
207
    # TODO: waiting for upstream: https://github.com/dmyersturnbull/typed-dfs/issues/46
208
    def test_raw_to_xml(self):
209
        dtypes = [
210
            pd.StringDtype(),
211
            pd.BooleanDtype(),
212
            pd.Float64Dtype(),
213
            pd.Float32Dtype(),
214
            pd.Int64Dtype(),
215
            pd.UInt64Dtype(),
216
            pd.Int32Dtype(),
217
            pd.UInt32Dtype(),
218
            pd.Int16Dtype(),
219
            pd.UInt16Dtype(),
220
            pd.Int8Dtype(),
221
            pd.UInt8Dtype(),
222
            pd.StringDtype(),
223
        ]
224
        data = [
225
            pd.Series({"abc": 1, "xyz": pd.NA}),
226
            pd.Series({"abc": pd.NA, "xyz": 0}),
227
        ]
228
        failed = {}
229
        for dtype in dtypes:
230
            df = pd.DataFrame(data).astype(dtype)
231
            try:
232
                df.to_xml()
233
            except TypeError as e:
234
                logger.error(dtype, exc_info=True)
235
                failed[str(dtype)] = str(e)
236
        assert failed == [], f"Failed on dtypes: {failed}"
237
    """
238
239
    def test_xml(self):
240
        with tmpfile(".xml.gz") as path:
241
            df = UntypedDf(sample_data())
242
            df.to_csv(path)
243
            df2 = UntypedDf.read_csv(path)
244
            assert list(df2.index.names) == [None]
245
            assert set(df2.columns) == {"abc", "123", "xyz"}
246
247
    def test_html_untyped(self):
248
        with tmpfile(".html") as path:
249
            df = UntypedDf(sample_data())
250
            df.to_html(path)
251
            df2 = UntypedDf.read_html(path)
252
            assert list(df2.index.names) == [None]
253
            assert set(df2.columns) == {"abc", "123", "xyz"}
254
255
    def test_html_singleindex(self):
256
        with tmpfile(".html") as path:
257
            df = Ind1.convert(Ind1(sample_data()))
258
            df.to_html(path)
259
            assert df.index_names() == ["abc"]
260
            assert df.column_names() == ["123", "xyz"]
261
            df2 = Ind1.read_html(path)
262
            assert df2.index_names() == ["abc"]
263
            assert df2.column_names() == ["123", "xyz"]
264
265
    def test_html_multiindex(self):
266
        with tmpfile(".html") as path:
267
            df = Ind2.convert(Ind2(sample_data()))
268
            df.to_html(path)
269
            assert df.index_names() == ["abc", "xyz"]
270
            assert df.column_names() == ["123"]
271
            df2 = Ind2.read_html(path)
272
            assert df2.index_names() == ["abc", "xyz"]
273
            assert df2.column_names() == ["123"]
274
275
    def test_html_invalid(self):
276
        with tmpfile(".html") as path:
277
            path.write_text("", encoding="utf-8")
278
            with pytest.raises(XMLSyntaxError):
279
                UntypedDf.read_html(path)
280
281
    def test_html_empty(self):
282
        with tmpfile(".html") as path:
283
            path.write_text("<html></html>", encoding="utf-8")
284
            with pytest.raises(NoValueError):
285
                UntypedDf.read_html(path)
286
287
    def test_read_toml(self):
288
        data = """
289
        [[row]]
290
        # a comment
291
        key = "value"
292
        """
293
        s = StringIO(data)
294
        df = UntypedDf.read_toml(s)
295
        assert df.column_names() == ["key"]
296
        assert df.values.tolist() == [["value"]]
297
298
    def test_read_toml_jagged(self):
299
        data = """
300
        [[row]]
301
        key = "value1"
302
        [[row]]
303
        key = "value2"
304
        kitten = "elephant"
305
        cuteness = 10.3
306
        """
307
        s = StringIO(data)
308
        df = UntypedDf.read_toml(s)
309
        assert df.column_names() == ["key", "kitten", "cuteness"]
310
        xx = df.fillna(0).values.tolist()
311
        assert xx == [["value1", 0, 0], ["value2", "elephant", 10.3]]
312
313
    def test_read_ini(self):
314
        data = """
315
        [section]
316
        ; a comment
317
        key = value
318
        """
319
        s = StringIO(data)
320
        df = UntypedDf.read_ini(s)
321
        assert df.column_names() == ["key", "value"]
322
        assert df.values.tolist() == [["section.key", "value"]]
323
324
    def test_read_properties(self):
325
        data = r"""
326
        [section]
327
        # a comment
328
        ! another comment
329
        k\:e\\y = v:a\\lue
330
        """
331
        s = StringIO(data)
332
        df = UntypedDf.read_properties(s)
333
        assert df.column_names() == ["key", "value"]
334
        assert df.values.tolist() == [[r"section.k:e\y", r"v:a\lue"]]
335
        data: str = df.to_properties()
336
        lines = [s.strip() for s in data.splitlines()]
337
        assert "[section]" in lines
338
        assert r"k\:e\\y = v:a\\lue" in lines
339
        s = StringIO(data)
340
        df2 = UntypedDf.read_properties(s)
341
        assert df2.values.tolist() == df.values.tolist()
342
343
    """
344
    # TODO re-enable when we get a tables 3.9 wheels on Windows
345
    def test_hdf(self):
346
        with tmpfile(".h5") as path:
347
            df = TypedMultiIndex.convert(TypedMultiIndex(sample_data()))
348
            df.to_hdf(path)
349
            df2 = TypedMultiIndex.read_hdf(path)
350
            assert df2.index_names() == ["abc", "xyz"]
351
            assert df2.column_names() == ["123"]
352
    """
353
354
355
if __name__ == "__main__":
356
    pytest.main()
357