Passed
Push — master ( 96da92...a1b572 )
by Konstantinos
37s queued 14s
created

conftest.test_json_data()   A

Complexity

Conditions 1

Size

Total Lines 6
Code Lines 6

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 6
dl 0
loc 6
rs 10
c 0
b 0
f 0
cc 1
nop 1
1
import os
2
import pytest
3
4
5
my_dir = os.path.dirname(os.path.realpath(__file__))
6
7
####### Files and folders
8
@pytest.fixture
9
def tests_root_dir():
10
    return my_dir
11
12
@pytest.fixture
13
def tests_data_root(tests_root_dir):
14
    return os.path.join(tests_root_dir, 'dts')
15
16
# Test data
17
@pytest.fixture
18
def sample_json(tests_data_root):
19
    return os.path.join(tests_data_root, 'sample-data.jsonlines')
20
21
@pytest.fixture
22
def sample_collaped_json(tests_data_root):
23
    return os.path.join(tests_data_root, 'sample-data-collapsed.jsonlines')
24
25
26
@pytest.fixture()
27
def test_json_data(sample_json):
28
    return {
29
        'file_path': sample_json,
30
        'nb_lines': 100,
31
        'attributes': {'flavors', 'name', 'medical', 'description', 'image_urls', 'parents', 'negatives', 'grow_info', '_id', 'type', 'image_paths', 'effects'},
32
    }
33
34
35
@pytest.fixture
36
def somagic():
37
    from so_magic import init_so_magic
38
    _ = init_so_magic()
39
    return _
40
41
42
@pytest.fixture
43
def data_manager():
44
    def getter():
45
        from so_magic.data import init_data_manager
46
        from so_magic.data.backend import init_engine
47
        data_manager = init_data_manager(init_engine(engine_type='pd'))
48
        return data_manager
49
    return getter
50
51
52
@pytest.fixture
53
def read_observations():
54
    """Read a json lines formatted file and create the observations object (see Datapoints class)."""
55
    def load_data(so_master, json_lines_formatted_file_path):
56
        """Create the observations object for a Datapoints instance, given a data file.
57
58
        Args:
59
            so_master (so_magic.so_master.SoMaster): an instance of SoMaster
60
            json_lines_formatted_file_path (str): path to a json lines formatted file with the observations data
61
        """
62
        cmd = so_master.command.observations_command
63
        cmd.args = [json_lines_formatted_file_path]
64
        cmd.execute()
65
    return load_data
66
67
68
@pytest.fixture
69
def test_datapoints(read_observations, sample_collaped_json, somagic):
70
    """Read the designated json lines 'test file' (which contains the 'test observations') as a Datapoints instance."""
71
    read_observations(somagic, sample_collaped_json)
72
    return somagic.datapoints
73
74
75
@pytest.fixture
76
def test_dataset(somagic, read_observations, sample_collaped_json):
77
    """Dataset ready to be fed into a training/inference algorithm; feature vectors have been computed."""
78
    read_observations(somagic, sample_collaped_json)
79
80
    type_values = ['hybrid', 'indica', 'sativa']
81
    ATTRS2 = [f'type_{x}' for x in type_values]
82
    from functools import reduce
83
    UNIQUE_FLAVORS = reduce(lambda i, j: set(i).union(set(j)),
84
                            [_ for _ in somagic._data_manager.datapoints.observations['flavors'] if _ is not None])
85
86
    cmd = somagic._data_manager.command.select_variables_command
87
    # current limitations:
88
    # 1. client code has to know the number of distict values for the nominal variable 'type'
89
    # 2. client code has to provide the column names that will result after encoding the 'type' variable
90
    cmd.args = [[
91
        # current limitations:
92
        # 1. client code has to know the number of distict values for the nominal variable 'type'
93
        # 2. client code has to provide the column names that will result after encoding the 'type' variable
94
        {'variable': 'type', 'columns': ATTRS2},
95
        # current limitations:
96
        # 1. client code has to know the number of distict values for the nominal variable 'flavors'
97
        # 2. client code has to provide the column names that will result after encoding the 'flavors' variable
98
        {'variable': 'flavors', 'columns': list(UNIQUE_FLAVORS)}]]
99
    cmd.execute()
100
101
    cmd = somagic._data_manager.command.one_hot_encoding_command
102
    cmd.args = [somagic._data_manager.datapoints, 'type']
103
    cmd.execute()
104
105
    assert set([type(x) for x in somagic._data_manager.datapoints.observations['flavors']]) == {list, type(None)}
106
107
    nb_columns_before = len(somagic._data_manager.datapoints.observations.columns)
108
109
    cmd = somagic._data_manager.command.one_hot_encoding_list_command
110
    cmd.args = [somagic._data_manager.datapoints, 'flavors']
111
    cmd.execute()
112
113
    assert nb_columns_before + len(UNIQUE_FLAVORS) == len(somagic._data_manager.datapoints.observations.columns)
114
115
    import numpy as np
116
    setattr(somagic.dataset, 'feature_vectors',
117
            np.array(somagic._data_manager.datapoints.observations[ATTRS2 + list(UNIQUE_FLAVORS)]))
118
119
    MAX_FLAVORS_PER_DAATPOINT = max(
120
        [len(x) for x in [_ for _ in somagic._data_manager.datapoints.observations['flavors'] if type(_) is list]])
121
    return somagic.dataset, type_values, UNIQUE_FLAVORS, MAX_FLAVORS_PER_DAATPOINT, nb_columns_before
122
123
124
@pytest.fixture
125
def built_in_backends():
126
    from so_magic.data.backend.panda_handling.df_backend import magic_backends
127
    engine_backends = magic_backends()
128
    return engine_backends
129
130
131
@pytest.fixture
132
def tabular_operators(built_in_backends):
133
    operators = {
134
        'retriever': {
135
            'class': built_in_backends.backend_interfaces['retriever']['class_registry'].subclasses['pd'],
136
            'interface': {
137
                'column': '(identifier, data)',
138
                'row': '(identifier, data)',
139
                'nb_columns': '(data)',
140
                'nb_rows': '(data)',
141
                'get_numerical_attributes': '(data)',
142
            }
143
        },
144
        'iterator': {
145
            'class': built_in_backends.backend_interfaces['iterator']['class_registry'].subclasses['pd'],
146
            'interface': {
147
                'columnnames': '(data)',
148
                'itercolumns': '(data)',
149
                'iterrows': '(data)',
150
            },
151
        },
152
        'mutator': {
153
            'class': built_in_backends.backend_interfaces['mutator']['class_registry'].subclasses['pd'],
154
            'interface': {
155
                'add_column': '(datapoints, values, new_attribute, **kwargs)',
156
            },
157
        },
158
    }
159
    return {
160
        'operators': operators,
161
        'reverse_dict': {operator_dict['class']: key for key, operator_dict in operators.items()},
162
        'get_nb_args': lambda operator_interface_name, method_name: len(operators[operator_interface_name]['interface'][method_name].replace(', **kwargs', '').split(',')),
163
        # operator_name_2_required_methods
164
        'required_methods': iter(((operator_interface_name, v['interface'].keys())
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable operator_interface_name does not seem to be defined.
Loading history...
Comprehensibility Best Practice introduced by
The variable v does not seem to be defined.
Loading history...
165
                                  for operator_interface_name, v in operators.items()))
166
    }
167
168
169
@pytest.fixture
170
def assert_different_objects():
171
    def _assert_different_objects(objects):
172
        assert len(set([id(x) for x in objects])) == len(objects)
173
    return _assert_different_objects
174