Passed
Pull Request — dev (#32)
by Konstantinos
03:05 queued 01:33
created

conftest.test_dataset()   A

Complexity

Conditions 2

Size

Total Lines 47
Code Lines 28

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 2
eloc 28
nop 3
dl 0
loc 47
rs 9.208
c 0
b 0
f 0
1
import os
2
import pytest
3
4
5
my_dir = os.path.dirname(os.path.realpath(__file__))
6
7
####### Files and folders
8
@pytest.fixture
9
def tests_root_dir():
10
    return my_dir
11
12
@pytest.fixture
13
def tests_data_root(tests_root_dir):
14
    return os.path.join(tests_root_dir, 'dts')
15
16
# Test data
17
@pytest.fixture
18
def sample_json(tests_data_root):
19
    return os.path.join(tests_data_root, 'sample-data.jsonlines')
20
21
@pytest.fixture
22
def sample_collaped_json(tests_data_root):
23
    return os.path.join(tests_data_root, 'sample-data-collapsed.jsonlines')
24
25
26
@pytest.fixture()
27
def test_json_data(sample_json):
28
    return {
29
        'file_path': sample_json,
30
        'nb_lines': 100,
31
        'attributes': {'flavors', 'name', 'medical', 'description', 'image_urls', 'parents', 'negatives', 'grow_info', '_id', 'type', 'image_paths', 'effects'},
32
    }
33
34
35
@pytest.fixture
36
def somagic():
37
    from so_magic import init_so_magic
38
    _ = init_so_magic()
39
    return _
40
41
42
@pytest.fixture
43
def data_manager():
44
    def getter():
45
        from so_magic.data import init_data_manager
46
        from so_magic.data.backend import init_engine
47
48
        data_manager = init_data_manager(init_engine(engine_type='pd'))
49
50
        datapoints_fact = data_manager.engine.backend.datapoints_factory
51
        cmd_fact = data_manager.engine.backend.command_factory
52
53
        # test 1
54
        from so_magic.data.datapoints.datapoints import DatapointsFactory
55
        from so_magic.data.backend.engine_command_factory import MagicCommandFactory
56
57
        assert isinstance(datapoints_fact, DatapointsFactory)
58
        assert isinstance(cmd_fact, MagicCommandFactory)
59
60
        subjects = [datapoints_fact.subject, cmd_fact.subject, data_manager.phi_class.subject]
61
        assert len(set([id(x._observers) for x in subjects])) == len(subjects)
62
63
        assert datapoints_fact.subject._observers[0] == data_manager.engine.datapoints_manager
64
        assert cmd_fact.subject._observers[0] == data_manager.commands_manager.command.accumulator
65
        assert id(data_manager.phi_class.subject._observers[0]) == id(data_manager.built_phis)
66
        assert data_manager.phi_class.subject._observers[0] == data_manager.built_phis
67
68
        print(f"DTP FCT OBS: [{', '.join(str(_) for _ in datapoints_fact.subject._observers)}]")
69
        print(f"CMD FCT OBS: [{', '.join(str(_) for _ in cmd_fact.subject._observers)}]")
70
        print(f"PHIFUNC class OBS: [{', '.join(str(_) for _ in data_manager.phi_class.subject._observers)}]")
71
        assert all([len(x._observers) == 1 for x in subjects])
72
        return data_manager
73
74
    return getter
75
76
77
@pytest.fixture
78
def read_observations():
79
    """Read a json lines formatted file and create the observations object (see Datapoints class)."""
80
    def load_data(so_master, json_lines_formatted_file_path):
81
        """Create the observations object for a Datapoints instance, given a data file.
82
83
        Args:
84
            so_master (so_magic.so_master.SoMaster): an instance of SoMaster
85
            json_lines_formatted_file_path (str): path to a json lines formatted file with the observations data
86
        """
87
        cmd = so_master.command.observations_command
88
        cmd.args = [json_lines_formatted_file_path]
89
        cmd.execute()
90
    return load_data
91
92
93
@pytest.fixture
94
def test_datapoints(read_observations, sample_collaped_json, somagic):
95
    """Read the designated json lines 'test file' (which contains the 'test observations') as a Datapoints instance."""
96
    read_observations(somagic, sample_collaped_json)
97
    return somagic.datapoints
98
99
100
@pytest.fixture
101
def test_dataset(somagic, read_observations, sample_collaped_json):
102
    """Dataset ready to be fed into a training/inference algorithm; feature vectors have been computed."""
103
    read_observations(somagic, sample_collaped_json)
104
105
    type_values = ['hybrid', 'indica', 'sativa']
106
    ATTRS2 = [f'type_{x}' for x in type_values]
107
    from functools import reduce
108
    UNIQUE_FLAVORS = reduce(lambda i, j: set(i).union(set(j)),
109
                            [_ for _ in somagic._data_manager.datapoints.observations['flavors'] if _ is not None])
110
111
    cmd = somagic._data_manager.command.select_variables_command
112
    # current limitations:
113
    # 1. client code has to know the number of distict values for the nominal variable 'type'
114
    # 2. client code has to provide the column names that will result after encoding the 'type' variable
115
    cmd.args = [[
116
        # current limitations:
117
        # 1. client code has to know the number of distict values for the nominal variable 'type'
118
        # 2. client code has to provide the column names that will result after encoding the 'type' variable
119
        {'variable': 'type', 'columns': ATTRS2},
120
        # current limitations:
121
        # 1. client code has to know the number of distict values for the nominal variable 'flavors'
122
        # 2. client code has to provide the column names that will result after encoding the 'flavors' variable
123
        {'variable': 'flavors', 'columns': list(UNIQUE_FLAVORS)}]]
124
    cmd.execute()
125
126
    cmd = somagic._data_manager.command.one_hot_encoding_command
127
    cmd.args = [somagic._data_manager.datapoints, 'type']
128
    cmd.execute()
129
130
    assert set([type(x) for x in somagic._data_manager.datapoints.observations['flavors']]) == {list, type(None)}
131
132
    nb_columns_before = len(somagic._data_manager.datapoints.observations.columns)
133
134
    cmd = somagic._data_manager.command.one_hot_encoding_list_command
135
    cmd.args = [somagic._data_manager.datapoints, 'flavors']
136
    cmd.execute()
137
138
    assert nb_columns_before + len(UNIQUE_FLAVORS) == len(somagic._data_manager.datapoints.observations.columns)
139
140
    import numpy as np
141
    setattr(somagic.dataset, 'feature_vectors',
142
            np.array(somagic._data_manager.datapoints.observations[ATTRS2 + list(UNIQUE_FLAVORS)]))
143
144
    MAX_FLAVORS_PER_DAATPOINT = max(
145
        [len(x) for x in [_ for _ in somagic._data_manager.datapoints.observations['flavors'] if type(_) is list]])
146
    return somagic.dataset, type_values, UNIQUE_FLAVORS, MAX_FLAVORS_PER_DAATPOINT, nb_columns_before
147
148
149
@pytest.fixture
150
def built_in_backends():
151
    from so_magic.data.backend.panda_handling.df_backend import magic_backends
152
    engine_backends = magic_backends()
153
    return engine_backends
154
155
156
@pytest.fixture
157
def tabular_operators(built_in_backends):
158
    operators = {
159
        'retriever': {
160
            'class': built_in_backends.backend_interfaces['retriever']['class_registry'].subclasses['pd'],
161
            'interface': {
162
                'column': '(identifier, data)',
163
                'row': '(identifier, data)',
164
                'nb_columns': '(data)',
165
                'nb_rows': '(data)',
166
                'get_numerical_attributes': '(data)',
167
            }
168
        },
169
        'iterator': {
170
            'class': built_in_backends.backend_interfaces['iterator']['class_registry'].subclasses['pd'],
171
            'interface': {
172
                'columnnames': '(data)',
173
                'itercolumns': '(data)',
174
                'iterrows': '(data)',
175
            },
176
        },
177
        'mutator': {
178
            'class': built_in_backends.backend_interfaces['mutator']['class_registry'].subclasses['pd'],
179
            'interface': {
180
                'add_column': '(datapoints, values, new_attribute, **kwargs)',
181
            },
182
        },
183
    }
184
    return {
185
        'operators': operators,
186
        'reverse_dict': {operator_dict['class']: key for key, operator_dict in operators.items()},
187
        'get_nb_args': lambda operator_interface_name, method_name: len(operators[operator_interface_name]['interface'][method_name].replace(', **kwargs', '').split(',')),
188
        # operator_name_2_required_methods
189
        'required_methods': iter(((operator_interface_name, v['interface'].keys())
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable operator_interface_name does not seem to be defined.
Loading history...
Comprehensibility Best Practice introduced by
The variable v does not seem to be defined.
Loading history...
190
                                  for operator_interface_name, v in operators.items()))
191
    }
192
193
194
@pytest.fixture
195
def assert_different_objects():
196
    def _assert_different_objects(objects):
197
        assert len(set([id(x) for x in objects])) == len(objects)
198
    return _assert_different_objects
199