Passed
Push — mpeta ( 1841cb...62640f )
by Konstantinos
03:46
created

conftest.test_dataset()   A

Complexity

Conditions 2

Size

Total Lines 47
Code Lines 28

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 2
eloc 28
nop 3
dl 0
loc 47
rs 9.208
c 0
b 0
f 0
1
from glob import glob
2
import pytest
3
4
5
def file_path_to_module_path(string: str) -> str:
6
    return string.replace("/", ".").replace("\\", ".").replace(".py", "")
7
8
9
pytest_plugins = [
10
    file_path_to_module_path(fixture) for fixture in glob("tests/fixtures/*.py") if "__" not in fixture
11
]
12
13
14
@pytest.fixture
15
def somagic():
16
    from so_magic import init_so_magic
17
    _ = init_so_magic()
18
    return _
19
20
21
@pytest.fixture
22
def data_manager():
23
    def getter():
24
        from so_magic.data import init_data_manager
25
        from so_magic.data.backend import init_engine
26
        data_manager = init_data_manager(init_engine(engine_type='pd'))
27
        return data_manager
28
    return getter
29
30
31
@pytest.fixture
32
def read_observations():
33
    """Read a json lines formatted file and create the observations object (see Datapoints class)."""
34
    def load_data(so_master, json_lines_formatted_file_path):
35
        """Create the observations object for a Datapoints instance, given a data file.
36
37
        Args:
38
            so_master (so_magic.so_master.SoMaster): an instance of SoMaster
39
            json_lines_formatted_file_path (str): path to a json lines formatted file with the observations data
40
        """
41
        cmd = so_master.command.observations_command
42
        cmd.args = [json_lines_formatted_file_path]
43
        cmd.execute()
44
    return load_data
45
46
47
@pytest.fixture
48
def test_datapoints(read_observations, sample_collaped_json, somagic):
49
    """Read the designated json lines 'test file' (which contains the 'test observations') as a Datapoints instance."""
50
    read_observations(somagic, sample_collaped_json)
51
    return somagic.datapoints
52
53
54
@pytest.fixture
55
def test_dataset(somagic, read_observations, sample_collaped_json):
56
    """Dataset ready to be fed into a training/inference algorithm; feature vectors have been computed."""
57
    read_observations(somagic, sample_collaped_json)
58
59
    type_values = ['hybrid', 'indica', 'sativa']
60
    ATTRS2 = [f'type_{x}' for x in type_values]
61
    from functools import reduce
62
    UNIQUE_FLAVORS = reduce(lambda i, j: set(i).union(set(j)),
63
                            [_ for _ in somagic._data_manager.datapoints.observations['flavors'] if _ is not None])
64
65
    # cmd = somagic._data_manager.command.encode_command
66
    # cmd.args = [somagic._data_manager.datapoints, 'type']
67
    # cmd.execute()
68
    #
69
    # cmd = somagic._data_manager.command.replace_empty_command
70
    # cmd.args = [somagic._data_manager.datapoints, 'flavors', []]
71
    # cmd.execute()
72
    #
73
    # cmd = somagic._data_manager.command.encode_command
74
    # cmd.args = [somagic._data_manager.datapoints, 'flavors']
75
    # cmd.execute()
76
77
78
    cmd = somagic._data_manager.command.select_variables_command
79
    # current limitations:
80
    # 1. client code has to know the number of distict values for the nominal variable 'type'
81
    # 2. client code has to provide the column names that will result after encoding the 'type' variable
82
    cmd.args = [[
83
        # current limitations:
84
        # 1. client code has to know the number of distict values for the nominal variable 'type'
85
        # 2. client code has to provide the column names that will result after encoding the 'type' variable
86
        {'variable': 'type', 'columns': ATTRS2},
87
        # current limitations:
88
        # 1. client code has to know the number of distict values for the nominal variable 'flavors'
89
        # 2. client code has to provide the column names that will result after encoding the 'flavors' variable
90
        {'variable': 'flavors', 'columns': list(UNIQUE_FLAVORS)}]]
91
    cmd.execute()
92
93
    cmd = somagic._data_manager.command.one_hot_encoding_command
94
    cmd.args = [somagic._data_manager.datapoints, 'type']
95
    cmd.execute()
96
97
    assert set([type(x) for x in somagic._data_manager.datapoints.observations['flavors']]) == {list, type(None)}
98
99
    nb_columns_before = len(somagic._data_manager.datapoints.observations.columns)
100
101
    cmd = somagic._data_manager.command.one_hot_encoding_list_command
102
    cmd.args = [somagic._data_manager.datapoints, 'flavors']
103
    cmd.execute()
104
105
    assert nb_columns_before + len(UNIQUE_FLAVORS) == len(somagic._data_manager.datapoints.observations.columns)
106
107
    import numpy as np
108
    setattr(somagic.dataset, 'feature_vectors',
109
            np.array(somagic._data_manager.datapoints.observations[ATTRS2 + list(UNIQUE_FLAVORS)]))
110
111
    MAX_FLAVORS_PER_DAATPOINT = max(
112
        [len(x) for x in [_ for _ in somagic._data_manager.datapoints.observations['flavors'] if type(_) is list]])
113
    return somagic.dataset, type_values, UNIQUE_FLAVORS, MAX_FLAVORS_PER_DAATPOINT, nb_columns_before
114
115
116
@pytest.fixture
117
def built_in_backends():
118
    from so_magic.data.backend.panda_handling.df_backend import magic_backends
119
    engine_backends = magic_backends()
120
    return engine_backends
121
122
123
@pytest.fixture
124
def tabular_operators(built_in_backends):
125
    operators = {
126
        'retriever': {
127
            'class': built_in_backends.backend_interfaces['retriever']['class_registry'].subclasses['pd'],
128
            'interface': {
129
                'column': '(identifier, data)',
130
                'row': '(identifier, data)',
131
                'nb_columns': '(data)',
132
                'nb_rows': '(data)',
133
                'get_numerical_attributes': '(data)',
134
            }
135
        },
136
        'iterator': {
137
            'class': built_in_backends.backend_interfaces['iterator']['class_registry'].subclasses['pd'],
138
            'interface': {
139
                'columnnames': '(data)',
140
                'itercolumns': '(data)',
141
                'iterrows': '(data)',
142
            },
143
        },
144
        'mutator': {
145
            'class': built_in_backends.backend_interfaces['mutator']['class_registry'].subclasses['pd'],
146
            'interface': {
147
                'add_column': '(datapoints, values, new_attribute, **kwargs)',
148
            },
149
        },
150
    }
151
    return {
152
        'operators': operators,
153
        'reverse_dict': {operator_dict['class']: key for key, operator_dict in operators.items()},
154
        'get_nb_args': lambda operator_interface_name, method_name: len(operators[operator_interface_name]['interface'][method_name].replace(', **kwargs', '').split(',')),
155
        # operator_name_2_required_methods
156
        'required_methods': iter(((operator_interface_name, v['interface'].keys())
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable operator_interface_name does not seem to be defined.
Loading history...
Comprehensibility Best Practice introduced by
The variable v does not seem to be defined.
Loading history...
157
                                  for operator_interface_name, v in operators.items()))
158
    }
159
160
161
@pytest.fixture
162
def assert_different_objects():
163
    def _assert_different_objects(objects):
164
        assert len(set([id(x) for x in objects])) == len(objects)
165
    return _assert_different_objects
166