Passed
Push — mpeta ( 1841cb...62640f )
by Konstantinos
03:46
created

test_dataset   A

Complexity

Total Complexity 4

Size/Duplication

Total Lines 62
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
eloc 37
dl 0
loc 62
rs 10
c 0
b 0
f 0
wmc 4
1
import pytest
2
3
4
@pytest.fixture
5
def assert_selected_variables_are(somagic):
6
    def _assert_selected_variables_are(variables: set):
7
        assert set([x['variable'] for x in somagic._data_manager.feature_manager.feature_configuration.variables]) == variables
8
    return _assert_selected_variables_are
9
10
11
@pytest.fixture
12
def assert_column_values(test_dataset):
13
    """Assert that the set of some tabular data column's values is equal to the given set."""
14
    import pandas as pd
15
16
    def _assert_column_values_are(attribute, expected_values):
17
        assert set([_ for _ in test_dataset[0].datapoints.observations[attribute]]) == set(expected_values)
18
        assert set(pd.unique(test_dataset[0].datapoints.observations[attribute])) == set(expected_values)
19
    return _assert_column_values_are
20
21
22
@pytest.fixture
23
def assert_correct_nominal_variable_encoding(test_dataset):
24
    """Test a column with each row having a string representing one of the possible values of an Attribute.
25
26
    Useful when an Attribute corresponds to a discreet Variable of type Nominal (ordering does not matter) and its
27
    observation (row) can have only one of the possible values.
28
    """
29
    from collections import Counter
30
31
    def _assert_nominal_variable_encoded_as_expected(expected_feature_columns):
32
        assert all(Counter([datarow[_] for _ in expected_feature_columns]) ==
33
                   Counter({0: len(expected_feature_columns) - 1, 1: 1})
34
                   for index, datarow in test_dataset[0].datapoints.observations[expected_feature_columns].iterrows())
35
    return _assert_nominal_variable_encoded_as_expected
36
37
38
def test_sanity_checks_on_dataset(test_dataset, assert_selected_variables_are, assert_column_values,
39
                                  assert_correct_nominal_variable_encoding):
40
    ATTRS2 = [f'type_{x}' for x in test_dataset[1]]
41
42
    datapoints = test_dataset[0].datapoints
43
    assert_selected_variables_are({'type', 'flavors'})
44
45
    assert all(type(x) == str for x in datapoints.observations['type'])
46
47
    assert_column_values('type', expected_values=test_dataset[1])
48
49
    assert_correct_nominal_variable_encoding(ATTRS2)
50
51
    # the below is expected because test_dataset invokes the 'one_hot_encoding_list_command' command which unfortunately
52
    # at the moment has a side effect on the attribute it operates on.
53
    # side effect: _data_manager.datapoints.observations[_attribute].fillna(value=np.nan, inplace=True)
54
    assert set([type(x) for x in datapoints.observations['flavors']]) == {list, float}
55
56
    assert len(test_dataset[2]) > 5
57
58
    assert all(x in datapoints.observations.columns for x in test_dataset[2])
59
    assert all(0 <= sum([datarow[_] for _ in test_dataset[2]]) <= test_dataset[3]
60
               for index, datarow in datapoints.observations[list(test_dataset[2])].iterrows())
61
62
    assert hasattr(test_dataset[0], 'feature_vectors')
63