Sample   A
last analyzed

Complexity

Total Complexity 17

Size/Duplication

Total Lines 56
Duplicated Lines 0 %

Importance

Changes 4
Bugs 0 Features 1
Metric Value
c 4
b 0
f 1
dl 0
loc 56
rs 10
wmc 17

11 Methods

Rating   Name   Duplication   Size   Complexity  
A is_categorical() 0 2 1
A get() 0 2 1
A count_by_group_id() 0 2 4
A add() 0 2 1
A add_category() 0 5 1
A is_numerical() 0 2 1
A __init__() 0 2 1
A add_xy() 0 6 1
A size() 0 2 1
A split_by_group_id() 0 13 4
A add_numeric() 0 5 1
1
import math
2
3
from pysie.dsl.set import TernarySearchTrie
4
5
6
class Observation(object):
7
    x = None
8
    y = None
9
    group_id = None
10
    label = None
11
12
    def __init__(self, x=None, label=None, group_id=None, y=None):
13
        if x is not None:
14
            self.x = x
15
        if y is not None:
16
            self.y = y
17
18
        if label is not None:
19
            self.label = label
20
21
        if group_id is not None:
22
            self.group_id = group_id
23
24
    def is_categorical(self):
25
        return self.label is not None
26
27
    def is_numerical(self):
28
        return self.x is not None
29
30
31
class Sample(object):
32
    observations = None
33
34
    def __init__(self):
35
        self.observations = []
36
37
    def add(self, observation):
38
        self.observations.append(observation)
39
40
    def add_numeric(self, x, group_id=None):
41
        ob = Observation()
42
        ob.x = x
43
        ob.group_id = group_id
44
        self.add(ob)
45
46
    def add_category(self, label, group_id=None):
47
        ob = Observation()
48
        ob.label = label
49
        ob.group_id = group_id
50
        self.add(ob)
51
52
    def add_xy(self, x, y, group_id=None):
53
        ob = Observation()
54
        ob.x = x
55
        ob.y = y
56
        ob.group_id = group_id
57
        self.add(ob)
58
59
    def size(self):
60
        return len(self.observations)
61
62
    def get(self, index):
63
        return self.observations[index]
64
65
    def is_categorical(self):
66
        return self.observations[0].is_categorical()
67
68
    def is_numerical(self):
69
        return self.observations[0].is_numerical()
70
71
    def count_by_group_id(self, group_id):
72
        return sum(1 for x in self.observations if group_id is None or x.group_id == group_id)
73
74
    def split_by_group_id(self):
75
        result = TernarySearchTrie()
76
        for ob in self.observations:
77
            group_id = ob.group_id
78
            if group_id is None:
79
                continue
80
            if result.contains_key(group_id):
81
                result.get(group_id).observations.append(ob)
82
            else:
83
                sample = Sample()
84
                sample.observations.append(ob)
85
                result.put(group_id, sample)
86
        return result
87
88
89
class SampleDistribution(object):
90
    sample = None
91
    group_id = None
92
93
    categorical_value = None
94
    is_categorical = False
95
    is_numerical = False
96
97
    sd = None
98
    sample_size = None
99
    mean = None
100
    variance = None
101
    sum_of_squares = None
102
103
    proportion = None
104
105
    def __init__(self, sample=None, group_id=None, categorical_value=None, mean=None, sd=None, sample_size=None,
106
                 proportion=None):
107
        if group_id is not None:
108
            self.group_id = group_id
109
110
        self.track_categorical(categorical_value, proportion)
111
        self.track_numerical(mean, sd)
112
113
        if sample_size is not None:
114
            self.sample_size = sample_size
115
116
        if self.sd is not None and self.sample_size is not None:
117
            self.variance = self.sd * self.sd
118
            self.sum_of_squares = self.variance * (self.sample_size - 1)
119
120
        if sample is not None:
121
            self.build(sample)
122
123
    def track_categorical(self, categorical_value, proportion):
124
        if categorical_value is not None:
125
            self.categorical_value = categorical_value
126
127
        if proportion is not None:
128
            self.proportion = proportion
129
            self.is_categorical = True
130
131
    def track_numerical(self, mean, sd):
132
        if mean is not None:
133
            self.mean = mean
134
            self.is_numerical = True
135
136
        if sd is not None:
137
            self.sd = sd
138
139
    def build(self, sample):
140
        self.sample = sample
141
        if sample.is_numerical():
142
            self.mean = SampleDistribution.calculate_mean(sample, self.group_id)
143
            self.sum_of_squares = SampleDistribution.calculate_sum_of_squares(sample, self.mean, self.group_id)
144
            self.sample_size = sample.count_by_group_id(self.group_id)
145
            self.variance = self.sum_of_squares / (self.sample_size - 1)
146
            self.sd = math.sqrt(self.variance)
147
            self.is_numerical = True
148
        elif sample.is_categorical() and self.categorical_value is not None:
149
            self.proportion = SampleDistribution.calculate_proportion(sample, self.categorical_value, self.group_id)
150
            self.sample_size = sample.count_by_group_id(self.group_id)
151
            self.mean = self.proportion * self.sample_size
152
            self.variance = self.proportion * (1.0 - self.proportion) * self.sample_size
153
            self.is_categorical = True
154
155
    @staticmethod
156
    def calculate_mean(sample, group_id):
157
        count = 0
158
        the_sum = 0
159
        for i in range(sample.size()):
160
            observation = sample.get(i)
161
            if group_id is not None and observation.group_id != group_id:
162
                continue
163
            the_sum += observation.x
164
            count += 1
165
        return the_sum / count
166
167
    @staticmethod
168
    def calculate_sum_of_squares(sample, mean, group_id):
169
        the_sum = 0
170
        for i in range(sample.size()):
171
            observation = sample.get(i)
172
            if group_id is not None and observation.group_id != group_id:
173
                continue
174
            the_sum += (observation.x - mean) * (observation.x - mean)
175
        return the_sum
176
177
    @staticmethod
178
    def calculate_proportion(sample, categorical_value, group_id):
179
        counter1 = 0
180
        counter2 = 0
181
        for i in range(sample.size()):
182
            observation = sample.get(i)
183
            if group_id is not None and observation.group_id != group_id:
184
                continue
185
            counter2 += 1
186
            if observation.label == categorical_value:
187
                counter1 += 1
188
        if counter2 == 0:
189
            return 0.0
190
        return float(counter1) / counter2
191