Completed
Push — master ( a9e9bf...00ce36 )
by Xianshun
59s
created

SampleDistribution.build()   A

Complexity

Conditions 4

Size

Total Lines 15

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 4
c 1
b 0
f 0
dl 0
loc 15
rs 9.2
1
import math
2
3
4
class Observation(object):
5
    x = None
6
    y = None
7
    group_id = None
8
    label = None
9
10
    def __init__(self, x=None, label=None, group_id=None, y=None):
11
        if x is not None:
12
            self.x = x
13
        if y is not None:
14
            self.y = y
15
16
        if label is not None:
17
            self.label = label
18
19
        if group_id is not None:
20
            self.group_id = group_id
21
22
    def is_categorical(self):
23
        return self.label is not None
24
25
    def is_numerical(self):
26
        return self.x is not None
27
28
29
class Sample(object):
30
    observations = None
31
32
    def __init__(self):
33
        self.observations = []
34
35
    def add(self, observation):
36
        self.observations.append(observation)
37
38
    def add_numeric(self, numeric, group_id=None):
39
        ob = Observation()
40
        ob.x = numeric
41
        ob.group_id = group_id
42
        self.add(ob)
43
44
    def add_category(self, category, group_id=None):
45
        ob = Observation()
46
        ob.label = category
47
        ob.group_id = group_id
48
        self.add(ob)
49
50
    def add_xy(self, x, y, group_id=None):
51
        ob = Observation()
52
        ob.x = x
53
        ob.y = y
54
        ob.group_id = group_id
55
        self.add(ob)
56
57
    def size(self):
58
        return len(self.observations)
59
60
    def get(self, index):
61
        return self.observations[index]
62
63
    def is_categorical(self):
64
        return self.observations[0].is_categorical()
65
66
    def is_numerical(self):
67
        return self.observations[0].is_numerical()
68
69
    def count_by_group_id(self, group_id):
70
        return sum(1 for x in self.observations if group_id is None or x.group_id == group_id)
71
72
73
class SampleDistribution(object):
74
    sample = None
75
    group_id = None
76
77
    categorical_value = None
78
    is_categorical = False
79
    is_numerical = False
80
81
    sd = None
82
    sample_size = None
83
    mean = None
84
    variance = None
85
    sum_of_squares= None
86
87
    proportion = None
88
89
    def __init__(self, sample=None, group_id=None, categorical_value=None, mean=None, sd=None, sample_size=None, proportion=None):
90
        if group_id is not None:
91
            self.group_id = group_id
92
        if categorical_value is not None:
93
            self.categorical_value = categorical_value
94
95
        if mean is not None:
96
            self.mean = mean
97
            self.is_numerical = True
98
99
        if proportion is not None:
100
            self.proportion = proportion
101
            self.is_categorical = True
102
103
        if sample_size is not None:
104
            self.sample_size = sample_size
105
106
        if sd is not None:
107
            self.sd = sd
108
109
        if self.sd is not None and self.sample_size is not None:
110
            self.variance = self.sd * self.sd
111
            self.sum_of_squares = self.variance * (self.sample_size-1)
112
113
        if sample is not None:
114
            self.build(sample)
115
116
    def build(self, sample):
117
        self.sample = sample
118
        if sample.is_numerical():
119
            self.mean = SampleDistribution.calculate_mean(sample, self.group_id)
120
            self.sum_of_squares = SampleDistribution.calculate_sum_of_squares(sample, self.mean, self.group_id)
121
            self.sample_size = sample.count_by_group_id(self.group_id)
122
            self.variance = self.sum_of_squares / (self.sample_size - 1)
123
            self.sd = math.sqrt(self.variance)
124
            self.is_numerical = True
125
        elif sample.is_categorical() and self.categorical_value is not None:
126
            self.proportion = SampleDistribution.calculate_proportion(sample, self.categorical_value, self.group_id)
127
            self.sample_size = sample.count_by_group_id(self.group_id)
128
            self.mean = self.proportion * self.sample_size
129
            self.variance = self.proportion * (1.0 - self.proportion) * self.sample_size
130
            self.is_categorical = True
131
132
    @staticmethod
133
    def calculate_mean(sample, group_id):
134
        count = 0
135
        the_sum = 0
136
        for i in range(sample.size()):
137
            observation = sample.get(i)
138
            if group_id is not None and observation.group_id != group_id:
139
                continue
140
            the_sum += observation.x
141
            count += 1
142
        return the_sum / count
143
144
    @staticmethod
145
    def calculate_sum_of_squares(sample, mean, group_id):
146
        the_sum = 0
147
        for i in range(sample.size()):
148
            observation = sample.get(i)
149
            if group_id is not None and observation.group_id != group_id:
150
                continue
151
            the_sum += (observation.x - mean) * (observation.x - mean)
152
        return the_sum
153
154
    @staticmethod
155
    def calculate_proportion(sample, categorical_value, group_id):
156
        counter1 = 0
157
        counter2 = 0
158
        for i in range(sample.size()):
159
            observation = sample.get(i)
160
            if group_id is not None and observation.group_id != group_id:
161
                continue
162
            counter2 += 1
163
            if observation.label == categorical_value:
164
                counter1 += 1
165
        if counter2 == 0:
166
            return 0
167
        return counter1 / counter2
168
169