Completed
Push — master ( b2a338...b187e9 )
by Xianshun
01:16
created

SampleDistribution   A

Complexity

Total Complexity 27

Size/Duplication

Total Lines 84
Duplicated Lines 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
dl 0
loc 84
rs 10
c 1
b 0
f 0
wmc 27

4 Methods

Rating   Name   Duplication   Size   Complexity  
B calculate_proportion() 0 14 6
A calculate_mean() 0 11 4
A calculate_sum_of_squares() 0 9 4
F __init__() 0 39 13
1
import math
2
3
4
class Observation(object):
5
    x = None
6
    y = None
7
    group_id = None
8
    label = None
9
10
    def __init__(self, x=None, label=None, group_id=None, y=None):
11
        if x is not None:
12
            self.x = x
13
        if y is not None:
14
            self.y = y
15
16
        if label is not None:
17
            self.label = label
18
19
        if group_id is not None:
20
            self.group_id = group_id
21
22
    def is_categorical(self):
23
        return self.label is not None
24
25
    def is_numerical(self):
26
        return self.x is not None
27
28
29
class Sample(object):
30
    observations = None
31
32
    def __init__(self):
33
        self.observations = []
34
35
    def add(self, observation):
36
        self.observations.append(observation)
37
38
    def size(self):
39
        return len(self.observations)
40
41
    def get(self, index):
42
        return self.observations[index]
43
44
    def is_categorical(self):
45
        return self.observations[0].is_categorical()
46
47
    def is_numerical(self):
48
        return self.observations[0].is_numerical()
49
50
    def count_by_group_id(self, group_id):
51
        return sum(1 for x in self.observations if group_id is None or x.group_id == group_id)
52
53
54
class SampleDistribution(object):
55
    sample = None
56
    group_id = None
57
58
    categorical_value = None
59
    is_categorical = False
60
    is_numerical = False
61
62
    def __init__(self, sample=None, group_id=None, categorical_value=None, mean=None, sd=None, sample_size=None, proportion=None):
63
        if group_id is not None:
64
            self.group_id = group_id
65
        if categorical_value is not None:
66
            self.categorical_value = categorical_value
67
68
        if mean is not None:
69
            self.mean = mean
70
            self.is_numerical = True
71
72
        if proportion is not None:
73
            self.proportion = proportion
74
            self.is_categorical = True
75
76
        if sample_size is not None:
77
            self.sample_size = sample_size
78
79
        if sd is not None:
80
            self.sd = sd
81
82
        if self.sd is not None and self.sample_size is not None:
83
            self.variance = self.sd * self.sd
84
            self.sum_of_squares = self.variance * (self.sample_size-1)
85
86
        if sample is not None:
87
            self.sample = sample
88
            if sample.is_numerical():
89
                self.mean = SampleDistribution.calculate_mean(sample, group_id)
90
                self.sum_of_squares = SampleDistribution.calculate_sum_of_squares(sample, self.mean, group_id)
91
                self.sample_size = sample.count_by_group_id(group_id)
92
                self.variance = self.sum_of_squares / (self.sample_size - 1)
93
                self.sd = math.sqrt(self.variance)
94
                self.is_numerical = True
95
            elif sample.is_categorical() and categorical_value is not None:
96
                self.proportion = SampleDistribution.calculate_proportion(sample, categorical_value, group_id)
97
                self.sample_size = sample.count_by_group_id(group_id)
98
                self.mean = self.proportion * self.sample_size
99
                self.variance = self.proportion * (1.0 - self.proportion) * self.sample_size
100
                self.is_categorical = True
101
102
    @staticmethod
103
    def calculate_mean(sample, group_id):
104
        count = 0
105
        the_sum = 0
106
        for i in range(sample.size()):
107
            observation = sample.get(i)
108
            if group_id is not None and observation.group_id != group_id:
109
                continue
110
            the_sum += observation.x
111
            count += 1
112
        return the_sum / count
113
114
    @staticmethod
115
    def calculate_sum_of_squares(sample, mean, group_id):
116
        the_sum = 0
117
        for i in range(sample.size()):
118
            observation = sample.get(i)
119
            if group_id is not None and observation.group_id != group_id:
120
                continue
121
            the_sum += (observation.x - mean) * (observation.x - mean)
122
        return the_sum
123
124
    @staticmethod
125
    def calculate_proportion(sample, categorical_value, group_id):
126
        counter1 = 0
127
        counter2 = 0
128
        for i in range(sample.size()):
129
            observation = sample.get(i)
130
            if group_id is not None and observation.group_id != group_id:
131
                continue
132
            counter2 += 1
133
            if observation.label == categorical_value:
134
                counter1 += 1
135
        if counter2 == 0:
136
            return 0
137
        return counter1 / counter2
138
139