ProportionSamplingDistribution   A
last analyzed

Complexity

Total Complexity 14

Size/Duplication

Total Lines 55
Duplicated Lines 18.18 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
dl 10
loc 55
rs 10
c 1
b 0
f 0
wmc 14

4 Methods

Rating   Name   Duplication   Size   Complexity  
A simulate() 0 9 4
A confidence_interval() 10 10 2
B __init__() 0 19 7
A build() 0 5 1

How to fix   Duplicated Code   

Duplicated Code

Duplicate code is one of the most pungent code smells. A rule that is often used is to re-structure code once it is duplicated in three or more places.

Common duplication problems, and corresponding solutions are:

1
import math
2
import random
3
4
from enum import Enum
5
6
from scipy.stats import norm, t
7
8
9
class DistributionFamily(Enum):
10
    normal = 1
11
    student_t = 2
12
    fisher = 3
13
    chi_square = 4
14
    simulation = 5
15
16
17
class MeanSamplingDistribution(object):
18
    sample_distribution = None
19
    point_estimate = None
20
    distribution_family = None
21
    df = None
22
    sample_sd = None
23
    sample_size = None
24
25
    def __init__(self, sample_distribution=None, sample_mean=None, sample_sd=None, sample_size=None):
26
        if sample_mean is not None:
27
            self.point_estimate = sample_mean
28
29
        if sample_sd is not None:
30
            self.sample_sd = sample_sd
31
32
        if sample_size is not None:
33
            self.sample_size = sample_size
34
35
        if sample_distribution is not None:
36
            self.sample_distribution = sample_distribution
37
            self.point_estimate = sample_distribution.mean
38
            self.sample_sd = sample_distribution.sd
39
            self.sample_size = sample_distribution.sample_size
40
41
        self.standard_error = MeanSamplingDistribution.calculate_standard_error(self.sample_sd, self.sample_size)
42
43
        self.df = self.sample_size - 1.0
44
        if self.sample_size < 30:
45
            self.distribution_family = DistributionFamily.student_t
46
        else:
47
            self.distribution_family = DistributionFamily.normal
48
49
    @staticmethod
50
    def calculate_standard_error(sample_sd, sample_size):
51
        return sample_sd / math.sqrt(sample_size)
52
53
    def confidence_interval(self, confidence_level):
54
        q = 1 - (1 - confidence_level) / 2
55
        if self.distribution_family == DistributionFamily.normal:
56
            z = norm.ppf(q)
57
            pf = z * self.standard_error
58
            return self.point_estimate - pf, self.point_estimate + pf
59
        else:
60
            t_df = t.ppf(q, self.df)
61
            pf = t_df * self.standard_error + self.point_estimate
62
            return self.point_estimate - pf, self.point_estimate + pf
63
64
65
class MeanDiffSamplingDistribution(object):
66
    grp1_sample_distribution = None
67
    grp2_sample_distribution = None
68
    grp1_point_estimate = None
69
    grp2_point_estimate = None
70
    grp1_sample_sd = None
71
    grp2_sample_sd = None
72
    grp1_sample_size = None
73
    grp2_sample_size = None
74
    distribution_family = None
75
    df = None
76
    point_estimate = None
77
78
    def __init__(self, grp1_sample_distribution=None, grp1_sample_mean=None, grp1_sample_sd=None, grp1_sample_size=None,
79
                 grp2_sample_distribution=None, grp2_sample_mean=None, grp2_sample_sd=None, grp2_sample_size=None):
80
        self.build_grp1(grp1_sample_distribution, grp1_sample_mean, grp1_sample_sd, grp1_sample_size)
81
        self.build_grp2(grp2_sample_distribution, grp2_sample_mean, grp2_sample_sd, grp2_sample_size)
82
83
        self.standard_error = self.calculate_standard_error()
84
85
        self.df = min(self.grp1_sample_size - 1.0, self.grp2_sample_size - 1.0)
86
        self.point_estimate = self.grp1_point_estimate - self.grp2_point_estimate
87
88
        if self.grp1_sample_size < 30 or self.grp2_sample_size < 30:
89
            self.distribution_family = DistributionFamily.student_t
90
        else:
91
            self.distribution_family = DistributionFamily.normal
92
93
    def build_grp1(self, grp1_sample_distribution=None, grp1_sample_mean=None, grp1_sample_sd=None, grp1_sample_size=None):
94
        if grp1_sample_mean is not None:
95
            self.grp1_point_estimate = grp1_sample_mean
96
97
        if grp1_sample_sd is not None:
98
            self.grp1_sample_sd = grp1_sample_sd
99
100
        if grp1_sample_size is not None:
101
            self.grp1_sample_size = grp1_sample_size
102
103
        if grp1_sample_distribution is not None:
104
            self.grp1_sample_distribution = grp1_sample_distribution
105
            self.grp1_point_estimate = grp1_sample_distribution.mean
106
            self.grp1_sample_sd = grp1_sample_distribution.sd
107
            self.grp1_sample_size = grp1_sample_distribution.sample_size
108
            
109
    def build_grp2(self, grp2_sample_distribution=None, grp2_sample_mean=None, grp2_sample_sd=None, grp2_sample_size=None):
110
        if grp2_sample_mean is not None:
111
            self.grp2_point_estimate = grp2_sample_mean
112
113
        if grp2_sample_sd is not None:
114
            self.grp2_sample_sd = grp2_sample_sd
115
116
        if grp2_sample_size is not None:
117
            self.grp2_sample_size = grp2_sample_size
118
119
        if grp2_sample_distribution is not None:
120
            self.grp2_sample_distribution = grp2_sample_distribution
121
            self.grp2_point_estimate = grp2_sample_distribution.mean
122
            self.grp2_sample_sd = grp2_sample_distribution.sd
123
            self.grp2_sample_size = grp2_sample_distribution.sample_size
124
125
    def calculate_standard_error(self):
126
        return math.sqrt(self.grp1_sample_sd * self.grp1_sample_sd / self.grp1_sample_size + 
127
                         self.grp2_sample_sd * self.grp2_sample_sd / self.grp2_sample_size)
128
129
    def confidence_interval(self, confidence_level):
130
        q = 1 - (1 - confidence_level) / 2
131
        if self.distribution_family == DistributionFamily.normal:
132
            z = norm.ppf(q)
133
            pf = z * self.standard_error
134
            
135
            return self.point_estimate - pf, self.point_estimate + pf
136
        else:
137
            t_df = t.ppf(q, self.df)
138
            pf = t_df * self.standard_error + self.point_estimate
139
            return self.point_estimate - pf, self.point_estimate + pf
140
141
142
class ProportionSamplingDistribution(object):
143
    sample_distribution = None
144
    point_estimate = None
145
    distribution_family = None
146
    sample_size = None
147
    categorical_value = None
148
    standard_error = None
149
    simulated_proportions = None
150
151
    def __init__(self, sample_distribution=None, categorical_value=None, sample_proportion=None, sample_size=None):
152
        if sample_proportion is not None:
153
            self.point_estimate = sample_proportion
154
155
        if sample_size is not None:
156
            self.sample_size = sample_size
157
158
        if categorical_value is not None:
159
            self.categorical_value = categorical_value
160
161
        if sample_distribution is not None:
162
            self.build(sample_distribution)
163
164
        if self.sample_size * self.point_estimate < 10 or self.sample_size * (1 - self.point_estimate) < 10:
165
            self.distribution_family = DistributionFamily.simulation
166
            self.simulate()
167
        else:
168
            self.distribution_family = DistributionFamily.normal
169
            self.standard_error = math.sqrt(self.point_estimate * (1 - self.point_estimate) / self.sample_size)
170
171
    def build(self, sample_distribution):
172
        self.sample_distribution = sample_distribution
173
        self.point_estimate = sample_distribution.proportion
174
        self.categorical_value = sample_distribution.categorical_value
175
        self.sample_size = sample_distribution.sample_size
176
177
    def simulate(self):
178
        self.simulated_proportions = [0] * 1000
179
        for i in range(1000):
180
            count = 0
181
            for trials in range(self.sample_size):
182
                if random.random() <= self.point_estimate:
183
                    count += 1
184
            self.simulated_proportions[i] = float(count) / self.sample_size
185
        self.simulated_proportions = sorted(self.simulated_proportions)
186
187 View Code Duplication
    def confidence_interval(self, confidence_level):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
188
        q = 1 - (1 - confidence_level) / 2
189
        if self.distribution_family == DistributionFamily.normal:
190
            z = norm.ppf(q)
191
            pf = z * self.standard_error
192
            return self.point_estimate - pf, self.point_estimate + pf
193
        else:
194
            threshold1 = int(1000 * (1 - confidence_level) / 2)
195
            threshold2 = int(1000 * q)
196
            return self.simulated_proportions[threshold1], self.simulated_proportions[threshold2]
197
        
198
        
199
class ProportionDiffSamplingDistribution(object):
200
    grp1_sample_distribution = None
201
    grp2_sample_distribution = None
202
    grp1_point_estimate = None
203
    grp2_point_estimate = None
204
    distribution_family = None
205
    grp1_sample_size = None
206
    grp2_sample_size = None
207
    categorical_value = None
208
    standard_error = None
209
    grp1_simulated_proportions = None
210
    grp2_simulated_proportions = None
211
    diff_simulated_proportions = None
212
    point_estimate = None
213
214
    def __init__(self, categorical_value=None,
215
                 grp1_sample_distribution=None, grp1_sample_proportion=None, grp1_sample_size=None,
216
                 grp2_sample_distribution=None, grp2_sample_proportion=None, grp2_sample_size=None):
217
        if categorical_value is not None:
218
            self.categorical_value = categorical_value
219
            
220
        self.build_grp1(grp1_sample_distribution, grp1_sample_proportion, grp1_sample_size)
221
        self.build_grp2(grp2_sample_distribution, grp2_sample_proportion, grp2_sample_size)
222
223
        if not self.is_clt_held():
224
            self.distribution_family = DistributionFamily.simulation
225
            self.simulate()
226
        else:
227
            self.distribution_family = DistributionFamily.normal
228
            self.standard_error = self.calculate_standard_error()
229
230
        self.point_estimate = self.grp1_point_estimate - self.grp2_point_estimate
231
            
232
    def calculate_standard_error(self):
233
        return math.sqrt(self.grp1_point_estimate * (1 - self.grp1_point_estimate) / self.grp2_sample_size +
234
                         self.grp2_point_estimate * (1 - self.grp2_point_estimate) / self.grp2_sample_size)
235
236
    def is_clt_held(self):
237
        if self.grp1_sample_size * self.grp1_point_estimate < 10:
238
            return False
239
        if self.grp1_sample_size * (1 - self.grp1_point_estimate) < 10:
240
            return False
241
        if self.grp2_sample_size * self.grp2_point_estimate < 10:
242
            return False
243
        if self.grp2_sample_size * (1 - self.grp2_point_estimate) < 10:
244
            return False
245
        return True
246
247
    def build_grp1(self, grp1_sample_distribution=None, grp1_sample_proportion=None, grp1_sample_size=None):
248
        if grp1_sample_proportion is not None:
249
            self.grp1_point_estimate = grp1_sample_proportion
250
251
        if grp1_sample_size is not None:
252
            self.grp1_sample_size = grp1_sample_size
253
254
        if grp1_sample_distribution is not None:
255
            self.grp1_sample_distribution = grp1_sample_distribution
256
            self.grp1_point_estimate = grp1_sample_distribution.proportion
257
            self.categorical_value = grp1_sample_distribution.categorical_value
258
            self.grp1_sample_size = grp1_sample_distribution.sample_size
259
260
    def build_grp2(self, grp2_sample_distribution=None, grp2_sample_proportion=None, grp2_sample_size=None):
261
        if grp2_sample_proportion is not None:
262
            self.grp2_point_estimate = grp2_sample_proportion
263
264
        if grp2_sample_size is not None:
265
            self.grp2_sample_size = grp2_sample_size
266
267
        if grp2_sample_distribution is not None:
268
            self.grp2_sample_distribution = grp2_sample_distribution
269
            self.grp2_point_estimate = grp2_sample_distribution.proportion
270
            self.categorical_value = grp2_sample_distribution.categorical_value
271
            self.grp2_sample_size = grp2_sample_distribution.sample_size
272
            
273
    def simulate(self):
274
        self.grp1_simulated_proportions = ProportionDiffSamplingDistribution.simulate_grp(self.grp1_point_estimate,
275
                                                                                          self.grp1_sample_size)
276
        self.grp2_simulated_proportions = ProportionDiffSamplingDistribution.simulate_grp(self.grp2_point_estimate,
277
                                                                                          self.grp2_sample_size)
278
279
        self.diff_simulated_proportions = [0] * 1000;
280
        for i in range(1000):
281
            self.diff_simulated_proportions[i] = self.grp1_simulated_proportions[i] - self.grp2_simulated_proportions[i]
282
        
283
    @staticmethod
284
    def simulate_grp(proportion, sample_size):
285
        simulated_proportions = [0] * 1000
286
        for iter in range(1000):
287
            count = 0
288
            for trials in range(sample_size):
289
                if random.random() <= proportion:
290
                    count += 1
291
            simulated_proportions[iter] = float(count) / sample_size
292
293
        return sorted(simulated_proportions)
294
    
295 View Code Duplication
    def confidence_interval(self, confidence_level):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
296
        q = 1 - (1 - confidence_level) / 2
297
        if self.distribution_family == DistributionFamily.normal:
298
            z = norm.ppf(q)
299
            pf = z * self.standard_error
300
            return self.point_estimate - pf, self.point_estimate + pf
301
        else:
302
            threshold1 = int(1000 * (1 - confidence_level) / 2)
303
            threshold2 = int(1000 * q)
304
            return self.diff_simulated_proportions[threshold1], self.diff_simulated_proportions[threshold2]
305
306
307
308