1
|
|
|
import math |
2
|
|
|
import random |
3
|
|
|
|
4
|
|
|
from enum import Enum |
5
|
|
|
|
6
|
|
|
from scipy.stats import norm, t |
7
|
|
|
|
8
|
|
|
|
9
|
|
|
class DistributionFamily(Enum): |
10
|
|
|
normal = 1 |
11
|
|
|
student_t = 2 |
12
|
|
|
fisher = 3 |
13
|
|
|
chi_square = 4 |
14
|
|
|
simulation = 5 |
15
|
|
|
|
16
|
|
|
|
17
|
|
|
class MeanSamplingDistribution(object): |
18
|
|
|
sample_distribution = None |
19
|
|
|
point_estimate = None |
20
|
|
|
distribution_family = None |
21
|
|
|
df = None |
22
|
|
|
sample_sd = None |
23
|
|
|
sample_size = None |
24
|
|
|
|
25
|
|
|
def __init__(self, sample_distribution=None, sample_mean=None, sample_sd=None, sample_size=None): |
26
|
|
|
if sample_mean is not None: |
27
|
|
|
self.point_estimate = sample_mean |
28
|
|
|
|
29
|
|
|
if sample_sd is not None: |
30
|
|
|
self.sample_sd = sample_sd |
31
|
|
|
|
32
|
|
|
if sample_size is not None: |
33
|
|
|
self.sample_size = sample_size |
34
|
|
|
|
35
|
|
|
if sample_distribution is not None: |
36
|
|
|
self.sample_distribution = sample_distribution |
37
|
|
|
self.point_estimate = sample_distribution.mean |
38
|
|
|
self.sample_sd = sample_distribution.sd |
39
|
|
|
self.sample_size = sample_distribution.sample_size |
40
|
|
|
|
41
|
|
|
self.standard_error = MeanSamplingDistribution.calculate_standard_error(self.sample_sd, self.sample_size) |
42
|
|
|
|
43
|
|
|
self.df = self.sample_size - 1.0 |
44
|
|
|
if self.sample_size < 30: |
45
|
|
|
self.distribution_family = DistributionFamily.student_t |
46
|
|
|
else: |
47
|
|
|
self.distribution_family = DistributionFamily.normal |
48
|
|
|
|
49
|
|
|
@staticmethod |
50
|
|
|
def calculate_standard_error(sample_sd, sample_size): |
51
|
|
|
return sample_sd / math.sqrt(sample_size) |
52
|
|
|
|
53
|
|
|
def confidence_interval(self, confidence_level): |
54
|
|
|
q = 1 - (1 - confidence_level) / 2 |
55
|
|
|
if self.distribution_family == DistributionFamily.normal: |
56
|
|
|
z = norm.ppf(q) |
57
|
|
|
pf = z * self.standard_error |
58
|
|
|
return self.point_estimate - pf, self.point_estimate + pf |
59
|
|
|
else: |
60
|
|
|
t_df = t.ppf(q, self.df) |
61
|
|
|
pf = t_df * self.standard_error + self.point_estimate |
62
|
|
|
return self.point_estimate - pf, self.point_estimate + pf |
63
|
|
|
|
64
|
|
|
|
65
|
|
|
class MeanDiffSamplingDistribution(object): |
66
|
|
|
grp1_sample_distribution = None |
67
|
|
|
grp2_sample_distribution = None |
68
|
|
|
grp1_point_estimate = None |
69
|
|
|
grp2_point_estimate = None |
70
|
|
|
grp1_sample_sd = None |
71
|
|
|
grp2_sample_sd = None |
72
|
|
|
grp1_sample_size = None |
73
|
|
|
grp2_sample_size = None |
74
|
|
|
distribution_family = None |
75
|
|
|
df = None |
76
|
|
|
point_estimate = None |
77
|
|
|
|
78
|
|
|
def __init__(self, grp1_sample_distribution=None, grp1_sample_mean=None, grp1_sample_sd=None, grp1_sample_size=None, |
79
|
|
|
grp2_sample_distribution=None, grp2_sample_mean=None, grp2_sample_sd=None, grp2_sample_size=None): |
80
|
|
|
self.build_grp1(grp1_sample_distribution, grp1_sample_mean, grp1_sample_sd, grp1_sample_size) |
81
|
|
|
self.build_grp2(grp2_sample_distribution, grp2_sample_mean, grp2_sample_sd, grp2_sample_size) |
82
|
|
|
|
83
|
|
|
self.standard_error = self.calculate_standard_error() |
84
|
|
|
|
85
|
|
|
self.df = min(self.grp1_sample_size - 1.0, self.grp2_sample_size - 1.0) |
86
|
|
|
self.point_estimate = self.grp1_point_estimate - self.grp2_point_estimate |
87
|
|
|
|
88
|
|
|
if self.grp1_sample_size < 30 or self.grp2_sample_size < 30: |
89
|
|
|
self.distribution_family = DistributionFamily.student_t |
90
|
|
|
else: |
91
|
|
|
self.distribution_family = DistributionFamily.normal |
92
|
|
|
|
93
|
|
|
def build_grp1(self, grp1_sample_distribution=None, grp1_sample_mean=None, grp1_sample_sd=None, grp1_sample_size=None): |
94
|
|
|
if grp1_sample_mean is not None: |
95
|
|
|
self.grp1_point_estimate = grp1_sample_mean |
96
|
|
|
|
97
|
|
|
if grp1_sample_sd is not None: |
98
|
|
|
self.grp1_sample_sd = grp1_sample_sd |
99
|
|
|
|
100
|
|
|
if grp1_sample_size is not None: |
101
|
|
|
self.grp1_sample_size = grp1_sample_size |
102
|
|
|
|
103
|
|
|
if grp1_sample_distribution is not None: |
104
|
|
|
self.grp1_sample_distribution = grp1_sample_distribution |
105
|
|
|
self.grp1_point_estimate = grp1_sample_distribution.mean |
106
|
|
|
self.grp1_sample_sd = grp1_sample_distribution.sd |
107
|
|
|
self.grp1_sample_size = grp1_sample_distribution.sample_size |
108
|
|
|
|
109
|
|
|
def build_grp2(self, grp2_sample_distribution=None, grp2_sample_mean=None, grp2_sample_sd=None, grp2_sample_size=None): |
110
|
|
|
if grp2_sample_mean is not None: |
111
|
|
|
self.grp2_point_estimate = grp2_sample_mean |
112
|
|
|
|
113
|
|
|
if grp2_sample_sd is not None: |
114
|
|
|
self.grp2_sample_sd = grp2_sample_sd |
115
|
|
|
|
116
|
|
|
if grp2_sample_size is not None: |
117
|
|
|
self.grp2_sample_size = grp2_sample_size |
118
|
|
|
|
119
|
|
|
if grp2_sample_distribution is not None: |
120
|
|
|
self.grp2_sample_distribution = grp2_sample_distribution |
121
|
|
|
self.grp2_point_estimate = grp2_sample_distribution.mean |
122
|
|
|
self.grp2_sample_sd = grp2_sample_distribution.sd |
123
|
|
|
self.grp2_sample_size = grp2_sample_distribution.sample_size |
124
|
|
|
|
125
|
|
|
def calculate_standard_error(self): |
126
|
|
|
return math.sqrt(self.grp1_sample_sd * self.grp1_sample_sd / self.grp1_sample_size + |
127
|
|
|
self.grp2_sample_sd * self.grp2_sample_sd / self.grp2_sample_size) |
128
|
|
|
|
129
|
|
|
def confidence_interval(self, confidence_level): |
130
|
|
|
q = 1 - (1 - confidence_level) / 2 |
131
|
|
|
if self.distribution_family == DistributionFamily.normal: |
132
|
|
|
z = norm.ppf(q) |
133
|
|
|
pf = z * self.standard_error |
134
|
|
|
|
135
|
|
|
return self.point_estimate - pf, self.point_estimate + pf |
136
|
|
|
else: |
137
|
|
|
t_df = t.ppf(q, self.df) |
138
|
|
|
pf = t_df * self.standard_error + self.point_estimate |
139
|
|
|
return self.point_estimate - pf, self.point_estimate + pf |
140
|
|
|
|
141
|
|
|
|
142
|
|
|
class ProportionSamplingDistribution(object): |
143
|
|
|
sample_distribution = None |
144
|
|
|
point_estimate = None |
145
|
|
|
distribution_family = None |
146
|
|
|
sample_size = None |
147
|
|
|
categorical_value = None |
148
|
|
|
standard_error = None |
149
|
|
|
simulated_proportions = None |
150
|
|
|
|
151
|
|
|
def __init__(self, sample_distribution=None, categorical_value=None, sample_proportion=None, sample_size=None): |
152
|
|
|
if sample_proportion is not None: |
153
|
|
|
self.point_estimate = sample_proportion |
154
|
|
|
|
155
|
|
|
if sample_size is not None: |
156
|
|
|
self.sample_size = sample_size |
157
|
|
|
|
158
|
|
|
if categorical_value is not None: |
159
|
|
|
self.categorical_value = categorical_value |
160
|
|
|
|
161
|
|
|
if sample_distribution is not None: |
162
|
|
|
self.build(sample_distribution) |
163
|
|
|
|
164
|
|
|
if self.sample_size * self.point_estimate < 10 or self.sample_size * (1 - self.point_estimate) < 10: |
165
|
|
|
self.distribution_family = DistributionFamily.simulation |
166
|
|
|
self.simulate() |
167
|
|
|
else: |
168
|
|
|
self.distribution_family = DistributionFamily.normal |
169
|
|
|
self.standard_error = math.sqrt(self.point_estimate * (1 - self.point_estimate) / self.sample_size) |
170
|
|
|
|
171
|
|
|
def build(self, sample_distribution): |
172
|
|
|
self.sample_distribution = sample_distribution |
173
|
|
|
self.point_estimate = sample_distribution.proportion |
174
|
|
|
self.categorical_value = sample_distribution.categorical_value |
175
|
|
|
self.sample_size = sample_distribution.sample_size |
176
|
|
|
|
177
|
|
|
def simulate(self): |
178
|
|
|
self.simulated_proportions = [0] * 1000 |
179
|
|
|
for i in range(1000): |
180
|
|
|
count = 0 |
181
|
|
|
for trials in range(self.sample_size): |
182
|
|
|
if random.random() <= self.point_estimate: |
183
|
|
|
count += 1 |
184
|
|
|
self.simulated_proportions[i] = float(count) / self.sample_size |
185
|
|
|
self.simulated_proportions = sorted(self.simulated_proportions) |
186
|
|
|
|
187
|
|
View Code Duplication |
def confidence_interval(self, confidence_level): |
|
|
|
|
188
|
|
|
q = 1 - (1 - confidence_level) / 2 |
189
|
|
|
if self.distribution_family == DistributionFamily.normal: |
190
|
|
|
z = norm.ppf(q) |
191
|
|
|
pf = z * self.standard_error |
192
|
|
|
return self.point_estimate - pf, self.point_estimate + pf |
193
|
|
|
else: |
194
|
|
|
threshold1 = int(1000 * (1 - confidence_level) / 2) |
195
|
|
|
threshold2 = int(1000 * q) |
196
|
|
|
return self.simulated_proportions[threshold1], self.simulated_proportions[threshold2] |
197
|
|
|
|
198
|
|
|
|
199
|
|
|
class ProportionDiffSamplingDistribution(object): |
200
|
|
|
grp1_sample_distribution = None |
201
|
|
|
grp2_sample_distribution = None |
202
|
|
|
grp1_point_estimate = None |
203
|
|
|
grp2_point_estimate = None |
204
|
|
|
distribution_family = None |
205
|
|
|
grp1_sample_size = None |
206
|
|
|
grp2_sample_size = None |
207
|
|
|
categorical_value = None |
208
|
|
|
standard_error = None |
209
|
|
|
grp1_simulated_proportions = None |
210
|
|
|
grp2_simulated_proportions = None |
211
|
|
|
diff_simulated_proportions = None |
212
|
|
|
point_estimate = None |
213
|
|
|
|
214
|
|
|
def __init__(self, categorical_value=None, |
215
|
|
|
grp1_sample_distribution=None, grp1_sample_proportion=None, grp1_sample_size=None, |
216
|
|
|
grp2_sample_distribution=None, grp2_sample_proportion=None, grp2_sample_size=None): |
217
|
|
|
if categorical_value is not None: |
218
|
|
|
self.categorical_value = categorical_value |
219
|
|
|
|
220
|
|
|
self.build_grp1(grp1_sample_distribution, grp1_sample_proportion, grp1_sample_size) |
221
|
|
|
self.build_grp2(grp2_sample_distribution, grp2_sample_proportion, grp2_sample_size) |
222
|
|
|
|
223
|
|
|
if not self.is_clt_held(): |
224
|
|
|
self.distribution_family = DistributionFamily.simulation |
225
|
|
|
self.simulate() |
226
|
|
|
else: |
227
|
|
|
self.distribution_family = DistributionFamily.normal |
228
|
|
|
self.standard_error = self.calculate_standard_error() |
229
|
|
|
|
230
|
|
|
self.point_estimate = self.grp1_point_estimate - self.grp2_point_estimate |
231
|
|
|
|
232
|
|
|
def calculate_standard_error(self): |
233
|
|
|
return math.sqrt(self.grp1_point_estimate * (1 - self.grp1_point_estimate) / self.grp2_sample_size + |
234
|
|
|
self.grp2_point_estimate * (1 - self.grp2_point_estimate) / self.grp2_sample_size) |
235
|
|
|
|
236
|
|
|
def is_clt_held(self): |
237
|
|
|
if self.grp1_sample_size * self.grp1_point_estimate < 10: |
238
|
|
|
return False |
239
|
|
|
if self.grp1_sample_size * (1 - self.grp1_point_estimate) < 10: |
240
|
|
|
return False |
241
|
|
|
if self.grp2_sample_size * self.grp2_point_estimate < 10: |
242
|
|
|
return False |
243
|
|
|
if self.grp2_sample_size * (1 - self.grp2_point_estimate) < 10: |
244
|
|
|
return False |
245
|
|
|
return True |
246
|
|
|
|
247
|
|
|
def build_grp1(self, grp1_sample_distribution=None, grp1_sample_proportion=None, grp1_sample_size=None): |
248
|
|
|
if grp1_sample_proportion is not None: |
249
|
|
|
self.grp1_point_estimate = grp1_sample_proportion |
250
|
|
|
|
251
|
|
|
if grp1_sample_size is not None: |
252
|
|
|
self.grp1_sample_size = grp1_sample_size |
253
|
|
|
|
254
|
|
|
if grp1_sample_distribution is not None: |
255
|
|
|
self.grp1_sample_distribution = grp1_sample_distribution |
256
|
|
|
self.grp1_point_estimate = grp1_sample_distribution.proportion |
257
|
|
|
self.categorical_value = grp1_sample_distribution.categorical_value |
258
|
|
|
self.grp1_sample_size = grp1_sample_distribution.sample_size |
259
|
|
|
|
260
|
|
|
def build_grp2(self, grp2_sample_distribution=None, grp2_sample_proportion=None, grp2_sample_size=None): |
261
|
|
|
if grp2_sample_proportion is not None: |
262
|
|
|
self.grp2_point_estimate = grp2_sample_proportion |
263
|
|
|
|
264
|
|
|
if grp2_sample_size is not None: |
265
|
|
|
self.grp2_sample_size = grp2_sample_size |
266
|
|
|
|
267
|
|
|
if grp2_sample_distribution is not None: |
268
|
|
|
self.grp2_sample_distribution = grp2_sample_distribution |
269
|
|
|
self.grp2_point_estimate = grp2_sample_distribution.proportion |
270
|
|
|
self.categorical_value = grp2_sample_distribution.categorical_value |
271
|
|
|
self.grp2_sample_size = grp2_sample_distribution.sample_size |
272
|
|
|
|
273
|
|
|
def simulate(self): |
274
|
|
|
self.grp1_simulated_proportions = ProportionDiffSamplingDistribution.simulate_grp(self.grp1_point_estimate, |
275
|
|
|
self.grp1_sample_size) |
276
|
|
|
self.grp2_simulated_proportions = ProportionDiffSamplingDistribution.simulate_grp(self.grp2_point_estimate, |
277
|
|
|
self.grp2_sample_size) |
278
|
|
|
|
279
|
|
|
self.diff_simulated_proportions = [0] * 1000; |
280
|
|
|
for i in range(1000): |
281
|
|
|
self.diff_simulated_proportions[i] = self.grp1_simulated_proportions[i] - self.grp2_simulated_proportions[i] |
282
|
|
|
|
283
|
|
|
@staticmethod |
284
|
|
|
def simulate_grp(proportion, sample_size): |
285
|
|
|
simulated_proportions = [0] * 1000 |
286
|
|
|
for iter in range(1000): |
287
|
|
|
count = 0 |
288
|
|
|
for trials in range(sample_size): |
289
|
|
|
if random.random() <= proportion: |
290
|
|
|
count += 1 |
291
|
|
|
simulated_proportions[iter] = float(count) / sample_size |
292
|
|
|
|
293
|
|
|
return sorted(simulated_proportions) |
294
|
|
|
|
295
|
|
View Code Duplication |
def confidence_interval(self, confidence_level): |
|
|
|
|
296
|
|
|
q = 1 - (1 - confidence_level) / 2 |
297
|
|
|
if self.distribution_family == DistributionFamily.normal: |
298
|
|
|
z = norm.ppf(q) |
299
|
|
|
pf = z * self.standard_error |
300
|
|
|
return self.point_estimate - pf, self.point_estimate + pf |
301
|
|
|
else: |
302
|
|
|
threshold1 = int(1000 * (1 - confidence_level) / 2) |
303
|
|
|
threshold2 = int(1000 * q) |
304
|
|
|
return self.diff_simulated_proportions[threshold1], self.diff_simulated_proportions[threshold2] |
305
|
|
|
|
306
|
|
|
|
307
|
|
|
|
308
|
|
|
|