|
1
|
|
|
import math |
|
2
|
|
|
import random |
|
3
|
|
|
|
|
4
|
|
|
from enum import Enum |
|
5
|
|
|
|
|
6
|
|
|
from scipy.stats import norm, t |
|
7
|
|
|
|
|
8
|
|
|
|
|
9
|
|
|
class DistributionFamily(Enum): |
|
10
|
|
|
normal = 1 |
|
11
|
|
|
student_t = 2 |
|
12
|
|
|
fisher = 3 |
|
13
|
|
|
chi_square = 4 |
|
14
|
|
|
simulation = 5 |
|
15
|
|
|
|
|
16
|
|
|
|
|
17
|
|
|
class MeanSamplingDistribution(object): |
|
18
|
|
|
sample_distribution = None |
|
19
|
|
|
point_estimate = None |
|
20
|
|
|
distribution_family = None |
|
21
|
|
|
df = None |
|
22
|
|
|
|
|
23
|
|
|
def __init__(self, sample_distribution=None, sample_mean=None, sample_sd=None, sample_size=None): |
|
24
|
|
|
if sample_mean is not None: |
|
25
|
|
|
self.point_estimate = sample_mean |
|
26
|
|
|
|
|
27
|
|
|
if sample_sd is not None: |
|
28
|
|
|
self.sample_sd = sample_sd |
|
29
|
|
|
|
|
30
|
|
|
if sample_size is not None: |
|
31
|
|
|
self.sample_size = sample_size |
|
32
|
|
|
|
|
33
|
|
|
if sample_distribution is not None: |
|
34
|
|
|
self.sample_distribution = sample_distribution |
|
35
|
|
|
self.point_estimate = sample_distribution.mean |
|
36
|
|
|
self.sample_sd = sample_distribution.sd |
|
37
|
|
|
self.sample_size = sample_distribution.sample_size |
|
38
|
|
|
|
|
39
|
|
|
self.standard_error = MeanSamplingDistribution.calculate_standard_error(self.sample_sd, self.sample_size) |
|
40
|
|
|
|
|
41
|
|
|
self.df = self.sample_size - 1.0 |
|
42
|
|
|
if self.sample_size < 30: |
|
43
|
|
|
self.distribution_family = DistributionFamily.student_t |
|
44
|
|
|
else: |
|
45
|
|
|
self.distribution_family = DistributionFamily.normal |
|
46
|
|
|
|
|
47
|
|
|
@staticmethod |
|
48
|
|
|
def calculate_standard_error(sample_sd, sample_size): |
|
49
|
|
|
return sample_sd / math.sqrt(sample_size) |
|
50
|
|
|
|
|
51
|
|
|
def confidence_interval(self, confidence_level): |
|
52
|
|
|
q = 1 - (1 - confidence_level) / 2 |
|
53
|
|
|
if self.distribution_family == DistributionFamily.normal: |
|
54
|
|
|
z = norm.ppf(q) |
|
55
|
|
|
pf = z * self.standard_error |
|
56
|
|
|
return self.point_estimate - pf, self.point_estimate + pf |
|
57
|
|
|
else: |
|
58
|
|
|
t_df = t.ppf(q, self.df) |
|
59
|
|
|
pf = t_df * self.standard_error + self.point_estimate |
|
60
|
|
|
return self.point_estimate - pf, self.point_estimate + pf |
|
61
|
|
|
|
|
62
|
|
|
|
|
63
|
|
|
class ProportionSamplingDistribution(object): |
|
64
|
|
|
sample_distribution = None |
|
65
|
|
|
point_estimate = None |
|
66
|
|
|
distribution_family = None |
|
67
|
|
|
sample_size = None |
|
68
|
|
|
categorical_value = None |
|
69
|
|
|
standard_error = None |
|
70
|
|
|
simulated_proportions = None |
|
71
|
|
|
|
|
72
|
|
|
def __init__(self, sample_distribution=None, categorical_value=None, sample_proportion=None, sample_size=None): |
|
73
|
|
|
if sample_proportion is not None: |
|
74
|
|
|
self.point_estimate = sample_proportion |
|
75
|
|
|
|
|
76
|
|
|
if sample_size is not None: |
|
77
|
|
|
self.sample_size = sample_size |
|
78
|
|
|
|
|
79
|
|
|
if categorical_value is not None: |
|
80
|
|
|
self.categorical_value = categorical_value |
|
81
|
|
|
|
|
82
|
|
|
if sample_distribution is not None: |
|
83
|
|
|
self.build(sample_distribution) |
|
84
|
|
|
|
|
85
|
|
|
if self.sample_size * self.point_estimate < 10 or self.sample_size * (1 - self.point_estimate) < 10: |
|
86
|
|
|
self.distribution_family = DistributionFamily.simulation |
|
87
|
|
|
self.simulate() |
|
88
|
|
|
else: |
|
89
|
|
|
self.distribution_family = DistributionFamily.normal |
|
90
|
|
|
self.standard_error = math.sqrt(self.point_estimate * (1 - self.point_estimate) / self.sample_size) |
|
91
|
|
|
|
|
92
|
|
|
def build(self, sample_distribution): |
|
93
|
|
|
self.sample_distribution = sample_distribution |
|
94
|
|
|
self.point_estimate = sample_distribution.proportion |
|
95
|
|
|
self.categorical_value = sample_distribution.categorical_value |
|
96
|
|
|
self.sample_size = sample_distribution.sample_size |
|
97
|
|
|
|
|
98
|
|
|
def simulate(self): |
|
99
|
|
|
self.simulated_proportions = [0] * 1000 |
|
100
|
|
|
for iter in range(1000): |
|
101
|
|
|
count = 0 |
|
102
|
|
|
for trials in range(self.sample_size): |
|
103
|
|
|
if random.random() <= self.point_estimate: |
|
104
|
|
|
count += 1 |
|
105
|
|
|
self.simulated_proportions[iter] = float(count) / self.sample_size |
|
106
|
|
|
self.simulated_proportions = sorted(self.simulated_proportions) |
|
107
|
|
|
|
|
108
|
|
|
def confidence_interval(self, confidence_level): |
|
109
|
|
|
q = 1 - (1 - confidence_level) / 2 |
|
110
|
|
|
if self.distribution_family == DistributionFamily.normal: |
|
111
|
|
|
z = norm.ppf(q) |
|
112
|
|
|
pf = z * self.standard_error |
|
113
|
|
|
return self.point_estimate - pf, self.point_estimate + pf |
|
114
|
|
|
else: |
|
115
|
|
|
threshold1 = int(1000 * (1 - confidence_level) / 2) |
|
116
|
|
|
threshold2 = int(1000 * q) |
|
117
|
|
|
return self.simulated_proportions[threshold1], self.simulated_proportions[threshold2] |
|
118
|
|
|
|
|
119
|
|
|
|