1
|
|
|
import math |
2
|
|
|
import random |
3
|
|
|
|
4
|
|
|
from enum import Enum |
5
|
|
|
|
6
|
|
|
from scipy.stats import norm, t |
7
|
|
|
|
8
|
|
|
|
9
|
|
|
class DistributionFamily(Enum): |
10
|
|
|
normal = 1 |
11
|
|
|
student_t = 2 |
12
|
|
|
fisher = 3 |
13
|
|
|
chi_square = 4 |
14
|
|
|
simulation = 5 |
15
|
|
|
|
16
|
|
|
|
17
|
|
|
class MeanSamplingDistribution(object): |
18
|
|
|
sample_distribution = None |
19
|
|
|
point_estimate = None |
20
|
|
|
distribution_family = None |
21
|
|
|
df = None |
22
|
|
|
|
23
|
|
|
def __init__(self, sample_distribution=None, sample_mean=None, sample_sd=None, sample_size=None): |
24
|
|
|
if sample_mean is not None: |
25
|
|
|
self.point_estimate = sample_mean |
26
|
|
|
|
27
|
|
|
if sample_sd is not None: |
28
|
|
|
self.sample_sd = sample_sd |
29
|
|
|
|
30
|
|
|
if sample_size is not None: |
31
|
|
|
self.sample_size = sample_size |
32
|
|
|
|
33
|
|
|
if sample_distribution is not None: |
34
|
|
|
self.sample_distribution = sample_distribution |
35
|
|
|
self.point_estimate = sample_distribution.mean |
36
|
|
|
self.sample_sd = sample_distribution.sd |
37
|
|
|
self.sample_size = sample_distribution.sample_size |
38
|
|
|
|
39
|
|
|
self.standard_error = MeanSamplingDistribution.calculate_standard_error(self.sample_sd, self.sample_size) |
40
|
|
|
|
41
|
|
|
self.df = self.sample_size - 1.0 |
42
|
|
|
if self.sample_size < 30: |
43
|
|
|
self.distribution_family = DistributionFamily.student_t |
44
|
|
|
else: |
45
|
|
|
self.distribution_family = DistributionFamily.normal |
46
|
|
|
|
47
|
|
|
@staticmethod |
48
|
|
|
def calculate_standard_error(sample_sd, sample_size): |
49
|
|
|
return sample_sd / math.sqrt(sample_size) |
50
|
|
|
|
51
|
|
|
def confidence_interval(self, confidence_level): |
52
|
|
|
q = 1 - (1 - confidence_level) / 2 |
53
|
|
|
if self.distribution_family == DistributionFamily.normal: |
54
|
|
|
z = norm.ppf(q) |
55
|
|
|
pf = z * self.standard_error |
56
|
|
|
return self.point_estimate - pf, self.point_estimate + pf |
57
|
|
|
else: |
58
|
|
|
t_df = t.ppf(q, self.df) |
59
|
|
|
pf = t_df * self.standard_error + self.point_estimate |
60
|
|
|
return self.point_estimate - pf, self.point_estimate + pf |
61
|
|
|
|
62
|
|
|
|
63
|
|
|
class ProportionSamplingDistribution(object): |
64
|
|
|
sample_distribution = None |
65
|
|
|
point_estimate = None |
66
|
|
|
distribution_family = None |
67
|
|
|
sample_size = None |
68
|
|
|
categorical_value = None |
69
|
|
|
standard_error = None |
70
|
|
|
simulated_proportions = None |
71
|
|
|
|
72
|
|
|
def __init__(self, sample_distribution=None, categorical_value=None, sample_proportion=None, sample_size=None): |
73
|
|
|
if sample_proportion is not None: |
74
|
|
|
self.point_estimate = sample_proportion |
75
|
|
|
|
76
|
|
|
if sample_size is not None: |
77
|
|
|
self.sample_size = sample_size |
78
|
|
|
|
79
|
|
|
if categorical_value is not None: |
80
|
|
|
self.categorical_value = categorical_value |
81
|
|
|
|
82
|
|
|
if sample_distribution is not None: |
83
|
|
|
self.build(sample_distribution) |
84
|
|
|
|
85
|
|
|
if self.sample_size * self.point_estimate < 10 or self.sample_size * (1 - self.point_estimate) < 10: |
86
|
|
|
self.distribution_family = DistributionFamily.simulation |
87
|
|
|
self.simulate() |
88
|
|
|
else: |
89
|
|
|
self.distribution_family = DistributionFamily.normal |
90
|
|
|
self.standard_error = math.sqrt(self.point_estimate * (1 - self.point_estimate) / self.sample_size) |
91
|
|
|
|
92
|
|
|
def build(self, sample_distribution): |
93
|
|
|
self.sample_distribution = sample_distribution |
94
|
|
|
self.point_estimate = sample_distribution.proportion |
95
|
|
|
self.categorical_value = sample_distribution.categorical_value |
96
|
|
|
self.sample_size = sample_distribution.sample_size |
97
|
|
|
|
98
|
|
|
def simulate(self): |
99
|
|
|
self.simulated_proportions = [0] * 1000 |
100
|
|
|
for iter in range(1000): |
101
|
|
|
count = 0 |
102
|
|
|
for trials in range(self.sample_size): |
103
|
|
|
if random.random() <= self.point_estimate: |
104
|
|
|
count += 1 |
105
|
|
|
self.simulated_proportions[iter] = float(count) / self.sample_size |
106
|
|
|
self.simulated_proportions = sorted(self.simulated_proportions) |
107
|
|
|
|
108
|
|
|
def confidence_interval(self, confidence_level): |
109
|
|
|
q = 1 - (1 - confidence_level) / 2 |
110
|
|
|
if self.distribution_family == DistributionFamily.normal: |
111
|
|
|
z = norm.ppf(q) |
112
|
|
|
pf = z * self.standard_error |
113
|
|
|
return self.point_estimate - pf, self.point_estimate + pf |
114
|
|
|
else: |
115
|
|
|
threshold1 = int(1000 * (1 - confidence_level) / 2) |
116
|
|
|
threshold2 = int(1000 * q) |
117
|
|
|
return self.simulated_proportions[threshold1], self.simulated_proportions[threshold2] |
118
|
|
|
|
119
|
|
|
|