Completed
Push — master ( 7b5ebf...f8d5ee )
by Xianshun
58s
created

Anova   A

Complexity

Total Complexity 6

Size/Duplication

Total Lines 65
Duplicated Lines 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
c 1
b 0
f 0
dl 0
loc 65
rs 10
wmc 6

2 Methods

Rating   Name   Duplication   Size   Complexity  
A build() 0 21 3
A __init__() 0 17 3
1
import math
2
3
from pysie.dsl.set import TernarySearchSet, TernarySearchTrie
4
from pysie.stats.distributions import MeanSamplingDistribution
5
from pysie.stats.samples import SampleDistribution
6
7
from scipy.stats import f
8
9
10
class ContingencyTable(object):
11
    values = None
12
    rows = None
13
    columns = None
14
15
    def __init__(self):
16
        self.rows = TernarySearchSet()
17
        self.columns = TernarySearchSet()
18
        self.values = TernarySearchTrie()
19
20
    def set_cell(self, row_name, column_name, value):
21
        key = self.make_key(row_name, column_name)
22
        self.values.put(key, value)
23
        self.rows.add(row_name)
24
        self.columns.add(column_name)
25
26
    def get_cell(self, row_name, column_name):
27
        key = self.make_key(row_name, column_name)
28
        if not self.values.contains_key(key):
29
            return 0
30
        return self.values.get(key)
31
32
    def make_key(self, row_name, column_name):
33
        return row_name + '-' + column_name
34
35
    def get_row_total(self, row_name):
36
        column_names = self.columns.to_array()
37
        result = 0
38
        for x in column_names:
39
            result += self.get_cell(row_name, x)
40
        return result
41
42
    def get_column_total(self, column_name):
43
        row_names = self.rows.to_array()
44
        result = 0
45
        for x in row_names:
46
            result += self.get_cell(x, column_name)
47
        return result
48
49
    def get_total(self):
50
        values = self.values.values()
51
        result = 0
52
        for val in values:
53
            result += val
54
        return result
55
56
57
class Anova(object):
58
59
    sample = None
60
    individual_samples = None
61
    individual_sample_distributions = None
62
    individual_sampling_distributions = None
63
    overall_sample_distribution = None
64
    overall_sampling_distribution = None
65
66
    sum_of_squares_total = None
67
    sum_of_squares_group = None
68
    sum_of_squares_error = None
69
70
    df_group = None
71
    df_error = None
72
    df_total = None
73
74
    mean_square_group = None
75
    mean_square_error = None
76
77
    F = None
78
    p_value = None
79
80
    significance_level = None
81
    reject_mean_same = None
82
83
    def __init__(self, sample, significance_level=None):
84
        if significance_level is not None:
85
            self.significance_level = significance_level
86
87
        self.sample = sample
88
        self.individual_sampling_distributions = TernarySearchTrie()
89
        self.individual_sample_distributions = TernarySearchTrie()
90
        self.individual_samples = sample.split_by_group_id()
91
        for group_id in self.individual_samples.keys():
92
            sample_distribution = SampleDistribution(sample=self.individual_samples.get(group_id), group_id=group_id)
93
            sampling_distribution = MeanSamplingDistribution(sample_distribution=sample_distribution)
94
            self.individual_sample_distributions.put(group_id, sample_distribution)
95
            self.individual_sampling_distributions.put(group_id, sampling_distribution)
96
97
        self.overall_sample_distribution = SampleDistribution(sample=sample, group_id=None)
98
        self.overall_sampling_distribution = MeanSamplingDistribution(self.overall_sample_distribution)
99
        self.build()
100
101
    def build(self):
102
        self.sum_of_squares_total = self.overall_sample_distribution.sum_of_squares
103
        self.sum_of_squares_group = 0
104
        mean_overall = self.overall_sample_distribution.mean
105
        for sample_distribution_i in self.individual_sample_distributions.values():
106
            mean_i = sample_distribution_i.mean
107
            self.sum_of_squares_group += math.pow(mean_i - mean_overall, 2.0) * sample_distribution_i.sample_size
108
        self.sum_of_squares_error = self.sum_of_squares_total - self.sum_of_squares_group
109
110
        self.df_total = self.sample.size() - 1
111
        self.df_group = self.individual_samples.size() - 1
112
        self.df_error = self.df_total - self.df_group
113
114
        self.mean_square_error = self.sum_of_squares_error / self.df_error
115
        self.mean_square_group = self.sum_of_squares_group / self.df_group
116
117
        self.F = self.mean_square_group / self.mean_square_error
118
        self.p_value = 1 - f.cdf(self.F, self.df_group, self.df_error)
119
120
        if self.significance_level is not None:
121
            self.reject_mean_same = self.p_value >= self.significance_level
122
123
124
125
126
127
128