GitHub Access Token became invalid

It seems like the GitHub access token used for retrieving details about this repository from GitHub became invalid. This might prevent certain types of inspections from being run (in particular, everything related to pull requests).
Please ask an admin of your repository to re-new the access token on this website.

for_each_model()   F
last analyzed

Complexity

Conditions 10

Size

Total Lines 25

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 10
dl 0
loc 25
rs 3.1304
c 0
b 0
f 0

3 Methods

Rating   Name   Duplication   Size   Complexity  
A test_all_models() 0 5 4
A test_one_model() 0 10 4
C filtered() 0 20 9

How to fix   Complexity   

Complexity

Complex classes like for_each_model() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

1
# Copyright (c) 2014, Salesforce.com, Inc.  All rights reserved.
2
#
3
# Redistribution and use in source and binary forms, with or without
4
# modification, are permitted provided that the following conditions
5
# are met:
6
#
7
# - Redistributions of source code must retain the above copyright
8
#   notice, this list of conditions and the following disclaimer.
9
# - Redistributions in binary form must reproduce the above copyright
10
#   notice, this list of conditions and the following disclaimer in the
11
#   documentation and/or other materials provided with the distribution.
12
# - Neither the name of Salesforce.com nor the names of its contributors
13
#   may be used to endorse or promote products derived from this
14
#   software without specific prior written permission.
15
#
16
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
19
# FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
20
# COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
21
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
22
# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
23
# OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
24
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
25
# TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
26
# USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
28
import math
29
import functools
30
from collections import defaultdict
31
import numpy
32
import numpy.random
33
from nose import SkipTest
34
from nose.tools import (
35
    assert_true,
36
    assert_equal,
37
    assert_less,
38
    assert_greater,
39
    assert_is_instance,
40
)
41
from distributions.dbg.random import sample_discrete
42
from goftests import discrete_goodness_of_fit
43
from distributions.tests.util import (
44
    require_cython,
45
    seed_all,
46
    assert_hasattr,
47
    assert_close,
48
)
49
from distributions.dbg.random import scores_to_probs
50
import distributions.dbg.clustering
51
require_cython()
52
import distributions.lp.clustering
53
from distributions.lp.clustering import count_assignments
54
from distributions.lp.mixture import MixtureIdTracker
55
56
MODELS = {
57
    'dbg.LowEntropy': distributions.dbg.clustering.LowEntropy,
58
    'lp.PitmanYor': distributions.lp.clustering.PitmanYor,
59
    'lp.LowEntropy': distributions.lp.clustering.LowEntropy,
60
}
61
62
SKIP_EXPENSIVE_TESTS = False
63
SAMPLE_COUNT = 2000
64
MIN_GOODNESS_OF_FIT = 1e-3
65
66
67
def iter_examples(Model):
68
    assert_hasattr(Model, 'EXAMPLES')
69
    EXAMPLES = Model.EXAMPLES
70
    assert_is_instance(EXAMPLES, list)
71
    assert_true(EXAMPLES, 'no examples provided')
72
    for i, EXAMPLE in enumerate(EXAMPLES):
73
        print 'example {}/{}'.format(1 + i, len(Model.EXAMPLES))
74
        yield EXAMPLE
75
76
77
def for_each_model(*filters):
78
    '''
79
    Run one test per Model, filtering out inappropriate Models for test.
80
    '''
81
    def filtered(test_fun):
82
83
        @functools.wraps(test_fun)
84
        def test_one_model(name):
85
            Model = MODELS[name]
86
            for EXAMPLE in iter_examples(Model):
87
                seed_all(0)
88
                if SKIP_EXPENSIVE_TESTS and name.startswith('dbg'):
89
                    sample_count = SAMPLE_COUNT / 10
90
                else:
91
                    sample_count = SAMPLE_COUNT
92
                test_fun(Model, EXAMPLE, sample_count)
93
94
        @functools.wraps(test_fun)
95
        def test_all_models():
96
            for name, Model in sorted(MODELS.iteritems()):
97
                if all(f(Model) for f in filters):
98
                    yield test_one_model, name
99
100
        return test_all_models
101
    return filtered
102
103
104
def canonicalize(assignments):
105
    groups = defaultdict(lambda: [])
106
    for value, group in enumerate(assignments):
107
        groups[group].append(value)
108
    result = []
109
    for group in groups.itervalues():
110
        group.sort()
111
        result.append(tuple(group))
112
    result.sort()
113
    return tuple(result)
114
115
116
@for_each_model()
117
def test_load_and_dump(Model, EXAMPLE, *unused):
118
    model = Model()
119
    model.load(EXAMPLE)
120
    expected = EXAMPLE
121
    actual = model.dump()
122
    assert_close(expected, actual)
123
124
125
def iter_valid_sizes(example, max_size, min_size=2):
126
    max_size = 5
127
    dataset_size = example.get('dataset_size', float('inf'))
128
    sizes = [
129
        size
130
        for size in xrange(min_size, max_size + 1)
131
        if size <= dataset_size
132
    ]
133
    assert sizes, 'no valid sizes to test'
134
    for size in sizes:
135
        print 'sample_size = {}'.format(size)
136
        yield size
137
138
139
@for_each_model()
140
def test_sample_matches_score_counts(Model, EXAMPLE, sample_count):
141
    for size in iter_valid_sizes(EXAMPLE, max_size=10):
142
        model = Model()
143
        model.load(EXAMPLE)
144
145
        samples = []
146
        probs_dict = {}
147
        for _ in xrange(sample_count):
148
            value = model.sample_assignments(size)
149
            sample = canonicalize(value)
150
            samples.append(sample)
151
            if sample not in probs_dict:
152
                assignments = dict(enumerate(value))
153
                counts = count_assignments(assignments)
154
                prob = math.exp(model.score_counts(counts))
155
                probs_dict[sample] = prob
156
157
        # renormalize here; test normalization separately
158
        total = sum(probs_dict.values())
159
        for key in probs_dict:
160
            probs_dict[key] /= total
161
162
        gof = discrete_goodness_of_fit(samples, probs_dict, plot=True)
163
        print '{} gof = {:0.3g}'.format(Model.__name__, gof)
164
        assert_greater(gof, MIN_GOODNESS_OF_FIT)
165
166
167
@for_each_model()
168
def test_score_counts_is_normalized(Model, EXAMPLE, sample_count):
169
170
    for sample_size in iter_valid_sizes(EXAMPLE, max_size=10):
171
        model = Model()
172
        model.load(EXAMPLE)
173
174
        if Model.__name__ == 'LowEntropy' and sample_size < model.dataset_size:
175
            print 'WARNING LowEntropy.score_counts normalization is imprecise'
176
            print '  when sample_size < dataset_size'
177
            tol = 0.5
178
        else:
179
            tol = 0.01
180
181
        probs_dict = {}
182
        for _ in xrange(sample_count):
183
            value = model.sample_assignments(sample_size)
184
            sample = canonicalize(value)
185
            if sample not in probs_dict:
186
                assignments = dict(enumerate(value))
187
                counts = count_assignments(assignments)
188
                prob = math.exp(model.score_counts(counts))
189
                probs_dict[sample] = prob
190
191
        total = sum(probs_dict.values())
192
        assert_less(abs(total - 1), tol, 'not normalized: {}'.format(total))
193
194
195
def add_to_counts(counts, pos):
196
    counts = counts[:]
197
    counts[pos] += 1
198
    return counts
199
200
201
@for_each_model()
202
def test_score_add_value_matches_score_counts(Model, EXAMPLE, sample_count):
203
    for sample_size in iter_valid_sizes(EXAMPLE, min_size=2, max_size=10):
204
        model = Model()
205
        model.load(EXAMPLE)
206
207
        samples = set(
208
            canonicalize(model.sample_assignments(sample_size - 1))
209
            for _ in xrange(sample_count)
210
        )
211
212
        for sample in samples:
213
            nonempty_group_count = len(sample)
214
            counts = map(len, sample)
215
            actual = numpy.zeros(len(counts) + 1)
216
            expected = numpy.zeros(len(counts) + 1)
217
218
            # add to existing group
219
            for i, group in enumerate(sample):
220
                group_size = len(sample[i])
221
                expected[i] = model.score_counts(add_to_counts(counts, i))
222
                actual[i] = model.score_add_value(
223
                    group_size,
224
                    nonempty_group_count,
225
                    sample_size - 1)
226
227
            # add to new group
228
            i = len(counts)
229
            group_size = 0
230
            expected[i] = model.score_counts(counts + [1])
231
            actual[i] = model.score_add_value(
232
                group_size,
233
                nonempty_group_count,
234
                sample_size - 1)
235
236
            actual = scores_to_probs(actual)
237
            expected = scores_to_probs(expected)
238
            print actual, expected
239
            assert_close(actual, expected, tol=0.05)
240
241
242
@for_each_model(lambda Model: hasattr(Model, 'Mixture'))
243
def test_mixture_score_matches_score_add_value(Model, EXAMPLE, *unused):
244
    sample_count = 200
245
    model = Model()
246
    model.load(EXAMPLE)
247
248
    if Model.__name__ == 'LowEntropy' and sample_count > model.dataset_size:
249
        raise SkipTest('skipping trivial example')
250
251
    assignment_vector = model.sample_assignments(sample_count)
252
    assignments = dict(enumerate(assignment_vector))
253
    nonempty_counts = count_assignments(assignments)
254
    nonempty_group_count = len(nonempty_counts)
255
    assert_greater(nonempty_group_count, 1, "test is inaccurate")
256
257
    def check_counts(mixture, counts, empty_group_count):
258
        # print 'counts =', counts
259
        empty_groupids = frozenset(mixture.empty_groupids)
260
        assert_equal(len(empty_groupids), empty_group_count)
261
        for groupid in empty_groupids:
262
            assert_equal(counts[groupid], 0)
263
264
    def check_scores(mixture, counts, empty_group_count):
265
        sample_count = sum(counts)
266
        nonempty_group_count = len(counts) - empty_group_count
267
        expected = [
268
            model.score_add_value(
269
                group_size,
270
                nonempty_group_count,
271
                sample_count,
272
                empty_group_count)
273
            for group_size in counts
274
        ]
275
        noise = numpy.random.randn(len(counts))
276
        actual = numpy.zeros(len(counts), dtype=numpy.float32)
277
        actual[:] = noise
278
        mixture.score_value(model, actual)
279
        assert_close(actual, expected)
280
        return actual
281
282
    for empty_group_count in [1, 10]:
283
        print 'empty_group_count =', empty_group_count
284
        counts = nonempty_counts + [0] * empty_group_count
285
        numpy.random.shuffle(counts)
286
        mixture = Model.Mixture()
287
        id_tracker = MixtureIdTracker()
288
289
        print 'init'
290
        mixture.init(model, counts)
291
        id_tracker.init(len(counts))
292
        check_counts(mixture, counts, empty_group_count)
293
        check_scores(mixture, counts, empty_group_count)
294
295
        print 'adding'
296
        groupids = []
297
        for _ in xrange(sample_count):
298
            check_counts(mixture, counts, empty_group_count)
299
            scores = check_scores(mixture, counts, empty_group_count)
300
            probs = scores_to_probs(scores)
301
            groupid = sample_discrete(probs)
302
            expected_group_added = (counts[groupid] == 0)
303
            counts[groupid] += 1
304
            actual_group_added = mixture.add_value(model, groupid)
305
            assert_equal(actual_group_added, expected_group_added)
306
            groupids.append(groupid)
307
            if actual_group_added:
308
                id_tracker.add_group()
309
                counts.append(0)
310
311
        check_counts(mixture, counts, empty_group_count)
312
        check_scores(mixture, counts, empty_group_count)
313
314
        print 'removing'
315
        for global_groupid in groupids:
316
            groupid = id_tracker.global_to_packed(global_groupid)
317
            counts[groupid] -= 1
318
            expected_group_removed = (counts[groupid] == 0)
319
            actual_group_removed = mixture.remove_value(model, groupid)
320
            assert_equal(actual_group_removed, expected_group_removed)
321
            if expected_group_removed:
322
                id_tracker.remove_group(groupid)
323
                back = counts.pop()
324
                if groupid < len(counts):
325
                    counts[groupid] = back
326
            check_counts(mixture, counts, empty_group_count)
327
            check_scores(mixture, counts, empty_group_count)
328