for_each_model() - Code Metrics - posterior/distributions - Measure and Improve Code Quality continuously with Scrutinizer

for_each_model() F
last analyzed 2017-06-28 22:26 UTC

↳ Parent: Project

Complexity

Conditions

Size

Total Lines

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	10
dl	0
loc	25
rs	3.1304
c	0
b	0
f	0

3 Methods

Rating	Name	Size	Complexity
A	test_all_models()	5	4
A	test_one_model()	10	4
C	filtered()	20	9

How to fix Complexity

# Copyright (c) 2014, Salesforce.com, Inc.  All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#
# - Redistributions of source code must retain the above copyright
#   notice, this list of conditions and the following disclaimer.
# - Redistributions in binary form must reproduce the above copyright
#   notice, this list of conditions and the following disclaimer in the
#   documentation and/or other materials provided with the distribution.
# - Neither the name of Salesforce.com nor the names of its contributors
#   may be used to endorse or promote products derived from this
#   software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
# FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
# COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
# OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
# TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
# USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

import math
import functools
from collections import defaultdict
import numpy
import numpy.random
from nose import SkipTest
from nose.tools import (
    assert_true,
    assert_equal,
    assert_less,
    assert_greater,
    assert_is_instance,
)
from distributions.dbg.random import sample_discrete
from goftests import discrete_goodness_of_fit
from distributions.tests.util import (
    require_cython,
    seed_all,
    assert_hasattr,
    assert_close,
)
from distributions.dbg.random import scores_to_probs
import distributions.dbg.clustering
require_cython()
import distributions.lp.clustering
from distributions.lp.clustering import count_assignments
from distributions.lp.mixture import MixtureIdTracker

MODELS = {
    'dbg.LowEntropy': distributions.dbg.clustering.LowEntropy,
    'lp.PitmanYor': distributions.lp.clustering.PitmanYor,
    'lp.LowEntropy': distributions.lp.clustering.LowEntropy,
}

SKIP_EXPENSIVE_TESTS = False
SAMPLE_COUNT = 2000
MIN_GOODNESS_OF_FIT = 1e-3


def iter_examples(Model):
    assert_hasattr(Model, 'EXAMPLES')
    EXAMPLES = Model.EXAMPLES
    assert_is_instance(EXAMPLES, list)
    assert_true(EXAMPLES, 'no examples provided')
    for i, EXAMPLE in enumerate(EXAMPLES):
        print 'example {}/{}'.format(1 + i, len(Model.EXAMPLES))
        yield EXAMPLE


def for_each_model(*filters):
    '''
    Run one test per Model, filtering out inappropriate Models for test.
    '''
    def filtered(test_fun):

        @functools.wraps(test_fun)
        def test_one_model(name):
            Model = MODELS[name]
            for EXAMPLE in iter_examples(Model):
                seed_all(0)
                if SKIP_EXPENSIVE_TESTS and name.startswith('dbg'):
                    sample_count = SAMPLE_COUNT / 10
                else:
                    sample_count = SAMPLE_COUNT
                test_fun(Model, EXAMPLE, sample_count)

        @functools.wraps(test_fun)
        def test_all_models():
            for name, Model in sorted(MODELS.iteritems()):
                if all(f(Model) for f in filters):
                    yield test_one_model, name

        return test_all_models
    return filtered


def canonicalize(assignments):
    groups = defaultdict(lambda: [])
    for value, group in enumerate(assignments):
        groups[group].append(value)
    result = []
    for group in groups.itervalues():
        group.sort()
        result.append(tuple(group))
    result.sort()
    return tuple(result)


@for_each_model()
def test_load_and_dump(Model, EXAMPLE, *unused):
    model = Model()
    model.load(EXAMPLE)
    expected = EXAMPLE
    actual = model.dump()
    assert_close(expected, actual)


def iter_valid_sizes(example, max_size, min_size=2):
    max_size = 5
    dataset_size = example.get('dataset_size', float('inf'))
    sizes = [
        size
        for size in xrange(min_size, max_size + 1)
        if size <= dataset_size
    ]
    assert sizes, 'no valid sizes to test'
    for size in sizes:
        print 'sample_size = {}'.format(size)
        yield size


@for_each_model()
def test_sample_matches_score_counts(Model, EXAMPLE, sample_count):
    for size in iter_valid_sizes(EXAMPLE, max_size=10):
        model = Model()
        model.load(EXAMPLE)

        samples = []
        probs_dict = {}
        for _ in xrange(sample_count):
            value = model.sample_assignments(size)
            sample = canonicalize(value)
            samples.append(sample)
            if sample not in probs_dict:
                assignments = dict(enumerate(value))
                counts = count_assignments(assignments)
                prob = math.exp(model.score_counts(counts))
                probs_dict[sample] = prob

        # renormalize here; test normalization separately
        total = sum(probs_dict.values())
        for key in probs_dict:
            probs_dict[key] /= total

        gof = discrete_goodness_of_fit(samples, probs_dict, plot=True)
        print '{} gof = {:0.3g}'.format(Model.__name__, gof)
        assert_greater(gof, MIN_GOODNESS_OF_FIT)


@for_each_model()
def test_score_counts_is_normalized(Model, EXAMPLE, sample_count):

    for sample_size in iter_valid_sizes(EXAMPLE, max_size=10):
        model = Model()
        model.load(EXAMPLE)

        if Model.__name__ == 'LowEntropy' and sample_size < model.dataset_size:
            print 'WARNING LowEntropy.score_counts normalization is imprecise'
            print '  when sample_size < dataset_size'
            tol = 0.5
        else:
            tol = 0.01

        probs_dict = {}
        for _ in xrange(sample_count):
            value = model.sample_assignments(sample_size)
            sample = canonicalize(value)
            if sample not in probs_dict:
                assignments = dict(enumerate(value))
                counts = count_assignments(assignments)
                prob = math.exp(model.score_counts(counts))
                probs_dict[sample] = prob

        total = sum(probs_dict.values())
        assert_less(abs(total - 1), tol, 'not normalized: {}'.format(total))


def add_to_counts(counts, pos):
    counts = counts[:]
    counts[pos] += 1
    return counts


@for_each_model()
def test_score_add_value_matches_score_counts(Model, EXAMPLE, sample_count):
    for sample_size in iter_valid_sizes(EXAMPLE, min_size=2, max_size=10):
        model = Model()
        model.load(EXAMPLE)

        samples = set(
            canonicalize(model.sample_assignments(sample_size - 1))
            for _ in xrange(sample_count)
        )

        for sample in samples:
            nonempty_group_count = len(sample)
            counts = map(len, sample)
            actual = numpy.zeros(len(counts) + 1)
            expected = numpy.zeros(len(counts) + 1)

            # add to existing group
            for i, group in enumerate(sample):
                group_size = len(sample[i])
                expected[i] = model.score_counts(add_to_counts(counts, i))
                actual[i] = model.score_add_value(
                    group_size,
                    nonempty_group_count,
                    sample_size - 1)

            # add to new group
            i = len(counts)
            group_size = 0
            expected[i] = model.score_counts(counts + [1])
            actual[i] = model.score_add_value(
                group_size,
                nonempty_group_count,
                sample_size - 1)

            actual = scores_to_probs(actual)
            expected = scores_to_probs(expected)
            print actual, expected
            assert_close(actual, expected, tol=0.05)


@for_each_model(lambda Model: hasattr(Model, 'Mixture'))
def test_mixture_score_matches_score_add_value(Model, EXAMPLE, *unused):
    sample_count = 200
    model = Model()
    model.load(EXAMPLE)

    if Model.__name__ == 'LowEntropy' and sample_count > model.dataset_size:
        raise SkipTest('skipping trivial example')

    assignment_vector = model.sample_assignments(sample_count)
    assignments = dict(enumerate(assignment_vector))
    nonempty_counts = count_assignments(assignments)
    nonempty_group_count = len(nonempty_counts)
    assert_greater(nonempty_group_count, 1, "test is inaccurate")

    def check_counts(mixture, counts, empty_group_count):
        # print 'counts =', counts
        empty_groupids = frozenset(mixture.empty_groupids)
        assert_equal(len(empty_groupids), empty_group_count)
        for groupid in empty_groupids:
            assert_equal(counts[groupid], 0)

    def check_scores(mixture, counts, empty_group_count):
        sample_count = sum(counts)
        nonempty_group_count = len(counts) - empty_group_count
        expected = [
            model.score_add_value(
                group_size,
                nonempty_group_count,
                sample_count,
                empty_group_count)
            for group_size in counts
        ]
        noise = numpy.random.randn(len(counts))
        actual = numpy.zeros(len(counts), dtype=numpy.float32)
        actual[:] = noise
        mixture.score_value(model, actual)
        assert_close(actual, expected)
        return actual

    for empty_group_count in [1, 10]:
        print 'empty_group_count =', empty_group_count
        counts = nonempty_counts + [0] * empty_group_count
        numpy.random.shuffle(counts)
        mixture = Model.Mixture()
        id_tracker = MixtureIdTracker()

        print 'init'
        mixture.init(model, counts)
        id_tracker.init(len(counts))
        check_counts(mixture, counts, empty_group_count)
        check_scores(mixture, counts, empty_group_count)

        print 'adding'
        groupids = []
        for _ in xrange(sample_count):
            check_counts(mixture, counts, empty_group_count)
            scores = check_scores(mixture, counts, empty_group_count)
            probs = scores_to_probs(scores)
            groupid = sample_discrete(probs)
            expected_group_added = (counts[groupid] == 0)
            counts[groupid] += 1
            actual_group_added = mixture.add_value(model, groupid)
            assert_equal(actual_group_added, expected_group_added)
            groupids.append(groupid)
            if actual_group_added:
                id_tracker.add_group()
                counts.append(0)

        check_counts(mixture, counts, empty_group_count)
        check_scores(mixture, counts, empty_group_count)

        print 'removing'
        for global_groupid in groupids:
            groupid = id_tracker.global_to_packed(global_groupid)
            counts[groupid] -= 1
            expected_group_removed = (counts[groupid] == 0)
            actual_group_removed = mixture.remove_value(model, groupid)
            assert_equal(actual_group_removed, expected_group_removed)
            if expected_group_removed:
                id_tracker.remove_group(groupid)
                back = counts.pop()
                if groupid < len(counts):
                    counts[groupid] = back
            check_counts(mixture, counts, empty_group_count)
            check_scores(mixture, counts, empty_group_count)


1			# Copyright (c) 2014, Salesforce.com, Inc. All rights reserved.
2			#
3			# Redistribution and use in source and binary forms, with or without
4			# modification, are permitted provided that the following conditions
5			# are met:
6			#
7			# - Redistributions of source code must retain the above copyright
8			# notice, this list of conditions and the following disclaimer.
9			# - Redistributions in binary form must reproduce the above copyright
10			# notice, this list of conditions and the following disclaimer in the
11			# documentation and/or other materials provided with the distribution.
12			# - Neither the name of Salesforce.com nor the names of its contributors
13			# may be used to endorse or promote products derived from this
14			# software without specific prior written permission.
15			#
16			# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17			# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18			# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
19			# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
20			# COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
21			# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
22			# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
23			# OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
24			# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
25			# TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
26			# USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
28			import math
29			import functools
30			from collections import defaultdict
31			import numpy
32			import numpy.random
33			from nose import SkipTest
34			from nose.tools import (
35			assert_true,
36			assert_equal,
37			assert_less,
38			assert_greater,
39			assert_is_instance,
40			)
41			from distributions.dbg.random import sample_discrete
42			from goftests import discrete_goodness_of_fit
43			from distributions.tests.util import (
44			require_cython,
45			seed_all,
46			assert_hasattr,
47			assert_close,
48			)
49			from distributions.dbg.random import scores_to_probs
50			import distributions.dbg.clustering
51			require_cython()
52			import distributions.lp.clustering
53			from distributions.lp.clustering import count_assignments
54			from distributions.lp.mixture import MixtureIdTracker
55
56			MODELS = {
57			'dbg.LowEntropy': distributions.dbg.clustering.LowEntropy,
58			'lp.PitmanYor': distributions.lp.clustering.PitmanYor,
59			'lp.LowEntropy': distributions.lp.clustering.LowEntropy,
60			}
61
62			SKIP_EXPENSIVE_TESTS = False
63			SAMPLE_COUNT = 2000
64			MIN_GOODNESS_OF_FIT = 1e-3
65
66
67			def iter_examples(Model):
68			assert_hasattr(Model, 'EXAMPLES')
69			EXAMPLES = Model.EXAMPLES
70			assert_is_instance(EXAMPLES, list)
71			assert_true(EXAMPLES, 'no examples provided')
72			for i, EXAMPLE in enumerate(EXAMPLES):
73			print 'example {}/{}'.format(1 + i, len(Model.EXAMPLES))
74			yield EXAMPLE
75
76
77			def for_each_model(*filters):
78			'''
79			Run one test per Model, filtering out inappropriate Models for test.
80			'''
81			def filtered(test_fun):
82
83			@functools.wraps(test_fun)
84			def test_one_model(name):
85			Model = MODELS[name]
86			for EXAMPLE in iter_examples(Model):
87			seed_all(0)
88			if SKIP_EXPENSIVE_TESTS and name.startswith('dbg'):
89			sample_count = SAMPLE_COUNT / 10
90			else:
91			sample_count = SAMPLE_COUNT
92			test_fun(Model, EXAMPLE, sample_count)
93
94			@functools.wraps(test_fun)
95			def test_all_models():
96			for name, Model in sorted(MODELS.iteritems()):
97			if all(f(Model) for f in filters):
98			yield test_one_model, name
99
100			return test_all_models
101			return filtered
102
103
104			def canonicalize(assignments):
105			groups = defaultdict(lambda: [])
106			for value, group in enumerate(assignments):
107			groups[group].append(value)
108			result = []
109			for group in groups.itervalues():
110			group.sort()
111			result.append(tuple(group))
112			result.sort()
113			return tuple(result)
114
115
116			@for_each_model()
117			def test_load_and_dump(Model, EXAMPLE, *unused):
118			model = Model()
119			model.load(EXAMPLE)
120			expected = EXAMPLE
121			actual = model.dump()
122			assert_close(expected, actual)
123
124
125			def iter_valid_sizes(example, max_size, min_size=2):
126			max_size = 5
127			dataset_size = example.get('dataset_size', float('inf'))
128			sizes = [
129			size
130			for size in xrange(min_size, max_size + 1)
131			if size <= dataset_size
132			]
133			assert sizes, 'no valid sizes to test'
134			for size in sizes:
135			print 'sample_size = {}'.format(size)
136			yield size
137
138
139			@for_each_model()
140			def test_sample_matches_score_counts(Model, EXAMPLE, sample_count):
141			for size in iter_valid_sizes(EXAMPLE, max_size=10):
142			model = Model()
143			model.load(EXAMPLE)
144
145			samples = []
146			probs_dict = {}
147			for _ in xrange(sample_count):
148			value = model.sample_assignments(size)
149			sample = canonicalize(value)
150			samples.append(sample)
151			if sample not in probs_dict:
152			assignments = dict(enumerate(value))
153			counts = count_assignments(assignments)
154			prob = math.exp(model.score_counts(counts))
155			probs_dict[sample] = prob
156
157			# renormalize here; test normalization separately
158			total = sum(probs_dict.values())
159			for key in probs_dict:
160			probs_dict[key] /= total
161
162			gof = discrete_goodness_of_fit(samples, probs_dict, plot=True)
163			print '{} gof = {:0.3g}'.format(Model.__name__, gof)
164			assert_greater(gof, MIN_GOODNESS_OF_FIT)
165
166
167			@for_each_model()
168			def test_score_counts_is_normalized(Model, EXAMPLE, sample_count):
169
170			for sample_size in iter_valid_sizes(EXAMPLE, max_size=10):
171			model = Model()
172			model.load(EXAMPLE)
173
174			if Model.__name__ == 'LowEntropy' and sample_size < model.dataset_size:
175			print 'WARNING LowEntropy.score_counts normalization is imprecise'
176			print ' when sample_size < dataset_size'
177			tol = 0.5
178			else:
179			tol = 0.01
180
181			probs_dict = {}
182			for _ in xrange(sample_count):
183			value = model.sample_assignments(sample_size)
184			sample = canonicalize(value)
185			if sample not in probs_dict:
186			assignments = dict(enumerate(value))
187			counts = count_assignments(assignments)
188			prob = math.exp(model.score_counts(counts))
189			probs_dict[sample] = prob
190
191			total = sum(probs_dict.values())
192			assert_less(abs(total - 1), tol, 'not normalized: {}'.format(total))
193
194
195			def add_to_counts(counts, pos):
196			counts = counts[:]
197			counts[pos] += 1
198			return counts
199
200
201			@for_each_model()
202			def test_score_add_value_matches_score_counts(Model, EXAMPLE, sample_count):
203			for sample_size in iter_valid_sizes(EXAMPLE, min_size=2, max_size=10):
204			model = Model()
205			model.load(EXAMPLE)
206
207			samples = set(
208			canonicalize(model.sample_assignments(sample_size - 1))
209			for _ in xrange(sample_count)
210			)
211
212			for sample in samples:
213			nonempty_group_count = len(sample)
214			counts = map(len, sample)
215			actual = numpy.zeros(len(counts) + 1)
216			expected = numpy.zeros(len(counts) + 1)
217
218			# add to existing group
219			for i, group in enumerate(sample):
220			group_size = len(sample[i])
221			expected[i] = model.score_counts(add_to_counts(counts, i))
222			actual[i] = model.score_add_value(
223			group_size,
224			nonempty_group_count,
225			sample_size - 1)
226
227			# add to new group
228			i = len(counts)
229			group_size = 0
230			expected[i] = model.score_counts(counts + [1])
231			actual[i] = model.score_add_value(
232			group_size,
233			nonempty_group_count,
234			sample_size - 1)
235
236			actual = scores_to_probs(actual)
237			expected = scores_to_probs(expected)
238			print actual, expected
239			assert_close(actual, expected, tol=0.05)
240
241
242			@for_each_model(lambda Model: hasattr(Model, 'Mixture'))
243			def test_mixture_score_matches_score_add_value(Model, EXAMPLE, *unused):
244			sample_count = 200
245			model = Model()
246			model.load(EXAMPLE)
247
248			if Model.__name__ == 'LowEntropy' and sample_count > model.dataset_size:
249			raise SkipTest('skipping trivial example')
250
251			assignment_vector = model.sample_assignments(sample_count)
252			assignments = dict(enumerate(assignment_vector))
253			nonempty_counts = count_assignments(assignments)
254			nonempty_group_count = len(nonempty_counts)
255			assert_greater(nonempty_group_count, 1, "test is inaccurate")
256
257			def check_counts(mixture, counts, empty_group_count):
258			# print 'counts =', counts
259			empty_groupids = frozenset(mixture.empty_groupids)
260			assert_equal(len(empty_groupids), empty_group_count)
261			for groupid in empty_groupids:
262			assert_equal(counts[groupid], 0)
263
264			def check_scores(mixture, counts, empty_group_count):
265			sample_count = sum(counts)
266			nonempty_group_count = len(counts) - empty_group_count
267			expected = [
268			model.score_add_value(
269			group_size,
270			nonempty_group_count,
271			sample_count,
272			empty_group_count)
273			for group_size in counts
274			]
275			noise = numpy.random.randn(len(counts))
276			actual = numpy.zeros(len(counts), dtype=numpy.float32)
277			actual[:] = noise
278			mixture.score_value(model, actual)
279			assert_close(actual, expected)
280			return actual
281
282			for empty_group_count in [1, 10]:
283			print 'empty_group_count =', empty_group_count
284			counts = nonempty_counts + [0] * empty_group_count
285			numpy.random.shuffle(counts)
286			mixture = Model.Mixture()
287			id_tracker = MixtureIdTracker()
288
289			print 'init'
290			mixture.init(model, counts)
291			id_tracker.init(len(counts))
292			check_counts(mixture, counts, empty_group_count)
293			check_scores(mixture, counts, empty_group_count)
294
295			print 'adding'
296			groupids = []
297			for _ in xrange(sample_count):
298			check_counts(mixture, counts, empty_group_count)
299			scores = check_scores(mixture, counts, empty_group_count)
300			probs = scores_to_probs(scores)
301			groupid = sample_discrete(probs)
302			expected_group_added = (counts[groupid] == 0)
303			counts[groupid] += 1
304			actual_group_added = mixture.add_value(model, groupid)
305			assert_equal(actual_group_added, expected_group_added)
306			groupids.append(groupid)
307			if actual_group_added:
308			id_tracker.add_group()
309			counts.append(0)
310
311			check_counts(mixture, counts, empty_group_count)
312			check_scores(mixture, counts, empty_group_count)
313
314			print 'removing'
315			for global_groupid in groupids:
316			groupid = id_tracker.global_to_packed(global_groupid)
317			counts[groupid] -= 1
318			expected_group_removed = (counts[groupid] == 0)
319			actual_group_removed = mixture.remove_value(model, groupid)
320			assert_equal(actual_group_removed, expected_group_removed)
321			if expected_group_removed:
322			id_tracker.remove_group(groupid)
323			back = counts.pop()
324			if groupid < len(counts):
325			counts[groupid] = back
326			check_counts(mixture, counts, empty_group_count)
327			check_scores(mixture, counts, empty_group_count)
328

posterior / distributions

GitHub Access Token became invalid

for_each_model() F last analyzed 2017-06-28 22:26 UTC

Complexity

Size

Duplication

Importance

3 Methods

How to fix Complexity

Complexity

Duplication Side-by-Side

Filter issues like

for_each_model() F
last analyzed 2017-06-28 22:26 UTC