abydos.compression._arithmetic.ac_decode() - Code Metrics - Inspection of "78a222a9f7d8976f6744d263e3d6d01a2a991c27" - chrislit/abydos - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Branch — master (78a222)

by Chris

created 2018-10-26 11:30 UTC

abydos.compression._arithmetic.ac_decode() A

↳ Parent: abydos.compression._arithmetic

Complexity

Conditions

Size

Total Lines	35
Code Lines	16

Duplication

Lines	0
Ratio	0 %

Code Coverage

Tests	15
CRAP Score	5

Importance

Changes

Metric	Value
eloc	16
dl	0
loc	35
ccs	15
cts	15
cp	1
rs	9.1333
c	0
b	0
f	0
cc	5
nop	3
crap	5

# -*- coding: utf-8 -*-

# Copyright 2014-2018 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.compression._arithmetic.

arithmetic coding functions
"""

from __future__ import division, unicode_literals

from collections import Counter
from fractions import Fraction

from six import PY3, text_type


if PY3:
    long = int


__all__ = ['ac_decode', 'ac_encode', 'ac_train']


def ac_train(text):
    r"""Generate a probability dict from the provided text.

    Text -> 0-order probability statistics as a dict

    This is based on Andrew Dalke's public domain implementation
    :cite:`Dalke:2005`. It has been ported to use the fractions.Fraction class.

    :param str text: The text data over which to calculate probability
        statistics. This must not contain the NUL (0x00) character because
        that's used to indicate the end of data.
    :returns: a probability dict
    :rtype: dict

    >>> ac_train('the quick brown fox jumped over the lazy dog')
    {' ': (Fraction(0, 1), Fraction(8, 45)),
     'o': (Fraction(8, 45), Fraction(4, 15)),
     'e': (Fraction(4, 15), Fraction(16, 45)),
     'u': (Fraction(16, 45), Fraction(2, 5)),
     't': (Fraction(2, 5), Fraction(4, 9)),
     'r': (Fraction(4, 9), Fraction(22, 45)),
     'h': (Fraction(22, 45), Fraction(8, 15)),
     'd': (Fraction(8, 15), Fraction(26, 45)),
     'z': (Fraction(26, 45), Fraction(3, 5)),
     'y': (Fraction(3, 5), Fraction(28, 45)),
     'x': (Fraction(28, 45), Fraction(29, 45)),
     'w': (Fraction(29, 45), Fraction(2, 3)),
     'v': (Fraction(2, 3), Fraction(31, 45)),
     'q': (Fraction(31, 45), Fraction(32, 45)),
     'p': (Fraction(32, 45), Fraction(11, 15)),
     'n': (Fraction(11, 15), Fraction(34, 45)),
     'm': (Fraction(34, 45), Fraction(7, 9)),
     'l': (Fraction(7, 9), Fraction(4, 5)),
     'k': (Fraction(4, 5), Fraction(37, 45)),
     'j': (Fraction(37, 45), Fraction(38, 45)),
     'i': (Fraction(38, 45), Fraction(13, 15)),
     'g': (Fraction(13, 15), Fraction(8, 9)),
     'f': (Fraction(8, 9), Fraction(41, 45)),
     'c': (Fraction(41, 45), Fraction(14, 15)),
     'b': (Fraction(14, 15), Fraction(43, 45)),
     'a': (Fraction(43, 45), Fraction(44, 45)),
     '\x00': (Fraction(44, 45), Fraction(1, 1))}
    """
    text = text_type(text)
    if '\x00' in text:
        text = text.replace('\x00', ' ')
    counts = Counter(text)
    counts['\x00'] = 1
    tot_letters = sum(counts.values())

    tot = 0
    prob_range = {}
    prev = Fraction(0)
    for char, count in sorted(
        counts.items(), key=lambda x: (x[1], x[0]), reverse=True

    ):
        follow = Fraction(tot + count, tot_letters)
        prob_range[char] = (prev, follow)
        prev = follow
        tot = tot + count
    # assert tot == tot_letters

    return prob_range


def ac_encode(text, probs):
    """Encode a text using arithmetic coding with the provided probabilities.

    Text and the 0-order probability statistics -> longval, nbits

    The encoded number is Fraction(longval, 2**nbits)

    This is based on Andrew Dalke's public domain implementation
    :cite:`Dalke:2005`. It has been ported to use the fractions.Fraction class.

    :param str text: A string to encode
    :param dict probs: A probability statistics dictionary generated by
        ac_train
    :returns: The arithmetically coded text
    :rtype: tuple

    >>> pr = ac_train('the quick brown fox jumped over the lazy dog')
    >>> ac_encode('align', pr)
    (16720586181, 34)
    """
    text = text_type(text)
    if '\x00' in text:
        text = text.replace('\x00', ' ')
    minval = Fraction(0)
    maxval = Fraction(1)
    for char in text + '\x00':
        prob_range = probs[char]
        delta = maxval - minval
        maxval = minval + prob_range[1] * delta
        minval = minval + prob_range[0] * delta

    # I tried without the /2 just to check.  Doesn't work.
    # Keep scaling up until the error range is >= 1.  That
    # gives me the minimum number of bits needed to resolve
    # down to the end-of-data character.
    delta = (maxval - minval) / 2
    nbits = long(0)

    while delta < 1:
        nbits += 1
        delta *= 2
    # The below condition shouldn't ever be false
    if nbits == 0:  # pragma: no cover
        return 0, 0
    # using -1 instead of /2
    avg = (maxval + minval) * 2 ** (nbits - 1)
    # Could return a rational instead ...
    # the division truncation is deliberate
    return avg.numerator // avg.denominator, nbits


def ac_decode(longval, nbits, probs):
    """Decode the number to a string using the given statistics.

    This is based on Andrew Dalke's public domain implementation
    :cite:`Dalke:2005`. It has been ported to use the fractions.Fraction class.

    :param int longval: The first part of an encoded tuple from ac_encode
    :param int nbits: The second part of an encoded tuple from ac_encode
    :param dict probs: A probability statistics dictionary generated by
        ac_train
    :returns: The arithmetically decoded text
    :rtype: str

    >>> pr = ac_train('the quick brown fox jumped over the lazy dog')
    >>> ac_decode(16720586181, 34, pr)
    'align'
    """
    val = Fraction(longval, long(1) << nbits)

    letters = []
    probs_items = [
        (char, minval, maxval) for (char, (minval, maxval)) in probs.items()
    ]

    char = '\x00'
    while True:
        for (char, minval, maxval) in probs_items:  # noqa: B007
            if minval <= val < maxval:
                break

        if char == '\x00':
            break
        letters.append(char)
        delta = maxval - minval

        val = (val - minval) / delta
    return ''.join(letters)


if __name__ == '__main__':
    import doctest

    doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)


1		# -- coding: utf-8 --
2
3		# Copyright 2014-2018 by Christopher C. Little.
4		# This file is part of Abydos.
5		#
6		# Abydos is free software: you can redistribute it and/or modify
7		# it under the terms of the GNU General Public License as published by
8		# the Free Software Foundation, either version 3 of the License, or
9		# (at your option) any later version.
10		#
11		# Abydos is distributed in the hope that it will be useful,
12		# but WITHOUT ANY WARRANTY; without even the implied warranty of
13		# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14		# GNU General Public License for more details.
15		#
16		# You should have received a copy of the GNU General Public License
17		# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19	1	"""abydos.compression._arithmetic.
20
21		arithmetic coding functions
22		"""
23
24	1	from __future__ import division, unicode_literals
25
26	1	from collections import Counter
27	1	from fractions import Fraction
28
29	1	from six import PY3, text_type
30
31
32		if PY3:
33		long = int
		0 ignored issues – show Coding Style Naming introduced 2018-10-23 05:52 UTC by Report Bug Copy Issue Report The name `long` does not conform to the class naming conventions (`[A-Z_][a-zA-Z0-9]+$`). This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
34
35	1	__all__ = ['ac_decode', 'ac_encode', 'ac_train']
36
37
38	1	def ac_train(text):
39		r"""Generate a probability dict from the provided text.
40
41		Text -> 0-order probability statistics as a dict
42
43		This is based on Andrew Dalke's public domain implementation
44		:cite:`Dalke:2005`. It has been ported to use the fractions.Fraction class.
45
46		:param str text: The text data over which to calculate probability
47		statistics. This must not contain the NUL (0x00) character because
48		that's used to indicate the end of data.
49		:returns: a probability dict
50		:rtype: dict
51
52		>>> ac_train('the quick brown fox jumped over the lazy dog')
53		{' ': (Fraction(0, 1), Fraction(8, 45)),
54		'o': (Fraction(8, 45), Fraction(4, 15)),
55		'e': (Fraction(4, 15), Fraction(16, 45)),
56		'u': (Fraction(16, 45), Fraction(2, 5)),
57		't': (Fraction(2, 5), Fraction(4, 9)),
58		'r': (Fraction(4, 9), Fraction(22, 45)),
59		'h': (Fraction(22, 45), Fraction(8, 15)),
60		'd': (Fraction(8, 15), Fraction(26, 45)),
61		'z': (Fraction(26, 45), Fraction(3, 5)),
62		'y': (Fraction(3, 5), Fraction(28, 45)),
63		'x': (Fraction(28, 45), Fraction(29, 45)),
64		'w': (Fraction(29, 45), Fraction(2, 3)),
65		'v': (Fraction(2, 3), Fraction(31, 45)),
66		'q': (Fraction(31, 45), Fraction(32, 45)),
67		'p': (Fraction(32, 45), Fraction(11, 15)),
68		'n': (Fraction(11, 15), Fraction(34, 45)),
69		'm': (Fraction(34, 45), Fraction(7, 9)),
70		'l': (Fraction(7, 9), Fraction(4, 5)),
71		'k': (Fraction(4, 5), Fraction(37, 45)),
72		'j': (Fraction(37, 45), Fraction(38, 45)),
73		'i': (Fraction(38, 45), Fraction(13, 15)),
74		'g': (Fraction(13, 15), Fraction(8, 9)),
75		'f': (Fraction(8, 9), Fraction(41, 45)),
76		'c': (Fraction(41, 45), Fraction(14, 15)),
77		'b': (Fraction(14, 15), Fraction(43, 45)),
78		'a': (Fraction(43, 45), Fraction(44, 45)),
79		'\x00': (Fraction(44, 45), Fraction(1, 1))}
80		"""
81	1	text = text_type(text)
82	1	if '\x00' in text:
83	1	text = text.replace('\x00', ' ')
84	1	counts = Counter(text)
85	1	counts['\x00'] = 1
86	1	tot_letters = sum(counts.values())
87
88	1	tot = 0
89	1	prob_range = {}
90	1	prev = Fraction(0)
91	1	for char, count in sorted(
92		counts.items(), key=lambda x: (x[1], x[0]), reverse=True
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
93		):
94	1	follow = Fraction(tot + count, tot_letters)
95	1	prob_range[char] = (prev, follow)
96	1	prev = follow
97	1	tot = tot + count
98		# assert tot == tot_letters
99
100	1	return prob_range
101
102
103	1	def ac_encode(text, probs):
104		"""Encode a text using arithmetic coding with the provided probabilities.
105
106		Text and the 0-order probability statistics -> longval, nbits
107
108		The encoded number is Fraction(longval, 2**nbits)
109
110		This is based on Andrew Dalke's public domain implementation
111		:cite:`Dalke:2005`. It has been ported to use the fractions.Fraction class.
112
113		:param str text: A string to encode
114		:param dict probs: A probability statistics dictionary generated by
115		ac_train
116		:returns: The arithmetically coded text
117		:rtype: tuple
118
119		>>> pr = ac_train('the quick brown fox jumped over the lazy dog')
120		>>> ac_encode('align', pr)
121		(16720586181, 34)
122		"""
123	1	text = text_type(text)
124	1	if '\x00' in text:
125	1	text = text.replace('\x00', ' ')
126	1	minval = Fraction(0)
127	1	maxval = Fraction(1)
128	1	for char in text + '\x00':
129	1	prob_range = probs[char]
130	1	delta = maxval - minval
131	1	maxval = minval + prob_range[1] * delta
132	1	minval = minval + prob_range[0] * delta
133
134		# I tried without the /2 just to check. Doesn't work.
135		# Keep scaling up until the error range is >= 1. That
136		# gives me the minimum number of bits needed to resolve
137		# down to the end-of-data character.
138	1	delta = (maxval - minval) / 2
139	1	nbits = long(0)
		0 ignored issues – show introduced 2018-08-02 19:04 UTC by Report Bug Copy Issue Report The variable `long` does not seem to be defined in case `PY3` on line `32` is `False`. Are you sure this can never be the case? Loading history...
140	1	while delta < 1:
141	1	nbits += 1
142	1	delta *= 2
143		# The below condition shouldn't ever be false
144		if nbits == 0: # pragma: no cover
145		return 0, 0
146		# using -1 instead of /2
147	1	avg = (maxval + minval) * 2 ** (nbits - 1)
148		# Could return a rational instead ...
149		# the division truncation is deliberate
150	1	return avg.numerator // avg.denominator, nbits
151
152
153	1	def ac_decode(longval, nbits, probs):
154		"""Decode the number to a string using the given statistics.
155
156		This is based on Andrew Dalke's public domain implementation
157		:cite:`Dalke:2005`. It has been ported to use the fractions.Fraction class.
158
159		:param int longval: The first part of an encoded tuple from ac_encode
160		:param int nbits: The second part of an encoded tuple from ac_encode
161		:param dict probs: A probability statistics dictionary generated by
162		ac_train
163		:returns: The arithmetically decoded text
164		:rtype: str
165
166		>>> pr = ac_train('the quick brown fox jumped over the lazy dog')
167		>>> ac_decode(16720586181, 34, pr)
168		'align'
169		"""
170	1	val = Fraction(longval, long(1) << nbits)
		0 ignored issues – show introduced 2018-08-02 19:04 UTC by Report Bug Copy Issue Report The variable `long` does not seem to be defined in case `PY3` on line `32` is `False`. Are you sure this can never be the case? Loading history...
171	1	letters = []
172	1	probs_items = [
173		(char, minval, maxval) for (char, (minval, maxval)) in probs.items()
174		]
175
176	1	char = '\x00'
177	1	while True:
178	1	for (char, minval, maxval) in probs_items: # noqa: B007
179	1	if minval <= val < maxval:
180	1	break
181
182	1	if char == '\x00':
183	1	break
184	1	letters.append(char)
185	1	delta = maxval - minval
		0 ignored issues – show introduced 2018-08-02 19:04 UTC by Report Bug Copy Issue Report The variable `maxval` does not seem to be defined for all execution paths. Loading history... introduced 2018-08-02 19:04 UTC by Report Bug Copy Issue Report The variable `minval` does not seem to be defined for all execution paths. Loading history...
186	1	val = (val - minval) / delta
187	1	return ''.join(letters)
188
189
190		if __name__ == '__main__':
191		import doctest
192
193		doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)
194

chrislit / abydos

Branch — master (78a222)

abydos.compression._arithmetic.ac_decode() A

Complexity

Size

Duplication

Code Coverage

Importance

Duplication Side-by-Side

Filter issues like