Completed
Branch master (78a222)
by Chris
14:36
created

abydos.compression._arithmetic.ac_decode()   A

Complexity

Conditions 5

Size

Total Lines 35
Code Lines 16

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 15
CRAP Score 5

Importance

Changes 0
Metric Value
eloc 16
dl 0
loc 35
ccs 15
cts 15
cp 1
rs 9.1333
c 0
b 0
f 0
cc 5
nop 3
crap 5
1
# -*- coding: utf-8 -*-
2
3
# Copyright 2014-2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19 1
"""abydos.compression._arithmetic.
20
21
arithmetic coding functions
22
"""
23
24 1
from __future__ import division, unicode_literals
25
26 1
from collections import Counter
27 1
from fractions import Fraction
28
29 1
from six import PY3, text_type
30
31
32
if PY3:
33
    long = int
0 ignored issues
show
Coding Style Naming introduced by
The name long does not conform to the class naming conventions ([A-Z_][a-zA-Z0-9]+$).

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
34
35 1
__all__ = ['ac_decode', 'ac_encode', 'ac_train']
36
37
38 1
def ac_train(text):
39
    r"""Generate a probability dict from the provided text.
40
41
    Text -> 0-order probability statistics as a dict
42
43
    This is based on Andrew Dalke's public domain implementation
44
    :cite:`Dalke:2005`. It has been ported to use the fractions.Fraction class.
45
46
    :param str text: The text data over which to calculate probability
47
        statistics. This must not contain the NUL (0x00) character because
48
        that's used to indicate the end of data.
49
    :returns: a probability dict
50
    :rtype: dict
51
52
    >>> ac_train('the quick brown fox jumped over the lazy dog')
53
    {' ': (Fraction(0, 1), Fraction(8, 45)),
54
     'o': (Fraction(8, 45), Fraction(4, 15)),
55
     'e': (Fraction(4, 15), Fraction(16, 45)),
56
     'u': (Fraction(16, 45), Fraction(2, 5)),
57
     't': (Fraction(2, 5), Fraction(4, 9)),
58
     'r': (Fraction(4, 9), Fraction(22, 45)),
59
     'h': (Fraction(22, 45), Fraction(8, 15)),
60
     'd': (Fraction(8, 15), Fraction(26, 45)),
61
     'z': (Fraction(26, 45), Fraction(3, 5)),
62
     'y': (Fraction(3, 5), Fraction(28, 45)),
63
     'x': (Fraction(28, 45), Fraction(29, 45)),
64
     'w': (Fraction(29, 45), Fraction(2, 3)),
65
     'v': (Fraction(2, 3), Fraction(31, 45)),
66
     'q': (Fraction(31, 45), Fraction(32, 45)),
67
     'p': (Fraction(32, 45), Fraction(11, 15)),
68
     'n': (Fraction(11, 15), Fraction(34, 45)),
69
     'm': (Fraction(34, 45), Fraction(7, 9)),
70
     'l': (Fraction(7, 9), Fraction(4, 5)),
71
     'k': (Fraction(4, 5), Fraction(37, 45)),
72
     'j': (Fraction(37, 45), Fraction(38, 45)),
73
     'i': (Fraction(38, 45), Fraction(13, 15)),
74
     'g': (Fraction(13, 15), Fraction(8, 9)),
75
     'f': (Fraction(8, 9), Fraction(41, 45)),
76
     'c': (Fraction(41, 45), Fraction(14, 15)),
77
     'b': (Fraction(14, 15), Fraction(43, 45)),
78
     'a': (Fraction(43, 45), Fraction(44, 45)),
79
     '\x00': (Fraction(44, 45), Fraction(1, 1))}
80
    """
81 1
    text = text_type(text)
82 1
    if '\x00' in text:
83 1
        text = text.replace('\x00', ' ')
84 1
    counts = Counter(text)
85 1
    counts['\x00'] = 1
86 1
    tot_letters = sum(counts.values())
87
88 1
    tot = 0
89 1
    prob_range = {}
90 1
    prev = Fraction(0)
91 1
    for char, count in sorted(
92
        counts.items(), key=lambda x: (x[1], x[0]), reverse=True
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
93
    ):
94 1
        follow = Fraction(tot + count, tot_letters)
95 1
        prob_range[char] = (prev, follow)
96 1
        prev = follow
97 1
        tot = tot + count
98
    # assert tot == tot_letters
99
100 1
    return prob_range
101
102
103 1
def ac_encode(text, probs):
104
    """Encode a text using arithmetic coding with the provided probabilities.
105
106
    Text and the 0-order probability statistics -> longval, nbits
107
108
    The encoded number is Fraction(longval, 2**nbits)
109
110
    This is based on Andrew Dalke's public domain implementation
111
    :cite:`Dalke:2005`. It has been ported to use the fractions.Fraction class.
112
113
    :param str text: A string to encode
114
    :param dict probs: A probability statistics dictionary generated by
115
        ac_train
116
    :returns: The arithmetically coded text
117
    :rtype: tuple
118
119
    >>> pr = ac_train('the quick brown fox jumped over the lazy dog')
120
    >>> ac_encode('align', pr)
121
    (16720586181, 34)
122
    """
123 1
    text = text_type(text)
124 1
    if '\x00' in text:
125 1
        text = text.replace('\x00', ' ')
126 1
    minval = Fraction(0)
127 1
    maxval = Fraction(1)
128 1
    for char in text + '\x00':
129 1
        prob_range = probs[char]
130 1
        delta = maxval - minval
131 1
        maxval = minval + prob_range[1] * delta
132 1
        minval = minval + prob_range[0] * delta
133
134
    # I tried without the /2 just to check.  Doesn't work.
135
    # Keep scaling up until the error range is >= 1.  That
136
    # gives me the minimum number of bits needed to resolve
137
    # down to the end-of-data character.
138 1
    delta = (maxval - minval) / 2
139 1
    nbits = long(0)
0 ignored issues
show
introduced by
The variable long does not seem to be defined in case PY3 on line 32 is False. Are you sure this can never be the case?
Loading history...
140 1
    while delta < 1:
141 1
        nbits += 1
142 1
        delta *= 2
143
    # The below condition shouldn't ever be false
144
    if nbits == 0:  # pragma: no cover
145
        return 0, 0
146
    # using -1 instead of /2
147 1
    avg = (maxval + minval) * 2 ** (nbits - 1)
148
    # Could return a rational instead ...
149
    # the division truncation is deliberate
150 1
    return avg.numerator // avg.denominator, nbits
151
152
153 1
def ac_decode(longval, nbits, probs):
154
    """Decode the number to a string using the given statistics.
155
156
    This is based on Andrew Dalke's public domain implementation
157
    :cite:`Dalke:2005`. It has been ported to use the fractions.Fraction class.
158
159
    :param int longval: The first part of an encoded tuple from ac_encode
160
    :param int nbits: The second part of an encoded tuple from ac_encode
161
    :param dict probs: A probability statistics dictionary generated by
162
        ac_train
163
    :returns: The arithmetically decoded text
164
    :rtype: str
165
166
    >>> pr = ac_train('the quick brown fox jumped over the lazy dog')
167
    >>> ac_decode(16720586181, 34, pr)
168
    'align'
169
    """
170 1
    val = Fraction(longval, long(1) << nbits)
0 ignored issues
show
introduced by
The variable long does not seem to be defined in case PY3 on line 32 is False. Are you sure this can never be the case?
Loading history...
171 1
    letters = []
172 1
    probs_items = [
173
        (char, minval, maxval) for (char, (minval, maxval)) in probs.items()
174
    ]
175
176 1
    char = '\x00'
177 1
    while True:
178 1
        for (char, minval, maxval) in probs_items:  # noqa: B007
179 1
            if minval <= val < maxval:
180 1
                break
181
182 1
        if char == '\x00':
183 1
            break
184 1
        letters.append(char)
185 1
        delta = maxval - minval
0 ignored issues
show
introduced by
The variable maxval does not seem to be defined for all execution paths.
Loading history...
introduced by
The variable minval does not seem to be defined for all execution paths.
Loading history...
186 1
        val = (val - minval) / delta
187 1
    return ''.join(letters)
188
189
190
if __name__ == '__main__':
191
    import doctest
192
193
    doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)
194