Completed
Push — master ( f43547...71985b )
by Chris
12:00 queued 10s
created

abydos.compression._arithmetic.Arithmetic.train()   A

Complexity

Conditions 4

Size

Total Lines 63
Code Lines 16

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 15
CRAP Score 4

Importance

Changes 0
Metric Value
cc 4
eloc 16
nop 2
dl 0
loc 63
ccs 15
cts 15
cp 1
crap 4
rs 9.6
c 0
b 0
f 0

How to fix   Long Method   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
# -*- coding: utf-8 -*-
2
3
# Copyright 2014-2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19 1
"""abydos.compression._arithmetic.
20
21
Arithmetic coder/decoder
22
"""
23
24 1
from __future__ import (
25
    absolute_import,
26
    division,
27
    print_function,
28
    unicode_literals,
29
)
30
31 1
from collections import Counter
32 1
from fractions import Fraction
33
34 1
from six import PY3, text_type
35
36
37
if PY3:
38
    long = int
0 ignored issues
show
Coding Style Naming introduced by
The name long does not conform to the class naming conventions ([A-Z_][a-zA-Z0-9]+$).

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
39
40 1
__all__ = ['Arithmetic', 'ac_decode', 'ac_encode', 'ac_train']
41
42
43 1
class Arithmetic(object):
0 ignored issues
show
Unused Code introduced by
The variable __class__ seems to be unused.
Loading history...
44
    """Arithmetic Coder.
45
46
    This is based on Andrew Dalke's public domain implementation
47
    :cite:`Dalke:2005`. It has been ported to use the fractions.Fraction class.
48
    """
49
50 1
    _probs = {}
51
52 1
    def __init__(self, text=None):
53
        """Initialize arithmetic coder object.
54
55
        Parameters
56
        ----------
57
        text : str
58
            The training text
59
60
        """
61 1
        if text is not None:
62 1
            self.train(text)
63
64 1
    def get_probs(self):
65
        """Return the probs dictionary.
66
67
        Returns
68
        -------
69
        dict
70
            The dictionary of probabilities
71
72
        """
73 1
        return self._probs
74
75 1
    def set_probs(self, probs):
76
        """Set the probs dictionary.
77
78
        Parameters
79
        ----------
80
        probs : dict
81
            The dictionary of probabilities
82
83
        """
84 1
        self._probs = probs
85
86 1
    def train(self, text):
87
        r"""Generate a probability dict from the provided text.
88
89
        Text to 0-order probability statistics as a dict
90
91
        Parameters
92
        ----------
93
        text : str
94
            The text data over which to calculate probability statistics. This
95
            must not contain the NUL (0x00) character because that is used to
96
            indicate the end of data.
97
98
        Example
99
        -------
100
        >>> ac = Arithmetic()
101
        >>> ac.train('the quick brown fox jumped over the lazy dog')
102
        >>> ac.get_probs()
103
        {' ': (Fraction(0, 1), Fraction(8, 45)),
104
         'o': (Fraction(8, 45), Fraction(4, 15)),
105
         'e': (Fraction(4, 15), Fraction(16, 45)),
106
         'u': (Fraction(16, 45), Fraction(2, 5)),
107
         't': (Fraction(2, 5), Fraction(4, 9)),
108
         'r': (Fraction(4, 9), Fraction(22, 45)),
109
         'h': (Fraction(22, 45), Fraction(8, 15)),
110
         'd': (Fraction(8, 15), Fraction(26, 45)),
111
         'z': (Fraction(26, 45), Fraction(3, 5)),
112
         'y': (Fraction(3, 5), Fraction(28, 45)),
113
         'x': (Fraction(28, 45), Fraction(29, 45)),
114
         'w': (Fraction(29, 45), Fraction(2, 3)),
115
         'v': (Fraction(2, 3), Fraction(31, 45)),
116
         'q': (Fraction(31, 45), Fraction(32, 45)),
117
         'p': (Fraction(32, 45), Fraction(11, 15)),
118
         'n': (Fraction(11, 15), Fraction(34, 45)),
119
         'm': (Fraction(34, 45), Fraction(7, 9)),
120
         'l': (Fraction(7, 9), Fraction(4, 5)),
121
         'k': (Fraction(4, 5), Fraction(37, 45)),
122
         'j': (Fraction(37, 45), Fraction(38, 45)),
123
         'i': (Fraction(38, 45), Fraction(13, 15)),
124
         'g': (Fraction(13, 15), Fraction(8, 9)),
125
         'f': (Fraction(8, 9), Fraction(41, 45)),
126
         'c': (Fraction(41, 45), Fraction(14, 15)),
127
         'b': (Fraction(14, 15), Fraction(43, 45)),
128
         'a': (Fraction(43, 45), Fraction(44, 45)),
129
         '\x00': (Fraction(44, 45), Fraction(1, 1))}
130
131
        """
132 1
        text = text_type(text)
133 1
        if '\x00' in text:
134 1
            text = text.replace('\x00', ' ')
135 1
        counts = Counter(text)
136 1
        counts['\x00'] = 1
137 1
        tot_letters = sum(counts.values())
138
139 1
        tot = 0
140 1
        self._probs = {}
141 1
        prev = Fraction(0)
142 1
        for char, count in sorted(
143
            counts.items(), key=lambda x: (x[1], x[0]), reverse=True
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
144
        ):
145 1
            follow = Fraction(tot + count, tot_letters)
146 1
            self._probs[char] = (prev, follow)
147 1
            prev = follow
148 1
            tot = tot + count
149
150 1
    def encode(self, text):
151
        """Encode a text using arithmetic coding.
152
153
        Text and the 0-order probability statistics -> longval, nbits
154
155
        The encoded number is Fraction(longval, 2**nbits)
156
157
        Parameters
158
        ----------
159
        text : str
160
            A string to encode
161
162
        Returns
163
        -------
164
        tuple
165
            The arithmetically coded text
166
167
        Example
168
        -------
169
        >>> ac = Arithmetic('the quick brown fox jumped over the lazy dog')
170
        >>> ac.encode('align')
171
        (16720586181, 34)
172
173
        """
174 1
        text = text_type(text)
175 1
        if '\x00' in text:
176 1
            text = text.replace('\x00', ' ')
177 1
        minval = Fraction(0)
178 1
        maxval = Fraction(1)
179
180 1
        for char in text + '\x00':
181 1
            prob_range = self._probs[char]
182 1
            delta = maxval - minval
183 1
            maxval = minval + prob_range[1] * delta
184 1
            minval = minval + prob_range[0] * delta
185
186
        # I tried without the /2 just to check.  Doesn't work.
187
        # Keep scaling up until the error range is >= 1.  That
188
        # gives me the minimum number of bits needed to resolve
189
        # down to the end-of-data character.
190 1
        delta = (maxval - minval) / 2
191 1
        nbits = long(0)
0 ignored issues
show
introduced by
The variable long does not seem to be defined in case PY3 on line 37 is False. Are you sure this can never be the case?
Loading history...
192 1
        while delta < 1:
193 1
            nbits += 1
194 1
            delta *= 2
195
        # The below condition shouldn't ever be false
196
        if nbits == 0:  # pragma: no cover
197
            return 0, 0
198
        # using -1 instead of /2
199 1
        avg = (maxval + minval) * 2 ** (nbits - 1)
200
        # Could return a rational instead ...
201
        # the division truncation is deliberate
202 1
        return avg.numerator // avg.denominator, nbits
203
204 1
    def decode(self, longval, nbits):
205
        """Decode the number to a string using the given statistics.
206
207
        Parameters
208
        ----------
209
        longval : int
210
            The first part of an encoded tuple from encode
211
        nbits : int
212
            The second part of an encoded tuple from encode
213
214
        Returns
215
        -------
216
        str
217
            The arithmetically decoded text
218
219
        Example
220
        -------
221
        >>> ac = Arithmetic('the quick brown fox jumped over the lazy dog')
222
        >>> ac.decode(16720586181, 34)
223
        'align'
224
225
        """
226 1
        val = Fraction(longval, long(1) << nbits)
0 ignored issues
show
introduced by
The variable long does not seem to be defined in case PY3 on line 37 is False. Are you sure this can never be the case?
Loading history...
227 1
        letters = []
228
229 1
        probs_items = [
230
            (char, minval, maxval)
231
            for (char, (minval, maxval)) in self._probs.items()
232
        ]
233
234 1
        char = '\x00'
235 1
        while True:
236 1
            for (char, minval, maxval) in probs_items:  # noqa: B007
237 1
                if minval <= val < maxval:
238 1
                    break
239
240 1
            if char == '\x00':
241 1
                break
242 1
            letters.append(char)
243 1
            delta = maxval - minval
0 ignored issues
show
introduced by
The variable maxval does not seem to be defined for all execution paths.
Loading history...
introduced by
The variable minval does not seem to be defined for all execution paths.
Loading history...
244 1
            val = (val - minval) / delta
245 1
        return ''.join(letters)
246
247
248 1
def ac_train(text):
249
    r"""Generate a probability dict from the provided text.
250
251
    This is a wrapper for :py:meth:`Arithmetic.train`.
252
253
    Parameters
254
    ----------
255
    text : str
256
        The text data over which to calculate probability statistics. This must
257
        not contain the NUL (0x00) character because that's used to indicate
258
        the end of data.
259
260
    Returns
261
    -------
262
    dict
263
        A probability dict
264
265
    Example
266
    -------
267
    >>> ac_train('the quick brown fox jumped over the lazy dog')
268
    {' ': (Fraction(0, 1), Fraction(8, 45)),
269
     'o': (Fraction(8, 45), Fraction(4, 15)),
270
     'e': (Fraction(4, 15), Fraction(16, 45)),
271
     'u': (Fraction(16, 45), Fraction(2, 5)),
272
     't': (Fraction(2, 5), Fraction(4, 9)),
273
     'r': (Fraction(4, 9), Fraction(22, 45)),
274
     'h': (Fraction(22, 45), Fraction(8, 15)),
275
     'd': (Fraction(8, 15), Fraction(26, 45)),
276
     'z': (Fraction(26, 45), Fraction(3, 5)),
277
     'y': (Fraction(3, 5), Fraction(28, 45)),
278
     'x': (Fraction(28, 45), Fraction(29, 45)),
279
     'w': (Fraction(29, 45), Fraction(2, 3)),
280
     'v': (Fraction(2, 3), Fraction(31, 45)),
281
     'q': (Fraction(31, 45), Fraction(32, 45)),
282
     'p': (Fraction(32, 45), Fraction(11, 15)),
283
     'n': (Fraction(11, 15), Fraction(34, 45)),
284
     'm': (Fraction(34, 45), Fraction(7, 9)),
285
     'l': (Fraction(7, 9), Fraction(4, 5)),
286
     'k': (Fraction(4, 5), Fraction(37, 45)),
287
     'j': (Fraction(37, 45), Fraction(38, 45)),
288
     'i': (Fraction(38, 45), Fraction(13, 15)),
289
     'g': (Fraction(13, 15), Fraction(8, 9)),
290
     'f': (Fraction(8, 9), Fraction(41, 45)),
291
     'c': (Fraction(41, 45), Fraction(14, 15)),
292
     'b': (Fraction(14, 15), Fraction(43, 45)),
293
     'a': (Fraction(43, 45), Fraction(44, 45)),
294
     '\x00': (Fraction(44, 45), Fraction(1, 1))}
295
296
    """
297 1
    return Arithmetic(text).get_probs()
298
299
300 1
def ac_encode(text, probs):
301
    """Encode a text using arithmetic coding with the provided probabilities.
302
303
    This is a wrapper for :py:meth:`Arithmetic.encode`.
304
305
    Parameters
306
    ----------
307
    text : str
308
        A string to encode
309
    probs : dict
310
        A probability statistics dictionary generated by
311
        :py:meth:`Arithmetic.train`
312
313
    Returns
314
    -------
315
    tuple
316
        The arithmetically coded text
317
318
    Example
319
    -------
320
    >>> pr = ac_train('the quick brown fox jumped over the lazy dog')
321
    >>> ac_encode('align', pr)
322
    (16720586181, 34)
323
324
    """
325 1
    coder = Arithmetic()
326 1
    coder.set_probs(probs)
327 1
    return coder.encode(text)
328
329
330 1
def ac_decode(longval, nbits, probs):
331
    """Decode the number to a string using the given statistics.
332
333
    This is a wrapper for :py:meth:`Arithmetic.decode`.
334
335
    Parameters
336
    ----------
337
    longval : int
338
        The first part of an encoded tuple from ac_encode
339
    nbits : int
340
        The second part of an encoded tuple from ac_encode
341
    probs : dict
342
        A probability statistics dictionary generated by
343
        :py:meth:`Arithmetic.train`
344
345
    Returns
346
    -------
347
    str
348
        The arithmetically decoded text
349
350
    Example
351
    -------
352
    >>> pr = ac_train('the quick brown fox jumped over the lazy dog')
353
    >>> ac_decode(16720586181, 34, pr)
354
    'align'
355
356
    """
357 1
    coder = Arithmetic()
358 1
    coder.set_probs(probs)
359 1
    return coder.decode(longval, nbits)
360
361
362
if __name__ == '__main__':
363
    import doctest
364
365
    doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)
366