Completed
Pull Request — master (#138)
by Chris
14:20
created

abydos.distance._compression.dist_ncd_lzma()   A

Complexity

Conditions 1

Size

Total Lines 20
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 1
CRAP Score 1.125

Importance

Changes 0
Metric Value
eloc 2
dl 0
loc 20
ccs 1
cts 2
cp 0.5
rs 10
c 0
b 0
f 0
cc 1
nop 2
crap 1.125
1
# -*- coding: utf-8 -*-
2
3
# Copyright 2014-2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19 1
"""abydos.distance.compression.
20
21
The distance.compression module implements compression distance measures.
22
"""
23
24 1
from __future__ import division, unicode_literals
25
26 1
import bz2
27 1
import zlib
28
29 1
from ._distance import Distance
30 1
from ..compression import Arithmetic, BWT, RLE
31
32 1
try:
33 1
    import lzma
34
except ImportError:  # pragma: no cover
35
    # If the system lacks the lzma library, that's fine, but lzma compression
36
    # similarity won't be supported.
37
    lzma = None
38
39 1
__all__ = [
40
    'NCDzlib',
41
    'dist_ncd_zlib',
42
    'sim_ncd_zlib',
43
    'NCDbz2',
44
    'dist_ncd_bz2',
45
    'sim_ncd_bz2',
46
    'NCDlzma',
47
    'dist_ncd_lzma',
48
    'sim_ncd_lzma',
49
    'NCDbwtrle',
50
    'dist_ncd_bwtrle',
51
    'sim_ncd_bwtrle',
52
    'NCDrle',
53
    'dist_ncd_rle',
54
    'sim_ncd_rle',
55
    'NCDarith',
56
    'dist_ncd_arith',
57
    'sim_ncd_arith',
58
]
59
60
61 1
class NCDarith(Distance):
0 ignored issues
show
Unused Code introduced by
The variable __class__ seems to be unused.
Loading history...
62
    """Normalized Compression Distance using Arithmetic Coding.
63
64
    Normalized compression distance (NCD) :cite:`Cilibrasi:2005`.
65
    """
66
67 1
    _coder = None
68
69 1
    def __init__(self):
70
        """Initialize the arithmetic coder object."""
71 1
        self._coder = Arithmetic()
72
73 1
    def dist(self, src, tar, probs=None):
0 ignored issues
show
Bug introduced by
Parameters differ from overridden 'dist' method
Loading history...
74
        """Return the NCD between two strings using arithmetic coding.
75
76
        :param str src: source string for comparison
77
        :param str tar: target string for comparison
78
        :param dict probs: a dictionary trained with
79
            :py:meth:`Arithmetic.train`
80
        :returns: compression distance
81
        :rtype: float
82
83
        >>> cmp = NCDarith()
84
        >>> cmp.dist('cat', 'hat')
85
        0.5454545454545454
86
        >>> cmp.dist('Niall', 'Neil')
87
        0.6875
88
        >>> cmp.dist('aluminum', 'Catalan')
89
        0.8275862068965517
90
        >>> cmp.dist('ATCG', 'TAGC')
91
        0.6923076923076923
92
        """
93 1
        if src == tar:
94 1
            return 0.0
95
96 1
        if probs is None:
97
            # lacking a reasonable dictionary, train on the strings themselves
98 1
            self._coder.train(src + tar)
99
        else:
100 1
            self._coder.set_probs(probs)
101
102 1
        src_comp = self._coder.encode(src)[1]
103 1
        tar_comp = self._coder.encode(tar)[1]
104 1
        concat_comp = self._coder.encode(src + tar)[1]
105 1
        concat_comp2 = self._coder.encode(tar + src)[1]
106
107 1
        return (
108
            min(concat_comp, concat_comp2) - min(src_comp, tar_comp)
109
        ) / max(src_comp, tar_comp)
110
111
112 1
def dist_ncd_arith(src, tar, probs=None):
113
    """Return the NCD between two strings using arithmetic coding.
114
115
    This is a wrapper for :py:meth:`NCDarith.dist`.
116
117
    :param str src: source string for comparison
118
    :param str tar: target string for comparison
119
    :param dict probs: a dictionary trained with :py:meth:`Arithmetic.train`
120
    :returns: compression distance
121
    :rtype: float
122
123
    >>> dist_ncd_arith('cat', 'hat')
124
    0.5454545454545454
125
    >>> dist_ncd_arith('Niall', 'Neil')
126
    0.6875
127
    >>> dist_ncd_arith('aluminum', 'Catalan')
128
    0.8275862068965517
129
    >>> dist_ncd_arith('ATCG', 'TAGC')
130
    0.6923076923076923
131
    """
132 1
    return NCDarith().dist(src, tar, probs)
133
134
135 1
def sim_ncd_arith(src, tar, probs=None):
136
    """Return the NCD similarity between two strings using arithmetic coding.
137
138
    This is a wrapper for :py:meth:`NCDarith.sim`.
139
140
    :param str src: source string for comparison
141
    :param str tar: target string for comparison
142
    :param dict probs: a dictionary trained with :py:meth:`Arithmetic.train`
143
    :returns: compression similarity
144
    :rtype: float
145
146
    >>> sim_ncd_arith('cat', 'hat')
147
    0.4545454545454546
148
    >>> sim_ncd_arith('Niall', 'Neil')
149
    0.3125
150
    >>> sim_ncd_arith('aluminum', 'Catalan')
151
    0.1724137931034483
152
    >>> sim_ncd_arith('ATCG', 'TAGC')
153
    0.3076923076923077
154
    """
155 1
    return NCDarith().sim(src, tar, probs)
156
157
158 1
class NCDrle(Distance):
0 ignored issues
show
Unused Code introduced by
The variable __class__ seems to be unused.
Loading history...
159
    """Normalized Compression Distance using RLE.
160
161
    Normalized compression distance (NCD) :cite:`Cilibrasi:2005`.
162
    """
163
164 1
    _rle = RLE()
165
166 1
    def dist(self, src, tar):
0 ignored issues
show
Bug introduced by
Parameters differ from overridden 'dist' method
Loading history...
167
        """Return the NCD between two strings using RLE.
168
169
        :param str src: source string for comparison
170
        :param str tar: target string for comparison
171
        :returns: compression distance
172
        :rtype: float
173
174
        >>> cmp = NCDrle()
175
        >>> cmp.dist('cat', 'hat')
176
        1.0
177
        >>> cmp.dist('Niall', 'Neil')
178
        1.0
179
        >>> cmp.dist('aluminum', 'Catalan')
180
        1.0
181
        >>> cmp.dist('ATCG', 'TAGC')
182
        1.0
183
        """
184 1
        if src == tar:
185 1
            return 0.0
186
187 1
        src_comp = self._rle.encode(src)
188 1
        tar_comp = self._rle.encode(tar)
189 1
        concat_comp = self._rle.encode(src + tar)
190 1
        concat_comp2 = self._rle.encode(tar + src)
191
192 1
        return (
193
            min(len(concat_comp), len(concat_comp2))
194
            - min(len(src_comp), len(tar_comp))
195
        ) / max(len(src_comp), len(tar_comp))
196
197
198 1
def dist_ncd_rle(src, tar):
199
    """Return the NCD between two strings using RLE.
200
201
    This is a wrapper for :py:meth:`NCDrle.dist`.
202
203
    :param str src: source string for comparison
204
    :param str tar: target string for comparison
205
    :returns: compression distance
206
    :rtype: float
207
208
    >>> dist_ncd_rle('cat', 'hat')
209
    1.0
210
    >>> dist_ncd_rle('Niall', 'Neil')
211
    1.0
212
    >>> dist_ncd_rle('aluminum', 'Catalan')
213
    1.0
214
    >>> dist_ncd_rle('ATCG', 'TAGC')
215
    1.0
216
    """
217 1
    return NCDrle().dist(src, tar)
218
219
220 1
def sim_ncd_rle(src, tar):
221
    """Return the NCD similarity between two strings using RLE.
222
223
    This is a wrapper for :py:meth:`NCDrle.sim`.
224
225
    :param str src: source string for comparison
226
    :param str tar: target string for comparison
227
    :returns: compression similarity
228
    :rtype: float
229
230
    >>> sim_ncd_rle('cat', 'hat')
231
    0.0
232
    >>> sim_ncd_rle('Niall', 'Neil')
233
    0.0
234
    >>> sim_ncd_rle('aluminum', 'Catalan')
235
    0.0
236
    >>> sim_ncd_rle('ATCG', 'TAGC')
237
    0.0
238
    """
239 1
    return NCDrle().sim(src, tar)
240
241
242 1
class NCDbwtrle(NCDrle):
0 ignored issues
show
Unused Code introduced by
The variable __class__ seems to be unused.
Loading history...
243
    """Normalized Compression Distance using BWT plus RLE.
244
245
    Normalized compression distance (NCD) :cite:`Cilibrasi:2005`.
246
    """
247
248 1
    _bwt = BWT()
249
250 1
    def dist(self, src, tar):
251
        """Return the NCD between two strings using BWT plus RLE.
252
253
        :param str src: source string for comparison
254
        :param str tar: target string for comparison
255
        :returns: compression distance
256
        :rtype: float
257
258
        >>> cmp = NCDbwtrle()
259
        >>> cmp.dist('cat', 'hat')
260
        0.75
261
        >>> cmp.dist('Niall', 'Neil')
262
        0.8333333333333334
263
        >>> cmp.dist('aluminum', 'Catalan')
264
        1.0
265
        >>> cmp.dist('ATCG', 'TAGC')
266
        0.8
267
        """
268 1
        if src == tar:
269 1
            return 0.0
270
271 1
        src_comp = self._rle.encode(self._bwt.encode(src))
272 1
        tar_comp = self._rle.encode(self._bwt.encode(tar))
273 1
        concat_comp = self._rle.encode(self._bwt.encode(src + tar))
274 1
        concat_comp2 = self._rle.encode(self._bwt.encode(tar + src))
275
276 1
        return (
277
            min(len(concat_comp), len(concat_comp2))
278
            - min(len(src_comp), len(tar_comp))
279
        ) / max(len(src_comp), len(tar_comp))
280
281
282 1
def dist_ncd_bwtrle(src, tar):
283
    """Return the NCD between two strings using BWT plus RLE.
284
285
    This is a wrapper for :py:meth:`NCDbwtrle.dist`.
286
287
    :param str src: source string for comparison
288
    :param str tar: target string for comparison
289
    :returns: compression distance
290
    :rtype: float
291
292
    >>> dist_ncd_bwtrle('cat', 'hat')
293
    0.75
294
    >>> dist_ncd_bwtrle('Niall', 'Neil')
295
    0.8333333333333334
296
    >>> dist_ncd_bwtrle('aluminum', 'Catalan')
297
    1.0
298
    >>> dist_ncd_bwtrle('ATCG', 'TAGC')
299
    0.8
300
    """
301 1
    return NCDbwtrle().dist(src, tar)
302
303
304 1
def sim_ncd_bwtrle(src, tar):
305
    """Return the NCD similarity between two strings using BWT plus RLE.
306
307
    This is a wrapper for :py:meth:`NCDbwtrle.sim`.
308
309
    :param str src: source string for comparison
310
    :param str tar: target string for comparison
311
    :returns: compression similarity
312
    :rtype: float
313
314
    >>> sim_ncd_bwtrle('cat', 'hat')
315
    0.25
316
    >>> sim_ncd_bwtrle('Niall', 'Neil')
317
    0.16666666666666663
318
    >>> sim_ncd_bwtrle('aluminum', 'Catalan')
319
    0.0
320
    >>> sim_ncd_bwtrle('ATCG', 'TAGC')
321
    0.19999999999999996
322
    """
323 1
    return NCDbwtrle().sim(src, tar)
324
325
326 1
class NCDzlib(Distance):
0 ignored issues
show
Unused Code introduced by
The variable __class__ seems to be unused.
Loading history...
327
    """Normalized Compression Distance using zlib compression.
328
329
    Normalized compression distance (NCD) :cite:`Cilibrasi:2005`.
330
    """
331
332 1
    _compressor = None
333
334 1
    def __init__(self, level=zlib.Z_DEFAULT_COMPRESSION):
335
        """Initialize zlib compressor.
336
337
        :param level: The compression level (0 to 9)
338
        """
339 1
        self._compressor = zlib.compressobj(level)
340
341 1
    def dist(self, src, tar):
0 ignored issues
show
Bug introduced by
Parameters differ from overridden 'dist' method
Loading history...
342
        """Return the NCD between two strings using zlib compression.
343
344
        :param str src: source string for comparison
345
        :param str tar: target string for comparison
346
        :returns: compression distance
347
        :rtype: float
348
349
        >>> cmp = NCDzlib()
350
        >>> cmp.dist('cat', 'hat')
351
        0.3333333333333333
352
        >>> cmp.dist('Niall', 'Neil')
353
        0.45454545454545453
354
        >>> cmp.dist('aluminum', 'Catalan')
355
        0.5714285714285714
356
        >>> cmp.dist('ATCG', 'TAGC')
357
        0.4
358
        """
359 1
        if src == tar:
360 1
            return 0.0
361
362 1
        src = src.encode('utf-8')
363 1
        tar = tar.encode('utf-8')
364
365 1
        self._compressor.compress(src)
366 1
        src_comp = self._compressor.flush(zlib.Z_FULL_FLUSH)
367 1
        self._compressor.compress(tar)
368 1
        tar_comp = self._compressor.flush(zlib.Z_FULL_FLUSH)
369 1
        self._compressor.compress(src + tar)
370 1
        concat_comp = self._compressor.flush(zlib.Z_FULL_FLUSH)
371 1
        self._compressor.compress(tar + src)
372 1
        concat_comp2 = self._compressor.flush(zlib.Z_FULL_FLUSH)
373
374 1
        return (
375
            min(len(concat_comp), len(concat_comp2))
376
            - min(len(src_comp), len(tar_comp))
377
        ) / max(len(src_comp), len(tar_comp))
378
379
380 1
def dist_ncd_zlib(src, tar):
381
    """Return the NCD between two strings using zlib compression.
382
383
    This is a wrapper for :py:meth:`NCDzlib.dist`.
384
385
    :param str src: source string for comparison
386
    :param str tar: target string for comparison
387
    :returns: compression distance
388
    :rtype: float
389
390
    >>> dist_ncd_zlib('cat', 'hat')
391
    0.3333333333333333
392
    >>> dist_ncd_zlib('Niall', 'Neil')
393
    0.45454545454545453
394
    >>> dist_ncd_zlib('aluminum', 'Catalan')
395
    0.5714285714285714
396
    >>> dist_ncd_zlib('ATCG', 'TAGC')
397
    0.4
398
    """
399 1
    return NCDzlib().dist(src, tar)
400
401
402 1
def sim_ncd_zlib(src, tar):
403
    """Return the NCD similarity between two strings using zlib compression.
404
405
    This is a wrapper for :py:meth:`NCDzlib.sim`.
406
407
    :param str src: source string for comparison
408
    :param str tar: target string for comparison
409
    :returns: compression similarity
410
    :rtype: float
411
412
    >>> sim_ncd_zlib('cat', 'hat')
413
    0.6666666666666667
414
    >>> sim_ncd_zlib('Niall', 'Neil')
415
    0.5454545454545454
416
    >>> sim_ncd_zlib('aluminum', 'Catalan')
417
    0.4285714285714286
418
    >>> sim_ncd_zlib('ATCG', 'TAGC')
419
    0.6
420
    """
421 1
    return NCDzlib().sim(src, tar)
422
423
424 1
class NCDbz2(Distance):
0 ignored issues
show
Unused Code introduced by
The variable __class__ seems to be unused.
Loading history...
425
    """Normalized Compression Distance using bz2 compression.
426
427
    Normalized compression distance (NCD) :cite:`Cilibrasi:2005`.
428
    """
429
430 1
    _level = 9
431
432 1
    def __init__(self, level=9):
433
        """Initialize zlib compressor.
434
435
        :param level: The compression level (0 to 9)
436
        """
437 1
        self._level = level
438
439 1
    def dist(self, src, tar):
0 ignored issues
show
Bug introduced by
Parameters differ from overridden 'dist' method
Loading history...
440
        """Return the NCD between two strings using bz2 compression.
441
442
        :param str src: source string for comparison
443
        :param str tar: target string for comparison
444
        :returns: compression distance
445
        :rtype: float
446
447
        >>> cmp = NCDbz2()
448
        >>> cmp.dist('cat', 'hat')
449
        0.06666666666666667
450
        >>> cmp.dist('Niall', 'Neil')
451
        0.03125
452
        >>> cmp.dist('aluminum', 'Catalan')
453
        0.17647058823529413
454
        >>> cmp.dist('ATCG', 'TAGC')
455
        0.03125
456
        """
457 1
        if src == tar:
458 1
            return 0.0
459
460 1
        src = src.encode('utf-8')
461 1
        tar = tar.encode('utf-8')
462
463 1
        src_comp = bz2.compress(src, self._level)[10:]
464 1
        tar_comp = bz2.compress(tar, self._level)[10:]
465 1
        concat_comp = bz2.compress(src + tar, self._level)[10:]
466 1
        concat_comp2 = bz2.compress(tar + src, self._level)[10:]
467
468 1
        return (
469
            min(len(concat_comp), len(concat_comp2))
470
            - min(len(src_comp), len(tar_comp))
471
        ) / max(len(src_comp), len(tar_comp))
472
473
474 1
def dist_ncd_bz2(src, tar):
475
    """Return the NCD between two strings using bz2 compression.
476
477
    This is a wrapper for :py:meth:`NCDbz2.dist`.
478
479
    :param str src: source string for comparison
480
    :param str tar: target string for comparison
481
    :returns: compression distance
482
    :rtype: float
483
484
    >>> dist_ncd_bz2('cat', 'hat')
485
    0.06666666666666667
486
    >>> dist_ncd_bz2('Niall', 'Neil')
487
    0.03125
488
    >>> dist_ncd_bz2('aluminum', 'Catalan')
489
    0.17647058823529413
490
    >>> dist_ncd_bz2('ATCG', 'TAGC')
491
    0.03125
492
    """
493 1
    return NCDbz2().dist(src, tar)
494
495
496 1
def sim_ncd_bz2(src, tar):
497
    """Return the NCD similarity between two strings using bz2 compression.
498
499
    This is a wrapper for :py:meth:`NCDbz2.sim`.
500
501
    :param str src: source string for comparison
502
    :param str tar: target string for comparison
503
    :returns: compression similarity
504
    :rtype: float
505
506
    >>> sim_ncd_bz2('cat', 'hat')
507
    0.9333333333333333
508
    >>> sim_ncd_bz2('Niall', 'Neil')
509
    0.96875
510
    >>> sim_ncd_bz2('aluminum', 'Catalan')
511
    0.8235294117647058
512
    >>> sim_ncd_bz2('ATCG', 'TAGC')
513
    0.96875
514
    """
515 1
    return NCDbz2().sim(src, tar)
516
517
518 1
class NCDlzma(Distance):
0 ignored issues
show
Unused Code introduced by
The variable __class__ seems to be unused.
Loading history...
519
    """Normalized Compression Distance using lzma compression.
520
521
    Normalized compression distance (NCD) :cite:`Cilibrasi:2005`.
522
    """
523
524 1
    def dist(self, src, tar):
0 ignored issues
show
Bug introduced by
Parameters differ from overridden 'dist' method
Loading history...
525
        """Return the NCD between two strings using lzma compression.
526
527
        :param str src: source string for comparison
528
        :param str tar: target string for comparison
529
        :returns: compression distance
530
        :rtype: float
531
532
        >>> cmp = NCDlzma()
533
        >>> cmp.dist('cat', 'hat')
534
        0.08695652173913043
535
        >>> cmp.dist('Niall', 'Neil')
536
        0.16
537
        >>> cmp.dist('aluminum', 'Catalan')
538
        0.16
539
        >>> cmp.dist('ATCG', 'TAGC')
540
        0.08695652173913043
541
        """
542
        if src == tar:
543
            return 0.0
544
545
        src = src.encode('utf-8')
546
        tar = tar.encode('utf-8')
547
548
        if lzma is not None:
549
            src_comp = lzma.compress(src)[14:]
550
            tar_comp = lzma.compress(tar)[14:]
551
            concat_comp = lzma.compress(src + tar)[14:]
552
            concat_comp2 = lzma.compress(tar + src)[14:]
553
        else:  # pragma: no cover
554
            raise ValueError(
555
                'Install the PylibLZMA module in order to use lzma'
556
            )
557
558
        return (
559
            min(len(concat_comp), len(concat_comp2))
560
            - min(len(src_comp), len(tar_comp))
561
        ) / max(len(src_comp), len(tar_comp))
562
563
564 1
def dist_ncd_lzma(src, tar):
565
    """Return the NCD between two strings using lzma compression.
566
567
    This is a wrapper for :py:meth:`NCDlzma.dist`.
568
569
    :param str src: source string for comparison
570
    :param str tar: target string for comparison
571
    :returns: compression distance
572
    :rtype: float
573
574
    >>> dist_ncd_lzma('cat', 'hat')
575
    0.08695652173913043
576
    >>> dist_ncd_lzma('Niall', 'Neil')
577
    0.16
578
    >>> dist_ncd_lzma('aluminum', 'Catalan')
579
    0.16
580
    >>> dist_ncd_lzma('ATCG', 'TAGC')
581
    0.08695652173913043
582
    """
583
    return NCDlzma().dist(src, tar)
584
585
586 1
def sim_ncd_lzma(src, tar):
587
    """Return the NCD similarity between two strings using lzma compression.
588
589
    This is a wrapper for :py:meth:`NCDlzma.sim`.
590
591
    :param str src: source string for comparison
592
    :param str tar: target string for comparison
593
    :returns: compression similarity
594
    :rtype: float
595
596
    >>> sim_ncd_lzma('cat', 'hat')
597
    0.9130434782608696
598
    >>> sim_ncd_lzma('Niall', 'Neil')
599
    0.84
600
    >>> sim_ncd_lzma('aluminum', 'Catalan')
601
    0.84
602
    >>> sim_ncd_lzma('ATCG', 'TAGC')
603
    0.9130434782608696
604
    """
605
    return NCDlzma().sim(src, tar)
606
607
608
if __name__ == '__main__':
609
    import doctest
610
611
    doctest.testmod()
612