Completed
Pull Request — master (#141)
by Chris
13:03
created

abydos.distance._compression.sim_ncd_lzma()   A

Complexity

Conditions 1

Size

Total Lines 24
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 1
CRAP Score 1.125

Importance

Changes 0
Metric Value
eloc 2
dl 0
loc 24
ccs 1
cts 2
cp 0.5
rs 10
c 0
b 0
f 0
cc 1
nop 2
crap 1.125
1
# -*- coding: utf-8 -*-
2
3
# Copyright 2014-2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19 1
"""abydos.distance.compression.
20
21
The distance.compression module implements compression distance measures.
22
"""
23
24 1
from __future__ import division, unicode_literals
25
26 1
import bz2
27 1
import zlib
28
29 1
from ._distance import Distance
30 1
from ..compression import Arithmetic, BWT, RLE
31
32 1
try:
33 1
    import lzma
34
except ImportError:  # pragma: no cover
35
    # If the system lacks the lzma library, that's fine, but lzma compression
36
    # similarity won't be supported.
37
    lzma = None
38
39 1
__all__ = [
40
    'NCDzlib',
41
    'dist_ncd_zlib',
42
    'sim_ncd_zlib',
43
    'NCDbz2',
44
    'dist_ncd_bz2',
45
    'sim_ncd_bz2',
46
    'NCDlzma',
47
    'dist_ncd_lzma',
48
    'sim_ncd_lzma',
49
    'NCDbwtrle',
50
    'dist_ncd_bwtrle',
51
    'sim_ncd_bwtrle',
52
    'NCDrle',
53
    'dist_ncd_rle',
54
    'sim_ncd_rle',
55
    'NCDarith',
56
    'dist_ncd_arith',
57
    'sim_ncd_arith',
58
]
59
60
61 1
class NCDarith(Distance):
0 ignored issues
show
Unused Code introduced by
The variable __class__ seems to be unused.
Loading history...
62
    """Normalized Compression Distance using Arithmetic Coding.
63
64
    Normalized compression distance (NCD) :cite:`Cilibrasi:2005`.
65
    """
66
67 1
    _coder = None
68
69 1
    def __init__(self):
70
        """Initialize the arithmetic coder object."""
71 1
        self._coder = Arithmetic()
72
73 1
    def dist(self, src, tar, probs=None):
0 ignored issues
show
Bug introduced by
Parameters differ from overridden 'dist' method
Loading history...
74
        """Return the NCD between two strings using arithmetic coding.
75
76
        Args:
77
            src (str): Source string for comparison
78
            tar (str): Target string for comparison
79
            probs (dict): A dictionary trained with :py:meth:`Arithmetic.train`
80
81
        Returns:
82
            float: Compression distance
83
84
        Examples:
85
            >>> cmp = NCDarith()
86
            >>> cmp.dist('cat', 'hat')
87
            0.5454545454545454
88
            >>> cmp.dist('Niall', 'Neil')
89
            0.6875
90
            >>> cmp.dist('aluminum', 'Catalan')
91
            0.8275862068965517
92
            >>> cmp.dist('ATCG', 'TAGC')
93
            0.6923076923076923
94
95
        """
96 1
        if src == tar:
97 1
            return 0.0
98
99 1
        if probs is None:
100
            # lacking a reasonable dictionary, train on the strings themselves
101 1
            self._coder.train(src + tar)
102
        else:
103 1
            self._coder.set_probs(probs)
104
105 1
        src_comp = self._coder.encode(src)[1]
106 1
        tar_comp = self._coder.encode(tar)[1]
107 1
        concat_comp = self._coder.encode(src + tar)[1]
108 1
        concat_comp2 = self._coder.encode(tar + src)[1]
109
110 1
        return (
111
            min(concat_comp, concat_comp2) - min(src_comp, tar_comp)
112
        ) / max(src_comp, tar_comp)
113
114
115 1
def dist_ncd_arith(src, tar, probs=None):
116
    """Return the NCD between two strings using arithmetic coding.
117
118
    This is a wrapper for :py:meth:`NCDarith.dist`.
119
120
    Args:
121
        src (str): Source string for comparison
122
        tar (str): Target string for comparison
123
        probs (dict): A dictionary trained with :py:meth:`Arithmetic.train`
124
125
    Returns:
126
        float: Compression distance
127
128
    Examples:
129
        >>> dist_ncd_arith('cat', 'hat')
130
        0.5454545454545454
131
        >>> dist_ncd_arith('Niall', 'Neil')
132
        0.6875
133
        >>> dist_ncd_arith('aluminum', 'Catalan')
134
        0.8275862068965517
135
        >>> dist_ncd_arith('ATCG', 'TAGC')
136
        0.6923076923076923
137
138
    """
139 1
    return NCDarith().dist(src, tar, probs)
140
141
142 1
def sim_ncd_arith(src, tar, probs=None):
143
    """Return the NCD similarity between two strings using arithmetic coding.
144
145
    This is a wrapper for :py:meth:`NCDarith.sim`.
146
147
    Args:
148
        src (str): Source string for comparison
149
        tar (str): Target string for comparison
150
        probs (dict): A dictionary trained with :py:meth:`Arithmetic.train`
151
152
    Returns:
153
        float: Compression similarity
154
155
    Examples:
156
        >>> sim_ncd_arith('cat', 'hat')
157
        0.4545454545454546
158
        >>> sim_ncd_arith('Niall', 'Neil')
159
        0.3125
160
        >>> sim_ncd_arith('aluminum', 'Catalan')
161
        0.1724137931034483
162
        >>> sim_ncd_arith('ATCG', 'TAGC')
163
        0.3076923076923077
164
165
    """
166 1
    return NCDarith().sim(src, tar, probs)
167
168
169 1
class NCDrle(Distance):
0 ignored issues
show
Unused Code introduced by
The variable __class__ seems to be unused.
Loading history...
170
    """Normalized Compression Distance using RLE.
171
172
    Normalized compression distance (NCD) :cite:`Cilibrasi:2005`.
173
    """
174
175 1
    _rle = RLE()
176
177 1
    def dist(self, src, tar):
0 ignored issues
show
Bug introduced by
Parameters differ from overridden 'dist' method
Loading history...
178
        """Return the NCD between two strings using RLE.
179
180
        Args:
181
            src (str): Source string for comparison
182
            tar (str): Target string for comparison
183
184
        Returns:
185
            float: Compression distance
186
187
        Examples:
188
            >>> cmp = NCDrle()
189
            >>> cmp.dist('cat', 'hat')
190
            1.0
191
            >>> cmp.dist('Niall', 'Neil')
192
            1.0
193
            >>> cmp.dist('aluminum', 'Catalan')
194
            1.0
195
            >>> cmp.dist('ATCG', 'TAGC')
196
            1.0
197
198
        """
199 1
        if src == tar:
200 1
            return 0.0
201
202 1
        src_comp = self._rle.encode(src)
203 1
        tar_comp = self._rle.encode(tar)
204 1
        concat_comp = self._rle.encode(src + tar)
205 1
        concat_comp2 = self._rle.encode(tar + src)
206
207 1
        return (
208
            min(len(concat_comp), len(concat_comp2))
209
            - min(len(src_comp), len(tar_comp))
210
        ) / max(len(src_comp), len(tar_comp))
211
212
213 1
def dist_ncd_rle(src, tar):
214
    """Return the NCD between two strings using RLE.
215
216
    This is a wrapper for :py:meth:`NCDrle.dist`.
217
218
    Args:
219
        src (str): Source string for comparison
220
        tar (str): Target string for comparison
221
222
    Returns:
223
        float: Compression distance
224
225
    Examples:
226
        >>> dist_ncd_rle('cat', 'hat')
227
        1.0
228
        >>> dist_ncd_rle('Niall', 'Neil')
229
        1.0
230
        >>> dist_ncd_rle('aluminum', 'Catalan')
231
        1.0
232
        >>> dist_ncd_rle('ATCG', 'TAGC')
233
        1.0
234
235
    """
236 1
    return NCDrle().dist(src, tar)
237
238
239 1
def sim_ncd_rle(src, tar):
240
    """Return the NCD similarity between two strings using RLE.
241
242
    This is a wrapper for :py:meth:`NCDrle.sim`.
243
244
    Args:
245
        src (str): Source string for comparison
246
        tar (str): Target string for comparison
247
248
    Returns:
249
        float: Compression similarity
250
251
    Examples:
252
        >>> sim_ncd_rle('cat', 'hat')
253
        0.0
254
        >>> sim_ncd_rle('Niall', 'Neil')
255
        0.0
256
        >>> sim_ncd_rle('aluminum', 'Catalan')
257
        0.0
258
        >>> sim_ncd_rle('ATCG', 'TAGC')
259
        0.0
260
261
    """
262 1
    return NCDrle().sim(src, tar)
263
264
265 1
class NCDbwtrle(NCDrle):
0 ignored issues
show
Unused Code introduced by
The variable __class__ seems to be unused.
Loading history...
266
    """Normalized Compression Distance using BWT plus RLE.
267
268
    Normalized compression distance (NCD) :cite:`Cilibrasi:2005`.
269
    """
270
271 1
    _bwt = BWT()
272
273 1
    def dist(self, src, tar):
274
        """Return the NCD between two strings using BWT plus RLE.
275
276
        Args:
277
            src (str): Source string for comparison
278
            tar (str): Target string for comparison
279
280
        Returns:
281
            float: Compression distance
282
283
        Examples:
284
            >>> cmp = NCDbwtrle()
285
            >>> cmp.dist('cat', 'hat')
286
            0.75
287
            >>> cmp.dist('Niall', 'Neil')
288
            0.8333333333333334
289
            >>> cmp.dist('aluminum', 'Catalan')
290
            1.0
291
            >>> cmp.dist('ATCG', 'TAGC')
292
            0.8
293
294
        """
295 1
        if src == tar:
296 1
            return 0.0
297
298 1
        src_comp = self._rle.encode(self._bwt.encode(src))
299 1
        tar_comp = self._rle.encode(self._bwt.encode(tar))
300 1
        concat_comp = self._rle.encode(self._bwt.encode(src + tar))
301 1
        concat_comp2 = self._rle.encode(self._bwt.encode(tar + src))
302
303 1
        return (
304
            min(len(concat_comp), len(concat_comp2))
305
            - min(len(src_comp), len(tar_comp))
306
        ) / max(len(src_comp), len(tar_comp))
307
308
309 1
def dist_ncd_bwtrle(src, tar):
310
    """Return the NCD between two strings using BWT plus RLE.
311
312
    This is a wrapper for :py:meth:`NCDbwtrle.dist`.
313
314
    Args:
315
        src (str): Source string for comparison
316
        tar (str): Target string for comparison
317
318
    Returns:
319
        float: Compression distance
320
321
    Examples:
322
        >>> dist_ncd_bwtrle('cat', 'hat')
323
        0.75
324
        >>> dist_ncd_bwtrle('Niall', 'Neil')
325
        0.8333333333333334
326
        >>> dist_ncd_bwtrle('aluminum', 'Catalan')
327
        1.0
328
        >>> dist_ncd_bwtrle('ATCG', 'TAGC')
329
        0.8
330
331
    """
332 1
    return NCDbwtrle().dist(src, tar)
333
334
335 1
def sim_ncd_bwtrle(src, tar):
336
    """Return the NCD similarity between two strings using BWT plus RLE.
337
338
    This is a wrapper for :py:meth:`NCDbwtrle.sim`.
339
340
    Args:
341
        src (str): Source string for comparison
342
        tar (str): Target string for comparison
343
344
    Returns:
345
        float: Compression similarity
346
347
    Examples:
348
        >>> sim_ncd_bwtrle('cat', 'hat')
349
        0.25
350
        >>> sim_ncd_bwtrle('Niall', 'Neil')
351
        0.16666666666666663
352
        >>> sim_ncd_bwtrle('aluminum', 'Catalan')
353
        0.0
354
        >>> sim_ncd_bwtrle('ATCG', 'TAGC')
355
        0.19999999999999996
356
357
    """
358 1
    return NCDbwtrle().sim(src, tar)
359
360
361 1
class NCDzlib(Distance):
0 ignored issues
show
Unused Code introduced by
The variable __class__ seems to be unused.
Loading history...
362
    """Normalized Compression Distance using zlib compression.
363
364
    Normalized compression distance (NCD) :cite:`Cilibrasi:2005`.
365
    """
366
367 1
    _compressor = None
368
369 1
    def __init__(self, level=zlib.Z_DEFAULT_COMPRESSION):
370
        """Initialize zlib compressor.
371
372
        Args:
373
            level (int): The compression level (0 to 9)
374
        """
375 1
        self._compressor = zlib.compressobj(level)
376
377 1
    def dist(self, src, tar):
0 ignored issues
show
Bug introduced by
Parameters differ from overridden 'dist' method
Loading history...
378
        """Return the NCD between two strings using zlib compression.
379
380
        Args:
381
            src (str): Source string for comparison
382
            tar (str): Target string for comparison
383
384
        Returns:
385
            float: Compression distance
386
387
        Examples:
388
            >>> cmp = NCDzlib()
389
            >>> cmp.dist('cat', 'hat')
390
            0.3333333333333333
391
            >>> cmp.dist('Niall', 'Neil')
392
            0.45454545454545453
393
            >>> cmp.dist('aluminum', 'Catalan')
394
            0.5714285714285714
395
            >>> cmp.dist('ATCG', 'TAGC')
396
            0.4
397
398
        """
399 1
        if src == tar:
400 1
            return 0.0
401
402 1
        src = src.encode('utf-8')
403 1
        tar = tar.encode('utf-8')
404
405 1
        self._compressor.compress(src)
406 1
        src_comp = self._compressor.flush(zlib.Z_FULL_FLUSH)
407 1
        self._compressor.compress(tar)
408 1
        tar_comp = self._compressor.flush(zlib.Z_FULL_FLUSH)
409 1
        self._compressor.compress(src + tar)
410 1
        concat_comp = self._compressor.flush(zlib.Z_FULL_FLUSH)
411 1
        self._compressor.compress(tar + src)
412 1
        concat_comp2 = self._compressor.flush(zlib.Z_FULL_FLUSH)
413
414 1
        return (
415
            min(len(concat_comp), len(concat_comp2))
416
            - min(len(src_comp), len(tar_comp))
417
        ) / max(len(src_comp), len(tar_comp))
418
419
420 1
def dist_ncd_zlib(src, tar):
421
    """Return the NCD between two strings using zlib compression.
422
423
    This is a wrapper for :py:meth:`NCDzlib.dist`.
424
425
    Args:
426
        src (str): Source string for comparison
427
        tar (str): Target string for comparison
428
429
    Returns:
430
        float: Compression distance
431
432
    Examples:
433
        >>> dist_ncd_zlib('cat', 'hat')
434
        0.3333333333333333
435
        >>> dist_ncd_zlib('Niall', 'Neil')
436
        0.45454545454545453
437
        >>> dist_ncd_zlib('aluminum', 'Catalan')
438
        0.5714285714285714
439
        >>> dist_ncd_zlib('ATCG', 'TAGC')
440
        0.4
441
442
    """
443 1
    return NCDzlib().dist(src, tar)
444
445
446 1
def sim_ncd_zlib(src, tar):
447
    """Return the NCD similarity between two strings using zlib compression.
448
449
    This is a wrapper for :py:meth:`NCDzlib.sim`.
450
451
    Args:
452
        src (str): Source string for comparison
453
        tar (str): Target string for comparison
454
455
    Returns:
456
        float: Compression similarity
457
458
    Examples:
459
        >>> sim_ncd_zlib('cat', 'hat')
460
        0.6666666666666667
461
        >>> sim_ncd_zlib('Niall', 'Neil')
462
        0.5454545454545454
463
        >>> sim_ncd_zlib('aluminum', 'Catalan')
464
        0.4285714285714286
465
        >>> sim_ncd_zlib('ATCG', 'TAGC')
466
        0.6
467
468
    """
469 1
    return NCDzlib().sim(src, tar)
470
471
472 1
class NCDbz2(Distance):
0 ignored issues
show
Unused Code introduced by
The variable __class__ seems to be unused.
Loading history...
473
    """Normalized Compression Distance using bz2 compression.
474
475
    Normalized compression distance (NCD) :cite:`Cilibrasi:2005`.
476
    """
477
478 1
    _level = 9
479
480 1
    def __init__(self, level=9):
481
        """Initialize zlib compressor.
482
483
        Args:
484
            level (int): The compression level (0 to 9)
485
        """
486 1
        self._level = level
487
488 1
    def dist(self, src, tar):
0 ignored issues
show
Bug introduced by
Parameters differ from overridden 'dist' method
Loading history...
489
        """Return the NCD between two strings using bz2 compression.
490
491
        Args:
492
            src (str): Source string for comparison
493
            tar (str): Target string for comparison
494
495
        Returns:
496
            float: Compression distance
497
498
        Examples:
499
            >>> cmp = NCDbz2()
500
            >>> cmp.dist('cat', 'hat')
501
            0.06666666666666667
502
            >>> cmp.dist('Niall', 'Neil')
503
            0.03125
504
            >>> cmp.dist('aluminum', 'Catalan')
505
            0.17647058823529413
506
            >>> cmp.dist('ATCG', 'TAGC')
507
            0.03125
508
509
        """
510 1
        if src == tar:
511 1
            return 0.0
512
513 1
        src = src.encode('utf-8')
514 1
        tar = tar.encode('utf-8')
515
516 1
        src_comp = bz2.compress(src, self._level)[10:]
517 1
        tar_comp = bz2.compress(tar, self._level)[10:]
518 1
        concat_comp = bz2.compress(src + tar, self._level)[10:]
519 1
        concat_comp2 = bz2.compress(tar + src, self._level)[10:]
520
521 1
        return (
522
            min(len(concat_comp), len(concat_comp2))
523
            - min(len(src_comp), len(tar_comp))
524
        ) / max(len(src_comp), len(tar_comp))
525
526
527 1
def dist_ncd_bz2(src, tar):
528
    """Return the NCD between two strings using bz2 compression.
529
530
    This is a wrapper for :py:meth:`NCDbz2.dist`.
531
532
    Args:
533
        src (str): Source string for comparison
534
        tar (str): Target string for comparison
535
536
    Returns:
537
        float: Compression distance
538
539
    Examples:
540
        >>> dist_ncd_bz2('cat', 'hat')
541
        0.06666666666666667
542
        >>> dist_ncd_bz2('Niall', 'Neil')
543
        0.03125
544
        >>> dist_ncd_bz2('aluminum', 'Catalan')
545
        0.17647058823529413
546
        >>> dist_ncd_bz2('ATCG', 'TAGC')
547
        0.03125
548
549
    """
550 1
    return NCDbz2().dist(src, tar)
551
552
553 1
def sim_ncd_bz2(src, tar):
554
    """Return the NCD similarity between two strings using bz2 compression.
555
556
    This is a wrapper for :py:meth:`NCDbz2.sim`.
557
558
    Args:
559
        src (str): Source string for comparison
560
        tar (str): Target string for comparison
561
562
    Returns:
563
        float: Compression similarity
564
565
    Examples:
566
        >>> sim_ncd_bz2('cat', 'hat')
567
        0.9333333333333333
568
        >>> sim_ncd_bz2('Niall', 'Neil')
569
        0.96875
570
        >>> sim_ncd_bz2('aluminum', 'Catalan')
571
        0.8235294117647058
572
        >>> sim_ncd_bz2('ATCG', 'TAGC')
573
        0.96875
574
575
    """
576 1
    return NCDbz2().sim(src, tar)
577
578
579 1
class NCDlzma(Distance):
0 ignored issues
show
Unused Code introduced by
The variable __class__ seems to be unused.
Loading history...
580
    """Normalized Compression Distance using lzma compression.
581
582
    Normalized compression distance (NCD) :cite:`Cilibrasi:2005`.
583
    """
584
585 1
    def dist(self, src, tar):
0 ignored issues
show
Bug introduced by
Parameters differ from overridden 'dist' method
Loading history...
586
        """Return the NCD between two strings using lzma compression.
587
588
        Args:
589
            src (str): Source string for comparison
590
            tar (str): Target string for comparison
591
592
        Returns:
593
            float: Compression distance
594
595
        Raises:
596
            ValueError: Install the PylibLZMA module in order to use lzma
597
598
        Examples:
599
            >>> cmp = NCDlzma()
600
            >>> cmp.dist('cat', 'hat')
601
            0.08695652173913043
602
            >>> cmp.dist('Niall', 'Neil')
603
            0.16
604
            >>> cmp.dist('aluminum', 'Catalan')
605
            0.16
606
            >>> cmp.dist('ATCG', 'TAGC')
607
            0.08695652173913043
608
609
        """
610
        if src == tar:
611
            return 0.0
612
613
        src = src.encode('utf-8')
614
        tar = tar.encode('utf-8')
615
616
        if lzma is not None:
617
            src_comp = lzma.compress(src)[14:]
618
            tar_comp = lzma.compress(tar)[14:]
619
            concat_comp = lzma.compress(src + tar)[14:]
620
            concat_comp2 = lzma.compress(tar + src)[14:]
621
        else:  # pragma: no cover
622
            raise ValueError(
623
                'Install the PylibLZMA module in order to use lzma'
624
            )
625
626
        return (
627
            min(len(concat_comp), len(concat_comp2))
628
            - min(len(src_comp), len(tar_comp))
629
        ) / max(len(src_comp), len(tar_comp))
630
631
632 1
def dist_ncd_lzma(src, tar):
633
    """Return the NCD between two strings using lzma compression.
634
635
    This is a wrapper for :py:meth:`NCDlzma.dist`.
636
637
    Args:
638
        src (str): Source string for comparison
639
        tar (str): Target string for comparison
640
641
    Returns:
642
        float: Compression distance
643
644
    Examples:
645
        >>> dist_ncd_lzma('cat', 'hat')
646
        0.08695652173913043
647
        >>> dist_ncd_lzma('Niall', 'Neil')
648
        0.16
649
        >>> dist_ncd_lzma('aluminum', 'Catalan')
650
        0.16
651
        >>> dist_ncd_lzma('ATCG', 'TAGC')
652
        0.08695652173913043
653
654
    """
655
    return NCDlzma().dist(src, tar)
656
657
658 1
def sim_ncd_lzma(src, tar):
659
    """Return the NCD similarity between two strings using lzma compression.
660
661
    This is a wrapper for :py:meth:`NCDlzma.sim`.
662
663
    Args:
664
        src (str): Source string for comparison
665
        tar (str): Target string for comparison
666
667
    Returns:
668
        float: Compression similarity
669
670
    Examples:
671
        >>> sim_ncd_lzma('cat', 'hat')
672
        0.9130434782608696
673
        >>> sim_ncd_lzma('Niall', 'Neil')
674
        0.84
675
        >>> sim_ncd_lzma('aluminum', 'Catalan')
676
        0.84
677
        >>> sim_ncd_lzma('ATCG', 'TAGC')
678
        0.9130434782608696
679
680
    """
681
    return NCDlzma().sim(src, tar)
682
683
684
if __name__ == '__main__':
685
    import doctest
686
687
    doctest.testmod()
688