Completed
Push — master ( f43547...71985b )
by Chris
12:00 queued 10s
created

abydos.distance._ncd_bz2.NCDbz2.dist()   A

Complexity

Conditions 2

Size

Total Lines 43
Code Lines 13

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 10
CRAP Score 2

Importance

Changes 0
Metric Value
cc 2
eloc 13
nop 3
dl 0
loc 43
ccs 10
cts 10
cp 1
crap 2
rs 9.75
c 0
b 0
f 0
1
# -*- coding: utf-8 -*-
2
3
# Copyright 2014-2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19 1
"""abydos.distance._ncd_bz2.
20
21
NCD using bzip2
22
"""
23
24 1
from __future__ import (
25
    absolute_import,
26
    division,
27
    print_function,
28
    unicode_literals,
29
)
30
31 1
import bz2
32
33 1
from ._distance import _Distance
34
35 1
__all__ = ['NCDbz2', 'dist_ncd_bz2', 'sim_ncd_bz2']
36
37
38 1
class NCDbz2(_Distance):
0 ignored issues
show
Unused Code introduced by
The variable __class__ seems to be unused.
Loading history...
39
    """Normalized Compression Distance using bzip2 compression.
40
41
    Cf. https://en.wikipedia.org/wiki/Bzip2
42
43
    Normalized compression distance (NCD) :cite:`Cilibrasi:2005`.
44
    """
45
46 1
    _level = 9
47
48 1
    def __init__(self, level=9):
49
        """Initialize bzip2 compressor.
50
51
        Parameters
52
        ----------
53
        level : int
54
            The compression level (0 to 9)
55
56
        """
57 1
        self._level = level
58
59 1
    def dist(self, src, tar):
0 ignored issues
show
Bug introduced by
Parameters differ from overridden 'dist' method
Loading history...
60
        """Return the NCD between two strings using bzip2 compression.
61
62
        Parameters
63
        ----------
64
        src : str
65
            Source string for comparison
66
        tar : str
67
            Target string for comparison
68
69
        Returns
70
        -------
71
        float
72
            Compression distance
73
74
        Examples
75
        --------
76
        >>> cmp = NCDbz2()
77
        >>> cmp.dist('cat', 'hat')
78
        0.06666666666666667
79
        >>> cmp.dist('Niall', 'Neil')
80
        0.03125
81
        >>> cmp.dist('aluminum', 'Catalan')
82
        0.17647058823529413
83
        >>> cmp.dist('ATCG', 'TAGC')
84
        0.03125
85
86
        """
87 1
        if src == tar:
88 1
            return 0.0
89
90 1
        src = src.encode('utf-8')
91 1
        tar = tar.encode('utf-8')
92
93 1
        src_comp = bz2.compress(src, self._level)[10:]
94 1
        tar_comp = bz2.compress(tar, self._level)[10:]
95 1
        concat_comp = bz2.compress(src + tar, self._level)[10:]
96 1
        concat_comp2 = bz2.compress(tar + src, self._level)[10:]
97
98 1
        return (
99
            min(len(concat_comp), len(concat_comp2))
100
            - min(len(src_comp), len(tar_comp))
101
        ) / max(len(src_comp), len(tar_comp))
102
103
104 1
def dist_ncd_bz2(src, tar):
105
    """Return the NCD between two strings using bzip2 compression.
106
107
    This is a wrapper for :py:meth:`NCDbz2.dist`.
108
109
    Parameters
110
    ----------
111
    src : str
112
        Source string for comparison
113
    tar : str
114
        Target string for comparison
115
116
    Returns
117
    -------
118
    float
119
        Compression distance
120
121
    Examples
122
    --------
123
    >>> dist_ncd_bz2('cat', 'hat')
124
    0.06666666666666667
125
    >>> dist_ncd_bz2('Niall', 'Neil')
126
    0.03125
127
    >>> dist_ncd_bz2('aluminum', 'Catalan')
128
    0.17647058823529413
129
    >>> dist_ncd_bz2('ATCG', 'TAGC')
130
    0.03125
131
132
    """
133 1
    return NCDbz2().dist(src, tar)
134
135
136 1
def sim_ncd_bz2(src, tar):
137
    """Return the NCD similarity between two strings using bzip2 compression.
138
139
    This is a wrapper for :py:meth:`NCDbz2.sim`.
140
141
    Parameters
142
    ----------
143
    src : str
144
        Source string for comparison
145
    tar : str
146
        Target string for comparison
147
148
    Returns
149
    -------
150
    float
151
        Compression similarity
152
153
    Examples
154
    --------
155
    >>> sim_ncd_bz2('cat', 'hat')
156
    0.9333333333333333
157
    >>> sim_ncd_bz2('Niall', 'Neil')
158
    0.96875
159
    >>> sim_ncd_bz2('aluminum', 'Catalan')
160
    0.8235294117647058
161
    >>> sim_ncd_bz2('ATCG', 'TAGC')
162
    0.96875
163
164
    """
165 1
    return NCDbz2().sim(src, tar)
166
167
168
if __name__ == '__main__':
169
    import doctest
170
171
    doctest.testmod()
172