Completed
Push — master ( f43547...71985b )
by Chris
12:00 queued 10s
created

abydos.distance._ncd_lzma.dist_ncd_lzma()   A

Complexity

Conditions 1

Size

Total Lines 30
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 1
CRAP Score 1.125

Importance

Changes 0
Metric Value
cc 1
eloc 2
nop 2
dl 0
loc 30
ccs 1
cts 2
cp 0.5
crap 1.125
rs 10
c 0
b 0
f 0
1
# -*- coding: utf-8 -*-
2
3
# Copyright 2014-2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19 1
"""abydos.distance._ncd_lzma.
20
21
NCD using LZMA
22
"""
23
24 1
from __future__ import (
25
    absolute_import,
26
    division,
27
    print_function,
28
    unicode_literals,
29
)
30
31 1
from ._distance import _Distance
32
33 1
try:
34 1
    import lzma
35
except ImportError:  # pragma: no cover
36
    # If the system lacks the lzma library, that's fine, but lzma compression
37
    # similarity won't be supported.
38
    lzma = None
39
40 1
__all__ = ['NCDlzma', 'dist_ncd_lzma', 'sim_ncd_lzma']
41
42
43 1
class NCDlzma(_Distance):
0 ignored issues
show
Unused Code introduced by
The variable __class__ seems to be unused.
Loading history...
44
    """Normalized Compression Distance using LZMA compression.
45
46
    Cf. https://en.wikipedia.org/wiki/Lempel-Ziv-Markov_chain_algorithm
47
48
    Normalized compression distance (NCD) :cite:`Cilibrasi:2005`.
49
    """
50
51 1
    def dist(self, src, tar):
0 ignored issues
show
Bug introduced by
Parameters differ from overridden 'dist' method
Loading history...
52
        """Return the NCD between two strings using LZMA compression.
53
54
        Parameters
55
        ----------
56
        src : str
57
            Source string for comparison
58
        tar : str
59
            Target string for comparison
60
61
        Returns
62
        -------
63
        float
64
            Compression distance
65
66
        Raises
67
        ------
68
        ValueError
69
            Install the PylibLZMA module in order to use LZMA
70
71
        Examples
72
        --------
73
        >>> cmp = NCDlzma()
74
        >>> cmp.dist('cat', 'hat')
75
        0.08695652173913043
76
        >>> cmp.dist('Niall', 'Neil')
77
        0.16
78
        >>> cmp.dist('aluminum', 'Catalan')
79
        0.16
80
        >>> cmp.dist('ATCG', 'TAGC')
81
        0.08695652173913043
82
83
        """
84
        if src == tar:
85
            return 0.0
86
87
        src = src.encode('utf-8')
88
        tar = tar.encode('utf-8')
89
90
        if lzma is not None:
91
            src_comp = lzma.compress(src)[14:]
92
            tar_comp = lzma.compress(tar)[14:]
93
            concat_comp = lzma.compress(src + tar)[14:]
94
            concat_comp2 = lzma.compress(tar + src)[14:]
95
        else:  # pragma: no cover
96
            raise ValueError(
97
                'Install the PylibLZMA module in order to use LZMA'
98
            )
99
100
        return (
101
            min(len(concat_comp), len(concat_comp2))
102
            - min(len(src_comp), len(tar_comp))
103
        ) / max(len(src_comp), len(tar_comp))
104
105
106 1
def dist_ncd_lzma(src, tar):
107
    """Return the NCD between two strings using LZMA compression.
108
109
    This is a wrapper for :py:meth:`NCDlzma.dist`.
110
111
    Parameters
112
    ----------
113
    src : str
114
        Source string for comparison
115
    tar : str
116
        Target string for comparison
117
118
    Returns
119
    -------
120
    float
121
        Compression distance
122
123
    Examples
124
    --------
125
    >>> dist_ncd_lzma('cat', 'hat')
126
    0.08695652173913043
127
    >>> dist_ncd_lzma('Niall', 'Neil')
128
    0.16
129
    >>> dist_ncd_lzma('aluminum', 'Catalan')
130
    0.16
131
    >>> dist_ncd_lzma('ATCG', 'TAGC')
132
    0.08695652173913043
133
134
    """
135
    return NCDlzma().dist(src, tar)
136
137
138 1
def sim_ncd_lzma(src, tar):
139
    """Return the NCD similarity between two strings using LZMA compression.
140
141
    This is a wrapper for :py:meth:`NCDlzma.sim`.
142
143
    Parameters
144
    ----------
145
    src : str
146
        Source string for comparison
147
    tar : str
148
        Target string for comparison
149
150
    Returns
151
    -------
152
    float
153
        Compression similarity
154
155
    Examples
156
    --------
157
    >>> sim_ncd_lzma('cat', 'hat')
158
    0.9130434782608696
159
    >>> sim_ncd_lzma('Niall', 'Neil')
160
    0.84
161
    >>> sim_ncd_lzma('aluminum', 'Catalan')
162
    0.84
163
    >>> sim_ncd_lzma('ATCG', 'TAGC')
164
    0.9130434782608696
165
166
    """
167
    return NCDlzma().sim(src, tar)
168
169
170
if __name__ == '__main__':
171
    import doctest
172
173
    doctest.testmod()
174