Completed
Pull Request — master (#141)
by Chris
16:23
created

abydos.distance._ncd_arith.dist_ncd_arith()   A

Complexity

Conditions 1

Size

Total Lines 32
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 2
CRAP Score 1

Importance

Changes 0
Metric Value
cc 1
eloc 2
nop 3
dl 0
loc 32
ccs 2
cts 2
cp 1
crap 1
rs 10
c 0
b 0
f 0
1
# -*- coding: utf-8 -*-
2
3
# Copyright 2014-2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19 1
"""abydos.distance._ncd_arith.
20
21
NCD using Arithmetic Coding
22
"""
23
24 1
from __future__ import (
25
    absolute_import,
26
    division,
27
    print_function,
28
    unicode_literals,
29
)
30
31 1
from ._distance import _Distance
32 1
from ..compression import Arithmetic
33
34 1
__all__ = ['NCDarith', 'dist_ncd_arith', 'sim_ncd_arith']
35
36
37 1
class NCDarith(_Distance):
0 ignored issues
show
Unused Code introduced by
The variable __class__ seems to be unused.
Loading history...
38
    """Normalized Compression Distance using Arithmetic Coding.
39
40
    Normalized compression distance (NCD) :cite:`Cilibrasi:2005`.
41
    """
42
43 1
    _coder = None
44
45 1
    def __init__(self):
46
        """Initialize the arithmetic coder object."""
47 1
        self._coder = Arithmetic()
48
49 1
    def dist(self, src, tar, probs=None):
0 ignored issues
show
Bug introduced by
Parameters differ from overridden 'dist' method
Loading history...
50
        """Return the NCD between two strings using arithmetic coding.
51
52
        Parameters
53
        ----------
54
        src : str
55
            Source string for comparison
56
        tar : str
57
            Target string for comparison
58
        probs : dict
59
            A dictionary trained with :py:meth:`Arithmetic.train`
60
61
        Returns
62
        -------
63
        float
64
            Compression distance
65
66
        Examples
67
        --------
68
        >>> cmp = NCDarith()
69
        >>> cmp.dist('cat', 'hat')
70
        0.5454545454545454
71
        >>> cmp.dist('Niall', 'Neil')
72
        0.6875
73
        >>> cmp.dist('aluminum', 'Catalan')
74
        0.8275862068965517
75
        >>> cmp.dist('ATCG', 'TAGC')
76
        0.6923076923076923
77
78
        """
79 1
        if src == tar:
80 1
            return 0.0
81
82 1
        if probs is None:
83
            # lacking a reasonable dictionary, train on the strings themselves
84 1
            self._coder.train(src + tar)
85
        else:
86 1
            self._coder.set_probs(probs)
87
88 1
        src_comp = self._coder.encode(src)[1]
89 1
        tar_comp = self._coder.encode(tar)[1]
90 1
        concat_comp = self._coder.encode(src + tar)[1]
91 1
        concat_comp2 = self._coder.encode(tar + src)[1]
92
93 1
        return (
94
            min(concat_comp, concat_comp2) - min(src_comp, tar_comp)
95
        ) / max(src_comp, tar_comp)
96
97
98 1
def dist_ncd_arith(src, tar, probs=None):
99
    """Return the NCD between two strings using arithmetic coding.
100
101
    This is a wrapper for :py:meth:`NCDarith.dist`.
102
103
    Parameters
104
    ----------
105
    src : str
106
        Source string for comparison
107
    tar : str
108
        Target string for comparison
109
    probs : dict
110
        A dictionary trained with :py:meth:`Arithmetic.train`
111
112
    Returns
113
    -------
114
    float
115
        Compression distance
116
117
    Examples
118
    --------
119
    >>> dist_ncd_arith('cat', 'hat')
120
    0.5454545454545454
121
    >>> dist_ncd_arith('Niall', 'Neil')
122
    0.6875
123
    >>> dist_ncd_arith('aluminum', 'Catalan')
124
    0.8275862068965517
125
    >>> dist_ncd_arith('ATCG', 'TAGC')
126
    0.6923076923076923
127
128
    """
129 1
    return NCDarith().dist(src, tar, probs)
130
131
132 1
def sim_ncd_arith(src, tar, probs=None):
133
    """Return the NCD similarity between two strings using arithmetic coding.
134
135
    This is a wrapper for :py:meth:`NCDarith.sim`.
136
137
    Parameters
138
    ----------
139
    src : str
140
        Source string for comparison
141
    tar : str
142
        Target string for comparison
143
    probs : dict
144
        A dictionary trained with :py:meth:`Arithmetic.train`
145
146
    Returns
147
    -------
148
    float
149
        Compression similarity
150
151
    Examples
152
    --------
153
    >>> sim_ncd_arith('cat', 'hat')
154
    0.4545454545454546
155
    >>> sim_ncd_arith('Niall', 'Neil')
156
    0.3125
157
    >>> sim_ncd_arith('aluminum', 'Catalan')
158
    0.1724137931034483
159
    >>> sim_ncd_arith('ATCG', 'TAGC')
160
    0.3076923076923077
161
162
    """
163 1
    return NCDarith().sim(src, tar, probs)
164
165
166
if __name__ == '__main__':
167
    import doctest
168
169
    doctest.testmod()
170