abydos.distance._ncd_lzma   A
last analyzed

Complexity

Total Complexity 3

Size/Duplication

Total Lines 113
Duplicated Lines 0 %

Test Coverage

Coverage 50%

Importance

Changes 0
Metric Value
wmc 3
eloc 26
dl 0
loc 113
ccs 10
cts 20
cp 0.5
rs 10
c 0
b 0
f 0

2 Methods

Rating   Name   Duplication   Size   Complexity  
A NCDlzma.dist() 0 48 2
A NCDlzma.__init__() 0 14 1
1
# Copyright 2014-2020 by Christopher C. Little.
2
# This file is part of Abydos.
3
#
4
# Abydos is free software: you can redistribute it and/or modify
5
# it under the terms of the GNU General Public License as published by
6
# the Free Software Foundation, either version 3 of the License, or
7
# (at your option) any later version.
8
#
9
# Abydos is distributed in the hope that it will be useful,
10
# but WITHOUT ANY WARRANTY; without even the implied warranty of
11
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12
# GNU General Public License for more details.
13
#
14
# You should have received a copy of the GNU General Public License
15
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
16
17
"""abydos.distance._ncd_lzma.
18
19 1
NCD using LZMA
20
"""
21
22
import lzma
23
24 1
from typing import Any
25
26
from ._distance import _Distance
27
28
29
__all__ = ['NCDlzma']
30
31 1
32
class NCDlzma(_Distance):
33 1
    """Normalized Compression Distance using LZMA compression.
34 1
35
    Cf. https://en.wikipedia.org/wiki/Lempel-Ziv-Markov_chain_algorithm
36 1
37 1
    Normalized compression distance (NCD) :cite:`Cilibrasi:2005`.
38
39
    .. versionadded:: 0.3.6
40
    """
41
42
    _level = 6
43 1
44
    def __init__(self, level: int = 6, **kwargs: Any) -> None:
45
        """Initialize LZMA compressor.
46 1
47
        Parameters
48
        ----------
49
        level : int
50
            The compression level (0 to 9)
51
52
53
        .. versionadded:: 0.5.0
54
55
        """
56 1
        super().__init__(**kwargs)
57
        self._level = level
58
59
    def dist(self, src: str, tar: str) -> float:
60
        """Return the NCD between two strings using LZMA compression.
61
62
        Parameters
63
        ----------
64
        src : str
65
            Source string for comparison
66
        tar : str
67
            Target string for comparison
68
69
        Returns
70
        -------
71
        float
72
            Compression distance
73
74
        Examples
75
        --------
76
        >>> cmp = NCDlzma()
77
        >>> cmp.dist('cat', 'hat')
78
        0.08695652173913043
79
        >>> cmp.dist('Niall', 'Neil')
80
        0.16
81
        >>> cmp.dist('aluminum', 'Catalan')
82
        0.16
83
        >>> cmp.dist('ATCG', 'TAGC')
84
        0.08695652173913043
85
86
87
        .. versionadded:: 0.3.5
88
        .. versionchanged:: 0.3.6
89
            Encapsulated in class
90
91
        """
92
        if src == tar:
93
            return 0.0
94
95
        src_b = src.encode('utf-8')
96
        tar_b = tar.encode('utf-8')
97
98
        src_comp = lzma.compress(src_b, preset=self._level)[14:]
99
        tar_comp = lzma.compress(tar_b, preset=self._level)[14:]
100
        concat_comp = lzma.compress(src_b + tar_b, preset=self._level)[14:]
101
        concat_comp2 = lzma.compress(tar_b + src_b, preset=self._level)[14:]
102
103
        return (
104
            min(len(concat_comp), len(concat_comp2))
105
            - min(len(src_comp), len(tar_comp))
106
        ) / max(len(src_comp), len(tar_comp))
107
108
109
if __name__ == '__main__':
110
    import doctest
111
112
    doctest.testmod()
113