Completed
Branch master (78a222)
by Chris
14:36
created

abydos.distance._compression.dist_ncd_rle()   A

Complexity

Conditions 2

Size

Total Lines 33
Code Lines 11

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 8
CRAP Score 2

Importance

Changes 0
Metric Value
eloc 11
dl 0
loc 33
ccs 8
cts 8
cp 1
rs 9.85
c 0
b 0
f 0
cc 2
nop 3
crap 2
1
# -*- coding: utf-8 -*-
2
3
# Copyright 2014-2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19 1
"""abydos.distance.compression.
20
21
The distance.compression module implements compression distance measures.
22
"""
23
24 1
from __future__ import division, unicode_literals
25
26 1
from codecs import encode
27
28 1
from ..compression import ac_encode, ac_train, rle_encode
29
30 1
try:
31 1
    import lzma
32
except ImportError:  # pragma: no cover
33
    # If the system lacks the lzma library, that's fine, but lzma compression
34
    # similarity won't be supported.
35
    lzma = None
36
37 1
__all__ = [
38
    'dist_ncd_arith',
39
    'dist_ncd_bwtrle',
40
    'dist_ncd_bz2',
41
    'dist_ncd_lzma',
42
    'dist_ncd_rle',
43
    'dist_ncd_zlib',
44
    'sim_ncd_arith',
45
    'sim_ncd_bwtrle',
46
    'sim_ncd_bz2',
47
    'sim_ncd_lzma',
48
    'sim_ncd_rle',
49
    'sim_ncd_zlib',
50
]
51
52
53 1
def dist_ncd_arith(src, tar, probs=None):
54
    """Return the NCD between two strings using arithmetic coding.
55
56
    Normalized compression distance (NCD) :cite:`Cilibrasi:2005`.
57
58
    :param str src: source string for comparison
59
    :param str tar: target string for comparison
60
    :param dict probs: a dictionary trained with arithmetic.train (for the
61
        arith compressor only)
62
    :returns: compression distance
63
    :rtype: float
64
65
    >>> dist_ncd_arith('cat', 'hat')
66
    0.5454545454545454
67
    >>> dist_ncd_arith('Niall', 'Neil')
68
    0.6875
69
    >>> dist_ncd_arith('aluminum', 'Catalan')
70
    0.8275862068965517
71
    >>> dist_ncd_arith('ATCG', 'TAGC')
72
    0.6923076923076923
73
    """
74 1
    if src == tar:
75 1
        return 0.0
76
77 1
    if probs is None:
78
        # lacking a reasonable dictionary, train on the strings themselves
79 1
        probs = ac_train(src + tar)
80 1
    src_comp = ac_encode(src, probs)[1]
81 1
    tar_comp = ac_encode(tar, probs)[1]
82 1
    concat_comp = ac_encode(src + tar, probs)[1]
83 1
    concat_comp2 = ac_encode(tar + src, probs)[1]
84
85 1
    return (min(concat_comp, concat_comp2) - min(src_comp, tar_comp)) / max(
86
        src_comp, tar_comp
87
    )
88
89
90 1
def sim_ncd_arith(src, tar, probs=None):
91
    """Return the NCD similarity between two strings using arithmetic coding.
92
93
    Normalized compression distance (NCD) :cite:`Cilibrasi:2005`.
94
95
    :param str src: source string for comparison
96
    :param str tar: target string for comparison
97
    :param dict probs: a dictionary trained with ac_train (for the
98
        arith compressor only)
99
    :returns: compression similarity
100
    :rtype: float
101
102
    >>> sim_ncd_arith('cat', 'hat')
103
    0.4545454545454546
104
    >>> sim_ncd_arith('Niall', 'Neil')
105
    0.3125
106
    >>> sim_ncd_arith('aluminum', 'Catalan')
107
    0.1724137931034483
108
    >>> sim_ncd_arith('ATCG', 'TAGC')
109
    0.3076923076923077
110
    """
111 1
    return 1 - dist_ncd_arith(src, tar, probs)
112
113
114 1
def dist_ncd_rle(src, tar, use_bwt=False):
115
    """Return the NCD between two strings using RLE.
116
117
    Normalized compression distance (NCD) :cite:`Cilibrasi:2005`.
118
119
    :param str src: source string for comparison
120
    :param str tar: target string for comparison
121
    :param bool use_bwt: boolean indicating whether to perform BWT encoding
122
        before RLE encoding
123
    :returns: compression distance
124
    :rtype: float
125
126
    >>> dist_ncd_rle('cat', 'hat')
127
    1.0
128
    >>> dist_ncd_rle('Niall', 'Neil')
129
    1.0
130
    >>> dist_ncd_rle('aluminum', 'Catalan')
131
    1.0
132
    >>> dist_ncd_rle('ATCG', 'TAGC')
133
    1.0
134
    """
135 1
    if src == tar:
136 1
        return 0.0
137
138 1
    src_comp = rle_encode(src, use_bwt)
139 1
    tar_comp = rle_encode(tar, use_bwt)
140 1
    concat_comp = rle_encode(src + tar, use_bwt)
141 1
    concat_comp2 = rle_encode(tar + src, use_bwt)
142
143 1
    return (
144
        min(len(concat_comp), len(concat_comp2))
145
        - min(len(src_comp), len(tar_comp))
146
    ) / max(len(src_comp), len(tar_comp))
147
148
149 1
def sim_ncd_rle(src, tar, use_bwt=False):
150
    """Return the NCD similarity between two strings using RLE.
151
152
    Normalized compression distance (NCD) :cite:`Cilibrasi:2005`.
153
154
    :param str src: source string for comparison
155
    :param str tar: target string for comparison
156
    :param bool use_bwt: boolean indicating whether to perform BWT encoding
157
        before RLE encoding
158
    :returns: compression similarity
159
    :rtype: float
160
161
    >>> sim_ncd_rle('cat', 'hat')
162
    0.0
163
    >>> sim_ncd_rle('Niall', 'Neil')
164
    0.0
165
    >>> sim_ncd_rle('aluminum', 'Catalan')
166
    0.0
167
    >>> sim_ncd_rle('ATCG', 'TAGC')
168
    0.0
169
    """
170 1
    return 1 - dist_ncd_rle(src, tar, use_bwt)
171
172
173 1
def dist_ncd_bwtrle(src, tar):
174
    """Return the NCD between two strings using BWT plus RLE.
175
176
    Normalized compression distance (NCD) :cite:`Cilibrasi:2005`.
177
178
    :param str src: source string for comparison
179
    :param str tar: target string for comparison
180
    :returns: compression distance
181
    :rtype: float
182
183
    >>> dist_ncd_bwtrle('cat', 'hat')
184
    0.75
185
    >>> dist_ncd_bwtrle('Niall', 'Neil')
186
    0.8333333333333334
187
    >>> dist_ncd_bwtrle('aluminum', 'Catalan')
188
    1.0
189
    >>> dist_ncd_bwtrle('ATCG', 'TAGC')
190
    0.8
191
    """
192 1
    return dist_ncd_rle(src, tar, True)
193
194
195 1
def sim_ncd_bwtrle(src, tar):
196
    """Return the NCD similarity between two strings using BWT plus RLE.
197
198
    Normalized compression distance (NCD) :cite:`Cilibrasi:2005`.
199
200
    :param str src: source string for comparison
201
    :param str tar: target string for comparison
202
    :returns: compression similarity
203
    :rtype: float
204
205
    >>> sim_ncd_bwtrle('cat', 'hat')
206
    0.25
207
    >>> sim_ncd_bwtrle('Niall', 'Neil')
208
    0.16666666666666663
209
    >>> sim_ncd_bwtrle('aluminum', 'Catalan')
210
    0.0
211
    >>> sim_ncd_bwtrle('ATCG', 'TAGC')
212
    0.19999999999999996
213
    """
214 1
    return 1 - dist_ncd_bwtrle(src, tar)
215
216
217 1 View Code Duplication
def dist_ncd_zlib(src, tar):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
218
    """Return the NCD between two strings using zlib compression.
219
220
    Normalized compression distance (NCD) :cite:`Cilibrasi:2005`.
221
222
    :param str src: source string for comparison
223
    :param str tar: target string for comparison
224
    :returns: compression distance
225
    :rtype: float
226
227
    >>> dist_ncd_zlib('cat', 'hat')
228
    0.3333333333333333
229
    >>> dist_ncd_zlib('Niall', 'Neil')
230
    0.45454545454545453
231
    >>> dist_ncd_zlib('aluminum', 'Catalan')
232
    0.5714285714285714
233
    >>> dist_ncd_zlib('ATCG', 'TAGC')
234
    0.4
235
    """
236 1
    if src == tar:
237 1
        return 0.0
238
239 1
    src = src.encode('utf-8')
240 1
    tar = tar.encode('utf-8')
241
242 1
    src_comp = encode(src, 'zlib_codec')[2:]
243 1
    tar_comp = encode(tar, 'zlib_codec')[2:]
244 1
    concat_comp = encode(src + tar, 'zlib_codec')[2:]
245 1
    concat_comp2 = encode(tar + src, 'zlib_codec')[2:]
246
247 1
    return (
248
        min(len(concat_comp), len(concat_comp2))
249
        - min(len(src_comp), len(tar_comp))
250
    ) / max(len(src_comp), len(tar_comp))
251
252
253 1
def sim_ncd_zlib(src, tar):
254
    """Return the NCD similarity between two strings using zlib compression.
255
256
    Normalized compression distance (NCD) :cite:`Cilibrasi:2005`.
257
258
    :param str src: source string for comparison
259
    :param str tar: target string for comparison
260
    :returns: compression similarity
261
    :rtype: float
262
263
    >>> sim_ncd_zlib('cat', 'hat')
264
    0.6666666666666667
265
    >>> sim_ncd_zlib('Niall', 'Neil')
266
    0.5454545454545454
267
    >>> sim_ncd_zlib('aluminum', 'Catalan')
268
    0.4285714285714286
269
    >>> sim_ncd_zlib('ATCG', 'TAGC')
270
    0.6
271
    """
272 1
    return 1 - dist_ncd_zlib(src, tar)
273
274
275 1 View Code Duplication
def dist_ncd_bz2(src, tar):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
276
    """Return the NCD between two strings using bz2 compression.
277
278
    Normalized compression distance (NCD) :cite:`Cilibrasi:2005`.
279
280
    :param str src: source string for comparison
281
    :param str tar: target string for comparison
282
    :returns: compression distance
283
    :rtype: float
284
285
    >>> dist_ncd_bz2('cat', 'hat')
286
    0.08
287
    >>> dist_ncd_bz2('Niall', 'Neil')
288
    0.037037037037037035
289
    >>> dist_ncd_bz2('aluminum', 'Catalan')
290
    0.20689655172413793
291
    >>> dist_ncd_bz2('ATCG', 'TAGC')
292
    0.037037037037037035
293
    """
294 1
    if src == tar:
295 1
        return 0.0
296
297 1
    src = src.encode('utf-8')
298 1
    tar = tar.encode('utf-8')
299
300 1
    src_comp = encode(src, 'bz2_codec')[15:]
301 1
    tar_comp = encode(tar, 'bz2_codec')[15:]
302 1
    concat_comp = encode(src + tar, 'bz2_codec')[15:]
303 1
    concat_comp2 = encode(tar + src, 'bz2_codec')[15:]
304
305 1
    return (
306
        min(len(concat_comp), len(concat_comp2))
307
        - min(len(src_comp), len(tar_comp))
308
    ) / max(len(src_comp), len(tar_comp))
309
310
311 1
def sim_ncd_bz2(src, tar):
312
    """Return the NCD similarity between two strings using bz2 compression.
313
314
    Normalized compression distance (NCD) :cite:`Cilibrasi:2005`.
315
316
    :param str src: source string for comparison
317
    :param str tar: target string for comparison
318
    :returns: compression similarity
319
    :rtype: float
320
321
    >>> sim_ncd_bz2('cat', 'hat')
322
    0.92
323
    >>> sim_ncd_bz2('Niall', 'Neil')
324
    0.962962962962963
325
    >>> sim_ncd_bz2('aluminum', 'Catalan')
326
    0.7931034482758621
327
    >>> sim_ncd_bz2('ATCG', 'TAGC')
328
    0.962962962962963
329
    """
330 1
    return 1 - dist_ncd_bz2(src, tar)
331
332
333 1
def dist_ncd_lzma(src, tar):
334
    """Return the NCD between two strings using lzma compression.
335
336
    Normalized compression distance (NCD) :cite:`Cilibrasi:2005`.
337
338
    :param str src: source string for comparison
339
    :param str tar: target string for comparison
340
    :returns: compression distance
341
    :rtype: float
342
343
    >>> dist_ncd_lzma('cat', 'hat')
344
    0.08695652173913043
345
    >>> dist_ncd_lzma('Niall', 'Neil')
346
    0.16
347
    >>> dist_ncd_lzma('aluminum', 'Catalan')
348
    0.16
349
    >>> dist_ncd_lzma('ATCG', 'TAGC')
350
    0.08695652173913043
351
    """
352
    if src == tar:
353
        return 0.0
354
355
    src = src.encode('utf-8')
356
    tar = tar.encode('utf-8')
357
358
    if lzma is not None:
359
        src_comp = lzma.compress(src)[14:]
360
        tar_comp = lzma.compress(tar)[14:]
361
        concat_comp = lzma.compress(src + tar)[14:]
362
        concat_comp2 = lzma.compress(tar + src)[14:]
363
    else:  # pragma: no cover
364
        raise ValueError(
365
            'Install the PylibLZMA module in order to use lzma '
366
            + 'compression similarity'
367
        )
368
369
    return (
370
        min(len(concat_comp), len(concat_comp2))
371
        - min(len(src_comp), len(tar_comp))
372
    ) / max(len(src_comp), len(tar_comp))
373
374
375 1
def sim_ncd_lzma(src, tar):
376
    """Return the NCD similarity between two strings using lzma compression.
377
378
    Normalized compression distance (NCD) :cite:`Cilibrasi:2005`.
379
380
    :param str src: source string for comparison
381
    :param str tar: target string for comparison
382
    :returns: compression similarity
383
    :rtype: float
384
385
    >>> sim_ncd_lzma('cat', 'hat')
386
    0.9130434782608696
387
    >>> sim_ncd_lzma('Niall', 'Neil')
388
    0.84
389
    >>> sim_ncd_lzma('aluminum', 'Catalan')
390
    0.84
391
    >>> sim_ncd_lzma('ATCG', 'TAGC')
392
    0.9130434782608696
393
    """
394
    return 1 - dist_ncd_lzma(src, tar)
395
396
397
if __name__ == '__main__':
398
    import doctest
399
400
    doctest.testmod()
401