Completed
Branch master (87ccc1)
by Chris
08:42
created

abydos.distance.compression.dist_ncd_rle()   A

Complexity

Conditions 2

Size

Total Lines 32
Code Lines 10

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 2
eloc 10
nop 3
dl 0
loc 32
rs 9.9
c 0
b 0
f 0
1
# -*- coding: utf-8 -*-
2
3
# Copyright 2014-2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19
"""abydos.distance.compression.
20
21
The distance.compression module implements compression distance measures.
22
"""
23
24
from __future__ import division, unicode_literals
25
26
from codecs import encode
27
from sys import modules
28
29
from ..compression import arithmetic, rle
30
31
try:
32
    import lzma
33
except ImportError:  # pragma: no cover
34
    # If the system lacks the lzma library, that's fine, but lzma compression
35
    # similarity won't be supported.
36
    lzma = None
37
38
__all__ = ['dist_ncd_arith', 'dist_ncd_bwtrle', 'dist_ncd_bz2',
39
           'dist_ncd_lzma', 'dist_ncd_rle', 'dist_ncd_zlib', 'sim_ncd_arith',
40
           'sim_ncd_bwtrle', 'sim_ncd_bz2', 'sim_ncd_lzma', 'sim_ncd_rle',
41
           'sim_ncd_zlib']
42
43
44
def dist_ncd_arith(src, tar, probs=None):
45
    """Return the NCD between two strings using arithmetic coding.
46
47
    Normalized compression distance (NCD) :cite:`Cilibrasi:2005`.
48
49
    :param str src: source string for comparison
50
    :param str tar: target string for comparison
51
    :param dict probs: a dictionary trained with arithmetic.train (for the
52
        arith compressor only)
53
    :returns: compression distance
54
    :rtype: float
55
56
    >>> dist_ncd_arith('cat', 'hat')
57
    0.5454545454545454
58
    >>> dist_ncd_arith('Niall', 'Neil')
59
    0.6875
60
    >>> dist_ncd_arith('aluminum', 'Catalan')
61
    0.8275862068965517
62
    >>> dist_ncd_arith('ATCG', 'TAGC')
63
    0.6923076923076923
64
    """
65
    if src == tar:
66
        return 0.0
67
68
    if probs is None:
69
        # lacking a reasonable dictionary, train on the strings themselves
70
        probs = arithmetic.train(src + tar)
71
    src_comp = arithmetic.encode(src, probs)[1]
72
    tar_comp = arithmetic.encode(tar, probs)[1]
73
    concat_comp = arithmetic.encode(src + tar, probs)[1]
74
    concat_comp2 = arithmetic.encode(tar + src, probs)[1]
75
76
    return ((min(concat_comp, concat_comp2) - min(src_comp, tar_comp)) /
77
            max(src_comp, tar_comp))
78
79
80
def sim_ncd_arith(src, tar, probs=None):
81
    """Return the NCD similarity between two strings using arithmetic coding.
82
83
    Normalized compression distance (NCD) :cite:`Cilibrasi:2005`.
84
85
    :param str src: source string for comparison
86
    :param str tar: target string for comparison
87
    :param dict probs: a dictionary trained with arithmetic.train (for the
88
        arith compressor only)
89
    :returns: compression similarity
90
    :rtype: float
91
92
    >>> sim_ncd_arith('cat', 'hat')
93
    0.4545454545454546
94
    >>> sim_ncd_arith('Niall', 'Neil')
95
    0.3125
96
    >>> sim_ncd_arith('aluminum', 'Catalan')
97
    0.1724137931034483
98
    >>> sim_ncd_arith('ATCG', 'TAGC')
99
    0.3076923076923077
100
    """
101
    return 1-dist_ncd_arith(src, tar, probs)
102
103
104
def dist_ncd_rle(src, tar, use_bwt=False):
105
    """Return the NCD between two strings using RLE.
106
107
    Normalized compression distance (NCD) :cite:`Cilibrasi:2005`.
108
109
    :param str src: source string for comparison
110
    :param str tar: target string for comparison
111
    :param bool use_bwt: boolean indicating whether to perform BWT encoding
112
        before RLE encoding
113
    :returns: compression distance
114
    :rtype: float
115
116
    >>> dist_ncd_rle('cat', 'hat')
117
    1.0
118
    >>> dist_ncd_rle('Niall', 'Neil')
119
    1.0
120
    >>> dist_ncd_rle('aluminum', 'Catalan')
121
    1.0
122
    >>> dist_ncd_rle('ATCG', 'TAGC')
123
    1.0
124
    """
125
    if src == tar:
126
        return 0.0
127
128
    src_comp = rle.encode(src, use_bwt)
129
    tar_comp = rle.encode(tar, use_bwt)
130
    concat_comp = rle.encode(src + tar, use_bwt)
131
    concat_comp2 = rle.encode(tar + src, use_bwt)
132
133
    return ((min(len(concat_comp), len(concat_comp2)) -
134
             min(len(src_comp), len(tar_comp))) /
135
            max(len(src_comp), len(tar_comp)))
136
137
138
def sim_ncd_rle(src, tar, use_bwt=False):
139
    """Return the NCD similarity between two strings using RLE.
140
141
    Normalized compression distance (NCD) :cite:`Cilibrasi:2005`.
142
143
    :param str src: source string for comparison
144
    :param str tar: target string for comparison
145
    :param bool use_bwt: boolean indicating whether to perform BWT encoding
146
        before RLE encoding
147
    :returns: compression similarity
148
    :rtype: float
149
150
    >>> sim_ncd_rle('cat', 'hat')
151
    0.0
152
    >>> sim_ncd_rle('Niall', 'Neil')
153
    0.0
154
    >>> sim_ncd_rle('aluminum', 'Catalan')
155
    0.0
156
    >>> sim_ncd_rle('ATCG', 'TAGC')
157
    0.0
158
    """
159
    return 1 - dist_ncd_rle(src, tar, use_bwt)
160
161
162
def dist_ncd_bwtrle(src, tar):
163
    """Return the NCD between two strings using BWT plus RLE.
164
165
    Normalized compression distance (NCD) :cite:`Cilibrasi:2005`.
166
167
    :param str src: source string for comparison
168
    :param str tar: target string for comparison
169
    :returns: compression distance
170
    :rtype: float
171
172
    >>> dist_ncd_bwtrle('cat', 'hat')
173
    0.75
174
    >>> dist_ncd_bwtrle('Niall', 'Neil')
175
    0.8333333333333334
176
    >>> dist_ncd_bwtrle('aluminum', 'Catalan')
177
    1.0
178
    >>> dist_ncd_bwtrle('ATCG', 'TAGC')
179
    0.8
180
    """
181
    return dist_ncd_rle(src, tar, True)
182
183
184
def sim_ncd_bwtrle(src, tar):
185
    """Return the NCD similarity between two strings using BWT plus RLE.
186
187
    Normalized compression distance (NCD) :cite:`Cilibrasi:2005`.
188
189
    :param str src: source string for comparison
190
    :param str tar: target string for comparison
191
    :returns: compression similarity
192
    :rtype: float
193
194
    >>> sim_ncd_bwtrle('cat', 'hat')
195
    0.25
196
    >>> sim_ncd_bwtrle('Niall', 'Neil')
197
    0.16666666666666663
198
    >>> sim_ncd_bwtrle('aluminum', 'Catalan')
199
    0.0
200
    >>> sim_ncd_bwtrle('ATCG', 'TAGC')
201
    0.19999999999999996
202
    """
203
    return 1 - dist_ncd_bwtrle(src, tar)
204
205
206 View Code Duplication
def dist_ncd_zlib(src, tar):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
207
    """Return the NCD between two strings using zlib compression.
208
209
    Normalized compression distance (NCD) :cite:`Cilibrasi:2005`.
210
211
    :param str src: source string for comparison
212
    :param str tar: target string for comparison
213
    :returns: compression distance
214
    :rtype: float
215
216
    >>> dist_ncd_zlib('cat', 'hat')
217
    0.3333333333333333
218
    >>> dist_ncd_zlib('Niall', 'Neil')
219
    0.45454545454545453
220
    >>> dist_ncd_zlib('aluminum', 'Catalan')
221
    0.5714285714285714
222
    >>> dist_ncd_zlib('ATCG', 'TAGC')
223
    0.4
224
    """
225
    if src == tar:
226
        return 0.0
227
228
    src = src.encode('utf-8')
229
    tar = tar.encode('utf-8')
230
231
    src_comp = encode(src, 'zlib_codec')[2:]
232
    tar_comp = encode(tar, 'zlib_codec')[2:]
233
    concat_comp = encode(src + tar, 'zlib_codec')[2:]
234
    concat_comp2 = encode(tar + src, 'zlib_codec')[2:]
235
236
    return ((min(len(concat_comp), len(concat_comp2)) -
237
             min(len(src_comp), len(tar_comp))) /
238
            max(len(src_comp), len(tar_comp)))
239
240
241
def sim_ncd_zlib(src, tar):
242
    """Return the NCD similarity between two strings using zlib compression.
243
244
    Normalized compression distance (NCD) :cite:`Cilibrasi:2005`.
245
246
    :param str src: source string for comparison
247
    :param str tar: target string for comparison
248
    :returns: compression similarity
249
    :rtype: float
250
251
    >>> sim_ncd_zlib('cat', 'hat')
252
    0.6666666666666667
253
    >>> sim_ncd_zlib('Niall', 'Neil')
254
    0.5454545454545454
255
    >>> sim_ncd_zlib('aluminum', 'Catalan')
256
    0.4285714285714286
257
    >>> sim_ncd_zlib('ATCG', 'TAGC')
258
    0.6
259
    """
260
    return 1 - dist_ncd_zlib(src, tar)
261
262
263 View Code Duplication
def dist_ncd_bz2(src, tar):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
264
    """Return the NCD between two strings using bz2 compression.
265
266
    Normalized compression distance (NCD) :cite:`Cilibrasi:2005`.
267
268
    :param str src: source string for comparison
269
    :param str tar: target string for comparison
270
    :returns: compression distance
271
    :rtype: float
272
273
    >>> dist_ncd_bz2('cat', 'hat')
274
    0.08
275
    >>> dist_ncd_bz2('Niall', 'Neil')
276
    0.037037037037037035
277
    >>> dist_ncd_bz2('aluminum', 'Catalan')
278
    0.20689655172413793
279
    >>> dist_ncd_bz2('ATCG', 'TAGC')
280
    0.037037037037037035
281
    """
282
    if src == tar:
283
        return 0.0
284
285
    src = src.encode('utf-8')
286
    tar = tar.encode('utf-8')
287
288
    src_comp = encode(src, 'bz2_codec')[15:]
289
    tar_comp = encode(tar, 'bz2_codec')[15:]
290
    concat_comp = encode(src + tar, 'bz2_codec')[15:]
291
    concat_comp2 = encode(tar + src, 'bz2_codec')[15:]
292
293
    return ((min(len(concat_comp), len(concat_comp2)) -
294
             min(len(src_comp), len(tar_comp))) /
295
            max(len(src_comp), len(tar_comp)))
296
297
298
def sim_ncd_bz2(src, tar):
299
    """Return the NCD similarity between two strings using bz2 compression.
300
301
    Normalized compression distance (NCD) :cite:`Cilibrasi:2005`.
302
303
    :param str src: source string for comparison
304
    :param str tar: target string for comparison
305
    :returns: compression similarity
306
    :rtype: float
307
308
    >>> sim_ncd_bz2('cat', 'hat')
309
    0.92
310
    >>> sim_ncd_bz2('Niall', 'Neil')
311
    0.962962962962963
312
    >>> sim_ncd_bz2('aluminum', 'Catalan')
313
    0.7931034482758621
314
    >>> sim_ncd_bz2('ATCG', 'TAGC')
315
    0.962962962962963
316
    """
317
    return 1 - dist_ncd_bz2(src, tar)
318
319
320
def dist_ncd_lzma(src, tar):
321
    """Return the NCD between two strings using lzma compression.
322
323
    Normalized compression distance (NCD) :cite:`Cilibrasi:2005`.
324
325
    :param str src: source string for comparison
326
    :param str tar: target string for comparison
327
    :returns: compression distance
328
    :rtype: float
329
330
    >>> dist_ncd_lzma('cat', 'hat')
331
    0.08695652173913043
332
    >>> dist_ncd_lzma('Niall', 'Neil')
333
    0.16
334
    >>> dist_ncd_lzma('aluminum', 'Catalan')
335
    0.16
336
    >>> dist_ncd_lzma('ATCG', 'TAGC')
337
    0.08695652173913043
338
    """
339
    if src == tar:
340
        return 0.0
341
342
    src = src.encode('utf-8')
343
    tar = tar.encode('utf-8')
344
345
    if 'lzma' in modules:
346
        src_comp = lzma.compress(src)[14:]
347
        tar_comp = lzma.compress(tar)[14:]
348
        concat_comp = lzma.compress(src + tar)[14:]
349
        concat_comp2 = lzma.compress(tar + src)[14:]
350
    else:
351
        raise ValueError('Install the PylibLZMA module in order to use lzma ' +
352
                         'compression similarity')
353
354
    return ((min(len(concat_comp), len(concat_comp2)) -
355
             min(len(src_comp), len(tar_comp))) /
356
            max(len(src_comp), len(tar_comp)))
357
358
359
def sim_ncd_lzma(src, tar):
360
    """Return the NCD similarity between two strings using lzma compression.
361
362
    Normalized compression distance (NCD) :cite:`Cilibrasi:2005`.
363
364
    :param str src: source string for comparison
365
    :param str tar: target string for comparison
366
    :returns: compression similarity
367
    :rtype: float
368
369
    >>> sim_ncd_lzma('cat', 'hat')
370
    0.9130434782608696
371
    >>> sim_ncd_lzma('Niall', 'Neil')
372
    0.84
373
    >>> sim_ncd_lzma('aluminum', 'Catalan')
374
    0.84
375
    >>> sim_ncd_lzma('ATCG', 'TAGC')
376
    0.9130434782608696
377
    """
378
    return 1 - dist_ncd_lzma(src, tar)
379
380
381
if __name__ == '__main__':
382
    import doctest
383
    doctest.testmod()
384