Code Duplication    Length = 34-34 lines in 2 locations

abydos/distance/_compression.py 2 locations

@@ 275-308 (lines=34) @@
272
    return 1 - dist_ncd_zlib(src, tar)
273
274
275
def dist_ncd_bz2(src, tar):
276
    """Return the NCD between two strings using bz2 compression.
277
278
    Normalized compression distance (NCD) :cite:`Cilibrasi:2005`.
279
280
    :param str src: source string for comparison
281
    :param str tar: target string for comparison
282
    :returns: compression distance
283
    :rtype: float
284
285
    >>> dist_ncd_bz2('cat', 'hat')
286
    0.08
287
    >>> dist_ncd_bz2('Niall', 'Neil')
288
    0.037037037037037035
289
    >>> dist_ncd_bz2('aluminum', 'Catalan')
290
    0.20689655172413793
291
    >>> dist_ncd_bz2('ATCG', 'TAGC')
292
    0.037037037037037035
293
    """
294
    if src == tar:
295
        return 0.0
296
297
    src = src.encode('utf-8')
298
    tar = tar.encode('utf-8')
299
300
    src_comp = encode(src, 'bz2_codec')[15:]
301
    tar_comp = encode(tar, 'bz2_codec')[15:]
302
    concat_comp = encode(src + tar, 'bz2_codec')[15:]
303
    concat_comp2 = encode(tar + src, 'bz2_codec')[15:]
304
305
    return (
306
        min(len(concat_comp), len(concat_comp2))
307
        - min(len(src_comp), len(tar_comp))
308
    ) / max(len(src_comp), len(tar_comp))
309
310
311
def sim_ncd_bz2(src, tar):
@@ 217-250 (lines=34) @@
214
    return 1 - dist_ncd_bwtrle(src, tar)
215
216
217
def dist_ncd_zlib(src, tar):
218
    """Return the NCD between two strings using zlib compression.
219
220
    Normalized compression distance (NCD) :cite:`Cilibrasi:2005`.
221
222
    :param str src: source string for comparison
223
    :param str tar: target string for comparison
224
    :returns: compression distance
225
    :rtype: float
226
227
    >>> dist_ncd_zlib('cat', 'hat')
228
    0.3333333333333333
229
    >>> dist_ncd_zlib('Niall', 'Neil')
230
    0.45454545454545453
231
    >>> dist_ncd_zlib('aluminum', 'Catalan')
232
    0.5714285714285714
233
    >>> dist_ncd_zlib('ATCG', 'TAGC')
234
    0.4
235
    """
236
    if src == tar:
237
        return 0.0
238
239
    src = src.encode('utf-8')
240
    tar = tar.encode('utf-8')
241
242
    src_comp = encode(src, 'zlib_codec')[2:]
243
    tar_comp = encode(tar, 'zlib_codec')[2:]
244
    concat_comp = encode(src + tar, 'zlib_codec')[2:]
245
    concat_comp2 = encode(tar + src, 'zlib_codec')[2:]
246
247
    return (
248
        min(len(concat_comp), len(concat_comp2))
249
        - min(len(src_comp), len(tar_comp))
250
    ) / max(len(src_comp), len(tar_comp))
251
252
253
def sim_ncd_zlib(src, tar):