@@ 275-308 (lines=34) @@ | ||
272 | return 1 - dist_ncd_zlib(src, tar) |
|
273 | ||
274 | ||
275 | def dist_ncd_bz2(src, tar): |
|
276 | """Return the NCD between two strings using bz2 compression. |
|
277 | ||
278 | Normalized compression distance (NCD) :cite:`Cilibrasi:2005`. |
|
279 | ||
280 | :param str src: source string for comparison |
|
281 | :param str tar: target string for comparison |
|
282 | :returns: compression distance |
|
283 | :rtype: float |
|
284 | ||
285 | >>> dist_ncd_bz2('cat', 'hat') |
|
286 | 0.08 |
|
287 | >>> dist_ncd_bz2('Niall', 'Neil') |
|
288 | 0.037037037037037035 |
|
289 | >>> dist_ncd_bz2('aluminum', 'Catalan') |
|
290 | 0.20689655172413793 |
|
291 | >>> dist_ncd_bz2('ATCG', 'TAGC') |
|
292 | 0.037037037037037035 |
|
293 | """ |
|
294 | if src == tar: |
|
295 | return 0.0 |
|
296 | ||
297 | src = src.encode('utf-8') |
|
298 | tar = tar.encode('utf-8') |
|
299 | ||
300 | src_comp = encode(src, 'bz2_codec')[15:] |
|
301 | tar_comp = encode(tar, 'bz2_codec')[15:] |
|
302 | concat_comp = encode(src + tar, 'bz2_codec')[15:] |
|
303 | concat_comp2 = encode(tar + src, 'bz2_codec')[15:] |
|
304 | ||
305 | return ( |
|
306 | min(len(concat_comp), len(concat_comp2)) |
|
307 | - min(len(src_comp), len(tar_comp)) |
|
308 | ) / max(len(src_comp), len(tar_comp)) |
|
309 | ||
310 | ||
311 | def sim_ncd_bz2(src, tar): |
|
@@ 217-250 (lines=34) @@ | ||
214 | return 1 - dist_ncd_bwtrle(src, tar) |
|
215 | ||
216 | ||
217 | def dist_ncd_zlib(src, tar): |
|
218 | """Return the NCD between two strings using zlib compression. |
|
219 | ||
220 | Normalized compression distance (NCD) :cite:`Cilibrasi:2005`. |
|
221 | ||
222 | :param str src: source string for comparison |
|
223 | :param str tar: target string for comparison |
|
224 | :returns: compression distance |
|
225 | :rtype: float |
|
226 | ||
227 | >>> dist_ncd_zlib('cat', 'hat') |
|
228 | 0.3333333333333333 |
|
229 | >>> dist_ncd_zlib('Niall', 'Neil') |
|
230 | 0.45454545454545453 |
|
231 | >>> dist_ncd_zlib('aluminum', 'Catalan') |
|
232 | 0.5714285714285714 |
|
233 | >>> dist_ncd_zlib('ATCG', 'TAGC') |
|
234 | 0.4 |
|
235 | """ |
|
236 | if src == tar: |
|
237 | return 0.0 |
|
238 | ||
239 | src = src.encode('utf-8') |
|
240 | tar = tar.encode('utf-8') |
|
241 | ||
242 | src_comp = encode(src, 'zlib_codec')[2:] |
|
243 | tar_comp = encode(tar, 'zlib_codec')[2:] |
|
244 | concat_comp = encode(src + tar, 'zlib_codec')[2:] |
|
245 | concat_comp2 = encode(tar + src, 'zlib_codec')[2:] |
|
246 | ||
247 | return ( |
|
248 | min(len(concat_comp), len(concat_comp2)) |
|
249 | - min(len(src_comp), len(tar_comp)) |
|
250 | ) / max(len(src_comp), len(tar_comp)) |
|
251 | ||
252 | ||
253 | def sim_ncd_zlib(src, tar): |