|
@@ 275-308 (lines=34) @@
|
| 272 |
|
return 1 - dist_ncd_zlib(src, tar) |
| 273 |
|
|
| 274 |
|
|
| 275 |
|
def dist_ncd_bz2(src, tar): |
| 276 |
|
"""Return the NCD between two strings using bz2 compression. |
| 277 |
|
|
| 278 |
|
Normalized compression distance (NCD) :cite:`Cilibrasi:2005`. |
| 279 |
|
|
| 280 |
|
:param str src: source string for comparison |
| 281 |
|
:param str tar: target string for comparison |
| 282 |
|
:returns: compression distance |
| 283 |
|
:rtype: float |
| 284 |
|
|
| 285 |
|
>>> dist_ncd_bz2('cat', 'hat') |
| 286 |
|
0.08 |
| 287 |
|
>>> dist_ncd_bz2('Niall', 'Neil') |
| 288 |
|
0.037037037037037035 |
| 289 |
|
>>> dist_ncd_bz2('aluminum', 'Catalan') |
| 290 |
|
0.20689655172413793 |
| 291 |
|
>>> dist_ncd_bz2('ATCG', 'TAGC') |
| 292 |
|
0.037037037037037035 |
| 293 |
|
""" |
| 294 |
|
if src == tar: |
| 295 |
|
return 0.0 |
| 296 |
|
|
| 297 |
|
src = src.encode('utf-8') |
| 298 |
|
tar = tar.encode('utf-8') |
| 299 |
|
|
| 300 |
|
src_comp = encode(src, 'bz2_codec')[15:] |
| 301 |
|
tar_comp = encode(tar, 'bz2_codec')[15:] |
| 302 |
|
concat_comp = encode(src + tar, 'bz2_codec')[15:] |
| 303 |
|
concat_comp2 = encode(tar + src, 'bz2_codec')[15:] |
| 304 |
|
|
| 305 |
|
return ( |
| 306 |
|
min(len(concat_comp), len(concat_comp2)) |
| 307 |
|
- min(len(src_comp), len(tar_comp)) |
| 308 |
|
) / max(len(src_comp), len(tar_comp)) |
| 309 |
|
|
| 310 |
|
|
| 311 |
|
def sim_ncd_bz2(src, tar): |
|
@@ 217-250 (lines=34) @@
|
| 214 |
|
return 1 - dist_ncd_bwtrle(src, tar) |
| 215 |
|
|
| 216 |
|
|
| 217 |
|
def dist_ncd_zlib(src, tar): |
| 218 |
|
"""Return the NCD between two strings using zlib compression. |
| 219 |
|
|
| 220 |
|
Normalized compression distance (NCD) :cite:`Cilibrasi:2005`. |
| 221 |
|
|
| 222 |
|
:param str src: source string for comparison |
| 223 |
|
:param str tar: target string for comparison |
| 224 |
|
:returns: compression distance |
| 225 |
|
:rtype: float |
| 226 |
|
|
| 227 |
|
>>> dist_ncd_zlib('cat', 'hat') |
| 228 |
|
0.3333333333333333 |
| 229 |
|
>>> dist_ncd_zlib('Niall', 'Neil') |
| 230 |
|
0.45454545454545453 |
| 231 |
|
>>> dist_ncd_zlib('aluminum', 'Catalan') |
| 232 |
|
0.5714285714285714 |
| 233 |
|
>>> dist_ncd_zlib('ATCG', 'TAGC') |
| 234 |
|
0.4 |
| 235 |
|
""" |
| 236 |
|
if src == tar: |
| 237 |
|
return 0.0 |
| 238 |
|
|
| 239 |
|
src = src.encode('utf-8') |
| 240 |
|
tar = tar.encode('utf-8') |
| 241 |
|
|
| 242 |
|
src_comp = encode(src, 'zlib_codec')[2:] |
| 243 |
|
tar_comp = encode(tar, 'zlib_codec')[2:] |
| 244 |
|
concat_comp = encode(src + tar, 'zlib_codec')[2:] |
| 245 |
|
concat_comp2 = encode(tar + src, 'zlib_codec')[2:] |
| 246 |
|
|
| 247 |
|
return ( |
| 248 |
|
min(len(concat_comp), len(concat_comp2)) |
| 249 |
|
- min(len(src_comp), len(tar_comp)) |
| 250 |
|
) / max(len(src_comp), len(tar_comp)) |
| 251 |
|
|
| 252 |
|
|
| 253 |
|
def sim_ncd_zlib(src, tar): |