|
@@ 263-295 (lines=33) @@
|
| 260 |
|
return 1 - dist_ncd_zlib(src, tar) |
| 261 |
|
|
| 262 |
|
|
| 263 |
|
def dist_ncd_bz2(src, tar): |
| 264 |
|
"""Return the NCD between two strings using bz2 compression. |
| 265 |
|
|
| 266 |
|
Normalized compression distance (NCD) :cite:`Cilibrasi:2005`. |
| 267 |
|
|
| 268 |
|
:param str src: source string for comparison |
| 269 |
|
:param str tar: target string for comparison |
| 270 |
|
:returns: compression distance |
| 271 |
|
:rtype: float |
| 272 |
|
|
| 273 |
|
>>> dist_ncd_bz2('cat', 'hat') |
| 274 |
|
0.08 |
| 275 |
|
>>> dist_ncd_bz2('Niall', 'Neil') |
| 276 |
|
0.037037037037037035 |
| 277 |
|
>>> dist_ncd_bz2('aluminum', 'Catalan') |
| 278 |
|
0.20689655172413793 |
| 279 |
|
>>> dist_ncd_bz2('ATCG', 'TAGC') |
| 280 |
|
0.037037037037037035 |
| 281 |
|
""" |
| 282 |
|
if src == tar: |
| 283 |
|
return 0.0 |
| 284 |
|
|
| 285 |
|
src = src.encode('utf-8') |
| 286 |
|
tar = tar.encode('utf-8') |
| 287 |
|
|
| 288 |
|
src_comp = encode(src, 'bz2_codec')[15:] |
| 289 |
|
tar_comp = encode(tar, 'bz2_codec')[15:] |
| 290 |
|
concat_comp = encode(src + tar, 'bz2_codec')[15:] |
| 291 |
|
concat_comp2 = encode(tar + src, 'bz2_codec')[15:] |
| 292 |
|
|
| 293 |
|
return ((min(len(concat_comp), len(concat_comp2)) - |
| 294 |
|
min(len(src_comp), len(tar_comp))) / |
| 295 |
|
max(len(src_comp), len(tar_comp))) |
| 296 |
|
|
| 297 |
|
|
| 298 |
|
def sim_ncd_bz2(src, tar): |
|
@@ 206-238 (lines=33) @@
|
| 203 |
|
return 1 - dist_ncd_bwtrle(src, tar) |
| 204 |
|
|
| 205 |
|
|
| 206 |
|
def dist_ncd_zlib(src, tar): |
| 207 |
|
"""Return the NCD between two strings using zlib compression. |
| 208 |
|
|
| 209 |
|
Normalized compression distance (NCD) :cite:`Cilibrasi:2005`. |
| 210 |
|
|
| 211 |
|
:param str src: source string for comparison |
| 212 |
|
:param str tar: target string for comparison |
| 213 |
|
:returns: compression distance |
| 214 |
|
:rtype: float |
| 215 |
|
|
| 216 |
|
>>> dist_ncd_zlib('cat', 'hat') |
| 217 |
|
0.3333333333333333 |
| 218 |
|
>>> dist_ncd_zlib('Niall', 'Neil') |
| 219 |
|
0.45454545454545453 |
| 220 |
|
>>> dist_ncd_zlib('aluminum', 'Catalan') |
| 221 |
|
0.5714285714285714 |
| 222 |
|
>>> dist_ncd_zlib('ATCG', 'TAGC') |
| 223 |
|
0.4 |
| 224 |
|
""" |
| 225 |
|
if src == tar: |
| 226 |
|
return 0.0 |
| 227 |
|
|
| 228 |
|
src = src.encode('utf-8') |
| 229 |
|
tar = tar.encode('utf-8') |
| 230 |
|
|
| 231 |
|
src_comp = encode(src, 'zlib_codec')[2:] |
| 232 |
|
tar_comp = encode(tar, 'zlib_codec')[2:] |
| 233 |
|
concat_comp = encode(src + tar, 'zlib_codec')[2:] |
| 234 |
|
concat_comp2 = encode(tar + src, 'zlib_codec')[2:] |
| 235 |
|
|
| 236 |
|
return ((min(len(concat_comp), len(concat_comp2)) - |
| 237 |
|
min(len(src_comp), len(tar_comp))) / |
| 238 |
|
max(len(src_comp), len(tar_comp))) |
| 239 |
|
|
| 240 |
|
|
| 241 |
|
def sim_ncd_zlib(src, tar): |