| @@ 34-94 (lines=61) @@ | ||
| 31 | __all__ = ['NCDlzss'] |
|
| 32 | ||
| 33 | ||
| 34 | class NCDlzss(_Distance): |
|
| 35 | """Normalized Compression Distance using LZSS compression. |
|
| 36 | ||
| 37 | Cf. https://en.wikipedia.org/wiki/Lempel-Ziv-Storer-Szymanski |
|
| 38 | ||
| 39 | Normalized compression distance (NCD) :cite:`Cilibrasi:2005`. |
|
| 40 | ||
| 41 | .. versionadded:: 0.4.0 |
|
| 42 | """ |
|
| 43 | ||
| 44 | def dist(self, src: str, tar: str) -> float: |
|
| 45 | """Return the NCD between two strings using LZSS compression. |
|
| 46 | ||
| 47 | Parameters |
|
| 48 | ---------- |
|
| 49 | src : str |
|
| 50 | Source string for comparison |
|
| 51 | tar : str |
|
| 52 | Target string for comparison |
|
| 53 | ||
| 54 | Returns |
|
| 55 | ------- |
|
| 56 | float |
|
| 57 | Compression distance |
|
| 58 | ||
| 59 | Raises |
|
| 60 | ------ |
|
| 61 | ValueError |
|
| 62 | Install the PyLZSS module in order to use LZSS |
|
| 63 | ||
| 64 | Examples |
|
| 65 | -------- |
|
| 66 | >>> cmp = NCDlzss() |
|
| 67 | >>> cmp.dist('cat', 'hat') |
|
| 68 | 0.75 |
|
| 69 | >>> cmp.dist('Niall', 'Neil') |
|
| 70 | 1.0 |
|
| 71 | >>> cmp.dist('aluminum', 'Catalan') |
|
| 72 | 1.0 |
|
| 73 | >>> cmp.dist('ATCG', 'TAGC') |
|
| 74 | 0.8 |
|
| 75 | ||
| 76 | ||
| 77 | .. versionadded:: 0.4.0 |
|
| 78 | ||
| 79 | """ |
|
| 80 | if src == tar: |
|
| 81 | return 0.0 |
|
| 82 | ||
| 83 | if lzss is not None: |
|
| 84 | src_comp = lzss.encode(src) |
|
| 85 | tar_comp = lzss.encode(tar) |
|
| 86 | concat_comp = lzss.encode(src + tar) |
|
| 87 | concat_comp2 = lzss.encode(tar + src) |
|
| 88 | else: # pragma: no cover |
|
| 89 | raise ValueError('Install the PyLZSS module in order to use LZSS') |
|
| 90 | ||
| 91 | return ( |
|
| 92 | min(len(concat_comp), len(concat_comp2)) |
|
| 93 | - min(len(src_comp), len(tar_comp)) |
|
| 94 | ) / max(len(src_comp), len(tar_comp)) |
|
| 95 | ||
| 96 | ||
| 97 | if __name__ == '__main__': |
|
| @@ 28-84 (lines=57) @@ | ||
| 25 | __all__ = ['NCDrle'] |
|
| 26 | ||
| 27 | ||
| 28 | class NCDrle(_Distance): |
|
| 29 | """Normalized Compression Distance using RLE. |
|
| 30 | ||
| 31 | Cf. https://en.wikipedia.org/wiki/Run-length_encoding |
|
| 32 | ||
| 33 | Normalized compression distance (NCD) :cite:`Cilibrasi:2005`. |
|
| 34 | ||
| 35 | .. versionadded:: 0.3.6 |
|
| 36 | """ |
|
| 37 | ||
| 38 | _rle = RLE() |
|
| 39 | ||
| 40 | def dist(self, src: str, tar: str) -> float: |
|
| 41 | """Return the NCD between two strings using RLE. |
|
| 42 | ||
| 43 | Parameters |
|
| 44 | ---------- |
|
| 45 | src : str |
|
| 46 | Source string for comparison |
|
| 47 | tar : str |
|
| 48 | Target string for comparison |
|
| 49 | ||
| 50 | Returns |
|
| 51 | ------- |
|
| 52 | float |
|
| 53 | Compression distance |
|
| 54 | ||
| 55 | Examples |
|
| 56 | -------- |
|
| 57 | >>> cmp = NCDrle() |
|
| 58 | >>> cmp.dist('cat', 'hat') |
|
| 59 | 1.0 |
|
| 60 | >>> cmp.dist('Niall', 'Neil') |
|
| 61 | 1.0 |
|
| 62 | >>> cmp.dist('aluminum', 'Catalan') |
|
| 63 | 1.0 |
|
| 64 | >>> cmp.dist('ATCG', 'TAGC') |
|
| 65 | 1.0 |
|
| 66 | ||
| 67 | ||
| 68 | .. versionadded:: 0.3.5 |
|
| 69 | .. versionchanged:: 0.3.6 |
|
| 70 | Encapsulated in class |
|
| 71 | ||
| 72 | """ |
|
| 73 | if src == tar: |
|
| 74 | return 0.0 |
|
| 75 | ||
| 76 | src_comp = self._rle.encode(src) |
|
| 77 | tar_comp = self._rle.encode(tar) |
|
| 78 | concat_comp = self._rle.encode(src + tar) |
|
| 79 | concat_comp2 = self._rle.encode(tar + src) |
|
| 80 | ||
| 81 | return ( |
|
| 82 | min(len(concat_comp), len(concat_comp2)) |
|
| 83 | - min(len(src_comp), len(tar_comp)) |
|
| 84 | ) / max(len(src_comp), len(tar_comp)) |
|
| 85 | ||
| 86 | ||
| 87 | if __name__ == '__main__': |
|