@@ 34-94 (lines=61) @@ | ||
31 | __all__ = ['NCDlzss'] |
|
32 | ||
33 | ||
34 | class NCDlzss(_Distance): |
|
35 | """Normalized Compression Distance using LZSS compression. |
|
36 | ||
37 | Cf. https://en.wikipedia.org/wiki/Lempel-Ziv-Storer-Szymanski |
|
38 | ||
39 | Normalized compression distance (NCD) :cite:`Cilibrasi:2005`. |
|
40 | ||
41 | .. versionadded:: 0.4.0 |
|
42 | """ |
|
43 | ||
44 | def dist(self, src: str, tar: str) -> float: |
|
45 | """Return the NCD between two strings using LZSS compression. |
|
46 | ||
47 | Parameters |
|
48 | ---------- |
|
49 | src : str |
|
50 | Source string for comparison |
|
51 | tar : str |
|
52 | Target string for comparison |
|
53 | ||
54 | Returns |
|
55 | ------- |
|
56 | float |
|
57 | Compression distance |
|
58 | ||
59 | Raises |
|
60 | ------ |
|
61 | ValueError |
|
62 | Install the PyLZSS module in order to use LZSS |
|
63 | ||
64 | Examples |
|
65 | -------- |
|
66 | >>> cmp = NCDlzss() |
|
67 | >>> cmp.dist('cat', 'hat') |
|
68 | 0.75 |
|
69 | >>> cmp.dist('Niall', 'Neil') |
|
70 | 1.0 |
|
71 | >>> cmp.dist('aluminum', 'Catalan') |
|
72 | 1.0 |
|
73 | >>> cmp.dist('ATCG', 'TAGC') |
|
74 | 0.8 |
|
75 | ||
76 | ||
77 | .. versionadded:: 0.4.0 |
|
78 | ||
79 | """ |
|
80 | if src == tar: |
|
81 | return 0.0 |
|
82 | ||
83 | if lzss is not None: |
|
84 | src_comp = lzss.encode(src) |
|
85 | tar_comp = lzss.encode(tar) |
|
86 | concat_comp = lzss.encode(src + tar) |
|
87 | concat_comp2 = lzss.encode(tar + src) |
|
88 | else: # pragma: no cover |
|
89 | raise ValueError('Install the PyLZSS module in order to use LZSS') |
|
90 | ||
91 | return ( |
|
92 | min(len(concat_comp), len(concat_comp2)) |
|
93 | - min(len(src_comp), len(tar_comp)) |
|
94 | ) / max(len(src_comp), len(tar_comp)) |
|
95 | ||
96 | ||
97 | if __name__ == '__main__': |
@@ 28-84 (lines=57) @@ | ||
25 | __all__ = ['NCDrle'] |
|
26 | ||
27 | ||
28 | class NCDrle(_Distance): |
|
29 | """Normalized Compression Distance using RLE. |
|
30 | ||
31 | Cf. https://en.wikipedia.org/wiki/Run-length_encoding |
|
32 | ||
33 | Normalized compression distance (NCD) :cite:`Cilibrasi:2005`. |
|
34 | ||
35 | .. versionadded:: 0.3.6 |
|
36 | """ |
|
37 | ||
38 | _rle = RLE() |
|
39 | ||
40 | def dist(self, src: str, tar: str) -> float: |
|
41 | """Return the NCD between two strings using RLE. |
|
42 | ||
43 | Parameters |
|
44 | ---------- |
|
45 | src : str |
|
46 | Source string for comparison |
|
47 | tar : str |
|
48 | Target string for comparison |
|
49 | ||
50 | Returns |
|
51 | ------- |
|
52 | float |
|
53 | Compression distance |
|
54 | ||
55 | Examples |
|
56 | -------- |
|
57 | >>> cmp = NCDrle() |
|
58 | >>> cmp.dist('cat', 'hat') |
|
59 | 1.0 |
|
60 | >>> cmp.dist('Niall', 'Neil') |
|
61 | 1.0 |
|
62 | >>> cmp.dist('aluminum', 'Catalan') |
|
63 | 1.0 |
|
64 | >>> cmp.dist('ATCG', 'TAGC') |
|
65 | 1.0 |
|
66 | ||
67 | ||
68 | .. versionadded:: 0.3.5 |
|
69 | .. versionchanged:: 0.3.6 |
|
70 | Encapsulated in class |
|
71 | ||
72 | """ |
|
73 | if src == tar: |
|
74 | return 0.0 |
|
75 | ||
76 | src_comp = self._rle.encode(src) |
|
77 | tar_comp = self._rle.encode(tar) |
|
78 | concat_comp = self._rle.encode(src + tar) |
|
79 | concat_comp2 = self._rle.encode(tar + src) |
|
80 | ||
81 | return ( |
|
82 | min(len(concat_comp), len(concat_comp2)) |
|
83 | - min(len(src_comp), len(tar_comp)) |
|
84 | ) / max(len(src_comp), len(tar_comp)) |
|
85 | ||
86 | ||
87 | if __name__ == '__main__': |