| @@ 36-78 (lines=43) @@ | ||
| 33 | __all__ = ['Overlap', 'dist_overlap', 'sim_overlap'] |
|
| 34 | ||
| 35 | ||
| 36 | class Overlap(_TokenDistance): |
|
| 37 | r"""Overlap coefficient. |
|
| 38 | ||
| 39 | For two sets X and Y, the overlap coefficient |
|
| 40 | :cite:`Szymkiewicz:1934,Simpson:1949`, also called the |
|
| 41 | Szymkiewicz-Simpson coefficient, is |
|
| 42 | :math:`sim_{overlap}(X, Y) = \frac{|X \cap Y|}{min(|X|, |Y|)}`. |
|
| 43 | """ |
|
| 44 | ||
| 45 | def sim(self, src, tar, qval=2): |
|
| 46 | r"""Return the overlap coefficient of two strings. |
|
| 47 | ||
| 48 | Parameters |
|
| 49 | ---------- |
|
| 50 | src : str |
|
| 51 | Source string (or QGrams/Counter objects) for comparison |
|
| 52 | tar : str |
|
| 53 | Target string (or QGrams/Counter objects) for comparison |
|
| 54 | qval : int |
|
| 55 | The length of each q-gram; 0 for non-q-gram version |
|
| 56 | ||
| 57 | Returns |
|
| 58 | ------- |
|
| 59 | float |
|
| 60 | Overlap similarity |
|
| 61 | ||
| 62 | Examples |
|
| 63 | -------- |
|
| 64 | >>> cmp = Overlap() |
|
| 65 | >>> cmp.sim('cat', 'hat') |
|
| 66 | 0.5 |
|
| 67 | >>> cmp.sim('Niall', 'Neil') |
|
| 68 | 0.4 |
|
| 69 | >>> cmp.sim('aluminum', 'Catalan') |
|
| 70 | 0.125 |
|
| 71 | >>> cmp.sim('ATCG', 'TAGC') |
|
| 72 | 0.0 |
|
| 73 | ||
| 74 | """ |
|
| 75 | if src == tar: |
|
| 76 | return 1.0 |
|
| 77 | elif not src or not tar: |
|
| 78 | return 0.0 |
|
| 79 | ||
| 80 | q_src, q_tar = self._get_qgrams(src, tar, qval) |
|
| 81 | q_src_mag = sum(q_src.values()) |
|
| @@ 38-79 (lines=42) @@ | ||
| 35 | __all__ = ['Cosine', 'dist_cosine', 'sim_cosine'] |
|
| 36 | ||
| 37 | ||
| 38 | class Cosine(_TokenDistance): |
|
| 39 | r"""Cosine similarity. |
|
| 40 | ||
| 41 | For two sets X and Y, the cosine similarity, Otsuka-Ochiai coefficient, or |
|
| 42 | Ochiai coefficient :cite:`Otsuka:1936,Ochiai:1957` is: |
|
| 43 | :math:`sim_{cosine}(X, Y) = \frac{|X \cap Y|}{\sqrt{|X| \cdot |Y|}}`. |
|
| 44 | """ |
|
| 45 | ||
| 46 | def sim(self, src, tar, qval=2): |
|
| 47 | r"""Return the cosine similarity of two strings. |
|
| 48 | ||
| 49 | Parameters |
|
| 50 | ---------- |
|
| 51 | src : str |
|
| 52 | Source string (or QGrams/Counter objects) for comparison |
|
| 53 | tar : str |
|
| 54 | Target string (or QGrams/Counter objects) for comparison |
|
| 55 | qval : int |
|
| 56 | The length of each q-gram; 0 for non-q-gram version |
|
| 57 | ||
| 58 | Returns |
|
| 59 | ------- |
|
| 60 | float |
|
| 61 | Cosine similarity |
|
| 62 | ||
| 63 | Examples |
|
| 64 | -------- |
|
| 65 | >>> cmp = Cosine() |
|
| 66 | >>> cmp.sim('cat', 'hat') |
|
| 67 | 0.5 |
|
| 68 | >>> cmp.sim('Niall', 'Neil') |
|
| 69 | 0.3651483716701107 |
|
| 70 | >>> cmp.sim('aluminum', 'Catalan') |
|
| 71 | 0.11785113019775793 |
|
| 72 | >>> cmp.sim('ATCG', 'TAGC') |
|
| 73 | 0.0 |
|
| 74 | ||
| 75 | """ |
|
| 76 | if src == tar: |
|
| 77 | return 1.0 |
|
| 78 | if not src or not tar: |
|
| 79 | return 0.0 |
|
| 80 | ||
| 81 | q_src, q_tar = self._get_qgrams(src, tar, qval) |
|
| 82 | q_src_mag = sum(q_src.values()) |
|