@@ 36-78 (lines=43) @@ | ||
33 | __all__ = ['Overlap', 'dist_overlap', 'sim_overlap'] |
|
34 | ||
35 | ||
36 | class Overlap(_TokenDistance): |
|
37 | r"""Overlap coefficient. |
|
38 | ||
39 | For two sets X and Y, the overlap coefficient |
|
40 | :cite:`Szymkiewicz:1934,Simpson:1949`, also called the |
|
41 | Szymkiewicz-Simpson coefficient, is |
|
42 | :math:`sim_{overlap}(X, Y) = \frac{|X \cap Y|}{min(|X|, |Y|)}`. |
|
43 | """ |
|
44 | ||
45 | def sim(self, src, tar, qval=2): |
|
46 | r"""Return the overlap coefficient of two strings. |
|
47 | ||
48 | Parameters |
|
49 | ---------- |
|
50 | src : str |
|
51 | Source string (or QGrams/Counter objects) for comparison |
|
52 | tar : str |
|
53 | Target string (or QGrams/Counter objects) for comparison |
|
54 | qval : int |
|
55 | The length of each q-gram; 0 for non-q-gram version |
|
56 | ||
57 | Returns |
|
58 | ------- |
|
59 | float |
|
60 | Overlap similarity |
|
61 | ||
62 | Examples |
|
63 | -------- |
|
64 | >>> cmp = Overlap() |
|
65 | >>> cmp.sim('cat', 'hat') |
|
66 | 0.5 |
|
67 | >>> cmp.sim('Niall', 'Neil') |
|
68 | 0.4 |
|
69 | >>> cmp.sim('aluminum', 'Catalan') |
|
70 | 0.125 |
|
71 | >>> cmp.sim('ATCG', 'TAGC') |
|
72 | 0.0 |
|
73 | ||
74 | """ |
|
75 | if src == tar: |
|
76 | return 1.0 |
|
77 | elif not src or not tar: |
|
78 | return 0.0 |
|
79 | ||
80 | q_src, q_tar = self._get_qgrams(src, tar, qval) |
|
81 | q_src_mag = sum(q_src.values()) |
@@ 38-79 (lines=42) @@ | ||
35 | __all__ = ['Cosine', 'dist_cosine', 'sim_cosine'] |
|
36 | ||
37 | ||
38 | class Cosine(_TokenDistance): |
|
39 | r"""Cosine similarity. |
|
40 | ||
41 | For two sets X and Y, the cosine similarity, Otsuka-Ochiai coefficient, or |
|
42 | Ochiai coefficient :cite:`Otsuka:1936,Ochiai:1957` is: |
|
43 | :math:`sim_{cosine}(X, Y) = \frac{|X \cap Y|}{\sqrt{|X| \cdot |Y|}}`. |
|
44 | """ |
|
45 | ||
46 | def sim(self, src, tar, qval=2): |
|
47 | r"""Return the cosine similarity of two strings. |
|
48 | ||
49 | Parameters |
|
50 | ---------- |
|
51 | src : str |
|
52 | Source string (or QGrams/Counter objects) for comparison |
|
53 | tar : str |
|
54 | Target string (or QGrams/Counter objects) for comparison |
|
55 | qval : int |
|
56 | The length of each q-gram; 0 for non-q-gram version |
|
57 | ||
58 | Returns |
|
59 | ------- |
|
60 | float |
|
61 | Cosine similarity |
|
62 | ||
63 | Examples |
|
64 | -------- |
|
65 | >>> cmp = Cosine() |
|
66 | >>> cmp.sim('cat', 'hat') |
|
67 | 0.5 |
|
68 | >>> cmp.sim('Niall', 'Neil') |
|
69 | 0.3651483716701107 |
|
70 | >>> cmp.sim('aluminum', 'Catalan') |
|
71 | 0.11785113019775793 |
|
72 | >>> cmp.sim('ATCG', 'TAGC') |
|
73 | 0.0 |
|
74 | ||
75 | """ |
|
76 | if src == tar: |
|
77 | return 1.0 |
|
78 | if not src or not tar: |
|
79 | return 0.0 |
|
80 | ||
81 | q_src, q_tar = self._get_qgrams(src, tar, qval) |
|
82 | q_src_mag = sum(q_src.values()) |