@@ 30-186 (lines=157) @@ | ||
27 | __all__ = ['Peirce'] |
|
28 | ||
29 | ||
30 | class Peirce(_TokenDistance): |
|
31 | r"""Peirce correlation. |
|
32 | ||
33 | For two sets X and Y and a population N, the Peirce correlation |
|
34 | :cite:`Peirce:1884` is |
|
35 | ||
36 | .. math:: |
|
37 | ||
38 | corr_{Peirce}(X, Y) = \frac{|X \cap Y| \cdot |
|
39 | |(N \setminus X) \setminus Y| - |
|
40 | |X \setminus Y| \cdot |Y \setminus Y|} |
|
41 | {|X| \cdot |N \setminus X|} |
|
42 | ||
43 | Both :cite:`Choi:2010` and :cite:`Hubalek:1982` present a different formula |
|
44 | and incorrectly attribute it to Peirce. Likewise, :cite:`Doolittle:1884` |
|
45 | presents a different formula and incorrectly attributes it to Peirce. This |
|
46 | is distinct from the formula he presents and attributes to himself. |
|
47 | ||
48 | In :ref:`2x2 confusion table terms <confusion_table>`, where a+b+c+d=n, |
|
49 | this is |
|
50 | ||
51 | .. math:: |
|
52 | ||
53 | corr_{Peirce} = |
|
54 | \frac{ad-bc}{(a+b)(c+d)} |
|
55 | ||
56 | .. versionadded:: 0.4.0 |
|
57 | """ |
|
58 | ||
59 | def __init__( |
|
60 | self, |
|
61 | alphabet: Optional[ |
|
62 | Union[TCounter[str], Sequence[str], Set[str], int] |
|
63 | ] = None, |
|
64 | tokenizer: Optional[_Tokenizer] = None, |
|
65 | intersection_type: str = 'crisp', |
|
66 | **kwargs: Any |
|
67 | ) -> None: |
|
68 | """Initialize Peirce instance. |
|
69 | ||
70 | Parameters |
|
71 | ---------- |
|
72 | alphabet : Counter, collection, int, or None |
|
73 | This represents the alphabet of possible tokens. |
|
74 | See :ref:`alphabet <alphabet>` description in |
|
75 | :py:class:`_TokenDistance` for details. |
|
76 | tokenizer : _Tokenizer |
|
77 | A tokenizer instance from the :py:mod:`abydos.tokenizer` package |
|
78 | intersection_type : str |
|
79 | Specifies the intersection type, and set type as a result: |
|
80 | See :ref:`intersection_type <intersection_type>` description in |
|
81 | :py:class:`_TokenDistance` for details. |
|
82 | **kwargs |
|
83 | Arbitrary keyword arguments |
|
84 | ||
85 | Other Parameters |
|
86 | ---------------- |
|
87 | qval : int |
|
88 | The length of each q-gram. Using this parameter and tokenizer=None |
|
89 | will cause the instance to use the QGram tokenizer with this |
|
90 | q value. |
|
91 | metric : _Distance |
|
92 | A string distance measure class for use in the ``soft`` and |
|
93 | ``fuzzy`` variants. |
|
94 | threshold : float |
|
95 | A threshold value, similarities above which are counted as |
|
96 | members of the intersection for the ``fuzzy`` variant. |
|
97 | ||
98 | ||
99 | .. versionadded:: 0.4.0 |
|
100 | ||
101 | """ |
|
102 | super(Peirce, self).__init__( |
|
103 | alphabet=alphabet, |
|
104 | tokenizer=tokenizer, |
|
105 | intersection_type=intersection_type, |
|
106 | **kwargs |
|
107 | ) |
|
108 | ||
109 | def corr(self, src: str, tar: str) -> float: |
|
110 | """Return the Peirce correlation of two strings. |
|
111 | ||
112 | Parameters |
|
113 | ---------- |
|
114 | src : str |
|
115 | Source string (or QGrams/Counter objects) for comparison |
|
116 | tar : str |
|
117 | Target string (or QGrams/Counter objects) for comparison |
|
118 | ||
119 | Returns |
|
120 | ------- |
|
121 | float |
|
122 | Peirce correlation |
|
123 | ||
124 | Examples |
|
125 | -------- |
|
126 | >>> cmp = Peirce() |
|
127 | >>> cmp.corr('cat', 'hat') |
|
128 | 0.49743589743589745 |
|
129 | >>> cmp.corr('Niall', 'Neil') |
|
130 | 0.32947729220222793 |
|
131 | >>> cmp.corr('aluminum', 'Catalan') |
|
132 | 0.10209049255441008 |
|
133 | >>> cmp.corr('ATCG', 'TAGC') |
|
134 | -0.006418485237483954 |
|
135 | ||
136 | ||
137 | .. versionadded:: 0.4.0 |
|
138 | ||
139 | """ |
|
140 | if src == tar: |
|
141 | return 1.0 |
|
142 | ||
143 | self._tokenize(src, tar) |
|
144 | ||
145 | a = self._intersection_card() |
|
146 | b = self._src_only_card() |
|
147 | c = self._tar_only_card() |
|
148 | d = self._total_complement_card() |
|
149 | ||
150 | num = a * d - b * c |
|
151 | if num: |
|
152 | return num / ((a + b) * (c + d)) |
|
153 | return 0.0 |
|
154 | ||
155 | def sim(self, src: str, tar: str) -> float: |
|
156 | """Return the Peirce similarity of two strings. |
|
157 | ||
158 | Parameters |
|
159 | ---------- |
|
160 | src : str |
|
161 | Source string (or QGrams/Counter objects) for comparison |
|
162 | tar : str |
|
163 | Target string (or QGrams/Counter objects) for comparison |
|
164 | ||
165 | Returns |
|
166 | ------- |
|
167 | float |
|
168 | Peirce similarity |
|
169 | ||
170 | Examples |
|
171 | -------- |
|
172 | >>> cmp = Peirce() |
|
173 | >>> cmp.sim('cat', 'hat') |
|
174 | 0.7487179487179487 |
|
175 | >>> cmp.sim('Niall', 'Neil') |
|
176 | 0.664738646101114 |
|
177 | >>> cmp.sim('aluminum', 'Catalan') |
|
178 | 0.5510452462772051 |
|
179 | >>> cmp.sim('ATCG', 'TAGC') |
|
180 | 0.496790757381258 |
|
181 | ||
182 | ||
183 | .. versionadded:: 0.4.0 |
|
184 | ||
185 | """ |
|
186 | return (1.0 + self.corr(src, tar)) / 2.0 |
|
187 | ||
188 | ||
189 | if __name__ == '__main__': |
@@ 30-183 (lines=154) @@ | ||
27 | __all__ = ['UnknownA'] |
|
28 | ||
29 | ||
30 | class UnknownA(_TokenDistance): |
|
31 | r"""Unknown A correlation. |
|
32 | ||
33 | For two sets X and Y and a population N, Unknown A correlation |
|
34 | is sometimes attributed to :cite:`Peirce:1884`. It differs from |
|
35 | :py:class:`Peirce` in that the numerator is the product of the opposite |
|
36 | pair of marginals: |
|
37 | ||
38 | .. math:: |
|
39 | ||
40 | corr_{UnknownA}(X, Y) = \frac{|X \cap Y| \cdot |
|
41 | |(N \setminus X) \setminus Y| - |
|
42 | |X \setminus Y| \cdot |Y \setminus Y|} |
|
43 | {|Y| \cdot |N \setminus Y|} |
|
44 | ||
45 | In :ref:`2x2 confusion table terms <confusion_table>`, where a+b+c+d=n, |
|
46 | this is |
|
47 | ||
48 | .. math:: |
|
49 | ||
50 | corr_{UnknownA} = |
|
51 | \frac{ad-bc}{(a+c)(b+d)} |
|
52 | ||
53 | .. versionadded:: 0.4.0 |
|
54 | """ |
|
55 | ||
56 | def __init__( |
|
57 | self, |
|
58 | alphabet: Optional[ |
|
59 | Union[TCounter[str], Sequence[str], Set[str], int] |
|
60 | ] = None, |
|
61 | tokenizer: Optional[_Tokenizer] = None, |
|
62 | intersection_type: str = 'crisp', |
|
63 | **kwargs: Any |
|
64 | ) -> None: |
|
65 | """Initialize UnknownA instance. |
|
66 | ||
67 | Parameters |
|
68 | ---------- |
|
69 | alphabet : Counter, collection, int, or None |
|
70 | This represents the alphabet of possible tokens. |
|
71 | See :ref:`alphabet <alphabet>` description in |
|
72 | :py:class:`_TokenDistance` for details. |
|
73 | tokenizer : _Tokenizer |
|
74 | A tokenizer instance from the :py:mod:`abydos.tokenizer` package |
|
75 | intersection_type : str |
|
76 | Specifies the intersection type, and set type as a result: |
|
77 | See :ref:`intersection_type <intersection_type>` description in |
|
78 | :py:class:`_TokenDistance` for details. |
|
79 | **kwargs |
|
80 | Arbitrary keyword arguments |
|
81 | ||
82 | Other Parameters |
|
83 | ---------------- |
|
84 | qval : int |
|
85 | The length of each q-gram. Using this parameter and tokenizer=None |
|
86 | will cause the instance to use the QGram tokenizer with this |
|
87 | q value. |
|
88 | metric : _Distance |
|
89 | A string distance measure class for use in the ``soft`` and |
|
90 | ``fuzzy`` variants. |
|
91 | threshold : float |
|
92 | A threshold value, similarities above which are counted as |
|
93 | members of the intersection for the ``fuzzy`` variant. |
|
94 | ||
95 | ||
96 | .. versionadded:: 0.4.0 |
|
97 | ||
98 | """ |
|
99 | super(UnknownA, self).__init__( |
|
100 | alphabet=alphabet, |
|
101 | tokenizer=tokenizer, |
|
102 | intersection_type=intersection_type, |
|
103 | **kwargs |
|
104 | ) |
|
105 | ||
106 | def corr(self, src: str, tar: str) -> float: |
|
107 | """Return the Unknown A correlation of two strings. |
|
108 | ||
109 | Parameters |
|
110 | ---------- |
|
111 | src : str |
|
112 | Source string (or QGrams/Counter objects) for comparison |
|
113 | tar : str |
|
114 | Target string (or QGrams/Counter objects) for comparison |
|
115 | ||
116 | Returns |
|
117 | ------- |
|
118 | float |
|
119 | Unknown A correlation |
|
120 | ||
121 | Examples |
|
122 | -------- |
|
123 | >>> cmp = UnknownA() |
|
124 | >>> cmp.corr('cat', 'hat') |
|
125 | 0.49743589743589745 |
|
126 | >>> cmp.corr('Niall', 'Neil') |
|
127 | 0.39486521181001283 |
|
128 | >>> cmp.corr('aluminum', 'Catalan') |
|
129 | 0.1147039897039897 |
|
130 | >>> cmp.corr('ATCG', 'TAGC') |
|
131 | -0.006418485237483954 |
|
132 | ||
133 | ||
134 | .. versionadded:: 0.4.0 |
|
135 | ||
136 | """ |
|
137 | if src == tar: |
|
138 | return 1.0 |
|
139 | ||
140 | self._tokenize(src, tar) |
|
141 | ||
142 | a = self._intersection_card() |
|
143 | b = self._src_only_card() |
|
144 | c = self._tar_only_card() |
|
145 | d = self._total_complement_card() |
|
146 | ||
147 | num = a * d - b * c |
|
148 | if num: |
|
149 | return num / ((a + c) * (b + d)) |
|
150 | return 0.0 |
|
151 | ||
152 | def sim(self, src: str, tar: str) -> float: |
|
153 | """Return the Unknown A similarity of two strings. |
|
154 | ||
155 | Parameters |
|
156 | ---------- |
|
157 | src : str |
|
158 | Source string (or QGrams/Counter objects) for comparison |
|
159 | tar : str |
|
160 | Target string (or QGrams/Counter objects) for comparison |
|
161 | ||
162 | Returns |
|
163 | ------- |
|
164 | float |
|
165 | Unknown A similarity |
|
166 | ||
167 | Examples |
|
168 | -------- |
|
169 | >>> cmp = UnknownA() |
|
170 | >>> cmp.sim('cat', 'hat') |
|
171 | 0.7487179487179487 |
|
172 | >>> cmp.sim('Niall', 'Neil') |
|
173 | 0.6974326059050064 |
|
174 | >>> cmp.sim('aluminum', 'Catalan') |
|
175 | 0.5573519948519948 |
|
176 | >>> cmp.sim('ATCG', 'TAGC') |
|
177 | 0.496790757381258 |
|
178 | ||
179 | ||
180 | .. versionadded:: 0.4.0 |
|
181 | ||
182 | """ |
|
183 | return (1.0 + self.corr(src, tar)) / 2.0 |
|
184 | ||
185 | ||
186 | if __name__ == '__main__': |
@@ 30-183 (lines=154) @@ | ||
27 | __all__ = ['AndresMarzoDelta'] |
|
28 | ||
29 | ||
30 | class AndresMarzoDelta(_TokenDistance): |
|
31 | r"""Andres & Marzo's Delta correlation. |
|
32 | ||
33 | For two sets X and Y and a population N, Andres & Marzo's :math:`\Delta` |
|
34 | correlation :cite:`Andres:2004` is |
|
35 | ||
36 | .. math:: |
|
37 | ||
38 | corr_{AndresMarzo_\Delta}(X, Y) = \Delta = |
|
39 | \frac{|X \cap Y| + |(N \setminus X) \setminus Y| - |
|
40 | 2\sqrt{|X \setminus Y| \cdot |Y \setminus X|}}{|N|} |
|
41 | ||
42 | ||
43 | In :ref:`2x2 confusion table terms <confusion_table>`, where a+b+c+d=n, |
|
44 | this is |
|
45 | ||
46 | .. math:: |
|
47 | ||
48 | corr_{AndresMarzo_\Delta} = \Delta = |
|
49 | \frac{a+d-2\sqrt{b \cdot c}}{n} |
|
50 | ||
51 | .. versionadded:: 0.4.0 |
|
52 | """ |
|
53 | ||
54 | def __init__( |
|
55 | self, |
|
56 | alphabet: Optional[ |
|
57 | Union[TCounter[str], Sequence[str], Set[str], int] |
|
58 | ] = None, |
|
59 | tokenizer: Optional[_Tokenizer] = None, |
|
60 | intersection_type: str = 'crisp', |
|
61 | **kwargs: Any |
|
62 | ) -> None: |
|
63 | """Initialize AndresMarzoDelta instance. |
|
64 | ||
65 | Parameters |
|
66 | ---------- |
|
67 | alphabet : Counter, collection, int, or None |
|
68 | This represents the alphabet of possible tokens. |
|
69 | See :ref:`alphabet <alphabet>` description in |
|
70 | :py:class:`_TokenDistance` for details. |
|
71 | tokenizer : _Tokenizer |
|
72 | A tokenizer instance from the :py:mod:`abydos.tokenizer` package |
|
73 | intersection_type : str |
|
74 | Specifies the intersection type, and set type as a result: |
|
75 | See :ref:`intersection_type <intersection_type>` description in |
|
76 | :py:class:`_TokenDistance` for details. |
|
77 | **kwargs |
|
78 | Arbitrary keyword arguments |
|
79 | ||
80 | Other Parameters |
|
81 | ---------------- |
|
82 | qval : int |
|
83 | The length of each q-gram. Using this parameter and tokenizer=None |
|
84 | will cause the instance to use the QGram tokenizer with this |
|
85 | q value. |
|
86 | metric : _Distance |
|
87 | A string distance measure class for use in the ``soft`` and |
|
88 | ``fuzzy`` variants. |
|
89 | threshold : float |
|
90 | A threshold value, similarities above which are counted as |
|
91 | members of the intersection for the ``fuzzy`` variant. |
|
92 | ||
93 | ||
94 | .. versionadded:: 0.4.0 |
|
95 | ||
96 | """ |
|
97 | super(AndresMarzoDelta, self).__init__( |
|
98 | alphabet=alphabet, |
|
99 | tokenizer=tokenizer, |
|
100 | intersection_type=intersection_type, |
|
101 | **kwargs |
|
102 | ) |
|
103 | ||
104 | def corr(self, src: str, tar: str) -> float: |
|
105 | """Return the Andres & Marzo's Delta correlation of two strings. |
|
106 | ||
107 | Parameters |
|
108 | ---------- |
|
109 | src : str |
|
110 | Source string (or QGrams/Counter objects) for comparison |
|
111 | tar : str |
|
112 | Target string (or QGrams/Counter objects) for comparison |
|
113 | ||
114 | Returns |
|
115 | ------- |
|
116 | float |
|
117 | Andres & Marzo's Delta correlation |
|
118 | ||
119 | Examples |
|
120 | -------- |
|
121 | >>> cmp = AndresMarzoDelta() |
|
122 | >>> cmp.corr('cat', 'hat') |
|
123 | 0.9897959183673469 |
|
124 | >>> cmp.corr('Niall', 'Neil') |
|
125 | 0.9822344346552608 |
|
126 | >>> cmp.corr('aluminum', 'Catalan') |
|
127 | 0.9618259496215341 |
|
128 | >>> cmp.corr('ATCG', 'TAGC') |
|
129 | 0.9744897959183674 |
|
130 | ||
131 | ||
132 | .. versionadded:: 0.4.0 |
|
133 | ||
134 | """ |
|
135 | if src == tar: |
|
136 | return 1.0 |
|
137 | ||
138 | self._tokenize(src, tar) |
|
139 | ||
140 | a = self._intersection_card() |
|
141 | b = self._src_only_card() |
|
142 | c = self._tar_only_card() |
|
143 | d = self._total_complement_card() |
|
144 | n = self._population_unique_card() |
|
145 | ||
146 | num = a + d - 2 * (b * c) ** 0.5 |
|
147 | ||
148 | if num == 0.0: |
|
149 | return 0.0 |
|
150 | return num / n |
|
151 | ||
152 | def sim(self, src: str, tar: str) -> float: |
|
153 | """Return the Andres & Marzo's Delta similarity of two strings. |
|
154 | ||
155 | Parameters |
|
156 | ---------- |
|
157 | src : str |
|
158 | Source string (or QGrams/Counter objects) for comparison |
|
159 | tar : str |
|
160 | Target string (or QGrams/Counter objects) for comparison |
|
161 | ||
162 | Returns |
|
163 | ------- |
|
164 | float |
|
165 | Andres & Marzo's Delta similarity |
|
166 | ||
167 | Examples |
|
168 | -------- |
|
169 | >>> cmp = AndresMarzoDelta() |
|
170 | >>> cmp.sim('cat', 'hat') |
|
171 | 0.9948979591836735 |
|
172 | >>> cmp.sim('Niall', 'Neil') |
|
173 | 0.9911172173276304 |
|
174 | >>> cmp.sim('aluminum', 'Catalan') |
|
175 | 0.980912974810767 |
|
176 | >>> cmp.sim('ATCG', 'TAGC') |
|
177 | 0.9872448979591837 |
|
178 | ||
179 | ||
180 | .. versionadded:: 0.4.0 |
|
181 | ||
182 | """ |
|
183 | return (self.corr(src, tar) + 1) / 2 |
|
184 | ||
185 | ||
186 | if __name__ == '__main__': |
@@ 30-181 (lines=152) @@ | ||
27 | __all__ = ['BeniniI'] |
|
28 | ||
29 | ||
30 | class BeniniI(_TokenDistance): |
|
31 | r"""BeniniI correlation. |
|
32 | ||
33 | For two sets X and Y and a population N, Benini I correlation, Benini's |
|
34 | Index of Attraction, :cite:`Benini:1901` is |
|
35 | ||
36 | .. math:: |
|
37 | ||
38 | corr_{BeniniI}(X, Y) = |
|
39 | \frac{|X \cap Y| \cdot |(N \setminus X) \setminus Y| - |
|
40 | |X \setminus Y| \cdot |Y \setminus X|}{|Y| \cdot |N \setminus X|} |
|
41 | ||
42 | ||
43 | In :ref:`2x2 confusion table terms <confusion_table>`, where a+b+c+d=n, |
|
44 | this is |
|
45 | ||
46 | .. math:: |
|
47 | ||
48 | corr_{BeniniI} = \frac{ad-bc}{(a+c)(c+d)} |
|
49 | ||
50 | .. versionadded:: 0.4.0 |
|
51 | """ |
|
52 | ||
53 | def __init__( |
|
54 | self, |
|
55 | alphabet: Optional[ |
|
56 | Union[TCounter[str], Sequence[str], Set[str], int] |
|
57 | ] = None, |
|
58 | tokenizer: Optional[_Tokenizer] = None, |
|
59 | intersection_type: str = 'crisp', |
|
60 | **kwargs: Any |
|
61 | ) -> None: |
|
62 | """Initialize BeniniI instance. |
|
63 | ||
64 | Parameters |
|
65 | ---------- |
|
66 | alphabet : Counter, collection, int, or None |
|
67 | This represents the alphabet of possible tokens. |
|
68 | See :ref:`alphabet <alphabet>` description in |
|
69 | :py:class:`_TokenDistance` for details. |
|
70 | tokenizer : _Tokenizer |
|
71 | A tokenizer instance from the :py:mod:`abydos.tokenizer` package |
|
72 | intersection_type : str |
|
73 | Specifies the intersection type, and set type as a result: |
|
74 | See :ref:`intersection_type <intersection_type>` description in |
|
75 | :py:class:`_TokenDistance` for details. |
|
76 | **kwargs |
|
77 | Arbitrary keyword arguments |
|
78 | ||
79 | Other Parameters |
|
80 | ---------------- |
|
81 | qval : int |
|
82 | The length of each q-gram. Using this parameter and tokenizer=None |
|
83 | will cause the instance to use the QGram tokenizer with this |
|
84 | q value. |
|
85 | metric : _Distance |
|
86 | A string distance measure class for use in the ``soft`` and |
|
87 | ``fuzzy`` variants. |
|
88 | threshold : float |
|
89 | A threshold value, similarities above which are counted as |
|
90 | members of the intersection for the ``fuzzy`` variant. |
|
91 | ||
92 | ||
93 | .. versionadded:: 0.4.0 |
|
94 | ||
95 | """ |
|
96 | super(BeniniI, self).__init__( |
|
97 | alphabet=alphabet, |
|
98 | tokenizer=tokenizer, |
|
99 | intersection_type=intersection_type, |
|
100 | **kwargs |
|
101 | ) |
|
102 | ||
103 | def corr(self, src: str, tar: str) -> float: |
|
104 | """Return the Benini I correlation of two strings. |
|
105 | ||
106 | Parameters |
|
107 | ---------- |
|
108 | src : str |
|
109 | Source string (or QGrams/Counter objects) for comparison |
|
110 | tar : str |
|
111 | Target string (or QGrams/Counter objects) for comparison |
|
112 | ||
113 | Returns |
|
114 | ------- |
|
115 | float |
|
116 | Benini I correlation |
|
117 | ||
118 | Examples |
|
119 | -------- |
|
120 | >>> cmp = BeniniI() |
|
121 | >>> cmp.corr('cat', 'hat') |
|
122 | 0.49743589743589745 |
|
123 | >>> cmp.corr('Niall', 'Neil') |
|
124 | 0.3953727506426735 |
|
125 | >>> cmp.corr('aluminum', 'Catalan') |
|
126 | 0.11485180412371133 |
|
127 | >>> cmp.corr('ATCG', 'TAGC') |
|
128 | -0.006418485237483954 |
|
129 | ||
130 | ||
131 | .. versionadded:: 0.4.0 |
|
132 | ||
133 | """ |
|
134 | if src == tar: |
|
135 | return 1.0 |
|
136 | ||
137 | self._tokenize(src, tar) |
|
138 | ||
139 | a = self._intersection_card() |
|
140 | b = self._src_only_card() |
|
141 | c = self._tar_only_card() |
|
142 | d = self._total_complement_card() |
|
143 | ||
144 | num = a * d - b * c |
|
145 | ||
146 | if num == 0.0: |
|
147 | return 0.0 |
|
148 | return num / ((a + c) * (c + d)) |
|
149 | ||
150 | def sim(self, src: str, tar: str) -> float: |
|
151 | """Return the Benini I similarity of two strings. |
|
152 | ||
153 | Parameters |
|
154 | ---------- |
|
155 | src : str |
|
156 | Source string (or QGrams/Counter objects) for comparison |
|
157 | tar : str |
|
158 | Target string (or QGrams/Counter objects) for comparison |
|
159 | ||
160 | Returns |
|
161 | ------- |
|
162 | float |
|
163 | Benini I similarity |
|
164 | ||
165 | Examples |
|
166 | -------- |
|
167 | >>> cmp = BeniniI() |
|
168 | >>> cmp.sim('cat', 'hat') |
|
169 | 0.7487179487179487 |
|
170 | >>> cmp.sim('Niall', 'Neil') |
|
171 | 0.6976863753213367 |
|
172 | >>> cmp.sim('aluminum', 'Catalan') |
|
173 | 0.5574259020618557 |
|
174 | >>> cmp.sim('ATCG', 'TAGC') |
|
175 | 0.496790757381258 |
|
176 | ||
177 | ||
178 | .. versionadded:: 0.4.0 |
|
179 | ||
180 | """ |
|
181 | return (1 + self.corr(src, tar)) / 2 |
|
182 | ||
183 | ||
184 | if __name__ == '__main__': |
@@ 30-179 (lines=150) @@ | ||
27 | __all__ = ['Dispersion'] |
|
28 | ||
29 | ||
30 | class Dispersion(_TokenDistance): |
|
31 | r"""Dispersion correlation. |
|
32 | ||
33 | For two sets X and Y and a population N, the dispersion |
|
34 | correlation :cite:`IBM:2017` is |
|
35 | ||
36 | .. math:: |
|
37 | ||
38 | corr_{dispersion}(X, Y) = |
|
39 | \frac{|X \cap Y| \cdot |(N \setminus X) \setminus Y| - |
|
40 | |X \setminus Y| \cdot |Y \setminus X|} |
|
41 | {|N|^2} |
|
42 | ||
43 | In :ref:`2x2 confusion table terms <confusion_table>`, where a+b+c+d=n, |
|
44 | this is |
|
45 | ||
46 | .. math:: |
|
47 | ||
48 | corr_{dispersion} = |
|
49 | \frac{ad-bc}{n^2} |
|
50 | ||
51 | .. versionadded:: 0.4.0 |
|
52 | """ |
|
53 | ||
54 | def __init__( |
|
55 | self, |
|
56 | alphabet: Optional[ |
|
57 | Union[TCounter[str], Sequence[str], Set[str], int] |
|
58 | ] = None, |
|
59 | tokenizer: Optional[_Tokenizer] = None, |
|
60 | intersection_type: str = 'crisp', |
|
61 | **kwargs: Any |
|
62 | ) -> None: |
|
63 | """Initialize Dispersion instance. |
|
64 | ||
65 | Parameters |
|
66 | ---------- |
|
67 | alphabet : Counter, collection, int, or None |
|
68 | This represents the alphabet of possible tokens. |
|
69 | See :ref:`alphabet <alphabet>` description in |
|
70 | :py:class:`_TokenDistance` for details. |
|
71 | tokenizer : _Tokenizer |
|
72 | A tokenizer instance from the :py:mod:`abydos.tokenizer` package |
|
73 | intersection_type : str |
|
74 | Specifies the intersection type, and set type as a result: |
|
75 | See :ref:`intersection_type <intersection_type>` description in |
|
76 | :py:class:`_TokenDistance` for details. |
|
77 | **kwargs |
|
78 | Arbitrary keyword arguments |
|
79 | ||
80 | Other Parameters |
|
81 | ---------------- |
|
82 | qval : int |
|
83 | The length of each q-gram. Using this parameter and tokenizer=None |
|
84 | will cause the instance to use the QGram tokenizer with this |
|
85 | q value. |
|
86 | metric : _Distance |
|
87 | A string distance measure class for use in the ``soft`` and |
|
88 | ``fuzzy`` variants. |
|
89 | threshold : float |
|
90 | A threshold value, similarities above which are counted as |
|
91 | members of the intersection for the ``fuzzy`` variant. |
|
92 | ||
93 | ||
94 | .. versionadded:: 0.4.0 |
|
95 | ||
96 | """ |
|
97 | super(Dispersion, self).__init__( |
|
98 | alphabet=alphabet, |
|
99 | tokenizer=tokenizer, |
|
100 | intersection_type=intersection_type, |
|
101 | **kwargs |
|
102 | ) |
|
103 | ||
104 | def corr(self, src: str, tar: str) -> float: |
|
105 | """Return the Dispersion correlation of two strings. |
|
106 | ||
107 | Parameters |
|
108 | ---------- |
|
109 | src : str |
|
110 | Source string (or QGrams/Counter objects) for comparison |
|
111 | tar : str |
|
112 | Target string (or QGrams/Counter objects) for comparison |
|
113 | ||
114 | Returns |
|
115 | ------- |
|
116 | float |
|
117 | Dispersion correlation |
|
118 | ||
119 | Examples |
|
120 | -------- |
|
121 | >>> cmp = Dispersion() |
|
122 | >>> cmp.corr('cat', 'hat') |
|
123 | 0.002524989587671803 |
|
124 | >>> cmp.corr('Niall', 'Neil') |
|
125 | 0.002502212619741774 |
|
126 | >>> cmp.corr('aluminum', 'Catalan') |
|
127 | 0.0011570449105440383 |
|
128 | >>> cmp.corr('ATCG', 'TAGC') |
|
129 | -4.06731570179092e-05 |
|
130 | ||
131 | ||
132 | .. versionadded:: 0.4.0 |
|
133 | ||
134 | """ |
|
135 | self._tokenize(src, tar) |
|
136 | ||
137 | a = self._intersection_card() |
|
138 | b = self._src_only_card() |
|
139 | c = self._tar_only_card() |
|
140 | d = self._total_complement_card() |
|
141 | n = self._population_unique_card() |
|
142 | ||
143 | admbc = a * d - b * c |
|
144 | if admbc == 0.0: |
|
145 | return 0.0 |
|
146 | return admbc / n ** 2 |
|
147 | ||
148 | def sim(self, src: str, tar: str) -> float: |
|
149 | """Return the Dispersion similarity of two strings. |
|
150 | ||
151 | Parameters |
|
152 | ---------- |
|
153 | src : str |
|
154 | Source string (or QGrams/Counter objects) for comparison |
|
155 | tar : str |
|
156 | Target string (or QGrams/Counter objects) for comparison |
|
157 | ||
158 | Returns |
|
159 | ------- |
|
160 | float |
|
161 | Dispersion similarity |
|
162 | ||
163 | Examples |
|
164 | -------- |
|
165 | >>> cmp = Dispersion() |
|
166 | >>> cmp.sim('cat', 'hat') |
|
167 | 0.5012624947938359 |
|
168 | >>> cmp.sim('Niall', 'Neil') |
|
169 | 0.5012511063098709 |
|
170 | >>> cmp.sim('aluminum', 'Catalan') |
|
171 | 0.500578522455272 |
|
172 | >>> cmp.sim('ATCG', 'TAGC') |
|
173 | 0.499979663421491 |
|
174 | ||
175 | ||
176 | .. versionadded:: 0.4.0 |
|
177 | ||
178 | """ |
|
179 | return (1 + self.corr(src, tar)) / 2 |
|
180 | ||
181 | ||
182 | if __name__ == '__main__': |
@@ 30-179 (lines=150) @@ | ||
27 | __all__ = ['WarrensIII'] |
|
28 | ||
29 | ||
30 | class WarrensIII(_TokenDistance): |
|
31 | r"""Warrens III correlation. |
|
32 | ||
33 | For two sets X and Y and a population N, Warrens III correlation |
|
34 | :math:`S_{NS3}` :cite:`Warrens:2008` is |
|
35 | ||
36 | .. math:: |
|
37 | ||
38 | corr_{WarrensIII}(X, Y) = |
|
39 | \frac{2|(N \setminus X) \setminus Y| - |X \setminus Y| - |
|
40 | |Y \setminus X|}{|N \setminus X| + |N \setminus Y|} |
|
41 | ||
42 | In :ref:`2x2 confusion table terms <confusion_table>`, where a+b+c+d=n, |
|
43 | this is |
|
44 | ||
45 | .. math:: |
|
46 | ||
47 | corr_{WarrensIII} = |
|
48 | \frac{2d-b-c}{2d+b+c} |
|
49 | ||
50 | .. versionadded:: 0.4.0 |
|
51 | """ |
|
52 | ||
53 | def __init__( |
|
54 | self, |
|
55 | alphabet: Optional[ |
|
56 | Union[TCounter[str], Sequence[str], Set[str], int] |
|
57 | ] = None, |
|
58 | tokenizer: Optional[_Tokenizer] = None, |
|
59 | intersection_type: str = 'crisp', |
|
60 | **kwargs: Any |
|
61 | ) -> None: |
|
62 | """Initialize WarrensIII instance. |
|
63 | ||
64 | Parameters |
|
65 | ---------- |
|
66 | alphabet : Counter, collection, int, or None |
|
67 | This represents the alphabet of possible tokens. |
|
68 | See :ref:`alphabet <alphabet>` description in |
|
69 | :py:class:`_TokenDistance` for details. |
|
70 | tokenizer : _Tokenizer |
|
71 | A tokenizer instance from the :py:mod:`abydos.tokenizer` package |
|
72 | intersection_type : str |
|
73 | Specifies the intersection type, and set type as a result: |
|
74 | See :ref:`intersection_type <intersection_type>` description in |
|
75 | :py:class:`_TokenDistance` for details. |
|
76 | **kwargs |
|
77 | Arbitrary keyword arguments |
|
78 | ||
79 | Other Parameters |
|
80 | ---------------- |
|
81 | qval : int |
|
82 | The length of each q-gram. Using this parameter and tokenizer=None |
|
83 | will cause the instance to use the QGram tokenizer with this |
|
84 | q value. |
|
85 | metric : _Distance |
|
86 | A string distance measure class for use in the ``soft`` and |
|
87 | ``fuzzy`` variants. |
|
88 | threshold : float |
|
89 | A threshold value, similarities above which are counted as |
|
90 | members of the intersection for the ``fuzzy`` variant. |
|
91 | ||
92 | ||
93 | .. versionadded:: 0.4.0 |
|
94 | ||
95 | """ |
|
96 | super(WarrensIII, self).__init__( |
|
97 | alphabet=alphabet, |
|
98 | tokenizer=tokenizer, |
|
99 | intersection_type=intersection_type, |
|
100 | **kwargs |
|
101 | ) |
|
102 | ||
103 | def corr(self, src: str, tar: str) -> float: |
|
104 | """Return the Warrens III correlation of two strings. |
|
105 | ||
106 | Parameters |
|
107 | ---------- |
|
108 | src : str |
|
109 | Source string (or QGrams/Counter objects) for comparison |
|
110 | tar : str |
|
111 | Target string (or QGrams/Counter objects) for comparison |
|
112 | ||
113 | Returns |
|
114 | ------- |
|
115 | float |
|
116 | Warrens III correlation |
|
117 | ||
118 | Examples |
|
119 | -------- |
|
120 | >>> cmp = WarrensIII() |
|
121 | >>> cmp.corr('cat', 'hat') |
|
122 | 0.9948717948717949 |
|
123 | >>> cmp.corr('Niall', 'Neil') |
|
124 | 0.9910083493898523 |
|
125 | >>> cmp.corr('aluminum', 'Catalan') |
|
126 | 0.9806825499034127 |
|
127 | >>> cmp.corr('ATCG', 'TAGC') |
|
128 | 0.9871630295250321 |
|
129 | ||
130 | ||
131 | .. versionadded:: 0.4.0 |
|
132 | ||
133 | """ |
|
134 | if src == tar: |
|
135 | return 1.0 |
|
136 | ||
137 | self._tokenize(src, tar) |
|
138 | ||
139 | b = self._src_only_card() |
|
140 | c = self._tar_only_card() |
|
141 | d = self._total_complement_card() |
|
142 | ||
143 | num = 2 * d - b - c |
|
144 | if num: |
|
145 | return num / (2 * d + b + c) |
|
146 | return 0.0 |
|
147 | ||
148 | def sim(self, src: str, tar: str) -> float: |
|
149 | """Return the Warrens III similarity of two strings. |
|
150 | ||
151 | Parameters |
|
152 | ---------- |
|
153 | src : str |
|
154 | Source string (or QGrams/Counter objects) for comparison |
|
155 | tar : str |
|
156 | Target string (or QGrams/Counter objects) for comparison |
|
157 | ||
158 | Returns |
|
159 | ------- |
|
160 | float |
|
161 | Warrens III similarity |
|
162 | ||
163 | Examples |
|
164 | -------- |
|
165 | >>> cmp = WarrensIII() |
|
166 | >>> cmp.sim('cat', 'hat') |
|
167 | 0.9974358974358974 |
|
168 | >>> cmp.sim('Niall', 'Neil') |
|
169 | 0.9955041746949261 |
|
170 | >>> cmp.sim('aluminum', 'Catalan') |
|
171 | 0.9903412749517064 |
|
172 | >>> cmp.sim('ATCG', 'TAGC') |
|
173 | 0.993581514762516 |
|
174 | ||
175 | ||
176 | .. versionadded:: 0.4.0 |
|
177 | ||
178 | """ |
|
179 | return (1.0 + self.corr(src, tar)) / 2.0 |
|
180 | ||
181 | ||
182 | if __name__ == '__main__': |