@@ 111-163 (lines=53) @@ | ||
108 | **kwargs |
|
109 | ) |
|
110 | ||
111 | def corr(self, src: str, tar: str) -> float: |
|
112 | """Return the normalized mean squared contingency corr. of two strings. |
|
113 | ||
114 | Parameters |
|
115 | ---------- |
|
116 | src : str |
|
117 | Source string (or QGrams/Counter objects) for comparison |
|
118 | tar : str |
|
119 | Target string (or QGrams/Counter objects) for comparison |
|
120 | ||
121 | Returns |
|
122 | ------- |
|
123 | float |
|
124 | Mean squared contingency correlation |
|
125 | ||
126 | Examples |
|
127 | -------- |
|
128 | >>> cmp = MSContingency() |
|
129 | >>> cmp.corr('cat', 'hat') |
|
130 | 0.6298568508557214 |
|
131 | >>> cmp.corr('Niall', 'Neil') |
|
132 | 0.4798371954796814 |
|
133 | >>> cmp.corr('aluminum', 'Catalan') |
|
134 | 0.15214891090821628 |
|
135 | >>> cmp.corr('ATCG', 'TAGC') |
|
136 | -0.009076921903905553 |
|
137 | ||
138 | ||
139 | .. versionadded:: 0.4.0 |
|
140 | ||
141 | """ |
|
142 | if src == tar: |
|
143 | return 1.0 |
|
144 | if not src or not tar: |
|
145 | return -1.0 |
|
146 | ||
147 | self._tokenize(src, tar) |
|
148 | ||
149 | a = self._intersection_card() |
|
150 | b = self._src_only_card() |
|
151 | c = self._tar_only_card() |
|
152 | d = self._total_complement_card() |
|
153 | ab = self._src_card() |
|
154 | ac = self._tar_card() |
|
155 | admbc = a * d - b * c |
|
156 | ||
157 | if admbc: |
|
158 | return ( |
|
159 | 2 ** 0.5 |
|
160 | * admbc |
|
161 | / (admbc ** 2 + ab * ac * (b + d) * (c + d)) ** 0.5 |
|
162 | ) |
|
163 | return 0.0 |
|
164 | ||
165 | def sim(self, src: str, tar: str) -> float: |
|
166 | """Return the normalized ms contingency similarity of two strings. |
@@ 107-155 (lines=49) @@ | ||
104 | **kwargs |
|
105 | ) |
|
106 | ||
107 | def sim_score(self, src: str, tar: str) -> float: |
|
108 | """Return Pearson's Chi-Squared similarity of two strings. |
|
109 | ||
110 | Parameters |
|
111 | ---------- |
|
112 | src : str |
|
113 | Source string (or QGrams/Counter objects) for comparison |
|
114 | tar : str |
|
115 | Target string (or QGrams/Counter objects) for comparison |
|
116 | ||
117 | Returns |
|
118 | ------- |
|
119 | float |
|
120 | Pearson's Chi-Squared similarity |
|
121 | ||
122 | Examples |
|
123 | -------- |
|
124 | >>> cmp = PearsonChiSquared() |
|
125 | >>> cmp.sim_score('cat', 'hat') |
|
126 | 193.99489809335964 |
|
127 | >>> cmp.sim_score('Niall', 'Neil') |
|
128 | 101.99771068526542 |
|
129 | >>> cmp.sim_score('aluminum', 'Catalan') |
|
130 | 9.19249664336649 |
|
131 | >>> cmp.sim_score('ATCG', 'TAGC') |
|
132 | 0.032298410951138765 |
|
133 | ||
134 | ||
135 | .. versionadded:: 0.4.0 |
|
136 | ||
137 | """ |
|
138 | self._tokenize(src, tar) |
|
139 | ||
140 | a = self._intersection_card() |
|
141 | b = self._src_only_card() |
|
142 | c = self._tar_only_card() |
|
143 | d = self._total_complement_card() |
|
144 | n = self._population_unique_card() |
|
145 | ab = self._src_card() |
|
146 | ac = self._tar_card() |
|
147 | ||
148 | if src == tar: |
|
149 | return float(n) |
|
150 | if not src or not tar: |
|
151 | return 0.0 |
|
152 | num = n * (a * d - b * c) ** 2 |
|
153 | if num: |
|
154 | return num / (ab * ac * (b + d) * (c + d)) |
|
155 | return 0.0 |
|
156 | ||
157 | def corr(self, src: str, tar: str) -> float: |
|
158 | """Return Pearson's Chi-Squared correlation of two strings. |