| @@ 115-167 (lines=53) @@ | ||
| 112 | **kwargs |
|
| 113 | ) |
|
| 114 | ||
| 115 | def corr(self, src, tar): |
|
| 116 | """Return the normalized mean squared contingency corr. of two strings. |
|
| 117 | ||
| 118 | Parameters |
|
| 119 | ---------- |
|
| 120 | src : str |
|
| 121 | Source string (or QGrams/Counter objects) for comparison |
|
| 122 | tar : str |
|
| 123 | Target string (or QGrams/Counter objects) for comparison |
|
| 124 | ||
| 125 | Returns |
|
| 126 | ------- |
|
| 127 | float |
|
| 128 | Mean squared contingency correlation |
|
| 129 | ||
| 130 | Examples |
|
| 131 | -------- |
|
| 132 | >>> cmp = MSContingency() |
|
| 133 | >>> cmp.corr('cat', 'hat') |
|
| 134 | 0.6298568508557214 |
|
| 135 | >>> cmp.corr('Niall', 'Neil') |
|
| 136 | 0.4798371954796814 |
|
| 137 | >>> cmp.corr('aluminum', 'Catalan') |
|
| 138 | 0.15214891090821628 |
|
| 139 | >>> cmp.corr('ATCG', 'TAGC') |
|
| 140 | -0.009076921903905553 |
|
| 141 | ||
| 142 | ||
| 143 | .. versionadded:: 0.4.0 |
|
| 144 | ||
| 145 | """ |
|
| 146 | if src == tar: |
|
| 147 | return 1.0 |
|
| 148 | if not src or not tar: |
|
| 149 | return -1.0 |
|
| 150 | ||
| 151 | self._tokenize(src, tar) |
|
| 152 | ||
| 153 | a = self._intersection_card() |
|
| 154 | b = self._src_only_card() |
|
| 155 | c = self._tar_only_card() |
|
| 156 | d = self._total_complement_card() |
|
| 157 | ab = self._src_card() |
|
| 158 | ac = self._tar_card() |
|
| 159 | admbc = a * d - b * c |
|
| 160 | ||
| 161 | if admbc: |
|
| 162 | return ( |
|
| 163 | 2 ** 0.5 |
|
| 164 | * admbc |
|
| 165 | / (admbc ** 2 + ab * ac * (b + d) * (c + d)) ** 0.5 |
|
| 166 | ) |
|
| 167 | return 0.0 |
|
| 168 | ||
| 169 | def sim(self, src, tar): |
|
| 170 | """Return the normalized ms contingency similarity of two strings. |
|
| @@ 112-160 (lines=49) @@ | ||
| 109 | **kwargs |
|
| 110 | ) |
|
| 111 | ||
| 112 | def sim_score(self, src, tar): |
|
| 113 | """Return Pearson's Chi-Squared similarity of two strings. |
|
| 114 | ||
| 115 | Parameters |
|
| 116 | ---------- |
|
| 117 | src : str |
|
| 118 | Source string (or QGrams/Counter objects) for comparison |
|
| 119 | tar : str |
|
| 120 | Target string (or QGrams/Counter objects) for comparison |
|
| 121 | ||
| 122 | Returns |
|
| 123 | ------- |
|
| 124 | float |
|
| 125 | Pearson's Chi-Squared similarity |
|
| 126 | ||
| 127 | Examples |
|
| 128 | -------- |
|
| 129 | >>> cmp = PearsonChiSquared() |
|
| 130 | >>> cmp.sim_score('cat', 'hat') |
|
| 131 | 193.99489809335964 |
|
| 132 | >>> cmp.sim_score('Niall', 'Neil') |
|
| 133 | 101.99771068526542 |
|
| 134 | >>> cmp.sim_score('aluminum', 'Catalan') |
|
| 135 | 9.19249664336649 |
|
| 136 | >>> cmp.sim_score('ATCG', 'TAGC') |
|
| 137 | 0.032298410951138765 |
|
| 138 | ||
| 139 | ||
| 140 | .. versionadded:: 0.4.0 |
|
| 141 | ||
| 142 | """ |
|
| 143 | self._tokenize(src, tar) |
|
| 144 | ||
| 145 | a = self._intersection_card() |
|
| 146 | b = self._src_only_card() |
|
| 147 | c = self._tar_only_card() |
|
| 148 | d = self._total_complement_card() |
|
| 149 | n = self._population_unique_card() |
|
| 150 | ab = self._src_card() |
|
| 151 | ac = self._tar_card() |
|
| 152 | ||
| 153 | if src == tar: |
|
| 154 | return float(n) |
|
| 155 | if not src or not tar: |
|
| 156 | return 0.0 |
|
| 157 | num = n * (a * d - b * c) ** 2 |
|
| 158 | if num: |
|
| 159 | return num / (ab * ac * (b + d) * (c + d)) |
|
| 160 | return 0.0 |
|
| 161 | ||
| 162 | def corr(self, src, tar): |
|
| 163 | """Return Pearson's Chi-Squared correlation of two strings. |
|