| @@ 111-163 (lines=53) @@ | ||
| 108 | **kwargs |
|
| 109 | ) |
|
| 110 | ||
| 111 | def corr(self, src: str, tar: str) -> float: |
|
| 112 | """Return the normalized mean squared contingency corr. of two strings. |
|
| 113 | ||
| 114 | Parameters |
|
| 115 | ---------- |
|
| 116 | src : str |
|
| 117 | Source string (or QGrams/Counter objects) for comparison |
|
| 118 | tar : str |
|
| 119 | Target string (or QGrams/Counter objects) for comparison |
|
| 120 | ||
| 121 | Returns |
|
| 122 | ------- |
|
| 123 | float |
|
| 124 | Mean squared contingency correlation |
|
| 125 | ||
| 126 | Examples |
|
| 127 | -------- |
|
| 128 | >>> cmp = MSContingency() |
|
| 129 | >>> cmp.corr('cat', 'hat') |
|
| 130 | 0.6298568508557214 |
|
| 131 | >>> cmp.corr('Niall', 'Neil') |
|
| 132 | 0.4798371954796814 |
|
| 133 | >>> cmp.corr('aluminum', 'Catalan') |
|
| 134 | 0.15214891090821628 |
|
| 135 | >>> cmp.corr('ATCG', 'TAGC') |
|
| 136 | -0.009076921903905553 |
|
| 137 | ||
| 138 | ||
| 139 | .. versionadded:: 0.4.0 |
|
| 140 | ||
| 141 | """ |
|
| 142 | if src == tar: |
|
| 143 | return 1.0 |
|
| 144 | if not src or not tar: |
|
| 145 | return -1.0 |
|
| 146 | ||
| 147 | self._tokenize(src, tar) |
|
| 148 | ||
| 149 | a = self._intersection_card() |
|
| 150 | b = self._src_only_card() |
|
| 151 | c = self._tar_only_card() |
|
| 152 | d = self._total_complement_card() |
|
| 153 | ab = self._src_card() |
|
| 154 | ac = self._tar_card() |
|
| 155 | admbc = a * d - b * c |
|
| 156 | ||
| 157 | if admbc: |
|
| 158 | return ( |
|
| 159 | 2 ** 0.5 |
|
| 160 | * admbc |
|
| 161 | / (admbc ** 2 + ab * ac * (b + d) * (c + d)) ** 0.5 |
|
| 162 | ) |
|
| 163 | return 0.0 |
|
| 164 | ||
| 165 | def sim(self, src: str, tar: str) -> float: |
|
| 166 | """Return the normalized ms contingency similarity of two strings. |
|
| @@ 107-155 (lines=49) @@ | ||
| 104 | **kwargs |
|
| 105 | ) |
|
| 106 | ||
| 107 | def sim_score(self, src: str, tar: str) -> float: |
|
| 108 | """Return Pearson's Chi-Squared similarity of two strings. |
|
| 109 | ||
| 110 | Parameters |
|
| 111 | ---------- |
|
| 112 | src : str |
|
| 113 | Source string (or QGrams/Counter objects) for comparison |
|
| 114 | tar : str |
|
| 115 | Target string (or QGrams/Counter objects) for comparison |
|
| 116 | ||
| 117 | Returns |
|
| 118 | ------- |
|
| 119 | float |
|
| 120 | Pearson's Chi-Squared similarity |
|
| 121 | ||
| 122 | Examples |
|
| 123 | -------- |
|
| 124 | >>> cmp = PearsonChiSquared() |
|
| 125 | >>> cmp.sim_score('cat', 'hat') |
|
| 126 | 193.99489809335964 |
|
| 127 | >>> cmp.sim_score('Niall', 'Neil') |
|
| 128 | 101.99771068526542 |
|
| 129 | >>> cmp.sim_score('aluminum', 'Catalan') |
|
| 130 | 9.19249664336649 |
|
| 131 | >>> cmp.sim_score('ATCG', 'TAGC') |
|
| 132 | 0.032298410951138765 |
|
| 133 | ||
| 134 | ||
| 135 | .. versionadded:: 0.4.0 |
|
| 136 | ||
| 137 | """ |
|
| 138 | self._tokenize(src, tar) |
|
| 139 | ||
| 140 | a = self._intersection_card() |
|
| 141 | b = self._src_only_card() |
|
| 142 | c = self._tar_only_card() |
|
| 143 | d = self._total_complement_card() |
|
| 144 | n = self._population_unique_card() |
|
| 145 | ab = self._src_card() |
|
| 146 | ac = self._tar_card() |
|
| 147 | ||
| 148 | if src == tar: |
|
| 149 | return float(n) |
|
| 150 | if not src or not tar: |
|
| 151 | return 0.0 |
|
| 152 | num = n * (a * d - b * c) ** 2 |
|
| 153 | if num: |
|
| 154 | return num / (ab * ac * (b + d) * (c + d)) |
|
| 155 | return 0.0 |
|
| 156 | ||
| 157 | def corr(self, src: str, tar: str) -> float: |
|
| 158 | """Return Pearson's Chi-Squared correlation of two strings. |
|