Code Duplication    Length = 150-157 lines in 6 locations

abydos/distance/_peirce.py 1 location

@@ 30-186 (lines=157) @@
27
__all__ = ['Peirce']
28
29
30
class Peirce(_TokenDistance):
31
    r"""Peirce correlation.
32
33
    For two sets X and Y and a population N, the Peirce correlation
34
    :cite:`Peirce:1884` is
35
36
        .. math::
37
38
            corr_{Peirce}(X, Y) = \frac{|X \cap Y| \cdot
39
            |(N \setminus X) \setminus Y| -
40
            |X \setminus Y| \cdot |Y \setminus Y|}
41
            {|X| \cdot |N \setminus X|}
42
43
    Both :cite:`Choi:2010` and :cite:`Hubalek:1982` present a different formula
44
    and incorrectly attribute it to Peirce. Likewise, :cite:`Doolittle:1884`
45
    presents a different formula and incorrectly attributes it to Peirce. This
46
    is distinct from the formula he presents and attributes to himself.
47
48
    In :ref:`2x2 confusion table terms <confusion_table>`, where a+b+c+d=n,
49
    this is
50
51
        .. math::
52
53
            corr_{Peirce} =
54
            \frac{ad-bc}{(a+b)(c+d)}
55
56
    .. versionadded:: 0.4.0
57
    """
58
59
    def __init__(
60
        self,
61
        alphabet: Optional[
62
            Union[TCounter[str], Sequence[str], Set[str], int]
63
        ] = None,
64
        tokenizer: Optional[_Tokenizer] = None,
65
        intersection_type: str = 'crisp',
66
        **kwargs: Any
67
    ) -> None:
68
        """Initialize Peirce instance.
69
70
        Parameters
71
        ----------
72
        alphabet : Counter, collection, int, or None
73
            This represents the alphabet of possible tokens.
74
            See :ref:`alphabet <alphabet>` description in
75
            :py:class:`_TokenDistance` for details.
76
        tokenizer : _Tokenizer
77
            A tokenizer instance from the :py:mod:`abydos.tokenizer` package
78
        intersection_type : str
79
            Specifies the intersection type, and set type as a result:
80
            See :ref:`intersection_type <intersection_type>` description in
81
            :py:class:`_TokenDistance` for details.
82
        **kwargs
83
            Arbitrary keyword arguments
84
85
        Other Parameters
86
        ----------------
87
        qval : int
88
            The length of each q-gram. Using this parameter and tokenizer=None
89
            will cause the instance to use the QGram tokenizer with this
90
            q value.
91
        metric : _Distance
92
            A string distance measure class for use in the ``soft`` and
93
            ``fuzzy`` variants.
94
        threshold : float
95
            A threshold value, similarities above which are counted as
96
            members of the intersection for the ``fuzzy`` variant.
97
98
99
        .. versionadded:: 0.4.0
100
101
        """
102
        super(Peirce, self).__init__(
103
            alphabet=alphabet,
104
            tokenizer=tokenizer,
105
            intersection_type=intersection_type,
106
            **kwargs
107
        )
108
109
    def corr(self, src: str, tar: str) -> float:
110
        """Return the Peirce correlation of two strings.
111
112
        Parameters
113
        ----------
114
        src : str
115
            Source string (or QGrams/Counter objects) for comparison
116
        tar : str
117
            Target string (or QGrams/Counter objects) for comparison
118
119
        Returns
120
        -------
121
        float
122
            Peirce correlation
123
124
        Examples
125
        --------
126
        >>> cmp = Peirce()
127
        >>> cmp.corr('cat', 'hat')
128
        0.49743589743589745
129
        >>> cmp.corr('Niall', 'Neil')
130
        0.32947729220222793
131
        >>> cmp.corr('aluminum', 'Catalan')
132
        0.10209049255441008
133
        >>> cmp.corr('ATCG', 'TAGC')
134
        -0.006418485237483954
135
136
137
        .. versionadded:: 0.4.0
138
139
        """
140
        if src == tar:
141
            return 1.0
142
143
        self._tokenize(src, tar)
144
145
        a = self._intersection_card()
146
        b = self._src_only_card()
147
        c = self._tar_only_card()
148
        d = self._total_complement_card()
149
150
        num = a * d - b * c
151
        if num:
152
            return num / ((a + b) * (c + d))
153
        return 0.0
154
155
    def sim(self, src: str, tar: str) -> float:
156
        """Return the Peirce similarity of two strings.
157
158
        Parameters
159
        ----------
160
        src : str
161
            Source string (or QGrams/Counter objects) for comparison
162
        tar : str
163
            Target string (or QGrams/Counter objects) for comparison
164
165
        Returns
166
        -------
167
        float
168
            Peirce similarity
169
170
        Examples
171
        --------
172
        >>> cmp = Peirce()
173
        >>> cmp.sim('cat', 'hat')
174
        0.7487179487179487
175
        >>> cmp.sim('Niall', 'Neil')
176
        0.664738646101114
177
        >>> cmp.sim('aluminum', 'Catalan')
178
        0.5510452462772051
179
        >>> cmp.sim('ATCG', 'TAGC')
180
        0.496790757381258
181
182
183
        .. versionadded:: 0.4.0
184
185
        """
186
        return (1.0 + self.corr(src, tar)) / 2.0
187
188
189
if __name__ == '__main__':

abydos/distance/_unknown_a.py 1 location

@@ 30-183 (lines=154) @@
27
__all__ = ['UnknownA']
28
29
30
class UnknownA(_TokenDistance):
31
    r"""Unknown A correlation.
32
33
    For two sets X and Y and a population N, Unknown A correlation
34
    is sometimes attributed to :cite:`Peirce:1884`. It differs from
35
    :py:class:`Peirce` in that the numerator is the product of the opposite
36
    pair of marginals:
37
38
        .. math::
39
40
            corr_{UnknownA}(X, Y) = \frac{|X \cap Y| \cdot
41
            |(N \setminus X) \setminus Y| -
42
            |X \setminus Y| \cdot |Y \setminus Y|}
43
            {|Y| \cdot |N \setminus Y|}
44
45
    In :ref:`2x2 confusion table terms <confusion_table>`, where a+b+c+d=n,
46
    this is
47
48
        .. math::
49
50
            corr_{UnknownA} =
51
            \frac{ad-bc}{(a+c)(b+d)}
52
53
    .. versionadded:: 0.4.0
54
    """
55
56
    def __init__(
57
        self,
58
        alphabet: Optional[
59
            Union[TCounter[str], Sequence[str], Set[str], int]
60
        ] = None,
61
        tokenizer: Optional[_Tokenizer] = None,
62
        intersection_type: str = 'crisp',
63
        **kwargs: Any
64
    ) -> None:
65
        """Initialize UnknownA instance.
66
67
        Parameters
68
        ----------
69
        alphabet : Counter, collection, int, or None
70
            This represents the alphabet of possible tokens.
71
            See :ref:`alphabet <alphabet>` description in
72
            :py:class:`_TokenDistance` for details.
73
        tokenizer : _Tokenizer
74
            A tokenizer instance from the :py:mod:`abydos.tokenizer` package
75
        intersection_type : str
76
            Specifies the intersection type, and set type as a result:
77
            See :ref:`intersection_type <intersection_type>` description in
78
            :py:class:`_TokenDistance` for details.
79
        **kwargs
80
            Arbitrary keyword arguments
81
82
        Other Parameters
83
        ----------------
84
        qval : int
85
            The length of each q-gram. Using this parameter and tokenizer=None
86
            will cause the instance to use the QGram tokenizer with this
87
            q value.
88
        metric : _Distance
89
            A string distance measure class for use in the ``soft`` and
90
            ``fuzzy`` variants.
91
        threshold : float
92
            A threshold value, similarities above which are counted as
93
            members of the intersection for the ``fuzzy`` variant.
94
95
96
        .. versionadded:: 0.4.0
97
98
        """
99
        super(UnknownA, self).__init__(
100
            alphabet=alphabet,
101
            tokenizer=tokenizer,
102
            intersection_type=intersection_type,
103
            **kwargs
104
        )
105
106
    def corr(self, src: str, tar: str) -> float:
107
        """Return the Unknown A correlation of two strings.
108
109
        Parameters
110
        ----------
111
        src : str
112
            Source string (or QGrams/Counter objects) for comparison
113
        tar : str
114
            Target string (or QGrams/Counter objects) for comparison
115
116
        Returns
117
        -------
118
        float
119
            Unknown A correlation
120
121
        Examples
122
        --------
123
        >>> cmp = UnknownA()
124
        >>> cmp.corr('cat', 'hat')
125
        0.49743589743589745
126
        >>> cmp.corr('Niall', 'Neil')
127
        0.39486521181001283
128
        >>> cmp.corr('aluminum', 'Catalan')
129
        0.1147039897039897
130
        >>> cmp.corr('ATCG', 'TAGC')
131
        -0.006418485237483954
132
133
134
        .. versionadded:: 0.4.0
135
136
        """
137
        if src == tar:
138
            return 1.0
139
140
        self._tokenize(src, tar)
141
142
        a = self._intersection_card()
143
        b = self._src_only_card()
144
        c = self._tar_only_card()
145
        d = self._total_complement_card()
146
147
        num = a * d - b * c
148
        if num:
149
            return num / ((a + c) * (b + d))
150
        return 0.0
151
152
    def sim(self, src: str, tar: str) -> float:
153
        """Return the Unknown A similarity of two strings.
154
155
        Parameters
156
        ----------
157
        src : str
158
            Source string (or QGrams/Counter objects) for comparison
159
        tar : str
160
            Target string (or QGrams/Counter objects) for comparison
161
162
        Returns
163
        -------
164
        float
165
            Unknown A similarity
166
167
        Examples
168
        --------
169
        >>> cmp = UnknownA()
170
        >>> cmp.sim('cat', 'hat')
171
        0.7487179487179487
172
        >>> cmp.sim('Niall', 'Neil')
173
        0.6974326059050064
174
        >>> cmp.sim('aluminum', 'Catalan')
175
        0.5573519948519948
176
        >>> cmp.sim('ATCG', 'TAGC')
177
        0.496790757381258
178
179
180
        .. versionadded:: 0.4.0
181
182
        """
183
        return (1.0 + self.corr(src, tar)) / 2.0
184
185
186
if __name__ == '__main__':

abydos/distance/_andres_marzo_delta.py 1 location

@@ 30-183 (lines=154) @@
27
__all__ = ['AndresMarzoDelta']
28
29
30
class AndresMarzoDelta(_TokenDistance):
31
    r"""Andres & Marzo's Delta correlation.
32
33
    For two sets X and Y and a population N, Andres & Marzo's :math:`\Delta`
34
    correlation :cite:`Andres:2004` is
35
36
        .. math::
37
38
            corr_{AndresMarzo_\Delta}(X, Y) = \Delta =
39
            \frac{|X \cap Y| + |(N \setminus X) \setminus Y| -
40
            2\sqrt{|X \setminus Y| \cdot |Y \setminus X|}}{|N|}
41
42
43
    In :ref:`2x2 confusion table terms <confusion_table>`, where a+b+c+d=n,
44
    this is
45
46
        .. math::
47
48
            corr_{AndresMarzo_\Delta} = \Delta =
49
            \frac{a+d-2\sqrt{b \cdot c}}{n}
50
51
    .. versionadded:: 0.4.0
52
    """
53
54
    def __init__(
55
        self,
56
        alphabet: Optional[
57
            Union[TCounter[str], Sequence[str], Set[str], int]
58
        ] = None,
59
        tokenizer: Optional[_Tokenizer] = None,
60
        intersection_type: str = 'crisp',
61
        **kwargs: Any
62
    ) -> None:
63
        """Initialize AndresMarzoDelta instance.
64
65
        Parameters
66
        ----------
67
        alphabet : Counter, collection, int, or None
68
            This represents the alphabet of possible tokens.
69
            See :ref:`alphabet <alphabet>` description in
70
            :py:class:`_TokenDistance` for details.
71
        tokenizer : _Tokenizer
72
            A tokenizer instance from the :py:mod:`abydos.tokenizer` package
73
        intersection_type : str
74
            Specifies the intersection type, and set type as a result:
75
            See :ref:`intersection_type <intersection_type>` description in
76
            :py:class:`_TokenDistance` for details.
77
        **kwargs
78
            Arbitrary keyword arguments
79
80
        Other Parameters
81
        ----------------
82
        qval : int
83
            The length of each q-gram. Using this parameter and tokenizer=None
84
            will cause the instance to use the QGram tokenizer with this
85
            q value.
86
        metric : _Distance
87
            A string distance measure class for use in the ``soft`` and
88
            ``fuzzy`` variants.
89
        threshold : float
90
            A threshold value, similarities above which are counted as
91
            members of the intersection for the ``fuzzy`` variant.
92
93
94
        .. versionadded:: 0.4.0
95
96
        """
97
        super(AndresMarzoDelta, self).__init__(
98
            alphabet=alphabet,
99
            tokenizer=tokenizer,
100
            intersection_type=intersection_type,
101
            **kwargs
102
        )
103
104
    def corr(self, src: str, tar: str) -> float:
105
        """Return the Andres & Marzo's Delta correlation of two strings.
106
107
        Parameters
108
        ----------
109
        src : str
110
            Source string (or QGrams/Counter objects) for comparison
111
        tar : str
112
            Target string (or QGrams/Counter objects) for comparison
113
114
        Returns
115
        -------
116
        float
117
            Andres & Marzo's Delta correlation
118
119
        Examples
120
        --------
121
        >>> cmp = AndresMarzoDelta()
122
        >>> cmp.corr('cat', 'hat')
123
        0.9897959183673469
124
        >>> cmp.corr('Niall', 'Neil')
125
        0.9822344346552608
126
        >>> cmp.corr('aluminum', 'Catalan')
127
        0.9618259496215341
128
        >>> cmp.corr('ATCG', 'TAGC')
129
        0.9744897959183674
130
131
132
        .. versionadded:: 0.4.0
133
134
        """
135
        if src == tar:
136
            return 1.0
137
138
        self._tokenize(src, tar)
139
140
        a = self._intersection_card()
141
        b = self._src_only_card()
142
        c = self._tar_only_card()
143
        d = self._total_complement_card()
144
        n = self._population_unique_card()
145
146
        num = a + d - 2 * (b * c) ** 0.5
147
148
        if num == 0.0:
149
            return 0.0
150
        return num / n
151
152
    def sim(self, src: str, tar: str) -> float:
153
        """Return the Andres & Marzo's Delta similarity of two strings.
154
155
        Parameters
156
        ----------
157
        src : str
158
            Source string (or QGrams/Counter objects) for comparison
159
        tar : str
160
            Target string (or QGrams/Counter objects) for comparison
161
162
        Returns
163
        -------
164
        float
165
            Andres & Marzo's Delta similarity
166
167
        Examples
168
        --------
169
        >>> cmp = AndresMarzoDelta()
170
        >>> cmp.sim('cat', 'hat')
171
        0.9948979591836735
172
        >>> cmp.sim('Niall', 'Neil')
173
        0.9911172173276304
174
        >>> cmp.sim('aluminum', 'Catalan')
175
        0.980912974810767
176
        >>> cmp.sim('ATCG', 'TAGC')
177
        0.9872448979591837
178
179
180
        .. versionadded:: 0.4.0
181
182
        """
183
        return (self.corr(src, tar) + 1) / 2
184
185
186
if __name__ == '__main__':

abydos/distance/_benini_i.py 1 location

@@ 30-181 (lines=152) @@
27
__all__ = ['BeniniI']
28
29
30
class BeniniI(_TokenDistance):
31
    r"""BeniniI correlation.
32
33
    For two sets X and Y and a population N, Benini I correlation, Benini's
34
    Index of Attraction, :cite:`Benini:1901` is
35
36
        .. math::
37
38
            corr_{BeniniI}(X, Y) =
39
            \frac{|X \cap Y| \cdot |(N \setminus X) \setminus Y| -
40
            |X \setminus Y| \cdot |Y \setminus X|}{|Y| \cdot |N \setminus X|}
41
42
43
    In :ref:`2x2 confusion table terms <confusion_table>`, where a+b+c+d=n,
44
    this is
45
46
        .. math::
47
48
            corr_{BeniniI} = \frac{ad-bc}{(a+c)(c+d)}
49
50
    .. versionadded:: 0.4.0
51
    """
52
53
    def __init__(
54
        self,
55
        alphabet: Optional[
56
            Union[TCounter[str], Sequence[str], Set[str], int]
57
        ] = None,
58
        tokenizer: Optional[_Tokenizer] = None,
59
        intersection_type: str = 'crisp',
60
        **kwargs: Any
61
    ) -> None:
62
        """Initialize BeniniI instance.
63
64
        Parameters
65
        ----------
66
        alphabet : Counter, collection, int, or None
67
            This represents the alphabet of possible tokens.
68
            See :ref:`alphabet <alphabet>` description in
69
            :py:class:`_TokenDistance` for details.
70
        tokenizer : _Tokenizer
71
            A tokenizer instance from the :py:mod:`abydos.tokenizer` package
72
        intersection_type : str
73
            Specifies the intersection type, and set type as a result:
74
            See :ref:`intersection_type <intersection_type>` description in
75
            :py:class:`_TokenDistance` for details.
76
        **kwargs
77
            Arbitrary keyword arguments
78
79
        Other Parameters
80
        ----------------
81
        qval : int
82
            The length of each q-gram. Using this parameter and tokenizer=None
83
            will cause the instance to use the QGram tokenizer with this
84
            q value.
85
        metric : _Distance
86
            A string distance measure class for use in the ``soft`` and
87
            ``fuzzy`` variants.
88
        threshold : float
89
            A threshold value, similarities above which are counted as
90
            members of the intersection for the ``fuzzy`` variant.
91
92
93
        .. versionadded:: 0.4.0
94
95
        """
96
        super(BeniniI, self).__init__(
97
            alphabet=alphabet,
98
            tokenizer=tokenizer,
99
            intersection_type=intersection_type,
100
            **kwargs
101
        )
102
103
    def corr(self, src: str, tar: str) -> float:
104
        """Return the Benini I correlation of two strings.
105
106
        Parameters
107
        ----------
108
        src : str
109
            Source string (or QGrams/Counter objects) for comparison
110
        tar : str
111
            Target string (or QGrams/Counter objects) for comparison
112
113
        Returns
114
        -------
115
        float
116
            Benini I correlation
117
118
        Examples
119
        --------
120
        >>> cmp = BeniniI()
121
        >>> cmp.corr('cat', 'hat')
122
        0.49743589743589745
123
        >>> cmp.corr('Niall', 'Neil')
124
        0.3953727506426735
125
        >>> cmp.corr('aluminum', 'Catalan')
126
        0.11485180412371133
127
        >>> cmp.corr('ATCG', 'TAGC')
128
        -0.006418485237483954
129
130
131
        .. versionadded:: 0.4.0
132
133
        """
134
        if src == tar:
135
            return 1.0
136
137
        self._tokenize(src, tar)
138
139
        a = self._intersection_card()
140
        b = self._src_only_card()
141
        c = self._tar_only_card()
142
        d = self._total_complement_card()
143
144
        num = a * d - b * c
145
146
        if num == 0.0:
147
            return 0.0
148
        return num / ((a + c) * (c + d))
149
150
    def sim(self, src: str, tar: str) -> float:
151
        """Return the Benini I similarity of two strings.
152
153
        Parameters
154
        ----------
155
        src : str
156
            Source string (or QGrams/Counter objects) for comparison
157
        tar : str
158
            Target string (or QGrams/Counter objects) for comparison
159
160
        Returns
161
        -------
162
        float
163
            Benini I similarity
164
165
        Examples
166
        --------
167
        >>> cmp = BeniniI()
168
        >>> cmp.sim('cat', 'hat')
169
        0.7487179487179487
170
        >>> cmp.sim('Niall', 'Neil')
171
        0.6976863753213367
172
        >>> cmp.sim('aluminum', 'Catalan')
173
        0.5574259020618557
174
        >>> cmp.sim('ATCG', 'TAGC')
175
        0.496790757381258
176
177
178
        .. versionadded:: 0.4.0
179
180
        """
181
        return (1 + self.corr(src, tar)) / 2
182
183
184
if __name__ == '__main__':

abydos/distance/_dispersion.py 1 location

@@ 30-179 (lines=150) @@
27
__all__ = ['Dispersion']
28
29
30
class Dispersion(_TokenDistance):
31
    r"""Dispersion correlation.
32
33
    For two sets X and Y and a population N, the dispersion
34
    correlation :cite:`IBM:2017` is
35
36
        .. math::
37
38
            corr_{dispersion}(X, Y) =
39
            \frac{|X \cap Y| \cdot |(N \setminus X) \setminus Y| -
40
            |X \setminus Y| \cdot |Y \setminus X|}
41
            {|N|^2}
42
43
    In :ref:`2x2 confusion table terms <confusion_table>`, where a+b+c+d=n,
44
    this is
45
46
        .. math::
47
48
            corr_{dispersion} =
49
            \frac{ad-bc}{n^2}
50
51
    .. versionadded:: 0.4.0
52
    """
53
54
    def __init__(
55
        self,
56
        alphabet: Optional[
57
            Union[TCounter[str], Sequence[str], Set[str], int]
58
        ] = None,
59
        tokenizer: Optional[_Tokenizer] = None,
60
        intersection_type: str = 'crisp',
61
        **kwargs: Any
62
    ) -> None:
63
        """Initialize Dispersion instance.
64
65
        Parameters
66
        ----------
67
        alphabet : Counter, collection, int, or None
68
            This represents the alphabet of possible tokens.
69
            See :ref:`alphabet <alphabet>` description in
70
            :py:class:`_TokenDistance` for details.
71
        tokenizer : _Tokenizer
72
            A tokenizer instance from the :py:mod:`abydos.tokenizer` package
73
        intersection_type : str
74
            Specifies the intersection type, and set type as a result:
75
            See :ref:`intersection_type <intersection_type>` description in
76
            :py:class:`_TokenDistance` for details.
77
        **kwargs
78
            Arbitrary keyword arguments
79
80
        Other Parameters
81
        ----------------
82
        qval : int
83
            The length of each q-gram. Using this parameter and tokenizer=None
84
            will cause the instance to use the QGram tokenizer with this
85
            q value.
86
        metric : _Distance
87
            A string distance measure class for use in the ``soft`` and
88
            ``fuzzy`` variants.
89
        threshold : float
90
            A threshold value, similarities above which are counted as
91
            members of the intersection for the ``fuzzy`` variant.
92
93
94
        .. versionadded:: 0.4.0
95
96
        """
97
        super(Dispersion, self).__init__(
98
            alphabet=alphabet,
99
            tokenizer=tokenizer,
100
            intersection_type=intersection_type,
101
            **kwargs
102
        )
103
104
    def corr(self, src: str, tar: str) -> float:
105
        """Return the Dispersion correlation of two strings.
106
107
        Parameters
108
        ----------
109
        src : str
110
            Source string (or QGrams/Counter objects) for comparison
111
        tar : str
112
            Target string (or QGrams/Counter objects) for comparison
113
114
        Returns
115
        -------
116
        float
117
            Dispersion correlation
118
119
        Examples
120
        --------
121
        >>> cmp = Dispersion()
122
        >>> cmp.corr('cat', 'hat')
123
        0.002524989587671803
124
        >>> cmp.corr('Niall', 'Neil')
125
        0.002502212619741774
126
        >>> cmp.corr('aluminum', 'Catalan')
127
        0.0011570449105440383
128
        >>> cmp.corr('ATCG', 'TAGC')
129
        -4.06731570179092e-05
130
131
132
        .. versionadded:: 0.4.0
133
134
        """
135
        self._tokenize(src, tar)
136
137
        a = self._intersection_card()
138
        b = self._src_only_card()
139
        c = self._tar_only_card()
140
        d = self._total_complement_card()
141
        n = self._population_unique_card()
142
143
        admbc = a * d - b * c
144
        if admbc == 0.0:
145
            return 0.0
146
        return admbc / n ** 2
147
148
    def sim(self, src: str, tar: str) -> float:
149
        """Return the Dispersion similarity of two strings.
150
151
        Parameters
152
        ----------
153
        src : str
154
            Source string (or QGrams/Counter objects) for comparison
155
        tar : str
156
            Target string (or QGrams/Counter objects) for comparison
157
158
        Returns
159
        -------
160
        float
161
            Dispersion similarity
162
163
        Examples
164
        --------
165
        >>> cmp = Dispersion()
166
        >>> cmp.sim('cat', 'hat')
167
        0.5012624947938359
168
        >>> cmp.sim('Niall', 'Neil')
169
        0.5012511063098709
170
        >>> cmp.sim('aluminum', 'Catalan')
171
        0.500578522455272
172
        >>> cmp.sim('ATCG', 'TAGC')
173
        0.499979663421491
174
175
176
        .. versionadded:: 0.4.0
177
178
        """
179
        return (1 + self.corr(src, tar)) / 2
180
181
182
if __name__ == '__main__':

abydos/distance/_warrens_iii.py 1 location

@@ 30-179 (lines=150) @@
27
__all__ = ['WarrensIII']
28
29
30
class WarrensIII(_TokenDistance):
31
    r"""Warrens III correlation.
32
33
    For two sets X and Y and a population N, Warrens III correlation
34
    :math:`S_{NS3}` :cite:`Warrens:2008` is
35
36
        .. math::
37
38
            corr_{WarrensIII}(X, Y) =
39
            \frac{2|(N \setminus X) \setminus Y| - |X \setminus Y| -
40
            |Y \setminus X|}{|N \setminus X| + |N \setminus Y|}
41
42
    In :ref:`2x2 confusion table terms <confusion_table>`, where a+b+c+d=n,
43
    this is
44
45
        .. math::
46
47
            corr_{WarrensIII} =
48
            \frac{2d-b-c}{2d+b+c}
49
50
    .. versionadded:: 0.4.0
51
    """
52
53
    def __init__(
54
        self,
55
        alphabet: Optional[
56
            Union[TCounter[str], Sequence[str], Set[str], int]
57
        ] = None,
58
        tokenizer: Optional[_Tokenizer] = None,
59
        intersection_type: str = 'crisp',
60
        **kwargs: Any
61
    ) -> None:
62
        """Initialize WarrensIII instance.
63
64
        Parameters
65
        ----------
66
        alphabet : Counter, collection, int, or None
67
            This represents the alphabet of possible tokens.
68
            See :ref:`alphabet <alphabet>` description in
69
            :py:class:`_TokenDistance` for details.
70
        tokenizer : _Tokenizer
71
            A tokenizer instance from the :py:mod:`abydos.tokenizer` package
72
        intersection_type : str
73
            Specifies the intersection type, and set type as a result:
74
            See :ref:`intersection_type <intersection_type>` description in
75
            :py:class:`_TokenDistance` for details.
76
        **kwargs
77
            Arbitrary keyword arguments
78
79
        Other Parameters
80
        ----------------
81
        qval : int
82
            The length of each q-gram. Using this parameter and tokenizer=None
83
            will cause the instance to use the QGram tokenizer with this
84
            q value.
85
        metric : _Distance
86
            A string distance measure class for use in the ``soft`` and
87
            ``fuzzy`` variants.
88
        threshold : float
89
            A threshold value, similarities above which are counted as
90
            members of the intersection for the ``fuzzy`` variant.
91
92
93
        .. versionadded:: 0.4.0
94
95
        """
96
        super(WarrensIII, self).__init__(
97
            alphabet=alphabet,
98
            tokenizer=tokenizer,
99
            intersection_type=intersection_type,
100
            **kwargs
101
        )
102
103
    def corr(self, src: str, tar: str) -> float:
104
        """Return the Warrens III correlation of two strings.
105
106
        Parameters
107
        ----------
108
        src : str
109
            Source string (or QGrams/Counter objects) for comparison
110
        tar : str
111
            Target string (or QGrams/Counter objects) for comparison
112
113
        Returns
114
        -------
115
        float
116
            Warrens III correlation
117
118
        Examples
119
        --------
120
        >>> cmp = WarrensIII()
121
        >>> cmp.corr('cat', 'hat')
122
        0.9948717948717949
123
        >>> cmp.corr('Niall', 'Neil')
124
        0.9910083493898523
125
        >>> cmp.corr('aluminum', 'Catalan')
126
        0.9806825499034127
127
        >>> cmp.corr('ATCG', 'TAGC')
128
        0.9871630295250321
129
130
131
        .. versionadded:: 0.4.0
132
133
        """
134
        if src == tar:
135
            return 1.0
136
137
        self._tokenize(src, tar)
138
139
        b = self._src_only_card()
140
        c = self._tar_only_card()
141
        d = self._total_complement_card()
142
143
        num = 2 * d - b - c
144
        if num:
145
            return num / (2 * d + b + c)
146
        return 0.0
147
148
    def sim(self, src: str, tar: str) -> float:
149
        """Return the Warrens III similarity of two strings.
150
151
        Parameters
152
        ----------
153
        src : str
154
            Source string (or QGrams/Counter objects) for comparison
155
        tar : str
156
            Target string (or QGrams/Counter objects) for comparison
157
158
        Returns
159
        -------
160
        float
161
            Warrens III similarity
162
163
        Examples
164
        --------
165
        >>> cmp = WarrensIII()
166
        >>> cmp.sim('cat', 'hat')
167
        0.9974358974358974
168
        >>> cmp.sim('Niall', 'Neil')
169
        0.9955041746949261
170
        >>> cmp.sim('aluminum', 'Catalan')
171
        0.9903412749517064
172
        >>> cmp.sim('ATCG', 'TAGC')
173
        0.993581514762516
174
175
176
        .. versionadded:: 0.4.0
177
178
        """
179
        return (1.0 + self.corr(src, tar)) / 2.0
180
181
182
if __name__ == '__main__':