Code Duplication - chrislit/abydos - Measure and Improve Code Quality continuously with Scrutinizer

Code Duplication Length = 150-157 lines in 6 locations

abydos/distance/_peirce.py 1 location


__all__ = ['Peirce']


class Peirce(_TokenDistance):
    r"""Peirce correlation.

    For two sets X and Y and a population N, the Peirce correlation
    :cite:`Peirce:1884` is

        .. math::

            corr_{Peirce}(X, Y) = \frac{|X \cap Y| \cdot
            |(N \setminus X) \setminus Y| -
            |X \setminus Y| \cdot |Y \setminus Y|}
            {|X| \cdot |N \setminus X|}

    Both :cite:`Choi:2010` and :cite:`Hubalek:1982` present a different formula
    and incorrectly attribute it to Peirce. Likewise, :cite:`Doolittle:1884`
    presents a different formula and incorrectly attributes it to Peirce. This
    is distinct from the formula he presents and attributes to himself.

    In :ref:`2x2 confusion table terms <confusion_table>`, where a+b+c+d=n,
    this is

        .. math::

            corr_{Peirce} =
            \frac{ad-bc}{(a+b)(c+d)}

    .. versionadded:: 0.4.0
    """

    def __init__(
        self,
        alphabet: Optional[
            Union[TCounter[str], Sequence[str], Set[str], int]
        ] = None,
        tokenizer: Optional[_Tokenizer] = None,
        intersection_type: str = 'crisp',
        **kwargs: Any
    ) -> None:
        """Initialize Peirce instance.

        Parameters
        ----------
        alphabet : Counter, collection, int, or None
            This represents the alphabet of possible tokens.
            See :ref:`alphabet <alphabet>` description in
            :py:class:`_TokenDistance` for details.
        tokenizer : _Tokenizer
            A tokenizer instance from the :py:mod:`abydos.tokenizer` package
        intersection_type : str
            Specifies the intersection type, and set type as a result:
            See :ref:`intersection_type <intersection_type>` description in
            :py:class:`_TokenDistance` for details.
        **kwargs
            Arbitrary keyword arguments

        Other Parameters
        ----------------
        qval : int
            The length of each q-gram. Using this parameter and tokenizer=None
            will cause the instance to use the QGram tokenizer with this
            q value.
        metric : _Distance
            A string distance measure class for use in the ``soft`` and
            ``fuzzy`` variants.
        threshold : float
            A threshold value, similarities above which are counted as
            members of the intersection for the ``fuzzy`` variant.


        .. versionadded:: 0.4.0

        """
        super(Peirce, self).__init__(
            alphabet=alphabet,
            tokenizer=tokenizer,
            intersection_type=intersection_type,
            **kwargs
        )

    def corr(self, src: str, tar: str) -> float:
        """Return the Peirce correlation of two strings.

        Parameters
        ----------
        src : str
            Source string (or QGrams/Counter objects) for comparison
        tar : str
            Target string (or QGrams/Counter objects) for comparison

        Returns
        -------
        float
            Peirce correlation

        Examples
        --------
        >>> cmp = Peirce()
        >>> cmp.corr('cat', 'hat')
        0.49743589743589745
        >>> cmp.corr('Niall', 'Neil')
        0.32947729220222793
        >>> cmp.corr('aluminum', 'Catalan')
        0.10209049255441008
        >>> cmp.corr('ATCG', 'TAGC')
        -0.006418485237483954


        .. versionadded:: 0.4.0

        """
        if src == tar:
            return 1.0

        self._tokenize(src, tar)

        a = self._intersection_card()
        b = self._src_only_card()
        c = self._tar_only_card()
        d = self._total_complement_card()

        num = a * d - b * c
        if num:
            return num / ((a + b) * (c + d))
        return 0.0

    def sim(self, src: str, tar: str) -> float:
        """Return the Peirce similarity of two strings.

        Parameters
        ----------
        src : str
            Source string (or QGrams/Counter objects) for comparison
        tar : str
            Target string (or QGrams/Counter objects) for comparison

        Returns
        -------
        float
            Peirce similarity

        Examples
        --------
        >>> cmp = Peirce()
        >>> cmp.sim('cat', 'hat')
        0.7487179487179487
        >>> cmp.sim('Niall', 'Neil')
        0.664738646101114
        >>> cmp.sim('aluminum', 'Catalan')
        0.5510452462772051
        >>> cmp.sim('ATCG', 'TAGC')
        0.496790757381258


        .. versionadded:: 0.4.0

        """
        return (1.0 + self.corr(src, tar)) / 2.0


if __name__ == '__main__':

abydos/distance/_unknown_a.py 1 location


__all__ = ['UnknownA']


class UnknownA(_TokenDistance):
    r"""Unknown A correlation.

    For two sets X and Y and a population N, Unknown A correlation
    is sometimes attributed to :cite:`Peirce:1884`. It differs from
    :py:class:`Peirce` in that the numerator is the product of the opposite
    pair of marginals:

        .. math::

            corr_{UnknownA}(X, Y) = \frac{|X \cap Y| \cdot
            |(N \setminus X) \setminus Y| -
            |X \setminus Y| \cdot |Y \setminus Y|}
            {|Y| \cdot |N \setminus Y|}

    In :ref:`2x2 confusion table terms <confusion_table>`, where a+b+c+d=n,
    this is

        .. math::

            corr_{UnknownA} =
            \frac{ad-bc}{(a+c)(b+d)}

    .. versionadded:: 0.4.0
    """

    def __init__(
        self,
        alphabet: Optional[
            Union[TCounter[str], Sequence[str], Set[str], int]
        ] = None,
        tokenizer: Optional[_Tokenizer] = None,
        intersection_type: str = 'crisp',
        **kwargs: Any
    ) -> None:
        """Initialize UnknownA instance.

        Parameters
        ----------
        alphabet : Counter, collection, int, or None
            This represents the alphabet of possible tokens.
            See :ref:`alphabet <alphabet>` description in
            :py:class:`_TokenDistance` for details.
        tokenizer : _Tokenizer
            A tokenizer instance from the :py:mod:`abydos.tokenizer` package
        intersection_type : str
            Specifies the intersection type, and set type as a result:
            See :ref:`intersection_type <intersection_type>` description in
            :py:class:`_TokenDistance` for details.
        **kwargs
            Arbitrary keyword arguments

        Other Parameters
        ----------------
        qval : int
            The length of each q-gram. Using this parameter and tokenizer=None
            will cause the instance to use the QGram tokenizer with this
            q value.
        metric : _Distance
            A string distance measure class for use in the ``soft`` and
            ``fuzzy`` variants.
        threshold : float
            A threshold value, similarities above which are counted as
            members of the intersection for the ``fuzzy`` variant.


        .. versionadded:: 0.4.0

        """
        super(UnknownA, self).__init__(
            alphabet=alphabet,
            tokenizer=tokenizer,
            intersection_type=intersection_type,
            **kwargs
        )

    def corr(self, src: str, tar: str) -> float:
        """Return the Unknown A correlation of two strings.

        Parameters
        ----------
        src : str
            Source string (or QGrams/Counter objects) for comparison
        tar : str
            Target string (or QGrams/Counter objects) for comparison

        Returns
        -------
        float
            Unknown A correlation

        Examples
        --------
        >>> cmp = UnknownA()
        >>> cmp.corr('cat', 'hat')
        0.49743589743589745
        >>> cmp.corr('Niall', 'Neil')
        0.39486521181001283
        >>> cmp.corr('aluminum', 'Catalan')
        0.1147039897039897
        >>> cmp.corr('ATCG', 'TAGC')
        -0.006418485237483954


        .. versionadded:: 0.4.0

        """
        if src == tar:
            return 1.0

        self._tokenize(src, tar)

        a = self._intersection_card()
        b = self._src_only_card()
        c = self._tar_only_card()
        d = self._total_complement_card()

        num = a * d - b * c
        if num:
            return num / ((a + c) * (b + d))
        return 0.0

    def sim(self, src: str, tar: str) -> float:
        """Return the Unknown A similarity of two strings.

        Parameters
        ----------
        src : str
            Source string (or QGrams/Counter objects) for comparison
        tar : str
            Target string (or QGrams/Counter objects) for comparison

        Returns
        -------
        float
            Unknown A similarity

        Examples
        --------
        >>> cmp = UnknownA()
        >>> cmp.sim('cat', 'hat')
        0.7487179487179487
        >>> cmp.sim('Niall', 'Neil')
        0.6974326059050064
        >>> cmp.sim('aluminum', 'Catalan')
        0.5573519948519948
        >>> cmp.sim('ATCG', 'TAGC')
        0.496790757381258


        .. versionadded:: 0.4.0

        """
        return (1.0 + self.corr(src, tar)) / 2.0


if __name__ == '__main__':

abydos/distance/_andres_marzo_delta.py 1 location


__all__ = ['AndresMarzoDelta']


class AndresMarzoDelta(_TokenDistance):
    r"""Andres & Marzo's Delta correlation.

    For two sets X and Y and a population N, Andres & Marzo's :math:`\Delta`
    correlation :cite:`Andres:2004` is

        .. math::

            corr_{AndresMarzo_\Delta}(X, Y) = \Delta =
            \frac{|X \cap Y| + |(N \setminus X) \setminus Y| -
            2\sqrt{|X \setminus Y| \cdot |Y \setminus X|}}{|N|}


    In :ref:`2x2 confusion table terms <confusion_table>`, where a+b+c+d=n,
    this is

        .. math::

            corr_{AndresMarzo_\Delta} = \Delta =
            \frac{a+d-2\sqrt{b \cdot c}}{n}

    .. versionadded:: 0.4.0
    """

    def __init__(
        self,
        alphabet: Optional[
            Union[TCounter[str], Sequence[str], Set[str], int]
        ] = None,
        tokenizer: Optional[_Tokenizer] = None,
        intersection_type: str = 'crisp',
        **kwargs: Any
    ) -> None:
        """Initialize AndresMarzoDelta instance.

        Parameters
        ----------
        alphabet : Counter, collection, int, or None
            This represents the alphabet of possible tokens.
            See :ref:`alphabet <alphabet>` description in
            :py:class:`_TokenDistance` for details.
        tokenizer : _Tokenizer
            A tokenizer instance from the :py:mod:`abydos.tokenizer` package
        intersection_type : str
            Specifies the intersection type, and set type as a result:
            See :ref:`intersection_type <intersection_type>` description in
            :py:class:`_TokenDistance` for details.
        **kwargs
            Arbitrary keyword arguments

        Other Parameters
        ----------------
        qval : int
            The length of each q-gram. Using this parameter and tokenizer=None
            will cause the instance to use the QGram tokenizer with this
            q value.
        metric : _Distance
            A string distance measure class for use in the ``soft`` and
            ``fuzzy`` variants.
        threshold : float
            A threshold value, similarities above which are counted as
            members of the intersection for the ``fuzzy`` variant.


        .. versionadded:: 0.4.0

        """
        super(AndresMarzoDelta, self).__init__(
            alphabet=alphabet,
            tokenizer=tokenizer,
            intersection_type=intersection_type,
            **kwargs
        )

    def corr(self, src: str, tar: str) -> float:
        """Return the Andres & Marzo's Delta correlation of two strings.

        Parameters
        ----------
        src : str
            Source string (or QGrams/Counter objects) for comparison
        tar : str
            Target string (or QGrams/Counter objects) for comparison

        Returns
        -------
        float
            Andres & Marzo's Delta correlation

        Examples
        --------
        >>> cmp = AndresMarzoDelta()
        >>> cmp.corr('cat', 'hat')
        0.9897959183673469
        >>> cmp.corr('Niall', 'Neil')
        0.9822344346552608
        >>> cmp.corr('aluminum', 'Catalan')
        0.9618259496215341
        >>> cmp.corr('ATCG', 'TAGC')
        0.9744897959183674


        .. versionadded:: 0.4.0

        """
        if src == tar:
            return 1.0

        self._tokenize(src, tar)

        a = self._intersection_card()
        b = self._src_only_card()
        c = self._tar_only_card()
        d = self._total_complement_card()
        n = self._population_unique_card()

        num = a + d - 2 * (b * c) ** 0.5

        if num == 0.0:
            return 0.0
        return num / n

    def sim(self, src: str, tar: str) -> float:
        """Return the Andres & Marzo's Delta similarity of two strings.

        Parameters
        ----------
        src : str
            Source string (or QGrams/Counter objects) for comparison
        tar : str
            Target string (or QGrams/Counter objects) for comparison

        Returns
        -------
        float
            Andres & Marzo's Delta similarity

        Examples
        --------
        >>> cmp = AndresMarzoDelta()
        >>> cmp.sim('cat', 'hat')
        0.9948979591836735
        >>> cmp.sim('Niall', 'Neil')
        0.9911172173276304
        >>> cmp.sim('aluminum', 'Catalan')
        0.980912974810767
        >>> cmp.sim('ATCG', 'TAGC')
        0.9872448979591837


        .. versionadded:: 0.4.0

        """
        return (self.corr(src, tar) + 1) / 2


if __name__ == '__main__':

abydos/distance/_benini_i.py 1 location


__all__ = ['BeniniI']


class BeniniI(_TokenDistance):
    r"""BeniniI correlation.

    For two sets X and Y and a population N, Benini I correlation, Benini's
    Index of Attraction, :cite:`Benini:1901` is

        .. math::

            corr_{BeniniI}(X, Y) =
            \frac{|X \cap Y| \cdot |(N \setminus X) \setminus Y| -
            |X \setminus Y| \cdot |Y \setminus X|}{|Y| \cdot |N \setminus X|}


    In :ref:`2x2 confusion table terms <confusion_table>`, where a+b+c+d=n,
    this is

        .. math::

            corr_{BeniniI} = \frac{ad-bc}{(a+c)(c+d)}

    .. versionadded:: 0.4.0
    """

    def __init__(
        self,
        alphabet: Optional[
            Union[TCounter[str], Sequence[str], Set[str], int]
        ] = None,
        tokenizer: Optional[_Tokenizer] = None,
        intersection_type: str = 'crisp',
        **kwargs: Any
    ) -> None:
        """Initialize BeniniI instance.

        Parameters
        ----------
        alphabet : Counter, collection, int, or None
            This represents the alphabet of possible tokens.
            See :ref:`alphabet <alphabet>` description in
            :py:class:`_TokenDistance` for details.
        tokenizer : _Tokenizer
            A tokenizer instance from the :py:mod:`abydos.tokenizer` package
        intersection_type : str
            Specifies the intersection type, and set type as a result:
            See :ref:`intersection_type <intersection_type>` description in
            :py:class:`_TokenDistance` for details.
        **kwargs
            Arbitrary keyword arguments

        Other Parameters
        ----------------
        qval : int
            The length of each q-gram. Using this parameter and tokenizer=None
            will cause the instance to use the QGram tokenizer with this
            q value.
        metric : _Distance
            A string distance measure class for use in the ``soft`` and
            ``fuzzy`` variants.
        threshold : float
            A threshold value, similarities above which are counted as
            members of the intersection for the ``fuzzy`` variant.


        .. versionadded:: 0.4.0

        """
        super(BeniniI, self).__init__(
            alphabet=alphabet,
            tokenizer=tokenizer,
            intersection_type=intersection_type,
            **kwargs
        )

    def corr(self, src: str, tar: str) -> float:
        """Return the Benini I correlation of two strings.

        Parameters
        ----------
        src : str
            Source string (or QGrams/Counter objects) for comparison
        tar : str
            Target string (or QGrams/Counter objects) for comparison

        Returns
        -------
        float
            Benini I correlation

        Examples
        --------
        >>> cmp = BeniniI()
        >>> cmp.corr('cat', 'hat')
        0.49743589743589745
        >>> cmp.corr('Niall', 'Neil')
        0.3953727506426735
        >>> cmp.corr('aluminum', 'Catalan')
        0.11485180412371133
        >>> cmp.corr('ATCG', 'TAGC')
        -0.006418485237483954


        .. versionadded:: 0.4.0

        """
        if src == tar:
            return 1.0

        self._tokenize(src, tar)

        a = self._intersection_card()
        b = self._src_only_card()
        c = self._tar_only_card()
        d = self._total_complement_card()

        num = a * d - b * c

        if num == 0.0:
            return 0.0
        return num / ((a + c) * (c + d))

    def sim(self, src: str, tar: str) -> float:
        """Return the Benini I similarity of two strings.

        Parameters
        ----------
        src : str
            Source string (or QGrams/Counter objects) for comparison
        tar : str
            Target string (or QGrams/Counter objects) for comparison

        Returns
        -------
        float
            Benini I similarity

        Examples
        --------
        >>> cmp = BeniniI()
        >>> cmp.sim('cat', 'hat')
        0.7487179487179487
        >>> cmp.sim('Niall', 'Neil')
        0.6976863753213367
        >>> cmp.sim('aluminum', 'Catalan')
        0.5574259020618557
        >>> cmp.sim('ATCG', 'TAGC')
        0.496790757381258


        .. versionadded:: 0.4.0

        """
        return (1 + self.corr(src, tar)) / 2


if __name__ == '__main__':

abydos/distance/_dispersion.py 1 location


__all__ = ['Dispersion']


class Dispersion(_TokenDistance):
    r"""Dispersion correlation.

    For two sets X and Y and a population N, the dispersion
    correlation :cite:`IBM:2017` is

        .. math::

            corr_{dispersion}(X, Y) =
            \frac{|X \cap Y| \cdot |(N \setminus X) \setminus Y| -
            |X \setminus Y| \cdot |Y \setminus X|}
            {|N|^2}

    In :ref:`2x2 confusion table terms <confusion_table>`, where a+b+c+d=n,
    this is

        .. math::

            corr_{dispersion} =
            \frac{ad-bc}{n^2}

    .. versionadded:: 0.4.0
    """

    def __init__(
        self,
        alphabet: Optional[
            Union[TCounter[str], Sequence[str], Set[str], int]
        ] = None,
        tokenizer: Optional[_Tokenizer] = None,
        intersection_type: str = 'crisp',
        **kwargs: Any
    ) -> None:
        """Initialize Dispersion instance.

        Parameters
        ----------
        alphabet : Counter, collection, int, or None
            This represents the alphabet of possible tokens.
            See :ref:`alphabet <alphabet>` description in
            :py:class:`_TokenDistance` for details.
        tokenizer : _Tokenizer
            A tokenizer instance from the :py:mod:`abydos.tokenizer` package
        intersection_type : str
            Specifies the intersection type, and set type as a result:
            See :ref:`intersection_type <intersection_type>` description in
            :py:class:`_TokenDistance` for details.
        **kwargs
            Arbitrary keyword arguments

        Other Parameters
        ----------------
        qval : int
            The length of each q-gram. Using this parameter and tokenizer=None
            will cause the instance to use the QGram tokenizer with this
            q value.
        metric : _Distance
            A string distance measure class for use in the ``soft`` and
            ``fuzzy`` variants.
        threshold : float
            A threshold value, similarities above which are counted as
            members of the intersection for the ``fuzzy`` variant.


        .. versionadded:: 0.4.0

        """
        super(Dispersion, self).__init__(
            alphabet=alphabet,
            tokenizer=tokenizer,
            intersection_type=intersection_type,
            **kwargs
        )

    def corr(self, src: str, tar: str) -> float:
        """Return the Dispersion correlation of two strings.

        Parameters
        ----------
        src : str
            Source string (or QGrams/Counter objects) for comparison
        tar : str
            Target string (or QGrams/Counter objects) for comparison

        Returns
        -------
        float
            Dispersion correlation

        Examples
        --------
        >>> cmp = Dispersion()
        >>> cmp.corr('cat', 'hat')
        0.002524989587671803
        >>> cmp.corr('Niall', 'Neil')
        0.002502212619741774
        >>> cmp.corr('aluminum', 'Catalan')
        0.0011570449105440383
        >>> cmp.corr('ATCG', 'TAGC')
        -4.06731570179092e-05


        .. versionadded:: 0.4.0

        """
        self._tokenize(src, tar)

        a = self._intersection_card()
        b = self._src_only_card()
        c = self._tar_only_card()
        d = self._total_complement_card()
        n = self._population_unique_card()

        admbc = a * d - b * c
        if admbc == 0.0:
            return 0.0
        return admbc / n ** 2

    def sim(self, src: str, tar: str) -> float:
        """Return the Dispersion similarity of two strings.

        Parameters
        ----------
        src : str
            Source string (or QGrams/Counter objects) for comparison
        tar : str
            Target string (or QGrams/Counter objects) for comparison

        Returns
        -------
        float
            Dispersion similarity

        Examples
        --------
        >>> cmp = Dispersion()
        >>> cmp.sim('cat', 'hat')
        0.5012624947938359
        >>> cmp.sim('Niall', 'Neil')
        0.5012511063098709
        >>> cmp.sim('aluminum', 'Catalan')
        0.500578522455272
        >>> cmp.sim('ATCG', 'TAGC')
        0.499979663421491


        .. versionadded:: 0.4.0

        """
        return (1 + self.corr(src, tar)) / 2


if __name__ == '__main__':

abydos/distance/_warrens_iii.py 1 location


__all__ = ['WarrensIII']


class WarrensIII(_TokenDistance):
    r"""Warrens III correlation.

    For two sets X and Y and a population N, Warrens III correlation
    :math:`S_{NS3}` :cite:`Warrens:2008` is

        .. math::

            corr_{WarrensIII}(X, Y) =
            \frac{2|(N \setminus X) \setminus Y| - |X \setminus Y| -
            |Y \setminus X|}{|N \setminus X| + |N \setminus Y|}

    In :ref:`2x2 confusion table terms <confusion_table>`, where a+b+c+d=n,
    this is

        .. math::

            corr_{WarrensIII} =
            \frac{2d-b-c}{2d+b+c}

    .. versionadded:: 0.4.0
    """

    def __init__(
        self,
        alphabet: Optional[
            Union[TCounter[str], Sequence[str], Set[str], int]
        ] = None,
        tokenizer: Optional[_Tokenizer] = None,
        intersection_type: str = 'crisp',
        **kwargs: Any
    ) -> None:
        """Initialize WarrensIII instance.

        Parameters
        ----------
        alphabet : Counter, collection, int, or None
            This represents the alphabet of possible tokens.
            See :ref:`alphabet <alphabet>` description in
            :py:class:`_TokenDistance` for details.
        tokenizer : _Tokenizer
            A tokenizer instance from the :py:mod:`abydos.tokenizer` package
        intersection_type : str
            Specifies the intersection type, and set type as a result:
            See :ref:`intersection_type <intersection_type>` description in
            :py:class:`_TokenDistance` for details.
        **kwargs
            Arbitrary keyword arguments

        Other Parameters
        ----------------
        qval : int
            The length of each q-gram. Using this parameter and tokenizer=None
            will cause the instance to use the QGram tokenizer with this
            q value.
        metric : _Distance
            A string distance measure class for use in the ``soft`` and
            ``fuzzy`` variants.
        threshold : float
            A threshold value, similarities above which are counted as
            members of the intersection for the ``fuzzy`` variant.


        .. versionadded:: 0.4.0

        """
        super(WarrensIII, self).__init__(
            alphabet=alphabet,
            tokenizer=tokenizer,
            intersection_type=intersection_type,
            **kwargs
        )

    def corr(self, src: str, tar: str) -> float:
        """Return the Warrens III correlation of two strings.

        Parameters
        ----------
        src : str
            Source string (or QGrams/Counter objects) for comparison
        tar : str
            Target string (or QGrams/Counter objects) for comparison

        Returns
        -------
        float
            Warrens III correlation

        Examples
        --------
        >>> cmp = WarrensIII()
        >>> cmp.corr('cat', 'hat')
        0.9948717948717949
        >>> cmp.corr('Niall', 'Neil')
        0.9910083493898523
        >>> cmp.corr('aluminum', 'Catalan')
        0.9806825499034127
        >>> cmp.corr('ATCG', 'TAGC')
        0.9871630295250321


        .. versionadded:: 0.4.0

        """
        if src == tar:
            return 1.0

        self._tokenize(src, tar)

        b = self._src_only_card()
        c = self._tar_only_card()
        d = self._total_complement_card()

        num = 2 * d - b - c
        if num:
            return num / (2 * d + b + c)
        return 0.0

    def sim(self, src: str, tar: str) -> float:
        """Return the Warrens III similarity of two strings.

        Parameters
        ----------
        src : str
            Source string (or QGrams/Counter objects) for comparison
        tar : str
            Target string (or QGrams/Counter objects) for comparison

        Returns
        -------
        float
            Warrens III similarity

        Examples
        --------
        >>> cmp = WarrensIII()
        >>> cmp.sim('cat', 'hat')
        0.9974358974358974
        >>> cmp.sim('Niall', 'Neil')
        0.9955041746949261
        >>> cmp.sim('aluminum', 'Catalan')
        0.9903412749517064
        >>> cmp.sim('ATCG', 'TAGC')
        0.993581514762516


        .. versionadded:: 0.4.0

        """
        return (1.0 + self.corr(src, tar)) / 2.0


if __name__ == '__main__':

		@@ 30-186 (lines=157) @@
27		__all__ = ['Peirce']
28
29
30		class Peirce(_TokenDistance):
31		r"""Peirce correlation.
32
33		For two sets X and Y and a population N, the Peirce correlation
34		:cite:`Peirce:1884` is
35
36		.. math::
37
38		corr_{Peirce}(X, Y) = \frac{\|X \cap Y\| \cdot
39		\|(N \setminus X) \setminus Y\| -
40		\|X \setminus Y\| \cdot \|Y \setminus Y\|}
41		{\|X\| \cdot \|N \setminus X\|}
42
43		Both :cite:`Choi:2010` and :cite:`Hubalek:1982` present a different formula
44		and incorrectly attribute it to Peirce. Likewise, :cite:`Doolittle:1884`
45		presents a different formula and incorrectly attributes it to Peirce. This
46		is distinct from the formula he presents and attributes to himself.
47
48		In :ref:`2x2 confusion table terms <confusion_table>`, where a+b+c+d=n,
49		this is
50
51		.. math::
52
53		corr_{Peirce} =
54		\frac{ad-bc}{(a+b)(c+d)}
55
56		.. versionadded:: 0.4.0
57		"""
58
59		def __init__(
60		self,
61		alphabet: Optional[
62		Union[TCounter[str], Sequence[str], Set[str], int]
63		] = None,
64		tokenizer: Optional[_Tokenizer] = None,
65		intersection_type: str = 'crisp',
66		**kwargs: Any
67		) -> None:
68		"""Initialize Peirce instance.
69
70		Parameters
71		----------
72		alphabet : Counter, collection, int, or None
73		This represents the alphabet of possible tokens.
74		See :ref:`alphabet <alphabet>` description in
75		:py:class:`_TokenDistance` for details.
76		tokenizer : _Tokenizer
77		A tokenizer instance from the :py:mod:`abydos.tokenizer` package
78		intersection_type : str
79		Specifies the intersection type, and set type as a result:
80		See :ref:`intersection_type <intersection_type>` description in
81		:py:class:`_TokenDistance` for details.
82		**kwargs
83		Arbitrary keyword arguments
84
85		Other Parameters
86		----------------
87		qval : int
88		The length of each q-gram. Using this parameter and tokenizer=None
89		will cause the instance to use the QGram tokenizer with this
90		q value.
91		metric : _Distance
92		A string distance measure class for use in the ``soft`` and
93		``fuzzy`` variants.
94		threshold : float
95		A threshold value, similarities above which are counted as
96		members of the intersection for the ``fuzzy`` variant.
97
98
99		.. versionadded:: 0.4.0
100
101		"""
102		super(Peirce, self).__init__(
103		alphabet=alphabet,
104		tokenizer=tokenizer,
105		intersection_type=intersection_type,
106		**kwargs
107		)
108
109		def corr(self, src: str, tar: str) -> float:
110		"""Return the Peirce correlation of two strings.
111
112		Parameters
113		----------
114		src : str
115		Source string (or QGrams/Counter objects) for comparison
116		tar : str
117		Target string (or QGrams/Counter objects) for comparison
118
119		Returns
120		-------
121		float
122		Peirce correlation
123
124		Examples
125		--------
126		>>> cmp = Peirce()
127		>>> cmp.corr('cat', 'hat')
128		0.49743589743589745
129		>>> cmp.corr('Niall', 'Neil')
130		0.32947729220222793
131		>>> cmp.corr('aluminum', 'Catalan')
132		0.10209049255441008
133		>>> cmp.corr('ATCG', 'TAGC')
134		-0.006418485237483954
135
136
137		.. versionadded:: 0.4.0
138
139		"""
140		if src == tar:
141		return 1.0
142
143		self._tokenize(src, tar)
144
145		a = self._intersection_card()
146		b = self._src_only_card()
147		c = self._tar_only_card()
148		d = self._total_complement_card()
149
150		num = a * d - b * c
151		if num:
152		return num / ((a + b) * (c + d))
153		return 0.0
154
155		def sim(self, src: str, tar: str) -> float:
156		"""Return the Peirce similarity of two strings.
157
158		Parameters
159		----------
160		src : str
161		Source string (or QGrams/Counter objects) for comparison
162		tar : str
163		Target string (or QGrams/Counter objects) for comparison
164
165		Returns
166		-------
167		float
168		Peirce similarity
169
170		Examples
171		--------
172		>>> cmp = Peirce()
173		>>> cmp.sim('cat', 'hat')
174		0.7487179487179487
175		>>> cmp.sim('Niall', 'Neil')
176		0.664738646101114
177		>>> cmp.sim('aluminum', 'Catalan')
178		0.5510452462772051
179		>>> cmp.sim('ATCG', 'TAGC')
180		0.496790757381258
181
182
183		.. versionadded:: 0.4.0
184
185		"""
186		return (1.0 + self.corr(src, tar)) / 2.0
187
188
189		if __name__ == '__main__':

		@@ 30-183 (lines=154) @@
27		__all__ = ['UnknownA']
28
29
30		class UnknownA(_TokenDistance):
31		r"""Unknown A correlation.
32
33		For two sets X and Y and a population N, Unknown A correlation
34		is sometimes attributed to :cite:`Peirce:1884`. It differs from
35		:py:class:`Peirce` in that the numerator is the product of the opposite
36		pair of marginals:
37
38		.. math::
39
40		corr_{UnknownA}(X, Y) = \frac{\|X \cap Y\| \cdot
41		\|(N \setminus X) \setminus Y\| -
42		\|X \setminus Y\| \cdot \|Y \setminus Y\|}
43		{\|Y\| \cdot \|N \setminus Y\|}
44
45		In :ref:`2x2 confusion table terms <confusion_table>`, where a+b+c+d=n,
46		this is
47
48		.. math::
49
50		corr_{UnknownA} =
51		\frac{ad-bc}{(a+c)(b+d)}
52
53		.. versionadded:: 0.4.0
54		"""
55
56		def __init__(
57		self,
58		alphabet: Optional[
59		Union[TCounter[str], Sequence[str], Set[str], int]
60		] = None,
61		tokenizer: Optional[_Tokenizer] = None,
62		intersection_type: str = 'crisp',
63		**kwargs: Any
64		) -> None:
65		"""Initialize UnknownA instance.
66
67		Parameters
68		----------
69		alphabet : Counter, collection, int, or None
70		This represents the alphabet of possible tokens.
71		See :ref:`alphabet <alphabet>` description in
72		:py:class:`_TokenDistance` for details.
73		tokenizer : _Tokenizer
74		A tokenizer instance from the :py:mod:`abydos.tokenizer` package
75		intersection_type : str
76		Specifies the intersection type, and set type as a result:
77		See :ref:`intersection_type <intersection_type>` description in
78		:py:class:`_TokenDistance` for details.
79		**kwargs
80		Arbitrary keyword arguments
81
82		Other Parameters
83		----------------
84		qval : int
85		The length of each q-gram. Using this parameter and tokenizer=None
86		will cause the instance to use the QGram tokenizer with this
87		q value.
88		metric : _Distance
89		A string distance measure class for use in the ``soft`` and
90		``fuzzy`` variants.
91		threshold : float
92		A threshold value, similarities above which are counted as
93		members of the intersection for the ``fuzzy`` variant.
94
95
96		.. versionadded:: 0.4.0
97
98		"""
99		super(UnknownA, self).__init__(
100		alphabet=alphabet,
101		tokenizer=tokenizer,
102		intersection_type=intersection_type,
103		**kwargs
104		)
105
106		def corr(self, src: str, tar: str) -> float:
107		"""Return the Unknown A correlation of two strings.
108
109		Parameters
110		----------
111		src : str
112		Source string (or QGrams/Counter objects) for comparison
113		tar : str
114		Target string (or QGrams/Counter objects) for comparison
115
116		Returns
117		-------
118		float
119		Unknown A correlation
120
121		Examples
122		--------
123		>>> cmp = UnknownA()
124		>>> cmp.corr('cat', 'hat')
125		0.49743589743589745
126		>>> cmp.corr('Niall', 'Neil')
127		0.39486521181001283
128		>>> cmp.corr('aluminum', 'Catalan')
129		0.1147039897039897
130		>>> cmp.corr('ATCG', 'TAGC')
131		-0.006418485237483954
132
133
134		.. versionadded:: 0.4.0
135
136		"""
137		if src == tar:
138		return 1.0
139
140		self._tokenize(src, tar)
141
142		a = self._intersection_card()
143		b = self._src_only_card()
144		c = self._tar_only_card()
145		d = self._total_complement_card()
146
147		num = a * d - b * c
148		if num:
149		return num / ((a + c) * (b + d))
150		return 0.0
151
152		def sim(self, src: str, tar: str) -> float:
153		"""Return the Unknown A similarity of two strings.
154
155		Parameters
156		----------
157		src : str
158		Source string (or QGrams/Counter objects) for comparison
159		tar : str
160		Target string (or QGrams/Counter objects) for comparison
161
162		Returns
163		-------
164		float
165		Unknown A similarity
166
167		Examples
168		--------
169		>>> cmp = UnknownA()
170		>>> cmp.sim('cat', 'hat')
171		0.7487179487179487
172		>>> cmp.sim('Niall', 'Neil')
173		0.6974326059050064
174		>>> cmp.sim('aluminum', 'Catalan')
175		0.5573519948519948
176		>>> cmp.sim('ATCG', 'TAGC')
177		0.496790757381258
178
179
180		.. versionadded:: 0.4.0
181
182		"""
183		return (1.0 + self.corr(src, tar)) / 2.0
184
185
186		if __name__ == '__main__':

		@@ 30-183 (lines=154) @@
27		__all__ = ['AndresMarzoDelta']
28
29
30		class AndresMarzoDelta(_TokenDistance):
31		r"""Andres & Marzo's Delta correlation.
32
33		For two sets X and Y and a population N, Andres & Marzo's :math:`\Delta`
34		correlation :cite:`Andres:2004` is
35
36		.. math::
37
38		corr_{AndresMarzo_\Delta}(X, Y) = \Delta =
39		\frac{\|X \cap Y\| + \|(N \setminus X) \setminus Y\| -
40		2\sqrt{\|X \setminus Y\| \cdot \|Y \setminus X\|}}{\|N\|}
41
42
43		In :ref:`2x2 confusion table terms <confusion_table>`, where a+b+c+d=n,
44		this is
45
46		.. math::
47
48		corr_{AndresMarzo_\Delta} = \Delta =
49		\frac{a+d-2\sqrt{b \cdot c}}{n}
50
51		.. versionadded:: 0.4.0
52		"""
53
54		def __init__(
55		self,
56		alphabet: Optional[
57		Union[TCounter[str], Sequence[str], Set[str], int]
58		] = None,
59		tokenizer: Optional[_Tokenizer] = None,
60		intersection_type: str = 'crisp',
61		**kwargs: Any
62		) -> None:
63		"""Initialize AndresMarzoDelta instance.
64
65		Parameters
66		----------
67		alphabet : Counter, collection, int, or None
68		This represents the alphabet of possible tokens.
69		See :ref:`alphabet <alphabet>` description in
70		:py:class:`_TokenDistance` for details.
71		tokenizer : _Tokenizer
72		A tokenizer instance from the :py:mod:`abydos.tokenizer` package
73		intersection_type : str
74		Specifies the intersection type, and set type as a result:
75		See :ref:`intersection_type <intersection_type>` description in
76		:py:class:`_TokenDistance` for details.
77		**kwargs
78		Arbitrary keyword arguments
79
80		Other Parameters
81		----------------
82		qval : int
83		The length of each q-gram. Using this parameter and tokenizer=None
84		will cause the instance to use the QGram tokenizer with this
85		q value.
86		metric : _Distance
87		A string distance measure class for use in the ``soft`` and
88		``fuzzy`` variants.
89		threshold : float
90		A threshold value, similarities above which are counted as
91		members of the intersection for the ``fuzzy`` variant.
92
93
94		.. versionadded:: 0.4.0
95
96		"""
97		super(AndresMarzoDelta, self).__init__(
98		alphabet=alphabet,
99		tokenizer=tokenizer,
100		intersection_type=intersection_type,
101		**kwargs
102		)
103
104		def corr(self, src: str, tar: str) -> float:
105		"""Return the Andres & Marzo's Delta correlation of two strings.
106
107		Parameters
108		----------
109		src : str
110		Source string (or QGrams/Counter objects) for comparison
111		tar : str
112		Target string (or QGrams/Counter objects) for comparison
113
114		Returns
115		-------
116		float
117		Andres & Marzo's Delta correlation
118
119		Examples
120		--------
121		>>> cmp = AndresMarzoDelta()
122		>>> cmp.corr('cat', 'hat')
123		0.9897959183673469
124		>>> cmp.corr('Niall', 'Neil')
125		0.9822344346552608
126		>>> cmp.corr('aluminum', 'Catalan')
127		0.9618259496215341
128		>>> cmp.corr('ATCG', 'TAGC')
129		0.9744897959183674
130
131
132		.. versionadded:: 0.4.0
133
134		"""
135		if src == tar:
136		return 1.0
137
138		self._tokenize(src, tar)
139
140		a = self._intersection_card()
141		b = self._src_only_card()
142		c = self._tar_only_card()
143		d = self._total_complement_card()
144		n = self._population_unique_card()
145
146		num = a + d - 2 * (b * c) ** 0.5
147
148		if num == 0.0:
149		return 0.0
150		return num / n
151
152		def sim(self, src: str, tar: str) -> float:
153		"""Return the Andres & Marzo's Delta similarity of two strings.
154
155		Parameters
156		----------
157		src : str
158		Source string (or QGrams/Counter objects) for comparison
159		tar : str
160		Target string (or QGrams/Counter objects) for comparison
161
162		Returns
163		-------
164		float
165		Andres & Marzo's Delta similarity
166
167		Examples
168		--------
169		>>> cmp = AndresMarzoDelta()
170		>>> cmp.sim('cat', 'hat')
171		0.9948979591836735
172		>>> cmp.sim('Niall', 'Neil')
173		0.9911172173276304
174		>>> cmp.sim('aluminum', 'Catalan')
175		0.980912974810767
176		>>> cmp.sim('ATCG', 'TAGC')
177		0.9872448979591837
178
179
180		.. versionadded:: 0.4.0
181
182		"""
183		return (self.corr(src, tar) + 1) / 2
184
185
186		if __name__ == '__main__':

		@@ 30-181 (lines=152) @@
27		__all__ = ['BeniniI']
28
29
30		class BeniniI(_TokenDistance):
31		r"""BeniniI correlation.
32
33		For two sets X and Y and a population N, Benini I correlation, Benini's
34		Index of Attraction, :cite:`Benini:1901` is
35
36		.. math::
37
38		corr_{BeniniI}(X, Y) =
39		\frac{\|X \cap Y\| \cdot \|(N \setminus X) \setminus Y\| -
40		\|X \setminus Y\| \cdot \|Y \setminus X\|}{\|Y\| \cdot \|N \setminus X\|}
41
42
43		In :ref:`2x2 confusion table terms <confusion_table>`, where a+b+c+d=n,
44		this is
45
46		.. math::
47
48		corr_{BeniniI} = \frac{ad-bc}{(a+c)(c+d)}
49
50		.. versionadded:: 0.4.0
51		"""
52
53		def __init__(
54		self,
55		alphabet: Optional[
56		Union[TCounter[str], Sequence[str], Set[str], int]
57		] = None,
58		tokenizer: Optional[_Tokenizer] = None,
59		intersection_type: str = 'crisp',
60		**kwargs: Any
61		) -> None:
62		"""Initialize BeniniI instance.
63
64		Parameters
65		----------
66		alphabet : Counter, collection, int, or None
67		This represents the alphabet of possible tokens.
68		See :ref:`alphabet <alphabet>` description in
69		:py:class:`_TokenDistance` for details.
70		tokenizer : _Tokenizer
71		A tokenizer instance from the :py:mod:`abydos.tokenizer` package
72		intersection_type : str
73		Specifies the intersection type, and set type as a result:
74		See :ref:`intersection_type <intersection_type>` description in
75		:py:class:`_TokenDistance` for details.
76		**kwargs
77		Arbitrary keyword arguments
78
79		Other Parameters
80		----------------
81		qval : int
82		The length of each q-gram. Using this parameter and tokenizer=None
83		will cause the instance to use the QGram tokenizer with this
84		q value.
85		metric : _Distance
86		A string distance measure class for use in the ``soft`` and
87		``fuzzy`` variants.
88		threshold : float
89		A threshold value, similarities above which are counted as
90		members of the intersection for the ``fuzzy`` variant.
91
92
93		.. versionadded:: 0.4.0
94
95		"""
96		super(BeniniI, self).__init__(
97		alphabet=alphabet,
98		tokenizer=tokenizer,
99		intersection_type=intersection_type,
100		**kwargs
101		)
102
103		def corr(self, src: str, tar: str) -> float:
104		"""Return the Benini I correlation of two strings.
105
106		Parameters
107		----------
108		src : str
109		Source string (or QGrams/Counter objects) for comparison
110		tar : str
111		Target string (or QGrams/Counter objects) for comparison
112
113		Returns
114		-------
115		float
116		Benini I correlation
117
118		Examples
119		--------
120		>>> cmp = BeniniI()
121		>>> cmp.corr('cat', 'hat')
122		0.49743589743589745
123		>>> cmp.corr('Niall', 'Neil')
124		0.3953727506426735
125		>>> cmp.corr('aluminum', 'Catalan')
126		0.11485180412371133
127		>>> cmp.corr('ATCG', 'TAGC')
128		-0.006418485237483954
129
130
131		.. versionadded:: 0.4.0
132
133		"""
134		if src == tar:
135		return 1.0
136
137		self._tokenize(src, tar)
138
139		a = self._intersection_card()
140		b = self._src_only_card()
141		c = self._tar_only_card()
142		d = self._total_complement_card()
143
144		num = a * d - b * c
145
146		if num == 0.0:
147		return 0.0
148		return num / ((a + c) * (c + d))
149
150		def sim(self, src: str, tar: str) -> float:
151		"""Return the Benini I similarity of two strings.
152
153		Parameters
154		----------
155		src : str
156		Source string (or QGrams/Counter objects) for comparison
157		tar : str
158		Target string (or QGrams/Counter objects) for comparison
159
160		Returns
161		-------
162		float
163		Benini I similarity
164
165		Examples
166		--------
167		>>> cmp = BeniniI()
168		>>> cmp.sim('cat', 'hat')
169		0.7487179487179487
170		>>> cmp.sim('Niall', 'Neil')
171		0.6976863753213367
172		>>> cmp.sim('aluminum', 'Catalan')
173		0.5574259020618557
174		>>> cmp.sim('ATCG', 'TAGC')
175		0.496790757381258
176
177
178		.. versionadded:: 0.4.0
179
180		"""
181		return (1 + self.corr(src, tar)) / 2
182
183
184		if __name__ == '__main__':

		@@ 30-179 (lines=150) @@
27		__all__ = ['Dispersion']
28
29
30		class Dispersion(_TokenDistance):
31		r"""Dispersion correlation.
32
33		For two sets X and Y and a population N, the dispersion
34		correlation :cite:`IBM:2017` is
35
36		.. math::
37
38		corr_{dispersion}(X, Y) =
39		\frac{\|X \cap Y\| \cdot \|(N \setminus X) \setminus Y\| -
40		\|X \setminus Y\| \cdot \|Y \setminus X\|}
41		{\|N\|^2}
42
43		In :ref:`2x2 confusion table terms <confusion_table>`, where a+b+c+d=n,
44		this is
45
46		.. math::
47
48		corr_{dispersion} =
49		\frac{ad-bc}{n^2}
50
51		.. versionadded:: 0.4.0
52		"""
53
54		def __init__(
55		self,
56		alphabet: Optional[
57		Union[TCounter[str], Sequence[str], Set[str], int]
58		] = None,
59		tokenizer: Optional[_Tokenizer] = None,
60		intersection_type: str = 'crisp',
61		**kwargs: Any
62		) -> None:
63		"""Initialize Dispersion instance.
64
65		Parameters
66		----------
67		alphabet : Counter, collection, int, or None
68		This represents the alphabet of possible tokens.
69		See :ref:`alphabet <alphabet>` description in
70		:py:class:`_TokenDistance` for details.
71		tokenizer : _Tokenizer
72		A tokenizer instance from the :py:mod:`abydos.tokenizer` package
73		intersection_type : str
74		Specifies the intersection type, and set type as a result:
75		See :ref:`intersection_type <intersection_type>` description in
76		:py:class:`_TokenDistance` for details.
77		**kwargs
78		Arbitrary keyword arguments
79
80		Other Parameters
81		----------------
82		qval : int
83		The length of each q-gram. Using this parameter and tokenizer=None
84		will cause the instance to use the QGram tokenizer with this
85		q value.
86		metric : _Distance
87		A string distance measure class for use in the ``soft`` and
88		``fuzzy`` variants.
89		threshold : float
90		A threshold value, similarities above which are counted as
91		members of the intersection for the ``fuzzy`` variant.
92
93
94		.. versionadded:: 0.4.0
95
96		"""
97		super(Dispersion, self).__init__(
98		alphabet=alphabet,
99		tokenizer=tokenizer,
100		intersection_type=intersection_type,
101		**kwargs
102		)
103
104		def corr(self, src: str, tar: str) -> float:
105		"""Return the Dispersion correlation of two strings.
106
107		Parameters
108		----------
109		src : str
110		Source string (or QGrams/Counter objects) for comparison
111		tar : str
112		Target string (or QGrams/Counter objects) for comparison
113
114		Returns
115		-------
116		float
117		Dispersion correlation
118
119		Examples
120		--------
121		>>> cmp = Dispersion()
122		>>> cmp.corr('cat', 'hat')
123		0.002524989587671803
124		>>> cmp.corr('Niall', 'Neil')
125		0.002502212619741774
126		>>> cmp.corr('aluminum', 'Catalan')
127		0.0011570449105440383
128		>>> cmp.corr('ATCG', 'TAGC')
129		-4.06731570179092e-05
130
131
132		.. versionadded:: 0.4.0
133
134		"""
135		self._tokenize(src, tar)
136
137		a = self._intersection_card()
138		b = self._src_only_card()
139		c = self._tar_only_card()
140		d = self._total_complement_card()
141		n = self._population_unique_card()
142
143		admbc = a * d - b * c
144		if admbc == 0.0:
145		return 0.0
146		return admbc / n ** 2
147
148		def sim(self, src: str, tar: str) -> float:
149		"""Return the Dispersion similarity of two strings.
150
151		Parameters
152		----------
153		src : str
154		Source string (or QGrams/Counter objects) for comparison
155		tar : str
156		Target string (or QGrams/Counter objects) for comparison
157
158		Returns
159		-------
160		float
161		Dispersion similarity
162
163		Examples
164		--------
165		>>> cmp = Dispersion()
166		>>> cmp.sim('cat', 'hat')
167		0.5012624947938359
168		>>> cmp.sim('Niall', 'Neil')
169		0.5012511063098709
170		>>> cmp.sim('aluminum', 'Catalan')
171		0.500578522455272
172		>>> cmp.sim('ATCG', 'TAGC')
173		0.499979663421491
174
175
176		.. versionadded:: 0.4.0
177
178		"""
179		return (1 + self.corr(src, tar)) / 2
180
181
182		if __name__ == '__main__':

		@@ 30-179 (lines=150) @@
27		__all__ = ['WarrensIII']
28
29
30		class WarrensIII(_TokenDistance):
31		r"""Warrens III correlation.
32
33		For two sets X and Y and a population N, Warrens III correlation
34		:math:`S_{NS3}` :cite:`Warrens:2008` is
35
36		.. math::
37
38		corr_{WarrensIII}(X, Y) =
39		\frac{2\|(N \setminus X) \setminus Y\| - \|X \setminus Y\| -
40		\|Y \setminus X\|}{\|N \setminus X\| + \|N \setminus Y\|}
41
42		In :ref:`2x2 confusion table terms <confusion_table>`, where a+b+c+d=n,
43		this is
44
45		.. math::
46
47		corr_{WarrensIII} =
48		\frac{2d-b-c}{2d+b+c}
49
50		.. versionadded:: 0.4.0
51		"""
52
53		def __init__(
54		self,
55		alphabet: Optional[
56		Union[TCounter[str], Sequence[str], Set[str], int]
57		] = None,
58		tokenizer: Optional[_Tokenizer] = None,
59		intersection_type: str = 'crisp',
60		**kwargs: Any
61		) -> None:
62		"""Initialize WarrensIII instance.
63
64		Parameters
65		----------
66		alphabet : Counter, collection, int, or None
67		This represents the alphabet of possible tokens.
68		See :ref:`alphabet <alphabet>` description in
69		:py:class:`_TokenDistance` for details.
70		tokenizer : _Tokenizer
71		A tokenizer instance from the :py:mod:`abydos.tokenizer` package
72		intersection_type : str
73		Specifies the intersection type, and set type as a result:
74		See :ref:`intersection_type <intersection_type>` description in
75		:py:class:`_TokenDistance` for details.
76		**kwargs
77		Arbitrary keyword arguments
78
79		Other Parameters
80		----------------
81		qval : int
82		The length of each q-gram. Using this parameter and tokenizer=None
83		will cause the instance to use the QGram tokenizer with this
84		q value.
85		metric : _Distance
86		A string distance measure class for use in the ``soft`` and
87		``fuzzy`` variants.
88		threshold : float
89		A threshold value, similarities above which are counted as
90		members of the intersection for the ``fuzzy`` variant.
91
92
93		.. versionadded:: 0.4.0
94
95		"""
96		super(WarrensIII, self).__init__(
97		alphabet=alphabet,
98		tokenizer=tokenizer,
99		intersection_type=intersection_type,
100		**kwargs
101		)
102
103		def corr(self, src: str, tar: str) -> float:
104		"""Return the Warrens III correlation of two strings.
105
106		Parameters
107		----------
108		src : str
109		Source string (or QGrams/Counter objects) for comparison
110		tar : str
111		Target string (or QGrams/Counter objects) for comparison
112
113		Returns
114		-------
115		float
116		Warrens III correlation
117
118		Examples
119		--------
120		>>> cmp = WarrensIII()
121		>>> cmp.corr('cat', 'hat')
122		0.9948717948717949
123		>>> cmp.corr('Niall', 'Neil')
124		0.9910083493898523
125		>>> cmp.corr('aluminum', 'Catalan')
126		0.9806825499034127
127		>>> cmp.corr('ATCG', 'TAGC')
128		0.9871630295250321
129
130
131		.. versionadded:: 0.4.0
132
133		"""
134		if src == tar:
135		return 1.0
136
137		self._tokenize(src, tar)
138
139		b = self._src_only_card()
140		c = self._tar_only_card()
141		d = self._total_complement_card()
142
143		num = 2 * d - b - c
144		if num:
145		return num / (2 * d + b + c)
146		return 0.0
147
148		def sim(self, src: str, tar: str) -> float:
149		"""Return the Warrens III similarity of two strings.
150
151		Parameters
152		----------
153		src : str
154		Source string (or QGrams/Counter objects) for comparison
155		tar : str
156		Target string (or QGrams/Counter objects) for comparison
157
158		Returns
159		-------
160		float
161		Warrens III similarity
162
163		Examples
164		--------
165		>>> cmp = WarrensIII()
166		>>> cmp.sim('cat', 'hat')
167		0.9974358974358974
168		>>> cmp.sim('Niall', 'Neil')
169		0.9955041746949261
170		>>> cmp.sim('aluminum', 'Catalan')
171		0.9903412749517064
172		>>> cmp.sim('ATCG', 'TAGC')
173		0.993581514762516
174
175
176		.. versionadded:: 0.4.0
177
178		"""
179		return (1.0 + self.corr(src, tar)) / 2.0
180
181
182		if __name__ == '__main__':

chrislit / abydos

Code Duplication Length = 150-157 lines in 6 locations

abydos/distance/_peirce.py 1 location

abydos/distance/_unknown_a.py 1 location

abydos/distance/_andres_marzo_delta.py 1 location

abydos/distance/_benini_i.py 1 location

abydos/distance/_dispersion.py 1 location

abydos/distance/_warrens_iii.py 1 location