Completed
Pull Request — master (#225)
by Chris
09:15
created

abydos.distance._inclusion   A

Complexity

Total Complexity 13

Size/Duplication

Total Lines 128
Duplicated Lines 0 %

Test Coverage

Coverage 100%

Importance

Changes 0
Metric Value
wmc 13
eloc 43
dl 0
loc 128
ccs 36
cts 36
cp 1
rs 10
c 0
b 0
f 0

1 Method

Rating   Name   Duplication   Size   Complexity  
D Inclusion.dist() 0 63 13
1
# -*- coding: utf-8 -*-
2
3
# Copyright 2019 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19 1
"""abydos.distance._inclusion.
20
21
Bouchard & Pouyez's INClusion Programme
22
"""
23
24 1
from __future__ import (
25
    absolute_import,
26
    division,
27
    print_function,
28
    unicode_literals,
29
)
30
31 1
from ._distance import _Distance
32 1
from ._levenshtein import Levenshtein
33
34 1
__all__ = ['Inclusion']
35
36
37 1
class Inclusion(_Distance):
38
    """Inclusion distance.
39
40
    The INC Programme, developed by :cite:`Bouchard:1980` designates two
41
    terms as being "included" when:
42
43
        - One name is shorter than the other
44
        - There are at least 3 common characters
45
        - There is at most one difference, disregarding unmatching
46
          prefixes and suffixes
47
48
    In addition to these rules, this implementation considers two terms
49
    as being "included" if they are identical.
50
51
    The return value, though a float, can only take one of two values:
52
    0.0, indicating inclusion, or 1.0, indication non-inclusion.
53
54
    .. versionadded:: 0.4.1
55
    """
56
57 1
    _lev = Levenshtein()
58
59 1
    def dist(self, src, tar):
60
        """Return the INClusion Programme value of two words.
61
62
        Parameters
63
        ----------
64
        src : str
65
            Source string for comparison
66
        tar : str
67
            Target string for comparison
68
69
        Returns
70
        -------
71
        float
72
            The INC Programme distance
73
74
        Examples
75
        --------
76
        >>> cmp = Inclusion()
77
        >>> round(cmp.dist('cat', 'hat'), 12)
78
        1.0
79
        >>> round(cmp.dist('Niall', 'Neil'), 12)
80
        1.0
81
        >>> cmp.dist('aluminum', 'Catalan')
82
        1.0
83
        >>> cmp.dist('ATCG', 'TAGC')
84
        1.0
85
86
87
        .. versionadded:: 0.4.1
88
89
        """
90 1
        if src == tar:
91 1
            return 0.0
92 1
        if len(src) == len(tar):
93 1
            return 1.0
94
95 1
        diff, src, tar = self._lev.alignment(src, tar)
96
97 1
        src = list(src)
98 1
        tar = list(tar)
99
100 1
        while src and src[0] == '-':
101 1
            src.pop(0)
102 1
            tar.pop(0)
103 1
            diff -= 1
104 1
        while tar and tar[0] == '-':
105 1
            src.pop(0)
106 1
            tar.pop(0)
107 1
            diff -= 1
108 1
        while src and src[-1] == '-':
109 1
            src.pop(0)
110 1
            tar.pop(0)
111 1
            diff -= 1
112 1
        while tar and tar[-1] == '-':
113 1
            src.pop(0)
114 1
            tar.pop(0)
115 1
            diff -= 1
116
117 1
        if diff > 1:
118 1
            return 1.0
119 1
        if len(src) - diff < 3:
120 1
            return 1.0
121 1
        return 0.0
122
123
124
if __name__ == '__main__':
125
    import doctest
126
127
    doctest.testmod()
128