Completed
Branch master (78a222)
by Chris
14:36
created

abydos.phonetic._alpha_sis   A

Complexity

Total Complexity 14

Size/Duplication

Total Lines 260
Duplicated Lines 0 %

Test Coverage

Coverage 100%

Importance

Changes 0
Metric Value
wmc 14
eloc 183
dl 0
loc 260
ccs 50
cts 50
cp 1
rs 10
c 0
b 0
f 0

1 Function

Rating   Name   Duplication   Size   Complexity  
F alpha_sis() 0 219 14
1
# -*- coding: utf-8 -*-
2
3
# Copyright 2014-2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19 1
"""abydos.phonetic._alpha_sis.
20
21
The phonetic._alpha_sis module implements IBM's Alpha Search Inquiry System
22
coding.
23
"""
24
25 1
from __future__ import unicode_literals
26
27 1
from unicodedata import normalize as unicode_normalize
28
29 1
from six import text_type
30 1
from six.moves import range
31
32 1
__all__ = ['alpha_sis']
33
34
35 1
def alpha_sis(word, max_length=14):
36
    """Return the IBM Alpha Search Inquiry System code for a word.
37
38
    The Alpha Search Inquiry System code is defined in :cite:`IBM:1973`.
39
    This implementation is based on the description in :cite:`Moore:1977`.
40
41
    A collection is necessary since there can be multiple values for a
42
    single word. But the collection must be ordered since the first value
43
    is the primary coding.
44
45
    :param str word: the word to transform
46
    :param int max_length: the length of the code returned (defaults to 14)
47
    :returns: the Alpha SIS value
48
    :rtype: tuple
49
50
    >>> alpha_sis('Christopher')
51
    ('06401840000000', '07040184000000', '04018400000000')
52
    >>> alpha_sis('Niall')
53
    ('02500000000000',)
54
    >>> alpha_sis('Smith')
55
    ('03100000000000',)
56
    >>> alpha_sis('Schmidt')
57
    ('06310000000000',)
58
    """
59 1
    _alpha_sis_initials = {
60
        'GF': '08',
61
        'GM': '03',
62
        'GN': '02',
63
        'KN': '02',
64
        'PF': '08',
65
        'PN': '02',
66
        'PS': '00',
67
        'WR': '04',
68
        'A': '1',
69
        'E': '1',
70
        'H': '2',
71
        'I': '1',
72
        'J': '3',
73
        'O': '1',
74
        'U': '1',
75
        'W': '4',
76
        'Y': '5',
77
    }
78 1
    _alpha_sis_initials_order = (
79
        'GF',
80
        'GM',
81
        'GN',
82
        'KN',
83
        'PF',
84
        'PN',
85
        'PS',
86
        'WR',
87
        'A',
88
        'E',
89
        'H',
90
        'I',
91
        'J',
92
        'O',
93
        'U',
94
        'W',
95
        'Y',
96
    )
97 1
    _alpha_sis_basic = {
98
        'SCH': '6',
99
        'CZ': ('70', '6', '0'),
100
        'CH': ('6', '70', '0'),
101
        'CK': ('7', '6'),
102
        'DS': ('0', '10'),
103
        'DZ': ('0', '10'),
104
        'TS': ('0', '10'),
105
        'TZ': ('0', '10'),
106
        'CI': '0',
107
        'CY': '0',
108
        'CE': '0',
109
        'SH': '6',
110
        'DG': '7',
111
        'PH': '8',
112
        'C': ('7', '6'),
113
        'K': ('7', '6'),
114
        'Z': '0',
115
        'S': '0',
116
        'D': '1',
117
        'T': '1',
118
        'N': '2',
119
        'M': '3',
120
        'R': '4',
121
        'L': '5',
122
        'J': '6',
123
        'G': '7',
124
        'Q': '7',
125
        'X': '7',
126
        'F': '8',
127
        'V': '8',
128
        'B': '9',
129
        'P': '9',
130
    }
131 1
    _alpha_sis_basic_order = (
132
        'SCH',
133
        'CZ',
134
        'CH',
135
        'CK',
136
        'DS',
137
        'DZ',
138
        'TS',
139
        'TZ',
140
        'CI',
141
        'CY',
142
        'CE',
143
        'SH',
144
        'DG',
145
        'PH',
146
        'C',
147
        'K',
148
        'Z',
149
        'S',
150
        'D',
151
        'T',
152
        'N',
153
        'M',
154
        'R',
155
        'L',
156
        'J',
157
        'C',
158
        'G',
159
        'K',
160
        'Q',
161
        'X',
162
        'F',
163
        'V',
164
        'B',
165
        'P',
166
    )
167
168 1
    alpha = ['']
169 1
    pos = 0
170 1
    word = unicode_normalize('NFKD', text_type(word.upper()))
171 1
    word = word.replace('ß', 'SS')
172 1
    word = ''.join(
173
        c
174
        for c in word
175
        if c
176
        in {
177
            'A',
178
            'B',
179
            'C',
180
            'D',
181
            'E',
182
            'F',
183
            'G',
184
            'H',
185
            'I',
186
            'J',
187
            'K',
188
            'L',
189
            'M',
190
            'N',
191
            'O',
192
            'P',
193
            'Q',
194
            'R',
195
            'S',
196
            'T',
197
            'U',
198
            'V',
199
            'W',
200
            'X',
201
            'Y',
202
            'Z',
203
        }
204
    )
205
206
    # Clamp max_length to [4, 64]
207 1
    if max_length != -1:
208 1
        max_length = min(max(4, max_length), 64)
209
    else:
210 1
        max_length = 64
211
212
    # Do special processing for initial substrings
213 1
    for k in _alpha_sis_initials_order:
214 1
        if word.startswith(k):
215 1
            alpha[0] += _alpha_sis_initials[k]
216 1
            pos += len(k)
217 1
            break
218
219
    # Add a '0' if alpha is still empty
220 1
    if not alpha[0]:
221 1
        alpha[0] += '0'
222
223
    # Whether or not any special initial codes were encoded, iterate
224
    # through the length of the word in the main encoding loop
225 1
    while pos < len(word):
226 1
        orig_pos = pos
227 1
        for k in _alpha_sis_basic_order:
228 1
            if word[pos:].startswith(k):
229 1
                if isinstance(_alpha_sis_basic[k], tuple):
230 1
                    newalpha = []
231 1
                    for i in range(len(_alpha_sis_basic[k])):
232 1
                        newalpha += [_ + _alpha_sis_basic[k][i] for _ in alpha]
233 1
                    alpha = newalpha
234
                else:
235 1
                    alpha = [_ + _alpha_sis_basic[k] for _ in alpha]
236 1
                pos += len(k)
237 1
                break
238 1
        if pos == orig_pos:
239 1
            alpha = [_ + '_' for _ in alpha]
240 1
            pos += 1
241
242
    # Trim doublets and placeholders
243 1
    for i in range(len(alpha)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
244 1
        pos = 1
245 1
        while pos < len(alpha[i]):
246 1
            if alpha[i][pos] == alpha[i][pos - 1]:
247 1
                alpha[i] = alpha[i][:pos] + alpha[i][pos + 1 :]
248 1
            pos += 1
249 1
    alpha = (_.replace('_', '') for _ in alpha)
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
250
251
    # Trim codes and return tuple
252 1
    alpha = ((_ + ('0' * max_length))[:max_length] for _ in alpha)
253 1
    return tuple(alpha)
254
255
256
if __name__ == '__main__':
257
    import doctest
258
259
    doctest.testmod()
260