Completed
Push — master ( 6ed6e1...91db7a )
by Chris
13:26
created

abydos.phonetic.roger_root   A

Complexity

Total Complexity 7

Size/Duplication

Total Lines 219
Duplicated Lines 0 %

Test Coverage

Coverage 100%

Importance

Changes 0
Metric Value
eloc 147
dl 0
loc 219
ccs 31
cts 31
cp 1
rs 10
c 0
b 0
f 0
wmc 7

1 Function

Rating   Name   Duplication   Size   Complexity  
C roger_root() 0 177 7
1
# -*- coding: utf-8 -*-
2
3
# Copyright 2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19 1
"""abydos.phonetic.roger_root.
20
21
The phonetic.roger_root module implements the Roger Root phonetic algorithm.
22
"""
23
24 1
from __future__ import unicode_literals
25
26 1
from unicodedata import normalize as unicode_normalize
27
28 1
from six import text_type
29 1
from six.moves import range
30
31 1
from . import _delete_consecutive_repeats
32
33 1
__all__ = ['roger_root']
34
35
36 1
def roger_root(word, max_length=5, zero_pad=True):
37
    """Return the Roger Root code for a word.
38
39
    This is Roger Root name coding, described in :cite:`Moore:1977`.
40
41
    :param str word: the word to transform
42
    :param int max_length: the maximum length (default 5) of the code to return
43
    :param bool zero_pad: pad the end of the return value with 0s to achieve a
44
        max_length string
45
    :returns: the Roger Root code
46
    :rtype: str
47
48
    >>> roger_root('Christopher')
49
    '06401'
50
    >>> roger_root('Niall')
51
    '02500'
52
    >>> roger_root('Smith')
53
    '00310'
54
    >>> roger_root('Schmidt')
55
    '06310'
56
    """
57
    # uppercase, normalize, decompose, and filter non-A-Z out
58 1
    word = unicode_normalize('NFKD', text_type(word.upper()))
59 1
    word = word.replace('ß', 'SS')
60 1
    word = ''.join(
61
        c
62
        for c in word
63
        if c
64
        in {
65
            'A',
66
            'B',
67
            'C',
68
            'D',
69
            'E',
70
            'F',
71
            'G',
72
            'H',
73
            'I',
74
            'J',
75
            'K',
76
            'L',
77
            'M',
78
            'N',
79
            'O',
80
            'P',
81
            'Q',
82
            'R',
83
            'S',
84
            'T',
85
            'U',
86
            'V',
87
            'W',
88
            'X',
89
            'Y',
90
            'Z',
91
        }
92
    )
93
94
    # '*' is used to prevent combining by _delete_consecutive_repeats()
95 1
    _init_patterns = {
96
        4: {'TSCH': '06'},
97
        3: {'TSH': '06', 'SCH': '06'},
98
        2: {
99
            'CE': '0*0',
100
            'CH': '06',
101
            'CI': '0*0',
102
            'CY': '0*0',
103
            'DG': '07',
104
            'GF': '08',
105
            'GM': '03',
106
            'GN': '02',
107
            'KN': '02',
108
            'PF': '08',
109
            'PH': '08',
110
            'PN': '02',
111
            'SH': '06',
112
            'TS': '0*0',
113
            'WR': '04',
114
        },
115
        1: {
116
            'A': '1',
117
            'B': '09',
118
            'C': '07',
119
            'D': '01',
120
            'E': '1',
121
            'F': '08',
122
            'G': '07',
123
            'H': '2',
124
            'I': '1',
125
            'J': '3',
126
            'K': '07',
127
            'L': '05',
128
            'M': '03',
129
            'N': '02',
130
            'O': '1',
131
            'P': '09',
132
            'Q': '07',
133
            'R': '04',
134
            'S': '0*0',
135
            'T': '01',
136
            'U': '1',
137
            'V': '08',
138
            'W': '4',
139
            'X': '07',
140
            'Y': '5',
141
            'Z': '0*0',
142
        },
143
    }
144
145 1
    _med_patterns = {
146
        4: {'TSCH': '6'},
147
        3: {'TSH': '6', 'SCH': '6'},
148
        2: {
149
            'CE': '0',
150
            'CH': '6',
151
            'CI': '0',
152
            'CY': '0',
153
            'DG': '7',
154
            'PH': '8',
155
            'SH': '6',
156
            'TS': '0',
157
        },
158
        1: {
159
            'B': '9',
160
            'C': '7',
161
            'D': '1',
162
            'F': '8',
163
            'G': '7',
164
            'J': '6',
165
            'K': '7',
166
            'L': '5',
167
            'M': '3',
168
            'N': '2',
169
            'P': '9',
170
            'Q': '7',
171
            'R': '4',
172
            'S': '0',
173
            'T': '1',
174
            'V': '8',
175
            'X': '7',
176
            'Z': '0',
177
            'A': '*',
178
            'E': '*',
179
            'H': '*',
180
            'I': '*',
181
            'O': '*',
182
            'U': '*',
183
            'W': '*',
184
            'Y': '*',
185
        },
186
    }
187
188 1
    code = ''
189 1
    pos = 0
190
191
    # Do first digit(s) first
192 1
    for num in range(4, 0, -1):
193 1
        if word[:num] in _init_patterns[num]:
194 1
            code = _init_patterns[num][word[:num]]
195 1
            pos += num
196 1
            break
197
198
    # Then code subsequent digits
199 1
    while pos < len(word):
200 1
        for num in range(4, 0, -1):  # pragma: no branch
201 1
            if word[pos : pos + num] in _med_patterns[num]:
202 1
                code += _med_patterns[num][word[pos : pos + num]]
203 1
                pos += num
204 1
                break
205
206 1
    code = _delete_consecutive_repeats(code)
207 1
    code = code.replace('*', '')
208
209 1
    if zero_pad:
210 1
        code += '0' * max_length
211
212 1
    return code[:max_length]
213
214
215
if __name__ == '__main__':
216
    import doctest
217
218
    doctest.testmod()
219