Completed
Push — master ( f43547...71985b )
by Chris
12:00 queued 10s
created

abydos.phonetic._haase.haase_phonetik()   A

Complexity

Conditions 1

Size

Total Lines 32
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 2
CRAP Score 1

Importance

Changes 0
Metric Value
cc 1
eloc 2
nop 2
dl 0
loc 32
ccs 2
cts 2
cp 1
crap 1
rs 10
c 0
b 0
f 0
1
# -*- coding: utf-8 -*-
2
3
# Copyright 2014-2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19 1
"""abydos.phonetic._haase.
20
21
Haase Phonetik
22
"""
23
24 1
from __future__ import (
25
    absolute_import,
26
    division,
27
    print_function,
28
    unicode_literals,
29
)
30
31 1
from itertools import product
32 1
from unicodedata import normalize as unicode_normalize
33
34 1
from six import text_type
35 1
from six.moves import range
36
37 1
from ._phonetic import _Phonetic
38
39 1
__all__ = ['Haase', 'haase_phonetik']
40
41
42 1
class Haase(_Phonetic):
0 ignored issues
show
Unused Code introduced by
The variable __class__ seems to be unused.
Loading history...
43
    """Haase Phonetik.
44
45
    Based on the algorithm described at :cite:`Prante:2015`.
46
47
    Based on the original :cite:`Haase:2000`.
48
    """
49
50 1
    _uc_v_set = set('AEIJOUY')
51
52 1
    def encode(self, word, primary_only=False):
0 ignored issues
show
Bug introduced by
Parameters differ from overridden 'encode' method
Loading history...
53
        """Return the Haase Phonetik (numeric output) code for a word.
54
55
        While the output code is numeric, it is nevertheless a str.
56
57
        Parameters
58
        ----------
59
        word : str
60
            The word to transform
61
        primary_only : bool
62
            If True, only the primary code is returned
63
64
        Returns
65
        -------
66
        tuple
67
            The Haase Phonetik value as a numeric string
68
69
        Examples
70
        --------
71
        >>> pe = Haase()
72
        >>> pe.encode('Joachim')
73
        ('9496',)
74
        >>> pe.encode('Christoph')
75
        ('4798293', '8798293')
76
        >>> pe.encode('Jörg')
77
        ('974',)
78
        >>> pe.encode('Smith')
79
        ('8692',)
80
        >>> pe.encode('Schmidt')
81
        ('8692', '4692')
82
83
        """
84
85 1
        def _after(word, pos, letters):
86
            """Return True if word[pos] follows one of the supplied letters.
87
88
            Parameters
89
            ----------
90
            word : str
91
                Word to modify
92
            pos : int
93
                Position to examine
94
            letters : set
95
                Letters to check for
96
97
            Returns
98
            -------
99
            bool
100
                True if word[pos] follows one of letters
101
102
            """
103 1
            if pos > 0 and word[pos - 1] in letters:
104 1
                return True
105 1
            return False
106
107 1
        def _before(word, pos, letters):
108
            """Return True if word[pos] precedes one of the supplied letters.
109
110
            Parameters
111
            ----------
112
            word : str
113
                Word to modify
114
            pos : int
115
                Position to examine
116
            letters : set
117
                Letters to check for
118
119
            Returns
120
            -------
121
            bool
122
                True if word[pos] precedes one of letters
123
124
            """
125 1
            if pos + 1 < len(word) and word[pos + 1] in letters:
126 1
                return True
127 1
            return False
128
129 1
        word = unicode_normalize('NFKD', text_type(word.upper()))
130 1
        word = word.replace('ß', 'SS')
131
132 1
        word = word.replace('Ä', 'AE')
133 1
        word = word.replace('Ö', 'OE')
134 1
        word = word.replace('Ü', 'UE')
135 1
        word = ''.join(c for c in word if c in self._uc_set)
136
137 1
        variants = []
138 1
        if primary_only:
139 1
            variants = [word]
140
        else:
141 1
            pos = 0
142 1
            if word[:2] == 'CH':
143 1
                variants.append(('CH', 'SCH'))
144 1
                pos += 2
145 1
            len_3_vars = {
146
                'OWN': 'AUN',
147
                'WSK': 'RSK',
148
                'SCH': 'CH',
149
                'GLI': 'LI',
150
                'AUX': 'O',
151
                'EUX': 'O',
152
            }
153 1
            while pos < len(word):
154 1
                if word[pos : pos + 4] == 'ILLE':
155 1
                    variants.append(('ILLE', 'I'))
156 1
                    pos += 4
157 1
                elif word[pos : pos + 3] in len_3_vars:
158 1
                    variants.append(
159
                        (word[pos : pos + 3], len_3_vars[word[pos : pos + 3]])
160
                    )
161 1
                    pos += 3
162 1
                elif word[pos : pos + 2] == 'RB':
163 1
                    variants.append(('RB', 'RW'))
164 1
                    pos += 2
165 1
                elif len(word[pos:]) == 3 and word[pos:] == 'EAU':
166 1
                    variants.append(('EAU', 'O'))
167 1
                    pos += 3
168 1
                elif len(word[pos:]) == 1 and word[pos:] in {'A', 'O'}:
169 1
                    if word[pos:] == 'O':
170 1
                        variants.append(('O', 'OW'))
171
                    else:
172 1
                        variants.append(('A', 'AR'))
173 1
                    pos += 1
174
                else:
175 1
                    variants.append((word[pos],))
176 1
                    pos += 1
177
178 1
            variants = [''.join(letters) for letters in product(*variants)]
179
180 1
        def _haase_code(word):
181 1
            sdx = ''
182 1
            for i in range(len(word)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
183 1 View Code Duplication
                if word[i] in self._uc_v_set:
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
184 1
                    sdx += '9'
185 1
                elif word[i] == 'B':
186 1
                    sdx += '1'
187 1
                elif word[i] == 'P':
188 1
                    if _before(word, i, {'H'}):
189 1
                        sdx += '3'
190
                    else:
191 1
                        sdx += '1'
192 1
                elif word[i] in {'D', 'T'}:
193 1
                    if _before(word, i, {'C', 'S', 'Z'}):
194 1
                        sdx += '8'
195
                    else:
196 1
                        sdx += '2'
197 1
                elif word[i] in {'F', 'V', 'W'}:
198 1
                    sdx += '3'
199 1
                elif word[i] in {'G', 'K', 'Q'}:
200 1
                    sdx += '4'
201 1
                elif word[i] == 'C':
202 1
                    if _after(word, i, {'S', 'Z'}):
203 1
                        sdx += '8'
204 1
                    elif i == 0:
205 1
                        if _before(
206
                            word,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
207
                            i,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
208
                            {'A', 'H', 'K', 'L', 'O', 'Q', 'R', 'U', 'X'},
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
209
                        ):
210 1
                            sdx += '4'
211
                        else:
212 1
                            sdx += '8'
213 1
                    elif _before(word, i, {'A', 'H', 'K', 'O', 'Q', 'U', 'X'}):
214 1
                        sdx += '4'
215
                    else:
216 1
                        sdx += '8'
217 1
                elif word[i] == 'X':
218 1
                    if _after(word, i, {'C', 'K', 'Q'}):
219 1
                        sdx += '8'
220
                    else:
221 1
                        sdx += '48'
222 1
                elif word[i] == 'L':
223 1
                    sdx += '5'
224 1
                elif word[i] in {'M', 'N'}:
225 1
                    sdx += '6'
226 1
                elif word[i] == 'R':
227 1
                    sdx += '7'
228 1
                elif word[i] in {'S', 'Z'}:
229 1
                    sdx += '8'
230
231 1
            sdx = self._delete_consecutive_repeats(sdx)
232
233 1
            return sdx
234
235 1
        encoded = tuple(_haase_code(word) for word in variants)
236 1
        if len(encoded) > 1:
237 1
            encoded_set = set()
238 1
            encoded_single = []
239 1
            for code in encoded:
240 1
                if code not in encoded_set:
241 1
                    encoded_set.add(code)
242 1
                    encoded_single.append(code)
243 1
            return tuple(encoded_single)
244
245 1
        return encoded
246
247
248 1
def haase_phonetik(word, primary_only=False):
249
    """Return the Haase Phonetik (numeric output) code for a word.
250
251
    This is a wrapper for :py:meth:`Haase.encode`.
252
253
    Parameters
254
    ----------
255
    word : str
256
        The word to transform
257
    primary_only : bool
258
        If True, only the primary code is returned
259
260
    Returns
261
    -------
262
    tuple
263
        The Haase Phonetik value as a numeric string
264
265
    Examples
266
    --------
267
    >>> haase_phonetik('Joachim')
268
    ('9496',)
269
    >>> haase_phonetik('Christoph')
270
    ('4798293', '8798293')
271
    >>> haase_phonetik('Jörg')
272
    ('974',)
273
    >>> haase_phonetik('Smith')
274
    ('8692',)
275
    >>> haase_phonetik('Schmidt')
276
    ('8692', '4692')
277
278
    """
279 1
    return Haase().encode(word, primary_only)
280
281
282
if __name__ == '__main__':
283
    import doctest
284
285
    doctest.testmod()
286