Completed
Pull Request — master (#141)
by Chris
11:04
created

abydos.phonetic._phonix   A

Complexity

Total Complexity 26

Size/Duplication

Total Lines 365
Duplicated Lines 0 %

Test Coverage

Coverage 100%

Importance

Changes 0
Metric Value
eloc 194
dl 0
loc 365
ccs 73
cts 73
cp 1
rs 10
c 0
b 0
f 0
wmc 26

2 Methods

Rating   Name   Duplication   Size   Complexity  
B Phonix.__init__() 0 118 1
F Phonix.encode() 0 149 24

1 Function

Rating   Name   Duplication   Size   Complexity  
A phonix() 0 26 1
1
# -*- coding: utf-8 -*-
2
3
# Copyright 2014-2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19 1
"""abydos.phonetic._phonix.
20
21
Phonix
22
"""
23
24 1
from __future__ import (
25
    absolute_import,
26
    division,
27
    print_function,
28
    unicode_literals,
29
)
30
31 1
from unicodedata import normalize as unicode_normalize
32
33 1
from six import text_type
34
35 1
from ._phonetic import Phonetic
36
37 1
__all__ = ['Phonix', 'phonix']
38
39
40 1
class Phonix(Phonetic):
0 ignored issues
show
Unused Code introduced by
The variable __class__ seems to be unused.
Loading history...
41
    """Phonix code.
42
43
    Phonix is a Soundex-like algorithm defined in :cite:`Gadd:1990`.
44
45
    This implementation is based on:
46
    - :cite:`Pfeifer:2000`
47
    - :cite:`Christen:2011`
48
    - :cite:`Kollar:2007`
49
    """
50
51 1
    _uc_c_set = None
52
53 1
    _substitutions = None
54
55 1
    _trans = dict(
56
        zip(
57
            (ord(_) for _ in 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
58
            '01230720022455012683070808',
59
        )
60
    )
61
62 1
    def __init__(self):
63
        """Initialize Phonix."""
64 1
        self._uc_c_set = (
65
            super(Phonix, self)._uc_set - super(Phonix, self)._uc_v_set
66
        )
67
68 1
        self._substitutions = (
69
            (3, 'DG', 'G'),
70
            (3, 'CO', 'KO'),
71
            (3, 'CA', 'KA'),
72
            (3, 'CU', 'KU'),
73
            (3, 'CY', 'SI'),
74
            (3, 'CI', 'SI'),
75
            (3, 'CE', 'SE'),
76
            (0, 'CL', 'KL', super(Phonix, self)._uc_v_set),
77
            (3, 'CK', 'K'),
78
            (1, 'GC', 'K'),
79
            (1, 'JC', 'K'),
80
            (0, 'CHR', 'KR', super(Phonix, self)._uc_v_set),
81
            (0, 'CR', 'KR', super(Phonix, self)._uc_v_set),
82
            (0, 'WR', 'R'),
83
            (3, 'NC', 'NK'),
84
            (3, 'CT', 'KT'),
85
            (3, 'PH', 'F'),
86
            (3, 'AA', 'AR'),
87
            (3, 'SCH', 'SH'),
88
            (3, 'BTL', 'TL'),
89
            (3, 'GHT', 'T'),
90
            (3, 'AUGH', 'ARF'),
91
            (
92
                2,
93
                'LJ',
94
                'LD',
95
                super(Phonix, self)._uc_v_set,
96
                super(Phonix, self)._uc_v_set,
97
            ),
98
            (3, 'LOUGH', 'LOW'),
99
            (0, 'Q', 'KW'),
100
            (0, 'KN', 'N'),
101
            (1, 'GN', 'N'),
102
            (3, 'GHN', 'N'),
103
            (1, 'GNE', 'N'),
104
            (3, 'GHNE', 'NE'),
105
            (1, 'GNES', 'NS'),
106
            (0, 'GN', 'N'),
107
            (2, 'GN', 'N', None, self._uc_c_set),
108
            (1, 'GN', 'N'),
109
            (0, 'PS', 'S'),
110
            (0, 'PT', 'T'),
111
            (0, 'CZ', 'C'),
112
            (2, 'WZ', 'Z', super(Phonix, self)._uc_v_set),
113
            (2, 'CZ', 'CH'),
114
            (3, 'LZ', 'LSH'),
115
            (3, 'RZ', 'RSH'),
116
            (2, 'Z', 'S', None, super(Phonix, self)._uc_v_set),
117
            (3, 'ZZ', 'TS'),
118
            (2, 'Z', 'TS', self._uc_c_set),
119
            (3, 'HROUG', 'REW'),
120
            (3, 'OUGH', 'OF'),
121
            (
122
                2,
123
                'Q',
124
                'KW',
125
                super(Phonix, self)._uc_v_set,
126
                super(Phonix, self)._uc_v_set,
127
            ),
128
            (
129
                2,
130
                'J',
131
                'Y',
132
                super(Phonix, self)._uc_v_set,
133
                super(Phonix, self)._uc_v_set,
134
            ),
135
            (0, 'YJ', 'Y', super(Phonix, self)._uc_v_set),
136
            (0, 'GH', 'G'),
137
            (1, 'GH', 'E', super(Phonix, self)._uc_v_set),
138
            (0, 'CY', 'S'),
139
            (3, 'NX', 'NKS'),
140
            (0, 'PF', 'F'),
141
            (1, 'DT', 'T'),
142
            (1, 'TL', 'TIL'),
143
            (1, 'DL', 'DIL'),
144
            (3, 'YTH', 'ITH'),
145
            (0, 'TJ', 'CH', super(Phonix, self)._uc_v_set),
146
            (0, 'TSJ', 'CH', super(Phonix, self)._uc_v_set),
147
            (0, 'TS', 'T', super(Phonix, self)._uc_v_set),
148
            (3, 'TCH', 'CH'),
149
            (2, 'WSK', 'VSKIE', super(Phonix, self)._uc_v_set),
150
            (1, 'WSK', 'VSKIE', super(Phonix, self)._uc_v_set),
151
            (0, 'MN', 'N', super(Phonix, self)._uc_v_set),
152
            (0, 'PN', 'N', super(Phonix, self)._uc_v_set),
153
            (2, 'STL', 'SL', super(Phonix, self)._uc_v_set),
154
            (1, 'STL', 'SL', super(Phonix, self)._uc_v_set),
155
            (1, 'TNT', 'ENT'),
156
            (1, 'EAUX', 'OH'),
157
            (3, 'EXCI', 'ECS'),
158
            (3, 'X', 'ECS'),
159
            (1, 'NED', 'ND'),
160
            (3, 'JR', 'DR'),
161
            (1, 'EE', 'EA'),
162
            (3, 'ZS', 'S'),
163
            (2, 'R', 'AH', super(Phonix, self)._uc_v_set, self._uc_c_set),
164
            (1, 'R', 'AH', super(Phonix, self)._uc_v_set),
165
            (2, 'HR', 'AH', super(Phonix, self)._uc_v_set, self._uc_c_set),
166
            (1, 'HR', 'AH', super(Phonix, self)._uc_v_set),
167
            (1, 'HR', 'AH', super(Phonix, self)._uc_v_set),
168
            (1, 'RE', 'AR'),
169
            (1, 'R', 'AH', super(Phonix, self)._uc_v_set),
170
            (3, 'LLE', 'LE'),
171
            (1, 'LE', 'ILE', self._uc_c_set),
172
            (1, 'LES', 'ILES', self._uc_c_set),
173
            (1, 'E', ''),
174
            (1, 'ES', 'S'),
175
            (1, 'SS', 'AS', super(Phonix, self)._uc_v_set),
176
            (1, 'MB', 'M', super(Phonix, self)._uc_v_set),
177
            (3, 'MPTS', 'MPS'),
178
            (3, 'MPS', 'MS'),
179
            (3, 'MPT', 'MT'),
180
        )
181
182 1
    def encode(self, word, max_length=4, zero_pad=True):
0 ignored issues
show
Bug introduced by
Parameters differ from overridden 'encode' method
Loading history...
183
        """Return the Phonix code for a word.
184
185
        Args:
186
            word (str): The word to transform
187
            max_length (int): The length of the code returned (defaults to 4)
188
            zero_pad (bool): pad the end of the return value with 0s to achieve
189
                a max_length string
190
191
        Returns:
192
            str: The Phonix value
193
194
        Examples:
195
            >>> pe = Phonix()
196
            >>> pe.encode('Christopher')
197
            'K683'
198
            >>> pe.encode('Niall')
199
            'N400'
200
            >>> pe.encode('Smith')
201
            'S530'
202
            >>> pe.encode('Schmidt')
203
            'S530'
204
205
        """
206
207 1
        def _start_repl(word, src, tar, post=None):
208
            """Replace src with tar at the start of word.
209
210
            Args:
211
                word (str): The word to modify
212
                src (str): Substring to match
213
                tar (str): Substring to substitute
214
                post (set): Following characters
215
216
            Returns:
217
                str: Modified string
218
219
            """
220 1
            if post:
221 1
                for i in post:
222 1
                    if word.startswith(src + i):
223 1
                        return tar + word[len(src) :]
224 1
            elif word.startswith(src):
225 1
                return tar + word[len(src) :]
226 1
            return word
227
228 1
        def _end_repl(word, src, tar, pre=None):
229
            """Replace src with tar at the end of word.
230
231
            Args:
232
                word (str): The word to modify
233
                src (str): Substring to match
234
                tar (str): Substring to substitute
235
                pre (set): Preceding characters
236
237
            Returns:
238
                str: Modified string
239
240
            """
241 1
            if pre:
242 1
                for i in pre:
243 1
                    if word.endswith(i + src):
244 1
                        return word[: -len(src)] + tar
245 1
            elif word.endswith(src):
246 1
                return word[: -len(src)] + tar
247 1
            return word
248
249 1
        def _mid_repl(word, src, tar, pre=None, post=None):
250
            """Replace src with tar in the middle of word.
251
252
            Args:
253
                word (str): The word to modify
254
                src (str): Substring to match
255
                tar (str): Substring to substitute
256
                pre (set): Preceding characters
257
                post (set): Following characters
258
259
            Returns:
260
                str: Modified string
261
262
            """
263 1
            if pre or post:
264 1
                if not pre:
265 1
                    return word[0] + _all_repl(word[1:], src, tar, pre, post)
266 1
                elif not post:
267 1
                    return _all_repl(word[:-1], src, tar, pre, post) + word[-1]
268 1
                return _all_repl(word, src, tar, pre, post)
269 1
            return (
270
                word[0] + _all_repl(word[1:-1], src, tar, pre, post) + word[-1]
271
            )
272
273 1
        def _all_repl(word, src, tar, pre=None, post=None):
274
            """Replace src with tar anywhere in word.
275
276
            Args:
277
                word (str): The word to modify
278
                src (str): Substring to match
279
                tar (str): Substring to substitute
280
                pre (set): Preceding characters
281
                post (set): Following characters
282
283
            Returns:
284
                str: Modified string
285
286
            """
287 1
            if pre or post:
288 1
                if post:
289 1
                    post = post
290
                else:
291 1
                    post = frozenset(('',))
292 1
                if pre:
293 1
                    pre = pre
294
                else:
295 1
                    pre = frozenset(('',))
296
297 1
                for i, j in ((i, j) for i in pre for j in post):
298 1
                    word = word.replace(i + src + j, i + tar + j)
299 1
                return word
300
            else:
301 1
                return word.replace(src, tar)
302
303 1
        repl_at = (_start_repl, _end_repl, _mid_repl, _all_repl)
304
305 1
        sdx = ''
306
307 1
        word = unicode_normalize('NFKD', text_type(word.upper()))
308 1
        word = word.replace('ß', 'SS')
309 1
        word = ''.join(c for c in word if c in self._uc_set)
310 1
        if word:
311 1
            for trans in self._substitutions:
312 1
                word = repl_at[trans[0]](word, *trans[1:])
313 1
            if word[0] in self._uc_vy_set:
314 1
                sdx = 'v' + word[1:].translate(self._trans)
315
            else:
316 1
                sdx = word[0] + word[1:].translate(self._trans)
317 1
            sdx = self._delete_consecutive_repeats(sdx)
318 1
            sdx = sdx.replace('0', '')
319
320
        # Clamp max_length to [4, 64]
321 1
        if max_length != -1:
322 1
            max_length = min(max(4, max_length), 64)
323
        else:
324 1
            max_length = 64
325
326 1
        if zero_pad:
327 1
            sdx += '0' * max_length
328 1
        if not sdx:
329 1
            sdx = '0'
330 1
        return sdx[:max_length]
331
332
333 1
def phonix(word, max_length=4, zero_pad=True):
334
    """Return the Phonix code for a word.
335
336
    This is a wrapper for :py:meth:`Phonix.encode`.
337
338
    Args:
339
        word (str): The word to transform
340
        max_length (int): The length of the code returned (defaults to 4)
341
        zero_pad (bool): pad the end of the return value with 0s to achieve
342
            a max_length string
343
344
    Returns:
345
        str: The Phonix value
346
347
    Examples:
348
        >>> phonix('Christopher')
349
        'K683'
350
        >>> phonix('Niall')
351
        'N400'
352
        >>> phonix('Smith')
353
        'S530'
354
        >>> phonix('Schmidt')
355
        'S530'
356
357
    """
358 1
    return Phonix().encode(word, max_length, zero_pad)
359
360
361
if __name__ == '__main__':
362
    import doctest
363
364
    doctest.testmod()
365