Completed
Pull Request — master (#138)
by Chris
14:20
created

abydos.phonetic._sv.norphone()   A

Complexity

Conditions 1

Size

Total Lines 21
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 2
CRAP Score 1

Importance

Changes 0
Metric Value
eloc 2
dl 0
loc 21
ccs 2
cts 2
cp 1
rs 10
c 0
b 0
f 0
cc 1
nop 1
crap 1
1
# -*- coding: utf-8 -*-
2
3
# Copyright 2014-2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19 1
"""abydos.phonetic._sv.
20
21
The phonetic._sv module implements phonetic algorithms for Scandinavian names
22
& languages (currently Swedish & Norwegian), including:
23
24
    - SfinxBis
25
    - Norphone
26
"""
27
28 1
from __future__ import unicode_literals
29
30 1
from unicodedata import normalize as unicode_normalize
31
32 1
from six import text_type
33
34 1
from ._phonetic import Phonetic
35
36 1
__all__ = ['Norphone', 'SfinxBis', 'norphone', 'sfinxbis']
37
38
39 1
class SfinxBis(Phonetic):
0 ignored issues
show
Unused Code introduced by
The variable __class__ seems to be unused.
Loading history...
40
    """SfinxBis code.
41
42
    SfinxBis is a Soundex-like algorithm defined in :cite:`Axelsson:2009`.
43
44
    This implementation follows the reference implementation:
45
    :cite:`Sjoo:2009`.
46
47
    SfinxBis is intended chiefly for Swedish names.
48
    """
49
50 1
    _adelstitler = (
51
        ' DE LA ',
52
        ' DE LAS ',
53
        ' DE LOS ',
54
        ' VAN DE ',
55
        ' VAN DEN ',
56
        ' VAN DER ',
57
        ' VON DEM ',
58
        ' VON DER ',
59
        ' AF ',
60
        ' AV ',
61
        ' DA ',
62
        ' DE ',
63
        ' DEL ',
64
        ' DEN ',
65
        ' DES ',
66
        ' DI ',
67
        ' DO ',
68
        ' DON ',
69
        ' DOS ',
70
        ' DU ',
71
        ' E ',
72
        ' IN ',
73
        ' LA ',
74
        ' LE ',
75
        ' MAC ',
76
        ' MC ',
77
        ' VAN ',
78
        ' VON ',
79
        ' Y ',
80
        ' S:T ',
81
    )
82
83 1
    _harde_vokaler = {'A', 'O', 'U', 'Å'}
84 1
    _mjuka_vokaler = {'E', 'I', 'Y', 'Ä', 'Ö'}
85 1
    _uc_c_set = {
86
        'B',
87
        'C',
88
        'D',
89
        'F',
90
        'G',
91
        'H',
92
        'J',
93
        'K',
94
        'L',
95
        'M',
96
        'N',
97
        'P',
98
        'Q',
99
        'R',
100
        'S',
101
        'T',
102
        'V',
103
        'W',
104
        'X',
105
        'Z',
106
    }
107 1
    _uc_set = {
108
        'A',
109
        'B',
110
        'C',
111
        'D',
112
        'E',
113
        'F',
114
        'G',
115
        'H',
116
        'I',
117
        'J',
118
        'K',
119
        'L',
120
        'M',
121
        'N',
122
        'O',
123
        'P',
124
        'Q',
125
        'R',
126
        'S',
127
        'T',
128
        'U',
129
        'V',
130
        'W',
131
        'X',
132
        'Y',
133
        'Z',
134
        'Ä',
135
        'Å',
136
        'Ö',
137
    }
138
139 1
    _trans = dict(
140
        zip(
141
            (ord(_) for _ in 'BCDFGHJKLMNPQRSTVZAOUÅEIYÄÖ'),
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
142
            '123729224551268378999999999',
143
        )
144
    )
145
146 1
    _substitutions = dict(
147
        zip(
148
            (ord(_) for _ in 'WZÀÁÂÃÆÇÈÉÊËÌÍÎÏÑÒÓÔÕØÙÚÛÜÝ'),
149
            'VSAAAAÄCEEEEIIIINOOOOÖUUUYY',
150
        )
151
    )
152
153 1
    def encode(self, word, max_length=-1):
0 ignored issues
show
Bug introduced by
Parameters differ from overridden 'encode' method
Loading history...
154
        """Return the SfinxBis code for a word.
155
156
        :param str word: the word to transform
157
        :param int max_length: the length of the code returned (defaults to
158
            unlimited)
159
        :returns: the SfinxBis value
160
        :rtype: tuple
161
162
        >>> pe = SfinxBis()
163
        >>> pe.encode('Christopher')
164
        ('K68376',)
165
        >>> pe.encode('Niall')
166
        ('N4',)
167
        >>> pe.encode('Smith')
168
        ('S53',)
169
        >>> pe.encode('Schmidt')
170
        ('S53',)
171
172
        >>> pe.encode('Johansson')
173
        ('J585',)
174
        >>> pe.encode('Sjöberg')
175
        ('#162',)
176
        """
177
178 1
        def _foersvensker(lokal_ordet):
179
            """Return the Swedish-ized form of the word."""
180 1
            lokal_ordet = lokal_ordet.replace('STIERN', 'STJÄRN')
181 1
            lokal_ordet = lokal_ordet.replace('HIE', 'HJ')
182 1
            lokal_ordet = lokal_ordet.replace('SIÖ', 'SJÖ')
183 1
            lokal_ordet = lokal_ordet.replace('SCH', 'SH')
184 1
            lokal_ordet = lokal_ordet.replace('QU', 'KV')
185 1
            lokal_ordet = lokal_ordet.replace('IO', 'JO')
186 1
            lokal_ordet = lokal_ordet.replace('PH', 'F')
187
188 1
            for i in self._harde_vokaler:
189 1
                lokal_ordet = lokal_ordet.replace(i + 'Ü', i + 'J')
190 1
                lokal_ordet = lokal_ordet.replace(i + 'Y', i + 'J')
191 1
                lokal_ordet = lokal_ordet.replace(i + 'I', i + 'J')
192 1
            for i in self._mjuka_vokaler:
193 1
                lokal_ordet = lokal_ordet.replace(i + 'Ü', i + 'J')
194 1
                lokal_ordet = lokal_ordet.replace(i + 'Y', i + 'J')
195 1
                lokal_ordet = lokal_ordet.replace(i + 'I', i + 'J')
196
197 1
            if 'H' in lokal_ordet:
198 1
                for i in self._uc_c_set:
199 1
                    lokal_ordet = lokal_ordet.replace('H' + i, i)
200
201 1
            lokal_ordet = lokal_ordet.translate(self._substitutions)
202
203 1
            lokal_ordet = lokal_ordet.replace('Ð', 'ETH')
204 1
            lokal_ordet = lokal_ordet.replace('Þ', 'TH')
205 1
            lokal_ordet = lokal_ordet.replace('ß', 'SS')
206
207 1
            return lokal_ordet
208
209 1
        def _koda_foersta_ljudet(lokal_ordet):
210
            """Return the word with the first sound coded."""
211 1
            if (
212
                lokal_ordet[0:1] in self._mjuka_vokaler
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
213
                or lokal_ordet[0:1] in self._harde_vokaler
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
214
            ):
215 1
                lokal_ordet = '$' + lokal_ordet[1:]
216 1
            elif lokal_ordet[0:2] in ('DJ', 'GJ', 'HJ', 'LJ'):
217 1
                lokal_ordet = 'J' + lokal_ordet[2:]
218 1
            elif (
219
                lokal_ordet[0:1] == 'G'
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
220
                and lokal_ordet[1:2] in self._mjuka_vokaler
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
221
            ):
222 1
                lokal_ordet = 'J' + lokal_ordet[1:]
223 1
            elif lokal_ordet[0:1] == 'Q':
224 1
                lokal_ordet = 'K' + lokal_ordet[1:]
225 1
            elif lokal_ordet[0:2] == 'CH' and lokal_ordet[2:3] in frozenset(
226
                self._mjuka_vokaler | self._harde_vokaler
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
227
            ):
228 1
                lokal_ordet = '#' + lokal_ordet[2:]
229 1
            elif (
230
                lokal_ordet[0:1] == 'C'
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
231
                and lokal_ordet[1:2] in self._harde_vokaler
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
232
            ):
233 1
                lokal_ordet = 'K' + lokal_ordet[1:]
234 1
            elif (
235
                lokal_ordet[0:1] == 'C' and lokal_ordet[1:2] in self._uc_c_set
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
236
            ):
237 1
                lokal_ordet = 'K' + lokal_ordet[1:]
238 1
            elif lokal_ordet[0:1] == 'X':
239 1
                lokal_ordet = 'S' + lokal_ordet[1:]
240 1
            elif (
241
                lokal_ordet[0:1] == 'C'
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
242
                and lokal_ordet[1:2] in self._mjuka_vokaler
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
243
            ):
244 1
                lokal_ordet = 'S' + lokal_ordet[1:]
245 1
            elif lokal_ordet[0:3] in ('SKJ', 'STJ', 'SCH'):
246 1
                lokal_ordet = '#' + lokal_ordet[3:]
247 1
            elif lokal_ordet[0:2] in ('SH', 'KJ', 'TJ', 'SJ'):
248 1
                lokal_ordet = '#' + lokal_ordet[2:]
249 1
            elif (
250
                lokal_ordet[0:2] == 'SK'
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
251
                and lokal_ordet[2:3] in self._mjuka_vokaler
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
252
            ):
253 1
                lokal_ordet = '#' + lokal_ordet[2:]
254 1
            elif (
255
                lokal_ordet[0:1] == 'K'
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
256
                and lokal_ordet[1:2] in self._mjuka_vokaler
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
257
            ):
258 1
                lokal_ordet = '#' + lokal_ordet[1:]
259 1
            return lokal_ordet
260
261
        # Steg 1, Versaler
262 1
        word = unicode_normalize('NFC', text_type(word.upper()))
263 1
        word = word.replace('ß', 'SS')
264 1
        word = word.replace('-', ' ')
265
266
        # Steg 2, Ta bort adelsprefix
267 1
        for adelstitel in self._adelstitler:
268 1
            while adelstitel in word:
269 1
                word = word.replace(adelstitel, ' ')
270 1
            if word.startswith(adelstitel[1:]):
271 1
                word = word[len(adelstitel) - 1 :]
272
273
        # Split word into tokens
274 1
        ordlista = word.split()
275
276
        # Steg 3, Ta bort dubbelteckning i början på namnet
277 1
        ordlista = [
278
            self._delete_consecutive_repeats(ordet) for ordet in ordlista
279
        ]
280 1
        if not ordlista:
281
            # noinspection PyRedundantParentheses
282 1
            return ('',)
283
284
        # Steg 4, Försvenskning
285 1
        ordlista = [_foersvensker(ordet) for ordet in ordlista]
286
287
        # Steg 5, Ta bort alla tecken som inte är A-Ö (65-90,196,197,214)
288 1
        ordlista = [
289
            ''.join(c for c in ordet if c in self._uc_set)
290
            for ordet in ordlista
291
        ]
292
293
        # Steg 6, Koda första ljudet
294 1
        ordlista = [_koda_foersta_ljudet(ordet) for ordet in ordlista]
295
296
        # Steg 7, Dela upp namnet i två delar
297 1
        rest = [ordet[1:] for ordet in ordlista]
298
299
        # Steg 8, Utför fonetisk transformation i resten
300 1
        rest = [ordet.replace('DT', 'T') for ordet in rest]
301 1
        rest = [ordet.replace('X', 'KS') for ordet in rest]
302
303
        # Steg 9, Koda resten till en sifferkod
304 1
        for vokal in self._mjuka_vokaler:
305 1
            rest = [ordet.replace('C' + vokal, '8' + vokal) for ordet in rest]
306 1
        rest = [ordet.translate(self._trans) for ordet in rest]
307
308
        # Steg 10, Ta bort intilliggande dubbletter
309 1
        rest = [self._delete_consecutive_repeats(ordet) for ordet in rest]
310
311
        # Steg 11, Ta bort alla "9"
312 1
        rest = [ordet.replace('9', '') for ordet in rest]
313
314
        # Steg 12, Sätt ihop delarna igen
315 1
        ordlista = [
316
            ''.join(ordet) for ordet in zip((_[0:1] for _ in ordlista), rest)
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
317
        ]
318
319
        # truncate, if max_length is set
320 1
        if max_length > 0:
321 1
            ordlista = [ordet[:max_length] for ordet in ordlista]
322
323 1
        return tuple(ordlista)
324
325
326 1
def sfinxbis(word, max_length=-1):
327
    """Return the SfinxBis code for a word.
328
329
    This is a wraper for :py:meth:`SfinxBis.encode`.
330
331
    :param str word: the word to transform
332
    :param int max_length: the length of the code returned (defaults to
333
        unlimited)
334
    :returns: the SfinxBis value
335
    :rtype: tuple
336
337
    >>> sfinxbis('Christopher')
338
    ('K68376',)
339
    >>> sfinxbis('Niall')
340
    ('N4',)
341
    >>> sfinxbis('Smith')
342
    ('S53',)
343
    >>> sfinxbis('Schmidt')
344
    ('S53',)
345
346
    >>> sfinxbis('Johansson')
347
    ('J585',)
348
    >>> sfinxbis('Sjöberg')
349
    ('#162',)
350
    """
351 1
    return SfinxBis().encode(word, max_length)
352
353
354 1
class Norphone(Phonetic):
0 ignored issues
show
Unused Code introduced by
The variable __class__ seems to be unused.
Loading history...
355
    """Norphone.
356
357
    The reference implementation by Lars Marius Garshol is available in
358
    :cite:`Garshol:2015`.
359
360
    Norphone was designed for Norwegian, but this implementation has been
361
    extended to support Swedish vowels as well. This function incorporates
362
    the "not implemented" rules from the above file's rule set.
363
    """
364
365 1
    _uc_v_set = {'A', 'E', 'I', 'O', 'U', 'Y', 'Å', 'Æ', 'Ø', 'Ä', 'Ö'}
366
367 1
    _replacements = {
368
        4: {'SKEI': 'X'},
369
        3: {'SKJ': 'X', 'KEI': 'X'},
370
        2: {
371
            'CH': 'K',
372
            'CK': 'K',
373
            'GJ': 'J',
374
            'GH': 'K',
375
            'HG': 'K',
376
            'HJ': 'J',
377
            'HL': 'L',
378
            'HR': 'R',
379
            'KJ': 'X',
380
            'KI': 'X',
381
            'LD': 'L',
382
            'ND': 'N',
383
            'PH': 'F',
384
            'TH': 'T',
385
            'SJ': 'X',
386
        },
387
        1: {'W': 'V', 'X': 'KS', 'Z': 'S', 'D': 'T', 'G': 'K'},
388
    }
389
390 1
    def encode(self, word):
391
        """Return the Norphone code.
392
393
        :param str word: the word to transform
394
        :returns: the Norphone code
395
        :rtype: str
396
397
        >>> pe = Norphone()
398
        >>> pe.encode('Hansen')
399
        'HNSN'
400
        >>> pe.encode('Larsen')
401
        'LRSN'
402
        >>> pe.encode('Aagaard')
403
        'ÅKRT'
404
        >>> pe.encode('Braaten')
405
        'BRTN'
406
        >>> pe.encode('Sandvik')
407
        'SNVK'
408
        """
409 1
        word = word.upper()
410
411 1
        code = ''
412 1
        skip = 0
413
414 1
        if word[0:2] == 'AA':
415 1
            code = 'Å'
416 1
            skip = 2
417 1
        elif word[0:2] == 'GI':
418 1
            code = 'J'
419 1
            skip = 2
420 1
        elif word[0:3] == 'SKY':
421 1
            code = 'X'
422 1
            skip = 3
423 1
        elif word[0:2] == 'EI':
424 1
            code = 'Æ'
425 1
            skip = 2
426 1
        elif word[0:2] == 'KY':
427 1
            code = 'X'
428 1
            skip = 2
429 1
        elif word[:1] == 'C':
430 1
            code = 'K'
431 1
            skip = 1
432 1
        elif word[:1] == 'Ä':
433 1
            code = 'Æ'
434 1
            skip = 1
435 1
        elif word[:1] == 'Ö':
436 1
            code = 'Ø'
437 1
            skip = 1
438
439 1
        if word[-2:] == 'DT':
440 1
            word = word[:-2] + 'T'
441
        # Though the rules indicate this rule applies in all positions, the
442
        # reference implementation indicates it applies only in final position.
443 1
        elif word[-2:-1] in self._uc_v_set and word[-1:] == 'D':
444 1
            word = word[:-2]
445
446 1
        for pos, char in enumerate(word):
447 1
            if skip:
448 1
                skip -= 1
449
            else:
450 1
                for length in sorted(self._replacements, reverse=True):
451 1
                    if word[pos : pos + length] in self._replacements[length]:
452 1
                        code += self._replacements[length][
453
                            word[pos : pos + length]
454
                        ]
455 1
                        skip = length - 1
456 1
                        break
457
                else:
458 1
                    if not pos or char not in self._uc_v_set:
459 1
                        code += char
460
461 1
        code = self._delete_consecutive_repeats(code)
462
463 1
        return code
464
465
466 1
def norphone(word):
467
    """Return the Norphone code.
468
469
    This is a wraper for :py:meth:`Norphone.encode`.
470
471
    :param str word: the word to transform
472
    :returns: the Norphone code
473
    :rtype: str
474
475
    >>> norphone('Hansen')
476
    'HNSN'
477
    >>> norphone('Larsen')
478
    'LRSN'
479
    >>> norphone('Aagaard')
480
    'ÅKRT'
481
    >>> norphone('Braaten')
482
    'BRTN'
483
    >>> norphone('Sandvik')
484
    'SNVK'
485
    """
486 1
    return Norphone().encode(word)
487
488
489
if __name__ == '__main__':
490
    import doctest
491
492
    doctest.testmod()
493