abydos.phonetic._sv.sfinxbis() - Code Metrics - Inspection of "78a222a9f7d8976f6744d263e3d6d01a2a991c27" - chrislit/abydos - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Branch — master (78a222)

by Chris

created 2018-10-26 11:30 UTC

abydos.phonetic._sv.sfinxbis() F

↳ Parent: abydos.phonetic._sv

Complexity

Conditions

Size

Total Lines	259
Code Lines	177

Duplication

Lines	0
Ratio	0 %

Code Coverage

Tests	87
CRAP Score	32

Importance

Changes

Metric	Value
eloc	177
dl	0
loc	259
ccs	87
cts	87
cp	1
rs	0
c	0
b	0
f	0
cc	32
nop	2
crap	32

How to fix Long Method Complexity

# -*- coding: utf-8 -*-

# Copyright 2014-2018 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.phonetic._sv.

The phonetic._sv module implements phonetic algorithms for Scandinavian names
& languages (currently Swedish & Norwegian), including:

    - SfinxBis
    - Norphone
"""

from __future__ import unicode_literals

from unicodedata import normalize as unicode_normalize

from six import text_type

from ._util import _delete_consecutive_repeats

__all__ = ['norphone', 'sfinxbis']


def sfinxbis(word, max_length=-1):
    """Return the SfinxBis code for a word.

    SfinxBis is a Soundex-like algorithm defined in :cite:`Axelsson:2009`.

    This implementation follows the reference implementation:
    :cite:`Sjoo:2009`.

    SfinxBis is intended chiefly for Swedish names.

    :param str word: the word to transform
    :param int max_length: the length of the code returned (defaults to
        unlimited)
    :returns: the SfinxBis value
    :rtype: tuple

    >>> sfinxbis('Christopher')
    ('K68376',)
    >>> sfinxbis('Niall')
    ('N4',)
    >>> sfinxbis('Smith')
    ('S53',)
    >>> sfinxbis('Schmidt')
    ('S53',)

    >>> sfinxbis('Johansson')
    ('J585',)
    >>> sfinxbis('Sjöberg')
    ('#162',)
    """
    adelstitler = (
        ' DE LA ',
        ' DE LAS ',
        ' DE LOS ',
        ' VAN DE ',
        ' VAN DEN ',
        ' VAN DER ',
        ' VON DEM ',
        ' VON DER ',
        ' AF ',
        ' AV ',
        ' DA ',
        ' DE ',
        ' DEL ',
        ' DEN ',
        ' DES ',
        ' DI ',
        ' DO ',
        ' DON ',
        ' DOS ',
        ' DU ',
        ' E ',
        ' IN ',
        ' LA ',
        ' LE ',
        ' MAC ',
        ' MC ',
        ' VAN ',
        ' VON ',
        ' Y ',
        ' S:T ',
    )

    _harde_vokaler = {'A', 'O', 'U', 'Å'}
    _mjuka_vokaler = {'E', 'I', 'Y', 'Ä', 'Ö'}
    _konsonanter = {
        'B',
        'C',
        'D',
        'F',
        'G',
        'H',
        'J',
        'K',
        'L',
        'M',
        'N',
        'P',
        'Q',
        'R',
        'S',
        'T',
        'V',
        'W',
        'X',
        'Z',
    }
    _alfabet = {
        'A',
        'B',
        'C',
        'D',
        'E',
        'F',
        'G',
        'H',
        'I',
        'J',
        'K',
        'L',
        'M',
        'N',
        'O',
        'P',
        'Q',
        'R',
        'S',
        'T',
        'U',
        'V',
        'W',
        'X',
        'Y',
        'Z',
        'Ä',
        'Å',
        'Ö',
    }

    _sfinxbis_translation = dict(
        zip(
            (ord(_) for _ in 'BCDFGHJKLMNPQRSTVZAOUÅEIYÄÖ'),

            '123729224551268378999999999',
        )
    )

    _sfinxbis_substitutions = dict(
        zip(
            (ord(_) for _ in 'WZÀÁÂÃÆÇÈÉÊËÌÍÎÏÑÒÓÔÕØÙÚÛÜÝ'),
            'VSAAAAÄCEEEEIIIINOOOOÖUUUYY',
        )
    )

    def _foersvensker(lokal_ordet):
        """Return the Swedish-ized form of the word."""
        lokal_ordet = lokal_ordet.replace('STIERN', 'STJÄRN')
        lokal_ordet = lokal_ordet.replace('HIE', 'HJ')
        lokal_ordet = lokal_ordet.replace('SIÖ', 'SJÖ')
        lokal_ordet = lokal_ordet.replace('SCH', 'SH')
        lokal_ordet = lokal_ordet.replace('QU', 'KV')
        lokal_ordet = lokal_ordet.replace('IO', 'JO')
        lokal_ordet = lokal_ordet.replace('PH', 'F')

        for i in _harde_vokaler:
            lokal_ordet = lokal_ordet.replace(i + 'Ü', i + 'J')
            lokal_ordet = lokal_ordet.replace(i + 'Y', i + 'J')
            lokal_ordet = lokal_ordet.replace(i + 'I', i + 'J')
        for i in _mjuka_vokaler:
            lokal_ordet = lokal_ordet.replace(i + 'Ü', i + 'J')
            lokal_ordet = lokal_ordet.replace(i + 'Y', i + 'J')
            lokal_ordet = lokal_ordet.replace(i + 'I', i + 'J')

        if 'H' in lokal_ordet:
            for i in _konsonanter:
                lokal_ordet = lokal_ordet.replace('H' + i, i)

        lokal_ordet = lokal_ordet.translate(_sfinxbis_substitutions)

        lokal_ordet = lokal_ordet.replace('Ð', 'ETH')
        lokal_ordet = lokal_ordet.replace('Þ', 'TH')
        lokal_ordet = lokal_ordet.replace('ß', 'SS')

        return lokal_ordet

    def _koda_foersta_ljudet(lokal_ordet):
        """Return the word with the first sound coded."""
        if (
            lokal_ordet[0:1] in _mjuka_vokaler

            or lokal_ordet[0:1] in _harde_vokaler

        ):
            lokal_ordet = '$' + lokal_ordet[1:]
        elif lokal_ordet[0:2] in ('DJ', 'GJ', 'HJ', 'LJ'):
            lokal_ordet = 'J' + lokal_ordet[2:]
        elif lokal_ordet[0:1] == 'G' and lokal_ordet[1:2] in _mjuka_vokaler:
            lokal_ordet = 'J' + lokal_ordet[1:]
        elif lokal_ordet[0:1] == 'Q':
            lokal_ordet = 'K' + lokal_ordet[1:]
        elif lokal_ordet[0:2] == 'CH' and lokal_ordet[2:3] in frozenset(
            _mjuka_vokaler | _harde_vokaler

        ):
            lokal_ordet = '#' + lokal_ordet[2:]
        elif lokal_ordet[0:1] == 'C' and lokal_ordet[1:2] in _harde_vokaler:
            lokal_ordet = 'K' + lokal_ordet[1:]
        elif lokal_ordet[0:1] == 'C' and lokal_ordet[1:2] in _konsonanter:
            lokal_ordet = 'K' + lokal_ordet[1:]
        elif lokal_ordet[0:1] == 'X':
            lokal_ordet = 'S' + lokal_ordet[1:]
        elif lokal_ordet[0:1] == 'C' and lokal_ordet[1:2] in _mjuka_vokaler:
            lokal_ordet = 'S' + lokal_ordet[1:]
        elif lokal_ordet[0:3] in ('SKJ', 'STJ', 'SCH'):
            lokal_ordet = '#' + lokal_ordet[3:]
        elif lokal_ordet[0:2] in ('SH', 'KJ', 'TJ', 'SJ'):
            lokal_ordet = '#' + lokal_ordet[2:]
        elif lokal_ordet[0:2] == 'SK' and lokal_ordet[2:3] in _mjuka_vokaler:
            lokal_ordet = '#' + lokal_ordet[2:]
        elif lokal_ordet[0:1] == 'K' and lokal_ordet[1:2] in _mjuka_vokaler:
            lokal_ordet = '#' + lokal_ordet[1:]
        return lokal_ordet

    # Steg 1, Versaler
    word = unicode_normalize('NFC', text_type(word.upper()))
    word = word.replace('ß', 'SS')
    word = word.replace('-', ' ')

    # Steg 2, Ta bort adelsprefix
    for adelstitel in adelstitler:
        while adelstitel in word:
            word = word.replace(adelstitel, ' ')
        if word.startswith(adelstitel[1:]):
            word = word[len(adelstitel) - 1 :]

    # Split word into tokens
    ordlista = word.split()

    # Steg 3, Ta bort dubbelteckning i början på namnet
    ordlista = [_delete_consecutive_repeats(ordet) for ordet in ordlista]
    if not ordlista:
        # noinspection PyRedundantParentheses
        return ('',)

    # Steg 4, Försvenskning
    ordlista = [_foersvensker(ordet) for ordet in ordlista]

    # Steg 5, Ta bort alla tecken som inte är A-Ö (65-90,196,197,214)
    ordlista = [
        ''.join(c for c in ordet if c in _alfabet) for ordet in ordlista
    ]

    # Steg 6, Koda första ljudet
    ordlista = [_koda_foersta_ljudet(ordet) for ordet in ordlista]

    # Steg 7, Dela upp namnet i två delar
    rest = [ordet[1:] for ordet in ordlista]

    # Steg 8, Utför fonetisk transformation i resten
    rest = [ordet.replace('DT', 'T') for ordet in rest]
    rest = [ordet.replace('X', 'KS') for ordet in rest]

    # Steg 9, Koda resten till en sifferkod
    for vokal in _mjuka_vokaler:
        rest = [ordet.replace('C' + vokal, '8' + vokal) for ordet in rest]
    rest = [ordet.translate(_sfinxbis_translation) for ordet in rest]

    # Steg 10, Ta bort intilliggande dubbletter
    rest = [_delete_consecutive_repeats(ordet) for ordet in rest]

    # Steg 11, Ta bort alla "9"
    rest = [ordet.replace('9', '') for ordet in rest]

    # Steg 12, Sätt ihop delarna igen
    ordlista = [
        ''.join(ordet) for ordet in zip((_[0:1] for _ in ordlista), rest)
    ]

    # truncate, if max_length is set
    if max_length > 0:
        ordlista = [ordet[:max_length] for ordet in ordlista]

    return tuple(ordlista)


def norphone(word):
    """Return the Norphone code.

    The reference implementation by Lars Marius Garshol is available in
    :cite:`Garshol:2015`.

    Norphone was designed for Norwegian, but this implementation has been
    extended to support Swedish vowels as well. This function incorporates
    the "not implemented" rules from the above file's rule set.

    :param str word: the word to transform
    :returns: the Norphone code
    :rtype: str

    >>> norphone('Hansen')
    'HNSN'
    >>> norphone('Larsen')
    'LRSN'
    >>> norphone('Aagaard')
    'ÅKRT'
    >>> norphone('Braaten')
    'BRTN'
    >>> norphone('Sandvik')
    'SNVK'
    """
    _vowels = {'A', 'E', 'I', 'O', 'U', 'Y', 'Å', 'Æ', 'Ø', 'Ä', 'Ö'}

    replacements = {
        4: {'SKEI': 'X'},
        3: {'SKJ': 'X', 'KEI': 'X'},
        2: {
            'CH': 'K',
            'CK': 'K',
            'GJ': 'J',
            'GH': 'K',
            'HG': 'K',
            'HJ': 'J',
            'HL': 'L',
            'HR': 'R',
            'KJ': 'X',
            'KI': 'X',
            'LD': 'L',
            'ND': 'N',
            'PH': 'F',
            'TH': 'T',
            'SJ': 'X',
        },
        1: {'W': 'V', 'X': 'KS', 'Z': 'S', 'D': 'T', 'G': 'K'},
    }

    word = word.upper()

    code = ''
    skip = 0

    if word[0:2] == 'AA':
        code = 'Å'
        skip = 2
    elif word[0:2] == 'GI':
        code = 'J'
        skip = 2
    elif word[0:3] == 'SKY':
        code = 'X'
        skip = 3
    elif word[0:2] == 'EI':
        code = 'Æ'
        skip = 2
    elif word[0:2] == 'KY':
        code = 'X'
        skip = 2
    elif word[:1] == 'C':
        code = 'K'
        skip = 1
    elif word[:1] == 'Ä':
        code = 'Æ'
        skip = 1
    elif word[:1] == 'Ö':
        code = 'Ø'
        skip = 1

    if word[-2:] == 'DT':
        word = word[:-2] + 'T'
    # Though the rules indicate this rule applies in all positions, the
    # reference implementation indicates it applies only in final position.
    elif word[-2:-1] in _vowels and word[-1:] == 'D':
        word = word[:-2]

    for pos, char in enumerate(word):
        if skip:
            skip -= 1
        else:
            for length in sorted(replacements, reverse=True):
                if word[pos : pos + length] in replacements[length]:
                    code += replacements[length][word[pos : pos + length]]
                    skip = length - 1
                    break
            else:
                if not pos or char not in _vowels:
                    code += char

    code = _delete_consecutive_repeats(code)

    return code


if __name__ == '__main__':
    import doctest

    doctest.testmod()


1		# -- coding: utf-8 --
2
3		# Copyright 2014-2018 by Christopher C. Little.
4		# This file is part of Abydos.
5		#
6		# Abydos is free software: you can redistribute it and/or modify
7		# it under the terms of the GNU General Public License as published by
8		# the Free Software Foundation, either version 3 of the License, or
9		# (at your option) any later version.
10		#
11		# Abydos is distributed in the hope that it will be useful,
12		# but WITHOUT ANY WARRANTY; without even the implied warranty of
13		# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14		# GNU General Public License for more details.
15		#
16		# You should have received a copy of the GNU General Public License
17		# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19	1	"""abydos.phonetic._sv.
20
21		The phonetic._sv module implements phonetic algorithms for Scandinavian names
22		& languages (currently Swedish & Norwegian), including:
23
24		- SfinxBis
25		- Norphone
26		"""
27
28	1	from __future__ import unicode_literals
29
30	1	from unicodedata import normalize as unicode_normalize
31
32	1	from six import text_type
33
34	1	from ._util import _delete_consecutive_repeats
35
36	1	__all__ = ['norphone', 'sfinxbis']
37
38
39	1	def sfinxbis(word, max_length=-1):
40		"""Return the SfinxBis code for a word.
41
42		SfinxBis is a Soundex-like algorithm defined in :cite:`Axelsson:2009`.
43
44		This implementation follows the reference implementation:
45		:cite:`Sjoo:2009`.
46
47		SfinxBis is intended chiefly for Swedish names.
48
49		:param str word: the word to transform
50		:param int max_length: the length of the code returned (defaults to
51		unlimited)
52		:returns: the SfinxBis value
53		:rtype: tuple
54
55		>>> sfinxbis('Christopher')
56		('K68376',)
57		>>> sfinxbis('Niall')
58		('N4',)
59		>>> sfinxbis('Smith')
60		('S53',)
61		>>> sfinxbis('Schmidt')
62		('S53',)
63
64		>>> sfinxbis('Johansson')
65		('J585',)
66		>>> sfinxbis('Sjöberg')
67		('#162',)
68		"""
69	1	adelstitler = (
70		' DE LA ',
71		' DE LAS ',
72		' DE LOS ',
73		' VAN DE ',
74		' VAN DEN ',
75		' VAN DER ',
76		' VON DEM ',
77		' VON DER ',
78		' AF ',
79		' AV ',
80		' DA ',
81		' DE ',
82		' DEL ',
83		' DEN ',
84		' DES ',
85		' DI ',
86		' DO ',
87		' DON ',
88		' DOS ',
89		' DU ',
90		' E ',
91		' IN ',
92		' LA ',
93		' LE ',
94		' MAC ',
95		' MC ',
96		' VAN ',
97		' VON ',
98		' Y ',
99		' S:T ',
100		)
101
102	1	_harde_vokaler = {'A', 'O', 'U', 'Å'}
103	1	_mjuka_vokaler = {'E', 'I', 'Y', 'Ä', 'Ö'}
104	1	_konsonanter = {
105		'B',
106		'C',
107		'D',
108		'F',
109		'G',
110		'H',
111		'J',
112		'K',
113		'L',
114		'M',
115		'N',
116		'P',
117		'Q',
118		'R',
119		'S',
120		'T',
121		'V',
122		'W',
123		'X',
124		'Z',
125		}
126	1	_alfabet = {
127		'A',
128		'B',
129		'C',
130		'D',
131		'E',
132		'F',
133		'G',
134		'H',
135		'I',
136		'J',
137		'K',
138		'L',
139		'M',
140		'N',
141		'O',
142		'P',
143		'Q',
144		'R',
145		'S',
146		'T',
147		'U',
148		'V',
149		'W',
150		'X',
151		'Y',
152		'Z',
153		'Ä',
154		'Å',
155		'Ö',
156		}
157
158	1	_sfinxbis_translation = dict(
159		zip(
160		(ord(_) for _ in 'BCDFGHJKLMNPQRSTVZAOUÅEIYÄÖ'),
		0 ignored issues – show Comprehensibility Best Practice introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report The variable `_` does not seem to be defined. Loading history...
161		'123729224551268378999999999',
162		)
163		)
164
165	1	_sfinxbis_substitutions = dict(
166		zip(
167		(ord(_) for _ in 'WZÀÁÂÃÆÇÈÉÊËÌÍÎÏÑÒÓÔÕØÙÚÛÜÝ'),
168		'VSAAAAÄCEEEEIIIINOOOOÖUUUYY',
169		)
170		)
171
172	1	def _foersvensker(lokal_ordet):
173		"""Return the Swedish-ized form of the word."""
174	1	lokal_ordet = lokal_ordet.replace('STIERN', 'STJÄRN')
175	1	lokal_ordet = lokal_ordet.replace('HIE', 'HJ')
176	1	lokal_ordet = lokal_ordet.replace('SIÖ', 'SJÖ')
177	1	lokal_ordet = lokal_ordet.replace('SCH', 'SH')
178	1	lokal_ordet = lokal_ordet.replace('QU', 'KV')
179	1	lokal_ordet = lokal_ordet.replace('IO', 'JO')
180	1	lokal_ordet = lokal_ordet.replace('PH', 'F')
181
182	1	for i in _harde_vokaler:
183	1	lokal_ordet = lokal_ordet.replace(i + 'Ü', i + 'J')
184	1	lokal_ordet = lokal_ordet.replace(i + 'Y', i + 'J')
185	1	lokal_ordet = lokal_ordet.replace(i + 'I', i + 'J')
186	1	for i in _mjuka_vokaler:
187	1	lokal_ordet = lokal_ordet.replace(i + 'Ü', i + 'J')
188	1	lokal_ordet = lokal_ordet.replace(i + 'Y', i + 'J')
189	1	lokal_ordet = lokal_ordet.replace(i + 'I', i + 'J')
190
191	1	if 'H' in lokal_ordet:
192	1	for i in _konsonanter:
193	1	lokal_ordet = lokal_ordet.replace('H' + i, i)
194
195	1	lokal_ordet = lokal_ordet.translate(_sfinxbis_substitutions)
196
197	1	lokal_ordet = lokal_ordet.replace('Ð', 'ETH')
198	1	lokal_ordet = lokal_ordet.replace('Þ', 'TH')
199	1	lokal_ordet = lokal_ordet.replace('ß', 'SS')
200
201	1	return lokal_ordet
202
203	1	def _koda_foersta_ljudet(lokal_ordet):
204		"""Return the word with the first sound coded."""
205	1	if (
206		lokal_ordet[0:1] in _mjuka_vokaler
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
207		or lokal_ordet[0:1] in _harde_vokaler
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
208		):
209	1	lokal_ordet = '$' + lokal_ordet[1:]
210	1	elif lokal_ordet[0:2] in ('DJ', 'GJ', 'HJ', 'LJ'):
211	1	lokal_ordet = 'J' + lokal_ordet[2:]
212	1	elif lokal_ordet[0:1] == 'G' and lokal_ordet[1:2] in _mjuka_vokaler:
213	1	lokal_ordet = 'J' + lokal_ordet[1:]
214	1	elif lokal_ordet[0:1] == 'Q':
215	1	lokal_ordet = 'K' + lokal_ordet[1:]
216	1	elif lokal_ordet[0:2] == 'CH' and lokal_ordet[2:3] in frozenset(
217		_mjuka_vokaler \| _harde_vokaler
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
218		):
219	1	lokal_ordet = '#' + lokal_ordet[2:]
220	1	elif lokal_ordet[0:1] == 'C' and lokal_ordet[1:2] in _harde_vokaler:
221	1	lokal_ordet = 'K' + lokal_ordet[1:]
222	1	elif lokal_ordet[0:1] == 'C' and lokal_ordet[1:2] in _konsonanter:
223	1	lokal_ordet = 'K' + lokal_ordet[1:]
224	1	elif lokal_ordet[0:1] == 'X':
225	1	lokal_ordet = 'S' + lokal_ordet[1:]
226	1	elif lokal_ordet[0:1] == 'C' and lokal_ordet[1:2] in _mjuka_vokaler:
227	1	lokal_ordet = 'S' + lokal_ordet[1:]
228	1	elif lokal_ordet[0:3] in ('SKJ', 'STJ', 'SCH'):
229	1	lokal_ordet = '#' + lokal_ordet[3:]
230	1	elif lokal_ordet[0:2] in ('SH', 'KJ', 'TJ', 'SJ'):
231	1	lokal_ordet = '#' + lokal_ordet[2:]
232	1	elif lokal_ordet[0:2] == 'SK' and lokal_ordet[2:3] in _mjuka_vokaler:
233	1	lokal_ordet = '#' + lokal_ordet[2:]
234	1	elif lokal_ordet[0:1] == 'K' and lokal_ordet[1:2] in _mjuka_vokaler:
235	1	lokal_ordet = '#' + lokal_ordet[1:]
236	1	return lokal_ordet
237
238		# Steg 1, Versaler
239	1	word = unicode_normalize('NFC', text_type(word.upper()))
240	1	word = word.replace('ß', 'SS')
241	1	word = word.replace('-', ' ')
242
243		# Steg 2, Ta bort adelsprefix
244	1	for adelstitel in adelstitler:
245	1	while adelstitel in word:
246	1	word = word.replace(adelstitel, ' ')
247	1	if word.startswith(adelstitel[1:]):
248	1	word = word[len(adelstitel) - 1 :]
249
250		# Split word into tokens
251	1	ordlista = word.split()
252
253		# Steg 3, Ta bort dubbelteckning i början på namnet
254	1	ordlista = [_delete_consecutive_repeats(ordet) for ordet in ordlista]
255	1	if not ordlista:
256		# noinspection PyRedundantParentheses
257	1	return ('',)
258
259		# Steg 4, Försvenskning
260	1	ordlista = [_foersvensker(ordet) for ordet in ordlista]
261
262		# Steg 5, Ta bort alla tecken som inte är A-Ö (65-90,196,197,214)
263	1	ordlista = [
264		''.join(c for c in ordet if c in _alfabet) for ordet in ordlista
265		]
266
267		# Steg 6, Koda första ljudet
268	1	ordlista = [_koda_foersta_ljudet(ordet) for ordet in ordlista]
269
270		# Steg 7, Dela upp namnet i två delar
271	1	rest = [ordet[1:] for ordet in ordlista]
272
273		# Steg 8, Utför fonetisk transformation i resten
274	1	rest = [ordet.replace('DT', 'T') for ordet in rest]
275	1	rest = [ordet.replace('X', 'KS') for ordet in rest]
276
277		# Steg 9, Koda resten till en sifferkod
278	1	for vokal in _mjuka_vokaler:
279	1	rest = [ordet.replace('C' + vokal, '8' + vokal) for ordet in rest]
280	1	rest = [ordet.translate(_sfinxbis_translation) for ordet in rest]
281
282		# Steg 10, Ta bort intilliggande dubbletter
283	1	rest = [_delete_consecutive_repeats(ordet) for ordet in rest]
284
285		# Steg 11, Ta bort alla "9"
286	1	rest = [ordet.replace('9', '') for ordet in rest]
287
288		# Steg 12, Sätt ihop delarna igen
289	1	ordlista = [
290		''.join(ordet) for ordet in zip((_[0:1] for _ in ordlista), rest)
291		]
292
293		# truncate, if max_length is set
294	1	if max_length > 0:
295	1	ordlista = [ordet[:max_length] for ordet in ordlista]
296
297	1	return tuple(ordlista)
298
299
300	1	def norphone(word):
301		"""Return the Norphone code.
302
303		The reference implementation by Lars Marius Garshol is available in
304		:cite:`Garshol:2015`.
305
306		Norphone was designed for Norwegian, but this implementation has been
307		extended to support Swedish vowels as well. This function incorporates
308		the "not implemented" rules from the above file's rule set.
309
310		:param str word: the word to transform
311		:returns: the Norphone code
312		:rtype: str
313
314		>>> norphone('Hansen')
315		'HNSN'
316		>>> norphone('Larsen')
317		'LRSN'
318		>>> norphone('Aagaard')
319		'ÅKRT'
320		>>> norphone('Braaten')
321		'BRTN'
322		>>> norphone('Sandvik')
323		'SNVK'
324		"""
325	1	_vowels = {'A', 'E', 'I', 'O', 'U', 'Y', 'Å', 'Æ', 'Ø', 'Ä', 'Ö'}
326
327	1	replacements = {
328		4: {'SKEI': 'X'},
329		3: {'SKJ': 'X', 'KEI': 'X'},
330		2: {
331		'CH': 'K',
332		'CK': 'K',
333		'GJ': 'J',
334		'GH': 'K',
335		'HG': 'K',
336		'HJ': 'J',
337		'HL': 'L',
338		'HR': 'R',
339		'KJ': 'X',
340		'KI': 'X',
341		'LD': 'L',
342		'ND': 'N',
343		'PH': 'F',
344		'TH': 'T',
345		'SJ': 'X',
346		},
347		1: {'W': 'V', 'X': 'KS', 'Z': 'S', 'D': 'T', 'G': 'K'},
348		}
349
350	1	word = word.upper()
351
352	1	code = ''
353	1	skip = 0
354
355	1	if word[0:2] == 'AA':
356	1	code = 'Å'
357	1	skip = 2
358	1	elif word[0:2] == 'GI':
359	1	code = 'J'
360	1	skip = 2
361	1	elif word[0:3] == 'SKY':
362	1	code = 'X'
363	1	skip = 3
364	1	elif word[0:2] == 'EI':
365	1	code = 'Æ'
366	1	skip = 2
367	1	elif word[0:2] == 'KY':
368	1	code = 'X'
369	1	skip = 2
370	1	elif word[:1] == 'C':
371	1	code = 'K'
372	1	skip = 1
373	1	elif word[:1] == 'Ä':
374	1	code = 'Æ'
375	1	skip = 1
376	1	elif word[:1] == 'Ö':
377	1	code = 'Ø'
378	1	skip = 1
379
380	1	if word[-2:] == 'DT':
381	1	word = word[:-2] + 'T'
382		# Though the rules indicate this rule applies in all positions, the
383		# reference implementation indicates it applies only in final position.
384	1	elif word[-2:-1] in _vowels and word[-1:] == 'D':
385	1	word = word[:-2]
386
387	1	for pos, char in enumerate(word):
388	1	if skip:
389	1	skip -= 1
390		else:
391	1	for length in sorted(replacements, reverse=True):
392	1	if word[pos : pos + length] in replacements[length]:
393	1	code += replacements[length][word[pos : pos + length]]
394	1	skip = length - 1
395	1	break
396		else:
397	1	if not pos or char not in _vowels:
398	1	code += char
399
400	1	code = _delete_consecutive_repeats(code)
401
402	1	return code
403
404
405		if __name__ == '__main__':
406		import doctest
407
408		doctest.testmod()
409

chrislit / abydos

Branch — master (78a222)

abydos.phonetic._sv.sfinxbis() F

Complexity

Size

Duplication

Code Coverage

Importance

How to fix Long Method Complexity

Long Method

Complexity

Duplication Side-by-Side

Filter issues like