abydos.phonetic._sfinx_bis - Code Metrics - Inspection of "Merge pull request #149 from chrislit/0.3.6" - chrislit/abydos - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — master ( f43547...71985b )

by Chris

created 2018-11-17 08:52 UTC

abydos.phonetic._sfinx_bis A

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	401
Duplicated Lines	0 %

Test Coverage

Coverage

100%

Importance

Changes

Metric	Value
eloc	206
dl	0
loc	401
ccs	96
cts	96
cp	1
rs	9.76
c	0
b	0
f	0
wmc	33

1 Function

Rating	Name	Duplication	Size	Complexity
A	sfinxbis()	0	35	1

1 Method

Rating	Name	Duplication	Size	Complexity
F	SfinxBis.encode()	0	204	32

# -*- coding: utf-8 -*-

# Copyright 2014-2018 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.phonetic._sfinx_bis.

SfinxBis
"""

from __future__ import (
    absolute_import,
    division,
    print_function,
    unicode_literals,
)

from unicodedata import normalize as unicode_normalize

from six import text_type

from ._phonetic import _Phonetic

__all__ = ['SfinxBis', 'sfinxbis']


class SfinxBis(_Phonetic):

    """SfinxBis code.

    SfinxBis is a Soundex-like algorithm defined in :cite:`Axelsson:2009`.

    This implementation follows the reference implementation:
    :cite:`Sjoo:2009`.

    SfinxBis is intended chiefly for Swedish names.
    """

    _adelstitler = (
        ' DE LA ',
        ' DE LAS ',
        ' DE LOS ',
        ' VAN DE ',
        ' VAN DEN ',
        ' VAN DER ',
        ' VON DEM ',
        ' VON DER ',
        ' AF ',
        ' AV ',
        ' DA ',
        ' DE ',
        ' DEL ',
        ' DEN ',
        ' DES ',
        ' DI ',
        ' DO ',
        ' DON ',
        ' DOS ',
        ' DU ',
        ' E ',
        ' IN ',
        ' LA ',
        ' LE ',
        ' MAC ',
        ' MC ',
        ' VAN ',
        ' VON ',
        ' Y ',
        ' S:T ',
    )

    _harde_vokaler = {'A', 'O', 'U', 'Å'}
    _mjuka_vokaler = {'E', 'I', 'Y', 'Ä', 'Ö'}
    _uc_c_set = {
        'B',
        'C',
        'D',
        'F',
        'G',
        'H',
        'J',
        'K',
        'L',
        'M',
        'N',
        'P',
        'Q',
        'R',
        'S',
        'T',
        'V',
        'W',
        'X',
        'Z',
    }
    _uc_set = {
        'A',
        'B',
        'C',
        'D',
        'E',
        'F',
        'G',
        'H',
        'I',
        'J',
        'K',
        'L',
        'M',
        'N',
        'O',
        'P',
        'Q',
        'R',
        'S',
        'T',
        'U',
        'V',
        'W',
        'X',
        'Y',
        'Z',
        'Ä',
        'Å',
        'Ö',
    }

    _trans = dict(
        zip(
            (ord(_) for _ in 'BCDFGHJKLMNPQRSTVZAOUÅEIYÄÖ'),

            '123729224551268378999999999',
        )
    )

    _substitutions = dict(
        zip(
            (ord(_) for _ in 'WZÀÁÂÃÆÇÈÉÊËÌÍÎÏÑÒÓÔÕØÙÚÛÜÝ'),
            'VSAAAAÄCEEEEIIIINOOOOÖUUUYY',
        )
    )

    def encode(self, word, max_length=-1):

        """Return the SfinxBis code for a word.

        Parameters
        ----------
        word : str
            The word to transform
        max_length : int
            The length of the code returned (defaults to unlimited)

        Returns
        -------
        tuple
            The SfinxBis value

        Examples
        --------
        >>> pe = SfinxBis()
        >>> pe.encode('Christopher')
        ('K68376',)
        >>> pe.encode('Niall')
        ('N4',)
        >>> pe.encode('Smith')
        ('S53',)
        >>> pe.encode('Schmidt')
        ('S53',)

        >>> pe.encode('Johansson')
        ('J585',)
        >>> pe.encode('Sjöberg')
        ('#162',)

        """

        def _foersvensker(lokal_ordet):
            """Return the Swedish-ized form of the word.

            Parameters
            ----------
            lokal_ordet : str
                Word to transform

            Returns
            -------
            str
                Transformed word

            """
            lokal_ordet = lokal_ordet.replace('STIERN', 'STJÄRN')
            lokal_ordet = lokal_ordet.replace('HIE', 'HJ')
            lokal_ordet = lokal_ordet.replace('SIÖ', 'SJÖ')
            lokal_ordet = lokal_ordet.replace('SCH', 'SH')
            lokal_ordet = lokal_ordet.replace('QU', 'KV')
            lokal_ordet = lokal_ordet.replace('IO', 'JO')
            lokal_ordet = lokal_ordet.replace('PH', 'F')

            for i in self._harde_vokaler:
                lokal_ordet = lokal_ordet.replace(i + 'Ü', i + 'J')
                lokal_ordet = lokal_ordet.replace(i + 'Y', i + 'J')
                lokal_ordet = lokal_ordet.replace(i + 'I', i + 'J')
            for i in self._mjuka_vokaler:
                lokal_ordet = lokal_ordet.replace(i + 'Ü', i + 'J')
                lokal_ordet = lokal_ordet.replace(i + 'Y', i + 'J')
                lokal_ordet = lokal_ordet.replace(i + 'I', i + 'J')

            if 'H' in lokal_ordet:
                for i in self._uc_c_set:
                    lokal_ordet = lokal_ordet.replace('H' + i, i)

            lokal_ordet = lokal_ordet.translate(self._substitutions)

            lokal_ordet = lokal_ordet.replace('Ð', 'ETH')
            lokal_ordet = lokal_ordet.replace('Þ', 'TH')
            lokal_ordet = lokal_ordet.replace('ß', 'SS')

            return lokal_ordet

        def _koda_foersta_ljudet(lokal_ordet):
            """Return the word with the first sound coded.

            Parameters
            ----------
            lokal_ordet : str
                Word to transform

            Returns
            -------
            str
                Transformed word

            """
            if (
                lokal_ordet[0:1] in self._mjuka_vokaler

                or lokal_ordet[0:1] in self._harde_vokaler

            ):
                lokal_ordet = '$' + lokal_ordet[1:]
            elif lokal_ordet[0:2] in ('DJ', 'GJ', 'HJ', 'LJ'):
                lokal_ordet = 'J' + lokal_ordet[2:]
            elif (
                lokal_ordet[0:1] == 'G'

                and lokal_ordet[1:2] in self._mjuka_vokaler

            ):
                lokal_ordet = 'J' + lokal_ordet[1:]
            elif lokal_ordet[0:1] == 'Q':
                lokal_ordet = 'K' + lokal_ordet[1:]
            elif lokal_ordet[0:2] == 'CH' and lokal_ordet[2:3] in frozenset(
                self._mjuka_vokaler | self._harde_vokaler

            ):
                lokal_ordet = '#' + lokal_ordet[2:]
            elif (
                lokal_ordet[0:1] == 'C'

                and lokal_ordet[1:2] in self._harde_vokaler

            ):
                lokal_ordet = 'K' + lokal_ordet[1:]
            elif (
                lokal_ordet[0:1] == 'C' and lokal_ordet[1:2] in self._uc_c_set

            ):
                lokal_ordet = 'K' + lokal_ordet[1:]
            elif lokal_ordet[0:1] == 'X':
                lokal_ordet = 'S' + lokal_ordet[1:]
            elif (
                lokal_ordet[0:1] == 'C'

                and lokal_ordet[1:2] in self._mjuka_vokaler

            ):
                lokal_ordet = 'S' + lokal_ordet[1:]
            elif lokal_ordet[0:3] in ('SKJ', 'STJ', 'SCH'):
                lokal_ordet = '#' + lokal_ordet[3:]
            elif lokal_ordet[0:2] in ('SH', 'KJ', 'TJ', 'SJ'):
                lokal_ordet = '#' + lokal_ordet[2:]
            elif (
                lokal_ordet[0:2] == 'SK'

                and lokal_ordet[2:3] in self._mjuka_vokaler

            ):
                lokal_ordet = '#' + lokal_ordet[2:]
            elif (
                lokal_ordet[0:1] == 'K'

                and lokal_ordet[1:2] in self._mjuka_vokaler

            ):
                lokal_ordet = '#' + lokal_ordet[1:]
            return lokal_ordet

        # Steg 1, Versaler
        word = unicode_normalize('NFC', text_type(word.upper()))
        word = word.replace('ß', 'SS')
        word = word.replace('-', ' ')

        # Steg 2, Ta bort adelsprefix
        for adelstitel in self._adelstitler:
            while adelstitel in word:
                word = word.replace(adelstitel, ' ')
            if word.startswith(adelstitel[1:]):
                word = word[len(adelstitel) - 1 :]

        # Split word into tokens
        ordlista = word.split()

        # Steg 3, Ta bort dubbelteckning i början på namnet
        ordlista = [
            self._delete_consecutive_repeats(ordet) for ordet in ordlista
        ]
        if not ordlista:
            # noinspection PyRedundantParentheses
            return ('',)

        # Steg 4, Försvenskning
        ordlista = [_foersvensker(ordet) for ordet in ordlista]

        # Steg 5, Ta bort alla tecken som inte är A-Ö (65-90,196,197,214)
        ordlista = [
            ''.join(c for c in ordet if c in self._uc_set)
            for ordet in ordlista
        ]

        # Steg 6, Koda första ljudet
        ordlista = [_koda_foersta_ljudet(ordet) for ordet in ordlista]

        # Steg 7, Dela upp namnet i två delar
        rest = [ordet[1:] for ordet in ordlista]

        # Steg 8, Utför fonetisk transformation i resten
        rest = [ordet.replace('DT', 'T') for ordet in rest]
        rest = [ordet.replace('X', 'KS') for ordet in rest]

        # Steg 9, Koda resten till en sifferkod
        for vokal in self._mjuka_vokaler:
            rest = [ordet.replace('C' + vokal, '8' + vokal) for ordet in rest]
        rest = [ordet.translate(self._trans) for ordet in rest]

        # Steg 10, Ta bort intilliggande dubbletter
        rest = [self._delete_consecutive_repeats(ordet) for ordet in rest]

        # Steg 11, Ta bort alla "9"
        rest = [ordet.replace('9', '') for ordet in rest]

        # Steg 12, Sätt ihop delarna igen
        ordlista = [
            ''.join(ordet) for ordet in zip((_[0:1] for _ in ordlista), rest)

        ]

        # truncate, if max_length is set
        if max_length > 0:
            ordlista = [ordet[:max_length] for ordet in ordlista]

        return tuple(ordlista)


def sfinxbis(word, max_length=-1):
    """Return the SfinxBis code for a word.

    This is a wrapper for :py:meth:`SfinxBis.encode`.

    Parameters
    ----------
    word : str
        The word to transform
    max_length : int
        The length of the code returned (defaults to unlimited)

    Returns
    -------
    tuple
        The SfinxBis value

    Examples
    --------
    >>> sfinxbis('Christopher')
    ('K68376',)
    >>> sfinxbis('Niall')
    ('N4',)
    >>> sfinxbis('Smith')
    ('S53',)
    >>> sfinxbis('Schmidt')
    ('S53',)

    >>> sfinxbis('Johansson')
    ('J585',)
    >>> sfinxbis('Sjöberg')
    ('#162',)

    """
    return SfinxBis().encode(word, max_length)


if __name__ == '__main__':
    import doctest

    doctest.testmod()


1		# -- coding: utf-8 --
2
3		# Copyright 2014-2018 by Christopher C. Little.
4		# This file is part of Abydos.
5		#
6		# Abydos is free software: you can redistribute it and/or modify
7		# it under the terms of the GNU General Public License as published by
8		# the Free Software Foundation, either version 3 of the License, or
9		# (at your option) any later version.
10		#
11		# Abydos is distributed in the hope that it will be useful,
12		# but WITHOUT ANY WARRANTY; without even the implied warranty of
13		# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14		# GNU General Public License for more details.
15		#
16		# You should have received a copy of the GNU General Public License
17		# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19	1	"""abydos.phonetic._sfinx_bis.
20
21		SfinxBis
22		"""
23
24	1	from __future__ import (
25		absolute_import,
26		division,
27		print_function,
28		unicode_literals,
29		)
30
31	1	from unicodedata import normalize as unicode_normalize
32
33	1	from six import text_type
34
35	1	from ._phonetic import _Phonetic
36
37	1	__all__ = ['SfinxBis', 'sfinxbis']
38
39
40	1	class SfinxBis(_Phonetic):
		0 ignored issues – show Unused Code introduced 2018-11-10 01:42 UTC by Report Bug Copy Issue Report The variable `__class__` seems to be unused. Loading history...
41		"""SfinxBis code.
42
43		SfinxBis is a Soundex-like algorithm defined in :cite:`Axelsson:2009`.
44
45		This implementation follows the reference implementation:
46		:cite:`Sjoo:2009`.
47
48		SfinxBis is intended chiefly for Swedish names.
49		"""
50
51	1	_adelstitler = (
52		' DE LA ',
53		' DE LAS ',
54		' DE LOS ',
55		' VAN DE ',
56		' VAN DEN ',
57		' VAN DER ',
58		' VON DEM ',
59		' VON DER ',
60		' AF ',
61		' AV ',
62		' DA ',
63		' DE ',
64		' DEL ',
65		' DEN ',
66		' DES ',
67		' DI ',
68		' DO ',
69		' DON ',
70		' DOS ',
71		' DU ',
72		' E ',
73		' IN ',
74		' LA ',
75		' LE ',
76		' MAC ',
77		' MC ',
78		' VAN ',
79		' VON ',
80		' Y ',
81		' S:T ',
82		)
83
84	1	_harde_vokaler = {'A', 'O', 'U', 'Å'}
85	1	_mjuka_vokaler = {'E', 'I', 'Y', 'Ä', 'Ö'}
86	1	_uc_c_set = {
87		'B',
88		'C',
89		'D',
90		'F',
91		'G',
92		'H',
93		'J',
94		'K',
95		'L',
96		'M',
97		'N',
98		'P',
99		'Q',
100		'R',
101		'S',
102		'T',
103		'V',
104		'W',
105		'X',
106		'Z',
107		}
108	1	_uc_set = {
109		'A',
110		'B',
111		'C',
112		'D',
113		'E',
114		'F',
115		'G',
116		'H',
117		'I',
118		'J',
119		'K',
120		'L',
121		'M',
122		'N',
123		'O',
124		'P',
125		'Q',
126		'R',
127		'S',
128		'T',
129		'U',
130		'V',
131		'W',
132		'X',
133		'Y',
134		'Z',
135		'Ä',
136		'Å',
137		'Ö',
138		}
139
140	1	_trans = dict(
141		zip(
142		(ord(_) for _ in 'BCDFGHJKLMNPQRSTVZAOUÅEIYÄÖ'),
		0 ignored issues – show Comprehensibility Best Practice introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report The variable `_` does not seem to be defined. Loading history...
143		'123729224551268378999999999',
144		)
145		)
146
147	1	_substitutions = dict(
148		zip(
149		(ord(_) for _ in 'WZÀÁÂÃÆÇÈÉÊËÌÍÎÏÑÒÓÔÕØÙÚÛÜÝ'),
150		'VSAAAAÄCEEEEIIIINOOOOÖUUUYY',
151		)
152		)
153
154	1	def encode(self, word, max_length=-1):
		0 ignored issues – show Bug introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Parameters differ from overridden 'encode' method Loading history...
155		"""Return the SfinxBis code for a word.
156
157		Parameters
158		----------
159		word : str
160		The word to transform
161		max_length : int
162		The length of the code returned (defaults to unlimited)
163
164		Returns
165		-------
166		tuple
167		The SfinxBis value
168
169		Examples
170		--------
171		>>> pe = SfinxBis()
172		>>> pe.encode('Christopher')
173		('K68376',)
174		>>> pe.encode('Niall')
175		('N4',)
176		>>> pe.encode('Smith')
177		('S53',)
178		>>> pe.encode('Schmidt')
179		('S53',)
180
181		>>> pe.encode('Johansson')
182		('J585',)
183		>>> pe.encode('Sjöberg')
184		('#162',)
185
186		"""
187
188	1	def _foersvensker(lokal_ordet):
189		"""Return the Swedish-ized form of the word.
190
191		Parameters
192		----------
193		lokal_ordet : str
194		Word to transform
195
196		Returns
197		-------
198		str
199		Transformed word
200
201		"""
202	1	lokal_ordet = lokal_ordet.replace('STIERN', 'STJÄRN')
203	1	lokal_ordet = lokal_ordet.replace('HIE', 'HJ')
204	1	lokal_ordet = lokal_ordet.replace('SIÖ', 'SJÖ')
205	1	lokal_ordet = lokal_ordet.replace('SCH', 'SH')
206	1	lokal_ordet = lokal_ordet.replace('QU', 'KV')
207	1	lokal_ordet = lokal_ordet.replace('IO', 'JO')
208	1	lokal_ordet = lokal_ordet.replace('PH', 'F')
209
210	1	for i in self._harde_vokaler:
211	1	lokal_ordet = lokal_ordet.replace(i + 'Ü', i + 'J')
212	1	lokal_ordet = lokal_ordet.replace(i + 'Y', i + 'J')
213	1	lokal_ordet = lokal_ordet.replace(i + 'I', i + 'J')
214	1	for i in self._mjuka_vokaler:
215	1	lokal_ordet = lokal_ordet.replace(i + 'Ü', i + 'J')
216	1	lokal_ordet = lokal_ordet.replace(i + 'Y', i + 'J')
217	1	lokal_ordet = lokal_ordet.replace(i + 'I', i + 'J')
218
219	1	if 'H' in lokal_ordet:
220	1	for i in self._uc_c_set:
221	1	lokal_ordet = lokal_ordet.replace('H' + i, i)
222
223	1	lokal_ordet = lokal_ordet.translate(self._substitutions)
224
225	1	lokal_ordet = lokal_ordet.replace('Ð', 'ETH')
226	1	lokal_ordet = lokal_ordet.replace('Þ', 'TH')
227	1	lokal_ordet = lokal_ordet.replace('ß', 'SS')
228
229	1	return lokal_ordet
230
231	1	def _koda_foersta_ljudet(lokal_ordet):
232		"""Return the word with the first sound coded.
233
234		Parameters
235		----------
236		lokal_ordet : str
237		Word to transform
238
239		Returns
240		-------
241		str
242		Transformed word
243
244		"""
245	1	if (
246		lokal_ordet[0:1] in self._mjuka_vokaler
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
247		or lokal_ordet[0:1] in self._harde_vokaler
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
248		):
249	1	lokal_ordet = '$' + lokal_ordet[1:]
250	1	elif lokal_ordet[0:2] in ('DJ', 'GJ', 'HJ', 'LJ'):
251	1	lokal_ordet = 'J' + lokal_ordet[2:]
252	1	elif (
253		lokal_ordet[0:1] == 'G'
		0 ignored issues – show Coding Style introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
254		and lokal_ordet[1:2] in self._mjuka_vokaler
		0 ignored issues – show Coding Style introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
255		):
256	1	lokal_ordet = 'J' + lokal_ordet[1:]
257	1	elif lokal_ordet[0:1] == 'Q':
258	1	lokal_ordet = 'K' + lokal_ordet[1:]
259	1	elif lokal_ordet[0:2] == 'CH' and lokal_ordet[2:3] in frozenset(
260		self._mjuka_vokaler \| self._harde_vokaler
		0 ignored issues – show Coding Style introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
261		):
262	1	lokal_ordet = '#' + lokal_ordet[2:]
263	1	elif (
264		lokal_ordet[0:1] == 'C'
		0 ignored issues – show Coding Style introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
265		and lokal_ordet[1:2] in self._harde_vokaler
		0 ignored issues – show Coding Style introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
266		):
267	1	lokal_ordet = 'K' + lokal_ordet[1:]
268	1	elif (
269		lokal_ordet[0:1] == 'C' and lokal_ordet[1:2] in self._uc_c_set
		0 ignored issues – show Coding Style introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
270		):
271	1	lokal_ordet = 'K' + lokal_ordet[1:]
272	1	elif lokal_ordet[0:1] == 'X':
273	1	lokal_ordet = 'S' + lokal_ordet[1:]
274	1	elif (
275		lokal_ordet[0:1] == 'C'
		0 ignored issues – show Coding Style introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
276		and lokal_ordet[1:2] in self._mjuka_vokaler
		0 ignored issues – show Coding Style introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
277		):
278	1	lokal_ordet = 'S' + lokal_ordet[1:]
279	1	elif lokal_ordet[0:3] in ('SKJ', 'STJ', 'SCH'):
280	1	lokal_ordet = '#' + lokal_ordet[3:]
281	1	elif lokal_ordet[0:2] in ('SH', 'KJ', 'TJ', 'SJ'):
282	1	lokal_ordet = '#' + lokal_ordet[2:]
283	1	elif (
284		lokal_ordet[0:2] == 'SK'
		0 ignored issues – show Coding Style introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
285		and lokal_ordet[2:3] in self._mjuka_vokaler
		0 ignored issues – show Coding Style introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
286		):
287	1	lokal_ordet = '#' + lokal_ordet[2:]
288	1	elif (
289		lokal_ordet[0:1] == 'K'
		0 ignored issues – show Coding Style introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
290		and lokal_ordet[1:2] in self._mjuka_vokaler
		0 ignored issues – show Coding Style introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
291		):
292	1	lokal_ordet = '#' + lokal_ordet[1:]
293	1	return lokal_ordet
294
295		# Steg 1, Versaler
296	1	word = unicode_normalize('NFC', text_type(word.upper()))
297	1	word = word.replace('ß', 'SS')
298	1	word = word.replace('-', ' ')
299
300		# Steg 2, Ta bort adelsprefix
301	1	for adelstitel in self._adelstitler:
302	1	while adelstitel in word:
303	1	word = word.replace(adelstitel, ' ')
304	1	if word.startswith(adelstitel[1:]):
305	1	word = word[len(adelstitel) - 1 :]
306
307		# Split word into tokens
308	1	ordlista = word.split()
309
310		# Steg 3, Ta bort dubbelteckning i början på namnet
311	1	ordlista = [
312		self._delete_consecutive_repeats(ordet) for ordet in ordlista
313		]
314	1	if not ordlista:
315		# noinspection PyRedundantParentheses
316	1	return ('',)
317
318		# Steg 4, Försvenskning
319	1	ordlista = [_foersvensker(ordet) for ordet in ordlista]
320
321		# Steg 5, Ta bort alla tecken som inte är A-Ö (65-90,196,197,214)
322	1	ordlista = [
323		''.join(c for c in ordet if c in self._uc_set)
324		for ordet in ordlista
325		]
326
327		# Steg 6, Koda första ljudet
328	1	ordlista = [_koda_foersta_ljudet(ordet) for ordet in ordlista]
329
330		# Steg 7, Dela upp namnet i två delar
331	1	rest = [ordet[1:] for ordet in ordlista]
332
333		# Steg 8, Utför fonetisk transformation i resten
334	1	rest = [ordet.replace('DT', 'T') for ordet in rest]
335	1	rest = [ordet.replace('X', 'KS') for ordet in rest]
336
337		# Steg 9, Koda resten till en sifferkod
338	1	for vokal in self._mjuka_vokaler:
339	1	rest = [ordet.replace('C' + vokal, '8' + vokal) for ordet in rest]
340	1	rest = [ordet.translate(self._trans) for ordet in rest]
341
342		# Steg 10, Ta bort intilliggande dubbletter
343	1	rest = [self._delete_consecutive_repeats(ordet) for ordet in rest]
344
345		# Steg 11, Ta bort alla "9"
346	1	rest = [ordet.replace('9', '') for ordet in rest]
347
348		# Steg 12, Sätt ihop delarna igen
349	1	ordlista = [
350		''.join(ordet) for ordet in zip((_[0:1] for _ in ordlista), rest)
		0 ignored issues – show Comprehensibility Best Practice introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report The variable `_` does not seem to be defined. Loading history...
351		]
352
353		# truncate, if max_length is set
354	1	if max_length > 0:
355	1	ordlista = [ordet[:max_length] for ordet in ordlista]
356
357	1	return tuple(ordlista)
358
359
360	1	def sfinxbis(word, max_length=-1):
361		"""Return the SfinxBis code for a word.
362
363		This is a wrapper for :py:meth:`SfinxBis.encode`.
364
365		Parameters
366		----------
367		word : str
368		The word to transform
369		max_length : int
370		The length of the code returned (defaults to unlimited)
371
372		Returns
373		-------
374		tuple
375		The SfinxBis value
376
377		Examples
378		--------
379		>>> sfinxbis('Christopher')
380		('K68376',)
381		>>> sfinxbis('Niall')
382		('N4',)
383		>>> sfinxbis('Smith')
384		('S53',)
385		>>> sfinxbis('Schmidt')
386		('S53',)
387
388		>>> sfinxbis('Johansson')
389		('J585',)
390		>>> sfinxbis('Sjöberg')
391		('#162',)
392
393		"""
394	1	return SfinxBis().encode(word, max_length)
395
396
397		if __name__ == '__main__':
398		import doctest
399
400		doctest.testmod()
401

chrislit / abydos

Push — master ( f43547...71985b )

abydos.phonetic._sfinx_bis A

Complexity

Size/Duplication

Test Coverage

Importance

1 Function

1 Method

Duplication Side-by-Side

Filter issues like