abydos.phonetic._SfinxBis.sfinxbis() - Code Metrics - Inspection of "0.3.6" - chrislit/abydos - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Pull Request — master (#141)

by Chris

created 2018-11-10 01:31 UTC

abydos.phonetic._SfinxBis.sfinxbis() A

↳ Parent: abydos.phonetic._SfinxBis

Complexity

Conditions

Size

Total Lines	30
Code Lines	2

Duplication

Lines	0
Ratio	0 %

Code Coverage

Tests	2
CRAP Score	1

Importance

Changes

Metric	Value
cc	1
eloc	2
nop	2
dl	0
loc	30
ccs	2
cts	2
cp	1
crap	1
rs	10
c	0
b	0
f	0

# -*- coding: utf-8 -*-


# Copyright 2014-2018 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.phonetic._SfinxBis.

SfinxBis
"""

from __future__ import (
    absolute_import,
    division,
    print_function,
    unicode_literals,
)

from unicodedata import normalize as unicode_normalize

from six import text_type

from ._Phonetic import Phonetic

__all__ = ['SfinxBis', 'sfinxbis']


class SfinxBis(Phonetic):

    """SfinxBis code.

    SfinxBis is a Soundex-like algorithm defined in :cite:`Axelsson:2009`.

    This implementation follows the reference implementation:
    :cite:`Sjoo:2009`.

    SfinxBis is intended chiefly for Swedish names.
    """

    _adelstitler = (
        ' DE LA ',
        ' DE LAS ',
        ' DE LOS ',
        ' VAN DE ',
        ' VAN DEN ',
        ' VAN DER ',
        ' VON DEM ',
        ' VON DER ',
        ' AF ',
        ' AV ',
        ' DA ',
        ' DE ',
        ' DEL ',
        ' DEN ',
        ' DES ',
        ' DI ',
        ' DO ',
        ' DON ',
        ' DOS ',
        ' DU ',
        ' E ',
        ' IN ',
        ' LA ',
        ' LE ',
        ' MAC ',
        ' MC ',
        ' VAN ',
        ' VON ',
        ' Y ',
        ' S:T ',
    )

    _harde_vokaler = {'A', 'O', 'U', 'Å'}
    _mjuka_vokaler = {'E', 'I', 'Y', 'Ä', 'Ö'}
    _uc_c_set = {
        'B',
        'C',
        'D',
        'F',
        'G',
        'H',
        'J',
        'K',
        'L',
        'M',
        'N',
        'P',
        'Q',
        'R',
        'S',
        'T',
        'V',
        'W',
        'X',
        'Z',
    }
    _uc_set = {
        'A',
        'B',
        'C',
        'D',
        'E',
        'F',
        'G',
        'H',
        'I',
        'J',
        'K',
        'L',
        'M',
        'N',
        'O',
        'P',
        'Q',
        'R',
        'S',
        'T',
        'U',
        'V',
        'W',
        'X',
        'Y',
        'Z',
        'Ä',
        'Å',
        'Ö',
    }

    _trans = dict(
        zip(
            (ord(_) for _ in 'BCDFGHJKLMNPQRSTVZAOUÅEIYÄÖ'),

            '123729224551268378999999999',
        )
    )

    _substitutions = dict(
        zip(
            (ord(_) for _ in 'WZÀÁÂÃÆÇÈÉÊËÌÍÎÏÑÒÓÔÕØÙÚÛÜÝ'),
            'VSAAAAÄCEEEEIIIINOOOOÖUUUYY',
        )
    )

    def encode(self, word, max_length=-1):

        """Return the SfinxBis code for a word.

        Args:
            word (str): The word to transform
            max_length (int): The length of the code returned (defaults to
                unlimited)

        Returns:
            tuple: The SfinxBis value

        Examples:
            >>> pe = SfinxBis()
            >>> pe.encode('Christopher')
            ('K68376',)
            >>> pe.encode('Niall')
            ('N4',)
            >>> pe.encode('Smith')
            ('S53',)
            >>> pe.encode('Schmidt')
            ('S53',)

            >>> pe.encode('Johansson')
            ('J585',)
            >>> pe.encode('Sjöberg')
            ('#162',)

        """

        def _foersvensker(lokal_ordet):
            """Return the Swedish-ized form of the word.

            Args:
                lokal_ordet (str): Word to transform

            Returns:
                str: Transformed word

            """
            lokal_ordet = lokal_ordet.replace('STIERN', 'STJÄRN')
            lokal_ordet = lokal_ordet.replace('HIE', 'HJ')
            lokal_ordet = lokal_ordet.replace('SIÖ', 'SJÖ')
            lokal_ordet = lokal_ordet.replace('SCH', 'SH')
            lokal_ordet = lokal_ordet.replace('QU', 'KV')
            lokal_ordet = lokal_ordet.replace('IO', 'JO')
            lokal_ordet = lokal_ordet.replace('PH', 'F')

            for i in self._harde_vokaler:
                lokal_ordet = lokal_ordet.replace(i + 'Ü', i + 'J')
                lokal_ordet = lokal_ordet.replace(i + 'Y', i + 'J')
                lokal_ordet = lokal_ordet.replace(i + 'I', i + 'J')
            for i in self._mjuka_vokaler:
                lokal_ordet = lokal_ordet.replace(i + 'Ü', i + 'J')
                lokal_ordet = lokal_ordet.replace(i + 'Y', i + 'J')
                lokal_ordet = lokal_ordet.replace(i + 'I', i + 'J')

            if 'H' in lokal_ordet:
                for i in self._uc_c_set:
                    lokal_ordet = lokal_ordet.replace('H' + i, i)

            lokal_ordet = lokal_ordet.translate(self._substitutions)

            lokal_ordet = lokal_ordet.replace('Ð', 'ETH')
            lokal_ordet = lokal_ordet.replace('Þ', 'TH')
            lokal_ordet = lokal_ordet.replace('ß', 'SS')

            return lokal_ordet

        def _koda_foersta_ljudet(lokal_ordet):
            """Return the word with the first sound coded.

            Args:
                lokal_ordet (str): Word to transform

            Returns:
                str: Transformed word

            """
            if (
                lokal_ordet[0:1] in self._mjuka_vokaler

                or lokal_ordet[0:1] in self._harde_vokaler

            ):
                lokal_ordet = '$' + lokal_ordet[1:]
            elif lokal_ordet[0:2] in ('DJ', 'GJ', 'HJ', 'LJ'):
                lokal_ordet = 'J' + lokal_ordet[2:]
            elif (
                lokal_ordet[0:1] == 'G'

                and lokal_ordet[1:2] in self._mjuka_vokaler

            ):
                lokal_ordet = 'J' + lokal_ordet[1:]
            elif lokal_ordet[0:1] == 'Q':
                lokal_ordet = 'K' + lokal_ordet[1:]
            elif lokal_ordet[0:2] == 'CH' and lokal_ordet[2:3] in frozenset(
                self._mjuka_vokaler | self._harde_vokaler

            ):
                lokal_ordet = '#' + lokal_ordet[2:]
            elif (
                lokal_ordet[0:1] == 'C'

                and lokal_ordet[1:2] in self._harde_vokaler

            ):
                lokal_ordet = 'K' + lokal_ordet[1:]
            elif (
                lokal_ordet[0:1] == 'C' and lokal_ordet[1:2] in self._uc_c_set

            ):
                lokal_ordet = 'K' + lokal_ordet[1:]
            elif lokal_ordet[0:1] == 'X':
                lokal_ordet = 'S' + lokal_ordet[1:]
            elif (
                lokal_ordet[0:1] == 'C'

                and lokal_ordet[1:2] in self._mjuka_vokaler

            ):
                lokal_ordet = 'S' + lokal_ordet[1:]
            elif lokal_ordet[0:3] in ('SKJ', 'STJ', 'SCH'):
                lokal_ordet = '#' + lokal_ordet[3:]
            elif lokal_ordet[0:2] in ('SH', 'KJ', 'TJ', 'SJ'):
                lokal_ordet = '#' + lokal_ordet[2:]
            elif (
                lokal_ordet[0:2] == 'SK'

                and lokal_ordet[2:3] in self._mjuka_vokaler

            ):
                lokal_ordet = '#' + lokal_ordet[2:]
            elif (
                lokal_ordet[0:1] == 'K'

                and lokal_ordet[1:2] in self._mjuka_vokaler

            ):
                lokal_ordet = '#' + lokal_ordet[1:]
            return lokal_ordet

        # Steg 1, Versaler
        word = unicode_normalize('NFC', text_type(word.upper()))
        word = word.replace('ß', 'SS')
        word = word.replace('-', ' ')

        # Steg 2, Ta bort adelsprefix
        for adelstitel in self._adelstitler:
            while adelstitel in word:
                word = word.replace(adelstitel, ' ')
            if word.startswith(adelstitel[1:]):
                word = word[len(adelstitel) - 1 :]

        # Split word into tokens
        ordlista = word.split()

        # Steg 3, Ta bort dubbelteckning i början på namnet
        ordlista = [
            self._delete_consecutive_repeats(ordet) for ordet in ordlista
        ]
        if not ordlista:
            # noinspection PyRedundantParentheses
            return ('',)

        # Steg 4, Försvenskning
        ordlista = [_foersvensker(ordet) for ordet in ordlista]

        # Steg 5, Ta bort alla tecken som inte är A-Ö (65-90,196,197,214)
        ordlista = [
            ''.join(c for c in ordet if c in self._uc_set)
            for ordet in ordlista
        ]

        # Steg 6, Koda första ljudet
        ordlista = [_koda_foersta_ljudet(ordet) for ordet in ordlista]

        # Steg 7, Dela upp namnet i två delar
        rest = [ordet[1:] for ordet in ordlista]

        # Steg 8, Utför fonetisk transformation i resten
        rest = [ordet.replace('DT', 'T') for ordet in rest]
        rest = [ordet.replace('X', 'KS') for ordet in rest]

        # Steg 9, Koda resten till en sifferkod
        for vokal in self._mjuka_vokaler:
            rest = [ordet.replace('C' + vokal, '8' + vokal) for ordet in rest]
        rest = [ordet.translate(self._trans) for ordet in rest]

        # Steg 10, Ta bort intilliggande dubbletter
        rest = [self._delete_consecutive_repeats(ordet) for ordet in rest]

        # Steg 11, Ta bort alla "9"
        rest = [ordet.replace('9', '') for ordet in rest]

        # Steg 12, Sätt ihop delarna igen
        ordlista = [
            ''.join(ordet) for ordet in zip((_[0:1] for _ in ordlista), rest)

        ]

        # truncate, if max_length is set
        if max_length > 0:
            ordlista = [ordet[:max_length] for ordet in ordlista]

        return tuple(ordlista)


def sfinxbis(word, max_length=-1):
    """Return the SfinxBis code for a word.

    This is a wrapper for :py:meth:`SfinxBis.encode`.

    Args:
        word (str): The word to transform
        max_length (int): The length of the code returned (defaults to
            unlimited)

    Returns:
        tuple: The SfinxBis value

    Examples:
        >>> sfinxbis('Christopher')
        ('K68376',)
        >>> sfinxbis('Niall')
        ('N4',)
        >>> sfinxbis('Smith')
        ('S53',)
        >>> sfinxbis('Schmidt')
        ('S53',)

        >>> sfinxbis('Johansson')
        ('J585',)
        >>> sfinxbis('Sjöberg')
        ('#162',)

    """
    return SfinxBis().encode(word, max_length)


if __name__ == '__main__':
    import doctest

    doctest.testmod()


1		# -- coding: utf-8 --
		0 ignored issues – show Coding Style Naming introduced 2018-11-10 01:42 UTC by Report Bug Copy Issue Report The name `_SfinxBis` does not conform to the module naming conventions (`(([a-z_][a-z0-9_]*)\|([A-Z][a-zA-Z0-9]+))$`). This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
2
3		# Copyright 2014-2018 by Christopher C. Little.
4		# This file is part of Abydos.
5		#
6		# Abydos is free software: you can redistribute it and/or modify
7		# it under the terms of the GNU General Public License as published by
8		# the Free Software Foundation, either version 3 of the License, or
9		# (at your option) any later version.
10		#
11		# Abydos is distributed in the hope that it will be useful,
12		# but WITHOUT ANY WARRANTY; without even the implied warranty of
13		# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14		# GNU General Public License for more details.
15		#
16		# You should have received a copy of the GNU General Public License
17		# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19	1	"""abydos.phonetic._SfinxBis.
20
21		SfinxBis
22		"""
23
24	1	from __future__ import (
25		absolute_import,
26		division,
27		print_function,
28		unicode_literals,
29		)
30
31	1	from unicodedata import normalize as unicode_normalize
32
33	1	from six import text_type
34
35	1	from ._Phonetic import Phonetic
36
37	1	__all__ = ['SfinxBis', 'sfinxbis']
38
39
40	1	class SfinxBis(Phonetic):
		0 ignored issues – show Unused Code introduced 2018-11-10 01:42 UTC by Report Bug Copy Issue Report The variable `__class__` seems to be unused. Loading history...
41		"""SfinxBis code.
42
43		SfinxBis is a Soundex-like algorithm defined in :cite:`Axelsson:2009`.
44
45		This implementation follows the reference implementation:
46		:cite:`Sjoo:2009`.
47
48		SfinxBis is intended chiefly for Swedish names.
49		"""
50
51	1	_adelstitler = (
52		' DE LA ',
53		' DE LAS ',
54		' DE LOS ',
55		' VAN DE ',
56		' VAN DEN ',
57		' VAN DER ',
58		' VON DEM ',
59		' VON DER ',
60		' AF ',
61		' AV ',
62		' DA ',
63		' DE ',
64		' DEL ',
65		' DEN ',
66		' DES ',
67		' DI ',
68		' DO ',
69		' DON ',
70		' DOS ',
71		' DU ',
72		' E ',
73		' IN ',
74		' LA ',
75		' LE ',
76		' MAC ',
77		' MC ',
78		' VAN ',
79		' VON ',
80		' Y ',
81		' S:T ',
82		)
83
84	1	_harde_vokaler = {'A', 'O', 'U', 'Å'}
85	1	_mjuka_vokaler = {'E', 'I', 'Y', 'Ä', 'Ö'}
86	1	_uc_c_set = {
87		'B',
88		'C',
89		'D',
90		'F',
91		'G',
92		'H',
93		'J',
94		'K',
95		'L',
96		'M',
97		'N',
98		'P',
99		'Q',
100		'R',
101		'S',
102		'T',
103		'V',
104		'W',
105		'X',
106		'Z',
107		}
108	1	_uc_set = {
109		'A',
110		'B',
111		'C',
112		'D',
113		'E',
114		'F',
115		'G',
116		'H',
117		'I',
118		'J',
119		'K',
120		'L',
121		'M',
122		'N',
123		'O',
124		'P',
125		'Q',
126		'R',
127		'S',
128		'T',
129		'U',
130		'V',
131		'W',
132		'X',
133		'Y',
134		'Z',
135		'Ä',
136		'Å',
137		'Ö',
138		}
139
140	1	_trans = dict(
141		zip(
142		(ord(_) for _ in 'BCDFGHJKLMNPQRSTVZAOUÅEIYÄÖ'),
		0 ignored issues – show Comprehensibility Best Practice introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report The variable `_` does not seem to be defined. Loading history...
143		'123729224551268378999999999',
144		)
145		)
146
147	1	_substitutions = dict(
148		zip(
149		(ord(_) for _ in 'WZÀÁÂÃÆÇÈÉÊËÌÍÎÏÑÒÓÔÕØÙÚÛÜÝ'),
150		'VSAAAAÄCEEEEIIIINOOOOÖUUUYY',
151		)
152		)
153
154	1	def encode(self, word, max_length=-1):
		0 ignored issues – show Bug introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Parameters differ from overridden 'encode' method Loading history...
155		"""Return the SfinxBis code for a word.
156
157		Args:
158		word (str): The word to transform
159		max_length (int): The length of the code returned (defaults to
160		unlimited)
161
162		Returns:
163		tuple: The SfinxBis value
164
165		Examples:
166		>>> pe = SfinxBis()
167		>>> pe.encode('Christopher')
168		('K68376',)
169		>>> pe.encode('Niall')
170		('N4',)
171		>>> pe.encode('Smith')
172		('S53',)
173		>>> pe.encode('Schmidt')
174		('S53',)
175
176		>>> pe.encode('Johansson')
177		('J585',)
178		>>> pe.encode('Sjöberg')
179		('#162',)
180
181		"""
182
183	1	def _foersvensker(lokal_ordet):
184		"""Return the Swedish-ized form of the word.
185
186		Args:
187		lokal_ordet (str): Word to transform
188
189		Returns:
190		str: Transformed word
191
192		"""
193	1	lokal_ordet = lokal_ordet.replace('STIERN', 'STJÄRN')
194	1	lokal_ordet = lokal_ordet.replace('HIE', 'HJ')
195	1	lokal_ordet = lokal_ordet.replace('SIÖ', 'SJÖ')
196	1	lokal_ordet = lokal_ordet.replace('SCH', 'SH')
197	1	lokal_ordet = lokal_ordet.replace('QU', 'KV')
198	1	lokal_ordet = lokal_ordet.replace('IO', 'JO')
199	1	lokal_ordet = lokal_ordet.replace('PH', 'F')
200
201	1	for i in self._harde_vokaler:
202	1	lokal_ordet = lokal_ordet.replace(i + 'Ü', i + 'J')
203	1	lokal_ordet = lokal_ordet.replace(i + 'Y', i + 'J')
204	1	lokal_ordet = lokal_ordet.replace(i + 'I', i + 'J')
205	1	for i in self._mjuka_vokaler:
206	1	lokal_ordet = lokal_ordet.replace(i + 'Ü', i + 'J')
207	1	lokal_ordet = lokal_ordet.replace(i + 'Y', i + 'J')
208	1	lokal_ordet = lokal_ordet.replace(i + 'I', i + 'J')
209
210	1	if 'H' in lokal_ordet:
211	1	for i in self._uc_c_set:
212	1	lokal_ordet = lokal_ordet.replace('H' + i, i)
213
214	1	lokal_ordet = lokal_ordet.translate(self._substitutions)
215
216	1	lokal_ordet = lokal_ordet.replace('Ð', 'ETH')
217	1	lokal_ordet = lokal_ordet.replace('Þ', 'TH')
218	1	lokal_ordet = lokal_ordet.replace('ß', 'SS')
219
220	1	return lokal_ordet
221
222	1	def _koda_foersta_ljudet(lokal_ordet):
223		"""Return the word with the first sound coded.
224
225		Args:
226		lokal_ordet (str): Word to transform
227
228		Returns:
229		str: Transformed word
230
231		"""
232	1	if (
233		lokal_ordet[0:1] in self._mjuka_vokaler
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
234		or lokal_ordet[0:1] in self._harde_vokaler
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
235		):
236	1	lokal_ordet = '$' + lokal_ordet[1:]
237	1	elif lokal_ordet[0:2] in ('DJ', 'GJ', 'HJ', 'LJ'):
238	1	lokal_ordet = 'J' + lokal_ordet[2:]
239	1	elif (
240		lokal_ordet[0:1] == 'G'
		0 ignored issues – show Coding Style introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
241		and lokal_ordet[1:2] in self._mjuka_vokaler
		0 ignored issues – show Coding Style introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
242		):
243	1	lokal_ordet = 'J' + lokal_ordet[1:]
244	1	elif lokal_ordet[0:1] == 'Q':
245	1	lokal_ordet = 'K' + lokal_ordet[1:]
246	1	elif lokal_ordet[0:2] == 'CH' and lokal_ordet[2:3] in frozenset(
247		self._mjuka_vokaler \| self._harde_vokaler
		0 ignored issues – show Coding Style introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
248		):
249	1	lokal_ordet = '#' + lokal_ordet[2:]
250	1	elif (
251		lokal_ordet[0:1] == 'C'
		0 ignored issues – show Coding Style introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
252		and lokal_ordet[1:2] in self._harde_vokaler
		0 ignored issues – show Coding Style introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
253		):
254	1	lokal_ordet = 'K' + lokal_ordet[1:]
255	1	elif (
256		lokal_ordet[0:1] == 'C' and lokal_ordet[1:2] in self._uc_c_set
		0 ignored issues – show Coding Style introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
257		):
258	1	lokal_ordet = 'K' + lokal_ordet[1:]
259	1	elif lokal_ordet[0:1] == 'X':
260	1	lokal_ordet = 'S' + lokal_ordet[1:]
261	1	elif (
262		lokal_ordet[0:1] == 'C'
		0 ignored issues – show Coding Style introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
263		and lokal_ordet[1:2] in self._mjuka_vokaler
		0 ignored issues – show Coding Style introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
264		):
265	1	lokal_ordet = 'S' + lokal_ordet[1:]
266	1	elif lokal_ordet[0:3] in ('SKJ', 'STJ', 'SCH'):
267	1	lokal_ordet = '#' + lokal_ordet[3:]
268	1	elif lokal_ordet[0:2] in ('SH', 'KJ', 'TJ', 'SJ'):
269	1	lokal_ordet = '#' + lokal_ordet[2:]
270	1	elif (
271		lokal_ordet[0:2] == 'SK'
		0 ignored issues – show Coding Style introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
272		and lokal_ordet[2:3] in self._mjuka_vokaler
		0 ignored issues – show Coding Style introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
273		):
274	1	lokal_ordet = '#' + lokal_ordet[2:]
275	1	elif (
276		lokal_ordet[0:1] == 'K'
		0 ignored issues – show Coding Style introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
277		and lokal_ordet[1:2] in self._mjuka_vokaler
		0 ignored issues – show Coding Style introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
278		):
279	1	lokal_ordet = '#' + lokal_ordet[1:]
280	1	return lokal_ordet
281
282		# Steg 1, Versaler
283	1	word = unicode_normalize('NFC', text_type(word.upper()))
284	1	word = word.replace('ß', 'SS')
285	1	word = word.replace('-', ' ')
286
287		# Steg 2, Ta bort adelsprefix
288	1	for adelstitel in self._adelstitler:
289	1	while adelstitel in word:
290	1	word = word.replace(adelstitel, ' ')
291	1	if word.startswith(adelstitel[1:]):
292	1	word = word[len(adelstitel) - 1 :]
293
294		# Split word into tokens
295	1	ordlista = word.split()
296
297		# Steg 3, Ta bort dubbelteckning i början på namnet
298	1	ordlista = [
299		self._delete_consecutive_repeats(ordet) for ordet in ordlista
300		]
301	1	if not ordlista:
302		# noinspection PyRedundantParentheses
303	1	return ('',)
304
305		# Steg 4, Försvenskning
306	1	ordlista = [_foersvensker(ordet) for ordet in ordlista]
307
308		# Steg 5, Ta bort alla tecken som inte är A-Ö (65-90,196,197,214)
309	1	ordlista = [
310		''.join(c for c in ordet if c in self._uc_set)
311		for ordet in ordlista
312		]
313
314		# Steg 6, Koda första ljudet
315	1	ordlista = [_koda_foersta_ljudet(ordet) for ordet in ordlista]
316
317		# Steg 7, Dela upp namnet i två delar
318	1	rest = [ordet[1:] for ordet in ordlista]
319
320		# Steg 8, Utför fonetisk transformation i resten
321	1	rest = [ordet.replace('DT', 'T') for ordet in rest]
322	1	rest = [ordet.replace('X', 'KS') for ordet in rest]
323
324		# Steg 9, Koda resten till en sifferkod
325	1	for vokal in self._mjuka_vokaler:
326	1	rest = [ordet.replace('C' + vokal, '8' + vokal) for ordet in rest]
327	1	rest = [ordet.translate(self._trans) for ordet in rest]
328
329		# Steg 10, Ta bort intilliggande dubbletter
330	1	rest = [self._delete_consecutive_repeats(ordet) for ordet in rest]
331
332		# Steg 11, Ta bort alla "9"
333	1	rest = [ordet.replace('9', '') for ordet in rest]
334
335		# Steg 12, Sätt ihop delarna igen
336	1	ordlista = [
337		''.join(ordet) for ordet in zip((_[0:1] for _ in ordlista), rest)
		0 ignored issues – show Comprehensibility Best Practice introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report The variable `_` does not seem to be defined. Loading history...
338		]
339
340		# truncate, if max_length is set
341	1	if max_length > 0:
342	1	ordlista = [ordet[:max_length] for ordet in ordlista]
343
344	1	return tuple(ordlista)
345
346
347	1	def sfinxbis(word, max_length=-1):
348		"""Return the SfinxBis code for a word.
349
350		This is a wrapper for :py:meth:`SfinxBis.encode`.
351
352		Args:
353		word (str): The word to transform
354		max_length (int): The length of the code returned (defaults to
355		unlimited)
356
357		Returns:
358		tuple: The SfinxBis value
359
360		Examples:
361		>>> sfinxbis('Christopher')
362		('K68376',)
363		>>> sfinxbis('Niall')
364		('N4',)
365		>>> sfinxbis('Smith')
366		('S53',)
367		>>> sfinxbis('Schmidt')
368		('S53',)
369
370		>>> sfinxbis('Johansson')
371		('J585',)
372		>>> sfinxbis('Sjöberg')
373		('#162',)
374
375		"""
376	1	return SfinxBis().encode(word, max_length)
377
378
379		if __name__ == '__main__':
380		import doctest
381
382		doctest.testmod()
383

chrislit / abydos

Pull Request — master (#141)

abydos.phonetic._SfinxBis.sfinxbis() A

Complexity

Size

Duplication

Code Coverage

Importance

Duplication Side-by-Side

Filter issues like