abydos.phonetic._sv.norphone() - Code Metrics - Inspection of "0.3.6" - chrislit/abydos - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Pull Request — master (#138)

by Chris

created 2018-11-05 04:07 UTC

abydos.phonetic._sv.norphone() A

↳ Parent: abydos.phonetic._sv

Complexity

Conditions

Size

Total Lines	21
Code Lines	2

Duplication

Lines	0
Ratio	0 %

Code Coverage

Tests	2
CRAP Score	1

Importance

Changes

Metric	Value
eloc	2
dl	0
loc	21
ccs	2
cts	2
cp	1
rs	10
c	0
b	0
f	0
cc	1
nop	1
crap	1

# -*- coding: utf-8 -*-

# Copyright 2014-2018 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.phonetic._sv.

The phonetic._sv module implements phonetic algorithms for Scandinavian names
& languages (currently Swedish & Norwegian), including:

    - SfinxBis
    - Norphone
"""

from __future__ import unicode_literals

from unicodedata import normalize as unicode_normalize

from six import text_type

from ._phonetic import Phonetic

__all__ = ['Norphone', 'SfinxBis', 'norphone', 'sfinxbis']


class SfinxBis(Phonetic):

    """SfinxBis code.

    SfinxBis is a Soundex-like algorithm defined in :cite:`Axelsson:2009`.

    This implementation follows the reference implementation:
    :cite:`Sjoo:2009`.

    SfinxBis is intended chiefly for Swedish names.
    """

    _adelstitler = (
        ' DE LA ',
        ' DE LAS ',
        ' DE LOS ',
        ' VAN DE ',
        ' VAN DEN ',
        ' VAN DER ',
        ' VON DEM ',
        ' VON DER ',
        ' AF ',
        ' AV ',
        ' DA ',
        ' DE ',
        ' DEL ',
        ' DEN ',
        ' DES ',
        ' DI ',
        ' DO ',
        ' DON ',
        ' DOS ',
        ' DU ',
        ' E ',
        ' IN ',
        ' LA ',
        ' LE ',
        ' MAC ',
        ' MC ',
        ' VAN ',
        ' VON ',
        ' Y ',
        ' S:T ',
    )

    _harde_vokaler = {'A', 'O', 'U', 'Å'}
    _mjuka_vokaler = {'E', 'I', 'Y', 'Ä', 'Ö'}
    _uc_c_set = {
        'B',
        'C',
        'D',
        'F',
        'G',
        'H',
        'J',
        'K',
        'L',
        'M',
        'N',
        'P',
        'Q',
        'R',
        'S',
        'T',
        'V',
        'W',
        'X',
        'Z',
    }
    _uc_set = {
        'A',
        'B',
        'C',
        'D',
        'E',
        'F',
        'G',
        'H',
        'I',
        'J',
        'K',
        'L',
        'M',
        'N',
        'O',
        'P',
        'Q',
        'R',
        'S',
        'T',
        'U',
        'V',
        'W',
        'X',
        'Y',
        'Z',
        'Ä',
        'Å',
        'Ö',
    }

    _trans = dict(
        zip(
            (ord(_) for _ in 'BCDFGHJKLMNPQRSTVZAOUÅEIYÄÖ'),

            '123729224551268378999999999',
        )
    )

    _substitutions = dict(
        zip(
            (ord(_) for _ in 'WZÀÁÂÃÆÇÈÉÊËÌÍÎÏÑÒÓÔÕØÙÚÛÜÝ'),
            'VSAAAAÄCEEEEIIIINOOOOÖUUUYY',
        )
    )

    def encode(self, word, max_length=-1):

        """Return the SfinxBis code for a word.

        :param str word: the word to transform
        :param int max_length: the length of the code returned (defaults to
            unlimited)
        :returns: the SfinxBis value
        :rtype: tuple

        >>> pe = SfinxBis()
        >>> pe.encode('Christopher')
        ('K68376',)
        >>> pe.encode('Niall')
        ('N4',)
        >>> pe.encode('Smith')
        ('S53',)
        >>> pe.encode('Schmidt')
        ('S53',)

        >>> pe.encode('Johansson')
        ('J585',)
        >>> pe.encode('Sjöberg')
        ('#162',)
        """

        def _foersvensker(lokal_ordet):
            """Return the Swedish-ized form of the word."""
            lokal_ordet = lokal_ordet.replace('STIERN', 'STJÄRN')
            lokal_ordet = lokal_ordet.replace('HIE', 'HJ')
            lokal_ordet = lokal_ordet.replace('SIÖ', 'SJÖ')
            lokal_ordet = lokal_ordet.replace('SCH', 'SH')
            lokal_ordet = lokal_ordet.replace('QU', 'KV')
            lokal_ordet = lokal_ordet.replace('IO', 'JO')
            lokal_ordet = lokal_ordet.replace('PH', 'F')

            for i in self._harde_vokaler:
                lokal_ordet = lokal_ordet.replace(i + 'Ü', i + 'J')
                lokal_ordet = lokal_ordet.replace(i + 'Y', i + 'J')
                lokal_ordet = lokal_ordet.replace(i + 'I', i + 'J')
            for i in self._mjuka_vokaler:
                lokal_ordet = lokal_ordet.replace(i + 'Ü', i + 'J')
                lokal_ordet = lokal_ordet.replace(i + 'Y', i + 'J')
                lokal_ordet = lokal_ordet.replace(i + 'I', i + 'J')

            if 'H' in lokal_ordet:
                for i in self._uc_c_set:
                    lokal_ordet = lokal_ordet.replace('H' + i, i)

            lokal_ordet = lokal_ordet.translate(self._substitutions)

            lokal_ordet = lokal_ordet.replace('Ð', 'ETH')
            lokal_ordet = lokal_ordet.replace('Þ', 'TH')
            lokal_ordet = lokal_ordet.replace('ß', 'SS')

            return lokal_ordet

        def _koda_foersta_ljudet(lokal_ordet):
            """Return the word with the first sound coded."""
            if (
                lokal_ordet[0:1] in self._mjuka_vokaler

                or lokal_ordet[0:1] in self._harde_vokaler

            ):
                lokal_ordet = '$' + lokal_ordet[1:]
            elif lokal_ordet[0:2] in ('DJ', 'GJ', 'HJ', 'LJ'):
                lokal_ordet = 'J' + lokal_ordet[2:]
            elif (
                lokal_ordet[0:1] == 'G'

                and lokal_ordet[1:2] in self._mjuka_vokaler

            ):
                lokal_ordet = 'J' + lokal_ordet[1:]
            elif lokal_ordet[0:1] == 'Q':
                lokal_ordet = 'K' + lokal_ordet[1:]
            elif lokal_ordet[0:2] == 'CH' and lokal_ordet[2:3] in frozenset(
                self._mjuka_vokaler | self._harde_vokaler

            ):
                lokal_ordet = '#' + lokal_ordet[2:]
            elif (
                lokal_ordet[0:1] == 'C'

                and lokal_ordet[1:2] in self._harde_vokaler

            ):
                lokal_ordet = 'K' + lokal_ordet[1:]
            elif (
                lokal_ordet[0:1] == 'C' and lokal_ordet[1:2] in self._uc_c_set

            ):
                lokal_ordet = 'K' + lokal_ordet[1:]
            elif lokal_ordet[0:1] == 'X':
                lokal_ordet = 'S' + lokal_ordet[1:]
            elif (
                lokal_ordet[0:1] == 'C'

                and lokal_ordet[1:2] in self._mjuka_vokaler

            ):
                lokal_ordet = 'S' + lokal_ordet[1:]
            elif lokal_ordet[0:3] in ('SKJ', 'STJ', 'SCH'):
                lokal_ordet = '#' + lokal_ordet[3:]
            elif lokal_ordet[0:2] in ('SH', 'KJ', 'TJ', 'SJ'):
                lokal_ordet = '#' + lokal_ordet[2:]
            elif (
                lokal_ordet[0:2] == 'SK'

                and lokal_ordet[2:3] in self._mjuka_vokaler

            ):
                lokal_ordet = '#' + lokal_ordet[2:]
            elif (
                lokal_ordet[0:1] == 'K'

                and lokal_ordet[1:2] in self._mjuka_vokaler

            ):
                lokal_ordet = '#' + lokal_ordet[1:]
            return lokal_ordet

        # Steg 1, Versaler
        word = unicode_normalize('NFC', text_type(word.upper()))
        word = word.replace('ß', 'SS')
        word = word.replace('-', ' ')

        # Steg 2, Ta bort adelsprefix
        for adelstitel in self._adelstitler:
            while adelstitel in word:
                word = word.replace(adelstitel, ' ')
            if word.startswith(adelstitel[1:]):
                word = word[len(adelstitel) - 1 :]

        # Split word into tokens
        ordlista = word.split()

        # Steg 3, Ta bort dubbelteckning i början på namnet
        ordlista = [
            self._delete_consecutive_repeats(ordet) for ordet in ordlista
        ]
        if not ordlista:
            # noinspection PyRedundantParentheses
            return ('',)

        # Steg 4, Försvenskning
        ordlista = [_foersvensker(ordet) for ordet in ordlista]

        # Steg 5, Ta bort alla tecken som inte är A-Ö (65-90,196,197,214)
        ordlista = [
            ''.join(c for c in ordet if c in self._uc_set)
            for ordet in ordlista
        ]

        # Steg 6, Koda första ljudet
        ordlista = [_koda_foersta_ljudet(ordet) for ordet in ordlista]

        # Steg 7, Dela upp namnet i två delar
        rest = [ordet[1:] for ordet in ordlista]

        # Steg 8, Utför fonetisk transformation i resten
        rest = [ordet.replace('DT', 'T') for ordet in rest]
        rest = [ordet.replace('X', 'KS') for ordet in rest]

        # Steg 9, Koda resten till en sifferkod
        for vokal in self._mjuka_vokaler:
            rest = [ordet.replace('C' + vokal, '8' + vokal) for ordet in rest]
        rest = [ordet.translate(self._trans) for ordet in rest]

        # Steg 10, Ta bort intilliggande dubbletter
        rest = [self._delete_consecutive_repeats(ordet) for ordet in rest]

        # Steg 11, Ta bort alla "9"
        rest = [ordet.replace('9', '') for ordet in rest]

        # Steg 12, Sätt ihop delarna igen
        ordlista = [
            ''.join(ordet) for ordet in zip((_[0:1] for _ in ordlista), rest)

        ]

        # truncate, if max_length is set
        if max_length > 0:
            ordlista = [ordet[:max_length] for ordet in ordlista]

        return tuple(ordlista)


def sfinxbis(word, max_length=-1):
    """Return the SfinxBis code for a word.

    This is a wraper for :py:meth:`SfinxBis.encode`.

    :param str word: the word to transform
    :param int max_length: the length of the code returned (defaults to
        unlimited)
    :returns: the SfinxBis value
    :rtype: tuple

    >>> sfinxbis('Christopher')
    ('K68376',)
    >>> sfinxbis('Niall')
    ('N4',)
    >>> sfinxbis('Smith')
    ('S53',)
    >>> sfinxbis('Schmidt')
    ('S53',)

    >>> sfinxbis('Johansson')
    ('J585',)
    >>> sfinxbis('Sjöberg')
    ('#162',)
    """
    return SfinxBis().encode(word, max_length)


class Norphone(Phonetic):

    """Norphone.

    The reference implementation by Lars Marius Garshol is available in
    :cite:`Garshol:2015`.

    Norphone was designed for Norwegian, but this implementation has been
    extended to support Swedish vowels as well. This function incorporates
    the "not implemented" rules from the above file's rule set.
    """

    _uc_v_set = {'A', 'E', 'I', 'O', 'U', 'Y', 'Å', 'Æ', 'Ø', 'Ä', 'Ö'}

    _replacements = {
        4: {'SKEI': 'X'},
        3: {'SKJ': 'X', 'KEI': 'X'},
        2: {
            'CH': 'K',
            'CK': 'K',
            'GJ': 'J',
            'GH': 'K',
            'HG': 'K',
            'HJ': 'J',
            'HL': 'L',
            'HR': 'R',
            'KJ': 'X',
            'KI': 'X',
            'LD': 'L',
            'ND': 'N',
            'PH': 'F',
            'TH': 'T',
            'SJ': 'X',
        },
        1: {'W': 'V', 'X': 'KS', 'Z': 'S', 'D': 'T', 'G': 'K'},
    }

    def encode(self, word):
        """Return the Norphone code.

        :param str word: the word to transform
        :returns: the Norphone code
        :rtype: str

        >>> pe = Norphone()
        >>> pe.encode('Hansen')
        'HNSN'
        >>> pe.encode('Larsen')
        'LRSN'
        >>> pe.encode('Aagaard')
        'ÅKRT'
        >>> pe.encode('Braaten')
        'BRTN'
        >>> pe.encode('Sandvik')
        'SNVK'
        """
        word = word.upper()

        code = ''
        skip = 0

        if word[0:2] == 'AA':
            code = 'Å'
            skip = 2
        elif word[0:2] == 'GI':
            code = 'J'
            skip = 2
        elif word[0:3] == 'SKY':
            code = 'X'
            skip = 3
        elif word[0:2] == 'EI':
            code = 'Æ'
            skip = 2
        elif word[0:2] == 'KY':
            code = 'X'
            skip = 2
        elif word[:1] == 'C':
            code = 'K'
            skip = 1
        elif word[:1] == 'Ä':
            code = 'Æ'
            skip = 1
        elif word[:1] == 'Ö':
            code = 'Ø'
            skip = 1

        if word[-2:] == 'DT':
            word = word[:-2] + 'T'
        # Though the rules indicate this rule applies in all positions, the
        # reference implementation indicates it applies only in final position.
        elif word[-2:-1] in self._uc_v_set and word[-1:] == 'D':
            word = word[:-2]

        for pos, char in enumerate(word):
            if skip:
                skip -= 1
            else:
                for length in sorted(self._replacements, reverse=True):
                    if word[pos : pos + length] in self._replacements[length]:
                        code += self._replacements[length][
                            word[pos : pos + length]
                        ]
                        skip = length - 1
                        break
                else:
                    if not pos or char not in self._uc_v_set:
                        code += char

        code = self._delete_consecutive_repeats(code)

        return code


def norphone(word):
    """Return the Norphone code.

    This is a wraper for :py:meth:`Norphone.encode`.

    :param str word: the word to transform
    :returns: the Norphone code
    :rtype: str

    >>> norphone('Hansen')
    'HNSN'
    >>> norphone('Larsen')
    'LRSN'
    >>> norphone('Aagaard')
    'ÅKRT'
    >>> norphone('Braaten')
    'BRTN'
    >>> norphone('Sandvik')
    'SNVK'
    """
    return Norphone().encode(word)


if __name__ == '__main__':
    import doctest

    doctest.testmod()


1		# -- coding: utf-8 --
2
3		# Copyright 2014-2018 by Christopher C. Little.
4		# This file is part of Abydos.
5		#
6		# Abydos is free software: you can redistribute it and/or modify
7		# it under the terms of the GNU General Public License as published by
8		# the Free Software Foundation, either version 3 of the License, or
9		# (at your option) any later version.
10		#
11		# Abydos is distributed in the hope that it will be useful,
12		# but WITHOUT ANY WARRANTY; without even the implied warranty of
13		# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14		# GNU General Public License for more details.
15		#
16		# You should have received a copy of the GNU General Public License
17		# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19	1	"""abydos.phonetic._sv.
20
21		The phonetic._sv module implements phonetic algorithms for Scandinavian names
22		& languages (currently Swedish & Norwegian), including:
23
24		- SfinxBis
25		- Norphone
26		"""
27
28	1	from __future__ import unicode_literals
29
30	1	from unicodedata import normalize as unicode_normalize
31
32	1	from six import text_type
33
34	1	from ._phonetic import Phonetic
35
36	1	__all__ = ['Norphone', 'SfinxBis', 'norphone', 'sfinxbis']
37
38
39	1	class SfinxBis(Phonetic):
		0 ignored issues – show Unused Code introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report The variable `__class__` seems to be unused. Loading history...
40		"""SfinxBis code.
41
42		SfinxBis is a Soundex-like algorithm defined in :cite:`Axelsson:2009`.
43
44		This implementation follows the reference implementation:
45		:cite:`Sjoo:2009`.
46
47		SfinxBis is intended chiefly for Swedish names.
48		"""
49
50	1	_adelstitler = (
51		' DE LA ',
52		' DE LAS ',
53		' DE LOS ',
54		' VAN DE ',
55		' VAN DEN ',
56		' VAN DER ',
57		' VON DEM ',
58		' VON DER ',
59		' AF ',
60		' AV ',
61		' DA ',
62		' DE ',
63		' DEL ',
64		' DEN ',
65		' DES ',
66		' DI ',
67		' DO ',
68		' DON ',
69		' DOS ',
70		' DU ',
71		' E ',
72		' IN ',
73		' LA ',
74		' LE ',
75		' MAC ',
76		' MC ',
77		' VAN ',
78		' VON ',
79		' Y ',
80		' S:T ',
81		)
82
83	1	_harde_vokaler = {'A', 'O', 'U', 'Å'}
84	1	_mjuka_vokaler = {'E', 'I', 'Y', 'Ä', 'Ö'}
85	1	_uc_c_set = {
86		'B',
87		'C',
88		'D',
89		'F',
90		'G',
91		'H',
92		'J',
93		'K',
94		'L',
95		'M',
96		'N',
97		'P',
98		'Q',
99		'R',
100		'S',
101		'T',
102		'V',
103		'W',
104		'X',
105		'Z',
106		}
107	1	_uc_set = {
108		'A',
109		'B',
110		'C',
111		'D',
112		'E',
113		'F',
114		'G',
115		'H',
116		'I',
117		'J',
118		'K',
119		'L',
120		'M',
121		'N',
122		'O',
123		'P',
124		'Q',
125		'R',
126		'S',
127		'T',
128		'U',
129		'V',
130		'W',
131		'X',
132		'Y',
133		'Z',
134		'Ä',
135		'Å',
136		'Ö',
137		}
138
139	1	_trans = dict(
140		zip(
141		(ord(_) for _ in 'BCDFGHJKLMNPQRSTVZAOUÅEIYÄÖ'),
		0 ignored issues – show Comprehensibility Best Practice introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report The variable `_` does not seem to be defined. Loading history...
142		'123729224551268378999999999',
143		)
144		)
145
146	1	_substitutions = dict(
147		zip(
148		(ord(_) for _ in 'WZÀÁÂÃÆÇÈÉÊËÌÍÎÏÑÒÓÔÕØÙÚÛÜÝ'),
149		'VSAAAAÄCEEEEIIIINOOOOÖUUUYY',
150		)
151		)
152
153	1	def encode(self, word, max_length=-1):
		0 ignored issues – show Bug introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Parameters differ from overridden 'encode' method Loading history...
154		"""Return the SfinxBis code for a word.
155
156		:param str word: the word to transform
157		:param int max_length: the length of the code returned (defaults to
158		unlimited)
159		:returns: the SfinxBis value
160		:rtype: tuple
161
162		>>> pe = SfinxBis()
163		>>> pe.encode('Christopher')
164		('K68376',)
165		>>> pe.encode('Niall')
166		('N4',)
167		>>> pe.encode('Smith')
168		('S53',)
169		>>> pe.encode('Schmidt')
170		('S53',)
171
172		>>> pe.encode('Johansson')
173		('J585',)
174		>>> pe.encode('Sjöberg')
175		('#162',)
176		"""
177
178	1	def _foersvensker(lokal_ordet):
179		"""Return the Swedish-ized form of the word."""
180	1	lokal_ordet = lokal_ordet.replace('STIERN', 'STJÄRN')
181	1	lokal_ordet = lokal_ordet.replace('HIE', 'HJ')
182	1	lokal_ordet = lokal_ordet.replace('SIÖ', 'SJÖ')
183	1	lokal_ordet = lokal_ordet.replace('SCH', 'SH')
184	1	lokal_ordet = lokal_ordet.replace('QU', 'KV')
185	1	lokal_ordet = lokal_ordet.replace('IO', 'JO')
186	1	lokal_ordet = lokal_ordet.replace('PH', 'F')
187
188	1	for i in self._harde_vokaler:
189	1	lokal_ordet = lokal_ordet.replace(i + 'Ü', i + 'J')
190	1	lokal_ordet = lokal_ordet.replace(i + 'Y', i + 'J')
191	1	lokal_ordet = lokal_ordet.replace(i + 'I', i + 'J')
192	1	for i in self._mjuka_vokaler:
193	1	lokal_ordet = lokal_ordet.replace(i + 'Ü', i + 'J')
194	1	lokal_ordet = lokal_ordet.replace(i + 'Y', i + 'J')
195	1	lokal_ordet = lokal_ordet.replace(i + 'I', i + 'J')
196
197	1	if 'H' in lokal_ordet:
198	1	for i in self._uc_c_set:
199	1	lokal_ordet = lokal_ordet.replace('H' + i, i)
200
201	1	lokal_ordet = lokal_ordet.translate(self._substitutions)
202
203	1	lokal_ordet = lokal_ordet.replace('Ð', 'ETH')
204	1	lokal_ordet = lokal_ordet.replace('Þ', 'TH')
205	1	lokal_ordet = lokal_ordet.replace('ß', 'SS')
206
207	1	return lokal_ordet
208
209	1	def _koda_foersta_ljudet(lokal_ordet):
210		"""Return the word with the first sound coded."""
211	1	if (
212		lokal_ordet[0:1] in self._mjuka_vokaler
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
213		or lokal_ordet[0:1] in self._harde_vokaler
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
214		):
215	1	lokal_ordet = '$' + lokal_ordet[1:]
216	1	elif lokal_ordet[0:2] in ('DJ', 'GJ', 'HJ', 'LJ'):
217	1	lokal_ordet = 'J' + lokal_ordet[2:]
218	1	elif (
219		lokal_ordet[0:1] == 'G'
		0 ignored issues – show Coding Style introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
220		and lokal_ordet[1:2] in self._mjuka_vokaler
		0 ignored issues – show Coding Style introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
221		):
222	1	lokal_ordet = 'J' + lokal_ordet[1:]
223	1	elif lokal_ordet[0:1] == 'Q':
224	1	lokal_ordet = 'K' + lokal_ordet[1:]
225	1	elif lokal_ordet[0:2] == 'CH' and lokal_ordet[2:3] in frozenset(
226		self._mjuka_vokaler \| self._harde_vokaler
		0 ignored issues – show Coding Style introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
227		):
228	1	lokal_ordet = '#' + lokal_ordet[2:]
229	1	elif (
230		lokal_ordet[0:1] == 'C'
		0 ignored issues – show Coding Style introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
231		and lokal_ordet[1:2] in self._harde_vokaler
		0 ignored issues – show Coding Style introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
232		):
233	1	lokal_ordet = 'K' + lokal_ordet[1:]
234	1	elif (
235		lokal_ordet[0:1] == 'C' and lokal_ordet[1:2] in self._uc_c_set
		0 ignored issues – show Coding Style introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
236		):
237	1	lokal_ordet = 'K' + lokal_ordet[1:]
238	1	elif lokal_ordet[0:1] == 'X':
239	1	lokal_ordet = 'S' + lokal_ordet[1:]
240	1	elif (
241		lokal_ordet[0:1] == 'C'
		0 ignored issues – show Coding Style introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
242		and lokal_ordet[1:2] in self._mjuka_vokaler
		0 ignored issues – show Coding Style introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
243		):
244	1	lokal_ordet = 'S' + lokal_ordet[1:]
245	1	elif lokal_ordet[0:3] in ('SKJ', 'STJ', 'SCH'):
246	1	lokal_ordet = '#' + lokal_ordet[3:]
247	1	elif lokal_ordet[0:2] in ('SH', 'KJ', 'TJ', 'SJ'):
248	1	lokal_ordet = '#' + lokal_ordet[2:]
249	1	elif (
250		lokal_ordet[0:2] == 'SK'
		0 ignored issues – show Coding Style introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
251		and lokal_ordet[2:3] in self._mjuka_vokaler
		0 ignored issues – show Coding Style introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
252		):
253	1	lokal_ordet = '#' + lokal_ordet[2:]
254	1	elif (
255		lokal_ordet[0:1] == 'K'
		0 ignored issues – show Coding Style introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
256		and lokal_ordet[1:2] in self._mjuka_vokaler
		0 ignored issues – show Coding Style introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
257		):
258	1	lokal_ordet = '#' + lokal_ordet[1:]
259	1	return lokal_ordet
260
261		# Steg 1, Versaler
262	1	word = unicode_normalize('NFC', text_type(word.upper()))
263	1	word = word.replace('ß', 'SS')
264	1	word = word.replace('-', ' ')
265
266		# Steg 2, Ta bort adelsprefix
267	1	for adelstitel in self._adelstitler:
268	1	while adelstitel in word:
269	1	word = word.replace(adelstitel, ' ')
270	1	if word.startswith(adelstitel[1:]):
271	1	word = word[len(adelstitel) - 1 :]
272
273		# Split word into tokens
274	1	ordlista = word.split()
275
276		# Steg 3, Ta bort dubbelteckning i början på namnet
277	1	ordlista = [
278		self._delete_consecutive_repeats(ordet) for ordet in ordlista
279		]
280	1	if not ordlista:
281		# noinspection PyRedundantParentheses
282	1	return ('',)
283
284		# Steg 4, Försvenskning
285	1	ordlista = [_foersvensker(ordet) for ordet in ordlista]
286
287		# Steg 5, Ta bort alla tecken som inte är A-Ö (65-90,196,197,214)
288	1	ordlista = [
289		''.join(c for c in ordet if c in self._uc_set)
290		for ordet in ordlista
291		]
292
293		# Steg 6, Koda första ljudet
294	1	ordlista = [_koda_foersta_ljudet(ordet) for ordet in ordlista]
295
296		# Steg 7, Dela upp namnet i två delar
297	1	rest = [ordet[1:] for ordet in ordlista]
298
299		# Steg 8, Utför fonetisk transformation i resten
300	1	rest = [ordet.replace('DT', 'T') for ordet in rest]
301	1	rest = [ordet.replace('X', 'KS') for ordet in rest]
302
303		# Steg 9, Koda resten till en sifferkod
304	1	for vokal in self._mjuka_vokaler:
305	1	rest = [ordet.replace('C' + vokal, '8' + vokal) for ordet in rest]
306	1	rest = [ordet.translate(self._trans) for ordet in rest]
307
308		# Steg 10, Ta bort intilliggande dubbletter
309	1	rest = [self._delete_consecutive_repeats(ordet) for ordet in rest]
310
311		# Steg 11, Ta bort alla "9"
312	1	rest = [ordet.replace('9', '') for ordet in rest]
313
314		# Steg 12, Sätt ihop delarna igen
315	1	ordlista = [
316		''.join(ordet) for ordet in zip((_[0:1] for _ in ordlista), rest)
		0 ignored issues – show Comprehensibility Best Practice introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report The variable `_` does not seem to be defined. Loading history...
317		]
318
319		# truncate, if max_length is set
320	1	if max_length > 0:
321	1	ordlista = [ordet[:max_length] for ordet in ordlista]
322
323	1	return tuple(ordlista)
324
325
326	1	def sfinxbis(word, max_length=-1):
327		"""Return the SfinxBis code for a word.
328
329		This is a wraper for :py:meth:`SfinxBis.encode`.
330
331		:param str word: the word to transform
332		:param int max_length: the length of the code returned (defaults to
333		unlimited)
334		:returns: the SfinxBis value
335		:rtype: tuple
336
337		>>> sfinxbis('Christopher')
338		('K68376',)
339		>>> sfinxbis('Niall')
340		('N4',)
341		>>> sfinxbis('Smith')
342		('S53',)
343		>>> sfinxbis('Schmidt')
344		('S53',)
345
346		>>> sfinxbis('Johansson')
347		('J585',)
348		>>> sfinxbis('Sjöberg')
349		('#162',)
350		"""
351	1	return SfinxBis().encode(word, max_length)
352
353
354	1	class Norphone(Phonetic):
		0 ignored issues – show Unused Code introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report The variable `__class__` seems to be unused. Loading history...
355		"""Norphone.
356
357		The reference implementation by Lars Marius Garshol is available in
358		:cite:`Garshol:2015`.
359
360		Norphone was designed for Norwegian, but this implementation has been
361		extended to support Swedish vowels as well. This function incorporates
362		the "not implemented" rules from the above file's rule set.
363		"""
364
365	1	_uc_v_set = {'A', 'E', 'I', 'O', 'U', 'Y', 'Å', 'Æ', 'Ø', 'Ä', 'Ö'}
366
367	1	_replacements = {
368		4: {'SKEI': 'X'},
369		3: {'SKJ': 'X', 'KEI': 'X'},
370		2: {
371		'CH': 'K',
372		'CK': 'K',
373		'GJ': 'J',
374		'GH': 'K',
375		'HG': 'K',
376		'HJ': 'J',
377		'HL': 'L',
378		'HR': 'R',
379		'KJ': 'X',
380		'KI': 'X',
381		'LD': 'L',
382		'ND': 'N',
383		'PH': 'F',
384		'TH': 'T',
385		'SJ': 'X',
386		},
387		1: {'W': 'V', 'X': 'KS', 'Z': 'S', 'D': 'T', 'G': 'K'},
388		}
389
390	1	def encode(self, word):
391		"""Return the Norphone code.
392
393		:param str word: the word to transform
394		:returns: the Norphone code
395		:rtype: str
396
397		>>> pe = Norphone()
398		>>> pe.encode('Hansen')
399		'HNSN'
400		>>> pe.encode('Larsen')
401		'LRSN'
402		>>> pe.encode('Aagaard')
403		'ÅKRT'
404		>>> pe.encode('Braaten')
405		'BRTN'
406		>>> pe.encode('Sandvik')
407		'SNVK'
408		"""
409	1	word = word.upper()
410
411	1	code = ''
412	1	skip = 0
413
414	1	if word[0:2] == 'AA':
415	1	code = 'Å'
416	1	skip = 2
417	1	elif word[0:2] == 'GI':
418	1	code = 'J'
419	1	skip = 2
420	1	elif word[0:3] == 'SKY':
421	1	code = 'X'
422	1	skip = 3
423	1	elif word[0:2] == 'EI':
424	1	code = 'Æ'
425	1	skip = 2
426	1	elif word[0:2] == 'KY':
427	1	code = 'X'
428	1	skip = 2
429	1	elif word[:1] == 'C':
430	1	code = 'K'
431	1	skip = 1
432	1	elif word[:1] == 'Ä':
433	1	code = 'Æ'
434	1	skip = 1
435	1	elif word[:1] == 'Ö':
436	1	code = 'Ø'
437	1	skip = 1
438
439	1	if word[-2:] == 'DT':
440	1	word = word[:-2] + 'T'
441		# Though the rules indicate this rule applies in all positions, the
442		# reference implementation indicates it applies only in final position.
443	1	elif word[-2:-1] in self._uc_v_set and word[-1:] == 'D':
444	1	word = word[:-2]
445
446	1	for pos, char in enumerate(word):
447	1	if skip:
448	1	skip -= 1
449		else:
450	1	for length in sorted(self._replacements, reverse=True):
451	1	if word[pos : pos + length] in self._replacements[length]:
452	1	code += self._replacements[length][
453		word[pos : pos + length]
454		]
455	1	skip = length - 1
456	1	break
457		else:
458	1	if not pos or char not in self._uc_v_set:
459	1	code += char
460
461	1	code = self._delete_consecutive_repeats(code)
462
463	1	return code
464
465
466	1	def norphone(word):
467		"""Return the Norphone code.
468
469		This is a wraper for :py:meth:`Norphone.encode`.
470
471		:param str word: the word to transform
472		:returns: the Norphone code
473		:rtype: str
474
475		>>> norphone('Hansen')
476		'HNSN'
477		>>> norphone('Larsen')
478		'LRSN'
479		>>> norphone('Aagaard')
480		'ÅKRT'
481		>>> norphone('Braaten')
482		'BRTN'
483		>>> norphone('Sandvik')
484		'SNVK'
485		"""
486	1	return Norphone().encode(word)
487
488
489		if __name__ == '__main__':
490		import doctest
491
492		doctest.testmod()
493

chrislit / abydos

Pull Request — master (#138)

abydos.phonetic._sv.norphone() A

Complexity

Size

Duplication

Code Coverage

Importance

Duplication Side-by-Side

Filter issues like