abydos.phonetic._haase.haase_phonetik() - Code Metrics - Inspection of "Merge pull request #149 from chrislit/0.3.6" - chrislit/abydos - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — master ( f43547...71985b )

by Chris

created 2018-11-17 08:52 UTC

abydos.phonetic._haase.haase_phonetik() A

↳ Parent: abydos.phonetic._haase

Complexity

Conditions

Size

Total Lines	32
Code Lines	2

Duplication

Lines	0
Ratio	0 %

Code Coverage

Tests	2
CRAP Score	1

Importance

Changes

Metric	Value
cc	1
eloc	2
nop	2
dl	0
loc	32
ccs	2
cts	2
cp	1
crap	1
rs	10
c	0
b	0
f	0

# -*- coding: utf-8 -*-

# Copyright 2014-2018 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.phonetic._haase.

Haase Phonetik
"""

from __future__ import (
    absolute_import,
    division,
    print_function,
    unicode_literals,
)

from itertools import product
from unicodedata import normalize as unicode_normalize

from six import text_type
from six.moves import range

from ._phonetic import _Phonetic

__all__ = ['Haase', 'haase_phonetik']


class Haase(_Phonetic):

    """Haase Phonetik.

    Based on the algorithm described at :cite:`Prante:2015`.

    Based on the original :cite:`Haase:2000`.
    """

    _uc_v_set = set('AEIJOUY')

    def encode(self, word, primary_only=False):

        """Return the Haase Phonetik (numeric output) code for a word.

        While the output code is numeric, it is nevertheless a str.

        Parameters
        ----------
        word : str
            The word to transform
        primary_only : bool
            If True, only the primary code is returned

        Returns
        -------
        tuple
            The Haase Phonetik value as a numeric string

        Examples
        --------
        >>> pe = Haase()
        >>> pe.encode('Joachim')
        ('9496',)
        >>> pe.encode('Christoph')
        ('4798293', '8798293')
        >>> pe.encode('Jörg')
        ('974',)
        >>> pe.encode('Smith')
        ('8692',)
        >>> pe.encode('Schmidt')
        ('8692', '4692')

        """

        def _after(word, pos, letters):
            """Return True if word[pos] follows one of the supplied letters.

            Parameters
            ----------
            word : str
                Word to modify
            pos : int
                Position to examine
            letters : set
                Letters to check for

            Returns
            -------
            bool
                True if word[pos] follows one of letters

            """
            if pos > 0 and word[pos - 1] in letters:
                return True
            return False

        def _before(word, pos, letters):
            """Return True if word[pos] precedes one of the supplied letters.

            Parameters
            ----------
            word : str
                Word to modify
            pos : int
                Position to examine
            letters : set
                Letters to check for

            Returns
            -------
            bool
                True if word[pos] precedes one of letters

            """
            if pos + 1 < len(word) and word[pos + 1] in letters:
                return True
            return False

        word = unicode_normalize('NFKD', text_type(word.upper()))
        word = word.replace('ß', 'SS')

        word = word.replace('Ä', 'AE')
        word = word.replace('Ö', 'OE')
        word = word.replace('Ü', 'UE')
        word = ''.join(c for c in word if c in self._uc_set)

        variants = []
        if primary_only:
            variants = [word]
        else:
            pos = 0
            if word[:2] == 'CH':
                variants.append(('CH', 'SCH'))
                pos += 2
            len_3_vars = {
                'OWN': 'AUN',
                'WSK': 'RSK',
                'SCH': 'CH',
                'GLI': 'LI',
                'AUX': 'O',
                'EUX': 'O',
            }
            while pos < len(word):
                if word[pos : pos + 4] == 'ILLE':
                    variants.append(('ILLE', 'I'))
                    pos += 4
                elif word[pos : pos + 3] in len_3_vars:
                    variants.append(
                        (word[pos : pos + 3], len_3_vars[word[pos : pos + 3]])
                    )
                    pos += 3
                elif word[pos : pos + 2] == 'RB':
                    variants.append(('RB', 'RW'))
                    pos += 2
                elif len(word[pos:]) == 3 and word[pos:] == 'EAU':
                    variants.append(('EAU', 'O'))
                    pos += 3
                elif len(word[pos:]) == 1 and word[pos:] in {'A', 'O'}:
                    if word[pos:] == 'O':
                        variants.append(('O', 'OW'))
                    else:
                        variants.append(('A', 'AR'))
                    pos += 1
                else:
                    variants.append((word[pos],))
                    pos += 1

            variants = [''.join(letters) for letters in product(*variants)]

        def _haase_code(word):
            sdx = ''
            for i in range(len(word)):

                if word[i] in self._uc_v_set:

                    sdx += '9'
                elif word[i] == 'B':
                    sdx += '1'
                elif word[i] == 'P':
                    if _before(word, i, {'H'}):
                        sdx += '3'
                    else:
                        sdx += '1'
                elif word[i] in {'D', 'T'}:
                    if _before(word, i, {'C', 'S', 'Z'}):
                        sdx += '8'
                    else:
                        sdx += '2'
                elif word[i] in {'F', 'V', 'W'}:
                    sdx += '3'
                elif word[i] in {'G', 'K', 'Q'}:
                    sdx += '4'
                elif word[i] == 'C':
                    if _after(word, i, {'S', 'Z'}):
                        sdx += '8'
                    elif i == 0:
                        if _before(
                            word,

                            i,

                            {'A', 'H', 'K', 'L', 'O', 'Q', 'R', 'U', 'X'},

                        ):
                            sdx += '4'
                        else:
                            sdx += '8'
                    elif _before(word, i, {'A', 'H', 'K', 'O', 'Q', 'U', 'X'}):
                        sdx += '4'
                    else:
                        sdx += '8'
                elif word[i] == 'X':
                    if _after(word, i, {'C', 'K', 'Q'}):
                        sdx += '8'
                    else:
                        sdx += '48'
                elif word[i] == 'L':
                    sdx += '5'
                elif word[i] in {'M', 'N'}:
                    sdx += '6'
                elif word[i] == 'R':
                    sdx += '7'
                elif word[i] in {'S', 'Z'}:
                    sdx += '8'

            sdx = self._delete_consecutive_repeats(sdx)

            return sdx

        encoded = tuple(_haase_code(word) for word in variants)
        if len(encoded) > 1:
            encoded_set = set()
            encoded_single = []
            for code in encoded:
                if code not in encoded_set:
                    encoded_set.add(code)
                    encoded_single.append(code)
            return tuple(encoded_single)

        return encoded


def haase_phonetik(word, primary_only=False):
    """Return the Haase Phonetik (numeric output) code for a word.

    This is a wrapper for :py:meth:`Haase.encode`.

    Parameters
    ----------
    word : str
        The word to transform
    primary_only : bool
        If True, only the primary code is returned

    Returns
    -------
    tuple
        The Haase Phonetik value as a numeric string

    Examples
    --------
    >>> haase_phonetik('Joachim')
    ('9496',)
    >>> haase_phonetik('Christoph')
    ('4798293', '8798293')
    >>> haase_phonetik('Jörg')
    ('974',)
    >>> haase_phonetik('Smith')
    ('8692',)
    >>> haase_phonetik('Schmidt')
    ('8692', '4692')

    """
    return Haase().encode(word, primary_only)


if __name__ == '__main__':
    import doctest

    doctest.testmod()


1			# -- coding: utf-8 --
2
3			# Copyright 2014-2018 by Christopher C. Little.
4			# This file is part of Abydos.
5			#
6			# Abydos is free software: you can redistribute it and/or modify
7			# it under the terms of the GNU General Public License as published by
8			# the Free Software Foundation, either version 3 of the License, or
9			# (at your option) any later version.
10			#
11			# Abydos is distributed in the hope that it will be useful,
12			# but WITHOUT ANY WARRANTY; without even the implied warranty of
13			# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14			# GNU General Public License for more details.
15			#
16			# You should have received a copy of the GNU General Public License
17			# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19	1		"""abydos.phonetic._haase.
20
21			Haase Phonetik
22			"""
23
24	1		from __future__ import (
25			absolute_import,
26			division,
27			print_function,
28			unicode_literals,
29			)
30
31	1		from itertools import product
32	1		from unicodedata import normalize as unicode_normalize
33
34	1		from six import text_type
35	1		from six.moves import range
36
37	1		from ._phonetic import _Phonetic
38
39	1		__all__ = ['Haase', 'haase_phonetik']
40
41
42	1		class Haase(_Phonetic):
			0 ignored issues – show Unused Code introduced 2018-11-10 01:42 UTC by Report Bug Copy Issue Report The variable `__class__` seems to be unused. Loading history...
43			"""Haase Phonetik.
44
45			Based on the algorithm described at :cite:`Prante:2015`.
46
47			Based on the original :cite:`Haase:2000`.
48			"""
49
50	1		_uc_v_set = set('AEIJOUY')
51
52	1		def encode(self, word, primary_only=False):
			0 ignored issues – show Bug introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Parameters differ from overridden 'encode' method Loading history...
53			"""Return the Haase Phonetik (numeric output) code for a word.
54
55			While the output code is numeric, it is nevertheless a str.
56
57			Parameters
58			----------
59			word : str
60			The word to transform
61			primary_only : bool
62			If True, only the primary code is returned
63
64			Returns
65			-------
66			tuple
67			The Haase Phonetik value as a numeric string
68
69			Examples
70			--------
71			>>> pe = Haase()
72			>>> pe.encode('Joachim')
73			('9496',)
74			>>> pe.encode('Christoph')
75			('4798293', '8798293')
76			>>> pe.encode('Jörg')
77			('974',)
78			>>> pe.encode('Smith')
79			('8692',)
80			>>> pe.encode('Schmidt')
81			('8692', '4692')
82
83			"""
84
85	1		def _after(word, pos, letters):
86			"""Return True if word[pos] follows one of the supplied letters.
87
88			Parameters
89			----------
90			word : str
91			Word to modify
92			pos : int
93			Position to examine
94			letters : set
95			Letters to check for
96
97			Returns
98			-------
99			bool
100			True if word[pos] follows one of letters
101
102			"""
103	1		if pos > 0 and word[pos - 1] in letters:
104	1		return True
105	1		return False
106
107	1		def _before(word, pos, letters):
108			"""Return True if word[pos] precedes one of the supplied letters.
109
110			Parameters
111			----------
112			word : str
113			Word to modify
114			pos : int
115			Position to examine
116			letters : set
117			Letters to check for
118
119			Returns
120			-------
121			bool
122			True if word[pos] precedes one of letters
123
124			"""
125	1		if pos + 1 < len(word) and word[pos + 1] in letters:
126	1		return True
127	1		return False
128
129	1		word = unicode_normalize('NFKD', text_type(word.upper()))
130	1		word = word.replace('ß', 'SS')
131
132	1		word = word.replace('Ä', 'AE')
133	1		word = word.replace('Ö', 'OE')
134	1		word = word.replace('Ü', 'UE')
135	1		word = ''.join(c for c in word if c in self._uc_set)
136
137	1		variants = []
138	1		if primary_only:
139	1		variants = [word]
140			else:
141	1		pos = 0
142	1		if word[:2] == 'CH':
143	1		variants.append(('CH', 'SCH'))
144	1		pos += 2
145	1		len_3_vars = {
146			'OWN': 'AUN',
147			'WSK': 'RSK',
148			'SCH': 'CH',
149			'GLI': 'LI',
150			'AUX': 'O',
151			'EUX': 'O',
152			}
153	1		while pos < len(word):
154	1		if word[pos : pos + 4] == 'ILLE':
155	1		variants.append(('ILLE', 'I'))
156	1		pos += 4
157	1		elif word[pos : pos + 3] in len_3_vars:
158	1		variants.append(
159			(word[pos : pos + 3], len_3_vars[word[pos : pos + 3]])
160			)
161	1		pos += 3
162	1		elif word[pos : pos + 2] == 'RB':
163	1		variants.append(('RB', 'RW'))
164	1		pos += 2
165	1		elif len(word[pos:]) == 3 and word[pos:] == 'EAU':
166	1		variants.append(('EAU', 'O'))
167	1		pos += 3
168	1		elif len(word[pos:]) == 1 and word[pos:] in {'A', 'O'}:
169	1		if word[pos:] == 'O':
170	1		variants.append(('O', 'OW'))
171			else:
172	1		variants.append(('A', 'AR'))
173	1		pos += 1
174			else:
175	1		variants.append((word[pos],))
176	1		pos += 1
177
178	1		variants = [''.join(letters) for letters in product(*variants)]
179
180	1		def _haase_code(word):
181	1		sdx = ''
182	1		for i in range(len(word)):
			0 ignored issues – show unused-code introduced 2018-10-20 00:45 UTC by Report Bug Copy Issue Report Consider using enumerate instead of iterating with range and len Loading history...
183	1	View Code Duplication	if word[i] in self._uc_v_set:
			0 ignored issues – show Duplication introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
184	1		sdx += '9'
185	1		elif word[i] == 'B':
186	1		sdx += '1'
187	1		elif word[i] == 'P':
188	1		if _before(word, i, {'H'}):
189	1		sdx += '3'
190			else:
191	1		sdx += '1'
192	1		elif word[i] in {'D', 'T'}:
193	1		if _before(word, i, {'C', 'S', 'Z'}):
194	1		sdx += '8'
195			else:
196	1		sdx += '2'
197	1		elif word[i] in {'F', 'V', 'W'}:
198	1		sdx += '3'
199	1		elif word[i] in {'G', 'K', 'Q'}:
200	1		sdx += '4'
201	1		elif word[i] == 'C':
202	1		if _after(word, i, {'S', 'Z'}):
203	1		sdx += '8'
204	1		elif i == 0:
205	1		if _before(
206			word,
			0 ignored issues – show Coding Style introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
207			i,
			0 ignored issues – show Coding Style introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
208			{'A', 'H', 'K', 'L', 'O', 'Q', 'R', 'U', 'X'},
			0 ignored issues – show Coding Style introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
209			):
210	1		sdx += '4'
211			else:
212	1		sdx += '8'
213	1		elif _before(word, i, {'A', 'H', 'K', 'O', 'Q', 'U', 'X'}):
214	1		sdx += '4'
215			else:
216	1		sdx += '8'
217	1		elif word[i] == 'X':
218	1		if _after(word, i, {'C', 'K', 'Q'}):
219	1		sdx += '8'
220			else:
221	1		sdx += '48'
222	1		elif word[i] == 'L':
223	1		sdx += '5'
224	1		elif word[i] in {'M', 'N'}:
225	1		sdx += '6'
226	1		elif word[i] == 'R':
227	1		sdx += '7'
228	1		elif word[i] in {'S', 'Z'}:
229	1		sdx += '8'
230
231	1		sdx = self._delete_consecutive_repeats(sdx)
232
233	1		return sdx
234
235	1		encoded = tuple(_haase_code(word) for word in variants)
236	1		if len(encoded) > 1:
237	1		encoded_set = set()
238	1		encoded_single = []
239	1		for code in encoded:
240	1		if code not in encoded_set:
241	1		encoded_set.add(code)
242	1		encoded_single.append(code)
243	1		return tuple(encoded_single)
244
245	1		return encoded
246
247
248	1		def haase_phonetik(word, primary_only=False):
249			"""Return the Haase Phonetik (numeric output) code for a word.
250
251			This is a wrapper for :py:meth:`Haase.encode`.
252
253			Parameters
254			----------
255			word : str
256			The word to transform
257			primary_only : bool
258			If True, only the primary code is returned
259
260			Returns
261			-------
262			tuple
263			The Haase Phonetik value as a numeric string
264
265			Examples
266			--------
267			>>> haase_phonetik('Joachim')
268			('9496',)
269			>>> haase_phonetik('Christoph')
270			('4798293', '8798293')
271			>>> haase_phonetik('Jörg')
272			('974',)
273			>>> haase_phonetik('Smith')
274			('8692',)
275			>>> haase_phonetik('Schmidt')
276			('8692', '4692')
277
278			"""
279	1		return Haase().encode(word, primary_only)
280
281
282			if __name__ == '__main__':
283			import doctest
284
285			doctest.testmod()
286

chrislit / abydos

Push — master ( f43547...71985b )

abydos.phonetic._haase.haase_phonetik() A

Complexity

Size

Duplication

Code Coverage

Importance

Duplication Side-by-Side

Filter issues like