abydos.phonetic._es.SpanishMetaphone.encode() - Code Metrics - Inspection of "0.3.6" - chrislit/abydos - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Pull Request — master (#141)

by Chris

created 2018-11-08 03:44 UTC

abydos.phonetic._es.SpanishMetaphone.encode() F

↳ Parent: abydos.phonetic._es

Complexity

Conditions

Size

Total Lines	172
Code Lines	101

Duplication

Lines	0
Ratio	0 %

Code Coverage

Tests	87
CRAP Score	29

Importance

Changes

Metric	Value
eloc	101
dl	0
loc	172
ccs	87
cts	87
cp	1
rs	0
c	0
b	0
f	0
cc	29
nop	4
crap	29

How to fix Long Method Complexity

# -*- coding: utf-8 -*-

# Copyright 2018 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.phonetic._es.

The phonetic._es module implements phonetic algorithms intended for Spanish,
including:

    - Phonetic Spanish
    - Spanish Metaphone
"""

from __future__ import unicode_literals

from unicodedata import normalize as unicode_normalize

from six import text_type

from ._phonetic import Phonetic

__all__ = [
    'PhoneticSpanish',
    'SpanishMetaphone',
    'phonetic_spanish',
    'spanish_metaphone',
]


class PhoneticSpanish(Phonetic):

    """PhoneticSpanish.

    This follows the coding described in :cite:`Amon:2012` and
    :cite:`delPilarAngeles:2015`.
    """

    _trans = dict(
        zip((ord(_) for _ in 'BCDFGHJKLMNPQRSTVXYZ'), '14328287566079431454')

    )

    _uc_set = set('BCDFGHJKLMNPQRSTVXYZ')

    def encode(self, word, max_length=-1):

        """Return the PhoneticSpanish coding of word.

        Args:
            word (str): The word to transform
            max_length (int): The length of the code returned (defaults to
                unlimited)

        Returns:
            str: The PhoneticSpanish code

        Examples:
            >>> pe = PhoneticSpanish()
            >>> pe.encode('Perez')
            '094'
            >>> pe.encode('Martinez')
            '69364'
            >>> pe.encode('Gutierrez')
            '83994'
            >>> pe.encode('Santiago')
            '4638'
            >>> pe.encode('Nicolás')
            '6454'

        """
        # uppercase, normalize, and decompose, filter to A-Z minus vowels & W
        word = unicode_normalize('NFKD', text_type(word.upper()))
        word = ''.join(c for c in word if c in self._uc_set)

        # merge repeated Ls & Rs
        word = word.replace('LL', 'L')
        word = word.replace('R', 'R')

        # apply the Soundex algorithm
        sdx = word.translate(self._trans)

        if max_length > 0:
            sdx = (sdx + ('0' * max_length))[:max_length]

        return sdx


def phonetic_spanish(word, max_length=-1):
    """Return the PhoneticSpanish coding of word.

    This is a wrapper for :py:meth:`PhoneticSpanish.encode`.

    Args:
        word (str): The word to transform
        max_length (int): The length of the code returned (defaults to
            unlimited)

    Returns:
        str: The PhoneticSpanish code

    Examples:
        >>> phonetic_spanish('Perez')
        '094'
        >>> phonetic_spanish('Martinez')
        '69364'
        >>> phonetic_spanish('Gutierrez')
        '83994'
        >>> phonetic_spanish('Santiago')
        '4638'
        >>> phonetic_spanish('Nicolás')
        '6454'

    """
    return PhoneticSpanish().encode(word, max_length)


class SpanishMetaphone(Phonetic):

    """Spanish Metaphone.

    This is a quick rewrite of the Spanish Metaphone Algorithm, as presented at
    https://github.com/amsqr/Spanish-Metaphone and discussed in
    :cite:`Mosquera:2012`.

    Modified version based on :cite:`delPilarAngeles:2016`.
    """

    def encode(self, word, max_length=6, modified=False):

        """Return the Spanish Metaphone of a word.

        Args:
            word (str): The word to transform
            max_length (int): The length of the code returned (defaults to 6)
            modified (bool): Set to True to use del Pilar Angeles &
                Bailón-Miguel's modified version of the algorithm

        Returns:
            str: The Spanish Metaphone code

        Examples:
            >>> pe = SpanishMetaphone()
            >>> pe.encode('Perez')
            'PRZ'
            >>> pe.encode('Martinez')
            'MRTNZ'
            >>> pe.encode('Gutierrez')
            'GTRRZ'
            >>> pe.encode('Santiago')
            'SNTG'
            >>> pe.encode('Nicolás')
            'NKLS'

        """

        def _is_vowel(pos):
            """Return True if the character at word[pos] is a vowel.

            Args:
                pos (int): Position to check for a vowel

            Returns:
                bool: True if word[pos] is a vowel

            """
            return pos < len(word) and word[pos] in {'A', 'E', 'I', 'O', 'U'}

        word = unicode_normalize('NFC', text_type(word.upper()))

        meta_key = ''
        pos = 0

        # do some replacements for the modified version
        if modified:
            word = word.replace('MB', 'NB')
            word = word.replace('MP', 'NP')
            word = word.replace('BS', 'S')
            if word[:2] == 'PS':
                word = word[1:]

        # simple replacements
        word = word.replace('Á', 'A')
        word = word.replace('CH', 'X')
        word = word.replace('Ç', 'S')
        word = word.replace('É', 'E')
        word = word.replace('Í', 'I')
        word = word.replace('Ó', 'O')
        word = word.replace('Ú', 'U')
        word = word.replace('Ñ', 'NY')
        word = word.replace('GÜ', 'W')
        word = word.replace('Ü', 'U')
        word = word.replace('B', 'V')
        word = word.replace('LL', 'Y')

        while len(meta_key) < max_length:
            if pos >= len(word):
                break

            # get the next character
            current_char = word[pos]

            # if a vowel in pos 0, add to key
            if _is_vowel(pos) and pos == 0:
                meta_key += current_char
                pos += 1
            # otherwise, do consonant rules
            else:
                # simple consonants (unmutated)
                if current_char in {
                    'D',

                    'F',

                    'J',

                    'K',

                    'M',

                    'N',

                    'P',

                    'T',

                    'V',

                    'L',

                    'Y',

                }:
                    meta_key += current_char
                    # skip doubled consonants
                    if word[pos + 1 : pos + 2] == current_char:
                        pos += 2
                    else:
                        pos += 1
                else:
                    if current_char == 'C':
                        # special case 'acción', 'reacción',etc.
                        if word[pos + 1 : pos + 2] == 'C':
                            meta_key += 'X'
                            pos += 2
                        # special case 'cesar', 'cien', 'cid', 'conciencia'
                        elif word[pos + 1 : pos + 2] in {'E', 'I'}:
                            meta_key += 'Z'
                            pos += 2
                        # base case
                        else:
                            meta_key += 'K'
                            pos += 1
                    elif current_char == 'G':
                        # special case 'gente', 'ecologia',etc
                        if word[pos + 1 : pos + 2] in {'E', 'I'}:
                            meta_key += 'J'
                            pos += 2
                        # base case
                        else:
                            meta_key += 'G'
                            pos += 1
                    elif current_char == 'H':
                        # since the letter 'H' is silent in Spanish,
                        # set the meta key to the vowel after the letter 'H'
                        if _is_vowel(pos + 1):
                            meta_key += word[pos + 1]
                            pos += 2
                        else:
                            meta_key += 'H'
                            pos += 1
                    elif current_char == 'Q':
                        if word[pos + 1 : pos + 2] == 'U':
                            pos += 2
                        else:
                            pos += 1
                        meta_key += 'K'
                    elif current_char == 'W':
                        meta_key += 'U'
                        pos += 1
                    elif current_char == 'R':
                        meta_key += 'R'
                        pos += 1
                    elif current_char == 'S':
                        if not _is_vowel(pos + 1) and pos == 0:
                            meta_key += 'ES'
                            pos += 1
                        else:
                            meta_key += 'S'
                            pos += 1
                    elif current_char == 'Z':
                        meta_key += 'Z'
                        pos += 1
                    elif current_char == 'X':
                        if (
                            len(word) > 1

                            and pos == 0

                            and not _is_vowel(pos + 1)

                        ):
                            meta_key += 'EX'
                            pos += 1
                        else:
                            meta_key += 'X'
                            pos += 1
                    else:
                        pos += 1

        # Final change from S to Z in modified version
        if modified:
            meta_key = meta_key.replace('S', 'Z')

        return meta_key


def spanish_metaphone(word, max_length=6, modified=False):
    """Return the Spanish Metaphone of a word.

    This is a wrapper for :py:meth:`SpanishMetaphone.encode`.

    Args:
        word (str): The word to transform
        max_length (int): The length of the code returned (defaults to 6)
        modified (bool): Set to True to use del Pilar Angeles &
            Bailón-Miguel's modified version of the algorithm

    Returns:
        str: The Spanish Metaphone code

    Examples:
        >>> spanish_metaphone('Perez')
        'PRZ'
        >>> spanish_metaphone('Martinez')
        'MRTNZ'
        >>> spanish_metaphone('Gutierrez')
        'GTRRZ'
        >>> spanish_metaphone('Santiago')
        'SNTG'
        >>> spanish_metaphone('Nicolás')
        'NKLS'

    """
    return SpanishMetaphone().encode(word, max_length, modified)


if __name__ == '__main__':
    import doctest

    doctest.testmod()


1		# -- coding: utf-8 --
2
3		# Copyright 2018 by Christopher C. Little.
4		# This file is part of Abydos.
5		#
6		# Abydos is free software: you can redistribute it and/or modify
7		# it under the terms of the GNU General Public License as published by
8		# the Free Software Foundation, either version 3 of the License, or
9		# (at your option) any later version.
10		#
11		# Abydos is distributed in the hope that it will be useful,
12		# but WITHOUT ANY WARRANTY; without even the implied warranty of
13		# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14		# GNU General Public License for more details.
15		#
16		# You should have received a copy of the GNU General Public License
17		# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19	1	"""abydos.phonetic._es.
20
21		The phonetic._es module implements phonetic algorithms intended for Spanish,
22		including:
23
24		- Phonetic Spanish
25		- Spanish Metaphone
26		"""
27
28	1	from __future__ import unicode_literals
29
30	1	from unicodedata import normalize as unicode_normalize
31
32	1	from six import text_type
33
34	1	from ._phonetic import Phonetic
35
36	1	__all__ = [
37		'PhoneticSpanish',
38		'SpanishMetaphone',
39		'phonetic_spanish',
40		'spanish_metaphone',
41		]
42
43
44	1	class PhoneticSpanish(Phonetic):
		0 ignored issues – show Unused Code introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report The variable `__class__` seems to be unused. Loading history...
45		"""PhoneticSpanish.
46
47		This follows the coding described in :cite:`Amon:2012` and
48		:cite:`delPilarAngeles:2015`.
49		"""
50
51	1	_trans = dict(
52		zip((ord(_) for _ in 'BCDFGHJKLMNPQRSTVXYZ'), '14328287566079431454')
		0 ignored issues – show Comprehensibility Best Practice introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report The variable `_` does not seem to be defined. Loading history...
53		)
54
55	1	_uc_set = set('BCDFGHJKLMNPQRSTVXYZ')
56
57	1	def encode(self, word, max_length=-1):
		0 ignored issues – show Bug introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Parameters differ from overridden 'encode' method Loading history...
58		"""Return the PhoneticSpanish coding of word.
59
60		Args:
61		word (str): The word to transform
62		max_length (int): The length of the code returned (defaults to
63		unlimited)
64
65		Returns:
66		str: The PhoneticSpanish code
67
68		Examples:
69		>>> pe = PhoneticSpanish()
70		>>> pe.encode('Perez')
71		'094'
72		>>> pe.encode('Martinez')
73		'69364'
74		>>> pe.encode('Gutierrez')
75		'83994'
76		>>> pe.encode('Santiago')
77		'4638'
78		>>> pe.encode('Nicolás')
79		'6454'
80
81		"""
82		# uppercase, normalize, and decompose, filter to A-Z minus vowels & W
83	1	word = unicode_normalize('NFKD', text_type(word.upper()))
84	1	word = ''.join(c for c in word if c in self._uc_set)
85
86		# merge repeated Ls & Rs
87	1	word = word.replace('LL', 'L')
88	1	word = word.replace('R', 'R')
89
90		# apply the Soundex algorithm
91	1	sdx = word.translate(self._trans)
92
93	1	if max_length > 0:
94	1	sdx = (sdx + ('0' * max_length))[:max_length]
95
96	1	return sdx
97
98
99	1	def phonetic_spanish(word, max_length=-1):
100		"""Return the PhoneticSpanish coding of word.
101
102		This is a wrapper for :py:meth:`PhoneticSpanish.encode`.
103
104		Args:
105		word (str): The word to transform
106		max_length (int): The length of the code returned (defaults to
107		unlimited)
108
109		Returns:
110		str: The PhoneticSpanish code
111
112		Examples:
113		>>> phonetic_spanish('Perez')
114		'094'
115		>>> phonetic_spanish('Martinez')
116		'69364'
117		>>> phonetic_spanish('Gutierrez')
118		'83994'
119		>>> phonetic_spanish('Santiago')
120		'4638'
121		>>> phonetic_spanish('Nicolás')
122		'6454'
123
124		"""
125	1	return PhoneticSpanish().encode(word, max_length)
126
127
128	1	class SpanishMetaphone(Phonetic):
		0 ignored issues – show Unused Code introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report The variable `__class__` seems to be unused. Loading history...
129		"""Spanish Metaphone.
130
131		This is a quick rewrite of the Spanish Metaphone Algorithm, as presented at
132		https://github.com/amsqr/Spanish-Metaphone and discussed in
133		:cite:`Mosquera:2012`.
134
135		Modified version based on :cite:`delPilarAngeles:2016`.
136		"""
137
138	1	def encode(self, word, max_length=6, modified=False):
		0 ignored issues – show Bug introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Parameters differ from overridden 'encode' method Loading history...
139		"""Return the Spanish Metaphone of a word.
140
141		Args:
142		word (str): The word to transform
143		max_length (int): The length of the code returned (defaults to 6)
144		modified (bool): Set to True to use del Pilar Angeles &
145		Bailón-Miguel's modified version of the algorithm
146
147		Returns:
148		str: The Spanish Metaphone code
149
150		Examples:
151		>>> pe = SpanishMetaphone()
152		>>> pe.encode('Perez')
153		'PRZ'
154		>>> pe.encode('Martinez')
155		'MRTNZ'
156		>>> pe.encode('Gutierrez')
157		'GTRRZ'
158		>>> pe.encode('Santiago')
159		'SNTG'
160		>>> pe.encode('Nicolás')
161		'NKLS'
162
163		"""
164
165	1	def _is_vowel(pos):
166		"""Return True if the character at word[pos] is a vowel.
167
168		Args:
169		pos (int): Position to check for a vowel
170
171		Returns:
172		bool: True if word[pos] is a vowel
173
174		"""
175	1	return pos < len(word) and word[pos] in {'A', 'E', 'I', 'O', 'U'}
176
177	1	word = unicode_normalize('NFC', text_type(word.upper()))
178
179	1	meta_key = ''
180	1	pos = 0
181
182		# do some replacements for the modified version
183	1	if modified:
184	1	word = word.replace('MB', 'NB')
185	1	word = word.replace('MP', 'NP')
186	1	word = word.replace('BS', 'S')
187	1	if word[:2] == 'PS':
188	1	word = word[1:]
189
190		# simple replacements
191	1	word = word.replace('Á', 'A')
192	1	word = word.replace('CH', 'X')
193	1	word = word.replace('Ç', 'S')
194	1	word = word.replace('É', 'E')
195	1	word = word.replace('Í', 'I')
196	1	word = word.replace('Ó', 'O')
197	1	word = word.replace('Ú', 'U')
198	1	word = word.replace('Ñ', 'NY')
199	1	word = word.replace('GÜ', 'W')
200	1	word = word.replace('Ü', 'U')
201	1	word = word.replace('B', 'V')
202	1	word = word.replace('LL', 'Y')
203
204	1	while len(meta_key) < max_length:
205	1	if pos >= len(word):
206	1	break
207
208		# get the next character
209	1	current_char = word[pos]
210
211		# if a vowel in pos 0, add to key
212	1	if _is_vowel(pos) and pos == 0:
213	1	meta_key += current_char
214	1	pos += 1
215		# otherwise, do consonant rules
216		else:
217		# simple consonants (unmutated)
218	1	if current_char in {
219		'D',
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
220		'F',
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
221		'J',
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
222		'K',
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
223		'M',
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
224		'N',
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
225		'P',
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
226		'T',
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
227		'V',
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
228		'L',
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
229		'Y',
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
230		}:
231	1	meta_key += current_char
232		# skip doubled consonants
233	1	if word[pos + 1 : pos + 2] == current_char:
234	1	pos += 2
235		else:
236	1	pos += 1
237		else:
238	1	if current_char == 'C':
239		# special case 'acción', 'reacción',etc.
240	1	if word[pos + 1 : pos + 2] == 'C':
241	1	meta_key += 'X'
242	1	pos += 2
243		# special case 'cesar', 'cien', 'cid', 'conciencia'
244	1	elif word[pos + 1 : pos + 2] in {'E', 'I'}:
245	1	meta_key += 'Z'
246	1	pos += 2
247		# base case
248		else:
249	1	meta_key += 'K'
250	1	pos += 1
251	1	elif current_char == 'G':
252		# special case 'gente', 'ecologia',etc
253	1	if word[pos + 1 : pos + 2] in {'E', 'I'}:
254	1	meta_key += 'J'
255	1	pos += 2
256		# base case
257		else:
258	1	meta_key += 'G'
259	1	pos += 1
260	1	elif current_char == 'H':
261		# since the letter 'H' is silent in Spanish,
262		# set the meta key to the vowel after the letter 'H'
263	1	if _is_vowel(pos + 1):
264	1	meta_key += word[pos + 1]
265	1	pos += 2
266		else:
267	1	meta_key += 'H'
268	1	pos += 1
269	1	elif current_char == 'Q':
270	1	if word[pos + 1 : pos + 2] == 'U':
271	1	pos += 2
272		else:
273	1	pos += 1
274	1	meta_key += 'K'
275	1	elif current_char == 'W':
276	1	meta_key += 'U'
277	1	pos += 1
278	1	elif current_char == 'R':
279	1	meta_key += 'R'
280	1	pos += 1
281	1	elif current_char == 'S':
282	1	if not _is_vowel(pos + 1) and pos == 0:
283	1	meta_key += 'ES'
284	1	pos += 1
285		else:
286	1	meta_key += 'S'
287	1	pos += 1
288	1	elif current_char == 'Z':
289	1	meta_key += 'Z'
290	1	pos += 1
291	1	elif current_char == 'X':
292	1	if (
293		len(word) > 1
		0 ignored issues – show Coding Style introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
294		and pos == 0
		0 ignored issues – show Coding Style introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
295		and not _is_vowel(pos + 1)
		0 ignored issues – show Coding Style introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
296		):
297	1	meta_key += 'EX'
298	1	pos += 1
299		else:
300	1	meta_key += 'X'
301	1	pos += 1
302		else:
303	1	pos += 1
304
305		# Final change from S to Z in modified version
306	1	if modified:
307	1	meta_key = meta_key.replace('S', 'Z')
308
309	1	return meta_key
310
311
312	1	def spanish_metaphone(word, max_length=6, modified=False):
313		"""Return the Spanish Metaphone of a word.
314
315		This is a wrapper for :py:meth:`SpanishMetaphone.encode`.
316
317		Args:
318		word (str): The word to transform
319		max_length (int): The length of the code returned (defaults to 6)
320		modified (bool): Set to True to use del Pilar Angeles &
321		Bailón-Miguel's modified version of the algorithm
322
323		Returns:
324		str: The Spanish Metaphone code
325
326		Examples:
327		>>> spanish_metaphone('Perez')
328		'PRZ'
329		>>> spanish_metaphone('Martinez')
330		'MRTNZ'
331		>>> spanish_metaphone('Gutierrez')
332		'GTRRZ'
333		>>> spanish_metaphone('Santiago')
334		'SNTG'
335		>>> spanish_metaphone('Nicolás')
336		'NKLS'
337
338		"""
339	1	return SpanishMetaphone().encode(word, max_length, modified)
340
341
342		if __name__ == '__main__':
343		import doctest
344
345		doctest.testmod()
346

chrislit / abydos

Pull Request — master (#141)

abydos.phonetic._es.SpanishMetaphone.encode() F

Complexity

Size

Duplication

Code Coverage

Importance

How to fix Long Method Complexity

Long Method

Complexity

Duplication Side-by-Side

Filter issues like