abydos.phonetic._es.PhoneticSpanish.encode() - Code Metrics - Inspection of "0.3.6" - chrislit/abydos - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Pull Request — master (#135)

by Chris

created 2018-11-04 07:51 UTC

abydos.phonetic._es.PhoneticSpanish.encode() A

↳ Parent: abydos.phonetic._es

Complexity

Conditions

Size

Total Lines	36
Code Lines	9

Duplication

Lines	0
Ratio	0 %

Code Coverage

Tests	9
CRAP Score	2

Importance

Changes

Metric	Value
eloc	9
dl	0
loc	36
ccs	9
cts	9
cp	1
rs	9.95
c	0
b	0
f	0
cc	2
nop	3
crap	2

# -*- coding: utf-8 -*-

# Copyright 2018 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.phonetic._es.

The phonetic._es module implements phonetic algorithms intended for Spanish,
including:

    - Phonetic Spanish
    - Spanish Metaphone
"""

from __future__ import unicode_literals

from unicodedata import normalize as unicode_normalize

from six import text_type

from ._phonetic import Phonetic

__all__ = [
    'PhoneticSpanish',
    'SpanishMetaphone',
    'phonetic_spanish',
    'spanish_metaphone',
]


class PhoneticSpanish(Phonetic):

    """PhoneticSpanish.

    This follows the coding described in :cite:`Amon:2012` and
    :cite:`delPilarAngeles:2015`.
    """

    _trans = dict(
        zip((ord(_) for _ in 'BCDFGHJKLMNPQRSTVXYZ'), '14328287566079431454')

    )

    _uc_set = set('BCDFGHJKLMNPQRSTVXYZ')

    def encode(self, word, max_length=-1):

        """Return the PhoneticSpanish coding of word.

        :param str word: the word to transform
        :param int max_length: the length of the code returned (defaults to
            unlimited)
        :returns: the PhoneticSpanish code
        :rtype: str

        >>> pe = PhoneticSpanish()
        >>> pe.encode('Perez')
        '094'
        >>> pe.encode('Martinez')
        '69364'
        >>> pe.encode('Gutierrez')
        '83994'
        >>> pe.encode('Santiago')
        '4638'
        >>> pe.encode('Nicolás')
        '6454'
        """
        # uppercase, normalize, and decompose, filter to A-Z minus vowels & W
        word = unicode_normalize('NFKD', text_type(word.upper()))
        word = ''.join(c for c in word if c in self._uc_set)

        # merge repeated Ls & Rs
        word = word.replace('LL', 'L')
        word = word.replace('R', 'R')

        # apply the Soundex algorithm
        sdx = word.translate(self._trans)

        if max_length > 0:
            sdx = (sdx + ('0' * max_length))[:max_length]

        return sdx


def phonetic_spanish(word, max_length=-1):
    """Return the PhoneticSpanish coding of word.

    This is a wrapper for :py:meth:`PhoneticSpanish.encode`.

    :param str word: the word to transform
    :param int max_length: the length of the code returned (defaults to
        unlimited)
    :returns: the PhoneticSpanish code
    :rtype: str

    >>> phonetic_spanish('Perez')
    '094'
    >>> phonetic_spanish('Martinez')
    '69364'
    >>> phonetic_spanish('Gutierrez')
    '83994'
    >>> phonetic_spanish('Santiago')
    '4638'
    >>> phonetic_spanish('Nicolás')
    '6454'
    """
    return PhoneticSpanish().encode(word, max_length)


class SpanishMetaphone(Phonetic):

    """Spanish Metaphone.

    This is a quick rewrite of the Spanish Metaphone Algorithm, as presented at
    https://github.com/amsqr/Spanish-Metaphone and discussed in
    :cite:`Mosquera:2012`.

    Modified version based on :cite:`delPilarAngeles:2016`.
    """

    def encode(self, word, max_length=6, modified=False):

        """Return the Spanish Metaphone of a word.

        :param str word: the word to transform
        :param int max_length: the length of the code returned (defaults to 6)
        :param bool modified: Set to True to use del Pilar Angeles &
            Bailón-Miguel's modified version of the algorithm
        :returns: the Spanish Metaphone code
        :rtype: str

        >>> pe = SpanishMetaphone()
        >>> pe.encode('Perez')
        'PRZ'
        >>> pe.encode('Martinez')
        'MRTNZ'
        >>> pe.encode('Gutierrez')
        'GTRRZ'
        >>> pe.encode('Santiago')
        'SNTG'
        >>> pe.encode('Nicolás')
        'NKLS'
        """

        def _is_vowel(pos):
            """Return True if the character at word[pos] is a vowel."""
            return pos < len(word) and word[pos] in {'A', 'E', 'I', 'O', 'U'}

        word = unicode_normalize('NFC', text_type(word.upper()))

        meta_key = ''
        pos = 0

        # do some replacements for the modified version
        if modified:
            word = word.replace('MB', 'NB')
            word = word.replace('MP', 'NP')
            word = word.replace('BS', 'S')
            if word[:2] == 'PS':
                word = word[1:]

        # simple replacements
        word = word.replace('Á', 'A')
        word = word.replace('CH', 'X')
        word = word.replace('Ç', 'S')
        word = word.replace('É', 'E')
        word = word.replace('Í', 'I')
        word = word.replace('Ó', 'O')
        word = word.replace('Ú', 'U')
        word = word.replace('Ñ', 'NY')
        word = word.replace('GÜ', 'W')
        word = word.replace('Ü', 'U')
        word = word.replace('B', 'V')
        word = word.replace('LL', 'Y')

        while len(meta_key) < max_length:
            if pos >= len(word):
                break

            # get the next character
            current_char = word[pos]

            # if a vowel in pos 0, add to key
            if _is_vowel(pos) and pos == 0:
                meta_key += current_char
                pos += 1
            # otherwise, do consonant rules
            else:
                # simple consonants (unmutated)
                if current_char in {
                    'D',

                    'F',

                    'J',

                    'K',

                    'M',

                    'N',

                    'P',

                    'T',

                    'V',

                    'L',

                    'Y',

                }:
                    meta_key += current_char
                    # skip doubled consonants
                    if word[pos + 1 : pos + 2] == current_char:
                        pos += 2
                    else:
                        pos += 1
                else:
                    if current_char == 'C':
                        # special case 'acción', 'reacción',etc.
                        if word[pos + 1 : pos + 2] == 'C':
                            meta_key += 'X'
                            pos += 2
                        # special case 'cesar', 'cien', 'cid', 'conciencia'
                        elif word[pos + 1 : pos + 2] in {'E', 'I'}:
                            meta_key += 'Z'
                            pos += 2
                        # base case
                        else:
                            meta_key += 'K'
                            pos += 1
                    elif current_char == 'G':
                        # special case 'gente', 'ecologia',etc
                        if word[pos + 1 : pos + 2] in {'E', 'I'}:
                            meta_key += 'J'
                            pos += 2
                        # base case
                        else:
                            meta_key += 'G'
                            pos += 1
                    elif current_char == 'H':
                        # since the letter 'H' is silent in Spanish,
                        # set the meta key to the vowel after the letter 'H'
                        if _is_vowel(pos + 1):
                            meta_key += word[pos + 1]
                            pos += 2
                        else:
                            meta_key += 'H'
                            pos += 1
                    elif current_char == 'Q':
                        if word[pos + 1 : pos + 2] == 'U':
                            pos += 2
                        else:
                            pos += 1
                        meta_key += 'K'
                    elif current_char == 'W':
                        meta_key += 'U'
                        pos += 1
                    elif current_char == 'R':
                        meta_key += 'R'
                        pos += 1
                    elif current_char == 'S':
                        if not _is_vowel(pos + 1) and pos == 0:
                            meta_key += 'ES'
                            pos += 1
                        else:
                            meta_key += 'S'
                            pos += 1
                    elif current_char == 'Z':
                        meta_key += 'Z'
                        pos += 1
                    elif current_char == 'X':
                        if (
                            len(word) > 1

                            and pos == 0

                            and not _is_vowel(pos + 1)

                        ):
                            meta_key += 'EX'
                            pos += 1
                        else:
                            meta_key += 'X'
                            pos += 1
                    else:
                        pos += 1

        # Final change from S to Z in modified version
        if modified:
            meta_key = meta_key.replace('S', 'Z')

        return meta_key


def spanish_metaphone(word, max_length=6, modified=False):
    """Return the Spanish Metaphone of a word.

    This is a wrapper for :py:meth:`SpanishMetaphone.encode`.

    :param str word: the word to transform
    :param int max_length: the length of the code returned (defaults to 6)
    :param bool modified: Set to True to use del Pilar Angeles &
        Bailón-Miguel's modified version of the algorithm
    :returns: the Spanish Metaphone code
    :rtype: str

    >>> spanish_metaphone('Perez')
    'PRZ'
    >>> spanish_metaphone('Martinez')
    'MRTNZ'
    >>> spanish_metaphone('Gutierrez')
    'GTRRZ'
    >>> spanish_metaphone('Santiago')
    'SNTG'
    >>> spanish_metaphone('Nicolás')
    'NKLS'
    """
    return SpanishMetaphone().encode(word, max_length, modified)


if __name__ == '__main__':
    import doctest

    doctest.testmod()


1		# -- coding: utf-8 --
2
3		# Copyright 2018 by Christopher C. Little.
4		# This file is part of Abydos.
5		#
6		# Abydos is free software: you can redistribute it and/or modify
7		# it under the terms of the GNU General Public License as published by
8		# the Free Software Foundation, either version 3 of the License, or
9		# (at your option) any later version.
10		#
11		# Abydos is distributed in the hope that it will be useful,
12		# but WITHOUT ANY WARRANTY; without even the implied warranty of
13		# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14		# GNU General Public License for more details.
15		#
16		# You should have received a copy of the GNU General Public License
17		# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19	1	"""abydos.phonetic._es.
20
21		The phonetic._es module implements phonetic algorithms intended for Spanish,
22		including:
23
24		- Phonetic Spanish
25		- Spanish Metaphone
26		"""
27
28	1	from __future__ import unicode_literals
29
30	1	from unicodedata import normalize as unicode_normalize
31
32	1	from six import text_type
33
34	1	from ._phonetic import Phonetic
35
36	1	__all__ = [
37		'PhoneticSpanish',
38		'SpanishMetaphone',
39		'phonetic_spanish',
40		'spanish_metaphone',
41		]
42
43
44	1	class PhoneticSpanish(Phonetic):
		0 ignored issues – show Unused Code introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report The variable `__class__` seems to be unused. Loading history...
45		"""PhoneticSpanish.
46
47		This follows the coding described in :cite:`Amon:2012` and
48		:cite:`delPilarAngeles:2015`.
49		"""
50
51	1	_trans = dict(
52		zip((ord(_) for _ in 'BCDFGHJKLMNPQRSTVXYZ'), '14328287566079431454')
		0 ignored issues – show Comprehensibility Best Practice introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report The variable `_` does not seem to be defined. Loading history...
53		)
54
55	1	_uc_set = set('BCDFGHJKLMNPQRSTVXYZ')
56
57	1	def encode(self, word, max_length=-1):
		0 ignored issues – show Bug introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Parameters differ from overridden 'encode' method Loading history...
58		"""Return the PhoneticSpanish coding of word.
59
60		:param str word: the word to transform
61		:param int max_length: the length of the code returned (defaults to
62		unlimited)
63		:returns: the PhoneticSpanish code
64		:rtype: str
65
66		>>> pe = PhoneticSpanish()
67		>>> pe.encode('Perez')
68		'094'
69		>>> pe.encode('Martinez')
70		'69364'
71		>>> pe.encode('Gutierrez')
72		'83994'
73		>>> pe.encode('Santiago')
74		'4638'
75		>>> pe.encode('Nicolás')
76		'6454'
77		"""
78		# uppercase, normalize, and decompose, filter to A-Z minus vowels & W
79	1	word = unicode_normalize('NFKD', text_type(word.upper()))
80	1	word = ''.join(c for c in word if c in self._uc_set)
81
82		# merge repeated Ls & Rs
83	1	word = word.replace('LL', 'L')
84	1	word = word.replace('R', 'R')
85
86		# apply the Soundex algorithm
87	1	sdx = word.translate(self._trans)
88
89	1	if max_length > 0:
90	1	sdx = (sdx + ('0' * max_length))[:max_length]
91
92	1	return sdx
93
94
95	1	def phonetic_spanish(word, max_length=-1):
96		"""Return the PhoneticSpanish coding of word.
97
98		This is a wrapper for :py:meth:`PhoneticSpanish.encode`.
99
100		:param str word: the word to transform
101		:param int max_length: the length of the code returned (defaults to
102		unlimited)
103		:returns: the PhoneticSpanish code
104		:rtype: str
105
106		>>> phonetic_spanish('Perez')
107		'094'
108		>>> phonetic_spanish('Martinez')
109		'69364'
110		>>> phonetic_spanish('Gutierrez')
111		'83994'
112		>>> phonetic_spanish('Santiago')
113		'4638'
114		>>> phonetic_spanish('Nicolás')
115		'6454'
116		"""
117	1	return PhoneticSpanish().encode(word, max_length)
118
119
120	1	class SpanishMetaphone(Phonetic):
		0 ignored issues – show Unused Code introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report The variable `__class__` seems to be unused. Loading history...
121		"""Spanish Metaphone.
122
123		This is a quick rewrite of the Spanish Metaphone Algorithm, as presented at
124		https://github.com/amsqr/Spanish-Metaphone and discussed in
125		:cite:`Mosquera:2012`.
126
127		Modified version based on :cite:`delPilarAngeles:2016`.
128		"""
129
130	1	def encode(self, word, max_length=6, modified=False):
		0 ignored issues – show Bug introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Parameters differ from overridden 'encode' method Loading history...
131		"""Return the Spanish Metaphone of a word.
132
133		:param str word: the word to transform
134		:param int max_length: the length of the code returned (defaults to 6)
135		:param bool modified: Set to True to use del Pilar Angeles &
136		Bailón-Miguel's modified version of the algorithm
137		:returns: the Spanish Metaphone code
138		:rtype: str
139
140		>>> pe = SpanishMetaphone()
141		>>> pe.encode('Perez')
142		'PRZ'
143		>>> pe.encode('Martinez')
144		'MRTNZ'
145		>>> pe.encode('Gutierrez')
146		'GTRRZ'
147		>>> pe.encode('Santiago')
148		'SNTG'
149		>>> pe.encode('Nicolás')
150		'NKLS'
151		"""
152
153	1	def _is_vowel(pos):
154		"""Return True if the character at word[pos] is a vowel."""
155	1	return pos < len(word) and word[pos] in {'A', 'E', 'I', 'O', 'U'}
156
157	1	word = unicode_normalize('NFC', text_type(word.upper()))
158
159	1	meta_key = ''
160	1	pos = 0
161
162		# do some replacements for the modified version
163	1	if modified:
164	1	word = word.replace('MB', 'NB')
165	1	word = word.replace('MP', 'NP')
166	1	word = word.replace('BS', 'S')
167	1	if word[:2] == 'PS':
168	1	word = word[1:]
169
170		# simple replacements
171	1	word = word.replace('Á', 'A')
172	1	word = word.replace('CH', 'X')
173	1	word = word.replace('Ç', 'S')
174	1	word = word.replace('É', 'E')
175	1	word = word.replace('Í', 'I')
176	1	word = word.replace('Ó', 'O')
177	1	word = word.replace('Ú', 'U')
178	1	word = word.replace('Ñ', 'NY')
179	1	word = word.replace('GÜ', 'W')
180	1	word = word.replace('Ü', 'U')
181	1	word = word.replace('B', 'V')
182	1	word = word.replace('LL', 'Y')
183
184	1	while len(meta_key) < max_length:
185	1	if pos >= len(word):
186	1	break
187
188		# get the next character
189	1	current_char = word[pos]
190
191		# if a vowel in pos 0, add to key
192	1	if _is_vowel(pos) and pos == 0:
193	1	meta_key += current_char
194	1	pos += 1
195		# otherwise, do consonant rules
196		else:
197		# simple consonants (unmutated)
198	1	if current_char in {
199		'D',
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
200		'F',
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
201		'J',
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
202		'K',
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
203		'M',
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
204		'N',
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
205		'P',
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
206		'T',
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
207		'V',
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
208		'L',
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
209		'Y',
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
210		}:
211	1	meta_key += current_char
212		# skip doubled consonants
213	1	if word[pos + 1 : pos + 2] == current_char:
214	1	pos += 2
215		else:
216	1	pos += 1
217		else:
218	1	if current_char == 'C':
219		# special case 'acción', 'reacción',etc.
220	1	if word[pos + 1 : pos + 2] == 'C':
221	1	meta_key += 'X'
222	1	pos += 2
223		# special case 'cesar', 'cien', 'cid', 'conciencia'
224	1	elif word[pos + 1 : pos + 2] in {'E', 'I'}:
225	1	meta_key += 'Z'
226	1	pos += 2
227		# base case
228		else:
229	1	meta_key += 'K'
230	1	pos += 1
231	1	elif current_char == 'G':
232		# special case 'gente', 'ecologia',etc
233	1	if word[pos + 1 : pos + 2] in {'E', 'I'}:
234	1	meta_key += 'J'
235	1	pos += 2
236		# base case
237		else:
238	1	meta_key += 'G'
239	1	pos += 1
240	1	elif current_char == 'H':
241		# since the letter 'H' is silent in Spanish,
242		# set the meta key to the vowel after the letter 'H'
243	1	if _is_vowel(pos + 1):
244	1	meta_key += word[pos + 1]
245	1	pos += 2
246		else:
247	1	meta_key += 'H'
248	1	pos += 1
249	1	elif current_char == 'Q':
250	1	if word[pos + 1 : pos + 2] == 'U':
251	1	pos += 2
252		else:
253	1	pos += 1
254	1	meta_key += 'K'
255	1	elif current_char == 'W':
256	1	meta_key += 'U'
257	1	pos += 1
258	1	elif current_char == 'R':
259	1	meta_key += 'R'
260	1	pos += 1
261	1	elif current_char == 'S':
262	1	if not _is_vowel(pos + 1) and pos == 0:
263	1	meta_key += 'ES'
264	1	pos += 1
265		else:
266	1	meta_key += 'S'
267	1	pos += 1
268	1	elif current_char == 'Z':
269	1	meta_key += 'Z'
270	1	pos += 1
271	1	elif current_char == 'X':
272	1	if (
273		len(word) > 1
		0 ignored issues – show Coding Style introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
274		and pos == 0
		0 ignored issues – show Coding Style introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
275		and not _is_vowel(pos + 1)
		0 ignored issues – show Coding Style introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
276		):
277	1	meta_key += 'EX'
278	1	pos += 1
279		else:
280	1	meta_key += 'X'
281	1	pos += 1
282		else:
283	1	pos += 1
284
285		# Final change from S to Z in modified version
286	1	if modified:
287	1	meta_key = meta_key.replace('S', 'Z')
288
289	1	return meta_key
290
291
292	1	def spanish_metaphone(word, max_length=6, modified=False):
293		"""Return the Spanish Metaphone of a word.
294
295		This is a wrapper for :py:meth:`SpanishMetaphone.encode`.
296
297		:param str word: the word to transform
298		:param int max_length: the length of the code returned (defaults to 6)
299		:param bool modified: Set to True to use del Pilar Angeles &
300		Bailón-Miguel's modified version of the algorithm
301		:returns: the Spanish Metaphone code
302		:rtype: str
303
304		>>> spanish_metaphone('Perez')
305		'PRZ'
306		>>> spanish_metaphone('Martinez')
307		'MRTNZ'
308		>>> spanish_metaphone('Gutierrez')
309		'GTRRZ'
310		>>> spanish_metaphone('Santiago')
311		'SNTG'
312		>>> spanish_metaphone('Nicolás')
313		'NKLS'
314		"""
315	1	return SpanishMetaphone().encode(word, max_length, modified)
316
317
318		if __name__ == '__main__':
319		import doctest
320
321		doctest.testmod()
322

chrislit / abydos

Pull Request — master (#135)

abydos.phonetic._es.PhoneticSpanish.encode() A

Complexity

Size

Duplication

Code Coverage

Importance

Duplication Side-by-Side

Filter issues like