Completed
Pull Request — master (#120)
by Chris
12:34
created

abydos.phonetic.es   A

Complexity

Total Complexity 31

Size/Duplication

Total Lines 238
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
eloc 111
dl 0
loc 238
rs 9.92
c 0
b 0
f 0
wmc 31

2 Functions

Rating   Name   Duplication   Size   Complexity  
F spanish_metaphone() 0 150 29
A phonetic_spanish() 0 44 2
1
# -*- coding: utf-8 -*-
2
3
# Copyright 2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19
"""abydos.phonetic.es.
20
21
The phonetic.es module implements phonetic algorithms intended for Spanish,
22
including:
23
24
    - Phonetic Spanish
25
    - Spanish Metaphone
26
"""
27
28
from __future__ import unicode_literals
29
30
from unicodedata import normalize as unicode_normalize
31
32
from six import text_type
33
34
__all__ = ['phonetic_spanish', 'spanish_metaphone']
35
36
37
def phonetic_spanish(word, max_length=-1):
38
    """Return the PhoneticSpanish coding of word.
39
40
    This follows the coding described in :cite:`Amon:2012` and
41
    :cite:`delPilarAngeles:2015`.
42
43
    :param str word: the word to transform
44
    :param int max_length: the length of the code returned (defaults to
45
        unlimited)
46
    :returns: the PhoneticSpanish code
47
    :rtype: str
48
49
    >>> phonetic_spanish('Perez')
50
    '094'
51
    >>> phonetic_spanish('Martinez')
52
    '69364'
53
    >>> phonetic_spanish('Gutierrez')
54
    '83994'
55
    >>> phonetic_spanish('Santiago')
56
    '4638'
57
    >>> phonetic_spanish('Nicolás')
58
    '6454'
59
    """
60
    _es_soundex_translation = dict(zip((ord(_) for _ in
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
61
                                        'BCDFGHJKLMNPQRSTVXYZ'),
62
                                       '14328287566079431454'))
63
64
    # uppercase, normalize, and decompose, filter to A-Z minus vowels & W
65
    word = unicode_normalize('NFKD', text_type(word.upper()))
66
    word = ''.join(c for c in word if c in
67
                   {'B', 'C', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'N',
68
                    'P', 'Q', 'R', 'S', 'T', 'V', 'X', 'Y', 'Z'})
69
70
    # merge repeated Ls & Rs
71
    word = word.replace('LL', 'L')
72
    word = word.replace('R', 'R')
73
74
    # apply the Soundex algorithm
75
    sdx = word.translate(_es_soundex_translation)
76
77
    if max_length > 0:
78
        sdx = (sdx+('0'*max_length))[:max_length]
79
80
    return sdx
81
82
83
def spanish_metaphone(word, max_length=6, modified=False):
84
    """Return the Spanish Metaphone of a word.
85
86
    This is a quick rewrite of the Spanish Metaphone Algorithm, as presented at
87
    https://github.com/amsqr/Spanish-Metaphone and discussed in
88
    :cite:`Mosquera:2012`.
89
90
    Modified version based on :cite:`delPilarAngeles:2016`.
91
92
    :param str word: the word to transform
93
    :param int max_length: the length of the code returned (defaults to 6)
94
    :param bool modified: Set to True to use del Pilar Angeles &
95
        Bailón-Miguel's modified version of the algorithm
96
    :returns: the Spanish Metaphone code
97
    :rtype: str
98
99
    >>> spanish_metaphone('Perez')
100
    'PRZ'
101
    >>> spanish_metaphone('Martinez')
102
    'MRTNZ'
103
    >>> spanish_metaphone('Gutierrez')
104
    'GTRRZ'
105
    >>> spanish_metaphone('Santiago')
106
    'SNTG'
107
    >>> spanish_metaphone('Nicolás')
108
    'NKLS'
109
    """
110
    def _is_vowel(pos):
111
        """Return True if the character at word[pos] is a vowel."""
112
        return (pos < len(word) and
113
                word[pos] in {'A', 'E', 'I', 'O', 'U'})
114
115
    word = unicode_normalize('NFC', text_type(word.upper()))
116
117
    meta_key = ''
118
    pos = 0
119
120
    # do some replacements for the modified version
121
    if modified:
122
        word = word.replace('MB', 'NB')
123
        word = word.replace('MP', 'NP')
124
        word = word.replace('BS', 'S')
125
        if word[:2] == 'PS':
126
            word = word[1:]
127
128
    # simple replacements
129
    word = word.replace('Á', 'A')
130
    word = word.replace('CH', 'X')
131
    word = word.replace('Ç', 'S')
132
    word = word.replace('É', 'E')
133
    word = word.replace('Í', 'I')
134
    word = word.replace('Ó', 'O')
135
    word = word.replace('Ú', 'U')
136
    word = word.replace('Ñ', 'NY')
137
    word = word.replace('GÜ', 'W')
138
    word = word.replace('Ü', 'U')
139
    word = word.replace('B', 'V')
140
    word = word.replace('LL', 'Y')
141
142
    while len(meta_key) < max_length:
143
        if pos >= len(word):
144
            break
145
146
        # get the next character
147
        current_char = word[pos]
148
149
        # if a vowel in pos 0, add to key
150
        if _is_vowel(pos) and pos == 0:
151
            meta_key += current_char
152
            pos += 1
153
        # otherwise, do consonant rules
154
        else:
155
            # simple consonants (unmutated)
156
            if current_char in {'D', 'F', 'J', 'K', 'M', 'N', 'P', 'T', 'V',
157
                                'L', 'Y'}:
158
                meta_key += current_char
159
                # skip doubled consonants
160
                if word[pos+1:pos+2] == current_char:
161
                    pos += 2
162
                else:
163
                    pos += 1
164
            else:
165
                if current_char == 'C':
166
                    # special case 'acción', 'reacción',etc.
167
                    if word[pos+1:pos+2] == 'C':
168
                        meta_key += 'X'
169
                        pos += 2
170
                    # special case 'cesar', 'cien', 'cid', 'conciencia'
171
                    elif word[pos+1:pos+2] in {'E', 'I'}:
172
                        meta_key += 'Z'
173
                        pos += 2
174
                    # base case
175
                    else:
176
                        meta_key += 'K'
177
                        pos += 1
178
                elif current_char == 'G':
179
                    # special case 'gente', 'ecologia',etc
180
                    if word[pos + 1:pos + 2] in {'E', 'I'}:
181
                        meta_key += 'J'
182
                        pos += 2
183
                    # base case
184
                    else:
185
                        meta_key += 'G'
186
                        pos += 1
187
                elif current_char == 'H':
188
                    # since the letter 'H' is silent in Spanish,
189
                    # set the meta key to the vowel after the letter 'H'
190
                    if _is_vowel(pos+1):
191
                        meta_key += word[pos+1]
192
                        pos += 2
193
                    else:
194
                        meta_key += 'H'
195
                        pos += 1
196
                elif current_char == 'Q':
197
                    if word[pos+1:pos+2] == 'U':
198
                        pos += 2
199
                    else:
200
                        pos += 1
201
                    meta_key += 'K'
202
                elif current_char == 'W':
203
                    meta_key += 'U'
204
                    pos += 1
205
                elif current_char == 'R':
206
                    meta_key += 'R'
207
                    pos += 1
208
                elif current_char == 'S':
209
                    if not _is_vowel(pos+1) and pos == 0:
210
                        meta_key += 'ES'
211
                        pos += 1
212
                    else:
213
                        meta_key += 'S'
214
                        pos += 1
215
                elif current_char == 'Z':
216
                    meta_key += 'Z'
217
                    pos += 1
218
                elif current_char == 'X':
219
                    if len(word) > 1 and pos == 0 and not _is_vowel(pos+1):
220
                        meta_key += 'EX'
221
                        pos += 1
222
                    else:
223
                        meta_key += 'X'
224
                        pos += 1
225
                else:
226
                    pos += 1
227
228
    # Final change from S to Z in modified version
229
    if modified:
230
        meta_key = meta_key.replace('S', 'Z')
231
232
    return meta_key
233
234
235
if __name__ == '__main__':
236
    import doctest
237
    doctest.testmod()
238