Completed
Pull Request — master (#149)
by Chris
11:34
created

abydos.phonetic._fonem.FONEM.encode()   A

Complexity

Conditions 3

Size

Total Lines 41
Code Lines 10

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 10
CRAP Score 3

Importance

Changes 0
Metric Value
cc 3
eloc 10
nop 2
dl 0
loc 41
ccs 10
cts 10
cp 1
crap 3
rs 9.9
c 0
b 0
f 0
1
# -*- coding: utf-8 -*-
2
3
# Copyright 2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19 1
"""abydos.phonetic._fonem.
20
21
FONEM
22
"""
23
24 1
from __future__ import (
25
    absolute_import,
26
    division,
27
    print_function,
28
    unicode_literals,
29
)
30
31 1
from re import compile as re_compile
32 1
from unicodedata import normalize as unicode_normalize
33
34 1
from six import text_type
35
36 1
from ._phonetic import _Phonetic
37
38 1
__all__ = ['FONEM', 'fonem']
39
40
41 1
class FONEM(_Phonetic):
0 ignored issues
show
Unused Code introduced by
The variable __class__ seems to be unused.
Loading history...
42
    """FONEM.
43
44
    FONEM is a phonetic algorithm designed for French (particularly surnames in
45
    Saguenay, Canada), defined in :cite:`Bouchard:1981`.
46
47
    Guillaume Plique's Javascript implementation :cite:`Plique:2018` at
48
    https://github.com/Yomguithereal/talisman/blob/master/src/phonetics/french/fonem.js
49
    was also consulted for this implementation.
50
    """
51
52
    # I don't see a sane way of doing this without regexps :(
53 1
    _rule_table = {
54
        # Vowels & groups of vowels
55
        'V-1': (re_compile('E?AU'), 'O'),
56
        'V-2,5': (re_compile('(E?AU|O)L[TX]$'), 'O'),
57
        'V-3,4': (re_compile('E?AU[TX]$'), 'O'),
58
        'V-6': (re_compile('E?AUL?D$'), 'O'),
59
        'V-7': (re_compile(r'(?<!G)AY$'), 'E'),
60
        'V-8': (re_compile('EUX$'), 'EU'),
61
        'V-9': (re_compile('EY(?=$|[BCDFGHJKLMNPQRSTVWXZ])'), 'E'),
62
        'V-10': ('Y', 'I'),
63
        'V-11': (re_compile('(?<=[AEIOUY])I(?=[AEIOUY])'), 'Y'),
64
        'V-12': (re_compile('(?<=[AEIOUY])ILL'), 'Y'),
65
        'V-13': (re_compile('OU(?=[AEOU]|I(?!LL))'), 'W'),
66
        'V-14': (re_compile(r'([AEIOUY])(?=\1)'), ''),
67
        # Nasal vowels
68
        'V-15': (re_compile('[AE]M(?=[BCDFGHJKLMPQRSTVWXZ])(?!$)'), 'EN'),
69
        'V-16': (re_compile('OM(?=[BCDFGHJKLMPQRSTVWXZ])'), 'ON'),
70
        'V-17': (re_compile('AN(?=[BCDFGHJKLMNPQRSTVWXZ])'), 'EN'),
71
        'V-18': (re_compile('(AI[MN]|EIN)(?=[BCDFGHJKLMNPQRSTVWXZ]|$)'), 'IN'),
72
        'V-19': (re_compile('B(O|U|OU)RNE?$'), 'BURN'),
73
        'V-20': (
74
            re_compile(
75
                '(^IM|(?<=[BCDFGHJKLMNPQRSTVWXZ])'
76
                + 'IM(?=[BCDFGHJKLMPQRSTVWXZ]))'
77
            ),
78
            'IN',
79
        ),
80
        # Consonants and groups of consonants
81
        'C-1': ('BV', 'V'),
82
        'C-2': (re_compile('(?<=[AEIOUY])C(?=[EIY])'), 'SS'),
83
        'C-3': (re_compile('(?<=[BDFGHJKLMNPQRSTVWZ])C(?=[EIY])'), 'S'),
84
        'C-4': (re_compile('^C(?=[EIY])'), 'S'),
85
        'C-5': (re_compile('^C(?=[OUA])'), 'K'),
86
        'C-6': (re_compile('(?<=[AEIOUY])C$'), 'K'),
87
        'C-7': (re_compile('C(?=[BDFGJKLMNPQRSTVWXZ])'), 'K'),
88
        'C-8': (re_compile('CC(?=[AOU])'), 'K'),
89
        'C-9': (re_compile('CC(?=[EIY])'), 'X'),
90
        'C-10': (re_compile('G(?=[EIY])'), 'J'),
91
        'C-11': (re_compile('GA(?=I?[MN])'), 'G#'),
92
        'C-12': (re_compile('GE(O|AU)'), 'JO'),
93
        'C-13': (re_compile('GNI(?=[AEIOUY])'), 'GN'),
94
        'C-14': (re_compile('(?<![PCS])H'), ''),
95
        'C-15': ('JEA', 'JA'),
96
        'C-16': (re_compile('^MAC(?=[BCDFGHJKLMNPQRSTVWXZ])'), 'MA#'),
97
        'C-17': (re_compile('^MC'), 'MA#'),
98
        'C-18': ('PH', 'F'),
99
        'C-19': ('QU', 'K'),
100
        'C-20': (re_compile('^SC(?=[EIY])'), 'S'),
101
        'C-21': (re_compile('(?<=.)SC(?=[EIY])'), 'SS'),
102
        'C-22': (re_compile('(?<=.)SC(?=[AOU])'), 'SK'),
103
        'C-23': ('SH', 'CH'),
104
        'C-24': (re_compile('TIA$'), 'SSIA'),
105
        'C-25': (re_compile('(?<=[AIOUY])W'), ''),
106
        'C-26': (re_compile('X[CSZ]'), 'X'),
107
        'C-27': (
108
            re_compile(
109
                '(?<=[AEIOUY])Z|(?<=[BCDFGHJKLMNPQRSTVWXZ])'
110
                + 'Z(?=[BCDFGHJKLMNPQRSTVWXZ])'
111
            ),
112
            'S',
113
        ),
114
        'C-28': (re_compile(r'([BDFGHJKMNPQRTVWXZ])\1'), r'\1'),
115
        'C-28a': (re_compile('CC(?=[BCDFGHJKLMNPQRSTVWXZ]|$)'), 'C'),
116
        'C-28b': (re_compile('((?<=[BCDFGHJKLMNPQRSTVWXZ])|^)SS'), 'S'),
117
        'C-28bb': (re_compile('SS(?=[BCDFGHJKLMNPQRSTVWXZ]|$)'), 'S'),
118
        'C-28c': (re_compile('((?<=[^I])|^)LL'), 'L'),
119
        'C-28d': (re_compile('ILE$'), 'ILLE'),
120
        'C-29': (
121
            re_compile(
122
                '(ILS|[CS]H|[MN]P|R[CFKLNSX])$|([BCDFGHJKL'
123
                + 'MNPQRSTVWXZ])[BCDFGHJKLMNPQRSTVWXZ]$'
124
            ),
125
            lambda m: (m.group(1) or '') + (m.group(2) or ''),
126
        ),
127
        'C-30,32': (re_compile('^(SA?INT?|SEI[NM]|CINQ?|ST)(?!E)-?'), 'ST-'),
128
        'C-31,33': (re_compile('^(SAINTE|STE)-?'), 'STE-'),
129
        # Rules to undo rule bleeding prevention in C-11, C-16, C-17
130
        'C-34': ('G#', 'GA'),
131
        'C-35': ('MA#', 'MAC'),
132
    }
133 1
    _rule_order = (
134
        'V-14',
135
        'C-28',
136
        'C-28a',
137
        'C-28b',
138
        'C-28bb',
139
        'C-28c',
140
        'C-28d',
141
        'C-12',
142
        'C-8',
143
        'C-9',
144
        'C-10',
145
        'C-16',
146
        'C-17',
147
        'C-2',
148
        'C-3',
149
        'C-7',
150
        'V-2,5',
151
        'V-3,4',
152
        'V-6',
153
        'V-1',
154
        'C-14',
155
        'C-31,33',
156
        'C-30,32',
157
        'C-11',
158
        'V-15',
159
        'V-17',
160
        'V-18',
161
        'V-7',
162
        'V-8',
163
        'V-9',
164
        'V-10',
165
        'V-11',
166
        'V-12',
167
        'V-13',
168
        'V-16',
169
        'V-19',
170
        'V-20',
171
        'C-1',
172
        'C-4',
173
        'C-5',
174
        'C-6',
175
        'C-13',
176
        'C-15',
177
        'C-18',
178
        'C-19',
179
        'C-20',
180
        'C-21',
181
        'C-22',
182
        'C-23',
183
        'C-24',
184
        'C-25',
185
        'C-26',
186
        'C-27',
187
        'C-29',
188
        'V-14',
189
        'C-28',
190
        'C-28a',
191
        'C-28b',
192
        'C-28bb',
193
        'C-28c',
194
        'C-28d',
195
        'C-34',
196
        'C-35',
197
    )
198
199 1
    _uc_set = set('ABCDEFGHIJKLMNOPQRSTUVWXYZ-')
200
201 1
    def encode(self, word):
202
        """Return the FONEM code of a word.
203
204
        Parameters
205
        ----------
206
        word : str
207
            The word to transform
208
209
        Returns
210
        -------
211
        str
212
            The FONEM code
213
214
        Examples
215
        --------
216
        >>> pe = FONEM()
217
        >>> pe.encode('Marchand')
218
        'MARCHEN'
219
        >>> pe.encode('Beaulieu')
220
        'BOLIEU'
221
        >>> pe.encode('Beaumont')
222
        'BOMON'
223
        >>> pe.encode('Legrand')
224
        'LEGREN'
225
        >>> pe.encode('Pelletier')
226
        'PELETIER'
227
228
        """
229
        # normalize, upper-case, and filter non-French letters
230 1
        word = unicode_normalize('NFKD', text_type(word.upper()))
231 1
        word = word.translate({198: 'AE', 338: 'OE'})
232 1
        word = ''.join(c for c in word if c in self._uc_set)
233
234 1
        for rule in self._rule_order:
235 1
            regex, repl = self._rule_table[rule]
236 1
            if isinstance(regex, text_type):
237 1
                word = word.replace(regex, repl)
238
            else:
239 1
                word = regex.sub(repl, word)
240
241 1
        return word
242
243
244 1
def fonem(word):
245
    """Return the FONEM code of a word.
246
247
    This is a wrapper for :py:meth:`FONEM.encode`.
248
249
    Parameters
250
    ----------
251
    word : str
252
        The word to transform
253
254
    Returns
255
    -------
256
    str
257
        The FONEM code
258
259
    Examples
260
    --------
261
    >>> fonem('Marchand')
262
    'MARCHEN'
263
    >>> fonem('Beaulieu')
264
    'BOLIEU'
265
    >>> fonem('Beaumont')
266
    'BOMON'
267
    >>> fonem('Legrand')
268
    'LEGREN'
269
    >>> fonem('Pelletier')
270
    'PELETIER'
271
272
    """
273 1
    return FONEM().encode(word)
274
275
276
if __name__ == '__main__':
277
    import doctest
278
279
    doctest.testmod()
280