abydos.phonetic._fonem.FONEM.encode() - Code Metrics - Inspection of "0.3.6" - chrislit/abydos - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Pull Request — master (#149)

by Chris

created 2018-11-17 08:37 UTC

abydos.phonetic._fonem.FONEM.encode() A

↳ Parent: abydos.phonetic._fonem

Complexity

Conditions

Size

Total Lines	41
Code Lines	10

Duplication

Lines	0
Ratio	0 %

Code Coverage

Tests	10
CRAP Score	3

Importance

Changes

Metric	Value
cc	3
eloc	10
nop	2
dl	0
loc	41
ccs	10
cts	10
cp	1
crap	3
rs	9.9
c	0
b	0
f	0

# -*- coding: utf-8 -*-

# Copyright 2018 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.phonetic._fonem.

FONEM
"""

from __future__ import (
    absolute_import,
    division,
    print_function,
    unicode_literals,
)

from re import compile as re_compile
from unicodedata import normalize as unicode_normalize

from six import text_type

from ._phonetic import _Phonetic

__all__ = ['FONEM', 'fonem']


class FONEM(_Phonetic):

    """FONEM.

    FONEM is a phonetic algorithm designed for French (particularly surnames in
    Saguenay, Canada), defined in :cite:`Bouchard:1981`.

    Guillaume Plique's Javascript implementation :cite:`Plique:2018` at
    https://github.com/Yomguithereal/talisman/blob/master/src/phonetics/french/fonem.js
    was also consulted for this implementation.
    """

    # I don't see a sane way of doing this without regexps :(
    _rule_table = {
        # Vowels & groups of vowels
        'V-1': (re_compile('E?AU'), 'O'),
        'V-2,5': (re_compile('(E?AU|O)L[TX]$'), 'O'),
        'V-3,4': (re_compile('E?AU[TX]$'), 'O'),
        'V-6': (re_compile('E?AUL?D$'), 'O'),
        'V-7': (re_compile(r'(?<!G)AY$'), 'E'),
        'V-8': (re_compile('EUX$'), 'EU'),
        'V-9': (re_compile('EY(?=$|[BCDFGHJKLMNPQRSTVWXZ])'), 'E'),
        'V-10': ('Y', 'I'),
        'V-11': (re_compile('(?<=[AEIOUY])I(?=[AEIOUY])'), 'Y'),
        'V-12': (re_compile('(?<=[AEIOUY])ILL'), 'Y'),
        'V-13': (re_compile('OU(?=[AEOU]|I(?!LL))'), 'W'),
        'V-14': (re_compile(r'([AEIOUY])(?=\1)'), ''),
        # Nasal vowels
        'V-15': (re_compile('[AE]M(?=[BCDFGHJKLMPQRSTVWXZ])(?!$)'), 'EN'),
        'V-16': (re_compile('OM(?=[BCDFGHJKLMPQRSTVWXZ])'), 'ON'),
        'V-17': (re_compile('AN(?=[BCDFGHJKLMNPQRSTVWXZ])'), 'EN'),
        'V-18': (re_compile('(AI[MN]|EIN)(?=[BCDFGHJKLMNPQRSTVWXZ]|$)'), 'IN'),
        'V-19': (re_compile('B(O|U|OU)RNE?$'), 'BURN'),
        'V-20': (
            re_compile(
                '(^IM|(?<=[BCDFGHJKLMNPQRSTVWXZ])'
                + 'IM(?=[BCDFGHJKLMPQRSTVWXZ]))'
            ),
            'IN',
        ),
        # Consonants and groups of consonants
        'C-1': ('BV', 'V'),
        'C-2': (re_compile('(?<=[AEIOUY])C(?=[EIY])'), 'SS'),
        'C-3': (re_compile('(?<=[BDFGHJKLMNPQRSTVWZ])C(?=[EIY])'), 'S'),
        'C-4': (re_compile('^C(?=[EIY])'), 'S'),
        'C-5': (re_compile('^C(?=[OUA])'), 'K'),
        'C-6': (re_compile('(?<=[AEIOUY])C$'), 'K'),
        'C-7': (re_compile('C(?=[BDFGJKLMNPQRSTVWXZ])'), 'K'),
        'C-8': (re_compile('CC(?=[AOU])'), 'K'),
        'C-9': (re_compile('CC(?=[EIY])'), 'X'),
        'C-10': (re_compile('G(?=[EIY])'), 'J'),
        'C-11': (re_compile('GA(?=I?[MN])'), 'G#'),
        'C-12': (re_compile('GE(O|AU)'), 'JO'),
        'C-13': (re_compile('GNI(?=[AEIOUY])'), 'GN'),
        'C-14': (re_compile('(?<![PCS])H'), ''),
        'C-15': ('JEA', 'JA'),
        'C-16': (re_compile('^MAC(?=[BCDFGHJKLMNPQRSTVWXZ])'), 'MA#'),
        'C-17': (re_compile('^MC'), 'MA#'),
        'C-18': ('PH', 'F'),
        'C-19': ('QU', 'K'),
        'C-20': (re_compile('^SC(?=[EIY])'), 'S'),
        'C-21': (re_compile('(?<=.)SC(?=[EIY])'), 'SS'),
        'C-22': (re_compile('(?<=.)SC(?=[AOU])'), 'SK'),
        'C-23': ('SH', 'CH'),
        'C-24': (re_compile('TIA$'), 'SSIA'),
        'C-25': (re_compile('(?<=[AIOUY])W'), ''),
        'C-26': (re_compile('X[CSZ]'), 'X'),
        'C-27': (
            re_compile(
                '(?<=[AEIOUY])Z|(?<=[BCDFGHJKLMNPQRSTVWXZ])'
                + 'Z(?=[BCDFGHJKLMNPQRSTVWXZ])'
            ),
            'S',
        ),
        'C-28': (re_compile(r'([BDFGHJKMNPQRTVWXZ])\1'), r'\1'),
        'C-28a': (re_compile('CC(?=[BCDFGHJKLMNPQRSTVWXZ]|$)'), 'C'),
        'C-28b': (re_compile('((?<=[BCDFGHJKLMNPQRSTVWXZ])|^)SS'), 'S'),
        'C-28bb': (re_compile('SS(?=[BCDFGHJKLMNPQRSTVWXZ]|$)'), 'S'),
        'C-28c': (re_compile('((?<=[^I])|^)LL'), 'L'),
        'C-28d': (re_compile('ILE$'), 'ILLE'),
        'C-29': (
            re_compile(
                '(ILS|[CS]H|[MN]P|R[CFKLNSX])$|([BCDFGHJKL'
                + 'MNPQRSTVWXZ])[BCDFGHJKLMNPQRSTVWXZ]$'
            ),
            lambda m: (m.group(1) or '') + (m.group(2) or ''),
        ),
        'C-30,32': (re_compile('^(SA?INT?|SEI[NM]|CINQ?|ST)(?!E)-?'), 'ST-'),
        'C-31,33': (re_compile('^(SAINTE|STE)-?'), 'STE-'),
        # Rules to undo rule bleeding prevention in C-11, C-16, C-17
        'C-34': ('G#', 'GA'),
        'C-35': ('MA#', 'MAC'),
    }
    _rule_order = (
        'V-14',
        'C-28',
        'C-28a',
        'C-28b',
        'C-28bb',
        'C-28c',
        'C-28d',
        'C-12',
        'C-8',
        'C-9',
        'C-10',
        'C-16',
        'C-17',
        'C-2',
        'C-3',
        'C-7',
        'V-2,5',
        'V-3,4',
        'V-6',
        'V-1',
        'C-14',
        'C-31,33',
        'C-30,32',
        'C-11',
        'V-15',
        'V-17',
        'V-18',
        'V-7',
        'V-8',
        'V-9',
        'V-10',
        'V-11',
        'V-12',
        'V-13',
        'V-16',
        'V-19',
        'V-20',
        'C-1',
        'C-4',
        'C-5',
        'C-6',
        'C-13',
        'C-15',
        'C-18',
        'C-19',
        'C-20',
        'C-21',
        'C-22',
        'C-23',
        'C-24',
        'C-25',
        'C-26',
        'C-27',
        'C-29',
        'V-14',
        'C-28',
        'C-28a',
        'C-28b',
        'C-28bb',
        'C-28c',
        'C-28d',
        'C-34',
        'C-35',
    )

    _uc_set = set('ABCDEFGHIJKLMNOPQRSTUVWXYZ-')

    def encode(self, word):
        """Return the FONEM code of a word.

        Parameters
        ----------
        word : str
            The word to transform

        Returns
        -------
        str
            The FONEM code

        Examples
        --------
        >>> pe = FONEM()
        >>> pe.encode('Marchand')
        'MARCHEN'
        >>> pe.encode('Beaulieu')
        'BOLIEU'
        >>> pe.encode('Beaumont')
        'BOMON'
        >>> pe.encode('Legrand')
        'LEGREN'
        >>> pe.encode('Pelletier')
        'PELETIER'

        """
        # normalize, upper-case, and filter non-French letters
        word = unicode_normalize('NFKD', text_type(word.upper()))
        word = word.translate({198: 'AE', 338: 'OE'})
        word = ''.join(c for c in word if c in self._uc_set)

        for rule in self._rule_order:
            regex, repl = self._rule_table[rule]
            if isinstance(regex, text_type):
                word = word.replace(regex, repl)
            else:
                word = regex.sub(repl, word)

        return word


def fonem(word):
    """Return the FONEM code of a word.

    This is a wrapper for :py:meth:`FONEM.encode`.

    Parameters
    ----------
    word : str
        The word to transform

    Returns
    -------
    str
        The FONEM code

    Examples
    --------
    >>> fonem('Marchand')
    'MARCHEN'
    >>> fonem('Beaulieu')
    'BOLIEU'
    >>> fonem('Beaumont')
    'BOMON'
    >>> fonem('Legrand')
    'LEGREN'
    >>> fonem('Pelletier')
    'PELETIER'

    """
    return FONEM().encode(word)


if __name__ == '__main__':
    import doctest

    doctest.testmod()


1		# -- coding: utf-8 --
2
3		# Copyright 2018 by Christopher C. Little.
4		# This file is part of Abydos.
5		#
6		# Abydos is free software: you can redistribute it and/or modify
7		# it under the terms of the GNU General Public License as published by
8		# the Free Software Foundation, either version 3 of the License, or
9		# (at your option) any later version.
10		#
11		# Abydos is distributed in the hope that it will be useful,
12		# but WITHOUT ANY WARRANTY; without even the implied warranty of
13		# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14		# GNU General Public License for more details.
15		#
16		# You should have received a copy of the GNU General Public License
17		# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19	1	"""abydos.phonetic._fonem.
20
21		FONEM
22		"""
23
24	1	from __future__ import (
25		absolute_import,
26		division,
27		print_function,
28		unicode_literals,
29		)
30
31	1	from re import compile as re_compile
32	1	from unicodedata import normalize as unicode_normalize
33
34	1	from six import text_type
35
36	1	from ._phonetic import _Phonetic
37
38	1	__all__ = ['FONEM', 'fonem']
39
40
41	1	class FONEM(_Phonetic):
		0 ignored issues – show Unused Code introduced 2018-11-10 01:42 UTC by Report Bug Copy Issue Report The variable `__class__` seems to be unused. Loading history...
42		"""FONEM.
43
44		FONEM is a phonetic algorithm designed for French (particularly surnames in
45		Saguenay, Canada), defined in :cite:`Bouchard:1981`.
46
47		Guillaume Plique's Javascript implementation :cite:`Plique:2018` at
48		https://github.com/Yomguithereal/talisman/blob/master/src/phonetics/french/fonem.js
49		was also consulted for this implementation.
50		"""
51
52		# I don't see a sane way of doing this without regexps :(
53	1	_rule_table = {
54		# Vowels & groups of vowels
55		'V-1': (re_compile('E?AU'), 'O'),
56		'V-2,5': (re_compile('(E?AU\|O)L[TX]$'), 'O'),
57		'V-3,4': (re_compile('E?AU[TX]$'), 'O'),
58		'V-6': (re_compile('E?AUL?D$'), 'O'),
59		'V-7': (re_compile(r'(?<!G)AY$'), 'E'),
60		'V-8': (re_compile('EUX$'), 'EU'),
61		'V-9': (re_compile('EY(?=$\|[BCDFGHJKLMNPQRSTVWXZ])'), 'E'),
62		'V-10': ('Y', 'I'),
63		'V-11': (re_compile('(?<=[AEIOUY])I(?=[AEIOUY])'), 'Y'),
64		'V-12': (re_compile('(?<=[AEIOUY])ILL'), 'Y'),
65		'V-13': (re_compile('OU(?=[AEOU]\|I(?!LL))'), 'W'),
66		'V-14': (re_compile(r'([AEIOUY])(?=\1)'), ''),
67		# Nasal vowels
68		'V-15': (re_compile('[AE]M(?=[BCDFGHJKLMPQRSTVWXZ])(?!$)'), 'EN'),
69		'V-16': (re_compile('OM(?=[BCDFGHJKLMPQRSTVWXZ])'), 'ON'),
70		'V-17': (re_compile('AN(?=[BCDFGHJKLMNPQRSTVWXZ])'), 'EN'),
71		'V-18': (re_compile('(AI[MN]\|EIN)(?=[BCDFGHJKLMNPQRSTVWXZ]\|$)'), 'IN'),
72		'V-19': (re_compile('B(O\|U\|OU)RNE?$'), 'BURN'),
73		'V-20': (
74		re_compile(
75		'(^IM\|(?<=[BCDFGHJKLMNPQRSTVWXZ])'
76		+ 'IM(?=[BCDFGHJKLMPQRSTVWXZ]))'
77		),
78		'IN',
79		),
80		# Consonants and groups of consonants
81		'C-1': ('BV', 'V'),
82		'C-2': (re_compile('(?<=[AEIOUY])C(?=[EIY])'), 'SS'),
83		'C-3': (re_compile('(?<=[BDFGHJKLMNPQRSTVWZ])C(?=[EIY])'), 'S'),
84		'C-4': (re_compile('^C(?=[EIY])'), 'S'),
85		'C-5': (re_compile('^C(?=[OUA])'), 'K'),
86		'C-6': (re_compile('(?<=[AEIOUY])C$'), 'K'),
87		'C-7': (re_compile('C(?=[BDFGJKLMNPQRSTVWXZ])'), 'K'),
88		'C-8': (re_compile('CC(?=[AOU])'), 'K'),
89		'C-9': (re_compile('CC(?=[EIY])'), 'X'),
90		'C-10': (re_compile('G(?=[EIY])'), 'J'),
91		'C-11': (re_compile('GA(?=I?[MN])'), 'G#'),
92		'C-12': (re_compile('GE(O\|AU)'), 'JO'),
93		'C-13': (re_compile('GNI(?=[AEIOUY])'), 'GN'),
94		'C-14': (re_compile('(?<![PCS])H'), ''),
95		'C-15': ('JEA', 'JA'),
96		'C-16': (re_compile('^MAC(?=[BCDFGHJKLMNPQRSTVWXZ])'), 'MA#'),
97		'C-17': (re_compile('^MC'), 'MA#'),
98		'C-18': ('PH', 'F'),
99		'C-19': ('QU', 'K'),
100		'C-20': (re_compile('^SC(?=[EIY])'), 'S'),
101		'C-21': (re_compile('(?<=.)SC(?=[EIY])'), 'SS'),
102		'C-22': (re_compile('(?<=.)SC(?=[AOU])'), 'SK'),
103		'C-23': ('SH', 'CH'),
104		'C-24': (re_compile('TIA$'), 'SSIA'),
105		'C-25': (re_compile('(?<=[AIOUY])W'), ''),
106		'C-26': (re_compile('X[CSZ]'), 'X'),
107		'C-27': (
108		re_compile(
109		'(?<=[AEIOUY])Z\|(?<=[BCDFGHJKLMNPQRSTVWXZ])'
110		+ 'Z(?=[BCDFGHJKLMNPQRSTVWXZ])'
111		),
112		'S',
113		),
114		'C-28': (re_compile(r'([BDFGHJKMNPQRTVWXZ])\1'), r'\1'),
115		'C-28a': (re_compile('CC(?=[BCDFGHJKLMNPQRSTVWXZ]\|$)'), 'C'),
116		'C-28b': (re_compile('((?<=[BCDFGHJKLMNPQRSTVWXZ])\|^)SS'), 'S'),
117		'C-28bb': (re_compile('SS(?=[BCDFGHJKLMNPQRSTVWXZ]\|$)'), 'S'),
118		'C-28c': (re_compile('((?<=[^I])\|^)LL'), 'L'),
119		'C-28d': (re_compile('ILE$'), 'ILLE'),
120		'C-29': (
121		re_compile(
122		'(ILS\|[CS]H\|[MN]P\|R[CFKLNSX])$\|([BCDFGHJKL'
123		+ 'MNPQRSTVWXZ])[BCDFGHJKLMNPQRSTVWXZ]$'
124		),
125		lambda m: (m.group(1) or '') + (m.group(2) or ''),
126		),
127		'C-30,32': (re_compile('^(SA?INT?\|SEI[NM]\|CINQ?\|ST)(?!E)-?'), 'ST-'),
128		'C-31,33': (re_compile('^(SAINTE\|STE)-?'), 'STE-'),
129		# Rules to undo rule bleeding prevention in C-11, C-16, C-17
130		'C-34': ('G#', 'GA'),
131		'C-35': ('MA#', 'MAC'),
132		}
133	1	_rule_order = (
134		'V-14',
135		'C-28',
136		'C-28a',
137		'C-28b',
138		'C-28bb',
139		'C-28c',
140		'C-28d',
141		'C-12',
142		'C-8',
143		'C-9',
144		'C-10',
145		'C-16',
146		'C-17',
147		'C-2',
148		'C-3',
149		'C-7',
150		'V-2,5',
151		'V-3,4',
152		'V-6',
153		'V-1',
154		'C-14',
155		'C-31,33',
156		'C-30,32',
157		'C-11',
158		'V-15',
159		'V-17',
160		'V-18',
161		'V-7',
162		'V-8',
163		'V-9',
164		'V-10',
165		'V-11',
166		'V-12',
167		'V-13',
168		'V-16',
169		'V-19',
170		'V-20',
171		'C-1',
172		'C-4',
173		'C-5',
174		'C-6',
175		'C-13',
176		'C-15',
177		'C-18',
178		'C-19',
179		'C-20',
180		'C-21',
181		'C-22',
182		'C-23',
183		'C-24',
184		'C-25',
185		'C-26',
186		'C-27',
187		'C-29',
188		'V-14',
189		'C-28',
190		'C-28a',
191		'C-28b',
192		'C-28bb',
193		'C-28c',
194		'C-28d',
195		'C-34',
196		'C-35',
197		)
198
199	1	_uc_set = set('ABCDEFGHIJKLMNOPQRSTUVWXYZ-')
200
201	1	def encode(self, word):
202		"""Return the FONEM code of a word.
203
204		Parameters
205		----------
206		word : str
207		The word to transform
208
209		Returns
210		-------
211		str
212		The FONEM code
213
214		Examples
215		--------
216		>>> pe = FONEM()
217		>>> pe.encode('Marchand')
218		'MARCHEN'
219		>>> pe.encode('Beaulieu')
220		'BOLIEU'
221		>>> pe.encode('Beaumont')
222		'BOMON'
223		>>> pe.encode('Legrand')
224		'LEGREN'
225		>>> pe.encode('Pelletier')
226		'PELETIER'
227
228		"""
229		# normalize, upper-case, and filter non-French letters
230	1	word = unicode_normalize('NFKD', text_type(word.upper()))
231	1	word = word.translate({198: 'AE', 338: 'OE'})
232	1	word = ''.join(c for c in word if c in self._uc_set)
233
234	1	for rule in self._rule_order:
235	1	regex, repl = self._rule_table[rule]
236	1	if isinstance(regex, text_type):
237	1	word = word.replace(regex, repl)
238		else:
239	1	word = regex.sub(repl, word)
240
241	1	return word
242
243
244	1	def fonem(word):
245		"""Return the FONEM code of a word.
246
247		This is a wrapper for :py:meth:`FONEM.encode`.
248
249		Parameters
250		----------
251		word : str
252		The word to transform
253
254		Returns
255		-------
256		str
257		The FONEM code
258
259		Examples
260		--------
261		>>> fonem('Marchand')
262		'MARCHEN'
263		>>> fonem('Beaulieu')
264		'BOLIEU'
265		>>> fonem('Beaumont')
266		'BOMON'
267		>>> fonem('Legrand')
268		'LEGREN'
269		>>> fonem('Pelletier')
270		'PELETIER'
271
272		"""
273	1	return FONEM().encode(word)
274
275
276		if __name__ == '__main__':
277		import doctest
278
279		doctest.testmod()
280

chrislit / abydos

Pull Request — master (#149)

abydos.phonetic._fonem.FONEM.encode() A

Complexity

Size

Duplication

Code Coverage

Importance

Duplication Side-by-Side

Filter issues like