Conditions | 4 |
Total Lines | 128 |
Code Lines | 92 |
Lines | 0 |
Ratio | 0 % |
Changes | 0 |
Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.
For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.
Commonly applied refactorings include:
If many parameters/temporary variables are present:
1 | # -*- coding: utf-8 -*- |
||
38 | def fonem(word): |
||
39 | """Return the FONEM code of a word. |
||
40 | |||
41 | FONEM is a phonetic algorithm designed for French (particularly surnames in |
||
42 | Saguenay, Canada), defined in :cite:`Bouchard:1981`. |
||
43 | |||
44 | Guillaume Plique's Javascript implementation :cite:`Plique:2018` at |
||
45 | https://github.com/Yomguithereal/talisman/blob/master/src/phonetics/french/fonem.js |
||
46 | was also consulted for this implementation. |
||
47 | |||
48 | :param str word: the word to transform |
||
49 | :returns: the FONEM code |
||
50 | :rtype: str |
||
51 | |||
52 | >>> fonem('Marchand') |
||
53 | 'MARCHEN' |
||
54 | >>> fonem('Beaulieu') |
||
55 | 'BOLIEU' |
||
56 | >>> fonem('Beaumont') |
||
57 | 'BOMON' |
||
58 | >>> fonem('Legrand') |
||
59 | 'LEGREN' |
||
60 | >>> fonem('Pelletier') |
||
61 | 'PELETIER' |
||
62 | """ |
||
63 | # I don't see a sane way of doing this without regexps :( |
||
64 | rule_table = { |
||
65 | # Vowels & groups of vowels |
||
66 | 'V-1': (re_compile('E?AU'), 'O'), |
||
67 | 'V-2,5': (re_compile('(E?AU|O)L[TX]$'), 'O'), |
||
68 | 'V-3,4': (re_compile('E?AU[TX]$'), 'O'), |
||
69 | 'V-6': (re_compile('E?AUL?D$'), 'O'), |
||
70 | 'V-7': (re_compile(r'(?<!G)AY$'), 'E'), |
||
71 | 'V-8': (re_compile('EUX$'), 'EU'), |
||
72 | 'V-9': (re_compile('EY(?=$|[BCDFGHJKLMNPQRSTVWXZ])'), 'E'), |
||
73 | 'V-10': ('Y', 'I'), |
||
74 | 'V-11': (re_compile('(?<=[AEIOUY])I(?=[AEIOUY])'), 'Y'), |
||
75 | 'V-12': (re_compile('(?<=[AEIOUY])ILL'), 'Y'), |
||
76 | 'V-13': (re_compile('OU(?=[AEOU]|I(?!LL))'), 'W'), |
||
77 | 'V-14': (re_compile(r'([AEIOUY])(?=\1)'), ''), |
||
78 | # Nasal vowels |
||
79 | 'V-15': (re_compile('[AE]M(?=[BCDFGHJKLMPQRSTVWXZ])(?!$)'), 'EN'), |
||
80 | 'V-16': (re_compile('OM(?=[BCDFGHJKLMPQRSTVWXZ])'), 'ON'), |
||
81 | 'V-17': (re_compile('AN(?=[BCDFGHJKLMNPQRSTVWXZ])'), 'EN'), |
||
82 | 'V-18': (re_compile('(AI[MN]|EIN)(?=[BCDFGHJKLMNPQRSTVWXZ]|$)'), |
||
83 | 'IN'), |
||
84 | 'V-19': (re_compile('B(O|U|OU)RNE?$'), 'BURN'), |
||
85 | 'V-20': (re_compile('(^IM|(?<=[BCDFGHJKLMNPQRSTVWXZ])' + |
||
86 | 'IM(?=[BCDFGHJKLMPQRSTVWXZ]))'), 'IN'), |
||
87 | # Consonants and groups of consonants |
||
88 | 'C-1': ('BV', 'V'), |
||
89 | 'C-2': (re_compile('(?<=[AEIOUY])C(?=[EIY])'), 'SS'), |
||
90 | 'C-3': (re_compile('(?<=[BDFGHJKLMNPQRSTVWZ])C(?=[EIY])'), 'S'), |
||
91 | 'C-4': (re_compile('^C(?=[EIY])'), 'S'), |
||
92 | 'C-5': (re_compile('^C(?=[OUA])'), 'K'), |
||
93 | 'C-6': (re_compile('(?<=[AEIOUY])C$'), 'K'), |
||
94 | 'C-7': (re_compile('C(?=[BDFGJKLMNPQRSTVWXZ])'), 'K'), |
||
95 | 'C-8': (re_compile('CC(?=[AOU])'), 'K'), |
||
96 | 'C-9': (re_compile('CC(?=[EIY])'), 'X'), |
||
97 | 'C-10': (re_compile('G(?=[EIY])'), 'J'), |
||
98 | 'C-11': (re_compile('GA(?=I?[MN])'), 'G#'), |
||
99 | 'C-12': (re_compile('GE(O|AU)'), 'JO'), |
||
100 | 'C-13': (re_compile('GNI(?=[AEIOUY])'), 'GN'), |
||
101 | 'C-14': (re_compile('(?<![PCS])H'), ''), |
||
102 | 'C-15': ('JEA', 'JA'), |
||
103 | 'C-16': (re_compile('^MAC(?=[BCDFGHJKLMNPQRSTVWXZ])'), 'MA#'), |
||
104 | 'C-17': (re_compile('^MC'), 'MA#'), |
||
105 | 'C-18': ('PH', 'F'), |
||
106 | 'C-19': ('QU', 'K'), |
||
107 | 'C-20': (re_compile('^SC(?=[EIY])'), 'S'), |
||
108 | 'C-21': (re_compile('(?<=.)SC(?=[EIY])'), 'SS'), |
||
109 | 'C-22': (re_compile('(?<=.)SC(?=[AOU])'), 'SK'), |
||
110 | 'C-23': ('SH', 'CH'), |
||
111 | 'C-24': (re_compile('TIA$'), 'SSIA'), |
||
112 | 'C-25': (re_compile('(?<=[AIOUY])W'), ''), |
||
113 | 'C-26': (re_compile('X[CSZ]'), 'X'), |
||
114 | 'C-27': (re_compile('(?<=[AEIOUY])Z|(?<=[BCDFGHJKLMNPQRSTVWXZ])' + |
||
115 | 'Z(?=[BCDFGHJKLMNPQRSTVWXZ])'), 'S'), |
||
116 | 'C-28': (re_compile(r'([BDFGHJKMNPQRTVWXZ])\1'), r'\1'), |
||
117 | 'C-28a': (re_compile('CC(?=[BCDFGHJKLMNPQRSTVWXZ]|$)'), 'C'), |
||
118 | 'C-28b': (re_compile('((?<=[BCDFGHJKLMNPQRSTVWXZ])|^)SS'), 'S'), |
||
119 | 'C-28bb': (re_compile('SS(?=[BCDFGHJKLMNPQRSTVWXZ]|$)'), 'S'), |
||
120 | 'C-28c': (re_compile('((?<=[^I])|^)LL'), 'L'), |
||
121 | 'C-28d': (re_compile('ILE$'), 'ILLE'), |
||
122 | 'C-29': (re_compile('(ILS|[CS]H|[MN]P|R[CFKLNSX])$|([BCDFGHJKL' + |
||
123 | 'MNPQRSTVWXZ])[BCDFGHJKLMNPQRSTVWXZ]$'), |
||
124 | lambda m: (m.group(1) or '') + (m.group(2) or '')), |
||
125 | 'C-30,32': (re_compile('^(SA?INT?|SEI[NM]|CINQ?|ST)(?!E)-?'), 'ST-'), |
||
126 | 'C-31,33': (re_compile('^(SAINTE|STE)-?'), 'STE-'), |
||
127 | # Rules to undo rule bleeding prevention in C-11, C-16, C-17 |
||
128 | 'C-34': ('G#', 'GA'), |
||
129 | 'C-35': ('MA#', 'MAC') |
||
130 | } |
||
131 | rule_order = [ |
||
132 | 'V-14', 'C-28', 'C-28a', 'C-28b', 'C-28bb', 'C-28c', 'C-28d', |
||
133 | 'C-12', |
||
134 | 'C-8', 'C-9', 'C-10', |
||
135 | 'C-16', 'C-17', 'C-2', 'C-3', 'C-7', |
||
136 | 'V-2,5', 'V-3,4', 'V-6', |
||
137 | 'V-1', 'C-14', |
||
138 | 'C-31,33', 'C-30,32', |
||
139 | 'C-11', 'V-15', 'V-17', 'V-18', |
||
140 | 'V-7', 'V-8', 'V-9', 'V-10', 'V-11', 'V-12', 'V-13', 'V-16', |
||
141 | 'V-19', 'V-20', |
||
142 | 'C-1', 'C-4', 'C-5', 'C-6', 'C-13', 'C-15', |
||
143 | 'C-18', 'C-19', 'C-20', 'C-21', 'C-22', 'C-23', 'C-24', |
||
144 | 'C-25', 'C-26', 'C-27', |
||
145 | 'C-29', |
||
146 | 'V-14', 'C-28', 'C-28a', 'C-28b', 'C-28bb', 'C-28c', 'C-28d', |
||
147 | 'C-34', 'C-35' |
||
148 | ] |
||
149 | |||
150 | # normalize, upper-case, and filter non-French letters |
||
151 | word = unicode_normalize('NFKD', text_type(word.upper())) |
||
152 | word = word.translate({198: 'AE', 338: 'OE'}) |
||
153 | word = ''.join(c for c in word if c in |
||
154 | {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', |
||
155 | 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', |
||
156 | 'Y', 'Z', '-'}) |
||
157 | |||
158 | for rule in rule_order: |
||
159 | regex, repl = rule_table[rule] |
||
160 | if isinstance(regex, text_type): |
||
161 | word = word.replace(regex, repl) |
||
162 | else: |
||
163 | word = regex.sub(repl, word) |
||
164 | |||
165 | return word |
||
166 | |||
340 |