| Conditions | 4 |
| Total Lines | 128 |
| Code Lines | 92 |
| Lines | 0 |
| Ratio | 0 % |
| Changes | 0 | ||
Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.
For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.
Commonly applied refactorings include:
If many parameters/temporary variables are present:
| 1 | # -*- coding: utf-8 -*- |
||
| 38 | def fonem(word): |
||
| 39 | """Return the FONEM code of a word. |
||
| 40 | |||
| 41 | FONEM is a phonetic algorithm designed for French (particularly surnames in |
||
| 42 | Saguenay, Canada), defined in :cite:`Bouchard:1981`. |
||
| 43 | |||
| 44 | Guillaume Plique's Javascript implementation :cite:`Plique:2018` at |
||
| 45 | https://github.com/Yomguithereal/talisman/blob/master/src/phonetics/french/fonem.js |
||
| 46 | was also consulted for this implementation. |
||
| 47 | |||
| 48 | :param str word: the word to transform |
||
| 49 | :returns: the FONEM code |
||
| 50 | :rtype: str |
||
| 51 | |||
| 52 | >>> fonem('Marchand') |
||
| 53 | 'MARCHEN' |
||
| 54 | >>> fonem('Beaulieu') |
||
| 55 | 'BOLIEU' |
||
| 56 | >>> fonem('Beaumont') |
||
| 57 | 'BOMON' |
||
| 58 | >>> fonem('Legrand') |
||
| 59 | 'LEGREN' |
||
| 60 | >>> fonem('Pelletier') |
||
| 61 | 'PELETIER' |
||
| 62 | """ |
||
| 63 | # I don't see a sane way of doing this without regexps :( |
||
| 64 | rule_table = { |
||
| 65 | # Vowels & groups of vowels |
||
| 66 | 'V-1': (re_compile('E?AU'), 'O'), |
||
| 67 | 'V-2,5': (re_compile('(E?AU|O)L[TX]$'), 'O'), |
||
| 68 | 'V-3,4': (re_compile('E?AU[TX]$'), 'O'), |
||
| 69 | 'V-6': (re_compile('E?AUL?D$'), 'O'), |
||
| 70 | 'V-7': (re_compile(r'(?<!G)AY$'), 'E'), |
||
| 71 | 'V-8': (re_compile('EUX$'), 'EU'), |
||
| 72 | 'V-9': (re_compile('EY(?=$|[BCDFGHJKLMNPQRSTVWXZ])'), 'E'), |
||
| 73 | 'V-10': ('Y', 'I'), |
||
| 74 | 'V-11': (re_compile('(?<=[AEIOUY])I(?=[AEIOUY])'), 'Y'), |
||
| 75 | 'V-12': (re_compile('(?<=[AEIOUY])ILL'), 'Y'), |
||
| 76 | 'V-13': (re_compile('OU(?=[AEOU]|I(?!LL))'), 'W'), |
||
| 77 | 'V-14': (re_compile(r'([AEIOUY])(?=\1)'), ''), |
||
| 78 | # Nasal vowels |
||
| 79 | 'V-15': (re_compile('[AE]M(?=[BCDFGHJKLMPQRSTVWXZ])(?!$)'), 'EN'), |
||
| 80 | 'V-16': (re_compile('OM(?=[BCDFGHJKLMPQRSTVWXZ])'), 'ON'), |
||
| 81 | 'V-17': (re_compile('AN(?=[BCDFGHJKLMNPQRSTVWXZ])'), 'EN'), |
||
| 82 | 'V-18': (re_compile('(AI[MN]|EIN)(?=[BCDFGHJKLMNPQRSTVWXZ]|$)'), |
||
| 83 | 'IN'), |
||
| 84 | 'V-19': (re_compile('B(O|U|OU)RNE?$'), 'BURN'), |
||
| 85 | 'V-20': (re_compile('(^IM|(?<=[BCDFGHJKLMNPQRSTVWXZ])' + |
||
| 86 | 'IM(?=[BCDFGHJKLMPQRSTVWXZ]))'), 'IN'), |
||
| 87 | # Consonants and groups of consonants |
||
| 88 | 'C-1': ('BV', 'V'), |
||
| 89 | 'C-2': (re_compile('(?<=[AEIOUY])C(?=[EIY])'), 'SS'), |
||
| 90 | 'C-3': (re_compile('(?<=[BDFGHJKLMNPQRSTVWZ])C(?=[EIY])'), 'S'), |
||
| 91 | 'C-4': (re_compile('^C(?=[EIY])'), 'S'), |
||
| 92 | 'C-5': (re_compile('^C(?=[OUA])'), 'K'), |
||
| 93 | 'C-6': (re_compile('(?<=[AEIOUY])C$'), 'K'), |
||
| 94 | 'C-7': (re_compile('C(?=[BDFGJKLMNPQRSTVWXZ])'), 'K'), |
||
| 95 | 'C-8': (re_compile('CC(?=[AOU])'), 'K'), |
||
| 96 | 'C-9': (re_compile('CC(?=[EIY])'), 'X'), |
||
| 97 | 'C-10': (re_compile('G(?=[EIY])'), 'J'), |
||
| 98 | 'C-11': (re_compile('GA(?=I?[MN])'), 'G#'), |
||
| 99 | 'C-12': (re_compile('GE(O|AU)'), 'JO'), |
||
| 100 | 'C-13': (re_compile('GNI(?=[AEIOUY])'), 'GN'), |
||
| 101 | 'C-14': (re_compile('(?<![PCS])H'), ''), |
||
| 102 | 'C-15': ('JEA', 'JA'), |
||
| 103 | 'C-16': (re_compile('^MAC(?=[BCDFGHJKLMNPQRSTVWXZ])'), 'MA#'), |
||
| 104 | 'C-17': (re_compile('^MC'), 'MA#'), |
||
| 105 | 'C-18': ('PH', 'F'), |
||
| 106 | 'C-19': ('QU', 'K'), |
||
| 107 | 'C-20': (re_compile('^SC(?=[EIY])'), 'S'), |
||
| 108 | 'C-21': (re_compile('(?<=.)SC(?=[EIY])'), 'SS'), |
||
| 109 | 'C-22': (re_compile('(?<=.)SC(?=[AOU])'), 'SK'), |
||
| 110 | 'C-23': ('SH', 'CH'), |
||
| 111 | 'C-24': (re_compile('TIA$'), 'SSIA'), |
||
| 112 | 'C-25': (re_compile('(?<=[AIOUY])W'), ''), |
||
| 113 | 'C-26': (re_compile('X[CSZ]'), 'X'), |
||
| 114 | 'C-27': (re_compile('(?<=[AEIOUY])Z|(?<=[BCDFGHJKLMNPQRSTVWXZ])' + |
||
| 115 | 'Z(?=[BCDFGHJKLMNPQRSTVWXZ])'), 'S'), |
||
| 116 | 'C-28': (re_compile(r'([BDFGHJKMNPQRTVWXZ])\1'), r'\1'), |
||
| 117 | 'C-28a': (re_compile('CC(?=[BCDFGHJKLMNPQRSTVWXZ]|$)'), 'C'), |
||
| 118 | 'C-28b': (re_compile('((?<=[BCDFGHJKLMNPQRSTVWXZ])|^)SS'), 'S'), |
||
| 119 | 'C-28bb': (re_compile('SS(?=[BCDFGHJKLMNPQRSTVWXZ]|$)'), 'S'), |
||
| 120 | 'C-28c': (re_compile('((?<=[^I])|^)LL'), 'L'), |
||
| 121 | 'C-28d': (re_compile('ILE$'), 'ILLE'), |
||
| 122 | 'C-29': (re_compile('(ILS|[CS]H|[MN]P|R[CFKLNSX])$|([BCDFGHJKL' + |
||
| 123 | 'MNPQRSTVWXZ])[BCDFGHJKLMNPQRSTVWXZ]$'), |
||
| 124 | lambda m: (m.group(1) or '') + (m.group(2) or '')), |
||
| 125 | 'C-30,32': (re_compile('^(SA?INT?|SEI[NM]|CINQ?|ST)(?!E)-?'), 'ST-'), |
||
| 126 | 'C-31,33': (re_compile('^(SAINTE|STE)-?'), 'STE-'), |
||
| 127 | # Rules to undo rule bleeding prevention in C-11, C-16, C-17 |
||
| 128 | 'C-34': ('G#', 'GA'), |
||
| 129 | 'C-35': ('MA#', 'MAC') |
||
| 130 | } |
||
| 131 | rule_order = [ |
||
| 132 | 'V-14', 'C-28', 'C-28a', 'C-28b', 'C-28bb', 'C-28c', 'C-28d', |
||
| 133 | 'C-12', |
||
| 134 | 'C-8', 'C-9', 'C-10', |
||
| 135 | 'C-16', 'C-17', 'C-2', 'C-3', 'C-7', |
||
| 136 | 'V-2,5', 'V-3,4', 'V-6', |
||
| 137 | 'V-1', 'C-14', |
||
| 138 | 'C-31,33', 'C-30,32', |
||
| 139 | 'C-11', 'V-15', 'V-17', 'V-18', |
||
| 140 | 'V-7', 'V-8', 'V-9', 'V-10', 'V-11', 'V-12', 'V-13', 'V-16', |
||
| 141 | 'V-19', 'V-20', |
||
| 142 | 'C-1', 'C-4', 'C-5', 'C-6', 'C-13', 'C-15', |
||
| 143 | 'C-18', 'C-19', 'C-20', 'C-21', 'C-22', 'C-23', 'C-24', |
||
| 144 | 'C-25', 'C-26', 'C-27', |
||
| 145 | 'C-29', |
||
| 146 | 'V-14', 'C-28', 'C-28a', 'C-28b', 'C-28bb', 'C-28c', 'C-28d', |
||
| 147 | 'C-34', 'C-35' |
||
| 148 | ] |
||
| 149 | |||
| 150 | # normalize, upper-case, and filter non-French letters |
||
| 151 | word = unicode_normalize('NFKD', text_type(word.upper())) |
||
| 152 | word = word.translate({198: 'AE', 338: 'OE'}) |
||
| 153 | word = ''.join(c for c in word if c in |
||
| 154 | {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', |
||
| 155 | 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', |
||
| 156 | 'Y', 'Z', '-'}) |
||
| 157 | |||
| 158 | for rule in rule_order: |
||
| 159 | regex, repl = rule_table[rule] |
||
| 160 | if isinstance(regex, text_type): |
||
| 161 | word = word.replace(regex, repl) |
||
| 162 | else: |
||
| 163 | word = regex.sub(repl, word) |
||
| 164 | |||
| 165 | return word |
||
| 166 | |||
| 340 |