Conditions | 39 |
Total Lines | 148 |
Code Lines | 103 |
Lines | 44 |
Ratio | 29.73 % |
Changes | 0 |
Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.
For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.
Commonly applied refactorings include:
If many parameters/temporary variables are present:
Complex classes like abydos.phonetic.de.haase_phonetik() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
1 | # -*- coding: utf-8 -*- |
||
237 | def haase_phonetik(word, primary_only=False): |
||
238 | """Return the Haase Phonetik (numeric output) code for a word. |
||
239 | |||
240 | Based on the algorithm described at :cite:`Prante:2015`. |
||
241 | |||
242 | Based on the original :cite:`Haase:2000`. |
||
243 | |||
244 | While the output code is numeric, it is nevertheless a str. |
||
245 | |||
246 | :param str word: the word to transform |
||
247 | :param bool primary_only: if True, only the primary code is returned |
||
248 | :returns: the Haase Phonetik value as a numeric string |
||
249 | :rtype: tuple |
||
250 | |||
251 | >>> haase_phonetik('Joachim') |
||
252 | ('9496',) |
||
253 | >>> haase_phonetik('Christoph') |
||
254 | ('4798293', '8798293') |
||
255 | >>> haase_phonetik('Jörg') |
||
256 | ('974',) |
||
257 | >>> haase_phonetik('Smith') |
||
258 | ('8692',) |
||
259 | >>> haase_phonetik('Schmidt') |
||
260 | ('8692', '4692') |
||
261 | """ |
||
262 | def _after(word, i, letters): |
||
263 | """Return True if word[i] follows one of the supplied letters.""" |
||
264 | if i > 0 and word[i-1] in letters: |
||
265 | return True |
||
266 | return False |
||
267 | |||
268 | def _before(word, i, letters): |
||
269 | """Return True if word[i] precedes one of the supplied letters.""" |
||
270 | if i+1 < len(word) and word[i+1] in letters: |
||
271 | return True |
||
272 | return False |
||
273 | |||
274 | _vowels = {'A', 'E', 'I', 'J', 'O', 'U', 'Y'} |
||
275 | |||
276 | word = unicode_normalize('NFKD', text_type(word.upper())) |
||
277 | word = word.replace('ß', 'SS') |
||
278 | |||
279 | word = word.replace('Ä', 'AE') |
||
280 | word = word.replace('Ö', 'OE') |
||
281 | word = word.replace('Ü', 'UE') |
||
282 | word = ''.join(c for c in word if c in |
||
283 | {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', |
||
284 | 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', |
||
285 | 'Y', 'Z'}) |
||
286 | |||
287 | variants = [] |
||
288 | if primary_only: |
||
289 | variants = [word] |
||
290 | else: |
||
291 | pos = 0 |
||
292 | if word[:2] == 'CH': |
||
293 | variants.append(('CH', 'SCH')) |
||
294 | pos += 2 |
||
295 | len_3_vars = {'OWN': 'AUN', 'WSK': 'RSK', 'SCH': 'CH', 'GLI': 'LI', |
||
296 | 'AUX': 'O', 'EUX': 'O'} |
||
297 | while pos < len(word): |
||
298 | if word[pos:pos+4] == 'ILLE': |
||
299 | variants.append(('ILLE', 'I')) |
||
300 | pos += 4 |
||
301 | elif word[pos:pos+3] in len_3_vars: |
||
302 | variants.append((word[pos:pos+3], len_3_vars[word[pos:pos+3]])) |
||
303 | pos += 3 |
||
304 | elif word[pos:pos+2] == 'RB': |
||
305 | variants.append(('RB', 'RW')) |
||
306 | pos += 2 |
||
307 | elif len(word[pos:]) == 3 and word[pos:] == 'EAU': |
||
308 | variants.append(('EAU', 'O')) |
||
309 | pos += 3 |
||
310 | elif len(word[pos:]) == 1 and word[pos:] in {'A', 'O'}: |
||
311 | if word[pos:] == 'O': |
||
312 | variants.append(('O', 'OW')) |
||
313 | else: |
||
314 | variants.append(('A', 'AR')) |
||
315 | pos += 1 |
||
316 | else: |
||
317 | variants.append((word[pos],)) |
||
318 | pos += 1 |
||
319 | |||
320 | variants = [''.join(letters) for letters in product(*variants)] |
||
321 | |||
322 | def _haase_code(word): |
||
323 | sdx = '' |
||
324 | for i in range(len(word)): |
||
325 | View Code Duplication | if word[i] in _vowels: |
|
326 | sdx += '9' |
||
327 | elif word[i] == 'B': |
||
328 | sdx += '1' |
||
329 | elif word[i] == 'P': |
||
330 | if _before(word, i, {'H'}): |
||
331 | sdx += '3' |
||
332 | else: |
||
333 | sdx += '1' |
||
334 | elif word[i] in {'D', 'T'}: |
||
335 | if _before(word, i, {'C', 'S', 'Z'}): |
||
336 | sdx += '8' |
||
337 | else: |
||
338 | sdx += '2' |
||
339 | elif word[i] in {'F', 'V', 'W'}: |
||
340 | sdx += '3' |
||
341 | elif word[i] in {'G', 'K', 'Q'}: |
||
342 | sdx += '4' |
||
343 | elif word[i] == 'C': |
||
344 | if _after(word, i, {'S', 'Z'}): |
||
345 | sdx += '8' |
||
346 | elif i == 0: |
||
347 | if _before(word, i, {'A', 'H', 'K', 'L', 'O', 'Q', 'R', |
||
348 | 'U', 'X'}): |
||
349 | sdx += '4' |
||
350 | else: |
||
351 | sdx += '8' |
||
352 | elif _before(word, i, {'A', 'H', 'K', 'O', 'Q', 'U', 'X'}): |
||
353 | sdx += '4' |
||
354 | else: |
||
355 | sdx += '8' |
||
356 | elif word[i] == 'X': |
||
357 | if _after(word, i, {'C', 'K', 'Q'}): |
||
358 | sdx += '8' |
||
359 | else: |
||
360 | sdx += '48' |
||
361 | elif word[i] == 'L': |
||
362 | sdx += '5' |
||
363 | elif word[i] in {'M', 'N'}: |
||
364 | sdx += '6' |
||
365 | elif word[i] == 'R': |
||
366 | sdx += '7' |
||
367 | elif word[i] in {'S', 'Z'}: |
||
368 | sdx += '8' |
||
369 | |||
370 | sdx = _delete_consecutive_repeats(sdx) |
||
371 | |||
372 | return sdx |
||
373 | |||
374 | encoded = tuple(_haase_code(word) for word in variants) |
||
375 | if len(encoded) > 1: |
||
376 | encoded_set = set() |
||
377 | encoded_single = [] |
||
378 | for code in encoded: |
||
379 | if code not in encoded_set: |
||
380 | encoded_set.add(code) |
||
381 | encoded_single.append(code) |
||
382 | return tuple(encoded_single) |
||
383 | |||
384 | return encoded |
||
385 | |||
476 |