| Conditions | 219 |
| Total Lines | 892 |
| Code Lines | 549 |
| Lines | 0 |
| Ratio | 0 % |
| Tests | 407 |
| CRAP Score | 219 |
| Changes | 0 | ||
Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.
For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.
Commonly applied refactorings include:
If many parameters/temporary variables are present:
Complex classes like abydos.phonetic._DoubleMetaphone.DoubleMetaphone.encode() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
| 1 | # -*- coding: utf-8 -*- |
||
| 43 | 1 | def encode(self, word, max_length=-1): |
|
| 44 | """Return the Double Metaphone code for a word. |
||
| 45 | |||
| 46 | Args: |
||
| 47 | word (str): The word to transform |
||
| 48 | max_length (int): The maximum length of the returned Double |
||
| 49 | Metaphone codes (defaults to 64, but in Philips' original |
||
| 50 | implementation this was 4) |
||
| 51 | |||
| 52 | Returns: |
||
| 53 | tuple: The Double Metaphone value(s) |
||
| 54 | |||
| 55 | Examples: |
||
| 56 | >>> pe = DoubleMetaphone() |
||
| 57 | >>> pe.encode('Christopher') |
||
| 58 | ('KRSTFR', '') |
||
| 59 | >>> pe.encode('Niall') |
||
| 60 | ('NL', '') |
||
| 61 | >>> pe.encode('Smith') |
||
| 62 | ('SM0', 'XMT') |
||
| 63 | >>> pe.encode('Schmidt') |
||
| 64 | ('XMT', 'SMT') |
||
| 65 | |||
| 66 | """ |
||
| 67 | # Require a max_length of at least 4 |
||
| 68 | 1 | if max_length != -1: |
|
| 69 | 1 | max_length = max(4, max_length) |
|
| 70 | else: |
||
| 71 | 1 | max_length = 64 |
|
| 72 | |||
| 73 | 1 | primary = '' |
|
| 74 | 1 | secondary = '' |
|
| 75 | |||
| 76 | 1 | def _slavo_germanic(): |
|
| 77 | """Return True if the word appears to be Slavic or Germanic. |
||
| 78 | |||
| 79 | Returns: |
||
| 80 | bool: True if the word appears to be Slavic or Germanic |
||
| 81 | |||
| 82 | """ |
||
| 83 | 1 | if 'W' in word or 'K' in word or 'CZ' in word: |
|
| 84 | 1 | return True |
|
| 85 | 1 | return False |
|
| 86 | |||
| 87 | 1 | def _metaph_add(pri, sec=''): |
|
| 88 | """Return a new metaphone tuple with the supplied elements. |
||
| 89 | |||
| 90 | Args: |
||
| 91 | pri (str): The primary element |
||
| 92 | sec (str): The secondary element |
||
| 93 | |||
| 94 | Returns: |
||
| 95 | tuple: A new metaphone tuple with the supplied elements |
||
| 96 | |||
| 97 | """ |
||
| 98 | 1 | newpri = primary |
|
| 99 | 1 | newsec = secondary |
|
| 100 | 1 | if pri: |
|
| 101 | 1 | newpri += pri |
|
| 102 | 1 | if sec: |
|
| 103 | 1 | if sec != ' ': |
|
| 104 | 1 | newsec += sec |
|
| 105 | else: |
||
| 106 | 1 | newsec += pri |
|
| 107 | 1 | return newpri, newsec |
|
| 108 | |||
| 109 | 1 | def _is_vowel(pos): |
|
| 110 | """Return True if the character at word[pos] is a vowel. |
||
| 111 | |||
| 112 | Args: |
||
| 113 | pos (int): Position in the word |
||
| 114 | |||
| 115 | Returns: |
||
| 116 | bool: True if the character is a vowel |
||
| 117 | |||
| 118 | """ |
||
| 119 | 1 | if pos >= 0 and word[pos] in {'A', 'E', 'I', 'O', 'U', 'Y'}: |
|
| 120 | 1 | return True |
|
| 121 | 1 | return False |
|
| 122 | |||
| 123 | 1 | def _get_at(pos): |
|
| 124 | """Return the character at word[pos]. |
||
| 125 | |||
| 126 | Args: |
||
| 127 | pos (int): Position in the word |
||
| 128 | |||
| 129 | Returns: |
||
| 130 | str: Character at word[pos] |
||
| 131 | |||
| 132 | """ |
||
| 133 | 1 | return word[pos] |
|
| 134 | |||
| 135 | 1 | def _string_at(pos, slen, substrings): |
|
| 136 | """Return True if word[pos:pos+slen] is in substrings. |
||
| 137 | |||
| 138 | Args: |
||
| 139 | pos (int): Position in the word |
||
| 140 | slen (int): Substring length |
||
| 141 | substrings (set): Substrings to search |
||
| 142 | |||
| 143 | Returns: |
||
| 144 | bool: True if word[pos:pos+slen] is in substrings |
||
| 145 | |||
| 146 | """ |
||
| 147 | 1 | if pos < 0: |
|
| 148 | 1 | return False |
|
| 149 | 1 | return word[pos : pos + slen] in substrings |
|
| 150 | |||
| 151 | 1 | current = 0 |
|
| 152 | 1 | length = len(word) |
|
| 153 | 1 | if length < 1: |
|
| 154 | 1 | return '', '' |
|
| 155 | 1 | last = length - 1 |
|
| 156 | |||
| 157 | 1 | word = word.upper() |
|
| 158 | 1 | word = word.replace('ß', 'SS') |
|
| 159 | |||
| 160 | # Pad the original string so that we can index beyond the edge of the |
||
| 161 | # world |
||
| 162 | 1 | word += ' ' |
|
| 163 | |||
| 164 | # Skip these when at start of word |
||
| 165 | 1 | if word[0:2] in {'GN', 'KN', 'PN', 'WR', 'PS'}: |
|
| 166 | 1 | current += 1 |
|
| 167 | |||
| 168 | # Initial 'X' is pronounced 'Z' e.g. 'Xavier' |
||
| 169 | 1 | if _get_at(0) == 'X': |
|
| 170 | 1 | primary, secondary = _metaph_add('S') # 'Z' maps to 'S' |
|
| 171 | 1 | current += 1 |
|
| 172 | |||
| 173 | # Main loop |
||
| 174 | 1 | while True: |
|
| 175 | 1 | if current >= length: |
|
| 176 | 1 | break |
|
| 177 | |||
| 178 | 1 | if _get_at(current) in {'A', 'E', 'I', 'O', 'U', 'Y'}: |
|
| 179 | 1 | if current == 0: |
|
| 180 | # All init vowels now map to 'A' |
||
| 181 | 1 | primary, secondary = _metaph_add('A') |
|
| 182 | 1 | current += 1 |
|
| 183 | 1 | continue |
|
| 184 | |||
| 185 | 1 | elif _get_at(current) == 'B': |
|
| 186 | # "-mb", e.g", "dumb", already skipped over... |
||
| 187 | 1 | primary, secondary = _metaph_add('P') |
|
| 188 | 1 | if _get_at(current + 1) == 'B': |
|
| 189 | 1 | current += 2 |
|
| 190 | else: |
||
| 191 | 1 | current += 1 |
|
| 192 | 1 | continue |
|
| 193 | |||
| 194 | 1 | elif _get_at(current) == 'Ç': |
|
| 195 | 1 | primary, secondary = _metaph_add('S') |
|
| 196 | 1 | current += 1 |
|
| 197 | 1 | continue |
|
| 198 | |||
| 199 | 1 | elif _get_at(current) == 'C': |
|
| 200 | # Various Germanic |
||
| 201 | 1 | if ( |
|
| 202 | current > 1 |
||
| 203 | and not _is_vowel(current - 2) |
||
| 204 | and _string_at((current - 1), 3, {'ACH'}) |
||
| 205 | and ( |
||
| 206 | (_get_at(current + 2) != 'I') |
||
| 207 | and ( |
||
| 208 | (_get_at(current + 2) != 'E') |
||
| 209 | or _string_at( |
||
| 210 | (current - 2), 6, {'BACHER', 'MACHER'} |
||
| 211 | ) |
||
| 212 | ) |
||
| 213 | ) |
||
| 214 | ): |
||
| 215 | 1 | primary, secondary = _metaph_add('K') |
|
| 216 | 1 | current += 2 |
|
| 217 | 1 | continue |
|
| 218 | |||
| 219 | # Special case 'caesar' |
||
| 220 | 1 | elif current == 0 and _string_at(current, 6, {'CAESAR'}): |
|
| 221 | 1 | primary, secondary = _metaph_add('S') |
|
| 222 | 1 | current += 2 |
|
| 223 | 1 | continue |
|
| 224 | |||
| 225 | # Italian 'chianti' |
||
| 226 | 1 | elif _string_at(current, 4, {'CHIA'}): |
|
| 227 | 1 | primary, secondary = _metaph_add('K') |
|
| 228 | 1 | current += 2 |
|
| 229 | 1 | continue |
|
| 230 | |||
| 231 | 1 | elif _string_at(current, 2, {'CH'}): |
|
| 232 | # Find 'Michael' |
||
| 233 | 1 | if current > 0 and _string_at(current, 4, {'CHAE'}): |
|
| 234 | 1 | primary, secondary = _metaph_add('K', 'X') |
|
| 235 | 1 | current += 2 |
|
| 236 | 1 | continue |
|
| 237 | |||
| 238 | # Greek roots e.g. 'chemistry', 'chorus' |
||
| 239 | 1 | elif ( |
|
| 240 | current == 0 |
||
| 241 | and ( |
||
| 242 | _string_at((current + 1), 5, {'HARAC', 'HARIS'}) |
||
| 243 | or _string_at( |
||
| 244 | (current + 1), 3, {'HOR', 'HYM', 'HIA', 'HEM'} |
||
| 245 | ) |
||
| 246 | ) |
||
| 247 | and not _string_at(0, 5, {'CHORE'}) |
||
| 248 | ): |
||
| 249 | 1 | primary, secondary = _metaph_add('K') |
|
| 250 | 1 | current += 2 |
|
| 251 | 1 | continue |
|
| 252 | |||
| 253 | # Germanic, Greek, or otherwise 'ch' for 'kh' sound |
||
| 254 | 1 | elif ( |
|
| 255 | ( |
||
| 256 | _string_at(0, 4, {'VAN ', 'VON '}) |
||
| 257 | or _string_at(0, 3, {'SCH'}) |
||
| 258 | ) |
||
| 259 | or |
||
| 260 | # 'architect but not 'arch', 'orchestra', 'orchid' |
||
| 261 | _string_at( |
||
| 262 | (current - 2), 6, {'ORCHES', 'ARCHIT', 'ORCHID'} |
||
| 263 | ) |
||
| 264 | or _string_at((current + 2), 1, {'T', 'S'}) |
||
| 265 | or ( |
||
| 266 | ( |
||
| 267 | _string_at( |
||
| 268 | (current - 1), 1, {'A', 'O', 'U', 'E'} |
||
| 269 | ) |
||
| 270 | or (current == 0) |
||
| 271 | ) |
||
| 272 | and |
||
| 273 | # e.g., 'wachtler', 'wechsler', but not 'tichner' |
||
| 274 | _string_at( |
||
| 275 | (current + 2), |
||
| 276 | 1, |
||
| 277 | { |
||
| 278 | 'L', |
||
| 279 | 'R', |
||
| 280 | 'N', |
||
| 281 | 'M', |
||
| 282 | 'B', |
||
| 283 | 'H', |
||
| 284 | 'F', |
||
| 285 | 'V', |
||
| 286 | 'W', |
||
| 287 | ' ', |
||
| 288 | }, |
||
| 289 | ) |
||
| 290 | ) |
||
| 291 | ): |
||
| 292 | 1 | primary, secondary = _metaph_add('K') |
|
| 293 | |||
| 294 | else: |
||
| 295 | 1 | if current > 0: |
|
| 296 | 1 | if _string_at(0, 2, {'MC'}): |
|
| 297 | # e.g., "McHugh" |
||
| 298 | 1 | primary, secondary = _metaph_add('K') |
|
| 299 | else: |
||
| 300 | 1 | primary, secondary = _metaph_add('X', 'K') |
|
| 301 | else: |
||
| 302 | 1 | primary, secondary = _metaph_add('X') |
|
| 303 | |||
| 304 | 1 | current += 2 |
|
| 305 | 1 | continue |
|
| 306 | |||
| 307 | # e.g, 'czerny' |
||
| 308 | 1 | elif _string_at(current, 2, {'CZ'}) and not _string_at( |
|
| 309 | (current - 2), 4, {'WICZ'} |
||
| 310 | ): |
||
| 311 | 1 | primary, secondary = _metaph_add('S', 'X') |
|
| 312 | 1 | current += 2 |
|
| 313 | 1 | continue |
|
| 314 | |||
| 315 | # e.g., 'focaccia' |
||
| 316 | 1 | elif _string_at((current + 1), 3, {'CIA'}): |
|
| 317 | 1 | primary, secondary = _metaph_add('X') |
|
| 318 | 1 | current += 3 |
|
| 319 | |||
| 320 | # double 'C', but not if e.g. 'McClellan' |
||
| 321 | 1 | elif _string_at(current, 2, {'CC'}) and not ( |
|
| 322 | (current == 1) and (_get_at(0) == 'M') |
||
| 323 | ): |
||
| 324 | # 'bellocchio' but not 'bacchus' |
||
| 325 | 1 | if _string_at( |
|
| 326 | (current + 2), 1, {'I', 'E', 'H'} |
||
| 327 | ) and not _string_at((current + 2), 2, ['HU']): |
||
| 328 | # 'accident', 'accede' 'succeed' |
||
| 329 | 1 | if ( |
|
| 330 | (current == 1) and _get_at(current - 1) == 'A' |
||
| 331 | ) or _string_at((current - 1), 5, {'UCCEE', 'UCCES'}): |
||
| 332 | 1 | primary, secondary = _metaph_add('KS') |
|
| 333 | # 'bacci', 'bertucci', other italian |
||
| 334 | else: |
||
| 335 | 1 | primary, secondary = _metaph_add('X') |
|
| 336 | 1 | current += 3 |
|
| 337 | 1 | continue |
|
| 338 | else: # Pierce's rule |
||
| 339 | 1 | primary, secondary = _metaph_add('K') |
|
| 340 | 1 | current += 2 |
|
| 341 | 1 | continue |
|
| 342 | |||
| 343 | 1 | elif _string_at(current, 2, {'CK', 'CG', 'CQ'}): |
|
| 344 | 1 | primary, secondary = _metaph_add('K') |
|
| 345 | 1 | current += 2 |
|
| 346 | 1 | continue |
|
| 347 | |||
| 348 | 1 | elif _string_at(current, 2, {'CI', 'CE', 'CY'}): |
|
| 349 | # Italian vs. English |
||
| 350 | 1 | if _string_at(current, 3, {'CIO', 'CIE', 'CIA'}): |
|
| 351 | 1 | primary, secondary = _metaph_add('S', 'X') |
|
| 352 | else: |
||
| 353 | 1 | primary, secondary = _metaph_add('S') |
|
| 354 | 1 | current += 2 |
|
| 355 | 1 | continue |
|
| 356 | |||
| 357 | # else |
||
| 358 | else: |
||
| 359 | 1 | primary, secondary = _metaph_add('K') |
|
| 360 | |||
| 361 | # name sent in 'mac caffrey', 'mac gregor |
||
| 362 | 1 | if _string_at((current + 1), 2, {' C', ' Q', ' G'}): |
|
| 363 | 1 | current += 3 |
|
| 364 | 1 | elif _string_at( |
|
| 365 | (current + 1), 1, {'C', 'K', 'Q'} |
||
| 366 | ) and not _string_at((current + 1), 2, {'CE', 'CI'}): |
||
| 367 | 1 | current += 2 |
|
| 368 | else: |
||
| 369 | 1 | current += 1 |
|
| 370 | 1 | continue |
|
| 371 | |||
| 372 | 1 | elif _get_at(current) == 'D': |
|
| 373 | 1 | if _string_at(current, 2, {'DG'}): |
|
| 374 | 1 | if _string_at((current + 2), 1, {'I', 'E', 'Y'}): |
|
| 375 | # e.g. 'edge' |
||
| 376 | 1 | primary, secondary = _metaph_add('J') |
|
| 377 | 1 | current += 3 |
|
| 378 | 1 | continue |
|
| 379 | else: |
||
| 380 | # e.g. 'edgar' |
||
| 381 | 1 | primary, secondary = _metaph_add('TK') |
|
| 382 | 1 | current += 2 |
|
| 383 | 1 | continue |
|
| 384 | |||
| 385 | 1 | elif _string_at(current, 2, {'DT', 'DD'}): |
|
| 386 | 1 | primary, secondary = _metaph_add('T') |
|
| 387 | 1 | current += 2 |
|
| 388 | 1 | continue |
|
| 389 | |||
| 390 | # else |
||
| 391 | else: |
||
| 392 | 1 | primary, secondary = _metaph_add('T') |
|
| 393 | 1 | current += 1 |
|
| 394 | 1 | continue |
|
| 395 | |||
| 396 | 1 | elif _get_at(current) == 'F': |
|
| 397 | 1 | if _get_at(current + 1) == 'F': |
|
| 398 | 1 | current += 2 |
|
| 399 | else: |
||
| 400 | 1 | current += 1 |
|
| 401 | 1 | primary, secondary = _metaph_add('F') |
|
| 402 | 1 | continue |
|
| 403 | |||
| 404 | 1 | elif _get_at(current) == 'G': |
|
| 405 | 1 | if _get_at(current + 1) == 'H': |
|
| 406 | 1 | if (current > 0) and not _is_vowel(current - 1): |
|
| 407 | 1 | primary, secondary = _metaph_add('K') |
|
| 408 | 1 | current += 2 |
|
| 409 | 1 | continue |
|
| 410 | |||
| 411 | # 'ghislane', ghiradelli |
||
| 412 | 1 | elif current == 0: |
|
| 413 | 1 | if _get_at(current + 2) == 'I': |
|
| 414 | 1 | primary, secondary = _metaph_add('J') |
|
| 415 | else: |
||
| 416 | 1 | primary, secondary = _metaph_add('K') |
|
| 417 | 1 | current += 2 |
|
| 418 | 1 | continue |
|
| 419 | |||
| 420 | # Parker's rule (with some further refinements) - |
||
| 421 | # e.g., 'hugh' |
||
| 422 | 1 | elif ( |
|
| 423 | ( |
||
| 424 | (current > 1) |
||
| 425 | and _string_at((current - 2), 1, {'B', 'H', 'D'}) |
||
| 426 | ) |
||
| 427 | or |
||
| 428 | # e.g., 'bough' |
||
| 429 | ( |
||
| 430 | (current > 2) |
||
| 431 | and _string_at((current - 3), 1, {'B', 'H', 'D'}) |
||
| 432 | ) |
||
| 433 | or |
||
| 434 | # e.g., 'broughton' |
||
| 435 | ( |
||
| 436 | (current > 3) |
||
| 437 | and _string_at((current - 4), 1, {'B', 'H'}) |
||
| 438 | ) |
||
| 439 | ): |
||
| 440 | 1 | current += 2 |
|
| 441 | 1 | continue |
|
| 442 | else: |
||
| 443 | # e.g. 'laugh', 'McLaughlin', 'cough', |
||
| 444 | # 'gough', 'rough', 'tough' |
||
| 445 | 1 | if ( |
|
| 446 | (current > 2) |
||
| 447 | and (_get_at(current - 1) == 'U') |
||
| 448 | and ( |
||
| 449 | _string_at( |
||
| 450 | (current - 3), 1, {'C', 'G', 'L', 'R', 'T'} |
||
| 451 | ) |
||
| 452 | ) |
||
| 453 | ): |
||
| 454 | 1 | primary, secondary = _metaph_add('F') |
|
| 455 | 1 | elif (current > 0) and _get_at(current - 1) != 'I': |
|
| 456 | 1 | primary, secondary = _metaph_add('K') |
|
| 457 | 1 | current += 2 |
|
| 458 | 1 | continue |
|
| 459 | |||
| 460 | 1 | elif _get_at(current + 1) == 'N': |
|
| 461 | 1 | if ( |
|
| 462 | (current == 1) |
||
| 463 | and _is_vowel(0) |
||
| 464 | and not _slavo_germanic() |
||
| 465 | ): |
||
| 466 | 1 | primary, secondary = _metaph_add('KN', 'N') |
|
| 467 | # not e.g. 'cagney' |
||
| 468 | 1 | elif ( |
|
| 469 | not _string_at((current + 2), 2, {'EY'}) |
||
| 470 | and (_get_at(current + 1) != 'Y') |
||
| 471 | and not _slavo_germanic() |
||
| 472 | ): |
||
| 473 | 1 | primary, secondary = _metaph_add('N', 'KN') |
|
| 474 | else: |
||
| 475 | 1 | primary, secondary = _metaph_add('KN') |
|
| 476 | 1 | current += 2 |
|
| 477 | 1 | continue |
|
| 478 | |||
| 479 | # 'tagliaro' |
||
| 480 | 1 | elif ( |
|
| 481 | _string_at((current + 1), 2, {'LI'}) |
||
| 482 | and not _slavo_germanic() |
||
| 483 | ): |
||
| 484 | 1 | primary, secondary = _metaph_add('KL', 'L') |
|
| 485 | 1 | current += 2 |
|
| 486 | 1 | continue |
|
| 487 | |||
| 488 | # -ges-, -gep-, -gel-, -gie- at beginning |
||
| 489 | 1 | elif (current == 0) and ( |
|
| 490 | (_get_at(current + 1) == 'Y') |
||
| 491 | or _string_at( |
||
| 492 | (current + 1), |
||
| 493 | 2, |
||
| 494 | { |
||
| 495 | 'ES', |
||
| 496 | 'EP', |
||
| 497 | 'EB', |
||
| 498 | 'EL', |
||
| 499 | 'EY', |
||
| 500 | 'IB', |
||
| 501 | 'IL', |
||
| 502 | 'IN', |
||
| 503 | 'IE', |
||
| 504 | 'EI', |
||
| 505 | 'ER', |
||
| 506 | }, |
||
| 507 | ) |
||
| 508 | ): |
||
| 509 | 1 | primary, secondary = _metaph_add('K', 'J') |
|
| 510 | 1 | current += 2 |
|
| 511 | 1 | continue |
|
| 512 | |||
| 513 | # -ger-, -gy- |
||
| 514 | 1 | elif ( |
|
| 515 | ( |
||
| 516 | _string_at((current + 1), 2, {'ER'}) |
||
| 517 | or (_get_at(current + 1) == 'Y') |
||
| 518 | ) |
||
| 519 | and not _string_at(0, 6, {'DANGER', 'RANGER', 'MANGER'}) |
||
| 520 | and not _string_at((current - 1), 1, {'E', 'I'}) |
||
| 521 | and not _string_at((current - 1), 3, {'RGY', 'OGY'}) |
||
| 522 | ): |
||
| 523 | 1 | primary, secondary = _metaph_add('K', 'J') |
|
| 524 | 1 | current += 2 |
|
| 525 | 1 | continue |
|
| 526 | |||
| 527 | # italian e.g, 'biaggi' |
||
| 528 | 1 | elif _string_at( |
|
| 529 | (current + 1), 1, {'E', 'I', 'Y'} |
||
| 530 | ) or _string_at((current - 1), 4, {'AGGI', 'OGGI'}): |
||
| 531 | # obvious germanic |
||
| 532 | 1 | if ( |
|
| 533 | _string_at(0, 4, {'VAN ', 'VON '}) |
||
| 534 | or _string_at(0, 3, {'SCH'}) |
||
| 535 | ) or _string_at((current + 1), 2, {'ET'}): |
||
| 536 | 1 | primary, secondary = _metaph_add('K') |
|
| 537 | 1 | elif _string_at((current + 1), 4, {'IER '}): |
|
| 538 | 1 | primary, secondary = _metaph_add('J') |
|
| 539 | else: |
||
| 540 | 1 | primary, secondary = _metaph_add('J', 'K') |
|
| 541 | 1 | current += 2 |
|
| 542 | 1 | continue |
|
| 543 | |||
| 544 | else: |
||
| 545 | 1 | if _get_at(current + 1) == 'G': |
|
| 546 | 1 | current += 2 |
|
| 547 | else: |
||
| 548 | 1 | current += 1 |
|
| 549 | 1 | primary, secondary = _metaph_add('K') |
|
| 550 | 1 | continue |
|
| 551 | |||
| 552 | 1 | elif _get_at(current) == 'H': |
|
| 553 | # only keep if first & before vowel or btw. 2 vowels |
||
| 554 | 1 | if ((current == 0) or _is_vowel(current - 1)) and _is_vowel( |
|
| 555 | current + 1 |
||
| 556 | ): |
||
| 557 | 1 | primary, secondary = _metaph_add('H') |
|
| 558 | 1 | current += 2 |
|
| 559 | else: # also takes care of 'HH' |
||
| 560 | 1 | current += 1 |
|
| 561 | 1 | continue |
|
| 562 | |||
| 563 | 1 | elif _get_at(current) == 'J': |
|
| 564 | # obvious spanish, 'jose', 'san jacinto' |
||
| 565 | 1 | if _string_at(current, 4, ['JOSE']) or _string_at( |
|
| 566 | 0, 4, {'SAN '} |
||
| 567 | ): |
||
| 568 | 1 | if ( |
|
| 569 | (current == 0) and (_get_at(current + 4) == ' ') |
||
| 570 | ) or _string_at(0, 4, ['SAN ']): |
||
| 571 | 1 | primary, secondary = _metaph_add('H') |
|
| 572 | else: |
||
| 573 | 1 | primary, secondary = _metaph_add('J', 'H') |
|
| 574 | 1 | current += 1 |
|
| 575 | 1 | continue |
|
| 576 | |||
| 577 | 1 | elif (current == 0) and not _string_at(current, 4, {'JOSE'}): |
|
| 578 | # Yankelovich/Jankelowicz |
||
| 579 | 1 | primary, secondary = _metaph_add('J', 'A') |
|
| 580 | # Spanish pron. of e.g. 'bajador' |
||
| 581 | 1 | elif ( |
|
| 582 | _is_vowel(current - 1) |
||
| 583 | and not _slavo_germanic() |
||
| 584 | and ( |
||
| 585 | (_get_at(current + 1) == 'A') |
||
| 586 | or (_get_at(current + 1) == 'O') |
||
| 587 | ) |
||
| 588 | ): |
||
| 589 | 1 | primary, secondary = _metaph_add('J', 'H') |
|
| 590 | 1 | elif current == last: |
|
| 591 | 1 | primary, secondary = _metaph_add('J', ' ') |
|
| 592 | 1 | elif not _string_at( |
|
| 593 | (current + 1), 1, {'L', 'T', 'K', 'S', 'N', 'M', 'B', 'Z'} |
||
| 594 | ) and not _string_at((current - 1), 1, {'S', 'K', 'L'}): |
||
| 595 | 1 | primary, secondary = _metaph_add('J') |
|
| 596 | |||
| 597 | 1 | if _get_at(current + 1) == 'J': # it could happen! |
|
| 598 | 1 | current += 2 |
|
| 599 | else: |
||
| 600 | 1 | current += 1 |
|
| 601 | 1 | continue |
|
| 602 | |||
| 603 | 1 | elif _get_at(current) == 'K': |
|
| 604 | 1 | if _get_at(current + 1) == 'K': |
|
| 605 | 1 | current += 2 |
|
| 606 | else: |
||
| 607 | 1 | current += 1 |
|
| 608 | 1 | primary, secondary = _metaph_add('K') |
|
| 609 | 1 | continue |
|
| 610 | |||
| 611 | 1 | elif _get_at(current) == 'L': |
|
| 612 | 1 | if _get_at(current + 1) == 'L': |
|
| 613 | # Spanish e.g. 'cabrillo', 'gallegos' |
||
| 614 | 1 | if ( |
|
| 615 | (current == (length - 3)) |
||
| 616 | and _string_at( |
||
| 617 | (current - 1), 4, {'ILLO', 'ILLA', 'ALLE'} |
||
| 618 | ) |
||
| 619 | ) or ( |
||
| 620 | ( |
||
| 621 | _string_at((last - 1), 2, {'AS', 'OS'}) |
||
| 622 | or _string_at(last, 1, {'A', 'O'}) |
||
| 623 | ) |
||
| 624 | and _string_at((current - 1), 4, {'ALLE'}) |
||
| 625 | ): |
||
| 626 | 1 | primary, secondary = _metaph_add('L', ' ') |
|
| 627 | 1 | current += 2 |
|
| 628 | 1 | continue |
|
| 629 | 1 | current += 2 |
|
| 630 | else: |
||
| 631 | 1 | current += 1 |
|
| 632 | 1 | primary, secondary = _metaph_add('L') |
|
| 633 | 1 | continue |
|
| 634 | |||
| 635 | 1 | elif _get_at(current) == 'M': |
|
| 636 | 1 | if ( |
|
| 637 | ( |
||
| 638 | _string_at((current - 1), 3, {'UMB'}) |
||
| 639 | and ( |
||
| 640 | ((current + 1) == last) |
||
| 641 | or _string_at((current + 2), 2, {'ER'}) |
||
| 642 | ) |
||
| 643 | ) |
||
| 644 | or |
||
| 645 | # 'dumb', 'thumb' |
||
| 646 | (_get_at(current + 1) == 'M') |
||
| 647 | ): |
||
| 648 | 1 | current += 2 |
|
| 649 | else: |
||
| 650 | 1 | current += 1 |
|
| 651 | 1 | primary, secondary = _metaph_add('M') |
|
| 652 | 1 | continue |
|
| 653 | |||
| 654 | 1 | elif _get_at(current) == 'N': |
|
| 655 | 1 | if _get_at(current + 1) == 'N': |
|
| 656 | 1 | current += 2 |
|
| 657 | else: |
||
| 658 | 1 | current += 1 |
|
| 659 | 1 | primary, secondary = _metaph_add('N') |
|
| 660 | 1 | continue |
|
| 661 | |||
| 662 | 1 | elif _get_at(current) == 'Ñ': |
|
| 663 | 1 | current += 1 |
|
| 664 | 1 | primary, secondary = _metaph_add('N') |
|
| 665 | 1 | continue |
|
| 666 | |||
| 667 | 1 | elif _get_at(current) == 'P': |
|
| 668 | 1 | if _get_at(current + 1) == 'H': |
|
| 669 | 1 | primary, secondary = _metaph_add('F') |
|
| 670 | 1 | current += 2 |
|
| 671 | 1 | continue |
|
| 672 | |||
| 673 | # also account for "campbell", "raspberry" |
||
| 674 | 1 | elif _string_at((current + 1), 1, {'P', 'B'}): |
|
| 675 | 1 | current += 2 |
|
| 676 | else: |
||
| 677 | 1 | current += 1 |
|
| 678 | 1 | primary, secondary = _metaph_add('P') |
|
| 679 | 1 | continue |
|
| 680 | |||
| 681 | 1 | elif _get_at(current) == 'Q': |
|
| 682 | 1 | if _get_at(current + 1) == 'Q': |
|
| 683 | 1 | current += 2 |
|
| 684 | else: |
||
| 685 | 1 | current += 1 |
|
| 686 | 1 | primary, secondary = _metaph_add('K') |
|
| 687 | 1 | continue |
|
| 688 | |||
| 689 | 1 | elif _get_at(current) == 'R': |
|
| 690 | # french e.g. 'rogier', but exclude 'hochmeier' |
||
| 691 | 1 | if ( |
|
| 692 | (current == last) |
||
| 693 | and not _slavo_germanic() |
||
| 694 | and _string_at((current - 2), 2, {'IE'}) |
||
| 695 | and not _string_at((current - 4), 2, {'ME', 'MA'}) |
||
| 696 | ): |
||
| 697 | 1 | primary, secondary = _metaph_add('', 'R') |
|
| 698 | else: |
||
| 699 | 1 | primary, secondary = _metaph_add('R') |
|
| 700 | |||
| 701 | 1 | if _get_at(current + 1) == 'R': |
|
| 702 | 1 | current += 2 |
|
| 703 | else: |
||
| 704 | 1 | current += 1 |
|
| 705 | 1 | continue |
|
| 706 | |||
| 707 | 1 | elif _get_at(current) == 'S': |
|
| 708 | # special cases 'island', 'isle', 'carlisle', 'carlysle' |
||
| 709 | 1 | if _string_at((current - 1), 3, {'ISL', 'YSL'}): |
|
| 710 | 1 | current += 1 |
|
| 711 | 1 | continue |
|
| 712 | |||
| 713 | # special case 'sugar-' |
||
| 714 | 1 | elif (current == 0) and _string_at(current, 5, {'SUGAR'}): |
|
| 715 | 1 | primary, secondary = _metaph_add('X', 'S') |
|
| 716 | 1 | current += 1 |
|
| 717 | 1 | continue |
|
| 718 | |||
| 719 | 1 | elif _string_at(current, 2, {'SH'}): |
|
| 720 | # Germanic |
||
| 721 | 1 | if _string_at( |
|
| 722 | (current + 1), 4, {'HEIM', 'HOEK', 'HOLM', 'HOLZ'} |
||
| 723 | ): |
||
| 724 | 1 | primary, secondary = _metaph_add('S') |
|
| 725 | else: |
||
| 726 | 1 | primary, secondary = _metaph_add('X') |
|
| 727 | 1 | current += 2 |
|
| 728 | 1 | continue |
|
| 729 | |||
| 730 | # Italian & Armenian |
||
| 731 | 1 | elif _string_at(current, 3, {'SIO', 'SIA'}) or _string_at( |
|
| 732 | current, 4, {'SIAN'} |
||
| 733 | ): |
||
| 734 | 1 | if not _slavo_germanic(): |
|
| 735 | 1 | primary, secondary = _metaph_add('S', 'X') |
|
| 736 | else: |
||
| 737 | 1 | primary, secondary = _metaph_add('S') |
|
| 738 | 1 | current += 3 |
|
| 739 | 1 | continue |
|
| 740 | |||
| 741 | # German & anglicisations, e.g. 'smith' match 'schmidt', |
||
| 742 | # 'snider' match 'schneider' |
||
| 743 | # also, -sz- in Slavic language although in Hungarian it is |
||
| 744 | # pronounced 's' |
||
| 745 | 1 | elif ( |
|
| 746 | (current == 0) |
||
| 747 | and _string_at((current + 1), 1, {'M', 'N', 'L', 'W'}) |
||
| 748 | ) or _string_at((current + 1), 1, {'Z'}): |
||
| 749 | 1 | primary, secondary = _metaph_add('S', 'X') |
|
| 750 | 1 | if _string_at((current + 1), 1, {'Z'}): |
|
| 751 | 1 | current += 2 |
|
| 752 | else: |
||
| 753 | 1 | current += 1 |
|
| 754 | 1 | continue |
|
| 755 | |||
| 756 | 1 | elif _string_at(current, 2, {'SC'}): |
|
| 757 | # Schlesinger's rule |
||
| 758 | 1 | if _get_at(current + 2) == 'H': |
|
| 759 | # dutch origin, e.g. 'school', 'schooner' |
||
| 760 | 1 | if _string_at( |
|
| 761 | (current + 3), |
||
| 762 | 2, |
||
| 763 | {'OO', 'ER', 'EN', 'UY', 'ED', 'EM'}, |
||
| 764 | ): |
||
| 765 | # 'schermerhorn', 'schenker' |
||
| 766 | 1 | if _string_at((current + 3), 2, {'ER', 'EN'}): |
|
| 767 | 1 | primary, secondary = _metaph_add('X', 'SK') |
|
| 768 | else: |
||
| 769 | 1 | primary, secondary = _metaph_add('SK') |
|
| 770 | 1 | current += 3 |
|
| 771 | 1 | continue |
|
| 772 | else: |
||
| 773 | 1 | if ( |
|
| 774 | (current == 0) |
||
| 775 | and not _is_vowel(3) |
||
| 776 | and (_get_at(3) != 'W') |
||
| 777 | ): |
||
| 778 | 1 | primary, secondary = _metaph_add('X', 'S') |
|
| 779 | else: |
||
| 780 | 1 | primary, secondary = _metaph_add('X') |
|
| 781 | 1 | current += 3 |
|
| 782 | 1 | continue |
|
| 783 | |||
| 784 | 1 | elif _string_at((current + 2), 1, {'I', 'E', 'Y'}): |
|
| 785 | 1 | primary, secondary = _metaph_add('S') |
|
| 786 | 1 | current += 3 |
|
| 787 | 1 | continue |
|
| 788 | |||
| 789 | # else |
||
| 790 | else: |
||
| 791 | 1 | primary, secondary = _metaph_add('SK') |
|
| 792 | 1 | current += 3 |
|
| 793 | 1 | continue |
|
| 794 | |||
| 795 | else: |
||
| 796 | # french e.g. 'resnais', 'artois' |
||
| 797 | 1 | if (current == last) and _string_at( |
|
| 798 | (current - 2), 2, {'AI', 'OI'} |
||
| 799 | ): |
||
| 800 | 1 | primary, secondary = _metaph_add('', 'S') |
|
| 801 | else: |
||
| 802 | 1 | primary, secondary = _metaph_add('S') |
|
| 803 | |||
| 804 | 1 | if _string_at((current + 1), 1, {'S', 'Z'}): |
|
| 805 | 1 | current += 2 |
|
| 806 | else: |
||
| 807 | 1 | current += 1 |
|
| 808 | 1 | continue |
|
| 809 | |||
| 810 | 1 | elif _get_at(current) == 'T': |
|
| 811 | 1 | if _string_at(current, 4, {'TION'}): |
|
| 812 | 1 | primary, secondary = _metaph_add('X') |
|
| 813 | 1 | current += 3 |
|
| 814 | 1 | continue |
|
| 815 | |||
| 816 | 1 | elif _string_at(current, 3, {'TIA', 'TCH'}): |
|
| 817 | 1 | primary, secondary = _metaph_add('X') |
|
| 818 | 1 | current += 3 |
|
| 819 | 1 | continue |
|
| 820 | |||
| 821 | 1 | elif _string_at(current, 2, {'TH'}) or _string_at( |
|
| 822 | current, 3, {'TTH'} |
||
| 823 | ): |
||
| 824 | # special case 'thomas', 'thames' or germanic |
||
| 825 | 1 | if ( |
|
| 826 | _string_at((current + 2), 2, {'OM', 'AM'}) |
||
| 827 | or _string_at(0, 4, {'VAN ', 'VON '}) |
||
| 828 | or _string_at(0, 3, {'SCH'}) |
||
| 829 | ): |
||
| 830 | 1 | primary, secondary = _metaph_add('T') |
|
| 831 | else: |
||
| 832 | 1 | primary, secondary = _metaph_add('0', 'T') |
|
| 833 | 1 | current += 2 |
|
| 834 | 1 | continue |
|
| 835 | |||
| 836 | 1 | elif _string_at((current + 1), 1, {'T', 'D'}): |
|
| 837 | 1 | current += 2 |
|
| 838 | else: |
||
| 839 | 1 | current += 1 |
|
| 840 | 1 | primary, secondary = _metaph_add('T') |
|
| 841 | 1 | continue |
|
| 842 | |||
| 843 | 1 | elif _get_at(current) == 'V': |
|
| 844 | 1 | if _get_at(current + 1) == 'V': |
|
| 845 | 1 | current += 2 |
|
| 846 | else: |
||
| 847 | 1 | current += 1 |
|
| 848 | 1 | primary, secondary = _metaph_add('F') |
|
| 849 | 1 | continue |
|
| 850 | |||
| 851 | 1 | elif _get_at(current) == 'W': |
|
| 852 | # can also be in middle of word |
||
| 853 | 1 | if _string_at(current, 2, {'WR'}): |
|
| 854 | 1 | primary, secondary = _metaph_add('R') |
|
| 855 | 1 | current += 2 |
|
| 856 | 1 | continue |
|
| 857 | 1 | elif (current == 0) and ( |
|
| 858 | _is_vowel(current + 1) or _string_at(current, 2, {'WH'}) |
||
| 859 | ): |
||
| 860 | # Wasserman should match Vasserman |
||
| 861 | 1 | if _is_vowel(current + 1): |
|
| 862 | 1 | primary, secondary = _metaph_add('A', 'F') |
|
| 863 | else: |
||
| 864 | # need Uomo to match Womo |
||
| 865 | 1 | primary, secondary = _metaph_add('A') |
|
| 866 | |||
| 867 | # Arnow should match Arnoff |
||
| 868 | 1 | if ( |
|
| 869 | ((current == last) and _is_vowel(current - 1)) |
||
| 870 | or _string_at( |
||
| 871 | (current - 1), 5, {'EWSKI', 'EWSKY', 'OWSKI', 'OWSKY'} |
||
| 872 | ) |
||
| 873 | or _string_at(0, 3, ['SCH']) |
||
| 874 | ): |
||
| 875 | 1 | primary, secondary = _metaph_add('', 'F') |
|
| 876 | 1 | current += 1 |
|
| 877 | 1 | continue |
|
| 878 | # Polish e.g. 'filipowicz' |
||
| 879 | 1 | elif _string_at(current, 4, {'WICZ', 'WITZ'}): |
|
| 880 | 1 | primary, secondary = _metaph_add('TS', 'FX') |
|
| 881 | 1 | current += 4 |
|
| 882 | 1 | continue |
|
| 883 | # else skip it |
||
| 884 | else: |
||
| 885 | 1 | current += 1 |
|
| 886 | 1 | continue |
|
| 887 | |||
| 888 | 1 | elif _get_at(current) == 'X': |
|
| 889 | # French e.g. breaux |
||
| 890 | 1 | if not ( |
|
| 891 | (current == last) |
||
| 892 | and ( |
||
| 893 | _string_at((current - 3), 3, {'IAU', 'EAU'}) |
||
| 894 | or _string_at((current - 2), 2, {'AU', 'OU'}) |
||
| 895 | ) |
||
| 896 | ): |
||
| 897 | 1 | primary, secondary = _metaph_add('KS') |
|
| 898 | |||
| 899 | 1 | if _string_at((current + 1), 1, {'C', 'X'}): |
|
| 900 | 1 | current += 2 |
|
| 901 | else: |
||
| 902 | 1 | current += 1 |
|
| 903 | 1 | continue |
|
| 904 | |||
| 905 | 1 | elif _get_at(current) == 'Z': |
|
| 906 | # Chinese Pinyin e.g. 'zhao' |
||
| 907 | 1 | if _get_at(current + 1) == 'H': |
|
| 908 | 1 | primary, secondary = _metaph_add('J') |
|
| 909 | 1 | current += 2 |
|
| 910 | 1 | continue |
|
| 911 | 1 | elif _string_at((current + 1), 2, {'ZO', 'ZI', 'ZA'}) or ( |
|
| 912 | _slavo_germanic() |
||
| 913 | and ((current > 0) and _get_at(current - 1) != 'T') |
||
| 914 | ): |
||
| 915 | 1 | primary, secondary = _metaph_add('S', 'TS') |
|
| 916 | else: |
||
| 917 | 1 | primary, secondary = _metaph_add('S') |
|
| 918 | |||
| 919 | 1 | if _get_at(current + 1) == 'Z': |
|
| 920 | 1 | current += 2 |
|
| 921 | else: |
||
| 922 | 1 | current += 1 |
|
| 923 | 1 | continue |
|
| 924 | |||
| 925 | else: |
||
| 926 | 1 | current += 1 |
|
| 927 | |||
| 928 | 1 | if max_length > 0: |
|
| 929 | 1 | primary = primary[:max_length] |
|
| 930 | 1 | secondary = secondary[:max_length] |
|
| 931 | 1 | if primary == secondary: |
|
| 932 | 1 | secondary = '' |
|
| 933 | |||
| 934 | 1 | return primary, secondary |
|
| 935 | |||
| 969 |
This check looks for invalid names for a range of different identifiers.
You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.
If your project includes a Pylint configuration file, the settings contained in that file take precedence.
To find out more about Pylint, please refer to their site.