| Conditions | 219 |
| Total Lines | 822 |
| Code Lines | 528 |
| Lines | 0 |
| Ratio | 0 % |
| Tests | 407 |
| CRAP Score | 219 |
| Changes | 0 | ||
Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.
For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.
Commonly applied refactorings include:
If many parameters/temporary variables are present:
Complex classes like abydos.phonetic._metaphone.double_metaphone() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
| 1 | # -*- coding: utf-8 -*- |
||
| 225 | 1 | def double_metaphone(word, max_length=-1): |
|
| 226 | """Return the Double Metaphone code for a word. |
||
| 227 | |||
| 228 | Based on Lawrence Philips' (Visual) C++ code from 1999 |
||
| 229 | :cite:`Philips:2000`. |
||
| 230 | |||
| 231 | :param word: the word to transform |
||
| 232 | :param max_length: the maximum length of the returned Double Metaphone |
||
| 233 | codes (defaults to 64, but in Philips' original implementation this |
||
| 234 | was 4) |
||
| 235 | :returns: the Double Metaphone value(s) |
||
| 236 | :rtype: tuple |
||
| 237 | |||
| 238 | >>> double_metaphone('Christopher') |
||
| 239 | ('KRSTFR', '') |
||
| 240 | >>> double_metaphone('Niall') |
||
| 241 | ('NL', '') |
||
| 242 | >>> double_metaphone('Smith') |
||
| 243 | ('SM0', 'XMT') |
||
| 244 | >>> double_metaphone('Schmidt') |
||
| 245 | ('XMT', 'SMT') |
||
| 246 | """ |
||
| 247 | # Require a max_length of at least 4 |
||
| 248 | 1 | if max_length != -1: |
|
| 249 | 1 | max_length = max(4, max_length) |
|
| 250 | else: |
||
| 251 | 1 | max_length = 64 |
|
| 252 | |||
| 253 | 1 | primary = '' |
|
| 254 | 1 | secondary = '' |
|
| 255 | |||
| 256 | 1 | def _slavo_germanic(): |
|
| 257 | """Return True if the word appears to be Slavic or Germanic.""" |
||
| 258 | 1 | if 'W' in word or 'K' in word or 'CZ' in word: |
|
| 259 | 1 | return True |
|
| 260 | 1 | return False |
|
| 261 | |||
| 262 | 1 | def _metaph_add(pri, sec=''): |
|
| 263 | """Return a new metaphone tuple with the supplied elements.""" |
||
| 264 | 1 | newpri = primary |
|
| 265 | 1 | newsec = secondary |
|
| 266 | 1 | if pri: |
|
| 267 | 1 | newpri += pri |
|
| 268 | 1 | if sec: |
|
| 269 | 1 | if sec != ' ': |
|
| 270 | 1 | newsec += sec |
|
| 271 | else: |
||
| 272 | 1 | newsec += pri |
|
| 273 | 1 | return newpri, newsec |
|
| 274 | |||
| 275 | 1 | def _is_vowel(pos): |
|
| 276 | """Return True if the character at word[pos] is a vowel.""" |
||
| 277 | 1 | if pos >= 0 and word[pos] in {'A', 'E', 'I', 'O', 'U', 'Y'}: |
|
| 278 | 1 | return True |
|
| 279 | 1 | return False |
|
| 280 | |||
| 281 | 1 | def _get_at(pos): |
|
| 282 | """Return the character at word[pos].""" |
||
| 283 | 1 | return word[pos] |
|
| 284 | |||
| 285 | 1 | def _string_at(pos, slen, substrings): |
|
| 286 | """Return True if word[pos:pos+slen] is in substrings.""" |
||
| 287 | 1 | if pos < 0: |
|
| 288 | 1 | return False |
|
| 289 | 1 | return word[pos : pos + slen] in substrings |
|
| 290 | |||
| 291 | 1 | current = 0 |
|
| 292 | 1 | length = len(word) |
|
| 293 | 1 | if length < 1: |
|
| 294 | 1 | return '', '' |
|
| 295 | 1 | last = length - 1 |
|
| 296 | |||
| 297 | 1 | word = word.upper() |
|
| 298 | 1 | word = word.replace('ß', 'SS') |
|
| 299 | |||
| 300 | # Pad the original string so that we can index beyond the edge of the world |
||
| 301 | 1 | word += ' ' |
|
| 302 | |||
| 303 | # Skip these when at start of word |
||
| 304 | 1 | if word[0:2] in {'GN', 'KN', 'PN', 'WR', 'PS'}: |
|
| 305 | 1 | current += 1 |
|
| 306 | |||
| 307 | # Initial 'X' is pronounced 'Z' e.g. 'Xavier' |
||
| 308 | 1 | if _get_at(0) == 'X': |
|
| 309 | 1 | primary, secondary = _metaph_add('S') # 'Z' maps to 'S' |
|
| 310 | 1 | current += 1 |
|
| 311 | |||
| 312 | # Main loop |
||
| 313 | 1 | while True: |
|
| 314 | 1 | if current >= length: |
|
| 315 | 1 | break |
|
| 316 | |||
| 317 | 1 | if _get_at(current) in {'A', 'E', 'I', 'O', 'U', 'Y'}: |
|
| 318 | 1 | if current == 0: |
|
| 319 | # All init vowels now map to 'A' |
||
| 320 | 1 | primary, secondary = _metaph_add('A') |
|
| 321 | 1 | current += 1 |
|
| 322 | 1 | continue |
|
| 323 | |||
| 324 | 1 | elif _get_at(current) == 'B': |
|
| 325 | # "-mb", e.g", "dumb", already skipped over... |
||
| 326 | 1 | primary, secondary = _metaph_add('P') |
|
| 327 | 1 | if _get_at(current + 1) == 'B': |
|
| 328 | 1 | current += 2 |
|
| 329 | else: |
||
| 330 | 1 | current += 1 |
|
| 331 | 1 | continue |
|
| 332 | |||
| 333 | 1 | elif _get_at(current) == 'Ç': |
|
| 334 | 1 | primary, secondary = _metaph_add('S') |
|
| 335 | 1 | current += 1 |
|
| 336 | 1 | continue |
|
| 337 | |||
| 338 | 1 | elif _get_at(current) == 'C': |
|
| 339 | # Various Germanic |
||
| 340 | 1 | if ( |
|
| 341 | current > 1 |
||
| 342 | and not _is_vowel(current - 2) |
||
| 343 | and _string_at((current - 1), 3, {'ACH'}) |
||
| 344 | and ( |
||
| 345 | (_get_at(current + 2) != 'I') |
||
| 346 | and ( |
||
| 347 | (_get_at(current + 2) != 'E') |
||
| 348 | or _string_at((current - 2), 6, {'BACHER', 'MACHER'}) |
||
| 349 | ) |
||
| 350 | ) |
||
| 351 | ): |
||
| 352 | 1 | primary, secondary = _metaph_add('K') |
|
| 353 | 1 | current += 2 |
|
| 354 | 1 | continue |
|
| 355 | |||
| 356 | # Special case 'caesar' |
||
| 357 | 1 | elif current == 0 and _string_at(current, 6, {'CAESAR'}): |
|
| 358 | 1 | primary, secondary = _metaph_add('S') |
|
| 359 | 1 | current += 2 |
|
| 360 | 1 | continue |
|
| 361 | |||
| 362 | # Italian 'chianti' |
||
| 363 | 1 | elif _string_at(current, 4, {'CHIA'}): |
|
| 364 | 1 | primary, secondary = _metaph_add('K') |
|
| 365 | 1 | current += 2 |
|
| 366 | 1 | continue |
|
| 367 | |||
| 368 | 1 | elif _string_at(current, 2, {'CH'}): |
|
| 369 | # Find 'Michael' |
||
| 370 | 1 | if current > 0 and _string_at(current, 4, {'CHAE'}): |
|
| 371 | 1 | primary, secondary = _metaph_add('K', 'X') |
|
| 372 | 1 | current += 2 |
|
| 373 | 1 | continue |
|
| 374 | |||
| 375 | # Greek roots e.g. 'chemistry', 'chorus' |
||
| 376 | 1 | elif ( |
|
| 377 | current == 0 |
||
| 378 | and ( |
||
| 379 | _string_at((current + 1), 5, {'HARAC', 'HARIS'}) |
||
| 380 | or _string_at( |
||
| 381 | (current + 1), 3, {'HOR', 'HYM', 'HIA', 'HEM'} |
||
| 382 | ) |
||
| 383 | ) |
||
| 384 | and not _string_at(0, 5, {'CHORE'}) |
||
| 385 | ): |
||
| 386 | 1 | primary, secondary = _metaph_add('K') |
|
| 387 | 1 | current += 2 |
|
| 388 | 1 | continue |
|
| 389 | |||
| 390 | # Germanic, Greek, or otherwise 'ch' for 'kh' sound |
||
| 391 | 1 | elif ( |
|
| 392 | ( |
||
| 393 | _string_at(0, 4, {'VAN ', 'VON '}) |
||
| 394 | or _string_at(0, 3, {'SCH'}) |
||
| 395 | ) |
||
| 396 | or |
||
| 397 | # 'architect but not 'arch', 'orchestra', 'orchid' |
||
| 398 | _string_at( |
||
| 399 | (current - 2), 6, {'ORCHES', 'ARCHIT', 'ORCHID'} |
||
| 400 | ) |
||
| 401 | or _string_at((current + 2), 1, {'T', 'S'}) |
||
| 402 | or ( |
||
| 403 | ( |
||
| 404 | _string_at((current - 1), 1, {'A', 'O', 'U', 'E'}) |
||
| 405 | or (current == 0) |
||
| 406 | ) |
||
| 407 | and |
||
| 408 | # e.g., 'wachtler', 'wechsler', but not 'tichner' |
||
| 409 | _string_at( |
||
| 410 | (current + 2), |
||
| 411 | 1, |
||
| 412 | {'L', 'R', 'N', 'M', 'B', 'H', 'F', 'V', 'W', ' '}, |
||
| 413 | ) |
||
| 414 | ) |
||
| 415 | ): |
||
| 416 | 1 | primary, secondary = _metaph_add('K') |
|
| 417 | |||
| 418 | else: |
||
| 419 | 1 | if current > 0: |
|
| 420 | 1 | if _string_at(0, 2, {'MC'}): |
|
| 421 | # e.g., "McHugh" |
||
| 422 | 1 | primary, secondary = _metaph_add('K') |
|
| 423 | else: |
||
| 424 | 1 | primary, secondary = _metaph_add('X', 'K') |
|
| 425 | else: |
||
| 426 | 1 | primary, secondary = _metaph_add('X') |
|
| 427 | |||
| 428 | 1 | current += 2 |
|
| 429 | 1 | continue |
|
| 430 | |||
| 431 | # e.g, 'czerny' |
||
| 432 | 1 | elif _string_at(current, 2, {'CZ'}) and not _string_at( |
|
| 433 | (current - 2), 4, {'WICZ'} |
||
| 434 | ): |
||
| 435 | 1 | primary, secondary = _metaph_add('S', 'X') |
|
| 436 | 1 | current += 2 |
|
| 437 | 1 | continue |
|
| 438 | |||
| 439 | # e.g., 'focaccia' |
||
| 440 | 1 | elif _string_at((current + 1), 3, {'CIA'}): |
|
| 441 | 1 | primary, secondary = _metaph_add('X') |
|
| 442 | 1 | current += 3 |
|
| 443 | |||
| 444 | # double 'C', but not if e.g. 'McClellan' |
||
| 445 | 1 | elif _string_at(current, 2, {'CC'}) and not ( |
|
| 446 | (current == 1) and (_get_at(0) == 'M') |
||
| 447 | ): |
||
| 448 | # 'bellocchio' but not 'bacchus' |
||
| 449 | 1 | if _string_at( |
|
| 450 | (current + 2), 1, {'I', 'E', 'H'} |
||
| 451 | ) and not _string_at((current + 2), 2, ['HU']): |
||
| 452 | # 'accident', 'accede' 'succeed' |
||
| 453 | 1 | if ( |
|
| 454 | (current == 1) and _get_at(current - 1) == 'A' |
||
| 455 | ) or _string_at((current - 1), 5, {'UCCEE', 'UCCES'}): |
||
| 456 | 1 | primary, secondary = _metaph_add('KS') |
|
| 457 | # 'bacci', 'bertucci', other italian |
||
| 458 | else: |
||
| 459 | 1 | primary, secondary = _metaph_add('X') |
|
| 460 | 1 | current += 3 |
|
| 461 | 1 | continue |
|
| 462 | else: # Pierce's rule |
||
| 463 | 1 | primary, secondary = _metaph_add('K') |
|
| 464 | 1 | current += 2 |
|
| 465 | 1 | continue |
|
| 466 | |||
| 467 | 1 | elif _string_at(current, 2, {'CK', 'CG', 'CQ'}): |
|
| 468 | 1 | primary, secondary = _metaph_add('K') |
|
| 469 | 1 | current += 2 |
|
| 470 | 1 | continue |
|
| 471 | |||
| 472 | 1 | elif _string_at(current, 2, {'CI', 'CE', 'CY'}): |
|
| 473 | # Italian vs. English |
||
| 474 | 1 | if _string_at(current, 3, {'CIO', 'CIE', 'CIA'}): |
|
| 475 | 1 | primary, secondary = _metaph_add('S', 'X') |
|
| 476 | else: |
||
| 477 | 1 | primary, secondary = _metaph_add('S') |
|
| 478 | 1 | current += 2 |
|
| 479 | 1 | continue |
|
| 480 | |||
| 481 | # else |
||
| 482 | else: |
||
| 483 | 1 | primary, secondary = _metaph_add('K') |
|
| 484 | |||
| 485 | # name sent in 'mac caffrey', 'mac gregor |
||
| 486 | 1 | if _string_at((current + 1), 2, {' C', ' Q', ' G'}): |
|
| 487 | 1 | current += 3 |
|
| 488 | 1 | elif _string_at( |
|
| 489 | (current + 1), 1, {'C', 'K', 'Q'} |
||
| 490 | ) and not _string_at((current + 1), 2, {'CE', 'CI'}): |
||
| 491 | 1 | current += 2 |
|
| 492 | else: |
||
| 493 | 1 | current += 1 |
|
| 494 | 1 | continue |
|
| 495 | |||
| 496 | 1 | elif _get_at(current) == 'D': |
|
| 497 | 1 | if _string_at(current, 2, {'DG'}): |
|
| 498 | 1 | if _string_at((current + 2), 1, {'I', 'E', 'Y'}): |
|
| 499 | # e.g. 'edge' |
||
| 500 | 1 | primary, secondary = _metaph_add('J') |
|
| 501 | 1 | current += 3 |
|
| 502 | 1 | continue |
|
| 503 | else: |
||
| 504 | # e.g. 'edgar' |
||
| 505 | 1 | primary, secondary = _metaph_add('TK') |
|
| 506 | 1 | current += 2 |
|
| 507 | 1 | continue |
|
| 508 | |||
| 509 | 1 | elif _string_at(current, 2, {'DT', 'DD'}): |
|
| 510 | 1 | primary, secondary = _metaph_add('T') |
|
| 511 | 1 | current += 2 |
|
| 512 | 1 | continue |
|
| 513 | |||
| 514 | # else |
||
| 515 | else: |
||
| 516 | 1 | primary, secondary = _metaph_add('T') |
|
| 517 | 1 | current += 1 |
|
| 518 | 1 | continue |
|
| 519 | |||
| 520 | 1 | elif _get_at(current) == 'F': |
|
| 521 | 1 | if _get_at(current + 1) == 'F': |
|
| 522 | 1 | current += 2 |
|
| 523 | else: |
||
| 524 | 1 | current += 1 |
|
| 525 | 1 | primary, secondary = _metaph_add('F') |
|
| 526 | 1 | continue |
|
| 527 | |||
| 528 | 1 | elif _get_at(current) == 'G': |
|
| 529 | 1 | if _get_at(current + 1) == 'H': |
|
| 530 | 1 | if (current > 0) and not _is_vowel(current - 1): |
|
| 531 | 1 | primary, secondary = _metaph_add('K') |
|
| 532 | 1 | current += 2 |
|
| 533 | 1 | continue |
|
| 534 | |||
| 535 | # 'ghislane', ghiradelli |
||
| 536 | 1 | elif current == 0: |
|
| 537 | 1 | if _get_at(current + 2) == 'I': |
|
| 538 | 1 | primary, secondary = _metaph_add('J') |
|
| 539 | else: |
||
| 540 | 1 | primary, secondary = _metaph_add('K') |
|
| 541 | 1 | current += 2 |
|
| 542 | 1 | continue |
|
| 543 | |||
| 544 | # Parker's rule (with some further refinements) - e.g., 'hugh' |
||
| 545 | 1 | elif ( |
|
| 546 | ( |
||
| 547 | (current > 1) |
||
| 548 | and _string_at((current - 2), 1, {'B', 'H', 'D'}) |
||
| 549 | ) |
||
| 550 | or |
||
| 551 | # e.g., 'bough' |
||
| 552 | ( |
||
| 553 | (current > 2) |
||
| 554 | and _string_at((current - 3), 1, {'B', 'H', 'D'}) |
||
| 555 | ) |
||
| 556 | or |
||
| 557 | # e.g., 'broughton' |
||
| 558 | ( |
||
| 559 | (current > 3) |
||
| 560 | and _string_at((current - 4), 1, {'B', 'H'}) |
||
| 561 | ) |
||
| 562 | ): |
||
| 563 | 1 | current += 2 |
|
| 564 | 1 | continue |
|
| 565 | else: |
||
| 566 | # e.g. 'laugh', 'McLaughlin', 'cough', |
||
| 567 | # 'gough', 'rough', 'tough' |
||
| 568 | 1 | if ( |
|
| 569 | (current > 2) |
||
| 570 | and (_get_at(current - 1) == 'U') |
||
| 571 | and ( |
||
| 572 | _string_at( |
||
| 573 | (current - 3), 1, {'C', 'G', 'L', 'R', 'T'} |
||
| 574 | ) |
||
| 575 | ) |
||
| 576 | ): |
||
| 577 | 1 | primary, secondary = _metaph_add('F') |
|
| 578 | 1 | elif (current > 0) and _get_at(current - 1) != 'I': |
|
| 579 | 1 | primary, secondary = _metaph_add('K') |
|
| 580 | 1 | current += 2 |
|
| 581 | 1 | continue |
|
| 582 | |||
| 583 | 1 | elif _get_at(current + 1) == 'N': |
|
| 584 | 1 | if (current == 1) and _is_vowel(0) and not _slavo_germanic(): |
|
| 585 | 1 | primary, secondary = _metaph_add('KN', 'N') |
|
| 586 | # not e.g. 'cagney' |
||
| 587 | 1 | elif ( |
|
| 588 | not _string_at((current + 2), 2, {'EY'}) |
||
| 589 | and (_get_at(current + 1) != 'Y') |
||
| 590 | and not _slavo_germanic() |
||
| 591 | ): |
||
| 592 | 1 | primary, secondary = _metaph_add('N', 'KN') |
|
| 593 | else: |
||
| 594 | 1 | primary, secondary = _metaph_add('KN') |
|
| 595 | 1 | current += 2 |
|
| 596 | 1 | continue |
|
| 597 | |||
| 598 | # 'tagliaro' |
||
| 599 | 1 | elif ( |
|
| 600 | _string_at((current + 1), 2, {'LI'}) and not _slavo_germanic() |
||
| 601 | ): |
||
| 602 | 1 | primary, secondary = _metaph_add('KL', 'L') |
|
| 603 | 1 | current += 2 |
|
| 604 | 1 | continue |
|
| 605 | |||
| 606 | # -ges-, -gep-, -gel-, -gie- at beginning |
||
| 607 | 1 | elif (current == 0) and ( |
|
| 608 | (_get_at(current + 1) == 'Y') |
||
| 609 | or _string_at( |
||
| 610 | (current + 1), |
||
| 611 | 2, |
||
| 612 | { |
||
| 613 | 'ES', |
||
| 614 | 'EP', |
||
| 615 | 'EB', |
||
| 616 | 'EL', |
||
| 617 | 'EY', |
||
| 618 | 'IB', |
||
| 619 | 'IL', |
||
| 620 | 'IN', |
||
| 621 | 'IE', |
||
| 622 | 'EI', |
||
| 623 | 'ER', |
||
| 624 | }, |
||
| 625 | ) |
||
| 626 | ): |
||
| 627 | 1 | primary, secondary = _metaph_add('K', 'J') |
|
| 628 | 1 | current += 2 |
|
| 629 | 1 | continue |
|
| 630 | |||
| 631 | # -ger-, -gy- |
||
| 632 | 1 | elif ( |
|
| 633 | ( |
||
| 634 | _string_at((current + 1), 2, {'ER'}) |
||
| 635 | or (_get_at(current + 1) == 'Y') |
||
| 636 | ) |
||
| 637 | and not _string_at(0, 6, {'DANGER', 'RANGER', 'MANGER'}) |
||
| 638 | and not _string_at((current - 1), 1, {'E', 'I'}) |
||
| 639 | and not _string_at((current - 1), 3, {'RGY', 'OGY'}) |
||
| 640 | ): |
||
| 641 | 1 | primary, secondary = _metaph_add('K', 'J') |
|
| 642 | 1 | current += 2 |
|
| 643 | 1 | continue |
|
| 644 | |||
| 645 | # italian e.g, 'biaggi' |
||
| 646 | 1 | elif _string_at((current + 1), 1, {'E', 'I', 'Y'}) or _string_at( |
|
| 647 | (current - 1), 4, {'AGGI', 'OGGI'} |
||
| 648 | ): |
||
| 649 | # obvious germanic |
||
| 650 | 1 | if ( |
|
| 651 | _string_at(0, 4, {'VAN ', 'VON '}) |
||
| 652 | or _string_at(0, 3, {'SCH'}) |
||
| 653 | ) or _string_at((current + 1), 2, {'ET'}): |
||
| 654 | 1 | primary, secondary = _metaph_add('K') |
|
| 655 | 1 | elif _string_at((current + 1), 4, {'IER '}): |
|
| 656 | 1 | primary, secondary = _metaph_add('J') |
|
| 657 | else: |
||
| 658 | 1 | primary, secondary = _metaph_add('J', 'K') |
|
| 659 | 1 | current += 2 |
|
| 660 | 1 | continue |
|
| 661 | |||
| 662 | else: |
||
| 663 | 1 | if _get_at(current + 1) == 'G': |
|
| 664 | 1 | current += 2 |
|
| 665 | else: |
||
| 666 | 1 | current += 1 |
|
| 667 | 1 | primary, secondary = _metaph_add('K') |
|
| 668 | 1 | continue |
|
| 669 | |||
| 670 | 1 | elif _get_at(current) == 'H': |
|
| 671 | # only keep if first & before vowel or btw. 2 vowels |
||
| 672 | 1 | if ((current == 0) or _is_vowel(current - 1)) and _is_vowel( |
|
| 673 | current + 1 |
||
| 674 | ): |
||
| 675 | 1 | primary, secondary = _metaph_add('H') |
|
| 676 | 1 | current += 2 |
|
| 677 | else: # also takes care of 'HH' |
||
| 678 | 1 | current += 1 |
|
| 679 | 1 | continue |
|
| 680 | |||
| 681 | 1 | elif _get_at(current) == 'J': |
|
| 682 | # obvious spanish, 'jose', 'san jacinto' |
||
| 683 | 1 | if _string_at(current, 4, ['JOSE']) or _string_at(0, 4, {'SAN '}): |
|
| 684 | 1 | if ( |
|
| 685 | (current == 0) and (_get_at(current + 4) == ' ') |
||
| 686 | ) or _string_at(0, 4, ['SAN ']): |
||
| 687 | 1 | primary, secondary = _metaph_add('H') |
|
| 688 | else: |
||
| 689 | 1 | primary, secondary = _metaph_add('J', 'H') |
|
| 690 | 1 | current += 1 |
|
| 691 | 1 | continue |
|
| 692 | |||
| 693 | 1 | elif (current == 0) and not _string_at(current, 4, {'JOSE'}): |
|
| 694 | # Yankelovich/Jankelowicz |
||
| 695 | 1 | primary, secondary = _metaph_add('J', 'A') |
|
| 696 | # Spanish pron. of e.g. 'bajador' |
||
| 697 | 1 | elif ( |
|
| 698 | _is_vowel(current - 1) |
||
| 699 | and not _slavo_germanic() |
||
| 700 | and ( |
||
| 701 | (_get_at(current + 1) == 'A') |
||
| 702 | or (_get_at(current + 1) == 'O') |
||
| 703 | ) |
||
| 704 | ): |
||
| 705 | 1 | primary, secondary = _metaph_add('J', 'H') |
|
| 706 | 1 | elif current == last: |
|
| 707 | 1 | primary, secondary = _metaph_add('J', ' ') |
|
| 708 | 1 | elif not _string_at( |
|
| 709 | (current + 1), 1, {'L', 'T', 'K', 'S', 'N', 'M', 'B', 'Z'} |
||
| 710 | ) and not _string_at((current - 1), 1, {'S', 'K', 'L'}): |
||
| 711 | 1 | primary, secondary = _metaph_add('J') |
|
| 712 | |||
| 713 | 1 | if _get_at(current + 1) == 'J': # it could happen! |
|
| 714 | 1 | current += 2 |
|
| 715 | else: |
||
| 716 | 1 | current += 1 |
|
| 717 | 1 | continue |
|
| 718 | |||
| 719 | 1 | elif _get_at(current) == 'K': |
|
| 720 | 1 | if _get_at(current + 1) == 'K': |
|
| 721 | 1 | current += 2 |
|
| 722 | else: |
||
| 723 | 1 | current += 1 |
|
| 724 | 1 | primary, secondary = _metaph_add('K') |
|
| 725 | 1 | continue |
|
| 726 | |||
| 727 | 1 | elif _get_at(current) == 'L': |
|
| 728 | 1 | if _get_at(current + 1) == 'L': |
|
| 729 | # Spanish e.g. 'cabrillo', 'gallegos' |
||
| 730 | 1 | if ( |
|
| 731 | (current == (length - 3)) |
||
| 732 | and _string_at((current - 1), 4, {'ILLO', 'ILLA', 'ALLE'}) |
||
| 733 | ) or ( |
||
| 734 | ( |
||
| 735 | _string_at((last - 1), 2, {'AS', 'OS'}) |
||
| 736 | or _string_at(last, 1, {'A', 'O'}) |
||
| 737 | ) |
||
| 738 | and _string_at((current - 1), 4, {'ALLE'}) |
||
| 739 | ): |
||
| 740 | 1 | primary, secondary = _metaph_add('L', ' ') |
|
| 741 | 1 | current += 2 |
|
| 742 | 1 | continue |
|
| 743 | 1 | current += 2 |
|
| 744 | else: |
||
| 745 | 1 | current += 1 |
|
| 746 | 1 | primary, secondary = _metaph_add('L') |
|
| 747 | 1 | continue |
|
| 748 | |||
| 749 | 1 | elif _get_at(current) == 'M': |
|
| 750 | 1 | if ( |
|
| 751 | ( |
||
| 752 | _string_at((current - 1), 3, {'UMB'}) |
||
| 753 | and ( |
||
| 754 | ((current + 1) == last) |
||
| 755 | or _string_at((current + 2), 2, {'ER'}) |
||
| 756 | ) |
||
| 757 | ) |
||
| 758 | or |
||
| 759 | # 'dumb', 'thumb' |
||
| 760 | (_get_at(current + 1) == 'M') |
||
| 761 | ): |
||
| 762 | 1 | current += 2 |
|
| 763 | else: |
||
| 764 | 1 | current += 1 |
|
| 765 | 1 | primary, secondary = _metaph_add('M') |
|
| 766 | 1 | continue |
|
| 767 | |||
| 768 | 1 | elif _get_at(current) == 'N': |
|
| 769 | 1 | if _get_at(current + 1) == 'N': |
|
| 770 | 1 | current += 2 |
|
| 771 | else: |
||
| 772 | 1 | current += 1 |
|
| 773 | 1 | primary, secondary = _metaph_add('N') |
|
| 774 | 1 | continue |
|
| 775 | |||
| 776 | 1 | elif _get_at(current) == 'Ñ': |
|
| 777 | 1 | current += 1 |
|
| 778 | 1 | primary, secondary = _metaph_add('N') |
|
| 779 | 1 | continue |
|
| 780 | |||
| 781 | 1 | elif _get_at(current) == 'P': |
|
| 782 | 1 | if _get_at(current + 1) == 'H': |
|
| 783 | 1 | primary, secondary = _metaph_add('F') |
|
| 784 | 1 | current += 2 |
|
| 785 | 1 | continue |
|
| 786 | |||
| 787 | # also account for "campbell", "raspberry" |
||
| 788 | 1 | elif _string_at((current + 1), 1, {'P', 'B'}): |
|
| 789 | 1 | current += 2 |
|
| 790 | else: |
||
| 791 | 1 | current += 1 |
|
| 792 | 1 | primary, secondary = _metaph_add('P') |
|
| 793 | 1 | continue |
|
| 794 | |||
| 795 | 1 | elif _get_at(current) == 'Q': |
|
| 796 | 1 | if _get_at(current + 1) == 'Q': |
|
| 797 | 1 | current += 2 |
|
| 798 | else: |
||
| 799 | 1 | current += 1 |
|
| 800 | 1 | primary, secondary = _metaph_add('K') |
|
| 801 | 1 | continue |
|
| 802 | |||
| 803 | 1 | elif _get_at(current) == 'R': |
|
| 804 | # french e.g. 'rogier', but exclude 'hochmeier' |
||
| 805 | 1 | if ( |
|
| 806 | (current == last) |
||
| 807 | and not _slavo_germanic() |
||
| 808 | and _string_at((current - 2), 2, {'IE'}) |
||
| 809 | and not _string_at((current - 4), 2, {'ME', 'MA'}) |
||
| 810 | ): |
||
| 811 | 1 | primary, secondary = _metaph_add('', 'R') |
|
| 812 | else: |
||
| 813 | 1 | primary, secondary = _metaph_add('R') |
|
| 814 | |||
| 815 | 1 | if _get_at(current + 1) == 'R': |
|
| 816 | 1 | current += 2 |
|
| 817 | else: |
||
| 818 | 1 | current += 1 |
|
| 819 | 1 | continue |
|
| 820 | |||
| 821 | 1 | elif _get_at(current) == 'S': |
|
| 822 | # special cases 'island', 'isle', 'carlisle', 'carlysle' |
||
| 823 | 1 | if _string_at((current - 1), 3, {'ISL', 'YSL'}): |
|
| 824 | 1 | current += 1 |
|
| 825 | 1 | continue |
|
| 826 | |||
| 827 | # special case 'sugar-' |
||
| 828 | 1 | elif (current == 0) and _string_at(current, 5, {'SUGAR'}): |
|
| 829 | 1 | primary, secondary = _metaph_add('X', 'S') |
|
| 830 | 1 | current += 1 |
|
| 831 | 1 | continue |
|
| 832 | |||
| 833 | 1 | elif _string_at(current, 2, {'SH'}): |
|
| 834 | # Germanic |
||
| 835 | 1 | if _string_at( |
|
| 836 | (current + 1), 4, {'HEIM', 'HOEK', 'HOLM', 'HOLZ'} |
||
| 837 | ): |
||
| 838 | 1 | primary, secondary = _metaph_add('S') |
|
| 839 | else: |
||
| 840 | 1 | primary, secondary = _metaph_add('X') |
|
| 841 | 1 | current += 2 |
|
| 842 | 1 | continue |
|
| 843 | |||
| 844 | # Italian & Armenian |
||
| 845 | 1 | elif _string_at(current, 3, {'SIO', 'SIA'}) or _string_at( |
|
| 846 | current, 4, {'SIAN'} |
||
| 847 | ): |
||
| 848 | 1 | if not _slavo_germanic(): |
|
| 849 | 1 | primary, secondary = _metaph_add('S', 'X') |
|
| 850 | else: |
||
| 851 | 1 | primary, secondary = _metaph_add('S') |
|
| 852 | 1 | current += 3 |
|
| 853 | 1 | continue |
|
| 854 | |||
| 855 | # German & anglicisations, e.g. 'smith' match 'schmidt', |
||
| 856 | # 'snider' match 'schneider' |
||
| 857 | # also, -sz- in Slavic language although in Hungarian it is |
||
| 858 | # pronounced 's' |
||
| 859 | 1 | elif ( |
|
| 860 | (current == 0) |
||
| 861 | and _string_at((current + 1), 1, {'M', 'N', 'L', 'W'}) |
||
| 862 | ) or _string_at((current + 1), 1, {'Z'}): |
||
| 863 | 1 | primary, secondary = _metaph_add('S', 'X') |
|
| 864 | 1 | if _string_at((current + 1), 1, {'Z'}): |
|
| 865 | 1 | current += 2 |
|
| 866 | else: |
||
| 867 | 1 | current += 1 |
|
| 868 | 1 | continue |
|
| 869 | |||
| 870 | 1 | elif _string_at(current, 2, {'SC'}): |
|
| 871 | # Schlesinger's rule |
||
| 872 | 1 | if _get_at(current + 2) == 'H': |
|
| 873 | # dutch origin, e.g. 'school', 'schooner' |
||
| 874 | 1 | if _string_at( |
|
| 875 | (current + 3), 2, {'OO', 'ER', 'EN', 'UY', 'ED', 'EM'} |
||
| 876 | ): |
||
| 877 | # 'schermerhorn', 'schenker' |
||
| 878 | 1 | if _string_at((current + 3), 2, {'ER', 'EN'}): |
|
| 879 | 1 | primary, secondary = _metaph_add('X', 'SK') |
|
| 880 | else: |
||
| 881 | 1 | primary, secondary = _metaph_add('SK') |
|
| 882 | 1 | current += 3 |
|
| 883 | 1 | continue |
|
| 884 | else: |
||
| 885 | 1 | if ( |
|
| 886 | (current == 0) |
||
| 887 | and not _is_vowel(3) |
||
| 888 | and (_get_at(3) != 'W') |
||
| 889 | ): |
||
| 890 | 1 | primary, secondary = _metaph_add('X', 'S') |
|
| 891 | else: |
||
| 892 | 1 | primary, secondary = _metaph_add('X') |
|
| 893 | 1 | current += 3 |
|
| 894 | 1 | continue |
|
| 895 | |||
| 896 | 1 | elif _string_at((current + 2), 1, {'I', 'E', 'Y'}): |
|
| 897 | 1 | primary, secondary = _metaph_add('S') |
|
| 898 | 1 | current += 3 |
|
| 899 | 1 | continue |
|
| 900 | |||
| 901 | # else |
||
| 902 | else: |
||
| 903 | 1 | primary, secondary = _metaph_add('SK') |
|
| 904 | 1 | current += 3 |
|
| 905 | 1 | continue |
|
| 906 | |||
| 907 | else: |
||
| 908 | # french e.g. 'resnais', 'artois' |
||
| 909 | 1 | if (current == last) and _string_at( |
|
| 910 | (current - 2), 2, {'AI', 'OI'} |
||
| 911 | ): |
||
| 912 | 1 | primary, secondary = _metaph_add('', 'S') |
|
| 913 | else: |
||
| 914 | 1 | primary, secondary = _metaph_add('S') |
|
| 915 | |||
| 916 | 1 | if _string_at((current + 1), 1, {'S', 'Z'}): |
|
| 917 | 1 | current += 2 |
|
| 918 | else: |
||
| 919 | 1 | current += 1 |
|
| 920 | 1 | continue |
|
| 921 | |||
| 922 | 1 | elif _get_at(current) == 'T': |
|
| 923 | 1 | if _string_at(current, 4, {'TION'}): |
|
| 924 | 1 | primary, secondary = _metaph_add('X') |
|
| 925 | 1 | current += 3 |
|
| 926 | 1 | continue |
|
| 927 | |||
| 928 | 1 | elif _string_at(current, 3, {'TIA', 'TCH'}): |
|
| 929 | 1 | primary, secondary = _metaph_add('X') |
|
| 930 | 1 | current += 3 |
|
| 931 | 1 | continue |
|
| 932 | |||
| 933 | 1 | elif _string_at(current, 2, {'TH'}) or _string_at( |
|
| 934 | current, 3, {'TTH'} |
||
| 935 | ): |
||
| 936 | # special case 'thomas', 'thames' or germanic |
||
| 937 | 1 | if ( |
|
| 938 | _string_at((current + 2), 2, {'OM', 'AM'}) |
||
| 939 | or _string_at(0, 4, {'VAN ', 'VON '}) |
||
| 940 | or _string_at(0, 3, {'SCH'}) |
||
| 941 | ): |
||
| 942 | 1 | primary, secondary = _metaph_add('T') |
|
| 943 | else: |
||
| 944 | 1 | primary, secondary = _metaph_add('0', 'T') |
|
| 945 | 1 | current += 2 |
|
| 946 | 1 | continue |
|
| 947 | |||
| 948 | 1 | elif _string_at((current + 1), 1, {'T', 'D'}): |
|
| 949 | 1 | current += 2 |
|
| 950 | else: |
||
| 951 | 1 | current += 1 |
|
| 952 | 1 | primary, secondary = _metaph_add('T') |
|
| 953 | 1 | continue |
|
| 954 | |||
| 955 | 1 | elif _get_at(current) == 'V': |
|
| 956 | 1 | if _get_at(current + 1) == 'V': |
|
| 957 | 1 | current += 2 |
|
| 958 | else: |
||
| 959 | 1 | current += 1 |
|
| 960 | 1 | primary, secondary = _metaph_add('F') |
|
| 961 | 1 | continue |
|
| 962 | |||
| 963 | 1 | elif _get_at(current) == 'W': |
|
| 964 | # can also be in middle of word |
||
| 965 | 1 | if _string_at(current, 2, {'WR'}): |
|
| 966 | 1 | primary, secondary = _metaph_add('R') |
|
| 967 | 1 | current += 2 |
|
| 968 | 1 | continue |
|
| 969 | 1 | elif (current == 0) and ( |
|
| 970 | _is_vowel(current + 1) or _string_at(current, 2, {'WH'}) |
||
| 971 | ): |
||
| 972 | # Wasserman should match Vasserman |
||
| 973 | 1 | if _is_vowel(current + 1): |
|
| 974 | 1 | primary, secondary = _metaph_add('A', 'F') |
|
| 975 | else: |
||
| 976 | # need Uomo to match Womo |
||
| 977 | 1 | primary, secondary = _metaph_add('A') |
|
| 978 | |||
| 979 | # Arnow should match Arnoff |
||
| 980 | 1 | if ( |
|
| 981 | ((current == last) and _is_vowel(current - 1)) |
||
| 982 | or _string_at( |
||
| 983 | (current - 1), 5, {'EWSKI', 'EWSKY', 'OWSKI', 'OWSKY'} |
||
| 984 | ) |
||
| 985 | or _string_at(0, 3, ['SCH']) |
||
| 986 | ): |
||
| 987 | 1 | primary, secondary = _metaph_add('', 'F') |
|
| 988 | 1 | current += 1 |
|
| 989 | 1 | continue |
|
| 990 | # Polish e.g. 'filipowicz' |
||
| 991 | 1 | elif _string_at(current, 4, {'WICZ', 'WITZ'}): |
|
| 992 | 1 | primary, secondary = _metaph_add('TS', 'FX') |
|
| 993 | 1 | current += 4 |
|
| 994 | 1 | continue |
|
| 995 | # else skip it |
||
| 996 | else: |
||
| 997 | 1 | current += 1 |
|
| 998 | 1 | continue |
|
| 999 | |||
| 1000 | 1 | elif _get_at(current) == 'X': |
|
| 1001 | # French e.g. breaux |
||
| 1002 | 1 | if not ( |
|
| 1003 | (current == last) |
||
| 1004 | and ( |
||
| 1005 | _string_at((current - 3), 3, {'IAU', 'EAU'}) |
||
| 1006 | or _string_at((current - 2), 2, {'AU', 'OU'}) |
||
| 1007 | ) |
||
| 1008 | ): |
||
| 1009 | 1 | primary, secondary = _metaph_add('KS') |
|
| 1010 | |||
| 1011 | 1 | if _string_at((current + 1), 1, {'C', 'X'}): |
|
| 1012 | 1 | current += 2 |
|
| 1013 | else: |
||
| 1014 | 1 | current += 1 |
|
| 1015 | 1 | continue |
|
| 1016 | |||
| 1017 | 1 | elif _get_at(current) == 'Z': |
|
| 1018 | # Chinese Pinyin e.g. 'zhao' |
||
| 1019 | 1 | if _get_at(current + 1) == 'H': |
|
| 1020 | 1 | primary, secondary = _metaph_add('J') |
|
| 1021 | 1 | current += 2 |
|
| 1022 | 1 | continue |
|
| 1023 | 1 | elif _string_at((current + 1), 2, {'ZO', 'ZI', 'ZA'}) or ( |
|
| 1024 | _slavo_germanic() |
||
| 1025 | and ((current > 0) and _get_at(current - 1) != 'T') |
||
| 1026 | ): |
||
| 1027 | 1 | primary, secondary = _metaph_add('S', 'TS') |
|
| 1028 | else: |
||
| 1029 | 1 | primary, secondary = _metaph_add('S') |
|
| 1030 | |||
| 1031 | 1 | if _get_at(current + 1) == 'Z': |
|
| 1032 | 1 | current += 2 |
|
| 1033 | else: |
||
| 1034 | 1 | current += 1 |
|
| 1035 | 1 | continue |
|
| 1036 | |||
| 1037 | else: |
||
| 1038 | 1 | current += 1 |
|
| 1039 | |||
| 1040 | 1 | if max_length > 0: |
|
| 1041 | 1 | primary = primary[:max_length] |
|
| 1042 | 1 | secondary = secondary[:max_length] |
|
| 1043 | 1 | if primary == secondary: |
|
| 1044 | 1 | secondary = '' |
|
| 1045 | |||
| 1046 | 1 | return primary, secondary |
|
| 1047 | |||
| 1053 |