| Conditions | 219 |
| Total Lines | 719 |
| Code Lines | 492 |
| Lines | 0 |
| Ratio | 0 % |
| Changes | 0 | ||
Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.
For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.
Commonly applied refactorings include:
If many parameters/temporary variables are present:
Complex classes like abydos.phonetic.metaphone.double_metaphone() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
| 1 | # -*- coding: utf-8 -*- |
||
| 200 | def double_metaphone(word, max_length=-1): |
||
| 201 | """Return the Double Metaphone code for a word. |
||
| 202 | |||
| 203 | Based on Lawrence Philips' (Visual) C++ code from 1999 |
||
| 204 | :cite:`Philips:2000`. |
||
| 205 | |||
| 206 | :param word: the word to transform |
||
| 207 | :param max_length: the maximum length of the returned Double Metaphone |
||
| 208 | codes (defaults to 64, but in Philips' original implementation this |
||
| 209 | was 4) |
||
| 210 | :returns: the Double Metaphone value(s) |
||
| 211 | :rtype: tuple |
||
| 212 | |||
| 213 | >>> double_metaphone('Christopher') |
||
| 214 | ('KRSTFR', '') |
||
| 215 | >>> double_metaphone('Niall') |
||
| 216 | ('NL', '') |
||
| 217 | >>> double_metaphone('Smith') |
||
| 218 | ('SM0', 'XMT') |
||
| 219 | >>> double_metaphone('Schmidt') |
||
| 220 | ('XMT', 'SMT') |
||
| 221 | """ |
||
| 222 | # Require a max_length of at least 4 |
||
| 223 | if max_length != -1: |
||
| 224 | max_length = max(4, max_length) |
||
| 225 | else: |
||
| 226 | max_length = 64 |
||
| 227 | |||
| 228 | primary = '' |
||
| 229 | secondary = '' |
||
| 230 | |||
| 231 | def _slavo_germanic(): |
||
| 232 | """Return True if the word appears to be Slavic or Germanic.""" |
||
| 233 | if 'W' in word or 'K' in word or 'CZ' in word: |
||
| 234 | return True |
||
| 235 | return False |
||
| 236 | |||
| 237 | def _metaph_add(pri, sec=''): |
||
| 238 | """Return a new metaphone tuple with the supplied elements.""" |
||
| 239 | newpri = primary |
||
| 240 | newsec = secondary |
||
| 241 | if pri: |
||
| 242 | newpri += pri |
||
| 243 | if sec: |
||
| 244 | if sec != ' ': |
||
| 245 | newsec += sec |
||
| 246 | else: |
||
| 247 | newsec += pri |
||
| 248 | return newpri, newsec |
||
| 249 | |||
| 250 | def _is_vowel(pos): |
||
| 251 | """Return True if the character at word[pos] is a vowel.""" |
||
| 252 | if pos >= 0 and word[pos] in {'A', 'E', 'I', 'O', 'U', 'Y'}: |
||
| 253 | return True |
||
| 254 | return False |
||
| 255 | |||
| 256 | def _get_at(pos): |
||
| 257 | """Return the character at word[pos].""" |
||
| 258 | return word[pos] |
||
| 259 | |||
| 260 | def _string_at(pos, slen, substrings): |
||
| 261 | """Return True if word[pos:pos+slen] is in substrings.""" |
||
| 262 | if pos < 0: |
||
| 263 | return False |
||
| 264 | return word[pos:pos+slen] in substrings |
||
| 265 | |||
| 266 | current = 0 |
||
| 267 | length = len(word) |
||
| 268 | if length < 1: |
||
| 269 | return '', '' |
||
| 270 | last = length - 1 |
||
| 271 | |||
| 272 | word = word.upper() |
||
| 273 | word = word.replace('ß', 'SS') |
||
| 274 | |||
| 275 | # Pad the original string so that we can index beyond the edge of the world |
||
| 276 | word += ' ' |
||
| 277 | |||
| 278 | # Skip these when at start of word |
||
| 279 | if word[0:2] in {'GN', 'KN', 'PN', 'WR', 'PS'}: |
||
| 280 | current += 1 |
||
| 281 | |||
| 282 | # Initial 'X' is pronounced 'Z' e.g. 'Xavier' |
||
| 283 | if _get_at(0) == 'X': |
||
| 284 | primary, secondary = _metaph_add('S') # 'Z' maps to 'S' |
||
| 285 | current += 1 |
||
| 286 | |||
| 287 | # Main loop |
||
| 288 | while True: |
||
| 289 | if current >= length: |
||
| 290 | break |
||
| 291 | |||
| 292 | if _get_at(current) in {'A', 'E', 'I', 'O', 'U', 'Y'}: |
||
| 293 | if current == 0: |
||
| 294 | # All init vowels now map to 'A' |
||
| 295 | primary, secondary = _metaph_add('A') |
||
| 296 | current += 1 |
||
| 297 | continue |
||
| 298 | |||
| 299 | elif _get_at(current) == 'B': |
||
| 300 | # "-mb", e.g", "dumb", already skipped over... |
||
| 301 | primary, secondary = _metaph_add('P') |
||
| 302 | if _get_at(current + 1) == 'B': |
||
| 303 | current += 2 |
||
| 304 | else: |
||
| 305 | current += 1 |
||
| 306 | continue |
||
| 307 | |||
| 308 | elif _get_at(current) == 'Ç': |
||
| 309 | primary, secondary = _metaph_add('S') |
||
| 310 | current += 1 |
||
| 311 | continue |
||
| 312 | |||
| 313 | elif _get_at(current) == 'C': |
||
| 314 | # Various Germanic |
||
| 315 | if (current > 1 and not _is_vowel(current - 2) and |
||
| 316 | _string_at((current - 1), 3, {'ACH'}) and |
||
| 317 | ((_get_at(current + 2) != 'I') and |
||
| 318 | ((_get_at(current + 2) != 'E') or |
||
| 319 | _string_at((current - 2), 6, |
||
| 320 | {'BACHER', 'MACHER'})))): |
||
| 321 | primary, secondary = _metaph_add('K') |
||
| 322 | current += 2 |
||
| 323 | continue |
||
| 324 | |||
| 325 | # Special case 'caesar' |
||
| 326 | elif current == 0 and _string_at(current, 6, {'CAESAR'}): |
||
| 327 | primary, secondary = _metaph_add('S') |
||
| 328 | current += 2 |
||
| 329 | continue |
||
| 330 | |||
| 331 | # Italian 'chianti' |
||
| 332 | elif _string_at(current, 4, {'CHIA'}): |
||
| 333 | primary, secondary = _metaph_add('K') |
||
| 334 | current += 2 |
||
| 335 | continue |
||
| 336 | |||
| 337 | elif _string_at(current, 2, {'CH'}): |
||
| 338 | # Find 'Michael' |
||
| 339 | if current > 0 and _string_at(current, 4, {'CHAE'}): |
||
| 340 | primary, secondary = _metaph_add('K', 'X') |
||
| 341 | current += 2 |
||
| 342 | continue |
||
| 343 | |||
| 344 | # Greek roots e.g. 'chemistry', 'chorus' |
||
| 345 | elif (current == 0 and |
||
| 346 | (_string_at((current + 1), 5, |
||
| 347 | {'HARAC', 'HARIS'}) or |
||
| 348 | _string_at((current + 1), 3, |
||
| 349 | {'HOR', 'HYM', 'HIA', 'HEM'})) and |
||
| 350 | not _string_at(0, 5, {'CHORE'})): |
||
| 351 | primary, secondary = _metaph_add('K') |
||
| 352 | current += 2 |
||
| 353 | continue |
||
| 354 | |||
| 355 | # Germanic, Greek, or otherwise 'ch' for 'kh' sound |
||
| 356 | elif ((_string_at(0, 4, {'VAN ', 'VON '}) or |
||
| 357 | _string_at(0, 3, {'SCH'})) or |
||
| 358 | # 'architect but not 'arch', 'orchestra', 'orchid' |
||
| 359 | _string_at((current - 2), 6, |
||
| 360 | {'ORCHES', 'ARCHIT', 'ORCHID'}) or |
||
| 361 | _string_at((current + 2), 1, {'T', 'S'}) or |
||
| 362 | ((_string_at((current - 1), 1, |
||
| 363 | {'A', 'O', 'U', 'E'}) or |
||
| 364 | (current == 0)) and |
||
| 365 | # e.g., 'wachtler', 'wechsler', but not 'tichner' |
||
| 366 | _string_at((current + 2), 1, |
||
| 367 | {'L', 'R', 'N', 'M', 'B', 'H', 'F', 'V', 'W', |
||
| 368 | ' '}))): |
||
| 369 | primary, secondary = _metaph_add('K') |
||
| 370 | |||
| 371 | else: |
||
| 372 | if current > 0: |
||
| 373 | if _string_at(0, 2, {'MC'}): |
||
| 374 | # e.g., "McHugh" |
||
| 375 | primary, secondary = _metaph_add('K') |
||
| 376 | else: |
||
| 377 | primary, secondary = _metaph_add('X', 'K') |
||
| 378 | else: |
||
| 379 | primary, secondary = _metaph_add('X') |
||
| 380 | |||
| 381 | current += 2 |
||
| 382 | continue |
||
| 383 | |||
| 384 | # e.g, 'czerny' |
||
| 385 | elif (_string_at(current, 2, {'CZ'}) and |
||
| 386 | not _string_at((current - 2), 4, {'WICZ'})): |
||
| 387 | primary, secondary = _metaph_add('S', 'X') |
||
| 388 | current += 2 |
||
| 389 | continue |
||
| 390 | |||
| 391 | # e.g., 'focaccia' |
||
| 392 | elif _string_at((current + 1), 3, {'CIA'}): |
||
| 393 | primary, secondary = _metaph_add('X') |
||
| 394 | current += 3 |
||
| 395 | |||
| 396 | # double 'C', but not if e.g. 'McClellan' |
||
| 397 | elif (_string_at(current, 2, {'CC'}) and |
||
| 398 | not ((current == 1) and (_get_at(0) == 'M'))): |
||
| 399 | # 'bellocchio' but not 'bacchus' |
||
| 400 | if ((_string_at((current + 2), 1, |
||
| 401 | {'I', 'E', 'H'}) and |
||
| 402 | not _string_at((current + 2), 2, ['HU']))): |
||
| 403 | # 'accident', 'accede' 'succeed' |
||
| 404 | if ((((current == 1) and _get_at(current - 1) == 'A') or |
||
| 405 | _string_at((current - 1), 5, |
||
| 406 | {'UCCEE', 'UCCES'}))): |
||
| 407 | primary, secondary = _metaph_add('KS') |
||
| 408 | # 'bacci', 'bertucci', other italian |
||
| 409 | else: |
||
| 410 | primary, secondary = _metaph_add('X') |
||
| 411 | current += 3 |
||
| 412 | continue |
||
| 413 | else: # Pierce's rule |
||
| 414 | primary, secondary = _metaph_add('K') |
||
| 415 | current += 2 |
||
| 416 | continue |
||
| 417 | |||
| 418 | elif _string_at(current, 2, {'CK', 'CG', 'CQ'}): |
||
| 419 | primary, secondary = _metaph_add('K') |
||
| 420 | current += 2 |
||
| 421 | continue |
||
| 422 | |||
| 423 | elif _string_at(current, 2, {'CI', 'CE', 'CY'}): |
||
| 424 | # Italian vs. English |
||
| 425 | if _string_at(current, 3, {'CIO', 'CIE', 'CIA'}): |
||
| 426 | primary, secondary = _metaph_add('S', 'X') |
||
| 427 | else: |
||
| 428 | primary, secondary = _metaph_add('S') |
||
| 429 | current += 2 |
||
| 430 | continue |
||
| 431 | |||
| 432 | # else |
||
| 433 | else: |
||
| 434 | primary, secondary = _metaph_add('K') |
||
| 435 | |||
| 436 | # name sent in 'mac caffrey', 'mac gregor |
||
| 437 | if _string_at((current + 1), 2, {' C', ' Q', ' G'}): |
||
| 438 | current += 3 |
||
| 439 | elif (_string_at((current + 1), 1, |
||
| 440 | {'C', 'K', 'Q'}) and |
||
| 441 | not _string_at((current + 1), 2, {'CE', 'CI'})): |
||
| 442 | current += 2 |
||
| 443 | else: |
||
| 444 | current += 1 |
||
| 445 | continue |
||
| 446 | |||
| 447 | elif _get_at(current) == 'D': |
||
| 448 | if _string_at(current, 2, {'DG'}): |
||
| 449 | if _string_at((current + 2), 1, {'I', 'E', 'Y'}): |
||
| 450 | # e.g. 'edge' |
||
| 451 | primary, secondary = _metaph_add('J') |
||
| 452 | current += 3 |
||
| 453 | continue |
||
| 454 | else: |
||
| 455 | # e.g. 'edgar' |
||
| 456 | primary, secondary = _metaph_add('TK') |
||
| 457 | current += 2 |
||
| 458 | continue |
||
| 459 | |||
| 460 | elif _string_at(current, 2, {'DT', 'DD'}): |
||
| 461 | primary, secondary = _metaph_add('T') |
||
| 462 | current += 2 |
||
| 463 | continue |
||
| 464 | |||
| 465 | # else |
||
| 466 | else: |
||
| 467 | primary, secondary = _metaph_add('T') |
||
| 468 | current += 1 |
||
| 469 | continue |
||
| 470 | |||
| 471 | elif _get_at(current) == 'F': |
||
| 472 | if _get_at(current + 1) == 'F': |
||
| 473 | current += 2 |
||
| 474 | else: |
||
| 475 | current += 1 |
||
| 476 | primary, secondary = _metaph_add('F') |
||
| 477 | continue |
||
| 478 | |||
| 479 | elif _get_at(current) == 'G': |
||
| 480 | if _get_at(current + 1) == 'H': |
||
| 481 | if (current > 0) and not _is_vowel(current - 1): |
||
| 482 | primary, secondary = _metaph_add('K') |
||
| 483 | current += 2 |
||
| 484 | continue |
||
| 485 | |||
| 486 | # 'ghislane', ghiradelli |
||
| 487 | elif current == 0: |
||
| 488 | if _get_at(current + 2) == 'I': |
||
| 489 | primary, secondary = _metaph_add('J') |
||
| 490 | else: |
||
| 491 | primary, secondary = _metaph_add('K') |
||
| 492 | current += 2 |
||
| 493 | continue |
||
| 494 | |||
| 495 | # Parker's rule (with some further refinements) - e.g., 'hugh' |
||
| 496 | elif (((current > 1) and |
||
| 497 | _string_at((current - 2), 1, {'B', 'H', 'D'})) or |
||
| 498 | # e.g., 'bough' |
||
| 499 | ((current > 2) and |
||
| 500 | _string_at((current - 3), 1, {'B', 'H', 'D'})) or |
||
| 501 | # e.g., 'broughton' |
||
| 502 | ((current > 3) and |
||
| 503 | _string_at((current - 4), 1, {'B', 'H'}))): |
||
| 504 | current += 2 |
||
| 505 | continue |
||
| 506 | else: |
||
| 507 | # e.g. 'laugh', 'McLaughlin', 'cough', |
||
| 508 | # 'gough', 'rough', 'tough' |
||
| 509 | if ((current > 2) and |
||
| 510 | (_get_at(current - 1) == 'U') and |
||
| 511 | (_string_at((current - 3), 1, |
||
| 512 | {'C', 'G', 'L', 'R', 'T'}))): |
||
| 513 | primary, secondary = _metaph_add('F') |
||
| 514 | elif (current > 0) and _get_at(current - 1) != 'I': |
||
| 515 | primary, secondary = _metaph_add('K') |
||
| 516 | current += 2 |
||
| 517 | continue |
||
| 518 | |||
| 519 | elif _get_at(current + 1) == 'N': |
||
| 520 | if (current == 1) and _is_vowel(0) and not _slavo_germanic(): |
||
| 521 | primary, secondary = _metaph_add('KN', 'N') |
||
| 522 | # not e.g. 'cagney' |
||
| 523 | elif (not _string_at((current + 2), 2, {'EY'}) and |
||
| 524 | (_get_at(current + 1) != 'Y') and |
||
| 525 | not _slavo_germanic()): |
||
| 526 | primary, secondary = _metaph_add('N', 'KN') |
||
| 527 | else: |
||
| 528 | primary, secondary = _metaph_add('KN') |
||
| 529 | current += 2 |
||
| 530 | continue |
||
| 531 | |||
| 532 | # 'tagliaro' |
||
| 533 | elif (_string_at((current + 1), 2, {'LI'}) and |
||
| 534 | not _slavo_germanic()): |
||
| 535 | primary, secondary = _metaph_add('KL', 'L') |
||
| 536 | current += 2 |
||
| 537 | continue |
||
| 538 | |||
| 539 | # -ges-, -gep-, -gel-, -gie- at beginning |
||
| 540 | elif ((current == 0) and |
||
| 541 | ((_get_at(current + 1) == 'Y') or |
||
| 542 | _string_at((current + 1), 2, {'ES', 'EP', 'EB', 'EL', 'EY', |
||
| 543 | 'IB', 'IL', 'IN', 'IE', 'EI', |
||
| 544 | 'ER'}))): |
||
| 545 | primary, secondary = _metaph_add('K', 'J') |
||
| 546 | current += 2 |
||
| 547 | continue |
||
| 548 | |||
| 549 | # -ger-, -gy- |
||
| 550 | elif ((_string_at((current + 1), 2, {'ER'}) or |
||
| 551 | (_get_at(current + 1) == 'Y')) and not |
||
| 552 | _string_at(0, 6, {'DANGER', 'RANGER', 'MANGER'}) and not |
||
| 553 | _string_at((current - 1), 1, {'E', 'I'}) and not |
||
| 554 | _string_at((current - 1), 3, {'RGY', 'OGY'})): |
||
| 555 | primary, secondary = _metaph_add('K', 'J') |
||
| 556 | current += 2 |
||
| 557 | continue |
||
| 558 | |||
| 559 | # italian e.g, 'biaggi' |
||
| 560 | elif (_string_at((current + 1), 1, {'E', 'I', 'Y'}) or |
||
| 561 | _string_at((current - 1), 4, {'AGGI', 'OGGI'})): |
||
| 562 | # obvious germanic |
||
| 563 | if (((_string_at(0, 4, {'VAN ', 'VON '}) or |
||
| 564 | _string_at(0, 3, {'SCH'})) or |
||
| 565 | _string_at((current + 1), 2, {'ET'}))): |
||
| 566 | primary, secondary = _metaph_add('K') |
||
| 567 | elif _string_at((current + 1), 4, {'IER '}): |
||
| 568 | primary, secondary = _metaph_add('J') |
||
| 569 | else: |
||
| 570 | primary, secondary = _metaph_add('J', 'K') |
||
| 571 | current += 2 |
||
| 572 | continue |
||
| 573 | |||
| 574 | else: |
||
| 575 | if _get_at(current + 1) == 'G': |
||
| 576 | current += 2 |
||
| 577 | else: |
||
| 578 | current += 1 |
||
| 579 | primary, secondary = _metaph_add('K') |
||
| 580 | continue |
||
| 581 | |||
| 582 | elif _get_at(current) == 'H': |
||
| 583 | # only keep if first & before vowel or btw. 2 vowels |
||
| 584 | if ((((current == 0) or _is_vowel(current - 1)) and |
||
| 585 | _is_vowel(current + 1))): |
||
| 586 | primary, secondary = _metaph_add('H') |
||
| 587 | current += 2 |
||
| 588 | else: # also takes care of 'HH' |
||
| 589 | current += 1 |
||
| 590 | continue |
||
| 591 | |||
| 592 | elif _get_at(current) == 'J': |
||
| 593 | # obvious spanish, 'jose', 'san jacinto' |
||
| 594 | if _string_at(current, 4, ['JOSE']) or _string_at(0, 4, {'SAN '}): |
||
| 595 | if ((((current == 0) and (_get_at(current + 4) == ' ')) or |
||
| 596 | _string_at(0, 4, ['SAN ']))): |
||
| 597 | primary, secondary = _metaph_add('H') |
||
| 598 | else: |
||
| 599 | primary, secondary = _metaph_add('J', 'H') |
||
| 600 | current += 1 |
||
| 601 | continue |
||
| 602 | |||
| 603 | elif (current == 0) and not _string_at(current, 4, {'JOSE'}): |
||
| 604 | # Yankelovich/Jankelowicz |
||
| 605 | primary, secondary = _metaph_add('J', 'A') |
||
| 606 | # Spanish pron. of e.g. 'bajador' |
||
| 607 | elif (_is_vowel(current - 1) and |
||
| 608 | not _slavo_germanic() and |
||
| 609 | ((_get_at(current + 1) == 'A') or |
||
| 610 | (_get_at(current + 1) == 'O'))): |
||
| 611 | primary, secondary = _metaph_add('J', 'H') |
||
| 612 | elif current == last: |
||
| 613 | primary, secondary = _metaph_add('J', ' ') |
||
| 614 | elif (not _string_at((current + 1), 1, |
||
| 615 | {'L', 'T', 'K', 'S', 'N', 'M', 'B', 'Z'}) and |
||
| 616 | not _string_at((current - 1), 1, {'S', 'K', 'L'})): |
||
| 617 | primary, secondary = _metaph_add('J') |
||
| 618 | |||
| 619 | if _get_at(current + 1) == 'J': # it could happen! |
||
| 620 | current += 2 |
||
| 621 | else: |
||
| 622 | current += 1 |
||
| 623 | continue |
||
| 624 | |||
| 625 | elif _get_at(current) == 'K': |
||
| 626 | if _get_at(current + 1) == 'K': |
||
| 627 | current += 2 |
||
| 628 | else: |
||
| 629 | current += 1 |
||
| 630 | primary, secondary = _metaph_add('K') |
||
| 631 | continue |
||
| 632 | |||
| 633 | elif _get_at(current) == 'L': |
||
| 634 | if _get_at(current + 1) == 'L': |
||
| 635 | # Spanish e.g. 'cabrillo', 'gallegos' |
||
| 636 | if (((current == (length - 3)) and |
||
| 637 | _string_at((current - 1), 4, {'ILLO', 'ILLA', 'ALLE'})) or |
||
| 638 | ((_string_at((last - 1), 2, {'AS', 'OS'}) or |
||
| 639 | _string_at(last, 1, {'A', 'O'})) and |
||
| 640 | _string_at((current - 1), 4, {'ALLE'}))): |
||
| 641 | primary, secondary = _metaph_add('L', ' ') |
||
| 642 | current += 2 |
||
| 643 | continue |
||
| 644 | current += 2 |
||
| 645 | else: |
||
| 646 | current += 1 |
||
| 647 | primary, secondary = _metaph_add('L') |
||
| 648 | continue |
||
| 649 | |||
| 650 | elif _get_at(current) == 'M': |
||
| 651 | if (((_string_at((current - 1), 3, {'UMB'}) and |
||
| 652 | (((current + 1) == last) or |
||
| 653 | _string_at((current + 2), 2, {'ER'}))) or |
||
| 654 | # 'dumb', 'thumb' |
||
| 655 | (_get_at(current + 1) == 'M'))): |
||
| 656 | current += 2 |
||
| 657 | else: |
||
| 658 | current += 1 |
||
| 659 | primary, secondary = _metaph_add('M') |
||
| 660 | continue |
||
| 661 | |||
| 662 | elif _get_at(current) == 'N': |
||
| 663 | if _get_at(current + 1) == 'N': |
||
| 664 | current += 2 |
||
| 665 | else: |
||
| 666 | current += 1 |
||
| 667 | primary, secondary = _metaph_add('N') |
||
| 668 | continue |
||
| 669 | |||
| 670 | elif _get_at(current) == 'Ñ': |
||
| 671 | current += 1 |
||
| 672 | primary, secondary = _metaph_add('N') |
||
| 673 | continue |
||
| 674 | |||
| 675 | elif _get_at(current) == 'P': |
||
| 676 | if _get_at(current + 1) == 'H': |
||
| 677 | primary, secondary = _metaph_add('F') |
||
| 678 | current += 2 |
||
| 679 | continue |
||
| 680 | |||
| 681 | # also account for "campbell", "raspberry" |
||
| 682 | elif _string_at((current + 1), 1, {'P', 'B'}): |
||
| 683 | current += 2 |
||
| 684 | else: |
||
| 685 | current += 1 |
||
| 686 | primary, secondary = _metaph_add('P') |
||
| 687 | continue |
||
| 688 | |||
| 689 | elif _get_at(current) == 'Q': |
||
| 690 | if _get_at(current + 1) == 'Q': |
||
| 691 | current += 2 |
||
| 692 | else: |
||
| 693 | current += 1 |
||
| 694 | primary, secondary = _metaph_add('K') |
||
| 695 | continue |
||
| 696 | |||
| 697 | elif _get_at(current) == 'R': |
||
| 698 | # french e.g. 'rogier', but exclude 'hochmeier' |
||
| 699 | if (((current == last) and |
||
| 700 | not _slavo_germanic() and |
||
| 701 | _string_at((current - 2), 2, {'IE'}) and |
||
| 702 | not _string_at((current - 4), 2, {'ME', 'MA'}))): |
||
| 703 | primary, secondary = _metaph_add('', 'R') |
||
| 704 | else: |
||
| 705 | primary, secondary = _metaph_add('R') |
||
| 706 | |||
| 707 | if _get_at(current + 1) == 'R': |
||
| 708 | current += 2 |
||
| 709 | else: |
||
| 710 | current += 1 |
||
| 711 | continue |
||
| 712 | |||
| 713 | elif _get_at(current) == 'S': |
||
| 714 | # special cases 'island', 'isle', 'carlisle', 'carlysle' |
||
| 715 | if _string_at((current - 1), 3, {'ISL', 'YSL'}): |
||
| 716 | current += 1 |
||
| 717 | continue |
||
| 718 | |||
| 719 | # special case 'sugar-' |
||
| 720 | elif (current == 0) and _string_at(current, 5, {'SUGAR'}): |
||
| 721 | primary, secondary = _metaph_add('X', 'S') |
||
| 722 | current += 1 |
||
| 723 | continue |
||
| 724 | |||
| 725 | elif _string_at(current, 2, {'SH'}): |
||
| 726 | # Germanic |
||
| 727 | if _string_at((current + 1), 4, |
||
| 728 | {'HEIM', 'HOEK', 'HOLM', 'HOLZ'}): |
||
| 729 | primary, secondary = _metaph_add('S') |
||
| 730 | else: |
||
| 731 | primary, secondary = _metaph_add('X') |
||
| 732 | current += 2 |
||
| 733 | continue |
||
| 734 | |||
| 735 | # Italian & Armenian |
||
| 736 | elif (_string_at(current, 3, {'SIO', 'SIA'}) or |
||
| 737 | _string_at(current, 4, {'SIAN'})): |
||
| 738 | if not _slavo_germanic(): |
||
| 739 | primary, secondary = _metaph_add('S', 'X') |
||
| 740 | else: |
||
| 741 | primary, secondary = _metaph_add('S') |
||
| 742 | current += 3 |
||
| 743 | continue |
||
| 744 | |||
| 745 | # German & anglicisations, e.g. 'smith' match 'schmidt', |
||
| 746 | # 'snider' match 'schneider' |
||
| 747 | # also, -sz- in Slavic language although in Hungarian it is |
||
| 748 | # pronounced 's' |
||
| 749 | elif (((current == 0) and |
||
| 750 | _string_at((current + 1), 1, {'M', 'N', 'L', 'W'})) or |
||
| 751 | _string_at((current + 1), 1, {'Z'})): |
||
| 752 | primary, secondary = _metaph_add('S', 'X') |
||
| 753 | if _string_at((current + 1), 1, {'Z'}): |
||
| 754 | current += 2 |
||
| 755 | else: |
||
| 756 | current += 1 |
||
| 757 | continue |
||
| 758 | |||
| 759 | elif _string_at(current, 2, {'SC'}): |
||
| 760 | # Schlesinger's rule |
||
| 761 | if _get_at(current + 2) == 'H': |
||
| 762 | # dutch origin, e.g. 'school', 'schooner' |
||
| 763 | if _string_at((current + 3), 2, |
||
| 764 | {'OO', 'ER', 'EN', 'UY', 'ED', 'EM'}): |
||
| 765 | # 'schermerhorn', 'schenker' |
||
| 766 | if _string_at((current + 3), 2, {'ER', 'EN'}): |
||
| 767 | primary, secondary = _metaph_add('X', 'SK') |
||
| 768 | else: |
||
| 769 | primary, secondary = _metaph_add('SK') |
||
| 770 | current += 3 |
||
| 771 | continue |
||
| 772 | else: |
||
| 773 | if (((current == 0) and not _is_vowel(3) and |
||
| 774 | (_get_at(3) != 'W'))): |
||
| 775 | primary, secondary = _metaph_add('X', 'S') |
||
| 776 | else: |
||
| 777 | primary, secondary = _metaph_add('X') |
||
| 778 | current += 3 |
||
| 779 | continue |
||
| 780 | |||
| 781 | elif _string_at((current + 2), 1, {'I', 'E', 'Y'}): |
||
| 782 | primary, secondary = _metaph_add('S') |
||
| 783 | current += 3 |
||
| 784 | continue |
||
| 785 | |||
| 786 | # else |
||
| 787 | else: |
||
| 788 | primary, secondary = _metaph_add('SK') |
||
| 789 | current += 3 |
||
| 790 | continue |
||
| 791 | |||
| 792 | else: |
||
| 793 | # french e.g. 'resnais', 'artois' |
||
| 794 | if (current == last) and _string_at((current - 2), 2, |
||
| 795 | {'AI', 'OI'}): |
||
| 796 | primary, secondary = _metaph_add('', 'S') |
||
| 797 | else: |
||
| 798 | primary, secondary = _metaph_add('S') |
||
| 799 | |||
| 800 | if _string_at((current + 1), 1, {'S', 'Z'}): |
||
| 801 | current += 2 |
||
| 802 | else: |
||
| 803 | current += 1 |
||
| 804 | continue |
||
| 805 | |||
| 806 | elif _get_at(current) == 'T': |
||
| 807 | if _string_at(current, 4, {'TION'}): |
||
| 808 | primary, secondary = _metaph_add('X') |
||
| 809 | current += 3 |
||
| 810 | continue |
||
| 811 | |||
| 812 | elif _string_at(current, 3, {'TIA', 'TCH'}): |
||
| 813 | primary, secondary = _metaph_add('X') |
||
| 814 | current += 3 |
||
| 815 | continue |
||
| 816 | |||
| 817 | elif (_string_at(current, 2, {'TH'}) or |
||
| 818 | _string_at(current, 3, {'TTH'})): |
||
| 819 | # special case 'thomas', 'thames' or germanic |
||
| 820 | if ((_string_at((current + 2), 2, {'OM', 'AM'}) or |
||
| 821 | _string_at(0, 4, {'VAN ', 'VON '}) or |
||
| 822 | _string_at(0, 3, {'SCH'}))): |
||
| 823 | primary, secondary = _metaph_add('T') |
||
| 824 | else: |
||
| 825 | primary, secondary = _metaph_add('0', 'T') |
||
| 826 | current += 2 |
||
| 827 | continue |
||
| 828 | |||
| 829 | elif _string_at((current + 1), 1, {'T', 'D'}): |
||
| 830 | current += 2 |
||
| 831 | else: |
||
| 832 | current += 1 |
||
| 833 | primary, secondary = _metaph_add('T') |
||
| 834 | continue |
||
| 835 | |||
| 836 | elif _get_at(current) == 'V': |
||
| 837 | if _get_at(current + 1) == 'V': |
||
| 838 | current += 2 |
||
| 839 | else: |
||
| 840 | current += 1 |
||
| 841 | primary, secondary = _metaph_add('F') |
||
| 842 | continue |
||
| 843 | |||
| 844 | elif _get_at(current) == 'W': |
||
| 845 | # can also be in middle of word |
||
| 846 | if _string_at(current, 2, {'WR'}): |
||
| 847 | primary, secondary = _metaph_add('R') |
||
| 848 | current += 2 |
||
| 849 | continue |
||
| 850 | elif ((current == 0) and |
||
| 851 | (_is_vowel(current + 1) or _string_at(current, 2, {'WH'}))): |
||
| 852 | # Wasserman should match Vasserman |
||
| 853 | if _is_vowel(current + 1): |
||
| 854 | primary, secondary = _metaph_add('A', 'F') |
||
| 855 | else: |
||
| 856 | # need Uomo to match Womo |
||
| 857 | primary, secondary = _metaph_add('A') |
||
| 858 | |||
| 859 | # Arnow should match Arnoff |
||
| 860 | if ((((current == last) and _is_vowel(current - 1)) or |
||
| 861 | _string_at((current - 1), 5, |
||
| 862 | {'EWSKI', 'EWSKY', 'OWSKI', 'OWSKY'}) or |
||
| 863 | _string_at(0, 3, ['SCH']))): |
||
| 864 | primary, secondary = _metaph_add('', 'F') |
||
| 865 | current += 1 |
||
| 866 | continue |
||
| 867 | # Polish e.g. 'filipowicz' |
||
| 868 | elif _string_at(current, 4, {'WICZ', 'WITZ'}): |
||
| 869 | primary, secondary = _metaph_add('TS', 'FX') |
||
| 870 | current += 4 |
||
| 871 | continue |
||
| 872 | # else skip it |
||
| 873 | else: |
||
| 874 | current += 1 |
||
| 875 | continue |
||
| 876 | |||
| 877 | elif _get_at(current) == 'X': |
||
| 878 | # French e.g. breaux |
||
| 879 | if (not ((current == last) and |
||
| 880 | (_string_at((current - 3), 3, {'IAU', 'EAU'}) or |
||
| 881 | _string_at((current - 2), 2, {'AU', 'OU'})))): |
||
| 882 | primary, secondary = _metaph_add('KS') |
||
| 883 | |||
| 884 | if _string_at((current + 1), 1, {'C', 'X'}): |
||
| 885 | current += 2 |
||
| 886 | else: |
||
| 887 | current += 1 |
||
| 888 | continue |
||
| 889 | |||
| 890 | elif _get_at(current) == 'Z': |
||
| 891 | # Chinese Pinyin e.g. 'zhao' |
||
| 892 | if _get_at(current + 1) == 'H': |
||
| 893 | primary, secondary = _metaph_add('J') |
||
| 894 | current += 2 |
||
| 895 | continue |
||
| 896 | elif (_string_at((current + 1), 2, {'ZO', 'ZI', 'ZA'}) or |
||
| 897 | (_slavo_germanic() and ((current > 0) and |
||
| 898 | _get_at(current - 1) != 'T'))): |
||
| 899 | primary, secondary = _metaph_add('S', 'TS') |
||
| 900 | else: |
||
| 901 | primary, secondary = _metaph_add('S') |
||
| 902 | |||
| 903 | if _get_at(current + 1) == 'Z': |
||
| 904 | current += 2 |
||
| 905 | else: |
||
| 906 | current += 1 |
||
| 907 | continue |
||
| 908 | |||
| 909 | else: |
||
| 910 | current += 1 |
||
| 911 | |||
| 912 | if max_length > 0: |
||
| 913 | primary = primary[:max_length] |
||
| 914 | secondary = secondary[:max_length] |
||
| 915 | if primary == secondary: |
||
| 916 | secondary = '' |
||
| 917 | |||
| 918 | return primary, secondary |
||
| 919 | |||
| 924 |