DoubleMetaphone.__init__()   A
last analyzed

Complexity

Conditions 2

Size

Total Lines 18
Code Lines 4

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 2
CRAP Score 2

Importance

Changes 0
Metric Value
eloc 4
dl 0
loc 18
ccs 2
cts 2
cp 1
rs 10
c 0
b 0
f 0
cc 2
nop 2
crap 2
1
# Copyright 2014-2020 by Christopher C. Little.
2
# This file is part of Abydos.
3
#
4
# Abydos is free software: you can redistribute it and/or modify
5
# it under the terms of the GNU General Public License as published by
6
# the Free Software Foundation, either version 3 of the License, or
7
# (at your option) any later version.
8
#
9
# Abydos is distributed in the hope that it will be useful,
10
# but WITHOUT ANY WARRANTY; without even the implied warranty of
11
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12
# GNU General Public License for more details.
13
#
14
# You should have received a copy of the GNU General Public License
15
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
16
17
"""abydos.phonetic._double_metaphone.
18
19 1
Double Metaphone
20
"""
21
22
from typing import Set, Tuple
23
24 1
from ._phonetic import _Phonetic
25
26
__all__ = ['DoubleMetaphone']
27
28
29
class DoubleMetaphone(_Phonetic):
30
    """Double Metaphone.
31 1
32
    Based on Lawrence Philips' (Visual) C++ code from 1999
33 1
    :cite:`Philips:2000`.
34 1
35
    .. versionadded:: 0.3.6
36 1
    """
37
38
    def __init__(self, max_length: int = -1) -> None:
39 1
        """Initialize DoubleMetaphone instance.
40
41
        Parameters
42
        ----------
43
        max_length : int
44
            Maximum length of the returned Dolby code -- this also activates
45
            the fixed-length code mode if it is greater than 0
46
47
48 1
        .. versionadded:: 0.4.0
49
50
        """
51
        self._max_length = max_length
52
53
        # Require a max_length of at least 4
54
        if self._max_length != -1:
55
            self._max_length = max(4, max_length)
56
57
    def encode_alpha(self, word: str) -> str:
58
        """Return the alphabetic Double Metaphone code for a word.
59
60
        Parameters
61 1
        ----------
62
        word : str
63
            The word to transform
64 1
65 1
        Returns
66
        -------
67 1
        str
68
            The alphabetic Double Metaphone value(s)
69
70
        Examples
71
        --------
72
        >>> pe = DoubleMetaphone()
73
        >>> pe.encode_alpha('Christopher')
74
        'KRSTFR,'
75
        >>> pe.encode_alpha('Niall')
76
        'NL,'
77
        >>> pe.encode_alpha('Smith')
78
        'SMÞ,XMT'
79
        >>> pe.encode_alpha('Schmidt')
80
        'XMT,SMT'
81
82
83
        .. versionadded:: 0.4.0
84
        .. versionchanged:: 0.6.0
85
            Made return a str only (comma-separated)
86
87
        """
88
        return self.encode(word).replace('0', 'Þ')
89
90
    def encode(self, word: str) -> str:
91
        """Return the Double Metaphone code for a word.
92
93
        Parameters
94
        ----------
95
        word : str
96 1
            The word to transform
97
98 1
        Returns
99
        -------
100
        str
101
            The Double Metaphone value(s)
102
103
        Examples
104
        --------
105
        >>> pe = DoubleMetaphone()
106
        >>> pe.encode('Christopher')
107
        'KRSTFR,'
108
        >>> pe.encode('Niall')
109
        'NL,'
110
        >>> pe.encode('Smith')
111
        'SM0,XMT'
112
        >>> pe.encode('Schmidt')
113
        'XMT,SMT'
114
115
116
        .. versionadded:: 0.1.0
117
        .. versionchanged:: 0.3.6
118
            Encapsulated in class
119
        .. versionchanged:: 0.6.0
120
            Made return a str only (comma-separated)
121
122
        """
123
        primary = ''
124
        secondary = ''
125
126
        def _slavo_germanic() -> bool:
127
            """Return True if the word appears to be Slavic or Germanic.
128
129 1
            Returns
130 1
            -------
131
            bool
132 1
                True if the word appears to be Slavic or Germanic
133
134
            .. versionadded:: 0.1.0
135
136
            """
137
            if 'W' in word or 'K' in word or 'CZ' in word:
138
                return True
139
            return False
140
141
        def _metaph_add(pri: str, sec: str = '') -> Tuple[str, str]:
142
            """Return a new metaphone tuple with the supplied elements.
143 1
144 1
            Parameters
145 1
            ----------
146
            pri : str
147 1
                The primary element
148
            sec : str
149
                The secondary element
150
151
            Returns
152
            -------
153
            tuple
154
                A new metaphone tuple with the supplied elements
155
156
            .. versionadded:: 0.1.0
157
158
            """
159
            newpri = primary
160
            newsec = secondary
161
            if pri:
162
                newpri += pri
163
            if sec:
164
                if sec != ' ':
165 1
                    newsec += sec
166 1
            else:
167 1
                newsec += pri
168 1
            return newpri, newsec
169 1
170 1
        def _is_vowel(pos: int) -> bool:
171 1
            """Return True if the character at word[pos] is a vowel.
172
173 1
            Parameters
174 1
            ----------
175
            pos : int
176 1
                Position in the word
177
178
            Returns
179
            -------
180
            bool
181
                True if the character is a vowel
182
183
            .. versionadded:: 0.1.0
184
185
            """
186
            if pos >= 0 and word[pos] in {'A', 'E', 'I', 'O', 'U', 'Y'}:
187
                return True
188
            return False
189
190
        def _get_at(pos: int) -> str:
191
            """Return the character at word[pos].
192 1
193 1
            Parameters
194 1
            ----------
195
            pos : int
196 1
                Position in the word
197
198
            Returns
199
            -------
200
            str
201
                Character at word[pos]
202
203
            .. versionadded:: 0.1.0
204
205
            """
206
            return word[pos]
207
208
        def _string_at(pos: int, slen: int, substrings: Set[str]) -> bool:
209
            """Return True if word[pos:pos+slen] is in substrings.
210
211
            Parameters
212 1
            ----------
213
            pos : int
214 1
                Position in the word
215
            slen : int
216
                Substring length
217
            substrings : set
218
                Substrings to search
219
220
            Returns
221
            -------
222
            bool
223
                True if word[pos:pos+slen] is in substrings
224
225
            .. versionadded:: 0.1.0
226
227
            """
228
            if pos < 0:
229
                return False
230
            return word[pos : pos + slen] in substrings
231
232
        current = 0
233
        length = len(word)
234 1
        if length < 1:
235 1
            return ','
236 1
        last = length - 1
237
238 1
        word = word.upper()
239 1
240 1
        # Pad the original string so that we can index beyond the edge of the
241 1
        # world
242 1
        word += '     '
243
244 1
        # Skip these when at start of word
245 1
        if word[0:2] in {'GN', 'KN', 'PN', 'WR', 'PS'}:
246
            current += 1
247
248
        # Initial 'X' is pronounced 'Z' e.g. 'Xavier'
249 1
        if _get_at(0) == 'X':
250
            primary, secondary = _metaph_add('S')  # 'Z' maps to 'S'
251
            current += 1
252 1
253 1
        # Main loop
254
        while True:
255
            if current >= length:
256 1
                break
257 1
258 1
            if _get_at(current) in {'A', 'E', 'I', 'O', 'U', 'Y'}:
259
                if current == 0:
260
                    # All init vowels now map to 'A'
261 1
                    primary, secondary = _metaph_add('A')
262 1
                current += 1
263 1
                continue
264
265 1
            elif _get_at(current) == 'B':
266 1
                # "-mb", e.g", "dumb", already skipped over...
267
                primary, secondary = _metaph_add('P')
268 1
                if _get_at(current + 1) == 'B':
269 1
                    current += 2
270 1
                else:
271
                    current += 1
272 1
                continue
273
274 1
            elif _get_at(current) == 'Ç':
275 1
                primary, secondary = _metaph_add('S')
276 1
                current += 1
277
                continue
278 1
279 1
            elif _get_at(current) == 'C':
280
                # Various Germanic
281 1
                if (
282 1
                    current > 1
283 1
                    and not _is_vowel(current - 2)
284 1
                    and _string_at((current - 1), 3, {'ACH'})
285
                    and (
286 1
                        (_get_at(current + 2) != 'I')
287
                        and (
288 1
                            (_get_at(current + 2) != 'E')
289
                            or _string_at(
290
                                (current - 2), 6, {'BACHER', 'MACHER'}
291
                            )
292
                        )
293
                    )
294
                ):
295
                    primary, secondary = _metaph_add('K')
296
                    current += 2
297
                    continue
298
299
                # Special case 'caesar'
300
                elif current == 0 and _string_at(current, 6, {'CAESAR'}):
301
                    primary, secondary = _metaph_add('S')
302 1
                    current += 2
303 1
                    continue
304 1
305
                # Italian 'chianti'
306
                elif _string_at(current, 4, {'CHIA'}):
307 1
                    primary, secondary = _metaph_add('K')
308 1
                    current += 2
309 1
                    continue
310 1
311
                elif _string_at(current, 2, {'CH'}):
312
                    # Find 'Michael'
313 1
                    if current > 0 and _string_at(current, 4, {'CHAE'}):
314 1
                        primary, secondary = _metaph_add('K', 'X')
315 1
                        current += 2
316 1
                        continue
317
318 1
                    # Greek roots e.g. 'chemistry', 'chorus'
319
                    elif (
320 1
                        current == 0
321 1
                        and (
322 1
                            _string_at((current + 1), 5, {'HARAC', 'HARIS'})
323 1
                            or _string_at(
324
                                (current + 1), 3, {'HOR', 'HYM', 'HIA', 'HEM'}
325
                            )
326 1
                        )
327
                        and not _string_at(0, 5, {'CHORE'})
328
                    ):
329
                        primary, secondary = _metaph_add('K')
330
                        current += 2
331
                        continue
332
333
                    # Germanic, Greek, or otherwise 'ch' for 'kh' sound
334
                    elif (
335
                        (
336 1
                            _string_at(0, 4, {'VAN ', 'VON '})
337 1
                            or _string_at(0, 3, {'SCH'})
338 1
                        )
339
                        # 'architect but not 'arch', 'orchestra', 'orchid'
340
                        or _string_at(
341 1
                            (current - 2), 6, {'ORCHES', 'ARCHIT', 'ORCHID'}
342
                        )
343
                        or _string_at((current + 2), 1, {'T', 'S'})
344
                        or (
345
                            (
346
                                _string_at(
347
                                    (current - 1), 1, {'A', 'O', 'U', 'E'}
348
                                )
349
                                or (current == 0)
350
                            )
351
                            # e.g., 'wachtler', 'wechsler', but not 'tichner'
352
                            and _string_at(
353
                                (current + 2),
354
                                1,
355
                                {
356
                                    'L',
357
                                    'R',
358
                                    'N',
359
                                    'M',
360
                                    'B',
361
                                    'H',
362
                                    'F',
363
                                    'V',
364
                                    'W',
365
                                    ' ',
366
                                },
367
                            )
368
                        )
369
                    ):
370
                        primary, secondary = _metaph_add('K')
371
372
                    else:
373
                        if current > 0:
374
                            if _string_at(0, 2, {'MC'}):
375
                                # e.g., "McHugh"
376
                                primary, secondary = _metaph_add('K')
377
                            else:
378
                                primary, secondary = _metaph_add('X', 'K')
379 1
                        else:
380
                            primary, secondary = _metaph_add('X')
381
382 1
                    current += 2
383 1
                    continue
384
385 1
                # e.g, 'czerny'
386
                elif _string_at(current, 2, {'CZ'}) and not _string_at(
387 1
                    (current - 2), 4, {'WICZ'}
388
                ):
389 1
                    primary, secondary = _metaph_add('S', 'X')
390
                    current += 2
391 1
                    continue
392 1
393
                # e.g., 'focaccia'
394
                elif _string_at((current + 1), 3, {'CIA'}):
395 1
                    primary, secondary = _metaph_add('X')
396
                    current += 3
397
398 1
                # double 'C', but not if e.g. 'McClellan'
399 1
                elif _string_at(current, 2, {'CC'}) and not (
400 1
                    (current == 1) and (_get_at(0) == 'M')
401
                ):
402
                    # 'bellocchio' but not 'bacchus'
403 1
                    if _string_at(
404 1
                        (current + 2), 1, {'I', 'E', 'H'}
405 1
                    ) and not _string_at((current + 2), 2, {'HU'}):
406
                        # 'accident', 'accede' 'succeed'
407
                        if (
408 1
                            (current == 1) and _get_at(current - 1) == 'A'
409
                        ) or _string_at((current - 1), 5, {'UCCEE', 'UCCES'}):
410
                            primary, secondary = _metaph_add('KS')
411
                        # 'bacci', 'bertucci', other italian
412 1
                        else:
413
                            primary, secondary = _metaph_add('X')
414
                        current += 3
415
                        continue
416 1
                    else:  # Pierce's rule
417
                        primary, secondary = _metaph_add('K')
418
                        current += 2
419 1
                        continue
420
421
                elif _string_at(current, 2, {'CK', 'CG', 'CQ'}):
422 1
                    primary, secondary = _metaph_add('K')
423 1
                    current += 2
424 1
                    continue
425
426 1
                elif _string_at(current, 2, {'CI', 'CE', 'CY'}):
427 1
                    # Italian vs. English
428 1
                    if _string_at(current, 3, {'CIO', 'CIE', 'CIA'}):
429
                        primary, secondary = _metaph_add('S', 'X')
430 1
                    else:
431 1
                        primary, secondary = _metaph_add('S')
432 1
                    current += 2
433 1
                    continue
434
435 1
                # else
436
                else:
437 1
                    primary, secondary = _metaph_add('K')
438 1
439
                    # name sent in 'mac caffrey', 'mac gregor
440 1
                    if _string_at((current + 1), 2, {' C', ' Q', ' G'}):
441 1
                        current += 3
442 1
                    elif _string_at(
443
                        (current + 1), 1, {'C', 'K', 'Q'}
444
                    ) and not _string_at((current + 1), 2, {'CE', 'CI'}):
445
                        current += 2
446 1
                    else:
447
                        current += 1
448
                    continue
449 1
450 1
            elif _get_at(current) == 'D':
451 1
                if _string_at(current, 2, {'DG'}):
452
                    if _string_at((current + 2), 1, {'I', 'E', 'Y'}):
453
                        # e.g. 'edge'
454 1
                        primary, secondary = _metaph_add('J')
455
                        current += 3
456 1
                        continue
457 1
                    else:
458
                        # e.g. 'edgar'
459 1
                        primary, secondary = _metaph_add('TK')
460 1
                        current += 2
461 1
                        continue
462
463 1
                elif _string_at(current, 2, {'DT', 'DD'}):
464 1
                    primary, secondary = _metaph_add('T')
465 1
                    current += 2
466
                    continue
467
468 1
                # else
469 1
                else:
470 1
                    primary, secondary = _metaph_add('T')
471
                    current += 1
472 1
                    continue
473 1
474 1
            elif _get_at(current) == 'F':
475 1
                if _get_at(current + 1) == 'F':
476
                    current += 2
477
                else:
478
                    current += 1
479 1
                primary, secondary = _metaph_add('F')
480 1
                continue
481 1
482
            elif _get_at(current) == 'G':
483 1
                if _get_at(current + 1) == 'H':
484 1
                    if (current > 0) and not _is_vowel(current - 1):
485 1
                        primary, secondary = _metaph_add('K')
486
                        current += 2
487 1
                        continue
488 1
489 1
                    # 'ghislane', ghiradelli
490
                    elif current == 0:
491 1
                        if _get_at(current + 2) == 'I':
492 1
                            primary, secondary = _metaph_add('J')
493 1
                        else:
494 1
                            primary, secondary = _metaph_add('K')
495 1
                        current += 2
496 1
                        continue
497
498
                    # Parker's rule (with some further refinements) -
499 1
                    # e.g., 'hugh'
500 1
                    elif (
501 1
                        (
502
                            (current > 1)
503 1
                            and _string_at((current - 2), 1, {'B', 'H', 'D'})
504 1
                        )
505 1
                        # e.g., 'bough'
506
                        or (
507
                            (current > 2)
508
                            and _string_at((current - 3), 1, {'B', 'H', 'D'})
509 1
                        )
510
                        # e.g., 'broughton'
511
                        or (
512
                            (current > 3)
513
                            and _string_at((current - 4), 1, {'B', 'H'})
514
                        )
515
                    ):
516
                        current += 2
517
                        continue
518
                    else:
519
                        # e.g. 'laugh', 'McLaughlin', 'cough',
520
                        #      'gough', 'rough', 'tough'
521
                        if (
522
                            (current > 2)
523
                            and (_get_at(current - 1) == 'U')
524
                            and (
525
                                _string_at(
526
                                    (current - 3), 1, {'C', 'G', 'L', 'R', 'T'}
527 1
                                )
528 1
                            )
529
                        ):
530
                            primary, secondary = _metaph_add('F')
531
                        elif (current > 0) and _get_at(current - 1) != 'I':
532 1
                            primary, secondary = _metaph_add('K')
533
                        current += 2
534
                        continue
535
536
                elif _get_at(current + 1) == 'N':
537
                    if (
538
                        (current == 1)
539
                        and _is_vowel(0)
540
                        and not _slavo_germanic()
541 1
                    ):
542 1
                        primary, secondary = _metaph_add('KN', 'N')
543 1
                    # not e.g. 'cagney'
544 1
                    elif (
545 1
                        not _string_at((current + 2), 2, {'EY'})
546
                        and (_get_at(current + 1) != 'Y')
547 1
                        and not _slavo_germanic()
548 1
                    ):
549
                        primary, secondary = _metaph_add('N', 'KN')
550
                    else:
551
                        primary, secondary = _metaph_add('KN')
552
                    current += 2
553 1
                    continue
554
555 1
                # 'tagliaro'
556
                elif (
557
                    _string_at((current + 1), 2, {'LI'})
558
                    and not _slavo_germanic()
559
                ):
560 1
                    primary, secondary = _metaph_add('KL', 'L')
561
                    current += 2
562 1
                    continue
563 1
564 1
                # -ges-, -gep-, -gel-, -gie- at beginning
565
                elif (current == 0) and (
566
                    (_get_at(current + 1) == 'Y')
567 1
                    or _string_at(
568
                        (current + 1),
569
                        2,
570
                        {
571 1
                            'ES',
572 1
                            'EP',
573 1
                            'EB',
574
                            'EL',
575
                            'EY',
576 1
                            'IB',
577
                            'IL',
578
                            'IN',
579
                            'IE',
580
                            'EI',
581
                            'ER',
582
                        },
583
                    )
584
                ):
585
                    primary, secondary = _metaph_add('K', 'J')
586
                    current += 2
587
                    continue
588
589
                #  -ger-,  -gy-
590
                elif (
591
                    (
592
                        _string_at((current + 1), 2, {'ER'})
593
                        or (_get_at(current + 1) == 'Y')
594
                    )
595
                    and not _string_at(0, 6, {'DANGER', 'RANGER', 'MANGER'})
596 1
                    and not _string_at((current - 1), 1, {'E', 'I'})
597 1
                    and not _string_at((current - 1), 3, {'RGY', 'OGY'})
598 1
                ):
599
                    primary, secondary = _metaph_add('K', 'J')
600
                    current += 2
601 1
                    continue
602
603
                #  italian e.g, 'biaggi'
604
                elif _string_at(
605
                    (current + 1), 1, {'E', 'I', 'Y'}
606
                ) or _string_at((current - 1), 4, {'AGGI', 'OGGI'}):
607
                    # obvious germanic
608
                    if (
609
                        _string_at(0, 4, {'VAN ', 'VON '})
610 1
                        or _string_at(0, 3, {'SCH'})
611 1
                    ) or _string_at((current + 1), 2, {'ET'}):
612 1
                        primary, secondary = _metaph_add('K')
613
                    elif _string_at((current + 1), 4, {'IER '}):
614
                        primary, secondary = _metaph_add('J')
615 1
                    else:
616
                        primary, secondary = _metaph_add('J', 'K')
617
                    current += 2
618
                    continue
619 1
620
                else:
621
                    if _get_at(current + 1) == 'G':
622
                        current += 2
623 1
                    else:
624 1
                        current += 1
625 1
                    primary, secondary = _metaph_add('K')
626
                    continue
627 1
628 1
            elif _get_at(current) == 'H':
629 1
                # only keep if first & before vowel or btw. 2 vowels
630
                if ((current == 0) or _is_vowel(current - 1)) and _is_vowel(
631
                    current + 1
632 1
                ):
633 1
                    primary, secondary = _metaph_add('H')
634
                    current += 2
635 1
                else:  # also takes care of 'HH'
636 1
                    current += 1
637 1
                continue
638
639 1
            elif _get_at(current) == 'J':
640
                # obvious spanish, 'jose', 'san jacinto'
641 1
                if _string_at(current, 4, {'JOSE'}) or _string_at(
642
                    0, 4, {'SAN '}
643
                ):
644 1
                    if (
645 1
                        (current == 0) and (_get_at(current + 4) == ' ')
646
                    ) or _string_at(0, 4, {'SAN '}):
647 1
                        primary, secondary = _metaph_add('H')
648 1
                    else:
649
                        primary, secondary = _metaph_add('J', 'H')
650 1
                    current += 1
651
                    continue
652 1
653
                elif (current == 0) and not _string_at(current, 4, {'JOSE'}):
654
                    # Yankelovich/Jankelowicz
655 1
                    primary, secondary = _metaph_add('J', 'A')
656
                # Spanish pron. of e.g. 'bajador'
657
                elif (
658 1
                    _is_vowel(current - 1)
659
                    and not _slavo_germanic()
660 1
                    and (
661 1
                        (_get_at(current + 1) == 'A')
662 1
                        or (_get_at(current + 1) == 'O')
663
                    )
664 1
                ):
665
                    primary, secondary = _metaph_add('J', 'H')
666 1
                elif current == last:
667
                    primary, secondary = _metaph_add('J', ' ')
668 1
                elif not _string_at(
669
                    (current + 1), 1, {'L', 'T', 'K', 'S', 'N', 'M', 'B', 'Z'}
670
                ) and not _string_at((current - 1), 1, {'S', 'K', 'L'}):
671
                    primary, secondary = _metaph_add('J')
672
673
                if _get_at(current + 1) == 'J':  # it could happen!
674
                    current += 2
675
                else:
676 1
                    current += 1
677 1
                continue
678 1
679 1
            elif _get_at(current) == 'K':
680
                if _get_at(current + 1) == 'K':
681
                    current += 2
682 1
                else:
683
                    current += 1
684 1
                primary, secondary = _metaph_add('K')
685 1
                continue
686
687 1
            elif _get_at(current) == 'L':
688 1
                if _get_at(current + 1) == 'L':
689
                    # Spanish e.g. 'cabrillo', 'gallegos'
690 1
                    if (
691 1
                        (current == (length - 3))
692 1
                        and _string_at(
693
                            (current - 1), 4, {'ILLO', 'ILLA', 'ALLE'}
694 1
                        )
695 1
                    ) or (
696 1
                        (
697
                            _string_at((last - 1), 2, {'AS', 'OS'})
698 1
                            or _string_at(last, 1, {'A', 'O'})
699 1
                        )
700
                        and _string_at((current - 1), 4, {'ALLE'})
701 1
                    ):
702
                        primary, secondary = _metaph_add('L', ' ')
703
                        current += 2
704
                        continue
705
                    current += 2
706
                else:
707
                    current += 1
708
                primary, secondary = _metaph_add('L')
709
                continue
710
711
            elif _get_at(current) == 'M':
712
                if (
713 1
                    (
714 1
                        _string_at((current - 1), 3, {'UMB'})
715 1
                        and (
716 1
                            ((current + 1) == last)
717
                            or _string_at((current + 2), 2, {'ER'})
718 1
                        )
719 1
                    )
720 1
                    # 'dumb', 'thumb'
721
                    or (_get_at(current + 1) == 'M')
722 1
                ):
723 1
                    current += 2
724
                else:
725
                    current += 1
726
                primary, secondary = _metaph_add('M')
727
                continue
728
729
            elif _get_at(current) == 'N':
730
                if _get_at(current + 1) == 'N':
731
                    current += 2
732
                else:
733
                    current += 1
734
                primary, secondary = _metaph_add('N')
735 1
                continue
736
737 1
            elif _get_at(current) == 'Ñ':
738 1
                current += 1
739 1
                primary, secondary = _metaph_add('N')
740
                continue
741 1
742 1
            elif _get_at(current) == 'P':
743 1
                if _get_at(current + 1) == 'H':
744
                    primary, secondary = _metaph_add('F')
745 1
                    current += 2
746 1
                    continue
747 1
748
                # also account for "campbell", "raspberry"
749 1
                elif _string_at((current + 1), 1, {'P', 'B'}):
750 1
                    current += 2
751 1
                else:
752 1
                    current += 1
753
                primary, secondary = _metaph_add('P')
754 1
                continue
755 1
756 1
            elif _get_at(current) == 'Q':
757 1
                if _get_at(current + 1) == 'Q':
758 1
                    current += 2
759
                else:
760
                    current += 1
761 1
                primary, secondary = _metaph_add('K')
762 1
                continue
763
764 1
            elif _get_at(current) == 'R':
765 1
                # french e.g. 'rogier', but exclude 'hochmeier'
766 1
                if (
767
                    (current == last)
768 1
                    and not _slavo_germanic()
769 1
                    and _string_at((current - 2), 2, {'IE'})
770 1
                    and not _string_at((current - 4), 2, {'ME', 'MA'})
771
                ):
772 1
                    primary, secondary = _metaph_add('', 'R')
773 1
                else:
774 1
                    primary, secondary = _metaph_add('R')
775
776 1
                if _get_at(current + 1) == 'R':
777
                    current += 2
778 1
                else:
779
                    current += 1
780
                continue
781
782
            elif _get_at(current) == 'S':
783
                # special cases 'island', 'isle', 'carlisle', 'carlysle'
784 1
                if _string_at((current - 1), 3, {'ISL', 'YSL'}):
785
                    current += 1
786 1
                    continue
787
788 1
                # special case 'sugar-'
789 1
                elif (current == 0) and _string_at(current, 5, {'SUGAR'}):
790
                    primary, secondary = _metaph_add('X', 'S')
791 1
                    current += 1
792 1
                    continue
793
794 1
                elif _string_at(current, 2, {'SH'}):
795
                    # Germanic
796 1
                    if _string_at(
797 1
                        (current + 1), 4, {'HEIM', 'HOEK', 'HOLM', 'HOLZ'}
798 1
                    ):
799
                        primary, secondary = _metaph_add('S')
800
                    else:
801 1
                        primary, secondary = _metaph_add('X')
802 1
                    current += 2
803 1
                    continue
804 1
805
                # Italian & Armenian
806 1
                elif _string_at(current, 3, {'SIO', 'SIA'}) or _string_at(
807
                    current, 4, {'SIAN'}
808 1
                ):
809
                    if not _slavo_germanic():
810
                        primary, secondary = _metaph_add('S', 'X')
811 1
                    else:
812
                        primary, secondary = _metaph_add('S')
813 1
                    current += 3
814 1
                    continue
815 1
816
                # German & anglicisations, e.g. 'smith' match 'schmidt',
817
                #                               'snider' match 'schneider'
818 1
                # also, -sz- in Slavic language although in Hungarian it is
819
                #       pronounced 's'
820
                elif (
821 1
                    (current == 0)
822 1
                    and _string_at((current + 1), 1, {'M', 'N', 'L', 'W'})
823
                ) or _string_at((current + 1), 1, {'Z'}):
824 1
                    primary, secondary = _metaph_add('S', 'X')
825 1
                    if _string_at((current + 1), 1, {'Z'}):
826 1
                        current += 2
827
                    else:
828
                        current += 1
829
                    continue
830
831
                elif _string_at(current, 2, {'SC'}):
832 1
                    # Schlesinger's rule
833
                    if _get_at(current + 2) == 'H':
834
                        # dutch origin, e.g. 'school', 'schooner'
835
                        if _string_at(
836 1
                            (current + 3),
837 1
                            2,
838 1
                            {'OO', 'ER', 'EN', 'UY', 'ED', 'EM'},
839
                        ):
840 1
                            # 'schermerhorn', 'schenker'
841 1
                            if _string_at((current + 3), 2, {'ER', 'EN'}):
842
                                primary, secondary = _metaph_add('X', 'SK')
843 1
                            else:
844
                                primary, secondary = _metaph_add('SK')
845 1
                            current += 3
846
                            continue
847 1
                        else:
848
                            if (
849
                                (current == 0)
850
                                and not _is_vowel(3)
851
                                and (_get_at(3) != 'W')
852
                            ):
853 1
                                primary, secondary = _metaph_add('X', 'S')
854 1
                            else:
855
                                primary, secondary = _metaph_add('X')
856 1
                            current += 3
857 1
                            continue
858 1
859
                    elif _string_at((current + 2), 1, {'I', 'E', 'Y'}):
860 1
                        primary, secondary = _metaph_add('S')
861
                        current += 3
862
                        continue
863
864
                    # else
865 1
                    else:
866
                        primary, secondary = _metaph_add('SK')
867 1
                        current += 3
868 1
                        continue
869 1
870
                else:
871 1
                    # french e.g. 'resnais', 'artois'
872 1
                    if (current == last) and _string_at(
873 1
                        (current - 2), 2, {'AI', 'OI'}
874 1
                    ):
875
                        primary, secondary = _metaph_add('', 'S')
876
                    else:
877
                        primary, secondary = _metaph_add('S')
878 1
879 1
                    if _string_at((current + 1), 1, {'S', 'Z'}):
880 1
                        current += 2
881
                    else:
882
                        current += 1
883
                    continue
884 1
885
            elif _get_at(current) == 'T':
886
                if _string_at(current, 4, {'TION'}):
887 1
                    primary, secondary = _metaph_add('X')
888
                    current += 3
889 1
                    continue
890
891 1
                elif _string_at(current, 3, {'TIA', 'TCH'}):
892 1
                    primary, secondary = _metaph_add('X')
893
                    current += 3
894 1
                    continue
895 1
896
                elif _string_at(current, 2, {'TH'}) or _string_at(
897 1
                    current, 3, {'TTH'}
898 1
                ):
899 1
                    # special case 'thomas', 'thames' or germanic
900 1
                    if (
901 1
                        _string_at((current + 2), 2, {'OM', 'AM'})
902
                        or _string_at(0, 4, {'VAN ', 'VON '})
903 1
                        or _string_at(0, 3, {'SCH'})
904 1
                    ):
905 1
                        primary, secondary = _metaph_add('T')
906 1
                    else:
907
                        primary, secondary = _metaph_add('0', 'T')
908 1
                    current += 2
909
                    continue
910
911
                elif _string_at((current + 1), 1, {'T', 'D'}):
912 1
                    current += 2
913
                else:
914
                    current += 1
915
                primary, secondary = _metaph_add('T')
916
                continue
917 1
918
            elif _get_at(current) == 'V':
919 1
                if _get_at(current + 1) == 'V':
920 1
                    current += 2
921 1
                else:
922
                    current += 1
923 1
                primary, secondary = _metaph_add('F')
924 1
                continue
925
926 1
            elif _get_at(current) == 'W':
927 1
                # can also be in middle of word
928 1
                if _string_at(current, 2, {'WR'}):
929
                    primary, secondary = _metaph_add('R')
930 1
                    current += 2
931 1
                    continue
932 1
                elif (current == 0) and (
933
                    _is_vowel(current + 1) or _string_at(current, 2, {'WH'})
934 1
                ):
935 1
                    # Wasserman should match Vasserman
936 1
                    if _is_vowel(current + 1):
937
                        primary, secondary = _metaph_add('A', 'F')
938 1
                    else:
939
                        # need Uomo to match Womo
940 1
                        primary, secondary = _metaph_add('A')
941 1
942 1
                # Arnow should match Arnoff
943 1
                if (
944 1
                    ((current == last) and _is_vowel(current - 1))
945
                    or _string_at(
946
                        (current - 1), 5, {'EWSKI', 'EWSKY', 'OWSKI', 'OWSKY'}
947
                    )
948 1
                    or _string_at(0, 3, {'SCH'})
949 1
                ):
950
                    primary, secondary = _metaph_add('', 'F')
951
                    current += 1
952 1
                    continue
953
                # Polish e.g. 'filipowicz'
954
                elif _string_at(current, 4, {'WICZ', 'WITZ'}):
955 1
                    primary, secondary = _metaph_add('TS', 'FX')
956
                    current += 4
957
                    continue
958
                # else skip it
959
                else:
960
                    current += 1
961
                    continue
962 1
963 1
            elif _get_at(current) == 'X':
964 1
                # French e.g. breaux
965
                if not (
966 1
                    (current == last)
967 1
                    and (
968 1
                        _string_at((current - 3), 3, {'IAU', 'EAU'})
969 1
                        or _string_at((current - 2), 2, {'AU', 'OU'})
970
                    )
971
                ):
972 1
                    primary, secondary = _metaph_add('KS')
973 1
974
                if _string_at((current + 1), 1, {'C', 'X'}):
975 1
                    current += 2
976
                else:
977 1
                    current += 1
978
                continue
979
980
            elif _get_at(current) == 'Z':
981
                # Chinese Pinyin e.g. 'zhao'
982
                if _get_at(current + 1) == 'H':
983
                    primary, secondary = _metaph_add('J')
984 1
                    current += 2
985
                    continue
986 1
                elif _string_at((current + 1), 2, {'ZO', 'ZI', 'ZA'}) or (
987 1
                    _slavo_germanic()
988
                    and ((current > 0) and _get_at(current - 1) != 'T')
989 1
                ):
990 1
                    primary, secondary = _metaph_add('S', 'TS')
991
                else:
992 1
                    primary, secondary = _metaph_add('S')
993
994 1
                if _get_at(current + 1) == 'Z':
995 1
                    current += 2
996 1
                else:
997 1
                    current += 1
998 1
                continue
999
1000
            else:
1001
                current += 1
1002 1
1003
        if self._max_length > 0:
1004 1
            primary = primary[: self._max_length]
1005
            secondary = secondary[: self._max_length]
1006 1
        if primary == secondary:
1007 1
            secondary = ''
1008
1009 1
        return ','.join((primary, secondary))
1010 1
1011
1012
if __name__ == '__main__':
1013 1
    import doctest
1014
1015
    doctest.testmod()
1016