Completed
Push — master ( f43547...71985b )
by Chris
12:00 queued 10s
created

abydos.phonetic._double_metaphone   F

Complexity

Total Complexity 220

Size/Duplication

Total Lines 998
Duplicated Lines 0 %

Test Coverage

Coverage 100%

Importance

Changes 0
Metric Value
eloc 563
dl 0
loc 998
ccs 413
cts 413
cp 1
rs 2
c 0
b 0
f 0
wmc 220

1 Function

Rating   Name   Duplication   Size   Complexity  
A double_metaphone() 0 31 1

1 Method

Rating   Name   Duplication   Size   Complexity  
F DoubleMetaphone.encode() 0 916 219

How to fix   Complexity   

Complexity

Complex classes like abydos.phonetic._double_metaphone often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

1
# -*- coding: utf-8 -*-
2
3
# Copyright 2014-2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19 1
"""abydos.phonetic._double_metaphone.
20
21
Double Metaphone
22
"""
23
24 1
from __future__ import (
25
    absolute_import,
26
    division,
27
    print_function,
28
    unicode_literals,
29
)
30
31 1
from ._phonetic import _Phonetic
32
33 1
__all__ = ['DoubleMetaphone', 'double_metaphone']
34
35
36 1
class DoubleMetaphone(_Phonetic):
0 ignored issues
show
Unused Code introduced by
The variable __class__ seems to be unused.
Loading history...
37
    """Double Metaphone.
38
39
    Based on Lawrence Philips' (Visual) C++ code from 1999
40
    :cite:`Philips:2000`.
41
    """
42
43 1
    def encode(self, word, max_length=-1):
0 ignored issues
show
Bug introduced by
Parameters differ from overridden 'encode' method
Loading history...
44
        """Return the Double Metaphone code for a word.
45
46
        Parameters
47
        ----------
48
        word : str
49
            The word to transform
50
        max_length : int
51
            The maximum length of the returned Double Metaphone codes (defaults
52
            to unlmited, but in Philips' original implementation this was 4)
53
54
        Returns
55
        -------
56
        tuple
57
            The Double Metaphone value(s)
58
59
        Examples
60
        --------
61
        >>> pe = DoubleMetaphone()
62
        >>> pe.encode('Christopher')
63
        ('KRSTFR', '')
64
        >>> pe.encode('Niall')
65
        ('NL', '')
66
        >>> pe.encode('Smith')
67
        ('SM0', 'XMT')
68
        >>> pe.encode('Schmidt')
69
        ('XMT', 'SMT')
70
71
        """
72
        # Require a max_length of at least 4
73 1
        if max_length != -1:
74 1
            max_length = max(4, max_length)
75
76 1
        primary = ''
77 1
        secondary = ''
78
79 1
        def _slavo_germanic():
80
            """Return True if the word appears to be Slavic or Germanic.
81
82
            Returns
83
            -------
84
            bool
85
                True if the word appears to be Slavic or Germanic
86
87
            """
88 1
            if 'W' in word or 'K' in word or 'CZ' in word:
89 1
                return True
90 1
            return False
91
92 1
        def _metaph_add(pri, sec=''):
93
            """Return a new metaphone tuple with the supplied elements.
94
95
            Parameters
96
            ----------
97
            pri : str
98
                The primary element
99
            sec : str
100
                The secondary element
101
102
            Returns
103
            -------
104
            tuple
105
                A new metaphone tuple with the supplied elements
106
107
            """
108 1
            newpri = primary
109 1
            newsec = secondary
110 1
            if pri:
111 1
                newpri += pri
112 1
            if sec:
113 1
                if sec != ' ':
114 1
                    newsec += sec
115
            else:
116 1
                newsec += pri
117 1
            return newpri, newsec
118
119 1
        def _is_vowel(pos):
120
            """Return True if the character at word[pos] is a vowel.
121
122
            Parameters
123
            ----------
124
            pos : int
125
                Position in the word
126
127
            Returns
128
            -------
129
            bool
130
                True if the character is a vowel
131
132
            """
133 1
            if pos >= 0 and word[pos] in {'A', 'E', 'I', 'O', 'U', 'Y'}:
134 1
                return True
135 1
            return False
136
137 1
        def _get_at(pos):
138
            """Return the character at word[pos].
139
140
            Parameters
141
            ----------
142
            pos : int
143
                Position in the word
144
145
            Returns
146
            -------
147
            str
148
                Character at word[pos]
149
150
            """
151 1
            return word[pos]
152
153 1
        def _string_at(pos, slen, substrings):
154
            """Return True if word[pos:pos+slen] is in substrings.
155
156
            Parameters
157
            ----------
158
            pos : int
159
                Position in the word
160
            slen : int
161
                Substring length
162
            substrings : set
163
                Substrings to search
164
165
            Returns
166
            -------
167
            bool
168
                True if word[pos:pos+slen] is in substrings
169
170
            """
171 1
            if pos < 0:
172 1
                return False
173 1
            return word[pos : pos + slen] in substrings
174
175 1
        current = 0
176 1
        length = len(word)
177 1
        if length < 1:
178 1
            return '', ''
179 1
        last = length - 1
180
181 1
        word = word.upper()
182 1
        word = word.replace('ß', 'SS')
183
184
        # Pad the original string so that we can index beyond the edge of the
185
        # world
186 1
        word += '     '
187
188
        # Skip these when at start of word
189 1
        if word[0:2] in {'GN', 'KN', 'PN', 'WR', 'PS'}:
190 1
            current += 1
191
192
        # Initial 'X' is pronounced 'Z' e.g. 'Xavier'
193 1
        if _get_at(0) == 'X':
194 1
            primary, secondary = _metaph_add('S')  # 'Z' maps to 'S'
195 1
            current += 1
196
197
        # Main loop
198 1
        while True:
0 ignored issues
show
unused-code introduced by
Too many nested blocks (6/5)
Loading history...
199 1
            if current >= length:
200 1
                break
201
202 1
            if _get_at(current) in {'A', 'E', 'I', 'O', 'U', 'Y'}:
203 1
                if current == 0:
204
                    # All init vowels now map to 'A'
205 1
                    primary, secondary = _metaph_add('A')
206 1
                current += 1
207 1
                continue
208
209 1
            elif _get_at(current) == 'B':
210
                # "-mb", e.g", "dumb", already skipped over...
211 1
                primary, secondary = _metaph_add('P')
212 1
                if _get_at(current + 1) == 'B':
213 1
                    current += 2
214
                else:
215 1
                    current += 1
216 1
                continue
217
218 1
            elif _get_at(current) == 'Ç':
219 1
                primary, secondary = _metaph_add('S')
220 1
                current += 1
221 1
                continue
222
223 1
            elif _get_at(current) == 'C':
224
                # Various Germanic
225 1
                if (
226
                    current > 1
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
best-practice introduced by
Too many boolean expressions in if statement (6/5)
Loading history...
227
                    and not _is_vowel(current - 2)
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
228
                    and _string_at((current - 1), 3, {'ACH'})
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
229
                    and (
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
230
                        (_get_at(current + 2) != 'I')
231
                        and (
232
                            (_get_at(current + 2) != 'E')
233
                            or _string_at(
234
                                (current - 2), 6, {'BACHER', 'MACHER'}
235
                            )
236
                        )
237
                    )
238
                ):
239 1
                    primary, secondary = _metaph_add('K')
240 1
                    current += 2
241 1
                    continue
242
243
                # Special case 'caesar'
244 1
                elif current == 0 and _string_at(current, 6, {'CAESAR'}):
245 1
                    primary, secondary = _metaph_add('S')
246 1
                    current += 2
247 1
                    continue
248
249
                # Italian 'chianti'
250 1
                elif _string_at(current, 4, {'CHIA'}):
251 1
                    primary, secondary = _metaph_add('K')
252 1
                    current += 2
253 1
                    continue
254
255 1
                elif _string_at(current, 2, {'CH'}):
256
                    # Find 'Michael'
257 1
                    if current > 0 and _string_at(current, 4, {'CHAE'}):
258 1
                        primary, secondary = _metaph_add('K', 'X')
259 1
                        current += 2
260 1
                        continue
261
262
                    # Greek roots e.g. 'chemistry', 'chorus'
263 1
                    elif (
264
                        current == 0
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
265
                        and (
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
266
                            _string_at((current + 1), 5, {'HARAC', 'HARIS'})
267
                            or _string_at(
268
                                (current + 1), 3, {'HOR', 'HYM', 'HIA', 'HEM'}
269
                            )
270
                        )
271
                        and not _string_at(0, 5, {'CHORE'})
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
272
                    ):
273 1
                        primary, secondary = _metaph_add('K')
274 1
                        current += 2
275 1
                        continue
276
277
                    # Germanic, Greek, or otherwise 'ch' for 'kh' sound
278 1
                    elif (
279
                        (
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
best-practice introduced by
Too many boolean expressions in if statement (7/5)
Loading history...
280
                            _string_at(0, 4, {'VAN ', 'VON '})
281
                            or _string_at(0, 3, {'SCH'})
282
                        )
283
                        or
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
284
                        # 'architect but not 'arch', 'orchestra', 'orchid'
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
285
                        _string_at(
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
286
                            (current - 2), 6, {'ORCHES', 'ARCHIT', 'ORCHID'}
287
                        )
288
                        or _string_at((current + 2), 1, {'T', 'S'})
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
289
                        or (
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
290
                            (
291
                                _string_at(
292
                                    (current - 1), 1, {'A', 'O', 'U', 'E'}
293
                                )
294
                                or (current == 0)
295
                            )
296
                            and
297
                            # e.g., 'wachtler', 'wechsler', but not 'tichner'
298
                            _string_at(
299
                                (current + 2),
300
                                1,
301
                                {
302
                                    'L',
303
                                    'R',
304
                                    'N',
305
                                    'M',
306
                                    'B',
307
                                    'H',
308
                                    'F',
309
                                    'V',
310
                                    'W',
311
                                    ' ',
312
                                },
313
                            )
314
                        )
315
                    ):
316 1
                        primary, secondary = _metaph_add('K')
317
318
                    else:
319 1
                        if current > 0:
320 1
                            if _string_at(0, 2, {'MC'}):
321
                                # e.g., "McHugh"
322 1
                                primary, secondary = _metaph_add('K')
323
                            else:
324 1
                                primary, secondary = _metaph_add('X', 'K')
325
                        else:
326 1
                            primary, secondary = _metaph_add('X')
327
328 1
                    current += 2
329 1
                    continue
330
331
                # e.g, 'czerny'
332 1
                elif _string_at(current, 2, {'CZ'}) and not _string_at(
333
                    (current - 2), 4, {'WICZ'}
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
334
                ):
335 1
                    primary, secondary = _metaph_add('S', 'X')
336 1
                    current += 2
337 1
                    continue
338
339
                # e.g., 'focaccia'
340 1
                elif _string_at((current + 1), 3, {'CIA'}):
341 1
                    primary, secondary = _metaph_add('X')
342 1
                    current += 3
343
344
                # double 'C', but not if e.g. 'McClellan'
345 1
                elif _string_at(current, 2, {'CC'}) and not (
346
                    (current == 1) and (_get_at(0) == 'M')
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
347
                ):
348
                    # 'bellocchio' but not 'bacchus'
349 1
                    if _string_at(
350
                        (current + 2), 1, {'I', 'E', 'H'}
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
351
                    ) and not _string_at((current + 2), 2, {'HU'}):
352
                        # 'accident', 'accede' 'succeed'
353 1
                        if (
354
                            (current == 1) and _get_at(current - 1) == 'A'
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
355
                        ) or _string_at((current - 1), 5, {'UCCEE', 'UCCES'}):
356 1
                            primary, secondary = _metaph_add('KS')
357
                        # 'bacci', 'bertucci', other italian
358
                        else:
359 1
                            primary, secondary = _metaph_add('X')
360 1
                        current += 3
361 1
                        continue
362
                    else:  # Pierce's rule
363 1
                        primary, secondary = _metaph_add('K')
364 1
                        current += 2
365 1
                        continue
366
367 1
                elif _string_at(current, 2, {'CK', 'CG', 'CQ'}):
368 1
                    primary, secondary = _metaph_add('K')
369 1
                    current += 2
370 1
                    continue
371
372 1
                elif _string_at(current, 2, {'CI', 'CE', 'CY'}):
373
                    # Italian vs. English
374 1
                    if _string_at(current, 3, {'CIO', 'CIE', 'CIA'}):
375 1
                        primary, secondary = _metaph_add('S', 'X')
376
                    else:
377 1
                        primary, secondary = _metaph_add('S')
378 1
                    current += 2
379 1
                    continue
380
381
                # else
382
                else:
383 1
                    primary, secondary = _metaph_add('K')
384
385
                    # name sent in 'mac caffrey', 'mac gregor
386 1
                    if _string_at((current + 1), 2, {' C', ' Q', ' G'}):
387 1
                        current += 3
388 1
                    elif _string_at(
389
                        (current + 1), 1, {'C', 'K', 'Q'}
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
390
                    ) and not _string_at((current + 1), 2, {'CE', 'CI'}):
391 1
                        current += 2
392
                    else:
393 1
                        current += 1
394 1
                    continue
395
396 1
            elif _get_at(current) == 'D':
397 1
                if _string_at(current, 2, {'DG'}):
398 1
                    if _string_at((current + 2), 1, {'I', 'E', 'Y'}):
399
                        # e.g. 'edge'
400 1
                        primary, secondary = _metaph_add('J')
401 1
                        current += 3
402 1
                        continue
403
                    else:
404
                        # e.g. 'edgar'
405 1
                        primary, secondary = _metaph_add('TK')
406 1
                        current += 2
407 1
                        continue
408
409 1
                elif _string_at(current, 2, {'DT', 'DD'}):
410 1
                    primary, secondary = _metaph_add('T')
411 1
                    current += 2
412 1
                    continue
413
414
                # else
415
                else:
416 1
                    primary, secondary = _metaph_add('T')
417 1
                    current += 1
418 1
                    continue
419
420 1
            elif _get_at(current) == 'F':
421 1
                if _get_at(current + 1) == 'F':
422 1
                    current += 2
423
                else:
424 1
                    current += 1
425 1
                primary, secondary = _metaph_add('F')
426 1
                continue
427
428 1
            elif _get_at(current) == 'G':
429 1
                if _get_at(current + 1) == 'H':
430 1
                    if (current > 0) and not _is_vowel(current - 1):
431 1
                        primary, secondary = _metaph_add('K')
432 1
                        current += 2
433 1
                        continue
434
435
                    # 'ghislane', ghiradelli
436 1
                    elif current == 0:
437 1
                        if _get_at(current + 2) == 'I':
438 1
                            primary, secondary = _metaph_add('J')
439
                        else:
440 1
                            primary, secondary = _metaph_add('K')
441 1
                        current += 2
442 1
                        continue
443
444
                    # Parker's rule (with some further refinements) -
445
                    # e.g., 'hugh'
446 1
                    elif (
447
                        (
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
best-practice introduced by
Too many boolean expressions in if statement (6/5)
Loading history...
448
                            (current > 1)
449
                            and _string_at((current - 2), 1, {'B', 'H', 'D'})
450
                        )
451
                        or
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
452
                        # e.g., 'bough'
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
453
                        (
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
454
                            (current > 2)
455
                            and _string_at((current - 3), 1, {'B', 'H', 'D'})
456
                        )
457
                        or
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
458
                        # e.g., 'broughton'
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
459
                        (
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
460
                            (current > 3)
461
                            and _string_at((current - 4), 1, {'B', 'H'})
462
                        )
463
                    ):
464 1
                        current += 2
465 1
                        continue
466
                    else:
467
                        # e.g. 'laugh', 'McLaughlin', 'cough',
468
                        #      'gough', 'rough', 'tough'
469 1
                        if (
470
                            (current > 2)
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
471
                            and (_get_at(current - 1) == 'U')
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
472
                            and (
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
473
                                _string_at(
474
                                    (current - 3), 1, {'C', 'G', 'L', 'R', 'T'}
475
                                )
476
                            )
477
                        ):
478 1
                            primary, secondary = _metaph_add('F')
479 1
                        elif (current > 0) and _get_at(current - 1) != 'I':
480 1
                            primary, secondary = _metaph_add('K')
481 1
                        current += 2
482 1
                        continue
483
484 1
                elif _get_at(current + 1) == 'N':
485 1
                    if (
486
                        (current == 1)
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
487
                        and _is_vowel(0)
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
488
                        and not _slavo_germanic()
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
489
                    ):
490 1
                        primary, secondary = _metaph_add('KN', 'N')
491
                    # not e.g. 'cagney'
492 1
                    elif (
493
                        not _string_at((current + 2), 2, {'EY'})
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
494
                        and (_get_at(current + 1) != 'Y')
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
495
                        and not _slavo_germanic()
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
496
                    ):
497 1
                        primary, secondary = _metaph_add('N', 'KN')
498
                    else:
499 1
                        primary, secondary = _metaph_add('KN')
500 1
                    current += 2
501 1
                    continue
502
503
                # 'tagliaro'
504 1
                elif (
505
                    _string_at((current + 1), 2, {'LI'})
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
506
                    and not _slavo_germanic()
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
507
                ):
508 1
                    primary, secondary = _metaph_add('KL', 'L')
509 1
                    current += 2
510 1
                    continue
511
512
                # -ges-, -gep-, -gel-, -gie- at beginning
513 1
                elif (current == 0) and (
514
                    (_get_at(current + 1) == 'Y')
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
515
                    or _string_at(
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
516
                        (current + 1),
517
                        2,
518
                        {
519
                            'ES',
520
                            'EP',
521
                            'EB',
522
                            'EL',
523
                            'EY',
524
                            'IB',
525
                            'IL',
526
                            'IN',
527
                            'IE',
528
                            'EI',
529
                            'ER',
530
                        },
531
                    )
532
                ):
533 1
                    primary, secondary = _metaph_add('K', 'J')
534 1
                    current += 2
535 1
                    continue
536
537
                #  -ger-,  -gy-
538 1
                elif (
539
                    (
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
540
                        _string_at((current + 1), 2, {'ER'})
541
                        or (_get_at(current + 1) == 'Y')
542
                    )
543
                    and not _string_at(0, 6, {'DANGER', 'RANGER', 'MANGER'})
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
544
                    and not _string_at((current - 1), 1, {'E', 'I'})
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
545
                    and not _string_at((current - 1), 3, {'RGY', 'OGY'})
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
546
                ):
547 1
                    primary, secondary = _metaph_add('K', 'J')
548 1
                    current += 2
549 1
                    continue
550
551
                #  italian e.g, 'biaggi'
552 1
                elif _string_at(
553
                    (current + 1), 1, {'E', 'I', 'Y'}
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
554
                ) or _string_at((current - 1), 4, {'AGGI', 'OGGI'}):
555
                    # obvious germanic
556 1
                    if (
557
                        _string_at(0, 4, {'VAN ', 'VON '})
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
558
                        or _string_at(0, 3, {'SCH'})
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
559
                    ) or _string_at((current + 1), 2, {'ET'}):
560 1
                        primary, secondary = _metaph_add('K')
561 1
                    elif _string_at((current + 1), 4, {'IER '}):
562 1
                        primary, secondary = _metaph_add('J')
563
                    else:
564 1
                        primary, secondary = _metaph_add('J', 'K')
565 1
                    current += 2
566 1
                    continue
567
568
                else:
569 1
                    if _get_at(current + 1) == 'G':
570 1
                        current += 2
571
                    else:
572 1
                        current += 1
573 1
                    primary, secondary = _metaph_add('K')
574 1
                    continue
575
576 1
            elif _get_at(current) == 'H':
577
                # only keep if first & before vowel or btw. 2 vowels
578 1
                if ((current == 0) or _is_vowel(current - 1)) and _is_vowel(
579
                    current + 1
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
580
                ):
581 1
                    primary, secondary = _metaph_add('H')
582 1
                    current += 2
583
                else:  # also takes care of 'HH'
584 1
                    current += 1
585 1
                continue
586
587 1
            elif _get_at(current) == 'J':
588
                # obvious spanish, 'jose', 'san jacinto'
589 1
                if _string_at(current, 4, {'JOSE'}) or _string_at(
590
                    0, 4, {'SAN '}
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
591
                ):
592 1
                    if (
593
                        (current == 0) and (_get_at(current + 4) == ' ')
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
594
                    ) or _string_at(0, 4, {'SAN '}):
595 1
                        primary, secondary = _metaph_add('H')
596
                    else:
597 1
                        primary, secondary = _metaph_add('J', 'H')
598 1
                    current += 1
599 1
                    continue
600
601 1
                elif (current == 0) and not _string_at(current, 4, {'JOSE'}):
602
                    # Yankelovich/Jankelowicz
603 1
                    primary, secondary = _metaph_add('J', 'A')
604
                # Spanish pron. of e.g. 'bajador'
605 1
                elif (
606
                    _is_vowel(current - 1)
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
607
                    and not _slavo_germanic()
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
608
                    and (
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
609
                        (_get_at(current + 1) == 'A')
610
                        or (_get_at(current + 1) == 'O')
611
                    )
612
                ):
613 1
                    primary, secondary = _metaph_add('J', 'H')
614 1
                elif current == last:
615 1
                    primary, secondary = _metaph_add('J', ' ')
616 1
                elif not _string_at(
617
                    (current + 1), 1, {'L', 'T', 'K', 'S', 'N', 'M', 'B', 'Z'}
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
618
                ) and not _string_at((current - 1), 1, {'S', 'K', 'L'}):
619 1
                    primary, secondary = _metaph_add('J')
620
621 1
                if _get_at(current + 1) == 'J':  # it could happen!
622 1
                    current += 2
623
                else:
624 1
                    current += 1
625 1
                continue
626
627 1
            elif _get_at(current) == 'K':
628 1
                if _get_at(current + 1) == 'K':
629 1
                    current += 2
630
                else:
631 1
                    current += 1
632 1
                primary, secondary = _metaph_add('K')
633 1
                continue
634
635 1
            elif _get_at(current) == 'L':
636 1
                if _get_at(current + 1) == 'L':
637
                    # Spanish e.g. 'cabrillo', 'gallegos'
638 1
                    if (
639
                        (current == (length - 3))
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
640
                        and _string_at(
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
641
                            (current - 1), 4, {'ILLO', 'ILLA', 'ALLE'}
642
                        )
643
                    ) or (
644
                        (
645
                            _string_at((last - 1), 2, {'AS', 'OS'})
646
                            or _string_at(last, 1, {'A', 'O'})
647
                        )
648
                        and _string_at((current - 1), 4, {'ALLE'})
649
                    ):
650 1
                        primary, secondary = _metaph_add('L', ' ')
651 1
                        current += 2
652 1
                        continue
653 1
                    current += 2
654
                else:
655 1
                    current += 1
656 1
                primary, secondary = _metaph_add('L')
657 1
                continue
658
659 1
            elif _get_at(current) == 'M':
660 1
                if (
661
                    (
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
662
                        _string_at((current - 1), 3, {'UMB'})
663
                        and (
664
                            ((current + 1) == last)
665
                            or _string_at((current + 2), 2, {'ER'})
666
                        )
667
                    )
668
                    or
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
669
                    # 'dumb', 'thumb'
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
670
                    (_get_at(current + 1) == 'M')
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
671
                ):
672 1
                    current += 2
673
                else:
674 1
                    current += 1
675 1
                primary, secondary = _metaph_add('M')
676 1
                continue
677
678 1
            elif _get_at(current) == 'N':
679 1
                if _get_at(current + 1) == 'N':
680 1
                    current += 2
681
                else:
682 1
                    current += 1
683 1
                primary, secondary = _metaph_add('N')
684 1
                continue
685
686 1
            elif _get_at(current) == 'Ñ':
687 1
                current += 1
688 1
                primary, secondary = _metaph_add('N')
689 1
                continue
690
691 1
            elif _get_at(current) == 'P':
692 1
                if _get_at(current + 1) == 'H':
693 1
                    primary, secondary = _metaph_add('F')
694 1
                    current += 2
695 1
                    continue
696
697
                # also account for "campbell", "raspberry"
698 1
                elif _string_at((current + 1), 1, {'P', 'B'}):
699 1
                    current += 2
700
                else:
701 1
                    current += 1
702 1
                primary, secondary = _metaph_add('P')
703 1
                continue
704
705 1
            elif _get_at(current) == 'Q':
706 1
                if _get_at(current + 1) == 'Q':
707 1
                    current += 2
708
                else:
709 1
                    current += 1
710 1
                primary, secondary = _metaph_add('K')
711 1
                continue
712
713 1
            elif _get_at(current) == 'R':
714
                # french e.g. 'rogier', but exclude 'hochmeier'
715 1
                if (
716
                    (current == last)
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
717
                    and not _slavo_germanic()
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
718
                    and _string_at((current - 2), 2, {'IE'})
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
719
                    and not _string_at((current - 4), 2, {'ME', 'MA'})
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
720
                ):
721 1
                    primary, secondary = _metaph_add('', 'R')
722
                else:
723 1
                    primary, secondary = _metaph_add('R')
724
725 1
                if _get_at(current + 1) == 'R':
726 1
                    current += 2
727
                else:
728 1
                    current += 1
729 1
                continue
730
731 1
            elif _get_at(current) == 'S':
732
                # special cases 'island', 'isle', 'carlisle', 'carlysle'
733 1
                if _string_at((current - 1), 3, {'ISL', 'YSL'}):
734 1
                    current += 1
735 1
                    continue
736
737
                # special case 'sugar-'
738 1
                elif (current == 0) and _string_at(current, 5, {'SUGAR'}):
739 1
                    primary, secondary = _metaph_add('X', 'S')
740 1
                    current += 1
741 1
                    continue
742
743 1
                elif _string_at(current, 2, {'SH'}):
744
                    # Germanic
745 1
                    if _string_at(
746
                        (current + 1), 4, {'HEIM', 'HOEK', 'HOLM', 'HOLZ'}
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
747
                    ):
748 1
                        primary, secondary = _metaph_add('S')
749
                    else:
750 1
                        primary, secondary = _metaph_add('X')
751 1
                    current += 2
752 1
                    continue
753
754
                # Italian & Armenian
755 1
                elif _string_at(current, 3, {'SIO', 'SIA'}) or _string_at(
756
                    current, 4, {'SIAN'}
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
757
                ):
758 1
                    if not _slavo_germanic():
759 1
                        primary, secondary = _metaph_add('S', 'X')
760
                    else:
761 1
                        primary, secondary = _metaph_add('S')
762 1
                    current += 3
763 1
                    continue
764
765
                # German & anglicisations, e.g. 'smith' match 'schmidt',
766
                #                               'snider' match 'schneider'
767
                # also, -sz- in Slavic language although in Hungarian it is
768
                #       pronounced 's'
769 1
                elif (
770
                    (current == 0)
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
771
                    and _string_at((current + 1), 1, {'M', 'N', 'L', 'W'})
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
772
                ) or _string_at((current + 1), 1, {'Z'}):
773 1
                    primary, secondary = _metaph_add('S', 'X')
774 1
                    if _string_at((current + 1), 1, {'Z'}):
775 1
                        current += 2
776
                    else:
777 1
                        current += 1
778 1
                    continue
779
780 1
                elif _string_at(current, 2, {'SC'}):
781
                    # Schlesinger's rule
782 1
                    if _get_at(current + 2) == 'H':
783
                        # dutch origin, e.g. 'school', 'schooner'
784 1
                        if _string_at(
785
                            (current + 3),
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
786
                            2,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
787
                            {'OO', 'ER', 'EN', 'UY', 'ED', 'EM'},
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
788
                        ):
789
                            # 'schermerhorn', 'schenker'
790 1
                            if _string_at((current + 3), 2, {'ER', 'EN'}):
791 1
                                primary, secondary = _metaph_add('X', 'SK')
792
                            else:
793 1
                                primary, secondary = _metaph_add('SK')
794 1
                            current += 3
795 1
                            continue
796
                        else:
797 1
                            if (
798
                                (current == 0)
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
799
                                and not _is_vowel(3)
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
800
                                and (_get_at(3) != 'W')
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
801
                            ):
802 1
                                primary, secondary = _metaph_add('X', 'S')
803
                            else:
804 1
                                primary, secondary = _metaph_add('X')
805 1
                            current += 3
806 1
                            continue
807
808 1
                    elif _string_at((current + 2), 1, {'I', 'E', 'Y'}):
809 1
                        primary, secondary = _metaph_add('S')
810 1
                        current += 3
811 1
                        continue
812
813
                    # else
814
                    else:
815 1
                        primary, secondary = _metaph_add('SK')
816 1
                        current += 3
817 1
                        continue
818
819
                else:
820
                    # french e.g. 'resnais', 'artois'
821 1
                    if (current == last) and _string_at(
822
                        (current - 2), 2, {'AI', 'OI'}
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
823
                    ):
824 1
                        primary, secondary = _metaph_add('', 'S')
825
                    else:
826 1
                        primary, secondary = _metaph_add('S')
827
828 1
                    if _string_at((current + 1), 1, {'S', 'Z'}):
829 1
                        current += 2
830
                    else:
831 1
                        current += 1
832 1
                    continue
833
834 1
            elif _get_at(current) == 'T':
835 1
                if _string_at(current, 4, {'TION'}):
836 1
                    primary, secondary = _metaph_add('X')
837 1
                    current += 3
838 1
                    continue
839
840 1
                elif _string_at(current, 3, {'TIA', 'TCH'}):
841 1
                    primary, secondary = _metaph_add('X')
842 1
                    current += 3
843 1
                    continue
844
845 1
                elif _string_at(current, 2, {'TH'}) or _string_at(
846
                    current, 3, {'TTH'}
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
847
                ):
848
                    # special case 'thomas', 'thames' or germanic
849 1
                    if (
850
                        _string_at((current + 2), 2, {'OM', 'AM'})
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
851
                        or _string_at(0, 4, {'VAN ', 'VON '})
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
852
                        or _string_at(0, 3, {'SCH'})
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
853
                    ):
854 1
                        primary, secondary = _metaph_add('T')
855
                    else:
856 1
                        primary, secondary = _metaph_add('0', 'T')
857 1
                    current += 2
858 1
                    continue
859
860 1
                elif _string_at((current + 1), 1, {'T', 'D'}):
861 1
                    current += 2
862
                else:
863 1
                    current += 1
864 1
                primary, secondary = _metaph_add('T')
865 1
                continue
866
867 1
            elif _get_at(current) == 'V':
868 1
                if _get_at(current + 1) == 'V':
869 1
                    current += 2
870
                else:
871 1
                    current += 1
872 1
                primary, secondary = _metaph_add('F')
873 1
                continue
874
875 1
            elif _get_at(current) == 'W':
876
                # can also be in middle of word
877 1
                if _string_at(current, 2, {'WR'}):
878 1
                    primary, secondary = _metaph_add('R')
879 1
                    current += 2
880 1
                    continue
881 1
                elif (current == 0) and (
882
                    _is_vowel(current + 1) or _string_at(current, 2, {'WH'})
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
883
                ):
884
                    # Wasserman should match Vasserman
885 1
                    if _is_vowel(current + 1):
886 1
                        primary, secondary = _metaph_add('A', 'F')
887
                    else:
888
                        # need Uomo to match Womo
889 1
                        primary, secondary = _metaph_add('A')
890
891
                # Arnow should match Arnoff
892 1
                if (
893
                    ((current == last) and _is_vowel(current - 1))
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
894
                    or _string_at(
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
895
                        (current - 1), 5, {'EWSKI', 'EWSKY', 'OWSKI', 'OWSKY'}
896
                    )
897
                    or _string_at(0, 3, {'SCH'})
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
898
                ):
899 1
                    primary, secondary = _metaph_add('', 'F')
900 1
                    current += 1
901 1
                    continue
902
                # Polish e.g. 'filipowicz'
903 1
                elif _string_at(current, 4, {'WICZ', 'WITZ'}):
904 1
                    primary, secondary = _metaph_add('TS', 'FX')
905 1
                    current += 4
906 1
                    continue
907
                # else skip it
908
                else:
909 1
                    current += 1
910 1
                    continue
911
912 1
            elif _get_at(current) == 'X':
913
                # French e.g. breaux
914 1
                if not (
915
                    (current == last)
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
916
                    and (
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
917
                        _string_at((current - 3), 3, {'IAU', 'EAU'})
918
                        or _string_at((current - 2), 2, {'AU', 'OU'})
919
                    )
920
                ):
921 1
                    primary, secondary = _metaph_add('KS')
922
923 1
                if _string_at((current + 1), 1, {'C', 'X'}):
924 1
                    current += 2
925
                else:
926 1
                    current += 1
927 1
                continue
928
929 1
            elif _get_at(current) == 'Z':
930
                # Chinese Pinyin e.g. 'zhao'
931 1
                if _get_at(current + 1) == 'H':
932 1
                    primary, secondary = _metaph_add('J')
933 1
                    current += 2
934 1
                    continue
935 1
                elif _string_at((current + 1), 2, {'ZO', 'ZI', 'ZA'}) or (
936
                    _slavo_germanic()
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
937
                    and ((current > 0) and _get_at(current - 1) != 'T')
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
938
                ):
939 1
                    primary, secondary = _metaph_add('S', 'TS')
940
                else:
941 1
                    primary, secondary = _metaph_add('S')
942
943 1
                if _get_at(current + 1) == 'Z':
944 1
                    current += 2
945
                else:
946 1
                    current += 1
947 1
                continue
948
949
            else:
950 1
                current += 1
951
952 1
        if max_length > 0:
953 1
            primary = primary[:max_length]
954 1
            secondary = secondary[:max_length]
955 1
        if primary == secondary:
956 1
            secondary = ''
957
958 1
        return primary, secondary
959
960
961 1
def double_metaphone(word, max_length=-1):
962
    """Return the Double Metaphone code for a word.
963
964
    This is a wrapper for :py:meth:`DoubleMetaphone.encode`.
965
966
    Parameters
967
    ----------
968
    word : str
969
        The word to transform
970
    max_length : int
971
        The maximum length of the returned Double Metaphone codes (defaults to
972
        unlimited, but in Philips' original implementation this was 4)
973
974
    Returns
975
    -------
976
    tuple
977
        The Double Metaphone value(s)
978
979
    Examples
980
    --------
981
    >>> double_metaphone('Christopher')
982
    ('KRSTFR', '')
983
    >>> double_metaphone('Niall')
984
    ('NL', '')
985
    >>> double_metaphone('Smith')
986
    ('SM0', 'XMT')
987
    >>> double_metaphone('Schmidt')
988
    ('XMT', 'SMT')
989
990
    """
991 1
    return DoubleMetaphone().encode(word, max_length)
992
993
994
if __name__ == '__main__':
995
    import doctest
996
997
    doctest.testmod()
998