Completed
Pull Request — master (#141)
by Chris
11:42
created

double_metaphone()   A

Complexity

Conditions 1

Size

Total Lines 26
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 2
CRAP Score 1

Importance

Changes 0
Metric Value
cc 1
eloc 2
nop 2
dl 0
loc 26
ccs 2
cts 2
cp 1
crap 1
rs 10
c 0
b 0
f 0
1
# -*- coding: utf-8 -*-
0 ignored issues
show
Coding Style Naming introduced by
The name _DoubleMetaphone does not conform to the module naming conventions ((([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$).

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
2
3
# Copyright 2014-2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19 1
"""abydos.phonetic._DoubleMetaphone.
20
21
Double Metaphone.
22
"""
23
24 1
from __future__ import (
25
    absolute_import,
26
    division,
27
    print_function,
28
    unicode_literals,
29
)
30
31 1
from ._Phonetic import Phonetic
32
33 1
__all__ = ['DoubleMetaphone', 'double_metaphone']
34
35
36 1
class DoubleMetaphone(Phonetic):
0 ignored issues
show
Unused Code introduced by
The variable __class__ seems to be unused.
Loading history...
37
    """Double Metaphone.
38
39
    Based on Lawrence Philips' (Visual) C++ code from 1999
40
    :cite:`Philips:2000`.
41
    """
42
43 1
    def encode(self, word, max_length=-1):
0 ignored issues
show
Bug introduced by
Parameters differ from overridden 'encode' method
Loading history...
44
        """Return the Double Metaphone code for a word.
45
46
        Args:
47
            word (str): The word to transform
48
            max_length (int): The maximum length of the returned Double
49
                Metaphone codes (defaults to 64, but in Philips' original
50
                implementation this was 4)
51
52
        Returns:
53
            tuple: The Double Metaphone value(s)
54
55
        Examples:
56
            >>> pe = DoubleMetaphone()
57
            >>> pe.encode('Christopher')
58
            ('KRSTFR', '')
59
            >>> pe.encode('Niall')
60
            ('NL', '')
61
            >>> pe.encode('Smith')
62
            ('SM0', 'XMT')
63
            >>> pe.encode('Schmidt')
64
            ('XMT', 'SMT')
65
66
        """
67
        # Require a max_length of at least 4
68 1
        if max_length != -1:
69 1
            max_length = max(4, max_length)
70
        else:
71 1
            max_length = 64
72
73 1
        primary = ''
74 1
        secondary = ''
75
76 1
        def _slavo_germanic():
77
            """Return True if the word appears to be Slavic or Germanic.
78
79
            Returns:
80
                bool: True if the word appears to be Slavic or Germanic
81
82
            """
83 1
            if 'W' in word or 'K' in word or 'CZ' in word:
84 1
                return True
85 1
            return False
86
87 1
        def _metaph_add(pri, sec=''):
88
            """Return a new metaphone tuple with the supplied elements.
89
90
            Args:
91
                pri (str): The primary element
92
                sec (str): The secondary element
93
94
            Returns:
95
                tuple: A new metaphone tuple with the supplied elements
96
97
            """
98 1
            newpri = primary
99 1
            newsec = secondary
100 1
            if pri:
101 1
                newpri += pri
102 1
            if sec:
103 1
                if sec != ' ':
104 1
                    newsec += sec
105
            else:
106 1
                newsec += pri
107 1
            return newpri, newsec
108
109 1
        def _is_vowel(pos):
110
            """Return True if the character at word[pos] is a vowel.
111
112
            Args:
113
                pos (int): Position in the word
114
115
            Returns:
116
                bool: True if the character is a vowel
117
118
            """
119 1
            if pos >= 0 and word[pos] in {'A', 'E', 'I', 'O', 'U', 'Y'}:
120 1
                return True
121 1
            return False
122
123 1
        def _get_at(pos):
124
            """Return the character at word[pos].
125
126
            Args:
127
                pos (int): Position in the word
128
129
            Returns:
130
                str: Character at word[pos]
131
132
            """
133 1
            return word[pos]
134
135 1
        def _string_at(pos, slen, substrings):
136
            """Return True if word[pos:pos+slen] is in substrings.
137
138
            Args:
139
                pos (int): Position in the word
140
                slen (int): Substring length
141
                substrings (set): Substrings to search
142
143
            Returns:
144
                bool: True if word[pos:pos+slen] is in substrings
145
146
            """
147 1
            if pos < 0:
148 1
                return False
149 1
            return word[pos : pos + slen] in substrings
150
151 1
        current = 0
152 1
        length = len(word)
153 1
        if length < 1:
154 1
            return '', ''
155 1
        last = length - 1
156
157 1
        word = word.upper()
158 1
        word = word.replace('ß', 'SS')
159
160
        # Pad the original string so that we can index beyond the edge of the
161
        # world
162 1
        word += '     '
163
164
        # Skip these when at start of word
165 1
        if word[0:2] in {'GN', 'KN', 'PN', 'WR', 'PS'}:
166 1
            current += 1
167
168
        # Initial 'X' is pronounced 'Z' e.g. 'Xavier'
169 1
        if _get_at(0) == 'X':
170 1
            primary, secondary = _metaph_add('S')  # 'Z' maps to 'S'
171 1
            current += 1
172
173
        # Main loop
174 1
        while True:
0 ignored issues
show
unused-code introduced by
Too many nested blocks (6/5)
Loading history...
175 1
            if current >= length:
176 1
                break
177
178 1
            if _get_at(current) in {'A', 'E', 'I', 'O', 'U', 'Y'}:
179 1
                if current == 0:
180
                    # All init vowels now map to 'A'
181 1
                    primary, secondary = _metaph_add('A')
182 1
                current += 1
183 1
                continue
184
185 1
            elif _get_at(current) == 'B':
186
                # "-mb", e.g", "dumb", already skipped over...
187 1
                primary, secondary = _metaph_add('P')
188 1
                if _get_at(current + 1) == 'B':
189 1
                    current += 2
190
                else:
191 1
                    current += 1
192 1
                continue
193
194 1
            elif _get_at(current) == 'Ç':
195 1
                primary, secondary = _metaph_add('S')
196 1
                current += 1
197 1
                continue
198
199 1
            elif _get_at(current) == 'C':
200
                # Various Germanic
201 1
                if (
202
                    current > 1
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
best-practice introduced by
Too many boolean expressions in if statement (6/5)
Loading history...
203
                    and not _is_vowel(current - 2)
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
204
                    and _string_at((current - 1), 3, {'ACH'})
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
205
                    and (
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
206
                        (_get_at(current + 2) != 'I')
207
                        and (
208
                            (_get_at(current + 2) != 'E')
209
                            or _string_at(
210
                                (current - 2), 6, {'BACHER', 'MACHER'}
211
                            )
212
                        )
213
                    )
214
                ):
215 1
                    primary, secondary = _metaph_add('K')
216 1
                    current += 2
217 1
                    continue
218
219
                # Special case 'caesar'
220 1
                elif current == 0 and _string_at(current, 6, {'CAESAR'}):
221 1
                    primary, secondary = _metaph_add('S')
222 1
                    current += 2
223 1
                    continue
224
225
                # Italian 'chianti'
226 1
                elif _string_at(current, 4, {'CHIA'}):
227 1
                    primary, secondary = _metaph_add('K')
228 1
                    current += 2
229 1
                    continue
230
231 1
                elif _string_at(current, 2, {'CH'}):
232
                    # Find 'Michael'
233 1
                    if current > 0 and _string_at(current, 4, {'CHAE'}):
234 1
                        primary, secondary = _metaph_add('K', 'X')
235 1
                        current += 2
236 1
                        continue
237
238
                    # Greek roots e.g. 'chemistry', 'chorus'
239 1
                    elif (
240
                        current == 0
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
241
                        and (
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
242
                            _string_at((current + 1), 5, {'HARAC', 'HARIS'})
243
                            or _string_at(
244
                                (current + 1), 3, {'HOR', 'HYM', 'HIA', 'HEM'}
245
                            )
246
                        )
247
                        and not _string_at(0, 5, {'CHORE'})
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
248
                    ):
249 1
                        primary, secondary = _metaph_add('K')
250 1
                        current += 2
251 1
                        continue
252
253
                    # Germanic, Greek, or otherwise 'ch' for 'kh' sound
254 1
                    elif (
255
                        (
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
best-practice introduced by
Too many boolean expressions in if statement (7/5)
Loading history...
256
                            _string_at(0, 4, {'VAN ', 'VON '})
257
                            or _string_at(0, 3, {'SCH'})
258
                        )
259
                        or
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
260
                        # 'architect but not 'arch', 'orchestra', 'orchid'
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
261
                        _string_at(
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
262
                            (current - 2), 6, {'ORCHES', 'ARCHIT', 'ORCHID'}
263
                        )
264
                        or _string_at((current + 2), 1, {'T', 'S'})
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
265
                        or (
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
266
                            (
267
                                _string_at(
268
                                    (current - 1), 1, {'A', 'O', 'U', 'E'}
269
                                )
270
                                or (current == 0)
271
                            )
272
                            and
273
                            # e.g., 'wachtler', 'wechsler', but not 'tichner'
274
                            _string_at(
275
                                (current + 2),
276
                                1,
277
                                {
278
                                    'L',
279
                                    'R',
280
                                    'N',
281
                                    'M',
282
                                    'B',
283
                                    'H',
284
                                    'F',
285
                                    'V',
286
                                    'W',
287
                                    ' ',
288
                                },
289
                            )
290
                        )
291
                    ):
292 1
                        primary, secondary = _metaph_add('K')
293
294
                    else:
295 1
                        if current > 0:
296 1
                            if _string_at(0, 2, {'MC'}):
297
                                # e.g., "McHugh"
298 1
                                primary, secondary = _metaph_add('K')
299
                            else:
300 1
                                primary, secondary = _metaph_add('X', 'K')
301
                        else:
302 1
                            primary, secondary = _metaph_add('X')
303
304 1
                    current += 2
305 1
                    continue
306
307
                # e.g, 'czerny'
308 1
                elif _string_at(current, 2, {'CZ'}) and not _string_at(
309
                    (current - 2), 4, {'WICZ'}
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
310
                ):
311 1
                    primary, secondary = _metaph_add('S', 'X')
312 1
                    current += 2
313 1
                    continue
314
315
                # e.g., 'focaccia'
316 1
                elif _string_at((current + 1), 3, {'CIA'}):
317 1
                    primary, secondary = _metaph_add('X')
318 1
                    current += 3
319
320
                # double 'C', but not if e.g. 'McClellan'
321 1
                elif _string_at(current, 2, {'CC'}) and not (
322
                    (current == 1) and (_get_at(0) == 'M')
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
323
                ):
324
                    # 'bellocchio' but not 'bacchus'
325 1
                    if _string_at(
326
                        (current + 2), 1, {'I', 'E', 'H'}
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
327
                    ) and not _string_at((current + 2), 2, ['HU']):
328
                        # 'accident', 'accede' 'succeed'
329 1
                        if (
330
                            (current == 1) and _get_at(current - 1) == 'A'
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
331
                        ) or _string_at((current - 1), 5, {'UCCEE', 'UCCES'}):
332 1
                            primary, secondary = _metaph_add('KS')
333
                        # 'bacci', 'bertucci', other italian
334
                        else:
335 1
                            primary, secondary = _metaph_add('X')
336 1
                        current += 3
337 1
                        continue
338
                    else:  # Pierce's rule
339 1
                        primary, secondary = _metaph_add('K')
340 1
                        current += 2
341 1
                        continue
342
343 1
                elif _string_at(current, 2, {'CK', 'CG', 'CQ'}):
344 1
                    primary, secondary = _metaph_add('K')
345 1
                    current += 2
346 1
                    continue
347
348 1
                elif _string_at(current, 2, {'CI', 'CE', 'CY'}):
349
                    # Italian vs. English
350 1
                    if _string_at(current, 3, {'CIO', 'CIE', 'CIA'}):
351 1
                        primary, secondary = _metaph_add('S', 'X')
352
                    else:
353 1
                        primary, secondary = _metaph_add('S')
354 1
                    current += 2
355 1
                    continue
356
357
                # else
358
                else:
359 1
                    primary, secondary = _metaph_add('K')
360
361
                    # name sent in 'mac caffrey', 'mac gregor
362 1
                    if _string_at((current + 1), 2, {' C', ' Q', ' G'}):
363 1
                        current += 3
364 1
                    elif _string_at(
365
                        (current + 1), 1, {'C', 'K', 'Q'}
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
366
                    ) and not _string_at((current + 1), 2, {'CE', 'CI'}):
367 1
                        current += 2
368
                    else:
369 1
                        current += 1
370 1
                    continue
371
372 1
            elif _get_at(current) == 'D':
373 1
                if _string_at(current, 2, {'DG'}):
374 1
                    if _string_at((current + 2), 1, {'I', 'E', 'Y'}):
375
                        # e.g. 'edge'
376 1
                        primary, secondary = _metaph_add('J')
377 1
                        current += 3
378 1
                        continue
379
                    else:
380
                        # e.g. 'edgar'
381 1
                        primary, secondary = _metaph_add('TK')
382 1
                        current += 2
383 1
                        continue
384
385 1
                elif _string_at(current, 2, {'DT', 'DD'}):
386 1
                    primary, secondary = _metaph_add('T')
387 1
                    current += 2
388 1
                    continue
389
390
                # else
391
                else:
392 1
                    primary, secondary = _metaph_add('T')
393 1
                    current += 1
394 1
                    continue
395
396 1
            elif _get_at(current) == 'F':
397 1
                if _get_at(current + 1) == 'F':
398 1
                    current += 2
399
                else:
400 1
                    current += 1
401 1
                primary, secondary = _metaph_add('F')
402 1
                continue
403
404 1
            elif _get_at(current) == 'G':
405 1
                if _get_at(current + 1) == 'H':
406 1
                    if (current > 0) and not _is_vowel(current - 1):
407 1
                        primary, secondary = _metaph_add('K')
408 1
                        current += 2
409 1
                        continue
410
411
                    # 'ghislane', ghiradelli
412 1
                    elif current == 0:
413 1
                        if _get_at(current + 2) == 'I':
414 1
                            primary, secondary = _metaph_add('J')
415
                        else:
416 1
                            primary, secondary = _metaph_add('K')
417 1
                        current += 2
418 1
                        continue
419
420
                    # Parker's rule (with some further refinements) -
421
                    # e.g., 'hugh'
422 1
                    elif (
423
                        (
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
best-practice introduced by
Too many boolean expressions in if statement (6/5)
Loading history...
424
                            (current > 1)
425
                            and _string_at((current - 2), 1, {'B', 'H', 'D'})
426
                        )
427
                        or
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
428
                        # e.g., 'bough'
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
429
                        (
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
430
                            (current > 2)
431
                            and _string_at((current - 3), 1, {'B', 'H', 'D'})
432
                        )
433
                        or
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
434
                        # e.g., 'broughton'
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
435
                        (
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
436
                            (current > 3)
437
                            and _string_at((current - 4), 1, {'B', 'H'})
438
                        )
439
                    ):
440 1
                        current += 2
441 1
                        continue
442
                    else:
443
                        # e.g. 'laugh', 'McLaughlin', 'cough',
444
                        #      'gough', 'rough', 'tough'
445 1
                        if (
446
                            (current > 2)
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
447
                            and (_get_at(current - 1) == 'U')
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
448
                            and (
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
449
                                _string_at(
450
                                    (current - 3), 1, {'C', 'G', 'L', 'R', 'T'}
451
                                )
452
                            )
453
                        ):
454 1
                            primary, secondary = _metaph_add('F')
455 1
                        elif (current > 0) and _get_at(current - 1) != 'I':
456 1
                            primary, secondary = _metaph_add('K')
457 1
                        current += 2
458 1
                        continue
459
460 1
                elif _get_at(current + 1) == 'N':
461 1
                    if (
462
                        (current == 1)
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
463
                        and _is_vowel(0)
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
464
                        and not _slavo_germanic()
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
465
                    ):
466 1
                        primary, secondary = _metaph_add('KN', 'N')
467
                    # not e.g. 'cagney'
468 1
                    elif (
469
                        not _string_at((current + 2), 2, {'EY'})
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
470
                        and (_get_at(current + 1) != 'Y')
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
471
                        and not _slavo_germanic()
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
472
                    ):
473 1
                        primary, secondary = _metaph_add('N', 'KN')
474
                    else:
475 1
                        primary, secondary = _metaph_add('KN')
476 1
                    current += 2
477 1
                    continue
478
479
                # 'tagliaro'
480 1
                elif (
481
                    _string_at((current + 1), 2, {'LI'})
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
482
                    and not _slavo_germanic()
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
483
                ):
484 1
                    primary, secondary = _metaph_add('KL', 'L')
485 1
                    current += 2
486 1
                    continue
487
488
                # -ges-, -gep-, -gel-, -gie- at beginning
489 1
                elif (current == 0) and (
490
                    (_get_at(current + 1) == 'Y')
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
491
                    or _string_at(
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
492
                        (current + 1),
493
                        2,
494
                        {
495
                            'ES',
496
                            'EP',
497
                            'EB',
498
                            'EL',
499
                            'EY',
500
                            'IB',
501
                            'IL',
502
                            'IN',
503
                            'IE',
504
                            'EI',
505
                            'ER',
506
                        },
507
                    )
508
                ):
509 1
                    primary, secondary = _metaph_add('K', 'J')
510 1
                    current += 2
511 1
                    continue
512
513
                #  -ger-,  -gy-
514 1
                elif (
515
                    (
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
516
                        _string_at((current + 1), 2, {'ER'})
517
                        or (_get_at(current + 1) == 'Y')
518
                    )
519
                    and not _string_at(0, 6, {'DANGER', 'RANGER', 'MANGER'})
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
520
                    and not _string_at((current - 1), 1, {'E', 'I'})
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
521
                    and not _string_at((current - 1), 3, {'RGY', 'OGY'})
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
522
                ):
523 1
                    primary, secondary = _metaph_add('K', 'J')
524 1
                    current += 2
525 1
                    continue
526
527
                #  italian e.g, 'biaggi'
528 1
                elif _string_at(
529
                    (current + 1), 1, {'E', 'I', 'Y'}
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
530
                ) or _string_at((current - 1), 4, {'AGGI', 'OGGI'}):
531
                    # obvious germanic
532 1
                    if (
533
                        _string_at(0, 4, {'VAN ', 'VON '})
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
534
                        or _string_at(0, 3, {'SCH'})
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
535
                    ) or _string_at((current + 1), 2, {'ET'}):
536 1
                        primary, secondary = _metaph_add('K')
537 1
                    elif _string_at((current + 1), 4, {'IER '}):
538 1
                        primary, secondary = _metaph_add('J')
539
                    else:
540 1
                        primary, secondary = _metaph_add('J', 'K')
541 1
                    current += 2
542 1
                    continue
543
544
                else:
545 1
                    if _get_at(current + 1) == 'G':
546 1
                        current += 2
547
                    else:
548 1
                        current += 1
549 1
                    primary, secondary = _metaph_add('K')
550 1
                    continue
551
552 1
            elif _get_at(current) == 'H':
553
                # only keep if first & before vowel or btw. 2 vowels
554 1
                if ((current == 0) or _is_vowel(current - 1)) and _is_vowel(
555
                    current + 1
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
556
                ):
557 1
                    primary, secondary = _metaph_add('H')
558 1
                    current += 2
559
                else:  # also takes care of 'HH'
560 1
                    current += 1
561 1
                continue
562
563 1
            elif _get_at(current) == 'J':
564
                # obvious spanish, 'jose', 'san jacinto'
565 1
                if _string_at(current, 4, ['JOSE']) or _string_at(
566
                    0, 4, {'SAN '}
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
567
                ):
568 1
                    if (
569
                        (current == 0) and (_get_at(current + 4) == ' ')
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
570
                    ) or _string_at(0, 4, ['SAN ']):
571 1
                        primary, secondary = _metaph_add('H')
572
                    else:
573 1
                        primary, secondary = _metaph_add('J', 'H')
574 1
                    current += 1
575 1
                    continue
576
577 1
                elif (current == 0) and not _string_at(current, 4, {'JOSE'}):
578
                    # Yankelovich/Jankelowicz
579 1
                    primary, secondary = _metaph_add('J', 'A')
580
                # Spanish pron. of e.g. 'bajador'
581 1
                elif (
582
                    _is_vowel(current - 1)
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
583
                    and not _slavo_germanic()
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
584
                    and (
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
585
                        (_get_at(current + 1) == 'A')
586
                        or (_get_at(current + 1) == 'O')
587
                    )
588
                ):
589 1
                    primary, secondary = _metaph_add('J', 'H')
590 1
                elif current == last:
591 1
                    primary, secondary = _metaph_add('J', ' ')
592 1
                elif not _string_at(
593
                    (current + 1), 1, {'L', 'T', 'K', 'S', 'N', 'M', 'B', 'Z'}
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
594
                ) and not _string_at((current - 1), 1, {'S', 'K', 'L'}):
595 1
                    primary, secondary = _metaph_add('J')
596
597 1
                if _get_at(current + 1) == 'J':  # it could happen!
598 1
                    current += 2
599
                else:
600 1
                    current += 1
601 1
                continue
602
603 1
            elif _get_at(current) == 'K':
604 1
                if _get_at(current + 1) == 'K':
605 1
                    current += 2
606
                else:
607 1
                    current += 1
608 1
                primary, secondary = _metaph_add('K')
609 1
                continue
610
611 1
            elif _get_at(current) == 'L':
612 1
                if _get_at(current + 1) == 'L':
613
                    # Spanish e.g. 'cabrillo', 'gallegos'
614 1
                    if (
615
                        (current == (length - 3))
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
616
                        and _string_at(
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
617
                            (current - 1), 4, {'ILLO', 'ILLA', 'ALLE'}
618
                        )
619
                    ) or (
620
                        (
621
                            _string_at((last - 1), 2, {'AS', 'OS'})
622
                            or _string_at(last, 1, {'A', 'O'})
623
                        )
624
                        and _string_at((current - 1), 4, {'ALLE'})
625
                    ):
626 1
                        primary, secondary = _metaph_add('L', ' ')
627 1
                        current += 2
628 1
                        continue
629 1
                    current += 2
630
                else:
631 1
                    current += 1
632 1
                primary, secondary = _metaph_add('L')
633 1
                continue
634
635 1
            elif _get_at(current) == 'M':
636 1
                if (
637
                    (
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
638
                        _string_at((current - 1), 3, {'UMB'})
639
                        and (
640
                            ((current + 1) == last)
641
                            or _string_at((current + 2), 2, {'ER'})
642
                        )
643
                    )
644
                    or
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
645
                    # 'dumb', 'thumb'
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
646
                    (_get_at(current + 1) == 'M')
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
647
                ):
648 1
                    current += 2
649
                else:
650 1
                    current += 1
651 1
                primary, secondary = _metaph_add('M')
652 1
                continue
653
654 1
            elif _get_at(current) == 'N':
655 1
                if _get_at(current + 1) == 'N':
656 1
                    current += 2
657
                else:
658 1
                    current += 1
659 1
                primary, secondary = _metaph_add('N')
660 1
                continue
661
662 1
            elif _get_at(current) == 'Ñ':
663 1
                current += 1
664 1
                primary, secondary = _metaph_add('N')
665 1
                continue
666
667 1
            elif _get_at(current) == 'P':
668 1
                if _get_at(current + 1) == 'H':
669 1
                    primary, secondary = _metaph_add('F')
670 1
                    current += 2
671 1
                    continue
672
673
                # also account for "campbell", "raspberry"
674 1
                elif _string_at((current + 1), 1, {'P', 'B'}):
675 1
                    current += 2
676
                else:
677 1
                    current += 1
678 1
                primary, secondary = _metaph_add('P')
679 1
                continue
680
681 1
            elif _get_at(current) == 'Q':
682 1
                if _get_at(current + 1) == 'Q':
683 1
                    current += 2
684
                else:
685 1
                    current += 1
686 1
                primary, secondary = _metaph_add('K')
687 1
                continue
688
689 1
            elif _get_at(current) == 'R':
690
                # french e.g. 'rogier', but exclude 'hochmeier'
691 1
                if (
692
                    (current == last)
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
693
                    and not _slavo_germanic()
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
694
                    and _string_at((current - 2), 2, {'IE'})
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
695
                    and not _string_at((current - 4), 2, {'ME', 'MA'})
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
696
                ):
697 1
                    primary, secondary = _metaph_add('', 'R')
698
                else:
699 1
                    primary, secondary = _metaph_add('R')
700
701 1
                if _get_at(current + 1) == 'R':
702 1
                    current += 2
703
                else:
704 1
                    current += 1
705 1
                continue
706
707 1
            elif _get_at(current) == 'S':
708
                # special cases 'island', 'isle', 'carlisle', 'carlysle'
709 1
                if _string_at((current - 1), 3, {'ISL', 'YSL'}):
710 1
                    current += 1
711 1
                    continue
712
713
                # special case 'sugar-'
714 1
                elif (current == 0) and _string_at(current, 5, {'SUGAR'}):
715 1
                    primary, secondary = _metaph_add('X', 'S')
716 1
                    current += 1
717 1
                    continue
718
719 1
                elif _string_at(current, 2, {'SH'}):
720
                    # Germanic
721 1
                    if _string_at(
722
                        (current + 1), 4, {'HEIM', 'HOEK', 'HOLM', 'HOLZ'}
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
723
                    ):
724 1
                        primary, secondary = _metaph_add('S')
725
                    else:
726 1
                        primary, secondary = _metaph_add('X')
727 1
                    current += 2
728 1
                    continue
729
730
                # Italian & Armenian
731 1
                elif _string_at(current, 3, {'SIO', 'SIA'}) or _string_at(
732
                    current, 4, {'SIAN'}
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
733
                ):
734 1
                    if not _slavo_germanic():
735 1
                        primary, secondary = _metaph_add('S', 'X')
736
                    else:
737 1
                        primary, secondary = _metaph_add('S')
738 1
                    current += 3
739 1
                    continue
740
741
                # German & anglicisations, e.g. 'smith' match 'schmidt',
742
                #                               'snider' match 'schneider'
743
                # also, -sz- in Slavic language although in Hungarian it is
744
                #       pronounced 's'
745 1
                elif (
746
                    (current == 0)
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
747
                    and _string_at((current + 1), 1, {'M', 'N', 'L', 'W'})
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
748
                ) or _string_at((current + 1), 1, {'Z'}):
749 1
                    primary, secondary = _metaph_add('S', 'X')
750 1
                    if _string_at((current + 1), 1, {'Z'}):
751 1
                        current += 2
752
                    else:
753 1
                        current += 1
754 1
                    continue
755
756 1
                elif _string_at(current, 2, {'SC'}):
757
                    # Schlesinger's rule
758 1
                    if _get_at(current + 2) == 'H':
759
                        # dutch origin, e.g. 'school', 'schooner'
760 1
                        if _string_at(
761
                            (current + 3),
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
762
                            2,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
763
                            {'OO', 'ER', 'EN', 'UY', 'ED', 'EM'},
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
764
                        ):
765
                            # 'schermerhorn', 'schenker'
766 1
                            if _string_at((current + 3), 2, {'ER', 'EN'}):
767 1
                                primary, secondary = _metaph_add('X', 'SK')
768
                            else:
769 1
                                primary, secondary = _metaph_add('SK')
770 1
                            current += 3
771 1
                            continue
772
                        else:
773 1
                            if (
774
                                (current == 0)
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
775
                                and not _is_vowel(3)
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
776
                                and (_get_at(3) != 'W')
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
777
                            ):
778 1
                                primary, secondary = _metaph_add('X', 'S')
779
                            else:
780 1
                                primary, secondary = _metaph_add('X')
781 1
                            current += 3
782 1
                            continue
783
784 1
                    elif _string_at((current + 2), 1, {'I', 'E', 'Y'}):
785 1
                        primary, secondary = _metaph_add('S')
786 1
                        current += 3
787 1
                        continue
788
789
                    # else
790
                    else:
791 1
                        primary, secondary = _metaph_add('SK')
792 1
                        current += 3
793 1
                        continue
794
795
                else:
796
                    # french e.g. 'resnais', 'artois'
797 1
                    if (current == last) and _string_at(
798
                        (current - 2), 2, {'AI', 'OI'}
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
799
                    ):
800 1
                        primary, secondary = _metaph_add('', 'S')
801
                    else:
802 1
                        primary, secondary = _metaph_add('S')
803
804 1
                    if _string_at((current + 1), 1, {'S', 'Z'}):
805 1
                        current += 2
806
                    else:
807 1
                        current += 1
808 1
                    continue
809
810 1
            elif _get_at(current) == 'T':
811 1
                if _string_at(current, 4, {'TION'}):
812 1
                    primary, secondary = _metaph_add('X')
813 1
                    current += 3
814 1
                    continue
815
816 1
                elif _string_at(current, 3, {'TIA', 'TCH'}):
817 1
                    primary, secondary = _metaph_add('X')
818 1
                    current += 3
819 1
                    continue
820
821 1
                elif _string_at(current, 2, {'TH'}) or _string_at(
822
                    current, 3, {'TTH'}
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
823
                ):
824
                    # special case 'thomas', 'thames' or germanic
825 1
                    if (
826
                        _string_at((current + 2), 2, {'OM', 'AM'})
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
827
                        or _string_at(0, 4, {'VAN ', 'VON '})
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
828
                        or _string_at(0, 3, {'SCH'})
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
829
                    ):
830 1
                        primary, secondary = _metaph_add('T')
831
                    else:
832 1
                        primary, secondary = _metaph_add('0', 'T')
833 1
                    current += 2
834 1
                    continue
835
836 1
                elif _string_at((current + 1), 1, {'T', 'D'}):
837 1
                    current += 2
838
                else:
839 1
                    current += 1
840 1
                primary, secondary = _metaph_add('T')
841 1
                continue
842
843 1
            elif _get_at(current) == 'V':
844 1
                if _get_at(current + 1) == 'V':
845 1
                    current += 2
846
                else:
847 1
                    current += 1
848 1
                primary, secondary = _metaph_add('F')
849 1
                continue
850
851 1
            elif _get_at(current) == 'W':
852
                # can also be in middle of word
853 1
                if _string_at(current, 2, {'WR'}):
854 1
                    primary, secondary = _metaph_add('R')
855 1
                    current += 2
856 1
                    continue
857 1
                elif (current == 0) and (
858
                    _is_vowel(current + 1) or _string_at(current, 2, {'WH'})
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
859
                ):
860
                    # Wasserman should match Vasserman
861 1
                    if _is_vowel(current + 1):
862 1
                        primary, secondary = _metaph_add('A', 'F')
863
                    else:
864
                        # need Uomo to match Womo
865 1
                        primary, secondary = _metaph_add('A')
866
867
                # Arnow should match Arnoff
868 1
                if (
869
                    ((current == last) and _is_vowel(current - 1))
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
870
                    or _string_at(
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
871
                        (current - 1), 5, {'EWSKI', 'EWSKY', 'OWSKI', 'OWSKY'}
872
                    )
873
                    or _string_at(0, 3, ['SCH'])
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
874
                ):
875 1
                    primary, secondary = _metaph_add('', 'F')
876 1
                    current += 1
877 1
                    continue
878
                # Polish e.g. 'filipowicz'
879 1
                elif _string_at(current, 4, {'WICZ', 'WITZ'}):
880 1
                    primary, secondary = _metaph_add('TS', 'FX')
881 1
                    current += 4
882 1
                    continue
883
                # else skip it
884
                else:
885 1
                    current += 1
886 1
                    continue
887
888 1
            elif _get_at(current) == 'X':
889
                # French e.g. breaux
890 1
                if not (
891
                    (current == last)
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
892
                    and (
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
893
                        _string_at((current - 3), 3, {'IAU', 'EAU'})
894
                        or _string_at((current - 2), 2, {'AU', 'OU'})
895
                    )
896
                ):
897 1
                    primary, secondary = _metaph_add('KS')
898
899 1
                if _string_at((current + 1), 1, {'C', 'X'}):
900 1
                    current += 2
901
                else:
902 1
                    current += 1
903 1
                continue
904
905 1
            elif _get_at(current) == 'Z':
906
                # Chinese Pinyin e.g. 'zhao'
907 1
                if _get_at(current + 1) == 'H':
908 1
                    primary, secondary = _metaph_add('J')
909 1
                    current += 2
910 1
                    continue
911 1
                elif _string_at((current + 1), 2, {'ZO', 'ZI', 'ZA'}) or (
912
                    _slavo_germanic()
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
913
                    and ((current > 0) and _get_at(current - 1) != 'T')
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
914
                ):
915 1
                    primary, secondary = _metaph_add('S', 'TS')
916
                else:
917 1
                    primary, secondary = _metaph_add('S')
918
919 1
                if _get_at(current + 1) == 'Z':
920 1
                    current += 2
921
                else:
922 1
                    current += 1
923 1
                continue
924
925
            else:
926 1
                current += 1
927
928 1
        if max_length > 0:
929 1
            primary = primary[:max_length]
930 1
            secondary = secondary[:max_length]
931 1
        if primary == secondary:
932 1
            secondary = ''
933
934 1
        return primary, secondary
935
936
937 1
def double_metaphone(word, max_length=-1):
938
    """Return the Double Metaphone code for a word.
939
940
    This is a wrapper for :py:meth:`DoubleMetaphone.encode`.
941
942
    Args:
943
        word (str): The word to transform
944
        max_length (int): The maximum length of the returned Double
945
            Metaphone codes (defaults to 64, but in Philips' original
946
            implementation this was 4)
947
948
    Returns:
949
        tuple: The Double Metaphone value(s)
950
951
    Examples:
952
        >>> double_metaphone('Christopher')
953
        ('KRSTFR', '')
954
        >>> double_metaphone('Niall')
955
        ('NL', '')
956
        >>> double_metaphone('Smith')
957
        ('SM0', 'XMT')
958
        >>> double_metaphone('Schmidt')
959
        ('XMT', 'SMT')
960
961
    """
962 1
    return DoubleMetaphone().encode(word, max_length)
963
964
965
if __name__ == '__main__':
966
    import doctest
967
968
    doctest.testmod()
969