abydos.phonetic._daitch_mokotoff.DaitchMokotoff.encode() - Code Metrics - Inspection of "started new entry in HISTORY for 0.4.0" - chrislit/abydos - Measure and Improve Code Quality continuously with Scrutinizer

Test Failed

Push — master ( 23810f...afe14d )

by Chris

created 2019-06-01 00:50 UTC

DaitchMokotoff.encode() D

↳ Parent: abydos.phonetic._daitch_mokotoff

Complexity

Conditions

Size

Total Lines	99
Code Lines	37

Duplication

Lines	0
Ratio	0 %

Code Coverage

Tests	32
CRAP Score	12

Importance

Changes

Metric	Value
cc	12
eloc	37
nop	4
dl	0
loc	99
ccs	32
cts	32
cp	1
crap	12
rs	4.8
c	0
b	0
f	0

How to fix Long Method Complexity

# -*- coding: utf-8 -*-

# Copyright 2014-2018 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.phonetic._daitch_mokotoff.

Daitch-Mokotoff Soundex
"""

from __future__ import (
    absolute_import,
    division,
    print_function,
    unicode_literals,
)

from unicodedata import normalize as unicode_normalize

from six import text_type

from ._phonetic import _Phonetic

__all__ = ['DaitchMokotoff', 'dm_soundex']


class DaitchMokotoff(_Phonetic):
    """Daitch-Mokotoff Soundex.

    Based on Daitch-Mokotoff Soundex :cite:`Mokotoff:1997`, this returns values
    of a word as a set. A collection is necessary since there can be multiple
    values for a single word.
    """

    _dms_table = {
        'STCH': (2, 4, 4),
        'DRZ': (4, 4, 4),
        'ZH': (4, 4, 4),
        'ZHDZH': (2, 4, 4),
        'DZH': (4, 4, 4),
        'DRS': (4, 4, 4),
        'DZS': (4, 4, 4),
        'SCHTCH': (2, 4, 4),
        'SHTSH': (2, 4, 4),
        'SZCZ': (2, 4, 4),
        'TZS': (4, 4, 4),
        'SZCS': (2, 4, 4),
        'STSH': (2, 4, 4),
        'SHCH': (2, 4, 4),
        'D': (3, 3, 3),
        'H': (5, 5, '_'),
        'TTSCH': (4, 4, 4),
        'THS': (4, 4, 4),
        'L': (8, 8, 8),
        'P': (7, 7, 7),
        'CHS': (5, 54, 54),
        'T': (3, 3, 3),
        'X': (5, 54, 54),
        'OJ': (0, 1, '_'),
        'OI': (0, 1, '_'),
        'SCHTSH': (2, 4, 4),
        'OY': (0, 1, '_'),
        'Y': (1, '_', '_'),
        'TSH': (4, 4, 4),
        'ZDZ': (2, 4, 4),
        'TSZ': (4, 4, 4),
        'SHT': (2, 43, 43),
        'SCHTSCH': (2, 4, 4),
        'TTSZ': (4, 4, 4),
        'TTZ': (4, 4, 4),
        'SCH': (4, 4, 4),
        'TTS': (4, 4, 4),
        'SZD': (2, 43, 43),
        'AI': (0, 1, '_'),
        'PF': (7, 7, 7),
        'TCH': (4, 4, 4),
        'PH': (7, 7, 7),
        'TTCH': (4, 4, 4),
        'SZT': (2, 43, 43),
        'ZDZH': (2, 4, 4),
        'EI': (0, 1, '_'),
        'G': (5, 5, 5),
        'EJ': (0, 1, '_'),
        'ZD': (2, 43, 43),
        'IU': (1, '_', '_'),
        'K': (5, 5, 5),
        'O': (0, '_', '_'),
        'SHTCH': (2, 4, 4),
        'S': (4, 4, 4),
        'TRZ': (4, 4, 4),
        'SHD': (2, 43, 43),
        'DSH': (4, 4, 4),
        'CSZ': (4, 4, 4),
        'EU': (1, 1, '_'),
        'TRS': (4, 4, 4),
        'ZS': (4, 4, 4),
        'STRZ': (2, 4, 4),
        'UY': (0, 1, '_'),
        'STRS': (2, 4, 4),
        'CZS': (4, 4, 4),
        'MN': ('6_6', '6_6', '6_6'),
        'UI': (0, 1, '_'),
        'UJ': (0, 1, '_'),
        'UE': (0, '_', '_'),
        'EY': (0, 1, '_'),
        'W': (7, 7, 7),
        'IA': (1, '_', '_'),
        'FB': (7, 7, 7),
        'STSCH': (2, 4, 4),
        'SCHT': (2, 43, 43),
        'NM': ('6_6', '6_6', '6_6'),
        'SCHD': (2, 43, 43),
        'B': (7, 7, 7),
        'DSZ': (4, 4, 4),
        'F': (7, 7, 7),
        'N': (6, 6, 6),
        'CZ': (4, 4, 4),
        'R': (9, 9, 9),
        'U': (0, '_', '_'),
        'V': (7, 7, 7),
        'CS': (4, 4, 4),
        'Z': (4, 4, 4),
        'SZ': (4, 4, 4),
        'TSCH': (4, 4, 4),
        'KH': (5, 5, 5),
        'ST': (2, 43, 43),
        'KS': (5, 54, 54),
        'SH': (4, 4, 4),
        'SC': (2, 4, 4),
        'SD': (2, 43, 43),
        'DZ': (4, 4, 4),
        'ZHD': (2, 43, 43),
        'DT': (3, 3, 3),
        'ZSH': (4, 4, 4),
        'DS': (4, 4, 4),
        'TZ': (4, 4, 4),
        'TS': (4, 4, 4),
        'TH': (3, 3, 3),
        'TC': (4, 4, 4),
        'A': (0, '_', '_'),
        'E': (0, '_', '_'),
        'I': (0, '_', '_'),
        'AJ': (0, 1, '_'),
        'M': (6, 6, 6),
        'Q': (5, 5, 5),
        'AU': (0, 7, '_'),
        'IO': (1, '_', '_'),
        'AY': (0, 1, '_'),
        'IE': (1, '_', '_'),
        'ZSCH': (4, 4, 4),
        'CH': ((5, 4), (5, 4), (5, 4)),
        'CK': ((5, 45), (5, 45), (5, 45)),
        'C': ((5, 4), (5, 4), (5, 4)),
        'J': ((1, 4), ('_', 4), ('_', 4)),
        'RZ': ((94, 4), (94, 4), (94, 4)),
        'RS': ((94, 4), (94, 4), (94, 4)),
    }

    _dms_order = {
        'A': ('AI', 'AJ', 'AU', 'AY', 'A'),
        'B': ('B',),
        'C': ('CHS', 'CSZ', 'CZS', 'CH', 'CK', 'CS', 'CZ', 'C'),
        'D': ('DRS', 'DRZ', 'DSH', 'DSZ', 'DZH', 'DZS', 'DS', 'DT', 'DZ', 'D'),
        'E': ('EI', 'EJ', 'EU', 'EY', 'E'),
        'F': ('FB', 'F'),
        'G': ('G',),
        'H': ('H',),
        'I': ('IA', 'IE', 'IO', 'IU', 'I'),
        'J': ('J',),
        'K': ('KH', 'KS', 'K'),
        'L': ('L',),
        'M': ('MN', 'M'),
        'N': ('NM', 'N'),
        'O': ('OI', 'OJ', 'OY', 'O'),
        'P': ('PF', 'PH', 'P'),
        'Q': ('Q',),
        'R': ('RS', 'RZ', 'R'),
        'S': (
            'SCHTSCH',
            'SCHTCH',
            'SCHTSH',
            'SHTCH',
            'SHTSH',
            'STSCH',
            'SCHD',
            'SCHT',
            'SHCH',
            'STCH',
            'STRS',
            'STRZ',
            'STSH',
            'SZCS',
            'SZCZ',
            'SCH',
            'SHD',
            'SHT',
            'SZD',
            'SZT',
            'SC',
            'SD',
            'SH',
            'ST',
            'SZ',
            'S',
        ),
        'T': (
            'TTSCH',
            'TSCH',
            'TTCH',
            'TTSZ',
            'TCH',
            'THS',
            'TRS',
            'TRZ',
            'TSH',
            'TSZ',
            'TTS',
            'TTZ',
            'TZS',
            'TC',
            'TH',
            'TS',
            'TZ',
            'T',
        ),
        'U': ('UE', 'UI', 'UJ', 'UY', 'U'),
        'V': ('V',),
        'W': ('W',),
        'X': ('X',),
        'Y': ('Y',),
        'Z': (
            'ZHDZH',
            'ZDZH',
            'ZSCH',
            'ZDZ',
            'ZHD',
            'ZSH',
            'ZD',
            'ZH',
            'ZS',
            'Z',
        ),
    }

    _uc_v_set = set('AEIJOUY')

    def encode(self, word, max_length=6, zero_pad=True):
        """Return the Daitch-Mokotoff Soundex code for a word.

        Parameters
        ----------
        word : str
            The word to transform
        max_length : int
            The length of the code returned (defaults to 6; must be between 6
            and 64)
        zero_pad : bool
            Pad the end of the return value with 0s to achieve a max_length
            string

        Returns
        -------
        str
            The Daitch-Mokotoff Soundex value

        Examples
        --------
        >>> pe = DaitchMokotoff()
        >>> sorted(pe.encode('Christopher'))
        ['494379', '594379']
        >>> pe.encode('Niall')
        {'680000'}
        >>> pe.encode('Smith')
        {'463000'}
        >>> pe.encode('Schmidt')
        {'463000'}

        >>> sorted(pe.encode('The quick brown fox', max_length=20,
        ... zero_pad=False))
        ['35457976754', '3557976754']

        """
        dms = ['']  # initialize empty code list

        # Require a max_length of at least 6 and not more than 64
        if max_length != -1:
            max_length = min(max(6, max_length), 64)
        else:
            max_length = 64

        # uppercase, normalize, decompose, and filter non-A-Z
        word = unicode_normalize('NFKD', text_type(word.upper()))
        word = word.replace('ß', 'SS')
        word = ''.join(c for c in word if c in self._uc_set)

        # Nothing to convert, return base case
        if not word:
            if zero_pad:
                return {'0' * max_length}
            return {'0'}

        pos = 0
        while pos < len(word):
            # Iterate through _dms_order, which specifies the possible
            # substrings for which codes exist in the Daitch-Mokotoff coding
            for sstr in self._dms_order[word[pos]]:  # pragma: no branch
                if word[pos:].startswith(sstr):
                    # Having determined a valid substring start, retrieve the
                    # code
                    dm_val = self._dms_table[sstr]

                    # Having retried the code (triple), determine the correct
                    # positional variant (first, pre-vocalic, elsewhere)
                    if pos == 0:
                        dm_val = dm_val[0]
                    elif (
                        pos + len(sstr) < len(word)
                        and word[pos + len(sstr)] in self._uc_v_set
                    ):
                        dm_val = dm_val[1]
                    else:
                        dm_val = dm_val[2]

                    # Build the code strings
                    if isinstance(dm_val, tuple):
                        dms = [_ + text_type(dm_val[0]) for _ in dms] + [
                            _ + text_type(dm_val[1]) for _ in dms
                        ]
                    else:
                        dms = [_ + text_type(dm_val) for _ in dms]
                    pos += len(sstr)
                    break

        # Filter out double letters and _ placeholders
        dms = (
            ''.join(c for c in self._delete_consecutive_repeats(_) if c != '_')

            for _ in dms
        )

        # Trim codes and return set
        if zero_pad:
            dms = ((_ + ('0' * max_length))[:max_length] for _ in dms)
        else:
            dms = (_[:max_length] for _ in dms)
        return set(dms)


def dm_soundex(word, max_length=6, zero_pad=True):
    """Return the Daitch-Mokotoff Soundex code for a word.

    This is a wrapper for :py:meth:`DaitchMokotoff.encode`.

    Parameters
    ----------
    word : str
        The word to transform
    max_length : int
        The length of the code returned (defaults to 6; must be between 6 and
        64)
    zero_pad : bool
        Pad the end of the return value with 0s to achieve a max_length string

    Returns
    -------
    str
        The Daitch-Mokotoff Soundex value

    Examples
    --------
    >>> sorted(dm_soundex('Christopher'))
    ['494379', '594379']
    >>> dm_soundex('Niall')
    {'680000'}
    >>> dm_soundex('Smith')
    {'463000'}
    >>> dm_soundex('Schmidt')
    {'463000'}

    >>> sorted(dm_soundex('The quick brown fox', max_length=20,
    ... zero_pad=False))
    ['35457976754', '3557976754']

    """
    return DaitchMokotoff().encode(word, max_length, zero_pad)


if __name__ == '__main__':
    import doctest

    doctest.testmod()


1		# -- coding: utf-8 --
2
3		# Copyright 2014-2018 by Christopher C. Little.
4		# This file is part of Abydos.
5		#
6		# Abydos is free software: you can redistribute it and/or modify
7		# it under the terms of the GNU General Public License as published by
8		# the Free Software Foundation, either version 3 of the License, or
9		# (at your option) any later version.
10		#
11		# Abydos is distributed in the hope that it will be useful,
12		# but WITHOUT ANY WARRANTY; without even the implied warranty of
13		# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14		# GNU General Public License for more details.
15		#
16		# You should have received a copy of the GNU General Public License
17		# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19	1	"""abydos.phonetic._daitch_mokotoff.
20
21		Daitch-Mokotoff Soundex
22		"""
23
24	1	from __future__ import (
25		absolute_import,
26		division,
27		print_function,
28		unicode_literals,
29		)
30
31	1	from unicodedata import normalize as unicode_normalize
32
33	1	from six import text_type
34
35	1	from ._phonetic import _Phonetic
36
37	1	__all__ = ['DaitchMokotoff', 'dm_soundex']
38
39
40	1	class DaitchMokotoff(_Phonetic):
41		"""Daitch-Mokotoff Soundex.
42
43		Based on Daitch-Mokotoff Soundex :cite:`Mokotoff:1997`, this returns values
44		of a word as a set. A collection is necessary since there can be multiple
45		values for a single word.
46		"""
47
48	1	_dms_table = {
49		'STCH': (2, 4, 4),
50		'DRZ': (4, 4, 4),
51		'ZH': (4, 4, 4),
52		'ZHDZH': (2, 4, 4),
53		'DZH': (4, 4, 4),
54		'DRS': (4, 4, 4),
55		'DZS': (4, 4, 4),
56		'SCHTCH': (2, 4, 4),
57		'SHTSH': (2, 4, 4),
58		'SZCZ': (2, 4, 4),
59		'TZS': (4, 4, 4),
60		'SZCS': (2, 4, 4),
61		'STSH': (2, 4, 4),
62		'SHCH': (2, 4, 4),
63		'D': (3, 3, 3),
64		'H': (5, 5, '_'),
65		'TTSCH': (4, 4, 4),
66		'THS': (4, 4, 4),
67		'L': (8, 8, 8),
68		'P': (7, 7, 7),
69		'CHS': (5, 54, 54),
70		'T': (3, 3, 3),
71		'X': (5, 54, 54),
72		'OJ': (0, 1, '_'),
73		'OI': (0, 1, '_'),
74		'SCHTSH': (2, 4, 4),
75		'OY': (0, 1, '_'),
76		'Y': (1, '_', '_'),
77		'TSH': (4, 4, 4),
78		'ZDZ': (2, 4, 4),
79		'TSZ': (4, 4, 4),
80		'SHT': (2, 43, 43),
81		'SCHTSCH': (2, 4, 4),
82		'TTSZ': (4, 4, 4),
83		'TTZ': (4, 4, 4),
84		'SCH': (4, 4, 4),
85		'TTS': (4, 4, 4),
86		'SZD': (2, 43, 43),
87		'AI': (0, 1, '_'),
88		'PF': (7, 7, 7),
89		'TCH': (4, 4, 4),
90		'PH': (7, 7, 7),
91		'TTCH': (4, 4, 4),
92		'SZT': (2, 43, 43),
93		'ZDZH': (2, 4, 4),
94		'EI': (0, 1, '_'),
95		'G': (5, 5, 5),
96		'EJ': (0, 1, '_'),
97		'ZD': (2, 43, 43),
98		'IU': (1, '_', '_'),
99		'K': (5, 5, 5),
100		'O': (0, '_', '_'),
101		'SHTCH': (2, 4, 4),
102		'S': (4, 4, 4),
103		'TRZ': (4, 4, 4),
104		'SHD': (2, 43, 43),
105		'DSH': (4, 4, 4),
106		'CSZ': (4, 4, 4),
107		'EU': (1, 1, '_'),
108		'TRS': (4, 4, 4),
109		'ZS': (4, 4, 4),
110		'STRZ': (2, 4, 4),
111		'UY': (0, 1, '_'),
112		'STRS': (2, 4, 4),
113		'CZS': (4, 4, 4),
114		'MN': ('6_6', '6_6', '6_6'),
115		'UI': (0, 1, '_'),
116		'UJ': (0, 1, '_'),
117		'UE': (0, '_', '_'),
118		'EY': (0, 1, '_'),
119		'W': (7, 7, 7),
120		'IA': (1, '_', '_'),
121		'FB': (7, 7, 7),
122		'STSCH': (2, 4, 4),
123		'SCHT': (2, 43, 43),
124		'NM': ('6_6', '6_6', '6_6'),
125		'SCHD': (2, 43, 43),
126		'B': (7, 7, 7),
127		'DSZ': (4, 4, 4),
128		'F': (7, 7, 7),
129		'N': (6, 6, 6),
130		'CZ': (4, 4, 4),
131		'R': (9, 9, 9),
132		'U': (0, '_', '_'),
133		'V': (7, 7, 7),
134		'CS': (4, 4, 4),
135		'Z': (4, 4, 4),
136		'SZ': (4, 4, 4),
137		'TSCH': (4, 4, 4),
138		'KH': (5, 5, 5),
139		'ST': (2, 43, 43),
140		'KS': (5, 54, 54),
141		'SH': (4, 4, 4),
142		'SC': (2, 4, 4),
143		'SD': (2, 43, 43),
144		'DZ': (4, 4, 4),
145		'ZHD': (2, 43, 43),
146		'DT': (3, 3, 3),
147		'ZSH': (4, 4, 4),
148		'DS': (4, 4, 4),
149		'TZ': (4, 4, 4),
150		'TS': (4, 4, 4),
151		'TH': (3, 3, 3),
152		'TC': (4, 4, 4),
153		'A': (0, '_', '_'),
154		'E': (0, '_', '_'),
155		'I': (0, '_', '_'),
156		'AJ': (0, 1, '_'),
157		'M': (6, 6, 6),
158		'Q': (5, 5, 5),
159		'AU': (0, 7, '_'),
160		'IO': (1, '_', '_'),
161		'AY': (0, 1, '_'),
162		'IE': (1, '_', '_'),
163		'ZSCH': (4, 4, 4),
164		'CH': ((5, 4), (5, 4), (5, 4)),
165		'CK': ((5, 45), (5, 45), (5, 45)),
166		'C': ((5, 4), (5, 4), (5, 4)),
167		'J': ((1, 4), ('_', 4), ('_', 4)),
168		'RZ': ((94, 4), (94, 4), (94, 4)),
169		'RS': ((94, 4), (94, 4), (94, 4)),
170		}
171
172	1	_dms_order = {
173		'A': ('AI', 'AJ', 'AU', 'AY', 'A'),
174		'B': ('B',),
175		'C': ('CHS', 'CSZ', 'CZS', 'CH', 'CK', 'CS', 'CZ', 'C'),
176		'D': ('DRS', 'DRZ', 'DSH', 'DSZ', 'DZH', 'DZS', 'DS', 'DT', 'DZ', 'D'),
177		'E': ('EI', 'EJ', 'EU', 'EY', 'E'),
178		'F': ('FB', 'F'),
179		'G': ('G',),
180		'H': ('H',),
181		'I': ('IA', 'IE', 'IO', 'IU', 'I'),
182		'J': ('J',),
183		'K': ('KH', 'KS', 'K'),
184		'L': ('L',),
185		'M': ('MN', 'M'),
186		'N': ('NM', 'N'),
187		'O': ('OI', 'OJ', 'OY', 'O'),
188		'P': ('PF', 'PH', 'P'),
189		'Q': ('Q',),
190		'R': ('RS', 'RZ', 'R'),
191		'S': (
192		'SCHTSCH',
193		'SCHTCH',
194		'SCHTSH',
195		'SHTCH',
196		'SHTSH',
197		'STSCH',
198		'SCHD',
199		'SCHT',
200		'SHCH',
201		'STCH',
202		'STRS',
203		'STRZ',
204		'STSH',
205		'SZCS',
206		'SZCZ',
207		'SCH',
208		'SHD',
209		'SHT',
210		'SZD',
211		'SZT',
212		'SC',
213		'SD',
214		'SH',
215		'ST',
216		'SZ',
217		'S',
218		),
219		'T': (
220		'TTSCH',
221		'TSCH',
222		'TTCH',
223		'TTSZ',
224		'TCH',
225		'THS',
226		'TRS',
227		'TRZ',
228		'TSH',
229		'TSZ',
230		'TTS',
231		'TTZ',
232		'TZS',
233		'TC',
234		'TH',
235		'TS',
236		'TZ',
237		'T',
238		),
239		'U': ('UE', 'UI', 'UJ', 'UY', 'U'),
240		'V': ('V',),
241		'W': ('W',),
242		'X': ('X',),
243		'Y': ('Y',),
244		'Z': (
245		'ZHDZH',
246		'ZDZH',
247		'ZSCH',
248		'ZDZ',
249		'ZHD',
250		'ZSH',
251		'ZD',
252		'ZH',
253		'ZS',
254		'Z',
255		),
256		}
257
258	1	_uc_v_set = set('AEIJOUY')
259
260	1	def encode(self, word, max_length=6, zero_pad=True):
261		"""Return the Daitch-Mokotoff Soundex code for a word.
262
263		Parameters
264		----------
265		word : str
266		The word to transform
267		max_length : int
268		The length of the code returned (defaults to 6; must be between 6
269		and 64)
270		zero_pad : bool
271		Pad the end of the return value with 0s to achieve a max_length
272		string
273
274		Returns
275		-------
276		str
277		The Daitch-Mokotoff Soundex value
278
279		Examples
280		--------
281		>>> pe = DaitchMokotoff()
282		>>> sorted(pe.encode('Christopher'))
283		['494379', '594379']
284		>>> pe.encode('Niall')
285		{'680000'}
286		>>> pe.encode('Smith')
287		{'463000'}
288		>>> pe.encode('Schmidt')
289		{'463000'}
290
291		>>> sorted(pe.encode('The quick brown fox', max_length=20,
292		... zero_pad=False))
293		['35457976754', '3557976754']
294
295		"""
296	1	dms = [''] # initialize empty code list
297
298		# Require a max_length of at least 6 and not more than 64
299	1	if max_length != -1:
300	1	max_length = min(max(6, max_length), 64)
301		else:
302	1	max_length = 64
303
304		# uppercase, normalize, decompose, and filter non-A-Z
305	1	word = unicode_normalize('NFKD', text_type(word.upper()))
306	1	word = word.replace('ß', 'SS')
307	1	word = ''.join(c for c in word if c in self._uc_set)
308
309		# Nothing to convert, return base case
310	1	if not word:
311	1	if zero_pad:
312	1	return {'0' * max_length}
313	1	return {'0'}
314
315	1	pos = 0
316	1	while pos < len(word):
317		# Iterate through _dms_order, which specifies the possible
318		# substrings for which codes exist in the Daitch-Mokotoff coding
319	1	for sstr in self._dms_order[word[pos]]: # pragma: no branch
320	1	if word[pos:].startswith(sstr):
321		# Having determined a valid substring start, retrieve the
322		# code
323	1	dm_val = self._dms_table[sstr]
324
325		# Having retried the code (triple), determine the correct
326		# positional variant (first, pre-vocalic, elsewhere)
327	1	if pos == 0:
328	1	dm_val = dm_val[0]
329	1	elif (
330		pos + len(sstr) < len(word)
331		and word[pos + len(sstr)] in self._uc_v_set
332		):
333	1	dm_val = dm_val[1]
334		else:
335	1	dm_val = dm_val[2]
336
337		# Build the code strings
338	1	if isinstance(dm_val, tuple):
339	1	dms = [_ + text_type(dm_val[0]) for _ in dms] + [
340		_ + text_type(dm_val[1]) for _ in dms
341		]
342		else:
343	1	dms = [_ + text_type(dm_val) for _ in dms]
344	1	pos += len(sstr)
345	1	break
346
347		# Filter out double letters and _ placeholders
348	1	dms = (
349		''.join(c for c in self._delete_consecutive_repeats(_) if c != '_')
		0 ignored issues – show Comprehensibility Best Practice introduced 2018-08-02 19:04 UTC by Report Bug Copy Issue Report The variable `_` does not seem to be defined. Loading history...
350		for _ in dms
351		)
352
353		# Trim codes and return set
354	1	if zero_pad:
355	1	dms = ((_ + ('0' * max_length))[:max_length] for _ in dms)
356		else:
357	1	dms = (_[:max_length] for _ in dms)
358	1	return set(dms)
359
360
361	1	def dm_soundex(word, max_length=6, zero_pad=True):
362		"""Return the Daitch-Mokotoff Soundex code for a word.
363
364		This is a wrapper for :py:meth:`DaitchMokotoff.encode`.
365
366		Parameters
367		----------
368		word : str
369		The word to transform
370		max_length : int
371		The length of the code returned (defaults to 6; must be between 6 and
372		64)
373		zero_pad : bool
374		Pad the end of the return value with 0s to achieve a max_length string
375
376		Returns
377		-------
378		str
379		The Daitch-Mokotoff Soundex value
380
381		Examples
382		--------
383		>>> sorted(dm_soundex('Christopher'))
384		['494379', '594379']
385		>>> dm_soundex('Niall')
386		{'680000'}
387		>>> dm_soundex('Smith')
388		{'463000'}
389		>>> dm_soundex('Schmidt')
390		{'463000'}
391
392		>>> sorted(dm_soundex('The quick brown fox', max_length=20,
393		... zero_pad=False))
394		['35457976754', '3557976754']
395
396		"""
397	1	return DaitchMokotoff().encode(word, max_length, zero_pad)
398
399
400		if __name__ == '__main__':
401		import doctest
402
403		doctest.testmod()
404

chrislit / abydos

Push — master ( 23810f...afe14d )

DaitchMokotoff.encode() D

Complexity

Size

Duplication

Code Coverage

Importance

How to fix Long Method Complexity

Long Method

Complexity

Duplication Side-by-Side

Filter issues like