abydos.phonetic._soundex.phonex() - Code Metrics - Inspection of "0.3.6" - chrislit/abydos - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Pull Request — master (#141)

by Chris

created 2018-11-10 03:25 UTC

abydos.phonetic._soundex.phonex() F

↳ Parent: Project

Complexity

Conditions

Size

Total Lines	117
Code Lines	73

Duplication

Lines	0
Ratio	0 %

Code Coverage

Tests	59
CRAP Score	33

Importance

Changes

Metric	Value
eloc	73
dl	0
loc	117
ccs	59
cts	59
cp	1
rs	0
c	0
b	0
f	0
cc	33
nop	3
crap	33

How to fix Long Method Complexity

# -*- coding: utf-8 -*-

# Copyright 2014-2018 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.phonetic._soundex.

American Soundex
"""

from __future__ import (
    absolute_import,
    division,
    print_function,
    unicode_literals,
)

from unicodedata import normalize as unicode_normalize

from six import text_type

from ._phonetic import Phonetic

__all__ = ['Soundex', 'soundex']


class Soundex(Phonetic):

    """Soundex.

    Three variants of Soundex are implemented:

    - 'American' follows the American Soundex algorithm, as described at
      :cite:`US:2007` and in :cite:`Knuth:1998`; this is also called
      Miracode
    - 'special' follows the rules from the 1880-1910 US Census
      retrospective re-analysis, in which h & w are not treated as blocking
      consonants but as vowels. Cf. :cite:`Repici:2013`.
    - 'Census' follows the rules laid out in GIL 55 :cite:`US:1997` by the
      US Census, including coding prefixed and unprefixed versions of some
      names
    """

    _trans = dict(
        zip(
            (ord(_) for _ in 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),

            '01230129022455012623019202',
        )
    )

    def encode(

        self, word, max_length=4, var='American', reverse=False, zero_pad=True

    ):
        """Return the Soundex code for a word.

        Args:
            word (str): The word to transform
            max_length (int): The length of the code returned (defaults to 4)
            var (str): The variant of the algorithm to employ (defaults to
                ``American``):
                    - ``American`` follows the American Soundex algorithm, as
                      described at :cite:`US:2007` and in :cite:`Knuth:1998`;
                      this is also called Miracode
                    - ``special`` follows the rules from the 1880-1910 US
                      Census retrospective re-analysis, in which h & w are not
                      treated as blocking consonants but as vowels. Cf.
                      :cite:`Repici:2013`.
                    - ``Census`` follows the rules laid out in GIL 55
                      :cite:`US:1997` by the US Census, including coding
                      prefixed and unprefixed versions of some names
            reverse (bool): Reverse the word before computing the selected
                Soundex (defaults to False); This results in "Reverse Soundex",
                which is useful for blocking in cases where the initial
                elements may be in error.
            zero_pad (bool): Pad the end of the return value with 0s to achieve
                a max_length string

        Returns:
            str: The Soundex value

        Examples:
            >>> pe = Soundex()
            >>> pe.encode("Christopher")
            'C623'
            >>> pe.encode("Niall")
            'N400'
            >>> pe.encode('Smith')
            'S530'
            >>> pe.encode('Schmidt')
            'S530'

            >>> pe.encode('Christopher', max_length=-1)
            'C623160000000000000000000000000000000000000000000000000000000000'
            >>> pe.encode('Christopher', max_length=-1, zero_pad=False)
            'C62316'

            >>> pe.encode('Christopher', reverse=True)
            'R132'

            >>> pe.encode('Ashcroft')
            'A261'
            >>> pe.encode('Asicroft')
            'A226'
            >>> pe.encode('Ashcroft', var='special')
            'A226'
            >>> pe.encode('Asicroft', var='special')
            'A226'

        """
        # Require a max_length of at least 4 and not more than 64
        if max_length != -1:
            max_length = min(max(4, max_length), 64)
        else:
            max_length = 64

        # uppercase, normalize, decompose, and filter non-A-Z out
        word = unicode_normalize('NFKD', text_type(word.upper()))
        word = word.replace('ß', 'SS')

        if var == 'Census':
            # TODO: Should these prefixes be supplemented? (VANDE, DELA, VON)

            if word[:3] in {'VAN', 'CON'} and len(word) > 4:
                return (
                    soundex(word, max_length, 'American', reverse, zero_pad),
                    soundex(
                        word[3:], max_length, 'American', reverse, zero_pad
                    ),
                )
            if word[:2] in {'DE', 'DI', 'LA', 'LE'} and len(word) > 3:
                return (
                    soundex(word, max_length, 'American', reverse, zero_pad),
                    soundex(
                        word[2:], max_length, 'American', reverse, zero_pad
                    ),
                )
            # Otherwise, proceed as usual (var='American' mode, ostensibly)

        word = ''.join(c for c in word if c in self._uc_set)

        # Nothing to convert, return base case
        if not word:
            if zero_pad:
                return '0' * max_length
            return '0'

        # Reverse word if computing Reverse Soundex
        if reverse:
            word = word[::-1]

        # apply the Soundex algorithm
        sdx = word.translate(self._trans)

        if var == 'special':
            sdx = sdx.replace('9', '0')  # special rule for 1880-1910 census
        else:
            sdx = sdx.replace('9', '')  # rule 1
        sdx = self._delete_consecutive_repeats(sdx)  # rule 3

        if word[0] in 'HW':
            sdx = word[0] + sdx
        else:
            sdx = word[0] + sdx[1:]
        sdx = sdx.replace('0', '')  # rule 1

        if zero_pad:
            sdx += '0' * max_length  # rule 4

        return sdx[:max_length]


def soundex(word, max_length=4, var='American', reverse=False, zero_pad=True):
    """Return the Soundex code for a word.

    This is a wrapper for :py:meth:`Soundex.encode`.

    Args:
        word (str): The word to transform
        max_length (int): The length of the code returned (defaults to 4)
        var (str): The variant of the algorithm to employ (defaults to
            ``American``):
                - ``American`` follows the American Soundex algorithm, as
                  described at :cite:`US:2007` and in :cite:`Knuth:1998`; this
                  is also called Miracode
                - ``special`` follows the rules from the 1880-1910 US Census
                  retrospective re-analysis, in which h & w are not treated as
                  blocking consonants but as vowels. Cf. :cite:`Repici:2013`.
                - ``Census`` follows the rules laid out in GIL 55
                  :cite:`US:1997` by the US Census, including coding prefixed
                  and unprefixed versions of some names
        reverse (bool): Reverse the word before computing the selected
            Soundex (defaults to False); This results in "Reverse Soundex",
            which is useful for blocking in cases where the initial elements
            may be in error.
        zero_pad (bool): Pad the end of the return value with 0s to achieve a
            max_length string

    Returns:
        str: The Soundex value

    Examples:
        >>> soundex("Christopher")
        'C623'
        >>> soundex("Niall")
        'N400'
        >>> soundex('Smith')
        'S530'
        >>> soundex('Schmidt')
        'S530'

        >>> soundex('Christopher', max_length=-1)
        'C623160000000000000000000000000000000000000000000000000000000000'
        >>> soundex('Christopher', max_length=-1, zero_pad=False)
        'C62316'

        >>> soundex('Christopher', reverse=True)
        'R132'

        >>> soundex('Ashcroft')
        'A261'
        >>> soundex('Asicroft')
        'A226'
        >>> soundex('Ashcroft', var='special')
        'A226'
        >>> soundex('Asicroft', var='special')
        'A226'

    """
    return Soundex().encode(word, max_length, var, reverse, zero_pad)


if __name__ == '__main__':
    import doctest

    doctest.testmod()


1		# -- coding: utf-8 --
2
3		# Copyright 2014-2018 by Christopher C. Little.
4		# This file is part of Abydos.
5		#
6		# Abydos is free software: you can redistribute it and/or modify
7		# it under the terms of the GNU General Public License as published by
8		# the Free Software Foundation, either version 3 of the License, or
9		# (at your option) any later version.
10		#
11		# Abydos is distributed in the hope that it will be useful,
12		# but WITHOUT ANY WARRANTY; without even the implied warranty of
13		# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14		# GNU General Public License for more details.
15		#
16		# You should have received a copy of the GNU General Public License
17		# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19	1	"""abydos.phonetic._soundex.
20
21		American Soundex
22		"""
23
24	1	from __future__ import (
25		absolute_import,
26		division,
27		print_function,
28		unicode_literals,
29		)
30
31	1	from unicodedata import normalize as unicode_normalize
32
33	1	from six import text_type
34
35	1	from ._phonetic import Phonetic
36
37	1	__all__ = ['Soundex', 'soundex']
38
39
40	1	class Soundex(Phonetic):
		0 ignored issues – show Unused Code introduced 2018-11-10 01:42 UTC by Report Bug Copy Issue Report The variable `__class__` seems to be unused. Loading history...
41		"""Soundex.
42
43		Three variants of Soundex are implemented:
44
45		- 'American' follows the American Soundex algorithm, as described at
46		:cite:`US:2007` and in :cite:`Knuth:1998`; this is also called
47		Miracode
48		- 'special' follows the rules from the 1880-1910 US Census
49		retrospective re-analysis, in which h & w are not treated as blocking
50		consonants but as vowels. Cf. :cite:`Repici:2013`.
51		- 'Census' follows the rules laid out in GIL 55 :cite:`US:1997` by the
52		US Census, including coding prefixed and unprefixed versions of some
53		names
54		"""
55
56	1	_trans = dict(
57		zip(
58		(ord(_) for _ in 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),
		0 ignored issues – show Comprehensibility Best Practice introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report The variable `_` does not seem to be defined. Loading history...
59		'01230129022455012623019202',
60		)
61		)
62
63	1	def encode(
		0 ignored issues – show best-practice introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Too many arguments (6/5) Loading history... Bug introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Parameters differ from overridden 'encode' method Loading history...
64		self, word, max_length=4, var='American', reverse=False, zero_pad=True
		0 ignored issues – show Coding Style introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
65		):
66		"""Return the Soundex code for a word.
67
68		Args:
69		word (str): The word to transform
70		max_length (int): The length of the code returned (defaults to 4)
71		var (str): The variant of the algorithm to employ (defaults to
72		``American``):
73		- ``American`` follows the American Soundex algorithm, as
74		described at :cite:`US:2007` and in :cite:`Knuth:1998`;
75		this is also called Miracode
76		- ``special`` follows the rules from the 1880-1910 US
77		Census retrospective re-analysis, in which h & w are not
78		treated as blocking consonants but as vowels. Cf.
79		:cite:`Repici:2013`.
80		- ``Census`` follows the rules laid out in GIL 55
81		:cite:`US:1997` by the US Census, including coding
82		prefixed and unprefixed versions of some names
83		reverse (bool): Reverse the word before computing the selected
84		Soundex (defaults to False); This results in "Reverse Soundex",
85		which is useful for blocking in cases where the initial
86		elements may be in error.
87		zero_pad (bool): Pad the end of the return value with 0s to achieve
88		a max_length string
89
90		Returns:
91		str: The Soundex value
92
93		Examples:
94		>>> pe = Soundex()
95		>>> pe.encode("Christopher")
96		'C623'
97		>>> pe.encode("Niall")
98		'N400'
99		>>> pe.encode('Smith')
100		'S530'
101		>>> pe.encode('Schmidt')
102		'S530'
103
104		>>> pe.encode('Christopher', max_length=-1)
105		'C623160000000000000000000000000000000000000000000000000000000000'
106		>>> pe.encode('Christopher', max_length=-1, zero_pad=False)
107		'C62316'
108
109		>>> pe.encode('Christopher', reverse=True)
110		'R132'
111
112		>>> pe.encode('Ashcroft')
113		'A261'
114		>>> pe.encode('Asicroft')
115		'A226'
116		>>> pe.encode('Ashcroft', var='special')
117		'A226'
118		>>> pe.encode('Asicroft', var='special')
119		'A226'
120
121		"""
122		# Require a max_length of at least 4 and not more than 64
123	1	if max_length != -1:
124	1	max_length = min(max(4, max_length), 64)
125		else:
126	1	max_length = 64
127
128		# uppercase, normalize, decompose, and filter non-A-Z out
129	1	word = unicode_normalize('NFKD', text_type(word.upper()))
130	1	word = word.replace('ß', 'SS')
131
132	1	if var == 'Census':
133		# TODO: Should these prefixes be supplemented? (VANDE, DELA, VON)
		0 ignored issues – show Coding Style introduced 2018-10-05 09:55 UTC by Report Bug Copy Issue Report `TODO` and `FIXME` comments should generally be avoided. Loading history...
134	1	if word[:3] in {'VAN', 'CON'} and len(word) > 4:
135	1	return (
136		soundex(word, max_length, 'American', reverse, zero_pad),
137		soundex(
138		word[3:], max_length, 'American', reverse, zero_pad
139		),
140		)
141	1	if word[:2] in {'DE', 'DI', 'LA', 'LE'} and len(word) > 3:
142	1	return (
143		soundex(word, max_length, 'American', reverse, zero_pad),
144		soundex(
145		word[2:], max_length, 'American', reverse, zero_pad
146		),
147		)
148		# Otherwise, proceed as usual (var='American' mode, ostensibly)
149
150	1	word = ''.join(c for c in word if c in self._uc_set)
151
152		# Nothing to convert, return base case
153	1	if not word:
154	1	if zero_pad:
155	1	return '0' * max_length
156	1	return '0'
157
158		# Reverse word if computing Reverse Soundex
159	1	if reverse:
160	1	word = word[::-1]
161
162		# apply the Soundex algorithm
163	1	sdx = word.translate(self._trans)
164
165	1	if var == 'special':
166	1	sdx = sdx.replace('9', '0') # special rule for 1880-1910 census
167		else:
168	1	sdx = sdx.replace('9', '') # rule 1
169	1	sdx = self._delete_consecutive_repeats(sdx) # rule 3
170
171	1	if word[0] in 'HW':
172	1	sdx = word[0] + sdx
173		else:
174	1	sdx = word[0] + sdx[1:]
175	1	sdx = sdx.replace('0', '') # rule 1
176
177	1	if zero_pad:
178	1	sdx += '0' * max_length # rule 4
179
180	1	return sdx[:max_length]
181
182
183	1	def soundex(word, max_length=4, var='American', reverse=False, zero_pad=True):
184		"""Return the Soundex code for a word.
185
186		This is a wrapper for :py:meth:`Soundex.encode`.
187
188		Args:
189		word (str): The word to transform
190		max_length (int): The length of the code returned (defaults to 4)
191		var (str): The variant of the algorithm to employ (defaults to
192		``American``):
193		- ``American`` follows the American Soundex algorithm, as
194		described at :cite:`US:2007` and in :cite:`Knuth:1998`; this
195		is also called Miracode
196		- ``special`` follows the rules from the 1880-1910 US Census
197		retrospective re-analysis, in which h & w are not treated as
198		blocking consonants but as vowels. Cf. :cite:`Repici:2013`.
199		- ``Census`` follows the rules laid out in GIL 55
200		:cite:`US:1997` by the US Census, including coding prefixed
201		and unprefixed versions of some names
202		reverse (bool): Reverse the word before computing the selected
203		Soundex (defaults to False); This results in "Reverse Soundex",
204		which is useful for blocking in cases where the initial elements
205		may be in error.
206		zero_pad (bool): Pad the end of the return value with 0s to achieve a
207		max_length string
208
209		Returns:
210		str: The Soundex value
211
212		Examples:
213		>>> soundex("Christopher")
214		'C623'
215		>>> soundex("Niall")
216		'N400'
217		>>> soundex('Smith')
218		'S530'
219		>>> soundex('Schmidt')
220		'S530'
221
222		>>> soundex('Christopher', max_length=-1)
223		'C623160000000000000000000000000000000000000000000000000000000000'
224		>>> soundex('Christopher', max_length=-1, zero_pad=False)
225		'C62316'
226
227		>>> soundex('Christopher', reverse=True)
228		'R132'
229
230		>>> soundex('Ashcroft')
231		'A261'
232		>>> soundex('Asicroft')
233		'A226'
234		>>> soundex('Ashcroft', var='special')
235		'A226'
236		>>> soundex('Asicroft', var='special')
237		'A226'
238
239		"""
240	1	return Soundex().encode(word, max_length, var, reverse, zero_pad)
241
242
243		if __name__ == '__main__':
244		import doctest
245
246		doctest.testmod()
247

chrislit / abydos

Pull Request — master (#141)

abydos.phonetic._soundex.phonex() F

Complexity

Size

Duplication

Code Coverage

Importance

How to fix Long Method Complexity

Long Method

Complexity

Duplication Side-by-Side

Filter issues like