abydos.phonetic._soundex.Soundex.__init__() - Code Metrics - Inspection of "Merge pull request #248 from chrislit/0.6.0" - chrislit/abydos - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Push — master ( c2a3b6...15a61d )

by Chris

created 2020-01-12 22:24 UTC

abydos.phonetic._soundex.Soundex.init() A

↳ Parent: abydos.phonetic._soundex

Complexity

Conditions

Size

Total Lines	43
Code Lines	8

Duplication

Lines	0
Ratio	0 %

Code Coverage

Tests	3
CRAP Score	2

Importance

Changes

Metric	Value
eloc	8
dl	0
loc	43
ccs	3
cts	3
cp	1
rs	10
c	0
b	0
f	0
cc	2
nop	5
crap	2

# Copyright 2014-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.phonetic._soundex.

American Soundex
"""

from unicodedata import normalize as unicode_normalize

from ._phonetic import _Phonetic

__all__ = ['Soundex']


class Soundex(_Phonetic):
    """Soundex.

    Three variants of Soundex are implemented:

    - 'American' follows the American Soundex algorithm, as described at
      :cite:`US:2007` and in :cite:`Knuth:1998`; this is also called
      Miracode
    - 'special' follows the rules from the 1880-1910 US Census
      retrospective re-analysis, in which h & w are not treated as blocking
      consonants but as vowels. Cf. :cite:`Repici:2013`.
    - 'Census' follows the rules laid out in GIL 55 :cite:`US:1997` by the
      US Census, including coding prefixed and unprefixed versions of some
      names

    .. versionadded:: 0.3.6
    """

    _trans = dict(
        zip(
            (ord(_) for _ in 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),

            '01230129022455012623019202',
        )
    )

    _alphabetic = dict(zip((ord(_) for _ in '01234569'), 'APKTLNRH'))

    def __init__(
        self, max_length=4, var='American', reverse=False, zero_pad=True
    ):
        """Initialize Soundex instance.

        Parameters
        ----------
        max_length : int
            The length of the code returned (defaults to 4)
        var : str
            The variant of the algorithm to employ (defaults to ``American``):

                - ``American`` follows the American Soundex algorithm, as
                  described at :cite:`US:2007` and in :cite:`Knuth:1998`; this
                  is also called Miracode
                - ``special`` follows the rules from the 1880-1910 US Census
                  retrospective re-analysis, in which h & w are not treated as
                  blocking consonants but as vowels. Cf. :cite:`Repici:2013`.
                - ``Census`` follows the rules laid out in GIL 55
                  :cite:`US:1997` by the US Census, including coding prefixed
                  and unprefixed versions of some names

        reverse : bool
            Reverse the word before computing the selected Soundex (defaults to
            False); This results in "Reverse Soundex", which is useful for
            blocking in cases where the initial elements may be in error.
        zero_pad : bool
            Pad the end of the return value with 0s to achieve a max_length
            string


        .. versionadded:: 0.4.0

        """
        # Require a max_length of at least 4 and not more than 64
        if max_length != -1:
            self._max_length = min(max(4, max_length), 64)
        else:
            self._max_length = 64

        self._var = var
        self._reverse = reverse
        self._zero_pad = zero_pad

    def encode_alpha(self, word):
        """Return the alphabetic Soundex code for a word.

        Parameters
        ----------
        word : str
            The word to transform

        Returns
        -------
        str
            The alphabetic Soundex value

        Examples
        --------
        >>> pe = Soundex()
        >>> pe.encode_alpha("Christopher")
        'CRKT'
        >>> pe.encode_alpha("Niall")
        'NL'
        >>> pe.encode_alpha('Smith')
        'SNT'
        >>> pe.encode_alpha('Schmidt')
        'SNT'


        .. versionadded:: 0.4.0

        """
        code = self.encode(word).rstrip('0')
        return code[:1] + code[1:].translate(self._alphabetic)

    def encode(self, word, **kwargs):
        """Return the Soundex code for a word.

        Parameters
        ----------
        word : str
            The word to transform

        Returns
        -------
        str
            The Soundex value

        Examples
        --------
        >>> pe = Soundex()
        >>> pe.encode("Christopher")
        'C623'
        >>> pe.encode("Niall")
        'N400'
        >>> pe.encode('Smith')
        'S530'
        >>> pe.encode('Schmidt')
        'S530'

        >>> Soundex(max_length=-1).encode('Christopher')
        'C623160000000000000000000000000000000000000000000000000000000000'
        >>> Soundex(max_length=-1, zero_pad=False).encode('Christopher')
        'C62316'

        >>> Soundex(reverse=True).encode('Christopher')
        'R132'

        >>> pe.encode('Ashcroft')
        'A261'
        >>> pe.encode('Asicroft')
        'A226'

        >>> pe_special = Soundex(var='special')
        >>> pe_special.encode('Ashcroft')
        'A226'
        >>> pe_special.encode('Asicroft')
        'A226'


        .. versionadded:: 0.1.0
        .. versionchanged:: 0.3.6
            Encapsulated in class

        """
        # uppercase, normalize, decompose, and filter non-A-Z out
        word = unicode_normalize('NFKD', word.upper())

        if self._var == 'Census' and (
            'recurse' not in kwargs or kwargs['recurse'] is not False
        ):
            if word[:3] in {'VAN', 'CON'} and len(word) > 4:
                return (
                    self.encode(word, recurse=False),
                    self.encode(word[3:], recurse=False),
                )
            if word[:2] in {'DE', 'DI', 'LA', 'LE'} and len(word) > 3:
                return (
                    self.encode(word, recurse=False),
                    self.encode(word[2:], recurse=False),
                )
            # Otherwise, proceed as usual (var='American' mode, ostensibly)

        word = ''.join(c for c in word if c in self._uc_set)

        # Nothing to convert, return base case
        if not word:
            if self._zero_pad:
                return '0' * self._max_length
            return '0'

        # Reverse word if computing Reverse Soundex
        if self._reverse:
            word = word[::-1]

        # apply the Soundex algorithm
        sdx = word.translate(self._trans)

        if self._var == 'special':
            sdx = sdx.replace('9', '0')  # special rule for 1880-1910 census
        else:
            sdx = sdx.replace('9', '')  # rule 1
        sdx = self._delete_consecutive_repeats(sdx)  # rule 3

        if word[0] in 'HW':
            sdx = word[0] + sdx
        else:
            sdx = word[0] + sdx[1:]
        sdx = sdx.replace('0', '')  # rule 1

        if self._zero_pad:
            sdx += '0' * self._max_length  # rule 4

        return sdx[: self._max_length]


if __name__ == '__main__':
    import doctest

    doctest.testmod()


1		# Copyright 2014-2020 by Christopher C. Little.
2		# This file is part of Abydos.
3		#
4		# Abydos is free software: you can redistribute it and/or modify
5		# it under the terms of the GNU General Public License as published by
6		# the Free Software Foundation, either version 3 of the License, or
7		# (at your option) any later version.
8		#
9		# Abydos is distributed in the hope that it will be useful,
10		# but WITHOUT ANY WARRANTY; without even the implied warranty of
11		# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12		# GNU General Public License for more details.
13		#
14		# You should have received a copy of the GNU General Public License
15		# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
16
17		"""abydos.phonetic._soundex.
18
19	1	American Soundex
20		"""
21
22		from unicodedata import normalize as unicode_normalize
23
24	1	from ._phonetic import _Phonetic
25
26		__all__ = ['Soundex']
27
28
29		class Soundex(_Phonetic):
30		"""Soundex.
31	1
32		Three variants of Soundex are implemented:
33	1
34		- 'American' follows the American Soundex algorithm, as described at
35	1	:cite:`US:2007` and in :cite:`Knuth:1998`; this is also called
36		Miracode
37	1	- 'special' follows the rules from the 1880-1910 US Census
38	1	retrospective re-analysis, in which h & w are not treated as blocking
39		consonants but as vowels. Cf. :cite:`Repici:2013`.
40	1	- 'Census' follows the rules laid out in GIL 55 :cite:`US:1997` by the
41		US Census, including coding prefixed and unprefixed versions of some
42		names
43	1
44		.. versionadded:: 0.3.6
45		"""
46
47		_trans = dict(
48		zip(
49		(ord(_) for _ in 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),
		0 ignored issues – show Comprehensibility Best Practice introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report The variable `_` does not seem to be defined. Loading history...
50		'01230129022455012623019202',
51		)
52		)
53
54		_alphabetic = dict(zip((ord(_) for _ in '01234569'), 'APKTLNRH'))
55
56		def __init__(
57		self, max_length=4, var='American', reverse=False, zero_pad=True
58		):
59		"""Initialize Soundex instance.
60
61	1	Parameters
62		----------
63		max_length : int
64		The length of the code returned (defaults to 4)
65		var : str
66		The variant of the algorithm to employ (defaults to ``American``):
67
68	1	- ``American`` follows the American Soundex algorithm, as
69		described at :cite:`US:2007` and in :cite:`Knuth:1998`; this
70	1	is also called Miracode
71		- ``special`` follows the rules from the 1880-1910 US Census
72		retrospective re-analysis, in which h & w are not treated as
73		blocking consonants but as vowels. Cf. :cite:`Repici:2013`.
74		- ``Census`` follows the rules laid out in GIL 55
75		:cite:`US:1997` by the US Census, including coding prefixed
76		and unprefixed versions of some names
77
78		reverse : bool
79		Reverse the word before computing the selected Soundex (defaults to
80		False); This results in "Reverse Soundex", which is useful for
81		blocking in cases where the initial elements may be in error.
82		zero_pad : bool
83		Pad the end of the return value with 0s to achieve a max_length
84		string
85
86
87		.. versionadded:: 0.4.0
88
89		"""
90		# Require a max_length of at least 4 and not more than 64
91		if max_length != -1:
92		self._max_length = min(max(4, max_length), 64)
93		else:
94		self._max_length = 64
95
96		self._var = var
97		self._reverse = reverse
98		self._zero_pad = zero_pad
99
100		def encode_alpha(self, word):
101		"""Return the alphabetic Soundex code for a word.
102
103		Parameters
104		----------
105	1	word : str
106	1	The word to transform
107
108	1	Returns
109		-------
110	1	str
111	1	The alphabetic Soundex value
112	1
113		Examples
114	1	--------
115		>>> pe = Soundex()
116		>>> pe.encode_alpha("Christopher")
117		'CRKT'
118		>>> pe.encode_alpha("Niall")
119		'NL'
120		>>> pe.encode_alpha('Smith')
121		'SNT'
122		>>> pe.encode_alpha('Schmidt')
123		'SNT'
124
125
126		.. versionadded:: 0.4.0
127
128		"""
129		code = self.encode(word).rstrip('0')
130		return code[:1] + code[1:].translate(self._alphabetic)
131
132		def encode(self, word, **kwargs):
133		"""Return the Soundex code for a word.
134
135		Parameters
136		----------
137		word : str
138		The word to transform
139
140		Returns
141		-------
142		str
143	1	The Soundex value
144	1
145		Examples
146	1	--------
147		>>> pe = Soundex()
148		>>> pe.encode("Christopher")
149		'C623'
150		>>> pe.encode("Niall")
151		'N400'
152		>>> pe.encode('Smith')
153		'S530'
154		>>> pe.encode('Schmidt')
155		'S530'
156
157		>>> Soundex(max_length=-1).encode('Christopher')
158		'C623160000000000000000000000000000000000000000000000000000000000'
159		>>> Soundex(max_length=-1, zero_pad=False).encode('Christopher')
160		'C62316'
161
162		>>> Soundex(reverse=True).encode('Christopher')
163		'R132'
164
165		>>> pe.encode('Ashcroft')
166		'A261'
167		>>> pe.encode('Asicroft')
168		'A226'
169
170		>>> pe_special = Soundex(var='special')
171		>>> pe_special.encode('Ashcroft')
172		'A226'
173		>>> pe_special.encode('Asicroft')
174		'A226'
175
176
177		.. versionadded:: 0.1.0
178		.. versionchanged:: 0.3.6
179		Encapsulated in class
180
181		"""
182		# uppercase, normalize, decompose, and filter non-A-Z out
183		word = unicode_normalize('NFKD', word.upper())
184
185		if self._var == 'Census' and (
186		'recurse' not in kwargs or kwargs['recurse'] is not False
187		):
188		if word[:3] in {'VAN', 'CON'} and len(word) > 4:
189		return (
190		self.encode(word, recurse=False),
191		self.encode(word[3:], recurse=False),
192		)
193		if word[:2] in {'DE', 'DI', 'LA', 'LE'} and len(word) > 3:
194		return (
195		self.encode(word, recurse=False),
196		self.encode(word[2:], recurse=False),
197	1	)
198	1	# Otherwise, proceed as usual (var='American' mode, ostensibly)
199
200	1	word = ''.join(c for c in word if c in self._uc_set)
201	1
202	1	# Nothing to convert, return base case
203		if not word:
204		if self._zero_pad:
205		return '0' * self._max_length
206		return '0'
207
208		# Reverse word if computing Reverse Soundex
209		if self._reverse:
210		word = word[::-1]
211
212		# apply the Soundex algorithm
213		sdx = word.translate(self._trans)
214
215		if self._var == 'special':
216		sdx = sdx.replace('9', '0') # special rule for 1880-1910 census
217		else:
218	1	sdx = sdx.replace('9', '') # rule 1
219	1	sdx = self._delete_consecutive_repeats(sdx) # rule 3
220
221		if word[0] in 'HW':
222		sdx = word[0] + sdx
223		else:
224		sdx = word[0] + sdx[1:]
225		sdx = sdx.replace('0', '') # rule 1
226
227		if self._zero_pad:
228		sdx += '0' * self._max_length # rule 4
229
230		return sdx[: self._max_length]
231
232
233		if __name__ == '__main__':
234		import doctest
235
236		doctest.testmod()
237

chrislit / abydos

Push — master ( c2a3b6...15a61d )

abydos.phonetic._soundex.Soundex.__init__() A

Complexity

Size

Duplication

Code Coverage

Importance

Duplication Side-by-Side

Filter issues like

abydos.phonetic._soundex.Soundex.init() A