abydos.phonetic._soundex - Code Metrics - chrislit/abydos - Measure and Improve Code Quality continuously with Scrutinizer

abydos.phonetic._soundex A
last analyzed 2020-12-31 20:10 UTC

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	244
Duplicated Lines	0 %

Test Coverage

Coverage

100%

Importance

Changes

Metric	Value
wmc	17
eloc	60
dl	0
loc	244
ccs	34
cts	34
cp	1
rs	10
c	0
b	0
f	0

3 Methods

Rating	Name	Size	Complexity
A	Soundex.__init__()	47	2
A	Soundex.encode_alpha()	31	1
F	Soundex.encode()	101	14

# Copyright 2014-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.phonetic._soundex.

American Soundex
"""

from typing import Any
from unicodedata import normalize as unicode_normalize

from ._phonetic import _Phonetic

__all__ = ['Soundex']


class Soundex(_Phonetic):
    """Soundex.

    Three variants of Soundex are implemented:

    - 'American' follows the American Soundex algorithm, as described at
      :cite:`US:2007` and in :cite:`Knuth:1998`; this is also called
      Miracode
    - 'special' follows the rules from the 1880-1910 US Census
      retrospective re-analysis, in which h & w are not treated as blocking
      consonants but as vowels. Cf. :cite:`Repici:2013`.
    - 'Census' follows the rules laid out in GIL 55 :cite:`US:1997` by the
      US Census, including coding prefixed and unprefixed versions of some
      names

    .. versionadded:: 0.3.6
    """

    _trans = dict(
        zip(
            (ord(_) for _ in 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),

            '01230129022455012623019202',
        )
    )

    _alphabetic = dict(zip((ord(_) for _ in '01234569'), 'APKTLNRH'))

    def __init__(
        self,
        max_length: int = 4,
        var: str = 'American',
        reverse: bool = False,
        zero_pad: bool = True,
    ) -> None:
        """Initialize Soundex instance.

        Parameters
        ----------
        max_length : int
            The length of the code returned (defaults to 4)
        var : str
            The variant of the algorithm to employ (defaults to ``American``):

                - ``American`` follows the American Soundex algorithm, as
                  described at :cite:`US:2007` and in :cite:`Knuth:1998`; this
                  is also called Miracode
                - ``special`` follows the rules from the 1880-1910 US Census
                  retrospective re-analysis, in which h & w are not treated as
                  blocking consonants but as vowels. Cf. :cite:`Repici:2013`.
                - ``Census`` follows the rules laid out in GIL 55
                  :cite:`US:1997` by the US Census, including coding prefixed
                  and unprefixed versions of some names

        reverse : bool
            Reverse the word before computing the selected Soundex (defaults to
            False); This results in "Reverse Soundex", which is useful for
            blocking in cases where the initial elements may be in error.
        zero_pad : bool
            Pad the end of the return value with 0s to achieve a max_length
            string


        .. versionadded:: 0.4.0

        """
        # Require a max_length of at least 4 and not more than 64
        if max_length != -1:
            self._max_length = min(max(4, max_length), 64)
        else:
            self._max_length = 64

        self._var = var
        self._reverse = reverse
        self._zero_pad = zero_pad

    def encode_alpha(self, word: str) -> str:
        """Return the alphabetic Soundex code for a word.

        Parameters
        ----------
        word : str
            The word to transform

        Returns
        -------
        str
            The alphabetic Soundex value

        Examples
        --------
        >>> pe = Soundex()
        >>> pe.encode_alpha("Christopher")
        'CRKT'
        >>> pe.encode_alpha("Niall")
        'NL'
        >>> pe.encode_alpha('Smith')
        'SNT'
        >>> pe.encode_alpha('Schmidt')
        'SNT'


        .. versionadded:: 0.4.0

        """
        code = self.encode(word).rstrip('0')
        return code[:1] + code[1:].translate(self._alphabetic)

    def encode(self, word: str, **kwargs: Any) -> str:
        """Return the Soundex code for a word.

        Parameters
        ----------
        word : str
            The word to transform

        Returns
        -------
        str
            The Soundex value

        Examples
        --------
        >>> pe = Soundex()
        >>> pe.encode("Christopher")
        'C623'
        >>> pe.encode("Niall")
        'N400'
        >>> pe.encode('Smith')
        'S530'
        >>> pe.encode('Schmidt')
        'S530'

        >>> Soundex(max_length=-1).encode('Christopher')
        'C623160000000000000000000000000000000000000000000000000000000000'
        >>> Soundex(max_length=-1, zero_pad=False).encode('Christopher')
        'C62316'

        >>> Soundex(reverse=True).encode('Christopher')
        'R132'

        >>> pe.encode('Ashcroft')
        'A261'
        >>> pe.encode('Asicroft')
        'A226'

        >>> pe_special = Soundex(var='special')
        >>> pe_special.encode('Ashcroft')
        'A226'
        >>> pe_special.encode('Asicroft')
        'A226'


        .. versionadded:: 0.1.0
        .. versionchanged:: 0.3.6
            Encapsulated in class
        .. versionchanged:: 0.6.0
            Made return a str only (comma-separated)

        """
        # uppercase, normalize, decompose, and filter non-A-Z out
        word = unicode_normalize('NFKD', word.upper())

        if self._var == 'Census' and (
            'recurse' not in kwargs or kwargs['recurse'] is not False
        ):
            if word[:3] in {'VAN', 'CON'} and len(word) > 4:
                return '{0},{1}'.format(
                    self.encode(word, recurse=False),
                    self.encode(word[3:], recurse=False),
                )
            if word[:2] in {'DE', 'DI', 'LA', 'LE'} and len(word) > 3:
                return '{0},{1}'.format(
                    self.encode(word, recurse=False),
                    self.encode(word[2:], recurse=False),
                )
            # Otherwise, proceed as usual (var='American' mode, ostensibly)

        word = ''.join(c for c in word if c in self._uc_set)

        # Nothing to convert, return base case
        if not word:
            if self._zero_pad:
                return '0' * self._max_length
            return '0'

        # Reverse word if computing Reverse Soundex
        if self._reverse:
            word = word[::-1]

        # apply the Soundex algorithm
        sdx = word.translate(self._trans)

        if self._var == 'special':
            sdx = sdx.replace('9', '0')  # special rule for 1880-1910 census
        else:
            sdx = sdx.replace('9', '')  # rule 1
        sdx = self._delete_consecutive_repeats(sdx)  # rule 3

        if word[0] in 'HW':
            sdx = word[0] + sdx
        else:
            sdx = word[0] + sdx[1:]
        sdx = sdx.replace('0', '')  # rule 1

        if self._zero_pad:
            sdx += '0' * self._max_length  # rule 4

        return sdx[: self._max_length]


if __name__ == '__main__':
    import doctest

    doctest.testmod()


1		# Copyright 2014-2020 by Christopher C. Little.
2		# This file is part of Abydos.
3		#
4		# Abydos is free software: you can redistribute it and/or modify
5		# it under the terms of the GNU General Public License as published by
6		# the Free Software Foundation, either version 3 of the License, or
7		# (at your option) any later version.
8		#
9		# Abydos is distributed in the hope that it will be useful,
10		# but WITHOUT ANY WARRANTY; without even the implied warranty of
11		# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12		# GNU General Public License for more details.
13		#
14		# You should have received a copy of the GNU General Public License
15		# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
16
17		"""abydos.phonetic._soundex.
18
19	1	American Soundex
20		"""
21
22		from typing import Any
23		from unicodedata import normalize as unicode_normalize
24	1
25		from ._phonetic import _Phonetic
26
27		__all__ = ['Soundex']
28
29
30		class Soundex(_Phonetic):
31	1	"""Soundex.
32
33	1	Three variants of Soundex are implemented:
34
35	1	- 'American' follows the American Soundex algorithm, as described at
36		:cite:`US:2007` and in :cite:`Knuth:1998`; this is also called
37	1	Miracode
38	1	- 'special' follows the rules from the 1880-1910 US Census
39		retrospective re-analysis, in which h & w are not treated as blocking
40	1	consonants but as vowels. Cf. :cite:`Repici:2013`.
41		- 'Census' follows the rules laid out in GIL 55 :cite:`US:1997` by the
42		US Census, including coding prefixed and unprefixed versions of some
43	1	names
44
45		.. versionadded:: 0.3.6
46		"""
47
48		_trans = dict(
49		zip(
50		(ord(_) for _ in 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),
		0 ignored issues – show Comprehensibility Best Practice introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report The variable `_` does not seem to be defined. Loading history...
51		'01230129022455012623019202',
52		)
53		)
54
55		_alphabetic = dict(zip((ord(_) for _ in '01234569'), 'APKTLNRH'))
56
57		def __init__(
58		self,
59		max_length: int = 4,
60		var: str = 'American',
61	1	reverse: bool = False,
62		zero_pad: bool = True,
63		) -> None:
64		"""Initialize Soundex instance.
65
66		Parameters
67		----------
68	1	max_length : int
69		The length of the code returned (defaults to 4)
70	1	var : str
71		The variant of the algorithm to employ (defaults to ``American``):
72
73		- ``American`` follows the American Soundex algorithm, as
74		described at :cite:`US:2007` and in :cite:`Knuth:1998`; this
75		is also called Miracode
76		- ``special`` follows the rules from the 1880-1910 US Census
77		retrospective re-analysis, in which h & w are not treated as
78		blocking consonants but as vowels. Cf. :cite:`Repici:2013`.
79		- ``Census`` follows the rules laid out in GIL 55
80		:cite:`US:1997` by the US Census, including coding prefixed
81		and unprefixed versions of some names
82
83		reverse : bool
84		Reverse the word before computing the selected Soundex (defaults to
85		False); This results in "Reverse Soundex", which is useful for
86		blocking in cases where the initial elements may be in error.
87		zero_pad : bool
88		Pad the end of the return value with 0s to achieve a max_length
89		string
90
91
92		.. versionadded:: 0.4.0
93
94		"""
95		# Require a max_length of at least 4 and not more than 64
96		if max_length != -1:
97		self._max_length = min(max(4, max_length), 64)
98		else:
99		self._max_length = 64
100
101		self._var = var
102		self._reverse = reverse
103		self._zero_pad = zero_pad
104
105	1	def encode_alpha(self, word: str) -> str:
106	1	"""Return the alphabetic Soundex code for a word.
107
108	1	Parameters
109		----------
110	1	word : str
111	1	The word to transform
112	1
113		Returns
114	1	-------
115		str
116		The alphabetic Soundex value
117
118		Examples
119		--------
120		>>> pe = Soundex()
121		>>> pe.encode_alpha("Christopher")
122		'CRKT'
123		>>> pe.encode_alpha("Niall")
124		'NL'
125		>>> pe.encode_alpha('Smith')
126		'SNT'
127		>>> pe.encode_alpha('Schmidt')
128		'SNT'
129
130
131		.. versionadded:: 0.4.0
132
133		"""
134		code = self.encode(word).rstrip('0')
135		return code[:1] + code[1:].translate(self._alphabetic)
136
137		def encode(self, word: str, **kwargs: Any) -> str:
138		"""Return the Soundex code for a word.
139
140		Parameters
141		----------
142		word : str
143	1	The word to transform
144	1
145		Returns
146	1	-------
147		str
148		The Soundex value
149
150		Examples
151		--------
152		>>> pe = Soundex()
153		>>> pe.encode("Christopher")
154		'C623'
155		>>> pe.encode("Niall")
156		'N400'
157		>>> pe.encode('Smith')
158		'S530'
159		>>> pe.encode('Schmidt')
160		'S530'
161
162		>>> Soundex(max_length=-1).encode('Christopher')
163		'C623160000000000000000000000000000000000000000000000000000000000'
164		>>> Soundex(max_length=-1, zero_pad=False).encode('Christopher')
165		'C62316'
166
167		>>> Soundex(reverse=True).encode('Christopher')
168		'R132'
169
170		>>> pe.encode('Ashcroft')
171		'A261'
172		>>> pe.encode('Asicroft')
173		'A226'
174
175		>>> pe_special = Soundex(var='special')
176		>>> pe_special.encode('Ashcroft')
177		'A226'
178		>>> pe_special.encode('Asicroft')
179		'A226'
180
181
182		.. versionadded:: 0.1.0
183		.. versionchanged:: 0.3.6
184		Encapsulated in class
185		.. versionchanged:: 0.6.0
186		Made return a str only (comma-separated)
187
188		"""
189		# uppercase, normalize, decompose, and filter non-A-Z out
190		word = unicode_normalize('NFKD', word.upper())
191
192		if self._var == 'Census' and (
193		'recurse' not in kwargs or kwargs['recurse'] is not False
194		):
195		if word[:3] in {'VAN', 'CON'} and len(word) > 4:
196		return '{0},{1}'.format(
197	1	self.encode(word, recurse=False),
198	1	self.encode(word[3:], recurse=False),
199		)
200	1	if word[:2] in {'DE', 'DI', 'LA', 'LE'} and len(word) > 3:
201	1	return '{0},{1}'.format(
202	1	self.encode(word, recurse=False),
203		self.encode(word[2:], recurse=False),
204		)
205		# Otherwise, proceed as usual (var='American' mode, ostensibly)
206
207		word = ''.join(c for c in word if c in self._uc_set)
208
209		# Nothing to convert, return base case
210		if not word:
211		if self._zero_pad:
212		return '0' * self._max_length
213		return '0'
214
215		# Reverse word if computing Reverse Soundex
216		if self._reverse:
217		word = word[::-1]
218	1
219	1	# apply the Soundex algorithm
220		sdx = word.translate(self._trans)
221
222		if self._var == 'special':
223		sdx = sdx.replace('9', '0') # special rule for 1880-1910 census
224		else:
225		sdx = sdx.replace('9', '') # rule 1
226		sdx = self._delete_consecutive_repeats(sdx) # rule 3
227
228		if word[0] in 'HW':
229		sdx = word[0] + sdx
230		else:
231		sdx = word[0] + sdx[1:]
232		sdx = sdx.replace('0', '') # rule 1
233
234		if self._zero_pad:
235		sdx += '0' * self._max_length # rule 4
236
237	1	return sdx[: self._max_length]
238
239
240	1	if __name__ == '__main__':
241	1	import doctest
242	1
243		doctest.testmod()
244

chrislit / abydos

abydos.phonetic._soundex A last analyzed 2020-12-31 20:10 UTC

Complexity

Size/Duplication

Test Coverage

Importance

3 Methods

Duplication Side-by-Side

Filter issues like

abydos.phonetic._soundex A
last analyzed 2020-12-31 20:10 UTC