abydos.phonetic._fuzzy_soundex.fuzzy_soundex() - Code Metrics - Inspection of "Merge pull request #149 from chrislit/0.3.6" - chrislit/abydos - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — master ( f43547...71985b )

by Chris

created 2018-11-17 08:52 UTC

abydos.phonetic._fuzzy_soundex.fuzzy_soundex() A

↳ Parent: abydos.phonetic._fuzzy_soundex

Complexity

Conditions

Size

Total Lines	32
Code Lines	2

Duplication

Lines	0
Ratio	0 %

Code Coverage

Tests	2
CRAP Score	1

Importance

Changes

Metric	Value
cc	1
eloc	2
nop	3
dl	0
loc	32
ccs	2
cts	2
cp	1
crap	1
rs	10
c	0
b	0
f	0

# -*- coding: utf-8 -*-

# Copyright 2014-2018 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.phonetic._fuzzy_soundex.

Fuzzy Soundex
"""

from __future__ import (
    absolute_import,
    division,
    print_function,
    unicode_literals,
)

from unicodedata import normalize as unicode_normalize

from six import text_type

from ._phonetic import _Phonetic

__all__ = ['FuzzySoundex', 'fuzzy_soundex']


class FuzzySoundex(_Phonetic):

    """Fuzzy Soundex.

    Fuzzy Soundex is an algorithm derived from Soundex, defined in
    :cite:`Holmes:2002`.
    """

    _trans = dict(
        zip(
            (ord(_) for _ in 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),

            '0193017-07745501769301-7-9',
        )
    )

    def encode(self, word, max_length=5, zero_pad=True):

        """Return the Fuzzy Soundex code for a word.

        Parameters
        ----------
        word : str
            The word to transform
        max_length : int
            The length of the code returned (defaults to 4)
        zero_pad : bool
            Pad the end of the return value with 0s to achieve a max_length
            string

        Returns
        -------
        str
            The Fuzzy Soundex value

        Examples
        --------
        >>> pe = FuzzySoundex()
        >>> pe.encode('Christopher')
        'K6931'
        >>> pe.encode('Niall')
        'N4000'
        >>> pe.encode('Smith')
        'S5300'
        >>> pe.encode('Smith')
        'S5300'

        """
        word = unicode_normalize('NFKD', text_type(word.upper()))
        word = word.replace('ß', 'SS')

        # Clamp max_length to [4, 64]
        if max_length != -1:
            max_length = min(max(4, max_length), 64)
        else:
            max_length = 64

        if not word:
            if zero_pad:
                return '0' * max_length
            return '0'

        if word[:2] in {'CS', 'CZ', 'TS', 'TZ'}:
            word = 'SS' + word[2:]
        elif word[:2] == 'GN':
            word = 'NN' + word[2:]
        elif word[:2] in {'HR', 'WR'}:
            word = 'RR' + word[2:]
        elif word[:2] == 'HW':
            word = 'WW' + word[2:]
        elif word[:2] in {'KN', 'NG'}:
            word = 'NN' + word[2:]

        if word[-2:] == 'CH':
            word = word[:-2] + 'KK'
        elif word[-2:] == 'NT':
            word = word[:-2] + 'TT'
        elif word[-2:] == 'RT':
            word = word[:-2] + 'RR'
        elif word[-3:] == 'RDT':
            word = word[:-3] + 'RR'

        word = word.replace('CA', 'KA')
        word = word.replace('CC', 'KK')
        word = word.replace('CK', 'KK')
        word = word.replace('CE', 'SE')
        word = word.replace('CHL', 'KL')
        word = word.replace('CL', 'KL')
        word = word.replace('CHR', 'KR')
        word = word.replace('CR', 'KR')
        word = word.replace('CI', 'SI')
        word = word.replace('CO', 'KO')
        word = word.replace('CU', 'KU')
        word = word.replace('CY', 'SY')
        word = word.replace('DG', 'GG')
        word = word.replace('GH', 'HH')
        word = word.replace('MAC', 'MK')
        word = word.replace('MC', 'MK')
        word = word.replace('NST', 'NSS')
        word = word.replace('PF', 'FF')
        word = word.replace('PH', 'FF')
        word = word.replace('SCH', 'SSS')
        word = word.replace('TIO', 'SIO')
        word = word.replace('TIA', 'SIO')
        word = word.replace('TCH', 'CHH')

        sdx = word.translate(self._trans)
        sdx = sdx.replace('-', '')

        # remove repeating characters
        sdx = self._delete_consecutive_repeats(sdx)

        if word[0] in {'H', 'W', 'Y'}:
            sdx = word[0] + sdx
        else:
            sdx = word[0] + sdx[1:]

        sdx = sdx.replace('0', '')

        if zero_pad:
            sdx += '0' * max_length

        return sdx[:max_length]


def fuzzy_soundex(word, max_length=5, zero_pad=True):
    """Return the Fuzzy Soundex code for a word.

    This is a wrapper for :py:meth:`FuzzySoundex.encode`.

    Parameters
    ----------
    word : str
        The word to transform
    max_length : int
        The length of the code returned (defaults to 4)
    zero_pad : bool
        Pad the end of the return value with 0s to achieve a max_length string

    Returns
    -------
    str
        The Fuzzy Soundex value

    Examples
    --------
    >>> fuzzy_soundex('Christopher')
    'K6931'
    >>> fuzzy_soundex('Niall')
    'N4000'
    >>> fuzzy_soundex('Smith')
    'S5300'
    >>> fuzzy_soundex('Smith')
    'S5300'

    """
    return FuzzySoundex().encode(word, max_length, zero_pad)


if __name__ == '__main__':
    import doctest

    doctest.testmod()


1		# -- coding: utf-8 --
2
3		# Copyright 2014-2018 by Christopher C. Little.
4		# This file is part of Abydos.
5		#
6		# Abydos is free software: you can redistribute it and/or modify
7		# it under the terms of the GNU General Public License as published by
8		# the Free Software Foundation, either version 3 of the License, or
9		# (at your option) any later version.
10		#
11		# Abydos is distributed in the hope that it will be useful,
12		# but WITHOUT ANY WARRANTY; without even the implied warranty of
13		# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14		# GNU General Public License for more details.
15		#
16		# You should have received a copy of the GNU General Public License
17		# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19	1	"""abydos.phonetic._fuzzy_soundex.
20
21		Fuzzy Soundex
22		"""
23
24	1	from __future__ import (
25		absolute_import,
26		division,
27		print_function,
28		unicode_literals,
29		)
30
31	1	from unicodedata import normalize as unicode_normalize
32
33	1	from six import text_type
34
35	1	from ._phonetic import _Phonetic
36
37	1	__all__ = ['FuzzySoundex', 'fuzzy_soundex']
38
39
40	1	class FuzzySoundex(_Phonetic):
		0 ignored issues – show Unused Code introduced 2018-11-10 01:42 UTC by Report Bug Copy Issue Report The variable `__class__` seems to be unused. Loading history...
41		"""Fuzzy Soundex.
42
43		Fuzzy Soundex is an algorithm derived from Soundex, defined in
44		:cite:`Holmes:2002`.
45		"""
46
47	1	_trans = dict(
48		zip(
49		(ord(_) for _ in 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),
		0 ignored issues – show Comprehensibility Best Practice introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report The variable `_` does not seem to be defined. Loading history...
50		'0193017-07745501769301-7-9',
51		)
52		)
53
54	1	def encode(self, word, max_length=5, zero_pad=True):
		0 ignored issues – show Bug introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Parameters differ from overridden 'encode' method Loading history...
55		"""Return the Fuzzy Soundex code for a word.
56
57		Parameters
58		----------
59		word : str
60		The word to transform
61		max_length : int
62		The length of the code returned (defaults to 4)
63		zero_pad : bool
64		Pad the end of the return value with 0s to achieve a max_length
65		string
66
67		Returns
68		-------
69		str
70		The Fuzzy Soundex value
71
72		Examples
73		--------
74		>>> pe = FuzzySoundex()
75		>>> pe.encode('Christopher')
76		'K6931'
77		>>> pe.encode('Niall')
78		'N4000'
79		>>> pe.encode('Smith')
80		'S5300'
81		>>> pe.encode('Smith')
82		'S5300'
83
84		"""
85	1	word = unicode_normalize('NFKD', text_type(word.upper()))
86	1	word = word.replace('ß', 'SS')
87
88		# Clamp max_length to [4, 64]
89	1	if max_length != -1:
90	1	max_length = min(max(4, max_length), 64)
91		else:
92	1	max_length = 64
93
94	1	if not word:
95	1	if zero_pad:
96	1	return '0' * max_length
97	1	return '0'
98
99	1	if word[:2] in {'CS', 'CZ', 'TS', 'TZ'}:
100	1	word = 'SS' + word[2:]
101	1	elif word[:2] == 'GN':
102	1	word = 'NN' + word[2:]
103	1	elif word[:2] in {'HR', 'WR'}:
104	1	word = 'RR' + word[2:]
105	1	elif word[:2] == 'HW':
106	1	word = 'WW' + word[2:]
107	1	elif word[:2] in {'KN', 'NG'}:
108	1	word = 'NN' + word[2:]
109
110	1	if word[-2:] == 'CH':
111	1	word = word[:-2] + 'KK'
112	1	elif word[-2:] == 'NT':
113	1	word = word[:-2] + 'TT'
114	1	elif word[-2:] == 'RT':
115	1	word = word[:-2] + 'RR'
116	1	elif word[-3:] == 'RDT':
117	1	word = word[:-3] + 'RR'
118
119	1	word = word.replace('CA', 'KA')
120	1	word = word.replace('CC', 'KK')
121	1	word = word.replace('CK', 'KK')
122	1	word = word.replace('CE', 'SE')
123	1	word = word.replace('CHL', 'KL')
124	1	word = word.replace('CL', 'KL')
125	1	word = word.replace('CHR', 'KR')
126	1	word = word.replace('CR', 'KR')
127	1	word = word.replace('CI', 'SI')
128	1	word = word.replace('CO', 'KO')
129	1	word = word.replace('CU', 'KU')
130	1	word = word.replace('CY', 'SY')
131	1	word = word.replace('DG', 'GG')
132	1	word = word.replace('GH', 'HH')
133	1	word = word.replace('MAC', 'MK')
134	1	word = word.replace('MC', 'MK')
135	1	word = word.replace('NST', 'NSS')
136	1	word = word.replace('PF', 'FF')
137	1	word = word.replace('PH', 'FF')
138	1	word = word.replace('SCH', 'SSS')
139	1	word = word.replace('TIO', 'SIO')
140	1	word = word.replace('TIA', 'SIO')
141	1	word = word.replace('TCH', 'CHH')
142
143	1	sdx = word.translate(self._trans)
144	1	sdx = sdx.replace('-', '')
145
146		# remove repeating characters
147	1	sdx = self._delete_consecutive_repeats(sdx)
148
149	1	if word[0] in {'H', 'W', 'Y'}:
150	1	sdx = word[0] + sdx
151		else:
152	1	sdx = word[0] + sdx[1:]
153
154	1	sdx = sdx.replace('0', '')
155
156	1	if zero_pad:
157	1	sdx += '0' * max_length
158
159	1	return sdx[:max_length]
160
161
162	1	def fuzzy_soundex(word, max_length=5, zero_pad=True):
163		"""Return the Fuzzy Soundex code for a word.
164
165		This is a wrapper for :py:meth:`FuzzySoundex.encode`.
166
167		Parameters
168		----------
169		word : str
170		The word to transform
171		max_length : int
172		The length of the code returned (defaults to 4)
173		zero_pad : bool
174		Pad the end of the return value with 0s to achieve a max_length string
175
176		Returns
177		-------
178		str
179		The Fuzzy Soundex value
180
181		Examples
182		--------
183		>>> fuzzy_soundex('Christopher')
184		'K6931'
185		>>> fuzzy_soundex('Niall')
186		'N4000'
187		>>> fuzzy_soundex('Smith')
188		'S5300'
189		>>> fuzzy_soundex('Smith')
190		'S5300'
191
192		"""
193	1	return FuzzySoundex().encode(word, max_length, zero_pad)
194
195
196		if __name__ == '__main__':
197		import doctest
198
199		doctest.testmod()
200

chrislit / abydos

Push — master ( f43547...71985b )

abydos.phonetic._fuzzy_soundex.fuzzy_soundex() A

Complexity

Size

Duplication

Code Coverage

Importance

Duplication Side-by-Side

Filter issues like