Completed
Push — master ( f43547...71985b )
by Chris
12:00 queued 10s
created

abydos.phonetic._eudex   A

Complexity

Total Complexity 6

Size/Duplication

Total Lines 273
Duplicated Lines 0 %

Test Coverage

Coverage 100%

Importance

Changes 0
Metric Value
eloc 155
dl 0
loc 273
ccs 26
cts 26
cp 1
rs 10
c 0
b 0
f 0
wmc 6

1 Function

Rating   Name   Duplication   Size   Complexity  
A eudex() 0 32 1

1 Method

Rating   Name   Duplication   Size   Complexity  
B Eudex.encode() 0 62 5
1
# -*- coding: utf-8 -*-
2
3
# Copyright 2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19 1
"""abydos.phonetic._eudex.
20
21
Eudex phonetic hash
22
"""
23
24 1
from __future__ import (
25
    absolute_import,
26
    division,
27
    print_function,
28
    unicode_literals,
29
)
30
31 1
from six.moves import range
32
33 1
from ._phonetic import _Phonetic
34
35 1
__all__ = ['Eudex', 'eudex']
36
37
38 1
class Eudex(_Phonetic):
0 ignored issues
show
Unused Code introduced by
The variable __class__ seems to be unused.
Loading history...
39
    """Eudex hash.
40
41
    This implementation of eudex phonetic hashing is based on the specification
42
    (not the reference implementation) at :cite:`Ticki:2016`.
43
44
    Further details can be found at :cite:`Ticki:2016b`.
45
    """
46
47 1
    _trailing_phones = {
48
        'a': 0,  # a
49
        'b': 0b01001000,  # b
50
        'c': 0b00001100,  # c
51
        'd': 0b00011000,  # d
52
        'e': 0,  # e
53
        'f': 0b01000100,  # f
54
        'g': 0b00001000,  # g
55
        'h': 0b00000100,  # h
56
        'i': 1,  # i
57
        'j': 0b00000101,  # j
58
        'k': 0b00001001,  # k
59
        'l': 0b10100000,  # l
60
        'm': 0b00000010,  # m
61
        'n': 0b00010010,  # n
62
        'o': 0,  # o
63
        'p': 0b01001001,  # p
64
        'q': 0b10101000,  # q
65
        'r': 0b10100001,  # r
66
        's': 0b00010100,  # s
67
        't': 0b00011101,  # t
68
        'u': 1,  # u
69
        'v': 0b01000101,  # v
70
        'w': 0b00000000,  # w
71
        'x': 0b10000100,  # x
72
        'y': 1,  # y
73
        'z': 0b10010100,  # z
74
        'ß': 0b00010101,  # ß
75
        'à': 0,  # à
76
        'á': 0,  # á
77
        'â': 0,  # â
78
        'ã': 0,  # ã
79
        'ä': 0,  # ä[æ]
80
        'å': 1,  # å[oː]
81
        'æ': 0,  # æ[æ]
82
        'ç': 0b10010101,  # ç[t͡ʃ]
83
        'è': 1,  # è
84
        'é': 1,  # é
85
        'ê': 1,  # ê
86
        'ë': 1,  # ë
87
        'ì': 1,  # ì
88
        'í': 1,  # í
89
        'î': 1,  # î
90
        'ï': 1,  # ï
91
        'ð': 0b00010101,  # ð[ð̠](represented as a non-plosive T)
92
        'ñ': 0b00010111,  # ñ[nj](represented as a combination of n and j)
93
        'ò': 0,  # ò
94
        'ó': 0,  # ó
95
        'ô': 0,  # ô
96
        'õ': 0,  # õ
97
        'ö': 1,  # ö[ø]
98
        '÷': 0b11111111,  # ÷
99
        'ø': 1,  # ø[ø]
100
        'ù': 1,  # ù
101
        'ú': 1,  # ú
102
        'û': 1,  # û
103
        'ü': 1,  # ü
104
        'ý': 1,  # ý
105
        'þ': 0b00010101,  # þ[ð̠](represented as a non-plosive T)
106
        'ÿ': 1,  # ÿ
107
    }
108
109 1
    _initial_phones = {
110
        'a': 0b10000100,  # a*
111
        'b': 0b00100100,  # b
112
        'c': 0b00000110,  # c
113
        'd': 0b00001100,  # d
114
        'e': 0b11011000,  # e*
115
        'f': 0b00100010,  # f
116
        'g': 0b00000100,  # g
117
        'h': 0b00000010,  # h
118
        'i': 0b11111000,  # i*
119
        'j': 0b00000011,  # j
120
        'k': 0b00000101,  # k
121
        'l': 0b01010000,  # l
122
        'm': 0b00000001,  # m
123
        'n': 0b00001001,  # n
124
        'o': 0b10010100,  # o*
125
        'p': 0b00100101,  # p
126
        'q': 0b01010100,  # q
127
        'r': 0b01010001,  # r
128
        's': 0b00001010,  # s
129
        't': 0b00001110,  # t
130
        'u': 0b11100000,  # u*
131
        'v': 0b00100011,  # v
132
        'w': 0b00000000,  # w
133
        'x': 0b01000010,  # x
134
        'y': 0b11100100,  # y*
135
        'z': 0b01001010,  # z
136
        'ß': 0b00001011,  # ß
137
        'à': 0b10000101,  # à
138
        'á': 0b10000101,  # á
139
        'â': 0b10000000,  # â
140
        'ã': 0b10000110,  # ã
141
        'ä': 0b10100110,  # ä [æ]
142
        'å': 0b11000010,  # å [oː]
143
        'æ': 0b10100111,  # æ [æ]
144
        'ç': 0b01010100,  # ç [t͡ʃ]
145
        'è': 0b11011001,  # è
146
        'é': 0b11011001,  # é
147
        'ê': 0b11011001,  # ê
148
        'ë': 0b11000110,  # ë [ə] or [œ]
149
        'ì': 0b11111001,  # ì
150
        'í': 0b11111001,  # í
151
        'î': 0b11111001,  # î
152
        'ï': 0b11111001,  # ï
153
        'ð': 0b00001011,  # ð [ð̠] (represented as a non-plosive T)
154
        'ñ': 0b00001011,  # ñ [nj] (represented as a combination of n and j)
155
        'ò': 0b10010101,  # ò
156
        'ó': 0b10010101,  # ó
157
        'ô': 0b10010101,  # ô
158
        'õ': 0b10010101,  # õ
159
        'ö': 0b11011100,  # ö [œ] or [ø]
160
        '÷': 0b11111111,  # ÷
161
        'ø': 0b11011101,  # ø [œ] or [ø]
162
        'ù': 0b11100001,  # ù
163
        'ú': 0b11100001,  # ú
164
        'û': 0b11100001,  # û
165
        'ü': 0b11100101,  # ü
166
        'ý': 0b11100101,  # ý
167
        'þ': 0b00001011,  # þ [ð̠] (represented as a non-plosive T)
168
        'ÿ': 0b11100101,  # ÿ
169
    }
170
171 1
    def encode(self, word, max_length=8):
0 ignored issues
show
Bug introduced by
Parameters differ from overridden 'encode' method
Loading history...
172
        """Return the eudex phonetic hash of a word.
173
174
        Parameters
175
        ----------
176
        word : str
177
            The word to transform
178
        max_length : int
179
            The length in bits of the code returned (default 8)
180
181
        Returns
182
        -------
183
        int
184
            The eudex hash
185
186
        Examples
187
        --------
188
        >>> pe = Eudex()
189
        >>> pe.encode('Colin')
190
        432345564238053650
191
        >>> pe.encode('Christopher')
192
        433648490138894409
193
        >>> pe.encode('Niall')
194
        648518346341351840
195
        >>> pe.encode('Smith')
196
        720575940412906756
197
        >>> pe.encode('Schmidt')
198
        720589151732307997
199
200
        """
201
        # Lowercase input & filter unknown characters
202 1
        word = ''.join(
203
            char for char in word.lower() if char in self._initial_phones
204
        )
205
206 1
        if not word:
207 1
            word = '÷'
208
209
        # Perform initial eudex coding of each character
210 1
        values = [self._initial_phones[word[0]]]
211 1
        values += [self._trailing_phones[char] for char in word[1:]]
212
213
        # Right-shift by one to determine if second instance should be skipped
214 1
        shifted_values = [_ >> 1 for _ in values]
215 1
        condensed_values = [values[0]]
216 1
        for n in range(1, len(shifted_values)):
0 ignored issues
show
Coding Style Naming introduced by
The name n does not conform to the variable naming conventions ((([a-z][a-z0-9_]{2,30})|(_[a-z0-9_]*))$).

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
217 1
            if shifted_values[n] != shifted_values[n - 1]:
218 1
                condensed_values.append(values[n])
219
220
        # Add padding after first character & trim beyond max_length
221 1
        values = (
222
            [condensed_values[0]]
223
            + [0] * max(0, max_length - len(condensed_values))
224
            + condensed_values[1:max_length]
225
        )
226
227
        # Combine individual character values into eudex hash
228 1
        hash_value = 0
229 1
        for val in values:
230 1
            hash_value = (hash_value << 8) | val
231
232 1
        return hash_value
233
234
235 1
def eudex(word, max_length=8):
236
    """Return the eudex phonetic hash of a word.
237
238
    This is a wrapper for :py:meth:`Eudex.encode`.
239
240
    Parameters
241
    ----------
242
    word : str
243
        The word to transform
244
    max_length : int
245
        The length in bits of the code returned (default 8)
246
247
    Returns
248
    -------
249
    int
250
        The eudex hash
251
252
    Examples
253
    --------
254
    >>> eudex('Colin')
255
    432345564238053650
256
    >>> eudex('Christopher')
257
    433648490138894409
258
    >>> eudex('Niall')
259
    648518346341351840
260
    >>> eudex('Smith')
261
    720575940412906756
262
    >>> eudex('Schmidt')
263
    720589151732307997
264
265
    """
266 1
    return Eudex().encode(word, max_length)
267
268
269
if __name__ == '__main__':
270
    import doctest
271
272
    doctest.testmod()
273