Completed
Branch master (78a222)
by Chris
14:36
created

abydos.fingerprint._lightweight   A

Complexity

Total Complexity 25

Size/Duplication

Total Lines 280
Duplicated Lines 0 %

Test Coverage

Coverage 100%

Importance

Changes 0
Metric Value
wmc 25
eloc 109
dl 0
loc 280
ccs 75
cts 75
cp 1
rs 10
c 0
b 0
f 0

4 Functions

Rating   Name   Duplication   Size   Complexity  
A count_fingerprint() 0 41 5
B position_fingerprint() 0 49 8
A occurrence_fingerprint() 0 42 5
B occurrence_halved_fingerprint() 0 49 7
1
# -*- coding: utf-8 -*-
2
3
# Copyright 2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19 1
"""abydos.fingerprint._lightweight.
20
21
The fingerprint.lightweight module implements string fingerprints developed
22
by Cisłak & Grabowski in :cite:`Cislak:2017`:
23
24
    - occurrence fingerprint
25
    - occurrence halved fingerprint
26
    - count fingerprint
27
    - position fingerprint
28
"""
29
30 1
from __future__ import unicode_literals
31
32 1
from collections import Counter
33
34 1
__all__ = [
35
    'MOST_COMMON_LETTERS',
36
    'MOST_COMMON_LETTERS_CG',
37
    'MOST_COMMON_LETTERS_DE',
38
    'MOST_COMMON_LETTERS_DE_LC',
39
    'MOST_COMMON_LETTERS_EN_LC',
40
    'count_fingerprint',
41
    'occurrence_fingerprint',
42
    'occurrence_halved_fingerprint',
43
    'position_fingerprint',
44
]
45
46
# fmt: off
47
# most common letters, as defined in Cisłak & Grabowski
48 1
MOST_COMMON_LETTERS_CG = ('e', 't', 'a', 'o', 'i', 'n', 's', 'h', 'r', 'd',
49
                          'l', 'c', 'u', 'm', 'w', 'f')
50
51
# most common letters (case-folded to lowercase), as shown in Google Books
52
# English n-grams, among letters a-z & digits 0-9
53 1
MOST_COMMON_LETTERS_EN_LC = ('e', 't', 'a', 'i', 'o', 'n', 's', 'r', 'h', 'l',
54
                             'd', 'c', 'u', 'm', 'f', 'p', 'g', 'y', 'w', 'b',
55
                             'v', 'k', 'x', 'j', 'q', 'z', '1', '2', '0', '9',
56
                             '3', '4', '8', '5', '6', '7')
57
58
# most common letters, as shown in Google Books English n-grams, among letters
59
# A-Z, a-z & digits 0-9
60 1
MOST_COMMON_LETTERS = ('e', 't', 'a', 'o', 'i', 'n', 's', 'r', 'h', 'l', 'd',
61
                       'c', 'u', 'm', 'f', 'p', 'g', 'y', 'w', 'b', 'v', 'k',
62
                       'T', 'I', 'A', 'S', 'C', 'x', 'M', 'P', 'E', 'B', 'H',
63
                       'R', 'N', 'D', 'L', 'F', 'W', 'O', 'q', 'G', 'z', 'j',
64
                       'J', 'U', 'V', 'K', 'Y', '1', '2', '0', 'X', '9', 'Q',
65
                       '3', 'Z', '4', '8', '5', '6', '7',)
66
67
# most common letters (case-folded to lowercase), as shown in Google Books
68
# German n-grams, among letters (a-z and umlauted vowels & eszett) & digits 0-9
69 1
MOST_COMMON_LETTERS_DE = ('e', 'n', 'i', 'r', 's', 't', 'a', 'd', 'h', 'u',
70
                          'l', 'g', 'c', 'o', 'm', 'b', 'f', 'w', 'k', 'z',
71
                          'v', 'p', 'ü', 'ä', 'ß', 'ö', 'j', 'y', 'x', 'q',
72
                          '1', '2', '3', '4', '0', '5', '6', '9', '8', '7')
73
74
# most common letters (case-folded to lowercase), as shown in Google Books
75
# German n-grams, among letters (A-Z, a-z, umlauted vowels & eszett) & digits
76
# 0-9
77 1
MOST_COMMON_LETTERS_DE_LC = ('e', 'n', 'i', 'r', 's', 't', 'a', 'd', 'h', 'u',
78
                             'l', 'c', 'g', 'o', 'm', 'b', 'f', 'w', 'k', 'z',
79
                             'v', 'p', 'ü', 'ä', 'S', 'A', 'D', 'B', 'E', 'G',
80
                             'M', 'ß', 'V', 'K', 'ö', 'W', 'F', 'P', 'R', 'I',
81
                             'H', 'L', 'T', 'N', 'Z', 'y', 'U', 'j', 'J', 'O',
82
                             'C', 'x', 'q', 'Ü', 'Q', 'X', 'Ä', 'Ö', '1', '2',
83
                             'Y', '3', '4', '0', '5', '6', '9', '8', '7')
84
# fmt: on
85
86
87 1
def occurrence_fingerprint(
88
    word, n_bits=16, most_common=MOST_COMMON_LETTERS_CG
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
89
):
90
    """Return the occurrence fingerprint.
91
92
    Based on the occurrence fingerprint from :cite:`Cislak:2017`.
93
94
    :param str word: the word to fingerprint
95
    :param int n_bits: number of bits in the fingerprint returned
96
    :param list most_common: the most common tokens in the target language,
97
        ordered by frequency
98
    :returns: the occurrence fingerprint
99
    :rtype: int
100
101
    >>> bin(occurrence_fingerprint('hat'))
102
    '0b110000100000000'
103
    >>> bin(occurrence_fingerprint('niall'))
104
    '0b10110000100000'
105
    >>> bin(occurrence_fingerprint('colin'))
106
    '0b1110000110000'
107
    >>> bin(occurrence_fingerprint('atcg'))
108
    '0b110000000010000'
109
    >>> bin(occurrence_fingerprint('entreatment'))
110
    '0b1110010010000100'
111
    """
112 1
    word = set(word)
113 1
    fingerprint = 0
114
115 1
    for letter in most_common:
116 1
        if letter in word:
117 1
            fingerprint += 1
118 1
        n_bits -= 1
119 1
        if n_bits:
120 1
            fingerprint <<= 1
121
        else:
122 1
            break
123
124 1
    n_bits -= 1
125 1
    if n_bits > 0:
126 1
        fingerprint <<= n_bits
127
128 1
    return fingerprint
129
130
131 1
def occurrence_halved_fingerprint(
132
    word, n_bits=16, most_common=MOST_COMMON_LETTERS_CG
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
133
):
134
    """Return the occurrence halved fingerprint.
135
136
    Based on the occurrence halved fingerprint from :cite:`Cislak:2017`.
137
138
    :param str word: the word to fingerprint
139
    :param int n_bits: number of bits in the fingerprint returned
140
    :param list most_common: the most common tokens in the target language,
141
        ordered by frequency
142
    :returns: the occurrence halved fingerprint
143
    :rtype: int
144
145
    >>> bin(occurrence_halved_fingerprint('hat'))
146
    '0b1010000000010'
147
    >>> bin(occurrence_halved_fingerprint('niall'))
148
    '0b10010100000'
149
    >>> bin(occurrence_halved_fingerprint('colin'))
150
    '0b1001010000'
151
    >>> bin(occurrence_halved_fingerprint('atcg'))
152
    '0b10100000000000'
153
    >>> bin(occurrence_halved_fingerprint('entreatment'))
154
    '0b1111010000110000'
155
    """
156 1
    if n_bits % 2:
157 1
        n_bits += 1
158
159 1
    w_len = len(word) // 2
160 1
    w_1 = set(word[:w_len])
161 1
    w_2 = set(word[w_len:])
162 1
    fingerprint = 0
163
164 1
    for letter in most_common:
165 1
        if n_bits:
166 1
            fingerprint <<= 1
167 1
            if letter in w_1:
168 1
                fingerprint += 1
169 1
            fingerprint <<= 1
170 1
            if letter in w_2:
171 1
                fingerprint += 1
172 1
            n_bits -= 2
173
        else:
174 1
            break
175
176 1
    if n_bits > 0:
177 1
        fingerprint <<= n_bits
178
179 1
    return fingerprint
180
181
182 1
def count_fingerprint(word, n_bits=16, most_common=MOST_COMMON_LETTERS_CG):
183
    """Return the count fingerprint.
184
185
    Based on the count fingerprint from :cite:`Cislak:2017`.
186
187
    :param str word: the word to fingerprint
188
    :param int n_bits: number of bits in the fingerprint returned
189
    :param list most_common: the most common tokens in the target language,
190
        ordered by frequency
191
    :returns: the count fingerprint
192
    :rtype: int
193
194
    >>> bin(count_fingerprint('hat'))
195
    '0b1010000000001'
196
    >>> bin(count_fingerprint('niall'))
197
    '0b10001010000'
198
    >>> bin(count_fingerprint('colin'))
199
    '0b101010000'
200
    >>> bin(count_fingerprint('atcg'))
201
    '0b1010000000000'
202
    >>> bin(count_fingerprint('entreatment'))
203
    '0b1111010000100000'
204
    """
205 1
    if n_bits % 2:
206 1
        n_bits += 1
207
208 1
    word = Counter(word)
209 1
    fingerprint = 0
210
211 1
    for letter in most_common:
212 1
        if n_bits:
213 1
            fingerprint <<= 2
214 1
            fingerprint += word[letter] & 3
215 1
            n_bits -= 2
216
        else:
217 1
            break
218
219 1
    if n_bits:
220 1
        fingerprint <<= n_bits
221
222 1
    return fingerprint
223
224
225 1
def position_fingerprint(
226
    word, n_bits=16, most_common=MOST_COMMON_LETTERS_CG, bits_per_letter=3
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
227
):
228
    """Return the position fingerprint.
229
230
    Based on the position fingerprint from :cite:`Cislak:2017`.
231
232
    :param str word: the word to fingerprint
233
    :param int n_bits: number of bits in the fingerprint returned
234
    :param list most_common: the most common tokens in the target language,
235
        ordered by frequency
236
    :param int bits_per_letter: the bits to assign for letter position
237
    :returns: the position fingerprint
238
    :rtype: int
239
240
    >>> bin(position_fingerprint('hat'))
241
    '0b1110100011111111'
242
    >>> bin(position_fingerprint('niall'))
243
    '0b1111110101110010'
244
    >>> bin(position_fingerprint('colin'))
245
    '0b1111111110010111'
246
    >>> bin(position_fingerprint('atcg'))
247
    '0b1110010001111111'
248
    >>> bin(position_fingerprint('entreatment'))
249
    '0b101011111111'
250
    """
251 1
    position = {}
252 1
    for pos, letter in enumerate(word):
253 1
        if letter not in position and letter in most_common:
254 1
            position[letter] = min(pos, 2 ** bits_per_letter - 1)
255
256 1
    fingerprint = 0
257
258 1
    for letter in most_common:
259 1
        if n_bits:
260 1
            fingerprint <<= min(bits_per_letter, n_bits)
261 1
            if letter in position:
262 1
                fingerprint += min(position[letter], 2 ** n_bits - 1)
263
            else:
264 1
                fingerprint += min(2 ** bits_per_letter - 1, 2 ** n_bits - 1)
265 1
            n_bits -= min(bits_per_letter, n_bits)
266
        else:
267 1
            break
268
269 1
    for _ in range(n_bits):
270 1
        fingerprint <<= 1
271 1
        fingerprint += 1
272
273 1
    return fingerprint
274
275
276
if __name__ == '__main__':
277
    import doctest
278
279
    doctest.testmod()
280