1
|
|
|
# -*- coding: utf-8 -*- |
2
|
|
|
|
3
|
|
|
# Copyright 2018 by Christopher C. Little. |
4
|
|
|
# This file is part of Abydos. |
5
|
|
|
# |
6
|
|
|
# Abydos is free software: you can redistribute it and/or modify |
7
|
|
|
# it under the terms of the GNU General Public License as published by |
8
|
|
|
# the Free Software Foundation, either version 3 of the License, or |
9
|
|
|
# (at your option) any later version. |
10
|
|
|
# |
11
|
|
|
# Abydos is distributed in the hope that it will be useful, |
12
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of |
13
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
14
|
|
|
# GNU General Public License for more details. |
15
|
|
|
# |
16
|
|
|
# You should have received a copy of the GNU General Public License |
17
|
|
|
# along with Abydos. If not, see <http://www.gnu.org/licenses/>. |
18
|
|
|
|
19
|
1 |
|
"""abydos.fingerprint._lightweight. |
20
|
|
|
|
21
|
|
|
The fingerprint.lightweight module implements string fingerprints developed |
22
|
|
|
by Cisłak & Grabowski in :cite:`Cislak:2017`: |
23
|
|
|
|
24
|
|
|
- occurrence fingerprint |
25
|
|
|
- occurrence halved fingerprint |
26
|
|
|
- count fingerprint |
27
|
|
|
- position fingerprint |
28
|
|
|
""" |
29
|
|
|
|
30
|
1 |
|
from __future__ import unicode_literals |
31
|
|
|
|
32
|
1 |
|
from collections import Counter |
33
|
|
|
|
34
|
1 |
|
__all__ = [ |
35
|
|
|
'MOST_COMMON_LETTERS', |
36
|
|
|
'MOST_COMMON_LETTERS_CG', |
37
|
|
|
'MOST_COMMON_LETTERS_DE', |
38
|
|
|
'MOST_COMMON_LETTERS_DE_LC', |
39
|
|
|
'MOST_COMMON_LETTERS_EN_LC', |
40
|
|
|
'count_fingerprint', |
41
|
|
|
'occurrence_fingerprint', |
42
|
|
|
'occurrence_halved_fingerprint', |
43
|
|
|
'position_fingerprint', |
44
|
|
|
] |
45
|
|
|
|
46
|
|
|
# fmt: off |
47
|
|
|
# most common letters, as defined in Cisłak & Grabowski |
48
|
1 |
|
MOST_COMMON_LETTERS_CG = ('e', 't', 'a', 'o', 'i', 'n', 's', 'h', 'r', 'd', |
49
|
|
|
'l', 'c', 'u', 'm', 'w', 'f') |
50
|
|
|
|
51
|
|
|
# most common letters (case-folded to lowercase), as shown in Google Books |
52
|
|
|
# English n-grams, among letters a-z & digits 0-9 |
53
|
1 |
|
MOST_COMMON_LETTERS_EN_LC = ('e', 't', 'a', 'i', 'o', 'n', 's', 'r', 'h', 'l', |
54
|
|
|
'd', 'c', 'u', 'm', 'f', 'p', 'g', 'y', 'w', 'b', |
55
|
|
|
'v', 'k', 'x', 'j', 'q', 'z', '1', '2', '0', '9', |
56
|
|
|
'3', '4', '8', '5', '6', '7') |
57
|
|
|
|
58
|
|
|
# most common letters, as shown in Google Books English n-grams, among letters |
59
|
|
|
# A-Z, a-z & digits 0-9 |
60
|
1 |
|
MOST_COMMON_LETTERS = ('e', 't', 'a', 'o', 'i', 'n', 's', 'r', 'h', 'l', 'd', |
61
|
|
|
'c', 'u', 'm', 'f', 'p', 'g', 'y', 'w', 'b', 'v', 'k', |
62
|
|
|
'T', 'I', 'A', 'S', 'C', 'x', 'M', 'P', 'E', 'B', 'H', |
63
|
|
|
'R', 'N', 'D', 'L', 'F', 'W', 'O', 'q', 'G', 'z', 'j', |
64
|
|
|
'J', 'U', 'V', 'K', 'Y', '1', '2', '0', 'X', '9', 'Q', |
65
|
|
|
'3', 'Z', '4', '8', '5', '6', '7',) |
66
|
|
|
|
67
|
|
|
# most common letters (case-folded to lowercase), as shown in Google Books |
68
|
|
|
# German n-grams, among letters (a-z and umlauted vowels & eszett) & digits 0-9 |
69
|
1 |
|
MOST_COMMON_LETTERS_DE = ('e', 'n', 'i', 'r', 's', 't', 'a', 'd', 'h', 'u', |
70
|
|
|
'l', 'g', 'c', 'o', 'm', 'b', 'f', 'w', 'k', 'z', |
71
|
|
|
'v', 'p', 'ü', 'ä', 'ß', 'ö', 'j', 'y', 'x', 'q', |
72
|
|
|
'1', '2', '3', '4', '0', '5', '6', '9', '8', '7') |
73
|
|
|
|
74
|
|
|
# most common letters (case-folded to lowercase), as shown in Google Books |
75
|
|
|
# German n-grams, among letters (A-Z, a-z, umlauted vowels & eszett) & digits |
76
|
|
|
# 0-9 |
77
|
1 |
|
MOST_COMMON_LETTERS_DE_LC = ('e', 'n', 'i', 'r', 's', 't', 'a', 'd', 'h', 'u', |
78
|
|
|
'l', 'c', 'g', 'o', 'm', 'b', 'f', 'w', 'k', 'z', |
79
|
|
|
'v', 'p', 'ü', 'ä', 'S', 'A', 'D', 'B', 'E', 'G', |
80
|
|
|
'M', 'ß', 'V', 'K', 'ö', 'W', 'F', 'P', 'R', 'I', |
81
|
|
|
'H', 'L', 'T', 'N', 'Z', 'y', 'U', 'j', 'J', 'O', |
82
|
|
|
'C', 'x', 'q', 'Ü', 'Q', 'X', 'Ä', 'Ö', '1', '2', |
83
|
|
|
'Y', '3', '4', '0', '5', '6', '9', '8', '7') |
84
|
|
|
# fmt: on |
85
|
|
|
|
86
|
|
|
|
87
|
1 |
|
def occurrence_fingerprint( |
88
|
|
|
word, n_bits=16, most_common=MOST_COMMON_LETTERS_CG |
|
|
|
|
89
|
|
|
): |
90
|
|
|
"""Return the occurrence fingerprint. |
91
|
|
|
|
92
|
|
|
Based on the occurrence fingerprint from :cite:`Cislak:2017`. |
93
|
|
|
|
94
|
|
|
:param str word: the word to fingerprint |
95
|
|
|
:param int n_bits: number of bits in the fingerprint returned |
96
|
|
|
:param list most_common: the most common tokens in the target language, |
97
|
|
|
ordered by frequency |
98
|
|
|
:returns: the occurrence fingerprint |
99
|
|
|
:rtype: int |
100
|
|
|
|
101
|
|
|
>>> bin(occurrence_fingerprint('hat')) |
102
|
|
|
'0b110000100000000' |
103
|
|
|
>>> bin(occurrence_fingerprint('niall')) |
104
|
|
|
'0b10110000100000' |
105
|
|
|
>>> bin(occurrence_fingerprint('colin')) |
106
|
|
|
'0b1110000110000' |
107
|
|
|
>>> bin(occurrence_fingerprint('atcg')) |
108
|
|
|
'0b110000000010000' |
109
|
|
|
>>> bin(occurrence_fingerprint('entreatment')) |
110
|
|
|
'0b1110010010000100' |
111
|
|
|
""" |
112
|
1 |
|
word = set(word) |
113
|
1 |
|
fingerprint = 0 |
114
|
|
|
|
115
|
1 |
|
for letter in most_common: |
116
|
1 |
|
if letter in word: |
117
|
1 |
|
fingerprint += 1 |
118
|
1 |
|
n_bits -= 1 |
119
|
1 |
|
if n_bits: |
120
|
1 |
|
fingerprint <<= 1 |
121
|
|
|
else: |
122
|
1 |
|
break |
123
|
|
|
|
124
|
1 |
|
n_bits -= 1 |
125
|
1 |
|
if n_bits > 0: |
126
|
1 |
|
fingerprint <<= n_bits |
127
|
|
|
|
128
|
1 |
|
return fingerprint |
129
|
|
|
|
130
|
|
|
|
131
|
1 |
|
def occurrence_halved_fingerprint( |
132
|
|
|
word, n_bits=16, most_common=MOST_COMMON_LETTERS_CG |
|
|
|
|
133
|
|
|
): |
134
|
|
|
"""Return the occurrence halved fingerprint. |
135
|
|
|
|
136
|
|
|
Based on the occurrence halved fingerprint from :cite:`Cislak:2017`. |
137
|
|
|
|
138
|
|
|
:param str word: the word to fingerprint |
139
|
|
|
:param int n_bits: number of bits in the fingerprint returned |
140
|
|
|
:param list most_common: the most common tokens in the target language, |
141
|
|
|
ordered by frequency |
142
|
|
|
:returns: the occurrence halved fingerprint |
143
|
|
|
:rtype: int |
144
|
|
|
|
145
|
|
|
>>> bin(occurrence_halved_fingerprint('hat')) |
146
|
|
|
'0b1010000000010' |
147
|
|
|
>>> bin(occurrence_halved_fingerprint('niall')) |
148
|
|
|
'0b10010100000' |
149
|
|
|
>>> bin(occurrence_halved_fingerprint('colin')) |
150
|
|
|
'0b1001010000' |
151
|
|
|
>>> bin(occurrence_halved_fingerprint('atcg')) |
152
|
|
|
'0b10100000000000' |
153
|
|
|
>>> bin(occurrence_halved_fingerprint('entreatment')) |
154
|
|
|
'0b1111010000110000' |
155
|
|
|
""" |
156
|
1 |
|
if n_bits % 2: |
157
|
1 |
|
n_bits += 1 |
158
|
|
|
|
159
|
1 |
|
w_len = len(word) // 2 |
160
|
1 |
|
w_1 = set(word[:w_len]) |
161
|
1 |
|
w_2 = set(word[w_len:]) |
162
|
1 |
|
fingerprint = 0 |
163
|
|
|
|
164
|
1 |
|
for letter in most_common: |
165
|
1 |
|
if n_bits: |
166
|
1 |
|
fingerprint <<= 1 |
167
|
1 |
|
if letter in w_1: |
168
|
1 |
|
fingerprint += 1 |
169
|
1 |
|
fingerprint <<= 1 |
170
|
1 |
|
if letter in w_2: |
171
|
1 |
|
fingerprint += 1 |
172
|
1 |
|
n_bits -= 2 |
173
|
|
|
else: |
174
|
1 |
|
break |
175
|
|
|
|
176
|
1 |
|
if n_bits > 0: |
177
|
1 |
|
fingerprint <<= n_bits |
178
|
|
|
|
179
|
1 |
|
return fingerprint |
180
|
|
|
|
181
|
|
|
|
182
|
1 |
|
def count_fingerprint(word, n_bits=16, most_common=MOST_COMMON_LETTERS_CG): |
183
|
|
|
"""Return the count fingerprint. |
184
|
|
|
|
185
|
|
|
Based on the count fingerprint from :cite:`Cislak:2017`. |
186
|
|
|
|
187
|
|
|
:param str word: the word to fingerprint |
188
|
|
|
:param int n_bits: number of bits in the fingerprint returned |
189
|
|
|
:param list most_common: the most common tokens in the target language, |
190
|
|
|
ordered by frequency |
191
|
|
|
:returns: the count fingerprint |
192
|
|
|
:rtype: int |
193
|
|
|
|
194
|
|
|
>>> bin(count_fingerprint('hat')) |
195
|
|
|
'0b1010000000001' |
196
|
|
|
>>> bin(count_fingerprint('niall')) |
197
|
|
|
'0b10001010000' |
198
|
|
|
>>> bin(count_fingerprint('colin')) |
199
|
|
|
'0b101010000' |
200
|
|
|
>>> bin(count_fingerprint('atcg')) |
201
|
|
|
'0b1010000000000' |
202
|
|
|
>>> bin(count_fingerprint('entreatment')) |
203
|
|
|
'0b1111010000100000' |
204
|
|
|
""" |
205
|
1 |
|
if n_bits % 2: |
206
|
1 |
|
n_bits += 1 |
207
|
|
|
|
208
|
1 |
|
word = Counter(word) |
209
|
1 |
|
fingerprint = 0 |
210
|
|
|
|
211
|
1 |
|
for letter in most_common: |
212
|
1 |
|
if n_bits: |
213
|
1 |
|
fingerprint <<= 2 |
214
|
1 |
|
fingerprint += word[letter] & 3 |
215
|
1 |
|
n_bits -= 2 |
216
|
|
|
else: |
217
|
1 |
|
break |
218
|
|
|
|
219
|
1 |
|
if n_bits: |
220
|
1 |
|
fingerprint <<= n_bits |
221
|
|
|
|
222
|
1 |
|
return fingerprint |
223
|
|
|
|
224
|
|
|
|
225
|
1 |
|
def position_fingerprint( |
226
|
|
|
word, n_bits=16, most_common=MOST_COMMON_LETTERS_CG, bits_per_letter=3 |
|
|
|
|
227
|
|
|
): |
228
|
|
|
"""Return the position fingerprint. |
229
|
|
|
|
230
|
|
|
Based on the position fingerprint from :cite:`Cislak:2017`. |
231
|
|
|
|
232
|
|
|
:param str word: the word to fingerprint |
233
|
|
|
:param int n_bits: number of bits in the fingerprint returned |
234
|
|
|
:param list most_common: the most common tokens in the target language, |
235
|
|
|
ordered by frequency |
236
|
|
|
:param int bits_per_letter: the bits to assign for letter position |
237
|
|
|
:returns: the position fingerprint |
238
|
|
|
:rtype: int |
239
|
|
|
|
240
|
|
|
>>> bin(position_fingerprint('hat')) |
241
|
|
|
'0b1110100011111111' |
242
|
|
|
>>> bin(position_fingerprint('niall')) |
243
|
|
|
'0b1111110101110010' |
244
|
|
|
>>> bin(position_fingerprint('colin')) |
245
|
|
|
'0b1111111110010111' |
246
|
|
|
>>> bin(position_fingerprint('atcg')) |
247
|
|
|
'0b1110010001111111' |
248
|
|
|
>>> bin(position_fingerprint('entreatment')) |
249
|
|
|
'0b101011111111' |
250
|
|
|
""" |
251
|
1 |
|
position = {} |
252
|
1 |
|
for pos, letter in enumerate(word): |
253
|
1 |
|
if letter not in position and letter in most_common: |
254
|
1 |
|
position[letter] = min(pos, 2 ** bits_per_letter - 1) |
255
|
|
|
|
256
|
1 |
|
fingerprint = 0 |
257
|
|
|
|
258
|
1 |
|
for letter in most_common: |
259
|
1 |
|
if n_bits: |
260
|
1 |
|
fingerprint <<= min(bits_per_letter, n_bits) |
261
|
1 |
|
if letter in position: |
262
|
1 |
|
fingerprint += min(position[letter], 2 ** n_bits - 1) |
263
|
|
|
else: |
264
|
1 |
|
fingerprint += min(2 ** bits_per_letter - 1, 2 ** n_bits - 1) |
265
|
1 |
|
n_bits -= min(bits_per_letter, n_bits) |
266
|
|
|
else: |
267
|
1 |
|
break |
268
|
|
|
|
269
|
1 |
|
for _ in range(n_bits): |
270
|
1 |
|
fingerprint <<= 1 |
271
|
1 |
|
fingerprint += 1 |
272
|
|
|
|
273
|
1 |
|
return fingerprint |
274
|
|
|
|
275
|
|
|
|
276
|
|
|
if __name__ == '__main__': |
277
|
|
|
import doctest |
278
|
|
|
|
279
|
|
|
doctest.testmod() |
280
|
|
|
|