1
|
|
|
# -*- coding: utf-8 -*- |
2
|
|
|
|
3
|
|
|
# Copyright 2014-2018 by Christopher C. Little. |
4
|
|
|
# This file is part of Abydos. |
5
|
|
|
# |
6
|
|
|
# Abydos is free software: you can redistribute it and/or modify |
7
|
|
|
# it under the terms of the GNU General Public License as published by |
8
|
|
|
# the Free Software Foundation, either version 3 of the License, or |
9
|
|
|
# (at your option) any later version. |
10
|
|
|
# |
11
|
|
|
# Abydos is distributed in the hope that it will be useful, |
12
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of |
13
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
14
|
|
|
# GNU General Public License for more details. |
15
|
|
|
# |
16
|
|
|
# You should have received a copy of the GNU General Public License |
17
|
|
|
# along with Abydos. If not, see <http://www.gnu.org/licenses/>. |
18
|
|
|
|
19
|
|
|
"""abydos.fingerprint. |
20
|
|
|
|
21
|
|
|
The clustering module implements clustering algorithms such as: |
22
|
|
|
- string fingerprint |
23
|
|
|
- q-gram fingerprint |
24
|
|
|
- phonetic fingerprint |
25
|
|
|
- Pollock & Zomora's skeleton key |
26
|
|
|
- Pollock & Zomora's omission key |
27
|
|
|
- Cisłak & Grabowski's occurrence fingerprint |
28
|
|
|
- Cisłak & Grabowski's occurrence halved fingerprint |
29
|
|
|
- Cisłak & Grabowski's count fingerprint |
30
|
|
|
- Cisłak & Grabowski's position fingerprint |
31
|
|
|
- Synoname Toolcode |
32
|
|
|
""" |
33
|
|
|
|
34
|
|
|
from __future__ import division, unicode_literals |
35
|
|
|
|
36
|
|
|
import unicodedata |
37
|
|
|
from collections import Counter |
38
|
|
|
|
39
|
|
|
from six import text_type |
40
|
|
|
|
41
|
|
|
from .phonetic import double_metaphone |
42
|
|
|
from .qgram import QGrams |
43
|
|
|
|
44
|
|
|
|
45
|
|
|
def str_fingerprint(phrase, joiner=' '): |
46
|
|
|
"""Return string fingerprint. |
47
|
|
|
|
48
|
|
|
The fingerprint of a string is a string consisting of all of the unique |
49
|
|
|
words in a string, alphabetized & concatenated with intervening joiners |
50
|
|
|
|
51
|
|
|
:param str phrase: the string from which to calculate the fingerprint |
52
|
|
|
:param str joiner: the string that will be placed between each word |
53
|
|
|
:returns: the fingerprint of the phrase |
54
|
|
|
:rtype: str |
55
|
|
|
|
56
|
|
|
>>> str_fingerprint('The quick brown fox jumped over the lazy dog.') |
57
|
|
|
'brown dog fox jumped lazy over quick the' |
58
|
|
|
""" |
59
|
|
|
phrase = unicodedata.normalize('NFKD', text_type(phrase.strip().lower())) |
60
|
|
|
phrase = ''.join([c for c in phrase if c.isalnum() or c.isspace()]) |
61
|
|
|
phrase = joiner.join(sorted(list(set(phrase.split())))) |
62
|
|
|
return phrase |
63
|
|
|
|
64
|
|
|
|
65
|
|
|
def qgram_fingerprint(phrase, qval=2, start_stop='', joiner=''): |
66
|
|
|
"""Return Q-Gram fingerprint. |
67
|
|
|
|
68
|
|
|
A q-gram fingerprint is a string consisting of all of the unique q-grams |
69
|
|
|
in a string, alphabetized & concatenated. |
70
|
|
|
|
71
|
|
|
:param str phrase: the string from which to calculate the q-gram |
72
|
|
|
fingerprint |
73
|
|
|
:param int qval: the length of each q-gram (by default 2) |
74
|
|
|
:param str start_stop: the start & stop symbol(s) to concatenate on either |
75
|
|
|
end of the phrase, as defined in abydos.util.qgram() |
76
|
|
|
:param str joiner: the string that will be placed between each word |
77
|
|
|
:returns: the q-gram fingerprint of the phrase |
78
|
|
|
:rtype: str |
79
|
|
|
|
80
|
|
|
>>> qgram_fingerprint('The quick brown fox jumped over the lazy dog.') |
81
|
|
|
'azbrckdoedeleqerfoheicjukblampnfogovowoxpequrortthuiumvewnxjydzy' |
82
|
|
|
>>> qgram_fingerprint('Christopher') |
83
|
|
|
'cherhehrisopphristto' |
84
|
|
|
>>> qgram_fingerprint('Niall') |
85
|
|
|
'aliallni' |
86
|
|
|
""" |
87
|
|
|
phrase = unicodedata.normalize('NFKD', text_type(phrase.strip().lower())) |
88
|
|
|
phrase = ''.join(c for c in phrase if c.isalnum()) |
89
|
|
|
phrase = QGrams(phrase, qval, start_stop) |
90
|
|
|
phrase = joiner.join(sorted(phrase)) |
91
|
|
|
return phrase |
92
|
|
|
|
93
|
|
|
|
94
|
|
|
def phonetic_fingerprint(phrase, phonetic_algorithm=double_metaphone, |
95
|
|
|
joiner=' ', *args): |
96
|
|
|
"""Return the phonetic fingerprint of a phrase. |
97
|
|
|
|
98
|
|
|
A phonetic fingerprint is identical to a standard string fingerprint, as |
99
|
|
|
implemented in abydos.clustering.fingerprint(), but performs the |
100
|
|
|
fingerprinting function after converting the string to its phonetic form, |
101
|
|
|
as determined by some phonetic algorithm. |
102
|
|
|
|
103
|
|
|
:param str phrase: the string from which to calculate the phonetic |
104
|
|
|
fingerprint |
105
|
|
|
:param function phonetic_algorithm: a phonetic algorithm that takes a |
106
|
|
|
string and returns a string (presumably a phonetic representation of |
107
|
|
|
the original string) By default, this function uses |
108
|
|
|
abydos.phonetic.double_metaphone() |
109
|
|
|
:param str joiner: the string that will be placed between each word |
110
|
|
|
:param args: additional arguments to pass to the phonetic algorithm, |
111
|
|
|
along with the phrase itself |
112
|
|
|
:returns: the phonetic fingerprint of the phrase |
113
|
|
|
:rtype: str |
114
|
|
|
|
115
|
|
|
>>> phonetic_fingerprint('The quick brown fox jumped over the lazy dog.') |
116
|
|
|
'0 afr fks jmpt kk ls prn tk' |
117
|
|
|
>>> phonetic_fingerprint('The quick brown fox jumped over the lazy dog.', |
118
|
|
|
... phonetic_algorithm=soundex) |
119
|
|
|
'b650 d200 f200 j513 l200 o160 q200 t000' |
120
|
|
|
""" |
121
|
|
|
phonetic = '' |
122
|
|
|
for word in phrase.split(): |
123
|
|
|
word = phonetic_algorithm(word, *args) |
124
|
|
|
if not isinstance(word, text_type) and hasattr(word, '__iter__'): |
125
|
|
|
word = word[0] |
126
|
|
|
phonetic += word + joiner |
127
|
|
|
phonetic = phonetic[:-len(joiner)] |
128
|
|
|
return str_fingerprint(phonetic) |
129
|
|
|
|
130
|
|
|
|
131
|
|
|
def skeleton_key(word): |
132
|
|
|
"""Return the skeleton key. |
133
|
|
|
|
134
|
|
|
The skeleton key of a word is defined in: |
135
|
|
|
Pollock, Joseph J. and Antonio Zamora. 1984. "Automatic Spelling Correction |
136
|
|
|
in Scientific and Scholarly Text." Communications of the ACM, 27(4). |
137
|
|
|
358--368. <http://dl.acm.org/citation.cfm?id=358048> |
138
|
|
|
|
139
|
|
|
:param str word: the word to transform into its skeleton key |
140
|
|
|
:returns: the skeleton key |
141
|
|
|
:rtype: str |
142
|
|
|
|
143
|
|
|
>>> skeleton_key('The quick brown fox jumped over the lazy dog.') |
144
|
|
|
'THQCKBRWNFXJMPDVLZYGEUIOA' |
145
|
|
|
>>> skeleton_key('Christopher') |
146
|
|
|
'CHRSTPIOE' |
147
|
|
|
>>> skeleton_key('Niall') |
148
|
|
|
'NLIA' |
149
|
|
|
""" |
150
|
|
|
_vowels = {'A', 'E', 'I', 'O', 'U'} |
151
|
|
|
|
152
|
|
|
word = unicodedata.normalize('NFKD', text_type(word.upper())) |
153
|
|
|
word = ''.join(c for c in word if c in |
154
|
|
|
{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', |
155
|
|
|
'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', |
156
|
|
|
'Y', 'Z'}) |
157
|
|
|
start = word[0:1] |
158
|
|
|
consonant_part = '' |
159
|
|
|
vowel_part = '' |
160
|
|
|
|
161
|
|
|
# add consonants & vowels to to separate strings |
162
|
|
|
# (omitting the first char & duplicates) |
163
|
|
|
for char in word[1:]: |
164
|
|
|
if char != start: |
165
|
|
|
if char in _vowels: |
166
|
|
|
if char not in vowel_part: |
167
|
|
|
vowel_part += char |
168
|
|
|
elif char not in consonant_part: |
169
|
|
|
consonant_part += char |
170
|
|
|
# return the first char followed by consonants followed by vowels |
171
|
|
|
return start + consonant_part + vowel_part |
172
|
|
|
|
173
|
|
|
|
174
|
|
|
def omission_key(word): |
175
|
|
|
"""Return the omission key. |
176
|
|
|
|
177
|
|
|
The omission key of a word is defined in: |
178
|
|
|
Pollock, Joseph J. and Antonio Zamora. 1984. "Automatic Spelling Correction |
179
|
|
|
in Scientific and Scholarly Text." Communications of the ACM, 27(4). |
180
|
|
|
358--368. <http://dl.acm.org/citation.cfm?id=358048> |
181
|
|
|
|
182
|
|
|
:param str word: the word to transform into its omission key |
183
|
|
|
:returns: the omission key |
184
|
|
|
:rtype: str |
185
|
|
|
|
186
|
|
|
>>> omission_key('The quick brown fox jumped over the lazy dog.') |
187
|
|
|
'JKQXZVWYBFMGPDHCLNTREUIOA' |
188
|
|
|
>>> omission_key('Christopher') |
189
|
|
|
'PHCTSRIOE' |
190
|
|
|
>>> omission_key('Niall') |
191
|
|
|
'LNIA' |
192
|
|
|
""" |
193
|
|
|
_consonants = ('J', 'K', 'Q', 'X', 'Z', 'V', 'W', 'Y', 'B', 'F', 'M', 'G', |
194
|
|
|
'P', 'D', 'H', 'C', 'L', 'N', 'T', 'S', 'R') |
195
|
|
|
|
196
|
|
|
word = unicodedata.normalize('NFKD', text_type(word.upper())) |
197
|
|
|
word = ''.join(c for c in word if c in |
198
|
|
|
{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', |
199
|
|
|
'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', |
200
|
|
|
'Y', 'Z'}) |
201
|
|
|
|
202
|
|
|
key = '' |
203
|
|
|
|
204
|
|
|
# add consonants in order supplied by _consonants (no duplicates) |
205
|
|
|
for char in _consonants: |
206
|
|
|
if char in word: |
207
|
|
|
key += char |
208
|
|
|
|
209
|
|
|
# add vowels in order they appeared in the word (no duplicates) |
210
|
|
|
for char in word: |
211
|
|
|
if char not in _consonants and char not in key: |
212
|
|
|
key += char |
213
|
|
|
|
214
|
|
|
return key |
215
|
|
|
|
216
|
|
|
|
217
|
|
|
# TODO: Dump all these to a data file. |
218
|
|
|
# most common letters, as defined in Cisłak & Grabowski |
219
|
|
|
MOST_COMMON_LETTERS_CG = ('e', 't', 'a', 'o', 'i', 'n', 's', 'h', 'r', 'd', |
220
|
|
|
'l', 'c', 'u', 'm', 'w', 'f') |
221
|
|
|
|
222
|
|
|
# most common letters (case-folded to lowercase), as shown in Google Books |
223
|
|
|
# English n-grams, among letters a-z & digits 0-9 |
224
|
|
|
MOST_COMMON_LETTERS_EN_LC = ('e', 't', 'a', 'i', 'o', 'n', 's', 'r', 'h', 'l', |
225
|
|
|
'd', 'c', 'u', 'm', 'f', 'p', 'g', 'y', 'w', 'b', |
226
|
|
|
'v', 'k', 'x', 'j', 'q', 'z', '1', '2', '0', '9', |
227
|
|
|
'3', '4', '8', '5', '6', '7') |
228
|
|
|
|
229
|
|
|
# most common letters, as shown in Google Books English n-grams, among letters |
230
|
|
|
# A-Z, a-z & digits 0-9 |
231
|
|
|
MOST_COMMON_LETTERS = ('e', 't', 'a', 'o', 'i', 'n', 's', 'r', 'h', 'l', 'd', |
232
|
|
|
'c', 'u', 'm', 'f', 'p', 'g', 'y', 'w', 'b', 'v', 'k', |
233
|
|
|
'T', 'I', 'A', 'S', 'C', 'x', 'M', 'P', 'E', 'B', 'H', |
234
|
|
|
'R', 'N', 'D', 'L', 'F', 'W', 'O', 'q', 'G', 'z', 'j', |
235
|
|
|
'J', 'U', 'V', 'K', 'Y', '1', '2', '0', 'X', '9', 'Q', |
236
|
|
|
'3', 'Z', '4', '8', '5', '6', '7',) |
237
|
|
|
|
238
|
|
|
# most common letters (case-folded to lowercase), as shown in Google Books |
239
|
|
|
# German n-grams, among letters (a-z and umlauted vowels & eszett) & digits 0-9 |
240
|
|
|
MOST_COMMON_LETTERS_DE = ('e', 'n', 'i', 'r', 's', 't', 'a', 'd', 'h', 'u', |
241
|
|
|
'l', 'g', 'c', 'o', 'm', 'b', 'f', 'w', 'k', 'z', |
242
|
|
|
'v', 'p', 'ü', 'ä', 'ß', 'ö', 'j', 'y', 'x', 'q', |
243
|
|
|
'1', '2', '3', '4', '0', '5', '6', '9', '8', '7') |
244
|
|
|
|
245
|
|
|
# most common letters (case-folded to lowercase), as shown in Google Books |
246
|
|
|
# German n-grams, among letters (A-Z, a-z, umlauted vowels & eszett) & digits |
247
|
|
|
# 0-9 |
248
|
|
|
MOST_COMMON_LETTERS_DE_LC = ('e', 'n', 'i', 'r', 's', 't', 'a', 'd', 'h', 'u', |
249
|
|
|
'l', 'c', 'g', 'o', 'm', 'b', 'f', 'w', 'k', 'z', |
250
|
|
|
'v', 'p', 'ü', 'ä', 'S', 'A', 'D', 'B', 'E', 'G', |
251
|
|
|
'M', 'ß', 'V', 'K', 'ö', 'W', 'F', 'P', 'R', 'I', |
252
|
|
|
'H', 'L', 'T', 'N', 'Z', 'y', 'U', 'j', 'J', 'O', |
253
|
|
|
'C', 'x', 'q', 'Ü', 'Q', 'X', 'Ä', 'Ö', '1', '2', |
254
|
|
|
'Y', '3', '4', '0', '5', '6', '9', '8', '7') |
255
|
|
|
|
256
|
|
|
|
257
|
|
|
def occurrence_fingerprint(word, n_bits=16, |
258
|
|
|
most_common=MOST_COMMON_LETTERS_CG): |
259
|
|
|
"""Return the occurrence fingerprint. |
260
|
|
|
|
261
|
|
|
Based on the occurence fingerprint from: |
262
|
|
|
Cisłak, Aleksander and Szymon Grabowski. "Lightweight Fingerprints for |
263
|
|
|
Fast Approximate Keyword Matching Using Bitwise Operations." |
264
|
|
|
http://arxiv.org/abs/1711.08475 |
265
|
|
|
|
266
|
|
|
:param word: the word to fingerprint |
267
|
|
|
:param n_bits: number of bits in the fingerprint returned |
268
|
|
|
:param most_common: the most common tokens in the target language |
269
|
|
|
:return: the occurrence fingerprint |
270
|
|
|
:rtype: int |
271
|
|
|
""" |
272
|
|
|
word = set(word) |
273
|
|
|
fingerprint = 0 |
274
|
|
|
|
275
|
|
|
for letter in most_common: |
276
|
|
|
if letter in word: |
277
|
|
|
fingerprint += 1 |
278
|
|
|
n_bits -= 1 |
279
|
|
|
if n_bits: |
280
|
|
|
fingerprint <<= 1 |
281
|
|
|
else: |
282
|
|
|
break |
283
|
|
|
|
284
|
|
|
if n_bits: |
285
|
|
|
fingerprint <<= n_bits |
286
|
|
|
|
287
|
|
|
return fingerprint |
288
|
|
|
|
289
|
|
|
|
290
|
|
|
def occurrence_halved_fingerprint(word, n_bits=16, |
291
|
|
|
most_common=MOST_COMMON_LETTERS_CG): |
292
|
|
|
"""Return the occurrence halved fingerprint. |
293
|
|
|
|
294
|
|
|
Based on the occurence halved fingerprint from: |
295
|
|
|
Cisłak, Aleksander and Szymon Grabowski. "Lightweight Fingerprints for |
296
|
|
|
Fast Approximate Keyword Matching Using Bitwise Operations." |
297
|
|
|
http://arxiv.org/abs/1711.08475 |
298
|
|
|
|
299
|
|
|
:param word: the word to fingerprint |
300
|
|
|
:param n_bits: number of bits in the fingerprint returned |
301
|
|
|
:param most_common: the most common tokens in the target language |
302
|
|
|
:return: the occurrence halved fingerprint |
303
|
|
|
:rtype: int |
304
|
|
|
""" |
305
|
|
|
if n_bits % 2: |
306
|
|
|
n_bits += 1 |
307
|
|
|
|
308
|
|
|
w_len = len(word)//2 |
309
|
|
|
w_1 = set(word[:w_len]) |
310
|
|
|
w_2 = set(word[w_len:]) |
311
|
|
|
fingerprint = 0 |
312
|
|
|
|
313
|
|
|
for letter in most_common: |
314
|
|
|
if letter in w_1: |
315
|
|
|
fingerprint += 1 |
316
|
|
|
fingerprint <<= 1 |
317
|
|
|
if letter in w_2: |
318
|
|
|
fingerprint += 1 |
319
|
|
|
n_bits -= 2 |
320
|
|
|
if n_bits: |
321
|
|
|
fingerprint <<= 1 |
322
|
|
|
else: |
323
|
|
|
break |
324
|
|
|
|
325
|
|
|
if n_bits: |
326
|
|
|
fingerprint <<= n_bits |
327
|
|
|
|
328
|
|
|
return fingerprint |
329
|
|
|
|
330
|
|
|
|
331
|
|
|
def count_fingerprint(word, n_bits=16, |
332
|
|
|
most_common=MOST_COMMON_LETTERS_CG): |
333
|
|
|
"""Return the count fingerprint. |
334
|
|
|
|
335
|
|
|
Based on the count fingerprint from: |
336
|
|
|
Cisłak, Aleksander and Szymon Grabowski. "Lightweight Fingerprints for |
337
|
|
|
Fast Approximate Keyword Matching Using Bitwise Operations." |
338
|
|
|
http://arxiv.org/abs/1711.08475 |
339
|
|
|
|
340
|
|
|
:param word: the word to fingerprint |
341
|
|
|
:param n_bits: number of bits in the fingerprint returned |
342
|
|
|
:param most_common: the most common tokens in the target language |
343
|
|
|
:return: the count fingerprint |
344
|
|
|
:rtype: int |
345
|
|
|
""" |
346
|
|
|
if n_bits % 2: |
347
|
|
|
n_bits += 1 |
348
|
|
|
|
349
|
|
|
word = Counter(word) |
350
|
|
|
fingerprint = 0 |
351
|
|
|
|
352
|
|
|
for letter in most_common: |
353
|
|
|
fingerprint += (word[letter] & 3) |
354
|
|
|
n_bits -= 2 |
355
|
|
|
if n_bits: |
356
|
|
|
fingerprint <<= 2 |
357
|
|
|
else: |
358
|
|
|
break |
359
|
|
|
|
360
|
|
|
if n_bits: |
361
|
|
|
fingerprint <<= n_bits |
362
|
|
|
|
363
|
|
|
return fingerprint |
364
|
|
|
|
365
|
|
|
|
366
|
|
|
def position_fingerprint(word, n_bits=16, |
367
|
|
|
most_common=MOST_COMMON_LETTERS_CG, |
368
|
|
|
bits_per_letter=3): |
369
|
|
|
"""Return the position fingerprint. |
370
|
|
|
|
371
|
|
|
Based on the position fingerprint from: |
372
|
|
|
Cisłak, Aleksander and Szymon Grabowski. "Lightweight Fingerprints for |
373
|
|
|
Fast Approximate Keyword Matching Using Bitwise Operations." |
374
|
|
|
http://arxiv.org/abs/1711.08475 |
375
|
|
|
|
376
|
|
|
:param word: the word to fingerprint |
377
|
|
|
:param n_bits: number of bits in the fingerprint returned |
378
|
|
|
:param most_common: the most common tokens in the target language |
379
|
|
|
:param bits_per_letter: the bits to assign for letter position |
380
|
|
|
:return: the position fingerprint |
381
|
|
|
:rtype: int |
382
|
|
|
""" |
383
|
|
|
position = {} |
384
|
|
|
for pos, letter in enumerate(word): |
385
|
|
|
if letter not in position and letter in most_common: |
386
|
|
|
position[letter] = min(pos, 2**bits_per_letter-1) |
387
|
|
|
|
388
|
|
|
fingerprint = 0 |
389
|
|
|
for letter in most_common: |
390
|
|
|
if letter in position: |
391
|
|
|
fingerprint += min(position[letter], 2**n_bits-1) |
392
|
|
|
n_bits -= bits_per_letter |
393
|
|
|
if n_bits > 0: |
394
|
|
|
fingerprint <<= min(bits_per_letter, n_bits) |
395
|
|
|
else: |
396
|
|
|
break |
397
|
|
|
|
398
|
|
|
if n_bits > 0: |
399
|
|
|
fingerprint <<= n_bits |
400
|
|
|
|
401
|
|
|
return fingerprint |
402
|
|
|
|
403
|
|
|
|
404
|
|
|
def synoname_toolcode(lname, fname='', qual='', normalize=0): |
405
|
|
|
"""Build the Synoname toolcode. |
406
|
|
|
|
407
|
|
|
:param lname: last name |
408
|
|
|
:param fname: first name (can be blank) |
409
|
|
|
:param qual: qualifier |
410
|
|
|
:return: |
411
|
|
|
""" |
412
|
|
|
method_dict = {'end': 1, 'middle': 2, 'beginning': 4, |
413
|
|
|
'beginning_no_space': 8} |
414
|
|
|
special_table = ( |
415
|
|
|
# Roman, string, extra, method |
416
|
|
|
(False, 'NONE', '', 0), |
417
|
|
|
(False, 'aine', '', 3), |
418
|
|
|
(False, 'also erroneously', '', 4), |
419
|
|
|
(False, 'also identified with the', '', 2), |
420
|
|
|
(False, 'also identified with', '', 2), |
421
|
|
|
(False, 'archbishop', '', 7), |
422
|
|
|
(False, 'atelier', '', 7), |
423
|
|
|
(False, 'baron', '', 7), |
424
|
|
|
(False, 'cadet', '', 3), |
425
|
|
|
(False, 'cardinal', '', 7), |
426
|
|
|
(False, 'circle of', '', 5), |
427
|
|
|
(False, 'circle', '', 5), |
428
|
|
|
(False, 'class of', '', 5), |
429
|
|
|
(False, 'conde de', '', 7), |
430
|
|
|
(False, 'countess', '', 7), |
431
|
|
|
(False, 'count', '', 7), |
432
|
|
|
(False, "d'", " d'", 15), |
433
|
|
|
(False, 'dai', '', 15), |
434
|
|
|
(False, "dall'", " dall'", 15), |
435
|
|
|
(False, 'dalla', '', 15), |
436
|
|
|
(False, 'dalle', '', 15), |
437
|
|
|
(False, 'dal', '', 15), |
438
|
|
|
(False, 'da', '', 15), |
439
|
|
|
(False, 'degli', '', 15), |
440
|
|
|
(False, 'della', '', 15), |
441
|
|
|
(False, 'del', '', 15), |
442
|
|
|
(False, 'den', '', 15), |
443
|
|
|
(False, 'der altere', '', 3), |
444
|
|
|
(False, 'der jungere', '', 3), |
445
|
|
|
(False, 'der', '', 15), |
446
|
|
|
(False, 'de la', '', 15), |
447
|
|
|
(False, 'des', '', 15), |
448
|
|
|
(False, "de'", " de'", 15), |
449
|
|
|
(False, 'de', '', 15), |
450
|
|
|
(False, 'di ser', '', 7), |
451
|
|
|
(False, 'di', '', 15), |
452
|
|
|
(False, 'dos', '', 15), |
453
|
|
|
(False, 'du', '', 15), |
454
|
|
|
(False, 'duke of', '', 7), |
455
|
|
|
(False, 'earl of', '', 7), |
456
|
|
|
(False, 'el', '', 15), |
457
|
|
|
(False, 'fils', '', 3), |
458
|
|
|
(False, 'florentine follower of', '', 5), |
459
|
|
|
(False, 'follower of', '', 5), |
460
|
|
|
(False, 'fra', '', 7), |
461
|
|
|
(False, 'freiherr von', '', 7), |
462
|
|
|
(False, 'giovane', '', 7), |
463
|
|
|
(False, 'group', '', 5), |
464
|
|
|
(True, 'iii', '', 3), |
465
|
|
|
(True, 'ii', '', 3), |
466
|
|
|
(False, 'il giovane', '', 7), |
467
|
|
|
(False, 'il vecchio', '', 7), |
468
|
|
|
(False, 'il', '', 15), |
469
|
|
|
(False, "in't", '', 7), |
470
|
|
|
(False, 'in het', '', 7), |
471
|
|
|
(True, 'iv', '', 3), |
472
|
|
|
(True, 'ix', '', 3), |
473
|
|
|
(True, 'i', '', 3), |
474
|
|
|
(False, 'jr.', '', 3), |
475
|
|
|
(False, 'jr', '', 3), |
476
|
|
|
(False, 'juniore', '', 3), |
477
|
|
|
(False, 'junior', '', 3), |
478
|
|
|
(False, 'king of', '', 7), |
479
|
|
|
(False, "l'", " l'", 15), |
480
|
|
|
(False, "l'aine", '', 3), |
481
|
|
|
(False, 'la', '', 15), |
482
|
|
|
(False, 'le jeune', '', 3), |
483
|
|
|
(False, 'le', '', 15), |
484
|
|
|
(False, 'lo', '', 15), |
485
|
|
|
(False, 'maestro', '', 7), |
486
|
|
|
(False, 'maitre', '', 7), |
487
|
|
|
(False, 'marchioness', '', 7), |
488
|
|
|
(False, 'markgrafin von', '', 7), |
489
|
|
|
(False, 'marquess', '', 7), |
490
|
|
|
(False, 'marquis', '', 7), |
491
|
|
|
(False, 'master of the', '', 7), |
492
|
|
|
(False, 'master of', '', 7), |
493
|
|
|
(False, 'master known as the', '', 7), |
494
|
|
|
(False, 'master with the', '', 7), |
495
|
|
|
(False, 'master with', '', 7), |
496
|
|
|
(False, 'masters', '', 7), |
497
|
|
|
(False, 'master', '', 7), |
498
|
|
|
(False, 'meister', '', 7), |
499
|
|
|
(False, 'met de', '', 7), |
500
|
|
|
(False, 'met', '', 7), |
501
|
|
|
(False, 'mlle.', '', 7), |
502
|
|
|
(False, 'mlle', '', 7), |
503
|
|
|
(False, 'monogrammist', '', 7), |
504
|
|
|
(False, 'monsu', '', 7), |
505
|
|
|
(False, 'nee', '', 2), |
506
|
|
|
(False, 'of', '', 3), |
507
|
|
|
(False, 'oncle', '', 3), |
508
|
|
|
(False, 'op den', '', 15), |
509
|
|
|
(False, 'op de', '', 15), |
510
|
|
|
(False, 'or', '', 2), |
511
|
|
|
(False, 'over den', '', 15), |
512
|
|
|
(False, 'over de', '', 15), |
513
|
|
|
(False, 'over', '', 7), |
514
|
|
|
(False, 'p.re', '', 7), |
515
|
|
|
(False, 'p.r.a.', '', 1), |
516
|
|
|
(False, 'padre', '', 7), |
517
|
|
|
(False, 'painter', '', 7), |
518
|
|
|
(False, 'pere', '', 3), |
519
|
|
|
(False, 'possibly identified with', '', 6), |
520
|
|
|
(False, 'possibly', '', 6), |
521
|
|
|
(False, 'pseudo', '', 15), |
522
|
|
|
(False, 'r.a.', '', 1), |
523
|
|
|
(False, 'reichsgraf von', '', 7), |
524
|
|
|
(False, 'ritter von', '', 7), |
525
|
|
|
(False, 'sainte-', ' sainte-', 8), |
526
|
|
|
(False, 'sainte', '', 7), |
527
|
|
|
(False, 'saint-', ' saint-', 8), |
528
|
|
|
(False, 'saint', '', 7), |
529
|
|
|
(False, 'santa', '', 15), |
530
|
|
|
(False, "sant'", " sant'", 15), |
531
|
|
|
(False, 'san', '', 15), |
532
|
|
|
(False, 'ser', '', 7), |
533
|
|
|
(False, 'seniore', '', 3), |
534
|
|
|
(False, 'senior', '', 3), |
535
|
|
|
(False, 'sir', '', 5), |
536
|
|
|
(False, 'sr.', '', 3), |
537
|
|
|
(False, 'sr', '', 3), |
538
|
|
|
(False, 'ss.', ' ss.', 14), |
539
|
|
|
(False, 'ss', '', 6), |
540
|
|
|
(False, 'st-', ' st-', 8), |
541
|
|
|
(False, 'st.', ' st.', 15), |
542
|
|
|
(False, 'ste-', ' ste-', 8), |
543
|
|
|
(False, 'ste.', ' ste.', 15), |
544
|
|
|
(False, 'studio', '', 7), |
545
|
|
|
(False, 'sub-group', '', 5), |
546
|
|
|
(False, 'sultan of', '', 7), |
547
|
|
|
(False, 'ten', '', 15), |
548
|
|
|
(False, 'ter', '', 15), |
549
|
|
|
(False, 'the elder', '', 3), |
550
|
|
|
(False, 'the younger', '', 3), |
551
|
|
|
(False, 'the', '', 7), |
552
|
|
|
(False, 'tot', '', 15), |
553
|
|
|
(False, 'unidentified', '', 1), |
554
|
|
|
(False, 'van den', '', 15), |
555
|
|
|
(False, 'van der', '', 15), |
556
|
|
|
(False, 'van de', '', 15), |
557
|
|
|
(False, 'vanden', '', 15), |
558
|
|
|
(False, 'vander', '', 15), |
559
|
|
|
(False, 'van', '', 15), |
560
|
|
|
(False, 'vecchia', '', 7), |
561
|
|
|
(False, 'vecchio', '', 7), |
562
|
|
|
(True, 'viii', '', 3), |
563
|
|
|
(True, 'vii', '', 3), |
564
|
|
|
(True, 'vi', '', 3), |
565
|
|
|
(True, 'v', '', 3), |
566
|
|
|
(False, 'vom', '', 7), |
567
|
|
|
(False, 'von', '', 15), |
568
|
|
|
(False, 'workshop', '', 7), |
569
|
|
|
(True, 'xiii', '', 3), |
570
|
|
|
(True, 'xii', '', 3), |
571
|
|
|
(True, 'xiv', '', 3), |
572
|
|
|
(True, 'xix', '', 3), |
573
|
|
|
(True, 'xi', '', 3), |
574
|
|
|
(True, 'xviii', '', 3), |
575
|
|
|
(True, 'xvii', '', 3), |
576
|
|
|
(True, 'xvi', '', 3), |
577
|
|
|
(True, 'xv', '', 3), |
578
|
|
|
(True, 'xx', '', 3), |
579
|
|
|
(True, 'x', '', 3), |
580
|
|
|
(False, 'y', '', 7) |
581
|
|
|
) |
582
|
|
|
|
583
|
|
|
# Start with the basic code |
584
|
|
|
toolcode = ['0', '0', '0', '000', '00', '00', '$', '', '$', ''] |
585
|
|
|
|
586
|
|
|
full_name = ' '.join((lname, fname)) |
587
|
|
|
|
588
|
|
|
# Fill field 0 (qualifier) |
589
|
|
|
qual_3 = {'adaptation after', 'after', 'assistant of', 'assistants of', |
590
|
|
|
'circle of', 'follower of', 'imitator of', 'in the style of', |
591
|
|
|
'manner of', 'pupil of', 'school of', 'studio of', |
592
|
|
|
'style of', 'workshop of'} |
593
|
|
|
qual_2 = {'copy after', 'copy after?', 'copy of'} |
594
|
|
|
qual_1 = {'ascribed to', 'attributed to or copy after', |
595
|
|
|
'attributed to', 'possibly'} |
596
|
|
|
|
597
|
|
|
if qual in qual_3: |
598
|
|
|
toolcode[0] = '3' |
599
|
|
|
elif qual in qual_2: |
600
|
|
|
toolcode[0] = '2' |
601
|
|
|
elif qual in qual_1: |
602
|
|
|
toolcode[0] = '1' |
603
|
|
|
|
604
|
|
|
# Fill field 1 (punctuation) |
605
|
|
|
if '.' in full_name: |
606
|
|
|
toolcode[1] = '2' |
607
|
|
|
else: |
608
|
|
|
for punct in ',-/:;"&\'()!{|}?$%*+<=>[\\]^_`~': |
609
|
|
|
if punct in full_name: |
610
|
|
|
toolcode[1] = '1' |
611
|
|
|
break |
612
|
|
|
|
613
|
|
|
# Fill field 2 (generation) |
614
|
|
|
gen_1 = ('the elder', ' sr.', ' sr', 'senior', 'der altere', 'il vecchio', |
615
|
|
|
"l'aine", 'p.re', 'padre', 'seniore', 'vecchia', 'vecchio') |
616
|
|
|
gen_2 = (' jr.', ' jr', 'der jungere', 'il giovane', 'giovane', 'juniore', |
617
|
|
|
'junior', 'le jeune', 'the younger') |
618
|
|
|
|
619
|
|
|
elderyounger = '' # save elder/younger for possible movement later |
620
|
|
|
for gen in gen_1: |
621
|
|
|
if gen in full_name: |
622
|
|
|
toolcode[2] = '1' |
623
|
|
|
elderyounger = gen |
624
|
|
|
break |
625
|
|
|
else: |
626
|
|
|
for gen in gen_2: |
627
|
|
|
if gen in full_name: |
628
|
|
|
toolcode[2] = '2' |
629
|
|
|
elderyounger = gen |
630
|
|
|
break |
631
|
|
|
|
632
|
|
|
# do comma flip |
633
|
|
|
if normalize: |
634
|
|
|
comma = lname.find(',') |
635
|
|
|
if comma != -1: |
636
|
|
|
lname_end = lname[comma + 1:] |
637
|
|
|
while lname_end[0] in {' ', ','}: |
638
|
|
|
lname_end = lname_end[1:] |
639
|
|
|
fname = lname_end + ' ' + fname |
640
|
|
|
lname = lname[:comma].strip() |
641
|
|
|
|
642
|
|
|
# do elder/younger move |
643
|
|
|
if normalize == 2 and elderyounger: |
644
|
|
|
elderyounger_loc = fname.find(elderyounger) |
645
|
|
|
if elderyounger_loc != -1: |
646
|
|
|
lname = lname + ' ' + elderyounger.strip() |
647
|
|
|
fname = (fname[:elderyounger_loc].strip() + ' ' + |
648
|
|
|
fname[elderyounger_loc + len(elderyounger):]) |
649
|
|
|
|
650
|
|
|
toolcode[4] = '{:02d}'.format(len(fname)) |
651
|
|
|
toolcode[5] = '{:02d}'.format(len(lname)) |
652
|
|
|
|
653
|
|
|
# strip punctuation |
654
|
|
|
for char in ',/:;"&()!{|}?$%*+<=>[\\]^_`~': |
655
|
|
|
full_name = full_name.replace(char, '') |
656
|
|
|
for pos, char in enumerate(full_name): |
657
|
|
|
if char == '-' and full_name[pos - 1:pos + 2] != 'b-g': |
658
|
|
|
full_name = full_name[:pos] + ' ' + full_name[pos + 1:] |
659
|
|
|
|
660
|
|
|
# Fill field 9 (search range) |
661
|
|
|
for letter in [_[0] for _ in full_name.split()]: |
662
|
|
|
if letter not in toolcode[9]: |
663
|
|
|
toolcode[9] += letter |
664
|
|
|
if len(toolcode[9]) == 15: |
665
|
|
|
break |
666
|
|
|
|
667
|
|
|
def roman_check(numeral, fname, lname): |
668
|
|
|
"""Move Roman numerals from first name to last.""" |
669
|
|
|
loc = fname.find(numeral) |
670
|
|
|
if (loc != -1 and |
671
|
|
|
(fname[loc + len(numeral)] in {' ', ','} or |
672
|
|
|
len(fname[loc:]) == len(numeral))): |
673
|
|
|
lname += ' ' + numeral |
674
|
|
|
fname = fname[:loc].strip() |
675
|
|
|
while fname[-1] in {' ', ','}: |
676
|
|
|
fname = fname[:-1] |
677
|
|
|
return fname, lname |
678
|
|
|
|
679
|
|
|
# Fill fields 7 (specials) and 3 (roman numerals) |
680
|
|
|
for num, special in enumerate(special_table): |
681
|
|
|
roman, string, extra, method = special |
682
|
|
|
if method & method_dict['end']: |
683
|
|
|
string_context = ' ' + string |
684
|
|
|
loc = full_name.find(string_context) |
685
|
|
|
if ((len(full_name) > len(string_context)) and |
686
|
|
|
(loc == len(full_name) - len(string_context))): |
687
|
|
|
if roman: |
688
|
|
|
if not any(abbr in fname for abbr in ('i.', 'v.', 'x.')): |
689
|
|
|
full_name = full_name[:loc] |
690
|
|
|
toolcode[7] += '{:03d}'.format(num) + 'a' |
691
|
|
|
if not toolcode[3]: |
692
|
|
|
toolcode[3] = '{:03d}'.format(num) |
693
|
|
|
if normalize == 2: |
694
|
|
|
fname, lname = roman_check(string, fname, lname) |
695
|
|
|
else: |
696
|
|
|
full_name = full_name[:loc] |
697
|
|
|
toolcode[7] += '{:03d}'.format(num) + 'a' |
698
|
|
|
if method & method_dict['middle']: |
699
|
|
|
string_context = ' ' + string + ' ' |
700
|
|
|
loc = full_name.find(string_context) |
701
|
|
|
if loc > 0: |
702
|
|
|
if roman: |
703
|
|
|
if not any(abbr in fname for abbr in ('i.', 'v.', 'x.')): |
704
|
|
|
full_name = (full_name[:loc] + |
705
|
|
|
full_name[loc + len(string) + 1:]) |
706
|
|
|
toolcode[7] += '{:03d}'.format(num) + 'b' |
707
|
|
|
if not toolcode[3]: |
708
|
|
|
toolcode[3] = '{:03d}'.format(num) |
709
|
|
|
if normalize == 2: |
710
|
|
|
fname, lname = roman_check(string, fname, lname) |
711
|
|
|
else: |
712
|
|
|
full_name = (full_name[:loc] + |
713
|
|
|
full_name[loc + len(string) + 1:]) |
714
|
|
|
toolcode[7] += '{:03d}'.format(num) + 'b' |
715
|
|
|
if method & method_dict['beginning']: |
716
|
|
|
string_context = string + ' ' |
717
|
|
|
loc = full_name.find(string_context) |
718
|
|
|
if loc == 0: |
719
|
|
|
full_name = full_name[len(string) + 1:] |
720
|
|
|
toolcode[7] += '{:03d}'.format(num) + 'c' |
721
|
|
|
if method & method_dict['beginning_no_space']: |
722
|
|
|
loc = full_name.find(string) |
723
|
|
|
if loc == 0: |
724
|
|
|
toolcode[7] += '{:03d}'.format(num) + 'd' |
725
|
|
|
if full_name[len(string)] not in toolcode[9]: |
726
|
|
|
toolcode[9] += full_name[len(string)] |
727
|
|
|
|
728
|
|
|
if extra: |
729
|
|
|
loc = full_name.find(extra) |
730
|
|
|
if loc != -1: |
731
|
|
|
toolcode[7] += '{:03d}'.format(num) + 'X' |
732
|
|
|
if full_name[loc + len(extra)] not in toolcode[9]: |
733
|
|
|
toolcode[9] += full_name[loc + len(string)] |
734
|
|
|
|
735
|
|
|
return lname, fname, ''.join(toolcode) |
736
|
|
|
|
737
|
|
|
|
738
|
|
|
if __name__ == '__main__': |
739
|
|
|
import doctest |
740
|
|
|
doctest.testmod() |
741
|
|
|
|