Completed
Push — master ( 3b0a79...5eb1fa )
by Chris
13:35
created

abydos.fingerprint   F

Complexity

Total Complexity 94

Size/Duplication

Total Lines 741
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
wmc 94
eloc 450
dl 0
loc 741
rs 2
c 0
b 0
f 0

10 Functions

Rating   Name   Duplication   Size   Complexity  
B occurrence_halved_fingerprint() 0 39 7
B omission_key() 0 41 6
B position_fingerprint() 0 36 8
A occurrence_fingerprint() 0 31 5
B skeleton_key() 0 41 6
A count_fingerprint() 0 33 5
A phonetic_fingerprint() 0 35 4
A qgram_fingerprint() 0 27 1
F synoname_toolcode() 0 332 51
A str_fingerprint() 0 18 1

How to fix   Complexity   

Complexity

Complex classes like abydos.fingerprint often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

1
# -*- coding: utf-8 -*-
2
3
# Copyright 2014-2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19
"""abydos.fingerprint.
20
21
The clustering module implements clustering algorithms such as:
22
    - string fingerprint
23
    - q-gram fingerprint
24
    - phonetic fingerprint
25
    - Pollock & Zomora's skeleton key
26
    - Pollock & Zomora's omission key
27
    - Cisłak & Grabowski's occurrence fingerprint
28
    - Cisłak & Grabowski's occurrence halved fingerprint
29
    - Cisłak & Grabowski's count fingerprint
30
    - Cisłak & Grabowski's position fingerprint
31
    - Synoname Toolcode
32
"""
33
34
from __future__ import division, unicode_literals
35
36
import unicodedata
37
from collections import Counter
38
39
from six import text_type
40
41
from .phonetic import double_metaphone
42
from .qgram import QGrams
43
44
45
def str_fingerprint(phrase, joiner=' '):
46
    """Return string fingerprint.
47
48
    The fingerprint of a string is a string consisting of all of the unique
49
    words in a string, alphabetized & concatenated with intervening joiners
50
51
    :param str phrase: the string from which to calculate the fingerprint
52
    :param str joiner: the string that will be placed between each word
53
    :returns: the fingerprint of the phrase
54
    :rtype: str
55
56
    >>> str_fingerprint('The quick brown fox jumped over the lazy dog.')
57
    'brown dog fox jumped lazy over quick the'
58
    """
59
    phrase = unicodedata.normalize('NFKD', text_type(phrase.strip().lower()))
60
    phrase = ''.join([c for c in phrase if c.isalnum() or c.isspace()])
61
    phrase = joiner.join(sorted(list(set(phrase.split()))))
62
    return phrase
63
64
65
def qgram_fingerprint(phrase, qval=2, start_stop='', joiner=''):
66
    """Return Q-Gram fingerprint.
67
68
    A q-gram fingerprint is a string consisting of all of the unique q-grams
69
    in a string, alphabetized & concatenated.
70
71
    :param str phrase: the string from which to calculate the q-gram
72
        fingerprint
73
    :param int qval: the length of each q-gram (by default 2)
74
    :param str start_stop: the start & stop symbol(s) to concatenate on either
75
        end of the phrase, as defined in abydos.util.qgram()
76
    :param str joiner: the string that will be placed between each word
77
    :returns: the q-gram fingerprint of the phrase
78
    :rtype: str
79
80
    >>> qgram_fingerprint('The quick brown fox jumped over the lazy dog.')
81
    'azbrckdoedeleqerfoheicjukblampnfogovowoxpequrortthuiumvewnxjydzy'
82
    >>> qgram_fingerprint('Christopher')
83
    'cherhehrisopphristto'
84
    >>> qgram_fingerprint('Niall')
85
    'aliallni'
86
    """
87
    phrase = unicodedata.normalize('NFKD', text_type(phrase.strip().lower()))
88
    phrase = ''.join(c for c in phrase if c.isalnum())
89
    phrase = QGrams(phrase, qval, start_stop)
90
    phrase = joiner.join(sorted(phrase))
91
    return phrase
92
93
94
def phonetic_fingerprint(phrase, phonetic_algorithm=double_metaphone,
95
                         joiner=' ', *args):
96
    """Return the phonetic fingerprint of a phrase.
97
98
    A phonetic fingerprint is identical to a standard string fingerprint, as
99
    implemented in abydos.clustering.fingerprint(), but performs the
100
    fingerprinting function after converting the string to its phonetic form,
101
    as determined by some phonetic algorithm.
102
103
    :param str phrase: the string from which to calculate the phonetic
104
        fingerprint
105
    :param function phonetic_algorithm: a phonetic algorithm that takes a
106
        string and returns a string (presumably a phonetic representation of
107
        the original string) By default, this function uses
108
        abydos.phonetic.double_metaphone()
109
    :param str joiner: the string that will be placed between each word
110
    :param args: additional arguments to pass to the phonetic algorithm,
111
        along with the phrase itself
112
    :returns: the phonetic fingerprint of the phrase
113
    :rtype: str
114
115
    >>> phonetic_fingerprint('The quick brown fox jumped over the lazy dog.')
116
    '0 afr fks jmpt kk ls prn tk'
117
    >>> phonetic_fingerprint('The quick brown fox jumped over the lazy dog.',
118
    ... phonetic_algorithm=soundex)
119
    'b650 d200 f200 j513 l200 o160 q200 t000'
120
    """
121
    phonetic = ''
122
    for word in phrase.split():
123
        word = phonetic_algorithm(word, *args)
124
        if not isinstance(word, text_type) and hasattr(word, '__iter__'):
125
            word = word[0]
126
        phonetic += word + joiner
127
    phonetic = phonetic[:-len(joiner)]
128
    return str_fingerprint(phonetic)
129
130
131
def skeleton_key(word):
132
    """Return the skeleton key.
133
134
    The skeleton key of a word is defined in:
135
    Pollock, Joseph J. and Antonio Zamora. 1984. "Automatic Spelling Correction
136
    in Scientific and Scholarly Text." Communications of the ACM, 27(4).
137
    358--368. <http://dl.acm.org/citation.cfm?id=358048>
138
139
    :param str word: the word to transform into its skeleton key
140
    :returns: the skeleton key
141
    :rtype: str
142
143
    >>> skeleton_key('The quick brown fox jumped over the lazy dog.')
144
    'THQCKBRWNFXJMPDVLZYGEUIOA'
145
    >>> skeleton_key('Christopher')
146
    'CHRSTPIOE'
147
    >>> skeleton_key('Niall')
148
    'NLIA'
149
    """
150
    _vowels = {'A', 'E', 'I', 'O', 'U'}
151
152
    word = unicodedata.normalize('NFKD', text_type(word.upper()))
153
    word = ''.join(c for c in word if c in
154
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
155
                    'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
156
                    'Y', 'Z'})
157
    start = word[0:1]
158
    consonant_part = ''
159
    vowel_part = ''
160
161
    # add consonants & vowels to to separate strings
162
    # (omitting the first char & duplicates)
163
    for char in word[1:]:
164
        if char != start:
165
            if char in _vowels:
166
                if char not in vowel_part:
167
                    vowel_part += char
168
            elif char not in consonant_part:
169
                consonant_part += char
170
    # return the first char followed by consonants followed by vowels
171
    return start + consonant_part + vowel_part
172
173
174
def omission_key(word):
175
    """Return the omission key.
176
177
    The omission key of a word is defined in:
178
    Pollock, Joseph J. and Antonio Zamora. 1984. "Automatic Spelling Correction
179
    in Scientific and Scholarly Text." Communications of the ACM, 27(4).
180
    358--368. <http://dl.acm.org/citation.cfm?id=358048>
181
182
    :param str word: the word to transform into its omission key
183
    :returns: the omission key
184
    :rtype: str
185
186
    >>> omission_key('The quick brown fox jumped over the lazy dog.')
187
    'JKQXZVWYBFMGPDHCLNTREUIOA'
188
    >>> omission_key('Christopher')
189
    'PHCTSRIOE'
190
    >>> omission_key('Niall')
191
    'LNIA'
192
    """
193
    _consonants = ('J', 'K', 'Q', 'X', 'Z', 'V', 'W', 'Y', 'B', 'F', 'M', 'G',
194
                   'P', 'D', 'H', 'C', 'L', 'N', 'T', 'S', 'R')
195
196
    word = unicodedata.normalize('NFKD', text_type(word.upper()))
197
    word = ''.join(c for c in word if c in
198
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
199
                    'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
200
                    'Y', 'Z'})
201
202
    key = ''
203
204
    # add consonants in order supplied by _consonants (no duplicates)
205
    for char in _consonants:
206
        if char in word:
207
            key += char
208
209
    # add vowels in order they appeared in the word (no duplicates)
210
    for char in word:
211
        if char not in _consonants and char not in key:
212
            key += char
213
214
    return key
215
216
217
# TODO: Dump all these to a data file.
218
# most common letters, as defined in Cisłak & Grabowski
219
MOST_COMMON_LETTERS_CG = ('e', 't', 'a', 'o', 'i', 'n', 's', 'h', 'r', 'd',
220
                          'l', 'c', 'u', 'm', 'w', 'f')
221
222
# most common letters (case-folded to lowercase), as shown in Google Books
223
# English n-grams, among letters a-z & digits 0-9
224
MOST_COMMON_LETTERS_EN_LC = ('e', 't', 'a', 'i', 'o', 'n', 's', 'r', 'h', 'l',
225
                             'd', 'c', 'u', 'm', 'f', 'p', 'g', 'y', 'w', 'b',
226
                             'v', 'k', 'x', 'j', 'q', 'z', '1', '2', '0', '9',
227
                             '3', '4', '8', '5', '6', '7')
228
229
# most common letters, as shown in Google Books English n-grams, among letters
230
# A-Z, a-z & digits 0-9
231
MOST_COMMON_LETTERS = ('e', 't', 'a', 'o', 'i', 'n', 's', 'r', 'h', 'l', 'd',
232
                       'c', 'u', 'm', 'f', 'p', 'g', 'y', 'w', 'b', 'v', 'k',
233
                       'T', 'I', 'A', 'S', 'C', 'x', 'M', 'P', 'E', 'B', 'H',
234
                       'R', 'N', 'D', 'L', 'F', 'W', 'O', 'q', 'G', 'z', 'j',
235
                       'J', 'U', 'V', 'K', 'Y', '1', '2', '0', 'X', '9', 'Q',
236
                       '3', 'Z', '4', '8', '5', '6', '7',)
237
238
# most common letters (case-folded to lowercase), as shown in Google Books
239
# German n-grams, among letters (a-z and umlauted vowels & eszett) & digits 0-9
240
MOST_COMMON_LETTERS_DE = ('e', 'n', 'i', 'r', 's', 't', 'a', 'd', 'h', 'u',
241
                          'l', 'g', 'c', 'o', 'm', 'b', 'f', 'w', 'k', 'z',
242
                          'v', 'p', 'ü', 'ä', 'ß', 'ö', 'j', 'y', 'x', 'q',
243
                          '1', '2', '3', '4', '0', '5', '6', '9', '8', '7')
244
245
# most common letters (case-folded to lowercase), as shown in Google Books
246
# German n-grams, among letters (A-Z, a-z, umlauted vowels & eszett) & digits
247
# 0-9
248
MOST_COMMON_LETTERS_DE_LC = ('e', 'n', 'i', 'r', 's', 't', 'a', 'd', 'h', 'u',
249
                             'l', 'c', 'g', 'o', 'm', 'b', 'f', 'w', 'k', 'z',
250
                             'v', 'p', 'ü', 'ä', 'S', 'A', 'D', 'B', 'E', 'G',
251
                             'M', 'ß', 'V', 'K', 'ö', 'W', 'F', 'P', 'R', 'I',
252
                             'H', 'L', 'T', 'N', 'Z', 'y', 'U', 'j', 'J', 'O',
253
                             'C', 'x', 'q', 'Ü', 'Q', 'X', 'Ä', 'Ö', '1', '2',
254
                             'Y', '3', '4', '0', '5', '6', '9', '8', '7')
255
256
257
def occurrence_fingerprint(word, n_bits=16,
258
                           most_common=MOST_COMMON_LETTERS_CG):
259
    """Return the occurrence fingerprint.
260
261
    Based on the occurence fingerprint from:
262
    Cisłak, Aleksander and Szymon Grabowski. "Lightweight Fingerprints for
263
    Fast Approximate Keyword Matching Using Bitwise Operations."
264
    http://arxiv.org/abs/1711.08475
265
266
    :param word: the word to fingerprint
267
    :param n_bits: number of bits in the fingerprint returned
268
    :param most_common: the most common tokens in the target language
269
    :return: the occurrence fingerprint
270
    :rtype: int
271
    """
272
    word = set(word)
273
    fingerprint = 0
274
275
    for letter in most_common:
276
        if letter in word:
277
            fingerprint += 1
278
        n_bits -= 1
279
        if n_bits:
280
            fingerprint <<= 1
281
        else:
282
            break
283
284
    if n_bits:
285
        fingerprint <<= n_bits
286
287
    return fingerprint
288
289
290
def occurrence_halved_fingerprint(word, n_bits=16,
291
                                  most_common=MOST_COMMON_LETTERS_CG):
292
    """Return the occurrence halved fingerprint.
293
294
    Based on the occurence halved fingerprint from:
295
    Cisłak, Aleksander and Szymon Grabowski. "Lightweight Fingerprints for
296
    Fast Approximate Keyword Matching Using Bitwise Operations."
297
    http://arxiv.org/abs/1711.08475
298
299
    :param word: the word to fingerprint
300
    :param n_bits: number of bits in the fingerprint returned
301
    :param most_common: the most common tokens in the target language
302
    :return: the occurrence halved fingerprint
303
    :rtype: int
304
    """
305
    if n_bits % 2:
306
        n_bits += 1
307
308
    w_len = len(word)//2
309
    w_1 = set(word[:w_len])
310
    w_2 = set(word[w_len:])
311
    fingerprint = 0
312
313
    for letter in most_common:
314
        if letter in w_1:
315
            fingerprint += 1
316
        fingerprint <<= 1
317
        if letter in w_2:
318
            fingerprint += 1
319
        n_bits -= 2
320
        if n_bits:
321
            fingerprint <<= 1
322
        else:
323
            break
324
325
    if n_bits:
326
        fingerprint <<= n_bits
327
328
    return fingerprint
329
330
331
def count_fingerprint(word, n_bits=16,
332
                      most_common=MOST_COMMON_LETTERS_CG):
333
    """Return the count fingerprint.
334
335
    Based on the count fingerprint from:
336
    Cisłak, Aleksander and Szymon Grabowski. "Lightweight Fingerprints for
337
    Fast Approximate Keyword Matching Using Bitwise Operations."
338
    http://arxiv.org/abs/1711.08475
339
340
    :param word: the word to fingerprint
341
    :param n_bits: number of bits in the fingerprint returned
342
    :param most_common: the most common tokens in the target language
343
    :return: the count fingerprint
344
    :rtype: int
345
    """
346
    if n_bits % 2:
347
        n_bits += 1
348
349
    word = Counter(word)
350
    fingerprint = 0
351
352
    for letter in most_common:
353
        fingerprint += (word[letter] & 3)
354
        n_bits -= 2
355
        if n_bits:
356
            fingerprint <<= 2
357
        else:
358
            break
359
360
    if n_bits:
361
        fingerprint <<= n_bits
362
363
    return fingerprint
364
365
366
def position_fingerprint(word, n_bits=16,
367
                         most_common=MOST_COMMON_LETTERS_CG,
368
                         bits_per_letter=3):
369
    """Return the position fingerprint.
370
371
    Based on the position fingerprint from:
372
    Cisłak, Aleksander and Szymon Grabowski. "Lightweight Fingerprints for
373
    Fast Approximate Keyword Matching Using Bitwise Operations."
374
    http://arxiv.org/abs/1711.08475
375
376
    :param word: the word to fingerprint
377
    :param n_bits: number of bits in the fingerprint returned
378
    :param most_common: the most common tokens in the target language
379
    :param bits_per_letter: the bits to assign for letter position
380
    :return: the position fingerprint
381
    :rtype: int
382
    """
383
    position = {}
384
    for pos, letter in enumerate(word):
385
        if letter not in position and letter in most_common:
386
            position[letter] = min(pos, 2**bits_per_letter-1)
387
388
    fingerprint = 0
389
    for letter in most_common:
390
        if letter in position:
391
            fingerprint += min(position[letter], 2**n_bits-1)
392
        n_bits -= bits_per_letter
393
        if n_bits > 0:
394
            fingerprint <<= min(bits_per_letter, n_bits)
395
        else:
396
            break
397
398
    if n_bits > 0:
399
        fingerprint <<= n_bits
400
401
    return fingerprint
402
403
404
def synoname_toolcode(lname, fname='', qual='', normalize=0):
405
    """Build the Synoname toolcode.
406
407
    :param lname: last name
408
    :param fname: first name (can be blank)
409
    :param qual: qualifier
410
    :return:
411
    """
412
    method_dict = {'end': 1, 'middle': 2, 'beginning': 4,
413
                   'beginning_no_space': 8}
414
    special_table = (
415
        # Roman, string, extra, method
416
        (False, 'NONE', '', 0),
417
        (False, 'aine', '', 3),
418
        (False, 'also erroneously', '', 4),
419
        (False, 'also identified with the', '', 2),
420
        (False, 'also identified with', '', 2),
421
        (False, 'archbishop', '', 7),
422
        (False, 'atelier', '', 7),
423
        (False, 'baron', '', 7),
424
        (False, 'cadet', '', 3),
425
        (False, 'cardinal', '', 7),
426
        (False, 'circle of', '', 5),
427
        (False, 'circle', '', 5),
428
        (False, 'class of', '', 5),
429
        (False, 'conde de', '', 7),
430
        (False, 'countess', '', 7),
431
        (False, 'count', '', 7),
432
        (False, "d'", " d'", 15),
433
        (False, 'dai', '', 15),
434
        (False, "dall'", " dall'", 15),
435
        (False, 'dalla', '', 15),
436
        (False, 'dalle', '', 15),
437
        (False, 'dal', '', 15),
438
        (False, 'da', '', 15),
439
        (False, 'degli', '', 15),
440
        (False, 'della', '', 15),
441
        (False, 'del', '', 15),
442
        (False, 'den', '', 15),
443
        (False, 'der altere', '', 3),
444
        (False, 'der jungere', '', 3),
445
        (False, 'der', '', 15),
446
        (False, 'de la', '', 15),
447
        (False, 'des', '', 15),
448
        (False, "de'", " de'", 15),
449
        (False, 'de', '', 15),
450
        (False, 'di ser', '', 7),
451
        (False, 'di', '', 15),
452
        (False, 'dos', '', 15),
453
        (False, 'du', '', 15),
454
        (False, 'duke of', '', 7),
455
        (False, 'earl of', '', 7),
456
        (False, 'el', '', 15),
457
        (False, 'fils', '', 3),
458
        (False, 'florentine follower of', '', 5),
459
        (False, 'follower of', '', 5),
460
        (False, 'fra', '', 7),
461
        (False, 'freiherr von', '', 7),
462
        (False, 'giovane', '', 7),
463
        (False, 'group', '', 5),
464
        (True, 'iii', '', 3),
465
        (True, 'ii', '', 3),
466
        (False, 'il giovane', '', 7),
467
        (False, 'il vecchio', '', 7),
468
        (False, 'il', '', 15),
469
        (False, "in't", '', 7),
470
        (False, 'in het', '', 7),
471
        (True, 'iv', '', 3),
472
        (True, 'ix', '', 3),
473
        (True, 'i', '', 3),
474
        (False, 'jr.', '', 3),
475
        (False, 'jr', '', 3),
476
        (False, 'juniore', '', 3),
477
        (False, 'junior', '', 3),
478
        (False, 'king of', '', 7),
479
        (False, "l'", " l'", 15),
480
        (False, "l'aine", '', 3),
481
        (False, 'la', '', 15),
482
        (False, 'le jeune', '', 3),
483
        (False, 'le', '', 15),
484
        (False, 'lo', '', 15),
485
        (False, 'maestro', '', 7),
486
        (False, 'maitre', '', 7),
487
        (False, 'marchioness', '', 7),
488
        (False, 'markgrafin von', '', 7),
489
        (False, 'marquess', '', 7),
490
        (False, 'marquis', '', 7),
491
        (False, 'master of the', '', 7),
492
        (False, 'master of', '', 7),
493
        (False, 'master known as the', '', 7),
494
        (False, 'master with the', '', 7),
495
        (False, 'master with', '', 7),
496
        (False, 'masters', '', 7),
497
        (False, 'master', '', 7),
498
        (False, 'meister', '', 7),
499
        (False, 'met de', '', 7),
500
        (False, 'met', '', 7),
501
        (False, 'mlle.', '', 7),
502
        (False, 'mlle', '', 7),
503
        (False, 'monogrammist', '', 7),
504
        (False, 'monsu', '', 7),
505
        (False, 'nee', '', 2),
506
        (False, 'of', '', 3),
507
        (False, 'oncle', '', 3),
508
        (False, 'op den', '', 15),
509
        (False, 'op de', '', 15),
510
        (False, 'or', '', 2),
511
        (False, 'over den', '', 15),
512
        (False, 'over de', '', 15),
513
        (False, 'over', '', 7),
514
        (False, 'p.re', '', 7),
515
        (False, 'p.r.a.', '', 1),
516
        (False, 'padre', '', 7),
517
        (False, 'painter', '', 7),
518
        (False, 'pere', '', 3),
519
        (False, 'possibly identified with', '', 6),
520
        (False, 'possibly', '', 6),
521
        (False, 'pseudo', '', 15),
522
        (False, 'r.a.', '', 1),
523
        (False, 'reichsgraf von', '', 7),
524
        (False, 'ritter von', '', 7),
525
        (False, 'sainte-', ' sainte-', 8),
526
        (False, 'sainte', '', 7),
527
        (False, 'saint-', ' saint-', 8),
528
        (False, 'saint', '', 7),
529
        (False, 'santa', '', 15),
530
        (False, "sant'", " sant'", 15),
531
        (False, 'san', '', 15),
532
        (False, 'ser', '', 7),
533
        (False, 'seniore', '', 3),
534
        (False, 'senior', '', 3),
535
        (False, 'sir', '', 5),
536
        (False, 'sr.', '', 3),
537
        (False, 'sr', '', 3),
538
        (False, 'ss.', ' ss.', 14),
539
        (False, 'ss', '', 6),
540
        (False, 'st-', ' st-', 8),
541
        (False, 'st.', ' st.', 15),
542
        (False, 'ste-', ' ste-', 8),
543
        (False, 'ste.', ' ste.', 15),
544
        (False, 'studio', '', 7),
545
        (False, 'sub-group', '', 5),
546
        (False, 'sultan of', '', 7),
547
        (False, 'ten', '', 15),
548
        (False, 'ter', '', 15),
549
        (False, 'the elder', '', 3),
550
        (False, 'the younger', '', 3),
551
        (False, 'the', '', 7),
552
        (False, 'tot', '', 15),
553
        (False, 'unidentified', '', 1),
554
        (False, 'van den', '', 15),
555
        (False, 'van der', '', 15),
556
        (False, 'van de', '', 15),
557
        (False, 'vanden', '', 15),
558
        (False, 'vander', '', 15),
559
        (False, 'van', '', 15),
560
        (False, 'vecchia', '', 7),
561
        (False, 'vecchio', '', 7),
562
        (True, 'viii', '', 3),
563
        (True, 'vii', '', 3),
564
        (True, 'vi', '', 3),
565
        (True, 'v', '', 3),
566
        (False, 'vom', '', 7),
567
        (False, 'von', '', 15),
568
        (False, 'workshop', '', 7),
569
        (True, 'xiii', '', 3),
570
        (True, 'xii', '', 3),
571
        (True, 'xiv', '', 3),
572
        (True, 'xix', '', 3),
573
        (True, 'xi', '', 3),
574
        (True, 'xviii', '', 3),
575
        (True, 'xvii', '', 3),
576
        (True, 'xvi', '', 3),
577
        (True, 'xv', '', 3),
578
        (True, 'xx', '', 3),
579
        (True, 'x', '', 3),
580
        (False, 'y', '', 7)
581
    )
582
583
    # Start with the basic code
584
    toolcode = ['0', '0', '0', '000', '00', '00', '$', '', '$', '']
585
586
    full_name = ' '.join((lname, fname))
587
588
    # Fill field 0 (qualifier)
589
    qual_3 = {'adaptation after', 'after', 'assistant of', 'assistants of',
590
              'circle of', 'follower of', 'imitator of', 'in the style of',
591
              'manner of', 'pupil of', 'school of', 'studio of',
592
              'style of', 'workshop of'}
593
    qual_2 = {'copy after', 'copy after?', 'copy of'}
594
    qual_1 = {'ascribed to', 'attributed to or copy after',
595
              'attributed to', 'possibly'}
596
597
    if qual in qual_3:
598
        toolcode[0] = '3'
599
    elif qual in qual_2:
600
        toolcode[0] = '2'
601
    elif qual in qual_1:
602
        toolcode[0] = '1'
603
604
    # Fill field 1 (punctuation)
605
    if '.' in full_name:
606
        toolcode[1] = '2'
607
    else:
608
        for punct in ',-/:;"&\'()!{|}?$%*+<=>[\\]^_`~':
609
            if punct in full_name:
610
                toolcode[1] = '1'
611
                break
612
613
    # Fill field 2 (generation)
614
    gen_1 = ('the elder', ' sr.', ' sr', 'senior', 'der altere', 'il vecchio',
615
             "l'aine", 'p.re', 'padre', 'seniore', 'vecchia', 'vecchio')
616
    gen_2 = (' jr.', ' jr', 'der jungere', 'il giovane', 'giovane', 'juniore',
617
             'junior', 'le jeune', 'the younger')
618
619
    elderyounger = ''  # save elder/younger for possible movement later
620
    for gen in gen_1:
621
        if gen in full_name:
622
            toolcode[2] = '1'
623
            elderyounger = gen
624
            break
625
    else:
626
        for gen in gen_2:
627
            if gen in full_name:
628
                toolcode[2] = '2'
629
                elderyounger = gen
630
                break
631
632
    # do comma flip
633
    if normalize:
634
        comma = lname.find(',')
635
        if comma != -1:
636
            lname_end = lname[comma + 1:]
637
            while lname_end[0] in {' ', ','}:
638
                lname_end = lname_end[1:]
639
            fname = lname_end + ' ' + fname
640
            lname = lname[:comma].strip()
641
642
    # do elder/younger move
643
    if normalize == 2 and elderyounger:
644
        elderyounger_loc = fname.find(elderyounger)
645
        if elderyounger_loc != -1:
646
            lname = lname + ' ' + elderyounger.strip()
647
            fname = (fname[:elderyounger_loc].strip() + ' ' +
648
                     fname[elderyounger_loc + len(elderyounger):])
649
650
    toolcode[4] = '{:02d}'.format(len(fname))
651
    toolcode[5] = '{:02d}'.format(len(lname))
652
653
    # strip punctuation
654
    for char in ',/:;"&()!{|}?$%*+<=>[\\]^_`~':
655
        full_name = full_name.replace(char, '')
656
    for pos, char in enumerate(full_name):
657
        if char == '-' and full_name[pos - 1:pos + 2] != 'b-g':
658
            full_name = full_name[:pos] + ' ' + full_name[pos + 1:]
659
660
    # Fill field 9 (search range)
661
    for letter in [_[0] for _ in full_name.split()]:
662
        if letter not in toolcode[9]:
663
            toolcode[9] += letter
664
        if len(toolcode[9]) == 15:
665
            break
666
667
    def roman_check(numeral, fname, lname):
668
        """Move Roman numerals from first name to last."""
669
        loc = fname.find(numeral)
670
        if (loc != -1 and
671
                (fname[loc + len(numeral)] in {' ', ','} or
672
                 len(fname[loc:]) == len(numeral))):
673
            lname += ' ' + numeral
674
            fname = fname[:loc].strip()
675
            while fname[-1] in {' ', ','}:
676
                fname = fname[:-1]
677
        return fname, lname
678
679
    # Fill fields 7 (specials) and 3 (roman numerals)
680
    for num, special in enumerate(special_table):
681
        roman, string, extra, method = special
682
        if method & method_dict['end']:
683
            string_context = ' ' + string
684
            loc = full_name.find(string_context)
685
            if ((len(full_name) > len(string_context)) and
686
                    (loc == len(full_name) - len(string_context))):
687
                if roman:
688
                    if not any(abbr in fname for abbr in ('i.', 'v.', 'x.')):
689
                        full_name = full_name[:loc]
690
                        toolcode[7] += '{:03d}'.format(num) + 'a'
691
                        if not toolcode[3]:
692
                            toolcode[3] = '{:03d}'.format(num)
693
                        if normalize == 2:
694
                            fname, lname = roman_check(string, fname, lname)
695
                else:
696
                    full_name = full_name[:loc]
697
                    toolcode[7] += '{:03d}'.format(num) + 'a'
698
        if method & method_dict['middle']:
699
            string_context = ' ' + string + ' '
700
            loc = full_name.find(string_context)
701
            if loc > 0:
702
                if roman:
703
                    if not any(abbr in fname for abbr in ('i.', 'v.', 'x.')):
704
                        full_name = (full_name[:loc] +
705
                                     full_name[loc + len(string) + 1:])
706
                        toolcode[7] += '{:03d}'.format(num) + 'b'
707
                        if not toolcode[3]:
708
                            toolcode[3] = '{:03d}'.format(num)
709
                        if normalize == 2:
710
                            fname, lname = roman_check(string, fname, lname)
711
                else:
712
                    full_name = (full_name[:loc] +
713
                                 full_name[loc + len(string) + 1:])
714
                    toolcode[7] += '{:03d}'.format(num) + 'b'
715
        if method & method_dict['beginning']:
716
            string_context = string + ' '
717
            loc = full_name.find(string_context)
718
            if loc == 0:
719
                full_name = full_name[len(string) + 1:]
720
                toolcode[7] += '{:03d}'.format(num) + 'c'
721
        if method & method_dict['beginning_no_space']:
722
            loc = full_name.find(string)
723
            if loc == 0:
724
                toolcode[7] += '{:03d}'.format(num) + 'd'
725
                if full_name[len(string)] not in toolcode[9]:
726
                    toolcode[9] += full_name[len(string)]
727
728
        if extra:
729
            loc = full_name.find(extra)
730
            if loc != -1:
731
                toolcode[7] += '{:03d}'.format(num) + 'X'
732
                if full_name[loc + len(extra)] not in toolcode[9]:
733
                    toolcode[9] += full_name[loc + len(string)]
734
735
    return lname, fname, ''.join(toolcode)
736
737
738
if __name__ == '__main__':
739
    import doctest
740
    doctest.testmod()
741