Completed
Push — master ( f7faf9...ef8477 )
by Chris
12:37
created

abydos.fingerprint   F

Complexity

Total Complexity 94

Size/Duplication

Total Lines 736
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
eloc 449
dl 0
loc 736
rs 2
c 0
b 0
f 0
wmc 94

10 Functions

Rating   Name   Duplication   Size   Complexity  
B occurrence_halved_fingerprint() 0 39 7
B omission_key() 0 41 6
B position_fingerprint() 0 36 8
A occurrence_fingerprint() 0 31 5
B skeleton_key() 0 41 6
A count_fingerprint() 0 33 5
A phonetic_fingerprint() 0 33 4
A qgram_fingerprint() 0 26 1
F synoname_toolcode() 0 332 51
A fingerprint() 0 17 1

How to fix   Complexity   

Complexity

Complex classes like abydos.fingerprint often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

1
# -*- coding: utf-8 -*-
2
3
# Copyright 2014-2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19
"""abydos.fingerptint.
20
21
The clustering module implements clustering algorithms such as:
22
    - string fingerprint
23
    - q-gram fingerprint
24
    - phonetic fingerprint
25
    - Pollock & Zomora's skeleton key
26
    - Pollock & Zomora's omission key
27
    - Cisłak & Grabowski's occurrence fingerprint
28
    - Cisłak & Grabowski's occurrence halved fingerprint
29
    - Cisłak & Grabowski's count fingerprint
30
    - Cisłak & Grabowski's position fingerprint
31
"""
32
33
from __future__ import division, unicode_literals
34
35
import unicodedata
36
from collections import Counter
37
38
from six import text_type
39
40
from .phonetic import double_metaphone
41
from .qgram import QGrams
42
43
44
def fingerprint(phrase):
45
    """Return string fingerprint.
46
47
    The fingerprint of a string is a string consisting of all of the unique
48
    words in a string, alphabetized & concatenated with intervening spaces
49
50
    :param str phrase: the string from which to calculate the fingerprint
51
    :returns: the fingerprint of the phrase
52
    :rtype: str
53
54
    >>> fingerprint('The quick brown fox jumped over the lazy dog.')
55
    'brown dog fox jumped lazy over quick the'
56
    """
57
    phrase = unicodedata.normalize('NFKD', text_type(phrase.strip().lower()))
58
    phrase = ''.join([c for c in phrase if c.isalnum() or c.isspace()])
59
    phrase = ' '.join(sorted(list(set(phrase.split()))))
60
    return phrase
61
62
63
def qgram_fingerprint(phrase, qval=2, start_stop=''):
64
    """Return Q-Gram fingerprint.
65
66
    A q-gram fingerprint is a string consisting of all of the unique q-grams
67
    in a string, alphabetized & concatenated.
68
69
    :param str phrase: the string from which to calculate the q-gram
70
        fingerprint
71
    :param int qval: the length of each q-gram (by default 2)
72
    :param str start_stop: the start & stop symbol(s) to concatenate on either
73
        end of the phrase, as defined in abydos.util.qgram()
74
    :returns: the q-gram fingerprint of the phrase
75
    :rtype: str
76
77
    >>> qgram_fingerprint('The quick brown fox jumped over the lazy dog.')
78
    'azbrckdoedeleqerfoheicjukblampnfogovowoxpequrortthuiumvewnxjydzy'
79
    >>> qgram_fingerprint('Christopher')
80
    'cherhehrisopphristto'
81
    >>> qgram_fingerprint('Niall')
82
    'aliallni'
83
    """
84
    phrase = unicodedata.normalize('NFKD', text_type(phrase.strip().lower()))
85
    phrase = ''.join(c for c in phrase if c.isalnum())
86
    phrase = QGrams(phrase, qval, start_stop)
87
    phrase = ''.join(sorted(phrase))
88
    return phrase
89
90
91
def phonetic_fingerprint(phrase, phonetic_algorithm=double_metaphone, *args):
92
    """Return the phonetic fingerprint of a phrase.
93
94
    A phonetic fingerprint is identical to a standard string fingerprint, as
95
    implemented in abydos.clustering.fingerprint(), but performs the
96
    fingerprinting function after converting the string to its phonetic form,
97
    as determined by some phonetic algorithm.
98
99
    :param str phrase: the string from which to calculate the phonetic
100
        fingerprint
101
    :param function phonetic_algorithm: a phonetic algorithm that takes a
102
        string and returns a string (presumably a phonetic representation of
103
        the original string) By default, this function uses
104
        abydos.phonetic.double_metaphone()
105
    :param args: additional arguments to pass to the phonetic algorithm,
106
        along with the phrase itself
107
    :returns: the phonetic fingerprint of the phrase
108
    :rtype: str
109
110
    >>> phonetic_fingerprint('The quick brown fox jumped over the lazy dog.')
111
    '0 afr fks jmpt kk ls prn tk'
112
    >>> phonetic_fingerprint('The quick brown fox jumped over the lazy dog.',
113
    ... phonetic_algorithm=soundex)
114
    'b650 d200 f200 j513 l200 o160 q200 t000'
115
    """
116
    phonetic = ''
117
    for word in phrase.split():
118
        word = phonetic_algorithm(word, *args)
119
        if not isinstance(word, text_type) and hasattr(word, '__iter__'):
120
            word = word[0]
121
        phonetic += word + ' '
122
    phonetic = phonetic[:-1]
123
    return fingerprint(phonetic)
124
125
126
def skeleton_key(word):
127
    """Return the skeleton key.
128
129
    The skeleton key of a word is defined in:
130
    Pollock, Joseph J. and Antonio Zamora. 1984. "Automatic Spelling Correction
131
    in Scientific and Scholarly Text." Communications of the ACM, 27(4).
132
    358--368. <http://dl.acm.org/citation.cfm?id=358048>
133
134
    :param str word: the word to transform into its skeleton key
135
    :returns: the skeleton key
136
    :rtype: str
137
138
    >>> skeleton_key('The quick brown fox jumped over the lazy dog.')
139
    'THQCKBRWNFXJMPDVLZYGEUIOA'
140
    >>> skeleton_key('Christopher')
141
    'CHRSTPIOE'
142
    >>> skeleton_key('Niall')
143
    'NLIA'
144
    """
145
    _vowels = {'A', 'E', 'I', 'O', 'U'}
146
147
    word = unicodedata.normalize('NFKD', text_type(word.upper()))
148
    word = ''.join(c for c in word if c in
149
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
150
                    'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
151
                    'Y', 'Z'})
152
    start = word[0:1]
153
    consonant_part = ''
154
    vowel_part = ''
155
156
    # add consonants & vowels to to separate strings
157
    # (omitting the first char & duplicates)
158
    for char in word[1:]:
159
        if char != start:
160
            if char in _vowels:
161
                if char not in vowel_part:
162
                    vowel_part += char
163
            elif char not in consonant_part:
164
                consonant_part += char
165
    # return the first char followed by consonants followed by vowels
166
    return start + consonant_part + vowel_part
167
168
169
def omission_key(word):
170
    """Return the omission key.
171
172
    The omission key of a word is defined in:
173
    Pollock, Joseph J. and Antonio Zamora. 1984. "Automatic Spelling Correction
174
    in Scientific and Scholarly Text." Communications of the ACM, 27(4).
175
    358--368. <http://dl.acm.org/citation.cfm?id=358048>
176
177
    :param str word: the word to transform into its omission key
178
    :returns: the omission key
179
    :rtype: str
180
181
    >>> omission_key('The quick brown fox jumped over the lazy dog.')
182
    'JKQXZVWYBFMGPDHCLNTREUIOA'
183
    >>> omission_key('Christopher')
184
    'PHCTSRIOE'
185
    >>> omission_key('Niall')
186
    'LNIA'
187
    """
188
    _consonants = ('J', 'K', 'Q', 'X', 'Z', 'V', 'W', 'Y', 'B', 'F', 'M', 'G',
189
                   'P', 'D', 'H', 'C', 'L', 'N', 'T', 'S', 'R')
190
191
    word = unicodedata.normalize('NFKD', text_type(word.upper()))
192
    word = ''.join(c for c in word if c in
193
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
194
                    'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
195
                    'Y', 'Z'})
196
197
    key = ''
198
199
    # add consonants in order supplied by _consonants (no duplicates)
200
    for char in _consonants:
201
        if char in word:
202
            key += char
203
204
    # add vowels in order they appeared in the word (no duplicates)
205
    for char in word:
206
        if char not in _consonants and char not in key:
207
            key += char
208
209
    return key
210
211
212
# TODO: Dump all these to a data file.
213
# most common letters, as defined in Cisłak & Grabowski
214
MOST_COMMON_LETTERS_CG = ('e', 't', 'a', 'o', 'i', 'n', 's', 'h', 'r', 'd',
215
                          'l', 'c', 'u', 'm', 'w', 'f')
216
217
# most common letters (case-folded to lowercase), as shown in Google Books
218
# English n-grams, among letters a-z & digits 0-9
219
MOST_COMMON_LETTERS_EN_LC = ('e', 't', 'a', 'i', 'o', 'n', 's', 'r', 'h', 'l',
220
                             'd', 'c', 'u', 'm', 'f', 'p', 'g', 'y', 'w', 'b',
221
                             'v', 'k', 'x', 'j', 'q', 'z', '1', '2', '0', '9',
222
                             '3', '4', '8', '5', '6', '7')
223
224
# most common letters, as shown in Google Books English n-grams, among letters
225
# A-Z, a-z & digits 0-9
226
MOST_COMMON_LETTERS = ('e', 't', 'a', 'o', 'i', 'n', 's', 'r', 'h', 'l', 'd',
227
                       'c', 'u', 'm', 'f', 'p', 'g', 'y', 'w', 'b', 'v', 'k',
228
                       'T', 'I', 'A', 'S', 'C', 'x', 'M', 'P', 'E', 'B', 'H',
229
                       'R', 'N', 'D', 'L', 'F', 'W', 'O', 'q', 'G', 'z', 'j',
230
                       'J', 'U', 'V', 'K', 'Y', '1', '2', '0', 'X', '9', 'Q',
231
                       '3', 'Z', '4', '8', '5', '6', '7',)
232
233
# most common letters (case-folded to lowercase), as shown in Google Books
234
# German n-grams, among letters (a-z and umlauted vowels & eszett) & digits 0-9
235
MOST_COMMON_LETTERS_DE = ('e', 'n', 'i', 'r', 's', 't', 'a', 'd', 'h', 'u',
236
                          'l', 'g', 'c', 'o', 'm', 'b', 'f', 'w', 'k', 'z',
237
                          'v', 'p', 'ü', 'ä', 'ß', 'ö', 'j', 'y', 'x', 'q',
238
                          '1', '2', '3', '4', '0', '5', '6', '9', '8', '7')
239
240
# most common letters (case-folded to lowercase), as shown in Google Books
241
# German n-grams, among letters (A-Z, a-z, umlauted vowels & eszett) & digits
242
# 0-9
243
MOST_COMMON_LETTERS_DE_LC = ('e', 'n', 'i', 'r', 's', 't', 'a', 'd', 'h', 'u',
244
                             'l', 'c', 'g', 'o', 'm', 'b', 'f', 'w', 'k', 'z',
245
                             'v', 'p', 'ü', 'ä', 'S', 'A', 'D', 'B', 'E', 'G',
246
                             'M', 'ß', 'V', 'K', 'ö', 'W', 'F', 'P', 'R', 'I',
247
                             'H', 'L', 'T', 'N', 'Z', 'y', 'U', 'j', 'J', 'O',
248
                             'C', 'x', 'q', 'Ü', 'Q', 'X', 'Ä', 'Ö', '1', '2',
249
                             'Y', '3', '4', '0', '5', '6', '9', '8', '7')
250
251
252
def occurrence_fingerprint(word, n_bits=16,
253
                           most_common=MOST_COMMON_LETTERS_CG):
254
    """Return the occurrence fingerprint.
255
256
    Based on the occurence fingerprint from:
257
    Cisłak, Aleksander and Szymon Grabowski. "Lightweight Fingerprints for
258
    Fast Approximate Keyword Matching Using Bitwise Operations."
259
    http://arxiv.org/abs/1711.08475
260
261
    :param word: the word to fingerprint
262
    :param n_bits: number of bits in the fingerprint returned
263
    :param most_common: the most common tokens in the target language
264
    :return: the occurrence fingerprint
265
    :rtype: int
266
    """
267
    word = set(word)
268
    fingerprint = 0
269
270
    for letter in most_common:
271
        if letter in word:
272
            fingerprint += 1
273
        n_bits -= 1
274
        if n_bits:
275
            fingerprint <<= 1
276
        else:
277
            break
278
279
    if n_bits:
280
        fingerprint <<= n_bits
281
282
    return fingerprint
283
284
285
def occurrence_halved_fingerprint(word, n_bits=16,
286
                                  most_common=MOST_COMMON_LETTERS_CG):
287
    """Return the occurrence halved fingerprint.
288
289
    Based on the occurence halved fingerprint from:
290
    Cisłak, Aleksander and Szymon Grabowski. "Lightweight Fingerprints for
291
    Fast Approximate Keyword Matching Using Bitwise Operations."
292
    http://arxiv.org/abs/1711.08475
293
294
    :param word: the word to fingerprint
295
    :param n_bits: number of bits in the fingerprint returned
296
    :param most_common: the most common tokens in the target language
297
    :return: the occurrence halved fingerprint
298
    :rtype: int
299
    """
300
    if n_bits % 2:
301
        n_bits += 1
302
303
    w_len = len(word)//2
304
    w_1 = set(word[:w_len])
305
    w_2 = set(word[w_len:])
306
    fingerprint = 0
307
308
    for letter in most_common:
309
        if letter in w_1:
310
            fingerprint += 1
311
        fingerprint <<= 1
312
        if letter in w_2:
313
            fingerprint += 1
314
        n_bits -= 2
315
        if n_bits:
316
            fingerprint <<= 1
317
        else:
318
            break
319
320
    if n_bits:
321
        fingerprint <<= n_bits
322
323
    return fingerprint
324
325
326
def count_fingerprint(word, n_bits=16,
327
                      most_common=MOST_COMMON_LETTERS_CG):
328
    """Return the count fingerprint.
329
330
    Based on the count fingerprint from:
331
    Cisłak, Aleksander and Szymon Grabowski. "Lightweight Fingerprints for
332
    Fast Approximate Keyword Matching Using Bitwise Operations."
333
    http://arxiv.org/abs/1711.08475
334
335
    :param word: the word to fingerprint
336
    :param n_bits: number of bits in the fingerprint returned
337
    :param most_common: the most common tokens in the target language
338
    :return: the count fingerprint
339
    :rtype: int
340
    """
341
    if n_bits % 2:
342
        n_bits += 1
343
344
    word = Counter(word)
345
    fingerprint = 0
346
347
    for letter in most_common:
348
        fingerprint += (word[letter] & 3)
349
        n_bits -= 2
350
        if n_bits:
351
            fingerprint <<= 2
352
        else:
353
            break
354
355
    if n_bits:
356
        fingerprint <<= n_bits
357
358
    return fingerprint
359
360
361
def position_fingerprint(word, n_bits=16,
362
                         most_common=MOST_COMMON_LETTERS_CG,
363
                         bits_per_letter=3):
364
    """Return the position fingerprint.
365
366
    Based on the position fingerprint from:
367
    Cisłak, Aleksander and Szymon Grabowski. "Lightweight Fingerprints for
368
    Fast Approximate Keyword Matching Using Bitwise Operations."
369
    http://arxiv.org/abs/1711.08475
370
371
    :param word: the word to fingerprint
372
    :param n_bits: number of bits in the fingerprint returned
373
    :param most_common: the most common tokens in the target language
374
    :param bits_per_letter: the bits to assign for letter position
375
    :return: the position fingerprint
376
    :rtype: int
377
    """
378
    position = {}
379
    for pos, letter in enumerate(word):
380
        if letter not in position and letter in most_common:
381
            position[letter] = min(pos, 2**bits_per_letter-1)
382
383
    fingerprint = 0
384
    for letter in most_common:
385
        if letter in position:
386
            fingerprint += min(position[letter], 2**n_bits-1)
387
        n_bits -= bits_per_letter
388
        if n_bits > 0:
389
            fingerprint <<= min(bits_per_letter, n_bits)
390
        else:
391
            break
392
393
    if n_bits > 0:
394
        fingerprint <<= n_bits
395
396
    return fingerprint
397
398
399
def synoname_toolcode(lname, fname='', qual='', normalize=0):
400
    """Build the Synoname toolcode.
401
402
    :param lname: last name
403
    :param fname: first name (can be blank)
404
    :param qual: qualifier
405
    :return:
406
    """
407
    method_dict = {'end': 1, 'middle': 2, 'beginning': 4,
408
                   'beginning_no_space': 8}
409
    special_table = (
410
        # Roman, string, extra, method
411
        (False, 'NONE', '', 0),
412
        (False, 'aine', '', 3),
413
        (False, 'also erroneously', '', 4),
414
        (False, 'also identified with the', '', 2),
415
        (False, 'also identified with', '', 2),
416
        (False, 'archbishop', '', 7),
417
        (False, 'atelier', '', 7),
418
        (False, 'baron', '', 7),
419
        (False, 'cadet', '', 3),
420
        (False, 'cardinal', '', 7),
421
        (False, 'circle of', '', 5),
422
        (False, 'circle', '', 5),
423
        (False, 'class of', '', 5),
424
        (False, 'conde de', '', 7),
425
        (False, 'countess', '', 7),
426
        (False, 'count', '', 7),
427
        (False, "d'", " d'", 15),
428
        (False, 'dai', '', 15),
429
        (False, "dall'", " dall'", 15),
430
        (False, 'dalla', '', 15),
431
        (False, 'dalle', '', 15),
432
        (False, 'dal', '', 15),
433
        (False, 'da', '', 15),
434
        (False, 'degli', '', 15),
435
        (False, 'della', '', 15),
436
        (False, 'del', '', 15),
437
        (False, 'den', '', 15),
438
        (False, 'der altere', '', 3),
439
        (False, 'der jungere', '', 3),
440
        (False, 'der', '', 15),
441
        (False, 'de la', '', 15),
442
        (False, 'des', '', 15),
443
        (False, "de'", " de'", 15),
444
        (False, 'de', '', 15),
445
        (False, 'di ser', '', 7),
446
        (False, 'di', '', 15),
447
        (False, 'dos', '', 15),
448
        (False, 'du', '', 15),
449
        (False, 'duke of', '', 7),
450
        (False, 'earl of', '', 7),
451
        (False, 'el', '', 15),
452
        (False, 'fils', '', 3),
453
        (False, 'florentine follower of', '', 5),
454
        (False, 'follower of', '', 5),
455
        (False, 'fra', '', 7),
456
        (False, 'freiherr von', '', 7),
457
        (False, 'giovane', '', 7),
458
        (False, 'group', '', 5),
459
        (True, 'iii', '', 3),
460
        (True, 'ii', '', 3),
461
        (False, 'il giovane', '', 7),
462
        (False, 'il vecchio', '', 7),
463
        (False, 'il', '', 15),
464
        (False, "in't", '', 7),
465
        (False, 'in het', '', 7),
466
        (True, 'iv', '', 3),
467
        (True, 'ix', '', 3),
468
        (True, 'i', '', 3),
469
        (False, 'jr.', '', 3),
470
        (False, 'jr', '', 3),
471
        (False, 'juniore', '', 3),
472
        (False, 'junior', '', 3),
473
        (False, 'king of', '', 7),
474
        (False, "l'", " l'", 15),
475
        (False, "l'aine", '', 3),
476
        (False, 'la', '', 15),
477
        (False, 'le jeune', '', 3),
478
        (False, 'le', '', 15),
479
        (False, 'lo', '', 15),
480
        (False, 'maestro', '', 7),
481
        (False, 'maitre', '', 7),
482
        (False, 'marchioness', '', 7),
483
        (False, 'markgrafin von', '', 7),
484
        (False, 'marquess', '', 7),
485
        (False, 'marquis', '', 7),
486
        (False, 'master of the', '', 7),
487
        (False, 'master of', '', 7),
488
        (False, 'master known as the', '', 7),
489
        (False, 'master with the', '', 7),
490
        (False, 'master with', '', 7),
491
        (False, 'masters', '', 7),
492
        (False, 'master', '', 7),
493
        (False, 'meister', '', 7),
494
        (False, 'met de', '', 7),
495
        (False, 'met', '', 7),
496
        (False, 'mlle.', '', 7),
497
        (False, 'mlle', '', 7),
498
        (False, 'monogrammist', '', 7),
499
        (False, 'monsu', '', 7),
500
        (False, 'nee', '', 2),
501
        (False, 'of', '', 3),
502
        (False, 'oncle', '', 3),
503
        (False, 'op den', '', 15),
504
        (False, 'op de', '', 15),
505
        (False, 'or', '', 2),
506
        (False, 'over den', '', 15),
507
        (False, 'over de', '', 15),
508
        (False, 'over', '', 7),
509
        (False, 'p.re', '', 7),
510
        (False, 'p.r.a.', '', 1),
511
        (False, 'padre', '', 7),
512
        (False, 'painter', '', 7),
513
        (False, 'pere', '', 3),
514
        (False, 'possibly identified with', '', 6),
515
        (False, 'possibly', '', 6),
516
        (False, 'pseudo', '', 15),
517
        (False, 'r.a.', '', 1),
518
        (False, 'reichsgraf von', '', 7),
519
        (False, 'ritter von', '', 7),
520
        (False, 'sainte-', ' sainte-', 8),
521
        (False, 'sainte', '', 7),
522
        (False, 'saint-', ' saint-', 8),
523
        (False, 'saint', '', 7),
524
        (False, 'santa', '', 15),
525
        (False, "sant'", " sant'", 15),
526
        (False, 'san', '', 15),
527
        (False, 'ser', '', 7),
528
        (False, 'seniore', '', 3),
529
        (False, 'senior', '', 3),
530
        (False, 'sir', '', 5),
531
        (False, 'sr.', '', 3),
532
        (False, 'sr', '', 3),
533
        (False, 'ss.', ' ss.', 14),
534
        (False, 'ss', '', 6),
535
        (False, 'st-', ' st-', 8),
536
        (False, 'st.', ' st.', 15),
537
        (False, 'ste-', ' ste-', 8),
538
        (False, 'ste.', ' ste.', 15),
539
        (False, 'studio', '', 7),
540
        (False, 'sub-group', '', 5),
541
        (False, 'sultan of', '', 7),
542
        (False, 'ten', '', 15),
543
        (False, 'ter', '', 15),
544
        (False, 'the elder', '', 3),
545
        (False, 'the younger', '', 3),
546
        (False, 'the', '', 7),
547
        (False, 'tot', '', 15),
548
        (False, 'unidentified', '', 1),
549
        (False, 'van den', '', 15),
550
        (False, 'van der', '', 15),
551
        (False, 'van de', '', 15),
552
        (False, 'vanden', '', 15),
553
        (False, 'vander', '', 15),
554
        (False, 'van', '', 15),
555
        (False, 'vecchia', '', 7),
556
        (False, 'vecchio', '', 7),
557
        (True, 'viii', '', 3),
558
        (True, 'vii', '', 3),
559
        (True, 'vi', '', 3),
560
        (True, 'v', '', 3),
561
        (False, 'vom', '', 7),
562
        (False, 'von', '', 15),
563
        (False, 'workshop', '', 7),
564
        (True, 'xiii', '', 3),
565
        (True, 'xii', '', 3),
566
        (True, 'xiv', '', 3),
567
        (True, 'xix', '', 3),
568
        (True, 'xi', '', 3),
569
        (True, 'xviii', '', 3),
570
        (True, 'xvii', '', 3),
571
        (True, 'xvi', '', 3),
572
        (True, 'xv', '', 3),
573
        (True, 'xx', '', 3),
574
        (True, 'x', '', 3),
575
        (False, 'y', '', 7)
576
    )
577
578
    # Start with the basic code
579
    toolcode = ['0', '0', '0', '000', '00', '00', '$', '', '$', '']
580
581
    full_name = ' '.join((lname, fname))
582
583
    # Fill field 0 (qualifier)
584
    qual_3 = {'adaptation after', 'after', 'assistant of', 'assistants of',
585
              'circle of', 'follower of', 'imitator of', 'in the style of',
586
              'manner of', 'pupil of', 'school of', 'studio of',
587
              'style of', 'workshop of'}
588
    qual_2 = {'copy after', 'copy after?', 'copy of'}
589
    qual_1 = {'ascribed to', 'attributed to or copy after',
590
              'attributed to', 'possibly'}
591
592
    if qual in qual_3:
593
        toolcode[0] = '3'
594
    elif qual in qual_2:
595
        toolcode[0] = '2'
596
    elif qual in qual_1:
597
        toolcode[0] = '1'
598
599
    # Fill field 1 (punctuation)
600
    if '.' in full_name:
601
        toolcode[1] = '2'
602
    else:
603
        for punct in ',-/:;"&\'()!{|}?$%*+<=>[\\]^_`~':
604
            if punct in full_name:
605
                toolcode[1] = '1'
606
                break
607
608
    # Fill field 2 (generation)
609
    gen_1 = ('the elder', ' sr.', ' sr', 'senior', 'der altere', 'il vecchio',
610
             "l'aine", 'p.re', 'padre', 'seniore', 'vecchia', 'vecchio')
611
    gen_2 = (' jr.', ' jr', 'der jungere', 'il giovane', 'giovane', 'juniore',
612
             'junior', 'le jeune', 'the younger')
613
614
    elderyounger = ''  # save elder/younger for possible movement later
615
    for gen in gen_1:
616
        if gen in full_name:
617
            toolcode[2] = '1'
618
            elderyounger = gen
619
            break
620
    else:
621
        for gen in gen_2:
622
            if gen in full_name:
623
                toolcode[2] = '2'
624
                elderyounger = gen
625
                break
626
627
    # do comma flip
628
    if normalize:
629
        comma = lname.find(',')
630
        if comma != -1:
631
            lname_end = lname[comma + 1:]
632
            while lname_end[0] in {' ', ','}:
633
                lname_end = lname_end[1:]
634
            fname = lname_end + ' ' + fname
635
            lname = lname[:comma].strip()
636
637
    # do elder/younger move
638
    if normalize == 2 and elderyounger:
639
        elderyounger_loc = fname.find(elderyounger)
640
        if elderyounger_loc != -1:
641
            lname = lname + ' ' + elderyounger.strip()
642
            fname = (fname[:elderyounger_loc].strip() + ' ' +
643
                     fname[elderyounger_loc + len(elderyounger):])
644
645
    toolcode[4] = '{:02d}'.format(len(fname))
646
    toolcode[5] = '{:02d}'.format(len(lname))
647
648
    # strip punctuation
649
    for char in ',/:;"&()!{|}?$%*+<=>[\\]^_`~':
650
        full_name = full_name.replace(char, '')
651
    for pos, char in enumerate(full_name):
652
        if char == '-' and full_name[pos - 1:pos + 2] != 'b-g':
653
            full_name = full_name[:pos] + ' ' + full_name[pos + 1:]
654
655
    # Fill field 9 (search range)
656
    for letter in [_[0] for _ in full_name.split()]:
657
        if letter not in toolcode[9]:
658
            toolcode[9] += letter
659
        if len(toolcode[9]) == 15:
660
            break
661
662
    def roman_check(numeral, fname, lname):
663
        """Move Roman numerals from first name to last."""
664
        loc = fname.find(numeral)
665
        if (loc != -1 and
666
                (fname[loc + len(numeral)] in {' ', ','} or
667
                 len(fname[loc:]) == len(numeral))):
668
            lname += ' ' + numeral
669
            fname = fname[:loc].strip()
670
            while fname[-1] in {' ', ','}:
671
                fname = fname[:-1]
672
        return fname, lname
673
674
    # Fill fields 7 (specials) and 3 (roman numerals)
675
    for num, special in enumerate(special_table):
676
        roman, string, extra, method = special
677
        if method & method_dict['end']:
678
            string_context = ' ' + string
679
            loc = full_name.find(string_context)
680
            if ((len(full_name) > len(string_context)) and
681
                    (loc == len(full_name) - len(string_context))):
682
                if roman:
683
                    if not any(abbr in fname for abbr in ('i.', 'v.', 'x.')):
684
                        full_name = full_name[:loc]
685
                        toolcode[7] += '{:03d}'.format(num) + 'a'
686
                        if not toolcode[3]:
687
                            toolcode[3] = '{:03d}'.format(num)
688
                        if normalize == 2:
689
                            fname, lname = roman_check(string, fname, lname)
690
                else:
691
                    full_name = full_name[:loc]
692
                    toolcode[7] += '{:03d}'.format(num) + 'a'
693
        if method & method_dict['middle']:
694
            string_context = ' ' + string + ' '
695
            loc = full_name.find(string_context)
696
            if loc > 0:
697
                if roman:
698
                    if not any(abbr in fname for abbr in ('i.', 'v.', 'x.')):
699
                        full_name = (full_name[:loc] +
700
                                     full_name[loc + len(string) + 1:])
701
                        toolcode[7] += '{:03d}'.format(num) + 'b'
702
                        if not toolcode[3]:
703
                            toolcode[3] = '{:03d}'.format(num)
704
                        if normalize == 2:
705
                            fname, lname = roman_check(string, fname, lname)
706
                else:
707
                    full_name = (full_name[:loc] +
708
                                 full_name[loc + len(string) + 1:])
709
                    toolcode[7] += '{:03d}'.format(num) + 'b'
710
        if method & method_dict['beginning']:
711
            string_context = string + ' '
712
            loc = full_name.find(string_context)
713
            if loc == 0:
714
                full_name = full_name[len(string) + 1:]
715
                toolcode[7] += '{:03d}'.format(num) + 'c'
716
        if method & method_dict['beginning_no_space']:
717
            loc = full_name.find(string)
718
            if loc == 0:
719
                toolcode[7] += '{:03d}'.format(num) + 'd'
720
                if full_name[len(string)] not in toolcode[9]:
721
                    toolcode[9] += full_name[len(string)]
722
723
        if extra:
724
            loc = full_name.find(extra)
725
            if loc != -1:
726
                toolcode[7] += '{:03d}'.format(num) + 'X'
727
                if full_name[loc + len(extra)] not in toolcode[9]:
728
                    toolcode[9] += full_name[loc + len(string)]
729
730
    return lname, fname, ''.join(toolcode)
731
732
733
if __name__ == '__main__':
734
    import doctest
735
    doctest.testmod()
736