Completed
Push — master ( 76e221...14a933 )
by Chris
08:59
created

abydos.clustering.occurrence_fingerprint()   A

Complexity

Conditions 5

Size

Total Lines 31
Code Lines 14

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 5
eloc 14
nop 3
dl 0
loc 31
rs 9.2333
c 0
b 0
f 0
1
# -*- coding: utf-8 -*-
2
3
# Copyright 2014-2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19
"""abydos.clustering.
20
21
The clustering module implements clustering algorithms such as:
22
    - string fingerprint
23
    - q-gram fingerprint
24
    - phonetic fingerprint
25
    - skeleton key
26
    - omission key
27
"""
28
29
from __future__ import division, unicode_literals
30
31
import unicodedata
32
from collections import Counter
33
34
from six import text_type
35
from six.moves import range
36
37
from .distance import sim
38
from .phonetic import double_metaphone
39
from .qgram import QGrams
40
from .stats import hmean
41
42
43
def fingerprint(phrase):
44
    """Return string fingerprint.
45
46
    The fingerprint of a string is a string consisting of all of the unique
47
    words in a string, alphabetized & concatenated with intervening spaces
48
49
    :param str phrase: the string from which to calculate the fingerprint
50
    :returns: the fingerprint of the phrase
51
    :rtype: str
52
53
    >>> fingerprint('The quick brown fox jumped over the lazy dog.')
54
    'brown dog fox jumped lazy over quick the'
55
    """
56
    phrase = unicodedata.normalize('NFKD', text_type(phrase.strip().lower()))
57
    phrase = ''.join([c for c in phrase if c.isalnum() or c.isspace()])
58
    phrase = ' '.join(sorted(list(set(phrase.split()))))
59
    return phrase
60
61
62
def qgram_fingerprint(phrase, qval=2, start_stop=''):
63
    """Return Q-Gram fingerprint.
64
65
    A q-gram fingerprint is a string consisting of all of the unique q-grams
66
    in a string, alphabetized & concatenated.
67
68
    :param str phrase: the string from which to calculate the q-gram
69
        fingerprint
70
    :param int qval: the length of each q-gram (by default 2)
71
    :param str start_stop: the start & stop symbol(s) to concatenate on either
72
        end of the phrase, as defined in abydos.util.qgram()
73
    :returns: the q-gram fingerprint of the phrase
74
    :rtype: str
75
76
    >>> qgram_fingerprint('The quick brown fox jumped over the lazy dog.')
77
    'azbrckdoedeleqerfoheicjukblampnfogovowoxpequrortthuiumvewnxjydzy'
78
    >>> qgram_fingerprint('Christopher')
79
    'cherhehrisopphristto'
80
    >>> qgram_fingerprint('Niall')
81
    'aliallni'
82
    """
83
    phrase = unicodedata.normalize('NFKD', text_type(phrase.strip().lower()))
84
    phrase = ''.join(c for c in phrase if c.isalnum())
85
    phrase = QGrams(phrase, qval, start_stop)
86
    phrase = ''.join(sorted(phrase))
87
    return phrase
88
89
90
def phonetic_fingerprint(phrase, phonetic_algorithm=double_metaphone, *args):
91
    """Return the phonetic fingerprint of a phrase.
92
93
    A phonetic fingerprint is identical to a standard string fingerprint, as
94
    implemented in abydos.clustering.fingerprint(), but performs the
95
    fingerprinting function after converting the string to its phonetic form,
96
    as determined by some phonetic algorithm.
97
98
    :param str phrase: the string from which to calculate the phonetic
99
        fingerprint
100
    :param function phonetic_algorithm: a phonetic algorithm that takes a
101
        string and returns a string (presumably a phonetic representation of
102
        the original string) By default, this function uses
103
        abydos.phonetic.double_metaphone()
104
    :param args: additional arguments to pass to the phonetic algorithm,
105
        along with the phrase itself
106
    :returns: the phonetic fingerprint of the phrase
107
    :rtype: str
108
109
    >>> phonetic_fingerprint('The quick brown fox jumped over the lazy dog.')
110
    '0 afr fks jmpt kk ls prn tk'
111
    >>> phonetic_fingerprint('The quick brown fox jumped over the lazy dog.',
112
    ... phonetic_algorithm=soundex)
113
    'b650 d200 f200 j513 l200 o160 q200 t000'
114
    """
115
    phonetic = ''
116
    for word in phrase.split():
117
        word = phonetic_algorithm(word, *args)
118
        if not isinstance(word, text_type) and hasattr(word, '__iter__'):
119
            word = word[0]
120
        phonetic += word + ' '
121
    phonetic = phonetic[:-1]
122
    return fingerprint(phonetic)
123
124
125
def skeleton_key(word):
126
    """Return the skeleton key.
127
128
    The skeleton key of a word is defined in:
129
    Pollock, Joseph J. and Antonio Zamora. 1984. "Automatic Spelling Correction
130
    in Scientific and Scholarly Text." Communications of the ACM, 27(4).
131
    358--368. <http://dl.acm.org/citation.cfm?id=358048>
132
133
    :param str word: the word to transform into its skeleton key
134
    :returns: the skeleton key
135
    :rtype: str
136
137
    >>> skeleton_key('The quick brown fox jumped over the lazy dog.')
138
    'THQCKBRWNFXJMPDVLZYGEUIOA'
139
    >>> skeleton_key('Christopher')
140
    'CHRSTPIOE'
141
    >>> skeleton_key('Niall')
142
    'NLIA'
143
    """
144
    _vowels = {'A', 'E', 'I', 'O', 'U'}
145
146
    word = unicodedata.normalize('NFKD', text_type(word.upper()))
147
    word = ''.join(c for c in word if c in
148
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
149
                    'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
150
                    'Y', 'Z'})
151
    start = word[0:1]
152
    consonant_part = ''
153
    vowel_part = ''
154
155
    # add consonants & vowels to to separate strings
156
    # (omitting the first char & duplicates)
157
    for char in word[1:]:
158
        if char != start:
159
            if char in _vowels:
160
                if char not in vowel_part:
161
                    vowel_part += char
162
            elif char not in consonant_part:
163
                consonant_part += char
164
    # return the first char followed by consonants followed by vowels
165
    return start + consonant_part + vowel_part
166
167
168
def omission_key(word):
169
    """Return the omission key.
170
171
    The omission key of a word is defined in:
172
    Pollock, Joseph J. and Antonio Zamora. 1984. "Automatic Spelling Correction
173
    in Scientific and Scholarly Text." Communications of the ACM, 27(4).
174
    358--368. <http://dl.acm.org/citation.cfm?id=358048>
175
176
    :param str word: the word to transform into its omission key
177
    :returns: the omission key
178
    :rtype: str
179
180
    >>> omission_key('The quick brown fox jumped over the lazy dog.')
181
    'JKQXZVWYBFMGPDHCLNTREUIOA'
182
    >>> omission_key('Christopher')
183
    'PHCTSRIOE'
184
    >>> omission_key('Niall')
185
    'LNIA'
186
    """
187
    _consonants = ('J', 'K', 'Q', 'X', 'Z', 'V', 'W', 'Y', 'B', 'F', 'M', 'G',
188
                   'P', 'D', 'H', 'C', 'L', 'N', 'T', 'S', 'R')
189
190
    word = unicodedata.normalize('NFKD', text_type(word.upper()))
191
    word = ''.join(c for c in word if c in
192
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
193
                    'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
194
                    'Y', 'Z'})
195
196
    key = ''
197
198
    # add consonants in order supplied by _consonants (no duplicates)
199
    for char in _consonants:
200
        if char in word:
201
            key += char
202
203
    # add vowels in order they appeared in the word (no duplicates)
204
    for char in word:
205
        if char not in _consonants and char not in key:
206
            key += char
207
208
    return key
209
210
# TODO: Dump all these to a data file.
211
# most common letters, as defined in Cisłak & Grabowski
212
MOST_COMMON_LETTERS_CG = ('e', 't', 'a', 'o', 'i', 'n', 's', 'h', 'r', 'd',
213
                          'l', 'c', 'u', 'm', 'w', 'f')
214
215
# most common letters (case-folded to lowercase), as shown in Google Books
216
# English n-grams, among letters a-z & digits 0-9
217
MOST_COMMON_LETTERS_EN_LC = ('e', 't', 'a', 'i', 'o', 'n', 's', 'r', 'h', 'l',
218
                             'd', 'c', 'u', 'm', 'f', 'p', 'g', 'y', 'w', 'b',
219
                             'v', 'k', 'x', 'j', 'q', 'z', '1', '2', '0', '9',
220
                             '3', '4', '8', '5', '6', '7')
221
222
# most common letters, as shown in Google Books English n-grams, among letters
223
# A-Z, a-z & digits 0-9
224
MOST_COMMON_LETTERS = ('e', 't', 'a', 'o', 'i', 'n', 's', 'r', 'h', 'l', 'd',
225
                       'c', 'u', 'm', 'f', 'p', 'g', 'y', 'w', 'b', 'v', 'k',
226
                       'T', 'I', 'A', 'S', 'C', 'x', 'M', 'P', 'E', 'B', 'H',
227
                       'R', 'N', 'D', 'L', 'F', 'W', 'O', 'q', 'G', 'z', 'j',
228
                       'J', 'U', 'V', 'K', 'Y', '1', '2', '0', 'X', '9', 'Q',
229
                       '3', 'Z', '4', '8', '5', '6', '7',)
230
231
# most common letters (case-folded to lowercase), as shown in Google Books
232
# German n-grams, among letters (a-z and umlauted vowels & eszett) & digits 0-9
233
MOST_COMMON_LETTERS_DE = ('e', 'n', 'i', 'r', 's', 't', 'a', 'd', 'h', 'u',
234
                          'l', 'g', 'c', 'o', 'm', 'b', 'f', 'w', 'k', 'z',
235
                          'v', 'p', 'ü', 'ä', 'ß', 'ö', 'j', 'y', 'x', 'q',
236
                          '1', '2', '3', '4', '0', '5', '6', '9', '8', '7')
237
238
# most common letters (case-folded to lowercase), as shown in Google Books
239
# German n-grams, among letters (A-Z, a-z, umlauted vowels & eszett) & digits
240
# 0-9
241
MOST_COMMON_LETTERS_DE_LC = ('e', 'n', 'i', 'r', 's', 't', 'a', 'd', 'h', 'u',
242
                             'l', 'c', 'g', 'o', 'm', 'b', 'f', 'w', 'k', 'z',
243
                             'v', 'p', 'ü', 'ä', 'S', 'A', 'D', 'B', 'E', 'G',
244
                             'M', 'ß', 'V', 'K', 'ö', 'W', 'F', 'P', 'R', 'I',
245
                             'H', 'L', 'T', 'N', 'Z', 'y', 'U', 'j', 'J', 'O',
246
                             'C', 'x', 'q', 'Ü', 'Q', 'X', 'Ä', 'Ö', '1', '2',
247
                             'Y', '3', '4', '0', '5', '6', '9', '8', '7')
248
249
def occurrence_fingerprint(word, n_bits=16,
250
                           most_common=MOST_COMMON_LETTERS_CG):
251
    """Return the occurrence fingerprint.
252
253
    Based on the occurence fingerprint from:
254
    Cisłak, Aleksander and Szymon Grabowski. "Lightweight Fingerprints for
255
    Fast Approximate Keyword Matching Using Bitwise Operations."
256
    http://arxiv.org/abs/1711.08475
257
258
    :param word: the word to fingerprint
259
    :param n_bits: number of bits in the fingerprint returned
260
    :param most_common: the most common tokens in the target language
261
    :return: the occurrence fingerprint
262
    :rtype: int
263
    """
264
    word = set(word)
265
    fingerprint = 0
266
267
    for letter in most_common:
268
        if letter in word:
269
            fingerprint += 1
270
        n_bits -= 1
271
        if n_bits:
272
            fingerprint <<= 1
273
        else:
274
            break
275
276
    if n_bits:
277
        fingerprint <<= n_bits
278
279
    return fingerprint
280
281
282
def occurrence_halved_fingerprint(word, n_bits=16,
283
                                  most_common=MOST_COMMON_LETTERS_CG):
284
    """Return the occurrence halved fingerprint.
285
286
    Based on the occurence halved fingerprint from:
287
    Cisłak, Aleksander and Szymon Grabowski. "Lightweight Fingerprints for
288
    Fast Approximate Keyword Matching Using Bitwise Operations."
289
    http://arxiv.org/abs/1711.08475
290
291
    :param word: the word to fingerprint
292
    :param n_bits: number of bits in the fingerprint returned
293
    :param most_common: the most common tokens in the target language
294
    :return: the occurrence halved fingerprint
295
    :rtype: int
296
    """
297
    if n_bits % 2:
298
        n_bits += 1
299
300
    w_len = len(word)//2
301
    w_1 = set(word[:w_len])
302
    w_2 = set(word[w_len:])
303
    fingerprint = 0
304
305
    for letter in most_common:
306
        if letter in w_1:
307
            fingerprint += 1
308
        fingerprint <<= 1
309
        if letter in w_2:
310
            fingerprint += 1
311
        n_bits -= 2
312
        if n_bits:
313
            fingerprint <<= 1
314
        else:
315
            break
316
317
    if n_bits:
318
        fingerprint <<= n_bits
319
320
    return fingerprint
321
322
323
def count_fingerprint(word, n_bits=16,
324
                      most_common=MOST_COMMON_LETTERS_CG):
325
    """Return the count fingerprint.
326
327
    Based on the count fingerprint from:
328
    Cisłak, Aleksander and Szymon Grabowski. "Lightweight Fingerprints for
329
    Fast Approximate Keyword Matching Using Bitwise Operations."
330
    http://arxiv.org/abs/1711.08475
331
332
    :param word: the word to fingerprint
333
    :param n_bits: number of bits in the fingerprint returned
334
    :param most_common: the most common tokens in the target language
335
    :return: the count fingerprint
336
    :rtype: int
337
    """
338
    if n_bits % 2:
339
        n_bits += 1
340
341
    word = Counter(word)
342
    fingerprint = 0
343
344
    for letter in most_common:
345
        fingerprint += (word[letter] & 3)
346
        n_bits -= 2
347
        if n_bits:
348
            fingerprint <<= 2
349
        else:
350
            break
351
352
    if n_bits:
353
        fingerprint <<= n_bits
354
355
    return fingerprint
356
357
358
def position_fingerprint(word, n_bits=16,
359
                         most_common=MOST_COMMON_LETTERS_CG,
360
                         bits_per_letter=3):
361
    """Return the position fingerprint.
362
363
    Based on the position fingerprint from:
364
    Cisłak, Aleksander and Szymon Grabowski. "Lightweight Fingerprints for
365
    Fast Approximate Keyword Matching Using Bitwise Operations."
366
    http://arxiv.org/abs/1711.08475
367
368
    :param word: the word to fingerprint
369
    :param n_bits: number of bits in the fingerprint returned
370
    :param most_common: the most common tokens in the target language
371
    :param bits_per_letter: the bits to assign for letter position
372
    :return: the position fingerprint
373
    :rtype: int
374
    """
375
    position = {}
376
    for pos, letter in enumerate(word):
377
        if letter not in position and letter in most_common:
378
            position[letter] = min(pos, 2**bits_per_letter-1)
379
380
    fingerprint = 0
381
    for letter in most_common:
382
        if letter in position:
383
            fingerprint += min(position[letter], 2**n_bits-1)
384
        n_bits -= bits_per_letter
385
        if n_bits > 0:
386
            fingerprint <<= min(bits_per_letter, n_bits)
387
        else:
388
            break
389
390
    if n_bits > 0:
391
        fingerprint <<= n_bits
392
393
    return fingerprint
394
395
396
def mean_pairwise_similarity(collection, metric=sim,
397
                             meanfunc=hmean, symmetric=False):
398
    """Calculate the mean pairwise similarity of a collection of strings.
399
400
    Takes the mean of the pairwise similarity between each member of a
401
    collection, optionally in both directions (for asymmetric similarity
402
    metrics.
403
404
    :param list collection: a collection of terms or a string that can be split
405
    :param function metric: a similarity metric function
406
    :param function mean: a mean function that takes a list of values and
407
        returns a float
408
    :param bool symmetric: set to True if all pairwise similarities should be
409
        calculated in both directions
410
    :returns: the mean pairwise similarity of a collection of strings
411
    :rtype: str
412
413
    >>> mean_pairwise_similarity(['Christopher', 'Kristof', 'Christobal'])
414
    0.51980198019801982
415
    >>> mean_pairwise_similarity(['Niall', 'Neal', 'Neil'])
416
    0.54545454545454541
417
    """
418
    if hasattr(collection, 'split'):
419
        collection = collection.split()
420
    if not hasattr(collection, '__iter__'):
421
        raise ValueError('collection is neither a string nor iterable type')
422
    elif len(collection) < 2:
423
        raise ValueError('collection has fewer than two members')
424
425
    collection = list(collection)
426
427
    pairwise_values = []
428
429
    for i in range(len(collection)):
430
        for j in range(i+1, len(collection)):
431
            pairwise_values.append(metric(collection[i], collection[j]))
432
            if symmetric:
433
                pairwise_values.append(metric(collection[j], collection[i]))
434
435
    if not callable(meanfunc):
436
        raise ValueError('meanfunc must be a function')
437
    return meanfunc(pairwise_values)
438