|
1
|
|
|
# -*- coding: utf-8 -*- |
|
2
|
|
|
|
|
3
|
|
|
# Copyright 2014-2018 by Christopher C. Little. |
|
4
|
|
|
# This file is part of Abydos. |
|
5
|
|
|
# |
|
6
|
|
|
# Abydos is free software: you can redistribute it and/or modify |
|
7
|
|
|
# it under the terms of the GNU General Public License as published by |
|
8
|
|
|
# the Free Software Foundation, either version 3 of the License, or |
|
9
|
|
|
# (at your option) any later version. |
|
10
|
|
|
# |
|
11
|
|
|
# Abydos is distributed in the hope that it will be useful, |
|
12
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
13
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|
14
|
|
|
# GNU General Public License for more details. |
|
15
|
|
|
# |
|
16
|
|
|
# You should have received a copy of the GNU General Public License |
|
17
|
|
|
# along with Abydos. If not, see <http://www.gnu.org/licenses/>. |
|
18
|
|
|
|
|
19
|
|
|
"""abydos.clustering. |
|
20
|
|
|
|
|
21
|
|
|
The clustering module implements clustering algorithms such as: |
|
22
|
|
|
- string fingerprint |
|
23
|
|
|
- q-gram fingerprint |
|
24
|
|
|
- phonetic fingerprint |
|
25
|
|
|
- skeleton key |
|
26
|
|
|
- omission key |
|
27
|
|
|
""" |
|
28
|
|
|
|
|
29
|
|
|
from __future__ import division, unicode_literals |
|
30
|
|
|
|
|
31
|
|
|
import unicodedata |
|
32
|
|
|
from collections import Counter |
|
33
|
|
|
|
|
34
|
|
|
from six import text_type |
|
35
|
|
|
from six.moves import range |
|
36
|
|
|
|
|
37
|
|
|
from .distance import sim |
|
38
|
|
|
from .phonetic import double_metaphone |
|
39
|
|
|
from .qgram import QGrams |
|
40
|
|
|
from .stats import hmean |
|
41
|
|
|
|
|
42
|
|
|
|
|
43
|
|
|
def fingerprint(phrase): |
|
44
|
|
|
"""Return string fingerprint. |
|
45
|
|
|
|
|
46
|
|
|
The fingerprint of a string is a string consisting of all of the unique |
|
47
|
|
|
words in a string, alphabetized & concatenated with intervening spaces |
|
48
|
|
|
|
|
49
|
|
|
:param str phrase: the string from which to calculate the fingerprint |
|
50
|
|
|
:returns: the fingerprint of the phrase |
|
51
|
|
|
:rtype: str |
|
52
|
|
|
|
|
53
|
|
|
>>> fingerprint('The quick brown fox jumped over the lazy dog.') |
|
54
|
|
|
'brown dog fox jumped lazy over quick the' |
|
55
|
|
|
""" |
|
56
|
|
|
phrase = unicodedata.normalize('NFKD', text_type(phrase.strip().lower())) |
|
57
|
|
|
phrase = ''.join([c for c in phrase if c.isalnum() or c.isspace()]) |
|
58
|
|
|
phrase = ' '.join(sorted(list(set(phrase.split())))) |
|
59
|
|
|
return phrase |
|
60
|
|
|
|
|
61
|
|
|
|
|
62
|
|
|
def qgram_fingerprint(phrase, qval=2, start_stop=''): |
|
63
|
|
|
"""Return Q-Gram fingerprint. |
|
64
|
|
|
|
|
65
|
|
|
A q-gram fingerprint is a string consisting of all of the unique q-grams |
|
66
|
|
|
in a string, alphabetized & concatenated. |
|
67
|
|
|
|
|
68
|
|
|
:param str phrase: the string from which to calculate the q-gram |
|
69
|
|
|
fingerprint |
|
70
|
|
|
:param int qval: the length of each q-gram (by default 2) |
|
71
|
|
|
:param str start_stop: the start & stop symbol(s) to concatenate on either |
|
72
|
|
|
end of the phrase, as defined in abydos.util.qgram() |
|
73
|
|
|
:returns: the q-gram fingerprint of the phrase |
|
74
|
|
|
:rtype: str |
|
75
|
|
|
|
|
76
|
|
|
>>> qgram_fingerprint('The quick brown fox jumped over the lazy dog.') |
|
77
|
|
|
'azbrckdoedeleqerfoheicjukblampnfogovowoxpequrortthuiumvewnxjydzy' |
|
78
|
|
|
>>> qgram_fingerprint('Christopher') |
|
79
|
|
|
'cherhehrisopphristto' |
|
80
|
|
|
>>> qgram_fingerprint('Niall') |
|
81
|
|
|
'aliallni' |
|
82
|
|
|
""" |
|
83
|
|
|
phrase = unicodedata.normalize('NFKD', text_type(phrase.strip().lower())) |
|
84
|
|
|
phrase = ''.join(c for c in phrase if c.isalnum()) |
|
85
|
|
|
phrase = QGrams(phrase, qval, start_stop) |
|
86
|
|
|
phrase = ''.join(sorted(phrase)) |
|
87
|
|
|
return phrase |
|
88
|
|
|
|
|
89
|
|
|
|
|
90
|
|
|
def phonetic_fingerprint(phrase, phonetic_algorithm=double_metaphone, *args): |
|
91
|
|
|
"""Return the phonetic fingerprint of a phrase. |
|
92
|
|
|
|
|
93
|
|
|
A phonetic fingerprint is identical to a standard string fingerprint, as |
|
94
|
|
|
implemented in abydos.clustering.fingerprint(), but performs the |
|
95
|
|
|
fingerprinting function after converting the string to its phonetic form, |
|
96
|
|
|
as determined by some phonetic algorithm. |
|
97
|
|
|
|
|
98
|
|
|
:param str phrase: the string from which to calculate the phonetic |
|
99
|
|
|
fingerprint |
|
100
|
|
|
:param function phonetic_algorithm: a phonetic algorithm that takes a |
|
101
|
|
|
string and returns a string (presumably a phonetic representation of |
|
102
|
|
|
the original string) By default, this function uses |
|
103
|
|
|
abydos.phonetic.double_metaphone() |
|
104
|
|
|
:param args: additional arguments to pass to the phonetic algorithm, |
|
105
|
|
|
along with the phrase itself |
|
106
|
|
|
:returns: the phonetic fingerprint of the phrase |
|
107
|
|
|
:rtype: str |
|
108
|
|
|
|
|
109
|
|
|
>>> phonetic_fingerprint('The quick brown fox jumped over the lazy dog.') |
|
110
|
|
|
'0 afr fks jmpt kk ls prn tk' |
|
111
|
|
|
>>> phonetic_fingerprint('The quick brown fox jumped over the lazy dog.', |
|
112
|
|
|
... phonetic_algorithm=soundex) |
|
113
|
|
|
'b650 d200 f200 j513 l200 o160 q200 t000' |
|
114
|
|
|
""" |
|
115
|
|
|
phonetic = '' |
|
116
|
|
|
for word in phrase.split(): |
|
117
|
|
|
word = phonetic_algorithm(word, *args) |
|
118
|
|
|
if not isinstance(word, text_type) and hasattr(word, '__iter__'): |
|
119
|
|
|
word = word[0] |
|
120
|
|
|
phonetic += word + ' ' |
|
121
|
|
|
phonetic = phonetic[:-1] |
|
122
|
|
|
return fingerprint(phonetic) |
|
123
|
|
|
|
|
124
|
|
|
|
|
125
|
|
|
def skeleton_key(word): |
|
126
|
|
|
"""Return the skeleton key. |
|
127
|
|
|
|
|
128
|
|
|
The skeleton key of a word is defined in: |
|
129
|
|
|
Pollock, Joseph J. and Antonio Zamora. 1984. "Automatic Spelling Correction |
|
130
|
|
|
in Scientific and Scholarly Text." Communications of the ACM, 27(4). |
|
131
|
|
|
358--368. <http://dl.acm.org/citation.cfm?id=358048> |
|
132
|
|
|
|
|
133
|
|
|
:param str word: the word to transform into its skeleton key |
|
134
|
|
|
:returns: the skeleton key |
|
135
|
|
|
:rtype: str |
|
136
|
|
|
|
|
137
|
|
|
>>> skeleton_key('The quick brown fox jumped over the lazy dog.') |
|
138
|
|
|
'THQCKBRWNFXJMPDVLZYGEUIOA' |
|
139
|
|
|
>>> skeleton_key('Christopher') |
|
140
|
|
|
'CHRSTPIOE' |
|
141
|
|
|
>>> skeleton_key('Niall') |
|
142
|
|
|
'NLIA' |
|
143
|
|
|
""" |
|
144
|
|
|
_vowels = {'A', 'E', 'I', 'O', 'U'} |
|
145
|
|
|
|
|
146
|
|
|
word = unicodedata.normalize('NFKD', text_type(word.upper())) |
|
147
|
|
|
word = ''.join(c for c in word if c in |
|
148
|
|
|
{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', |
|
149
|
|
|
'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', |
|
150
|
|
|
'Y', 'Z'}) |
|
151
|
|
|
start = word[0:1] |
|
152
|
|
|
consonant_part = '' |
|
153
|
|
|
vowel_part = '' |
|
154
|
|
|
|
|
155
|
|
|
# add consonants & vowels to to separate strings |
|
156
|
|
|
# (omitting the first char & duplicates) |
|
157
|
|
|
for char in word[1:]: |
|
158
|
|
|
if char != start: |
|
159
|
|
|
if char in _vowels: |
|
160
|
|
|
if char not in vowel_part: |
|
161
|
|
|
vowel_part += char |
|
162
|
|
|
elif char not in consonant_part: |
|
163
|
|
|
consonant_part += char |
|
164
|
|
|
# return the first char followed by consonants followed by vowels |
|
165
|
|
|
return start + consonant_part + vowel_part |
|
166
|
|
|
|
|
167
|
|
|
|
|
168
|
|
|
def omission_key(word): |
|
169
|
|
|
"""Return the omission key. |
|
170
|
|
|
|
|
171
|
|
|
The omission key of a word is defined in: |
|
172
|
|
|
Pollock, Joseph J. and Antonio Zamora. 1984. "Automatic Spelling Correction |
|
173
|
|
|
in Scientific and Scholarly Text." Communications of the ACM, 27(4). |
|
174
|
|
|
358--368. <http://dl.acm.org/citation.cfm?id=358048> |
|
175
|
|
|
|
|
176
|
|
|
:param str word: the word to transform into its omission key |
|
177
|
|
|
:returns: the omission key |
|
178
|
|
|
:rtype: str |
|
179
|
|
|
|
|
180
|
|
|
>>> omission_key('The quick brown fox jumped over the lazy dog.') |
|
181
|
|
|
'JKQXZVWYBFMGPDHCLNTREUIOA' |
|
182
|
|
|
>>> omission_key('Christopher') |
|
183
|
|
|
'PHCTSRIOE' |
|
184
|
|
|
>>> omission_key('Niall') |
|
185
|
|
|
'LNIA' |
|
186
|
|
|
""" |
|
187
|
|
|
_consonants = ('J', 'K', 'Q', 'X', 'Z', 'V', 'W', 'Y', 'B', 'F', 'M', 'G', |
|
188
|
|
|
'P', 'D', 'H', 'C', 'L', 'N', 'T', 'S', 'R') |
|
189
|
|
|
|
|
190
|
|
|
word = unicodedata.normalize('NFKD', text_type(word.upper())) |
|
191
|
|
|
word = ''.join(c for c in word if c in |
|
192
|
|
|
{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', |
|
193
|
|
|
'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', |
|
194
|
|
|
'Y', 'Z'}) |
|
195
|
|
|
|
|
196
|
|
|
key = '' |
|
197
|
|
|
|
|
198
|
|
|
# add consonants in order supplied by _consonants (no duplicates) |
|
199
|
|
|
for char in _consonants: |
|
200
|
|
|
if char in word: |
|
201
|
|
|
key += char |
|
202
|
|
|
|
|
203
|
|
|
# add vowels in order they appeared in the word (no duplicates) |
|
204
|
|
|
for char in word: |
|
205
|
|
|
if char not in _consonants and char not in key: |
|
206
|
|
|
key += char |
|
207
|
|
|
|
|
208
|
|
|
return key |
|
209
|
|
|
|
|
210
|
|
|
# TODO: Dump all these to a data file. |
|
211
|
|
|
# most common letters, as defined in Cisłak & Grabowski |
|
212
|
|
|
MOST_COMMON_LETTERS_CG = ('e', 't', 'a', 'o', 'i', 'n', 's', 'h', 'r', 'd', |
|
213
|
|
|
'l', 'c', 'u', 'm', 'w', 'f') |
|
214
|
|
|
|
|
215
|
|
|
# most common letters (case-folded to lowercase), as shown in Google Books |
|
216
|
|
|
# English n-grams, among letters a-z & digits 0-9 |
|
217
|
|
|
MOST_COMMON_LETTERS_EN_LC = ('e', 't', 'a', 'i', 'o', 'n', 's', 'r', 'h', 'l', |
|
218
|
|
|
'd', 'c', 'u', 'm', 'f', 'p', 'g', 'y', 'w', 'b', |
|
219
|
|
|
'v', 'k', 'x', 'j', 'q', 'z', '1', '2', '0', '9', |
|
220
|
|
|
'3', '4', '8', '5', '6', '7') |
|
221
|
|
|
|
|
222
|
|
|
# most common letters, as shown in Google Books English n-grams, among letters |
|
223
|
|
|
# A-Z, a-z & digits 0-9 |
|
224
|
|
|
MOST_COMMON_LETTERS = ('e', 't', 'a', 'o', 'i', 'n', 's', 'r', 'h', 'l', 'd', |
|
225
|
|
|
'c', 'u', 'm', 'f', 'p', 'g', 'y', 'w', 'b', 'v', 'k', |
|
226
|
|
|
'T', 'I', 'A', 'S', 'C', 'x', 'M', 'P', 'E', 'B', 'H', |
|
227
|
|
|
'R', 'N', 'D', 'L', 'F', 'W', 'O', 'q', 'G', 'z', 'j', |
|
228
|
|
|
'J', 'U', 'V', 'K', 'Y', '1', '2', '0', 'X', '9', 'Q', |
|
229
|
|
|
'3', 'Z', '4', '8', '5', '6', '7',) |
|
230
|
|
|
|
|
231
|
|
|
# most common letters (case-folded to lowercase), as shown in Google Books |
|
232
|
|
|
# German n-grams, among letters (a-z and umlauted vowels & eszett) & digits 0-9 |
|
233
|
|
|
MOST_COMMON_LETTERS_DE = ('e', 'n', 'i', 'r', 's', 't', 'a', 'd', 'h', 'u', |
|
234
|
|
|
'l', 'g', 'c', 'o', 'm', 'b', 'f', 'w', 'k', 'z', |
|
235
|
|
|
'v', 'p', 'ü', 'ä', 'ß', 'ö', 'j', 'y', 'x', 'q', |
|
236
|
|
|
'1', '2', '3', '4', '0', '5', '6', '9', '8', '7') |
|
237
|
|
|
|
|
238
|
|
|
# most common letters (case-folded to lowercase), as shown in Google Books |
|
239
|
|
|
# German n-grams, among letters (A-Z, a-z, umlauted vowels & eszett) & digits |
|
240
|
|
|
# 0-9 |
|
241
|
|
|
MOST_COMMON_LETTERS_DE_LC = ('e', 'n', 'i', 'r', 's', 't', 'a', 'd', 'h', 'u', |
|
242
|
|
|
'l', 'c', 'g', 'o', 'm', 'b', 'f', 'w', 'k', 'z', |
|
243
|
|
|
'v', 'p', 'ü', 'ä', 'S', 'A', 'D', 'B', 'E', 'G', |
|
244
|
|
|
'M', 'ß', 'V', 'K', 'ö', 'W', 'F', 'P', 'R', 'I', |
|
245
|
|
|
'H', 'L', 'T', 'N', 'Z', 'y', 'U', 'j', 'J', 'O', |
|
246
|
|
|
'C', 'x', 'q', 'Ü', 'Q', 'X', 'Ä', 'Ö', '1', '2', |
|
247
|
|
|
'Y', '3', '4', '0', '5', '6', '9', '8', '7') |
|
248
|
|
|
|
|
249
|
|
|
def occurrence_fingerprint(word, n_bits=16, |
|
250
|
|
|
most_common=MOST_COMMON_LETTERS_CG): |
|
251
|
|
|
"""Return the occurrence fingerprint. |
|
252
|
|
|
|
|
253
|
|
|
Based on the occurence fingerprint from: |
|
254
|
|
|
Cisłak, Aleksander and Szymon Grabowski. "Lightweight Fingerprints for |
|
255
|
|
|
Fast Approximate Keyword Matching Using Bitwise Operations." |
|
256
|
|
|
http://arxiv.org/abs/1711.08475 |
|
257
|
|
|
|
|
258
|
|
|
:param word: the word to fingerprint |
|
259
|
|
|
:param n_bits: number of bits in the fingerprint returned |
|
260
|
|
|
:param most_common: the most common tokens in the target language |
|
261
|
|
|
:return: the occurrence fingerprint |
|
262
|
|
|
:rtype: int |
|
263
|
|
|
""" |
|
264
|
|
|
word = set(word) |
|
265
|
|
|
fingerprint = 0 |
|
266
|
|
|
|
|
267
|
|
|
for letter in most_common: |
|
268
|
|
|
if letter in word: |
|
269
|
|
|
fingerprint += 1 |
|
270
|
|
|
n_bits -= 1 |
|
271
|
|
|
if n_bits: |
|
272
|
|
|
fingerprint <<= 1 |
|
273
|
|
|
else: |
|
274
|
|
|
break |
|
275
|
|
|
|
|
276
|
|
|
if n_bits: |
|
277
|
|
|
fingerprint <<= n_bits |
|
278
|
|
|
|
|
279
|
|
|
return fingerprint |
|
280
|
|
|
|
|
281
|
|
|
|
|
282
|
|
|
def occurrence_halved_fingerprint(word, n_bits=16, |
|
283
|
|
|
most_common=MOST_COMMON_LETTERS_CG): |
|
284
|
|
|
"""Return the occurrence halved fingerprint. |
|
285
|
|
|
|
|
286
|
|
|
Based on the occurence halved fingerprint from: |
|
287
|
|
|
Cisłak, Aleksander and Szymon Grabowski. "Lightweight Fingerprints for |
|
288
|
|
|
Fast Approximate Keyword Matching Using Bitwise Operations." |
|
289
|
|
|
http://arxiv.org/abs/1711.08475 |
|
290
|
|
|
|
|
291
|
|
|
:param word: the word to fingerprint |
|
292
|
|
|
:param n_bits: number of bits in the fingerprint returned |
|
293
|
|
|
:param most_common: the most common tokens in the target language |
|
294
|
|
|
:return: the occurrence halved fingerprint |
|
295
|
|
|
:rtype: int |
|
296
|
|
|
""" |
|
297
|
|
|
if n_bits % 2: |
|
298
|
|
|
n_bits += 1 |
|
299
|
|
|
|
|
300
|
|
|
w_len = len(word)//2 |
|
301
|
|
|
w_1 = set(word[:w_len]) |
|
302
|
|
|
w_2 = set(word[w_len:]) |
|
303
|
|
|
fingerprint = 0 |
|
304
|
|
|
|
|
305
|
|
|
for letter in most_common: |
|
306
|
|
|
if letter in w_1: |
|
307
|
|
|
fingerprint += 1 |
|
308
|
|
|
fingerprint <<= 1 |
|
309
|
|
|
if letter in w_2: |
|
310
|
|
|
fingerprint += 1 |
|
311
|
|
|
n_bits -= 2 |
|
312
|
|
|
if n_bits: |
|
313
|
|
|
fingerprint <<= 1 |
|
314
|
|
|
else: |
|
315
|
|
|
break |
|
316
|
|
|
|
|
317
|
|
|
if n_bits: |
|
318
|
|
|
fingerprint <<= n_bits |
|
319
|
|
|
|
|
320
|
|
|
return fingerprint |
|
321
|
|
|
|
|
322
|
|
|
|
|
323
|
|
|
def count_fingerprint(word, n_bits=16, |
|
324
|
|
|
most_common=MOST_COMMON_LETTERS_CG): |
|
325
|
|
|
"""Return the count fingerprint. |
|
326
|
|
|
|
|
327
|
|
|
Based on the count fingerprint from: |
|
328
|
|
|
Cisłak, Aleksander and Szymon Grabowski. "Lightweight Fingerprints for |
|
329
|
|
|
Fast Approximate Keyword Matching Using Bitwise Operations." |
|
330
|
|
|
http://arxiv.org/abs/1711.08475 |
|
331
|
|
|
|
|
332
|
|
|
:param word: the word to fingerprint |
|
333
|
|
|
:param n_bits: number of bits in the fingerprint returned |
|
334
|
|
|
:param most_common: the most common tokens in the target language |
|
335
|
|
|
:return: the count fingerprint |
|
336
|
|
|
:rtype: int |
|
337
|
|
|
""" |
|
338
|
|
|
if n_bits % 2: |
|
339
|
|
|
n_bits += 1 |
|
340
|
|
|
|
|
341
|
|
|
word = Counter(word) |
|
342
|
|
|
fingerprint = 0 |
|
343
|
|
|
|
|
344
|
|
|
for letter in most_common: |
|
345
|
|
|
fingerprint += (word[letter] & 3) |
|
346
|
|
|
n_bits -= 2 |
|
347
|
|
|
if n_bits: |
|
348
|
|
|
fingerprint <<= 2 |
|
349
|
|
|
else: |
|
350
|
|
|
break |
|
351
|
|
|
|
|
352
|
|
|
if n_bits: |
|
353
|
|
|
fingerprint <<= n_bits |
|
354
|
|
|
|
|
355
|
|
|
return fingerprint |
|
356
|
|
|
|
|
357
|
|
|
|
|
358
|
|
|
def position_fingerprint(word, n_bits=16, |
|
359
|
|
|
most_common=MOST_COMMON_LETTERS_CG, |
|
360
|
|
|
bits_per_letter=3): |
|
361
|
|
|
"""Return the position fingerprint. |
|
362
|
|
|
|
|
363
|
|
|
Based on the position fingerprint from: |
|
364
|
|
|
Cisłak, Aleksander and Szymon Grabowski. "Lightweight Fingerprints for |
|
365
|
|
|
Fast Approximate Keyword Matching Using Bitwise Operations." |
|
366
|
|
|
http://arxiv.org/abs/1711.08475 |
|
367
|
|
|
|
|
368
|
|
|
:param word: the word to fingerprint |
|
369
|
|
|
:param n_bits: number of bits in the fingerprint returned |
|
370
|
|
|
:param most_common: the most common tokens in the target language |
|
371
|
|
|
:param bits_per_letter: the bits to assign for letter position |
|
372
|
|
|
:return: the position fingerprint |
|
373
|
|
|
:rtype: int |
|
374
|
|
|
""" |
|
375
|
|
|
position = {} |
|
376
|
|
|
for pos, letter in enumerate(word): |
|
377
|
|
|
if letter not in position and letter in most_common: |
|
378
|
|
|
position[letter] = min(pos, 2**bits_per_letter-1) |
|
379
|
|
|
|
|
380
|
|
|
fingerprint = 0 |
|
381
|
|
|
for letter in most_common: |
|
382
|
|
|
if letter in position: |
|
383
|
|
|
fingerprint += min(position[letter], 2**n_bits-1) |
|
384
|
|
|
n_bits -= bits_per_letter |
|
385
|
|
|
if n_bits > 0: |
|
386
|
|
|
fingerprint <<= min(bits_per_letter, n_bits) |
|
387
|
|
|
else: |
|
388
|
|
|
break |
|
389
|
|
|
|
|
390
|
|
|
if n_bits > 0: |
|
391
|
|
|
fingerprint <<= n_bits |
|
392
|
|
|
|
|
393
|
|
|
return fingerprint |
|
394
|
|
|
|
|
395
|
|
|
|
|
396
|
|
|
def mean_pairwise_similarity(collection, metric=sim, |
|
397
|
|
|
meanfunc=hmean, symmetric=False): |
|
398
|
|
|
"""Calculate the mean pairwise similarity of a collection of strings. |
|
399
|
|
|
|
|
400
|
|
|
Takes the mean of the pairwise similarity between each member of a |
|
401
|
|
|
collection, optionally in both directions (for asymmetric similarity |
|
402
|
|
|
metrics. |
|
403
|
|
|
|
|
404
|
|
|
:param list collection: a collection of terms or a string that can be split |
|
405
|
|
|
:param function metric: a similarity metric function |
|
406
|
|
|
:param function mean: a mean function that takes a list of values and |
|
407
|
|
|
returns a float |
|
408
|
|
|
:param bool symmetric: set to True if all pairwise similarities should be |
|
409
|
|
|
calculated in both directions |
|
410
|
|
|
:returns: the mean pairwise similarity of a collection of strings |
|
411
|
|
|
:rtype: str |
|
412
|
|
|
|
|
413
|
|
|
>>> mean_pairwise_similarity(['Christopher', 'Kristof', 'Christobal']) |
|
414
|
|
|
0.51980198019801982 |
|
415
|
|
|
>>> mean_pairwise_similarity(['Niall', 'Neal', 'Neil']) |
|
416
|
|
|
0.54545454545454541 |
|
417
|
|
|
""" |
|
418
|
|
|
if hasattr(collection, 'split'): |
|
419
|
|
|
collection = collection.split() |
|
420
|
|
|
if not hasattr(collection, '__iter__'): |
|
421
|
|
|
raise ValueError('collection is neither a string nor iterable type') |
|
422
|
|
|
elif len(collection) < 2: |
|
423
|
|
|
raise ValueError('collection has fewer than two members') |
|
424
|
|
|
|
|
425
|
|
|
collection = list(collection) |
|
426
|
|
|
|
|
427
|
|
|
pairwise_values = [] |
|
428
|
|
|
|
|
429
|
|
|
for i in range(len(collection)): |
|
430
|
|
|
for j in range(i+1, len(collection)): |
|
431
|
|
|
pairwise_values.append(metric(collection[i], collection[j])) |
|
432
|
|
|
if symmetric: |
|
433
|
|
|
pairwise_values.append(metric(collection[j], collection[i])) |
|
434
|
|
|
|
|
435
|
|
|
if not callable(meanfunc): |
|
436
|
|
|
raise ValueError('meanfunc must be a function') |
|
437
|
|
|
return meanfunc(pairwise_values) |
|
438
|
|
|
|