1
|
|
|
# -*- coding: utf-8 -*- |
2
|
|
|
|
3
|
|
|
# Copyright 2018-2019 by Christopher C. Little. |
4
|
|
|
# This file is part of Abydos. |
5
|
|
|
# |
6
|
|
|
# Abydos is free software: you can redistribute it and/or modify |
7
|
|
|
# it under the terms of the GNU General Public License as published by |
8
|
|
|
# the Free Software Foundation, either version 3 of the License, or |
9
|
|
|
# (at your option) any later version. |
10
|
|
|
# |
11
|
|
|
# Abydos is distributed in the hope that it will be useful, |
12
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of |
13
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
14
|
|
|
# GNU General Public License for more details. |
15
|
|
|
# |
16
|
|
|
# You should have received a copy of the GNU General Public License |
17
|
|
|
# along with Abydos. If not, see <http://www.gnu.org/licenses/>. |
18
|
|
|
|
19
|
1 |
|
"""abydos.distance._token_distance. |
20
|
|
|
|
21
|
|
|
The distance._token_distance._TokenDistance module implements abstract class |
22
|
|
|
_TokenDistance. |
23
|
|
|
""" |
24
|
|
|
|
25
|
1 |
|
from __future__ import ( |
26
|
|
|
absolute_import, |
27
|
|
|
division, |
28
|
|
|
print_function, |
29
|
|
|
unicode_literals, |
30
|
|
|
) |
31
|
|
|
|
32
|
1 |
|
from collections import Counter, OrderedDict |
33
|
1 |
|
from itertools import product |
34
|
1 |
|
from math import exp, log1p |
35
|
|
|
|
36
|
1 |
|
from numpy import copy as np_copy |
37
|
1 |
|
from numpy import zeros as np_zeros |
38
|
|
|
|
39
|
1 |
|
try: |
40
|
1 |
|
from scipy.optimize import linear_sum_assignment |
41
|
|
|
except ImportError: # pragma: no cover |
42
|
|
|
# If the system lacks the scipy library, we'll fall back to our |
43
|
|
|
# Python+Numpy implementation of the Hungarian algorithm |
44
|
|
|
linear_sum_assignment = None |
45
|
|
|
|
46
|
1 |
|
from ._damerau_levenshtein import DamerauLevenshtein |
47
|
1 |
|
from ._distance import _Distance |
48
|
1 |
|
from ._lcprefix import LCPrefix |
49
|
1 |
|
from ._levenshtein import Levenshtein |
50
|
1 |
|
from ..stats import ConfusionTable |
51
|
1 |
|
from ..tokenizer import QGrams, QSkipgrams, WhitespaceTokenizer |
52
|
|
|
|
53
|
1 |
|
__all__ = ['_TokenDistance'] |
54
|
|
|
|
55
|
|
|
|
56
|
1 |
|
class _TokenDistance(_Distance): |
57
|
|
|
r"""Abstract Token Distance class. |
58
|
|
|
|
59
|
|
|
.. _confusion_table: |
60
|
|
|
|
61
|
|
|
+----------------+--------------+-----------------+-------+ |
62
|
|
|
| | |in| ``tar`` | |notin| ``tar`` | | |
63
|
|
|
+----------------+--------------+-----------------+-------+ |
64
|
|
|
| |in| ``src`` | |a| | |b| | |a+b| | |
65
|
|
|
+----------------+--------------+-----------------+-------+ |
66
|
|
|
| |notin| ``src``| |c| | |d| | |c+d| | |
67
|
|
|
+----------------+--------------+-----------------+-------+ |
68
|
|
|
| | |a+c| | |b+d| | |n| | |
69
|
|
|
+----------------+--------------+-----------------+-------+ |
70
|
|
|
|
71
|
|
|
.. |in| replace:: :math:`x \in` |
72
|
|
|
.. |notin| replace:: :math:`x \notin` |
73
|
|
|
|
74
|
|
|
.. |a| replace:: :math:`a = |X \cap Y|` |
75
|
|
|
.. |b| replace:: :math:`b = |X\setminus Y|` |
76
|
|
|
.. |c| replace:: :math:`c = |Y \setminus X|` |
77
|
|
|
.. |d| replace:: :math:`d = |(N\setminus X)\setminus Y|` |
78
|
|
|
.. |n| replace:: :math:`n = |N|` |
79
|
|
|
.. |a+b| replace:: :math:`p_1 = a+b = |X|` |
80
|
|
|
.. |a+c| replace:: :math:`p_2 = a+c = |Y|` |
81
|
|
|
.. |c+d| replace:: :math:`q_1 = c+d = |N\setminus X|` |
82
|
|
|
.. |b+d| replace:: :math:`q_2 = b+d = |N\setminus Y|` |
83
|
|
|
|
84
|
|
|
.. versionadded:: 0.3.6 |
85
|
|
|
""" |
86
|
|
|
|
87
|
1 |
|
def __init__(self, tokenizer=None, intersection_type='crisp', **kwargs): |
88
|
|
|
r"""Initialize _TokenDistance instance. |
89
|
|
|
|
90
|
|
|
.. _intersection_type: |
91
|
|
|
|
92
|
|
|
Parameters |
93
|
|
|
---------- |
94
|
|
|
tokenizer : _Tokenizer |
95
|
|
|
A tokenizer instance from the :py:mod:`abydos.tokenizer` package |
96
|
|
|
intersection_type : str |
97
|
|
|
Specifies the intersection type, and set type as a result: |
98
|
|
|
|
99
|
|
|
- 'crisp': Ordinary intersection, wherein items are entirely |
100
|
|
|
members or non-members of the intersection. (Default) |
101
|
|
|
- ``fuzzy``: Fuzzy intersection, defined by :cite:`Wang:2014`, |
102
|
|
|
wherein items can be partially members of the intersection |
103
|
|
|
if their similarity meets or exceeds a threshold value. This |
104
|
|
|
also takes `metric` (by default :class:`Levenshtein()`) and |
105
|
|
|
`threshold` (by default 0.8) parameters. |
106
|
|
|
- ``soft``: Soft intersection, defined by :cite:`Russ:2014`, |
107
|
|
|
wherein items can be partially members of the intersection |
108
|
|
|
depending on their similarity. This also takes a `metric` |
109
|
|
|
(by default :class:`DamerauLevenshtein()`) parameter. |
110
|
|
|
- ``linkage``: Group linkage, defined by :cite:`On:2007`. Like |
111
|
|
|
the soft intersection, items can be partially members of the |
112
|
|
|
intersection, but the method of pairing similar members is |
113
|
|
|
somewhat more complex. See the cited paper for details. This |
114
|
|
|
also takes `metric` |
115
|
|
|
(by default :class:`DamerauLevenshtein()`) and `threshold` |
116
|
|
|
(by default 0.1) parameters. |
117
|
|
|
**kwargs |
118
|
|
|
Arbitrary keyword arguments |
119
|
|
|
|
120
|
|
|
|
121
|
|
|
.. _alphabet: |
122
|
|
|
|
123
|
|
|
Other Parameters |
124
|
|
|
---------------- |
125
|
|
|
qval : int |
126
|
|
|
The length of each q-gram. Using this parameter and tokenizer=None |
127
|
|
|
will cause the instance to use the QGram tokenizer with this |
128
|
|
|
q value. |
129
|
|
|
metric : _Distance |
130
|
|
|
A string distance measure class for use in the ``soft`` and |
131
|
|
|
``fuzzy`` variants. |
132
|
|
|
threshold : float |
133
|
|
|
A threshold value, similarities above which are counted as |
134
|
|
|
members of the intersection for the ``fuzzy`` variant. |
135
|
|
|
alphabet : Counter, collection, int, or None |
136
|
|
|
This represents the alphabet of possible tokens. |
137
|
|
|
|
138
|
|
|
- If a Counter is supplied, it is used directly in computing |
139
|
|
|
the complement of the tokens in both sets. |
140
|
|
|
- If a collection is supplied, it is converted to a Counter |
141
|
|
|
and used directly. In the case of a single string being |
142
|
|
|
supplied and the QGram tokenizer being used, the full |
143
|
|
|
alphabet is inferred (i.e. |
144
|
|
|
:math:`len(set(alphabet+QGrams.start\_stop))^{QGrams.qval}` |
145
|
|
|
is used as the cardinality of the full alphabet. |
146
|
|
|
- If an int is supplied, it is used as the cardinality of the |
147
|
|
|
full alphabet. |
148
|
|
|
- If None is supplied, the cardinality of the full alphabet |
149
|
|
|
is inferred if QGram of QSkipgrams tokenization is used (i.e. |
150
|
|
|
:math:`28^{QGrams.qval}` is used as the cardinality of the |
151
|
|
|
full alphabet or :math:`26` if QGrams.qval is 1, which |
152
|
|
|
assumes the strings are English language strings and only |
153
|
|
|
contain letters of a single case). Otherwise, the cardinality |
154
|
|
|
of the complement of the total will be 0. |
155
|
|
|
normalizer : str |
156
|
|
|
This represents the normalization applied to the values in the |
157
|
|
|
2x2 contingency table prior to any of the cardinality (\*_card) |
158
|
|
|
methods returning a value. By default, no normalization is applied, |
159
|
|
|
but the following values are supported: |
160
|
|
|
|
161
|
|
|
- ``proportional`` : :math:`\frac{x}{n}`, where n is the total |
162
|
|
|
population |
163
|
|
|
- ``log`` : :math:`log(1+x)` |
164
|
|
|
- ``exp`` : :math:`e^x` |
165
|
|
|
- ``laplace`` : :math:`x+1` |
166
|
|
|
- ``inverse`` : :math:`\frac{1}{x}` |
167
|
|
|
- ``complement`` : :math:`n-x`, where n is the total population |
168
|
|
|
internal_assignment_problem : bool |
169
|
|
|
When using ``linkage`` as the intersection type (i.e. group |
170
|
|
|
linkage), this forces use of the internal implementation to solve |
171
|
|
|
the assignment problem, rather than scipy's linear_sum_assignment. |
172
|
|
|
|
173
|
|
|
.. versionadded:: 0.4.0 |
174
|
|
|
|
175
|
|
|
""" |
176
|
1 |
|
super(_TokenDistance, self).__init__( |
177
|
|
|
intersection_type=intersection_type, **kwargs |
178
|
|
|
) |
179
|
|
|
|
180
|
1 |
|
qval = 2 if 'qval' not in self.params else self.params['qval'] |
181
|
1 |
|
self.params['tokenizer'] = ( |
182
|
|
|
tokenizer |
183
|
|
|
if tokenizer is not None |
184
|
|
|
else WhitespaceTokenizer() |
185
|
|
|
if qval == 0 |
186
|
|
|
else QGrams(qval=qval, start_stop='$#', skip=0, scaler=None) |
187
|
|
|
) |
188
|
|
|
|
189
|
1 |
|
if hasattr(self.params['tokenizer'], 'qval'): |
190
|
1 |
|
if isinstance(self.params['tokenizer'].qval, int): |
191
|
1 |
|
qvals = [self.params['tokenizer'].qval] |
192
|
|
|
else: |
193
|
1 |
|
qvals = list(self.params['tokenizer'].qval) |
194
|
|
|
else: |
195
|
1 |
|
qvals = [] |
196
|
|
|
|
197
|
1 |
|
if 'alphabet' in self.params: |
198
|
1 |
|
if isinstance(self.params['alphabet'], str): |
199
|
1 |
|
self.params['alphabet'] = set(self.params['alphabet']) |
200
|
1 |
|
if isinstance(self.params['tokenizer'], (QGrams, QSkipgrams)): |
201
|
1 |
|
self.params['alphabet'] |= set( |
202
|
|
|
self.params['tokenizer'].start_stop |
203
|
|
|
) |
204
|
1 |
|
self.params['alphabet'] = sum( |
205
|
|
|
len(self.params['alphabet']) ** qval for qval in qvals |
206
|
|
|
) |
207
|
1 |
|
if hasattr(self.params['alphabet'], '__len__') and not isinstance( |
208
|
|
|
self.params['alphabet'], Counter |
209
|
|
|
): |
210
|
1 |
|
self.params['alphabet'] = len(self.params['alphabet']) |
211
|
1 |
|
elif self.params['alphabet'] is None and isinstance( |
212
|
|
|
self.params['tokenizer'], (QGrams, QSkipgrams) |
213
|
|
|
): |
214
|
1 |
|
self.params['alphabet'] = sum( |
215
|
|
|
28 ** qval if qval > 1 else 26 for qval in qvals |
216
|
|
|
) |
217
|
|
|
else: |
218
|
1 |
|
if isinstance(self.params['tokenizer'], (QGrams, QSkipgrams)): |
219
|
1 |
|
self.params['alphabet'] = sum( |
220
|
|
|
28 ** qval if qval > 1 else 26 for qval in qvals |
221
|
|
|
) |
222
|
|
|
else: |
223
|
1 |
|
self.params['alphabet'] = None |
224
|
|
|
|
225
|
1 |
|
if intersection_type == 'soft': |
226
|
1 |
|
if 'metric' not in self.params or self.params['metric'] is None: |
227
|
1 |
|
self.params['metric'] = DamerauLevenshtein() |
228
|
1 |
|
self._lcprefix = LCPrefix() |
229
|
1 |
|
self._intersection = self._soft_intersection |
230
|
1 |
|
elif intersection_type == 'fuzzy': |
231
|
1 |
|
if 'metric' not in self.params or self.params['metric'] is None: |
232
|
1 |
|
self.params['metric'] = Levenshtein() |
233
|
1 |
|
if 'threshold' not in self.params: |
234
|
1 |
|
self.params['threshold'] = 0.8 |
235
|
1 |
|
self._intersection = self._fuzzy_intersection |
236
|
1 |
|
elif intersection_type == 'linkage': |
237
|
1 |
|
if 'metric' not in self.params or self.params['metric'] is None: |
238
|
1 |
|
self.params['metric'] = DamerauLevenshtein() |
239
|
1 |
|
if 'threshold' not in self.params: |
240
|
1 |
|
self.params['threshold'] = 0.1 |
241
|
1 |
|
self._intersection = self._group_linkage_intersection |
242
|
|
|
else: |
243
|
1 |
|
self._intersection = self._crisp_intersection |
244
|
|
|
|
245
|
1 |
|
self._src_tokens = Counter() |
246
|
1 |
|
self._tar_tokens = Counter() |
247
|
1 |
|
self._population_card_value = 0 |
248
|
|
|
|
249
|
|
|
# initialize normalizer |
250
|
1 |
|
self.normalizer = self._norm_none |
251
|
|
|
|
252
|
1 |
|
self._norm_dict = { |
253
|
|
|
'proportional': self._norm_proportional, |
254
|
|
|
'log': self._norm_log, |
255
|
|
|
'exp': self._norm_exp, |
256
|
|
|
'laplace': self._norm_laplace, |
257
|
|
|
'inverse': self._norm_inverse, |
258
|
|
|
'complement': self._norm_complement, |
259
|
|
|
} |
260
|
|
|
|
261
|
1 |
|
def _norm_none(self, x, _squares, _pop): |
262
|
1 |
|
return x |
263
|
|
|
|
264
|
1 |
|
def _norm_proportional(self, x, _squares, pop): |
265
|
1 |
|
return x / max(1, pop) |
266
|
|
|
|
267
|
1 |
|
def _norm_log(self, x, _squares, _pop): |
268
|
1 |
|
return log1p(x) |
269
|
|
|
|
270
|
1 |
|
def _norm_exp(self, x, _squares, _pop): |
271
|
1 |
|
return exp(x) |
272
|
|
|
|
273
|
1 |
|
def _norm_laplace(self, x, squares, _pop): |
274
|
1 |
|
return x + squares |
275
|
|
|
|
276
|
1 |
|
def _norm_inverse(self, x, _squares, pop): |
277
|
1 |
|
return 1 / x if x else pop |
278
|
|
|
|
279
|
1 |
|
def _norm_complement(self, x, _squares, pop): |
280
|
1 |
|
return pop - x |
281
|
|
|
|
282
|
1 |
|
def _tokenize(self, src, tar): |
283
|
|
|
"""Return the Q-Grams in src & tar. |
284
|
|
|
|
285
|
|
|
Parameters |
286
|
|
|
---------- |
287
|
|
|
src : str |
288
|
|
|
Source string (or QGrams/Counter objects) for comparison |
289
|
|
|
tar : str |
290
|
|
|
Target string (or QGrams/Counter objects) for comparison |
291
|
|
|
|
292
|
|
|
Returns |
293
|
|
|
------- |
294
|
|
|
tuple of Counters |
295
|
|
|
Q-Grams |
296
|
|
|
|
297
|
|
|
Examples |
298
|
|
|
-------- |
299
|
|
|
>>> pe = _TokenDistance() |
300
|
|
|
>>> pe._tokenize('AT', 'TT')._get_tokens() |
301
|
|
|
(Counter({'$A': 1, 'AT': 1, 'T#': 1}), |
302
|
|
|
Counter({'$T': 1, 'TT': 1, 'T#': 1})) |
303
|
|
|
|
304
|
|
|
|
305
|
|
|
.. versionadded:: 0.1.0 |
306
|
|
|
.. versionchanged:: 0.3.6 |
307
|
|
|
Encapsulated in class |
308
|
|
|
|
309
|
|
|
""" |
310
|
1 |
|
self._src_orig = src |
311
|
1 |
|
self._tar_orig = tar |
312
|
|
|
|
313
|
1 |
|
if isinstance(src, Counter): |
314
|
1 |
|
self._src_tokens = src |
315
|
|
|
else: |
316
|
1 |
|
self._src_tokens = ( |
317
|
|
|
self.params['tokenizer'].tokenize(src).get_counter() |
318
|
|
|
) |
319
|
1 |
|
if isinstance(src, Counter): |
320
|
1 |
|
self._tar_tokens = tar |
321
|
|
|
else: |
322
|
1 |
|
self._tar_tokens = ( |
323
|
|
|
self.params['tokenizer'].tokenize(tar).get_counter() |
324
|
|
|
) |
325
|
|
|
|
326
|
1 |
|
self._population_card_value = self._calc_population_card() |
327
|
|
|
|
328
|
|
|
# Set up the normalizer, a function of two variables: |
329
|
|
|
# x is the value in the contingency table square(s) |
330
|
|
|
# n is the number of squares that x represents |
331
|
1 |
|
if ( |
332
|
|
|
'normalizer' in self.params |
333
|
|
|
and self.params['normalizer'] in self._norm_dict |
334
|
|
|
): |
335
|
1 |
|
self.normalizer = self._norm_dict[self.params['normalizer']] |
336
|
|
|
|
337
|
1 |
|
return self |
338
|
|
|
|
339
|
1 |
|
def _get_tokens(self): |
340
|
|
|
"""Return the src and tar tokens as a tuple.""" |
341
|
1 |
|
return self._src_tokens, self._tar_tokens |
342
|
|
|
|
343
|
1 |
|
def _src_card(self): |
344
|
|
|
r"""Return the cardinality of the tokens in the source set.""" |
345
|
1 |
|
return self.normalizer( |
346
|
|
|
sum(abs(val) for val in self._src_tokens.values()), |
347
|
|
|
2, |
348
|
|
|
self._population_card_value, |
349
|
|
|
) |
350
|
|
|
|
351
|
1 |
|
def _src_only(self): |
352
|
|
|
r"""Return the src tokens minus the tar tokens. |
353
|
|
|
|
354
|
|
|
For (multi-)sets S and T, this is :math:`S \setminus T`. |
355
|
|
|
""" |
356
|
1 |
|
src_only = self._src_tokens - self._intersection() |
357
|
1 |
|
if self.params['intersection_type'] != 'crisp': |
358
|
1 |
|
src_only -= self._intersection() - self._crisp_intersection() |
359
|
1 |
|
return src_only |
360
|
|
|
|
361
|
1 |
|
def _src_only_card(self): |
362
|
|
|
"""Return the cardinality of the tokens only in the source set.""" |
363
|
1 |
|
return self.normalizer( |
364
|
|
|
sum(abs(val) for val in self._src_only().values()), |
365
|
|
|
1, |
366
|
|
|
self._population_card_value, |
367
|
|
|
) |
368
|
|
|
|
369
|
1 |
|
def _tar_card(self): |
370
|
|
|
r"""Return the cardinality of the tokens in the target set.""" |
371
|
1 |
|
return self.normalizer( |
372
|
|
|
sum(abs(val) for val in self._tar_tokens.values()), |
373
|
|
|
2, |
374
|
|
|
self._population_card_value, |
375
|
|
|
) |
376
|
|
|
|
377
|
1 |
|
def _tar_only(self): |
378
|
|
|
r"""Return the tar tokens minus the src tokens. |
379
|
|
|
|
380
|
|
|
For (multi-)sets S and T, this is :math:`T \setminus S`. |
381
|
|
|
""" |
382
|
1 |
|
tar_only = self._tar_tokens - self._intersection() |
383
|
1 |
|
if self.params['intersection_type'] != 'crisp': |
384
|
1 |
|
tar_only -= self._intersection() - self._crisp_intersection() |
385
|
1 |
|
return tar_only |
386
|
|
|
|
387
|
1 |
|
def _tar_only_card(self): |
388
|
|
|
"""Return the cardinality of the tokens only in the target set.""" |
389
|
1 |
|
return self.normalizer( |
390
|
|
|
sum(abs(val) for val in self._tar_only().values()), |
391
|
|
|
1, |
392
|
|
|
self._population_card_value, |
393
|
|
|
) |
394
|
|
|
|
395
|
1 |
|
def _symmetric_difference(self): |
396
|
|
|
r"""Return the symmetric difference of tokens from src and tar. |
397
|
|
|
|
398
|
|
|
For (multi-)sets S and T, this is :math:`S \triangle T`. |
399
|
|
|
""" |
400
|
1 |
|
return self._src_only() + self._tar_only() |
401
|
|
|
|
402
|
1 |
|
def _symmetric_difference_card(self): |
403
|
|
|
"""Return the cardinality of the symmetric difference.""" |
404
|
1 |
|
return self.normalizer( |
405
|
|
|
sum(abs(val) for val in self._symmetric_difference().values()), |
406
|
|
|
2, |
407
|
|
|
self._population_card_value, |
408
|
|
|
) |
409
|
|
|
|
410
|
1 |
|
def _total(self): |
411
|
|
|
"""Return the sum of the sets. |
412
|
|
|
|
413
|
|
|
For (multi-)sets S and T, this is :math:`S + T`. |
414
|
|
|
|
415
|
|
|
In the case of multisets, this counts values in the interesection |
416
|
|
|
twice. In the case of sets, this is identical to the union. |
417
|
|
|
""" |
418
|
1 |
|
return self._src_tokens + self._tar_tokens |
419
|
|
|
|
420
|
1 |
|
def _total_card(self): |
421
|
|
|
"""Return the cardinality of the complement of the total.""" |
422
|
1 |
|
return self.normalizer( |
423
|
|
|
sum(abs(val) for val in self._total().values()), |
424
|
|
|
3, |
425
|
|
|
self._population_card_value, |
426
|
|
|
) |
427
|
|
|
|
428
|
1 |
|
def _total_complement_card(self): |
429
|
|
|
"""Return the cardinality of the complement of the total.""" |
430
|
1 |
|
if self.params['alphabet'] is None: |
431
|
1 |
|
return self.normalizer(0, 1, self._population_card_value) |
432
|
1 |
|
elif isinstance(self.params['alphabet'], Counter): |
433
|
1 |
|
return self.normalizer( |
434
|
|
|
max( |
435
|
|
|
0, |
436
|
|
|
sum( |
437
|
|
|
abs(val) |
438
|
|
|
for val in ( |
439
|
|
|
self.params['alphabet'] - self._total() |
440
|
|
|
).values() |
441
|
|
|
), |
442
|
|
|
), |
443
|
|
|
1, |
444
|
|
|
self._population_card_value, |
445
|
|
|
) |
446
|
1 |
|
return self.normalizer( |
447
|
|
|
max(0, self.params['alphabet'] - len(self._total().values())), |
448
|
|
|
1, |
449
|
|
|
self._population_card_value, |
450
|
|
|
) |
451
|
|
|
|
452
|
1 |
|
def _calc_population_card(self): |
453
|
|
|
"""Return the cardinality of the population.""" |
454
|
1 |
|
save_normalizer = self.normalizer |
455
|
1 |
|
self.normalizer = self._norm_none |
456
|
1 |
|
pop = self._total_card() + self._total_complement_card() |
457
|
1 |
|
self.normalizer = save_normalizer |
458
|
1 |
|
return pop |
459
|
|
|
|
460
|
1 |
|
def _population_card(self): |
461
|
|
|
"""Return the cardinality of the population.""" |
462
|
1 |
|
return self.normalizer( |
463
|
|
|
self._population_card_value, 4, self._population_card_value |
464
|
|
|
) |
465
|
|
|
|
466
|
1 |
|
def _population_unique_card(self): |
467
|
|
|
"""Return the cardinality of the population minus the intersection.""" |
468
|
1 |
|
return self.normalizer( |
469
|
|
|
self._population_card_value - self._intersection_card(), |
470
|
|
|
4, |
471
|
|
|
self._population_card_value, |
472
|
|
|
) |
473
|
|
|
|
474
|
1 |
|
def _union(self): |
475
|
|
|
r"""Return the union of tokens from src and tar. |
476
|
|
|
|
477
|
|
|
For (multi-)sets S and T, this is :math:`S \cup T`. |
478
|
|
|
""" |
479
|
1 |
|
union = self._total() - self._intersection() |
480
|
1 |
|
if self.params['intersection_type'] != 'crisp': |
481
|
1 |
|
union -= self._intersection() - self._crisp_intersection() |
482
|
1 |
|
return union |
483
|
|
|
|
484
|
1 |
|
def _union_card(self): |
485
|
|
|
"""Return the cardinality of the union.""" |
486
|
1 |
|
return self.normalizer( |
487
|
|
|
sum(abs(val) for val in self._union().values()), |
488
|
|
|
3, |
489
|
|
|
self._population_card_value, |
490
|
|
|
) |
491
|
|
|
|
492
|
1 |
|
def _difference(self): |
493
|
|
|
"""Return the difference of the tokens, supporting negative values.""" |
494
|
1 |
|
_src_copy = Counter(self._src_tokens) |
495
|
1 |
|
_src_copy.subtract(self._tar_tokens) |
496
|
1 |
|
return _src_copy |
497
|
|
|
|
498
|
1 |
|
def _crisp_intersection(self): |
499
|
|
|
r"""Return the intersection of tokens from src and tar. |
500
|
|
|
|
501
|
|
|
For (multi-)sets S and T, this is :math:`S \cap T`. |
502
|
|
|
""" |
503
|
1 |
|
return self._src_tokens & self._tar_tokens |
504
|
|
|
|
505
|
1 |
|
def _soft_intersection(self): |
506
|
|
|
"""Return the soft intersection of the tokens in src and tar. |
507
|
|
|
|
508
|
|
|
This implements the soft intersection defined by :cite:`Russ:2014`. |
509
|
|
|
""" |
510
|
1 |
|
intersection = self._crisp_intersection() |
511
|
1 |
|
src_only = self._src_tokens - self._tar_tokens |
512
|
1 |
|
tar_only = self._tar_tokens - self._src_tokens |
513
|
|
|
|
514
|
1 |
|
def _membership(src, tar): |
515
|
1 |
|
greater_length = max(len(src), len(tar)) |
516
|
1 |
|
return ( |
517
|
|
|
max( |
518
|
|
|
greater_length - self.params['metric'].dist_abs(src, tar), |
519
|
|
|
self._lcprefix.dist_abs(src, tar), |
520
|
|
|
) |
521
|
|
|
/ greater_length |
522
|
|
|
) |
523
|
|
|
|
524
|
|
|
# Dictionary ordering is important for reproducibility, so insertion |
525
|
|
|
# order needs to be controlled and retained. |
526
|
1 |
|
memberships = OrderedDict( |
527
|
|
|
((src, tar), _membership(src, tar)) |
528
|
|
|
for src, tar in sorted(product(src_only, tar_only)) |
529
|
|
|
) |
530
|
|
|
|
531
|
1 |
|
while memberships: |
532
|
1 |
|
src_tok, tar_tok = max(memberships, key=memberships.get) |
533
|
1 |
|
if memberships[src_tok, tar_tok] > 0.0: |
534
|
1 |
|
pairings = min(src_only[src_tok], tar_only[tar_tok]) |
535
|
1 |
|
if pairings: |
536
|
1 |
|
intersection[src_tok] += ( |
537
|
|
|
memberships[src_tok, tar_tok] * pairings / 2 |
538
|
|
|
) |
539
|
1 |
|
intersection[tar_tok] += ( |
540
|
|
|
memberships[src_tok, tar_tok] * pairings / 2 |
541
|
|
|
) |
542
|
1 |
|
src_only[src_tok] -= pairings |
543
|
1 |
|
tar_only[tar_tok] -= pairings |
544
|
1 |
|
del memberships[src_tok, tar_tok] |
545
|
|
|
|
546
|
1 |
|
return intersection |
547
|
|
|
|
548
|
1 |
|
def _fuzzy_intersection(self): |
549
|
|
|
r"""Return the fuzzy intersection of the tokens in src and tar. |
550
|
|
|
|
551
|
|
|
This implements the fuzzy intersection defined by :cite:`Wang:2014`. |
552
|
|
|
|
553
|
|
|
For two sets X and Y, the intersection :cite:`Wang:2014` is the sum of |
554
|
|
|
similarities of all tokens in the two sets that are greater than or |
555
|
|
|
equal to some threshold value (:math:`\delta`). |
556
|
|
|
|
557
|
|
|
The lower bound of on this intersection and the value when |
558
|
|
|
:math:`\delta = 1.0`, is the crisp intersection. Tokens shorter than |
559
|
|
|
:math:`\frac{\delta}{1-\delta}`, 4 in the case of the default threshold |
560
|
|
|
:math:`\delta = 0.8`, must match exactly to be included in the |
561
|
|
|
intersection. |
562
|
|
|
|
563
|
|
|
|
564
|
|
|
.. versionadded:: 0.4.0 |
565
|
|
|
|
566
|
|
|
""" |
567
|
1 |
|
intersection = self._crisp_intersection() |
568
|
1 |
|
src_only = self._src_tokens - self._tar_tokens |
569
|
1 |
|
tar_only = self._tar_tokens - self._src_tokens |
570
|
|
|
|
571
|
1 |
|
pair = {} |
572
|
1 |
|
for src_tok in sorted(src_only): |
573
|
1 |
|
for tar_tok in sorted(tar_only): |
574
|
1 |
|
sim = self.params['metric'].sim(src_tok, tar_tok) |
575
|
1 |
|
if sim >= self.params['threshold']: |
576
|
1 |
|
pair[(src_tok, tar_tok)] = sim |
577
|
|
|
|
578
|
1 |
|
ordered_keys = [(pair[_], _[0], _[1]) for _ in pair] |
579
|
1 |
|
ordered_keys.sort(key=lambda x: x[2]) |
580
|
1 |
|
ordered_keys.sort(key=lambda x: x[1]) |
581
|
1 |
|
ordered_keys.sort(key=lambda x: x[0], reverse=True) |
582
|
|
|
|
583
|
1 |
|
for _, src_tok, tar_tok in ordered_keys: |
584
|
1 |
|
pairings = min(src_only[src_tok], tar_only[tar_tok]) |
585
|
1 |
|
if pairings: |
586
|
1 |
|
sim = pair[(src_tok, tar_tok)] |
587
|
|
|
|
588
|
1 |
|
intersection[src_tok] += sim / 2 * pairings |
589
|
1 |
|
intersection[tar_tok] += sim / 2 * pairings |
590
|
|
|
|
591
|
1 |
|
src_only[src_tok] -= pairings |
592
|
1 |
|
tar_only[tar_tok] -= pairings |
593
|
|
|
|
594
|
|
|
""" |
595
|
|
|
# Here is a slightly different optimization method, which is even |
596
|
|
|
# greedier than the above. |
597
|
|
|
# ordered by sim*pairings rather than just sim |
598
|
|
|
|
599
|
|
|
pair = {} |
600
|
|
|
for src_tok in sorted(src_only): |
601
|
|
|
for tar_tok in sorted(tar_only): |
602
|
|
|
sim = self.params['metric'].sim(src_tok, tar_tok) |
603
|
|
|
if sim >= self.params['threshold']: |
604
|
|
|
pairings = min(src_only[src_tok], tar_only[tar_tok]) |
605
|
|
|
pair[(src_tok, tar_tok)] = sim*pairings |
606
|
|
|
|
607
|
|
|
for src_tok, tar_tok in sorted(pair, key=pair.get, reverse=True): |
608
|
|
|
pairings = min(src_only[src_tok], tar_only[tar_tok]) |
609
|
|
|
if pairings: |
610
|
|
|
sim = pair[(src_tok, tar_tok)] |
611
|
|
|
|
612
|
|
|
intersection[src_tok] += sim / 2 |
613
|
|
|
intersection[tar_tok] += sim / 2 |
614
|
|
|
|
615
|
|
|
src_only[src_tok] -= pairings |
616
|
|
|
tar_only[tar_tok] -= pairings |
617
|
|
|
""" |
618
|
|
|
|
619
|
1 |
|
return intersection |
620
|
|
|
|
621
|
1 |
|
def _group_linkage_intersection(self): |
622
|
|
|
r"""Return the group linkage intersection of the tokens in src and tar. |
623
|
|
|
|
624
|
|
|
This is based on group linkage, as defined by :cite:`On:2007`. |
625
|
|
|
|
626
|
|
|
Most of this method is concerned with solving the assignment problem, |
627
|
|
|
in order to find the weight of the maximum weight bipartite matching. |
628
|
|
|
If the system has SciPy installed, we use it's linear_sum_assignment |
629
|
|
|
function to get the assignments. Otherwise, we use the Hungarian |
630
|
|
|
algorithm of Munkres :cite:`Munkres:1957`, implemented in Python & |
631
|
|
|
Numpy. |
632
|
|
|
|
633
|
|
|
.. versionadded:: 0.4.0 |
634
|
|
|
|
635
|
|
|
""" |
636
|
1 |
|
intersection = self._crisp_intersection() |
637
|
1 |
|
src_only = sorted(self._src_tokens - self._tar_tokens) |
638
|
1 |
|
tar_only = sorted(self._tar_tokens - self._src_tokens) |
639
|
|
|
|
640
|
1 |
|
if linear_sum_assignment and not ( |
641
|
|
|
'internal_assignment_problem' in self.params |
642
|
|
|
and self.params['internal_assignment_problem'] |
643
|
|
|
): |
644
|
1 |
|
arr = np_zeros((len(tar_only), len(src_only))) |
645
|
|
|
|
646
|
1 |
|
for col in range(len(src_only)): |
647
|
1 |
|
for row in range(len(tar_only)): |
648
|
1 |
|
arr[row, col] = self.params['metric'].dist( |
649
|
|
|
src_only[col], tar_only[row] |
650
|
|
|
) |
651
|
|
|
|
652
|
1 |
|
for row, col in zip(*linear_sum_assignment(arr)): |
653
|
1 |
|
sim = 1.0 - arr[row, col] |
654
|
1 |
|
if sim >= self.params['threshold']: |
655
|
1 |
|
intersection[src_only[col]] += (sim / 2) * ( |
656
|
|
|
self._src_tokens - self._tar_tokens |
657
|
|
|
)[src_only[col]] |
658
|
1 |
|
intersection[tar_only[row]] += (sim / 2) * ( |
659
|
|
|
self._tar_tokens - self._src_tokens |
660
|
|
|
)[tar_only[row]] |
661
|
|
|
else: |
662
|
1 |
|
n = max(len(tar_only), len(src_only)) |
663
|
1 |
|
arr = np_zeros((n, n), dtype=float) |
664
|
|
|
|
665
|
1 |
|
for col in range(len(src_only)): |
666
|
1 |
|
for row in range(len(tar_only)): |
667
|
1 |
|
arr[row, col] = self.params['metric'].dist( |
668
|
|
|
src_only[col], tar_only[row] |
669
|
|
|
) |
670
|
|
|
|
671
|
1 |
|
src_only += [''] * (n - len(src_only)) |
672
|
1 |
|
tar_only += [''] * (n - len(tar_only)) |
673
|
|
|
|
674
|
1 |
|
orig_sim = 1 - np_copy(arr) |
675
|
|
|
|
676
|
|
|
# Step 1 |
677
|
1 |
|
for row in range(n): |
678
|
1 |
|
arr[row, :] -= arr[row, :].min() |
679
|
|
|
# Step 2 |
680
|
1 |
|
for col in range(n): |
681
|
1 |
|
arr[:, col] -= arr[:, col].min() |
682
|
|
|
|
683
|
1 |
|
while True: |
684
|
|
|
# Step 3 |
685
|
1 |
|
assignments = {} |
686
|
|
|
|
687
|
1 |
|
allocated_cols = set() |
688
|
1 |
|
allocated_rows = set() |
689
|
1 |
|
assigned_rows = set() |
690
|
1 |
|
assigned_cols = set() |
691
|
|
|
|
692
|
1 |
|
for row in range(n): |
693
|
1 |
|
if (arr[row, :] == 0.0).sum() == 1: |
694
|
1 |
|
col = arr[row, :].argmin() |
695
|
1 |
|
if col not in allocated_cols: |
696
|
1 |
|
assignments[row, col] = orig_sim[row, col] |
697
|
1 |
|
allocated_cols.add(col) |
698
|
1 |
|
assigned_rows.add(row) |
699
|
1 |
|
assigned_cols.add(col) |
700
|
|
|
|
701
|
1 |
|
for col in range(n): |
702
|
1 |
|
if (arr[:, col] == 0.0).sum() == 1: |
703
|
1 |
|
row = arr[:, col].argmin() |
704
|
1 |
|
if row not in allocated_rows: |
705
|
1 |
|
assignments[row, col] = orig_sim[row, col] |
706
|
1 |
|
allocated_rows.add(row) |
707
|
1 |
|
assigned_rows.add(row) |
708
|
1 |
|
assigned_cols.add(col) |
709
|
|
|
|
710
|
1 |
|
if len(assignments) == n: |
711
|
1 |
|
break |
712
|
|
|
|
713
|
1 |
|
marked_rows = {_ for _ in range(n) if _ not in assigned_rows} |
714
|
1 |
|
marked_cols = set() |
715
|
1 |
|
for row in sorted(set(marked_rows)): |
716
|
1 |
|
for col, mark in enumerate(arr[row, :] == 0.0): |
717
|
1 |
|
if mark: |
718
|
1 |
|
marked_cols.add(col) |
719
|
1 |
|
for row2 in range(n): |
720
|
1 |
|
if (row2, col) in assignments: |
721
|
1 |
|
marked_rows.add(row2) |
722
|
|
|
|
723
|
1 |
|
if n - len(marked_rows) + len(marked_cols) == n: |
724
|
|
|
# We have sufficient lines |
725
|
1 |
|
for col in range(n): |
726
|
1 |
|
row = arr[:, col].argmin() |
727
|
1 |
|
assignments[row, col] = orig_sim[row, col] |
728
|
1 |
|
break |
729
|
|
|
|
730
|
|
|
# Step 4 |
731
|
1 |
|
min_val = arr[tuple(marked_rows), :][ |
732
|
|
|
:, sorted(set(range(n)) - marked_cols) |
733
|
|
|
].min() |
734
|
1 |
|
for row in range(n): |
735
|
1 |
|
for col in range(n): |
736
|
1 |
|
if row in marked_rows and col not in marked_cols: |
737
|
1 |
|
arr[row, col] -= min_val |
738
|
1 |
|
elif row not in marked_rows and col in marked_cols: |
739
|
1 |
|
arr[row, col] += min_val |
740
|
|
|
|
741
|
1 |
|
for row, col in assignments.keys(): |
742
|
1 |
|
sim = orig_sim[row, col] |
743
|
1 |
|
if sim >= self.params['threshold']: |
744
|
1 |
|
intersection[src_only[col]] += (sim / 2) * ( |
745
|
|
|
self._src_tokens - self._tar_tokens |
746
|
|
|
)[src_only[col]] |
747
|
1 |
|
intersection[tar_only[row]] += (sim / 2) * ( |
748
|
|
|
self._tar_tokens - self._src_tokens |
749
|
|
|
)[tar_only[row]] |
750
|
|
|
|
751
|
1 |
|
return intersection |
752
|
|
|
|
753
|
1 |
|
def _intersection_card(self): |
754
|
|
|
"""Return the cardinality of the intersection.""" |
755
|
1 |
|
return self.normalizer( |
756
|
|
|
sum(abs(val) for val in self._intersection().values()), |
757
|
|
|
1, |
758
|
|
|
self._population_card_value, |
759
|
|
|
) |
760
|
|
|
|
761
|
1 |
|
def _intersection(self): |
762
|
|
|
"""Return the intersection. |
763
|
|
|
|
764
|
|
|
This function may be overridden by setting the intersection_type during |
765
|
|
|
initialization. |
766
|
|
|
""" |
767
|
|
|
return self._crisp_intersection() # pragma: no cover |
768
|
|
|
|
769
|
1 |
|
def _get_confusion_table(self): |
770
|
|
|
"""Return the token counts as a ConfusionTable object.""" |
771
|
1 |
|
return ConfusionTable( |
772
|
|
|
self._intersection_card(), |
773
|
|
|
self._total_complement_card(), |
774
|
|
|
self._src_only_card(), |
775
|
|
|
self._tar_only_card(), |
776
|
|
|
) |
777
|
|
|
|
778
|
|
|
|
779
|
|
|
if __name__ == '__main__': |
780
|
|
|
import doctest |
781
|
|
|
|
782
|
|
|
doctest.testmod() |
783
|
|
|
|