1
|
|
|
# -*- coding: utf-8 -*- |
2
|
|
|
|
3
|
|
|
# Copyright 2018-2019 by Christopher C. Little. |
4
|
|
|
# This file is part of Abydos. |
5
|
|
|
# |
6
|
|
|
# Abydos is free software: you can redistribute it and/or modify |
7
|
|
|
# it under the terms of the GNU General Public License as published by |
8
|
|
|
# the Free Software Foundation, either version 3 of the License, or |
9
|
|
|
# (at your option) any later version. |
10
|
|
|
# |
11
|
|
|
# Abydos is distributed in the hope that it will be useful, |
12
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of |
13
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
14
|
|
|
# GNU General Public License for more details. |
15
|
|
|
# |
16
|
|
|
# You should have received a copy of the GNU General Public License |
17
|
|
|
# along with Abydos. If not, see <http://www.gnu.org/licenses/>. |
18
|
|
|
|
19
|
1 |
|
"""abydos.distance._token_distance. |
20
|
|
|
|
21
|
|
|
The distance._token_distance._TokenDistance module implements abstract class |
22
|
|
|
_TokenDistance. |
23
|
|
|
""" |
24
|
|
|
|
25
|
1 |
|
from __future__ import ( |
26
|
|
|
absolute_import, |
27
|
|
|
division, |
28
|
|
|
print_function, |
29
|
|
|
unicode_literals, |
30
|
|
|
) |
31
|
|
|
|
32
|
1 |
|
from collections import Counter, OrderedDict |
33
|
1 |
|
from itertools import product |
34
|
1 |
|
from math import exp, log1p |
35
|
|
|
|
36
|
1 |
|
import numpy as np |
37
|
1 |
|
from numpy import zeros as np_zeros |
38
|
|
|
|
39
|
1 |
|
from ._damerau_levenshtein import DamerauLevenshtein |
40
|
1 |
|
from ._distance import _Distance |
41
|
1 |
|
from ._lcprefix import LCPrefix |
42
|
1 |
|
from ._levenshtein import Levenshtein |
43
|
1 |
|
from ..stats import ConfusionTable |
44
|
1 |
|
from ..tokenizer import QGrams, QSkipgrams, WhitespaceTokenizer |
45
|
|
|
|
46
|
1 |
|
__all__ = ['_TokenDistance'] |
47
|
|
|
|
48
|
|
|
|
49
|
1 |
|
class _TokenDistance(_Distance): |
50
|
|
|
r"""Abstract Token Distance class. |
51
|
|
|
|
52
|
|
|
.. _confusion_table: |
53
|
|
|
|
54
|
|
|
+----------------+--------------+-----------------+-------+ |
55
|
|
|
| | |in| ``tar`` | |notin| ``tar`` | | |
56
|
|
|
+----------------+--------------+-----------------+-------+ |
57
|
|
|
| |in| ``src`` | |a| | |b| | |a+b| | |
58
|
|
|
+----------------+--------------+-----------------+-------+ |
59
|
|
|
| |notin| ``src``| |c| | |d| | |c+d| | |
60
|
|
|
+----------------+--------------+-----------------+-------+ |
61
|
|
|
| | |a+c| | |b+d| | |n| | |
62
|
|
|
+----------------+--------------+-----------------+-------+ |
63
|
|
|
|
64
|
|
|
.. |in| replace:: :math:`x \in` |
65
|
|
|
.. |notin| replace:: :math:`x \notin` |
66
|
|
|
|
67
|
|
|
.. |a| replace:: :math:`a = |X \cap Y|` |
68
|
|
|
.. |b| replace:: :math:`b = |X\setminus Y|` |
69
|
|
|
.. |c| replace:: :math:`c = |Y \setminus X|` |
70
|
|
|
.. |d| replace:: :math:`d = |(N\setminus X)\setminus Y|` |
71
|
|
|
.. |n| replace:: :math:`n = |N|` |
72
|
|
|
.. |a+b| replace:: :math:`p_1 = a+b = |X|` |
73
|
|
|
.. |a+c| replace:: :math:`p_2 = a+c = |Y|` |
74
|
|
|
.. |c+d| replace:: :math:`q_1 = c+d = |N\setminus X|` |
75
|
|
|
.. |b+d| replace:: :math:`q_2 = b+d = |N\setminus Y|` |
76
|
|
|
|
77
|
|
|
.. versionadded:: 0.3.6 |
78
|
|
|
""" |
79
|
|
|
|
80
|
1 |
|
def __init__(self, tokenizer=None, intersection_type='crisp', **kwargs): |
81
|
|
|
r"""Initialize _TokenDistance instance. |
82
|
|
|
|
83
|
|
|
.. _intersection_type: |
84
|
|
|
|
85
|
|
|
Parameters |
86
|
|
|
---------- |
87
|
|
|
tokenizer : _Tokenizer |
88
|
|
|
A tokenizer instance from the :py:mod:`abydos.tokenizer` package |
89
|
|
|
intersection_type : str |
90
|
|
|
Specifies the intersection type, and set type as a result: |
91
|
|
|
|
92
|
|
|
- 'crisp': Ordinary intersection, wherein items are entirely |
93
|
|
|
members or non-members of the intersection. (Default) |
94
|
|
|
- ``fuzzy``: Fuzzy intersection, defined by :cite:`Wang:2014`, |
95
|
|
|
wherein items can be partially members of the intersection |
96
|
|
|
if their similarity meets or exceeds a threshold value. This |
97
|
|
|
also takes `metric` (by default :class:`Levenshtein()`) and |
98
|
|
|
`threshold` (by default 0.8) parameters. |
99
|
|
|
- ``soft``: Soft intersection, defined by :cite:`Russ:2014`, |
100
|
|
|
wherein items can be partially members of the intersection |
101
|
|
|
depending on their similarity. This also takes a `metric` |
102
|
|
|
(by default :class:`DamerauLevenshtein()`) parameter. |
103
|
|
|
- ``linkage``: Group linkage, defined by :cite:`On:2007`. Like |
104
|
|
|
the soft intersection, items can be partially members of the |
105
|
|
|
intersection, but the method of pairing similar members is |
106
|
|
|
somewhat more complex. See the cited paper for details. This |
107
|
|
|
also takes `metric` |
108
|
|
|
(by default :class:`DamerauLevenshtein()`) and `threshold` |
109
|
|
|
(by default 0.1) parameters. |
110
|
|
|
**kwargs |
111
|
|
|
Arbitrary keyword arguments |
112
|
|
|
|
113
|
|
|
|
114
|
|
|
.. _alphabet: |
115
|
|
|
|
116
|
|
|
Other Parameters |
117
|
|
|
---------------- |
118
|
|
|
qval : int |
119
|
|
|
The length of each q-gram. Using this parameter and tokenizer=None |
120
|
|
|
will cause the instance to use the QGram tokenizer with this |
121
|
|
|
q value. |
122
|
|
|
metric : _Distance |
123
|
|
|
A string distance measure class for use in the ``soft`` and |
124
|
|
|
``fuzzy`` variants. |
125
|
|
|
threshold : float |
126
|
|
|
A threshold value, similarities above which are counted as |
127
|
|
|
members of the intersection for the ``fuzzy`` variant. |
128
|
|
|
alphabet : Counter, collection, int, or None |
129
|
|
|
This represents the alphabet of possible tokens. |
130
|
|
|
|
131
|
|
|
- If a Counter is supplied, it is used directly in computing |
132
|
|
|
the complement of the tokens in both sets. |
133
|
|
|
- If a collection is supplied, it is converted to a Counter |
134
|
|
|
and used directly. In the case of a single string being |
135
|
|
|
supplied and the QGram tokenizer being used, the full |
136
|
|
|
alphabet is inferred (i.e. |
137
|
|
|
:math:`len(set(alphabet+QGrams.start\_stop))^{QGrams.qval}` |
138
|
|
|
is used as the cardinality of the full alphabet. |
139
|
|
|
- If an int is supplied, it is used as the cardinality of the |
140
|
|
|
full alphabet. |
141
|
|
|
- If None is supplied, the cardinality of the full alphabet |
142
|
|
|
is inferred if QGram of QSkipgrams tokenization is used (i.e. |
143
|
|
|
:math:`28^{QGrams.qval}` is used as the cardinality of the |
144
|
|
|
full alphabet or :math:`26` if QGrams.qval is 1, which |
145
|
|
|
assumes the strings are English language strings and only |
146
|
|
|
contain letters of a single case). Otherwise, the cardinality |
147
|
|
|
of the complement of the total will be 0. |
148
|
|
|
normalizer : str |
149
|
|
|
This represents the normalization applied to the values in the |
150
|
|
|
2x2 contingency table prior to any of the cardinality (\*_card) |
151
|
|
|
methods returning a value. By default, no normalization is applied, |
152
|
|
|
but the following values are supported: |
153
|
|
|
|
154
|
|
|
- ``proportional`` : :math:`\frac{x}{n}`, where n is the total |
155
|
|
|
population |
156
|
|
|
- ``log`` : :math:`log(1+x)` |
157
|
|
|
- ``exp`` : :math:`e^x` |
158
|
|
|
- ``laplace`` : :math:`x+1` |
159
|
|
|
- ``inverse`` : :math:`\frac{1}{x}` |
160
|
|
|
- ``complement`` : :math:`n-x`, where n is the total population |
161
|
|
|
|
162
|
|
|
.. versionadded:: 0.4.0 |
163
|
|
|
|
164
|
|
|
""" |
165
|
1 |
|
super(_TokenDistance, self).__init__( |
166
|
|
|
intersection_type=intersection_type, **kwargs |
167
|
|
|
) |
168
|
|
|
|
169
|
1 |
|
qval = 2 if 'qval' not in self.params else self.params['qval'] |
170
|
1 |
|
self.params['tokenizer'] = ( |
171
|
|
|
tokenizer |
172
|
|
|
if tokenizer is not None |
173
|
|
|
else WhitespaceTokenizer() |
174
|
|
|
if qval == 0 |
175
|
|
|
else QGrams(qval=qval, start_stop='$#', skip=0, scaler=None) |
176
|
|
|
) |
177
|
|
|
|
178
|
1 |
|
if hasattr(self.params['tokenizer'], 'qval'): |
179
|
1 |
|
if isinstance(self.params['tokenizer'].qval, int): |
180
|
1 |
|
qvals = [self.params['tokenizer'].qval] |
181
|
|
|
else: |
182
|
1 |
|
qvals = list(self.params['tokenizer'].qval) |
183
|
|
|
else: |
184
|
1 |
|
qvals = [] |
185
|
|
|
|
186
|
1 |
|
if 'alphabet' in self.params: |
187
|
1 |
|
if isinstance(self.params['alphabet'], str): |
188
|
1 |
|
self.params['alphabet'] = set(self.params['alphabet']) |
189
|
1 |
|
if isinstance(self.params['tokenizer'], (QGrams, QSkipgrams)): |
190
|
1 |
|
self.params['alphabet'] |= set( |
191
|
|
|
self.params['tokenizer'].start_stop |
192
|
|
|
) |
193
|
1 |
|
self.params['alphabet'] = sum( |
194
|
|
|
len(self.params['alphabet']) ** qval for qval in qvals |
195
|
|
|
) |
196
|
1 |
|
if hasattr(self.params['alphabet'], '__len__') and not isinstance( |
197
|
|
|
self.params['alphabet'], Counter |
198
|
|
|
): |
199
|
1 |
|
self.params['alphabet'] = len(self.params['alphabet']) |
200
|
1 |
|
elif self.params['alphabet'] is None and isinstance( |
201
|
|
|
self.params['tokenizer'], (QGrams, QSkipgrams) |
202
|
|
|
): |
203
|
1 |
|
self.params['alphabet'] = sum( |
204
|
|
|
28 ** qval if qval > 1 else 26 for qval in qvals |
205
|
|
|
) |
206
|
|
|
else: |
207
|
1 |
|
if isinstance(self.params['tokenizer'], (QGrams, QSkipgrams)): |
208
|
1 |
|
self.params['alphabet'] = sum( |
209
|
|
|
28 ** qval if qval > 1 else 26 for qval in qvals |
210
|
|
|
) |
211
|
|
|
else: |
212
|
1 |
|
self.params['alphabet'] = None |
213
|
|
|
|
214
|
1 |
|
if intersection_type == 'soft': |
215
|
1 |
|
if 'metric' not in self.params or self.params['metric'] is None: |
216
|
1 |
|
self.params['metric'] = Levenshtein() |
217
|
1 |
|
self._lcprefix = LCPrefix() |
218
|
1 |
|
self._intersection = self._soft_intersection |
219
|
1 |
|
elif intersection_type == 'fuzzy': |
220
|
1 |
|
if 'metric' not in self.params or self.params['metric'] is None: |
221
|
1 |
|
self.params['metric'] = Levenshtein() |
222
|
1 |
|
if 'threshold' not in self.params: |
223
|
1 |
|
self.params['threshold'] = 0.8 |
224
|
1 |
|
self._intersection = self._fuzzy_intersection |
225
|
1 |
|
elif intersection_type == 'linkage': |
226
|
1 |
|
if 'metric' not in self.params or self.params['metric'] is None: |
227
|
1 |
|
self.params['metric'] = DamerauLevenshtein() |
228
|
1 |
|
if 'threshold' not in self.params: |
229
|
1 |
|
self.params['threshold'] = 0.1 |
230
|
1 |
|
self._intersection = self._group_linkage_intersection |
231
|
|
|
else: |
232
|
1 |
|
self._intersection = self._crisp_intersection |
233
|
|
|
|
234
|
1 |
|
self._src_tokens = Counter() |
235
|
1 |
|
self._tar_tokens = Counter() |
236
|
1 |
|
self._population_card_value = 0 |
237
|
|
|
|
238
|
|
|
# initialize normalizer |
239
|
1 |
|
self.normalizer = self._norm_none |
240
|
|
|
|
241
|
1 |
|
self._norm_dict = { |
242
|
|
|
'proportional': self._norm_proportional, |
243
|
|
|
'log': self._norm_log, |
244
|
|
|
'exp': self._norm_exp, |
245
|
|
|
'laplace': self._norm_laplace, |
246
|
|
|
'inverse': self._norm_inverse, |
247
|
|
|
'complement': self._norm_complement, |
248
|
|
|
} |
249
|
|
|
|
250
|
|
|
# initialize values for soft intersection |
251
|
1 |
|
self._soft_intersection_precalc = None |
252
|
1 |
|
self._soft_src_only = None |
253
|
1 |
|
self._soft_tar_only = None |
254
|
|
|
|
255
|
1 |
|
def _norm_none(self, x, _squares, _pop): |
256
|
1 |
|
return x |
257
|
|
|
|
258
|
1 |
|
def _norm_proportional(self, x, _squares, pop): |
259
|
1 |
|
return x / max(1, pop) |
260
|
|
|
|
261
|
1 |
|
def _norm_log(self, x, _squares, _pop): |
262
|
1 |
|
return log1p(x) |
263
|
|
|
|
264
|
1 |
|
def _norm_exp(self, x, _squares, _pop): |
265
|
1 |
|
return exp(x) |
266
|
|
|
|
267
|
1 |
|
def _norm_laplace(self, x, squares, _pop): |
268
|
1 |
|
return x + squares |
269
|
|
|
|
270
|
1 |
|
def _norm_inverse(self, x, _squares, pop): |
271
|
1 |
|
return 1 / x if x else pop |
272
|
|
|
|
273
|
1 |
|
def _norm_complement(self, x, _squares, pop): |
274
|
1 |
|
return pop - x |
275
|
|
|
|
276
|
1 |
|
def _tokenize(self, src, tar): |
277
|
|
|
"""Return the Q-Grams in src & tar. |
278
|
|
|
|
279
|
|
|
Parameters |
280
|
|
|
---------- |
281
|
|
|
src : str |
282
|
|
|
Source string (or QGrams/Counter objects) for comparison |
283
|
|
|
tar : str |
284
|
|
|
Target string (or QGrams/Counter objects) for comparison |
285
|
|
|
|
286
|
|
|
Returns |
287
|
|
|
------- |
288
|
|
|
tuple of Counters |
289
|
|
|
Q-Grams |
290
|
|
|
|
291
|
|
|
Examples |
292
|
|
|
-------- |
293
|
|
|
>>> pe = _TokenDistance() |
294
|
|
|
>>> pe._tokenize('AT', 'TT')._get_tokens() |
295
|
|
|
(Counter({'$A': 1, 'AT': 1, 'T#': 1}), |
296
|
|
|
Counter({'$T': 1, 'TT': 1, 'T#': 1})) |
297
|
|
|
|
298
|
|
|
|
299
|
|
|
.. versionadded:: 0.1.0 |
300
|
|
|
.. versionchanged:: 0.3.6 |
301
|
|
|
Encapsulated in class |
302
|
|
|
|
303
|
|
|
""" |
304
|
1 |
|
self._src_orig = src |
305
|
1 |
|
self._tar_orig = tar |
306
|
|
|
|
307
|
1 |
|
if isinstance(src, Counter): |
308
|
1 |
|
self._src_tokens = src |
309
|
|
|
else: |
310
|
1 |
|
self._src_tokens = ( |
311
|
|
|
self.params['tokenizer'].tokenize(src).get_counter() |
312
|
|
|
) |
313
|
1 |
|
if isinstance(tar, Counter): |
314
|
1 |
|
self._tar_tokens = tar |
315
|
|
|
else: |
316
|
1 |
|
self._tar_tokens = ( |
317
|
|
|
self.params['tokenizer'].tokenize(tar).get_counter() |
318
|
|
|
) |
319
|
|
|
|
320
|
1 |
|
self._population_card_value = self._calc_population_card() |
321
|
|
|
|
322
|
|
|
# Set up the normalizer, a function of two variables: |
323
|
|
|
# x is the value in the contingency table square(s) |
324
|
|
|
# n is the number of squares that x represents |
325
|
1 |
|
if ( |
326
|
|
|
'normalizer' in self.params |
327
|
|
|
and self.params['normalizer'] in self._norm_dict |
328
|
|
|
): |
329
|
1 |
|
self.normalizer = self._norm_dict[self.params['normalizer']] |
330
|
|
|
|
331
|
|
|
# clear values for soft intersection |
332
|
1 |
|
self._soft_intersection_precalc = None |
333
|
1 |
|
self._soft_src_only = None |
334
|
1 |
|
self._soft_tar_only = None |
335
|
|
|
|
336
|
1 |
|
return self |
337
|
|
|
|
338
|
1 |
|
def _get_tokens(self): |
339
|
|
|
"""Return the src and tar tokens as a tuple.""" |
340
|
1 |
|
return self._src_tokens, self._tar_tokens |
341
|
|
|
|
342
|
1 |
|
def _src_card(self): |
343
|
|
|
r"""Return the cardinality of the tokens in the source set.""" |
344
|
1 |
|
if self.params['intersection_type'] == 'soft': |
345
|
1 |
|
if not self._soft_intersection_precalc: |
346
|
1 |
|
self._intersection() |
347
|
1 |
|
return self.normalizer( |
348
|
|
|
sum( |
349
|
|
|
abs(val) |
350
|
|
|
for val in ( |
351
|
|
|
self._soft_intersection_precalc + self._soft_src_only |
352
|
|
|
).values() |
353
|
|
|
), |
354
|
|
|
2, |
355
|
|
|
self._population_card_value, |
356
|
|
|
) |
357
|
1 |
|
return self.normalizer( |
358
|
|
|
sum(abs(val) for val in self._src_tokens.values()), |
359
|
|
|
2, |
360
|
|
|
self._population_card_value, |
361
|
|
|
) |
362
|
|
|
|
363
|
1 |
|
def _src_only(self): |
364
|
|
|
r"""Return the src tokens minus the tar tokens. |
365
|
|
|
|
366
|
|
|
For (multi-)sets S and T, this is :math:`S \setminus T`. |
367
|
|
|
""" |
368
|
1 |
|
if self.params['intersection_type'] == 'soft': |
369
|
1 |
|
if not self._soft_intersection_precalc: |
370
|
1 |
|
self._intersection() |
371
|
1 |
|
return self._soft_src_only |
372
|
1 |
|
src_only = self._src_tokens - self._intersection() |
373
|
1 |
|
if self.params['intersection_type'] != 'crisp': |
374
|
1 |
|
src_only -= self._intersection() - self._crisp_intersection() |
375
|
1 |
|
return src_only |
376
|
|
|
|
377
|
1 |
|
def _src_only_card(self): |
378
|
|
|
"""Return the cardinality of the tokens only in the source set.""" |
379
|
1 |
|
return self.normalizer( |
380
|
|
|
sum(abs(val) for val in self._src_only().values()), |
381
|
|
|
1, |
382
|
|
|
self._population_card_value, |
383
|
|
|
) |
384
|
|
|
|
385
|
1 |
|
def _tar_card(self): |
386
|
|
|
r"""Return the cardinality of the tokens in the target set.""" |
387
|
1 |
|
if self.params['intersection_type'] == 'soft': |
388
|
1 |
|
if not self._soft_intersection_precalc: |
389
|
1 |
|
self._intersection() |
390
|
1 |
|
return self.normalizer( |
391
|
|
|
sum( |
392
|
|
|
abs(val) |
393
|
|
|
for val in ( |
394
|
|
|
self._soft_intersection_precalc + self._soft_tar_only |
395
|
|
|
).values() |
396
|
|
|
), |
397
|
|
|
2, |
398
|
|
|
self._population_card_value, |
399
|
|
|
) |
400
|
1 |
|
return self.normalizer( |
401
|
|
|
sum(abs(val) for val in self._tar_tokens.values()), |
402
|
|
|
2, |
403
|
|
|
self._population_card_value, |
404
|
|
|
) |
405
|
|
|
|
406
|
1 |
|
def _tar_only(self): |
407
|
|
|
r"""Return the tar tokens minus the src tokens. |
408
|
|
|
|
409
|
|
|
For (multi-)sets S and T, this is :math:`T \setminus S`. |
410
|
|
|
""" |
411
|
1 |
|
if self.params['intersection_type'] == 'soft': |
412
|
1 |
|
if not self._soft_intersection_precalc: |
413
|
1 |
|
self._intersection() |
414
|
1 |
|
return self._soft_tar_only |
415
|
1 |
|
tar_only = self._tar_tokens - self._intersection() |
416
|
1 |
|
if self.params['intersection_type'] != 'crisp': |
417
|
1 |
|
tar_only -= self._intersection() - self._crisp_intersection() |
418
|
1 |
|
return tar_only |
419
|
|
|
|
420
|
1 |
|
def _tar_only_card(self): |
421
|
|
|
"""Return the cardinality of the tokens only in the target set.""" |
422
|
1 |
|
return self.normalizer( |
423
|
|
|
sum(abs(val) for val in self._tar_only().values()), |
424
|
|
|
1, |
425
|
|
|
self._population_card_value, |
426
|
|
|
) |
427
|
|
|
|
428
|
1 |
|
def _symmetric_difference(self): |
429
|
|
|
r"""Return the symmetric difference of tokens from src and tar. |
430
|
|
|
|
431
|
|
|
For (multi-)sets S and T, this is :math:`S \triangle T`. |
432
|
|
|
""" |
433
|
1 |
|
return self._src_only() + self._tar_only() |
434
|
|
|
|
435
|
1 |
|
def _symmetric_difference_card(self): |
436
|
|
|
"""Return the cardinality of the symmetric difference.""" |
437
|
1 |
|
return self.normalizer( |
438
|
|
|
sum(abs(val) for val in self._symmetric_difference().values()), |
439
|
|
|
2, |
440
|
|
|
self._population_card_value, |
441
|
|
|
) |
442
|
|
|
|
443
|
1 |
|
def _total(self): |
444
|
|
|
"""Return the sum of the sets. |
445
|
|
|
|
446
|
|
|
For (multi-)sets S and T, this is :math:`S + T`. |
447
|
|
|
|
448
|
|
|
In the case of multisets, this counts values in the interesection |
449
|
|
|
twice. In the case of sets, this is identical to the union. |
450
|
|
|
""" |
451
|
1 |
|
if self.params['intersection_type'] == 'soft': |
452
|
1 |
|
if not self._soft_intersection_precalc: |
453
|
1 |
|
self._intersection() |
454
|
1 |
|
return ( |
455
|
|
|
self._soft_tar_only |
456
|
|
|
+ self._soft_src_only |
457
|
|
|
+ self._soft_intersection_precalc |
458
|
|
|
+ self._soft_intersection_precalc |
459
|
|
|
) |
460
|
1 |
|
return self._src_tokens + self._tar_tokens |
461
|
|
|
|
462
|
1 |
|
def _total_card(self): |
463
|
|
|
"""Return the cardinality of the complement of the total.""" |
464
|
1 |
|
return self.normalizer( |
465
|
|
|
sum(abs(val) for val in self._total().values()), |
466
|
|
|
3, |
467
|
|
|
self._population_card_value, |
468
|
|
|
) |
469
|
|
|
|
470
|
1 |
|
def _total_complement_card(self): |
471
|
|
|
"""Return the cardinality of the complement of the total.""" |
472
|
1 |
|
if self.params['alphabet'] is None: |
473
|
1 |
|
return self.normalizer(0, 1, self._population_card_value) |
474
|
1 |
|
elif isinstance(self.params['alphabet'], Counter): |
475
|
1 |
|
return self.normalizer( |
476
|
|
|
max( |
477
|
|
|
0, |
478
|
|
|
sum( |
479
|
|
|
abs(val) |
480
|
|
|
for val in ( |
481
|
|
|
self.params['alphabet'] - self._total() |
482
|
|
|
).values() |
483
|
|
|
), |
484
|
|
|
), |
485
|
|
|
1, |
486
|
|
|
self._population_card_value, |
487
|
|
|
) |
488
|
1 |
|
return self.normalizer( |
489
|
|
|
max(0, self.params['alphabet'] - len(self._total().values())), |
490
|
|
|
1, |
491
|
|
|
self._population_card_value, |
492
|
|
|
) |
493
|
|
|
|
494
|
1 |
|
def _calc_population_card(self): |
495
|
|
|
"""Return the cardinality of the population.""" |
496
|
1 |
|
save_normalizer = self.normalizer |
497
|
1 |
|
self.normalizer = self._norm_none |
498
|
1 |
|
save_intersection = self.params['intersection_type'] |
499
|
1 |
|
self.params['intersection_type'] = 'crisp' |
500
|
1 |
|
pop = self._total_card() + self._total_complement_card() |
501
|
1 |
|
self.normalizer = save_normalizer |
502
|
1 |
|
self.params['intersection_type'] = save_intersection |
503
|
1 |
|
return pop |
504
|
|
|
|
505
|
1 |
|
def _population_card(self): |
506
|
|
|
"""Return the cardinality of the population.""" |
507
|
1 |
|
return self.normalizer( |
508
|
|
|
self._population_card_value, 4, self._population_card_value |
509
|
|
|
) |
510
|
|
|
|
511
|
1 |
|
def _population_unique_card(self): |
512
|
|
|
"""Return the cardinality of the population minus the intersection.""" |
513
|
1 |
|
return self.normalizer( |
514
|
|
|
self._population_card_value - self._intersection_card(), |
515
|
|
|
4, |
516
|
|
|
self._population_card_value, |
517
|
|
|
) |
518
|
|
|
|
519
|
1 |
|
def _union(self): |
520
|
|
|
r"""Return the union of tokens from src and tar. |
521
|
|
|
|
522
|
|
|
For (multi-)sets S and T, this is :math:`S \cup T`. |
523
|
|
|
""" |
524
|
1 |
|
if self.params['intersection_type'] == 'soft': |
525
|
1 |
|
if not self._soft_intersection_precalc: |
526
|
1 |
|
self._intersection() |
527
|
1 |
|
return ( |
528
|
|
|
self._soft_tar_only |
529
|
|
|
+ self._soft_src_only |
530
|
|
|
+ self._soft_intersection_precalc |
531
|
|
|
) |
532
|
1 |
|
union = self._total() - self._intersection() |
533
|
1 |
|
if self.params['intersection_type'] != 'crisp': |
534
|
1 |
|
union -= self._intersection() - self._crisp_intersection() |
535
|
1 |
|
return union |
536
|
|
|
|
537
|
1 |
|
def _union_card(self): |
538
|
|
|
"""Return the cardinality of the union.""" |
539
|
1 |
|
return self.normalizer( |
540
|
|
|
sum(abs(val) for val in self._union().values()), |
541
|
|
|
3, |
542
|
|
|
self._population_card_value, |
543
|
|
|
) |
544
|
|
|
|
545
|
1 |
|
def _difference(self): |
546
|
|
|
"""Return the difference of the tokens, supporting negative values.""" |
547
|
1 |
|
if self.params['intersection_type'] == 'soft': |
548
|
1 |
|
if not self._soft_intersection_precalc: |
549
|
1 |
|
self._intersection() |
550
|
1 |
|
_src_copy = Counter(self._soft_src_only) |
551
|
1 |
|
_src_copy.subtract(self._soft_tar_only) |
552
|
1 |
|
return _src_copy |
553
|
1 |
|
_src_copy = Counter(self._src_tokens) |
554
|
1 |
|
_src_copy.subtract(self._tar_tokens) |
555
|
1 |
|
return _src_copy |
556
|
|
|
|
557
|
1 |
|
def _crisp_intersection(self): |
558
|
|
|
r"""Return the intersection of tokens from src and tar. |
559
|
|
|
|
560
|
|
|
For (multi-)sets S and T, this is :math:`S \cap T`. |
561
|
|
|
""" |
562
|
1 |
|
return self._src_tokens & self._tar_tokens |
563
|
|
|
|
564
|
1 |
|
def _soft_intersection(self): |
565
|
|
|
"""Return the soft source, target, & intersection tokens & weights. |
566
|
|
|
|
567
|
|
|
This implements the soft intersection defined by :cite:`Russ:2014` in |
568
|
|
|
a way that can reproduce the results in the paper. |
569
|
|
|
""" |
570
|
1 |
|
if not hasattr(self.params['metric'], 'alignment'): |
571
|
1 |
|
raise TypeError( |
572
|
|
|
"Soft similarity requires a 'metric' with an alignment \ |
573
|
|
|
member function, such as Levenshtein." |
574
|
|
|
) |
575
|
|
|
|
576
|
1 |
|
intersection = self._crisp_intersection() |
577
|
1 |
|
src_only = self._src_tokens - self._tar_tokens |
578
|
1 |
|
tar_only = self._tar_tokens - self._src_tokens |
579
|
|
|
|
580
|
1 |
|
src_new = Counter() |
581
|
1 |
|
tar_new = Counter() |
582
|
|
|
|
583
|
1 |
|
def _membership(src, tar): |
584
|
1 |
|
greater_length = max(len(src), len(tar)) |
585
|
1 |
|
return ( |
586
|
|
|
max( |
587
|
|
|
greater_length - self.params['metric'].dist_abs(src, tar), |
588
|
|
|
self._lcprefix.dist_abs(src, tar), |
589
|
|
|
) |
590
|
|
|
/ greater_length |
591
|
|
|
) |
592
|
|
|
|
593
|
1 |
|
def _token_src_tar_int(src, tar): |
594
|
1 |
|
src_tok = [] |
595
|
1 |
|
tar_tok = [] |
596
|
1 |
|
int_tok = [] |
597
|
|
|
|
598
|
1 |
|
src_val = 0 |
599
|
1 |
|
tar_val = 0 |
600
|
1 |
|
int_val = 0 |
601
|
|
|
|
602
|
1 |
|
_cost, _src, _tar = self.params['metric'].alignment(src, tar) |
603
|
|
|
|
604
|
1 |
|
for i in range(len(_src)): |
605
|
1 |
|
if _src[i] == _tar[i]: |
606
|
1 |
|
src_tok.append('-') |
607
|
1 |
|
tar_tok.append('-') |
608
|
1 |
|
int_tok.append(_src[i]) |
609
|
1 |
|
int_val += 1 |
610
|
|
|
else: |
611
|
1 |
|
src_tok.append(_src[i]) |
612
|
1 |
|
if _src[i] != '-': |
613
|
1 |
|
src_val += 1 |
614
|
1 |
|
tar_tok.append(_tar[i]) |
615
|
1 |
|
if _tar[i] != '-': |
616
|
1 |
|
tar_val += 1 |
617
|
1 |
|
int_tok.append('-') |
618
|
|
|
|
619
|
1 |
|
src_val /= len(_src) |
620
|
1 |
|
tar_val /= len(_src) |
621
|
1 |
|
int_val /= len(_src) |
622
|
|
|
|
623
|
1 |
|
src_tok = ''.join(src_tok).strip('-') |
624
|
1 |
|
tar_tok = ''.join(tar_tok).strip('-') |
625
|
1 |
|
int_tok = ''.join(int_tok).strip('-') |
626
|
|
|
|
627
|
1 |
|
return src_tok, src_val, tar_tok, tar_val, int_tok, int_val |
628
|
|
|
|
629
|
|
|
# Dictionary ordering is important for reproducibility, so insertion |
630
|
|
|
# order needs to be controlled and retained. |
631
|
1 |
|
memberships = OrderedDict( |
632
|
|
|
((src, tar), _membership(src, tar)) |
633
|
|
|
for src, tar in sorted(product(src_only, tar_only)) |
634
|
|
|
) |
635
|
|
|
|
636
|
1 |
|
while memberships: |
637
|
1 |
|
src_tok, tar_tok = max(memberships, key=memberships.get) |
638
|
1 |
|
if memberships[src_tok, tar_tok] > 0.0: |
639
|
1 |
|
pairings = min(src_only[src_tok], tar_only[tar_tok]) |
640
|
1 |
|
if pairings: |
641
|
1 |
|
( |
642
|
|
|
src_ntok, |
643
|
|
|
src_val, |
644
|
|
|
tar_ntok, |
645
|
|
|
tar_val, |
646
|
|
|
int_ntok, |
647
|
|
|
int_val, |
648
|
|
|
) = _token_src_tar_int(src_tok, tar_tok) |
649
|
|
|
|
650
|
1 |
|
src_new[src_ntok] += src_val * pairings |
651
|
1 |
|
tar_new[tar_ntok] += tar_val * pairings |
652
|
1 |
|
intersection[int_ntok] += int_val * pairings |
653
|
|
|
|
654
|
|
|
# Remove pairings from src_only/tar_only |
655
|
1 |
|
src_only[src_tok] -= pairings |
656
|
1 |
|
tar_only[tar_tok] -= pairings |
657
|
|
|
|
658
|
1 |
|
del memberships[src_tok, tar_tok] |
659
|
|
|
|
660
|
|
|
# Add src_new/tar_new back into src_only/tar_only |
661
|
1 |
|
src_only += src_new |
662
|
1 |
|
tar_only += tar_new |
663
|
|
|
|
664
|
|
|
# Save src_only/tar_only to the instance for retrieval later. |
665
|
1 |
|
self._soft_src_only = src_only |
666
|
1 |
|
self._soft_tar_only = tar_only |
667
|
1 |
|
self._soft_intersection_precalc = intersection |
668
|
|
|
|
669
|
1 |
|
return intersection |
670
|
|
|
|
671
|
1 |
|
def _fuzzy_intersection(self): |
672
|
|
|
r"""Return the fuzzy intersection of the tokens in src and tar. |
673
|
|
|
|
674
|
|
|
This implements the fuzzy intersection defined by :cite:`Wang:2014`. |
675
|
|
|
|
676
|
|
|
For two sets X and Y, the intersection :cite:`Wang:2014` is the sum of |
677
|
|
|
similarities of all tokens in the two sets that are greater than or |
678
|
|
|
equal to some threshold value (:math:`\delta`). |
679
|
|
|
|
680
|
|
|
The lower bound of on this intersection and the value when |
681
|
|
|
:math:`\delta = 1.0`, is the crisp intersection. Tokens shorter than |
682
|
|
|
:math:`\frac{\delta}{1-\delta}`, 4 in the case of the default threshold |
683
|
|
|
:math:`\delta = 0.8`, must match exactly to be included in the |
684
|
|
|
intersection. |
685
|
|
|
|
686
|
|
|
|
687
|
|
|
.. versionadded:: 0.4.0 |
688
|
|
|
|
689
|
|
|
""" |
690
|
1 |
|
intersection = self._crisp_intersection() |
691
|
1 |
|
src_only = self._src_tokens - self._tar_tokens |
692
|
1 |
|
tar_only = self._tar_tokens - self._src_tokens |
693
|
|
|
|
694
|
1 |
|
pair = {} |
695
|
1 |
|
for src_tok in sorted(src_only): |
696
|
1 |
|
for tar_tok in sorted(tar_only): |
697
|
1 |
|
sim = self.params['metric'].sim(src_tok, tar_tok) |
698
|
1 |
|
if sim >= self.params['threshold']: |
699
|
1 |
|
pair[(src_tok, tar_tok)] = sim |
700
|
|
|
|
701
|
1 |
|
ordered_keys = [(pair[_], _[0], _[1]) for _ in pair] |
702
|
1 |
|
ordered_keys.sort(key=lambda x: x[2]) |
703
|
1 |
|
ordered_keys.sort(key=lambda x: x[1]) |
704
|
1 |
|
ordered_keys.sort(key=lambda x: x[0], reverse=True) |
705
|
|
|
|
706
|
1 |
|
for _, src_tok, tar_tok in ordered_keys: |
707
|
1 |
|
pairings = min(src_only[src_tok], tar_only[tar_tok]) |
708
|
1 |
|
if pairings: |
709
|
1 |
|
sim = pair[(src_tok, tar_tok)] |
710
|
|
|
|
711
|
1 |
|
intersection[src_tok] += sim / 2 * pairings |
712
|
1 |
|
intersection[tar_tok] += sim / 2 * pairings |
713
|
|
|
|
714
|
1 |
|
src_only[src_tok] -= pairings |
715
|
1 |
|
tar_only[tar_tok] -= pairings |
716
|
|
|
|
717
|
|
|
""" |
718
|
|
|
# Here is a slightly different optimization method, which is even |
719
|
|
|
# greedier than the above. |
720
|
|
|
# ordered by sim*pairings rather than just sim |
721
|
|
|
|
722
|
|
|
pair = {} |
723
|
|
|
for src_tok in sorted(src_only): |
724
|
|
|
for tar_tok in sorted(tar_only): |
725
|
|
|
sim = self.params['metric'].sim(src_tok, tar_tok) |
726
|
|
|
if sim >= self.params['threshold']: |
727
|
|
|
pairings = min(src_only[src_tok], tar_only[tar_tok]) |
728
|
|
|
pair[(src_tok, tar_tok)] = sim*pairings |
729
|
|
|
|
730
|
|
|
for src_tok, tar_tok in sorted(pair, key=pair.get, reverse=True): |
731
|
|
|
pairings = min(src_only[src_tok], tar_only[tar_tok]) |
732
|
|
|
if pairings: |
733
|
|
|
sim = pair[(src_tok, tar_tok)] |
734
|
|
|
|
735
|
|
|
intersection[src_tok] += sim / 2 |
736
|
|
|
intersection[tar_tok] += sim / 2 |
737
|
|
|
|
738
|
|
|
src_only[src_tok] -= pairings |
739
|
|
|
tar_only[tar_tok] -= pairings |
740
|
|
|
""" |
741
|
|
|
|
742
|
1 |
|
return intersection |
743
|
|
|
|
744
|
1 |
|
def _group_linkage_intersection(self): |
745
|
|
|
r"""Return the group linkage intersection of the tokens in src and tar. |
746
|
|
|
|
747
|
|
|
This is based on group linkage, as defined by :cite:`On:2007`. |
748
|
|
|
|
749
|
|
|
Most of this method is concerned with solving the assignment problem, |
750
|
|
|
in order to find the weight of the maximum weight bipartite matching. |
751
|
|
|
The Hungarian algorithm of Munkres :cite:`Munkres:1957`, implemented |
752
|
|
|
below in Python & Numpy is used to solve the assignment problem since |
753
|
|
|
it is roughly twice as fast as SciPy's implementation. |
754
|
|
|
|
755
|
|
|
.. versionadded:: 0.4.0 |
756
|
|
|
.. versionchanged:: 0.4.1 |
757
|
|
|
Corrected the Hungarian algorithm & optimized it so that SciPy's |
758
|
|
|
version is no longer needed. |
759
|
|
|
|
760
|
|
|
""" |
761
|
1 |
|
intersection = self._crisp_intersection() |
762
|
1 |
|
src_only_tok = sorted(self._src_tokens - self._tar_tokens) |
763
|
1 |
|
tar_only_tok = sorted(self._tar_tokens - self._src_tokens) |
764
|
1 |
|
src_only = self._src_tokens - self._tar_tokens |
765
|
1 |
|
tar_only = self._tar_tokens - self._src_tokens |
766
|
|
|
|
767
|
|
|
# Quoted text below is from Munkres (1957), cited above. |
768
|
|
|
|
769
|
|
|
# Pre-preliminaries: create square the matrix of scores |
770
|
1 |
|
n = max(len(src_only_tok), len(tar_only_tok)) |
771
|
1 |
|
arr = np_zeros((n, n), dtype=float) |
772
|
|
|
|
773
|
1 |
|
for col in range(len(src_only_tok)): |
774
|
1 |
|
for row in range(len(tar_only_tok)): |
775
|
1 |
|
arr[row, col] = self.params['metric'].dist( |
776
|
|
|
src_only_tok[col], tar_only_tok[row] |
777
|
|
|
) |
778
|
|
|
|
779
|
1 |
|
src_only_tok += [''] * (n - len(src_only_tok)) |
780
|
1 |
|
tar_only_tok += [''] * (n - len(tar_only_tok)) |
781
|
|
|
|
782
|
1 |
|
starred = np.zeros((n, n), dtype=np.bool) |
783
|
1 |
|
primed = np.zeros((n, n), dtype=np.bool) |
784
|
1 |
|
row_covered = np.zeros(n, dtype=np.bool) |
785
|
1 |
|
col_covered = np.zeros(n, dtype=np.bool) |
786
|
|
|
|
787
|
1 |
|
orig_sim = 1 - np.copy(arr) |
788
|
|
|
# Preliminaries: |
789
|
|
|
# P: "No lines are covered; no zeros are starred or primed." |
790
|
|
|
# P: "Consider a row of matrix A; subtract from each element in |
791
|
|
|
# this row the smallest element of this row. Do the same for each |
792
|
|
|
# row of A." |
793
|
1 |
|
arr -= arr.min(axis=1, keepdims=True) |
794
|
|
|
# P: "Then consider each column of the resulting matrix and |
795
|
|
|
# subtract from each column its smallest entry." |
796
|
1 |
|
arr -= arr.min(axis=0, keepdims=True) |
797
|
|
|
|
798
|
|
|
# P: "Consider a zero Z of the matrix. If there is no starred zero |
799
|
|
|
# in its row and none in its column, star Z. Repeat, considering |
800
|
|
|
# each zero in the matrix in turn. Then cover every column |
801
|
|
|
# containing a starred zero. |
802
|
1 |
|
for col in range(n): |
803
|
1 |
|
for row in range(n): |
804
|
1 |
|
if arr[row, col] == 0: |
805
|
1 |
|
if ( |
806
|
|
|
np.count_nonzero(starred[row, :]) == 0 |
807
|
|
|
and np.count_nonzero(starred[:, col]) == 0 |
808
|
|
|
): |
809
|
1 |
|
starred[row, col] = True |
810
|
1 |
|
col_covered[col] = True |
811
|
|
|
|
812
|
1 |
|
step = 1 |
813
|
|
|
# This is the simple case where independent assignments are obvious |
814
|
|
|
# and found without the rest of the algorithm. |
815
|
1 |
|
if np.count_nonzero(col_covered) == n: |
816
|
1 |
|
step = 4 |
817
|
|
|
|
818
|
1 |
|
while step < 4: |
819
|
1 |
|
if step == 1: |
820
|
|
|
# Step 1: |
821
|
|
|
# 1: "Choose a non-covered zero and prime it. Consider the |
822
|
|
|
# row containing it. If there is no starred zero in this |
823
|
|
|
# row, go at once to Step 2. If there is a starred zero Z |
824
|
|
|
# in this row, cover this row and uncover the column of Z." |
825
|
|
|
# 1: Repeat until all zeros are covered. Go to Step 3." |
826
|
1 |
|
zeros = tuple(zip(*((arr == 0).nonzero()))) |
827
|
1 |
|
while step == 1: |
828
|
1 |
|
for row, col in zeros: |
829
|
1 |
|
if not (col_covered[col] | row_covered[row]): |
830
|
1 |
|
primed[row, col] = True |
831
|
1 |
|
z_cols = (starred[row, :]).nonzero()[0] |
832
|
1 |
|
if not z_cols.size: |
833
|
1 |
|
step = 2 |
834
|
1 |
|
break |
835
|
|
|
else: |
836
|
1 |
|
row_covered[row] = True |
837
|
1 |
|
col_covered[z_cols[0]] = False |
838
|
|
|
|
839
|
1 |
|
if step != 1: |
840
|
1 |
|
break |
841
|
|
|
|
842
|
1 |
|
for row, col in zeros: |
843
|
1 |
|
if not (col_covered[col] | row_covered[row]): |
844
|
1 |
|
break |
845
|
|
|
else: |
846
|
1 |
|
step = 3 |
847
|
|
|
|
848
|
1 |
|
if step == 2: |
849
|
|
|
# Step 2: |
850
|
|
|
# 2: "There is a sequence of alternating starred and primed |
851
|
|
|
# zeros, constructed as follows: Let Z_0 denote the |
852
|
|
|
# uncovered 0'. [There is only one.] Let Z_1 denote the 0* |
853
|
|
|
# in Z_0's column (if any). Let Z_2 denote the 0' in Z_1's |
854
|
|
|
# row (we must prove that it exists). Let Z_3 denote the 0* |
855
|
|
|
# in Z_2's column (if any). Similarly continue until the |
856
|
|
|
# sequence stops at a 0', Z_{2k}, which has no 0* in its |
857
|
|
|
# column." |
858
|
1 |
|
z_series = [] |
859
|
1 |
|
for row, col in zeros: # pragma: no branch |
|
|
|
|
860
|
1 |
|
if primed[row, col] and not ( |
861
|
|
|
row_covered[row] | col_covered[col] |
862
|
|
|
): |
863
|
1 |
|
z_series.append((row, col)) |
864
|
1 |
|
break |
865
|
1 |
|
col = z_series[-1][1] |
866
|
1 |
|
while True: |
867
|
1 |
|
row = tuple( |
868
|
|
|
set((arr[:, col] == 0).nonzero()[0]) |
869
|
|
|
& set((starred[:, col]).nonzero()[0]) |
870
|
|
|
) |
871
|
1 |
|
if row: |
872
|
1 |
|
row = row[0] |
873
|
1 |
|
z_series.append((row, col)) |
874
|
1 |
|
col = tuple( |
875
|
|
|
set((arr[row, :] == 0).nonzero()[0]) |
876
|
|
|
& set((primed[row, :]).nonzero()[0]) |
877
|
|
|
)[0] |
878
|
1 |
|
z_series.append((row, col)) |
879
|
|
|
else: |
880
|
1 |
|
break |
881
|
|
|
|
882
|
|
|
# 2: "Unstar each starred zero of the sequence and star |
883
|
|
|
# each primed zero of the sequence. Erase all primes, |
884
|
|
|
# uncover every row, and cover every column containing a |
885
|
|
|
# 0*." |
886
|
1 |
|
primed[:, :] = False |
887
|
1 |
|
row_covered[:] = False |
888
|
1 |
|
col_covered[:] = False |
889
|
1 |
|
for row, col in z_series: |
890
|
1 |
|
starred[row, col] = not starred[row, col] |
891
|
1 |
|
for col in range(n): |
892
|
1 |
|
if np.count_nonzero(starred[:, col]): |
893
|
1 |
|
col_covered[col] = True |
894
|
|
|
# 2: "If all columns are covered, the starred zeros form |
895
|
|
|
# the desired independent set. Otherwise, return to Step |
896
|
|
|
# 1." |
897
|
1 |
|
if np.count_nonzero(col_covered) == n: |
898
|
1 |
|
step = 4 |
899
|
|
|
else: |
900
|
1 |
|
step = 1 |
901
|
|
|
|
902
|
1 |
|
if step == 3: |
903
|
|
|
# Step 3: |
904
|
|
|
# 3: "Let h denote the smallest non-covered element of the |
905
|
|
|
# matrix; it will be positive. Add h to each covered row; |
906
|
|
|
# then subtract h from each uncovered column." |
907
|
1 |
|
h_val = float('inf') |
908
|
1 |
|
for col in range(n): |
909
|
1 |
|
if not (col_covered[col]): |
910
|
1 |
|
for row in range(n): |
911
|
1 |
|
if ( |
912
|
|
|
not (row_covered[row]) |
913
|
|
|
and arr[row, col] < h_val |
914
|
|
|
): |
915
|
1 |
|
h_val = arr[row, col] |
916
|
1 |
|
for row in range(n): |
917
|
1 |
|
if row_covered[row]: |
918
|
1 |
|
arr[row, :] += h_val |
919
|
1 |
|
for col in range(n): |
920
|
1 |
|
if not (col_covered[col]): |
921
|
1 |
|
arr[:, col] -= h_val |
922
|
|
|
|
923
|
|
|
# 3: "Return to Step 1, without altering any asterisks, |
924
|
|
|
# primes, or covered lines." |
925
|
1 |
|
step = 1 |
926
|
|
|
|
927
|
1 |
|
for row, col in tuple(zip(*(starred.nonzero()))): |
928
|
1 |
|
sim = orig_sim[row, col] |
929
|
1 |
|
if sim >= self.params['threshold']: |
930
|
1 |
|
score = float( |
931
|
|
|
(sim / 2) |
932
|
|
|
* min( |
933
|
|
|
src_only[src_only_tok[col]], |
934
|
|
|
tar_only[tar_only_tok[row]], |
935
|
|
|
) |
936
|
|
|
) |
937
|
1 |
|
intersection[src_only_tok[col]] += score |
938
|
1 |
|
intersection[tar_only_tok[row]] += score |
939
|
|
|
|
940
|
1 |
|
return intersection |
941
|
|
|
|
942
|
1 |
|
def _intersection_card(self): |
943
|
|
|
"""Return the cardinality of the intersection.""" |
944
|
1 |
|
return self.normalizer( |
945
|
|
|
sum(abs(val) for val in self._intersection().values()), |
946
|
|
|
1, |
947
|
|
|
self._population_card_value, |
948
|
|
|
) |
949
|
|
|
|
950
|
1 |
|
def _intersection(self): |
951
|
|
|
"""Return the intersection. |
952
|
|
|
|
953
|
|
|
This function may be overridden by setting the intersection_type during |
954
|
|
|
initialization. |
955
|
|
|
""" |
956
|
|
|
return self._crisp_intersection() # pragma: no cover |
957
|
|
|
|
958
|
1 |
|
def _get_confusion_table(self): |
959
|
|
|
"""Return the token counts as a ConfusionTable object.""" |
960
|
1 |
|
return ConfusionTable( |
961
|
|
|
self._intersection_card(), |
962
|
|
|
self._total_complement_card(), |
963
|
|
|
self._src_only_card(), |
964
|
|
|
self._tar_only_card(), |
965
|
|
|
) |
966
|
|
|
|
967
|
|
|
|
968
|
|
|
if __name__ == '__main__': |
969
|
|
|
import doctest |
970
|
|
|
|
971
|
|
|
doctest.testmod() |
972
|
|
|
|