1
|
|
|
# Copyright 2018-2020 by Christopher C. Little. |
2
|
|
|
# This file is part of Abydos. |
3
|
|
|
# |
4
|
|
|
# Abydos is free software: you can redistribute it and/or modify |
5
|
|
|
# it under the terms of the GNU General Public License as published by |
6
|
|
|
# the Free Software Foundation, either version 3 of the License, or |
7
|
|
|
# (at your option) any later version. |
8
|
|
|
# |
9
|
|
|
# Abydos is distributed in the hope that it will be useful, |
10
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of |
11
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
12
|
|
|
# GNU General Public License for more details. |
13
|
|
|
# |
14
|
|
|
# You should have received a copy of the GNU General Public License |
15
|
|
|
# along with Abydos. If not, see <http://www.gnu.org/licenses/>. |
16
|
|
|
|
17
|
|
|
"""abydos.distance._token_distance. |
18
|
|
|
|
19
|
1 |
|
The distance._token_distance._TokenDistance module implements abstract class |
20
|
|
|
_TokenDistance. |
21
|
|
|
""" |
22
|
|
|
|
23
|
|
|
from collections import Counter, OrderedDict |
24
|
|
|
from itertools import product |
25
|
1 |
|
from math import exp, log1p |
26
|
|
|
from typing import ( |
27
|
|
|
Any, |
28
|
|
|
Callable, |
29
|
|
|
Counter as TCounter, |
30
|
|
|
Optional, |
31
|
|
|
Tuple, |
32
|
1 |
|
Union, |
33
|
1 |
|
cast, |
34
|
1 |
|
) |
35
|
|
|
|
36
|
1 |
|
import numpy as np |
37
|
1 |
|
|
38
|
|
|
from ._damerau_levenshtein import DamerauLevenshtein |
39
|
1 |
|
from ._distance import _Distance |
40
|
1 |
|
from ._lcprefix import LCPrefix |
41
|
1 |
|
from ._levenshtein import Levenshtein |
42
|
1 |
|
from ..stats import ConfusionTable |
43
|
1 |
|
from ..tokenizer import QGrams, QSkipgrams, WhitespaceTokenizer, _Tokenizer |
44
|
1 |
|
|
45
|
|
|
__all__ = ['_TokenDistance'] |
46
|
1 |
|
|
47
|
|
|
|
48
|
|
|
class _TokenDistance(_Distance): |
49
|
1 |
|
r"""Abstract Token Distance class. |
50
|
|
|
|
51
|
|
|
.. _confusion_table: |
52
|
|
|
|
53
|
|
|
+----------------+--------------+-----------------+-------+ |
54
|
|
|
| | |in| ``tar`` | |notin| ``tar`` | | |
55
|
|
|
+----------------+--------------+-----------------+-------+ |
56
|
|
|
| |in| ``src`` | |a| | |b| | |a+b| | |
57
|
|
|
+----------------+--------------+-----------------+-------+ |
58
|
|
|
| |notin| ``src``| |c| | |d| | |c+d| | |
59
|
|
|
+----------------+--------------+-----------------+-------+ |
60
|
|
|
| | |a+c| | |b+d| | |n| | |
61
|
|
|
+----------------+--------------+-----------------+-------+ |
62
|
|
|
|
63
|
|
|
.. |in| replace:: :math:`x \in` |
64
|
|
|
.. |notin| replace:: :math:`x \notin` |
65
|
|
|
|
66
|
|
|
.. |a| replace:: :math:`a = |X \cap Y|` |
67
|
|
|
.. |b| replace:: :math:`b = |X\setminus Y|` |
68
|
|
|
.. |c| replace:: :math:`c = |Y \setminus X|` |
69
|
|
|
.. |d| replace:: :math:`d = |(N\setminus X)\setminus Y|` |
70
|
|
|
.. |n| replace:: :math:`n = |N|` |
71
|
|
|
.. |a+b| replace:: :math:`p_1 = a+b = |X|` |
72
|
|
|
.. |a+c| replace:: :math:`p_2 = a+c = |Y|` |
73
|
|
|
.. |c+d| replace:: :math:`q_1 = c+d = |N\setminus X|` |
74
|
|
|
.. |b+d| replace:: :math:`q_2 = b+d = |N\setminus Y|` |
75
|
|
|
|
76
|
|
|
.. versionadded:: 0.3.6 |
77
|
|
|
""" |
78
|
|
|
|
79
|
|
|
def __init__( |
80
|
1 |
|
self, |
81
|
|
|
tokenizer: Optional[_Tokenizer] = None, |
82
|
|
|
intersection_type: str = 'crisp', |
83
|
|
|
**kwargs: Any |
84
|
|
|
) -> None: |
85
|
|
|
r"""Initialize _TokenDistance instance. |
86
|
|
|
|
87
|
|
|
.. _intersection_type: |
88
|
|
|
|
89
|
|
|
Parameters |
90
|
|
|
---------- |
91
|
|
|
tokenizer : _Tokenizer |
92
|
|
|
A tokenizer instance from the :py:mod:`abydos.tokenizer` package |
93
|
|
|
intersection_type : str |
94
|
|
|
Specifies the intersection type, and set type as a result: |
95
|
|
|
|
96
|
|
|
- 'crisp': Ordinary intersection, wherein items are entirely |
97
|
|
|
members or non-members of the intersection. (Default) |
98
|
|
|
- ``fuzzy``: Fuzzy intersection, defined by :cite:`Wang:2014`, |
99
|
|
|
wherein items can be partially members of the intersection |
100
|
|
|
if their similarity meets or exceeds a threshold value. This |
101
|
|
|
also takes `metric` (by default :class:`Levenshtein()`) and |
102
|
|
|
`threshold` (by default 0.8) parameters. |
103
|
|
|
- ``soft``: Soft intersection, defined by :cite:`Russ:2014`, |
104
|
|
|
wherein items can be partially members of the intersection |
105
|
|
|
depending on their similarity. This also takes a `metric` |
106
|
|
|
(by default :class:`DamerauLevenshtein()`) parameter. |
107
|
|
|
- ``linkage``: Group linkage, defined by :cite:`On:2007`. Like |
108
|
|
|
the soft intersection, items can be partially members of the |
109
|
|
|
intersection, but the method of pairing similar members is |
110
|
|
|
somewhat more complex. See the cited paper for details. This |
111
|
|
|
also takes `metric` |
112
|
|
|
(by default :class:`DamerauLevenshtein()`) and `threshold` |
113
|
|
|
(by default 0.1) parameters. |
114
|
|
|
**kwargs |
115
|
|
|
Arbitrary keyword arguments |
116
|
|
|
|
117
|
|
|
|
118
|
|
|
.. _alphabet: |
119
|
|
|
|
120
|
|
|
Other Parameters |
121
|
|
|
---------------- |
122
|
|
|
qval : int |
123
|
|
|
The length of each q-gram. Using this parameter and tokenizer=None |
124
|
|
|
will cause the instance to use the QGram tokenizer with this |
125
|
|
|
q value. |
126
|
|
|
metric : _Distance |
127
|
|
|
A string distance measure class for use in the ``soft`` and |
128
|
|
|
``fuzzy`` variants. |
129
|
|
|
threshold : float |
130
|
|
|
A threshold value, similarities above which are counted as |
131
|
|
|
members of the intersection for the ``fuzzy`` variant. |
132
|
|
|
alphabet : Counter, collection, int, or None |
133
|
|
|
This represents the alphabet of possible tokens. |
134
|
|
|
|
135
|
|
|
- If a Counter is supplied, it is used directly in computing |
136
|
|
|
the complement of the tokens in both sets. |
137
|
|
|
- If a collection is supplied, it is converted to a Counter |
138
|
|
|
and used directly. In the case of a single string being |
139
|
|
|
supplied and the QGram tokenizer being used, the full |
140
|
|
|
alphabet is inferred (i.e. |
141
|
|
|
:math:`len(set(alphabet+QGrams.start\_stop))^{QGrams.qval}` |
142
|
|
|
is used as the cardinality of the full alphabet. |
143
|
|
|
- If an int is supplied, it is used as the cardinality of the |
144
|
|
|
full alphabet. |
145
|
|
|
- If None is supplied, the cardinality of the full alphabet |
146
|
|
|
is inferred if QGram of QSkipgrams tokenization is used (i.e. |
147
|
|
|
:math:`28^{QGrams.qval}` is used as the cardinality of the |
148
|
|
|
full alphabet or :math:`26` if QGrams.qval is 1, which |
149
|
|
|
assumes the strings are English language strings and only |
150
|
|
|
contain letters of a single case). Otherwise, the cardinality |
151
|
|
|
of the complement of the total will be 0. |
152
|
|
|
normalizer : str |
153
|
|
|
This represents the normalization applied to the values in the |
154
|
|
|
2x2 contingency table prior to any of the cardinality (\*_card) |
155
|
|
|
methods returning a value. By default, no normalization is applied, |
156
|
|
|
but the following values are supported: |
157
|
|
|
|
158
|
|
|
- ``proportional`` : :math:`\frac{x}{n}`, where n is the total |
159
|
|
|
population |
160
|
|
|
- ``log`` : :math:`log(1+x)` |
161
|
|
|
- ``exp`` : :math:`e^x` |
162
|
|
|
- ``laplace`` : :math:`x+1` |
163
|
|
|
- ``inverse`` : :math:`\frac{1}{x}` |
164
|
|
|
- ``complement`` : :math:`n-x`, where n is the total population |
165
|
1 |
|
|
166
|
|
|
.. versionadded:: 0.4.0 |
167
|
|
|
|
168
|
|
|
""" |
169
|
1 |
|
super(_TokenDistance, self).__init__( |
170
|
1 |
|
intersection_type=intersection_type, **kwargs |
171
|
|
|
) |
172
|
|
|
|
173
|
|
|
qval = 2 if 'qval' not in self.params else self.params['qval'] |
174
|
|
|
self.params['tokenizer'] = ( |
175
|
|
|
tokenizer |
176
|
|
|
if tokenizer is not None |
177
|
|
|
else WhitespaceTokenizer() |
178
|
1 |
|
if qval == 0 |
179
|
1 |
|
else QGrams(qval=qval, start_stop='$#', skip=0, scaler=None) |
180
|
1 |
|
) |
181
|
|
|
|
182
|
1 |
|
if hasattr(self.params['tokenizer'], 'qval'): |
183
|
|
|
if isinstance(self.params['tokenizer'].qval, int): |
184
|
1 |
|
qvals = [self.params['tokenizer'].qval] |
185
|
|
|
else: |
186
|
1 |
|
qvals = list(self.params['tokenizer'].qval) |
187
|
1 |
|
else: |
188
|
1 |
|
qvals = [] |
189
|
1 |
|
|
190
|
1 |
|
if 'alphabet' in self.params: |
191
|
|
|
if isinstance(self.params['alphabet'], str): |
192
|
|
|
alpha_set = set(self.params['alphabet']) |
193
|
1 |
|
if isinstance(self.params['tokenizer'], (QGrams, QSkipgrams)): |
194
|
|
|
alpha_set |= set(self.params['tokenizer'].start_stop) |
195
|
|
|
self.params['alphabet'] = sum( |
196
|
1 |
|
len(alpha_set) ** qval for qval in qvals |
197
|
|
|
) |
198
|
|
|
if hasattr(self.params['alphabet'], '__len__') and not isinstance( |
199
|
1 |
|
self.params['alphabet'], Counter |
200
|
1 |
|
): |
201
|
|
|
self.params['alphabet'] = len(self.params['alphabet']) |
202
|
|
|
elif self.params['alphabet'] is None and isinstance( |
203
|
1 |
|
self.params['tokenizer'], (QGrams, QSkipgrams) |
204
|
|
|
): |
205
|
|
|
self.params['alphabet'] = sum( |
206
|
|
|
28 ** qval if qval > 1 else 26 for qval in qvals |
207
|
1 |
|
) |
208
|
1 |
|
else: |
209
|
|
|
if isinstance(self.params['tokenizer'], (QGrams, QSkipgrams)): |
210
|
|
|
self.params['alphabet'] = sum( |
211
|
|
|
28 ** qval if qval > 1 else 26 for qval in qvals |
212
|
1 |
|
) |
213
|
|
|
else: |
214
|
1 |
|
self.params['alphabet'] = None |
215
|
1 |
|
|
216
|
1 |
|
if intersection_type == 'soft': |
217
|
1 |
|
if 'metric' not in self.params or self.params['metric'] is None: |
218
|
1 |
|
self.params['metric'] = Levenshtein() |
219
|
1 |
|
self._lcprefix = LCPrefix() |
220
|
1 |
|
self._intersection = self._soft_intersection # type: ignore |
221
|
1 |
|
elif intersection_type == 'fuzzy': |
222
|
1 |
|
if 'metric' not in self.params or self.params['metric'] is None: |
223
|
1 |
|
self.params['metric'] = Levenshtein() |
224
|
1 |
|
if 'threshold' not in self.params: |
225
|
1 |
|
self.params['threshold'] = 0.8 |
226
|
1 |
|
self._intersection = self._fuzzy_intersection # type: ignore |
227
|
1 |
|
elif intersection_type == 'linkage': |
228
|
1 |
|
if 'metric' not in self.params or self.params['metric'] is None: |
229
|
1 |
|
self.params['metric'] = DamerauLevenshtein() |
230
|
1 |
|
if 'threshold' not in self.params: |
231
|
|
|
self.params['threshold'] = 0.1 |
232
|
1 |
|
self._intersection = ( # type: ignore |
233
|
|
|
self._group_linkage_intersection |
234
|
1 |
|
) |
235
|
1 |
|
else: |
236
|
1 |
|
self._intersection = self._crisp_intersection # type: ignore |
237
|
|
|
|
238
|
|
|
self._src_tokens = Counter() # type: TCounter[str] |
239
|
1 |
|
self._tar_tokens = Counter() # type: TCounter[str] |
240
|
|
|
self._population_card_value = 0 # type: float |
241
|
1 |
|
|
242
|
|
|
# initialize normalizer |
243
|
|
|
self.normalizer = ( |
244
|
|
|
self._norm_none |
245
|
|
|
) # type: Callable[[float, int, float], float] # noqa: E501 |
246
|
|
|
|
247
|
|
|
self._norm_dict = { |
248
|
|
|
'proportional': self._norm_proportional, |
249
|
|
|
'log': self._norm_log, |
250
|
|
|
'exp': self._norm_exp, |
251
|
1 |
|
'laplace': self._norm_laplace, |
252
|
1 |
|
'inverse': self._norm_inverse, |
253
|
1 |
|
'complement': self._norm_complement, |
254
|
|
|
} |
255
|
1 |
|
|
256
|
1 |
|
# initialize values for soft intersection |
257
|
|
|
self._soft_intersection_precalc = Counter() # type: TCounter[str] |
258
|
1 |
|
self._soft_src_only = Counter() # type: TCounter[str] |
259
|
1 |
|
self._soft_tar_only = Counter() # type: TCounter[str] |
260
|
|
|
|
261
|
1 |
|
@staticmethod |
262
|
1 |
|
def _norm_none(x: float, _squares: int, _pop: float) -> float: |
263
|
|
|
return x |
264
|
1 |
|
|
265
|
1 |
|
@staticmethod |
266
|
|
|
def _norm_proportional(x: float, _squares: int, pop: float) -> float: |
267
|
1 |
|
return x / max(1, pop) |
268
|
1 |
|
|
269
|
|
|
@staticmethod |
270
|
1 |
|
def _norm_log(x: float, _squares: int, _pop: float) -> float: |
271
|
1 |
|
return log1p(x) |
272
|
|
|
|
273
|
1 |
|
@staticmethod |
274
|
1 |
|
def _norm_exp(x: float, _squares: int, _pop: float) -> float: |
275
|
|
|
return exp(x) |
276
|
1 |
|
|
277
|
|
|
@staticmethod |
278
|
|
|
def _norm_laplace(x: float, squares: int, _pop: float) -> float: |
279
|
|
|
return x + squares |
280
|
|
|
|
281
|
|
|
@staticmethod |
282
|
|
|
def _norm_inverse(x: float, _squares: int, pop: float) -> float: |
283
|
|
|
return 1 / x if x else pop |
284
|
|
|
|
285
|
|
|
@staticmethod |
286
|
|
|
def _norm_complement(x: float, _squares: int, pop: float) -> float: |
287
|
|
|
return pop - x |
288
|
|
|
|
289
|
|
|
def _tokenize( |
290
|
|
|
self, src: Union[str, TCounter[str]], tar: Union[str, TCounter[str]] |
291
|
|
|
) -> '_TokenDistance': |
292
|
|
|
"""Return the Q-Grams in src & tar. |
293
|
|
|
|
294
|
|
|
Parameters |
295
|
|
|
---------- |
296
|
|
|
src : str |
297
|
|
|
Source string (or QGrams/Counter objects) for comparison |
298
|
|
|
tar : str |
299
|
|
|
Target string (or QGrams/Counter objects) for comparison |
300
|
|
|
|
301
|
|
|
Returns |
302
|
|
|
------- |
303
|
|
|
tuple of Counters |
304
|
1 |
|
Q-Grams |
305
|
1 |
|
|
306
|
|
|
Examples |
307
|
1 |
|
-------- |
308
|
1 |
|
>>> pe = _TokenDistance() |
309
|
|
|
>>> pe._tokenize('AT', 'TT')._get_tokens() |
310
|
1 |
|
(Counter({'$A': 1, 'AT': 1, 'T#': 1}), |
311
|
|
|
Counter({'$T': 1, 'TT': 1, 'T#': 1})) |
312
|
|
|
|
313
|
1 |
|
|
314
|
1 |
|
.. versionadded:: 0.1.0 |
315
|
|
|
.. versionchanged:: 0.3.6 |
316
|
1 |
|
Encapsulated in class |
317
|
|
|
|
318
|
|
|
""" |
319
|
|
|
self._src_orig = src |
320
|
1 |
|
self._tar_orig = tar |
321
|
|
|
|
322
|
|
|
if isinstance(src, Counter): |
323
|
|
|
self._src_tokens = src |
324
|
|
|
else: |
325
|
1 |
|
self._src_tokens = ( |
326
|
|
|
self.params['tokenizer'].tokenize(src).get_counter() |
327
|
|
|
) |
328
|
|
|
if isinstance(tar, Counter): |
329
|
1 |
|
self._tar_tokens = tar |
330
|
|
|
else: |
331
|
|
|
self._tar_tokens = ( |
332
|
1 |
|
self.params['tokenizer'].tokenize(tar).get_counter() |
333
|
1 |
|
) |
334
|
1 |
|
|
335
|
|
|
self._population_card_value = self._calc_population_card() |
336
|
1 |
|
|
337
|
|
|
# Set up the normalizer, a function of two variables: |
338
|
1 |
|
# x is the value in the contingency table square(s) |
339
|
|
|
# n is the number of squares that x represents |
340
|
1 |
|
if ( |
341
|
|
|
'normalizer' in self.params |
342
|
1 |
|
and self.params['normalizer'] in self._norm_dict |
343
|
|
|
): |
344
|
1 |
|
self.normalizer = self._norm_dict[self.params['normalizer']] |
345
|
1 |
|
|
346
|
1 |
|
# clear values for soft intersection |
347
|
1 |
|
self._soft_intersection_precalc = Counter() |
348
|
|
|
self._soft_src_only = Counter() |
349
|
|
|
self._soft_tar_only = Counter() |
350
|
|
|
|
351
|
|
|
return self |
352
|
|
|
|
353
|
|
|
def _get_tokens(self) -> Tuple[TCounter[str], TCounter[str]]: |
354
|
|
|
"""Return the src and tar tokens as a tuple.""" |
355
|
|
|
return self._src_tokens, self._tar_tokens |
356
|
|
|
|
357
|
1 |
|
def _src_card(self) -> float: |
358
|
|
|
r"""Return the cardinality of the tokens in the source set.""" |
359
|
|
|
if self.params['intersection_type'] == 'soft': |
360
|
|
|
if not len(self._soft_intersection_precalc): |
361
|
|
|
self._intersection() |
362
|
|
|
return self.normalizer( |
363
|
1 |
|
sum( |
364
|
|
|
abs(val) |
365
|
|
|
for val in ( |
366
|
|
|
self._soft_intersection_precalc + self._soft_src_only |
367
|
|
|
).values() |
368
|
1 |
|
), |
369
|
1 |
|
2, |
370
|
1 |
|
self._population_card_value, |
371
|
1 |
|
) |
372
|
1 |
|
return self.normalizer( |
373
|
1 |
|
sum(abs(val) for val in self._src_tokens.values()), |
374
|
1 |
|
2, |
375
|
1 |
|
self._population_card_value, |
376
|
|
|
) |
377
|
1 |
|
|
378
|
|
|
def _src_only(self) -> TCounter[str]: |
379
|
1 |
|
r"""Return the src tokens minus the tar tokens. |
380
|
|
|
|
381
|
|
|
For (multi-)sets S and T, this is :math:`S \setminus T`. |
382
|
|
|
""" |
383
|
|
|
if self.params['intersection_type'] == 'soft': |
384
|
|
|
if not len(self._soft_intersection_precalc): |
385
|
1 |
|
self._intersection() |
386
|
|
|
return self._soft_src_only |
387
|
1 |
|
src_only = self._src_tokens - self._intersection() |
388
|
1 |
|
if self.params['intersection_type'] != 'crisp': |
389
|
1 |
|
src_only -= self._intersection() - self._crisp_intersection() |
390
|
1 |
|
return src_only |
391
|
|
|
|
392
|
|
|
def _src_only_card(self) -> float: |
393
|
|
|
"""Return the cardinality of the tokens only in the source set.""" |
394
|
|
|
return self.normalizer( |
395
|
|
|
sum(abs(val) for val in self._src_only().values()), |
396
|
|
|
1, |
397
|
|
|
self._population_card_value, |
398
|
|
|
) |
399
|
|
|
|
400
|
1 |
|
def _tar_card(self) -> float: |
401
|
|
|
r"""Return the cardinality of the tokens in the target set.""" |
402
|
|
|
if self.params['intersection_type'] == 'soft': |
403
|
|
|
if not len(self._soft_intersection_precalc): |
404
|
|
|
self._intersection() |
405
|
|
|
return self.normalizer( |
406
|
1 |
|
sum( |
407
|
|
|
abs(val) |
408
|
|
|
for val in ( |
409
|
|
|
self._soft_intersection_precalc + self._soft_tar_only |
410
|
|
|
).values() |
411
|
1 |
|
), |
412
|
1 |
|
2, |
413
|
1 |
|
self._population_card_value, |
414
|
1 |
|
) |
415
|
1 |
|
return self.normalizer( |
416
|
1 |
|
sum(abs(val) for val in self._tar_tokens.values()), |
417
|
1 |
|
2, |
418
|
1 |
|
self._population_card_value, |
419
|
|
|
) |
420
|
1 |
|
|
421
|
|
|
def _tar_only(self) -> TCounter[str]: |
422
|
1 |
|
r"""Return the tar tokens minus the src tokens. |
423
|
|
|
|
424
|
|
|
For (multi-)sets S and T, this is :math:`T \setminus S`. |
425
|
|
|
""" |
426
|
|
|
if self.params['intersection_type'] == 'soft': |
427
|
|
|
if not len(self._soft_intersection_precalc): |
428
|
1 |
|
self._intersection() |
429
|
|
|
return self._soft_tar_only |
430
|
|
|
tar_only = self._tar_tokens - self._intersection() |
431
|
|
|
if self.params['intersection_type'] != 'crisp': |
432
|
|
|
tar_only -= self._intersection() - self._crisp_intersection() |
433
|
1 |
|
return tar_only |
434
|
|
|
|
435
|
1 |
|
def _tar_only_card(self) -> float: |
436
|
|
|
"""Return the cardinality of the tokens only in the target set.""" |
437
|
1 |
|
return self.normalizer( |
438
|
|
|
sum(abs(val) for val in self._tar_only().values()), |
439
|
|
|
1, |
440
|
|
|
self._population_card_value, |
441
|
|
|
) |
442
|
|
|
|
443
|
1 |
|
def _symmetric_difference(self) -> TCounter[str]: |
444
|
|
|
r"""Return the symmetric difference of tokens from src and tar. |
445
|
|
|
|
446
|
|
|
For (multi-)sets S and T, this is :math:`S \triangle T`. |
447
|
|
|
""" |
448
|
|
|
return self._src_only() + self._tar_only() |
449
|
|
|
|
450
|
|
|
def _symmetric_difference_card(self) -> float: |
451
|
1 |
|
"""Return the cardinality of the symmetric difference.""" |
452
|
1 |
|
return self.normalizer( |
453
|
1 |
|
sum(abs(val) for val in self._symmetric_difference().values()), |
454
|
1 |
|
2, |
455
|
|
|
self._population_card_value, |
456
|
|
|
) |
457
|
|
|
|
458
|
|
|
def _total(self) -> TCounter[str]: |
459
|
|
|
"""Return the sum of the sets. |
460
|
1 |
|
|
461
|
|
|
For (multi-)sets S and T, this is :math:`S + T`. |
462
|
1 |
|
|
463
|
|
|
In the case of multisets, this counts values in the interesection |
464
|
1 |
|
twice. In the case of sets, this is identical to the union. |
465
|
|
|
""" |
466
|
|
|
if self.params['intersection_type'] == 'soft': |
467
|
|
|
if not len(self._soft_intersection_precalc): |
468
|
|
|
self._intersection() |
469
|
|
|
return ( |
470
|
1 |
|
self._soft_tar_only |
471
|
|
|
+ self._soft_src_only |
472
|
1 |
|
+ self._soft_intersection_precalc |
473
|
1 |
|
+ self._soft_intersection_precalc |
474
|
1 |
|
) |
475
|
1 |
|
return self._src_tokens + self._tar_tokens |
476
|
|
|
|
477
|
|
|
def _total_card(self) -> float: |
478
|
|
|
"""Return the cardinality of the complement of the total.""" |
479
|
|
|
return self.normalizer( |
480
|
|
|
sum(abs(val) for val in self._total().values()), |
481
|
|
|
3, |
482
|
|
|
self._population_card_value, |
483
|
|
|
) |
484
|
|
|
|
485
|
|
|
def _total_complement_card(self) -> float: |
486
|
|
|
"""Return the cardinality of the complement of the total.""" |
487
|
|
|
if self.params['alphabet'] is None: |
488
|
1 |
|
return self.normalizer(0, 1, self._population_card_value) |
489
|
|
|
elif isinstance(self.params['alphabet'], Counter): |
490
|
|
|
return self.normalizer( |
491
|
|
|
max( |
492
|
|
|
0, |
493
|
|
|
sum( |
494
|
1 |
|
abs(val) |
495
|
|
|
for val in ( |
496
|
1 |
|
self.params['alphabet'] - self._total() |
497
|
1 |
|
).values() |
498
|
1 |
|
), |
499
|
1 |
|
), |
500
|
1 |
|
1, |
501
|
1 |
|
self._population_card_value, |
502
|
1 |
|
) |
503
|
1 |
|
return self.normalizer( |
504
|
|
|
max(0, self.params['alphabet'] - len(self._total().values())), |
505
|
1 |
|
1, |
506
|
|
|
self._population_card_value, |
507
|
1 |
|
) |
508
|
|
|
|
509
|
|
|
def _calc_population_card(self) -> float: |
510
|
|
|
"""Return the cardinality of the population.""" |
511
|
1 |
|
save_normalizer = self.normalizer |
512
|
|
|
self.normalizer = self._norm_none |
513
|
1 |
|
save_intersection = self.params['intersection_type'] |
514
|
|
|
self.params['intersection_type'] = 'crisp' |
515
|
|
|
pop = self._total_card() + self._total_complement_card() |
516
|
|
|
self.normalizer = save_normalizer |
517
|
|
|
self.params['intersection_type'] = save_intersection |
518
|
|
|
return pop |
519
|
1 |
|
|
520
|
|
|
def _population_card(self) -> float: |
521
|
|
|
"""Return the cardinality of the population.""" |
522
|
|
|
return self.normalizer( |
523
|
|
|
self._population_card_value, 4, self._population_card_value |
524
|
1 |
|
) |
525
|
1 |
|
|
526
|
1 |
|
def _population_unique_card(self) -> float: |
527
|
1 |
|
"""Return the cardinality of the population minus the intersection.""" |
528
|
|
|
return self.normalizer( |
529
|
|
|
self._population_card_value - self._intersection_card(), |
530
|
|
|
4, |
531
|
|
|
self._population_card_value, |
532
|
1 |
|
) |
533
|
1 |
|
|
534
|
1 |
|
def _union(self) -> TCounter[str]: |
535
|
1 |
|
r"""Return the union of tokens from src and tar. |
536
|
|
|
|
537
|
1 |
|
For (multi-)sets S and T, this is :math:`S \cup T`. |
538
|
|
|
""" |
539
|
1 |
|
if self.params['intersection_type'] == 'soft': |
540
|
|
|
if not len(self._soft_intersection_precalc): |
541
|
|
|
self._intersection() |
542
|
|
|
return ( |
543
|
|
|
self._soft_tar_only |
544
|
|
|
+ self._soft_src_only |
545
|
1 |
|
+ self._soft_intersection_precalc |
546
|
|
|
) |
547
|
1 |
|
union = self._total() - self._intersection() |
548
|
1 |
|
if self.params['intersection_type'] != 'crisp': |
549
|
1 |
|
union -= self._intersection() - self._crisp_intersection() |
550
|
1 |
|
return union |
551
|
1 |
|
|
552
|
1 |
|
def _union_card(self) -> float: |
553
|
1 |
|
"""Return the cardinality of the union.""" |
554
|
1 |
|
return self.normalizer( |
555
|
1 |
|
sum(abs(val) for val in self._union().values()), |
556
|
|
|
3, |
557
|
1 |
|
self._population_card_value, |
558
|
|
|
) |
559
|
|
|
|
560
|
|
|
def _difference(self) -> TCounter[str]: |
561
|
|
|
"""Return the difference of the tokens, supporting negative values.""" |
562
|
1 |
|
if self.params['intersection_type'] == 'soft': |
563
|
|
|
if not len(self._soft_intersection_precalc): |
564
|
1 |
|
self._intersection() |
565
|
|
|
_src_copy = Counter(self._soft_src_only) |
566
|
|
|
_src_copy.subtract(self._soft_tar_only) |
567
|
|
|
return _src_copy |
568
|
|
|
_src_copy = Counter(self._src_tokens) |
569
|
|
|
_src_copy.subtract(self._tar_tokens) |
570
|
1 |
|
return _src_copy |
571
|
1 |
|
|
572
|
|
|
def _crisp_intersection(self) -> TCounter[str]: |
573
|
|
|
r"""Return the intersection of tokens from src and tar. |
574
|
|
|
|
575
|
|
|
For (multi-)sets S and T, this is :math:`S \cap T`. |
576
|
1 |
|
""" |
577
|
1 |
|
return self._src_tokens & self._tar_tokens |
578
|
1 |
|
|
579
|
|
|
def _soft_intersection(self) -> TCounter[str]: |
580
|
1 |
|
"""Return the soft source, target, & intersection tokens & weights. |
581
|
1 |
|
|
582
|
|
|
This implements the soft intersection defined by :cite:`Russ:2014` in |
583
|
1 |
|
a way that can reproduce the results in the paper. |
584
|
1 |
|
""" |
585
|
1 |
|
if not hasattr(self.params['metric'], 'alignment'): |
586
|
|
|
raise TypeError( |
587
|
|
|
"Soft similarity requires a 'metric' with an alignment \ |
588
|
|
|
member function, such as Levenshtein." |
589
|
|
|
) |
590
|
|
|
|
591
|
|
|
intersection = self._crisp_intersection() |
592
|
|
|
src_only = self._src_tokens - self._tar_tokens |
593
|
1 |
|
tar_only = self._tar_tokens - self._src_tokens |
594
|
1 |
|
|
595
|
1 |
|
src_new = Counter() # type: TCounter[str] |
596
|
1 |
|
tar_new = Counter() # type: TCounter[str] |
597
|
|
|
|
598
|
1 |
|
def _membership(src: str, tar: str) -> float: |
599
|
1 |
|
greater_length = max(len(src), len(tar)) |
600
|
1 |
|
return cast( |
601
|
|
|
float, |
602
|
1 |
|
max( |
603
|
|
|
greater_length - self.params['metric'].dist_abs(src, tar), |
604
|
1 |
|
self._lcprefix.dist_abs(src, tar), |
605
|
1 |
|
) |
606
|
1 |
|
/ greater_length, |
607
|
1 |
|
) |
608
|
1 |
|
|
609
|
1 |
|
def _token_src_tar_int( |
610
|
|
|
src: str, tar: str |
611
|
1 |
|
) -> Tuple[str, float, str, float, str, float]: |
612
|
1 |
|
src_tok = [] |
613
|
1 |
|
tar_tok = [] |
614
|
1 |
|
int_tok = [] |
615
|
1 |
|
|
616
|
1 |
|
src_val = 0.0 |
617
|
1 |
|
tar_val = 0.0 |
618
|
|
|
int_val = 0.0 |
619
|
1 |
|
|
620
|
1 |
|
_cost, _src, _tar = self.params['metric'].alignment(src, tar) |
621
|
1 |
|
|
622
|
|
|
for i in range(len(_src)): |
623
|
1 |
|
if _src[i] == _tar[i]: |
624
|
1 |
|
src_tok.append('-') |
625
|
1 |
|
tar_tok.append('-') |
626
|
|
|
int_tok.append(_src[i]) |
627
|
1 |
|
int_val += 1 |
628
|
|
|
else: |
629
|
|
|
src_tok.append(_src[i]) |
630
|
|
|
if _src[i] != '-': |
631
|
1 |
|
src_val += 1 |
632
|
|
|
tar_tok.append(_tar[i]) |
633
|
|
|
if _tar[i] != '-': |
634
|
|
|
tar_val += 1 |
635
|
|
|
int_tok.append('-') |
636
|
1 |
|
|
637
|
1 |
|
src_val /= len(_src) |
638
|
1 |
|
tar_val /= len(_src) |
639
|
1 |
|
int_val /= len(_src) |
640
|
1 |
|
|
641
|
1 |
|
src_tok_str = ''.join(src_tok).strip('-') |
642
|
|
|
tar_tok_str = ''.join(tar_tok).strip('-') |
643
|
|
|
int_tok_str = ''.join(int_tok).strip('-') |
644
|
|
|
|
645
|
|
|
return ( |
646
|
|
|
src_tok_str, |
647
|
|
|
src_val, |
648
|
|
|
tar_tok_str, |
649
|
|
|
tar_val, |
650
|
1 |
|
int_tok_str, |
651
|
1 |
|
int_val, |
652
|
1 |
|
) |
653
|
|
|
|
654
|
|
|
# Dictionary ordering is important for reproducibility, so insertion |
655
|
1 |
|
# order needs to be controlled and retained. |
656
|
1 |
|
memberships = OrderedDict( |
657
|
|
|
((src, tar), _membership(src, tar)) |
658
|
1 |
|
for src, tar in sorted(product(src_only, tar_only)) |
659
|
|
|
) |
660
|
|
|
|
661
|
1 |
|
while memberships: |
662
|
1 |
|
src_tok, tar_tok = max(memberships, key=memberships.get) |
663
|
|
|
if memberships[src_tok, tar_tok] > 0.0: |
664
|
|
|
pairings = min(src_only[src_tok], tar_only[tar_tok]) |
665
|
1 |
|
if pairings: |
666
|
1 |
|
( |
667
|
1 |
|
src_ntok, |
668
|
|
|
src_val, |
669
|
1 |
|
tar_ntok, |
670
|
|
|
tar_val, |
671
|
1 |
|
int_ntok, |
672
|
|
|
int_val, |
673
|
|
|
) = _token_src_tar_int(src_tok, tar_tok) |
674
|
|
|
|
675
|
|
|
src_new[src_ntok] += src_val * pairings # type: ignore |
676
|
|
|
tar_new[tar_ntok] += tar_val * pairings # type: ignore |
677
|
|
|
intersection[int_ntok] += int_val * pairings # type: ignore |
678
|
|
|
|
679
|
|
|
# Remove pairings from src_only/tar_only |
680
|
|
|
src_only[src_tok] -= pairings |
681
|
|
|
tar_only[tar_tok] -= pairings |
682
|
|
|
|
683
|
|
|
del memberships[src_tok, tar_tok] |
684
|
|
|
|
685
|
|
|
# Add src_new/tar_new back into src_only/tar_only |
686
|
|
|
src_only += src_new |
687
|
|
|
tar_only += tar_new |
688
|
|
|
|
689
|
|
|
# Save src_only/tar_only to the instance for retrieval later. |
690
|
1 |
|
self._soft_src_only = src_only |
691
|
1 |
|
self._soft_tar_only = tar_only |
692
|
1 |
|
self._soft_intersection_precalc = intersection |
693
|
|
|
|
694
|
1 |
|
return intersection |
695
|
1 |
|
|
696
|
1 |
|
def _fuzzy_intersection(self) -> TCounter[str]: |
697
|
1 |
|
r"""Return the fuzzy intersection of the tokens in src and tar. |
698
|
1 |
|
|
699
|
1 |
|
This implements the fuzzy intersection defined by :cite:`Wang:2014`. |
700
|
|
|
|
701
|
1 |
|
For two sets X and Y, the intersection :cite:`Wang:2014` is the sum of |
702
|
1 |
|
similarities of all tokens in the two sets that are greater than or |
703
|
1 |
|
equal to some threshold value (:math:`\delta`). |
704
|
1 |
|
|
705
|
|
|
The lower bound of on this intersection and the value when |
706
|
1 |
|
:math:`\delta = 1.0`, is the crisp intersection. Tokens shorter than |
707
|
1 |
|
:math:`\frac{\delta}{1-\delta}`, 4 in the case of the default threshold |
708
|
1 |
|
:math:`\delta = 0.8`, must match exactly to be included in the |
709
|
1 |
|
intersection. |
710
|
|
|
|
711
|
1 |
|
|
712
|
1 |
|
.. versionadded:: 0.4.0 |
713
|
|
|
|
714
|
1 |
|
""" |
715
|
1 |
|
intersection = self._crisp_intersection() |
716
|
|
|
src_only = self._src_tokens - self._tar_tokens |
717
|
|
|
tar_only = self._tar_tokens - self._src_tokens |
718
|
|
|
|
719
|
|
|
pair = {} |
720
|
|
|
for src_tok in sorted(src_only): |
721
|
|
|
for tar_tok in sorted(tar_only): |
722
|
|
|
sim = self.params['metric'].sim(src_tok, tar_tok) |
723
|
|
|
if sim >= self.params['threshold']: |
724
|
|
|
pair[(src_tok, tar_tok)] = sim |
725
|
|
|
|
726
|
|
|
ordered_keys = [(pair[_], _[0], _[1]) for _ in pair] |
727
|
|
|
ordered_keys.sort(key=lambda x: x[2]) |
728
|
|
|
ordered_keys.sort(key=lambda x: x[1]) |
729
|
|
|
ordered_keys.sort(key=lambda x: x[0], reverse=True) |
730
|
|
|
|
731
|
|
|
for _, src_tok, tar_tok in ordered_keys: |
732
|
|
|
pairings = min(src_only[src_tok], tar_only[tar_tok]) |
733
|
|
|
if pairings: |
734
|
|
|
sim = pair[(src_tok, tar_tok)] |
735
|
|
|
|
736
|
|
|
intersection[src_tok] += sim / 2 * pairings |
737
|
|
|
intersection[tar_tok] += sim / 2 * pairings |
738
|
|
|
|
739
|
|
|
src_only[src_tok] -= pairings |
740
|
|
|
tar_only[tar_tok] -= pairings |
741
|
|
|
|
742
|
1 |
|
""" |
743
|
|
|
# Here is a slightly different optimization method, which is even |
744
|
1 |
|
# greedier than the above. |
745
|
|
|
# ordered by sim*pairings rather than just sim |
746
|
|
|
|
747
|
|
|
pair = {} |
748
|
|
|
for src_tok in sorted(src_only): |
749
|
|
|
for tar_tok in sorted(tar_only): |
750
|
|
|
sim = self.params['metric'].sim(src_tok, tar_tok) |
751
|
|
|
if sim >= self.params['threshold']: |
752
|
|
|
pairings = min(src_only[src_tok], tar_only[tar_tok]) |
753
|
|
|
pair[(src_tok, tar_tok)] = sim*pairings |
754
|
|
|
|
755
|
|
|
for src_tok, tar_tok in sorted(pair, key=pair.get, reverse=True): |
756
|
|
|
pairings = min(src_only[src_tok], tar_only[tar_tok]) |
757
|
|
|
if pairings: |
758
|
|
|
sim = pair[(src_tok, tar_tok)] |
759
|
|
|
|
760
|
|
|
intersection[src_tok] += sim / 2 |
761
|
1 |
|
intersection[tar_tok] += sim / 2 |
762
|
1 |
|
|
763
|
1 |
|
src_only[src_tok] -= pairings |
764
|
1 |
|
tar_only[tar_tok] -= pairings |
765
|
1 |
|
""" |
766
|
|
|
|
767
|
|
|
return intersection |
768
|
|
|
|
769
|
|
|
def _group_linkage_intersection(self) -> TCounter[str]: |
770
|
1 |
|
r"""Return the group linkage intersection of the tokens in src and tar. |
771
|
1 |
|
|
772
|
|
|
This is based on group linkage, as defined by :cite:`On:2007`. |
773
|
1 |
|
|
774
|
1 |
|
Most of this method is concerned with solving the assignment problem, |
775
|
1 |
|
in order to find the weight of the maximum weight bipartite matching. |
776
|
|
|
The Hungarian algorithm of Munkres :cite:`Munkres:1957`, implemented |
777
|
|
|
below in Python & Numpy is used to solve the assignment problem since |
778
|
|
|
it is roughly twice as fast as SciPy's implementation. |
779
|
1 |
|
|
780
|
1 |
|
.. versionadded:: 0.4.0 |
781
|
|
|
.. versionchanged:: 0.4.1 |
782
|
1 |
|
Corrected the Hungarian algorithm & optimized it so that SciPy's |
783
|
1 |
|
version is no longer needed. |
784
|
1 |
|
|
785
|
1 |
|
""" |
786
|
|
|
intersection = self._crisp_intersection() |
787
|
1 |
|
src_only_tok = sorted(self._src_tokens - self._tar_tokens) |
788
|
|
|
tar_only_tok = sorted(self._tar_tokens - self._src_tokens) |
789
|
|
|
src_only = self._src_tokens - self._tar_tokens |
790
|
|
|
tar_only = self._tar_tokens - self._src_tokens |
791
|
|
|
|
792
|
|
|
# Quoted text below is from Munkres (1957), cited above. |
793
|
1 |
|
|
794
|
|
|
# Pre-preliminaries: create square the matrix of scores |
795
|
|
|
n = max(len(src_only_tok), len(tar_only_tok)) |
796
|
1 |
|
arr = np.zeros((n, n), dtype=float) |
797
|
|
|
|
798
|
|
|
for col in range(len(src_only_tok)): |
799
|
|
|
for row in range(len(tar_only_tok)): |
800
|
|
|
arr[row, col] = self.params['metric'].dist( |
801
|
|
|
src_only_tok[col], tar_only_tok[row] |
802
|
1 |
|
) |
803
|
1 |
|
|
804
|
1 |
|
src_only_tok += [''] * (n - len(src_only_tok)) |
805
|
1 |
|
tar_only_tok += [''] * (n - len(tar_only_tok)) |
806
|
|
|
|
807
|
|
|
starred = np.zeros((n, n), dtype=np.bool_) |
808
|
|
|
primed = np.zeros((n, n), dtype=np.bool_) |
809
|
1 |
|
row_covered = np.zeros(n, dtype=np.bool_) |
810
|
1 |
|
col_covered = np.zeros(n, dtype=np.bool_) |
811
|
|
|
|
812
|
1 |
|
orig_sim = 1 - np.copy(arr) |
813
|
|
|
# Preliminaries: |
814
|
|
|
# P: "No lines are covered; no zeros are starred or primed." |
815
|
1 |
|
# P: "Consider a row of matrix A; subtract from each element in |
816
|
1 |
|
# this row the smallest element of this row. Do the same for each |
817
|
|
|
# row of A." |
818
|
1 |
|
arr -= arr.min(axis=1, keepdims=True) |
819
|
1 |
|
# P: "Then consider each column of the resulting matrix and |
820
|
|
|
# subtract from each column its smallest entry." |
821
|
|
|
arr -= arr.min(axis=0, keepdims=True) |
822
|
|
|
|
823
|
|
|
# P: "Consider a zero Z of the matrix. If there is no starred zero |
824
|
|
|
# in its row and none in its column, star Z. Repeat, considering |
825
|
|
|
# each zero in the matrix in turn. Then cover every column |
826
|
1 |
|
# containing a starred zero. |
827
|
1 |
|
for col in range(n): |
828
|
1 |
|
for row in range(n): |
829
|
1 |
|
if arr[row, col] == 0: |
830
|
1 |
|
if ( |
831
|
1 |
|
np.count_nonzero(starred[row, :]) == 0 |
832
|
1 |
|
and np.count_nonzero(starred[:, col]) == 0 |
833
|
1 |
|
): |
834
|
1 |
|
starred[row, col] = True |
835
|
|
|
col_covered[col] = True |
836
|
1 |
|
|
837
|
1 |
|
step = 1 |
838
|
|
|
# This is the simple case where independent assignments are obvious |
839
|
1 |
|
# and found without the rest of the algorithm. |
840
|
1 |
|
if np.count_nonzero(col_covered) == n: |
841
|
|
|
step = 4 |
842
|
1 |
|
|
843
|
1 |
|
while step < 4: |
844
|
1 |
|
if step == 1: |
845
|
|
|
# Step 1: |
846
|
1 |
|
# 1: "Choose a non-covered zero and prime it. Consider the |
847
|
|
|
# row containing it. If there is no starred zero in this |
848
|
1 |
|
# row, go at once to Step 2. If there is a starred zero Z |
849
|
|
|
# in this row, cover this row and uncover the column of Z." |
850
|
|
|
# 1: Repeat until all zeros are covered. Go to Step 3." |
851
|
|
|
zeros = tuple(zip(*((arr == 0).nonzero()))) |
852
|
|
|
while step == 1: |
853
|
|
|
for row, col in zeros: |
854
|
|
|
if not (col_covered[col] | row_covered[row]): |
855
|
|
|
primed[row, col] = True |
856
|
|
|
z_cols = (starred[row, :]).nonzero()[0] |
857
|
|
|
if not z_cols.size: |
858
|
1 |
|
step = 2 |
859
|
1 |
|
break |
860
|
1 |
|
else: |
861
|
|
|
row_covered[row] = True |
862
|
|
|
col_covered[z_cols[0]] = False |
863
|
1 |
|
|
864
|
1 |
|
if step != 1: |
865
|
1 |
|
break |
866
|
1 |
|
|
867
|
1 |
|
for row, col in zeros: |
868
|
|
|
if not (col_covered[col] | row_covered[row]): |
869
|
|
|
break |
870
|
|
|
else: |
871
|
1 |
|
step = 3 |
872
|
1 |
|
|
873
|
1 |
|
if step == 2: |
874
|
1 |
|
# Step 2: |
875
|
|
|
# 2: "There is a sequence of alternating starred and primed |
876
|
|
|
# zeros, constructed as follows: Let Z_0 denote the |
877
|
|
|
# uncovered 0'. [There is only one.] Let Z_1 denote the 0* |
878
|
1 |
|
# in Z_0's column (if any). Let Z_2 denote the 0' in Z_1's |
879
|
|
|
# row (we must prove that it exists). Let Z_3 denote the 0* |
880
|
1 |
|
# in Z_2's column (if any). Similarly continue until the |
881
|
|
|
# sequence stops at a 0', Z_{2k}, which has no 0* in its |
882
|
|
|
# column." |
883
|
|
|
z_series = [] |
884
|
|
|
for row, col in zeros: # pragma: no branch |
|
|
|
|
885
|
|
|
if primed[row, col] and not ( |
886
|
1 |
|
row_covered[row] | col_covered[col] |
887
|
1 |
|
): |
888
|
1 |
|
z_series.append((row, col)) |
889
|
1 |
|
break |
890
|
1 |
|
col = z_series[-1][1] |
891
|
1 |
|
while True: |
892
|
1 |
|
row_content = tuple( |
893
|
1 |
|
set((arr[:, col] == 0).nonzero()[0]) |
894
|
|
|
& set((starred[:, col]).nonzero()[0]) |
895
|
|
|
) |
896
|
|
|
if row_content: |
897
|
1 |
|
row = row_content[0] |
898
|
1 |
|
z_series.append((row, col)) |
899
|
|
|
col = tuple( |
900
|
1 |
|
set((arr[row, :] == 0).nonzero()[0]) |
901
|
|
|
& set((primed[row, :]).nonzero()[0]) |
902
|
1 |
|
)[0] |
903
|
|
|
z_series.append((row, col)) |
904
|
|
|
else: |
905
|
|
|
break |
906
|
|
|
|
907
|
1 |
|
# 2: "Unstar each starred zero of the sequence and star |
908
|
1 |
|
# each primed zero of the sequence. Erase all primes, |
909
|
1 |
|
# uncover every row, and cover every column containing a |
910
|
1 |
|
# 0*." |
911
|
1 |
|
primed[:, :] = False |
912
|
|
|
row_covered[:] = False |
913
|
|
|
col_covered[:] = False |
914
|
|
|
for row, col in z_series: |
915
|
1 |
|
starred[row, col] = not starred[row, col] |
916
|
1 |
|
for col in range(n): |
917
|
1 |
|
if np.count_nonzero(starred[:, col]): |
918
|
1 |
|
col_covered[col] = True |
919
|
1 |
|
# 2: "If all columns are covered, the starred zeros form |
920
|
1 |
|
# the desired independent set. Otherwise, return to Step |
921
|
1 |
|
# 1." |
922
|
|
|
if np.count_nonzero(col_covered) == n: |
923
|
|
|
step = 4 |
924
|
|
|
else: |
925
|
1 |
|
step = 1 |
926
|
|
|
|
927
|
1 |
|
if step == 3: |
928
|
1 |
|
# Step 3: |
929
|
1 |
|
# 3: "Let h denote the smallest non-covered element of the |
930
|
1 |
|
# matrix; it will be positive. Add h to each covered row; |
931
|
|
|
# then subtract h from each uncovered column." |
932
|
|
|
h_val = float('inf') |
933
|
|
|
for col in range(n): |
934
|
|
|
if not (col_covered[col]): |
935
|
|
|
for row in range(n): |
936
|
|
|
if ( |
937
|
1 |
|
not (row_covered[row]) |
938
|
1 |
|
and arr[row, col] < h_val |
939
|
|
|
): |
940
|
1 |
|
h_val = arr[row, col] |
941
|
|
|
for row in range(n): |
942
|
1 |
|
if row_covered[row]: |
943
|
|
|
arr[row, :] += h_val |
944
|
1 |
|
for col in range(n): |
945
|
|
|
if not (col_covered[col]): |
946
|
|
|
arr[:, col] -= h_val |
947
|
|
|
|
948
|
|
|
# 3: "Return to Step 1, without altering any asterisks, |
949
|
|
|
# primes, or covered lines." |
950
|
1 |
|
step = 1 |
951
|
|
|
|
952
|
|
|
for row, col in tuple(zip(*(starred.nonzero()))): |
953
|
|
|
sim = orig_sim[row, col] |
954
|
|
|
if sim >= self.params['threshold']: |
955
|
|
|
score = float( |
956
|
|
|
(sim / 2) |
957
|
|
|
* min( |
958
|
1 |
|
src_only[src_only_tok[col]], |
959
|
|
|
tar_only[tar_only_tok[row]], |
960
|
1 |
|
) |
961
|
|
|
) |
962
|
|
|
intersection[src_only_tok[col]] += score # type: ignore |
963
|
|
|
intersection[tar_only_tok[row]] += score # type: ignore |
964
|
|
|
|
965
|
|
|
return intersection |
966
|
|
|
|
967
|
|
|
def _intersection_card(self) -> float: |
968
|
|
|
"""Return the cardinality of the intersection.""" |
969
|
|
|
return self.normalizer( |
970
|
|
|
sum(abs(val) for val in self._intersection().values()), |
971
|
|
|
1, |
972
|
|
|
self._population_card_value, |
973
|
|
|
) |
974
|
|
|
|
975
|
|
|
def _intersection(self) -> TCounter[str]: |
976
|
|
|
"""Return the intersection. |
977
|
|
|
|
978
|
|
|
This function may be overridden by setting the intersection_type during |
979
|
|
|
initialization. |
980
|
|
|
""" |
981
|
|
|
return self._crisp_intersection() # pragma: no cover |
982
|
|
|
|
983
|
|
|
def _get_confusion_table(self) -> ConfusionTable: |
984
|
|
|
"""Return the token counts as a ConfusionTable object.""" |
985
|
|
|
return ConfusionTable( |
986
|
|
|
self._intersection_card(), |
987
|
|
|
self._total_complement_card(), |
988
|
|
|
self._src_only_card(), |
989
|
|
|
self._tar_only_card(), |
990
|
|
|
) |
991
|
|
|
|
992
|
|
|
|
993
|
|
|
if __name__ == '__main__': |
994
|
|
|
import doctest |
995
|
|
|
|
996
|
|
|
doctest.testmod() |
997
|
|
|
|