|
1
|
|
|
# Copyright 2018-2020 by Christopher C. Little. |
|
2
|
|
|
# This file is part of Abydos. |
|
3
|
|
|
# |
|
4
|
|
|
# Abydos is free software: you can redistribute it and/or modify |
|
5
|
|
|
# it under the terms of the GNU General Public License as published by |
|
6
|
|
|
# the Free Software Foundation, either version 3 of the License, or |
|
7
|
|
|
# (at your option) any later version. |
|
8
|
|
|
# |
|
9
|
|
|
# Abydos is distributed in the hope that it will be useful, |
|
10
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
11
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|
12
|
|
|
# GNU General Public License for more details. |
|
13
|
|
|
# |
|
14
|
|
|
# You should have received a copy of the GNU General Public License |
|
15
|
|
|
# along with Abydos. If not, see <http://www.gnu.org/licenses/>. |
|
16
|
|
|
|
|
17
|
|
|
"""abydos.distance._token_distance. |
|
18
|
|
|
|
|
19
|
1 |
|
The distance._token_distance._TokenDistance module implements abstract class |
|
20
|
|
|
_TokenDistance. |
|
21
|
|
|
""" |
|
22
|
|
|
|
|
23
|
|
|
from collections import Counter, OrderedDict |
|
24
|
|
|
from itertools import product |
|
25
|
1 |
|
from math import exp, log1p |
|
26
|
|
|
from typing import ( |
|
27
|
|
|
Any, |
|
28
|
|
|
Callable, |
|
29
|
|
|
Counter as TCounter, |
|
30
|
|
|
Optional, |
|
31
|
|
|
Tuple, |
|
32
|
1 |
|
Union, |
|
33
|
1 |
|
cast, |
|
34
|
1 |
|
) |
|
35
|
|
|
|
|
36
|
1 |
|
import numpy as np |
|
37
|
1 |
|
|
|
38
|
|
|
from ._damerau_levenshtein import DamerauLevenshtein |
|
39
|
1 |
|
from ._distance import _Distance |
|
40
|
1 |
|
from ._lcprefix import LCPrefix |
|
41
|
1 |
|
from ._levenshtein import Levenshtein |
|
42
|
1 |
|
from ..stats import ConfusionTable |
|
43
|
1 |
|
from ..tokenizer import QGrams, QSkipgrams, WhitespaceTokenizer, _Tokenizer |
|
44
|
1 |
|
|
|
45
|
|
|
__all__ = ['_TokenDistance'] |
|
46
|
1 |
|
|
|
47
|
|
|
|
|
48
|
|
|
class _TokenDistance(_Distance): |
|
49
|
1 |
|
r"""Abstract Token Distance class. |
|
50
|
|
|
|
|
51
|
|
|
.. _confusion_table: |
|
52
|
|
|
|
|
53
|
|
|
+----------------+--------------+-----------------+-------+ |
|
54
|
|
|
| | |in| ``tar`` | |notin| ``tar`` | | |
|
55
|
|
|
+----------------+--------------+-----------------+-------+ |
|
56
|
|
|
| |in| ``src`` | |a| | |b| | |a+b| | |
|
57
|
|
|
+----------------+--------------+-----------------+-------+ |
|
58
|
|
|
| |notin| ``src``| |c| | |d| | |c+d| | |
|
59
|
|
|
+----------------+--------------+-----------------+-------+ |
|
60
|
|
|
| | |a+c| | |b+d| | |n| | |
|
61
|
|
|
+----------------+--------------+-----------------+-------+ |
|
62
|
|
|
|
|
63
|
|
|
.. |in| replace:: :math:`x \in` |
|
64
|
|
|
.. |notin| replace:: :math:`x \notin` |
|
65
|
|
|
|
|
66
|
|
|
.. |a| replace:: :math:`a = |X \cap Y|` |
|
67
|
|
|
.. |b| replace:: :math:`b = |X\setminus Y|` |
|
68
|
|
|
.. |c| replace:: :math:`c = |Y \setminus X|` |
|
69
|
|
|
.. |d| replace:: :math:`d = |(N\setminus X)\setminus Y|` |
|
70
|
|
|
.. |n| replace:: :math:`n = |N|` |
|
71
|
|
|
.. |a+b| replace:: :math:`p_1 = a+b = |X|` |
|
72
|
|
|
.. |a+c| replace:: :math:`p_2 = a+c = |Y|` |
|
73
|
|
|
.. |c+d| replace:: :math:`q_1 = c+d = |N\setminus X|` |
|
74
|
|
|
.. |b+d| replace:: :math:`q_2 = b+d = |N\setminus Y|` |
|
75
|
|
|
|
|
76
|
|
|
.. versionadded:: 0.3.6 |
|
77
|
|
|
""" |
|
78
|
|
|
|
|
79
|
|
|
def __init__( |
|
80
|
1 |
|
self, |
|
81
|
|
|
tokenizer: Optional[_Tokenizer] = None, |
|
82
|
|
|
intersection_type: str = 'crisp', |
|
83
|
|
|
**kwargs: Any |
|
84
|
|
|
) -> None: |
|
85
|
|
|
r"""Initialize _TokenDistance instance. |
|
86
|
|
|
|
|
87
|
|
|
.. _intersection_type: |
|
88
|
|
|
|
|
89
|
|
|
Parameters |
|
90
|
|
|
---------- |
|
91
|
|
|
tokenizer : _Tokenizer |
|
92
|
|
|
A tokenizer instance from the :py:mod:`abydos.tokenizer` package |
|
93
|
|
|
intersection_type : str |
|
94
|
|
|
Specifies the intersection type, and set type as a result: |
|
95
|
|
|
|
|
96
|
|
|
- 'crisp': Ordinary intersection, wherein items are entirely |
|
97
|
|
|
members or non-members of the intersection. (Default) |
|
98
|
|
|
- ``fuzzy``: Fuzzy intersection, defined by :cite:`Wang:2014`, |
|
99
|
|
|
wherein items can be partially members of the intersection |
|
100
|
|
|
if their similarity meets or exceeds a threshold value. This |
|
101
|
|
|
also takes `metric` (by default :class:`Levenshtein()`) and |
|
102
|
|
|
`threshold` (by default 0.8) parameters. |
|
103
|
|
|
- ``soft``: Soft intersection, defined by :cite:`Russ:2014`, |
|
104
|
|
|
wherein items can be partially members of the intersection |
|
105
|
|
|
depending on their similarity. This also takes a `metric` |
|
106
|
|
|
(by default :class:`DamerauLevenshtein()`) parameter. |
|
107
|
|
|
- ``linkage``: Group linkage, defined by :cite:`On:2007`. Like |
|
108
|
|
|
the soft intersection, items can be partially members of the |
|
109
|
|
|
intersection, but the method of pairing similar members is |
|
110
|
|
|
somewhat more complex. See the cited paper for details. This |
|
111
|
|
|
also takes `metric` |
|
112
|
|
|
(by default :class:`DamerauLevenshtein()`) and `threshold` |
|
113
|
|
|
(by default 0.1) parameters. |
|
114
|
|
|
**kwargs |
|
115
|
|
|
Arbitrary keyword arguments |
|
116
|
|
|
|
|
117
|
|
|
|
|
118
|
|
|
.. _alphabet: |
|
119
|
|
|
|
|
120
|
|
|
Other Parameters |
|
121
|
|
|
---------------- |
|
122
|
|
|
qval : int |
|
123
|
|
|
The length of each q-gram. Using this parameter and tokenizer=None |
|
124
|
|
|
will cause the instance to use the QGram tokenizer with this |
|
125
|
|
|
q value. |
|
126
|
|
|
metric : _Distance |
|
127
|
|
|
A string distance measure class for use in the ``soft`` and |
|
128
|
|
|
``fuzzy`` variants. |
|
129
|
|
|
threshold : float |
|
130
|
|
|
A threshold value, similarities above which are counted as |
|
131
|
|
|
members of the intersection for the ``fuzzy`` variant. |
|
132
|
|
|
alphabet : Counter, collection, int, or None |
|
133
|
|
|
This represents the alphabet of possible tokens. |
|
134
|
|
|
|
|
135
|
|
|
- If a Counter is supplied, it is used directly in computing |
|
136
|
|
|
the complement of the tokens in both sets. |
|
137
|
|
|
- If a collection is supplied, it is converted to a Counter |
|
138
|
|
|
and used directly. In the case of a single string being |
|
139
|
|
|
supplied and the QGram tokenizer being used, the full |
|
140
|
|
|
alphabet is inferred (i.e. |
|
141
|
|
|
:math:`len(set(alphabet+QGrams.start\_stop))^{QGrams.qval}` |
|
142
|
|
|
is used as the cardinality of the full alphabet. |
|
143
|
|
|
- If an int is supplied, it is used as the cardinality of the |
|
144
|
|
|
full alphabet. |
|
145
|
|
|
- If None is supplied, the cardinality of the full alphabet |
|
146
|
|
|
is inferred if QGram of QSkipgrams tokenization is used (i.e. |
|
147
|
|
|
:math:`28^{QGrams.qval}` is used as the cardinality of the |
|
148
|
|
|
full alphabet or :math:`26` if QGrams.qval is 1, which |
|
149
|
|
|
assumes the strings are English language strings and only |
|
150
|
|
|
contain letters of a single case). Otherwise, the cardinality |
|
151
|
|
|
of the complement of the total will be 0. |
|
152
|
|
|
normalizer : str |
|
153
|
|
|
This represents the normalization applied to the values in the |
|
154
|
|
|
2x2 contingency table prior to any of the cardinality (\*_card) |
|
155
|
|
|
methods returning a value. By default, no normalization is applied, |
|
156
|
|
|
but the following values are supported: |
|
157
|
|
|
|
|
158
|
|
|
- ``proportional`` : :math:`\frac{x}{n}`, where n is the total |
|
159
|
|
|
population |
|
160
|
|
|
- ``log`` : :math:`log(1+x)` |
|
161
|
|
|
- ``exp`` : :math:`e^x` |
|
162
|
|
|
- ``laplace`` : :math:`x+1` |
|
163
|
|
|
- ``inverse`` : :math:`\frac{1}{x}` |
|
164
|
|
|
- ``complement`` : :math:`n-x`, where n is the total population |
|
165
|
1 |
|
|
|
166
|
|
|
.. versionadded:: 0.4.0 |
|
167
|
|
|
|
|
168
|
|
|
""" |
|
169
|
1 |
|
super(_TokenDistance, self).__init__( |
|
170
|
1 |
|
intersection_type=intersection_type, **kwargs |
|
171
|
|
|
) |
|
172
|
|
|
|
|
173
|
|
|
qval = 2 if 'qval' not in self.params else self.params['qval'] |
|
174
|
|
|
self.params['tokenizer'] = ( |
|
175
|
|
|
tokenizer |
|
176
|
|
|
if tokenizer is not None |
|
177
|
|
|
else WhitespaceTokenizer() |
|
178
|
1 |
|
if qval == 0 |
|
179
|
1 |
|
else QGrams(qval=qval, start_stop='$#', skip=0, scaler=None) |
|
180
|
1 |
|
) |
|
181
|
|
|
|
|
182
|
1 |
|
if hasattr(self.params['tokenizer'], 'qval'): |
|
183
|
|
|
if isinstance(self.params['tokenizer'].qval, int): |
|
184
|
1 |
|
qvals = [self.params['tokenizer'].qval] |
|
185
|
|
|
else: |
|
186
|
1 |
|
qvals = list(self.params['tokenizer'].qval) |
|
187
|
1 |
|
else: |
|
188
|
1 |
|
qvals = [] |
|
189
|
1 |
|
|
|
190
|
1 |
|
if 'alphabet' in self.params: |
|
191
|
|
|
if isinstance(self.params['alphabet'], str): |
|
192
|
|
|
alpha_set = set(self.params['alphabet']) |
|
193
|
1 |
|
if isinstance(self.params['tokenizer'], (QGrams, QSkipgrams)): |
|
194
|
|
|
alpha_set |= set(self.params['tokenizer'].start_stop) |
|
195
|
|
|
self.params['alphabet'] = sum( |
|
196
|
1 |
|
len(alpha_set) ** qval for qval in qvals |
|
197
|
|
|
) |
|
198
|
|
|
if hasattr(self.params['alphabet'], '__len__') and not isinstance( |
|
199
|
1 |
|
self.params['alphabet'], Counter |
|
200
|
1 |
|
): |
|
201
|
|
|
self.params['alphabet'] = len(self.params['alphabet']) |
|
202
|
|
|
elif self.params['alphabet'] is None and isinstance( |
|
203
|
1 |
|
self.params['tokenizer'], (QGrams, QSkipgrams) |
|
204
|
|
|
): |
|
205
|
|
|
self.params['alphabet'] = sum( |
|
206
|
|
|
28 ** qval if qval > 1 else 26 for qval in qvals |
|
207
|
1 |
|
) |
|
208
|
1 |
|
else: |
|
209
|
|
|
if isinstance(self.params['tokenizer'], (QGrams, QSkipgrams)): |
|
210
|
|
|
self.params['alphabet'] = sum( |
|
211
|
|
|
28 ** qval if qval > 1 else 26 for qval in qvals |
|
212
|
1 |
|
) |
|
213
|
|
|
else: |
|
214
|
1 |
|
self.params['alphabet'] = None |
|
215
|
1 |
|
|
|
216
|
1 |
|
if intersection_type == 'soft': |
|
217
|
1 |
|
if 'metric' not in self.params or self.params['metric'] is None: |
|
218
|
1 |
|
self.params['metric'] = Levenshtein() |
|
219
|
1 |
|
self._lcprefix = LCPrefix() |
|
220
|
1 |
|
self._intersection = self._soft_intersection # type: ignore |
|
221
|
1 |
|
elif intersection_type == 'fuzzy': |
|
222
|
1 |
|
if 'metric' not in self.params or self.params['metric'] is None: |
|
223
|
1 |
|
self.params['metric'] = Levenshtein() |
|
224
|
1 |
|
if 'threshold' not in self.params: |
|
225
|
1 |
|
self.params['threshold'] = 0.8 |
|
226
|
1 |
|
self._intersection = self._fuzzy_intersection # type: ignore |
|
227
|
1 |
|
elif intersection_type == 'linkage': |
|
228
|
1 |
|
if 'metric' not in self.params or self.params['metric'] is None: |
|
229
|
1 |
|
self.params['metric'] = DamerauLevenshtein() |
|
230
|
1 |
|
if 'threshold' not in self.params: |
|
231
|
|
|
self.params['threshold'] = 0.1 |
|
232
|
1 |
|
self._intersection = ( # type: ignore |
|
233
|
|
|
self._group_linkage_intersection |
|
234
|
1 |
|
) |
|
235
|
1 |
|
else: |
|
236
|
1 |
|
self._intersection = self._crisp_intersection # type: ignore |
|
237
|
|
|
|
|
238
|
|
|
self._src_tokens = Counter() # type: TCounter[str] |
|
239
|
1 |
|
self._tar_tokens = Counter() # type: TCounter[str] |
|
240
|
|
|
self._population_card_value = 0 # type: float |
|
241
|
1 |
|
|
|
242
|
|
|
# initialize normalizer |
|
243
|
|
|
self.normalizer = ( |
|
244
|
|
|
self._norm_none |
|
245
|
|
|
) # type: Callable[[float, int, float], float] # noqa: E501 |
|
246
|
|
|
|
|
247
|
|
|
self._norm_dict = { |
|
248
|
|
|
'proportional': self._norm_proportional, |
|
249
|
|
|
'log': self._norm_log, |
|
250
|
|
|
'exp': self._norm_exp, |
|
251
|
1 |
|
'laplace': self._norm_laplace, |
|
252
|
1 |
|
'inverse': self._norm_inverse, |
|
253
|
1 |
|
'complement': self._norm_complement, |
|
254
|
|
|
} |
|
255
|
1 |
|
|
|
256
|
1 |
|
# initialize values for soft intersection |
|
257
|
|
|
self._soft_intersection_precalc = Counter() # type: TCounter[str] |
|
258
|
1 |
|
self._soft_src_only = Counter() # type: TCounter[str] |
|
259
|
1 |
|
self._soft_tar_only = Counter() # type: TCounter[str] |
|
260
|
|
|
|
|
261
|
1 |
|
@staticmethod |
|
262
|
1 |
|
def _norm_none(x: float, _squares: int, _pop: float) -> float: |
|
263
|
|
|
return x |
|
264
|
1 |
|
|
|
265
|
1 |
|
@staticmethod |
|
266
|
|
|
def _norm_proportional(x: float, _squares: int, pop: float) -> float: |
|
267
|
1 |
|
return x / max(1, pop) |
|
268
|
1 |
|
|
|
269
|
|
|
@staticmethod |
|
270
|
1 |
|
def _norm_log(x: float, _squares: int, _pop: float) -> float: |
|
271
|
1 |
|
return log1p(x) |
|
272
|
|
|
|
|
273
|
1 |
|
@staticmethod |
|
274
|
1 |
|
def _norm_exp(x: float, _squares: int, _pop: float) -> float: |
|
275
|
|
|
return exp(x) |
|
276
|
1 |
|
|
|
277
|
|
|
@staticmethod |
|
278
|
|
|
def _norm_laplace(x: float, squares: int, _pop: float) -> float: |
|
279
|
|
|
return x + squares |
|
280
|
|
|
|
|
281
|
|
|
@staticmethod |
|
282
|
|
|
def _norm_inverse(x: float, _squares: int, pop: float) -> float: |
|
283
|
|
|
return 1 / x if x else pop |
|
284
|
|
|
|
|
285
|
|
|
@staticmethod |
|
286
|
|
|
def _norm_complement(x: float, _squares: int, pop: float) -> float: |
|
287
|
|
|
return pop - x |
|
288
|
|
|
|
|
289
|
|
|
def _tokenize( |
|
290
|
|
|
self, src: Union[str, TCounter[str]], tar: Union[str, TCounter[str]] |
|
291
|
|
|
) -> '_TokenDistance': |
|
292
|
|
|
"""Return the Q-Grams in src & tar. |
|
293
|
|
|
|
|
294
|
|
|
Parameters |
|
295
|
|
|
---------- |
|
296
|
|
|
src : str |
|
297
|
|
|
Source string (or QGrams/Counter objects) for comparison |
|
298
|
|
|
tar : str |
|
299
|
|
|
Target string (or QGrams/Counter objects) for comparison |
|
300
|
|
|
|
|
301
|
|
|
Returns |
|
302
|
|
|
------- |
|
303
|
|
|
tuple of Counters |
|
304
|
1 |
|
Q-Grams |
|
305
|
1 |
|
|
|
306
|
|
|
Examples |
|
307
|
1 |
|
-------- |
|
308
|
1 |
|
>>> pe = _TokenDistance() |
|
309
|
|
|
>>> pe._tokenize('AT', 'TT')._get_tokens() |
|
310
|
1 |
|
(Counter({'$A': 1, 'AT': 1, 'T#': 1}), |
|
311
|
|
|
Counter({'$T': 1, 'TT': 1, 'T#': 1})) |
|
312
|
|
|
|
|
313
|
1 |
|
|
|
314
|
1 |
|
.. versionadded:: 0.1.0 |
|
315
|
|
|
.. versionchanged:: 0.3.6 |
|
316
|
1 |
|
Encapsulated in class |
|
317
|
|
|
|
|
318
|
|
|
""" |
|
319
|
|
|
self._src_orig = src |
|
320
|
1 |
|
self._tar_orig = tar |
|
321
|
|
|
|
|
322
|
|
|
if isinstance(src, Counter): |
|
323
|
|
|
self._src_tokens = src |
|
324
|
|
|
else: |
|
325
|
1 |
|
self._src_tokens = ( |
|
326
|
|
|
self.params['tokenizer'].tokenize(src).get_counter() |
|
327
|
|
|
) |
|
328
|
|
|
if isinstance(tar, Counter): |
|
329
|
1 |
|
self._tar_tokens = tar |
|
330
|
|
|
else: |
|
331
|
|
|
self._tar_tokens = ( |
|
332
|
1 |
|
self.params['tokenizer'].tokenize(tar).get_counter() |
|
333
|
1 |
|
) |
|
334
|
1 |
|
|
|
335
|
|
|
self._population_card_value = self._calc_population_card() |
|
336
|
1 |
|
|
|
337
|
|
|
# Set up the normalizer, a function of two variables: |
|
338
|
1 |
|
# x is the value in the contingency table square(s) |
|
339
|
|
|
# n is the number of squares that x represents |
|
340
|
1 |
|
if ( |
|
341
|
|
|
'normalizer' in self.params |
|
342
|
1 |
|
and self.params['normalizer'] in self._norm_dict |
|
343
|
|
|
): |
|
344
|
1 |
|
self.normalizer = self._norm_dict[self.params['normalizer']] |
|
345
|
1 |
|
|
|
346
|
1 |
|
# clear values for soft intersection |
|
347
|
1 |
|
self._soft_intersection_precalc = Counter() |
|
348
|
|
|
self._soft_src_only = Counter() |
|
349
|
|
|
self._soft_tar_only = Counter() |
|
350
|
|
|
|
|
351
|
|
|
return self |
|
352
|
|
|
|
|
353
|
|
|
def _get_tokens(self) -> Tuple[TCounter[str], TCounter[str]]: |
|
354
|
|
|
"""Return the src and tar tokens as a tuple.""" |
|
355
|
|
|
return self._src_tokens, self._tar_tokens |
|
356
|
|
|
|
|
357
|
1 |
|
def _src_card(self) -> float: |
|
358
|
|
|
r"""Return the cardinality of the tokens in the source set.""" |
|
359
|
|
|
if self.params['intersection_type'] == 'soft': |
|
360
|
|
|
if not len(self._soft_intersection_precalc): |
|
361
|
|
|
self._intersection() |
|
362
|
|
|
return self.normalizer( |
|
363
|
1 |
|
sum( |
|
364
|
|
|
abs(val) |
|
365
|
|
|
for val in ( |
|
366
|
|
|
self._soft_intersection_precalc + self._soft_src_only |
|
367
|
|
|
).values() |
|
368
|
1 |
|
), |
|
369
|
1 |
|
2, |
|
370
|
1 |
|
self._population_card_value, |
|
371
|
1 |
|
) |
|
372
|
1 |
|
return self.normalizer( |
|
373
|
1 |
|
sum(abs(val) for val in self._src_tokens.values()), |
|
374
|
1 |
|
2, |
|
375
|
1 |
|
self._population_card_value, |
|
376
|
|
|
) |
|
377
|
1 |
|
|
|
378
|
|
|
def _src_only(self) -> TCounter[str]: |
|
379
|
1 |
|
r"""Return the src tokens minus the tar tokens. |
|
380
|
|
|
|
|
381
|
|
|
For (multi-)sets S and T, this is :math:`S \setminus T`. |
|
382
|
|
|
""" |
|
383
|
|
|
if self.params['intersection_type'] == 'soft': |
|
384
|
|
|
if not len(self._soft_intersection_precalc): |
|
385
|
1 |
|
self._intersection() |
|
386
|
|
|
return self._soft_src_only |
|
387
|
1 |
|
src_only = self._src_tokens - self._intersection() |
|
388
|
1 |
|
if self.params['intersection_type'] != 'crisp': |
|
389
|
1 |
|
src_only -= self._intersection() - self._crisp_intersection() |
|
390
|
1 |
|
return src_only |
|
391
|
|
|
|
|
392
|
|
|
def _src_only_card(self) -> float: |
|
393
|
|
|
"""Return the cardinality of the tokens only in the source set.""" |
|
394
|
|
|
return self.normalizer( |
|
395
|
|
|
sum(abs(val) for val in self._src_only().values()), |
|
396
|
|
|
1, |
|
397
|
|
|
self._population_card_value, |
|
398
|
|
|
) |
|
399
|
|
|
|
|
400
|
1 |
|
def _tar_card(self) -> float: |
|
401
|
|
|
r"""Return the cardinality of the tokens in the target set.""" |
|
402
|
|
|
if self.params['intersection_type'] == 'soft': |
|
403
|
|
|
if not len(self._soft_intersection_precalc): |
|
404
|
|
|
self._intersection() |
|
405
|
|
|
return self.normalizer( |
|
406
|
1 |
|
sum( |
|
407
|
|
|
abs(val) |
|
408
|
|
|
for val in ( |
|
409
|
|
|
self._soft_intersection_precalc + self._soft_tar_only |
|
410
|
|
|
).values() |
|
411
|
1 |
|
), |
|
412
|
1 |
|
2, |
|
413
|
1 |
|
self._population_card_value, |
|
414
|
1 |
|
) |
|
415
|
1 |
|
return self.normalizer( |
|
416
|
1 |
|
sum(abs(val) for val in self._tar_tokens.values()), |
|
417
|
1 |
|
2, |
|
418
|
1 |
|
self._population_card_value, |
|
419
|
|
|
) |
|
420
|
1 |
|
|
|
421
|
|
|
def _tar_only(self) -> TCounter[str]: |
|
422
|
1 |
|
r"""Return the tar tokens minus the src tokens. |
|
423
|
|
|
|
|
424
|
|
|
For (multi-)sets S and T, this is :math:`T \setminus S`. |
|
425
|
|
|
""" |
|
426
|
|
|
if self.params['intersection_type'] == 'soft': |
|
427
|
|
|
if not len(self._soft_intersection_precalc): |
|
428
|
1 |
|
self._intersection() |
|
429
|
|
|
return self._soft_tar_only |
|
430
|
|
|
tar_only = self._tar_tokens - self._intersection() |
|
431
|
|
|
if self.params['intersection_type'] != 'crisp': |
|
432
|
|
|
tar_only -= self._intersection() - self._crisp_intersection() |
|
433
|
1 |
|
return tar_only |
|
434
|
|
|
|
|
435
|
1 |
|
def _tar_only_card(self) -> float: |
|
436
|
|
|
"""Return the cardinality of the tokens only in the target set.""" |
|
437
|
1 |
|
return self.normalizer( |
|
438
|
|
|
sum(abs(val) for val in self._tar_only().values()), |
|
439
|
|
|
1, |
|
440
|
|
|
self._population_card_value, |
|
441
|
|
|
) |
|
442
|
|
|
|
|
443
|
1 |
|
def _symmetric_difference(self) -> TCounter[str]: |
|
444
|
|
|
r"""Return the symmetric difference of tokens from src and tar. |
|
445
|
|
|
|
|
446
|
|
|
For (multi-)sets S and T, this is :math:`S \triangle T`. |
|
447
|
|
|
""" |
|
448
|
|
|
return self._src_only() + self._tar_only() |
|
449
|
|
|
|
|
450
|
|
|
def _symmetric_difference_card(self) -> float: |
|
451
|
1 |
|
"""Return the cardinality of the symmetric difference.""" |
|
452
|
1 |
|
return self.normalizer( |
|
453
|
1 |
|
sum(abs(val) for val in self._symmetric_difference().values()), |
|
454
|
1 |
|
2, |
|
455
|
|
|
self._population_card_value, |
|
456
|
|
|
) |
|
457
|
|
|
|
|
458
|
|
|
def _total(self) -> TCounter[str]: |
|
459
|
|
|
"""Return the sum of the sets. |
|
460
|
1 |
|
|
|
461
|
|
|
For (multi-)sets S and T, this is :math:`S + T`. |
|
462
|
1 |
|
|
|
463
|
|
|
In the case of multisets, this counts values in the interesection |
|
464
|
1 |
|
twice. In the case of sets, this is identical to the union. |
|
465
|
|
|
""" |
|
466
|
|
|
if self.params['intersection_type'] == 'soft': |
|
467
|
|
|
if not len(self._soft_intersection_precalc): |
|
468
|
|
|
self._intersection() |
|
469
|
|
|
return ( |
|
470
|
1 |
|
self._soft_tar_only |
|
471
|
|
|
+ self._soft_src_only |
|
472
|
1 |
|
+ self._soft_intersection_precalc |
|
473
|
1 |
|
+ self._soft_intersection_precalc |
|
474
|
1 |
|
) |
|
475
|
1 |
|
return self._src_tokens + self._tar_tokens |
|
476
|
|
|
|
|
477
|
|
|
def _total_card(self) -> float: |
|
478
|
|
|
"""Return the cardinality of the complement of the total.""" |
|
479
|
|
|
return self.normalizer( |
|
480
|
|
|
sum(abs(val) for val in self._total().values()), |
|
481
|
|
|
3, |
|
482
|
|
|
self._population_card_value, |
|
483
|
|
|
) |
|
484
|
|
|
|
|
485
|
|
|
def _total_complement_card(self) -> float: |
|
486
|
|
|
"""Return the cardinality of the complement of the total.""" |
|
487
|
|
|
if self.params['alphabet'] is None: |
|
488
|
1 |
|
return self.normalizer(0, 1, self._population_card_value) |
|
489
|
|
|
elif isinstance(self.params['alphabet'], Counter): |
|
490
|
|
|
return self.normalizer( |
|
491
|
|
|
max( |
|
492
|
|
|
0, |
|
493
|
|
|
sum( |
|
494
|
1 |
|
abs(val) |
|
495
|
|
|
for val in ( |
|
496
|
1 |
|
self.params['alphabet'] - self._total() |
|
497
|
1 |
|
).values() |
|
498
|
1 |
|
), |
|
499
|
1 |
|
), |
|
500
|
1 |
|
1, |
|
501
|
1 |
|
self._population_card_value, |
|
502
|
1 |
|
) |
|
503
|
1 |
|
return self.normalizer( |
|
504
|
|
|
max(0, self.params['alphabet'] - len(self._total().values())), |
|
505
|
1 |
|
1, |
|
506
|
|
|
self._population_card_value, |
|
507
|
1 |
|
) |
|
508
|
|
|
|
|
509
|
|
|
def _calc_population_card(self) -> float: |
|
510
|
|
|
"""Return the cardinality of the population.""" |
|
511
|
1 |
|
save_normalizer = self.normalizer |
|
512
|
|
|
self.normalizer = self._norm_none |
|
513
|
1 |
|
save_intersection = self.params['intersection_type'] |
|
514
|
|
|
self.params['intersection_type'] = 'crisp' |
|
515
|
|
|
pop = self._total_card() + self._total_complement_card() |
|
516
|
|
|
self.normalizer = save_normalizer |
|
517
|
|
|
self.params['intersection_type'] = save_intersection |
|
518
|
|
|
return pop |
|
519
|
1 |
|
|
|
520
|
|
|
def _population_card(self) -> float: |
|
521
|
|
|
"""Return the cardinality of the population.""" |
|
522
|
|
|
return self.normalizer( |
|
523
|
|
|
self._population_card_value, 4, self._population_card_value |
|
524
|
1 |
|
) |
|
525
|
1 |
|
|
|
526
|
1 |
|
def _population_unique_card(self) -> float: |
|
527
|
1 |
|
"""Return the cardinality of the population minus the intersection.""" |
|
528
|
|
|
return self.normalizer( |
|
529
|
|
|
self._population_card_value - self._intersection_card(), |
|
530
|
|
|
4, |
|
531
|
|
|
self._population_card_value, |
|
532
|
1 |
|
) |
|
533
|
1 |
|
|
|
534
|
1 |
|
def _union(self) -> TCounter[str]: |
|
535
|
1 |
|
r"""Return the union of tokens from src and tar. |
|
536
|
|
|
|
|
537
|
1 |
|
For (multi-)sets S and T, this is :math:`S \cup T`. |
|
538
|
|
|
""" |
|
539
|
1 |
|
if self.params['intersection_type'] == 'soft': |
|
540
|
|
|
if not len(self._soft_intersection_precalc): |
|
541
|
|
|
self._intersection() |
|
542
|
|
|
return ( |
|
543
|
|
|
self._soft_tar_only |
|
544
|
|
|
+ self._soft_src_only |
|
545
|
1 |
|
+ self._soft_intersection_precalc |
|
546
|
|
|
) |
|
547
|
1 |
|
union = self._total() - self._intersection() |
|
548
|
1 |
|
if self.params['intersection_type'] != 'crisp': |
|
549
|
1 |
|
union -= self._intersection() - self._crisp_intersection() |
|
550
|
1 |
|
return union |
|
551
|
1 |
|
|
|
552
|
1 |
|
def _union_card(self) -> float: |
|
553
|
1 |
|
"""Return the cardinality of the union.""" |
|
554
|
1 |
|
return self.normalizer( |
|
555
|
1 |
|
sum(abs(val) for val in self._union().values()), |
|
556
|
|
|
3, |
|
557
|
1 |
|
self._population_card_value, |
|
558
|
|
|
) |
|
559
|
|
|
|
|
560
|
|
|
def _difference(self) -> TCounter[str]: |
|
561
|
|
|
"""Return the difference of the tokens, supporting negative values.""" |
|
562
|
1 |
|
if self.params['intersection_type'] == 'soft': |
|
563
|
|
|
if not len(self._soft_intersection_precalc): |
|
564
|
1 |
|
self._intersection() |
|
565
|
|
|
_src_copy = Counter(self._soft_src_only) |
|
566
|
|
|
_src_copy.subtract(self._soft_tar_only) |
|
567
|
|
|
return _src_copy |
|
568
|
|
|
_src_copy = Counter(self._src_tokens) |
|
569
|
|
|
_src_copy.subtract(self._tar_tokens) |
|
570
|
1 |
|
return _src_copy |
|
571
|
1 |
|
|
|
572
|
|
|
def _crisp_intersection(self) -> TCounter[str]: |
|
573
|
|
|
r"""Return the intersection of tokens from src and tar. |
|
574
|
|
|
|
|
575
|
|
|
For (multi-)sets S and T, this is :math:`S \cap T`. |
|
576
|
1 |
|
""" |
|
577
|
1 |
|
return self._src_tokens & self._tar_tokens |
|
578
|
1 |
|
|
|
579
|
|
|
def _soft_intersection(self) -> TCounter[str]: |
|
580
|
1 |
|
"""Return the soft source, target, & intersection tokens & weights. |
|
581
|
1 |
|
|
|
582
|
|
|
This implements the soft intersection defined by :cite:`Russ:2014` in |
|
583
|
1 |
|
a way that can reproduce the results in the paper. |
|
584
|
1 |
|
""" |
|
585
|
1 |
|
if not hasattr(self.params['metric'], 'alignment'): |
|
586
|
|
|
raise TypeError( |
|
587
|
|
|
"Soft similarity requires a 'metric' with an alignment \ |
|
588
|
|
|
member function, such as Levenshtein." |
|
589
|
|
|
) |
|
590
|
|
|
|
|
591
|
|
|
intersection = self._crisp_intersection() |
|
592
|
|
|
src_only = self._src_tokens - self._tar_tokens |
|
593
|
1 |
|
tar_only = self._tar_tokens - self._src_tokens |
|
594
|
1 |
|
|
|
595
|
1 |
|
src_new = Counter() # type: TCounter[str] |
|
596
|
1 |
|
tar_new = Counter() # type: TCounter[str] |
|
597
|
|
|
|
|
598
|
1 |
|
def _membership(src: str, tar: str) -> float: |
|
599
|
1 |
|
greater_length = max(len(src), len(tar)) |
|
600
|
1 |
|
return cast( |
|
601
|
|
|
float, |
|
602
|
1 |
|
max( |
|
603
|
|
|
greater_length - self.params['metric'].dist_abs(src, tar), |
|
604
|
1 |
|
self._lcprefix.dist_abs(src, tar), |
|
605
|
1 |
|
) |
|
606
|
1 |
|
/ greater_length, |
|
607
|
1 |
|
) |
|
608
|
1 |
|
|
|
609
|
1 |
|
def _token_src_tar_int( |
|
610
|
|
|
src: str, tar: str |
|
611
|
1 |
|
) -> Tuple[str, float, str, float, str, float]: |
|
612
|
1 |
|
src_tok = [] |
|
613
|
1 |
|
tar_tok = [] |
|
614
|
1 |
|
int_tok = [] |
|
615
|
1 |
|
|
|
616
|
1 |
|
src_val = 0.0 |
|
617
|
1 |
|
tar_val = 0.0 |
|
618
|
|
|
int_val = 0.0 |
|
619
|
1 |
|
|
|
620
|
1 |
|
_cost, _src, _tar = self.params['metric'].alignment(src, tar) |
|
621
|
1 |
|
|
|
622
|
|
|
for i in range(len(_src)): |
|
623
|
1 |
|
if _src[i] == _tar[i]: |
|
624
|
1 |
|
src_tok.append('-') |
|
625
|
1 |
|
tar_tok.append('-') |
|
626
|
|
|
int_tok.append(_src[i]) |
|
627
|
1 |
|
int_val += 1 |
|
628
|
|
|
else: |
|
629
|
|
|
src_tok.append(_src[i]) |
|
630
|
|
|
if _src[i] != '-': |
|
631
|
1 |
|
src_val += 1 |
|
632
|
|
|
tar_tok.append(_tar[i]) |
|
633
|
|
|
if _tar[i] != '-': |
|
634
|
|
|
tar_val += 1 |
|
635
|
|
|
int_tok.append('-') |
|
636
|
1 |
|
|
|
637
|
1 |
|
src_val /= len(_src) |
|
638
|
1 |
|
tar_val /= len(_src) |
|
639
|
1 |
|
int_val /= len(_src) |
|
640
|
1 |
|
|
|
641
|
1 |
|
src_tok_str = ''.join(src_tok).strip('-') |
|
642
|
|
|
tar_tok_str = ''.join(tar_tok).strip('-') |
|
643
|
|
|
int_tok_str = ''.join(int_tok).strip('-') |
|
644
|
|
|
|
|
645
|
|
|
return ( |
|
646
|
|
|
src_tok_str, |
|
647
|
|
|
src_val, |
|
648
|
|
|
tar_tok_str, |
|
649
|
|
|
tar_val, |
|
650
|
1 |
|
int_tok_str, |
|
651
|
1 |
|
int_val, |
|
652
|
1 |
|
) |
|
653
|
|
|
|
|
654
|
|
|
# Dictionary ordering is important for reproducibility, so insertion |
|
655
|
1 |
|
# order needs to be controlled and retained. |
|
656
|
1 |
|
memberships = OrderedDict( |
|
657
|
|
|
((src, tar), _membership(src, tar)) |
|
658
|
1 |
|
for src, tar in sorted(product(src_only, tar_only)) |
|
659
|
|
|
) |
|
660
|
|
|
|
|
661
|
1 |
|
while memberships: |
|
662
|
1 |
|
src_tok, tar_tok = max(memberships, key=memberships.get) |
|
663
|
|
|
if memberships[src_tok, tar_tok] > 0.0: |
|
664
|
|
|
pairings = min(src_only[src_tok], tar_only[tar_tok]) |
|
665
|
1 |
|
if pairings: |
|
666
|
1 |
|
( |
|
667
|
1 |
|
src_ntok, |
|
668
|
|
|
src_val, |
|
669
|
1 |
|
tar_ntok, |
|
670
|
|
|
tar_val, |
|
671
|
1 |
|
int_ntok, |
|
672
|
|
|
int_val, |
|
673
|
|
|
) = _token_src_tar_int(src_tok, tar_tok) |
|
674
|
|
|
|
|
675
|
|
|
src_new[src_ntok] += src_val * pairings # type: ignore |
|
676
|
|
|
tar_new[tar_ntok] += tar_val * pairings # type: ignore |
|
677
|
|
|
intersection[int_ntok] += int_val * pairings # type: ignore |
|
678
|
|
|
|
|
679
|
|
|
# Remove pairings from src_only/tar_only |
|
680
|
|
|
src_only[src_tok] -= pairings |
|
681
|
|
|
tar_only[tar_tok] -= pairings |
|
682
|
|
|
|
|
683
|
|
|
del memberships[src_tok, tar_tok] |
|
684
|
|
|
|
|
685
|
|
|
# Add src_new/tar_new back into src_only/tar_only |
|
686
|
|
|
src_only += src_new |
|
687
|
|
|
tar_only += tar_new |
|
688
|
|
|
|
|
689
|
|
|
# Save src_only/tar_only to the instance for retrieval later. |
|
690
|
1 |
|
self._soft_src_only = src_only |
|
691
|
1 |
|
self._soft_tar_only = tar_only |
|
692
|
1 |
|
self._soft_intersection_precalc = intersection |
|
693
|
|
|
|
|
694
|
1 |
|
return intersection |
|
695
|
1 |
|
|
|
696
|
1 |
|
def _fuzzy_intersection(self) -> TCounter[str]: |
|
697
|
1 |
|
r"""Return the fuzzy intersection of the tokens in src and tar. |
|
698
|
1 |
|
|
|
699
|
1 |
|
This implements the fuzzy intersection defined by :cite:`Wang:2014`. |
|
700
|
|
|
|
|
701
|
1 |
|
For two sets X and Y, the intersection :cite:`Wang:2014` is the sum of |
|
702
|
1 |
|
similarities of all tokens in the two sets that are greater than or |
|
703
|
1 |
|
equal to some threshold value (:math:`\delta`). |
|
704
|
1 |
|
|
|
705
|
|
|
The lower bound of on this intersection and the value when |
|
706
|
1 |
|
:math:`\delta = 1.0`, is the crisp intersection. Tokens shorter than |
|
707
|
1 |
|
:math:`\frac{\delta}{1-\delta}`, 4 in the case of the default threshold |
|
708
|
1 |
|
:math:`\delta = 0.8`, must match exactly to be included in the |
|
709
|
1 |
|
intersection. |
|
710
|
|
|
|
|
711
|
1 |
|
|
|
712
|
1 |
|
.. versionadded:: 0.4.0 |
|
713
|
|
|
|
|
714
|
1 |
|
""" |
|
715
|
1 |
|
intersection = self._crisp_intersection() |
|
716
|
|
|
src_only = self._src_tokens - self._tar_tokens |
|
717
|
|
|
tar_only = self._tar_tokens - self._src_tokens |
|
718
|
|
|
|
|
719
|
|
|
pair = {} |
|
720
|
|
|
for src_tok in sorted(src_only): |
|
721
|
|
|
for tar_tok in sorted(tar_only): |
|
722
|
|
|
sim = self.params['metric'].sim(src_tok, tar_tok) |
|
723
|
|
|
if sim >= self.params['threshold']: |
|
724
|
|
|
pair[(src_tok, tar_tok)] = sim |
|
725
|
|
|
|
|
726
|
|
|
ordered_keys = [(pair[_], _[0], _[1]) for _ in pair] |
|
727
|
|
|
ordered_keys.sort(key=lambda x: x[2]) |
|
728
|
|
|
ordered_keys.sort(key=lambda x: x[1]) |
|
729
|
|
|
ordered_keys.sort(key=lambda x: x[0], reverse=True) |
|
730
|
|
|
|
|
731
|
|
|
for _, src_tok, tar_tok in ordered_keys: |
|
732
|
|
|
pairings = min(src_only[src_tok], tar_only[tar_tok]) |
|
733
|
|
|
if pairings: |
|
734
|
|
|
sim = pair[(src_tok, tar_tok)] |
|
735
|
|
|
|
|
736
|
|
|
intersection[src_tok] += sim / 2 * pairings |
|
737
|
|
|
intersection[tar_tok] += sim / 2 * pairings |
|
738
|
|
|
|
|
739
|
|
|
src_only[src_tok] -= pairings |
|
740
|
|
|
tar_only[tar_tok] -= pairings |
|
741
|
|
|
|
|
742
|
1 |
|
""" |
|
743
|
|
|
# Here is a slightly different optimization method, which is even |
|
744
|
1 |
|
# greedier than the above. |
|
745
|
|
|
# ordered by sim*pairings rather than just sim |
|
746
|
|
|
|
|
747
|
|
|
pair = {} |
|
748
|
|
|
for src_tok in sorted(src_only): |
|
749
|
|
|
for tar_tok in sorted(tar_only): |
|
750
|
|
|
sim = self.params['metric'].sim(src_tok, tar_tok) |
|
751
|
|
|
if sim >= self.params['threshold']: |
|
752
|
|
|
pairings = min(src_only[src_tok], tar_only[tar_tok]) |
|
753
|
|
|
pair[(src_tok, tar_tok)] = sim*pairings |
|
754
|
|
|
|
|
755
|
|
|
for src_tok, tar_tok in sorted(pair, key=pair.get, reverse=True): |
|
756
|
|
|
pairings = min(src_only[src_tok], tar_only[tar_tok]) |
|
757
|
|
|
if pairings: |
|
758
|
|
|
sim = pair[(src_tok, tar_tok)] |
|
759
|
|
|
|
|
760
|
|
|
intersection[src_tok] += sim / 2 |
|
761
|
1 |
|
intersection[tar_tok] += sim / 2 |
|
762
|
1 |
|
|
|
763
|
1 |
|
src_only[src_tok] -= pairings |
|
764
|
1 |
|
tar_only[tar_tok] -= pairings |
|
765
|
1 |
|
""" |
|
766
|
|
|
|
|
767
|
|
|
return intersection |
|
768
|
|
|
|
|
769
|
|
|
def _group_linkage_intersection(self) -> TCounter[str]: |
|
770
|
1 |
|
r"""Return the group linkage intersection of the tokens in src and tar. |
|
771
|
1 |
|
|
|
772
|
|
|
This is based on group linkage, as defined by :cite:`On:2007`. |
|
773
|
1 |
|
|
|
774
|
1 |
|
Most of this method is concerned with solving the assignment problem, |
|
775
|
1 |
|
in order to find the weight of the maximum weight bipartite matching. |
|
776
|
|
|
The Hungarian algorithm of Munkres :cite:`Munkres:1957`, implemented |
|
777
|
|
|
below in Python & Numpy is used to solve the assignment problem since |
|
778
|
|
|
it is roughly twice as fast as SciPy's implementation. |
|
779
|
1 |
|
|
|
780
|
1 |
|
.. versionadded:: 0.4.0 |
|
781
|
|
|
.. versionchanged:: 0.4.1 |
|
782
|
1 |
|
Corrected the Hungarian algorithm & optimized it so that SciPy's |
|
783
|
1 |
|
version is no longer needed. |
|
784
|
1 |
|
|
|
785
|
1 |
|
""" |
|
786
|
|
|
intersection = self._crisp_intersection() |
|
787
|
1 |
|
src_only_tok = sorted(self._src_tokens - self._tar_tokens) |
|
788
|
|
|
tar_only_tok = sorted(self._tar_tokens - self._src_tokens) |
|
789
|
|
|
src_only = self._src_tokens - self._tar_tokens |
|
790
|
|
|
tar_only = self._tar_tokens - self._src_tokens |
|
791
|
|
|
|
|
792
|
|
|
# Quoted text below is from Munkres (1957), cited above. |
|
793
|
1 |
|
|
|
794
|
|
|
# Pre-preliminaries: create square the matrix of scores |
|
795
|
|
|
n = max(len(src_only_tok), len(tar_only_tok)) |
|
796
|
1 |
|
arr = np.zeros((n, n), dtype=float) |
|
797
|
|
|
|
|
798
|
|
|
for col in range(len(src_only_tok)): |
|
799
|
|
|
for row in range(len(tar_only_tok)): |
|
800
|
|
|
arr[row, col] = self.params['metric'].dist( |
|
801
|
|
|
src_only_tok[col], tar_only_tok[row] |
|
802
|
1 |
|
) |
|
803
|
1 |
|
|
|
804
|
1 |
|
src_only_tok += [''] * (n - len(src_only_tok)) |
|
805
|
1 |
|
tar_only_tok += [''] * (n - len(tar_only_tok)) |
|
806
|
|
|
|
|
807
|
|
|
starred = np.zeros((n, n), dtype=np.bool_) |
|
808
|
|
|
primed = np.zeros((n, n), dtype=np.bool_) |
|
809
|
1 |
|
row_covered = np.zeros(n, dtype=np.bool_) |
|
810
|
1 |
|
col_covered = np.zeros(n, dtype=np.bool_) |
|
811
|
|
|
|
|
812
|
1 |
|
orig_sim = 1 - np.copy(arr) |
|
813
|
|
|
# Preliminaries: |
|
814
|
|
|
# P: "No lines are covered; no zeros are starred or primed." |
|
815
|
1 |
|
# P: "Consider a row of matrix A; subtract from each element in |
|
816
|
1 |
|
# this row the smallest element of this row. Do the same for each |
|
817
|
|
|
# row of A." |
|
818
|
1 |
|
arr -= arr.min(axis=1, keepdims=True) |
|
819
|
1 |
|
# P: "Then consider each column of the resulting matrix and |
|
820
|
|
|
# subtract from each column its smallest entry." |
|
821
|
|
|
arr -= arr.min(axis=0, keepdims=True) |
|
822
|
|
|
|
|
823
|
|
|
# P: "Consider a zero Z of the matrix. If there is no starred zero |
|
824
|
|
|
# in its row and none in its column, star Z. Repeat, considering |
|
825
|
|
|
# each zero in the matrix in turn. Then cover every column |
|
826
|
1 |
|
# containing a starred zero. |
|
827
|
1 |
|
for col in range(n): |
|
828
|
1 |
|
for row in range(n): |
|
829
|
1 |
|
if arr[row, col] == 0: |
|
830
|
1 |
|
if ( |
|
831
|
1 |
|
np.count_nonzero(starred[row, :]) == 0 |
|
832
|
1 |
|
and np.count_nonzero(starred[:, col]) == 0 |
|
833
|
1 |
|
): |
|
834
|
1 |
|
starred[row, col] = True |
|
835
|
|
|
col_covered[col] = True |
|
836
|
1 |
|
|
|
837
|
1 |
|
step = 1 |
|
838
|
|
|
# This is the simple case where independent assignments are obvious |
|
839
|
1 |
|
# and found without the rest of the algorithm. |
|
840
|
1 |
|
if np.count_nonzero(col_covered) == n: |
|
841
|
|
|
step = 4 |
|
842
|
1 |
|
|
|
843
|
1 |
|
while step < 4: |
|
844
|
1 |
|
if step == 1: |
|
845
|
|
|
# Step 1: |
|
846
|
1 |
|
# 1: "Choose a non-covered zero and prime it. Consider the |
|
847
|
|
|
# row containing it. If there is no starred zero in this |
|
848
|
1 |
|
# row, go at once to Step 2. If there is a starred zero Z |
|
849
|
|
|
# in this row, cover this row and uncover the column of Z." |
|
850
|
|
|
# 1: Repeat until all zeros are covered. Go to Step 3." |
|
851
|
|
|
zeros = tuple(zip(*((arr == 0).nonzero()))) |
|
852
|
|
|
while step == 1: |
|
853
|
|
|
for row, col in zeros: |
|
854
|
|
|
if not (col_covered[col] | row_covered[row]): |
|
855
|
|
|
primed[row, col] = True |
|
856
|
|
|
z_cols = (starred[row, :]).nonzero()[0] |
|
857
|
|
|
if not z_cols.size: |
|
858
|
1 |
|
step = 2 |
|
859
|
1 |
|
break |
|
860
|
1 |
|
else: |
|
861
|
|
|
row_covered[row] = True |
|
862
|
|
|
col_covered[z_cols[0]] = False |
|
863
|
1 |
|
|
|
864
|
1 |
|
if step != 1: |
|
865
|
1 |
|
break |
|
866
|
1 |
|
|
|
867
|
1 |
|
for row, col in zeros: |
|
868
|
|
|
if not (col_covered[col] | row_covered[row]): |
|
869
|
|
|
break |
|
870
|
|
|
else: |
|
871
|
1 |
|
step = 3 |
|
872
|
1 |
|
|
|
873
|
1 |
|
if step == 2: |
|
874
|
1 |
|
# Step 2: |
|
875
|
|
|
# 2: "There is a sequence of alternating starred and primed |
|
876
|
|
|
# zeros, constructed as follows: Let Z_0 denote the |
|
877
|
|
|
# uncovered 0'. [There is only one.] Let Z_1 denote the 0* |
|
878
|
1 |
|
# in Z_0's column (if any). Let Z_2 denote the 0' in Z_1's |
|
879
|
|
|
# row (we must prove that it exists). Let Z_3 denote the 0* |
|
880
|
1 |
|
# in Z_2's column (if any). Similarly continue until the |
|
881
|
|
|
# sequence stops at a 0', Z_{2k}, which has no 0* in its |
|
882
|
|
|
# column." |
|
883
|
|
|
z_series = [] |
|
884
|
|
|
for row, col in zeros: # pragma: no branch |
|
|
|
|
|
|
885
|
|
|
if primed[row, col] and not ( |
|
886
|
1 |
|
row_covered[row] | col_covered[col] |
|
887
|
1 |
|
): |
|
888
|
1 |
|
z_series.append((row, col)) |
|
889
|
1 |
|
break |
|
890
|
1 |
|
col = z_series[-1][1] |
|
891
|
1 |
|
while True: |
|
892
|
1 |
|
row_content = tuple( |
|
893
|
1 |
|
set((arr[:, col] == 0).nonzero()[0]) |
|
894
|
|
|
& set((starred[:, col]).nonzero()[0]) |
|
895
|
|
|
) |
|
896
|
|
|
if row_content: |
|
897
|
1 |
|
row = row_content[0] |
|
898
|
1 |
|
z_series.append((row, col)) |
|
899
|
|
|
col = tuple( |
|
900
|
1 |
|
set((arr[row, :] == 0).nonzero()[0]) |
|
901
|
|
|
& set((primed[row, :]).nonzero()[0]) |
|
902
|
1 |
|
)[0] |
|
903
|
|
|
z_series.append((row, col)) |
|
904
|
|
|
else: |
|
905
|
|
|
break |
|
906
|
|
|
|
|
907
|
1 |
|
# 2: "Unstar each starred zero of the sequence and star |
|
908
|
1 |
|
# each primed zero of the sequence. Erase all primes, |
|
909
|
1 |
|
# uncover every row, and cover every column containing a |
|
910
|
1 |
|
# 0*." |
|
911
|
1 |
|
primed[:, :] = False |
|
912
|
|
|
row_covered[:] = False |
|
913
|
|
|
col_covered[:] = False |
|
914
|
|
|
for row, col in z_series: |
|
915
|
1 |
|
starred[row, col] = not starred[row, col] |
|
916
|
1 |
|
for col in range(n): |
|
917
|
1 |
|
if np.count_nonzero(starred[:, col]): |
|
918
|
1 |
|
col_covered[col] = True |
|
919
|
1 |
|
# 2: "If all columns are covered, the starred zeros form |
|
920
|
1 |
|
# the desired independent set. Otherwise, return to Step |
|
921
|
1 |
|
# 1." |
|
922
|
|
|
if np.count_nonzero(col_covered) == n: |
|
923
|
|
|
step = 4 |
|
924
|
|
|
else: |
|
925
|
1 |
|
step = 1 |
|
926
|
|
|
|
|
927
|
1 |
|
if step == 3: |
|
928
|
1 |
|
# Step 3: |
|
929
|
1 |
|
# 3: "Let h denote the smallest non-covered element of the |
|
930
|
1 |
|
# matrix; it will be positive. Add h to each covered row; |
|
931
|
|
|
# then subtract h from each uncovered column." |
|
932
|
|
|
h_val = float('inf') |
|
933
|
|
|
for col in range(n): |
|
934
|
|
|
if not (col_covered[col]): |
|
935
|
|
|
for row in range(n): |
|
936
|
|
|
if ( |
|
937
|
1 |
|
not (row_covered[row]) |
|
938
|
1 |
|
and arr[row, col] < h_val |
|
939
|
|
|
): |
|
940
|
1 |
|
h_val = arr[row, col] |
|
941
|
|
|
for row in range(n): |
|
942
|
1 |
|
if row_covered[row]: |
|
943
|
|
|
arr[row, :] += h_val |
|
944
|
1 |
|
for col in range(n): |
|
945
|
|
|
if not (col_covered[col]): |
|
946
|
|
|
arr[:, col] -= h_val |
|
947
|
|
|
|
|
948
|
|
|
# 3: "Return to Step 1, without altering any asterisks, |
|
949
|
|
|
# primes, or covered lines." |
|
950
|
1 |
|
step = 1 |
|
951
|
|
|
|
|
952
|
|
|
for row, col in tuple(zip(*(starred.nonzero()))): |
|
953
|
|
|
sim = orig_sim[row, col] |
|
954
|
|
|
if sim >= self.params['threshold']: |
|
955
|
|
|
score = float( |
|
956
|
|
|
(sim / 2) |
|
957
|
|
|
* min( |
|
958
|
1 |
|
src_only[src_only_tok[col]], |
|
959
|
|
|
tar_only[tar_only_tok[row]], |
|
960
|
1 |
|
) |
|
961
|
|
|
) |
|
962
|
|
|
intersection[src_only_tok[col]] += score # type: ignore |
|
963
|
|
|
intersection[tar_only_tok[row]] += score # type: ignore |
|
964
|
|
|
|
|
965
|
|
|
return intersection |
|
966
|
|
|
|
|
967
|
|
|
def _intersection_card(self) -> float: |
|
968
|
|
|
"""Return the cardinality of the intersection.""" |
|
969
|
|
|
return self.normalizer( |
|
970
|
|
|
sum(abs(val) for val in self._intersection().values()), |
|
971
|
|
|
1, |
|
972
|
|
|
self._population_card_value, |
|
973
|
|
|
) |
|
974
|
|
|
|
|
975
|
|
|
def _intersection(self) -> TCounter[str]: |
|
976
|
|
|
"""Return the intersection. |
|
977
|
|
|
|
|
978
|
|
|
This function may be overridden by setting the intersection_type during |
|
979
|
|
|
initialization. |
|
980
|
|
|
""" |
|
981
|
|
|
return self._crisp_intersection() # pragma: no cover |
|
982
|
|
|
|
|
983
|
|
|
def _get_confusion_table(self) -> ConfusionTable: |
|
984
|
|
|
"""Return the token counts as a ConfusionTable object.""" |
|
985
|
|
|
return ConfusionTable( |
|
986
|
|
|
self._intersection_card(), |
|
987
|
|
|
self._total_complement_card(), |
|
988
|
|
|
self._src_only_card(), |
|
989
|
|
|
self._tar_only_card(), |
|
990
|
|
|
) |
|
991
|
|
|
|
|
992
|
|
|
|
|
993
|
|
|
if __name__ == '__main__': |
|
994
|
|
|
import doctest |
|
995
|
|
|
|
|
996
|
|
|
doctest.testmod() |
|
997
|
|
|
|