_TokenDistance._intersection()   A
last analyzed

Complexity

Conditions 1

Size

Total Lines 7
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 2

Importance

Changes 0
Metric Value
eloc 2
dl 0
loc 7
ccs 0
cts 0
cp 0
rs 10
c 0
b 0
f 0
cc 1
nop 1
crap 2
1
# Copyright 2018-2020 by Christopher C. Little.
2
# This file is part of Abydos.
3
#
4
# Abydos is free software: you can redistribute it and/or modify
5
# it under the terms of the GNU General Public License as published by
6
# the Free Software Foundation, either version 3 of the License, or
7
# (at your option) any later version.
8
#
9
# Abydos is distributed in the hope that it will be useful,
10
# but WITHOUT ANY WARRANTY; without even the implied warranty of
11
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12
# GNU General Public License for more details.
13
#
14
# You should have received a copy of the GNU General Public License
15
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
16
17
"""abydos.distance._token_distance.
18
19 1
The distance._token_distance._TokenDistance module implements abstract class
20
_TokenDistance.
21
"""
22
23
from collections import Counter, OrderedDict
24
from itertools import product
25 1
from math import exp, log1p
26
from typing import (
27
    Any,
28
    Callable,
29
    Counter as TCounter,
30
    Optional,
31
    Tuple,
32 1
    Union,
33 1
    cast,
34 1
)
35
36 1
import numpy as np
37 1
38
from ._damerau_levenshtein import DamerauLevenshtein
39 1
from ._distance import _Distance
40 1
from ._lcprefix import LCPrefix
41 1
from ._levenshtein import Levenshtein
42 1
from ..stats import ConfusionTable
43 1
from ..tokenizer import QGrams, QSkipgrams, WhitespaceTokenizer, _Tokenizer
44 1
45
__all__ = ['_TokenDistance']
46 1
47
48
class _TokenDistance(_Distance):
49 1
    r"""Abstract Token Distance class.
50
51
    .. _confusion_table:
52
53
    +----------------+--------------+-----------------+-------+
54
    |                | |in| ``tar`` | |notin| ``tar`` |       |
55
    +----------------+--------------+-----------------+-------+
56
    | |in| ``src``   | |a|          | |b|             | |a+b| |
57
    +----------------+--------------+-----------------+-------+
58
    | |notin| ``src``| |c|          | |d|             | |c+d| |
59
    +----------------+--------------+-----------------+-------+
60
    |                | |a+c|        | |b+d|           | |n|   |
61
    +----------------+--------------+-----------------+-------+
62
63
    .. |in| replace:: :math:`x \in`
64
    .. |notin| replace:: :math:`x \notin`
65
66
    .. |a| replace:: :math:`a = |X \cap Y|`
67
    .. |b| replace:: :math:`b = |X\setminus Y|`
68
    .. |c| replace:: :math:`c = |Y \setminus X|`
69
    .. |d| replace:: :math:`d = |(N\setminus X)\setminus Y|`
70
    .. |n| replace:: :math:`n = |N|`
71
    .. |a+b| replace:: :math:`p_1 = a+b = |X|`
72
    .. |a+c| replace:: :math:`p_2 = a+c = |Y|`
73
    .. |c+d| replace:: :math:`q_1 = c+d = |N\setminus X|`
74
    .. |b+d| replace:: :math:`q_2 = b+d = |N\setminus Y|`
75
76
    .. versionadded:: 0.3.6
77
    """
78
79
    def __init__(
80 1
        self,
81
        tokenizer: Optional[_Tokenizer] = None,
82
        intersection_type: str = 'crisp',
83
        **kwargs: Any
84
    ) -> None:
85
        r"""Initialize _TokenDistance instance.
86
87
        .. _intersection_type:
88
89
        Parameters
90
        ----------
91
        tokenizer : _Tokenizer
92
            A tokenizer instance from the :py:mod:`abydos.tokenizer` package
93
        intersection_type : str
94
            Specifies the intersection type, and set type as a result:
95
96
                - 'crisp': Ordinary intersection, wherein items are entirely
97
                  members or non-members of the intersection. (Default)
98
                - ``fuzzy``: Fuzzy intersection, defined by :cite:`Wang:2014`,
99
                  wherein items can be partially members of the intersection
100
                  if their similarity meets or exceeds a threshold value. This
101
                  also takes `metric` (by default :class:`Levenshtein()`) and
102
                  `threshold` (by default 0.8) parameters.
103
                - ``soft``: Soft intersection, defined by :cite:`Russ:2014`,
104
                  wherein items can be partially members of the intersection
105
                  depending on their similarity. This also takes a `metric`
106
                  (by default :class:`DamerauLevenshtein()`) parameter.
107
                - ``linkage``: Group linkage, defined by :cite:`On:2007`. Like
108
                  the soft intersection, items can be partially members of the
109
                  intersection, but the method of pairing similar members is
110
                  somewhat more complex. See the cited paper for details. This
111
                  also takes `metric`
112
                  (by default :class:`DamerauLevenshtein()`) and `threshold`
113
                  (by default 0.1) parameters.
114
        **kwargs
115
            Arbitrary keyword arguments
116
117
118
        .. _alphabet:
119
120
        Other Parameters
121
        ----------------
122
        qval : int
123
            The length of each q-gram. Using this parameter and tokenizer=None
124
            will cause the instance to use the QGram tokenizer with this
125
            q value.
126
        metric : _Distance
127
            A string distance measure class for use in the ``soft`` and
128
            ``fuzzy`` variants.
129
        threshold : float
130
            A threshold value, similarities above which are counted as
131
            members of the intersection for the ``fuzzy`` variant.
132
        alphabet : Counter, collection, int, or None
133
            This represents the alphabet of possible tokens.
134
135
                - If a Counter is supplied, it is used directly in computing
136
                  the complement of the tokens in both sets.
137
                - If a collection is supplied, it is converted to a Counter
138
                  and used directly. In the case of a single string being
139
                  supplied and the QGram tokenizer being used, the full
140
                  alphabet is inferred (i.e.
141
                  :math:`len(set(alphabet+QGrams.start\_stop))^{QGrams.qval}`
142
                  is used as the cardinality of the full alphabet.
143
                - If an int is supplied, it is used as the cardinality of the
144
                  full alphabet.
145
                - If None is supplied, the cardinality of the full alphabet
146
                  is inferred if QGram of QSkipgrams tokenization is used (i.e.
147
                  :math:`28^{QGrams.qval}` is used as the cardinality of the
148
                  full alphabet or :math:`26` if QGrams.qval is 1, which
149
                  assumes the strings are English language strings and only
150
                  contain letters of a single case). Otherwise, the cardinality
151
                  of the complement of the total will be 0.
152
        normalizer : str
153
            This represents the normalization applied to the values in the
154
            2x2 contingency table prior to any of the cardinality (\*_card)
155
            methods returning a value. By default, no normalization is applied,
156
            but the following values are supported:
157
158
                - ``proportional`` : :math:`\frac{x}{n}`, where n is the total
159
                  population
160
                - ``log`` : :math:`log(1+x)`
161
                - ``exp`` : :math:`e^x`
162
                - ``laplace`` : :math:`x+1`
163
                - ``inverse`` : :math:`\frac{1}{x}`
164
                - ``complement`` : :math:`n-x`, where n is the total population
165 1
166
        .. versionadded:: 0.4.0
167
168
        """
169 1
        super(_TokenDistance, self).__init__(
170 1
            intersection_type=intersection_type, **kwargs
171
        )
172
173
        qval = 2 if 'qval' not in self.params else self.params['qval']
174
        self.params['tokenizer'] = (
175
            tokenizer
176
            if tokenizer is not None
177
            else WhitespaceTokenizer()
178 1
            if qval == 0
179 1
            else QGrams(qval=qval, start_stop='$#', skip=0, scaler=None)
180 1
        )
181
182 1
        if hasattr(self.params['tokenizer'], 'qval'):
183
            if isinstance(self.params['tokenizer'].qval, int):
184 1
                qvals = [self.params['tokenizer'].qval]
185
            else:
186 1
                qvals = list(self.params['tokenizer'].qval)
187 1
        else:
188 1
            qvals = []
189 1
190 1
        if 'alphabet' in self.params:
191
            if isinstance(self.params['alphabet'], str):
192
                alpha_set = set(self.params['alphabet'])
193 1
                if isinstance(self.params['tokenizer'], (QGrams, QSkipgrams)):
194
                    alpha_set |= set(self.params['tokenizer'].start_stop)
195
                    self.params['alphabet'] = sum(
196 1
                        len(alpha_set) ** qval for qval in qvals
197
                    )
198
            if hasattr(self.params['alphabet'], '__len__') and not isinstance(
199 1
                self.params['alphabet'], Counter
200 1
            ):
201
                self.params['alphabet'] = len(self.params['alphabet'])
202
            elif self.params['alphabet'] is None and isinstance(
203 1
                self.params['tokenizer'], (QGrams, QSkipgrams)
204
            ):
205
                self.params['alphabet'] = sum(
206
                    28 ** qval if qval > 1 else 26 for qval in qvals
207 1
                )
208 1
        else:
209
            if isinstance(self.params['tokenizer'], (QGrams, QSkipgrams)):
210
                self.params['alphabet'] = sum(
211
                    28 ** qval if qval > 1 else 26 for qval in qvals
212 1
                )
213
            else:
214 1
                self.params['alphabet'] = None
215 1
216 1
        if intersection_type == 'soft':
217 1
            if 'metric' not in self.params or self.params['metric'] is None:
218 1
                self.params['metric'] = Levenshtein()
219 1
            self._lcprefix = LCPrefix()
220 1
            self._intersection = self._soft_intersection  # type: ignore
221 1
        elif intersection_type == 'fuzzy':
222 1
            if 'metric' not in self.params or self.params['metric'] is None:
223 1
                self.params['metric'] = Levenshtein()
224 1
            if 'threshold' not in self.params:
225 1
                self.params['threshold'] = 0.8
226 1
            self._intersection = self._fuzzy_intersection  # type: ignore
227 1
        elif intersection_type == 'linkage':
228 1
            if 'metric' not in self.params or self.params['metric'] is None:
229 1
                self.params['metric'] = DamerauLevenshtein()
230 1
            if 'threshold' not in self.params:
231
                self.params['threshold'] = 0.1
232 1
            self._intersection = (  # type: ignore
233
                self._group_linkage_intersection
234 1
            )
235 1
        else:
236 1
            self._intersection = self._crisp_intersection  # type: ignore
237
238
        self._src_tokens = Counter()  # type: TCounter[str]
239 1
        self._tar_tokens = Counter()  # type: TCounter[str]
240
        self._population_card_value = 0  # type: float
241 1
242
        # initialize normalizer
243
        self.normalizer = (
244
            self._norm_none
245
        )  # type: Callable[[float, int, float], float]  # noqa: E501
246
247
        self._norm_dict = {
248
            'proportional': self._norm_proportional,
249
            'log': self._norm_log,
250
            'exp': self._norm_exp,
251 1
            'laplace': self._norm_laplace,
252 1
            'inverse': self._norm_inverse,
253 1
            'complement': self._norm_complement,
254
        }
255 1
256 1
        # initialize values for soft intersection
257
        self._soft_intersection_precalc = Counter()  # type: TCounter[str]
258 1
        self._soft_src_only = Counter()  # type: TCounter[str]
259 1
        self._soft_tar_only = Counter()  # type: TCounter[str]
260
261 1
    @staticmethod
262 1
    def _norm_none(x: float, _squares: int, _pop: float) -> float:
263
        return x
264 1
265 1
    @staticmethod
266
    def _norm_proportional(x: float, _squares: int, pop: float) -> float:
267 1
        return x / max(1, pop)
268 1
269
    @staticmethod
270 1
    def _norm_log(x: float, _squares: int, _pop: float) -> float:
271 1
        return log1p(x)
272
273 1
    @staticmethod
274 1
    def _norm_exp(x: float, _squares: int, _pop: float) -> float:
275
        return exp(x)
276 1
277
    @staticmethod
278
    def _norm_laplace(x: float, squares: int, _pop: float) -> float:
279
        return x + squares
280
281
    @staticmethod
282
    def _norm_inverse(x: float, _squares: int, pop: float) -> float:
283
        return 1 / x if x else pop
284
285
    @staticmethod
286
    def _norm_complement(x: float, _squares: int, pop: float) -> float:
287
        return pop - x
288
289
    def _tokenize(
290
        self, src: Union[str, TCounter[str]], tar: Union[str, TCounter[str]]
291
    ) -> '_TokenDistance':
292
        """Return the Q-Grams in src & tar.
293
294
        Parameters
295
        ----------
296
        src : str
297
            Source string (or QGrams/Counter objects) for comparison
298
        tar : str
299
            Target string (or QGrams/Counter objects) for comparison
300
301
        Returns
302
        -------
303
        tuple of Counters
304 1
            Q-Grams
305 1
306
        Examples
307 1
        --------
308 1
        >>> pe = _TokenDistance()
309
        >>> pe._tokenize('AT', 'TT')._get_tokens()
310 1
        (Counter({'$A': 1, 'AT': 1, 'T#': 1}),
311
         Counter({'$T': 1, 'TT': 1, 'T#': 1}))
312
313 1
314 1
        .. versionadded:: 0.1.0
315
        .. versionchanged:: 0.3.6
316 1
            Encapsulated in class
317
318
        """
319
        self._src_orig = src
320 1
        self._tar_orig = tar
321
322
        if isinstance(src, Counter):
323
            self._src_tokens = src
324
        else:
325 1
            self._src_tokens = (
326
                self.params['tokenizer'].tokenize(src).get_counter()
327
            )
328
        if isinstance(tar, Counter):
329 1
            self._tar_tokens = tar
330
        else:
331
            self._tar_tokens = (
332 1
                self.params['tokenizer'].tokenize(tar).get_counter()
333 1
            )
334 1
335
        self._population_card_value = self._calc_population_card()
336 1
337
        # Set up the normalizer, a function of two variables:
338 1
        # x is the value in the contingency table square(s)
339
        # n is the number of squares that x represents
340 1
        if (
341
            'normalizer' in self.params
342 1
            and self.params['normalizer'] in self._norm_dict
343
        ):
344 1
            self.normalizer = self._norm_dict[self.params['normalizer']]
345 1
346 1
        # clear values for soft intersection
347 1
        self._soft_intersection_precalc = Counter()
348
        self._soft_src_only = Counter()
349
        self._soft_tar_only = Counter()
350
351
        return self
352
353
    def _get_tokens(self) -> Tuple[TCounter[str], TCounter[str]]:
354
        """Return the src and tar tokens as a tuple."""
355
        return self._src_tokens, self._tar_tokens
356
357 1
    def _src_card(self) -> float:
358
        r"""Return the cardinality of the tokens in the source set."""
359
        if self.params['intersection_type'] == 'soft':
360
            if not len(self._soft_intersection_precalc):
361
                self._intersection()
362
            return self.normalizer(
363 1
                sum(
364
                    abs(val)
365
                    for val in (
366
                        self._soft_intersection_precalc + self._soft_src_only
367
                    ).values()
368 1
                ),
369 1
                2,
370 1
                self._population_card_value,
371 1
            )
372 1
        return self.normalizer(
373 1
            sum(abs(val) for val in self._src_tokens.values()),
374 1
            2,
375 1
            self._population_card_value,
376
        )
377 1
378
    def _src_only(self) -> TCounter[str]:
379 1
        r"""Return the src tokens minus the tar tokens.
380
381
        For (multi-)sets S and T, this is :math:`S \setminus T`.
382
        """
383
        if self.params['intersection_type'] == 'soft':
384
            if not len(self._soft_intersection_precalc):
385 1
                self._intersection()
386
            return self._soft_src_only
387 1
        src_only = self._src_tokens - self._intersection()
388 1
        if self.params['intersection_type'] != 'crisp':
389 1
            src_only -= self._intersection() - self._crisp_intersection()
390 1
        return src_only
391
392
    def _src_only_card(self) -> float:
393
        """Return the cardinality of the tokens only in the source set."""
394
        return self.normalizer(
395
            sum(abs(val) for val in self._src_only().values()),
396
            1,
397
            self._population_card_value,
398
        )
399
400 1
    def _tar_card(self) -> float:
401
        r"""Return the cardinality of the tokens in the target set."""
402
        if self.params['intersection_type'] == 'soft':
403
            if not len(self._soft_intersection_precalc):
404
                self._intersection()
405
            return self.normalizer(
406 1
                sum(
407
                    abs(val)
408
                    for val in (
409
                        self._soft_intersection_precalc + self._soft_tar_only
410
                    ).values()
411 1
                ),
412 1
                2,
413 1
                self._population_card_value,
414 1
            )
415 1
        return self.normalizer(
416 1
            sum(abs(val) for val in self._tar_tokens.values()),
417 1
            2,
418 1
            self._population_card_value,
419
        )
420 1
421
    def _tar_only(self) -> TCounter[str]:
422 1
        r"""Return the tar tokens minus the src tokens.
423
424
        For (multi-)sets S and T, this is :math:`T \setminus S`.
425
        """
426
        if self.params['intersection_type'] == 'soft':
427
            if not len(self._soft_intersection_precalc):
428 1
                self._intersection()
429
            return self._soft_tar_only
430
        tar_only = self._tar_tokens - self._intersection()
431
        if self.params['intersection_type'] != 'crisp':
432
            tar_only -= self._intersection() - self._crisp_intersection()
433 1
        return tar_only
434
435 1
    def _tar_only_card(self) -> float:
436
        """Return the cardinality of the tokens only in the target set."""
437 1
        return self.normalizer(
438
            sum(abs(val) for val in self._tar_only().values()),
439
            1,
440
            self._population_card_value,
441
        )
442
443 1
    def _symmetric_difference(self) -> TCounter[str]:
444
        r"""Return the symmetric difference of tokens from src and tar.
445
446
        For (multi-)sets S and T, this is :math:`S \triangle T`.
447
        """
448
        return self._src_only() + self._tar_only()
449
450
    def _symmetric_difference_card(self) -> float:
451 1
        """Return the cardinality of the symmetric difference."""
452 1
        return self.normalizer(
453 1
            sum(abs(val) for val in self._symmetric_difference().values()),
454 1
            2,
455
            self._population_card_value,
456
        )
457
458
    def _total(self) -> TCounter[str]:
459
        """Return the sum of the sets.
460 1
461
        For (multi-)sets S and T, this is :math:`S + T`.
462 1
463
        In the case of multisets, this counts values in the interesection
464 1
        twice. In the case of sets, this is identical to the union.
465
        """
466
        if self.params['intersection_type'] == 'soft':
467
            if not len(self._soft_intersection_precalc):
468
                self._intersection()
469
            return (
470 1
                self._soft_tar_only
471
                + self._soft_src_only
472 1
                + self._soft_intersection_precalc
473 1
                + self._soft_intersection_precalc
474 1
            )
475 1
        return self._src_tokens + self._tar_tokens
476
477
    def _total_card(self) -> float:
478
        """Return the cardinality of the complement of the total."""
479
        return self.normalizer(
480
            sum(abs(val) for val in self._total().values()),
481
            3,
482
            self._population_card_value,
483
        )
484
485
    def _total_complement_card(self) -> float:
486
        """Return the cardinality of the complement of the total."""
487
        if self.params['alphabet'] is None:
488 1
            return self.normalizer(0, 1, self._population_card_value)
489
        elif isinstance(self.params['alphabet'], Counter):
490
            return self.normalizer(
491
                max(
492
                    0,
493
                    sum(
494 1
                        abs(val)
495
                        for val in (
496 1
                            self.params['alphabet'] - self._total()
497 1
                        ).values()
498 1
                    ),
499 1
                ),
500 1
                1,
501 1
                self._population_card_value,
502 1
            )
503 1
        return self.normalizer(
504
            max(0, self.params['alphabet'] - len(self._total().values())),
505 1
            1,
506
            self._population_card_value,
507 1
        )
508
509
    def _calc_population_card(self) -> float:
510
        """Return the cardinality of the population."""
511 1
        save_normalizer = self.normalizer
512
        self.normalizer = self._norm_none
513 1
        save_intersection = self.params['intersection_type']
514
        self.params['intersection_type'] = 'crisp'
515
        pop = self._total_card() + self._total_complement_card()
516
        self.normalizer = save_normalizer
517
        self.params['intersection_type'] = save_intersection
518
        return pop
519 1
520
    def _population_card(self) -> float:
521
        """Return the cardinality of the population."""
522
        return self.normalizer(
523
            self._population_card_value, 4, self._population_card_value
524 1
        )
525 1
526 1
    def _population_unique_card(self) -> float:
527 1
        """Return the cardinality of the population minus the intersection."""
528
        return self.normalizer(
529
            self._population_card_value - self._intersection_card(),
530
            4,
531
            self._population_card_value,
532 1
        )
533 1
534 1
    def _union(self) -> TCounter[str]:
535 1
        r"""Return the union of tokens from src and tar.
536
537 1
        For (multi-)sets S and T, this is :math:`S \cup T`.
538
        """
539 1
        if self.params['intersection_type'] == 'soft':
540
            if not len(self._soft_intersection_precalc):
541
                self._intersection()
542
            return (
543
                self._soft_tar_only
544
                + self._soft_src_only
545 1
                + self._soft_intersection_precalc
546
            )
547 1
        union = self._total() - self._intersection()
548 1
        if self.params['intersection_type'] != 'crisp':
549 1
            union -= self._intersection() - self._crisp_intersection()
550 1
        return union
551 1
552 1
    def _union_card(self) -> float:
553 1
        """Return the cardinality of the union."""
554 1
        return self.normalizer(
555 1
            sum(abs(val) for val in self._union().values()),
556
            3,
557 1
            self._population_card_value,
558
        )
559
560
    def _difference(self) -> TCounter[str]:
561
        """Return the difference of the tokens, supporting negative values."""
562 1
        if self.params['intersection_type'] == 'soft':
563
            if not len(self._soft_intersection_precalc):
564 1
                self._intersection()
565
            _src_copy = Counter(self._soft_src_only)
566
            _src_copy.subtract(self._soft_tar_only)
567
            return _src_copy
568
        _src_copy = Counter(self._src_tokens)
569
        _src_copy.subtract(self._tar_tokens)
570 1
        return _src_copy
571 1
572
    def _crisp_intersection(self) -> TCounter[str]:
573
        r"""Return the intersection of tokens from src and tar.
574
575
        For (multi-)sets S and T, this is :math:`S \cap T`.
576 1
        """
577 1
        return self._src_tokens & self._tar_tokens
578 1
579
    def _soft_intersection(self) -> TCounter[str]:
580 1
        """Return the soft source, target, & intersection tokens & weights.
581 1
582
        This implements the soft intersection defined by :cite:`Russ:2014` in
583 1
        a way that can reproduce the results in the paper.
584 1
        """
585 1
        if not hasattr(self.params['metric'], 'alignment'):
586
            raise TypeError(
587
                "Soft similarity requires a 'metric' with an alignment \
588
member function, such as Levenshtein."
589
            )
590
591
        intersection = self._crisp_intersection()
592
        src_only = self._src_tokens - self._tar_tokens
593 1
        tar_only = self._tar_tokens - self._src_tokens
594 1
595 1
        src_new = Counter()  # type: TCounter[str]
596 1
        tar_new = Counter()  # type: TCounter[str]
597
598 1
        def _membership(src: str, tar: str) -> float:
599 1
            greater_length = max(len(src), len(tar))
600 1
            return cast(
601
                float,
602 1
                max(
603
                    greater_length - self.params['metric'].dist_abs(src, tar),
604 1
                    self._lcprefix.dist_abs(src, tar),
605 1
                )
606 1
                / greater_length,
607 1
            )
608 1
609 1
        def _token_src_tar_int(
610
            src: str, tar: str
611 1
        ) -> Tuple[str, float, str, float, str, float]:
612 1
            src_tok = []
613 1
            tar_tok = []
614 1
            int_tok = []
615 1
616 1
            src_val = 0.0
617 1
            tar_val = 0.0
618
            int_val = 0.0
619 1
620 1
            _cost, _src, _tar = self.params['metric'].alignment(src, tar)
621 1
622
            for i in range(len(_src)):
623 1
                if _src[i] == _tar[i]:
624 1
                    src_tok.append('-')
625 1
                    tar_tok.append('-')
626
                    int_tok.append(_src[i])
627 1
                    int_val += 1
628
                else:
629
                    src_tok.append(_src[i])
630
                    if _src[i] != '-':
631 1
                        src_val += 1
632
                    tar_tok.append(_tar[i])
633
                    if _tar[i] != '-':
634
                        tar_val += 1
635
                    int_tok.append('-')
636 1
637 1
            src_val /= len(_src)
638 1
            tar_val /= len(_src)
639 1
            int_val /= len(_src)
640 1
641 1
            src_tok_str = ''.join(src_tok).strip('-')
642
            tar_tok_str = ''.join(tar_tok).strip('-')
643
            int_tok_str = ''.join(int_tok).strip('-')
644
645
            return (
646
                src_tok_str,
647
                src_val,
648
                tar_tok_str,
649
                tar_val,
650 1
                int_tok_str,
651 1
                int_val,
652 1
            )
653
654
        # Dictionary ordering is important for reproducibility, so insertion
655 1
        # order needs to be controlled and retained.
656 1
        memberships = OrderedDict(
657
            ((src, tar), _membership(src, tar))
658 1
            for src, tar in sorted(product(src_only, tar_only))
659
        )
660
661 1
        while memberships:
662 1
            src_tok, tar_tok = max(memberships, key=memberships.get)
663
            if memberships[src_tok, tar_tok] > 0.0:
664
                pairings = min(src_only[src_tok], tar_only[tar_tok])
665 1
                if pairings:
666 1
                    (
667 1
                        src_ntok,
668
                        src_val,
669 1
                        tar_ntok,
670
                        tar_val,
671 1
                        int_ntok,
672
                        int_val,
673
                    ) = _token_src_tar_int(src_tok, tar_tok)
674
675
                    src_new[src_ntok] += src_val * pairings  # type: ignore
676
                    tar_new[tar_ntok] += tar_val * pairings  # type: ignore
677
                    intersection[int_ntok] += int_val * pairings  # type: ignore
678
679
                    # Remove pairings from src_only/tar_only
680
                    src_only[src_tok] -= pairings
681
                    tar_only[tar_tok] -= pairings
682
683
            del memberships[src_tok, tar_tok]
684
685
        # Add src_new/tar_new back into src_only/tar_only
686
        src_only += src_new
687
        tar_only += tar_new
688
689
        # Save src_only/tar_only to the instance for retrieval later.
690 1
        self._soft_src_only = src_only
691 1
        self._soft_tar_only = tar_only
692 1
        self._soft_intersection_precalc = intersection
693
694 1
        return intersection
695 1
696 1
    def _fuzzy_intersection(self) -> TCounter[str]:
697 1
        r"""Return the fuzzy intersection of the tokens in src and tar.
698 1
699 1
        This implements the fuzzy intersection defined by :cite:`Wang:2014`.
700
701 1
        For two sets X and Y, the intersection :cite:`Wang:2014` is the sum of
702 1
        similarities of all tokens in the two sets that are greater than or
703 1
        equal to some threshold value (:math:`\delta`).
704 1
705
        The lower bound of on this intersection and the value when
706 1
        :math:`\delta = 1.0`, is the crisp intersection. Tokens shorter than
707 1
        :math:`\frac{\delta}{1-\delta}`, 4 in the case of the default threshold
708 1
        :math:`\delta = 0.8`, must match exactly to be included in the
709 1
        intersection.
710
711 1
712 1
        .. versionadded:: 0.4.0
713
714 1
        """
715 1
        intersection = self._crisp_intersection()
716
        src_only = self._src_tokens - self._tar_tokens
717
        tar_only = self._tar_tokens - self._src_tokens
718
719
        pair = {}
720
        for src_tok in sorted(src_only):
721
            for tar_tok in sorted(tar_only):
722
                sim = self.params['metric'].sim(src_tok, tar_tok)
723
                if sim >= self.params['threshold']:
724
                    pair[(src_tok, tar_tok)] = sim
725
726
        ordered_keys = [(pair[_], _[0], _[1]) for _ in pair]
727
        ordered_keys.sort(key=lambda x: x[2])
728
        ordered_keys.sort(key=lambda x: x[1])
729
        ordered_keys.sort(key=lambda x: x[0], reverse=True)
730
731
        for _, src_tok, tar_tok in ordered_keys:
732
            pairings = min(src_only[src_tok], tar_only[tar_tok])
733
            if pairings:
734
                sim = pair[(src_tok, tar_tok)]
735
736
                intersection[src_tok] += sim / 2 * pairings
737
                intersection[tar_tok] += sim / 2 * pairings
738
739
                src_only[src_tok] -= pairings
740
                tar_only[tar_tok] -= pairings
741
742 1
        """
743
        # Here is a slightly different optimization method, which is even
744 1
        # greedier than the above.
745
        # ordered by sim*pairings rather than just sim
746
747
        pair = {}
748
        for src_tok in sorted(src_only):
749
            for tar_tok in sorted(tar_only):
750
                sim = self.params['metric'].sim(src_tok, tar_tok)
751
                if sim >= self.params['threshold']:
752
                    pairings = min(src_only[src_tok], tar_only[tar_tok])
753
                    pair[(src_tok, tar_tok)] = sim*pairings
754
755
        for src_tok, tar_tok in sorted(pair, key=pair.get, reverse=True):
756
            pairings = min(src_only[src_tok], tar_only[tar_tok])
757
            if pairings:
758
                sim = pair[(src_tok, tar_tok)]
759
760
                intersection[src_tok] += sim / 2
761 1
                intersection[tar_tok] += sim / 2
762 1
763 1
                src_only[src_tok] -= pairings
764 1
                tar_only[tar_tok] -= pairings
765 1
        """
766
767
        return intersection
768
769
    def _group_linkage_intersection(self) -> TCounter[str]:
770 1
        r"""Return the group linkage intersection of the tokens in src and tar.
771 1
772
        This is based on group linkage, as defined by :cite:`On:2007`.
773 1
774 1
        Most of this method is concerned with solving the assignment problem,
775 1
        in order to find the weight of the maximum weight bipartite matching.
776
        The Hungarian algorithm of Munkres :cite:`Munkres:1957`, implemented
777
        below in Python & Numpy is used to solve the assignment problem since
778
        it is roughly twice as fast as SciPy's implementation.
779 1
780 1
        .. versionadded:: 0.4.0
781
        .. versionchanged:: 0.4.1
782 1
            Corrected the Hungarian algorithm & optimized it so that SciPy's
783 1
            version is no longer needed.
784 1
785 1
        """
786
        intersection = self._crisp_intersection()
787 1
        src_only_tok = sorted(self._src_tokens - self._tar_tokens)
788
        tar_only_tok = sorted(self._tar_tokens - self._src_tokens)
789
        src_only = self._src_tokens - self._tar_tokens
790
        tar_only = self._tar_tokens - self._src_tokens
791
792
        # Quoted text below is from Munkres (1957), cited above.
793 1
794
        # Pre-preliminaries: create square the matrix of scores
795
        n = max(len(src_only_tok), len(tar_only_tok))
796 1
        arr = np.zeros((n, n), dtype=float)
797
798
        for col in range(len(src_only_tok)):
799
            for row in range(len(tar_only_tok)):
800
                arr[row, col] = self.params['metric'].dist(
801
                    src_only_tok[col], tar_only_tok[row]
802 1
                )
803 1
804 1
        src_only_tok += [''] * (n - len(src_only_tok))
805 1
        tar_only_tok += [''] * (n - len(tar_only_tok))
806
807
        starred = np.zeros((n, n), dtype=np.bool_)
808
        primed = np.zeros((n, n), dtype=np.bool_)
809 1
        row_covered = np.zeros(n, dtype=np.bool_)
810 1
        col_covered = np.zeros(n, dtype=np.bool_)
811
812 1
        orig_sim = 1 - np.copy(arr)
813
        # Preliminaries:
814
        # P: "No lines are covered; no zeros are starred or primed."
815 1
        # P: "Consider a row of matrix A; subtract from each element in
816 1
        # this row the smallest element of this row. Do the same for each
817
        # row of A."
818 1
        arr -= arr.min(axis=1, keepdims=True)
819 1
        # P: "Then consider each column of the resulting matrix and
820
        # subtract from each column its smallest entry."
821
        arr -= arr.min(axis=0, keepdims=True)
822
823
        # P: "Consider a zero Z of the matrix. If there is no starred zero
824
        # in its row and none in its column, star Z. Repeat, considering
825
        # each zero in the matrix in turn. Then cover every column
826 1
        # containing a starred zero.
827 1
        for col in range(n):
828 1
            for row in range(n):
829 1
                if arr[row, col] == 0:
830 1
                    if (
831 1
                        np.count_nonzero(starred[row, :]) == 0
832 1
                        and np.count_nonzero(starred[:, col]) == 0
833 1
                    ):
834 1
                        starred[row, col] = True
835
                        col_covered[col] = True
836 1
837 1
        step = 1
838
        # This is the simple case where independent assignments are obvious
839 1
        # and found without the rest of the algorithm.
840 1
        if np.count_nonzero(col_covered) == n:
841
            step = 4
842 1
843 1
        while step < 4:
844 1
            if step == 1:
845
                # Step 1:
846 1
                # 1: "Choose a non-covered zero and prime it. Consider the
847
                # row containing it. If there is no starred zero in this
848 1
                # row, go at once to Step 2. If there is a starred zero Z
849
                # in this row, cover this row and uncover the column of Z."
850
                # 1: Repeat until all zeros are covered. Go to Step 3."
851
                zeros = tuple(zip(*((arr == 0).nonzero())))
852
                while step == 1:
853
                    for row, col in zeros:
854
                        if not (col_covered[col] | row_covered[row]):
855
                            primed[row, col] = True
856
                            z_cols = (starred[row, :]).nonzero()[0]
857
                            if not z_cols.size:
858 1
                                step = 2
859 1
                                break
860 1
                            else:
861
                                row_covered[row] = True
862
                                col_covered[z_cols[0]] = False
863 1
864 1
                    if step != 1:
865 1
                        break
866 1
867 1
                    for row, col in zeros:
868
                        if not (col_covered[col] | row_covered[row]):
869
                            break
870
                    else:
871 1
                        step = 3
872 1
873 1
            if step == 2:
874 1
                # Step 2:
875
                # 2: "There is a sequence of alternating starred and primed
876
                # zeros, constructed as follows: Let Z_0 denote the
877
                # uncovered 0'. [There is only one.] Let Z_1 denote the 0*
878 1
                # in Z_0's column (if any). Let Z_2 denote the 0' in Z_1's
879
                # row (we must prove that it exists). Let Z_3 denote the 0*
880 1
                # in Z_2's column (if any). Similarly continue until the
881
                # sequence stops at a 0', Z_{2k}, which has no 0* in its
882
                # column."
883
                z_series = []
884
                for row, col in zeros:  # pragma: no branch
0 ignored issues
show
introduced by
The variable zeros does not seem to be defined in case step == 1 on line 844 is False. Are you sure this can never be the case?
Loading history...
885
                    if primed[row, col] and not (
886 1
                        row_covered[row] | col_covered[col]
887 1
                    ):
888 1
                        z_series.append((row, col))
889 1
                        break
890 1
                col = z_series[-1][1]
891 1
                while True:
892 1
                    row_content = tuple(
893 1
                        set((arr[:, col] == 0).nonzero()[0])
894
                        & set((starred[:, col]).nonzero()[0])
895
                    )
896
                    if row_content:
897 1
                        row = row_content[0]
898 1
                        z_series.append((row, col))
899
                        col = tuple(
900 1
                            set((arr[row, :] == 0).nonzero()[0])
901
                            & set((primed[row, :]).nonzero()[0])
902 1
                        )[0]
903
                        z_series.append((row, col))
904
                    else:
905
                        break
906
907 1
                # 2: "Unstar each starred zero of the sequence and star
908 1
                # each primed zero of the sequence. Erase all primes,
909 1
                # uncover every row, and cover every column containing a
910 1
                # 0*."
911 1
                primed[:, :] = False
912
                row_covered[:] = False
913
                col_covered[:] = False
914
                for row, col in z_series:
915 1
                    starred[row, col] = not starred[row, col]
916 1
                for col in range(n):
917 1
                    if np.count_nonzero(starred[:, col]):
918 1
                        col_covered[col] = True
919 1
                # 2: "If all columns are covered, the starred zeros form
920 1
                # the desired independent set. Otherwise, return to Step
921 1
                # 1."
922
                if np.count_nonzero(col_covered) == n:
923
                    step = 4
924
                else:
925 1
                    step = 1
926
927 1
            if step == 3:
928 1
                # Step 3:
929 1
                # 3: "Let h denote the smallest non-covered element of the
930 1
                # matrix; it will be positive. Add h to each covered row;
931
                # then subtract h from each uncovered column."
932
                h_val = float('inf')
933
                for col in range(n):
934
                    if not (col_covered[col]):
935
                        for row in range(n):
936
                            if (
937 1
                                not (row_covered[row])
938 1
                                and arr[row, col] < h_val
939
                            ):
940 1
                                h_val = arr[row, col]
941
                for row in range(n):
942 1
                    if row_covered[row]:
943
                        arr[row, :] += h_val
944 1
                for col in range(n):
945
                    if not (col_covered[col]):
946
                        arr[:, col] -= h_val
947
948
                # 3: "Return to Step 1, without altering any asterisks,
949
                # primes, or covered lines."
950 1
                step = 1
951
952
        for row, col in tuple(zip(*(starred.nonzero()))):
953
            sim = orig_sim[row, col]
954
            if sim >= self.params['threshold']:
955
                score = float(
956
                    (sim / 2)
957
                    * min(
958 1
                        src_only[src_only_tok[col]],
959
                        tar_only[tar_only_tok[row]],
960 1
                    )
961
                )
962
                intersection[src_only_tok[col]] += score  # type: ignore
963
                intersection[tar_only_tok[row]] += score  # type: ignore
964
965
        return intersection
966
967
    def _intersection_card(self) -> float:
968
        """Return the cardinality of the intersection."""
969
        return self.normalizer(
970
            sum(abs(val) for val in self._intersection().values()),
971
            1,
972
            self._population_card_value,
973
        )
974
975
    def _intersection(self) -> TCounter[str]:
976
        """Return the intersection.
977
978
        This function may be overridden by setting the intersection_type during
979
        initialization.
980
        """
981
        return self._crisp_intersection()  # pragma: no cover
982
983
    def _get_confusion_table(self) -> ConfusionTable:
984
        """Return the token counts as a ConfusionTable object."""
985
        return ConfusionTable(
986
            self._intersection_card(),
987
            self._total_complement_card(),
988
            self._src_only_card(),
989
            self._tar_only_card(),
990
        )
991
992
993
if __name__ == '__main__':
994
    import doctest
995
996
    doctest.testmod()
997