Completed
Push — master ( 643512...2b6b3e )
by Chris
20:40 queued 10:36
created

_TokenDistance._soft_intersection()   C

Complexity

Conditions 9

Size

Total Lines 106
Code Lines 72

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 58
CRAP Score 9

Importance

Changes 0
Metric Value
eloc 72
dl 0
loc 106
ccs 58
cts 58
cp 1
rs 5.583
c 0
b 0
f 0
cc 9
nop 1
crap 9

How to fix   Long Method   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
# -*- coding: utf-8 -*-
2
3
# Copyright 2018-2019 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19 1
"""abydos.distance._token_distance.
20
21
The distance._token_distance._TokenDistance module implements abstract class
22
_TokenDistance.
23
"""
24
25 1
from __future__ import (
26
    absolute_import,
27
    division,
28
    print_function,
29
    unicode_literals,
30
)
31
32 1
from collections import Counter, OrderedDict
33 1
from itertools import product
34 1
from math import exp, log1p
35
36 1
import numpy as np
37 1
from numpy import zeros as np_zeros
38
39 1
from ._damerau_levenshtein import DamerauLevenshtein
40 1
from ._distance import _Distance
41 1
from ._lcprefix import LCPrefix
42 1
from ._levenshtein import Levenshtein
43 1
from ..stats import ConfusionTable
44 1
from ..tokenizer import QGrams, QSkipgrams, WhitespaceTokenizer
45
46 1
__all__ = ['_TokenDistance']
47
48
49 1
class _TokenDistance(_Distance):
50
    r"""Abstract Token Distance class.
51
52
    .. _confusion_table:
53
54
    +----------------+--------------+-----------------+-------+
55
    |                | |in| ``tar`` | |notin| ``tar`` |       |
56
    +----------------+--------------+-----------------+-------+
57
    | |in| ``src``   | |a|          | |b|             | |a+b| |
58
    +----------------+--------------+-----------------+-------+
59
    | |notin| ``src``| |c|          | |d|             | |c+d| |
60
    +----------------+--------------+-----------------+-------+
61
    |                | |a+c|        | |b+d|           | |n|   |
62
    +----------------+--------------+-----------------+-------+
63
64
    .. |in| replace:: :math:`x \in`
65
    .. |notin| replace:: :math:`x \notin`
66
67
    .. |a| replace:: :math:`a = |X \cap Y|`
68
    .. |b| replace:: :math:`b = |X\setminus Y|`
69
    .. |c| replace:: :math:`c = |Y \setminus X|`
70
    .. |d| replace:: :math:`d = |(N\setminus X)\setminus Y|`
71
    .. |n| replace:: :math:`n = |N|`
72
    .. |a+b| replace:: :math:`p_1 = a+b = |X|`
73
    .. |a+c| replace:: :math:`p_2 = a+c = |Y|`
74
    .. |c+d| replace:: :math:`q_1 = c+d = |N\setminus X|`
75
    .. |b+d| replace:: :math:`q_2 = b+d = |N\setminus Y|`
76
77
    .. versionadded:: 0.3.6
78
    """
79
80 1
    def __init__(self, tokenizer=None, intersection_type='crisp', **kwargs):
81
        r"""Initialize _TokenDistance instance.
82
83
        .. _intersection_type:
84
85
        Parameters
86
        ----------
87
        tokenizer : _Tokenizer
88
            A tokenizer instance from the :py:mod:`abydos.tokenizer` package
89
        intersection_type : str
90
            Specifies the intersection type, and set type as a result:
91
92
                - 'crisp': Ordinary intersection, wherein items are entirely
93
                  members or non-members of the intersection. (Default)
94
                - ``fuzzy``: Fuzzy intersection, defined by :cite:`Wang:2014`,
95
                  wherein items can be partially members of the intersection
96
                  if their similarity meets or exceeds a threshold value. This
97
                  also takes `metric` (by default :class:`Levenshtein()`) and
98
                  `threshold` (by default 0.8) parameters.
99
                - ``soft``: Soft intersection, defined by :cite:`Russ:2014`,
100
                  wherein items can be partially members of the intersection
101
                  depending on their similarity. This also takes a `metric`
102
                  (by default :class:`DamerauLevenshtein()`) parameter.
103
                - ``linkage``: Group linkage, defined by :cite:`On:2007`. Like
104
                  the soft intersection, items can be partially members of the
105
                  intersection, but the method of pairing similar members is
106
                  somewhat more complex. See the cited paper for details. This
107
                  also takes `metric`
108
                  (by default :class:`DamerauLevenshtein()`) and `threshold`
109
                  (by default 0.1) parameters.
110
        **kwargs
111
            Arbitrary keyword arguments
112
113
114
        .. _alphabet:
115
116
        Other Parameters
117
        ----------------
118
        qval : int
119
            The length of each q-gram. Using this parameter and tokenizer=None
120
            will cause the instance to use the QGram tokenizer with this
121
            q value.
122
        metric : _Distance
123
            A string distance measure class for use in the ``soft`` and
124
            ``fuzzy`` variants.
125
        threshold : float
126
            A threshold value, similarities above which are counted as
127
            members of the intersection for the ``fuzzy`` variant.
128
        alphabet : Counter, collection, int, or None
129
            This represents the alphabet of possible tokens.
130
131
                - If a Counter is supplied, it is used directly in computing
132
                  the complement of the tokens in both sets.
133
                - If a collection is supplied, it is converted to a Counter
134
                  and used directly. In the case of a single string being
135
                  supplied and the QGram tokenizer being used, the full
136
                  alphabet is inferred (i.e.
137
                  :math:`len(set(alphabet+QGrams.start\_stop))^{QGrams.qval}`
138
                  is used as the cardinality of the full alphabet.
139
                - If an int is supplied, it is used as the cardinality of the
140
                  full alphabet.
141
                - If None is supplied, the cardinality of the full alphabet
142
                  is inferred if QGram of QSkipgrams tokenization is used (i.e.
143
                  :math:`28^{QGrams.qval}` is used as the cardinality of the
144
                  full alphabet or :math:`26` if QGrams.qval is 1, which
145
                  assumes the strings are English language strings and only
146
                  contain letters of a single case). Otherwise, the cardinality
147
                  of the complement of the total will be 0.
148
        normalizer : str
149
            This represents the normalization applied to the values in the
150
            2x2 contingency table prior to any of the cardinality (\*_card)
151
            methods returning a value. By default, no normalization is applied,
152
            but the following values are supported:
153
154
                - ``proportional`` : :math:`\frac{x}{n}`, where n is the total
155
                  population
156
                - ``log`` : :math:`log(1+x)`
157
                - ``exp`` : :math:`e^x`
158
                - ``laplace`` : :math:`x+1`
159
                - ``inverse`` : :math:`\frac{1}{x}`
160
                - ``complement`` : :math:`n-x`, where n is the total population
161
162
        .. versionadded:: 0.4.0
163
164
        """
165 1
        super(_TokenDistance, self).__init__(
166
            intersection_type=intersection_type, **kwargs
167
        )
168
169 1
        qval = 2 if 'qval' not in self.params else self.params['qval']
170 1
        self.params['tokenizer'] = (
171
            tokenizer
172
            if tokenizer is not None
173
            else WhitespaceTokenizer()
174
            if qval == 0
175
            else QGrams(qval=qval, start_stop='$#', skip=0, scaler=None)
176
        )
177
178 1
        if hasattr(self.params['tokenizer'], 'qval'):
179 1
            if isinstance(self.params['tokenizer'].qval, int):
180 1
                qvals = [self.params['tokenizer'].qval]
181
            else:
182 1
                qvals = list(self.params['tokenizer'].qval)
183
        else:
184 1
            qvals = []
185
186 1
        if 'alphabet' in self.params:
187 1
            if isinstance(self.params['alphabet'], str):
188 1
                self.params['alphabet'] = set(self.params['alphabet'])
189 1
                if isinstance(self.params['tokenizer'], (QGrams, QSkipgrams)):
190 1
                    self.params['alphabet'] |= set(
191
                        self.params['tokenizer'].start_stop
192
                    )
193 1
                    self.params['alphabet'] = sum(
194
                        len(self.params['alphabet']) ** qval for qval in qvals
195
                    )
196 1
            if hasattr(self.params['alphabet'], '__len__') and not isinstance(
197
                self.params['alphabet'], Counter
198
            ):
199 1
                self.params['alphabet'] = len(self.params['alphabet'])
200 1
            elif self.params['alphabet'] is None and isinstance(
201
                self.params['tokenizer'], (QGrams, QSkipgrams)
202
            ):
203 1
                self.params['alphabet'] = sum(
204
                    28 ** qval if qval > 1 else 26 for qval in qvals
205
                )
206
        else:
207 1
            if isinstance(self.params['tokenizer'], (QGrams, QSkipgrams)):
208 1
                self.params['alphabet'] = sum(
209
                    28 ** qval if qval > 1 else 26 for qval in qvals
210
                )
211
            else:
212 1
                self.params['alphabet'] = None
213
214 1
        if intersection_type == 'soft':
215 1
            if 'metric' not in self.params or self.params['metric'] is None:
216 1
                self.params['metric'] = Levenshtein()
217 1
            self._lcprefix = LCPrefix()
218 1
            self._intersection = self._soft_intersection
219 1
        elif intersection_type == 'fuzzy':
220 1
            if 'metric' not in self.params or self.params['metric'] is None:
221 1
                self.params['metric'] = Levenshtein()
222 1
            if 'threshold' not in self.params:
223 1
                self.params['threshold'] = 0.8
224 1
            self._intersection = self._fuzzy_intersection
225 1
        elif intersection_type == 'linkage':
226 1
            if 'metric' not in self.params or self.params['metric'] is None:
227 1
                self.params['metric'] = DamerauLevenshtein()
228 1
            if 'threshold' not in self.params:
229 1
                self.params['threshold'] = 0.1
230 1
            self._intersection = self._group_linkage_intersection
231
        else:
232 1
            self._intersection = self._crisp_intersection
233
234 1
        self._src_tokens = Counter()
235 1
        self._tar_tokens = Counter()
236 1
        self._population_card_value = 0
237
238
        # initialize normalizer
239 1
        self.normalizer = self._norm_none
240
241 1
        self._norm_dict = {
242
            'proportional': self._norm_proportional,
243
            'log': self._norm_log,
244
            'exp': self._norm_exp,
245
            'laplace': self._norm_laplace,
246
            'inverse': self._norm_inverse,
247
            'complement': self._norm_complement,
248
        }
249
250
        # initialize values for soft intersection
251 1
        self._soft_intersection_precalc = None
252 1
        self._soft_src_only = None
253 1
        self._soft_tar_only = None
254
255 1
    def _norm_none(self, x, _squares, _pop):
256 1
        return x
257
258 1
    def _norm_proportional(self, x, _squares, pop):
259 1
        return x / max(1, pop)
260
261 1
    def _norm_log(self, x, _squares, _pop):
262 1
        return log1p(x)
263
264 1
    def _norm_exp(self, x, _squares, _pop):
265 1
        return exp(x)
266
267 1
    def _norm_laplace(self, x, squares, _pop):
268 1
        return x + squares
269
270 1
    def _norm_inverse(self, x, _squares, pop):
271 1
        return 1 / x if x else pop
272
273 1
    def _norm_complement(self, x, _squares, pop):
274 1
        return pop - x
275
276 1
    def _tokenize(self, src, tar):
277
        """Return the Q-Grams in src & tar.
278
279
        Parameters
280
        ----------
281
        src : str
282
            Source string (or QGrams/Counter objects) for comparison
283
        tar : str
284
            Target string (or QGrams/Counter objects) for comparison
285
286
        Returns
287
        -------
288
        tuple of Counters
289
            Q-Grams
290
291
        Examples
292
        --------
293
        >>> pe = _TokenDistance()
294
        >>> pe._tokenize('AT', 'TT')._get_tokens()
295
        (Counter({'$A': 1, 'AT': 1, 'T#': 1}),
296
         Counter({'$T': 1, 'TT': 1, 'T#': 1}))
297
298
299
        .. versionadded:: 0.1.0
300
        .. versionchanged:: 0.3.6
301
            Encapsulated in class
302
303
        """
304 1
        self._src_orig = src
305 1
        self._tar_orig = tar
306
307 1
        if isinstance(src, Counter):
308 1
            self._src_tokens = src
309
        else:
310 1
            self._src_tokens = (
311
                self.params['tokenizer'].tokenize(src).get_counter()
312
            )
313 1
        if isinstance(tar, Counter):
314 1
            self._tar_tokens = tar
315
        else:
316 1
            self._tar_tokens = (
317
                self.params['tokenizer'].tokenize(tar).get_counter()
318
            )
319
320 1
        self._population_card_value = self._calc_population_card()
321
322
        # Set up the normalizer, a function of two variables:
323
        # x is the value in the contingency table square(s)
324
        # n is the number of squares that x represents
325 1
        if (
326
            'normalizer' in self.params
327
            and self.params['normalizer'] in self._norm_dict
328
        ):
329 1
            self.normalizer = self._norm_dict[self.params['normalizer']]
330
331
        # clear values for soft intersection
332 1
        self._soft_intersection_precalc = None
333 1
        self._soft_src_only = None
334 1
        self._soft_tar_only = None
335
336 1
        return self
337
338 1
    def _get_tokens(self):
339
        """Return the src and tar tokens as a tuple."""
340 1
        return self._src_tokens, self._tar_tokens
341
342 1
    def _src_card(self):
343
        r"""Return the cardinality of the tokens in the source set."""
344 1
        if self.params['intersection_type'] == 'soft':
345 1
            if not self._soft_intersection_precalc:
346 1
                self._intersection()
347 1
            return self.normalizer(
348
                sum(
349
                    abs(val)
350
                    for val in (
351
                        self._soft_intersection_precalc + self._soft_src_only
352
                    ).values()
353
                ),
354
                2,
355
                self._population_card_value,
356
            )
357 1
        return self.normalizer(
358
            sum(abs(val) for val in self._src_tokens.values()),
359
            2,
360
            self._population_card_value,
361
        )
362
363 1
    def _src_only(self):
364
        r"""Return the src tokens minus the tar tokens.
365
366
        For (multi-)sets S and T, this is :math:`S \setminus T`.
367
        """
368 1
        if self.params['intersection_type'] == 'soft':
369 1
            if not self._soft_intersection_precalc:
370 1
                self._intersection()
371 1
            return self._soft_src_only
372 1
        src_only = self._src_tokens - self._intersection()
373 1
        if self.params['intersection_type'] != 'crisp':
374 1
            src_only -= self._intersection() - self._crisp_intersection()
375 1
        return src_only
376
377 1
    def _src_only_card(self):
378
        """Return the cardinality of the tokens only in the source set."""
379 1
        return self.normalizer(
380
            sum(abs(val) for val in self._src_only().values()),
381
            1,
382
            self._population_card_value,
383
        )
384
385 1
    def _tar_card(self):
386
        r"""Return the cardinality of the tokens in the target set."""
387 1
        if self.params['intersection_type'] == 'soft':
388 1
            if not self._soft_intersection_precalc:
389 1
                self._intersection()
390 1
            return self.normalizer(
391
                sum(
392
                    abs(val)
393
                    for val in (
394
                        self._soft_intersection_precalc + self._soft_tar_only
395
                    ).values()
396
                ),
397
                2,
398
                self._population_card_value,
399
            )
400 1
        return self.normalizer(
401
            sum(abs(val) for val in self._tar_tokens.values()),
402
            2,
403
            self._population_card_value,
404
        )
405
406 1
    def _tar_only(self):
407
        r"""Return the tar tokens minus the src tokens.
408
409
        For (multi-)sets S and T, this is :math:`T \setminus S`.
410
        """
411 1
        if self.params['intersection_type'] == 'soft':
412 1
            if not self._soft_intersection_precalc:
413 1
                self._intersection()
414 1
            return self._soft_tar_only
415 1
        tar_only = self._tar_tokens - self._intersection()
416 1
        if self.params['intersection_type'] != 'crisp':
417 1
            tar_only -= self._intersection() - self._crisp_intersection()
418 1
        return tar_only
419
420 1
    def _tar_only_card(self):
421
        """Return the cardinality of the tokens only in the target set."""
422 1
        return self.normalizer(
423
            sum(abs(val) for val in self._tar_only().values()),
424
            1,
425
            self._population_card_value,
426
        )
427
428 1
    def _symmetric_difference(self):
429
        r"""Return the symmetric difference of tokens from src and tar.
430
431
        For (multi-)sets S and T, this is :math:`S \triangle T`.
432
        """
433 1
        return self._src_only() + self._tar_only()
434
435 1
    def _symmetric_difference_card(self):
436
        """Return the cardinality of the symmetric difference."""
437 1
        return self.normalizer(
438
            sum(abs(val) for val in self._symmetric_difference().values()),
439
            2,
440
            self._population_card_value,
441
        )
442
443 1
    def _total(self):
444
        """Return the sum of the sets.
445
446
        For (multi-)sets S and T, this is :math:`S + T`.
447
448
        In the case of multisets, this counts values in the interesection
449
        twice. In the case of sets, this is identical to the union.
450
        """
451 1
        if self.params['intersection_type'] == 'soft':
452 1
            if not self._soft_intersection_precalc:
453 1
                self._intersection()
454 1
            return (
455
                self._soft_tar_only
456
                + self._soft_src_only
457
                + self._soft_intersection_precalc
458
                + self._soft_intersection_precalc
459
            )
460 1
        return self._src_tokens + self._tar_tokens
461
462 1
    def _total_card(self):
463
        """Return the cardinality of the complement of the total."""
464 1
        return self.normalizer(
465
            sum(abs(val) for val in self._total().values()),
466
            3,
467
            self._population_card_value,
468
        )
469
470 1
    def _total_complement_card(self):
471
        """Return the cardinality of the complement of the total."""
472 1
        if self.params['alphabet'] is None:
473 1
            return self.normalizer(0, 1, self._population_card_value)
474 1
        elif isinstance(self.params['alphabet'], Counter):
475 1
            return self.normalizer(
476
                max(
477
                    0,
478
                    sum(
479
                        abs(val)
480
                        for val in (
481
                            self.params['alphabet'] - self._total()
482
                        ).values()
483
                    ),
484
                ),
485
                1,
486
                self._population_card_value,
487
            )
488 1
        return self.normalizer(
489
            max(0, self.params['alphabet'] - len(self._total().values())),
490
            1,
491
            self._population_card_value,
492
        )
493
494 1
    def _calc_population_card(self):
495
        """Return the cardinality of the population."""
496 1
        save_normalizer = self.normalizer
497 1
        self.normalizer = self._norm_none
498 1
        save_intersection = self.params['intersection_type']
499 1
        self.params['intersection_type'] = 'crisp'
500 1
        pop = self._total_card() + self._total_complement_card()
501 1
        self.normalizer = save_normalizer
502 1
        self.params['intersection_type'] = save_intersection
503 1
        return pop
504
505 1
    def _population_card(self):
506
        """Return the cardinality of the population."""
507 1
        return self.normalizer(
508
            self._population_card_value, 4, self._population_card_value
509
        )
510
511 1
    def _population_unique_card(self):
512
        """Return the cardinality of the population minus the intersection."""
513 1
        return self.normalizer(
514
            self._population_card_value - self._intersection_card(),
515
            4,
516
            self._population_card_value,
517
        )
518
519 1
    def _union(self):
520
        r"""Return the union of tokens from src and tar.
521
522
        For (multi-)sets S and T, this is :math:`S \cup T`.
523
        """
524 1
        if self.params['intersection_type'] == 'soft':
525 1
            if not self._soft_intersection_precalc:
526 1
                self._intersection()
527 1
            return (
528
                self._soft_tar_only
529
                + self._soft_src_only
530
                + self._soft_intersection_precalc
531
            )
532 1
        union = self._total() - self._intersection()
533 1
        if self.params['intersection_type'] != 'crisp':
534 1
            union -= self._intersection() - self._crisp_intersection()
535 1
        return union
536
537 1
    def _union_card(self):
538
        """Return the cardinality of the union."""
539 1
        return self.normalizer(
540
            sum(abs(val) for val in self._union().values()),
541
            3,
542
            self._population_card_value,
543
        )
544
545 1
    def _difference(self):
546
        """Return the difference of the tokens, supporting negative values."""
547 1
        if self.params['intersection_type'] == 'soft':
548 1
            if not self._soft_intersection_precalc:
549 1
                self._intersection()
550 1
            _src_copy = Counter(self._soft_src_only)
551 1
            _src_copy.subtract(self._soft_tar_only)
552 1
            return _src_copy
553 1
        _src_copy = Counter(self._src_tokens)
554 1
        _src_copy.subtract(self._tar_tokens)
555 1
        return _src_copy
556
557 1
    def _crisp_intersection(self):
558
        r"""Return the intersection of tokens from src and tar.
559
560
        For (multi-)sets S and T, this is :math:`S \cap T`.
561
        """
562 1
        return self._src_tokens & self._tar_tokens
563
564 1
    def _soft_intersection(self):
565
        """Return the soft source, target, & intersection tokens & weights.
566
567
        This implements the soft intersection defined by :cite:`Russ:2014` in
568
        a way that can reproduce the results in the paper.
569
        """
570 1
        if not hasattr(self.params['metric'], 'alignment'):
571 1
            raise TypeError(
572
                "Soft similarity requires a 'metric' with an alignment \
573
member function, such as Levenshtein."
574
            )
575
576 1
        intersection = self._crisp_intersection()
577 1
        src_only = self._src_tokens - self._tar_tokens
578 1
        tar_only = self._tar_tokens - self._src_tokens
579
580 1
        src_new = Counter()
581 1
        tar_new = Counter()
582
583 1
        def _membership(src, tar):
584 1
            greater_length = max(len(src), len(tar))
585 1
            return (
586
                max(
587
                    greater_length - self.params['metric'].dist_abs(src, tar),
588
                    self._lcprefix.dist_abs(src, tar),
589
                )
590
                / greater_length
591
            )
592
593 1
        def _token_src_tar_int(src, tar):
594 1
            src_tok = []
595 1
            tar_tok = []
596 1
            int_tok = []
597
598 1
            src_val = 0
599 1
            tar_val = 0
600 1
            int_val = 0
601
602 1
            _cost, _src, _tar = self.params['metric'].alignment(src, tar)
603
604 1
            for i in range(len(_src)):
605 1
                if _src[i] == _tar[i]:
606 1
                    src_tok.append('-')
607 1
                    tar_tok.append('-')
608 1
                    int_tok.append(_src[i])
609 1
                    int_val += 1
610
                else:
611 1
                    src_tok.append(_src[i])
612 1
                    if _src[i] != '-':
613 1
                        src_val += 1
614 1
                    tar_tok.append(_tar[i])
615 1
                    if _tar[i] != '-':
616 1
                        tar_val += 1
617 1
                    int_tok.append('-')
618
619 1
            src_val /= len(_src)
620 1
            tar_val /= len(_src)
621 1
            int_val /= len(_src)
622
623 1
            src_tok = ''.join(src_tok).strip('-')
624 1
            tar_tok = ''.join(tar_tok).strip('-')
625 1
            int_tok = ''.join(int_tok).strip('-')
626
627 1
            return src_tok, src_val, tar_tok, tar_val, int_tok, int_val
628
629
        # Dictionary ordering is important for reproducibility, so insertion
630
        # order needs to be controlled and retained.
631 1
        memberships = OrderedDict(
632
            ((src, tar), _membership(src, tar))
633
            for src, tar in sorted(product(src_only, tar_only))
634
        )
635
636 1
        while memberships:
637 1
            src_tok, tar_tok = max(memberships, key=memberships.get)
638 1
            if memberships[src_tok, tar_tok] > 0.0:
639 1
                pairings = min(src_only[src_tok], tar_only[tar_tok])
640 1
                if pairings:
641 1
                    (
642
                        src_ntok,
643
                        src_val,
644
                        tar_ntok,
645
                        tar_val,
646
                        int_ntok,
647
                        int_val,
648
                    ) = _token_src_tar_int(src_tok, tar_tok)
649
650 1
                    src_new[src_ntok] += src_val * pairings
651 1
                    tar_new[tar_ntok] += tar_val * pairings
652 1
                    intersection[int_ntok] += int_val * pairings
653
654
                    # Remove pairings from src_only/tar_only
655 1
                    src_only[src_tok] -= pairings
656 1
                    tar_only[tar_tok] -= pairings
657
658 1
            del memberships[src_tok, tar_tok]
659
660
        # Add src_new/tar_new back into src_only/tar_only
661 1
        src_only += src_new
662 1
        tar_only += tar_new
663
664
        # Save src_only/tar_only to the instance for retrieval later.
665 1
        self._soft_src_only = src_only
666 1
        self._soft_tar_only = tar_only
667 1
        self._soft_intersection_precalc = intersection
668
669 1
        return intersection
670
671 1
    def _fuzzy_intersection(self):
672
        r"""Return the fuzzy intersection of the tokens in src and tar.
673
674
        This implements the fuzzy intersection defined by :cite:`Wang:2014`.
675
676
        For two sets X and Y, the intersection :cite:`Wang:2014` is the sum of
677
        similarities of all tokens in the two sets that are greater than or
678
        equal to some threshold value (:math:`\delta`).
679
680
        The lower bound of on this intersection and the value when
681
        :math:`\delta = 1.0`, is the crisp intersection. Tokens shorter than
682
        :math:`\frac{\delta}{1-\delta}`, 4 in the case of the default threshold
683
        :math:`\delta = 0.8`, must match exactly to be included in the
684
        intersection.
685
686
687
        .. versionadded:: 0.4.0
688
689
        """
690 1
        intersection = self._crisp_intersection()
691 1
        src_only = self._src_tokens - self._tar_tokens
692 1
        tar_only = self._tar_tokens - self._src_tokens
693
694 1
        pair = {}
695 1
        for src_tok in sorted(src_only):
696 1
            for tar_tok in sorted(tar_only):
697 1
                sim = self.params['metric'].sim(src_tok, tar_tok)
698 1
                if sim >= self.params['threshold']:
699 1
                    pair[(src_tok, tar_tok)] = sim
700
701 1
        ordered_keys = [(pair[_], _[0], _[1]) for _ in pair]
702 1
        ordered_keys.sort(key=lambda x: x[2])
703 1
        ordered_keys.sort(key=lambda x: x[1])
704 1
        ordered_keys.sort(key=lambda x: x[0], reverse=True)
705
706 1
        for _, src_tok, tar_tok in ordered_keys:
707 1
            pairings = min(src_only[src_tok], tar_only[tar_tok])
708 1
            if pairings:
709 1
                sim = pair[(src_tok, tar_tok)]
710
711 1
                intersection[src_tok] += sim / 2 * pairings
712 1
                intersection[tar_tok] += sim / 2 * pairings
713
714 1
                src_only[src_tok] -= pairings
715 1
                tar_only[tar_tok] -= pairings
716
717
        """
718
        # Here is a slightly different optimization method, which is even
719
        # greedier than the above.
720
        # ordered by sim*pairings rather than just sim
721
722
        pair = {}
723
        for src_tok in sorted(src_only):
724
            for tar_tok in sorted(tar_only):
725
                sim = self.params['metric'].sim(src_tok, tar_tok)
726
                if sim >= self.params['threshold']:
727
                    pairings = min(src_only[src_tok], tar_only[tar_tok])
728
                    pair[(src_tok, tar_tok)] = sim*pairings
729
730
        for src_tok, tar_tok in sorted(pair, key=pair.get, reverse=True):
731
            pairings = min(src_only[src_tok], tar_only[tar_tok])
732
            if pairings:
733
                sim = pair[(src_tok, tar_tok)]
734
735
                intersection[src_tok] += sim / 2
736
                intersection[tar_tok] += sim / 2
737
738
                src_only[src_tok] -= pairings
739
                tar_only[tar_tok] -= pairings
740
        """
741
742 1
        return intersection
743
744 1
    def _group_linkage_intersection(self):
745
        r"""Return the group linkage intersection of the tokens in src and tar.
746
747
        This is based on group linkage, as defined by :cite:`On:2007`.
748
749
        Most of this method is concerned with solving the assignment problem,
750
        in order to find the weight of the maximum weight bipartite matching.
751
        The Hungarian algorithm of Munkres :cite:`Munkres:1957`, implemented
752
        below in Python & Numpy is used to solve the assignment problem since
753
        it is roughly twice as fast as SciPy's implementation.
754
755
        .. versionadded:: 0.4.0
756
        .. versionchanged:: 0.4.1
757
            Corrected the Hungarian algorithm & optimized it so that SciPy's
758
            version is no longer needed.
759
760
        """
761 1
        intersection = self._crisp_intersection()
762 1
        src_only_tok = sorted(self._src_tokens - self._tar_tokens)
763 1
        tar_only_tok = sorted(self._tar_tokens - self._src_tokens)
764 1
        src_only = self._src_tokens - self._tar_tokens
765 1
        tar_only = self._tar_tokens - self._src_tokens
766
767
        # Quoted text below is from Munkres (1957), cited above.
768
769
        # Pre-preliminaries: create square the matrix of scores
770 1
        n = max(len(src_only_tok), len(tar_only_tok))
771 1
        arr = np_zeros((n, n), dtype=float)
772
773 1
        for col in range(len(src_only_tok)):
774 1
            for row in range(len(tar_only_tok)):
775 1
                arr[row, col] = self.params['metric'].dist(
776
                    src_only_tok[col], tar_only_tok[row]
777
                )
778
779 1
        src_only_tok += [''] * (n - len(src_only_tok))
780 1
        tar_only_tok += [''] * (n - len(tar_only_tok))
781
782 1
        starred = np.zeros((n, n), dtype=np.bool)
783 1
        primed = np.zeros((n, n), dtype=np.bool)
784 1
        row_covered = np.zeros(n, dtype=np.bool)
785 1
        col_covered = np.zeros(n, dtype=np.bool)
786
787 1
        orig_sim = 1 - np.copy(arr)
788
        # Preliminaries:
789
        # P: "No lines are covered; no zeros are starred or primed."
790
        # P: "Consider a row of matrix A; subtract from each element in
791
        # this row the smallest element of this row. Do the same for each
792
        # row of A."
793 1
        arr -= arr.min(axis=1, keepdims=True)
794
        # P: "Then consider each column of the resulting matrix and
795
        # subtract from each column its smallest entry."
796 1
        arr -= arr.min(axis=0, keepdims=True)
797
798
        # P: "Consider a zero Z of the matrix. If there is no starred zero
799
        # in its row and none in its column, star Z. Repeat, considering
800
        # each zero in the matrix in turn. Then cover every column
801
        # containing a starred zero.
802 1
        for col in range(n):
803 1
            for row in range(n):
804 1
                if arr[row, col] == 0:
805 1
                    if (
806
                        np.count_nonzero(starred[row, :]) == 0
807
                        and np.count_nonzero(starred[:, col]) == 0
808
                    ):
809 1
                        starred[row, col] = True
810 1
                        col_covered[col] = True
811
812 1
        step = 1
813
        # This is the simple case where independent assignments are obvious
814
        # and found without the rest of the algorithm.
815 1
        if np.count_nonzero(col_covered) == n:
816 1
            step = 4
817
818 1
        while step < 4:
819 1
            if step == 1:
820
                # Step 1:
821
                # 1: "Choose a non-covered zero and prime it. Consider the
822
                # row containing it. If there is no starred zero in this
823
                # row, go at once to Step 2. If there is a starred zero Z
824
                # in this row, cover this row and uncover the column of Z."
825
                # 1: Repeat until all zeros are covered. Go to Step 3."
826 1
                zeros = tuple(zip(*((arr == 0).nonzero())))
827 1
                while step == 1:
828 1
                    for row, col in zeros:
829 1
                        if not (col_covered[col] | row_covered[row]):
830 1
                            primed[row, col] = True
831 1
                            z_cols = (starred[row, :]).nonzero()[0]
832 1
                            if not z_cols.size:
833 1
                                step = 2
834 1
                                break
835
                            else:
836 1
                                row_covered[row] = True
837 1
                                col_covered[z_cols[0]] = False
838
839 1
                    if step != 1:
840 1
                        break
841
842 1
                    for row, col in zeros:
843 1
                        if not (col_covered[col] | row_covered[row]):
844 1
                            break
845
                    else:
846 1
                        step = 3
847
848 1
            if step == 2:
849
                # Step 2:
850
                # 2: "There is a sequence of alternating starred and primed
851
                # zeros, constructed as follows: Let Z_0 denote the
852
                # uncovered 0'. [There is only one.] Let Z_1 denote the 0*
853
                # in Z_0's column (if any). Let Z_2 denote the 0' in Z_1's
854
                # row (we must prove that it exists). Let Z_3 denote the 0*
855
                # in Z_2's column (if any). Similarly continue until the
856
                # sequence stops at a 0', Z_{2k}, which has no 0* in its
857
                # column."
858 1
                z_series = []
859 1
                for row, col in zeros:  # pragma: no branch
0 ignored issues
show
introduced by
The variable zeros does not seem to be defined in case step == 1 on line 819 is False. Are you sure this can never be the case?
Loading history...
860 1
                    if primed[row, col] and not (
861
                        row_covered[row] | col_covered[col]
862
                    ):
863 1
                        z_series.append((row, col))
864 1
                        break
865 1
                col = z_series[-1][1]
866 1
                while True:
867 1
                    row = tuple(
868
                        set((arr[:, col] == 0).nonzero()[0])
869
                        & set((starred[:, col]).nonzero()[0])
870
                    )
871 1
                    if row:
872 1
                        row = row[0]
873 1
                        z_series.append((row, col))
874 1
                        col = tuple(
875
                            set((arr[row, :] == 0).nonzero()[0])
876
                            & set((primed[row, :]).nonzero()[0])
877
                        )[0]
878 1
                        z_series.append((row, col))
879
                    else:
880 1
                        break
881
882
                # 2: "Unstar each starred zero of the sequence and star
883
                # each primed zero of the sequence. Erase all primes,
884
                # uncover every row, and cover every column containing a
885
                # 0*."
886 1
                primed[:, :] = False
887 1
                row_covered[:] = False
888 1
                col_covered[:] = False
889 1
                for row, col in z_series:
890 1
                    starred[row, col] = not starred[row, col]
891 1
                for col in range(n):
892 1
                    if np.count_nonzero(starred[:, col]):
893 1
                        col_covered[col] = True
894
                # 2: "If all columns are covered, the starred zeros form
895
                # the desired independent set. Otherwise, return to Step
896
                # 1."
897 1
                if np.count_nonzero(col_covered) == n:
898 1
                    step = 4
899
                else:
900 1
                    step = 1
901
902 1
            if step == 3:
903
                # Step 3:
904
                # 3: "Let h denote the smallest non-covered element of the
905
                # matrix; it will be positive. Add h to each covered row;
906
                # then subtract h from each uncovered column."
907 1
                h_val = float('inf')
908 1
                for col in range(n):
909 1
                    if not (col_covered[col]):
910 1
                        for row in range(n):
911 1
                            if (
912
                                not (row_covered[row])
913
                                and arr[row, col] < h_val
914
                            ):
915 1
                                h_val = arr[row, col]
916 1
                for row in range(n):
917 1
                    if row_covered[row]:
918 1
                        arr[row, :] += h_val
919 1
                for col in range(n):
920 1
                    if not (col_covered[col]):
921 1
                        arr[:, col] -= h_val
922
923
                # 3: "Return to Step 1, without altering any asterisks,
924
                # primes, or covered lines."
925 1
                step = 1
926
927 1
        for row, col in tuple(zip(*(starred.nonzero()))):
928 1
            sim = orig_sim[row, col]
929 1
            if sim >= self.params['threshold']:
930 1
                score = float(
931
                    (sim / 2)
932
                    * min(
933
                        src_only[src_only_tok[col]],
934
                        tar_only[tar_only_tok[row]],
935
                    )
936
                )
937 1
                intersection[src_only_tok[col]] += score
938 1
                intersection[tar_only_tok[row]] += score
939
940 1
        return intersection
941
942 1
    def _intersection_card(self):
943
        """Return the cardinality of the intersection."""
944 1
        return self.normalizer(
945
            sum(abs(val) for val in self._intersection().values()),
946
            1,
947
            self._population_card_value,
948
        )
949
950 1
    def _intersection(self):
951
        """Return the intersection.
952
953
        This function may be overridden by setting the intersection_type during
954
        initialization.
955
        """
956
        return self._crisp_intersection()  # pragma: no cover
957
958 1
    def _get_confusion_table(self):
959
        """Return the token counts as a ConfusionTable object."""
960 1
        return ConfusionTable(
961
            self._intersection_card(),
962
            self._total_complement_card(),
963
            self._src_only_card(),
964
            self._tar_only_card(),
965
        )
966
967
968
if __name__ == '__main__':
969
    import doctest
970
971
    doctest.testmod()
972