abydos.distance._aline.ALINE.alignment()   A
last analyzed

Complexity

Conditions 1

Size

Total Lines 36
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 9
CRAP Score 1

Importance

Changes 0
Metric Value
eloc 2
dl 0
loc 36
ccs 9
cts 9
cp 1
rs 10
c 0
b 0
f 0
cc 1
nop 3
crap 1
1
# Copyright 2019-2020 by Christopher C. Little.
2
# This file is part of Abydos.
3
#
4
# Abydos is free software: you can redistribute it and/or modify
5
# it under the terms of the GNU General Public License as published by
6
# the Free Software Foundation, either version 3 of the License, or
7
# (at your option) any later version.
8
#
9
# Abydos is distributed in the hope that it will be useful,
10
# but WITHOUT ANY WARRANTY; without even the implied warranty of
11
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12
# GNU General Public License for more details.
13
#
14
# You should have received a copy of the GNU General Public License
15
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
16
17
"""abydos.distance._aline.
18
19 1
ALINE alignment, similarity, and distance
20
"""
21
22
from copy import deepcopy
23
from typing import Any, Callable, Dict, List, Tuple, Union, cast
24 1
25
from numpy import float_, inf, zeros
26
27
from ._distance import _Distance
28
29
__all__ = ['ALINE']
30
31 1
32
class ALINE(_Distance):
33 1
    r"""ALINE alignment, similarity, and distance.
34 1
35 1
    ALINE alignment was developed by
36
    :cite:`Kondrak:2000,Kondrak:2002,Downey:2008`, and establishes an
37 1
    alignment algorithm based on multivalued phonetic features and feature
38
    salience weights. Along with the alignment itself, the algorithm produces a
39 1
    term similarity score.
40
41
    :cite:`Downey:2008` develops ALINE's similarity score into a similarity
42 1
    measure & distance measure:
43
44
        .. math::
45
46
            sim_{ALINE} = \frac{2 \dot score_{ALINE}(src, tar)}
47
            {score_{ALINE}(src, src) + score_{ALINE}(tar, tar)}
48
49
    However, because the average of the two self-similarity scores is not
50
    guaranteed to be greater than or equal to the similarity score between
51
    the two strings, by default, this formula is not used here in order to
52
    guarantee that the similarity measure is bounded to [0, 1]. Instead,
53
    Kondrak's similarity measure is employed:
54
55
        .. math::
56
57
            sim_{ALINE} = \frac{score_{ALINE}(src, tar)}
58
            {max(score_{ALINE}(src, src), score_{ALINE}(tar, tar))}
59
60
61
    .. versionadded:: 0.4.0
62
    """
63
64
    # The three dicts below are mostly copied from NLTK's implementation
65
    # https://www.nltk.org/_modules/nltk/metrics/aline.html
66
    # But values have been returned, as much as possible to the reference
67
    # values supplied in Kondrak's paper.
68
    feature_weights = {
69
        # place
70
        'bilabial': 1.0,
71
        'labiodental': 0.95,
72
        'dental': 0.9,
73
        'alveolar': 0.85,
74
        'retroflex': 0.8,
75
        'palato-alveolar': 0.75,
76
        'palatal': 0.7,
77
        'velar': 0.6,
78 1
        'uvular': 0.5,
79
        'pharyngeal': 0.3,
80
        'glottal': 0.1,
81
        # manner
82
        'stop': 1.0,
83
        'affricate': 0.9,
84
        'fricative': 0.8,
85
        'approximant': 0.6,
86
        'trill': 0.55,  # not in original
87
        'tap': 0.5,  # not in original
88
        'high vowel': 0.4,
89
        'mid vowel': 0.2,
90
        'low vowel': 0.0,
91
        # high
92
        'high': 1.0,
93
        'mid': 0.5,
94
        'low': 0.0,
95
        # back
96
        'front': 1.0,
97
        'central': 0.5,
98
        'back': 0.0,
99
        # binary features
100
        'plus': 1.0,
101
        'minus': 0.0,
102
    }
103
104
    v_features = {
105
        'syllabic',
106
        'nasal',
107
        'retroflex',
108
        'high',
109
        'back',
110
        'round',
111
        'long',
112
    }
113
    c_features = {
114 1
        'syllabic',
115
        'manner',
116
        'voice',
117
        'nasal',
118
        'retroflex',
119
        'lateral',
120
        'aspirated',
121
        'place',
122
    }
123 1
124
    salience = {
125
        'syllabic': 5,
126
        'voice': 10,
127
        'lateral': 10,
128
        'high': 5,
129
        'manner': 50,
130
        'long': 1,
131
        'place': 40,
132
        'nasal': 10,
133
        'aspirated': 5,
134 1
        'back': 5,
135
        'retroflex': 10,
136
        'round': 5,
137
    }
138
139
    phones_ipa = {
140
        'p': {
141
            'place': 'bilabial',
142
            'manner': 'stop',
143
            'syllabic': 'minus',
144
            'voice': 'minus',
145
            'nasal': 'minus',
146
            'retroflex': 'minus',
147
            'lateral': 'minus',
148
            'aspirated': 'minus',
149 1
        },
150
        'b': {
151
            'place': 'bilabial',
152
            'manner': 'stop',
153
            'syllabic': 'minus',
154
            'voice': 'plus',
155
            'nasal': 'minus',
156
            'retroflex': 'minus',
157
            'lateral': 'minus',
158
            'aspirated': 'minus',
159
        },
160
        't': {
161
            'place': 'alveolar',
162
            'manner': 'stop',
163
            'syllabic': 'minus',
164
            'voice': 'minus',
165
            'nasal': 'minus',
166
            'retroflex': 'minus',
167
            'lateral': 'minus',
168
            'aspirated': 'minus',
169
        },
170
        'd': {
171
            'place': 'alveolar',
172
            'manner': 'stop',
173
            'syllabic': 'minus',
174
            'voice': 'plus',
175
            'nasal': 'minus',
176
            'retroflex': 'minus',
177
            'lateral': 'minus',
178
            'aspirated': 'minus',
179
        },
180
        'ʈ': {
181
            'place': 'retroflex',
182
            'manner': 'stop',
183
            'syllabic': 'minus',
184
            'voice': 'minus',
185
            'nasal': 'minus',
186
            'retroflex': 'plus',
187
            'lateral': 'minus',
188
            'aspirated': 'minus',
189
        },
190
        'ɖ': {
191
            'place': 'retroflex',
192
            'manner': 'stop',
193
            'syllabic': 'minus',
194
            'voice': 'plus',
195
            'nasal': 'minus',
196
            'retroflex': 'plus',
197
            'lateral': 'minus',
198
            'aspirated': 'minus',
199
        },
200
        'c': {
201
            'place': 'palatal',
202
            'manner': 'stop',
203
            'syllabic': 'minus',
204
            'voice': 'minus',
205
            'nasal': 'minus',
206
            'retroflex': 'minus',
207
            'lateral': 'minus',
208
            'aspirated': 'minus',
209
        },
210
        'ɟ': {
211
            'place': 'palatal',
212
            'manner': 'stop',
213
            'syllabic': 'minus',
214
            'voice': 'plus',
215
            'nasal': 'minus',
216
            'retroflex': 'minus',
217
            'lateral': 'minus',
218
            'aspirated': 'minus',
219
        },
220
        'k': {
221
            'place': 'velar',
222
            'manner': 'stop',
223
            'syllabic': 'minus',
224
            'voice': 'minus',
225
            'nasal': 'minus',
226
            'retroflex': 'minus',
227
            'lateral': 'minus',
228
            'aspirated': 'minus',
229
        },
230
        'g': {
231
            'place': 'velar',
232
            'manner': 'stop',
233
            'syllabic': 'minus',
234
            'voice': 'plus',
235
            'nasal': 'minus',
236
            'retroflex': 'minus',
237
            'lateral': 'minus',
238
            'aspirated': 'minus',
239
        },
240
        'q': {
241
            'place': 'uvular',
242
            'manner': 'stop',
243
            'syllabic': 'minus',
244
            'voice': 'minus',
245
            'nasal': 'minus',
246
            'retroflex': 'minus',
247
            'lateral': 'minus',
248
            'aspirated': 'minus',
249
        },
250
        'ɢ': {
251
            'place': 'uvular',
252
            'manner': 'stop',
253
            'syllabic': 'minus',
254
            'voice': 'plus',
255
            'nasal': 'minus',
256
            'retroflex': 'minus',
257
            'lateral': 'minus',
258
            'aspirated': 'minus',
259
        },
260
        'ʔ': {
261
            'place': 'glottal',
262
            'manner': 'stop',
263
            'syllabic': 'minus',
264
            'voice': 'minus',
265
            'nasal': 'minus',
266
            'retroflex': 'minus',
267
            'lateral': 'minus',
268
            'aspirated': 'minus',
269
        },
270
        'm': {
271
            'place': 'bilabial',
272
            'manner': 'stop',
273
            'syllabic': 'minus',
274
            'voice': 'plus',
275
            'nasal': 'plus',
276
            'retroflex': 'minus',
277
            'lateral': 'minus',
278
            'aspirated': 'minus',
279
        },
280
        'ɱ': {
281
            'place': 'labiodental',
282
            'manner': 'stop',
283
            'syllabic': 'minus',
284
            'voice': 'plus',
285
            'nasal': 'plus',
286
            'retroflex': 'minus',
287
            'lateral': 'minus',
288
            'aspirated': 'minus',
289
        },
290
        'n': {
291
            'place': 'alveolar',
292
            'manner': 'stop',
293
            'syllabic': 'minus',
294
            'voice': 'plus',
295
            'nasal': 'plus',
296
            'retroflex': 'minus',
297
            'lateral': 'minus',
298
            'aspirated': 'minus',
299
        },
300
        'ɳ': {
301
            'place': 'retroflex',
302
            'manner': 'stop',
303
            'syllabic': 'minus',
304
            'voice': 'plus',
305
            'nasal': 'plus',
306
            'retroflex': 'plus',
307
            'lateral': 'minus',
308
            'aspirated': 'minus',
309
        },
310
        'ɲ': {
311
            'place': 'palatal',
312
            'manner': 'stop',
313
            'syllabic': 'minus',
314
            'voice': 'plus',
315
            'nasal': 'plus',
316
            'retroflex': 'minus',
317
            'lateral': 'minus',
318
            'aspirated': 'minus',
319
        },
320
        'ŋ': {
321
            'place': 'velar',
322
            'manner': 'stop',
323
            'syllabic': 'minus',
324
            'voice': 'plus',
325
            'nasal': 'plus',
326
            'retroflex': 'minus',
327
            'lateral': 'minus',
328
            'aspirated': 'minus',
329
        },
330
        'ɴ': {
331
            'place': 'uvular',
332
            'manner': 'stop',
333
            'syllabic': 'minus',
334
            'voice': 'plus',
335
            'nasal': 'plus',
336
            'retroflex': 'minus',
337
            'lateral': 'minus',
338
            'aspirated': 'minus',
339
        },
340
        'ʙ': {
341
            'place': 'bilabial',
342
            'manner': 'trill',
343
            'syllabic': 'minus',
344
            'voice': 'plus',
345
            'nasal': 'minus',
346
            'retroflex': 'minus',
347
            'lateral': 'minus',
348
            'aspirated': 'minus',
349
        },
350
        'r': {
351
            'place': 'alveolar',
352
            'manner': 'trill',
353
            'syllabic': 'minus',
354
            'voice': 'plus',
355
            'nasal': 'minus',
356
            'retroflex': 'plus',
357
            'lateral': 'minus',
358
            'aspirated': 'minus',
359
        },
360
        'ʀ': {
361
            'place': 'uvular',
362
            'manner': 'trill',
363
            'syllabic': 'minus',
364
            'voice': 'plus',
365
            'nasal': 'minus',
366
            'retroflex': 'minus',
367
            'lateral': 'minus',
368
            'aspirated': 'minus',
369
        },
370
        'ɾ': {
371
            'place': 'alveolar',
372
            'manner': 'tap',
373
            'syllabic': 'minus',
374
            'voice': 'plus',
375
            'nasal': 'minus',
376
            'retroflex': 'minus',
377
            'lateral': 'minus',
378
            'aspirated': 'minus',
379
        },
380
        'ɽ': {
381
            'place': 'retroflex',
382
            'manner': 'tap',
383
            'syllabic': 'minus',
384
            'voice': 'plus',
385
            'nasal': 'minus',
386
            'retroflex': 'plus',
387
            'lateral': 'minus',
388
            'aspirated': 'minus',
389
        },
390
        'ɸ': {
391
            'place': 'bilabial',
392
            'manner': 'fricative',
393
            'syllabic': 'minus',
394
            'voice': 'minus',
395
            'nasal': 'minus',
396
            'retroflex': 'minus',
397
            'lateral': 'minus',
398
            'aspirated': 'minus',
399
        },
400
        'β': {
401
            'place': 'bilabial',
402
            'manner': 'fricative',
403
            'syllabic': 'minus',
404
            'voice': 'plus',
405
            'nasal': 'minus',
406
            'retroflex': 'minus',
407
            'lateral': 'minus',
408
            'aspirated': 'minus',
409
        },
410
        'f': {
411
            'place': 'labiodental',
412
            'manner': 'fricative',
413
            'syllabic': 'minus',
414
            'voice': 'minus',
415
            'nasal': 'minus',
416
            'retroflex': 'minus',
417
            'lateral': 'minus',
418
            'aspirated': 'minus',
419
        },
420
        'v': {
421
            'place': 'labiodental',
422
            'manner': 'fricative',
423
            'syllabic': 'minus',
424
            'voice': 'plus',
425
            'nasal': 'minus',
426
            'retroflex': 'minus',
427
            'lateral': 'minus',
428
            'aspirated': 'minus',
429
        },
430
        'θ': {
431
            'place': 'dental',
432
            'manner': 'fricative',
433
            'syllabic': 'minus',
434
            'voice': 'minus',
435
            'nasal': 'minus',
436
            'retroflex': 'minus',
437
            'lateral': 'minus',
438
            'aspirated': 'minus',
439
        },
440
        'ð': {
441
            'place': 'dental',
442
            'manner': 'fricative',
443
            'syllabic': 'minus',
444
            'voice': 'plus',
445
            'nasal': 'minus',
446
            'retroflex': 'minus',
447
            'lateral': 'minus',
448
            'aspirated': 'minus',
449
        },
450
        's': {
451
            'place': 'alveolar',
452
            'manner': 'fricative',
453
            'syllabic': 'minus',
454
            'voice': 'minus',
455
            'nasal': 'minus',
456
            'retroflex': 'minus',
457
            'lateral': 'minus',
458
            'aspirated': 'minus',
459
        },
460
        'z': {
461
            'place': 'alveolar',
462
            'manner': 'fricative',
463
            'syllabic': 'minus',
464
            'voice': 'plus',
465
            'nasal': 'minus',
466
            'retroflex': 'minus',
467
            'lateral': 'minus',
468
            'aspirated': 'minus',
469
        },
470
        'ʃ': {
471
            'place': 'palato-alveolar',
472
            'manner': 'fricative',
473
            'syllabic': 'minus',
474
            'voice': 'minus',
475
            'nasal': 'minus',
476
            'retroflex': 'minus',
477
            'lateral': 'minus',
478
            'aspirated': 'minus',
479
        },
480
        'ʒ': {
481
            'place': 'palato-alveolar',
482
            'manner': 'fricative',
483
            'syllabic': 'minus',
484
            'voice': 'plus',
485
            'nasal': 'minus',
486
            'retroflex': 'minus',
487
            'lateral': 'minus',
488
            'aspirated': 'minus',
489
        },
490
        'ʂ': {
491
            'place': 'retroflex',
492
            'manner': 'fricative',
493
            'syllabic': 'minus',
494
            'voice': 'minus',
495
            'nasal': 'minus',
496
            'retroflex': 'plus',
497
            'lateral': 'minus',
498
            'aspirated': 'minus',
499
        },
500
        'ʐ': {
501
            'place': 'retroflex',
502
            'manner': 'fricative',
503
            'syllabic': 'minus',
504
            'voice': 'plus',
505
            'nasal': 'minus',
506
            'retroflex': 'plus',
507
            'lateral': 'minus',
508
            'aspirated': 'minus',
509
        },
510
        'ç': {
511
            'place': 'palatal',
512
            'manner': 'fricative',
513
            'syllabic': 'minus',
514
            'voice': 'minus',
515
            'nasal': 'minus',
516
            'retroflex': 'minus',
517
            'lateral': 'minus',
518
            'aspirated': 'minus',
519
        },
520
        'ʝ': {
521
            'place': 'palatal',
522
            'manner': 'fricative',
523
            'syllabic': 'minus',
524
            'voice': 'plus',
525
            'nasal': 'minus',
526
            'retroflex': 'minus',
527
            'lateral': 'minus',
528
            'aspirated': 'minus',
529
        },
530
        'x': {
531
            'place': 'velar',
532
            'manner': 'fricative',
533
            'syllabic': 'minus',
534
            'voice': 'minus',
535
            'nasal': 'minus',
536
            'retroflex': 'minus',
537
            'lateral': 'minus',
538
            'aspirated': 'minus',
539
        },
540
        'ɣ': {
541
            'place': 'velar',
542
            'manner': 'fricative',
543
            'syllabic': 'minus',
544
            'voice': 'plus',
545
            'nasal': 'minus',
546
            'retroflex': 'minus',
547
            'lateral': 'minus',
548
            'aspirated': 'minus',
549
        },
550
        'χ': {
551
            'place': 'uvular',
552
            'manner': 'fricative',
553
            'syllabic': 'minus',
554
            'voice': 'minus',
555
            'nasal': 'minus',
556
            'retroflex': 'minus',
557
            'lateral': 'minus',
558
            'aspirated': 'minus',
559
        },
560
        'ʁ': {
561
            'place': 'uvular',
562
            'manner': 'fricative',
563
            'syllabic': 'minus',
564
            'voice': 'plus',
565
            'nasal': 'minus',
566
            'retroflex': 'minus',
567
            'lateral': 'minus',
568
            'aspirated': 'minus',
569
        },
570
        'ħ': {
571
            'place': 'pharyngeal',
572
            'manner': 'fricative',
573
            'syllabic': 'minus',
574
            'voice': 'minus',
575
            'nasal': 'minus',
576
            'retroflex': 'minus',
577
            'lateral': 'minus',
578
            'aspirated': 'minus',
579
        },
580
        'ʕ': {
581
            'place': 'pharyngeal',
582
            'manner': 'fricative',
583
            'syllabic': 'minus',
584
            'voice': 'plus',
585
            'nasal': 'minus',
586
            'retroflex': 'minus',
587
            'lateral': 'minus',
588
            'aspirated': 'minus',
589
        },
590
        'h': {
591
            'place': 'glottal',
592
            'manner': 'fricative',
593
            'syllabic': 'minus',
594
            'voice': 'minus',
595
            'nasal': 'minus',
596
            'retroflex': 'minus',
597
            'lateral': 'minus',
598
            'aspirated': 'minus',
599
        },
600
        'ɦ': {
601
            'place': 'glottal',
602
            'manner': 'fricative',
603
            'syllabic': 'minus',
604
            'voice': 'plus',
605
            'nasal': 'minus',
606
            'retroflex': 'minus',
607
            'lateral': 'minus',
608
            'aspirated': 'minus',
609
        },
610
        'ɬ': {
611
            'place': 'alveolar',
612
            'manner': 'fricative',
613
            'syllabic': 'minus',
614
            'voice': 'minus',
615
            'nasal': 'minus',
616
            'retroflex': 'minus',
617
            'lateral': 'plus',
618
            'aspirated': 'minus',
619
        },
620
        'ɮ': {
621
            'place': 'alveolar',
622
            'manner': 'fricative',
623
            'syllabic': 'minus',
624
            'voice': 'plus',
625
            'nasal': 'minus',
626
            'retroflex': 'minus',
627
            'lateral': 'plus',
628
            'aspirated': 'minus',
629
        },
630
        'ʋ': {
631
            'place': 'labiodental',
632
            'manner': 'approximant',
633
            'syllabic': 'minus',
634
            'voice': 'plus',
635
            'nasal': 'minus',
636
            'retroflex': 'minus',
637
            'lateral': 'minus',
638
            'aspirated': 'minus',
639
        },
640
        'ɹ': {
641
            'place': 'alveolar',
642
            'manner': 'approximant',
643
            'syllabic': 'minus',
644
            'voice': 'plus',
645
            'nasal': 'minus',
646
            'retroflex': 'minus',
647
            'lateral': 'minus',
648
            'aspirated': 'minus',
649
        },
650
        'ɻ': {
651
            'place': 'retroflex',
652
            'manner': 'approximant',
653
            'syllabic': 'minus',
654
            'voice': 'plus',
655
            'nasal': 'minus',
656
            'retroflex': 'plus',
657
            'lateral': 'minus',
658
            'aspirated': 'minus',
659
        },
660
        'j': {
661
            'place': 'palatal',
662
            'manner': 'approximant',
663
            'syllabic': 'minus',
664
            'voice': 'plus',
665
            'nasal': 'minus',
666
            'retroflex': 'minus',
667
            'lateral': 'minus',
668
            'aspirated': 'minus',
669
        },
670
        'ɰ': {
671
            'place': 'velar',
672
            'manner': 'approximant',
673
            'syllabic': 'minus',
674
            'voice': 'plus',
675
            'nasal': 'minus',
676
            'retroflex': 'minus',
677
            'lateral': 'minus',
678
            'aspirated': 'minus',
679
        },
680
        'l': {
681
            'place': 'alveolar',
682
            'manner': 'approximant',
683
            'syllabic': 'minus',
684
            'voice': 'plus',
685
            'nasal': 'minus',
686
            'retroflex': 'minus',
687
            'lateral': 'plus',
688
            'aspirated': 'minus',
689
        },
690
        'w': {
691
            'place': 'velar',
692
            'manner': 'approximant',
693
            'syllabic': 'minus',
694
            'voice': 'plus',
695
            'nasal': 'minus',
696
            'retroflex': 'minus',
697
            'lateral': 'minus',
698
            'aspirated': 'minus',
699
            'double': 'bilabial',
700
        },
701
        'i': {
702
            'manner': 'high vowel',
703
            'syllabic': 'plus',
704
            'voice': 'plus',
705
            'nasal': 'minus',
706
            'retroflex': 'minus',
707
            'lateral': 'minus',
708
            'high': 'high',
709
            'back': 'front',
710
            'round': 'minus',
711
            'long': 'minus',
712
            'aspirated': 'minus',
713
        },
714
        'y': {
715
            'manner': 'high vowel',
716
            'syllabic': 'plus',
717
            'voice': 'plus',
718
            'nasal': 'minus',
719
            'retroflex': 'minus',
720
            'lateral': 'minus',
721
            'high': 'high',
722
            'back': 'front',
723
            'round': 'plus',
724
            'long': 'minus',
725
            'aspirated': 'minus',
726
        },
727
        'e': {
728
            'manner': 'mid vowel',
729
            'syllabic': 'plus',
730
            'voice': 'plus',
731
            'nasal': 'minus',
732
            'retroflex': 'minus',
733
            'lateral': 'minus',
734
            'high': 'mid',
735
            'back': 'front',
736
            'round': 'minus',
737
            'long': 'minus',
738
            'aspirated': 'minus',
739
        },
740
        'ø': {
741
            'manner': 'mid vowel',
742
            'syllabic': 'plus',
743
            'voice': 'plus',
744
            'nasal': 'minus',
745
            'retroflex': 'minus',
746
            'lateral': 'minus',
747
            'high': 'mid',
748
            'back': 'front',
749
            'round': 'plus',
750
            'long': 'minus',
751
            'aspirated': 'minus',
752
        },
753
        'ɛ': {
754
            'manner': 'mid vowel',
755
            'syllabic': 'plus',
756
            'voice': 'plus',
757
            'nasal': 'minus',
758
            'retroflex': 'minus',
759
            'lateral': 'minus',
760
            'high': 'mid',
761
            'back': 'front',
762
            'round': 'minus',
763
            'long': 'minus',
764
            'aspirated': 'minus',
765
        },
766
        'œ': {
767
            'manner': 'mid vowel',
768
            'syllabic': 'plus',
769
            'voice': 'plus',
770
            'nasal': 'minus',
771
            'retroflex': 'minus',
772
            'lateral': 'minus',
773
            'high': 'mid',
774
            'back': 'front',
775
            'round': 'plus',
776
            'long': 'minus',
777
            'aspirated': 'minus',
778
        },
779
        'æ': {
780
            'manner': 'low vowel',
781
            'syllabic': 'plus',
782
            'voice': 'plus',
783
            'nasal': 'minus',
784
            'retroflex': 'minus',
785
            'lateral': 'minus',
786
            'high': 'low',
787
            'back': 'front',
788
            'round': 'minus',
789
            'long': 'minus',
790
            'aspirated': 'minus',
791
        },
792
        'a': {
793
            'manner': 'low vowel',
794
            'syllabic': 'plus',
795
            'voice': 'plus',
796
            'nasal': 'minus',
797
            'retroflex': 'minus',
798
            'lateral': 'minus',
799
            'high': 'low',
800
            'back': 'front',
801
            'round': 'minus',
802
            'long': 'minus',
803
            'aspirated': 'minus',
804
        },
805
        'ɨ': {
806
            'manner': 'high vowel',
807
            'syllabic': 'plus',
808
            'voice': 'plus',
809
            'nasal': 'minus',
810
            'retroflex': 'minus',
811
            'lateral': 'minus',
812
            'high': 'high',
813
            'back': 'central',
814
            'round': 'minus',
815
            'long': 'minus',
816
            'aspirated': 'minus',
817
        },
818
        'ʉ': {
819
            'manner': 'high vowel',
820
            'syllabic': 'plus',
821
            'voice': 'plus',
822
            'nasal': 'minus',
823
            'retroflex': 'minus',
824
            'lateral': 'minus',
825
            'high': 'high',
826
            'back': 'central',
827
            'round': 'plus',
828
            'long': 'minus',
829
            'aspirated': 'minus',
830
        },
831
        'ə': {
832
            'manner': 'mid vowel',
833
            'syllabic': 'plus',
834
            'voice': 'plus',
835
            'nasal': 'minus',
836
            'retroflex': 'minus',
837
            'lateral': 'minus',
838
            'high': 'mid',
839
            'back': 'central',
840
            'round': 'minus',
841
            'long': 'minus',
842
            'aspirated': 'minus',
843
        },
844
        'u': {
845
            'manner': 'high vowel',
846
            'syllabic': 'plus',
847
            'voice': 'plus',
848
            'nasal': 'minus',
849
            'retroflex': 'minus',
850
            'lateral': 'minus',
851
            'high': 'high',
852
            'back': 'back',
853
            'round': 'plus',
854
            'long': 'minus',
855
            'aspirated': 'minus',
856
        },
857
        'o': {
858
            'manner': 'mid vowel',
859
            'syllabic': 'plus',
860
            'voice': 'plus',
861
            'nasal': 'minus',
862
            'retroflex': 'minus',
863
            'lateral': 'minus',
864
            'high': 'mid',
865
            'back': 'back',
866
            'round': 'plus',
867
            'long': 'minus',
868
            'aspirated': 'minus',
869
        },
870
        'ɔ': {
871
            'manner': 'mid vowel',
872
            'syllabic': 'plus',
873
            'voice': 'plus',
874
            'nasal': 'minus',
875
            'retroflex': 'minus',
876
            'lateral': 'minus',
877
            'high': 'mid',
878
            'back': 'back',
879
            'round': 'plus',
880
            'long': 'minus',
881
            'aspirated': 'minus',
882
        },
883
        'ɒ': {
884
            'manner': 'low vowel',
885
            'syllabic': 'plus',
886
            'voice': 'plus',
887
            'nasal': 'minus',
888
            'retroflex': 'minus',
889
            'lateral': 'minus',
890
            'high': 'low',
891
            'back': 'back',
892
            'round': 'minus',
893
            'long': 'minus',
894
            'aspirated': 'minus',
895
        },
896
        'ː': {'long': 'plus', 'supplemental': 'True'},
897
        'ʰ': {'aspirated': 'plus', 'supplemental': 'True'},
898
    }  # type: Dict[str, Dict[str, str]]
899
900
    phones_kondrak = {
901
        'a': {
902
            'place': 'velar',
903
            'manner': 'low vowel',
904
            'syllabic': 'plus',
905
            'voice': 'plus',
906
            'nasal': 'minus',
907
            'retroflex': 'minus',
908
            'lateral': 'minus',
909
            'high': 'low',
910 1
            'back': 'central',
911
            'round': 'minus',
912
        },
913
        'b': {
914
            'place': 'bilabial',
915
            'manner': 'stop',
916
            'syllabic': 'minus',
917
            'voice': 'plus',
918
            'nasal': 'minus',
919
            'retroflex': 'minus',
920
            'lateral': 'minus',
921
        },
922
        'c': {
923
            'place': 'alveolar',
924
            'manner': 'affricate',
925
            'syllabic': 'minus',
926
            'voice': 'minus',
927
            'nasal': 'minus',
928
            'retroflex': 'minus',
929
            'lateral': 'minus',
930
        },
931
        'd': {
932
            'place': 'alveolar',
933
            'manner': 'stop',
934
            'syllabic': 'minus',
935
            'voice': 'plus',
936
            'nasal': 'minus',
937
            'retroflex': 'minus',
938
            'lateral': 'minus',
939
        },
940
        'e': {
941
            'place': 'palatal',
942
            'manner': 'mid vowel',
943
            'syllabic': 'plus',
944
            'voice': 'plus',
945
            'nasal': 'minus',
946
            'retroflex': 'minus',
947
            'lateral': 'minus',
948
            'high': 'mid',
949
            'back': 'front',
950
            'round': 'minus',
951
        },
952
        'f': {
953
            'place': 'labiodental',
954
            'manner': 'fricative',
955
            'syllabic': 'minus',
956
            'voice': 'minus',
957
            'nasal': 'minus',
958
            'retroflex': 'minus',
959
            'lateral': 'minus',
960
        },
961
        'g': {
962
            'place': 'velar',
963
            'manner': 'stop',
964
            'syllabic': 'minus',
965
            'voice': 'plus',
966
            'nasal': 'minus',
967
            'retroflex': 'minus',
968
            'lateral': 'minus',
969
        },
970
        'h': {
971
            'place': 'glottal',
972
            'manner': 'fricative',
973
            'syllabic': 'minus',
974
            'voice': 'minus',
975
            'nasal': 'minus',
976
            'retroflex': 'minus',
977
            'lateral': 'minus',
978
        },
979
        'i': {
980
            'place': 'palatal',
981
            'manner': 'high vowel',
982
            'syllabic': 'plus',
983
            'voice': 'plus',
984
            'nasal': 'minus',
985
            'retroflex': 'minus',
986
            'lateral': 'minus',
987
            'high': 'high',
988
            'back': 'front',
989
            'round': 'plus',
990
        },
991
        'j': {
992
            'place': 'alveolar',
993
            'manner': 'affricate',
994
            'syllabic': 'minus',
995
            'voice': 'plus',
996
            'nasal': 'minus',
997
            'retroflex': 'minus',
998
            'lateral': 'minus',
999
        },
1000
        'k': {
1001
            'place': 'velar',
1002
            'manner': 'stop',
1003
            'syllabic': 'minus',
1004
            'voice': 'minus',
1005
            'nasal': 'minus',
1006
            'retroflex': 'minus',
1007
            'lateral': 'minus',
1008
        },
1009
        'l': {
1010
            'place': 'alveolar',
1011
            'manner': 'approximant',
1012
            'syllabic': 'minus',
1013
            'voice': 'plus',
1014
            'nasal': 'minus',
1015
            'retroflex': 'minus',
1016
            'lateral': 'plus',
1017
        },
1018
        'm': {
1019
            'place': 'bilabial',
1020
            'manner': 'stop',
1021
            'syllabic': 'minus',
1022
            'voice': 'plus',
1023
            'nasal': 'plus',
1024
            'retroflex': 'minus',
1025
            'lateral': 'minus',
1026
        },
1027
        'n': {
1028
            'place': 'alveolar',
1029
            'manner': 'stop',
1030
            'syllabic': 'minus',
1031
            'voice': 'plus',
1032
            'nasal': 'plus',
1033
            'retroflex': 'minus',
1034
            'lateral': 'minus',
1035
        },
1036
        'o': {
1037
            'place': 'velar',
1038
            'manner': 'mid vowel',
1039
            'syllabic': 'plus',
1040
            'voice': 'plus',
1041
            'nasal': 'minus',
1042
            'retroflex': 'minus',
1043
            'lateral': 'minus',
1044
            'high': 'mid',
1045
            'back': 'back',
1046
            'round': 'plus',
1047
        },
1048
        'p': {
1049
            'place': 'bilabial',
1050
            'manner': 'stop',
1051
            'syllabic': 'minus',
1052
            'voice': 'minus',
1053
            'nasal': 'minus',
1054
            'retroflex': 'minus',
1055
            'lateral': 'minus',
1056
        },
1057
        'q': {
1058
            'place': 'glottal',
1059
            'manner': 'stop',
1060
            'syllabic': 'minus',
1061
            'voice': 'minus',
1062
            'nasal': 'minus',
1063
            'retroflex': 'minus',
1064
            'lateral': 'minus',
1065
        },
1066
        'r': {
1067
            'place': 'retroflex',
1068
            'manner': 'approximant',
1069
            'syllabic': 'minus',
1070
            'voice': 'plus',
1071
            'nasal': 'minus',
1072
            'retroflex': 'plus',
1073
            'lateral': 'minus',
1074
        },
1075
        's': {
1076
            'place': 'alveolar',
1077
            'manner': 'fricative',
1078
            'syllabic': 'minus',
1079
            'voice': 'minus',
1080
            'nasal': 'minus',
1081
            'retroflex': 'minus',
1082
            'lateral': 'minus',
1083
        },
1084
        't': {
1085
            'place': 'alveolar',
1086
            'manner': 'stop',
1087
            'syllabic': 'minus',
1088
            'voice': 'minus',
1089
            'nasal': 'minus',
1090
            'retroflex': 'minus',
1091
            'lateral': 'minus',
1092
        },
1093
        'u': {
1094
            'place': 'velar',
1095
            'manner': 'high vowel',
1096
            'syllabic': 'plus',
1097
            'voice': 'plus',
1098
            'nasal': 'minus',
1099
            'retroflex': 'minus',
1100
            'lateral': 'minus',
1101
            'high': 'high',
1102
            'back': 'back',
1103
            'round': 'plus',
1104
        },
1105
        'v': {
1106
            'place': 'labiodental',
1107
            'manner': 'fricative',
1108
            'syllabic': 'plus',
1109
            'voice': 'plus',
1110
            'nasal': 'minus',
1111
            'retroflex': 'minus',
1112
            'lateral': 'minus',
1113
        },
1114
        'w': {
1115
            'place': 'velar',
1116
            'manner': 'high vowel',
1117
            'syllabic': 'plus',
1118
            'voice': 'plus',
1119
            'nasal': 'minus',
1120
            'retroflex': 'minus',
1121
            'lateral': 'minus',
1122
            'high': 'high',
1123
            'back': 'back',
1124
            'round': 'plus',
1125
            'double': 'bilabial',
1126
        },
1127
        'x': {
1128
            'place': 'velar',
1129
            'manner': 'fricative',
1130
            'syllabic': 'minus',
1131
            'voice': 'minus',
1132
            'nasal': 'minus',
1133
            'retroflex': 'minus',
1134
            'lateral': 'minus',
1135
        },
1136
        'y': {
1137
            'place': 'velar',
1138
            'manner': 'high vowel',
1139
            'syllabic': 'plus',
1140
            'voice': 'plus',
1141
            'nasal': 'minus',
1142
            'retroflex': 'minus',
1143
            'lateral': 'minus',
1144
            'high': 'high',
1145
            'back': 'front',
1146
            'round': 'minus',
1147
        },
1148
        'z': {
1149
            'place': 'alveolar',
1150
            'manner': 'fricative',
1151
            'syllabic': 'minus',
1152
            'voice': 'plus',
1153
            'nasal': 'minus',
1154
            'retroflex': 'minus',
1155
            'lateral': 'minus',
1156
        },
1157
        'A': {'aspirated': 'plus', 'supplemental': 'True'},
1158
        'B': {'back': 'back', 'supplemental': 'True'},
1159
        'C': {'back': 'central', 'supplemental': 'True'},
1160
        'D': {'place': 'dental', 'supplemental': 'True'},
1161
        'F': {'back': 'front', 'supplemental': 'True'},
1162
        'H': {'long': 'plus', 'supplemental': 'True'},
1163
        'N': {'nasal': 'plus', 'supplemental': 'True'},
1164
        'P': {'place': 'palatal', 'supplemental': 'True'},
1165
        'R': {'round': 'plus', 'supplemental': 'True'},
1166
        'S': {'manner': 'fricative', 'supplemental': 'True'},
1167
        'V': {'place': 'palato-alveolar', 'supplemental': 'True'},
1168
    }  # type: Dict[str, Dict[str, str]]
1169
1170
    def __init__(
1171
        self,
1172
        epsilon: float = 0.0,
1173
        c_skip: float = -10,
1174
        c_sub: float = 35,
1175
        c_exp: float = 45,
1176
        c_vwl: float = 10,
1177
        mode: str = 'local',
1178
        phones: str = 'aline',
1179
        normalizer: Callable[[List[float]], float] = max,
1180 1
        **kwargs: Any
1181
    ) -> None:
1182
        """Initialize ALINE instance.
1183
1184
        Parameters
1185
        ----------
1186
        epsilon : float
1187
            The portion (out of 1.0) of the maximum ALINE score, above which
1188
            alignments are returned. If set to 0, only the alignments matching
1189
            the maximum alignment score are returned. If set to 1, all
1190
            alignments scoring 0 or higher are returned.
1191
        c_skip : float
1192
            The cost of an insertion or deletion
1193
        c_sub : float
1194
            The cost of a substitution
1195
        c_exp : float
1196
            The cost of an expansion or contraction
1197
        c_vwl : float
1198
            The additional cost of a vowel substitution, expansion, or
1199
            contraction
1200
        mode : str
1201
            Alignment mode, which can be ``local`` (default), ``global``,
1202
            ``half-local``, or ``semi-global``
1203
        phones : str
1204
            Phonetic symbol set, which can be:
1205
                - ``aline`` selects Kondrak's original symbols set
1206
                - ``ipa`` selects IPA symbols
1207
        normalizer : function
1208
            A function that takes an list and computes a normalization term
1209
            by which the edit distance is divided (max by default). For the
1210
            normalization proposed by Downey, et al. (2008), set this to:
1211
            ``lambda x: sum(x)/len(x)``
1212
        **kwargs
1213
            Arbitrary keyword arguments
1214
1215
1216
        .. versionadded:: 0.4.0
1217
1218
        """
1219
        super(ALINE, self).__init__(**kwargs)
1220
        self._epsilon = epsilon
1221
        self._c_skip = c_skip
1222
        self._c_sub = c_sub
1223
        self._c_exp = c_exp
1224
        self._c_vwl = c_vwl
1225
        self._mode = mode
1226
        if self._mode not in {'local', 'global', 'half-local', 'semi-global'}:
1227
            self._mode = 'local'
1228
        if phones == 'ipa':
1229 1
            self._phones = self.phones_ipa
1230 1
        else:
1231 1
            self._phones = self.phones_kondrak
1232 1
        self._normalizer = normalizer
1233 1
1234 1
    def alignment(self, src: str, tar: str) -> Tuple[float, str, str]:
1235 1
        """Return the top ALINE alignment of two strings.
1236 1
1237 1
        The `top` ALINE alignment is the first alignment with the best score.
1238 1
        The purpose of this function is to have a single tuple as a return
1239 1
        value.
1240
1241 1
        Parameters
1242 1
        ----------
1243
        src : str
1244 1
            Source string for comparison
1245
        tar : str
1246
            Target string for comparison
1247
1248
        Returns
1249
        -------
1250
        tuple(float, str, str)
1251
            ALINE alignment and its score
1252
1253
        Examples
1254
        --------
1255
        >>> cmp = ALINE()
1256
        >>> cmp.alignment('cat', 'hat')
1257
        (50.0, 'c ‖ a t ‖', 'h ‖ a t ‖')
1258
        >>> cmp.alignment('niall', 'neil')
1259
        (90.0, '‖ n i a ll ‖', '‖ n e i l  ‖')
1260
        >>> cmp.alignment('aluminum', 'catalan')
1261
        (81.5, '‖ a l u m ‖ inum', 'cat ‖ a l a n ‖')
1262
        >>> cmp.alignment('atcg', 'tagc')
1263
        (65.0, '‖ a t c ‖ g', 't ‖ a g c ‖')
1264
1265
1266
        .. versionadded:: 0.4.1
1267
1268
        """
1269
        return cast(List[Tuple[float, str, str]], self.alignments(src, tar))[0]
1270
1271
    def alignments(
1272
        self, src: str, tar: str, score_only: bool = False
1273
    ) -> Union[float, List[Tuple[float, str, str]]]:
1274
        """Return the ALINE alignments of two strings.
1275
1276
        Parameters
1277
        ----------
1278
        src : str
1279 1
            Source string for comparison
1280
        tar : str
1281 1
            Target string for comparison
1282
        score_only : bool
1283
            Return the score only, not the alignments
1284
1285
        Returns
1286
        -------
1287
        list(tuple(float, str, str) or float
1288
            ALINE alignments and their scores or the top score
1289
1290
        Examples
1291
        --------
1292
        >>> cmp = ALINE()
1293
        >>> cmp.alignments('cat', 'hat')
1294
        [(50.0, 'c ‖ a t ‖', 'h ‖ a t ‖')]
1295
        >>> cmp.alignments('niall', 'neil')
1296
        [(90.0, '‖ n i a ll ‖', '‖ n e i l  ‖')]
1297
        >>> cmp.alignments('aluminum', 'catalan')
1298
        [(81.5, '‖ a l u m ‖ inum', 'cat ‖ a l a n ‖')]
1299
        >>> cmp.alignments('atcg', 'tagc')
1300
        [(65.0, '‖ a t c ‖ g', 't ‖ a g c ‖'), (65.0, 'a ‖ tc - g ‖',
1301
        '‖ t  a g ‖ c')]
1302
1303
1304
        .. versionadded:: 0.4.0
1305
        .. versionchanged:: 0.4.1
1306
            Renamed from .alignment to .alignments
1307
1308
        """
1309
1310
        def _sig_skip(*args: Any) -> float:
1311
            return self._c_skip
1312
1313
        def _sig_sub(seg1: Dict[str, float], seg2: Dict[str, float]) -> float:
1314
            return (
1315
                self._c_sub
1316
                - _delta(seg1, seg2)
1317
                - _sig_vwl(seg1)
1318 1
                - _sig_vwl(seg2)
1319 1
            )
1320
1321 1
        def _sig_exp(
1322 1
            seg1: Dict[str, float],
1323
            seg2a: Dict[str, float],
1324
            seg2b: Dict[str, float],
1325
        ) -> float:
1326
            return (
1327
                self._c_exp
1328
                - _delta(seg1, seg2a)
1329 1
                - _delta(seg1, seg2b)
1330 1
                - _sig_vwl(seg1)
1331
                - max(_sig_vwl(seg2a), _sig_vwl(seg2b))
1332
            )
1333
1334
        def _sig_vwl(seg1: Dict[str, float]) -> float:
1335
            return (
1336
                0.0
1337
                if seg1['manner'] > self.feature_weights['high vowel']
1338 1
                else self._c_vwl
1339 1
            )
1340
1341
        def _delta(seg1: Dict[str, float], seg2: Dict[str, float]) -> float:
1342
            features = (
1343
                self.c_features
1344
                if max(seg1['manner'], seg2['manner'])
1345 1
                > self.feature_weights['high vowel']
1346 1
                else self.v_features
1347
            )
1348
            diff = 0.0
1349
            for f in features:
1350
                diff += (
1351
                    abs(seg1.get(f, 0.0) - seg2.get(f, 0.0)) * self.salience[f]
1352 1
                )
1353 1
            return diff
1354 1
1355
        def _retrieve(
1356
            i: int, j: int, score: float, out: List[Tuple[str, str]]
1357 1
        ) -> None:
1358
            def _record(score: float, out: List[Tuple[str, str]]) -> None:
1359 1
                out.append(('‖', '‖'))
1360 1
                for i1 in range(i - 1, -1, -1):
1361 1
                    out.append((src_tok[i1], ''))
1362 1
                for j1 in range(j - 1, -1, -1):
1363 1
                    out.append(('', tar_tok[j1]))
1364 1
                if self._mode == 'global':
1365 1
                    score += (i + j) * _sig_skip('')
1366 1
1367 1
                out = out[::-1]
1368
1369 1
                src_alignment = []
1370
                tar_alignment = []
1371 1
1372 1
                out.append(('‖', '‖'))
1373
                part = 0
1374 1
                s_segment = ''  # type: Union[str, List[str]]
1375 1
                t_segment = ''  # type: Union[str, List[str]]
1376 1
                for ss, ts in out:
1377 1
                    if ss == '‖':
1378 1
                        if part % 2 == 0:
1379 1
                            src_alignment.append(s_segment)
1380 1
                            tar_alignment.append(t_segment)
1381 1
                            s_segment = []
1382 1
                            t_segment = []
1383 1
                        else:
1384 1
                            src_alignment.append(' '.join(s_segment))
1385
                            tar_alignment.append(' '.join(t_segment))
1386 1
                            s_segment = ''
1387 1
                            t_segment = ''
1388 1
                        part += 1
1389 1
                    else:
1390 1
                        if part % 2 == 0:
1391
                            s_segment = cast(str, s_segment) + ss
1392 1
                            t_segment = cast(str, t_segment) + ts
1393 1
                        else:
1394 1
                            cast(List[str], s_segment).append(
1395
                                ss + ' ' * (len(ts) - len(ss))
1396 1
                            )
1397 1
                            cast(List[str], t_segment).append(
1398
                                ts + ' ' * (len(ss) - len(ts))
1399 1
                            )
1400 1
1401
                src_alignment_str = ' ‖ '.join(
1402 1
                    cast(List[str], src_alignment)
1403 1
                ).strip()
1404
                tar_alignment_str = ' ‖ '.join(
1405 1
                    cast(List[str], tar_alignment)
1406 1
                ).strip()
1407 1
1408
                alignments.append(
1409 1
                    (score, src_alignment_str, tar_alignment_str)
1410
                )
1411
                return
1412
1413
            if s_mat[i, j] == 0:
1414
                _record(score, out)
1415
                return
1416
            else:
1417 1
                if (
1418 1
                    i > 0
1419
                    and j > 0
1420
                    and s_mat[i - 1, j - 1]
1421 1
                    + _sig_sub(src_feat_wt[i - 1], tar_feat_wt[j - 1])
1422
                    + score
1423
                    >= threshold
1424
                ):
1425
                    loc_out = deepcopy(out)
1426
                    loc_out.append((src_tok[i - 1], tar_tok[j - 1]))
1427 1
                    _retrieve(
1428
                        i - 1,
1429 1
                        j - 1,
1430
                        score
1431
                        + _sig_sub(src_feat_wt[i - 1], tar_feat_wt[j - 1]),
1432
                        loc_out,
1433
                    )
1434 1
                    loc_out.pop()
1435 1
1436 1 View Code Duplication
                if (
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
1437 1
                    j > 0
1438
                    and s_mat[i, j - 1] + _sig_skip(tar_tok[j - 1]) + score
1439 1
                    >= threshold
1440
                ):
1441
                    loc_out = deepcopy(out)
1442
                    loc_out.append(('-', tar_tok[j - 1]))
1443
                    _retrieve(
1444
                        i, j - 1, score + _sig_skip(tar_tok[j - 1]), loc_out
1445
                    )
1446
                    loc_out.pop()
1447 1
1448 1 View Code Duplication
                if (
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
1449
                    i > 0
1450
                    and j > 1
1451
                    and s_mat[i - 1, j - 2]
1452
                    + _sig_exp(
1453
                        src_feat_wt[i - 1],
1454 1
                        tar_feat_wt[j - 2],
1455
                        tar_feat_wt[j - 1],
1456
                    )
1457
                    + score
1458
                    >= threshold
1459
                ):
1460 1
                    loc_out = deepcopy(out)
1461
                    loc_out.append(
1462 1
                        (src_tok[i - 1], tar_tok[j - 2] + tar_tok[j - 1],)
1463
                    )
1464
                    _retrieve(
1465
                        i - 1,
1466
                        j - 2,
1467 1
                        score
1468 1
                        + _sig_exp(
1469 1
                            src_feat_wt[i - 1],
1470 1
                            tar_feat_wt[j - 2],
1471
                            tar_feat_wt[j - 1],
1472 1
                        ),
1473
                        loc_out,
1474
                    )
1475
                    loc_out.pop()
1476
1477 View Code Duplication
                if (
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
1478
                    i > 0
1479
                    and s_mat[i - 1, j] + _sig_skip(src_tok[i - 1]) + score
1480 1
                    >= threshold
1481 1
                ):
1482
                    loc_out = deepcopy(out)
1483
                    loc_out.append((src_tok[i - 1], '-'))
1484
                    _retrieve(
1485
                        i - 1, j, score + _sig_skip(src_tok[i - 1]), loc_out
1486
                    )
1487 1
                    loc_out.pop()
1488
1489 View Code Duplication
                if (
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
1490
                    i > 1
1491
                    and j > 0
1492
                    and s_mat[i - 2, j - 1]
1493 1
                    + _sig_exp(
1494
                        tar_feat_wt[j - 1],
1495 1
                        src_feat_wt[i - 2],
1496
                        src_feat_wt[i - 1],
1497 1
                    )
1498 1
                    + score
1499
                    >= threshold
1500 1
                ):
1501 1
                    loc_out = deepcopy(out)
1502 1
                    loc_out.append(
1503 1
                        (src_tok[i - 2] + src_tok[i - 1], tar_tok[j - 1],)
1504 1
                    )
1505 1
                    _retrieve(
1506 1
                        i - 2,
1507 1
                        j - 1,
1508 1
                        score
1509 1
                        + _sig_exp(
1510
                            tar_feat_wt[j - 1],
1511 1
                            src_feat_wt[i - 2],
1512 1
                            src_feat_wt[i - 1],
1513
                        ),
1514 1
                        loc_out,
1515 1
                    )
1516 1
                    loc_out.pop()
1517 1
1518 1
        sg_max = 0.0
1519 1
1520 1
        src_tok = []  # type: List[str]
1521 1
        src_feat = []  # type: List[Dict[str, str]]
1522 1
        tar_tok = []  # type: List[str]
1523
        tar_feat = []  # type: List[Dict[str, str]]
1524 1
1525 1
        for ch in src:
1526 1
            if ch in self._phones:
1527 1
                src_tok.append(ch)
1528
                src_feat.append(dict(self._phones[ch]))
1529 1
        for ch in tar:
1530 1
            if ch in self._phones:
1531 1
                tar_tok.append(ch)
1532 1
                tar_feat.append(dict(self._phones[ch]))
1533 1
1534 1
        for i in range(1, len(src_feat)):
1535 1
            if 'supplemental' in src_feat[i]:
1536 1
                j = i - 1
1537 1
                while j > -1:
1538
                    if 'supplemental' not in src_feat[j]:
1539 1
                        src_tok[j] += src_tok[i]
1540 1
                        for key, value in src_feat[i].items():
1541 1
                            if key != 'supplemental':
1542 1
                                src_feat[j][key] = value
1543
                        j = 0
1544 1
                    j -= 1
1545 1
1546 1
        zipped = [
1547 1
            fb for fb in zip(src_feat, src_tok) if 'supplemental' not in fb[0]
1548 1
        ]
1549 1
        if zipped:
1550 1
            src_feat, src_tok = zip(*zipped)  # type: ignore
1551 1
        else:
1552
            src_feat, src_tok = [], []
1553 1
1554 1
        src_feat_wt = []  # type: List[Dict[str, float]]
1555
        for f_dict in src_feat:
1556 1
            src_feat_wt.append(
1557
                {
1558 1
                    key: self.feature_weights[f_dict[key]]
1559 1
                    for key in f_dict.keys()
1560 1
                }
1561 1
            )
1562 1
1563
        src_len = len(src_tok)
1564 1
1565 1
        for i in range(1, len(tar_feat)):
1566 1
            if 'supplemental' in tar_feat[i]:
1567
                j = i - 1
1568
                while j > -1:
1569
                    if 'supplemental' not in tar_feat[j]:
1570
                        tar_tok[j] += tar_tok[i]
1571
                        for key, value in tar_feat[i].items():
1572
                            if key != 'supplemental':
1573
                                tar_feat[j][key] = value
1574
                        j = 0
1575
                    j -= 1
1576
1577
        zipped = [
1578
            fb for fb in zip(tar_feat, tar_tok) if 'supplemental' not in fb[0]
1579
        ]
1580
        if zipped:
1581 1
            tar_feat, tar_tok = zip(*zipped)  # type: ignore
1582 1
        else:
1583 1
            tar_feat, tar_tok = [], []
1584 1
1585
        tar_feat_wt = []  # type: List[Dict[str, float]]
1586 1
        for f_dict in tar_feat:
1587
            tar_feat_wt.append(
1588 1
                {
1589 1
                    key: self.feature_weights[f_dict[key]]
1590
                    for key in f_dict.keys()
1591 1
                }
1592
            )
1593 1
1594 1
        tar_len = len(tar_tok)
1595
1596 1
        s_mat = zeros((src_len + 1, tar_len + 1), dtype=float_)
1597
1598 1
        if self._mode == 'global':
1599
            for i in range(1, src_len + 1):
1600 1
                s_mat[i, 0] = s_mat[i - 1, 0] + _sig_skip(src_tok[i - 1])
1601 1
            for j in range(1, tar_len + 1):
1602 1
                s_mat[0, j] = s_mat[0, j - 1] + _sig_skip(tar_tok[j - 1])
1603
1604
        for i in range(1, src_len + 1):
1605 1
            for j in range(1, tar_len + 1):
1606 1
                s_mat[i, j] = max(
1607
                    s_mat[i - 1, j] + _sig_skip(src_feat_wt[i - 1]),
1608
                    s_mat[i, j - 1] + _sig_skip(tar_feat_wt[j - 1]),
1609 1
                    s_mat[i - 1, j - 1]
1610 1
                    + _sig_sub(src_feat_wt[i - 1], tar_feat_wt[j - 1]),
1611 1
                    s_mat[i - 1, j - 2]
1612 1
                    + _sig_exp(
1613 1
                        src_feat_wt[i - 1],
1614 1
                        tar_feat_wt[j - 2],
1615 1
                        tar_feat_wt[j - 1],
1616 1
                    )
1617 1
                    if j > 1
1618
                    else -inf,
1619 1
                    s_mat[i - 2, j - 1]
1620 1
                    + _sig_exp(
1621
                        tar_feat_wt[j - 1],
1622 1
                        src_feat_wt[i - 2],
1623
                        src_feat_wt[i - 1],
1624 1
                    )
1625
                    if i > 1
1626
                    else -inf,
1627
                    0 if self._mode in {'local', 'half-local'} else -inf,
1628
                )
1629
1630
                if s_mat[i, j] > sg_max:
1631
                    if self._mode == 'semi-global':
1632
                        if i == src_len or j == tar_len:
1633
                            sg_max = s_mat[i, j]
1634
                    else:
1635
                        sg_max = s_mat[i, j]
1636
1637
        if self._mode in {'global', 'half-local'}:
1638
            dp_score = s_mat[src_len, tar_len]
1639
        else:
1640
            dp_score = s_mat.max()
1641
1642
        if score_only:
1643
            return cast(float, dp_score)
1644
1645
        threshold = (1 - self._epsilon) * dp_score
1646
1647
        alignments = []  # type: List[Tuple[float, str, str]]
1648
1649
        for i in range(1, src_len + 1):
1650
            for j in range(1, tar_len + 1):
1651
                if self._mode in {'global', 'half-local'} and (
1652
                    i < src_len or j < tar_len
1653
                ):
1654
                    continue
1655 1
                if self._mode == 'semi-global' and (
1656 1
                    i < src_len and j < tar_len
1657 1
                ):
1658
                    continue
1659 1
                if s_mat[i, j] >= threshold:
1660
                    out = []
1661
                    for j1 in range(tar_len - 1, j - 1, -1):
1662
                        out.append(('', tar_tok[j1]))
1663
                    for i1 in range(src_len - 1, i - 1, -1):
1664
                        out.append((src_tok[i1], ''))
1665
                    out.append(('‖', '‖'))
1666
                    _retrieve(i, j, 0, out)
1667
1668
        return sorted(alignments, key=lambda _: _[0], reverse=True)
1669
1670
    def sim_score(self, src: str, tar: str) -> float:
1671
        """Return the ALINE alignment score of two strings.
1672
1673
        Parameters
1674
        ----------
1675
        src : str
1676
            Source string for comparison
1677
        tar : str
1678
            Target string for comparison
1679
1680
        Returns
1681
        -------
1682
        float
1683
            ALINE alignment score
1684
1685
        Examples
1686
        --------
1687
        >>> cmp = ALINE()
1688
        >>> cmp.sim_score('cat', 'hat')
1689
        50.0
1690 1
        >>> cmp.sim_score('niall', 'neil')
1691 1
        90.0
1692 1
        >>> cmp.sim_score('aluminum', 'catalan')
1693
        81.5
1694
        >>> cmp.sim_score('atcg', 'tagc')
1695 1
        65.0
1696
1697
1698
        .. versionadded:: 0.4.0
1699
1700
        """
1701
        if src == '' and tar == '':
1702
            return 1.0
1703
        return cast(float, self.alignments(src, tar, score_only=True))
1704
1705
    def sim(self, src: str, tar: str) -> float:
1706
        """Return the normalized ALINE similarity of two strings.
1707
1708
        Parameters
1709
        ----------
1710
        src : str
1711
            Source string for comparison
1712
        tar : str
1713
            Target string for comparison
1714
1715
        Returns
1716
        -------
1717
        float
1718
            Normalized ALINE similarity
1719
1720
        Examples
1721
        --------
1722
        >>> cmp = ALINE()
1723
        >>> cmp.dist('cat', 'hat')
1724
        0.4117647058823529
1725
        >>> cmp.dist('niall', 'neil')
1726
        0.33333333333333337
1727
        >>> cmp.dist('aluminum', 'catalan')
1728
        0.5925
1729
        >>> cmp.dist('atcg', 'tagc')
1730
        0.45833333333333337
1731
1732
1733
        .. versionadded:: 0.4.0
1734
1735
        """
1736
        num = self.sim_score(src, tar)
1737
        if num:
1738
            return num / self._normalizer(
1739
                [self.sim_score(src, src), self.sim_score(tar, tar)]
1740
            )
1741
        return 0.0
1742
1743
1744
if __name__ == '__main__':
1745
    import doctest
1746
1747
    doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)
1748