Total Complexity | 80 |
Total Lines | 1748 |
Duplicated Lines | 4.46 % |
Coverage | 100% |
Changes | 0 |
Duplicate code is one of the most pungent code smells. A rule that is often used is to re-structure code once it is duplicated in three or more places.
Common duplication problems, and corresponding solutions are:
Complex classes like abydos.distance._aline often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
1 | # Copyright 2019-2020 by Christopher C. Little. |
||
2 | # This file is part of Abydos. |
||
3 | # |
||
4 | # Abydos is free software: you can redistribute it and/or modify |
||
5 | # it under the terms of the GNU General Public License as published by |
||
6 | # the Free Software Foundation, either version 3 of the License, or |
||
7 | # (at your option) any later version. |
||
8 | # |
||
9 | # Abydos is distributed in the hope that it will be useful, |
||
10 | # but WITHOUT ANY WARRANTY; without even the implied warranty of |
||
11 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
||
12 | # GNU General Public License for more details. |
||
13 | # |
||
14 | # You should have received a copy of the GNU General Public License |
||
15 | # along with Abydos. If not, see <http://www.gnu.org/licenses/>. |
||
16 | |||
17 | """abydos.distance._aline. |
||
18 | |||
19 | 1 | ALINE alignment, similarity, and distance |
|
20 | """ |
||
21 | |||
22 | from copy import deepcopy |
||
23 | from typing import Any, Callable, Dict, List, Tuple, Union, cast |
||
24 | 1 | ||
25 | from numpy import float_, inf, zeros |
||
26 | |||
27 | from ._distance import _Distance |
||
28 | |||
29 | __all__ = ['ALINE'] |
||
30 | |||
31 | 1 | ||
32 | class ALINE(_Distance): |
||
33 | 1 | r"""ALINE alignment, similarity, and distance. |
|
34 | 1 | ||
35 | 1 | ALINE alignment was developed by |
|
36 | :cite:`Kondrak:2000,Kondrak:2002,Downey:2008`, and establishes an |
||
37 | 1 | alignment algorithm based on multivalued phonetic features and feature |
|
38 | salience weights. Along with the alignment itself, the algorithm produces a |
||
39 | 1 | term similarity score. |
|
40 | |||
41 | :cite:`Downey:2008` develops ALINE's similarity score into a similarity |
||
42 | 1 | measure & distance measure: |
|
43 | |||
44 | .. math:: |
||
45 | |||
46 | sim_{ALINE} = \frac{2 \dot score_{ALINE}(src, tar)} |
||
47 | {score_{ALINE}(src, src) + score_{ALINE}(tar, tar)} |
||
48 | |||
49 | However, because the average of the two self-similarity scores is not |
||
50 | guaranteed to be greater than or equal to the similarity score between |
||
51 | the two strings, by default, this formula is not used here in order to |
||
52 | guarantee that the similarity measure is bounded to [0, 1]. Instead, |
||
53 | Kondrak's similarity measure is employed: |
||
54 | |||
55 | .. math:: |
||
56 | |||
57 | sim_{ALINE} = \frac{score_{ALINE}(src, tar)} |
||
58 | {max(score_{ALINE}(src, src), score_{ALINE}(tar, tar))} |
||
59 | |||
60 | |||
61 | .. versionadded:: 0.4.0 |
||
62 | """ |
||
63 | |||
64 | # The three dicts below are mostly copied from NLTK's implementation |
||
65 | # https://www.nltk.org/_modules/nltk/metrics/aline.html |
||
66 | # But values have been returned, as much as possible to the reference |
||
67 | # values supplied in Kondrak's paper. |
||
68 | feature_weights = { |
||
69 | # place |
||
70 | 'bilabial': 1.0, |
||
71 | 'labiodental': 0.95, |
||
72 | 'dental': 0.9, |
||
73 | 'alveolar': 0.85, |
||
74 | 'retroflex': 0.8, |
||
75 | 'palato-alveolar': 0.75, |
||
76 | 'palatal': 0.7, |
||
77 | 'velar': 0.6, |
||
78 | 1 | 'uvular': 0.5, |
|
79 | 'pharyngeal': 0.3, |
||
80 | 'glottal': 0.1, |
||
81 | # manner |
||
82 | 'stop': 1.0, |
||
83 | 'affricate': 0.9, |
||
84 | 'fricative': 0.8, |
||
85 | 'approximant': 0.6, |
||
86 | 'trill': 0.55, # not in original |
||
87 | 'tap': 0.5, # not in original |
||
88 | 'high vowel': 0.4, |
||
89 | 'mid vowel': 0.2, |
||
90 | 'low vowel': 0.0, |
||
91 | # high |
||
92 | 'high': 1.0, |
||
93 | 'mid': 0.5, |
||
94 | 'low': 0.0, |
||
95 | # back |
||
96 | 'front': 1.0, |
||
97 | 'central': 0.5, |
||
98 | 'back': 0.0, |
||
99 | # binary features |
||
100 | 'plus': 1.0, |
||
101 | 'minus': 0.0, |
||
102 | } |
||
103 | |||
104 | v_features = { |
||
105 | 'syllabic', |
||
106 | 'nasal', |
||
107 | 'retroflex', |
||
108 | 'high', |
||
109 | 'back', |
||
110 | 'round', |
||
111 | 'long', |
||
112 | } |
||
113 | c_features = { |
||
114 | 1 | 'syllabic', |
|
115 | 'manner', |
||
116 | 'voice', |
||
117 | 'nasal', |
||
118 | 'retroflex', |
||
119 | 'lateral', |
||
120 | 'aspirated', |
||
121 | 'place', |
||
122 | } |
||
123 | 1 | ||
124 | salience = { |
||
125 | 'syllabic': 5, |
||
126 | 'voice': 10, |
||
127 | 'lateral': 10, |
||
128 | 'high': 5, |
||
129 | 'manner': 50, |
||
130 | 'long': 1, |
||
131 | 'place': 40, |
||
132 | 'nasal': 10, |
||
133 | 'aspirated': 5, |
||
134 | 1 | 'back': 5, |
|
135 | 'retroflex': 10, |
||
136 | 'round': 5, |
||
137 | } |
||
138 | |||
139 | phones_ipa = { |
||
140 | 'p': { |
||
141 | 'place': 'bilabial', |
||
142 | 'manner': 'stop', |
||
143 | 'syllabic': 'minus', |
||
144 | 'voice': 'minus', |
||
145 | 'nasal': 'minus', |
||
146 | 'retroflex': 'minus', |
||
147 | 'lateral': 'minus', |
||
148 | 'aspirated': 'minus', |
||
149 | 1 | }, |
|
150 | 'b': { |
||
151 | 'place': 'bilabial', |
||
152 | 'manner': 'stop', |
||
153 | 'syllabic': 'minus', |
||
154 | 'voice': 'plus', |
||
155 | 'nasal': 'minus', |
||
156 | 'retroflex': 'minus', |
||
157 | 'lateral': 'minus', |
||
158 | 'aspirated': 'minus', |
||
159 | }, |
||
160 | 't': { |
||
161 | 'place': 'alveolar', |
||
162 | 'manner': 'stop', |
||
163 | 'syllabic': 'minus', |
||
164 | 'voice': 'minus', |
||
165 | 'nasal': 'minus', |
||
166 | 'retroflex': 'minus', |
||
167 | 'lateral': 'minus', |
||
168 | 'aspirated': 'minus', |
||
169 | }, |
||
170 | 'd': { |
||
171 | 'place': 'alveolar', |
||
172 | 'manner': 'stop', |
||
173 | 'syllabic': 'minus', |
||
174 | 'voice': 'plus', |
||
175 | 'nasal': 'minus', |
||
176 | 'retroflex': 'minus', |
||
177 | 'lateral': 'minus', |
||
178 | 'aspirated': 'minus', |
||
179 | }, |
||
180 | 'ʈ': { |
||
181 | 'place': 'retroflex', |
||
182 | 'manner': 'stop', |
||
183 | 'syllabic': 'minus', |
||
184 | 'voice': 'minus', |
||
185 | 'nasal': 'minus', |
||
186 | 'retroflex': 'plus', |
||
187 | 'lateral': 'minus', |
||
188 | 'aspirated': 'minus', |
||
189 | }, |
||
190 | 'ɖ': { |
||
191 | 'place': 'retroflex', |
||
192 | 'manner': 'stop', |
||
193 | 'syllabic': 'minus', |
||
194 | 'voice': 'plus', |
||
195 | 'nasal': 'minus', |
||
196 | 'retroflex': 'plus', |
||
197 | 'lateral': 'minus', |
||
198 | 'aspirated': 'minus', |
||
199 | }, |
||
200 | 'c': { |
||
201 | 'place': 'palatal', |
||
202 | 'manner': 'stop', |
||
203 | 'syllabic': 'minus', |
||
204 | 'voice': 'minus', |
||
205 | 'nasal': 'minus', |
||
206 | 'retroflex': 'minus', |
||
207 | 'lateral': 'minus', |
||
208 | 'aspirated': 'minus', |
||
209 | }, |
||
210 | 'ɟ': { |
||
211 | 'place': 'palatal', |
||
212 | 'manner': 'stop', |
||
213 | 'syllabic': 'minus', |
||
214 | 'voice': 'plus', |
||
215 | 'nasal': 'minus', |
||
216 | 'retroflex': 'minus', |
||
217 | 'lateral': 'minus', |
||
218 | 'aspirated': 'minus', |
||
219 | }, |
||
220 | 'k': { |
||
221 | 'place': 'velar', |
||
222 | 'manner': 'stop', |
||
223 | 'syllabic': 'minus', |
||
224 | 'voice': 'minus', |
||
225 | 'nasal': 'minus', |
||
226 | 'retroflex': 'minus', |
||
227 | 'lateral': 'minus', |
||
228 | 'aspirated': 'minus', |
||
229 | }, |
||
230 | 'g': { |
||
231 | 'place': 'velar', |
||
232 | 'manner': 'stop', |
||
233 | 'syllabic': 'minus', |
||
234 | 'voice': 'plus', |
||
235 | 'nasal': 'minus', |
||
236 | 'retroflex': 'minus', |
||
237 | 'lateral': 'minus', |
||
238 | 'aspirated': 'minus', |
||
239 | }, |
||
240 | 'q': { |
||
241 | 'place': 'uvular', |
||
242 | 'manner': 'stop', |
||
243 | 'syllabic': 'minus', |
||
244 | 'voice': 'minus', |
||
245 | 'nasal': 'minus', |
||
246 | 'retroflex': 'minus', |
||
247 | 'lateral': 'minus', |
||
248 | 'aspirated': 'minus', |
||
249 | }, |
||
250 | 'ɢ': { |
||
251 | 'place': 'uvular', |
||
252 | 'manner': 'stop', |
||
253 | 'syllabic': 'minus', |
||
254 | 'voice': 'plus', |
||
255 | 'nasal': 'minus', |
||
256 | 'retroflex': 'minus', |
||
257 | 'lateral': 'minus', |
||
258 | 'aspirated': 'minus', |
||
259 | }, |
||
260 | 'ʔ': { |
||
261 | 'place': 'glottal', |
||
262 | 'manner': 'stop', |
||
263 | 'syllabic': 'minus', |
||
264 | 'voice': 'minus', |
||
265 | 'nasal': 'minus', |
||
266 | 'retroflex': 'minus', |
||
267 | 'lateral': 'minus', |
||
268 | 'aspirated': 'minus', |
||
269 | }, |
||
270 | 'm': { |
||
271 | 'place': 'bilabial', |
||
272 | 'manner': 'stop', |
||
273 | 'syllabic': 'minus', |
||
274 | 'voice': 'plus', |
||
275 | 'nasal': 'plus', |
||
276 | 'retroflex': 'minus', |
||
277 | 'lateral': 'minus', |
||
278 | 'aspirated': 'minus', |
||
279 | }, |
||
280 | 'ɱ': { |
||
281 | 'place': 'labiodental', |
||
282 | 'manner': 'stop', |
||
283 | 'syllabic': 'minus', |
||
284 | 'voice': 'plus', |
||
285 | 'nasal': 'plus', |
||
286 | 'retroflex': 'minus', |
||
287 | 'lateral': 'minus', |
||
288 | 'aspirated': 'minus', |
||
289 | }, |
||
290 | 'n': { |
||
291 | 'place': 'alveolar', |
||
292 | 'manner': 'stop', |
||
293 | 'syllabic': 'minus', |
||
294 | 'voice': 'plus', |
||
295 | 'nasal': 'plus', |
||
296 | 'retroflex': 'minus', |
||
297 | 'lateral': 'minus', |
||
298 | 'aspirated': 'minus', |
||
299 | }, |
||
300 | 'ɳ': { |
||
301 | 'place': 'retroflex', |
||
302 | 'manner': 'stop', |
||
303 | 'syllabic': 'minus', |
||
304 | 'voice': 'plus', |
||
305 | 'nasal': 'plus', |
||
306 | 'retroflex': 'plus', |
||
307 | 'lateral': 'minus', |
||
308 | 'aspirated': 'minus', |
||
309 | }, |
||
310 | 'ɲ': { |
||
311 | 'place': 'palatal', |
||
312 | 'manner': 'stop', |
||
313 | 'syllabic': 'minus', |
||
314 | 'voice': 'plus', |
||
315 | 'nasal': 'plus', |
||
316 | 'retroflex': 'minus', |
||
317 | 'lateral': 'minus', |
||
318 | 'aspirated': 'minus', |
||
319 | }, |
||
320 | 'ŋ': { |
||
321 | 'place': 'velar', |
||
322 | 'manner': 'stop', |
||
323 | 'syllabic': 'minus', |
||
324 | 'voice': 'plus', |
||
325 | 'nasal': 'plus', |
||
326 | 'retroflex': 'minus', |
||
327 | 'lateral': 'minus', |
||
328 | 'aspirated': 'minus', |
||
329 | }, |
||
330 | 'ɴ': { |
||
331 | 'place': 'uvular', |
||
332 | 'manner': 'stop', |
||
333 | 'syllabic': 'minus', |
||
334 | 'voice': 'plus', |
||
335 | 'nasal': 'plus', |
||
336 | 'retroflex': 'minus', |
||
337 | 'lateral': 'minus', |
||
338 | 'aspirated': 'minus', |
||
339 | }, |
||
340 | 'ʙ': { |
||
341 | 'place': 'bilabial', |
||
342 | 'manner': 'trill', |
||
343 | 'syllabic': 'minus', |
||
344 | 'voice': 'plus', |
||
345 | 'nasal': 'minus', |
||
346 | 'retroflex': 'minus', |
||
347 | 'lateral': 'minus', |
||
348 | 'aspirated': 'minus', |
||
349 | }, |
||
350 | 'r': { |
||
351 | 'place': 'alveolar', |
||
352 | 'manner': 'trill', |
||
353 | 'syllabic': 'minus', |
||
354 | 'voice': 'plus', |
||
355 | 'nasal': 'minus', |
||
356 | 'retroflex': 'plus', |
||
357 | 'lateral': 'minus', |
||
358 | 'aspirated': 'minus', |
||
359 | }, |
||
360 | 'ʀ': { |
||
361 | 'place': 'uvular', |
||
362 | 'manner': 'trill', |
||
363 | 'syllabic': 'minus', |
||
364 | 'voice': 'plus', |
||
365 | 'nasal': 'minus', |
||
366 | 'retroflex': 'minus', |
||
367 | 'lateral': 'minus', |
||
368 | 'aspirated': 'minus', |
||
369 | }, |
||
370 | 'ɾ': { |
||
371 | 'place': 'alveolar', |
||
372 | 'manner': 'tap', |
||
373 | 'syllabic': 'minus', |
||
374 | 'voice': 'plus', |
||
375 | 'nasal': 'minus', |
||
376 | 'retroflex': 'minus', |
||
377 | 'lateral': 'minus', |
||
378 | 'aspirated': 'minus', |
||
379 | }, |
||
380 | 'ɽ': { |
||
381 | 'place': 'retroflex', |
||
382 | 'manner': 'tap', |
||
383 | 'syllabic': 'minus', |
||
384 | 'voice': 'plus', |
||
385 | 'nasal': 'minus', |
||
386 | 'retroflex': 'plus', |
||
387 | 'lateral': 'minus', |
||
388 | 'aspirated': 'minus', |
||
389 | }, |
||
390 | 'ɸ': { |
||
391 | 'place': 'bilabial', |
||
392 | 'manner': 'fricative', |
||
393 | 'syllabic': 'minus', |
||
394 | 'voice': 'minus', |
||
395 | 'nasal': 'minus', |
||
396 | 'retroflex': 'minus', |
||
397 | 'lateral': 'minus', |
||
398 | 'aspirated': 'minus', |
||
399 | }, |
||
400 | 'β': { |
||
401 | 'place': 'bilabial', |
||
402 | 'manner': 'fricative', |
||
403 | 'syllabic': 'minus', |
||
404 | 'voice': 'plus', |
||
405 | 'nasal': 'minus', |
||
406 | 'retroflex': 'minus', |
||
407 | 'lateral': 'minus', |
||
408 | 'aspirated': 'minus', |
||
409 | }, |
||
410 | 'f': { |
||
411 | 'place': 'labiodental', |
||
412 | 'manner': 'fricative', |
||
413 | 'syllabic': 'minus', |
||
414 | 'voice': 'minus', |
||
415 | 'nasal': 'minus', |
||
416 | 'retroflex': 'minus', |
||
417 | 'lateral': 'minus', |
||
418 | 'aspirated': 'minus', |
||
419 | }, |
||
420 | 'v': { |
||
421 | 'place': 'labiodental', |
||
422 | 'manner': 'fricative', |
||
423 | 'syllabic': 'minus', |
||
424 | 'voice': 'plus', |
||
425 | 'nasal': 'minus', |
||
426 | 'retroflex': 'minus', |
||
427 | 'lateral': 'minus', |
||
428 | 'aspirated': 'minus', |
||
429 | }, |
||
430 | 'θ': { |
||
431 | 'place': 'dental', |
||
432 | 'manner': 'fricative', |
||
433 | 'syllabic': 'minus', |
||
434 | 'voice': 'minus', |
||
435 | 'nasal': 'minus', |
||
436 | 'retroflex': 'minus', |
||
437 | 'lateral': 'minus', |
||
438 | 'aspirated': 'minus', |
||
439 | }, |
||
440 | 'ð': { |
||
441 | 'place': 'dental', |
||
442 | 'manner': 'fricative', |
||
443 | 'syllabic': 'minus', |
||
444 | 'voice': 'plus', |
||
445 | 'nasal': 'minus', |
||
446 | 'retroflex': 'minus', |
||
447 | 'lateral': 'minus', |
||
448 | 'aspirated': 'minus', |
||
449 | }, |
||
450 | 's': { |
||
451 | 'place': 'alveolar', |
||
452 | 'manner': 'fricative', |
||
453 | 'syllabic': 'minus', |
||
454 | 'voice': 'minus', |
||
455 | 'nasal': 'minus', |
||
456 | 'retroflex': 'minus', |
||
457 | 'lateral': 'minus', |
||
458 | 'aspirated': 'minus', |
||
459 | }, |
||
460 | 'z': { |
||
461 | 'place': 'alveolar', |
||
462 | 'manner': 'fricative', |
||
463 | 'syllabic': 'minus', |
||
464 | 'voice': 'plus', |
||
465 | 'nasal': 'minus', |
||
466 | 'retroflex': 'minus', |
||
467 | 'lateral': 'minus', |
||
468 | 'aspirated': 'minus', |
||
469 | }, |
||
470 | 'ʃ': { |
||
471 | 'place': 'palato-alveolar', |
||
472 | 'manner': 'fricative', |
||
473 | 'syllabic': 'minus', |
||
474 | 'voice': 'minus', |
||
475 | 'nasal': 'minus', |
||
476 | 'retroflex': 'minus', |
||
477 | 'lateral': 'minus', |
||
478 | 'aspirated': 'minus', |
||
479 | }, |
||
480 | 'ʒ': { |
||
481 | 'place': 'palato-alveolar', |
||
482 | 'manner': 'fricative', |
||
483 | 'syllabic': 'minus', |
||
484 | 'voice': 'plus', |
||
485 | 'nasal': 'minus', |
||
486 | 'retroflex': 'minus', |
||
487 | 'lateral': 'minus', |
||
488 | 'aspirated': 'minus', |
||
489 | }, |
||
490 | 'ʂ': { |
||
491 | 'place': 'retroflex', |
||
492 | 'manner': 'fricative', |
||
493 | 'syllabic': 'minus', |
||
494 | 'voice': 'minus', |
||
495 | 'nasal': 'minus', |
||
496 | 'retroflex': 'plus', |
||
497 | 'lateral': 'minus', |
||
498 | 'aspirated': 'minus', |
||
499 | }, |
||
500 | 'ʐ': { |
||
501 | 'place': 'retroflex', |
||
502 | 'manner': 'fricative', |
||
503 | 'syllabic': 'minus', |
||
504 | 'voice': 'plus', |
||
505 | 'nasal': 'minus', |
||
506 | 'retroflex': 'plus', |
||
507 | 'lateral': 'minus', |
||
508 | 'aspirated': 'minus', |
||
509 | }, |
||
510 | 'ç': { |
||
511 | 'place': 'palatal', |
||
512 | 'manner': 'fricative', |
||
513 | 'syllabic': 'minus', |
||
514 | 'voice': 'minus', |
||
515 | 'nasal': 'minus', |
||
516 | 'retroflex': 'minus', |
||
517 | 'lateral': 'minus', |
||
518 | 'aspirated': 'minus', |
||
519 | }, |
||
520 | 'ʝ': { |
||
521 | 'place': 'palatal', |
||
522 | 'manner': 'fricative', |
||
523 | 'syllabic': 'minus', |
||
524 | 'voice': 'plus', |
||
525 | 'nasal': 'minus', |
||
526 | 'retroflex': 'minus', |
||
527 | 'lateral': 'minus', |
||
528 | 'aspirated': 'minus', |
||
529 | }, |
||
530 | 'x': { |
||
531 | 'place': 'velar', |
||
532 | 'manner': 'fricative', |
||
533 | 'syllabic': 'minus', |
||
534 | 'voice': 'minus', |
||
535 | 'nasal': 'minus', |
||
536 | 'retroflex': 'minus', |
||
537 | 'lateral': 'minus', |
||
538 | 'aspirated': 'minus', |
||
539 | }, |
||
540 | 'ɣ': { |
||
541 | 'place': 'velar', |
||
542 | 'manner': 'fricative', |
||
543 | 'syllabic': 'minus', |
||
544 | 'voice': 'plus', |
||
545 | 'nasal': 'minus', |
||
546 | 'retroflex': 'minus', |
||
547 | 'lateral': 'minus', |
||
548 | 'aspirated': 'minus', |
||
549 | }, |
||
550 | 'χ': { |
||
551 | 'place': 'uvular', |
||
552 | 'manner': 'fricative', |
||
553 | 'syllabic': 'minus', |
||
554 | 'voice': 'minus', |
||
555 | 'nasal': 'minus', |
||
556 | 'retroflex': 'minus', |
||
557 | 'lateral': 'minus', |
||
558 | 'aspirated': 'minus', |
||
559 | }, |
||
560 | 'ʁ': { |
||
561 | 'place': 'uvular', |
||
562 | 'manner': 'fricative', |
||
563 | 'syllabic': 'minus', |
||
564 | 'voice': 'plus', |
||
565 | 'nasal': 'minus', |
||
566 | 'retroflex': 'minus', |
||
567 | 'lateral': 'minus', |
||
568 | 'aspirated': 'minus', |
||
569 | }, |
||
570 | 'ħ': { |
||
571 | 'place': 'pharyngeal', |
||
572 | 'manner': 'fricative', |
||
573 | 'syllabic': 'minus', |
||
574 | 'voice': 'minus', |
||
575 | 'nasal': 'minus', |
||
576 | 'retroflex': 'minus', |
||
577 | 'lateral': 'minus', |
||
578 | 'aspirated': 'minus', |
||
579 | }, |
||
580 | 'ʕ': { |
||
581 | 'place': 'pharyngeal', |
||
582 | 'manner': 'fricative', |
||
583 | 'syllabic': 'minus', |
||
584 | 'voice': 'plus', |
||
585 | 'nasal': 'minus', |
||
586 | 'retroflex': 'minus', |
||
587 | 'lateral': 'minus', |
||
588 | 'aspirated': 'minus', |
||
589 | }, |
||
590 | 'h': { |
||
591 | 'place': 'glottal', |
||
592 | 'manner': 'fricative', |
||
593 | 'syllabic': 'minus', |
||
594 | 'voice': 'minus', |
||
595 | 'nasal': 'minus', |
||
596 | 'retroflex': 'minus', |
||
597 | 'lateral': 'minus', |
||
598 | 'aspirated': 'minus', |
||
599 | }, |
||
600 | 'ɦ': { |
||
601 | 'place': 'glottal', |
||
602 | 'manner': 'fricative', |
||
603 | 'syllabic': 'minus', |
||
604 | 'voice': 'plus', |
||
605 | 'nasal': 'minus', |
||
606 | 'retroflex': 'minus', |
||
607 | 'lateral': 'minus', |
||
608 | 'aspirated': 'minus', |
||
609 | }, |
||
610 | 'ɬ': { |
||
611 | 'place': 'alveolar', |
||
612 | 'manner': 'fricative', |
||
613 | 'syllabic': 'minus', |
||
614 | 'voice': 'minus', |
||
615 | 'nasal': 'minus', |
||
616 | 'retroflex': 'minus', |
||
617 | 'lateral': 'plus', |
||
618 | 'aspirated': 'minus', |
||
619 | }, |
||
620 | 'ɮ': { |
||
621 | 'place': 'alveolar', |
||
622 | 'manner': 'fricative', |
||
623 | 'syllabic': 'minus', |
||
624 | 'voice': 'plus', |
||
625 | 'nasal': 'minus', |
||
626 | 'retroflex': 'minus', |
||
627 | 'lateral': 'plus', |
||
628 | 'aspirated': 'minus', |
||
629 | }, |
||
630 | 'ʋ': { |
||
631 | 'place': 'labiodental', |
||
632 | 'manner': 'approximant', |
||
633 | 'syllabic': 'minus', |
||
634 | 'voice': 'plus', |
||
635 | 'nasal': 'minus', |
||
636 | 'retroflex': 'minus', |
||
637 | 'lateral': 'minus', |
||
638 | 'aspirated': 'minus', |
||
639 | }, |
||
640 | 'ɹ': { |
||
641 | 'place': 'alveolar', |
||
642 | 'manner': 'approximant', |
||
643 | 'syllabic': 'minus', |
||
644 | 'voice': 'plus', |
||
645 | 'nasal': 'minus', |
||
646 | 'retroflex': 'minus', |
||
647 | 'lateral': 'minus', |
||
648 | 'aspirated': 'minus', |
||
649 | }, |
||
650 | 'ɻ': { |
||
651 | 'place': 'retroflex', |
||
652 | 'manner': 'approximant', |
||
653 | 'syllabic': 'minus', |
||
654 | 'voice': 'plus', |
||
655 | 'nasal': 'minus', |
||
656 | 'retroflex': 'plus', |
||
657 | 'lateral': 'minus', |
||
658 | 'aspirated': 'minus', |
||
659 | }, |
||
660 | 'j': { |
||
661 | 'place': 'palatal', |
||
662 | 'manner': 'approximant', |
||
663 | 'syllabic': 'minus', |
||
664 | 'voice': 'plus', |
||
665 | 'nasal': 'minus', |
||
666 | 'retroflex': 'minus', |
||
667 | 'lateral': 'minus', |
||
668 | 'aspirated': 'minus', |
||
669 | }, |
||
670 | 'ɰ': { |
||
671 | 'place': 'velar', |
||
672 | 'manner': 'approximant', |
||
673 | 'syllabic': 'minus', |
||
674 | 'voice': 'plus', |
||
675 | 'nasal': 'minus', |
||
676 | 'retroflex': 'minus', |
||
677 | 'lateral': 'minus', |
||
678 | 'aspirated': 'minus', |
||
679 | }, |
||
680 | 'l': { |
||
681 | 'place': 'alveolar', |
||
682 | 'manner': 'approximant', |
||
683 | 'syllabic': 'minus', |
||
684 | 'voice': 'plus', |
||
685 | 'nasal': 'minus', |
||
686 | 'retroflex': 'minus', |
||
687 | 'lateral': 'plus', |
||
688 | 'aspirated': 'minus', |
||
689 | }, |
||
690 | 'w': { |
||
691 | 'place': 'velar', |
||
692 | 'manner': 'approximant', |
||
693 | 'syllabic': 'minus', |
||
694 | 'voice': 'plus', |
||
695 | 'nasal': 'minus', |
||
696 | 'retroflex': 'minus', |
||
697 | 'lateral': 'minus', |
||
698 | 'aspirated': 'minus', |
||
699 | 'double': 'bilabial', |
||
700 | }, |
||
701 | 'i': { |
||
702 | 'manner': 'high vowel', |
||
703 | 'syllabic': 'plus', |
||
704 | 'voice': 'plus', |
||
705 | 'nasal': 'minus', |
||
706 | 'retroflex': 'minus', |
||
707 | 'lateral': 'minus', |
||
708 | 'high': 'high', |
||
709 | 'back': 'front', |
||
710 | 'round': 'minus', |
||
711 | 'long': 'minus', |
||
712 | 'aspirated': 'minus', |
||
713 | }, |
||
714 | 'y': { |
||
715 | 'manner': 'high vowel', |
||
716 | 'syllabic': 'plus', |
||
717 | 'voice': 'plus', |
||
718 | 'nasal': 'minus', |
||
719 | 'retroflex': 'minus', |
||
720 | 'lateral': 'minus', |
||
721 | 'high': 'high', |
||
722 | 'back': 'front', |
||
723 | 'round': 'plus', |
||
724 | 'long': 'minus', |
||
725 | 'aspirated': 'minus', |
||
726 | }, |
||
727 | 'e': { |
||
728 | 'manner': 'mid vowel', |
||
729 | 'syllabic': 'plus', |
||
730 | 'voice': 'plus', |
||
731 | 'nasal': 'minus', |
||
732 | 'retroflex': 'minus', |
||
733 | 'lateral': 'minus', |
||
734 | 'high': 'mid', |
||
735 | 'back': 'front', |
||
736 | 'round': 'minus', |
||
737 | 'long': 'minus', |
||
738 | 'aspirated': 'minus', |
||
739 | }, |
||
740 | 'ø': { |
||
741 | 'manner': 'mid vowel', |
||
742 | 'syllabic': 'plus', |
||
743 | 'voice': 'plus', |
||
744 | 'nasal': 'minus', |
||
745 | 'retroflex': 'minus', |
||
746 | 'lateral': 'minus', |
||
747 | 'high': 'mid', |
||
748 | 'back': 'front', |
||
749 | 'round': 'plus', |
||
750 | 'long': 'minus', |
||
751 | 'aspirated': 'minus', |
||
752 | }, |
||
753 | 'ɛ': { |
||
754 | 'manner': 'mid vowel', |
||
755 | 'syllabic': 'plus', |
||
756 | 'voice': 'plus', |
||
757 | 'nasal': 'minus', |
||
758 | 'retroflex': 'minus', |
||
759 | 'lateral': 'minus', |
||
760 | 'high': 'mid', |
||
761 | 'back': 'front', |
||
762 | 'round': 'minus', |
||
763 | 'long': 'minus', |
||
764 | 'aspirated': 'minus', |
||
765 | }, |
||
766 | 'œ': { |
||
767 | 'manner': 'mid vowel', |
||
768 | 'syllabic': 'plus', |
||
769 | 'voice': 'plus', |
||
770 | 'nasal': 'minus', |
||
771 | 'retroflex': 'minus', |
||
772 | 'lateral': 'minus', |
||
773 | 'high': 'mid', |
||
774 | 'back': 'front', |
||
775 | 'round': 'plus', |
||
776 | 'long': 'minus', |
||
777 | 'aspirated': 'minus', |
||
778 | }, |
||
779 | 'æ': { |
||
780 | 'manner': 'low vowel', |
||
781 | 'syllabic': 'plus', |
||
782 | 'voice': 'plus', |
||
783 | 'nasal': 'minus', |
||
784 | 'retroflex': 'minus', |
||
785 | 'lateral': 'minus', |
||
786 | 'high': 'low', |
||
787 | 'back': 'front', |
||
788 | 'round': 'minus', |
||
789 | 'long': 'minus', |
||
790 | 'aspirated': 'minus', |
||
791 | }, |
||
792 | 'a': { |
||
793 | 'manner': 'low vowel', |
||
794 | 'syllabic': 'plus', |
||
795 | 'voice': 'plus', |
||
796 | 'nasal': 'minus', |
||
797 | 'retroflex': 'minus', |
||
798 | 'lateral': 'minus', |
||
799 | 'high': 'low', |
||
800 | 'back': 'front', |
||
801 | 'round': 'minus', |
||
802 | 'long': 'minus', |
||
803 | 'aspirated': 'minus', |
||
804 | }, |
||
805 | 'ɨ': { |
||
806 | 'manner': 'high vowel', |
||
807 | 'syllabic': 'plus', |
||
808 | 'voice': 'plus', |
||
809 | 'nasal': 'minus', |
||
810 | 'retroflex': 'minus', |
||
811 | 'lateral': 'minus', |
||
812 | 'high': 'high', |
||
813 | 'back': 'central', |
||
814 | 'round': 'minus', |
||
815 | 'long': 'minus', |
||
816 | 'aspirated': 'minus', |
||
817 | }, |
||
818 | 'ʉ': { |
||
819 | 'manner': 'high vowel', |
||
820 | 'syllabic': 'plus', |
||
821 | 'voice': 'plus', |
||
822 | 'nasal': 'minus', |
||
823 | 'retroflex': 'minus', |
||
824 | 'lateral': 'minus', |
||
825 | 'high': 'high', |
||
826 | 'back': 'central', |
||
827 | 'round': 'plus', |
||
828 | 'long': 'minus', |
||
829 | 'aspirated': 'minus', |
||
830 | }, |
||
831 | 'ə': { |
||
832 | 'manner': 'mid vowel', |
||
833 | 'syllabic': 'plus', |
||
834 | 'voice': 'plus', |
||
835 | 'nasal': 'minus', |
||
836 | 'retroflex': 'minus', |
||
837 | 'lateral': 'minus', |
||
838 | 'high': 'mid', |
||
839 | 'back': 'central', |
||
840 | 'round': 'minus', |
||
841 | 'long': 'minus', |
||
842 | 'aspirated': 'minus', |
||
843 | }, |
||
844 | 'u': { |
||
845 | 'manner': 'high vowel', |
||
846 | 'syllabic': 'plus', |
||
847 | 'voice': 'plus', |
||
848 | 'nasal': 'minus', |
||
849 | 'retroflex': 'minus', |
||
850 | 'lateral': 'minus', |
||
851 | 'high': 'high', |
||
852 | 'back': 'back', |
||
853 | 'round': 'plus', |
||
854 | 'long': 'minus', |
||
855 | 'aspirated': 'minus', |
||
856 | }, |
||
857 | 'o': { |
||
858 | 'manner': 'mid vowel', |
||
859 | 'syllabic': 'plus', |
||
860 | 'voice': 'plus', |
||
861 | 'nasal': 'minus', |
||
862 | 'retroflex': 'minus', |
||
863 | 'lateral': 'minus', |
||
864 | 'high': 'mid', |
||
865 | 'back': 'back', |
||
866 | 'round': 'plus', |
||
867 | 'long': 'minus', |
||
868 | 'aspirated': 'minus', |
||
869 | }, |
||
870 | 'ɔ': { |
||
871 | 'manner': 'mid vowel', |
||
872 | 'syllabic': 'plus', |
||
873 | 'voice': 'plus', |
||
874 | 'nasal': 'minus', |
||
875 | 'retroflex': 'minus', |
||
876 | 'lateral': 'minus', |
||
877 | 'high': 'mid', |
||
878 | 'back': 'back', |
||
879 | 'round': 'plus', |
||
880 | 'long': 'minus', |
||
881 | 'aspirated': 'minus', |
||
882 | }, |
||
883 | 'ɒ': { |
||
884 | 'manner': 'low vowel', |
||
885 | 'syllabic': 'plus', |
||
886 | 'voice': 'plus', |
||
887 | 'nasal': 'minus', |
||
888 | 'retroflex': 'minus', |
||
889 | 'lateral': 'minus', |
||
890 | 'high': 'low', |
||
891 | 'back': 'back', |
||
892 | 'round': 'minus', |
||
893 | 'long': 'minus', |
||
894 | 'aspirated': 'minus', |
||
895 | }, |
||
896 | 'ː': {'long': 'plus', 'supplemental': 'True'}, |
||
897 | 'ʰ': {'aspirated': 'plus', 'supplemental': 'True'}, |
||
898 | } # type: Dict[str, Dict[str, str]] |
||
899 | |||
900 | phones_kondrak = { |
||
901 | 'a': { |
||
902 | 'place': 'velar', |
||
903 | 'manner': 'low vowel', |
||
904 | 'syllabic': 'plus', |
||
905 | 'voice': 'plus', |
||
906 | 'nasal': 'minus', |
||
907 | 'retroflex': 'minus', |
||
908 | 'lateral': 'minus', |
||
909 | 'high': 'low', |
||
910 | 1 | 'back': 'central', |
|
911 | 'round': 'minus', |
||
912 | }, |
||
913 | 'b': { |
||
914 | 'place': 'bilabial', |
||
915 | 'manner': 'stop', |
||
916 | 'syllabic': 'minus', |
||
917 | 'voice': 'plus', |
||
918 | 'nasal': 'minus', |
||
919 | 'retroflex': 'minus', |
||
920 | 'lateral': 'minus', |
||
921 | }, |
||
922 | 'c': { |
||
923 | 'place': 'alveolar', |
||
924 | 'manner': 'affricate', |
||
925 | 'syllabic': 'minus', |
||
926 | 'voice': 'minus', |
||
927 | 'nasal': 'minus', |
||
928 | 'retroflex': 'minus', |
||
929 | 'lateral': 'minus', |
||
930 | }, |
||
931 | 'd': { |
||
932 | 'place': 'alveolar', |
||
933 | 'manner': 'stop', |
||
934 | 'syllabic': 'minus', |
||
935 | 'voice': 'plus', |
||
936 | 'nasal': 'minus', |
||
937 | 'retroflex': 'minus', |
||
938 | 'lateral': 'minus', |
||
939 | }, |
||
940 | 'e': { |
||
941 | 'place': 'palatal', |
||
942 | 'manner': 'mid vowel', |
||
943 | 'syllabic': 'plus', |
||
944 | 'voice': 'plus', |
||
945 | 'nasal': 'minus', |
||
946 | 'retroflex': 'minus', |
||
947 | 'lateral': 'minus', |
||
948 | 'high': 'mid', |
||
949 | 'back': 'front', |
||
950 | 'round': 'minus', |
||
951 | }, |
||
952 | 'f': { |
||
953 | 'place': 'labiodental', |
||
954 | 'manner': 'fricative', |
||
955 | 'syllabic': 'minus', |
||
956 | 'voice': 'minus', |
||
957 | 'nasal': 'minus', |
||
958 | 'retroflex': 'minus', |
||
959 | 'lateral': 'minus', |
||
960 | }, |
||
961 | 'g': { |
||
962 | 'place': 'velar', |
||
963 | 'manner': 'stop', |
||
964 | 'syllabic': 'minus', |
||
965 | 'voice': 'plus', |
||
966 | 'nasal': 'minus', |
||
967 | 'retroflex': 'minus', |
||
968 | 'lateral': 'minus', |
||
969 | }, |
||
970 | 'h': { |
||
971 | 'place': 'glottal', |
||
972 | 'manner': 'fricative', |
||
973 | 'syllabic': 'minus', |
||
974 | 'voice': 'minus', |
||
975 | 'nasal': 'minus', |
||
976 | 'retroflex': 'minus', |
||
977 | 'lateral': 'minus', |
||
978 | }, |
||
979 | 'i': { |
||
980 | 'place': 'palatal', |
||
981 | 'manner': 'high vowel', |
||
982 | 'syllabic': 'plus', |
||
983 | 'voice': 'plus', |
||
984 | 'nasal': 'minus', |
||
985 | 'retroflex': 'minus', |
||
986 | 'lateral': 'minus', |
||
987 | 'high': 'high', |
||
988 | 'back': 'front', |
||
989 | 'round': 'plus', |
||
990 | }, |
||
991 | 'j': { |
||
992 | 'place': 'alveolar', |
||
993 | 'manner': 'affricate', |
||
994 | 'syllabic': 'minus', |
||
995 | 'voice': 'plus', |
||
996 | 'nasal': 'minus', |
||
997 | 'retroflex': 'minus', |
||
998 | 'lateral': 'minus', |
||
999 | }, |
||
1000 | 'k': { |
||
1001 | 'place': 'velar', |
||
1002 | 'manner': 'stop', |
||
1003 | 'syllabic': 'minus', |
||
1004 | 'voice': 'minus', |
||
1005 | 'nasal': 'minus', |
||
1006 | 'retroflex': 'minus', |
||
1007 | 'lateral': 'minus', |
||
1008 | }, |
||
1009 | 'l': { |
||
1010 | 'place': 'alveolar', |
||
1011 | 'manner': 'approximant', |
||
1012 | 'syllabic': 'minus', |
||
1013 | 'voice': 'plus', |
||
1014 | 'nasal': 'minus', |
||
1015 | 'retroflex': 'minus', |
||
1016 | 'lateral': 'plus', |
||
1017 | }, |
||
1018 | 'm': { |
||
1019 | 'place': 'bilabial', |
||
1020 | 'manner': 'stop', |
||
1021 | 'syllabic': 'minus', |
||
1022 | 'voice': 'plus', |
||
1023 | 'nasal': 'plus', |
||
1024 | 'retroflex': 'minus', |
||
1025 | 'lateral': 'minus', |
||
1026 | }, |
||
1027 | 'n': { |
||
1028 | 'place': 'alveolar', |
||
1029 | 'manner': 'stop', |
||
1030 | 'syllabic': 'minus', |
||
1031 | 'voice': 'plus', |
||
1032 | 'nasal': 'plus', |
||
1033 | 'retroflex': 'minus', |
||
1034 | 'lateral': 'minus', |
||
1035 | }, |
||
1036 | 'o': { |
||
1037 | 'place': 'velar', |
||
1038 | 'manner': 'mid vowel', |
||
1039 | 'syllabic': 'plus', |
||
1040 | 'voice': 'plus', |
||
1041 | 'nasal': 'minus', |
||
1042 | 'retroflex': 'minus', |
||
1043 | 'lateral': 'minus', |
||
1044 | 'high': 'mid', |
||
1045 | 'back': 'back', |
||
1046 | 'round': 'plus', |
||
1047 | }, |
||
1048 | 'p': { |
||
1049 | 'place': 'bilabial', |
||
1050 | 'manner': 'stop', |
||
1051 | 'syllabic': 'minus', |
||
1052 | 'voice': 'minus', |
||
1053 | 'nasal': 'minus', |
||
1054 | 'retroflex': 'minus', |
||
1055 | 'lateral': 'minus', |
||
1056 | }, |
||
1057 | 'q': { |
||
1058 | 'place': 'glottal', |
||
1059 | 'manner': 'stop', |
||
1060 | 'syllabic': 'minus', |
||
1061 | 'voice': 'minus', |
||
1062 | 'nasal': 'minus', |
||
1063 | 'retroflex': 'minus', |
||
1064 | 'lateral': 'minus', |
||
1065 | }, |
||
1066 | 'r': { |
||
1067 | 'place': 'retroflex', |
||
1068 | 'manner': 'approximant', |
||
1069 | 'syllabic': 'minus', |
||
1070 | 'voice': 'plus', |
||
1071 | 'nasal': 'minus', |
||
1072 | 'retroflex': 'plus', |
||
1073 | 'lateral': 'minus', |
||
1074 | }, |
||
1075 | 's': { |
||
1076 | 'place': 'alveolar', |
||
1077 | 'manner': 'fricative', |
||
1078 | 'syllabic': 'minus', |
||
1079 | 'voice': 'minus', |
||
1080 | 'nasal': 'minus', |
||
1081 | 'retroflex': 'minus', |
||
1082 | 'lateral': 'minus', |
||
1083 | }, |
||
1084 | 't': { |
||
1085 | 'place': 'alveolar', |
||
1086 | 'manner': 'stop', |
||
1087 | 'syllabic': 'minus', |
||
1088 | 'voice': 'minus', |
||
1089 | 'nasal': 'minus', |
||
1090 | 'retroflex': 'minus', |
||
1091 | 'lateral': 'minus', |
||
1092 | }, |
||
1093 | 'u': { |
||
1094 | 'place': 'velar', |
||
1095 | 'manner': 'high vowel', |
||
1096 | 'syllabic': 'plus', |
||
1097 | 'voice': 'plus', |
||
1098 | 'nasal': 'minus', |
||
1099 | 'retroflex': 'minus', |
||
1100 | 'lateral': 'minus', |
||
1101 | 'high': 'high', |
||
1102 | 'back': 'back', |
||
1103 | 'round': 'plus', |
||
1104 | }, |
||
1105 | 'v': { |
||
1106 | 'place': 'labiodental', |
||
1107 | 'manner': 'fricative', |
||
1108 | 'syllabic': 'plus', |
||
1109 | 'voice': 'plus', |
||
1110 | 'nasal': 'minus', |
||
1111 | 'retroflex': 'minus', |
||
1112 | 'lateral': 'minus', |
||
1113 | }, |
||
1114 | 'w': { |
||
1115 | 'place': 'velar', |
||
1116 | 'manner': 'high vowel', |
||
1117 | 'syllabic': 'plus', |
||
1118 | 'voice': 'plus', |
||
1119 | 'nasal': 'minus', |
||
1120 | 'retroflex': 'minus', |
||
1121 | 'lateral': 'minus', |
||
1122 | 'high': 'high', |
||
1123 | 'back': 'back', |
||
1124 | 'round': 'plus', |
||
1125 | 'double': 'bilabial', |
||
1126 | }, |
||
1127 | 'x': { |
||
1128 | 'place': 'velar', |
||
1129 | 'manner': 'fricative', |
||
1130 | 'syllabic': 'minus', |
||
1131 | 'voice': 'minus', |
||
1132 | 'nasal': 'minus', |
||
1133 | 'retroflex': 'minus', |
||
1134 | 'lateral': 'minus', |
||
1135 | }, |
||
1136 | 'y': { |
||
1137 | 'place': 'velar', |
||
1138 | 'manner': 'high vowel', |
||
1139 | 'syllabic': 'plus', |
||
1140 | 'voice': 'plus', |
||
1141 | 'nasal': 'minus', |
||
1142 | 'retroflex': 'minus', |
||
1143 | 'lateral': 'minus', |
||
1144 | 'high': 'high', |
||
1145 | 'back': 'front', |
||
1146 | 'round': 'minus', |
||
1147 | }, |
||
1148 | 'z': { |
||
1149 | 'place': 'alveolar', |
||
1150 | 'manner': 'fricative', |
||
1151 | 'syllabic': 'minus', |
||
1152 | 'voice': 'plus', |
||
1153 | 'nasal': 'minus', |
||
1154 | 'retroflex': 'minus', |
||
1155 | 'lateral': 'minus', |
||
1156 | }, |
||
1157 | 'A': {'aspirated': 'plus', 'supplemental': 'True'}, |
||
1158 | 'B': {'back': 'back', 'supplemental': 'True'}, |
||
1159 | 'C': {'back': 'central', 'supplemental': 'True'}, |
||
1160 | 'D': {'place': 'dental', 'supplemental': 'True'}, |
||
1161 | 'F': {'back': 'front', 'supplemental': 'True'}, |
||
1162 | 'H': {'long': 'plus', 'supplemental': 'True'}, |
||
1163 | 'N': {'nasal': 'plus', 'supplemental': 'True'}, |
||
1164 | 'P': {'place': 'palatal', 'supplemental': 'True'}, |
||
1165 | 'R': {'round': 'plus', 'supplemental': 'True'}, |
||
1166 | 'S': {'manner': 'fricative', 'supplemental': 'True'}, |
||
1167 | 'V': {'place': 'palato-alveolar', 'supplemental': 'True'}, |
||
1168 | } # type: Dict[str, Dict[str, str]] |
||
1169 | |||
1170 | def __init__( |
||
1171 | self, |
||
1172 | epsilon: float = 0.0, |
||
1173 | c_skip: float = -10, |
||
1174 | c_sub: float = 35, |
||
1175 | c_exp: float = 45, |
||
1176 | c_vwl: float = 10, |
||
1177 | mode: str = 'local', |
||
1178 | phones: str = 'aline', |
||
1179 | normalizer: Callable[[List[float]], float] = max, |
||
1180 | 1 | **kwargs: Any |
|
1181 | ) -> None: |
||
1182 | """Initialize ALINE instance. |
||
1183 | |||
1184 | Parameters |
||
1185 | ---------- |
||
1186 | epsilon : float |
||
1187 | The portion (out of 1.0) of the maximum ALINE score, above which |
||
1188 | alignments are returned. If set to 0, only the alignments matching |
||
1189 | the maximum alignment score are returned. If set to 1, all |
||
1190 | alignments scoring 0 or higher are returned. |
||
1191 | c_skip : float |
||
1192 | The cost of an insertion or deletion |
||
1193 | c_sub : float |
||
1194 | The cost of a substitution |
||
1195 | c_exp : float |
||
1196 | The cost of an expansion or contraction |
||
1197 | c_vwl : float |
||
1198 | The additional cost of a vowel substitution, expansion, or |
||
1199 | contraction |
||
1200 | mode : str |
||
1201 | Alignment mode, which can be ``local`` (default), ``global``, |
||
1202 | ``half-local``, or ``semi-global`` |
||
1203 | phones : str |
||
1204 | Phonetic symbol set, which can be: |
||
1205 | - ``aline`` selects Kondrak's original symbols set |
||
1206 | - ``ipa`` selects IPA symbols |
||
1207 | normalizer : function |
||
1208 | A function that takes an list and computes a normalization term |
||
1209 | by which the edit distance is divided (max by default). For the |
||
1210 | normalization proposed by Downey, et al. (2008), set this to: |
||
1211 | ``lambda x: sum(x)/len(x)`` |
||
1212 | **kwargs |
||
1213 | Arbitrary keyword arguments |
||
1214 | |||
1215 | |||
1216 | .. versionadded:: 0.4.0 |
||
1217 | |||
1218 | """ |
||
1219 | super(ALINE, self).__init__(**kwargs) |
||
1220 | self._epsilon = epsilon |
||
1221 | self._c_skip = c_skip |
||
1222 | self._c_sub = c_sub |
||
1223 | self._c_exp = c_exp |
||
1224 | self._c_vwl = c_vwl |
||
1225 | self._mode = mode |
||
1226 | if self._mode not in {'local', 'global', 'half-local', 'semi-global'}: |
||
1227 | self._mode = 'local' |
||
1228 | if phones == 'ipa': |
||
1229 | 1 | self._phones = self.phones_ipa |
|
1230 | 1 | else: |
|
1231 | 1 | self._phones = self.phones_kondrak |
|
1232 | 1 | self._normalizer = normalizer |
|
1233 | 1 | ||
1234 | 1 | def alignment(self, src: str, tar: str) -> Tuple[float, str, str]: |
|
1235 | 1 | """Return the top ALINE alignment of two strings. |
|
1236 | 1 | ||
1237 | 1 | The `top` ALINE alignment is the first alignment with the best score. |
|
1238 | 1 | The purpose of this function is to have a single tuple as a return |
|
1239 | 1 | value. |
|
1240 | |||
1241 | 1 | Parameters |
|
1242 | 1 | ---------- |
|
1243 | src : str |
||
1244 | 1 | Source string for comparison |
|
1245 | tar : str |
||
1246 | Target string for comparison |
||
1247 | |||
1248 | Returns |
||
1249 | ------- |
||
1250 | tuple(float, str, str) |
||
1251 | ALINE alignment and its score |
||
1252 | |||
1253 | Examples |
||
1254 | -------- |
||
1255 | >>> cmp = ALINE() |
||
1256 | >>> cmp.alignment('cat', 'hat') |
||
1257 | (50.0, 'c ‖ a t ‖', 'h ‖ a t ‖') |
||
1258 | >>> cmp.alignment('niall', 'neil') |
||
1259 | (90.0, '‖ n i a ll ‖', '‖ n e i l ‖') |
||
1260 | >>> cmp.alignment('aluminum', 'catalan') |
||
1261 | (81.5, '‖ a l u m ‖ inum', 'cat ‖ a l a n ‖') |
||
1262 | >>> cmp.alignment('atcg', 'tagc') |
||
1263 | (65.0, '‖ a t c ‖ g', 't ‖ a g c ‖') |
||
1264 | |||
1265 | |||
1266 | .. versionadded:: 0.4.1 |
||
1267 | |||
1268 | """ |
||
1269 | return cast(List[Tuple[float, str, str]], self.alignments(src, tar))[0] |
||
1270 | |||
1271 | def alignments( |
||
1272 | self, src: str, tar: str, score_only: bool = False |
||
1273 | ) -> Union[float, List[Tuple[float, str, str]]]: |
||
1274 | """Return the ALINE alignments of two strings. |
||
1275 | |||
1276 | Parameters |
||
1277 | ---------- |
||
1278 | src : str |
||
1279 | 1 | Source string for comparison |
|
1280 | tar : str |
||
1281 | 1 | Target string for comparison |
|
1282 | score_only : bool |
||
1283 | Return the score only, not the alignments |
||
1284 | |||
1285 | Returns |
||
1286 | ------- |
||
1287 | list(tuple(float, str, str) or float |
||
1288 | ALINE alignments and their scores or the top score |
||
1289 | |||
1290 | Examples |
||
1291 | -------- |
||
1292 | >>> cmp = ALINE() |
||
1293 | >>> cmp.alignments('cat', 'hat') |
||
1294 | [(50.0, 'c ‖ a t ‖', 'h ‖ a t ‖')] |
||
1295 | >>> cmp.alignments('niall', 'neil') |
||
1296 | [(90.0, '‖ n i a ll ‖', '‖ n e i l ‖')] |
||
1297 | >>> cmp.alignments('aluminum', 'catalan') |
||
1298 | [(81.5, '‖ a l u m ‖ inum', 'cat ‖ a l a n ‖')] |
||
1299 | >>> cmp.alignments('atcg', 'tagc') |
||
1300 | [(65.0, '‖ a t c ‖ g', 't ‖ a g c ‖'), (65.0, 'a ‖ tc - g ‖', |
||
1301 | '‖ t a g ‖ c')] |
||
1302 | |||
1303 | |||
1304 | .. versionadded:: 0.4.0 |
||
1305 | .. versionchanged:: 0.4.1 |
||
1306 | Renamed from .alignment to .alignments |
||
1307 | |||
1308 | """ |
||
1309 | |||
1310 | def _sig_skip(*args: Any) -> float: |
||
1311 | return self._c_skip |
||
1312 | |||
1313 | def _sig_sub(seg1: Dict[str, float], seg2: Dict[str, float]) -> float: |
||
1314 | return ( |
||
1315 | self._c_sub |
||
1316 | - _delta(seg1, seg2) |
||
1317 | - _sig_vwl(seg1) |
||
1318 | 1 | - _sig_vwl(seg2) |
|
1319 | 1 | ) |
|
1320 | |||
1321 | 1 | def _sig_exp( |
|
1322 | 1 | seg1: Dict[str, float], |
|
1323 | seg2a: Dict[str, float], |
||
1324 | seg2b: Dict[str, float], |
||
1325 | ) -> float: |
||
1326 | return ( |
||
1327 | self._c_exp |
||
1328 | - _delta(seg1, seg2a) |
||
1329 | 1 | - _delta(seg1, seg2b) |
|
1330 | 1 | - _sig_vwl(seg1) |
|
1331 | - max(_sig_vwl(seg2a), _sig_vwl(seg2b)) |
||
1332 | ) |
||
1333 | |||
1334 | def _sig_vwl(seg1: Dict[str, float]) -> float: |
||
1335 | return ( |
||
1336 | 0.0 |
||
1337 | if seg1['manner'] > self.feature_weights['high vowel'] |
||
1338 | 1 | else self._c_vwl |
|
1339 | 1 | ) |
|
1340 | |||
1341 | def _delta(seg1: Dict[str, float], seg2: Dict[str, float]) -> float: |
||
1342 | features = ( |
||
1343 | self.c_features |
||
1344 | if max(seg1['manner'], seg2['manner']) |
||
1345 | 1 | > self.feature_weights['high vowel'] |
|
1346 | 1 | else self.v_features |
|
1347 | ) |
||
1348 | diff = 0.0 |
||
1349 | for f in features: |
||
1350 | diff += ( |
||
1351 | abs(seg1.get(f, 0.0) - seg2.get(f, 0.0)) * self.salience[f] |
||
1352 | 1 | ) |
|
1353 | 1 | return diff |
|
1354 | 1 | ||
1355 | def _retrieve( |
||
1356 | i: int, j: int, score: float, out: List[Tuple[str, str]] |
||
1357 | 1 | ) -> None: |
|
1358 | def _record(score: float, out: List[Tuple[str, str]]) -> None: |
||
1359 | 1 | out.append(('‖', '‖')) |
|
1360 | 1 | for i1 in range(i - 1, -1, -1): |
|
1361 | 1 | out.append((src_tok[i1], '')) |
|
1362 | 1 | for j1 in range(j - 1, -1, -1): |
|
1363 | 1 | out.append(('', tar_tok[j1])) |
|
1364 | 1 | if self._mode == 'global': |
|
1365 | 1 | score += (i + j) * _sig_skip('') |
|
1366 | 1 | ||
1367 | 1 | out = out[::-1] |
|
1368 | |||
1369 | 1 | src_alignment = [] |
|
1370 | tar_alignment = [] |
||
1371 | 1 | ||
1372 | 1 | out.append(('‖', '‖')) |
|
1373 | part = 0 |
||
1374 | 1 | s_segment = '' # type: Union[str, List[str]] |
|
1375 | 1 | t_segment = '' # type: Union[str, List[str]] |
|
1376 | 1 | for ss, ts in out: |
|
1377 | 1 | if ss == '‖': |
|
1378 | 1 | if part % 2 == 0: |
|
1379 | 1 | src_alignment.append(s_segment) |
|
1380 | 1 | tar_alignment.append(t_segment) |
|
1381 | 1 | s_segment = [] |
|
1382 | 1 | t_segment = [] |
|
1383 | 1 | else: |
|
1384 | 1 | src_alignment.append(' '.join(s_segment)) |
|
1385 | tar_alignment.append(' '.join(t_segment)) |
||
1386 | 1 | s_segment = '' |
|
1387 | 1 | t_segment = '' |
|
1388 | 1 | part += 1 |
|
1389 | 1 | else: |
|
1390 | 1 | if part % 2 == 0: |
|
1391 | s_segment = cast(str, s_segment) + ss |
||
1392 | 1 | t_segment = cast(str, t_segment) + ts |
|
1393 | 1 | else: |
|
1394 | 1 | cast(List[str], s_segment).append( |
|
1395 | ss + ' ' * (len(ts) - len(ss)) |
||
1396 | 1 | ) |
|
1397 | 1 | cast(List[str], t_segment).append( |
|
1398 | ts + ' ' * (len(ss) - len(ts)) |
||
1399 | 1 | ) |
|
1400 | 1 | ||
1401 | src_alignment_str = ' ‖ '.join( |
||
1402 | 1 | cast(List[str], src_alignment) |
|
1403 | 1 | ).strip() |
|
1404 | tar_alignment_str = ' ‖ '.join( |
||
1405 | 1 | cast(List[str], tar_alignment) |
|
1406 | 1 | ).strip() |
|
1407 | 1 | ||
1408 | alignments.append( |
||
1409 | 1 | (score, src_alignment_str, tar_alignment_str) |
|
1410 | ) |
||
1411 | return |
||
1412 | |||
1413 | if s_mat[i, j] == 0: |
||
1414 | _record(score, out) |
||
1415 | return |
||
1416 | else: |
||
1417 | 1 | if ( |
|
1418 | 1 | i > 0 |
|
1419 | and j > 0 |
||
1420 | and s_mat[i - 1, j - 1] |
||
1421 | 1 | + _sig_sub(src_feat_wt[i - 1], tar_feat_wt[j - 1]) |
|
1422 | + score |
||
1423 | >= threshold |
||
1424 | ): |
||
1425 | loc_out = deepcopy(out) |
||
1426 | loc_out.append((src_tok[i - 1], tar_tok[j - 1])) |
||
1427 | 1 | _retrieve( |
|
1428 | i - 1, |
||
1429 | 1 | j - 1, |
|
1430 | score |
||
1431 | + _sig_sub(src_feat_wt[i - 1], tar_feat_wt[j - 1]), |
||
1432 | loc_out, |
||
1433 | ) |
||
1434 | 1 | loc_out.pop() |
|
1435 | 1 | ||
1436 | 1 | View Code Duplication | if ( |
|
|||
1437 | 1 | j > 0 |
|
1438 | and s_mat[i, j - 1] + _sig_skip(tar_tok[j - 1]) + score |
||
1439 | 1 | >= threshold |
|
1440 | ): |
||
1441 | loc_out = deepcopy(out) |
||
1442 | loc_out.append(('-', tar_tok[j - 1])) |
||
1443 | _retrieve( |
||
1444 | i, j - 1, score + _sig_skip(tar_tok[j - 1]), loc_out |
||
1445 | ) |
||
1446 | loc_out.pop() |
||
1447 | 1 | ||
1448 | 1 | View Code Duplication | if ( |
1449 | i > 0 |
||
1450 | and j > 1 |
||
1451 | and s_mat[i - 1, j - 2] |
||
1452 | + _sig_exp( |
||
1453 | src_feat_wt[i - 1], |
||
1454 | 1 | tar_feat_wt[j - 2], |
|
1455 | tar_feat_wt[j - 1], |
||
1456 | ) |
||
1457 | + score |
||
1458 | >= threshold |
||
1459 | ): |
||
1460 | 1 | loc_out = deepcopy(out) |
|
1461 | loc_out.append( |
||
1462 | 1 | (src_tok[i - 1], tar_tok[j - 2] + tar_tok[j - 1],) |
|
1463 | ) |
||
1464 | _retrieve( |
||
1465 | i - 1, |
||
1466 | j - 2, |
||
1467 | 1 | score |
|
1468 | 1 | + _sig_exp( |
|
1469 | 1 | src_feat_wt[i - 1], |
|
1470 | 1 | tar_feat_wt[j - 2], |
|
1471 | tar_feat_wt[j - 1], |
||
1472 | 1 | ), |
|
1473 | loc_out, |
||
1474 | ) |
||
1475 | loc_out.pop() |
||
1476 | |||
1477 | View Code Duplication | if ( |
|
1478 | i > 0 |
||
1479 | and s_mat[i - 1, j] + _sig_skip(src_tok[i - 1]) + score |
||
1480 | 1 | >= threshold |
|
1481 | 1 | ): |
|
1482 | loc_out = deepcopy(out) |
||
1483 | loc_out.append((src_tok[i - 1], '-')) |
||
1484 | _retrieve( |
||
1485 | i - 1, j, score + _sig_skip(src_tok[i - 1]), loc_out |
||
1486 | ) |
||
1487 | 1 | loc_out.pop() |
|
1488 | |||
1489 | View Code Duplication | if ( |
|
1490 | i > 1 |
||
1491 | and j > 0 |
||
1492 | and s_mat[i - 2, j - 1] |
||
1493 | 1 | + _sig_exp( |
|
1494 | tar_feat_wt[j - 1], |
||
1495 | 1 | src_feat_wt[i - 2], |
|
1496 | src_feat_wt[i - 1], |
||
1497 | 1 | ) |
|
1498 | 1 | + score |
|
1499 | >= threshold |
||
1500 | 1 | ): |
|
1501 | 1 | loc_out = deepcopy(out) |
|
1502 | 1 | loc_out.append( |
|
1503 | 1 | (src_tok[i - 2] + src_tok[i - 1], tar_tok[j - 1],) |
|
1504 | 1 | ) |
|
1505 | 1 | _retrieve( |
|
1506 | 1 | i - 2, |
|
1507 | 1 | j - 1, |
|
1508 | 1 | score |
|
1509 | 1 | + _sig_exp( |
|
1510 | tar_feat_wt[j - 1], |
||
1511 | 1 | src_feat_wt[i - 2], |
|
1512 | 1 | src_feat_wt[i - 1], |
|
1513 | ), |
||
1514 | 1 | loc_out, |
|
1515 | 1 | ) |
|
1516 | 1 | loc_out.pop() |
|
1517 | 1 | ||
1518 | 1 | sg_max = 0.0 |
|
1519 | 1 | ||
1520 | 1 | src_tok = [] # type: List[str] |
|
1521 | 1 | src_feat = [] # type: List[Dict[str, str]] |
|
1522 | 1 | tar_tok = [] # type: List[str] |
|
1523 | tar_feat = [] # type: List[Dict[str, str]] |
||
1524 | 1 | ||
1525 | 1 | for ch in src: |
|
1526 | 1 | if ch in self._phones: |
|
1527 | 1 | src_tok.append(ch) |
|
1528 | src_feat.append(dict(self._phones[ch])) |
||
1529 | 1 | for ch in tar: |
|
1530 | 1 | if ch in self._phones: |
|
1531 | 1 | tar_tok.append(ch) |
|
1532 | 1 | tar_feat.append(dict(self._phones[ch])) |
|
1533 | 1 | ||
1534 | 1 | for i in range(1, len(src_feat)): |
|
1535 | 1 | if 'supplemental' in src_feat[i]: |
|
1536 | 1 | j = i - 1 |
|
1537 | 1 | while j > -1: |
|
1538 | if 'supplemental' not in src_feat[j]: |
||
1539 | 1 | src_tok[j] += src_tok[i] |
|
1540 | 1 | for key, value in src_feat[i].items(): |
|
1541 | 1 | if key != 'supplemental': |
|
1542 | 1 | src_feat[j][key] = value |
|
1543 | j = 0 |
||
1544 | 1 | j -= 1 |
|
1545 | 1 | ||
1546 | 1 | zipped = [ |
|
1547 | 1 | fb for fb in zip(src_feat, src_tok) if 'supplemental' not in fb[0] |
|
1548 | 1 | ] |
|
1549 | 1 | if zipped: |
|
1550 | 1 | src_feat, src_tok = zip(*zipped) # type: ignore |
|
1551 | 1 | else: |
|
1552 | src_feat, src_tok = [], [] |
||
1553 | 1 | ||
1554 | 1 | src_feat_wt = [] # type: List[Dict[str, float]] |
|
1555 | for f_dict in src_feat: |
||
1556 | 1 | src_feat_wt.append( |
|
1557 | { |
||
1558 | 1 | key: self.feature_weights[f_dict[key]] |
|
1559 | 1 | for key in f_dict.keys() |
|
1560 | 1 | } |
|
1561 | 1 | ) |
|
1562 | 1 | ||
1563 | src_len = len(src_tok) |
||
1564 | 1 | ||
1565 | 1 | for i in range(1, len(tar_feat)): |
|
1566 | 1 | if 'supplemental' in tar_feat[i]: |
|
1567 | j = i - 1 |
||
1568 | while j > -1: |
||
1569 | if 'supplemental' not in tar_feat[j]: |
||
1570 | tar_tok[j] += tar_tok[i] |
||
1571 | for key, value in tar_feat[i].items(): |
||
1572 | if key != 'supplemental': |
||
1573 | tar_feat[j][key] = value |
||
1574 | j = 0 |
||
1575 | j -= 1 |
||
1576 | |||
1577 | zipped = [ |
||
1578 | fb for fb in zip(tar_feat, tar_tok) if 'supplemental' not in fb[0] |
||
1579 | ] |
||
1580 | if zipped: |
||
1581 | 1 | tar_feat, tar_tok = zip(*zipped) # type: ignore |
|
1582 | 1 | else: |
|
1583 | 1 | tar_feat, tar_tok = [], [] |
|
1584 | 1 | ||
1585 | tar_feat_wt = [] # type: List[Dict[str, float]] |
||
1586 | 1 | for f_dict in tar_feat: |
|
1587 | tar_feat_wt.append( |
||
1588 | 1 | { |
|
1589 | 1 | key: self.feature_weights[f_dict[key]] |
|
1590 | for key in f_dict.keys() |
||
1591 | 1 | } |
|
1592 | ) |
||
1593 | 1 | ||
1594 | 1 | tar_len = len(tar_tok) |
|
1595 | |||
1596 | 1 | s_mat = zeros((src_len + 1, tar_len + 1), dtype=float_) |
|
1597 | |||
1598 | 1 | if self._mode == 'global': |
|
1599 | for i in range(1, src_len + 1): |
||
1600 | 1 | s_mat[i, 0] = s_mat[i - 1, 0] + _sig_skip(src_tok[i - 1]) |
|
1601 | 1 | for j in range(1, tar_len + 1): |
|
1602 | 1 | s_mat[0, j] = s_mat[0, j - 1] + _sig_skip(tar_tok[j - 1]) |
|
1603 | |||
1604 | for i in range(1, src_len + 1): |
||
1605 | 1 | for j in range(1, tar_len + 1): |
|
1606 | 1 | s_mat[i, j] = max( |
|
1607 | s_mat[i - 1, j] + _sig_skip(src_feat_wt[i - 1]), |
||
1608 | s_mat[i, j - 1] + _sig_skip(tar_feat_wt[j - 1]), |
||
1609 | 1 | s_mat[i - 1, j - 1] |
|
1610 | 1 | + _sig_sub(src_feat_wt[i - 1], tar_feat_wt[j - 1]), |
|
1611 | 1 | s_mat[i - 1, j - 2] |
|
1612 | 1 | + _sig_exp( |
|
1613 | 1 | src_feat_wt[i - 1], |
|
1614 | 1 | tar_feat_wt[j - 2], |
|
1615 | 1 | tar_feat_wt[j - 1], |
|
1616 | 1 | ) |
|
1617 | 1 | if j > 1 |
|
1618 | else -inf, |
||
1619 | 1 | s_mat[i - 2, j - 1] |
|
1620 | 1 | + _sig_exp( |
|
1621 | tar_feat_wt[j - 1], |
||
1622 | 1 | src_feat_wt[i - 2], |
|
1623 | src_feat_wt[i - 1], |
||
1624 | 1 | ) |
|
1625 | if i > 1 |
||
1626 | else -inf, |
||
1627 | 0 if self._mode in {'local', 'half-local'} else -inf, |
||
1628 | ) |
||
1629 | |||
1630 | if s_mat[i, j] > sg_max: |
||
1631 | if self._mode == 'semi-global': |
||
1632 | if i == src_len or j == tar_len: |
||
1633 | sg_max = s_mat[i, j] |
||
1634 | else: |
||
1635 | sg_max = s_mat[i, j] |
||
1636 | |||
1637 | if self._mode in {'global', 'half-local'}: |
||
1638 | dp_score = s_mat[src_len, tar_len] |
||
1639 | else: |
||
1640 | dp_score = s_mat.max() |
||
1641 | |||
1642 | if score_only: |
||
1643 | return cast(float, dp_score) |
||
1644 | |||
1645 | threshold = (1 - self._epsilon) * dp_score |
||
1646 | |||
1647 | alignments = [] # type: List[Tuple[float, str, str]] |
||
1648 | |||
1649 | for i in range(1, src_len + 1): |
||
1650 | for j in range(1, tar_len + 1): |
||
1651 | if self._mode in {'global', 'half-local'} and ( |
||
1652 | i < src_len or j < tar_len |
||
1653 | ): |
||
1654 | continue |
||
1655 | 1 | if self._mode == 'semi-global' and ( |
|
1656 | 1 | i < src_len and j < tar_len |
|
1657 | 1 | ): |
|
1658 | continue |
||
1659 | 1 | if s_mat[i, j] >= threshold: |
|
1660 | out = [] |
||
1661 | for j1 in range(tar_len - 1, j - 1, -1): |
||
1662 | out.append(('', tar_tok[j1])) |
||
1663 | for i1 in range(src_len - 1, i - 1, -1): |
||
1664 | out.append((src_tok[i1], '')) |
||
1665 | out.append(('‖', '‖')) |
||
1666 | _retrieve(i, j, 0, out) |
||
1667 | |||
1668 | return sorted(alignments, key=lambda _: _[0], reverse=True) |
||
1669 | |||
1670 | def sim_score(self, src: str, tar: str) -> float: |
||
1671 | """Return the ALINE alignment score of two strings. |
||
1672 | |||
1673 | Parameters |
||
1674 | ---------- |
||
1675 | src : str |
||
1676 | Source string for comparison |
||
1677 | tar : str |
||
1678 | Target string for comparison |
||
1679 | |||
1680 | Returns |
||
1681 | ------- |
||
1682 | float |
||
1683 | ALINE alignment score |
||
1684 | |||
1685 | Examples |
||
1686 | -------- |
||
1687 | >>> cmp = ALINE() |
||
1688 | >>> cmp.sim_score('cat', 'hat') |
||
1689 | 50.0 |
||
1690 | 1 | >>> cmp.sim_score('niall', 'neil') |
|
1691 | 1 | 90.0 |
|
1692 | 1 | >>> cmp.sim_score('aluminum', 'catalan') |
|
1693 | 81.5 |
||
1694 | >>> cmp.sim_score('atcg', 'tagc') |
||
1695 | 1 | 65.0 |
|
1696 | |||
1697 | |||
1698 | .. versionadded:: 0.4.0 |
||
1699 | |||
1700 | """ |
||
1701 | if src == '' and tar == '': |
||
1702 | return 1.0 |
||
1703 | return cast(float, self.alignments(src, tar, score_only=True)) |
||
1704 | |||
1705 | def sim(self, src: str, tar: str) -> float: |
||
1706 | """Return the normalized ALINE similarity of two strings. |
||
1707 | |||
1708 | Parameters |
||
1709 | ---------- |
||
1710 | src : str |
||
1711 | Source string for comparison |
||
1712 | tar : str |
||
1713 | Target string for comparison |
||
1714 | |||
1715 | Returns |
||
1716 | ------- |
||
1717 | float |
||
1718 | Normalized ALINE similarity |
||
1719 | |||
1720 | Examples |
||
1721 | -------- |
||
1722 | >>> cmp = ALINE() |
||
1723 | >>> cmp.dist('cat', 'hat') |
||
1724 | 0.4117647058823529 |
||
1725 | >>> cmp.dist('niall', 'neil') |
||
1726 | 0.33333333333333337 |
||
1727 | >>> cmp.dist('aluminum', 'catalan') |
||
1728 | 0.5925 |
||
1729 | >>> cmp.dist('atcg', 'tagc') |
||
1730 | 0.45833333333333337 |
||
1731 | |||
1732 | |||
1733 | .. versionadded:: 0.4.0 |
||
1734 | |||
1735 | """ |
||
1736 | num = self.sim_score(src, tar) |
||
1737 | if num: |
||
1738 | return num / self._normalizer( |
||
1739 | [self.sim_score(src, src), self.sim_score(tar, tar)] |
||
1740 | ) |
||
1741 | return 0.0 |
||
1742 | |||
1743 | |||
1744 | if __name__ == '__main__': |
||
1745 | import doctest |
||
1746 | |||
1747 | doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE) |
||
1748 |