Total Complexity | 83 |
Total Lines | 1702 |
Duplicated Lines | 4 % |
Coverage | 100% |
Changes | 0 |
Duplicate code is one of the most pungent code smells. A rule that is often used is to re-structure code once it is duplicated in three or more places.
Common duplication problems, and corresponding solutions are:
Complex classes like abydos.distance._aline often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
1 | # -*- coding: utf-8 -*- |
||
2 | |||
3 | # Copyright 2019 by Christopher C. Little. |
||
4 | # This file is part of Abydos. |
||
5 | # |
||
6 | # Abydos is free software: you can redistribute it and/or modify |
||
7 | # it under the terms of the GNU General Public License as published by |
||
8 | # the Free Software Foundation, either version 3 of the License, or |
||
9 | # (at your option) any later version. |
||
10 | # |
||
11 | # Abydos is distributed in the hope that it will be useful, |
||
12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of |
||
13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
||
14 | # GNU General Public License for more details. |
||
15 | # |
||
16 | # You should have received a copy of the GNU General Public License |
||
17 | # along with Abydos. If not, see <http://www.gnu.org/licenses/>. |
||
18 | |||
19 | 1 | """abydos.distance._aline. |
|
20 | |||
21 | ALINE alignment, similarity, and distance |
||
22 | """ |
||
23 | |||
24 | 1 | from __future__ import ( |
|
25 | absolute_import, |
||
26 | division, |
||
27 | print_function, |
||
28 | unicode_literals, |
||
29 | ) |
||
30 | |||
31 | 1 | from copy import deepcopy |
|
32 | |||
33 | 1 | from numpy import NINF |
|
34 | 1 | from numpy import float as np_float |
|
35 | 1 | from numpy import zeros as np_zeros |
|
36 | |||
37 | 1 | from ._distance import _Distance |
|
38 | |||
39 | 1 | __all__ = ['ALINE'] |
|
40 | |||
41 | |||
42 | 1 | class ALINE(_Distance): |
|
43 | r"""ALINE alignment, similarity, and distance. |
||
44 | |||
45 | ALINE alignment was developed by |
||
46 | :cite:`Kondrak:2000,Kondrak:2002,Downey:2008`, and establishes an |
||
47 | alignment algorithm based on multivalued phonetic features and feature |
||
48 | salience weights. Along with the alignment itself, the algorithm produces a |
||
49 | term similarity score. |
||
50 | |||
51 | :cite:`Downey:2008` develops ALINE's similarity score into a similarity |
||
52 | measure & distance measure: |
||
53 | |||
54 | .. math:: |
||
55 | |||
56 | sim_{ALINE} = \frac{2 \dot score_{ALINE}(src, tar)} |
||
57 | {score_{ALINE}(src, src) + score_{ALINE}(tar, tar)} |
||
58 | |||
59 | However, because the average of the two self-similarity scores is not |
||
60 | guaranteed to be greater than or equal to the similarity score between |
||
61 | the two strings, by default, this formula is not used here in order to |
||
62 | guarantee that the similarity measure is bounded to [0, 1]. Instead, |
||
63 | Kondrak's similarity measure is employed: |
||
64 | |||
65 | .. math:: |
||
66 | |||
67 | sim_{ALINE} = \frac{score_{ALINE}(src, tar)} |
||
68 | {max(score_{ALINE}(src, src), score_{ALINE}(tar, tar))} |
||
69 | |||
70 | |||
71 | .. versionadded:: 0.4.0 |
||
72 | """ |
||
73 | |||
74 | # The three dicts below are mostly copied from NLTK's implementation |
||
75 | # https://www.nltk.org/_modules/nltk/metrics/aline.html |
||
76 | # But values have been returned, as much as possible to the reference |
||
77 | # values supplied in Kondrak's paper. |
||
78 | 1 | feature_weights = { |
|
79 | # place |
||
80 | 'bilabial': 1.0, |
||
81 | 'labiodental': 0.95, |
||
82 | 'dental': 0.9, |
||
83 | 'alveolar': 0.85, |
||
84 | 'retroflex': 0.8, |
||
85 | 'palato-alveolar': 0.75, |
||
86 | 'palatal': 0.7, |
||
87 | 'velar': 0.6, |
||
88 | 'uvular': 0.5, |
||
89 | 'pharyngeal': 0.3, |
||
90 | 'glottal': 0.1, |
||
91 | # manner |
||
92 | 'stop': 1.0, |
||
93 | 'affricate': 0.9, |
||
94 | 'fricative': 0.8, |
||
95 | 'approximant': 0.6, |
||
96 | 'trill': 0.55, # not in original |
||
97 | 'tap': 0.5, # not in original |
||
98 | 'high vowel': 0.4, |
||
99 | 'mid vowel': 0.2, |
||
100 | 'low vowel': 0.0, |
||
101 | # high |
||
102 | 'high': 1.0, |
||
103 | 'mid': 0.5, |
||
104 | 'low': 0.0, |
||
105 | # back |
||
106 | 'front': 1.0, |
||
107 | 'central': 0.5, |
||
108 | 'back': 0.0, |
||
109 | # binary features |
||
110 | 'plus': 1.0, |
||
111 | 'minus': 0.0, |
||
112 | } |
||
113 | |||
114 | 1 | v_features = { |
|
115 | 'syllabic', |
||
116 | 'nasal', |
||
117 | 'retroflex', |
||
118 | 'high', |
||
119 | 'back', |
||
120 | 'round', |
||
121 | 'long', |
||
122 | } |
||
123 | 1 | c_features = { |
|
124 | 'syllabic', |
||
125 | 'manner', |
||
126 | 'voice', |
||
127 | 'nasal', |
||
128 | 'retroflex', |
||
129 | 'lateral', |
||
130 | 'aspirated', |
||
131 | 'place', |
||
132 | } |
||
133 | |||
134 | 1 | salience = { |
|
135 | 'syllabic': 5, |
||
136 | 'voice': 10, |
||
137 | 'lateral': 10, |
||
138 | 'high': 5, |
||
139 | 'manner': 50, |
||
140 | 'long': 1, |
||
141 | 'place': 40, |
||
142 | 'nasal': 10, |
||
143 | 'aspirated': 5, |
||
144 | 'back': 5, |
||
145 | 'retroflex': 10, |
||
146 | 'round': 5, |
||
147 | } |
||
148 | |||
149 | 1 | phones_ipa = { |
|
150 | 'p': { |
||
151 | 'place': 'bilabial', |
||
152 | 'manner': 'stop', |
||
153 | 'syllabic': 'minus', |
||
154 | 'voice': 'minus', |
||
155 | 'nasal': 'minus', |
||
156 | 'retroflex': 'minus', |
||
157 | 'lateral': 'minus', |
||
158 | 'aspirated': 'minus', |
||
159 | }, |
||
160 | 'b': { |
||
161 | 'place': 'bilabial', |
||
162 | 'manner': 'stop', |
||
163 | 'syllabic': 'minus', |
||
164 | 'voice': 'plus', |
||
165 | 'nasal': 'minus', |
||
166 | 'retroflex': 'minus', |
||
167 | 'lateral': 'minus', |
||
168 | 'aspirated': 'minus', |
||
169 | }, |
||
170 | 't': { |
||
171 | 'place': 'alveolar', |
||
172 | 'manner': 'stop', |
||
173 | 'syllabic': 'minus', |
||
174 | 'voice': 'minus', |
||
175 | 'nasal': 'minus', |
||
176 | 'retroflex': 'minus', |
||
177 | 'lateral': 'minus', |
||
178 | 'aspirated': 'minus', |
||
179 | }, |
||
180 | 'd': { |
||
181 | 'place': 'alveolar', |
||
182 | 'manner': 'stop', |
||
183 | 'syllabic': 'minus', |
||
184 | 'voice': 'plus', |
||
185 | 'nasal': 'minus', |
||
186 | 'retroflex': 'minus', |
||
187 | 'lateral': 'minus', |
||
188 | 'aspirated': 'minus', |
||
189 | }, |
||
190 | 'ʈ': { |
||
191 | 'place': 'retroflex', |
||
192 | 'manner': 'stop', |
||
193 | 'syllabic': 'minus', |
||
194 | 'voice': 'minus', |
||
195 | 'nasal': 'minus', |
||
196 | 'retroflex': 'plus', |
||
197 | 'lateral': 'minus', |
||
198 | 'aspirated': 'minus', |
||
199 | }, |
||
200 | 'ɖ': { |
||
201 | 'place': 'retroflex', |
||
202 | 'manner': 'stop', |
||
203 | 'syllabic': 'minus', |
||
204 | 'voice': 'plus', |
||
205 | 'nasal': 'minus', |
||
206 | 'retroflex': 'plus', |
||
207 | 'lateral': 'minus', |
||
208 | 'aspirated': 'minus', |
||
209 | }, |
||
210 | 'c': { |
||
211 | 'place': 'palatal', |
||
212 | 'manner': 'stop', |
||
213 | 'syllabic': 'minus', |
||
214 | 'voice': 'minus', |
||
215 | 'nasal': 'minus', |
||
216 | 'retroflex': 'minus', |
||
217 | 'lateral': 'minus', |
||
218 | 'aspirated': 'minus', |
||
219 | }, |
||
220 | 'ɟ': { |
||
221 | 'place': 'palatal', |
||
222 | 'manner': 'stop', |
||
223 | 'syllabic': 'minus', |
||
224 | 'voice': 'plus', |
||
225 | 'nasal': 'minus', |
||
226 | 'retroflex': 'minus', |
||
227 | 'lateral': 'minus', |
||
228 | 'aspirated': 'minus', |
||
229 | }, |
||
230 | 'k': { |
||
231 | 'place': 'velar', |
||
232 | 'manner': 'stop', |
||
233 | 'syllabic': 'minus', |
||
234 | 'voice': 'minus', |
||
235 | 'nasal': 'minus', |
||
236 | 'retroflex': 'minus', |
||
237 | 'lateral': 'minus', |
||
238 | 'aspirated': 'minus', |
||
239 | }, |
||
240 | 'g': { |
||
241 | 'place': 'velar', |
||
242 | 'manner': 'stop', |
||
243 | 'syllabic': 'minus', |
||
244 | 'voice': 'plus', |
||
245 | 'nasal': 'minus', |
||
246 | 'retroflex': 'minus', |
||
247 | 'lateral': 'minus', |
||
248 | 'aspirated': 'minus', |
||
249 | }, |
||
250 | 'q': { |
||
251 | 'place': 'uvular', |
||
252 | 'manner': 'stop', |
||
253 | 'syllabic': 'minus', |
||
254 | 'voice': 'minus', |
||
255 | 'nasal': 'minus', |
||
256 | 'retroflex': 'minus', |
||
257 | 'lateral': 'minus', |
||
258 | 'aspirated': 'minus', |
||
259 | }, |
||
260 | 'ɢ': { |
||
261 | 'place': 'uvular', |
||
262 | 'manner': 'stop', |
||
263 | 'syllabic': 'minus', |
||
264 | 'voice': 'plus', |
||
265 | 'nasal': 'minus', |
||
266 | 'retroflex': 'minus', |
||
267 | 'lateral': 'minus', |
||
268 | 'aspirated': 'minus', |
||
269 | }, |
||
270 | 'ʔ': { |
||
271 | 'place': 'glottal', |
||
272 | 'manner': 'stop', |
||
273 | 'syllabic': 'minus', |
||
274 | 'voice': 'minus', |
||
275 | 'nasal': 'minus', |
||
276 | 'retroflex': 'minus', |
||
277 | 'lateral': 'minus', |
||
278 | 'aspirated': 'minus', |
||
279 | }, |
||
280 | 'm': { |
||
281 | 'place': 'bilabial', |
||
282 | 'manner': 'stop', |
||
283 | 'syllabic': 'minus', |
||
284 | 'voice': 'plus', |
||
285 | 'nasal': 'plus', |
||
286 | 'retroflex': 'minus', |
||
287 | 'lateral': 'minus', |
||
288 | 'aspirated': 'minus', |
||
289 | }, |
||
290 | 'ɱ': { |
||
291 | 'place': 'labiodental', |
||
292 | 'manner': 'stop', |
||
293 | 'syllabic': 'minus', |
||
294 | 'voice': 'plus', |
||
295 | 'nasal': 'plus', |
||
296 | 'retroflex': 'minus', |
||
297 | 'lateral': 'minus', |
||
298 | 'aspirated': 'minus', |
||
299 | }, |
||
300 | 'n': { |
||
301 | 'place': 'alveolar', |
||
302 | 'manner': 'stop', |
||
303 | 'syllabic': 'minus', |
||
304 | 'voice': 'plus', |
||
305 | 'nasal': 'plus', |
||
306 | 'retroflex': 'minus', |
||
307 | 'lateral': 'minus', |
||
308 | 'aspirated': 'minus', |
||
309 | }, |
||
310 | 'ɳ': { |
||
311 | 'place': 'retroflex', |
||
312 | 'manner': 'stop', |
||
313 | 'syllabic': 'minus', |
||
314 | 'voice': 'plus', |
||
315 | 'nasal': 'plus', |
||
316 | 'retroflex': 'plus', |
||
317 | 'lateral': 'minus', |
||
318 | 'aspirated': 'minus', |
||
319 | }, |
||
320 | 'ɲ': { |
||
321 | 'place': 'palatal', |
||
322 | 'manner': 'stop', |
||
323 | 'syllabic': 'minus', |
||
324 | 'voice': 'plus', |
||
325 | 'nasal': 'plus', |
||
326 | 'retroflex': 'minus', |
||
327 | 'lateral': 'minus', |
||
328 | 'aspirated': 'minus', |
||
329 | }, |
||
330 | 'ŋ': { |
||
331 | 'place': 'velar', |
||
332 | 'manner': 'stop', |
||
333 | 'syllabic': 'minus', |
||
334 | 'voice': 'plus', |
||
335 | 'nasal': 'plus', |
||
336 | 'retroflex': 'minus', |
||
337 | 'lateral': 'minus', |
||
338 | 'aspirated': 'minus', |
||
339 | }, |
||
340 | 'ɴ': { |
||
341 | 'place': 'uvular', |
||
342 | 'manner': 'stop', |
||
343 | 'syllabic': 'minus', |
||
344 | 'voice': 'plus', |
||
345 | 'nasal': 'plus', |
||
346 | 'retroflex': 'minus', |
||
347 | 'lateral': 'minus', |
||
348 | 'aspirated': 'minus', |
||
349 | }, |
||
350 | 'ʙ': { |
||
351 | 'place': 'bilabial', |
||
352 | 'manner': 'trill', |
||
353 | 'syllabic': 'minus', |
||
354 | 'voice': 'plus', |
||
355 | 'nasal': 'minus', |
||
356 | 'retroflex': 'minus', |
||
357 | 'lateral': 'minus', |
||
358 | 'aspirated': 'minus', |
||
359 | }, |
||
360 | 'r': { |
||
361 | 'place': 'alveolar', |
||
362 | 'manner': 'trill', |
||
363 | 'syllabic': 'minus', |
||
364 | 'voice': 'plus', |
||
365 | 'nasal': 'minus', |
||
366 | 'retroflex': 'plus', |
||
367 | 'lateral': 'minus', |
||
368 | 'aspirated': 'minus', |
||
369 | }, |
||
370 | 'ʀ': { |
||
371 | 'place': 'uvular', |
||
372 | 'manner': 'trill', |
||
373 | 'syllabic': 'minus', |
||
374 | 'voice': 'plus', |
||
375 | 'nasal': 'minus', |
||
376 | 'retroflex': 'minus', |
||
377 | 'lateral': 'minus', |
||
378 | 'aspirated': 'minus', |
||
379 | }, |
||
380 | 'ɾ': { |
||
381 | 'place': 'alveolar', |
||
382 | 'manner': 'tap', |
||
383 | 'syllabic': 'minus', |
||
384 | 'voice': 'plus', |
||
385 | 'nasal': 'minus', |
||
386 | 'retroflex': 'minus', |
||
387 | 'lateral': 'minus', |
||
388 | 'aspirated': 'minus', |
||
389 | }, |
||
390 | 'ɽ': { |
||
391 | 'place': 'retroflex', |
||
392 | 'manner': 'tap', |
||
393 | 'syllabic': 'minus', |
||
394 | 'voice': 'plus', |
||
395 | 'nasal': 'minus', |
||
396 | 'retroflex': 'plus', |
||
397 | 'lateral': 'minus', |
||
398 | 'aspirated': 'minus', |
||
399 | }, |
||
400 | 'ɸ': { |
||
401 | 'place': 'bilabial', |
||
402 | 'manner': 'fricative', |
||
403 | 'syllabic': 'minus', |
||
404 | 'voice': 'minus', |
||
405 | 'nasal': 'minus', |
||
406 | 'retroflex': 'minus', |
||
407 | 'lateral': 'minus', |
||
408 | 'aspirated': 'minus', |
||
409 | }, |
||
410 | 'β': { |
||
411 | 'place': 'bilabial', |
||
412 | 'manner': 'fricative', |
||
413 | 'syllabic': 'minus', |
||
414 | 'voice': 'plus', |
||
415 | 'nasal': 'minus', |
||
416 | 'retroflex': 'minus', |
||
417 | 'lateral': 'minus', |
||
418 | 'aspirated': 'minus', |
||
419 | }, |
||
420 | 'f': { |
||
421 | 'place': 'labiodental', |
||
422 | 'manner': 'fricative', |
||
423 | 'syllabic': 'minus', |
||
424 | 'voice': 'minus', |
||
425 | 'nasal': 'minus', |
||
426 | 'retroflex': 'minus', |
||
427 | 'lateral': 'minus', |
||
428 | 'aspirated': 'minus', |
||
429 | }, |
||
430 | 'v': { |
||
431 | 'place': 'labiodental', |
||
432 | 'manner': 'fricative', |
||
433 | 'syllabic': 'minus', |
||
434 | 'voice': 'plus', |
||
435 | 'nasal': 'minus', |
||
436 | 'retroflex': 'minus', |
||
437 | 'lateral': 'minus', |
||
438 | 'aspirated': 'minus', |
||
439 | }, |
||
440 | 'θ': { |
||
441 | 'place': 'dental', |
||
442 | 'manner': 'fricative', |
||
443 | 'syllabic': 'minus', |
||
444 | 'voice': 'minus', |
||
445 | 'nasal': 'minus', |
||
446 | 'retroflex': 'minus', |
||
447 | 'lateral': 'minus', |
||
448 | 'aspirated': 'minus', |
||
449 | }, |
||
450 | 'ð': { |
||
451 | 'place': 'dental', |
||
452 | 'manner': 'fricative', |
||
453 | 'syllabic': 'minus', |
||
454 | 'voice': 'plus', |
||
455 | 'nasal': 'minus', |
||
456 | 'retroflex': 'minus', |
||
457 | 'lateral': 'minus', |
||
458 | 'aspirated': 'minus', |
||
459 | }, |
||
460 | 's': { |
||
461 | 'place': 'alveolar', |
||
462 | 'manner': 'fricative', |
||
463 | 'syllabic': 'minus', |
||
464 | 'voice': 'minus', |
||
465 | 'nasal': 'minus', |
||
466 | 'retroflex': 'minus', |
||
467 | 'lateral': 'minus', |
||
468 | 'aspirated': 'minus', |
||
469 | }, |
||
470 | 'z': { |
||
471 | 'place': 'alveolar', |
||
472 | 'manner': 'fricative', |
||
473 | 'syllabic': 'minus', |
||
474 | 'voice': 'plus', |
||
475 | 'nasal': 'minus', |
||
476 | 'retroflex': 'minus', |
||
477 | 'lateral': 'minus', |
||
478 | 'aspirated': 'minus', |
||
479 | }, |
||
480 | 'ʃ': { |
||
481 | 'place': 'palato-alveolar', |
||
482 | 'manner': 'fricative', |
||
483 | 'syllabic': 'minus', |
||
484 | 'voice': 'minus', |
||
485 | 'nasal': 'minus', |
||
486 | 'retroflex': 'minus', |
||
487 | 'lateral': 'minus', |
||
488 | 'aspirated': 'minus', |
||
489 | }, |
||
490 | 'ʒ': { |
||
491 | 'place': 'palato-alveolar', |
||
492 | 'manner': 'fricative', |
||
493 | 'syllabic': 'minus', |
||
494 | 'voice': 'plus', |
||
495 | 'nasal': 'minus', |
||
496 | 'retroflex': 'minus', |
||
497 | 'lateral': 'minus', |
||
498 | 'aspirated': 'minus', |
||
499 | }, |
||
500 | 'ʂ': { |
||
501 | 'place': 'retroflex', |
||
502 | 'manner': 'fricative', |
||
503 | 'syllabic': 'minus', |
||
504 | 'voice': 'minus', |
||
505 | 'nasal': 'minus', |
||
506 | 'retroflex': 'plus', |
||
507 | 'lateral': 'minus', |
||
508 | 'aspirated': 'minus', |
||
509 | }, |
||
510 | 'ʐ': { |
||
511 | 'place': 'retroflex', |
||
512 | 'manner': 'fricative', |
||
513 | 'syllabic': 'minus', |
||
514 | 'voice': 'plus', |
||
515 | 'nasal': 'minus', |
||
516 | 'retroflex': 'plus', |
||
517 | 'lateral': 'minus', |
||
518 | 'aspirated': 'minus', |
||
519 | }, |
||
520 | 'ç': { |
||
521 | 'place': 'palatal', |
||
522 | 'manner': 'fricative', |
||
523 | 'syllabic': 'minus', |
||
524 | 'voice': 'minus', |
||
525 | 'nasal': 'minus', |
||
526 | 'retroflex': 'minus', |
||
527 | 'lateral': 'minus', |
||
528 | 'aspirated': 'minus', |
||
529 | }, |
||
530 | 'ʝ': { |
||
531 | 'place': 'palatal', |
||
532 | 'manner': 'fricative', |
||
533 | 'syllabic': 'minus', |
||
534 | 'voice': 'plus', |
||
535 | 'nasal': 'minus', |
||
536 | 'retroflex': 'minus', |
||
537 | 'lateral': 'minus', |
||
538 | 'aspirated': 'minus', |
||
539 | }, |
||
540 | 'x': { |
||
541 | 'place': 'velar', |
||
542 | 'manner': 'fricative', |
||
543 | 'syllabic': 'minus', |
||
544 | 'voice': 'minus', |
||
545 | 'nasal': 'minus', |
||
546 | 'retroflex': 'minus', |
||
547 | 'lateral': 'minus', |
||
548 | 'aspirated': 'minus', |
||
549 | }, |
||
550 | 'ɣ': { |
||
551 | 'place': 'velar', |
||
552 | 'manner': 'fricative', |
||
553 | 'syllabic': 'minus', |
||
554 | 'voice': 'plus', |
||
555 | 'nasal': 'minus', |
||
556 | 'retroflex': 'minus', |
||
557 | 'lateral': 'minus', |
||
558 | 'aspirated': 'minus', |
||
559 | }, |
||
560 | 'χ': { |
||
561 | 'place': 'uvular', |
||
562 | 'manner': 'fricative', |
||
563 | 'syllabic': 'minus', |
||
564 | 'voice': 'minus', |
||
565 | 'nasal': 'minus', |
||
566 | 'retroflex': 'minus', |
||
567 | 'lateral': 'minus', |
||
568 | 'aspirated': 'minus', |
||
569 | }, |
||
570 | 'ʁ': { |
||
571 | 'place': 'uvular', |
||
572 | 'manner': 'fricative', |
||
573 | 'syllabic': 'minus', |
||
574 | 'voice': 'plus', |
||
575 | 'nasal': 'minus', |
||
576 | 'retroflex': 'minus', |
||
577 | 'lateral': 'minus', |
||
578 | 'aspirated': 'minus', |
||
579 | }, |
||
580 | 'ħ': { |
||
581 | 'place': 'pharyngeal', |
||
582 | 'manner': 'fricative', |
||
583 | 'syllabic': 'minus', |
||
584 | 'voice': 'minus', |
||
585 | 'nasal': 'minus', |
||
586 | 'retroflex': 'minus', |
||
587 | 'lateral': 'minus', |
||
588 | 'aspirated': 'minus', |
||
589 | }, |
||
590 | 'ʕ': { |
||
591 | 'place': 'pharyngeal', |
||
592 | 'manner': 'fricative', |
||
593 | 'syllabic': 'minus', |
||
594 | 'voice': 'plus', |
||
595 | 'nasal': 'minus', |
||
596 | 'retroflex': 'minus', |
||
597 | 'lateral': 'minus', |
||
598 | 'aspirated': 'minus', |
||
599 | }, |
||
600 | 'h': { |
||
601 | 'place': 'glottal', |
||
602 | 'manner': 'fricative', |
||
603 | 'syllabic': 'minus', |
||
604 | 'voice': 'minus', |
||
605 | 'nasal': 'minus', |
||
606 | 'retroflex': 'minus', |
||
607 | 'lateral': 'minus', |
||
608 | 'aspirated': 'minus', |
||
609 | }, |
||
610 | 'ɦ': { |
||
611 | 'place': 'glottal', |
||
612 | 'manner': 'fricative', |
||
613 | 'syllabic': 'minus', |
||
614 | 'voice': 'plus', |
||
615 | 'nasal': 'minus', |
||
616 | 'retroflex': 'minus', |
||
617 | 'lateral': 'minus', |
||
618 | 'aspirated': 'minus', |
||
619 | }, |
||
620 | 'ɬ': { |
||
621 | 'place': 'alveolar', |
||
622 | 'manner': 'fricative', |
||
623 | 'syllabic': 'minus', |
||
624 | 'voice': 'minus', |
||
625 | 'nasal': 'minus', |
||
626 | 'retroflex': 'minus', |
||
627 | 'lateral': 'plus', |
||
628 | 'aspirated': 'minus', |
||
629 | }, |
||
630 | 'ɮ': { |
||
631 | 'place': 'alveolar', |
||
632 | 'manner': 'fricative', |
||
633 | 'syllabic': 'minus', |
||
634 | 'voice': 'plus', |
||
635 | 'nasal': 'minus', |
||
636 | 'retroflex': 'minus', |
||
637 | 'lateral': 'plus', |
||
638 | 'aspirated': 'minus', |
||
639 | }, |
||
640 | 'ʋ': { |
||
641 | 'place': 'labiodental', |
||
642 | 'manner': 'approximant', |
||
643 | 'syllabic': 'minus', |
||
644 | 'voice': 'plus', |
||
645 | 'nasal': 'minus', |
||
646 | 'retroflex': 'minus', |
||
647 | 'lateral': 'minus', |
||
648 | 'aspirated': 'minus', |
||
649 | }, |
||
650 | 'ɹ': { |
||
651 | 'place': 'alveolar', |
||
652 | 'manner': 'approximant', |
||
653 | 'syllabic': 'minus', |
||
654 | 'voice': 'plus', |
||
655 | 'nasal': 'minus', |
||
656 | 'retroflex': 'minus', |
||
657 | 'lateral': 'minus', |
||
658 | 'aspirated': 'minus', |
||
659 | }, |
||
660 | 'ɻ': { |
||
661 | 'place': 'retroflex', |
||
662 | 'manner': 'approximant', |
||
663 | 'syllabic': 'minus', |
||
664 | 'voice': 'plus', |
||
665 | 'nasal': 'minus', |
||
666 | 'retroflex': 'plus', |
||
667 | 'lateral': 'minus', |
||
668 | 'aspirated': 'minus', |
||
669 | }, |
||
670 | 'j': { |
||
671 | 'place': 'palatal', |
||
672 | 'manner': 'approximant', |
||
673 | 'syllabic': 'minus', |
||
674 | 'voice': 'plus', |
||
675 | 'nasal': 'minus', |
||
676 | 'retroflex': 'minus', |
||
677 | 'lateral': 'minus', |
||
678 | 'aspirated': 'minus', |
||
679 | }, |
||
680 | 'ɰ': { |
||
681 | 'place': 'velar', |
||
682 | 'manner': 'approximant', |
||
683 | 'syllabic': 'minus', |
||
684 | 'voice': 'plus', |
||
685 | 'nasal': 'minus', |
||
686 | 'retroflex': 'minus', |
||
687 | 'lateral': 'minus', |
||
688 | 'aspirated': 'minus', |
||
689 | }, |
||
690 | 'l': { |
||
691 | 'place': 'alveolar', |
||
692 | 'manner': 'approximant', |
||
693 | 'syllabic': 'minus', |
||
694 | 'voice': 'plus', |
||
695 | 'nasal': 'minus', |
||
696 | 'retroflex': 'minus', |
||
697 | 'lateral': 'plus', |
||
698 | 'aspirated': 'minus', |
||
699 | }, |
||
700 | 'w': { |
||
701 | 'place': 'velar', |
||
702 | 'manner': 'approximant', |
||
703 | 'syllabic': 'minus', |
||
704 | 'voice': 'plus', |
||
705 | 'nasal': 'minus', |
||
706 | 'retroflex': 'minus', |
||
707 | 'lateral': 'minus', |
||
708 | 'aspirated': 'minus', |
||
709 | 'double': 'bilabial', |
||
710 | }, |
||
711 | 'i': { |
||
712 | 'manner': 'high vowel', |
||
713 | 'syllabic': 'plus', |
||
714 | 'voice': 'plus', |
||
715 | 'nasal': 'minus', |
||
716 | 'retroflex': 'minus', |
||
717 | 'lateral': 'minus', |
||
718 | 'high': 'high', |
||
719 | 'back': 'front', |
||
720 | 'round': 'minus', |
||
721 | 'long': 'minus', |
||
722 | 'aspirated': 'minus', |
||
723 | }, |
||
724 | 'y': { |
||
725 | 'manner': 'high vowel', |
||
726 | 'syllabic': 'plus', |
||
727 | 'voice': 'plus', |
||
728 | 'nasal': 'minus', |
||
729 | 'retroflex': 'minus', |
||
730 | 'lateral': 'minus', |
||
731 | 'high': 'high', |
||
732 | 'back': 'front', |
||
733 | 'round': 'plus', |
||
734 | 'long': 'minus', |
||
735 | 'aspirated': 'minus', |
||
736 | }, |
||
737 | 'e': { |
||
738 | 'manner': 'mid vowel', |
||
739 | 'syllabic': 'plus', |
||
740 | 'voice': 'plus', |
||
741 | 'nasal': 'minus', |
||
742 | 'retroflex': 'minus', |
||
743 | 'lateral': 'minus', |
||
744 | 'high': 'mid', |
||
745 | 'back': 'front', |
||
746 | 'round': 'minus', |
||
747 | 'long': 'minus', |
||
748 | 'aspirated': 'minus', |
||
749 | }, |
||
750 | 'ø': { |
||
751 | 'manner': 'mid vowel', |
||
752 | 'syllabic': 'plus', |
||
753 | 'voice': 'plus', |
||
754 | 'nasal': 'minus', |
||
755 | 'retroflex': 'minus', |
||
756 | 'lateral': 'minus', |
||
757 | 'high': 'mid', |
||
758 | 'back': 'front', |
||
759 | 'round': 'plus', |
||
760 | 'long': 'minus', |
||
761 | 'aspirated': 'minus', |
||
762 | }, |
||
763 | 'ɛ': { |
||
764 | 'manner': 'mid vowel', |
||
765 | 'syllabic': 'plus', |
||
766 | 'voice': 'plus', |
||
767 | 'nasal': 'minus', |
||
768 | 'retroflex': 'minus', |
||
769 | 'lateral': 'minus', |
||
770 | 'high': 'mid', |
||
771 | 'back': 'front', |
||
772 | 'round': 'minus', |
||
773 | 'long': 'minus', |
||
774 | 'aspirated': 'minus', |
||
775 | }, |
||
776 | 'œ': { |
||
777 | 'manner': 'mid vowel', |
||
778 | 'syllabic': 'plus', |
||
779 | 'voice': 'plus', |
||
780 | 'nasal': 'minus', |
||
781 | 'retroflex': 'minus', |
||
782 | 'lateral': 'minus', |
||
783 | 'high': 'mid', |
||
784 | 'back': 'front', |
||
785 | 'round': 'plus', |
||
786 | 'long': 'minus', |
||
787 | 'aspirated': 'minus', |
||
788 | }, |
||
789 | 'æ': { |
||
790 | 'manner': 'low vowel', |
||
791 | 'syllabic': 'plus', |
||
792 | 'voice': 'plus', |
||
793 | 'nasal': 'minus', |
||
794 | 'retroflex': 'minus', |
||
795 | 'lateral': 'minus', |
||
796 | 'high': 'low', |
||
797 | 'back': 'front', |
||
798 | 'round': 'minus', |
||
799 | 'long': 'minus', |
||
800 | 'aspirated': 'minus', |
||
801 | }, |
||
802 | 'a': { |
||
803 | 'manner': 'low vowel', |
||
804 | 'syllabic': 'plus', |
||
805 | 'voice': 'plus', |
||
806 | 'nasal': 'minus', |
||
807 | 'retroflex': 'minus', |
||
808 | 'lateral': 'minus', |
||
809 | 'high': 'low', |
||
810 | 'back': 'front', |
||
811 | 'round': 'minus', |
||
812 | 'long': 'minus', |
||
813 | 'aspirated': 'minus', |
||
814 | }, |
||
815 | 'ɨ': { |
||
816 | 'manner': 'high vowel', |
||
817 | 'syllabic': 'plus', |
||
818 | 'voice': 'plus', |
||
819 | 'nasal': 'minus', |
||
820 | 'retroflex': 'minus', |
||
821 | 'lateral': 'minus', |
||
822 | 'high': 'high', |
||
823 | 'back': 'central', |
||
824 | 'round': 'minus', |
||
825 | 'long': 'minus', |
||
826 | 'aspirated': 'minus', |
||
827 | }, |
||
828 | 'ʉ': { |
||
829 | 'manner': 'high vowel', |
||
830 | 'syllabic': 'plus', |
||
831 | 'voice': 'plus', |
||
832 | 'nasal': 'minus', |
||
833 | 'retroflex': 'minus', |
||
834 | 'lateral': 'minus', |
||
835 | 'high': 'high', |
||
836 | 'back': 'central', |
||
837 | 'round': 'plus', |
||
838 | 'long': 'minus', |
||
839 | 'aspirated': 'minus', |
||
840 | }, |
||
841 | 'ə': { |
||
842 | 'manner': 'mid vowel', |
||
843 | 'syllabic': 'plus', |
||
844 | 'voice': 'plus', |
||
845 | 'nasal': 'minus', |
||
846 | 'retroflex': 'minus', |
||
847 | 'lateral': 'minus', |
||
848 | 'high': 'mid', |
||
849 | 'back': 'central', |
||
850 | 'round': 'minus', |
||
851 | 'long': 'minus', |
||
852 | 'aspirated': 'minus', |
||
853 | }, |
||
854 | 'u': { |
||
855 | 'manner': 'high vowel', |
||
856 | 'syllabic': 'plus', |
||
857 | 'voice': 'plus', |
||
858 | 'nasal': 'minus', |
||
859 | 'retroflex': 'minus', |
||
860 | 'lateral': 'minus', |
||
861 | 'high': 'high', |
||
862 | 'back': 'back', |
||
863 | 'round': 'plus', |
||
864 | 'long': 'minus', |
||
865 | 'aspirated': 'minus', |
||
866 | }, |
||
867 | 'o': { |
||
868 | 'manner': 'mid vowel', |
||
869 | 'syllabic': 'plus', |
||
870 | 'voice': 'plus', |
||
871 | 'nasal': 'minus', |
||
872 | 'retroflex': 'minus', |
||
873 | 'lateral': 'minus', |
||
874 | 'high': 'mid', |
||
875 | 'back': 'back', |
||
876 | 'round': 'plus', |
||
877 | 'long': 'minus', |
||
878 | 'aspirated': 'minus', |
||
879 | }, |
||
880 | 'ɔ': { |
||
881 | 'manner': 'mid vowel', |
||
882 | 'syllabic': 'plus', |
||
883 | 'voice': 'plus', |
||
884 | 'nasal': 'minus', |
||
885 | 'retroflex': 'minus', |
||
886 | 'lateral': 'minus', |
||
887 | 'high': 'mid', |
||
888 | 'back': 'back', |
||
889 | 'round': 'plus', |
||
890 | 'long': 'minus', |
||
891 | 'aspirated': 'minus', |
||
892 | }, |
||
893 | 'ɒ': { |
||
894 | 'manner': 'low vowel', |
||
895 | 'syllabic': 'plus', |
||
896 | 'voice': 'plus', |
||
897 | 'nasal': 'minus', |
||
898 | 'retroflex': 'minus', |
||
899 | 'lateral': 'minus', |
||
900 | 'high': 'low', |
||
901 | 'back': 'back', |
||
902 | 'round': 'minus', |
||
903 | 'long': 'minus', |
||
904 | 'aspirated': 'minus', |
||
905 | }, |
||
906 | 'ː': {'long': 'plus', 'supplemental': True}, |
||
907 | 'ʰ': {'aspirated': 'plus', 'supplemental': True}, |
||
908 | } |
||
909 | |||
910 | 1 | phones_kondrak = { |
|
911 | 'a': { |
||
912 | 'place': 'velar', |
||
913 | 'manner': 'low vowel', |
||
914 | 'syllabic': 'plus', |
||
915 | 'voice': 'plus', |
||
916 | 'nasal': 'minus', |
||
917 | 'retroflex': 'minus', |
||
918 | 'lateral': 'minus', |
||
919 | 'high': 'low', |
||
920 | 'back': 'central', |
||
921 | 'round': 'minus', |
||
922 | }, |
||
923 | 'b': { |
||
924 | 'place': 'bilabial', |
||
925 | 'manner': 'stop', |
||
926 | 'syllabic': 'minus', |
||
927 | 'voice': 'plus', |
||
928 | 'nasal': 'minus', |
||
929 | 'retroflex': 'minus', |
||
930 | 'lateral': 'minus', |
||
931 | }, |
||
932 | 'c': { |
||
933 | 'place': 'alveolar', |
||
934 | 'manner': 'affricate', |
||
935 | 'syllabic': 'minus', |
||
936 | 'voice': 'minus', |
||
937 | 'nasal': 'minus', |
||
938 | 'retroflex': 'minus', |
||
939 | 'lateral': 'minus', |
||
940 | }, |
||
941 | 'd': { |
||
942 | 'place': 'alveolar', |
||
943 | 'manner': 'stop', |
||
944 | 'syllabic': 'minus', |
||
945 | 'voice': 'plus', |
||
946 | 'nasal': 'minus', |
||
947 | 'retroflex': 'minus', |
||
948 | 'lateral': 'minus', |
||
949 | }, |
||
950 | 'e': { |
||
951 | 'place': 'palatal', |
||
952 | 'manner': 'mid vowel', |
||
953 | 'syllabic': 'plus', |
||
954 | 'voice': 'plus', |
||
955 | 'nasal': 'minus', |
||
956 | 'retroflex': 'minus', |
||
957 | 'lateral': 'minus', |
||
958 | 'high': 'mid', |
||
959 | 'back': 'front', |
||
960 | 'round': 'minus', |
||
961 | }, |
||
962 | 'f': { |
||
963 | 'place': 'labiodental', |
||
964 | 'manner': 'fricative', |
||
965 | 'syllabic': 'minus', |
||
966 | 'voice': 'minus', |
||
967 | 'nasal': 'minus', |
||
968 | 'retroflex': 'minus', |
||
969 | 'lateral': 'minus', |
||
970 | }, |
||
971 | 'g': { |
||
972 | 'place': 'velar', |
||
973 | 'manner': 'stop', |
||
974 | 'syllabic': 'minus', |
||
975 | 'voice': 'plus', |
||
976 | 'nasal': 'minus', |
||
977 | 'retroflex': 'minus', |
||
978 | 'lateral': 'minus', |
||
979 | }, |
||
980 | 'h': { |
||
981 | 'place': 'glottal', |
||
982 | 'manner': 'fricative', |
||
983 | 'syllabic': 'minus', |
||
984 | 'voice': 'minus', |
||
985 | 'nasal': 'minus', |
||
986 | 'retroflex': 'minus', |
||
987 | 'lateral': 'minus', |
||
988 | }, |
||
989 | 'i': { |
||
990 | 'place': 'palatal', |
||
991 | 'manner': 'high vowel', |
||
992 | 'syllabic': 'plus', |
||
993 | 'voice': 'plus', |
||
994 | 'nasal': 'minus', |
||
995 | 'retroflex': 'minus', |
||
996 | 'lateral': 'minus', |
||
997 | 'high': 'high', |
||
998 | 'back': 'front', |
||
999 | 'round': 'plus', |
||
1000 | }, |
||
1001 | 'j': { |
||
1002 | 'place': 'alveolar', |
||
1003 | 'manner': 'affricate', |
||
1004 | 'syllabic': 'minus', |
||
1005 | 'voice': 'plus', |
||
1006 | 'nasal': 'minus', |
||
1007 | 'retroflex': 'minus', |
||
1008 | 'lateral': 'minus', |
||
1009 | }, |
||
1010 | 'k': { |
||
1011 | 'place': 'velar', |
||
1012 | 'manner': 'stop', |
||
1013 | 'syllabic': 'minus', |
||
1014 | 'voice': 'minus', |
||
1015 | 'nasal': 'minus', |
||
1016 | 'retroflex': 'minus', |
||
1017 | 'lateral': 'minus', |
||
1018 | }, |
||
1019 | 'l': { |
||
1020 | 'place': 'alveolar', |
||
1021 | 'manner': 'approximant', |
||
1022 | 'syllabic': 'minus', |
||
1023 | 'voice': 'plus', |
||
1024 | 'nasal': 'minus', |
||
1025 | 'retroflex': 'minus', |
||
1026 | 'lateral': 'plus', |
||
1027 | }, |
||
1028 | 'm': { |
||
1029 | 'place': 'bilabial', |
||
1030 | 'manner': 'stop', |
||
1031 | 'syllabic': 'minus', |
||
1032 | 'voice': 'plus', |
||
1033 | 'nasal': 'plus', |
||
1034 | 'retroflex': 'minus', |
||
1035 | 'lateral': 'minus', |
||
1036 | }, |
||
1037 | 'n': { |
||
1038 | 'place': 'alveolar', |
||
1039 | 'manner': 'stop', |
||
1040 | 'syllabic': 'minus', |
||
1041 | 'voice': 'plus', |
||
1042 | 'nasal': 'plus', |
||
1043 | 'retroflex': 'minus', |
||
1044 | 'lateral': 'minus', |
||
1045 | }, |
||
1046 | 'o': { |
||
1047 | 'place': 'velar', |
||
1048 | 'manner': 'mid vowel', |
||
1049 | 'syllabic': 'plus', |
||
1050 | 'voice': 'plus', |
||
1051 | 'nasal': 'minus', |
||
1052 | 'retroflex': 'minus', |
||
1053 | 'lateral': 'minus', |
||
1054 | 'high': 'mid', |
||
1055 | 'back': 'back', |
||
1056 | 'round': 'plus', |
||
1057 | }, |
||
1058 | 'p': { |
||
1059 | 'place': 'bilabial', |
||
1060 | 'manner': 'stop', |
||
1061 | 'syllabic': 'minus', |
||
1062 | 'voice': 'minus', |
||
1063 | 'nasal': 'minus', |
||
1064 | 'retroflex': 'minus', |
||
1065 | 'lateral': 'minus', |
||
1066 | }, |
||
1067 | 'q': { |
||
1068 | 'place': 'glottal', |
||
1069 | 'manner': 'stop', |
||
1070 | 'syllabic': 'minus', |
||
1071 | 'voice': 'minus', |
||
1072 | 'nasal': 'minus', |
||
1073 | 'retroflex': 'minus', |
||
1074 | 'lateral': 'minus', |
||
1075 | }, |
||
1076 | 'r': { |
||
1077 | 'place': 'retroflex', |
||
1078 | 'manner': 'approximant', |
||
1079 | 'syllabic': 'minus', |
||
1080 | 'voice': 'plus', |
||
1081 | 'nasal': 'minus', |
||
1082 | 'retroflex': 'plus', |
||
1083 | 'lateral': 'minus', |
||
1084 | }, |
||
1085 | 's': { |
||
1086 | 'place': 'alveolar', |
||
1087 | 'manner': 'fricative', |
||
1088 | 'syllabic': 'minus', |
||
1089 | 'voice': 'minus', |
||
1090 | 'nasal': 'minus', |
||
1091 | 'retroflex': 'minus', |
||
1092 | 'lateral': 'minus', |
||
1093 | }, |
||
1094 | 't': { |
||
1095 | 'place': 'alveolar', |
||
1096 | 'manner': 'stop', |
||
1097 | 'syllabic': 'minus', |
||
1098 | 'voice': 'minus', |
||
1099 | 'nasal': 'minus', |
||
1100 | 'retroflex': 'minus', |
||
1101 | 'lateral': 'minus', |
||
1102 | }, |
||
1103 | 'u': { |
||
1104 | 'place': 'velar', |
||
1105 | 'manner': 'high vowel', |
||
1106 | 'syllabic': 'plus', |
||
1107 | 'voice': 'plus', |
||
1108 | 'nasal': 'minus', |
||
1109 | 'retroflex': 'minus', |
||
1110 | 'lateral': 'minus', |
||
1111 | 'high': 'high', |
||
1112 | 'back': 'back', |
||
1113 | 'round': 'plus', |
||
1114 | }, |
||
1115 | 'v': { |
||
1116 | 'place': 'labiodental', |
||
1117 | 'manner': 'fricative', |
||
1118 | 'syllabic': 'plus', |
||
1119 | 'voice': 'plus', |
||
1120 | 'nasal': 'minus', |
||
1121 | 'retroflex': 'minus', |
||
1122 | 'lateral': 'minus', |
||
1123 | }, |
||
1124 | 'w': { |
||
1125 | 'place': 'velar', |
||
1126 | 'manner': 'high vowel', |
||
1127 | 'syllabic': 'plus', |
||
1128 | 'voice': 'plus', |
||
1129 | 'nasal': 'minus', |
||
1130 | 'retroflex': 'minus', |
||
1131 | 'lateral': 'minus', |
||
1132 | 'high': 'high', |
||
1133 | 'back': 'back', |
||
1134 | 'round': 'plus', |
||
1135 | 'double': 'bilabial', |
||
1136 | }, |
||
1137 | 'x': { |
||
1138 | 'place': 'velar', |
||
1139 | 'manner': 'fricative', |
||
1140 | 'syllabic': 'minus', |
||
1141 | 'voice': 'minus', |
||
1142 | 'nasal': 'minus', |
||
1143 | 'retroflex': 'minus', |
||
1144 | 'lateral': 'minus', |
||
1145 | }, |
||
1146 | 'y': { |
||
1147 | 'place': 'velar', |
||
1148 | 'manner': 'high vowel', |
||
1149 | 'syllabic': 'plus', |
||
1150 | 'voice': 'plus', |
||
1151 | 'nasal': 'minus', |
||
1152 | 'retroflex': 'minus', |
||
1153 | 'lateral': 'minus', |
||
1154 | 'high': 'high', |
||
1155 | 'back': 'front', |
||
1156 | 'round': 'minus', |
||
1157 | }, |
||
1158 | 'z': { |
||
1159 | 'place': 'alveolar', |
||
1160 | 'manner': 'fricative', |
||
1161 | 'syllabic': 'minus', |
||
1162 | 'voice': 'plus', |
||
1163 | 'nasal': 'minus', |
||
1164 | 'retroflex': 'minus', |
||
1165 | 'lateral': 'minus', |
||
1166 | }, |
||
1167 | 'A': {'aspirated': 'plus', 'supplemental': True}, |
||
1168 | 'B': {'back': 'back', 'supplemental': True}, |
||
1169 | 'C': {'back': 'central', 'supplemental': True}, |
||
1170 | 'D': {'place': 'dental', 'supplemental': True}, |
||
1171 | 'F': {'back': 'front', 'supplemental': True}, |
||
1172 | 'H': {'long': 'plus', 'supplemental': True}, |
||
1173 | 'N': {'nasal': 'plus', 'supplemental': True}, |
||
1174 | 'P': {'place': 'palatal', 'supplemental': True}, |
||
1175 | 'R': {'round': 'plus', 'supplemental': True}, |
||
1176 | 'S': {'manner': 'fricative', 'supplemental': True}, |
||
1177 | 'V': {'place': 'palato-alveolar', 'supplemental': True}, |
||
1178 | } |
||
1179 | |||
1180 | 1 | def __init__( |
|
1181 | self, |
||
1182 | epsilon=0, |
||
1183 | c_skip=-10, |
||
1184 | c_sub=35, |
||
1185 | c_exp=45, |
||
1186 | c_vwl=10, |
||
1187 | mode='local', |
||
1188 | phones='aline', |
||
1189 | normalizer=max, |
||
1190 | **kwargs |
||
1191 | ): |
||
1192 | """Initialize ALINE instance. |
||
1193 | |||
1194 | Parameters |
||
1195 | ---------- |
||
1196 | epsilon : float |
||
1197 | The portion (out of 1.0) of the maximum ALINE score, above which |
||
1198 | alignments are returned. If set to 0, only the alignments matching |
||
1199 | the maximum alignment score are returned. If set to 1, all |
||
1200 | alignments scoring 0 or higher are returned. |
||
1201 | c_skip : int |
||
1202 | The cost of an insertion or deletion |
||
1203 | c_sub : int |
||
1204 | The cost of a substitution |
||
1205 | c_exp : int |
||
1206 | The cost of an expansion or contraction |
||
1207 | c_vwl : int |
||
1208 | The additional cost of a vowel substitution, expansion, or |
||
1209 | contraction |
||
1210 | mode : str |
||
1211 | Alignment mode, which can be ``local`` (default), ``global``, |
||
1212 | ``half-local``, or ``semi-global`` |
||
1213 | phones : str |
||
1214 | Phonetic symbol set, which can be: |
||
1215 | - ``aline`` selects Kondrak's original symbols set |
||
1216 | - ``ipa`` selects IPA symbols |
||
1217 | normalizer : function |
||
1218 | A function that takes an list and computes a normalization term |
||
1219 | by which the edit distance is divided (max by default). For the |
||
1220 | normalization proposed by Downey, et al. (2008), set this to: |
||
1221 | ``lambda x: sum(x)/len(x)`` |
||
1222 | **kwargs |
||
1223 | Arbitrary keyword arguments |
||
1224 | |||
1225 | |||
1226 | .. versionadded:: 0.4.0 |
||
1227 | |||
1228 | """ |
||
1229 | 1 | super(ALINE, self).__init__(**kwargs) |
|
1230 | 1 | self._epsilon = epsilon |
|
1231 | 1 | self._c_skip = c_skip |
|
1232 | 1 | self._c_sub = c_sub |
|
1233 | 1 | self._c_exp = c_exp |
|
1234 | 1 | self._c_vwl = c_vwl |
|
1235 | 1 | self._mode = mode |
|
1236 | 1 | if self._mode not in {'local', 'global', 'half-local', 'semi-global'}: |
|
1237 | 1 | self._mode = 'local' |
|
1238 | 1 | if phones == 'ipa': |
|
1239 | 1 | self._phones = self.phones_ipa |
|
1240 | else: |
||
1241 | 1 | self._phones = self.phones_kondrak |
|
1242 | 1 | self._normalizer = normalizer |
|
1243 | |||
1244 | 1 | def alignment(self, src, tar): |
|
1245 | """Return the top ALINE alignment of two strings. |
||
1246 | |||
1247 | The `top` ALINE alignment is the first alignment with the best score. |
||
1248 | The purpose of this function is to have a single tuple as a return |
||
1249 | value. |
||
1250 | |||
1251 | Parameters |
||
1252 | ---------- |
||
1253 | src : str |
||
1254 | Source string for comparison |
||
1255 | tar : str |
||
1256 | Target string for comparison |
||
1257 | |||
1258 | Returns |
||
1259 | ------- |
||
1260 | tuple(float, str, str) |
||
1261 | ALINE alignment and its score |
||
1262 | |||
1263 | Examples |
||
1264 | -------- |
||
1265 | >>> cmp = ALINE() |
||
1266 | >>> cmp.alignment('cat', 'hat') |
||
1267 | (50.0, 'c ‖ a t ‖', 'h ‖ a t ‖') |
||
1268 | >>> cmp.alignment('niall', 'neil') |
||
1269 | (90.0, '‖ n i a ll ‖', '‖ n e i l ‖') |
||
1270 | >>> cmp.alignment('aluminum', 'catalan') |
||
1271 | (81.5, '‖ a l u m ‖ inum', 'cat ‖ a l a n ‖') |
||
1272 | >>> cmp.alignment('atcg', 'tagc') |
||
1273 | (65.0, '‖ a t c ‖ g', 't ‖ a g c ‖') |
||
1274 | |||
1275 | |||
1276 | .. versionadded:: 0.4.1 |
||
1277 | |||
1278 | """ |
||
1279 | 1 | return self.alignments(src, tar)[0] |
|
1280 | |||
1281 | 1 | def alignments(self, src, tar, score_only=False): |
|
1282 | """Return the ALINE alignments of two strings. |
||
1283 | |||
1284 | Parameters |
||
1285 | ---------- |
||
1286 | src : str |
||
1287 | Source string for comparison |
||
1288 | tar : str |
||
1289 | Target string for comparison |
||
1290 | score_only : bool |
||
1291 | Return the score only, not the alignments |
||
1292 | |||
1293 | Returns |
||
1294 | ------- |
||
1295 | list(tuple(float, str, str) or float |
||
1296 | ALINE alignments and their scores or the top score |
||
1297 | |||
1298 | Examples |
||
1299 | -------- |
||
1300 | >>> cmp = ALINE() |
||
1301 | >>> cmp.alignments('cat', 'hat') |
||
1302 | [(50.0, 'c ‖ a t ‖', 'h ‖ a t ‖')] |
||
1303 | >>> cmp.alignments('niall', 'neil') |
||
1304 | [(90.0, '‖ n i a ll ‖', '‖ n e i l ‖')] |
||
1305 | >>> cmp.alignments('aluminum', 'catalan') |
||
1306 | [(81.5, '‖ a l u m ‖ inum', 'cat ‖ a l a n ‖')] |
||
1307 | >>> cmp.alignments('atcg', 'tagc') |
||
1308 | [(65.0, '‖ a t c ‖ g', 't ‖ a g c ‖'), (65.0, 'a ‖ tc - g ‖', |
||
1309 | '‖ t a g ‖ c')] |
||
1310 | |||
1311 | |||
1312 | .. versionadded:: 0.4.0 |
||
1313 | .. versionchanged:: 0.4.1 |
||
1314 | Renamed from .alignment to .alignments |
||
1315 | |||
1316 | """ |
||
1317 | |||
1318 | 1 | def _sig_skip(seg): |
|
1319 | 1 | return self._c_skip |
|
1320 | |||
1321 | 1 | def _sig_sub(seg1, seg2): |
|
1322 | 1 | return ( |
|
1323 | self._c_sub |
||
1324 | - _delta(seg1, seg2) |
||
1325 | - _sig_vwl(seg1) |
||
1326 | - _sig_vwl(seg2) |
||
1327 | ) |
||
1328 | |||
1329 | 1 | def _sig_exp(seg1, seg2a, seg2b): |
|
1330 | 1 | return ( |
|
1331 | self._c_exp |
||
1332 | - _delta(seg1, seg2a) |
||
1333 | - _delta(seg1, seg2b) |
||
1334 | - _sig_vwl(seg1) |
||
1335 | - max(_sig_vwl(seg2a), _sig_vwl(seg2b)) |
||
1336 | ) |
||
1337 | |||
1338 | 1 | def _sig_vwl(seg): |
|
1339 | 1 | return ( |
|
1340 | 0.0 |
||
1341 | if seg['manner'] > self.feature_weights['high vowel'] |
||
1342 | else self._c_vwl |
||
1343 | ) |
||
1344 | |||
1345 | 1 | def _delta(seg1, seg2): |
|
1346 | 1 | features = ( |
|
1347 | self.c_features |
||
1348 | if max(seg1['manner'], seg2['manner']) |
||
1349 | > self.feature_weights['high vowel'] |
||
1350 | else self.v_features |
||
1351 | ) |
||
1352 | 1 | diff = 0.0 |
|
1353 | 1 | for f in features: |
|
1354 | 1 | diff += ( |
|
1355 | abs(seg1.get(f, 0.0) - seg2.get(f, 0.0)) * self.salience[f] |
||
1356 | ) |
||
1357 | 1 | return diff |
|
1358 | |||
1359 | 1 | def _retrieve(i, j, score, out): |
|
1360 | 1 | def _record(score, out): |
|
1361 | 1 | out.append(('‖', '‖')) |
|
1362 | 1 | for i1 in range(i - 1, -1, -1): |
|
1363 | 1 | out.append((src[i1]['segment'], '')) |
|
1364 | 1 | for j1 in range(j - 1, -1, -1): |
|
1365 | 1 | out.append(('', tar[j1]['segment'])) |
|
1366 | 1 | if self._mode == 'global': |
|
1367 | 1 | score += (i + j) * _sig_skip('') |
|
1368 | |||
1369 | 1 | out = out[::-1] |
|
1370 | |||
1371 | 1 | src_alignment = [] |
|
1372 | 1 | tar_alignment = [] |
|
1373 | |||
1374 | 1 | out.append(('‖', '‖')) |
|
1375 | 1 | part = 0 |
|
1376 | 1 | s_segment = '' |
|
1377 | 1 | t_segment = '' |
|
1378 | 1 | for ss, ts in out: |
|
1379 | 1 | if ss == '‖': |
|
1380 | 1 | if part % 2 == 0: |
|
1381 | 1 | src_alignment.append(s_segment) |
|
1382 | 1 | tar_alignment.append(t_segment) |
|
1383 | 1 | s_segment = [] |
|
1384 | 1 | t_segment = [] |
|
1385 | else: |
||
1386 | 1 | src_alignment.append(' '.join(s_segment)) |
|
1387 | 1 | tar_alignment.append(' '.join(t_segment)) |
|
1388 | 1 | s_segment = '' |
|
1389 | 1 | t_segment = '' |
|
1390 | 1 | part += 1 |
|
1391 | else: |
||
1392 | 1 | if part % 2 == 0: |
|
1393 | 1 | s_segment += ss |
|
1394 | 1 | t_segment += ts |
|
1395 | else: |
||
1396 | 1 | s_segment.append(ss + ' ' * (len(ts) - len(ss))) |
|
1397 | 1 | t_segment.append(ts + ' ' * (len(ss) - len(ts))) |
|
1398 | |||
1399 | 1 | src_alignment = ' ‖ '.join(src_alignment).strip() |
|
1400 | 1 | tar_alignment = ' ‖ '.join(tar_alignment).strip() |
|
1401 | |||
1402 | 1 | alignments.append((score, src_alignment, tar_alignment)) |
|
1403 | 1 | return |
|
1404 | |||
1405 | 1 | if s_mat[i, j] == 0: |
|
1406 | 1 | _record(score, out) |
|
1407 | 1 | return |
|
1408 | else: |
||
1409 | 1 | if ( |
|
1410 | i > 0 |
||
1411 | and j > 0 |
||
1412 | and s_mat[i - 1, j - 1] |
||
1413 | + _sig_sub(src[i - 1], tar[j - 1]) |
||
1414 | + score |
||
1415 | >= threshold |
||
1416 | ): |
||
1417 | 1 | loc_out = deepcopy(out) |
|
1418 | 1 | loc_out.append( |
|
1419 | (src[i - 1]['segment'], tar[j - 1]['segment']) |
||
1420 | ) |
||
1421 | 1 | _retrieve( |
|
1422 | i - 1, |
||
1423 | j - 1, |
||
1424 | score + _sig_sub(src[i - 1], tar[j - 1]), |
||
1425 | loc_out, |
||
1426 | ) |
||
1427 | 1 | loc_out.pop() |
|
1428 | |||
1429 | 1 | if ( |
|
1430 | j > 0 |
||
1431 | and s_mat[i, j - 1] + _sig_skip(tar[j - 1]) + score |
||
1432 | >= threshold |
||
1433 | ): |
||
1434 | 1 | loc_out = deepcopy(out) |
|
1435 | 1 | loc_out.append(('-', tar[j - 1]['segment'])) |
|
1436 | 1 | _retrieve(i, j - 1, score + _sig_skip(tar[j - 1]), loc_out) |
|
1437 | 1 | loc_out.pop() |
|
1438 | |||
1439 | 1 | View Code Duplication | if ( |
|
|||
1440 | i > 0 |
||
1441 | and j > 1 |
||
1442 | and s_mat[i - 1, j - 2] |
||
1443 | + _sig_exp(src[i - 1], tar[j - 2], tar[j - 1]) |
||
1444 | + score |
||
1445 | >= threshold |
||
1446 | ): |
||
1447 | 1 | loc_out = deepcopy(out) |
|
1448 | 1 | loc_out.append( |
|
1449 | ( |
||
1450 | src[i - 1]['segment'], |
||
1451 | tar[j - 2]['segment'] + tar[j - 1]['segment'], |
||
1452 | ) |
||
1453 | ) |
||
1454 | 1 | _retrieve( |
|
1455 | i - 1, |
||
1456 | j - 2, |
||
1457 | score + _sig_exp(src[i - 1], tar[j - 2], tar[j - 1]), |
||
1458 | loc_out, |
||
1459 | ) |
||
1460 | 1 | loc_out.pop() |
|
1461 | |||
1462 | 1 | if ( |
|
1463 | i > 0 |
||
1464 | and s_mat[i - 1, j] + _sig_skip(src[i - 1]) + score |
||
1465 | >= threshold |
||
1466 | ): |
||
1467 | 1 | loc_out = deepcopy(out) |
|
1468 | 1 | loc_out.append((src[i - 1]['segment'], '-')) |
|
1469 | 1 | _retrieve(i - 1, j, score + _sig_skip(src[i - 1]), loc_out) |
|
1470 | 1 | loc_out.pop() |
|
1471 | |||
1472 | 1 | View Code Duplication | if ( |
1473 | i > 1 |
||
1474 | and j > 0 |
||
1475 | and s_mat[i - 2, j - 1] |
||
1476 | + _sig_exp(tar[j - 1], src[i - 2], src[i - 1]) |
||
1477 | + score |
||
1478 | >= threshold |
||
1479 | ): |
||
1480 | 1 | loc_out = deepcopy(out) |
|
1481 | 1 | loc_out.append( |
|
1482 | ( |
||
1483 | src[i - 2]['segment'] + src[i - 1]['segment'], |
||
1484 | tar[j - 1]['segment'], |
||
1485 | ) |
||
1486 | ) |
||
1487 | 1 | _retrieve( |
|
1488 | i - 2, |
||
1489 | j - 1, |
||
1490 | score + _sig_exp(tar[j - 1], src[i - 2], src[i - 1]), |
||
1491 | loc_out, |
||
1492 | ) |
||
1493 | 1 | loc_out.pop() |
|
1494 | |||
1495 | 1 | sg_max = 0.0 |
|
1496 | |||
1497 | 1 | src = list(src) |
|
1498 | 1 | tar = list(tar) |
|
1499 | |||
1500 | 1 | for ch in range(len(src)): |
|
1501 | 1 | if src[ch] in self._phones: |
|
1502 | 1 | seg = src[ch] |
|
1503 | 1 | src[ch] = dict(self._phones[src[ch]]) |
|
1504 | 1 | src[ch]['segment'] = seg |
|
1505 | 1 | for ch in range(len(tar)): |
|
1506 | 1 | if tar[ch] in self._phones: |
|
1507 | 1 | seg = tar[ch] |
|
1508 | 1 | tar[ch] = dict(self._phones[tar[ch]]) |
|
1509 | 1 | tar[ch]['segment'] = seg |
|
1510 | |||
1511 | 1 | src = [fb for fb in src if isinstance(fb, dict)] |
|
1512 | 1 | tar = [fb for fb in tar if isinstance(fb, dict)] |
|
1513 | |||
1514 | 1 | for i in range(1, len(src)): |
|
1515 | 1 | View Code Duplication | if 'supplemental' in src[i]: |
1516 | 1 | j = i - 1 |
|
1517 | 1 | while j > -1: |
|
1518 | 1 | if 'supplemental' not in src[j]: |
|
1519 | 1 | for key, value in src[i].items(): |
|
1520 | 1 | if key != 'supplemental': |
|
1521 | 1 | if key == 'segment': |
|
1522 | 1 | src[j]['segment'] += value |
|
1523 | else: |
||
1524 | 1 | src[j][key] = value |
|
1525 | 1 | j = 0 |
|
1526 | 1 | j -= 1 |
|
1527 | 1 | src = [fb for fb in src if 'supplemental' not in fb] |
|
1528 | |||
1529 | 1 | for i in range(1, len(tar)): |
|
1530 | 1 | View Code Duplication | if 'supplemental' in tar[i]: |
1531 | 1 | j = i - 1 |
|
1532 | 1 | while j > -1: |
|
1533 | 1 | if 'supplemental' not in tar[j]: |
|
1534 | 1 | for key, value in tar[i].items(): |
|
1535 | 1 | if key != 'supplemental': |
|
1536 | 1 | if key == 'segment': |
|
1537 | 1 | tar[j]['segment'] += value |
|
1538 | else: |
||
1539 | 1 | tar[j][key] = value |
|
1540 | 1 | j = 0 |
|
1541 | 1 | j -= 1 |
|
1542 | 1 | tar = [fb for fb in tar if 'supplemental' not in fb] |
|
1543 | |||
1544 | 1 | for i in range(len(src)): |
|
1545 | 1 | for key in src[i].keys(): |
|
1546 | 1 | if key != 'segment': |
|
1547 | 1 | src[i][key] = self.feature_weights[src[i][key]] |
|
1548 | 1 | for i in range(len(tar)): |
|
1549 | 1 | for key in tar[i].keys(): |
|
1550 | 1 | if key != 'segment': |
|
1551 | 1 | tar[i][key] = self.feature_weights[tar[i][key]] |
|
1552 | |||
1553 | 1 | src_len = len(src) |
|
1554 | 1 | tar_len = len(tar) |
|
1555 | |||
1556 | 1 | s_mat = np_zeros((src_len + 1, tar_len + 1), dtype=np_float) |
|
1557 | |||
1558 | 1 | if self._mode == 'global': |
|
1559 | 1 | for i in range(1, src_len + 1): |
|
1560 | 1 | s_mat[i, 0] = s_mat[i - 1, 0] + _sig_skip(src[i - 1]) |
|
1561 | 1 | for j in range(1, tar_len + 1): |
|
1562 | 1 | s_mat[0, j] = s_mat[0, j - 1] + _sig_skip(tar[j - 1]) |
|
1563 | |||
1564 | 1 | for i in range(1, src_len + 1): |
|
1565 | 1 | for j in range(1, tar_len + 1): |
|
1566 | 1 | s_mat[i, j] = max( |
|
1567 | s_mat[i - 1, j] + _sig_skip(src[i - 1]), |
||
1568 | s_mat[i, j - 1] + _sig_skip(tar[j - 1]), |
||
1569 | s_mat[i - 1, j - 1] + _sig_sub(src[i - 1], tar[j - 1]), |
||
1570 | s_mat[i - 1, j - 2] |
||
1571 | + _sig_exp(src[i - 1], tar[j - 2], tar[j - 1]) |
||
1572 | if j > 1 |
||
1573 | else NINF, |
||
1574 | s_mat[i - 2, j - 1] |
||
1575 | + _sig_exp(tar[j - 1], src[i - 2], src[i - 1]) |
||
1576 | if i > 1 |
||
1577 | else NINF, |
||
1578 | 0 if self._mode in {'local', 'half-local'} else NINF, |
||
1579 | ) |
||
1580 | |||
1581 | 1 | if s_mat[i, j] > sg_max: |
|
1582 | 1 | if self._mode == 'semi-global': |
|
1583 | 1 | if i == src_len or j == tar_len: |
|
1584 | 1 | sg_max = s_mat[i, j] |
|
1585 | else: |
||
1586 | 1 | sg_max = s_mat[i, j] |
|
1587 | |||
1588 | 1 | if self._mode in {'global', 'half-local'}: |
|
1589 | 1 | dp_score = s_mat[src_len, tar_len] |
|
1590 | else: |
||
1591 | 1 | dp_score = s_mat.max() |
|
1592 | |||
1593 | 1 | if score_only: |
|
1594 | 1 | return dp_score |
|
1595 | |||
1596 | 1 | threshold = (1 - self._epsilon) * dp_score |
|
1597 | |||
1598 | 1 | alignments = [] |
|
1599 | |||
1600 | 1 | for i in range(1, src_len + 1): |
|
1601 | 1 | for j in range(1, tar_len + 1): |
|
1602 | 1 | if self._mode in {'global', 'half-local'} and ( |
|
1603 | i < src_len or j < tar_len |
||
1604 | ): |
||
1605 | 1 | continue |
|
1606 | 1 | if self._mode == 'semi-global' and ( |
|
1607 | i < src_len and j < tar_len |
||
1608 | ): |
||
1609 | 1 | continue |
|
1610 | 1 | if s_mat[i, j] >= threshold: |
|
1611 | 1 | out = [] |
|
1612 | 1 | for j1 in range(tar_len - 1, j - 1, -1): |
|
1613 | 1 | out.append(('', tar[j1]['segment'])) |
|
1614 | 1 | for i1 in range(src_len - 1, i - 1, -1): |
|
1615 | 1 | out.append((src[i1]['segment'], '')) |
|
1616 | 1 | out.append(('‖', '‖')) |
|
1617 | 1 | _retrieve(i, j, 0, out) |
|
1618 | |||
1619 | 1 | def _first_element(x): |
|
1620 | 1 | return x[0] |
|
1621 | |||
1622 | 1 | return sorted(alignments, key=_first_element, reverse=True) |
|
1623 | |||
1624 | 1 | def sim_score(self, src, tar): |
|
1625 | """Return the ALINE alignment score of two strings. |
||
1626 | |||
1627 | Parameters |
||
1628 | ---------- |
||
1629 | src : str |
||
1630 | Source string for comparison |
||
1631 | tar : str |
||
1632 | Target string for comparison |
||
1633 | |||
1634 | Returns |
||
1635 | ------- |
||
1636 | float |
||
1637 | ALINE alignment score |
||
1638 | |||
1639 | Examples |
||
1640 | -------- |
||
1641 | >>> cmp = ALINE() |
||
1642 | >>> cmp.sim_score('cat', 'hat') |
||
1643 | 50.0 |
||
1644 | >>> cmp.sim_score('niall', 'neil') |
||
1645 | 90.0 |
||
1646 | >>> cmp.sim_score('aluminum', 'catalan') |
||
1647 | 81.5 |
||
1648 | >>> cmp.sim_score('atcg', 'tagc') |
||
1649 | 65.0 |
||
1650 | |||
1651 | |||
1652 | .. versionadded:: 0.4.0 |
||
1653 | |||
1654 | """ |
||
1655 | 1 | if src == '' and tar == '': |
|
1656 | 1 | return 1.0 |
|
1657 | 1 | return self.alignments(src, tar, score_only=True) |
|
1658 | |||
1659 | 1 | def sim(self, src, tar): |
|
1660 | """Return the normalized ALINE similarity of two strings. |
||
1661 | |||
1662 | Parameters |
||
1663 | ---------- |
||
1664 | src : str |
||
1665 | Source string for comparison |
||
1666 | tar : str |
||
1667 | Target string for comparison |
||
1668 | |||
1669 | Returns |
||
1670 | ------- |
||
1671 | float |
||
1672 | Normalized ALINE similarity |
||
1673 | |||
1674 | Examples |
||
1675 | -------- |
||
1676 | >>> cmp = ALINE() |
||
1677 | >>> cmp.dist('cat', 'hat') |
||
1678 | 0.4117647058823529 |
||
1679 | >>> cmp.dist('niall', 'neil') |
||
1680 | 0.33333333333333337 |
||
1681 | >>> cmp.dist('aluminum', 'catalan') |
||
1682 | 0.5925 |
||
1683 | >>> cmp.dist('atcg', 'tagc') |
||
1684 | 0.45833333333333337 |
||
1685 | |||
1686 | |||
1687 | .. versionadded:: 0.4.0 |
||
1688 | |||
1689 | """ |
||
1690 | 1 | num = self.sim_score(src, tar) |
|
1691 | 1 | if num: |
|
1692 | 1 | return num / self._normalizer( |
|
1693 | [self.sim_score(src, src), self.sim_score(tar, tar)] |
||
1694 | ) |
||
1695 | 1 | return 0.0 |
|
1696 | |||
1697 | |||
1698 | if __name__ == '__main__': |
||
1699 | import doctest |
||
1700 | |||
1701 | doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE) |
||
1702 |