1
|
|
|
# -*- coding: utf-8 -*- |
2
|
|
|
|
3
|
|
|
# Copyright 2018 by Christopher C. Little. |
4
|
|
|
# This file is part of Abydos. |
5
|
|
|
# |
6
|
|
|
# Abydos is free software: you can redistribute it and/or modify |
7
|
|
|
# it under the terms of the GNU General Public License as published by |
8
|
|
|
# the Free Software Foundation, either version 3 of the License, or |
9
|
|
|
# (at your option) any later version. |
10
|
|
|
# |
11
|
|
|
# Abydos is distributed in the hope that it will be useful, |
12
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of |
13
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
14
|
|
|
# GNU General Public License for more details. |
15
|
|
|
# |
16
|
|
|
# You should have received a copy of the GNU General Public License |
17
|
|
|
# along with Abydos. If not, see <http://www.gnu.org/licenses/>. |
18
|
|
|
|
19
|
1 |
|
"""abydos.phonetic._nrl. |
20
|
|
|
|
21
|
|
|
NRL English-to-phoneme algorithm |
22
|
|
|
""" |
23
|
|
|
|
24
|
1 |
|
from __future__ import ( |
25
|
|
|
absolute_import, |
26
|
|
|
division, |
27
|
|
|
print_function, |
28
|
|
|
unicode_literals, |
29
|
|
|
) |
30
|
|
|
|
31
|
1 |
|
from re import match as re_match |
32
|
|
|
|
33
|
1 |
|
from ._phonetic import _Phonetic |
34
|
|
|
|
35
|
1 |
|
__all__ = ['NRL', 'nrl'] |
36
|
|
|
|
37
|
|
|
|
38
|
1 |
|
class NRL(_Phonetic): |
|
|
|
|
39
|
|
|
"""Naval Research Laboratory English-to-phoneme encoder. |
40
|
|
|
|
41
|
|
|
This is defined by :cite:`Elovitz:1976`. |
42
|
|
|
""" |
43
|
|
|
|
44
|
1 |
|
_rules = { |
45
|
|
|
' ': ( |
46
|
|
|
('', ' ', '', ' '), |
47
|
|
|
('', '-', '', ''), |
48
|
|
|
('.', '\'S', '', 'z'), |
49
|
|
|
('#:.E', '\'S', '', 'z'), |
50
|
|
|
('#', '\'S', '', 'z'), |
51
|
|
|
('', '\'', '', ''), |
52
|
|
|
('', ',', '', ' '), |
53
|
|
|
('', '.', '', ' '), |
54
|
|
|
('', '?', '', ' '), |
55
|
|
|
('', '!', '', ' '), |
56
|
|
|
), |
57
|
|
|
'A': ( |
58
|
|
|
('', 'A', ' ', 'AX'), |
59
|
|
|
(' ', 'ARE', ' ', 'AAr'), |
60
|
|
|
(' ', 'AR', 'O', 'AXr'), |
61
|
|
|
('', 'AR', '#', 'EHr'), |
62
|
|
|
('^', 'AS', '#', 'EYs'), |
63
|
|
|
('', 'A', 'WA', 'AX'), |
64
|
|
|
('', 'AW', '', 'AO'), |
65
|
|
|
(' :', 'ANY', '', 'EHnIY'), |
66
|
|
|
('', 'A', '^+#', 'EY'), |
67
|
|
|
('#:', 'ALLY', '', 'AXlIY'), |
68
|
|
|
(' ', 'AL', '#', 'AXl'), |
69
|
|
|
('', 'AGAIN', '', 'AXgEHn'), |
70
|
|
|
('#:', 'AG', 'E', 'IHj'), |
71
|
|
|
('', 'A', '^+:#', 'AE'), |
72
|
|
|
(' :', 'A', '^+ ', 'EY'), |
73
|
|
|
('', 'A', '^%', 'EY'), |
74
|
|
|
(' ', 'ARR', '', 'AXr'), |
75
|
|
|
('', 'ARR', '', 'AEr'), |
76
|
|
|
(' :', 'AR', ' ', 'AAr'), |
77
|
|
|
('', 'AR', ' ', 'ER'), |
78
|
|
|
('', 'AR', '', 'AAr'), |
79
|
|
|
('', 'AIR', '', 'EHr'), |
80
|
|
|
('', 'AI', '', 'EY'), |
81
|
|
|
('', 'AY', '', 'EY'), |
82
|
|
|
('', 'AU', '', 'AO'), |
83
|
|
|
('#:', 'AL', ' ', 'AXl'), |
84
|
|
|
('#:', 'ALS', ' ', 'AXlz'), |
85
|
|
|
('', 'ALK', '', 'AOk'), |
86
|
|
|
('', 'AL', '^', 'AOl'), |
87
|
|
|
(' :', 'ABLE', '', 'EYbAXl'), |
88
|
|
|
('', 'ABLE', '', 'AXbAXl'), |
89
|
|
|
('', 'ANG', '+', 'EYnj'), |
90
|
|
|
('', 'A', '', 'AE'), |
91
|
|
|
), |
92
|
|
|
'B': ( |
93
|
|
|
(' ', 'BE', '^#', 'bIH'), |
94
|
|
|
('', 'BEING', '', 'bIYIHNG'), |
95
|
|
|
(' ', 'BOTH', ' ', 'bOWTH'), |
96
|
|
|
(' ', 'BUS', '#', 'bIHz'), |
97
|
|
|
('', 'BUIL', '', 'bIHl'), |
98
|
|
|
('', 'B', '', 'b'), |
99
|
|
|
), |
100
|
|
|
'C': ( |
101
|
|
|
(' ', 'CH', '^', 'k'), |
102
|
|
|
('^E', 'CH', '', 'k'), |
103
|
|
|
('', 'CH', '', 'CH'), |
104
|
|
|
(' S', 'CI', '#', 'sAY'), |
105
|
|
|
('', 'CI', 'A', 'SH'), |
106
|
|
|
('', 'CI', 'O', 'SH'), |
107
|
|
|
('', 'CI', 'EN', 'SH'), |
108
|
|
|
('', 'C', '+', 's'), |
109
|
|
|
('', 'CK', '', 'k'), |
110
|
|
|
('', 'COM', '%', 'kAHm'), |
111
|
|
|
('', 'C', '', 'k'), |
112
|
|
|
), |
113
|
|
|
'D': ( |
114
|
|
|
('#:', 'DED', ' ', 'dIHd'), |
115
|
|
|
('.E', 'D', ' ', 'd'), |
116
|
|
|
('#:^E', 'D', ' ', 't'), |
117
|
|
|
(' ', 'DE', '^#', 'dIH'), |
118
|
|
|
(' ', 'DO', ' ', 'dUW'), |
119
|
|
|
(' ', 'DOES', '', 'dAHz'), |
120
|
|
|
(' ', 'DOING', '', 'dUWIHNG'), |
121
|
|
|
(' ', 'DOW', '', 'dAW'), |
122
|
|
|
('', 'DU', 'A', 'jUW'), |
123
|
|
|
('', 'D', '', 'd'), |
124
|
|
|
), |
125
|
|
|
'E': ( |
126
|
|
|
('#:', 'E', ' ', ''), |
127
|
|
|
('\':^', 'E', ' ', ''), |
128
|
|
|
(' :', 'E', ' ', 'IY'), |
129
|
|
|
('#', 'ED', ' ', 'd'), |
130
|
|
|
('#:', 'E', 'D ', ''), |
131
|
|
|
('', 'EV', 'ER', 'EHv'), |
132
|
|
|
('', 'E', '^%', 'IY'), |
133
|
|
|
('', 'ERI', '#', 'IYrIY'), |
134
|
|
|
('', 'ERI', '', 'EHrIH'), |
135
|
|
|
('#:', 'ER', '#', 'ER'), |
136
|
|
|
('', 'ER', '#', 'EHr'), |
137
|
|
|
('', 'ER', '', 'ER'), |
138
|
|
|
(' ', 'EVEN', '', 'IYvEHn'), |
139
|
|
|
('#:', 'E', 'W', ''), |
140
|
|
|
('T', 'EW', '', 'UW'), |
141
|
|
|
('S', 'EW', '', 'UW'), |
142
|
|
|
('R', 'EW', '', 'UW'), |
143
|
|
|
('D', 'EW', '', 'UW'), |
144
|
|
|
('L', 'EW', '', 'UW'), |
145
|
|
|
('Z', 'EW', '', 'UW'), |
146
|
|
|
('N', 'EW', '', 'UW'), |
147
|
|
|
('J', 'EW', '', 'UW'), |
148
|
|
|
('TH', 'EW', '', 'UW'), |
149
|
|
|
('CH', 'EW', '', 'UW'), |
150
|
|
|
('SH', 'EW', '', 'UW'), |
151
|
|
|
('', 'EW', '', 'yUW'), |
152
|
|
|
('', 'E', 'O', 'IY'), |
153
|
|
|
('#:S', 'ES', ' ', 'IHz'), |
154
|
|
|
('#:C', 'ES', ' ', 'IHz'), |
155
|
|
|
('#:G', 'ES', ' ', 'IHz'), |
156
|
|
|
('#:Z', 'ES', ' ', 'IHz'), |
157
|
|
|
('#:X', 'ES', ' ', 'IHz'), |
158
|
|
|
('#:J', 'ES', ' ', 'IHz'), |
159
|
|
|
('#:CH', 'ES', ' ', 'IHz'), |
160
|
|
|
('#:SH', 'ES', ' ', 'IHz'), |
161
|
|
|
('#:', 'E', 'S ', ''), |
162
|
|
|
('#:', 'ELY', ' ', 'lIY'), |
163
|
|
|
('#:', 'EMENT', '', 'mEHnt'), |
164
|
|
|
('', 'EFUL', '', 'fUHl'), |
165
|
|
|
('', 'EE', '', 'IY'), |
166
|
|
|
('', 'EARN', '', 'ERn'), |
167
|
|
|
(' ', 'EAR', '^', 'ER'), |
168
|
|
|
('', 'EAD', '', 'EHd'), |
169
|
|
|
('#:', 'EA', ' ', 'IYAX'), |
170
|
|
|
('', 'EA', 'SU', 'EH'), |
171
|
|
|
('', 'EA', '', 'IY'), |
172
|
|
|
('', 'EIGH', '', 'EY'), |
173
|
|
|
('', 'EI', '', 'IY'), |
174
|
|
|
(' ', 'EYE', '', 'AY'), |
175
|
|
|
('', 'EY', '', 'IY'), |
176
|
|
|
('', 'EU', '', 'yUW'), |
177
|
|
|
('', 'E', '', 'EH'), |
178
|
|
|
), |
179
|
|
|
'F': (('', 'FUL', '', 'fUHl'), ('', 'F', '', 'f')), |
180
|
|
|
'G': ( |
181
|
|
|
('', 'GIV', '', 'gIHv'), |
182
|
|
|
(' ', 'G', 'I^', 'g'), |
183
|
|
|
('', 'GE', 'T', 'gEH'), |
184
|
|
|
('SU', 'GGES', '', 'gjEHs'), |
185
|
|
|
('', 'GG', '', 'g'), |
186
|
|
|
(' B#', 'G', '', 'g'), |
187
|
|
|
('', 'G', '+', 'j'), |
188
|
|
|
('', 'GREAT', '', 'grEYt'), |
189
|
|
|
('#', 'GH', '', ''), |
190
|
|
|
('', 'G', '', 'g'), |
191
|
|
|
), |
192
|
|
|
'H': ( |
193
|
|
|
(' ', 'HAV', '', 'hAEv'), |
194
|
|
|
(' ', 'HERE', '', 'hIYr'), |
195
|
|
|
(' ', 'HOUR', '', 'AWER'), |
196
|
|
|
('', 'HOW', '', 'hAW'), |
197
|
|
|
('', 'H', '#', 'h'), |
198
|
|
|
('', 'H', '', ''), |
199
|
|
|
), |
200
|
|
|
'I': ( |
201
|
|
|
(' ', 'IN', '', 'IHn'), |
202
|
|
|
(' ', 'I', ' ', 'AY'), |
203
|
|
|
('', 'IN', 'D', 'AYn'), |
204
|
|
|
('', 'IER', '', 'IYER'), |
205
|
|
|
('#:R', 'IED', '', 'IYd'), |
206
|
|
|
('', 'IED', ' ', 'AYd'), |
207
|
|
|
('', 'IEN', '', 'IYEHn'), |
208
|
|
|
('', 'IE', 'T', 'AYEH'), |
209
|
|
|
(' :', 'I', '%', 'AY'), |
210
|
|
|
('', 'I', '%', 'IY'), |
211
|
|
|
('', 'IE', '', 'IY'), |
212
|
|
|
('', 'I', '^+:#', 'IH'), |
213
|
|
|
('', 'IR', '#', 'AYr'), |
214
|
|
|
('', 'IZ', '%', 'AYz'), |
215
|
|
|
('', 'IS', '%', 'AYz'), |
216
|
|
|
('', 'I', 'D%', 'AY'), |
217
|
|
|
('+^', 'I', '^+', 'IH'), |
218
|
|
|
('', 'I', 'T%', 'AY'), |
219
|
|
|
('#:^', 'I', '^+', 'IH'), |
220
|
|
|
('', 'I', '^+', 'AY'), |
221
|
|
|
('', 'IR', '', 'ER'), |
222
|
|
|
('', 'IGH', '', 'AY'), |
223
|
|
|
('', 'ILD', '', 'AYld'), |
224
|
|
|
('', 'IGN', ' ', 'AYn'), |
225
|
|
|
('', 'IGN', '^', 'AYn'), |
226
|
|
|
('', 'IGN', '%', 'AYn'), |
227
|
|
|
('', 'IQUE', '', 'IYk'), |
228
|
|
|
('', 'I', '', 'IH'), |
229
|
|
|
), |
230
|
|
|
'J': (('', 'J', '', 'j'),), |
231
|
|
|
'K': ((' ', 'K', 'N', ''), ('', 'K', '', 'k')), |
232
|
|
|
'L': ( |
233
|
|
|
('', 'LO', 'C#', 'lOW'), |
234
|
|
|
('L', 'L', '', ''), |
235
|
|
|
('#:^', 'L', '%', 'AXl'), |
236
|
|
|
('', 'LEAD', '', 'lIYd'), |
237
|
|
|
('', 'L', '', 'l'), |
238
|
|
|
), |
239
|
|
|
'M': (('', 'MOV', '', 'mUWv'), ('', 'M', '', 'm')), |
240
|
|
|
'N': ( |
241
|
|
|
('E', 'NG', '+', 'nj'), |
242
|
|
|
('', 'NG', 'R', 'NGg'), |
243
|
|
|
('', 'NG', '#', 'NGg'), |
244
|
|
|
('', 'NGL', '%', 'NGgAXl'), |
245
|
|
|
('', 'NG', '', 'NG'), |
246
|
|
|
('', 'NK', '', 'NGk'), |
247
|
|
|
(' ', 'NOW', ' ', 'nAW'), |
248
|
|
|
('', 'N', '', 'n'), |
249
|
|
|
), |
250
|
|
|
'O': ( |
251
|
|
|
('', 'OF', ' ', 'AXv'), |
252
|
|
|
('', 'OROUGH', '', 'EROW'), |
253
|
|
|
('#:', 'OR', ' ', 'ER'), |
254
|
|
|
('#:', 'ORS', ' ', 'ERz'), |
255
|
|
|
('', 'OR', '', 'AOr'), |
256
|
|
|
(' ', 'ONE', '', 'wAHn'), |
257
|
|
|
('', 'OW', '', 'OW'), |
258
|
|
|
(' ', 'OVER', '', 'OWvER'), |
259
|
|
|
('', 'OV', '', 'AHv'), |
260
|
|
|
('', 'O', '^%', 'OW'), |
261
|
|
|
('', 'O', '^EN', 'OW'), |
262
|
|
|
('', 'O', '^I#', 'OW'), |
263
|
|
|
('', 'OL', 'D', 'OWl'), |
264
|
|
|
('', 'OUGHT', '', 'AOt'), |
265
|
|
|
('', 'OUGH', '', 'AHf'), |
266
|
|
|
(' ', 'OU', '', 'AW'), |
267
|
|
|
('H', 'OU', 'S#', 'AW'), |
268
|
|
|
('', 'OUS', '', 'AXs'), |
269
|
|
|
('', 'OUR', '', 'AOr'), |
270
|
|
|
('', 'OULD', '', 'UHd'), |
271
|
|
|
('^', 'OU', '^L', 'AH'), |
272
|
|
|
('', 'OUP', '', 'UWp'), |
273
|
|
|
('', 'OU', '', 'AW'), |
274
|
|
|
('', 'OY', '', 'OY'), |
275
|
|
|
('', 'OING', '', 'OWIHNG'), |
276
|
|
|
('', 'OI', '', 'OY'), |
277
|
|
|
('', 'OOR', '', 'AOr'), |
278
|
|
|
('', 'OOK', '', 'UHk'), |
279
|
|
|
('', 'OOD', '', 'UHd'), |
280
|
|
|
('', 'OO', '', 'UW'), |
281
|
|
|
('', 'O', 'E', 'OW'), |
282
|
|
|
('', 'O', ' ', 'OW'), |
283
|
|
|
('', 'OA', '', 'OW'), |
284
|
|
|
(' ', 'ONLY', '', 'OWnlIY'), |
285
|
|
|
(' ', 'ONCE', '', 'wAHns'), |
286
|
|
|
('', 'ON\'T', '', 'OWnt'), |
287
|
|
|
('C', 'O', 'N', 'AA'), |
288
|
|
|
('', 'O', 'NG', 'AO'), |
289
|
|
|
(' :^', 'O', 'N', 'AH'), |
290
|
|
|
('I', 'ON', '', 'AXn'), |
291
|
|
|
('#:', 'ON', ' ', 'AXn'), |
292
|
|
|
('#^', 'ON', '', 'AXn'), |
293
|
|
|
('', 'O', 'ST ', 'OW'), |
294
|
|
|
('', 'OF', '^', 'AOf'), |
295
|
|
|
('', 'OTHER', '', 'AHDHER'), |
296
|
|
|
('', 'OSS', ' ', 'AOs'), |
297
|
|
|
('#:^', 'OM', '', 'AHm'), |
298
|
|
|
('', 'O', '', 'AA'), |
299
|
|
|
), |
300
|
|
|
'P': ( |
301
|
|
|
('', 'PH', '', 'f'), |
302
|
|
|
('', 'PEOP', '', 'pIYp'), |
303
|
|
|
('', 'POW', '', 'pAW'), |
304
|
|
|
('', 'PUT', ' ', 'pUHt'), |
305
|
|
|
('', 'P', '', 'p'), |
306
|
|
|
), |
307
|
|
|
'Q': ( |
308
|
|
|
('', 'QUAR', '', 'kwAOr'), |
309
|
|
|
('', 'QU', '', 'kw'), |
310
|
|
|
('', 'Q', '', 'k'), |
311
|
|
|
), |
312
|
|
|
'R': ((' ', 'RE', '^#', 'rIY'), ('', 'R', '', 'r')), |
313
|
|
|
'S': ( |
314
|
|
|
('', 'SH', '', 'SH'), |
315
|
|
|
('#', 'SION', '', 'ZHAXn'), |
316
|
|
|
('', 'SOME', '', 'sAHm'), |
317
|
|
|
('#', 'SUR', '#', 'ZHER'), |
318
|
|
|
('', 'SUR', '#', 'SHER'), |
319
|
|
|
('#', 'SU', '#', 'ZHUW'), |
320
|
|
|
('#', 'SSU', '#', 'SHUW'), |
321
|
|
|
('#', 'SED', ' ', 'zd'), |
322
|
|
|
('#', 'S', '#', 'z'), |
323
|
|
|
('', 'SAID', '', 'sEHd'), |
324
|
|
|
('^', 'SION', '', 'SHAXn'), |
325
|
|
|
('', 'S', 'S', ''), |
326
|
|
|
('.', 'S', ' ', 'z'), |
327
|
|
|
('#:.E', 'S', ' ', 'z'), |
328
|
|
|
('#:^##', 'S', ' ', 'z'), |
329
|
|
|
('#:^#', 'S', ' ', 's'), |
330
|
|
|
('U', 'S', ' ', 's'), |
331
|
|
|
(' :#', 'S', ' ', 'z'), |
332
|
|
|
(' ', 'SCH', '', 'sk'), |
333
|
|
|
('', 'S', 'C+', ''), |
334
|
|
|
('#', 'SM', '', 'zm'), |
335
|
|
|
('#', 'SN', '\'', 'zAXn'), |
336
|
|
|
('', 'S', '', 's'), |
337
|
|
|
), |
338
|
|
|
'T': ( |
339
|
|
|
(' ', 'THE', ' ', 'DHAX'), |
340
|
|
|
('', 'TO', ' ', 'tUW'), |
341
|
|
|
('', 'THAT', ' ', 'DHAEt'), |
342
|
|
|
(' ', 'THIS', ' ', 'DHIHs'), |
343
|
|
|
(' ', 'THEY', '', 'DHEY'), |
344
|
|
|
(' ', 'THERE', '', 'DHEHr'), |
345
|
|
|
('', 'THER', '', 'DHER'), |
346
|
|
|
('', 'THEIR', '', 'DHEHr'), |
347
|
|
|
(' ', 'THAN', ' ', 'DHAEn'), |
348
|
|
|
(' ', 'THEM', ' ', 'DHEHm'), |
349
|
|
|
('', 'THESE', ' ', 'DHIYz'), |
350
|
|
|
(' ', 'THEN', '', 'DHEHn'), |
351
|
|
|
('', 'THROUGH', '', 'THrUW'), |
352
|
|
|
('', 'THOSE', '', 'DHOWz'), |
353
|
|
|
('', 'THOUGH', ' ', 'DHOW'), |
354
|
|
|
(' ', 'THUS', '', 'DHAHs'), |
355
|
|
|
('', 'TH', '', 'TH'), |
356
|
|
|
('#:', 'TED', ' ', 'tIHd'), |
357
|
|
|
('S', 'TI', '#N', 'CH'), |
358
|
|
|
('', 'TI', 'O', 'SH'), |
359
|
|
|
('', 'TI', 'A', 'SH'), |
360
|
|
|
('', 'TIEN', '', 'SHAXn'), |
361
|
|
|
('', 'TUR', '#', 'CHER'), |
362
|
|
|
('', 'TU', 'A', 'CHUW'), |
363
|
|
|
(' ', 'TWO', '', 'tUW'), |
364
|
|
|
('', 'T', '', 't'), |
365
|
|
|
), |
366
|
|
|
'U': ( |
367
|
|
|
(' ', 'UN', 'I', 'yUWn'), |
368
|
|
|
(' ', 'UN', '', 'AHn'), |
369
|
|
|
(' ', 'UPON', '', 'AXpAOn'), |
370
|
|
|
('T', 'UR', '#', 'UHr'), |
371
|
|
|
('S', 'UR', '#', 'UHr'), |
372
|
|
|
('R', 'UR', '#', 'UHr'), |
373
|
|
|
('D', 'UR', '#', 'UHr'), |
374
|
|
|
('L', 'UR', '#', 'UHr'), |
375
|
|
|
('Z', 'UR', '#', 'UHr'), |
376
|
|
|
('N', 'UR', '#', 'UHr'), |
377
|
|
|
('J', 'UR', '#', 'UHr'), |
378
|
|
|
('TH', 'UR', '#', 'UHr'), |
379
|
|
|
('CH', 'UR', '#', 'UHr'), |
380
|
|
|
('SH', 'UR', '#', 'UHr'), |
381
|
|
|
('', 'UR', '#', 'yUHr'), |
382
|
|
|
('', 'UR', '', 'ER'), |
383
|
|
|
('', 'U', '^ ', 'AH'), |
384
|
|
|
('', 'U', '^^', 'AH'), |
385
|
|
|
('', 'UY', '', 'AY'), |
386
|
|
|
(' G', 'U', '#', ''), |
387
|
|
|
('G', 'U', '%', ''), |
388
|
|
|
('G', 'U', '#', 'w'), |
389
|
|
|
('#N', 'U', '', 'yUW'), |
390
|
|
|
('T', 'U', '', 'UW'), |
391
|
|
|
('S', 'U', '', 'UW'), |
392
|
|
|
('R', 'U', '', 'UW'), |
393
|
|
|
('D', 'U', '', 'UW'), |
394
|
|
|
('L', 'U', '', 'UW'), |
395
|
|
|
('Z', 'U', '', 'UW'), |
396
|
|
|
('N', 'U', '', 'UW'), |
397
|
|
|
('J', 'U', '', 'UW'), |
398
|
|
|
('TH', 'U', '', 'UW'), |
399
|
|
|
('CH', 'U', '', 'UW'), |
400
|
|
|
('SH', 'U', '', 'UW'), |
401
|
|
|
('', 'U', '', 'yUW'), |
402
|
|
|
), |
403
|
|
|
'V': (('', 'VIEW', '', 'vyUW'), ('', 'V', '', 'v')), |
404
|
|
|
'W': ( |
405
|
|
|
(' ', 'WERE', '', 'wER'), |
406
|
|
|
('', 'WA', 'S', 'wAA'), |
407
|
|
|
('', 'WA', 'T', 'wAA'), |
408
|
|
|
('', 'WHERE', '', 'WHEHr'), |
409
|
|
|
('', 'WHAT', '', 'WHAAt'), |
410
|
|
|
('', 'WHOL', '', 'hOWl'), |
411
|
|
|
('', 'WHO', '', 'hUW'), |
412
|
|
|
('', 'WH', '', 'WH'), |
413
|
|
|
('', 'WAR', '', 'wAOr'), |
414
|
|
|
('', 'WOR', '^', 'wER'), |
415
|
|
|
('', 'WR', '', 'r'), |
416
|
|
|
('', 'W', '', 'w'), |
417
|
|
|
), |
418
|
|
|
'X': (('', 'X', '', 'ks'),), |
419
|
|
|
'Y': ( |
420
|
|
|
('', 'YOUNG', '', 'yAHNG'), |
421
|
|
|
(' ', 'YOU', '', 'yUW'), |
422
|
|
|
(' ', 'YES', '', 'yEHs'), |
423
|
|
|
(' ', 'Y', '', 'y'), |
424
|
|
|
('#:^', 'Y', ' ', 'IY'), |
425
|
|
|
('#:^', 'Y', 'I', 'IY'), |
426
|
|
|
(' :', 'Y', ' ', 'AY'), |
427
|
|
|
(' :', 'Y', '#', 'AY'), |
428
|
|
|
(' :', 'Y', '^+:#', 'IH'), |
429
|
|
|
(' :', 'Y', '^#', 'AY'), |
430
|
|
|
('', 'Y', '', 'IH'), |
431
|
|
|
), |
432
|
|
|
'Z': (('', 'Z', '', 'z'),), |
433
|
|
|
} |
434
|
|
|
|
435
|
1 |
|
def encode(self, word): |
|
|
|
|
436
|
|
|
"""Return the Naval Research Laboratory phonetic encoding of a word. |
437
|
|
|
|
438
|
|
|
Parameters |
439
|
|
|
---------- |
440
|
|
|
word : str |
441
|
|
|
The word to transform |
442
|
|
|
|
443
|
|
|
Returns |
444
|
|
|
------- |
445
|
|
|
str |
446
|
|
|
The NRL phonetic encoding |
447
|
|
|
|
448
|
|
|
Examples |
449
|
|
|
-------- |
450
|
|
|
>>> pe = NRL() |
451
|
|
|
>>> pe.encode('the') |
452
|
|
|
'DHAX' |
453
|
|
|
>>> pe.encode('round') |
454
|
|
|
'rAWnd' |
455
|
|
|
>>> pe.encode('quick') |
456
|
|
|
'kwIHk' |
457
|
|
|
>>> pe.encode('eaten') |
458
|
|
|
'IYtEHn' |
459
|
|
|
>>> pe.encode('Smith') |
460
|
|
|
'smIHTH' |
461
|
|
|
>>> pe.encode('Larsen') |
462
|
|
|
'lAArsEHn' |
463
|
|
|
|
464
|
|
|
""" |
465
|
|
|
|
466
|
1 |
|
def _to_regex(pattern, left_match=True): |
467
|
1 |
|
new_pattern = '' |
468
|
1 |
|
replacements = { |
469
|
|
|
'#': '[AEIOU]+', |
470
|
|
|
':': '[BCDFGHJKLMNPQRSTVWXYZ]*', |
471
|
|
|
'^': '[BCDFGHJKLMNPQRSTVWXYZ]', |
472
|
|
|
'.': '[BDVGJLMNTWZ]', |
473
|
|
|
'%': '(ER|E|ES|ED|ING|ELY)', |
474
|
|
|
'+': '[EIY]', |
475
|
|
|
' ': '^', |
476
|
|
|
} |
477
|
1 |
|
for char in pattern: |
478
|
1 |
|
new_pattern += ( |
479
|
|
|
replacements[char] if char in replacements else char |
480
|
|
|
) |
481
|
|
|
|
482
|
1 |
|
if left_match: |
483
|
1 |
|
new_pattern += '$' |
484
|
1 |
|
if '^' not in pattern: |
485
|
1 |
|
new_pattern = '^.*' + new_pattern |
486
|
|
|
else: |
487
|
1 |
|
new_pattern = '^' + new_pattern.replace('^', '$') |
488
|
1 |
|
if '$' not in new_pattern: |
489
|
1 |
|
new_pattern += '.*$' |
490
|
|
|
|
491
|
1 |
|
return new_pattern |
492
|
|
|
|
493
|
1 |
|
word = word.upper() |
494
|
|
|
|
495
|
1 |
|
pron = '' |
496
|
1 |
|
pos = 0 |
497
|
1 |
|
while pos < len(word): |
498
|
1 |
|
left_orig = word[:pos] |
499
|
1 |
|
right_orig = word[pos:] |
500
|
1 |
|
first = word[pos] if word[pos] in self._rules else ' ' |
501
|
1 |
|
for rule in self._rules[first]: |
502
|
1 |
|
left, match, right, out = rule |
503
|
1 |
|
if right_orig.startswith(match): |
504
|
1 |
|
if left: |
505
|
1 |
|
l_pattern = _to_regex(left, left_match=True) |
506
|
1 |
|
if right: |
507
|
1 |
|
r_pattern = _to_regex(right, left_match=False) |
508
|
1 |
|
if (not left or re_match(l_pattern, left_orig)) and ( |
|
|
|
|
509
|
|
|
not right |
|
|
|
|
510
|
|
|
or re_match(r_pattern, right_orig[len(match) :]) |
|
|
|
|
511
|
|
|
): |
512
|
1 |
|
pron += out |
513
|
1 |
|
pos += len(match) |
514
|
1 |
|
break |
515
|
|
|
else: |
516
|
1 |
|
pron += word[pos] |
517
|
1 |
|
pos += 1 |
518
|
|
|
|
519
|
1 |
|
return pron |
520
|
|
|
|
521
|
|
|
|
522
|
1 |
|
def nrl(word): |
523
|
|
|
"""Return the Naval Research Laboratory phonetic encoding of a word. |
524
|
|
|
|
525
|
|
|
This is a wrapper for :py:meth:`NRL.encode`. |
526
|
|
|
|
527
|
|
|
Parameters |
528
|
|
|
---------- |
529
|
|
|
word : str |
530
|
|
|
The word to transform |
531
|
|
|
|
532
|
|
|
Returns |
533
|
|
|
------- |
534
|
|
|
str |
535
|
|
|
The NRL phonetic encoding |
536
|
|
|
|
537
|
|
|
Examples |
538
|
|
|
-------- |
539
|
|
|
>>> nrl('the') |
540
|
|
|
'DHAX' |
541
|
|
|
>>> nrl('round') |
542
|
|
|
'rAWnd' |
543
|
|
|
>>> nrl('quick') |
544
|
|
|
'kwIHk' |
545
|
|
|
>>> nrl('eaten') |
546
|
|
|
'IYtEHn' |
547
|
|
|
>>> nrl('Smith') |
548
|
|
|
'smIHTH' |
549
|
|
|
>>> nrl('Larsen') |
550
|
|
|
'lAArsEHn' |
551
|
|
|
|
552
|
|
|
""" |
553
|
1 |
|
return NRL().encode(word) |
554
|
|
|
|
555
|
|
|
|
556
|
|
|
if __name__ == '__main__': |
557
|
|
|
import doctest |
558
|
|
|
|
559
|
|
|
doctest.testmod() |
560
|
|
|
|