Completed
Pull Request — master (#225)
by Chris
09:15
created

abydos.phonetic._ainsworth   A

Complexity

Total Complexity 5

Size/Duplication

Total Lines 300
Duplicated Lines 0 %

Test Coverage

Coverage 100%

Importance

Changes 0
Metric Value
wmc 5
eloc 222
dl 0
loc 300
ccs 20
cts 20
cp 1
rs 10
c 0
b 0
f 0

1 Method

Rating   Name   Duplication   Size   Complexity  
A Ainsworth.encode() 0 45 5
1
# -*- coding: utf-8 -*-
2
3
# Copyright 2019 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19 1
"""abydos.phonetic._ainsworth.
20
21
Ainsworth's grapheme to phoneme converter
22
"""
23
24 1
from __future__ import (
25
    absolute_import,
26
    division,
27
    print_function,
28
    unicode_literals,
29
)
30
31 1
import re
32
33 1
from ._phonetic import _Phonetic
34
35 1
__all__ = ['Ainsworth']
36
37
38 1
class Ainsworth(_Phonetic):
39
    """Ainsworth's grapheme to phoneme converter.
40
41
    Based on the ruleset listed in :cite:`Ainsworth:1973`.
42
43
44
    .. versionadded:: 0.4.1
45
    """
46
47 1
    _suffixes = (
48
        '('
49
        + '|'.join(
50
            [
51
                'able',
52
                'ance',
53
                'ence',
54
                'less',
55
                'ment',
56
                'ness',
57
                'ship',
58
                'sion',
59
                'tion',
60
                'age',
61
                'ant',
62
                'ate',
63
                'ent',
64
                'ery',
65
                'ful',
66
                'ify',
67
                'ise',
68
                'ism',
69
                'ity',
70
                'ive',
71
                'ize',
72
                'ous',
73
                'al',
74
                'cy',
75
                'en',
76
                'er',
77
                'es',
78
                'fy',
79
                'ry',
80
                's',
81
                'y',
82
            ]
83
        )
84
        + ')'
85
    )
86
87 1
    _rules = [
88
        (re.compile('^a$'), 'ə', 1),
89
        (re.compile('^are'), 'ɑ', 3),
90
        (re.compile('a(?=[ei])'), 'ɛi', 1),
91
        (re.compile('ar'), 'ɑ', 2),
92
        (re.compile('a(?=sk)'), 'ɑ', 1),
93
        (re.compile('a(?=st)'), 'ɑ', 1),
94
        (re.compile('a(?=th)'), 'ɑ', 1),
95
        (re.compile('a(?=ft)'), 'ɑ', 1),
96
        (re.compile('ai'), 'ɛi', 2),
97
        (re.compile('ay'), 'ɛi', 2),
98
        (re.compile('aw'), 'ɔ', 2),
99
        (re.compile('au'), 'ɔ', 2),
100
        (re.compile('al(?=l)'), 'ɔ', 2),
101
        (re.compile('a(?=ble)'), 'ɛi', 1),
102
        (re.compile('a(?=ng' + _suffixes + ')'), 'ɛi', 1),
103
        (re.compile('a'), 'æ', 1),
104
        (re.compile('b'), 'b', 1),
105
        (re.compile('ch'), 'tʃ', 2),
106
        (re.compile('ck'), 'k', 2),
107
        (re.compile('c(?=y)'), 's', 1),
108
        (re.compile('c(?=e)'), 's', 1),
109
        (re.compile('c(?=i)'), 's', 1),
110
        (re.compile('c'), 'k', 1),
111
        (re.compile('d'), 'd', 1),
112
        (re.compile('(?<=[aeiou][bcdfghjklmnpqrstvwxyz])e$'), '', 1),
113
        (re.compile('(?<=th)e$'), 'ə', 1),
114
        (re.compile('^(?<=[bcdfghjklmnpqrstvwxyz])e$'), 'i', 1),
115
        (re.compile('^(?<=[bcdfghjklmnpqrstvwxyz])e(?=d)'), 'ɛ', 1),
116
        (re.compile('o(?=ld)'), 'əʊ', 1),
117
        (re.compile('oy'), 'ɔi', 2),
118
        (re.compile('o(?=ing)'), 'əʊ', 1),
119
        (re.compile('oi'), 'ɔi', 2),
120
        (re.compile('(?<=y)ou'), 'u', 2),
121
        (re.compile('ou(?=s)'), 'ʌ', 2),
122
        (re.compile('ough(?=t)'), 'ɔ', 4),
123
        (re.compile('(?<=b)ough'), 'aʊ', 4),
124
        (re.compile('(?<=t)ough'), 'ʌf', 4),
125
        (re.compile('(?<=c)ough'), 'of', 4),
126
        (re.compile('^(?<=r)ough'), 'ʌf', 4),
127
        (re.compile('(?<=r)ough'), 'ʊ', 4),
128
        (re.compile('ough'), 'əʊ', 4),
129
        (re.compile('oul(?=d)'), 'ʊ', 3),
130
        (re.compile('ou'), 'aʊ', 2),
131
        (re.compile('oor'), 'ɔ', 3),
132
        (re.compile('oo(?=k)'), 'ʊ', 2),
133
        (re.compile('(?<=f)oo(?=d)'), 'u', 2),
134
        (re.compile('oo(?=d)'), 'ʊ', 2),
135
        (re.compile('(?<=f)oo(?=t)'), 'ʊ', 2),
136
        (re.compile('(?<=s)oo(?=t)'), 'ʊ', 2),
137
        (re.compile('(?<=w)oo'), 'ʊ', 2),
138
        (re.compile('oo'), 'u', 2),
139
        (re.compile('(?<=sh)oe'), 'u', 2),
140
        (re.compile('oe'), 'əʊ', 2),
141
        (re.compile('(?<=[aeiou][bcdfghjklmnpqrstvwxyz]d)e(?=d)$'), 'ə', 1),
142
        (re.compile('(?<=[aeiou][bcdfghjklmnpqrstvwxyz]t)e(?=d)$'), 'ə', 1),
143
        (re.compile('(?<=[aeiou][bcdfghjklmnpqrstvwxyz])e(?=d)$'), '', 1),
144
        (re.compile('e(?=r)$'), 'ə', 1),
145
        (re.compile('(?<=wh)ere'), 'ɛə', 3),
146
        (re.compile('(?<=h)ere'), 'iə', 3),
147
        (re.compile('(?<=w)ere'), 'ɜ', 3),
148
        (re.compile('ere'), 'ir', 3),
149
        (re.compile('ee'), 'i', 2),
150
        (re.compile('ear'), 'ir', 3),
151
        (re.compile('ea'), 'i', 2),
152
        (re.compile('e(?=ver)'), 'ɛ', 1),
153
        (re.compile('eye'), 'ɑi', 3),
154
        (re.compile('e(?=[ei])'), 'i', 1),
155
        (re.compile('(?<=c)ei'), 'i', 2),
156
        (re.compile('ei'), 'ɑi', 2),
157
        (re.compile('e(?=r)'), 'ɜ', 1),
158
        (re.compile('eo'), 'i', 2),
159
        (re.compile('ew'), 'ju', 2),
160
        (re.compile('e(?=u)'), '', 1),
161
        (re.compile('e'), 'ɛ', 1),
162
        (re.compile('f$'), 'v', 1),
163
        (re.compile('f'), 'f', 1),
164
        (re.compile('g(?=e)$'), 'dʒ', 1),
165
        (re.compile('g(?=es)$'), 'dʒ', 1),
166
        (re.compile('g(?=' + _suffixes + ')'), 'g', 1),
167
        (re.compile('g(?=i)'), 'dʒ', 1),
168
        (re.compile('g(?=et)'), 'g', 1),
169
        (re.compile('(?<=c)ow'), 'ɑʊ', 2),
170
        (re.compile('(?<=h)ow'), 'ɑʊ', 2),
171
        (re.compile('(?<=n)ow'), 'ɑʊ', 2),
172
        (re.compile('(?<=v)ow'), 'ɑʊ', 2),
173
        (re.compile('(?<=r)ow'), 'ɑʊ', 2),
174
        (re.compile('ow'), 'əʊ', 2),
175
        (re.compile('(?<=g)o$'), 'əʊ', 1),
176
        (re.compile('(?<=n)o$'), 'əʊ', 1),
177
        (re.compile('(?<=s)o$'), 'əʊ', 1),
178
        (re.compile('o$'), 'u', 1),
179
        (re.compile('o'), 'o', 1),
180
        (re.compile('ph'), 'f', 2),
181
        (re.compile('psy'), 'sɑi', 3),
182
        (re.compile('p'), 'p', 1),
183
        (re.compile('q'), 'kw', 1),
184
        (re.compile('r$'), '', 1),
185
        (re.compile('rho'), 'rəʊ', 3),
186
        (re.compile('r'), 'r', 1),
187
        (re.compile('sh'), 'ʃ', 2),
188
        (re.compile('ss'), 's', 2),
189
        (re.compile('sch'), 'sk', 3),
190
        (re.compile('(?<=Xv)s'), 'z', 1),
191
        (re.compile('(?<=[aeiou])s$'), 'z', 1),
192
        (re.compile('s'), 's', 1),
193
        (re.compile('there'), 'ðɛə', 5),
194
        (re.compile('g(?=e)'), 'dʒ', 1),
195
        (re.compile('gh'), 'g', 2),
196
        (re.compile('g'), 'g', 1),
197
        (re.compile('(?<=w)h'), '', 1),
198
        (re.compile('ha(?=v)'), 'hæ', 2),
199
        (re.compile('h'), 'h', 1),
200
        (re.compile('^i$'), 'ɑi', 1),
201
        (re.compile('i(?=ty)'), 'ɪ', 1),
202
        (re.compile('i(?=[ei])'), 'ɑi', 1),
203
        (re.compile('ir'), 'ɜ', 2),
204
        (re.compile('igh'), 'ɑi', 3),
205
        (re.compile('(?<=t)io(?=n)'), 'ʌ', 2),
206
        (re.compile('i(?=nd)'), 'ɑi', 1),
207
        (re.compile('i(?=ld)'), 'ɑi', 1),
208
        (re.compile('^(?<=[bcdfghjklmnpqrstvwxyz])ie'), 'ɑi', 2),
209
        (re.compile('(?<=[aeiou][bcdfghjklmnpqrstvwxyz])ie'), 'i', 2),
210
        (re.compile('i'), 'ɪ', 1),
211
        (re.compile('j'), 'dʒ', 1),
212
        (re.compile('^k(?=n)'), '', 1),
213
        (re.compile('k'), 'k', 1),
214
        (re.compile('le$'), 'əl', 2),
215
        (re.compile('l'), 'l', 1),
216
        (re.compile('m'), 'm', 1),
217
        (re.compile('n(?=g)'), 'ŋ', 1),
218
        (re.compile('n'), 'n', 1),
219
        (re.compile('or'), 'ɔ', 2),
220
        (re.compile('o(?=[ei])'), 'əʊ', 1),
221
        (re.compile('oa'), 'əʊ', 2),
222
        (re.compile('their'), 'ðɛə', 5),
223
        (re.compile('th(?=r)'), 'θ', 2),
224
        (re.compile('th'), 'ð', 2),
225
        (re.compile('t(?=ion)'), 'ʃ', 1),
226
        (re.compile('t'), 't', 1),
227
        (re.compile('u(?=pon)'), 'ʌ', 1),
228
        (re.compile('u(?=[aeiou])'), 'u', 1),
229
        (re.compile('u(?=[bcdfghjklmnpqrstvwxyz])$'), 'ʌ', 1),
230
        (re.compile('(?<=r)u'), 'u', 1),
231
        (re.compile('(?<=l)u'), 'u', 1),
232
        (re.compile('u'), 'ju', 1),
233
        (re.compile('v'), 'v', 1),
234
        (re.compile('w(?=r)'), '', 1),
235
        (re.compile('wh(?=o)'), 'h', 2),
236
        (re.compile('wha(?=t)'), 'wo', 3),
237
        (re.compile('wa'), 'wo', 2),
238
        (re.compile('wo(?=r)'), 'wɜ', 2),
239
        (re.compile('w'), 'w', 1),
240
        (re.compile('x'), 'ks', 1),
241
        (re.compile('^y'), 'j', 1),
242
        (re.compile('(?<=[aeiou][bcdfghjklmnpqrstvwxyz])y'), 'ɪ', 1),
243
        (re.compile('^(?<=[bcdfghjklmnpqrstvwxyz])y'), 'ɑi', 1),
244
        (re.compile('y(?=[ei])'), 'ɑi', 1),
245
        (re.compile('y'), 'ɪ', 1),
246
        (re.compile('z'), 'z', 1),
247
    ]
248
249 1
    def encode(self, word):
250
        """Return the phonemic representation of a word.
251
252
        Parameters
253
        ----------
254
        word : str
255
            The word to transform
256
257
        Returns
258
        -------
259
        str
260
            The phonemic representation in IPA
261
262
        Examples
263
        --------
264
        >>> pe = Ainsworth()
265
        >>> pe.encode('Christopher')
266
        'tʃrɪstofɜ'
267
        >>> pe.encode('Niall')
268
        'nɪɔl'
269
        >>> pe.encode('Smith')
270
        'smɪð'
271
        >>> pe.encode('Schmidt')
272
        'skmɪdt'
273
274
275
        .. versionadded:: 0.4.1
276
277
        """
278
        # lowercase
279 1
        word = word.lower()
280 1
        pron = []
281
282 1
        pos = 0
283 1
        while pos < len(word):
284 1
            for rule, repl, matchlen in self._rules:
285 1
                if rule.match(word, pos):
286 1
                    pron.append(repl)
287 1
                    pos += matchlen
288 1
                    break
289
            else:
290
                # failed to match, but advance anyway
291 1
                pos += 1
292
293 1
        return ''.join(pron)
294
295
296
if __name__ == '__main__':
297
    import doctest
298
299
    doctest.testmod()
300