1
|
|
|
"""Primary techniques for the core functionality of namebot.""" |
2
|
|
|
|
3
|
|
|
from __future__ import absolute_import |
4
|
|
|
from __future__ import division |
5
|
|
|
|
6
|
|
|
from random import choice |
7
|
|
|
from string import ascii_uppercase |
8
|
|
|
from collections import defaultdict |
9
|
|
|
import re |
10
|
|
|
import nltk |
11
|
|
|
|
12
|
|
|
from . import settings as namebot_settings |
13
|
|
|
from . import normalization |
14
|
|
|
|
15
|
|
|
|
16
|
|
|
_prefixes = namebot_settings.PREFIXES |
17
|
|
|
_suffixes = namebot_settings.SUFFIXES |
18
|
|
|
_alphabet = namebot_settings.ALPHABET |
19
|
|
|
_consonants = namebot_settings.CONSONANTS |
20
|
|
|
_vowels = namebot_settings.VOWELS |
21
|
|
|
_regexes = namebot_settings.regexes |
22
|
|
|
|
23
|
|
|
|
24
|
|
|
class InsufficientWordsError(Exception): |
25
|
|
|
def __init__(self, msg): |
26
|
|
|
self.msg = msg |
27
|
|
|
|
28
|
|
|
|
29
|
|
|
def domainify(words, tld='com'): |
30
|
|
|
"""Convert words into a domain format for testing domains. |
31
|
|
|
|
32
|
|
|
Args: |
33
|
|
|
words (list): List of words |
34
|
|
|
tld (str, optional): The TLD (top-level domain) to use. |
35
|
|
|
|
36
|
|
|
Returns: |
37
|
|
|
list: The modified list of words. |
38
|
|
|
""" |
39
|
|
|
_words = [] |
40
|
|
|
if tld.startswith('.'): |
41
|
|
|
tld = tld.replace('.', '') |
42
|
|
|
for word in words: |
43
|
|
|
if word.endswith(tld) and tld != '': |
44
|
|
|
word = word.replace(tld, '.{}'.format(tld)) |
45
|
|
|
_words.append(word) |
46
|
|
|
return _words |
47
|
|
|
|
48
|
|
|
|
49
|
|
|
def spoonerism(words): |
50
|
|
|
"""Convert a list of words formatted with the spoonerism technique. |
51
|
|
|
|
52
|
|
|
Args: |
53
|
|
|
words (list) - The list of words to operate on |
54
|
|
|
|
55
|
|
|
Returns: |
56
|
|
|
words (list) - The updated list of words |
57
|
|
|
|
58
|
|
|
>>> spoonerism(['foo', 'bar']) |
59
|
|
|
>>> ['boo', 'far'] |
60
|
|
|
""" |
61
|
|
|
"First: [f]oo [b]ar => boo far" |
62
|
|
|
new_words = [] |
63
|
|
|
if len(words) < 2: |
64
|
|
|
raise InsufficientWordsError('Need more than one word to combine') |
65
|
|
|
for k, word in enumerate(words): |
66
|
|
|
try: |
67
|
|
|
new_words.append('{}{} {}{}'.format( |
68
|
|
|
words[k + 1][0], # 2nd word, 1st letter |
69
|
|
|
word[1:], # 1st word, 2nd letter to end |
70
|
|
|
word[0], # 1st word, 1st letter |
71
|
|
|
words[k + 1][1:])) # 2nd word, 2nd letter to end |
72
|
|
|
except IndexError: |
73
|
|
|
continue |
74
|
|
|
return new_words |
75
|
|
|
|
76
|
|
|
|
77
|
|
|
def kniferism(words): |
78
|
|
|
"""Convert a list of words formatted with the kniferism technique. |
79
|
|
|
|
80
|
|
|
Args: |
81
|
|
|
words (list) - The list of words to operate on |
82
|
|
|
|
83
|
|
|
Returns: |
84
|
|
|
words (list) - The updated list of words |
85
|
|
|
|
86
|
|
|
>>> kniferism(['foo', 'bar']) |
87
|
|
|
>>> ['fao', 'bor'] |
88
|
|
|
""" |
89
|
|
|
"Mid: f[o]o b[a]r => fao bor" |
90
|
|
|
if len(words) < 2: |
91
|
|
|
raise InsufficientWordsError('Need more than one word to combine') |
92
|
|
|
new_words = [] |
93
|
|
|
for k, word in enumerate(words): |
94
|
|
|
try: |
95
|
|
|
middle_second = int(len(words[k + 1]) / 2) |
96
|
|
|
middle_first = int(len(word) / 2) |
97
|
|
|
new_words.append('{}{}{} {}{}{}'.format( |
98
|
|
|
word[:middle_first], |
99
|
|
|
words[k + 1][middle_second], |
100
|
|
|
word[middle_first + 1:], |
101
|
|
|
words[k + 1][:middle_second], |
102
|
|
|
word[middle_first], |
103
|
|
|
words[k + 1][middle_second + 1:])) |
104
|
|
|
except IndexError: |
105
|
|
|
continue |
106
|
|
|
return new_words |
107
|
|
|
|
108
|
|
|
|
109
|
|
|
def forkerism(words): |
110
|
|
|
"""Convert a list of words formatted with the forkerism technique. |
111
|
|
|
|
112
|
|
|
Args: |
113
|
|
|
words (list) - The list of words to operate on |
114
|
|
|
|
115
|
|
|
Returns: |
116
|
|
|
words (list) - The updated list of words |
117
|
|
|
|
118
|
|
|
>>> forkerism(['foo', 'bar']) |
119
|
|
|
>>> ['for', 'bao'] |
120
|
|
|
""" |
121
|
|
|
"Last: fo[o] ba[r] => for bao" |
122
|
|
|
if len(words) < 2: |
123
|
|
|
raise InsufficientWordsError('Need more than one word to combine') |
124
|
|
|
new_words = [] |
125
|
|
|
for k, word in enumerate(words): |
126
|
|
|
try: |
127
|
|
|
s_word = words[k + 1] |
128
|
|
|
s_word_len = len(s_word) |
129
|
|
|
f_word_len = len(word) |
130
|
|
|
f_w_last_letter = word[f_word_len - 1] |
131
|
|
|
s_w_last_letter = words[k + 1][s_word_len - 1] |
132
|
|
|
new_words.append('{}{} {}{}'.format( |
133
|
|
|
word[:f_word_len - 1], # 1st word, 1st letter to last - 1 |
134
|
|
|
s_w_last_letter, # 2nd word, last letter |
135
|
|
|
s_word[:s_word_len - 1], # 2nd word, 1st letter to last - 1 |
136
|
|
|
f_w_last_letter)) # 1st word, last letter |
137
|
|
|
except IndexError: |
138
|
|
|
continue |
139
|
|
|
return new_words |
140
|
|
|
|
141
|
|
|
|
142
|
|
|
def reduplication_ablaut(words, count=1, random=True, vowel='e'): |
143
|
|
|
"""A technique to combine words and altering the vowels. |
144
|
|
|
|
145
|
|
|
e.g ch[i]t-ch[a]t, d[i]lly, d[a]lly. |
146
|
|
|
See http://phrases.org.uk/meanings/reduplication.html. |
147
|
|
|
""" |
148
|
|
|
if len(words) < 2: |
149
|
|
|
raise InsufficientWordsError('Need more than one word to combine') |
150
|
|
|
new_words = [] |
151
|
|
|
substitution = choice(_vowels) if random else vowel |
152
|
|
|
for word in words: |
153
|
|
|
second = re.sub(r'a|e|i|o|u', substitution, word, count=count) |
154
|
|
|
# Only append if the first and second are different. |
155
|
|
|
if word != second: |
156
|
|
|
new_words.append('{} {}'.format(word, second)) |
157
|
|
|
return new_words |
158
|
|
|
|
159
|
|
|
|
160
|
|
|
def prefixify(words): |
161
|
|
|
"""Apply a prefix technique to a set of words. |
162
|
|
|
|
163
|
|
|
Args: |
164
|
|
|
words (list) - The list of words to operate on. |
165
|
|
|
|
166
|
|
|
Returns: |
167
|
|
|
new_arr (list): the updated *fixed words |
168
|
|
|
""" |
169
|
|
|
new_arr = [] |
170
|
|
|
for word in words: |
171
|
|
|
if not word: |
172
|
|
|
continue |
173
|
|
|
for prefix in _prefixes: |
174
|
|
|
first_prefix_no_vowel = re.search(_regexes['no_vowels'], word[0]) |
175
|
|
|
second_prefix_no_vowel = re.search(_regexes['no_vowels'], prefix[0]) |
176
|
|
|
if first_prefix_no_vowel or second_prefix_no_vowel: |
177
|
|
|
# if there's a vowel at the end of |
178
|
|
|
# prefix but not at the beginning |
179
|
|
|
# of the word (or vice versa) |
180
|
|
|
vowel_beginning = re.search(r'a|e|i|o|u', prefix[-1:]) |
181
|
|
|
vowel_end = re.search(r'^a|e|i|o|u', word[:1]) |
182
|
|
|
if vowel_beginning or vowel_end: |
183
|
|
|
new_arr.append('{}{}'.format(prefix, word)) |
184
|
|
|
return new_arr |
185
|
|
|
|
186
|
|
|
|
187
|
|
|
def suffixify(words): |
188
|
|
|
"""Apply a suffix technique to a set of words. |
189
|
|
|
|
190
|
|
|
Args: |
191
|
|
|
words (list) - The list of words to operate on. |
192
|
|
|
(e.g -> chard + ard = chardard -> chard) |
193
|
|
|
|
194
|
|
|
Returns: |
195
|
|
|
new_arr (list): the updated *fixed words |
196
|
|
|
""" |
197
|
|
|
new_arr = [] |
198
|
|
|
for word in words: |
199
|
|
|
if not word: |
200
|
|
|
continue |
201
|
|
|
for suffix in _suffixes: |
202
|
|
|
prefix_start_vowel = re.search(_regexes['all_vowels'], word[0]) |
203
|
|
|
suffix_start_vowel = re.search(_regexes['all_vowels'], suffix[0]) |
204
|
|
|
if prefix_start_vowel or suffix_start_vowel: |
205
|
|
|
if suffix is 'ify': |
206
|
|
|
if word[-1] is 'e': |
207
|
|
|
if word[-2] is not 'i': |
208
|
|
|
new_arr.append('{}{}'.format(word[:-2], suffix)) |
209
|
|
|
else: |
210
|
|
|
new_arr.append('{}{}'.format(word[:-1], suffix)) |
211
|
|
|
new_arr.append(word + suffix) |
212
|
|
|
else: |
213
|
|
|
new_arr.append(word + suffix) |
214
|
|
|
return new_arr |
215
|
|
|
|
216
|
|
|
|
217
|
|
|
def duplifixify(words): |
218
|
|
|
"""Apply a duplifix technique to a set of words (e.g: teeny weeny, etc...). |
219
|
|
|
|
220
|
|
|
Args: |
221
|
|
|
words (list) - The list of words to operate on. |
222
|
|
|
|
223
|
|
|
Returns: |
224
|
|
|
new_arr (list): the updated *fixed words |
225
|
|
|
""" |
226
|
|
|
new_arr = [] |
227
|
|
|
for word in words: |
228
|
|
|
if not word: |
229
|
|
|
continue |
230
|
|
|
for letter in _alphabet: |
231
|
|
|
# check if the first letter is NOT the same as the second letter, |
232
|
|
|
# or the combined word is not a duplicate of the first. |
233
|
|
|
duplicate_word = '{}{}'.format(letter, word[1:]) == word |
234
|
|
|
if word[0] is not letter and not duplicate_word: |
235
|
|
|
new_arr.append('{} {}{}'.format(word, letter, word[1:])) |
236
|
|
|
return new_arr |
237
|
|
|
|
238
|
|
|
|
239
|
|
|
def disfixify(words): |
240
|
|
|
"""Apply a disfix technique to a set of words. |
241
|
|
|
|
242
|
|
|
TODO: implement |
243
|
|
|
|
244
|
|
|
Args: |
245
|
|
|
words (list) - The list of words to operate on. |
246
|
|
|
|
247
|
|
|
Returns: |
248
|
|
|
new_arr (list): the updated *fixed words |
249
|
|
|
""" |
250
|
|
|
new_arr = [] |
251
|
|
|
return new_arr |
252
|
|
|
|
253
|
|
|
|
254
|
|
|
def infixify(words): |
255
|
|
|
"""Apply a disfix technique to a set of words. |
256
|
|
|
|
257
|
|
|
TODO: implement |
258
|
|
|
|
259
|
|
|
Args: |
260
|
|
|
words (list) - The list of words to operate on. |
261
|
|
|
|
262
|
|
|
Returns: |
263
|
|
|
new_arr (list): the updated *fixed words |
264
|
|
|
""" |
265
|
|
|
new_arr = [] |
266
|
|
|
return new_arr |
267
|
|
|
|
268
|
|
|
|
269
|
|
|
def simulfixify(words, pairs=None, max=5): |
270
|
|
|
"""Generate simulfixed words. |
271
|
|
|
|
272
|
|
|
Args: |
273
|
|
|
words (list) - List of words to operate on. |
274
|
|
|
pairs (list, optional) - Simulfix pairs to use for each word. |
275
|
|
|
If not specified, these will be generated |
276
|
|
|
randomly as vowel + consonant strings. |
277
|
|
|
max (int, optional): The number of simulfix pairs to generate |
278
|
|
|
(if pairs is not specified.) |
279
|
|
|
|
280
|
|
|
Returns: |
281
|
|
|
results (list) - The simulfix version of each word, |
282
|
|
|
for each simulfix pair. |
283
|
|
|
""" |
284
|
|
|
results = [] |
285
|
|
|
if pairs is None: |
286
|
|
|
pairs = ['{}{}'.format(choice(_vowels), choice(_consonants)) |
287
|
|
|
for _ in range(max)] |
288
|
|
|
for word in words: |
289
|
|
|
for combo in pairs: |
290
|
|
|
mid = len(word) // 2 |
291
|
|
|
_word = '{}{}{}'.format(word[0:mid], combo, word[mid:]) |
292
|
|
|
results.append(_word) |
293
|
|
|
return results |
294
|
|
|
|
295
|
|
|
|
296
|
|
|
def palindrome(word): |
297
|
|
|
return '{}{}'.format(word, word[::-1]) |
298
|
|
|
|
299
|
|
|
|
300
|
|
|
def palindromes(words): |
301
|
|
|
return [palindrome(word) for word in words] |
302
|
|
|
|
303
|
|
|
|
304
|
|
|
def make_founder_product_name(founder1, founder2, product): |
305
|
|
|
"""Get the name of two people forming a company and combine it.""" |
306
|
|
|
return '{} & {} {}'.format( |
307
|
|
|
founder1[0].upper(), |
308
|
|
|
founder2[0].upper(), |
309
|
|
|
product) |
310
|
|
|
|
311
|
|
|
|
312
|
|
|
def make_name_alliteration(word_array, divider=' '): |
313
|
|
|
new_arr = [] |
314
|
|
|
""" |
315
|
|
|
java jacket |
316
|
|
|
singing sally |
317
|
|
|
earth engines |
318
|
|
|
...etc |
319
|
|
|
|
320
|
|
|
1. Loop through a given array of words |
321
|
|
|
2. group by words with the same first letter |
322
|
|
|
3. combine them and return to new array |
323
|
|
|
|
324
|
|
|
""" |
325
|
|
|
word_array = sorted(word_array) |
326
|
|
|
|
327
|
|
|
for word1 in word_array: |
328
|
|
|
for word2 in word_array: |
329
|
|
|
if word1[:1] is word2[:1] and word1 is not word2: |
330
|
|
|
new_arr.append(word1 + divider + word2) |
331
|
|
|
return new_arr |
332
|
|
|
|
333
|
|
|
|
334
|
|
|
def make_name_abbreviation(words): |
335
|
|
|
"""Will make some kind of company acronym. |
336
|
|
|
|
337
|
|
|
eg: BASF, AT&T, A&W |
338
|
|
|
Returns a single string of the new word combined. |
339
|
|
|
""" |
340
|
|
|
return ''.join([word[:1].upper() for word in words]) |
341
|
|
|
|
342
|
|
|
|
343
|
|
|
def make_vowel(words, vowel_type, vowel_index): |
344
|
|
|
"""Primary for all Portmanteau generators. |
345
|
|
|
|
346
|
|
|
This creates the portmanteau based on :vowel_index, and :vowel_type. |
347
|
|
|
|
348
|
|
|
The algorithm works as following: |
349
|
|
|
|
350
|
|
|
It looks for the first occurrence of a specified vowel in the first word, |
351
|
|
|
then gets the matching occurrence (if any) of the second word, |
352
|
|
|
then determines which should be first or second position, based on |
353
|
|
|
the ratio of letters (for each word) divided by the position of the vowel |
354
|
|
|
in question (e.g. c[a]t (2/3) vs. cr[a]te (3/5)). |
355
|
|
|
|
356
|
|
|
The higher number is ordered first, and the two words are then fused |
357
|
|
|
together by the single matching vowel. |
358
|
|
|
""" |
359
|
|
|
new_arr = [] |
360
|
|
|
for i in words: |
361
|
|
|
for j in words: |
362
|
|
|
is_match_i = re.search(vowel_type, i) |
363
|
|
|
is_match_j = re.search(vowel_type, j) |
364
|
|
|
if i is not j and is_match_i and is_match_j: |
365
|
|
|
# get the indices and lengths to use in finding the ratio |
366
|
|
|
pos_i = i.index(vowel_index) |
367
|
|
|
len_i = len(i) |
368
|
|
|
pos_j = j.index(vowel_index) |
369
|
|
|
len_j = len(j) |
370
|
|
|
|
371
|
|
|
# If starting index is 0, |
372
|
|
|
# add 1 to it so we're not dividing by zero |
373
|
|
|
if pos_i is 0: |
374
|
|
|
pos_i = 1 |
375
|
|
|
if pos_j is 0: |
376
|
|
|
pos_j = 1 |
377
|
|
|
|
378
|
|
|
# Decide which word should be the |
379
|
|
|
# prefix and which should be suffix |
380
|
|
|
if round(pos_i / len_i) > round(pos_j / len_j): |
381
|
|
|
p = i[0: pos_i + 1] |
382
|
|
|
p2 = j[pos_j: len(j)] |
383
|
|
|
if len(p) + len(p2) > 2: |
384
|
|
|
if re.search( |
385
|
|
|
_regexes['all_vowels'], p) or re.search( |
386
|
|
|
_regexes['all_vowels'], p2): |
387
|
|
|
if p[-1] is p2[0]: |
388
|
|
|
new_arr.append(p[:-1] + p2) |
389
|
|
|
else: |
390
|
|
|
new_arr.append(p + p2) |
391
|
|
|
return new_arr |
392
|
|
|
|
393
|
|
|
|
394
|
|
|
def make_portmanteau_default_vowel(words): |
395
|
|
|
"""Make a portmanteau based on vowel matches. |
396
|
|
|
|
397
|
|
|
E.g. (ala Brad+Angelina = Brangelina) |
398
|
|
|
Only matches for second to last letter |
399
|
|
|
in first word and matching vowel in second word. |
400
|
|
|
|
401
|
|
|
This defers to the make_vowel function for all the internal |
402
|
|
|
magic, but is a helper in that it provides all types of vowel |
403
|
|
|
combinations in one function. |
404
|
|
|
""" |
405
|
|
|
new_arr = [] |
406
|
|
|
vowel_a_re = re.compile(r'a{1}') |
407
|
|
|
vowel_e_re = re.compile(r'e{1}') |
408
|
|
|
vowel_i_re = re.compile(r'i{1}') |
409
|
|
|
vowel_o_re = re.compile(r'o{1}') |
410
|
|
|
vowel_u_re = re.compile(r'u{1}') |
411
|
|
|
|
412
|
|
|
new_arr += make_vowel(words, vowel_a_re, "a") |
413
|
|
|
new_arr += make_vowel(words, vowel_e_re, "e") |
414
|
|
|
new_arr += make_vowel(words, vowel_i_re, "i") |
415
|
|
|
new_arr += make_vowel(words, vowel_o_re, "o") |
416
|
|
|
new_arr += make_vowel(words, vowel_u_re, "u") |
417
|
|
|
return new_arr |
418
|
|
|
|
419
|
|
|
|
420
|
|
|
def make_portmanteau_split(words): |
421
|
|
|
"""Make a portmeanteau, split by vowel/consonant combos. |
422
|
|
|
|
423
|
|
|
Based on the word formation of nikon: [ni]pp[on] go[k]aku, |
424
|
|
|
which is comprised of Nippon + Gokaku. |
425
|
|
|
|
426
|
|
|
We get the first C+V in the first word, |
427
|
|
|
then last V+C in the first word, |
428
|
|
|
then all C in the second word. |
429
|
|
|
""" |
430
|
|
|
new_arr = [] |
431
|
|
|
for i in words: |
432
|
|
|
for j in words: |
433
|
|
|
if i is not j: |
434
|
|
|
l1 = re.search(r'[^a|e|i|o|u{1}]+[a|e|i|o|u{1}]', i) |
435
|
|
|
l2 = re.search(r'[a|e|i|o|u{1}]+[^a|e|i|o|u{1}]$', j) |
436
|
|
|
if i and l1 and l2: |
437
|
|
|
# Third letter used for |
438
|
|
|
# consonant middle splits only |
439
|
|
|
l3 = re.split(r'[a|e|i|o|u{1}]', i) |
440
|
|
|
l1 = l1.group(0) |
441
|
|
|
l2 = l2.group(0) |
442
|
|
|
if l3 and len(l3) > 0: |
443
|
|
|
for v in l3: |
444
|
|
|
new_arr.append(l1 + v + l2) |
445
|
|
|
else: |
446
|
|
|
new_arr.append('{}{}{}'.format(l1, 't', l2)) |
447
|
|
|
new_arr.append('{}{}{}'.format(l1, 's', l2)) |
448
|
|
|
new_arr.append('{}{}{}'.format(l1, 'z', l2)) |
449
|
|
|
new_arr.append('{}{}{}'.format(l1, 'x', l2)) |
450
|
|
|
return new_arr |
451
|
|
|
|
452
|
|
|
|
453
|
|
|
def make_punctuator(words, replace): |
454
|
|
|
"""Put some hyphens or dots, or a given punctutation. |
455
|
|
|
|
456
|
|
|
Works via :replace in the word, but only around vowels ala "del.ic.ious" |
457
|
|
|
""" |
458
|
|
|
def _replace(words, replace, replace_type='.'): |
459
|
|
|
return [word.replace( |
460
|
|
|
replace, replace + replace_type) for word in words] |
461
|
|
|
|
462
|
|
|
hyphens = _replace(words, replace, replace_type='-') |
463
|
|
|
periods = _replace(words, replace) |
464
|
|
|
return hyphens + periods |
465
|
|
|
|
466
|
|
|
|
467
|
|
|
def make_punctuator_vowels(words): |
468
|
|
|
"""Helper function that combines all possible combinations for vowels.""" |
469
|
|
|
new_words = [] |
470
|
|
|
new_words += make_punctuator(words, 'a') |
471
|
|
|
new_words += make_punctuator(words, 'e') |
472
|
|
|
new_words += make_punctuator(words, 'i') |
473
|
|
|
new_words += make_punctuator(words, 'o') |
474
|
|
|
new_words += make_punctuator(words, 'u') |
475
|
|
|
return new_words |
476
|
|
|
|
477
|
|
|
|
478
|
|
|
def make_vowelify(words): |
479
|
|
|
"""Chop off consonant ala nautica if second to last letter is a vowel.""" |
480
|
|
|
new_arr = [] |
481
|
|
|
for word in words: |
482
|
|
|
if re.search(_regexes['all_vowels'], word[:-2]): |
483
|
|
|
new_arr.append(word[:-1]) |
484
|
|
|
return new_arr |
485
|
|
|
|
486
|
|
|
|
487
|
|
|
def make_misspelling(words): |
488
|
|
|
"""Misspell a word in numerous ways, to create interesting results.""" |
489
|
|
|
token_groups = ( |
490
|
|
|
('ics', 'ix'), |
491
|
|
|
('ph', 'f'), |
492
|
|
|
('kew', 'cue'), |
493
|
|
|
('f', 'ph'), |
494
|
|
|
('o', 'ough'), |
495
|
|
|
# these seem to have |
496
|
|
|
# sucked in practice |
497
|
|
|
('o', 'off'), |
498
|
|
|
('ow', 'o'), |
499
|
|
|
('x', 'ecks'), |
500
|
|
|
('za', 'xa'), |
501
|
|
|
('xa', 'za'), |
502
|
|
|
('ze', 'xe'), |
503
|
|
|
('xe', 'ze'), |
504
|
|
|
('zi', 'xi'), |
505
|
|
|
('xi', 'zi'), |
506
|
|
|
('zo', 'xo'), |
507
|
|
|
('xo', 'zo'), |
508
|
|
|
('zu', 'xu'), |
509
|
|
|
('xu', 'zu'), |
510
|
|
|
# number based |
511
|
|
|
('one', '1'), |
512
|
|
|
('1', 'one'), |
513
|
|
|
('two', '2'), |
514
|
|
|
('2', 'two'), |
515
|
|
|
('three', '3'), |
516
|
|
|
('3', 'three'), |
517
|
|
|
('four', '4'), |
518
|
|
|
('4', 'four'), |
519
|
|
|
('five', '5'), |
520
|
|
|
('5', 'five'), |
521
|
|
|
('six', '6'), |
522
|
|
|
('6', 'six'), |
523
|
|
|
('seven', '7'), |
524
|
|
|
('7', 'seven'), |
525
|
|
|
('eight', '8'), |
526
|
|
|
('8', 'eight'), |
527
|
|
|
('nine', '9'), |
528
|
|
|
('9', 'nine'), |
529
|
|
|
('ten', '10'), |
530
|
|
|
('10', 'ten'), |
531
|
|
|
('ecks', 'x'), |
532
|
|
|
('spir', 'speer'), |
533
|
|
|
('speer', 'spir'), |
534
|
|
|
('x', 'ex'), |
535
|
|
|
('on', 'awn'), |
536
|
|
|
('ow', 'owoo'), |
537
|
|
|
('awn', 'on'), |
538
|
|
|
('awf', 'off'), |
539
|
|
|
('s', 'z'), |
540
|
|
|
('ce', 'ze'), |
541
|
|
|
('ss', 'zz'), |
542
|
|
|
('ku', 'koo'), |
543
|
|
|
('trate', 'trait'), |
544
|
|
|
('trait', 'trate'), |
545
|
|
|
('ance', 'anz'), |
546
|
|
|
('il', 'yll'), |
547
|
|
|
('ice', 'ize'), |
548
|
|
|
('chr', 'kr'), |
549
|
|
|
# These should only be at end of word! |
550
|
|
|
('er', 'r'), |
551
|
|
|
('lee', 'ly'), |
552
|
|
|
) |
553
|
|
|
new_arr = [] |
554
|
|
|
for word in words: |
555
|
|
|
for tokens in token_groups: |
556
|
|
|
new_arr.append(word.replace(*tokens)) |
557
|
|
|
return normalization.uniquify(new_arr) |
558
|
|
|
|
559
|
|
|
|
560
|
|
|
def _pig_latinize(word, postfix='ay'): |
561
|
|
|
"""Generates standard pig latin style, |
562
|
|
|
with customizeable postfix argument""" |
563
|
|
|
# Common postfixes: ['ay', 'yay', 'way'] |
564
|
|
|
if not type(postfix) is str: |
565
|
|
|
raise TypeError('Must use a string for postfix.') |
566
|
|
|
|
567
|
|
|
piggified = None |
568
|
|
|
|
569
|
|
|
vowel_re = re.compile(r'(a|e|i|o|u)') |
570
|
|
|
first_letter = word[0:1] |
571
|
|
|
|
572
|
|
|
# clean up non letters |
573
|
|
|
word = word.replace(r'[^a-zA-Z]', '') |
574
|
|
|
|
575
|
|
|
if vowel_re.match(first_letter): |
576
|
|
|
piggified = word + 'way' |
577
|
|
|
else: |
578
|
|
|
piggified = ''.join([word[1: len(word)], first_letter, postfix]) |
579
|
|
|
return piggified |
580
|
|
|
|
581
|
|
|
|
582
|
|
|
def pig_latinize(words, postfix='ay'): |
583
|
|
|
return [_pig_latinize(word, postfix=postfix) for word in words] |
584
|
|
|
|
585
|
|
|
|
586
|
|
|
def acronym_lastname(description, lastname): |
587
|
|
|
"""Inspiration: ALFA Romeo""" |
588
|
|
|
desc = ''.join([word[0].upper() for word in normalization.remove_stop_words( |
589
|
|
|
description.split(' '))]) |
590
|
|
|
return '{} {}'.format(desc, lastname) |
591
|
|
|
|
592
|
|
|
|
593
|
|
|
def get_descriptors(words): |
594
|
|
|
""" |
595
|
|
|
Use NLTK to first grab tokens by looping through words, |
596
|
|
|
then tag part-of-speech (in isolation) |
597
|
|
|
and provide a dictionary with a list of each type |
598
|
|
|
for later retrieval and usage |
599
|
|
|
""" |
600
|
|
|
descriptors = defaultdict(list) |
601
|
|
|
tokens = nltk.word_tokenize(' '.join(words)) |
602
|
|
|
parts = nltk.pos_tag(tokens) |
603
|
|
|
# Then, push the word into the matching type |
604
|
|
|
for part in parts: |
605
|
|
|
descriptors[part[1]].append(part[0]) |
606
|
|
|
return descriptors |
607
|
|
|
|
608
|
|
|
|
609
|
|
|
def _add_pos_subtypes(nouns, verbs): |
610
|
|
|
"""Combine alternating verbs and nouns into a new list. |
611
|
|
|
|
612
|
|
|
Args: |
613
|
|
|
nouns (list) - List of nouns, noun phrases, etc... |
614
|
|
|
verbs (list) - List of verbs, verb phrases, etc... |
615
|
|
|
|
616
|
|
|
Returns: |
617
|
|
|
words (list) - The newly combined list |
618
|
|
|
""" |
619
|
|
|
words = [] |
620
|
|
|
try: |
621
|
|
|
for noun in nouns: |
622
|
|
|
for verb in verbs: |
623
|
|
|
words.append('{} {}'.format(noun, verb)) |
624
|
|
|
words.append('{} {}'.format(verb, noun)) |
625
|
|
|
except KeyError: |
626
|
|
|
pass |
627
|
|
|
return words |
628
|
|
|
|
629
|
|
|
|
630
|
|
|
def _create_pos_subtypes(words): |
631
|
|
|
"""Check the part-of-speech tags for a noun-phrase, and if it exists, |
632
|
|
|
add combinations with noun-phrase + verb-phrase, noun-phrase + verb, |
633
|
|
|
and noun-phrase + adverb, for each pos type that exists. |
634
|
|
|
|
635
|
|
|
Args: |
636
|
|
|
words (list) - List of verbs, verb phrases, etc... |
637
|
|
|
|
638
|
|
|
Returns: |
639
|
|
|
new_words (list) - The newly combined list |
640
|
|
|
""" |
641
|
|
|
new_words = [] |
642
|
|
|
types = words.keys() |
643
|
|
|
if 'NNP' in types: |
644
|
|
|
if 'VBP' in types: |
645
|
|
|
new_words += _add_pos_subtypes(words['NNP'], words['VBP']) |
646
|
|
|
if 'VB' in types: |
647
|
|
|
new_words += _add_pos_subtypes(words['NNP'], words['VB']) |
648
|
|
|
if 'RB' in types: |
649
|
|
|
new_words += _add_pos_subtypes(words['NNP'], words['RB']) |
650
|
|
|
return new_words |
651
|
|
|
|
652
|
|
|
|
653
|
|
|
def make_descriptors(words): |
654
|
|
|
"""Make descriptor names. |
655
|
|
|
|
656
|
|
|
Based from a verb + noun, adjective + noun combination. |
657
|
|
|
Examples: |
658
|
|
|
-Pop Cap, |
659
|
|
|
-Big Fish, |
660
|
|
|
-Red Fin, |
661
|
|
|
-Cold Water (grill), etc... |
662
|
|
|
Combines VBP/VB/RB, with NN/NNS |
663
|
|
|
""" |
664
|
|
|
return list(set(_create_pos_subtypes(words))) |
665
|
|
|
|
666
|
|
|
|
667
|
|
|
def all_prefix_first_vowel(word, letters=list(ascii_uppercase)): |
668
|
|
|
"""Finds the first vowel in a word and removes all letters before it, |
669
|
|
|
prefixing it with all consonants. |
670
|
|
|
|
671
|
|
|
Args: |
672
|
|
|
word (str) - the word to update |
673
|
|
|
letters (list) - the letters to use for prefixing. |
674
|
|
|
|
675
|
|
|
Returns: |
676
|
|
|
words (list) - All prefixed words |
677
|
|
|
|
678
|
|
|
""" |
679
|
|
|
re_vowels = re.compile(r'[aeiouy]') |
680
|
|
|
matches = re.search(re_vowels, word) |
681
|
|
|
if matches is None: |
682
|
|
|
return [word] |
683
|
|
|
words = [] |
684
|
|
|
vowels = ['A', 'E', 'I', 'O', 'U'] |
685
|
|
|
first_match = matches.start(0) |
686
|
|
|
for letter in letters: |
687
|
|
|
if letter not in vowels: |
688
|
|
|
# If beginning letter is a vowel, don't offset the index |
689
|
|
|
if first_match == 0: |
690
|
|
|
words.append('{}{}'.format(letter, word)) |
691
|
|
|
else: |
692
|
|
|
words.append('{}{}'.format(letter, word[first_match:])) |
693
|
|
|
return words |
694
|
|
|
|
695
|
|
|
|
696
|
|
|
def recycle(words, func, times=2): |
697
|
|
|
"""Run a set of words applied to `func` and re-runs it |
698
|
|
|
`times` with the last output as the new input. |
699
|
|
|
`words` must be a list, and `func` must return a list.""" |
700
|
|
|
if times > 0: |
701
|
|
|
return recycle(func(words), func, times - 1) |
702
|
|
|
return words |
703
|
|
|
|
704
|
|
|
|
705
|
|
|
def super_scrub(data): |
706
|
|
|
"""Run words through a comprehensive list of filtering functions. |
707
|
|
|
|
708
|
|
|
Expects a dictionary with key "words" |
709
|
|
|
""" |
710
|
|
|
for technique in data['words']: |
711
|
|
|
data['words'][technique] = normalization.uniquify( |
712
|
|
|
normalization.remove_odd_sounding_words( |
713
|
|
|
normalization.clean_sort( |
714
|
|
|
data['words'][technique]))) |
715
|
|
|
return data |
716
|
|
|
|
717
|
|
|
|
718
|
|
|
def generate_all_techniques(words): |
719
|
|
|
"""Generate all techniques across the library in one place.""" |
720
|
|
|
data = { |
721
|
|
|
'words': { |
722
|
|
|
'alliterations': make_name_alliteration(words), |
723
|
|
|
'alliterations': make_name_alliteration(words), |
724
|
|
|
'portmanteau': make_portmanteau_default_vowel(words), |
725
|
|
|
'vowels': make_vowelify(words), |
726
|
|
|
'suffix': suffixify(words), |
727
|
|
|
'prefix': prefixify(words), |
728
|
|
|
'duplifix': duplifixify(words), |
729
|
|
|
'disfix': disfixify(words), |
730
|
|
|
'infix': infixify(words), |
731
|
|
|
'simulfix': simulfixify(words), |
732
|
|
|
'founder_product_name': make_founder_product_name( |
733
|
|
|
'Lindsey', 'Chris', 'Widgets'), |
734
|
|
|
'punctuator': make_punctuator_vowels(words), |
735
|
|
|
'name_abbreviation': make_name_abbreviation(words), |
736
|
|
|
'make_portmanteau_split': make_portmanteau_split(words), |
737
|
|
|
'forkerism': forkerism(words), |
738
|
|
|
'kniferism': kniferism(words), |
739
|
|
|
'spoonerism': spoonerism(words), |
740
|
|
|
'palindrome': palindromes(words), |
741
|
|
|
'reduplication_ablaut': reduplication_ablaut(words), |
742
|
|
|
'misspelling': make_misspelling(words), |
743
|
|
|
'descriptors': make_descriptors( |
744
|
|
|
get_descriptors(words)) |
745
|
|
|
} |
746
|
|
|
} |
747
|
|
|
return super_scrub(data) |
748
|
|
|
|