1
|
|
|
"""Provides various scoring methods for word strength.""" |
2
|
|
|
|
3
|
|
|
import fuzzy |
4
|
|
|
import re |
5
|
|
|
|
6
|
|
|
|
7
|
|
|
def score_dmetaphone(words): |
8
|
|
|
"""Score words using the double metaphone algorithm. |
9
|
|
|
|
10
|
|
|
Args: |
11
|
|
|
words (list) - the list of words. |
12
|
|
|
Returns: |
13
|
|
|
scores (list) - the scored words |
14
|
|
|
""" |
15
|
|
|
scores = [] |
16
|
|
|
dmeta = fuzzy.DMetaphone() |
17
|
|
|
for word in words: |
18
|
|
|
scored = '{}: {}'.format(word.lower(), dmeta(word)) |
19
|
|
|
scores.append(scored) |
20
|
|
|
return scores |
21
|
|
|
|
22
|
|
|
|
23
|
|
|
def score_soundex(words): |
24
|
|
|
"""Score words using the soundex algorithm. |
25
|
|
|
|
26
|
|
|
Args: |
27
|
|
|
words (list) - the list of words. |
28
|
|
|
Returns: |
29
|
|
|
scores (list) - the scored words |
30
|
|
|
""" |
31
|
|
|
scores = [] |
32
|
|
|
soundex = fuzzy.Soundex(4) |
33
|
|
|
for word in words: |
34
|
|
|
scored = '{}: {}'.format(word.lower(), soundex(word)) |
35
|
|
|
scores.append(scored) |
36
|
|
|
return scores |
37
|
|
|
|
38
|
|
|
|
39
|
|
|
def score_nysiis(words): |
40
|
|
|
"""Score words using the nysiis algorithm. |
41
|
|
|
|
42
|
|
|
Args: |
43
|
|
|
words (list) - the list of words. |
44
|
|
|
Returns: |
45
|
|
|
scores (list) - the scored words |
46
|
|
|
""" |
47
|
|
|
scores = [] |
48
|
|
|
for word in words: |
49
|
|
|
scored = '{}: {}'.format(word.lower(), fuzzy.nysiis(word)) |
50
|
|
|
scores.append(scored) |
51
|
|
|
return scores |
52
|
|
|
|
53
|
|
|
|
54
|
|
|
def score_length(word): |
55
|
|
|
"""Return a score, 1-5, of the length of the word. |
56
|
|
|
|
57
|
|
|
Really long, or really short words get a lower score. |
58
|
|
|
There is no hard science, but popular opinion suggests |
59
|
|
|
that a word somewhere between 8-15 letters is optimal. |
60
|
|
|
""" |
61
|
|
|
if not word or len(word) == 0: |
62
|
|
|
return 0 |
63
|
|
|
_len = len(word) |
64
|
|
|
# 20+ |
65
|
|
|
if _len > 20: |
66
|
|
|
return 1 |
67
|
|
|
# 15-20 |
68
|
|
|
elif _len > 15 and _len <= 20: |
69
|
|
|
return 2 |
70
|
|
|
# 1-4 |
71
|
|
|
elif _len <= 4: |
72
|
|
|
return 3 |
73
|
|
|
# 10-15 |
74
|
|
|
elif _len >= 10 and _len <= 15: |
75
|
|
|
return 4 |
76
|
|
|
# 5-10 |
77
|
|
|
elif _len > 4 and _len < 10: |
78
|
|
|
return 5 |
79
|
|
|
|
80
|
|
|
|
81
|
|
|
def bounded(num, start, end): |
82
|
|
|
"""Determine if a number is within the bounds of `start` and `end`.""" |
83
|
|
|
return num >= start and num <= end |
84
|
|
|
|
85
|
|
|
|
86
|
|
|
def score_pronounceability(word): |
87
|
|
|
"""Get the ratio of vowels to consonants, a very basic measurement. |
88
|
|
|
|
89
|
|
|
Half vowels and half consonants indicates a highly pronounceable word. |
90
|
|
|
For example, 0.5 / 0.5 = 1.0, so one is perfect, and lower is worse. |
91
|
|
|
|
92
|
|
|
The 1-5 scale translation: |
93
|
|
|
--------------------------------------------------------------- |
94
|
|
|
0.0 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1.0 |
95
|
|
|
--------------------------------------------------------------- |
96
|
|
|
0 1 2 3 4 5 4 3 2 1 5 |
97
|
|
|
--------------------------------------------------------------- |
98
|
|
|
|
99
|
|
|
Args: |
100
|
|
|
word (string) - the name |
101
|
|
|
Returns: |
102
|
|
|
score (int) - the final pronounceability score |
103
|
|
|
""" |
104
|
|
|
if not word or len(word) == 0: |
105
|
|
|
return 0 |
106
|
|
|
word = re.sub(r'[^a-zA-Z0-9]', '', word) |
107
|
|
|
re_vowels = re.compile(r'[a|e|i|o|u]') |
108
|
|
|
re_cons = re.compile(r'[^a|e|i|o|u]') |
109
|
|
|
vowels = float(len(re.findall(re_vowels, word))) |
110
|
|
|
consonants = float(len(re.findall(re_cons, word))) |
111
|
|
|
if vowels is 0.0 or consonants is 0.0: |
112
|
|
|
return 0 |
113
|
|
|
if vowels < consonants: |
114
|
|
|
ratio = vowels / consonants |
115
|
|
|
else: |
116
|
|
|
ratio = consonants / vowels |
117
|
|
|
if ratio == 0.0: |
118
|
|
|
return 0 |
119
|
|
|
if ratio == 1.0: |
120
|
|
|
return 5 |
121
|
|
|
if bounded(ratio, 0.0, 0.1) or bounded(ratio, 0.9, 1.0): |
122
|
|
|
return 1 |
123
|
|
|
if bounded(ratio, 0.1, 0.2) or bounded(ratio, 0.8, 0.9): |
124
|
|
|
return 2 |
125
|
|
|
if bounded(ratio, 0.2, 0.3) or bounded(ratio, 0.7, 0.8): |
126
|
|
|
return 3 |
127
|
|
|
if bounded(ratio, 0.3, 0.4) or bounded(ratio, 0.6, 0.7): |
128
|
|
|
return 4 |
129
|
|
|
if bounded(ratio, 0.4, 0.5) or bounded(ratio, 0.5, 0.6): |
130
|
|
|
return 5 |
131
|
|
|
return 0 |
132
|
|
|
|
133
|
|
|
|
134
|
|
|
def score_simplicity(word): |
135
|
|
|
"""Determine how simple the word is. |
136
|
|
|
|
137
|
|
|
Simple is defined as the number of separate words. |
138
|
|
|
In this case, higher is better, indicating a better score. |
139
|
|
|
|
140
|
|
|
Args: |
141
|
|
|
word (string) - the name |
142
|
|
|
Returns: |
143
|
|
|
score (int) - the final simplicity score |
144
|
|
|
|
145
|
|
|
>>> score_simplicity('the cat in the hat') |
146
|
|
|
>>> 1 |
147
|
|
|
>>> score_simplicity('facebook') |
148
|
|
|
>>> 5 |
149
|
|
|
""" |
150
|
|
|
if not word or len(word) == 0: |
151
|
|
|
return 0 |
152
|
|
|
word_count = len(re.split(r'[^a-z]', word)) |
153
|
|
|
if word_count == 1: |
154
|
|
|
return 5 |
155
|
|
|
if word_count < 3: |
156
|
|
|
return 4 |
157
|
|
|
if word_count < 4: |
158
|
|
|
return 3 |
159
|
|
|
if word_count < 5: |
160
|
|
|
return 2 |
161
|
|
|
# After 4+ words, the name has a very poor score. |
162
|
|
|
return 1 |
163
|
|
|
|
164
|
|
|
|
165
|
|
|
def score_name_overall(word): |
166
|
|
|
"""Score the name using separate scoring functions, then normalize to 100. |
167
|
|
|
|
168
|
|
|
This method gives an overall intuitive score. |
169
|
|
|
The closer to 100%, the better. |
170
|
|
|
|
171
|
|
|
Args: |
172
|
|
|
word (string) - the name |
173
|
|
|
Returns: |
174
|
|
|
score (float) - the final name score |
175
|
|
|
""" |
176
|
|
|
length = score_length(word) |
177
|
|
|
pronounceability = score_pronounceability(word) |
178
|
|
|
simplicity = score_simplicity(word) |
179
|
|
|
_scores = sum([length, pronounceability, simplicity]) |
180
|
|
|
score = round(_scores * 10) |
181
|
|
|
# cut off at 100% |
182
|
|
|
if score > 100: |
183
|
|
|
return 100 |
184
|
|
|
return score |
185
|
|
|
|
186
|
|
|
|
187
|
|
|
def score_names_overall(words): |
188
|
|
|
"""Score all names. |
189
|
|
|
|
190
|
|
|
Args: |
191
|
|
|
words (list) - the list of words. |
192
|
|
|
Returns: |
193
|
|
|
words (list) - a list of tuples, with the score and word. |
194
|
|
|
""" |
195
|
|
|
new = [] |
196
|
|
|
for k, word in enumerate(words): |
197
|
|
|
new.append((score_name_overall(word), word)) |
198
|
|
|
return new |
199
|
|
|
|
200
|
|
|
|
201
|
|
|
def generate_all_scoring(words): |
202
|
|
|
"""Return all scoring methods for a set of words. |
203
|
|
|
|
204
|
|
|
Args: |
205
|
|
|
words (list) - the list of words. |
206
|
|
|
Returns: |
207
|
|
|
words (dict) - the scores, keyed by scoring name. |
208
|
|
|
""" |
209
|
|
|
return { |
210
|
|
|
'dmetaphone': score_dmetaphone(words), |
211
|
|
|
'soundex': score_soundex(words), |
212
|
|
|
'nysiis': score_nysiis(words), |
213
|
|
|
'grade': score_names_overall(words) |
214
|
|
|
} |
215
|
|
|
|