Completed
Push — master ( 4d1361...6dc915 )
by Chris
03:58
created

namebot.get_keyword_relevancy_map()   A

Complexity

Conditions 2

Size

Total Lines 19

Duplication

Lines 0
Ratio 0 %
Metric Value
cc 2
dl 0
loc 19
rs 9.4285
1
"""Metrics for measuring various aspects of words.
2
3
Conventions used in this utility:
4
1.  All functions return a dictionary,
5
    with key 'data' and/or 'summary':
6
    return {
7
        'data': data,
8
        'summary': summary or None
9
    }
10
"""
11
12
from __future__ import division
13
import re
14
from pattern.en import parse
15
from nltk import pos_tag
16
17
18
def prep_file(file_name):
19
    """Take a file, extracts items line-by-line, and returns a list of them.
20
21
    Args:
22
        file_name (str): The file name to open
23
24
    Returns:
25
        items (list): A list of items extracted from the file
26
    """
27
    items = []
28
    with open(file_name) as files:
29
        for newline in files:
30
            items.append(newline)
31
    return items
32
33
34
def get_named_numbers_1_10(words):
35
    """Return a summary of words spelled out (e.g. one, two).
36
37
    Args:
38
        words (list): A list of words
39
40
    Returns:
41
        dict: The data and summary results.
42
    """
43
    matches = []
44
    numbers = re.compile(
45
        r'\Aone |two |three |four |five |six |seven |eight |nine |ten',
46
        re.IGNORECASE)
47
    for word in words:
48
        if re.findall(numbers, word):
49
            matches.append(word)
50
    return {
51
        'data': matches,
52
        'summary': 'Of {} words, {} matched'.format(len(words), len(matches))
53
    }
54
55
56
def name_length(words):
57
    """Check the length of each word and an average.
58
59
    Args:
60
        words (list): A list of words
61
62
    Returns:
63
        dict: The data and summary results.
64
    """
65
    names_length = []
66
    for val in words:
67
        names_length.append(len(val))
68
    summary = 'Of {} words, the average length of names is...{}'.format(
69
        len(words),
70
        round(sum(names_length) / len(names_length)))
71
    return {
72
        'data': names_length,
73
        'summary': summary
74
    }
75
76
77
def name_vowel_count(words):
78
    """Check the number of times vowels occurs, and total the results.
79
80
    Args:
81
        words (list): A list of words
82
83
    Returns:
84
        dict: The data and summary results.
85
    """
86
    num_count = {'a': 0, 'e': 0, 'i': 0, 'o': 0, 'u': 0}
87
    try:
88
        for word in words:
89
            num_count['a'] += word.count('a')
90
            num_count['e'] += word.count('e')
91
            num_count['i'] += word.count('i')
92
            num_count['o'] += word.count('o')
93
            num_count['u'] += word.count('u')
94
    except AttributeError:
95
        pass
96
    finally:
97
        return {
98
            'data': num_count,
99
            'summary': None
100
        }
101
102
103
def name_starts_with_vowel(words):
104
    """Check the number of times a list of words starts with a vowel.
105
106
    Args:
107
        words (list): A list of words
108
109
    Returns:
110
        dict: The data and summary results.
111
    """
112
    vowelcount = 0
113
    vowels = re.compile(r'\A[aeiou]')
114
    for name in words:
115
        if re.match(vowels, name):
116
            vowelcount += 1
117
    summary = 'Of {} words, {} or {}% are vowels as the first letter.'.format(
118
        len(words), vowelcount,
119
        round(float(vowelcount) / len(words) * 100))
120
    return {
121
        'data': None,
122
        'summary': summary
123
    }
124
125
126
def get_digits_frequency(words):
127
    """Look for and count the digits in names, e.g. 7-11, 3M, etc...
128
129
    Args:
130
        words (list): A list of words
131
132
    Returns:
133
        dict: The data and summary results.
134
    """
135
    new_words = []
136
    count = 0
137
    digits = re.compile(r'[0-9]+')
138
    for name in words:
139
        if re.findall(digits, name):
140
            count += 1
141
            matches = re.findall(digits, name)
142
            new_words += matches
143
    return {
144
        'data': new_words,
145
        'summary': ('Of {} words, {} have numbers in them, '
146
                    'with a total of {} numbers found.').format(
147
                        len(words), count, len(new_words))
148
    }
149
150
151
def get_first_letter_frequency(words):
152
    """Add the frequency of first letters e.g. [C]at, [C]law, c = 2.
153
154
    Args:
155
        words (list): A list of words
156
157
    Returns:
158
        dict: The data and summary results.
159
    """
160
    letters = {}
161
    # populate keys
162
    for name in words:
163
        letters[name[0]] = 0
164
    # add counts
165
    for name in words:
166
        letters[name[0]] += 1
167
    return {
168
        'data': letters,
169
        'summary': None
170
    }
171
172
173
def get_special_chars(words):
174
    """Find occurrences of special characters (non-alphabetical characters).
175
176
    Args:
177
        words (list): A list of words
178
179
    Returns:
180
        dict: The data and summary results.
181
    """
182
    data = []
183
    chars = re.compile(r'[^a-z]', re.IGNORECASE)
184
    for word in words:
185
        if re.findall(chars, word):
186
            data += re.findall(chars, word)
187
    return {
188
        'data': data,
189
        'summary': ('{} occurrences of special characters were'
190
                    ' found in {} words.').format(len(data), len(words))
191
    }
192
193
194
def get_word_types(words):
195
    """Determine the occurrences of pos types.
196
197
    Args:
198
        words (list): A list of words
199
200
    Returns:
201
        dict: The data and summary results.
202
    """
203
    new_arr = []
204
    for val in words:
205
        try:
206
            val = parse(
207
                val,
208
                encoding='utf-8',
209
                tokenize=False,
210
                light=False,
211
                tags=True,
212
                chunks=False,
213
                relations=False,
214
                lemmata=False)
215
            new_arr.append(val)
216
        except IndexError:
217
            continue
218
    return {
219
        'data': new_arr,
220
        'summary': None
221
    }
222
223
224
def get_name_spaces(words):
225
    """Check number of spaces for a given set of words.
226
227
    Args:
228
        words (list): A list of words
229
230
    Returns:
231
        dict: The data and summary results.
232
    """
233
    results = [{'word': word, 'spaces': len(word.split(r' '))}
234
               for word in words]
235
    return {
236
        'data': results,
237
        'summary': None
238
    }
239
240
241
def get_consonant_repeat_frequency(words):
242
    """Check for repeating consonant frequency for a given set of words.
243
244
    Args:
245
        words (list): A list of words
246
247
    Returns:
248
        dict: The data and summary results.
249
    """
250
    count = 0
251
    cons = re.compile(r'[^a|e|i|o|u{6}]')
252
    for val in words:
253
        if re.match(cons, val):
254
            count += 1
255
    return {
256
        'data': count,
257
        'summary': None
258
    }
259
260
261
def get_consonant_duplicate_repeat_frequency(words):
262
    """Check for duplicate repeating consonant frequency.
263
264
    Args:
265
        words (list): A list of words
266
267
    Returns:
268
        dict: The data and summary results.
269
    """
270
    count = 0
271
    cons_dup = re.compile(r'[^a|e|i|o|u]{1,}')
272
    for name in words:
273
        if re.match(cons_dup, name):
274
            count += 1
275
    return {
276
        'data': count,
277
        'summary': None
278
    }
279
280
281
def get_vowel_repeat_frequency(words):
282
    """Check for repeating vowel frequency for a given set of words.
283
284
    Args:
285
        words (list): A list of words
286
287
    Returns:
288
        dict: The data and summary results.
289
    """
290
    count = 0
291
    cons_vowel = re.compile(r'[aeiou{3}]')
292
    for val in words:
293
        if re.match(cons_vowel, val):
294
            count += 1
295
    return {
296
        'data': count,
297
        'summary': None
298
    }
299
300
301
def get_adjective_verb_or_noun(words):
302
    """Get the number of words that are classified as verbs or nouns.
303
304
    Args:
305
        words (TYPE): Description
306
307
    Returns:
308
        dict: The data and summary results.
309
    """
310
    total = len(words)
311
    data = {'verbs': 0, 'nouns': 0}
312
    verby = ['VBP', 'VB', 'RB', 'VBG']
313
    nouns = ['NN', 'NNP']
314
    for word, tag in pos_tag(words):
315
        if tag in nouns:
316
            data['nouns'] += 1
317
        elif tag in verby:
318
            data['verbs'] += 1
319
    remainder = total - (data['verbs'] + data['nouns'])
320
    return {
321
        'data': data,
322
        'summary': ('Of {0} words, {1} were nouns, {2} were verbs, '
323
                    'and {3} were everything else.').format(
324
                        total, data['nouns'], data['verbs'], remainder)
325
    }
326
327
328
def categorize_word_type(words):
329
    """Get the common naming strategy 'category' of a name, based on precedence.
330
331
    Categories are derived from
332
    http://www.thenameinspector.com/10-name-types/,
333
    so it is important to note there is no agreed upon standard,
334
    meaning it is ultimately a little arbitrary.
335
336
    Since it is a bit challenging to actually determine its type,
337
    we give a weighting for each word based on a few known metrics.
338
    This can be updated in the future so that weightings are binary
339
    (e.g. 0.0 and 100.0), giving traditional False/True.
340
341
    Categories ====
342
343
    1. Real Words
344
     1a. Misspelled words
345
     1b. Foreign words
346
    2. Compounds
347
    3. Phrases
348
    4. Blends
349
    5. Tweaked
350
    6. Affixed
351
    7. Fake/obscure
352
    8. Puns
353
    9. People's names
354
    10. Initials and Acronyms
355
356
    Args:
357
        words (list): A list of words
358
359
    Returns:
360
        new_words (list) - A list of lists, with each word
361
                           and its distribution by word "type"
362
    """
363
    new_words = []
364
365
    def _get_distribution(word):
366
        # TODO:
367
        # misspelled, foreign, tweaked, affixed, fake_obscure,
368
        # initials_acronym, blend, puns, person, compound
369
        """Return the distribution for all categories, given a single word."""
370
        categories = {
371
            'real': 0,
372
            'misspelled': 0,
373
            'foreign': 0,
374
            'compound': 0,
375
            'phrase': 0,
376
            'blend': 0,
377
            'tweaked': 0,
378
            'affixed': 0,
379
            'fake_obscure': 0,
380
            'puns': 0,
381
            'person': 0,
382
            'initials_acronym': 0,
383
        }
384
        if len(word.split(' ')) == 1:
385
            # Real words are single
386
            categories['real'] = 50
387
        else:
388
            # Phrases are not
389
            categories['phrase'] = 50
390
        # If word cannot be tagged,
391
        # it's very likely fake_obscure
392
        if pos_tag([word])[0][1] == '-NONE-':
393
            categories['real'] = 0
394
            categories['fake_obscure'] = 75
395
        return categories
396
397
    for word in words:
398
        new_words.append([word, _get_distribution(word)])
399
    return new_words
400
401
402
def generate_all_metrics(filename=None, words=None):
403
    """Generate all metrics in this module in one place.
404
405
    Args:
406
        filename (str, optional): A filename to load words from.
407
        words (TYPE, optional): Words to use, if file is not specified.
408
409
    Returns:
410
        dict: All metrics results, keyed by name.
411
    """
412
    if not filename and not words:
413
        return None
414
    if filename:
415
        allnames = prep_file(filename)
416
    else:
417
        allnames = words
418
    return {
419
        'names': allnames,
420
        'metrics': {
421
            'digits_freq': get_digits_frequency(allnames),
422
            'length': name_length(allnames),
423
            'vowel_beginning': name_starts_with_vowel(allnames),
424
            'vowel_count': name_vowel_count(allnames),
425
            'name_length': name_length(allnames),
426
            'name_spaces': get_name_spaces(allnames),
427
            'consonant_repeat_freq': get_consonant_repeat_frequency(allnames),
428
            'consonant_dup_repeat_freq': get_consonant_duplicate_repeat_frequency(allnames),
429
            'vowel_repeat_freq': get_vowel_repeat_frequency(allnames),
430
            'special_characters': get_special_chars(allnames),
431
            'name_numbers': get_named_numbers_1_10(allnames),
432
            'adj_verb_noun': get_adjective_verb_or_noun(allnames),
433
            'first_letter_freq': get_first_letter_frequency(allnames),
434
            'word_types': get_word_types(allnames)
435
        }
436
    }
437