Completed
Push — master ( 1cff91...86fea4 )
by Chris
04:08
created

namebot.get_adjective_verb_or_noun()   B

Complexity

Conditions 4

Size

Total Lines 24

Duplication

Lines 0
Ratio 0 %
Metric Value
cc 4
dl 0
loc 24
rs 8.6845
1
"""Metrics for measuring various aspects of words.
2
3
Conventions used in this utility:
4
1.  All functions return a dictionary,
5
    with key 'data' and/or 'summary':
6
    return {
7
        'data': data,
8
        'summary': summary or None
9
    }
10
"""
11
12
from __future__ import division
13
import re
14
from pattern.en import parse
15
from pattern.web import sort
16
from nltk import pos_tag
17
18
19
def prep_file(file_name):
20
    """Take a file, extracts items line-by-line, and returns a list of them.
21
22
    Args:
23
        file_name (str): The file name to open
24
25
    Returns:
26
        items (list): A list of items extracted from the file
27
    """
28
    items = []
29
    with open(file_name) as files:
30
        for newline in files:
31
            items.append(newline)
32
    return items
33
34
35
def get_named_numbers_1_10(words):
36
    """Return a summary of words spelled out (e.g. one, two).
37
38
    Args:
39
        words (list): A list of words
40
41
    Returns:
42
        dict: The data and summary results.
43
    """
44
    matches = []
45
    numbers = re.compile(
46
        r'\Aone |two |three |four |five |six |seven |eight |nine |ten',
47
        re.IGNORECASE)
48
    for word in words:
49
        if re.findall(numbers, word):
50
            matches.append(word)
51
    return {
52
        'data': matches,
53
        'summary': 'Of {} words, {} matched'.format(len(words), len(matches))
54
    }
55
56
57
def name_length(words):
58
    """Check the length of each word and an average.
59
60
    Args:
61
        words (list): A list of words
62
63
    Returns:
64
        dict: The data and summary results.
65
    """
66
    names_length = []
67
    for val in words:
68
        names_length.append(len(val))
69
    summary = 'Of {} words, the average length of names is...{}'.format(
70
        len(words),
71
        round(sum(names_length) / len(names_length)))
72
    return {
73
        'data': names_length,
74
        'summary': summary
75
    }
76
77
78
def name_vowel_count(words):
79
    """Check the number of times vowels occurs, and total the results.
80
81
    Args:
82
        words (list): A list of words
83
84
    Returns:
85
        dict: The data and summary results.
86
    """
87
    num_count = {'a': 0, 'e': 0, 'i': 0, 'o': 0, 'u': 0}
88
    try:
89
        for word in words:
90
            num_count['a'] += word.count('a')
91
            num_count['e'] += word.count('e')
92
            num_count['i'] += word.count('i')
93
            num_count['o'] += word.count('o')
94
            num_count['u'] += word.count('u')
95
    except AttributeError:
96
        pass
97
    finally:
98
        return {
99
            'data': num_count,
100
            'summary': None
101
        }
102
103
104
def name_starts_with_vowel(words):
105
    """Check the number of times a list of words starts with a vowel.
106
107
    Args:
108
        words (list): A list of words
109
110
    Returns:
111
        dict: The data and summary results.
112
    """
113
    vowelcount = 0
114
    vowels = re.compile(r'\A[aeiou]')
115
    for name in words:
116
        if re.match(vowels, name):
117
            vowelcount += 1
118
    summary = 'Of {} words, {} or {}% are vowels as the first letter.'.format(
119
        len(words), vowelcount,
120
        round(float(vowelcount) / len(words) * 100))
121
    return {
122
        'data': None,
123
        'summary': summary
124
    }
125
126
127
def get_digits_frequency(words):
128
    """Look for and count the digits in names, e.g. 7-11, 3M, etc...
129
130
    Args:
131
        words (list): A list of words
132
133
    Returns:
134
        dict: The data and summary results.
135
    """
136
    new_words = []
137
    count = 0
138
    digits = re.compile(r'[0-9]+')
139
    for name in words:
140
        if re.findall(digits, name):
141
            count += 1
142
            matches = re.findall(digits, name)
143
            new_words += matches
144
    return {
145
        'data': new_words,
146
        'summary': ('Of {} words, {} have numbers in them, '
147
                    'with a total of {} numbers found.').format(
148
                        len(words), count, len(new_words))
149
    }
150
151
152
def get_first_letter_frequency(words):
153
    """Add the frequency of first letters e.g. [C]at, [C]law, c = 2.
154
155
    Args:
156
        words (list): A list of words
157
158
    Returns:
159
        dict: The data and summary results.
160
    """
161
    letters = {}
162
    # populate keys
163
    for name in words:
164
        letters[name[0]] = 0
165
166
    # add counts
167
    for name in words:
168
        letters[name[0]] += 1
169
170
    return {
171
        'data': letters,
172
        'summary': None
173
    }
174
175
176
def get_special_chars(words):
177
    """Find occurrences of special characters (non-alphabetical characters).
178
179
    Args:
180
        words (list): A list of words
181
182
    Returns:
183
        dict: The data and summary results.
184
    """
185
    data = []
186
    chars = re.compile(r'[^a-z]', re.IGNORECASE)
187
    for word in words:
188
        if re.findall(chars, word):
189
            data += re.findall(chars, word)
190
    return {
191
        'data': data,
192
        'summary': ('{} occurrences of special characters were'
193
                    ' found in {} words.').format(len(data), len(words))
194
    }
195
196
197
def get_word_types(words):
198
    """Determine the occurrences of pos types.
199
200
    Args:
201
        words (list): A list of words
202
203
    Returns:
204
        dict: The data and summary results.
205
    """
206
    new_arr = []
207
    for val in words:
208
        try:
209
            val = parse(
210
                val,
211
                encoding='utf-8',
212
                tokenize=False,
213
                light=False,
214
                tags=True,
215
                chunks=False,
216
                relations=False,
217
                lemmata=False)
218
            new_arr.append(val)
219
        except IndexError:
220
            continue
221
    return {
222
        'data': new_arr,
223
        'summary': None
224
    }
225
226
227
def get_name_spaces(words):
228
    """Check number of spaces for a given set of words.
229
230
    Args:
231
        words (list): A list of words
232
233
    Returns:
234
        dict: The data and summary results.
235
    """
236
    results = [{'word': word, 'spaces': len(word.split(r' '))}
237
               for word in words]
238
    return {
239
        'data': results,
240
        'summary': None
241
    }
242
243
244
def get_consonant_repeat_frequency(words):
245
    """Check for repeating consonant frequency for a given set of words.
246
247
    Args:
248
        words (list): A list of words
249
250
    Returns:
251
        dict: The data and summary results.
252
    """
253
    count = 0
254
    cons = re.compile(r'[^a|e|i|o|u{6}]')
255
    for val in words:
256
        if re.match(cons, val):
257
            count += 1
258
    return {
259
        'data': count,
260
        'summary': None
261
    }
262
263
264
def get_consonant_duplicate_repeat_frequency(words):
265
    """Check for duplicate repeating consonant frequency.
266
267
    Args:
268
        words (list): A list of words
269
270
    Returns:
271
        dict: The data and summary results.
272
    """
273
    count = 0
274
    cons_dup = re.compile(r'[^a|e|i|o|u]{1,}')
275
    for name in words:
276
        if re.match(cons_dup, name):
277
            count += 1
278
    return {
279
        'data': count,
280
        'summary': None
281
    }
282
283
284
def get_vowel_repeat_frequency(words):
285
    """Check for repeating vowel frequency for a given set of words.
286
287
    Args:
288
        words (list): A list of words
289
290
    Returns:
291
        dict: The data and summary results.
292
    """
293
    count = 0
294
    cons_vowel = re.compile(r'[aeiou{3}]')
295
    for val in words:
296
        if re.match(cons_vowel, val):
297
            count += 1
298
    return {
299
        'data': count,
300
        'summary': None
301
    }
302
303
304
def get_adjective_verb_or_noun(words):
305
    """Get the number of words that are classified as verbs or nouns.
306
307
    Args:
308
        words (TYPE): Description
309
310
    Returns:
311
        dict: The data and summary results.
312
    """
313
    total = len(words)
314
    data = {'verbs': 0, 'nouns': 0}
315
    verby = ['VBP', 'VB', 'RB', 'VBG']
316
    nouns = ['NN', 'NNP']
317
    for word, tag in pos_tag(words):
318
        if tag in nouns:
319
            data['nouns'] += 1
320
        elif tag in verby:
321
            data['verbs'] += 1
322
    remainder = total - (data['verbs'] + data['nouns'])
323
    return {
324
        'data': data,
325
        'summary': ('Of {0} words, {1} were nouns, {2} were verbs, '
326
                    'and {3} were everything else.').format(
327
                        total, data['nouns'], data['verbs'], remainder)
328
    }
329
330
331
def get_keyword_relevancy_map(words, n_list, terms, sortcontext,
332
                              enginetype='BING',
333
                              license=None):
334
    """http://www.clips.ua.ac.be/pages/pattern-web#sort."""
335
    results_list = []
336
    results = sort(
337
        terms=[],
338
        context=sortcontext,   # Term used for sorting.
339
        service=enginetype,    # GOOGLE, YAHOO, BING, ...
340
        license=None,          # You should supply your own API license key
341
        strict=True,           # Wraps query in quotes: 'mac sweet'.
342
        reverse=True,          # Reverse: 'sweet mac' <=> 'mac sweet'.
343
        cached=True)
344
345
    for weight, term in results:
346
        results.append("%5.2f" % (weight * 100) + "%", term)
347
    return {
348
        'data': results_list,
349
        'summary': None
350
    }
351
352
353
def check_trademark_registration(words):
354
    # TODO
355
    """Search the USTM office and return the number of results."""
356
    return {
357
        'data': None,
358
        'summary': None
359
    }
360
361
362
def check_domain_searches(words):
363
    # TODO
364
    """Check domain search results for each name."""
365
    raise NotImplemented
366
367
368
def get_search_result_count(words):
369
    # TODO
370
    """
371
    Check google results and return the number of results.
372
373
    http://www.clips.ua.ac.be/pages/pattern-web#DOM
374
    """
375
    raise NotImplemented
376
377
378
def categorize_word_type(words):
379
    """Get the common naming strategy 'category' of a name, based on precedence.
380
381
    Categories are derived from
382
    http://www.thenameinspector.com/10-name-types/,
383
    so it is important to note there is no agreed upon standard,
384
    meaning it is ultimately a little arbitrary.
385
386
    Since it is a bit challenging to actually determine its type,
387
    we give a weighting for each word based on a few known metrics.
388
    This can be updated in the future so that weightings are binary
389
    (e.g. 0.0 and 100.0), giving traditional False/True.
390
391
    Categories ====
392
393
    1. Real Words
394
     1a. Misspelled words
395
     1b. Foreign words
396
    2. Compounds
397
    3. Phrases
398
    4. Blends
399
    5. Tweaked
400
    6. Affixed
401
    7. Fake/obscure
402
    8. Puns
403
    9. People's names
404
    10. Initials and Acronyms
405
406
    Args:
407
        words (list): A list of words
408
409
    Returns:
410
        new_words (list) - A list of lists, with each word
411
                           and its distribution by word "type"
412
    """
413
    new_words = []
414
415
    def _get_distribution(word):
416
        # TODO:
417
        # misspelled, foreign, tweaked, affixed, fake_obscure,
418
        # initials_acronym, blend, puns, person, compound
419
        """Return the distribution for all categories, given a single word."""
420
        categories = {
421
            'real': 0,
422
            'misspelled': 0,
423
            'foreign': 0,
424
            'compound': 0,
425
            'phrase': 0,
426
            'blend': 0,
427
            'tweaked': 0,
428
            'affixed': 0,
429
            'fake_obscure': 0,
430
            'puns': 0,
431
            'person': 0,
432
            'initials_acronym': 0,
433
        }
434
        if len(word.split(' ')) == 1:
435
            # Real words are single
436
            categories['real'] = 50
437
        else:
438
            # Phrases are not
439
            categories['phrase'] = 50
440
        # If word cannot be tagged,
441
        # it's very likely fake_obscure
442
        if pos_tag([word])[0][1] == '-NONE-':
443
            categories['real'] = 0
444
            categories['fake_obscure'] = 75
445
        return categories
446
447
    for word in words:
448
        new_words.append([word, _get_distribution(word)])
449
    return new_words
450
451
452
def get_word_ranking(words):
453
    """Use google results and get a quality of ranking.
454
455
    This is based on other metrics such as domain name availability,
456
    google results and others.
457
    """
458
    results = []
459
    for name in words:
460
        results = get_search_result_count(words)
461
        domains = check_domain_searches(words)
462
        results.append(results / domains)
463
    return {
464
        'data': results,
465
        'summary': None
466
    }
467
468
469
def generate_all_metrics(filename=None, words=None):
470
    """Generate all metrics in this module in one place.
471
472
    Args:
473
        filename (str, optional): A filename to load words from.
474
        words (TYPE, optional): Words to use, if file is not specified.
475
476
    Returns:
477
        dict: All metrics results, keyed by name.
478
    """
479
    if not filename and not words:
480
        return None
481
    if filename:
482
        allnames = prep_file(filename)
483
    else:
484
        allnames = words
485
    return {
486
        'names': allnames,
487
        'metrics': {
488
            'digits_freq': get_digits_frequency(allnames),
489
            'length': name_length(allnames),
490
            'vowel_beginning': name_starts_with_vowel(allnames),
491
            'vowel_count': name_vowel_count(allnames),
492
            'name_length': name_length(allnames),
493
            'name_spaces': get_name_spaces(allnames),
494
            'consonant_repeat_freq': get_consonant_repeat_frequency(allnames),
495
            'consonant_dup_repeat_freq': get_consonant_duplicate_repeat_frequency(allnames),
496
            'vowel_repeat_freq': get_vowel_repeat_frequency(allnames),
497
            'special_characters': get_special_chars(allnames),
498
            'name_numbers': get_named_numbers_1_10(allnames),
499
            'adj_verb_noun': get_adjective_verb_or_noun(allnames),
500
            'first_letter_freq': get_first_letter_frequency(allnames),
501
            'word_types': get_word_types(allnames)
502
        }
503
    }
504