namebot.get_keyword_relevancy_map() - Code Metrics - Inspection of "Update version" - christabor/namebot - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — master ( 4d1361...6dc915 )

by Chris

created 2016-03-09 06:46 UTC

namebot.get_keyword_relevancy_map() A

↳ Parent: namebot.categorize_word_type()

Complexity

Conditions

Size

Total Lines

Duplication

Lines	0
Ratio	0 %

Metric	Value
cc	2
dl	0
loc	19
rs	9.4285

"""Metrics for measuring various aspects of words.

Conventions used in this utility:
1.  All functions return a dictionary,
    with key 'data' and/or 'summary':
    return {
        'data': data,
        'summary': summary or None
    }
"""

from __future__ import division
import re
from pattern.en import parse
from nltk import pos_tag


def prep_file(file_name):
    """Take a file, extracts items line-by-line, and returns a list of them.

    Args:
        file_name (str): The file name to open

    Returns:
        items (list): A list of items extracted from the file
    """
    items = []
    with open(file_name) as files:
        for newline in files:
            items.append(newline)
    return items


def get_named_numbers_1_10(words):
    """Return a summary of words spelled out (e.g. one, two).

    Args:
        words (list): A list of words

    Returns:
        dict: The data and summary results.
    """
    matches = []
    numbers = re.compile(
        r'\Aone |two |three |four |five |six |seven |eight |nine |ten',
        re.IGNORECASE)
    for word in words:
        if re.findall(numbers, word):
            matches.append(word)
    return {
        'data': matches,
        'summary': 'Of {} words, {} matched'.format(len(words), len(matches))
    }


def name_length(words):
    """Check the length of each word and an average.

    Args:
        words (list): A list of words

    Returns:
        dict: The data and summary results.
    """
    names_length = []
    for val in words:
        names_length.append(len(val))
    summary = 'Of {} words, the average length of names is...{}'.format(
        len(words),
        round(sum(names_length) / len(names_length)))
    return {
        'data': names_length,
        'summary': summary
    }


def name_vowel_count(words):
    """Check the number of times vowels occurs, and total the results.

    Args:
        words (list): A list of words

    Returns:
        dict: The data and summary results.
    """
    num_count = {'a': 0, 'e': 0, 'i': 0, 'o': 0, 'u': 0}
    try:
        for word in words:
            num_count['a'] += word.count('a')
            num_count['e'] += word.count('e')
            num_count['i'] += word.count('i')
            num_count['o'] += word.count('o')
            num_count['u'] += word.count('u')
    except AttributeError:
        pass
    finally:
        return {
            'data': num_count,
            'summary': None
        }


def name_starts_with_vowel(words):
    """Check the number of times a list of words starts with a vowel.

    Args:
        words (list): A list of words

    Returns:
        dict: The data and summary results.
    """
    vowelcount = 0
    vowels = re.compile(r'\A[aeiou]')
    for name in words:
        if re.match(vowels, name):
            vowelcount += 1
    summary = 'Of {} words, {} or {}% are vowels as the first letter.'.format(
        len(words), vowelcount,
        round(float(vowelcount) / len(words) * 100))
    return {
        'data': None,
        'summary': summary
    }


def get_digits_frequency(words):
    """Look for and count the digits in names, e.g. 7-11, 3M, etc...

    Args:
        words (list): A list of words

    Returns:
        dict: The data and summary results.
    """
    new_words = []
    count = 0
    digits = re.compile(r'[0-9]+')
    for name in words:
        if re.findall(digits, name):
            count += 1
            matches = re.findall(digits, name)
            new_words += matches
    return {
        'data': new_words,
        'summary': ('Of {} words, {} have numbers in them, '
                    'with a total of {} numbers found.').format(
                        len(words), count, len(new_words))
    }


def get_first_letter_frequency(words):
    """Add the frequency of first letters e.g. [C]at, [C]law, c = 2.

    Args:
        words (list): A list of words

    Returns:
        dict: The data and summary results.
    """
    letters = {}
    # populate keys
    for name in words:
        letters[name[0]] = 0
    # add counts
    for name in words:
        letters[name[0]] += 1
    return {
        'data': letters,
        'summary': None
    }


def get_special_chars(words):
    """Find occurrences of special characters (non-alphabetical characters).

    Args:
        words (list): A list of words

    Returns:
        dict: The data and summary results.
    """
    data = []
    chars = re.compile(r'[^a-z]', re.IGNORECASE)
    for word in words:
        if re.findall(chars, word):
            data += re.findall(chars, word)
    return {
        'data': data,
        'summary': ('{} occurrences of special characters were'
                    ' found in {} words.').format(len(data), len(words))
    }


def get_word_types(words):
    """Determine the occurrences of pos types.

    Args:
        words (list): A list of words

    Returns:
        dict: The data and summary results.
    """
    new_arr = []
    for val in words:
        try:
            val = parse(
                val,
                encoding='utf-8',
                tokenize=False,
                light=False,
                tags=True,
                chunks=False,
                relations=False,
                lemmata=False)
            new_arr.append(val)
        except IndexError:
            continue
    return {
        'data': new_arr,
        'summary': None
    }


def get_name_spaces(words):
    """Check number of spaces for a given set of words.

    Args:
        words (list): A list of words

    Returns:
        dict: The data and summary results.
    """
    results = [{'word': word, 'spaces': len(word.split(r' '))}
               for word in words]
    return {
        'data': results,
        'summary': None
    }


def get_consonant_repeat_frequency(words):
    """Check for repeating consonant frequency for a given set of words.

    Args:
        words (list): A list of words

    Returns:
        dict: The data and summary results.
    """
    count = 0
    cons = re.compile(r'[^a|e|i|o|u{6}]')
    for val in words:
        if re.match(cons, val):
            count += 1
    return {
        'data': count,
        'summary': None
    }


def get_consonant_duplicate_repeat_frequency(words):
    """Check for duplicate repeating consonant frequency.

    Args:
        words (list): A list of words

    Returns:
        dict: The data and summary results.
    """
    count = 0
    cons_dup = re.compile(r'[^a|e|i|o|u]{1,}')
    for name in words:
        if re.match(cons_dup, name):
            count += 1
    return {
        'data': count,
        'summary': None
    }


def get_vowel_repeat_frequency(words):
    """Check for repeating vowel frequency for a given set of words.

    Args:
        words (list): A list of words

    Returns:
        dict: The data and summary results.
    """
    count = 0
    cons_vowel = re.compile(r'[aeiou{3}]')
    for val in words:
        if re.match(cons_vowel, val):
            count += 1
    return {
        'data': count,
        'summary': None
    }


def get_adjective_verb_or_noun(words):
    """Get the number of words that are classified as verbs or nouns.

    Args:
        words (TYPE): Description

    Returns:
        dict: The data and summary results.
    """
    total = len(words)
    data = {'verbs': 0, 'nouns': 0}
    verby = ['VBP', 'VB', 'RB', 'VBG']
    nouns = ['NN', 'NNP']
    for word, tag in pos_tag(words):
        if tag in nouns:
            data['nouns'] += 1
        elif tag in verby:
            data['verbs'] += 1
    remainder = total - (data['verbs'] + data['nouns'])
    return {
        'data': data,
        'summary': ('Of {0} words, {1} were nouns, {2} were verbs, '
                    'and {3} were everything else.').format(
                        total, data['nouns'], data['verbs'], remainder)
    }


def categorize_word_type(words):
    """Get the common naming strategy 'category' of a name, based on precedence.

    Categories are derived from
    http://www.thenameinspector.com/10-name-types/,
    so it is important to note there is no agreed upon standard,
    meaning it is ultimately a little arbitrary.

    Since it is a bit challenging to actually determine its type,
    we give a weighting for each word based on a few known metrics.
    This can be updated in the future so that weightings are binary
    (e.g. 0.0 and 100.0), giving traditional False/True.

    Categories ====

    1. Real Words
     1a. Misspelled words
     1b. Foreign words
    2. Compounds
    3. Phrases
    4. Blends
    5. Tweaked
    6. Affixed
    7. Fake/obscure
    8. Puns
    9. People's names
    10. Initials and Acronyms

    Args:
        words (list): A list of words

    Returns:
        new_words (list) - A list of lists, with each word
                           and its distribution by word "type"
    """
    new_words = []

    def _get_distribution(word):
        # TODO:
        # misspelled, foreign, tweaked, affixed, fake_obscure,
        # initials_acronym, blend, puns, person, compound
        """Return the distribution for all categories, given a single word."""
        categories = {
            'real': 0,
            'misspelled': 0,
            'foreign': 0,
            'compound': 0,
            'phrase': 0,
            'blend': 0,
            'tweaked': 0,
            'affixed': 0,
            'fake_obscure': 0,
            'puns': 0,
            'person': 0,
            'initials_acronym': 0,
        }
        if len(word.split(' ')) == 1:
            # Real words are single
            categories['real'] = 50
        else:
            # Phrases are not
            categories['phrase'] = 50
        # If word cannot be tagged,
        # it's very likely fake_obscure
        if pos_tag([word])[0][1] == '-NONE-':
            categories['real'] = 0
            categories['fake_obscure'] = 75
        return categories

    for word in words:
        new_words.append([word, _get_distribution(word)])
    return new_words


def generate_all_metrics(filename=None, words=None):
    """Generate all metrics in this module in one place.

    Args:
        filename (str, optional): A filename to load words from.
        words (TYPE, optional): Words to use, if file is not specified.

    Returns:
        dict: All metrics results, keyed by name.
    """
    if not filename and not words:
        return None
    if filename:
        allnames = prep_file(filename)
    else:
        allnames = words
    return {
        'names': allnames,
        'metrics': {
            'digits_freq': get_digits_frequency(allnames),
            'length': name_length(allnames),
            'vowel_beginning': name_starts_with_vowel(allnames),
            'vowel_count': name_vowel_count(allnames),
            'name_length': name_length(allnames),
            'name_spaces': get_name_spaces(allnames),
            'consonant_repeat_freq': get_consonant_repeat_frequency(allnames),
            'consonant_dup_repeat_freq': get_consonant_duplicate_repeat_frequency(allnames),
            'vowel_repeat_freq': get_vowel_repeat_frequency(allnames),
            'special_characters': get_special_chars(allnames),
            'name_numbers': get_named_numbers_1_10(allnames),
            'adj_verb_noun': get_adjective_verb_or_noun(allnames),
            'first_letter_freq': get_first_letter_frequency(allnames),
            'word_types': get_word_types(allnames)
        }
    }


1			"""Metrics for measuring various aspects of words.
2
3			Conventions used in this utility:
4			1. All functions return a dictionary,
5			with key 'data' and/or 'summary':
6			return {
7			'data': data,
8			'summary': summary or None
9			}
10			"""
11
12			from __future__ import division
13			import re
14			from pattern.en import parse
15			from nltk import pos_tag
16
17
18			def prep_file(file_name):
19			"""Take a file, extracts items line-by-line, and returns a list of them.
20
21			Args:
22			file_name (str): The file name to open
23
24			Returns:
25			items (list): A list of items extracted from the file
26			"""
27			items = []
28			with open(file_name) as files:
29			for newline in files:
30			items.append(newline)
31			return items
32
33
34			def get_named_numbers_1_10(words):
35			"""Return a summary of words spelled out (e.g. one, two).
36
37			Args:
38			words (list): A list of words
39
40			Returns:
41			dict: The data and summary results.
42			"""
43			matches = []
44			numbers = re.compile(
45			r'\Aone \|two \|three \|four \|five \|six \|seven \|eight \|nine \|ten',
46			re.IGNORECASE)
47			for word in words:
48			if re.findall(numbers, word):
49			matches.append(word)
50			return {
51			'data': matches,
52			'summary': 'Of {} words, {} matched'.format(len(words), len(matches))
53			}
54
55
56			def name_length(words):
57			"""Check the length of each word and an average.
58
59			Args:
60			words (list): A list of words
61
62			Returns:
63			dict: The data and summary results.
64			"""
65			names_length = []
66			for val in words:
67			names_length.append(len(val))
68			summary = 'Of {} words, the average length of names is...{}'.format(
69			len(words),
70			round(sum(names_length) / len(names_length)))
71			return {
72			'data': names_length,
73			'summary': summary
74			}
75
76
77			def name_vowel_count(words):
78			"""Check the number of times vowels occurs, and total the results.
79
80			Args:
81			words (list): A list of words
82
83			Returns:
84			dict: The data and summary results.
85			"""
86			num_count = {'a': 0, 'e': 0, 'i': 0, 'o': 0, 'u': 0}
87			try:
88			for word in words:
89			num_count['a'] += word.count('a')
90			num_count['e'] += word.count('e')
91			num_count['i'] += word.count('i')
92			num_count['o'] += word.count('o')
93			num_count['u'] += word.count('u')
94			except AttributeError:
95			pass
96			finally:
97			return {
98			'data': num_count,
99			'summary': None
100			}
101
102
103			def name_starts_with_vowel(words):
104			"""Check the number of times a list of words starts with a vowel.
105
106			Args:
107			words (list): A list of words
108
109			Returns:
110			dict: The data and summary results.
111			"""
112			vowelcount = 0
113			vowels = re.compile(r'\A[aeiou]')
114			for name in words:
115			if re.match(vowels, name):
116			vowelcount += 1
117			summary = 'Of {} words, {} or {}% are vowels as the first letter.'.format(
118			len(words), vowelcount,
119			round(float(vowelcount) / len(words) * 100))
120			return {
121			'data': None,
122			'summary': summary
123			}
124
125
126			def get_digits_frequency(words):
127			"""Look for and count the digits in names, e.g. 7-11, 3M, etc...
128
129			Args:
130			words (list): A list of words
131
132			Returns:
133			dict: The data and summary results.
134			"""
135			new_words = []
136			count = 0
137			digits = re.compile(r'[0-9]+')
138			for name in words:
139			if re.findall(digits, name):
140			count += 1
141			matches = re.findall(digits, name)
142			new_words += matches
143			return {
144			'data': new_words,
145			'summary': ('Of {} words, {} have numbers in them, '
146			'with a total of {} numbers found.').format(
147			len(words), count, len(new_words))
148			}
149
150
151			def get_first_letter_frequency(words):
152			"""Add the frequency of first letters e.g. [C]at, [C]law, c = 2.
153
154			Args:
155			words (list): A list of words
156
157			Returns:
158			dict: The data and summary results.
159			"""
160			letters = {}
161			# populate keys
162			for name in words:
163			letters[name[0]] = 0
164			# add counts
165			for name in words:
166			letters[name[0]] += 1
167			return {
168			'data': letters,
169			'summary': None
170			}
171
172
173			def get_special_chars(words):
174			"""Find occurrences of special characters (non-alphabetical characters).
175
176			Args:
177			words (list): A list of words
178
179			Returns:
180			dict: The data and summary results.
181			"""
182			data = []
183			chars = re.compile(r'[^a-z]', re.IGNORECASE)
184			for word in words:
185			if re.findall(chars, word):
186			data += re.findall(chars, word)
187			return {
188			'data': data,
189			'summary': ('{} occurrences of special characters were'
190			' found in {} words.').format(len(data), len(words))
191			}
192
193
194			def get_word_types(words):
195			"""Determine the occurrences of pos types.
196
197			Args:
198			words (list): A list of words
199
200			Returns:
201			dict: The data and summary results.
202			"""
203			new_arr = []
204			for val in words:
205			try:
206			val = parse(
207			val,
208			encoding='utf-8',
209			tokenize=False,
210			light=False,
211			tags=True,
212			chunks=False,
213			relations=False,
214			lemmata=False)
215			new_arr.append(val)
216			except IndexError:
217			continue
218			return {
219			'data': new_arr,
220			'summary': None
221			}
222
223
224			def get_name_spaces(words):
225			"""Check number of spaces for a given set of words.
226
227			Args:
228			words (list): A list of words
229
230			Returns:
231			dict: The data and summary results.
232			"""
233			results = [{'word': word, 'spaces': len(word.split(r' '))}
234			for word in words]
235			return {
236			'data': results,
237			'summary': None
238			}
239
240
241			def get_consonant_repeat_frequency(words):
242			"""Check for repeating consonant frequency for a given set of words.
243
244			Args:
245			words (list): A list of words
246
247			Returns:
248			dict: The data and summary results.
249			"""
250			count = 0
251			cons = re.compile(r'[^a\|e\|i\|o\|u{6}]')
252			for val in words:
253			if re.match(cons, val):
254			count += 1
255			return {
256			'data': count,
257			'summary': None
258			}
259
260
261			def get_consonant_duplicate_repeat_frequency(words):
262			"""Check for duplicate repeating consonant frequency.
263
264			Args:
265			words (list): A list of words
266
267			Returns:
268			dict: The data and summary results.
269			"""
270			count = 0
271			cons_dup = re.compile(r'[^a\|e\|i\|o\|u]{1,}')
272			for name in words:
273			if re.match(cons_dup, name):
274			count += 1
275			return {
276			'data': count,
277			'summary': None
278			}
279
280
281			def get_vowel_repeat_frequency(words):
282			"""Check for repeating vowel frequency for a given set of words.
283
284			Args:
285			words (list): A list of words
286
287			Returns:
288			dict: The data and summary results.
289			"""
290			count = 0
291			cons_vowel = re.compile(r'[aeiou{3}]')
292			for val in words:
293			if re.match(cons_vowel, val):
294			count += 1
295			return {
296			'data': count,
297			'summary': None
298			}
299
300
301			def get_adjective_verb_or_noun(words):
302			"""Get the number of words that are classified as verbs or nouns.
303
304			Args:
305			words (TYPE): Description
306
307			Returns:
308			dict: The data and summary results.
309			"""
310			total = len(words)
311			data = {'verbs': 0, 'nouns': 0}
312			verby = ['VBP', 'VB', 'RB', 'VBG']
313			nouns = ['NN', 'NNP']
314			for word, tag in pos_tag(words):
315			if tag in nouns:
316			data['nouns'] += 1
317			elif tag in verby:
318			data['verbs'] += 1
319			remainder = total - (data['verbs'] + data['nouns'])
320			return {
321			'data': data,
322			'summary': ('Of {0} words, {1} were nouns, {2} were verbs, '
323			'and {3} were everything else.').format(
324			total, data['nouns'], data['verbs'], remainder)
325			}
326
327
328			def categorize_word_type(words):
329			"""Get the common naming strategy 'category' of a name, based on precedence.
330
331			Categories are derived from
332			http://www.thenameinspector.com/10-name-types/,
333			so it is important to note there is no agreed upon standard,
334			meaning it is ultimately a little arbitrary.
335
336			Since it is a bit challenging to actually determine its type,
337			we give a weighting for each word based on a few known metrics.
338			This can be updated in the future so that weightings are binary
339			(e.g. 0.0 and 100.0), giving traditional False/True.
340
341			Categories ====
342
343			1. Real Words
344			1a. Misspelled words
345			1b. Foreign words
346			2. Compounds
347			3. Phrases
348			4. Blends
349			5. Tweaked
350			6. Affixed
351			7. Fake/obscure
352			8. Puns
353			9. People's names
354			10. Initials and Acronyms
355
356			Args:
357			words (list): A list of words
358
359			Returns:
360			new_words (list) - A list of lists, with each word
361			and its distribution by word "type"
362			"""
363			new_words = []
364
365			def _get_distribution(word):
366			# TODO:
367			# misspelled, foreign, tweaked, affixed, fake_obscure,
368			# initials_acronym, blend, puns, person, compound
369			"""Return the distribution for all categories, given a single word."""
370			categories = {
371			'real': 0,
372			'misspelled': 0,
373			'foreign': 0,
374			'compound': 0,
375			'phrase': 0,
376			'blend': 0,
377			'tweaked': 0,
378			'affixed': 0,
379			'fake_obscure': 0,
380			'puns': 0,
381			'person': 0,
382			'initials_acronym': 0,
383			}
384			if len(word.split(' ')) == 1:
385			# Real words are single
386			categories['real'] = 50
387			else:
388			# Phrases are not
389			categories['phrase'] = 50
390			# If word cannot be tagged,
391			# it's very likely fake_obscure
392			if pos_tag([word])[0][1] == '-NONE-':
393			categories['real'] = 0
394			categories['fake_obscure'] = 75
395			return categories
396
397			for word in words:
398			new_words.append([word, _get_distribution(word)])
399			return new_words
400
401
402			def generate_all_metrics(filename=None, words=None):
403			"""Generate all metrics in this module in one place.
404
405			Args:
406			filename (str, optional): A filename to load words from.
407			words (TYPE, optional): Words to use, if file is not specified.
408
409			Returns:
410			dict: All metrics results, keyed by name.
411			"""
412			if not filename and not words:
413			return None
414			if filename:
415			allnames = prep_file(filename)
416			else:
417			allnames = words
418			return {
419			'names': allnames,
420			'metrics': {
421			'digits_freq': get_digits_frequency(allnames),
422			'length': name_length(allnames),
423			'vowel_beginning': name_starts_with_vowel(allnames),
424			'vowel_count': name_vowel_count(allnames),
425			'name_length': name_length(allnames),
426			'name_spaces': get_name_spaces(allnames),
427			'consonant_repeat_freq': get_consonant_repeat_frequency(allnames),
428			'consonant_dup_repeat_freq': get_consonant_duplicate_repeat_frequency(allnames),
429			'vowel_repeat_freq': get_vowel_repeat_frequency(allnames),
430			'special_characters': get_special_chars(allnames),
431			'name_numbers': get_named_numbers_1_10(allnames),
432			'adj_verb_noun': get_adjective_verb_or_noun(allnames),
433			'first_letter_freq': get_first_letter_frequency(allnames),
434			'word_types': get_word_types(allnames)
435			}
436			}
437

christabor / namebot

Push — master ( 4d1361...6dc915 )

namebot.get_keyword_relevancy_map() A

Complexity

Size

Duplication

Duplication Side-by-Side

Filter issues like