url2wordcloud() - Code Metrics - Inspection of "Add wordcloud utils, tests, docs, and small ui twe..." - christabor/flask_jsondash - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — master ( fe329d...7d09bc )

by Chris

created 2017-07-27 18:25 UTC

url2wordcloud() F

↳ Parent: Project

Complexity

Conditions

Size

Total Lines

Duplication

Lines	0
Ratio	0 %

Importance

Changes	1
Bugs	0	Features	0

Metric	Value
cc	12
c	1
b	0
f	0
dl	0
loc	41
rs	2.7855

How to fix Complexity

#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
flask_jsondash.data_utils.wordcloud
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Utilities for working with wordcloud formatted data.

:copyright: (c) 2016 by Chris Tabor.
:license: MIT, see LICENSE for more details.
"""

from collections import Counter
from string import punctuation
import re

import requests
from pyquery import PyQuery as Pq

# Py2/3 compat.
try:
    _unicode = unicode
except NameError:
    _unicode = str


# NLTK stopwords
stopwords = [
    'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your',
    'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she',
    'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their',
    'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that',
    'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being',
    'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an',
    'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of',
    'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into',
    'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from',
    'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again',
    'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why',
    'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other',
    'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than',
    'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now',
]


def get_word_freq_distribution(words):
    """Get the counted word frequency distribution of all words.

    Arg:
        words (list): A list of strings indicating words.

    Returns:
        collections.Counter: The Counter object with word frequencies.
    """
    return Counter([w for w in words if w not in stopwords])


def format_4_wordcloud(words, size_multiplier=2):
    """Format words in a way suitable for wordcloud plugin.

    Args:
        words (list): A list 2-tuples of format: (word-string, occurences).
        size_multiplier (int, optional): The size multiplier to scale
            word sizing. Can improve visual display of word cloud.

    Returns:
        list: A list of dicts w/ appropriate keys.
    """
    return [
        {'text': word, 'size': size * size_multiplier}
        for (word, size) in words if word
    ]


def url2wordcloud(url, requests_kwargs={},
                  exclude_punct=True,
                  normalized=True,
                  limit=None,
                  size=1,
                  min_len=None):
    """Convert the text content of a urls' html to a wordcloud config.

    Args:
        url (str): The url to load.
        requests_kwargs (dict, optional): The kwargs to pass to the
            requests library. (e.g. auth, headers, mimetypes)
        exclude_punc (bool, optional): exclude punctuation
        min_length (int, optional): the minimum required length, if any
        limit (int, optional): the number of items to limit
            (by most common), if any
        normalized (bool, optional): normalize data by
            lowercasing and strippping whitespace

    Returns:
        same value as :func:`~format_4_wordcloud`
    """
    resp = requests.get(url, **requests_kwargs)
    if not resp.status_code == 200:
        return []
    resp = Pq(resp.content).find('body').text().split(' ')
    if exclude_punct:
        resp = [
            re.sub(r'[^a-zA-Z0-9]+', '', w) for w
            in resp if w not in punctuation
        ]
    if min_len is not None:
        resp = [w for w in resp if len(w) >= min_len]
    if normalized:
        resp = [w.lower() for w in resp]
    words = get_word_freq_distribution(resp)
    if limit is not None:
        words = words.most_common(limit)
    else:
        words = [(k, v) for k, v in words.items()]
    return format_4_wordcloud(words, size_multiplier=size)


1			#!/usr/bin/env python
2			# -- coding: utf-8 --
3
4			"""
5			flask_jsondash.data_utils.wordcloud
6			~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
7
8			Utilities for working with wordcloud formatted data.
9
10			:copyright: (c) 2016 by Chris Tabor.
11			:license: MIT, see LICENSE for more details.
12			"""
13
14			from collections import Counter
15			from string import punctuation
16			import re
17
18			import requests
19			from pyquery import PyQuery as Pq
20
21			# Py2/3 compat.
22			try:
23			_unicode = unicode
24			except NameError:
25			_unicode = str
26
27
28			# NLTK stopwords
29			stopwords = [
30			'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your',
31			'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she',
32			'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their',
33			'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that',
34			'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being',
35			'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an',
36			'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of',
37			'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into',
38			'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from',
39			'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again',
40			'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why',
41			'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other',
42			'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than',
43			'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now',
44			]
45
46
47			def get_word_freq_distribution(words):
48			"""Get the counted word frequency distribution of all words.
49
50			Arg:
51			words (list): A list of strings indicating words.
52
53			Returns:
54			collections.Counter: The Counter object with word frequencies.
55			"""
56			return Counter([w for w in words if w not in stopwords])
57
58
59			def format_4_wordcloud(words, size_multiplier=2):
60			"""Format words in a way suitable for wordcloud plugin.
61
62			Args:
63			words (list): A list 2-tuples of format: (word-string, occurences).
64			size_multiplier (int, optional): The size multiplier to scale
65			word sizing. Can improve visual display of word cloud.
66
67			Returns:
68			list: A list of dicts w/ appropriate keys.
69			"""
70			return [
71			{'text': word, 'size': size * size_multiplier}
72			for (word, size) in words if word
73			]
74
75
76			def url2wordcloud(url, requests_kwargs={},
77			exclude_punct=True,
78			normalized=True,
79			limit=None,
80			size=1,
81			min_len=None):
82			"""Convert the text content of a urls' html to a wordcloud config.
83
84			Args:
85			url (str): The url to load.
86			requests_kwargs (dict, optional): The kwargs to pass to the
87			requests library. (e.g. auth, headers, mimetypes)
88			exclude_punc (bool, optional): exclude punctuation
89			min_length (int, optional): the minimum required length, if any
90			limit (int, optional): the number of items to limit
91			(by most common), if any
92			normalized (bool, optional): normalize data by
93			lowercasing and strippping whitespace
94
95			Returns:
96			same value as :func:`~format_4_wordcloud`
97			"""
98			resp = requests.get(url, **requests_kwargs)
99			if not resp.status_code == 200:
100			return []
101			resp = Pq(resp.content).find('body').text().split(' ')
102			if exclude_punct:
103			resp = [
104			re.sub(r'[^a-zA-Z0-9]+', '', w) for w
105			in resp if w not in punctuation
106			]
107			if min_len is not None:
108			resp = [w for w in resp if len(w) >= min_len]
109			if normalized:
110			resp = [w.lower() for w in resp]
111			words = get_word_freq_distribution(resp)
112			if limit is not None:
113			words = words.most_common(limit)
114			else:
115			words = [(k, v) for k, v in words.items()]
116			return format_4_wordcloud(words, size_multiplier=size)
117

christabor / flask_jsondash

Push — master ( fe329d...7d09bc )

url2wordcloud() F

Complexity

Size

Duplication

Importance

How to fix Complexity

Complexity

Duplication Side-by-Side

Filter issues like