Completed
Push — master ( fe329d...7d09bc )
by Chris
39s
created

url2wordcloud()   F

Complexity

Conditions 12

Size

Total Lines 41

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 12
c 1
b 0
f 0
dl 0
loc 41
rs 2.7855

How to fix   Complexity   

Complexity

Complex classes like url2wordcloud() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

1
#!/usr/bin/env python
2
# -*- coding: utf-8 -*-
3
4
"""
5
flask_jsondash.data_utils.wordcloud
6
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
7
8
Utilities for working with wordcloud formatted data.
9
10
:copyright: (c) 2016 by Chris Tabor.
11
:license: MIT, see LICENSE for more details.
12
"""
13
14
from collections import Counter
15
from string import punctuation
16
import re
17
18
import requests
19
from pyquery import PyQuery as Pq
20
21
# Py2/3 compat.
22
try:
23
    _unicode = unicode
24
except NameError:
25
    _unicode = str
26
27
28
# NLTK stopwords
29
stopwords = [
30
    'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your',
31
    'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she',
32
    'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their',
33
    'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that',
34
    'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being',
35
    'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an',
36
    'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of',
37
    'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into',
38
    'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from',
39
    'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again',
40
    'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why',
41
    'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other',
42
    'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than',
43
    'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now',
44
]
45
46
47
def get_word_freq_distribution(words):
48
    """Get the counted word frequency distribution of all words.
49
50
    Arg:
51
        words (list): A list of strings indicating words.
52
53
    Returns:
54
        collections.Counter: The Counter object with word frequencies.
55
    """
56
    return Counter([w for w in words if w not in stopwords])
57
58
59
def format_4_wordcloud(words, size_multiplier=2):
60
    """Format words in a way suitable for wordcloud plugin.
61
62
    Args:
63
        words (list): A list 2-tuples of format: (word-string, occurences).
64
        size_multiplier (int, optional): The size multiplier to scale
65
            word sizing. Can improve visual display of word cloud.
66
67
    Returns:
68
        list: A list of dicts w/ appropriate keys.
69
    """
70
    return [
71
        {'text': word, 'size': size * size_multiplier}
72
        for (word, size) in words if word
73
    ]
74
75
76
def url2wordcloud(url, requests_kwargs={},
77
                  exclude_punct=True,
78
                  normalized=True,
79
                  limit=None,
80
                  size=1,
81
                  min_len=None):
82
    """Convert the text content of a urls' html to a wordcloud config.
83
84
    Args:
85
        url (str): The url to load.
86
        requests_kwargs (dict, optional): The kwargs to pass to the
87
            requests library. (e.g. auth, headers, mimetypes)
88
        exclude_punc (bool, optional): exclude punctuation
89
        min_length (int, optional): the minimum required length, if any
90
        limit (int, optional): the number of items to limit
91
            (by most common), if any
92
        normalized (bool, optional): normalize data by
93
            lowercasing and strippping whitespace
94
95
    Returns:
96
        same value as :func:`~format_4_wordcloud`
97
    """
98
    resp = requests.get(url, **requests_kwargs)
99
    if not resp.status_code == 200:
100
        return []
101
    resp = Pq(resp.content).find('body').text().split(' ')
102
    if exclude_punct:
103
        resp = [
104
            re.sub(r'[^a-zA-Z0-9]+', '', w) for w
105
            in resp if w not in punctuation
106
        ]
107
    if min_len is not None:
108
        resp = [w for w in resp if len(w) >= min_len]
109
    if normalized:
110
        resp = [w.lower() for w in resp]
111
    words = get_word_freq_distribution(resp)
112
    if limit is not None:
113
        words = words.most_common(limit)
114
    else:
115
        words = [(k, v) for k, v in words.items()]
116
    return format_4_wordcloud(words, size_multiplier=size)
117