load_csv()   B
last analyzed

Complexity

Conditions 5

Size

Total Lines 11

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 5
c 1
b 0
f 0
dl 0
loc 11
rs 8.5454
1
#!/usr/bin/env python
2
# encoding=utf8
3
"""
4
Get top word charts from a CSV file of logged tweets:
5
    word_charts.py -n 100
6
Get top in a given year:
7
    word_charts.py -y 2014 -n 100
8
Get top in a given year that aren't in the previous year.
9
    word_charts.py -y 2014 -n 100 --diff
10
Get top for a search term:
11
    word_charts.py  -s favorite
12
    word_charts.py  -s favourite
13
"""
14
from __future__ import print_function
15
from most_frequent_words import most_frequent_words_and_counts, commafy
16
import argparse
17
import csv
18
19
20
def filter_year(tweets, desired_year):
21
    found = []
22
    for tweet in tweets:
23
        year = int(tweet["created_at"][-4:])
24
        if desired_year == year:
25
            found.append(tweet)
26
    print("Total in " + str(desired_year) + ":\t" + commafy(len(found)))
27
    return found
28
29
30
def filter_search_term(tweets, desired_search_term):
31
    found = []
32
    for tweet in tweets:
33
        if desired_search_term in tweet["search_term"]:
34
            found.append(tweet)
35
    print("Total " + desired_search_term + ":\t" + commafy(len(found)))
36
    return found
37
38
39
def print_chart(top):
40
    """
41
    param top: a list of (word, count) tuples.
42
    """
43
    if args.html:
44
        print("<ol>")
45
46
    for i, (word, count) in enumerate(top):
47
        if args.html:
48
            print("<li>{0} ({1})</li>".format(word, commafy(count)))
49
        else:
50
            print(str(i+1) + ". " + word + " (" + commafy(count) + ")")
51
52
    if args.html:
53
        print("</ol>")
54
55
56
def print_top(tweets, number=10, year=None, search_term=None):
57
    words = []
58
59
    if search_term:
60
        tweets = filter_search_term(tweets, search_term)
61
62
    if year:
63
        tweets = filter_year(tweets, year)
64
65
    for tweet in tweets:
66
        words.append(tweet['word'])
67
68
    print()
69
    title = "# Top " + str(number)
70
    if year:
71
        title += " (" + str(year) + ")"
72
    print(title)
73
    print()
74
    top = most_frequent_words_and_counts(words, number)
75
    print_chart(top)
76
    return top
77
78
79
# cmd.exe cannot do Unicode so encode first
80
def print_it(text):
81
    print(text.encode('utf-8'))
82
83
84
def load_csv(filename):
85
    # with codecs.open(filename, mode='rb', encoding='cp1252') as fd:
86
    with open(filename, mode='rb') as fd:
87
        data = csv.DictReader(fd)
88
        rows = []
89
        seen = set()  # avoid duplicates
90
        for row in data:
91
            if row['id_str'] not in seen and row['word'] not in ['actually']:
92
                seen.add(row['id_str'])
93
                rows.append(row)
94
    return rows
95
96
97
if __name__ == "__main__":
98
    parser = argparse.ArgumentParser(
99
        description="Get top word charts from a CSV file of logged tweets.",
100
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
101
    parser.add_argument(
102
        '-c', '--csv', default='M:/bin/data/favibot.csv',
103
        help='Input CSV file')
104
    parser.add_argument(
105
        '-n', '--top-number',  type=int, default=10,
106
        help="Show top X")
107
    parser.add_argument(
108
        '-y', '--year',  type=int, default=None,
109
        help="Only from this year")
110
    parser.add_argument(
111
        '-s', '--search_term',
112
        help="Only for this search term")
113
    parser.add_argument(
114
        '--diff', action='store_true',
115
        help="Compare a year to the previous year")
116
    parser.add_argument(
117
        '--html', action='store_true',
118
        help="Output with html markup")
119
    args = parser.parse_args()
120
121
    tweets = load_csv(args.csv)
122
    print("Total tweets:\t" + commafy(len(tweets)))
123
124
    if args.diff and args.year:
125
        last_year = print_top(tweets, number=args.top_number, year=args.year-1,
126
                              search_term=args.search_term)
127
        this_year = print_top(tweets, number=args.top_number, year=args.year,
128
                              search_term=args.search_term)
129
        last_years_words = [e[0] for e in last_year]
130
        this_years_words = [e[0] for e in this_year]
131
132
        set1 = set(last_years_words)
133
        diff = [x for x in this_years_words if x not in set1]
134
135
        top = []
136
        for word in diff:
137
            for e in this_year:
138
                if e[0] == word:
139
                    top.append(e)
140
                    continue
141
        print()
142
        top_x = " top " + str(args.top_number)
143
        print("# New entries in the " + str(args.year) + top_x +
144
              " which weren't in the " + str(args.year-1) + top_x)
145
        print()
146
        print_chart(top)
147
148
    else:
149
        print_top(tweets, number=args.top_number, year=args.year,
150
                  search_term=args.search_term)
151
152
153
# End of file
154