| 1 |  |  | #!/usr/bin/env python | 
            
                                                                                                            
                            
            
                                    
            
            
                | 2 |  |  | # encoding=utf8 | 
            
                                                                                                            
                            
            
                                    
            
            
                | 3 |  |  | """ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 4 |  |  | Get top word charts from a CSV file of logged tweets: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 5 |  |  |     word_charts.py -n 100 | 
            
                                                                                                            
                            
            
                                    
            
            
                | 6 |  |  | Get top in a given year: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 7 |  |  |     word_charts.py -y 2014 -n 100 | 
            
                                                                                                            
                            
            
                                    
            
            
                | 8 |  |  | Get top in a given year that aren't in the previous year. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 9 |  |  |     word_charts.py -y 2014 -n 100 --diff | 
            
                                                                                                            
                            
            
                                    
            
            
                | 10 |  |  | Get top for a search term: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 11 |  |  |     word_charts.py  -s favorite | 
            
                                                                                                            
                            
            
                                    
            
            
                | 12 |  |  |     word_charts.py  -s favourite | 
            
                                                                                                            
                            
            
                                    
            
            
                | 13 |  |  | """ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 14 |  |  | from __future__ import print_function | 
            
                                                                                                            
                            
            
                                    
            
            
                | 15 |  |  | from most_frequent_words import most_frequent_words_and_counts, commafy | 
            
                                                                                                            
                            
            
                                    
            
            
                | 16 |  |  | import argparse | 
            
                                                                                                            
                            
            
                                    
            
            
                | 17 |  |  | import csv | 
            
                                                                                                            
                            
            
                                    
            
            
                | 18 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 19 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 20 |  |  | def filter_year(tweets, desired_year): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 21 |  |  |     found = [] | 
            
                                                                                                            
                            
            
                                    
            
            
                | 22 |  |  |     for tweet in tweets: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 23 |  |  |         year = int(tweet["created_at"][-4:]) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 24 |  |  |         if desired_year == year: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 25 |  |  |             found.append(tweet) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 26 |  |  |     print("Total in " + str(desired_year) + ":\t" + commafy(len(found))) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 27 |  |  |     return found | 
            
                                                                                                            
                            
            
                                    
            
            
                | 28 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 29 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 30 |  |  | def filter_search_term(tweets, desired_search_term): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 31 |  |  |     found = [] | 
            
                                                                                                            
                            
            
                                    
            
            
                | 32 |  |  |     for tweet in tweets: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 33 |  |  |         if desired_search_term in tweet["search_term"]: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 34 |  |  |             found.append(tweet) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 35 |  |  |     print("Total " + desired_search_term + ":\t" + commafy(len(found))) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 36 |  |  |     return found | 
            
                                                                                                            
                            
            
                                    
            
            
                | 37 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 38 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 39 |  |  | def print_chart(top): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 40 |  |  |     """ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 41 |  |  |     param top: a list of (word, count) tuples. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 42 |  |  |     """ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 43 |  |  |     if args.html: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 44 |  |  |         print("<ol>") | 
            
                                                                                                            
                            
            
                                    
            
            
                | 45 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 46 |  |  |     for i, (word, count) in enumerate(top): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 47 |  |  |         if args.html: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 48 |  |  |             print("<li>{0} ({1})</li>".format(word, commafy(count))) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 49 |  |  |         else: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 50 |  |  |             print(str(i+1) + ". " + word + " (" + commafy(count) + ")") | 
            
                                                                                                            
                            
            
                                    
            
            
                | 51 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 52 |  |  |     if args.html: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 53 |  |  |         print("</ol>") | 
            
                                                                                                            
                            
            
                                    
            
            
                | 54 |  |  |  | 
            
                                                                                                            
                                                                
            
                                    
            
            
                | 55 |  |  |  | 
            
                                                                        
                            
            
                                    
            
            
                | 56 |  |  | def print_top(tweets, number=10, year=None, search_term=None): | 
            
                                                                        
                            
            
                                    
            
            
                | 57 |  |  |     words = [] | 
            
                                                                        
                            
            
                                    
            
            
                | 58 |  |  |  | 
            
                                                                        
                            
            
                                    
            
            
                | 59 |  |  |     if search_term: | 
            
                                                                        
                            
            
                                    
            
            
                | 60 |  |  |         tweets = filter_search_term(tweets, search_term) | 
            
                                                                        
                            
            
                                    
            
            
                | 61 |  |  |  | 
            
                                                                        
                            
            
                                    
            
            
                | 62 |  |  |     if year: | 
            
                                                                        
                            
            
                                    
            
            
                | 63 |  |  |         tweets = filter_year(tweets, year) | 
            
                                                                        
                            
            
                                    
            
            
                | 64 |  |  |  | 
            
                                                                        
                            
            
                                    
            
            
                | 65 |  |  |     for tweet in tweets: | 
            
                                                                        
                            
            
                                    
            
            
                | 66 |  |  |         words.append(tweet['word']) | 
            
                                                                        
                            
            
                                    
            
            
                | 67 |  |  |  | 
            
                                                                        
                            
            
                                    
            
            
                | 68 |  |  |     print() | 
            
                                                                        
                            
            
                                    
            
            
                | 69 |  |  |     title = "# Top " + str(number) | 
            
                                                                        
                            
            
                                    
            
            
                | 70 |  |  |     if year: | 
            
                                                                        
                            
            
                                    
            
            
                | 71 |  |  |         title += " (" + str(year) + ")" | 
            
                                                                        
                            
            
                                    
            
            
                | 72 |  |  |     print(title) | 
            
                                                                        
                            
            
                                    
            
            
                | 73 |  |  |     print() | 
            
                                                                        
                            
            
                                    
            
            
                | 74 |  |  |     top = most_frequent_words_and_counts(words, number) | 
            
                                                                        
                            
            
                                    
            
            
                | 75 |  |  |     print_chart(top) | 
            
                                                                        
                            
            
                                    
            
            
                | 76 |  |  |     return top | 
            
                                                                                                            
                            
            
                                    
            
            
                | 77 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 78 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 79 |  |  | # cmd.exe cannot do Unicode so encode first | 
            
                                                                                                            
                            
            
                                    
            
            
                | 80 |  |  | def print_it(text): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 81 |  |  |     print(text.encode('utf-8')) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 82 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 83 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 84 |  |  | def load_csv(filename): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 85 |  |  |     # with codecs.open(filename, mode='rb', encoding='cp1252') as fd: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 86 |  |  |     with open(filename, mode='rb') as fd: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 87 |  |  |         data = csv.DictReader(fd) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 88 |  |  |         rows = [] | 
            
                                                                                                            
                            
            
                                    
            
            
                | 89 |  |  |         seen = set()  # avoid duplicates | 
            
                                                                                                            
                            
            
                                    
            
            
                | 90 |  |  |         for row in data: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 91 |  |  |             if row['id_str'] not in seen and row['word'] not in ['actually']: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 92 |  |  |                 seen.add(row['id_str']) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 93 |  |  |                 rows.append(row) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 94 |  |  |     return rows | 
            
                                                                                                            
                            
            
                                    
            
            
                | 95 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 96 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 97 |  |  | if __name__ == "__main__": | 
            
                                                                                                            
                            
            
                                    
            
            
                | 98 |  |  |     parser = argparse.ArgumentParser( | 
            
                                                                                                            
                            
            
                                    
            
            
                | 99 |  |  |         description="Get top word charts from a CSV file of logged tweets.", | 
            
                                                                                                            
                            
            
                                    
            
            
                | 100 |  |  |         formatter_class=argparse.ArgumentDefaultsHelpFormatter) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 101 |  |  |     parser.add_argument( | 
            
                                                                                                            
                            
            
                                    
            
            
                | 102 |  |  |         '-c', '--csv', default='M:/bin/data/favibot.csv', | 
            
                                                                                                            
                            
            
                                    
            
            
                | 103 |  |  |         help='Input CSV file') | 
            
                                                                                                            
                            
            
                                    
            
            
                | 104 |  |  |     parser.add_argument( | 
            
                                                                                                            
                            
            
                                    
            
            
                | 105 |  |  |         '-n', '--top-number',  type=int, default=10, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 106 |  |  |         help="Show top X") | 
            
                                                                                                            
                            
            
                                    
            
            
                | 107 |  |  |     parser.add_argument( | 
            
                                                                                                            
                            
            
                                    
            
            
                | 108 |  |  |         '-y', '--year',  type=int, default=None, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 109 |  |  |         help="Only from this year") | 
            
                                                                                                            
                            
            
                                    
            
            
                | 110 |  |  |     parser.add_argument( | 
            
                                                                                                            
                            
            
                                    
            
            
                | 111 |  |  |         '-s', '--search_term', | 
            
                                                                                                            
                            
            
                                    
            
            
                | 112 |  |  |         help="Only for this search term") | 
            
                                                                                                            
                            
            
                                    
            
            
                | 113 |  |  |     parser.add_argument( | 
            
                                                                                                            
                            
            
                                    
            
            
                | 114 |  |  |         '--diff', action='store_true', | 
            
                                                                                                            
                            
            
                                    
            
            
                | 115 |  |  |         help="Compare a year to the previous year") | 
            
                                                                                                            
                            
            
                                    
            
            
                | 116 |  |  |     parser.add_argument( | 
            
                                                                                                            
                            
            
                                    
            
            
                | 117 |  |  |         '--html', action='store_true', | 
            
                                                                                                            
                            
            
                                    
            
            
                | 118 |  |  |         help="Output with html markup") | 
            
                                                                                                            
                            
            
                                    
            
            
                | 119 |  |  |     args = parser.parse_args() | 
            
                                                                                                            
                            
            
                                    
            
            
                | 120 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 121 |  |  |     tweets = load_csv(args.csv) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 122 |  |  |     print("Total tweets:\t" + commafy(len(tweets))) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 123 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 124 |  |  |     if args.diff and args.year: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 125 |  |  |         last_year = print_top(tweets, number=args.top_number, year=args.year-1, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 126 |  |  |                               search_term=args.search_term) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 127 |  |  |         this_year = print_top(tweets, number=args.top_number, year=args.year, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 128 |  |  |                               search_term=args.search_term) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 129 |  |  |         last_years_words = [e[0] for e in last_year] | 
            
                                                                                                            
                            
            
                                    
            
            
                | 130 |  |  |         this_years_words = [e[0] for e in this_year] | 
            
                                                                                                            
                            
            
                                    
            
            
                | 131 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 132 |  |  |         set1 = set(last_years_words) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 133 |  |  |         diff = [x for x in this_years_words if x not in set1] | 
            
                                                                                                            
                            
            
                                    
            
            
                | 134 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 135 |  |  |         top = [] | 
            
                                                                                                            
                            
            
                                    
            
            
                | 136 |  |  |         for word in diff: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 137 |  |  |             for e in this_year: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 138 |  |  |                 if e[0] == word: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 139 |  |  |                     top.append(e) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 140 |  |  |                     continue | 
            
                                                                                                            
                            
            
                                    
            
            
                | 141 |  |  |         print() | 
            
                                                                                                            
                            
            
                                    
            
            
                | 142 |  |  |         top_x = " top " + str(args.top_number) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 143 |  |  |         print("# New entries in the " + str(args.year) + top_x + | 
            
                                                                                                            
                            
            
                                    
            
            
                | 144 |  |  |               " which weren't in the " + str(args.year-1) + top_x) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 145 |  |  |         print() | 
            
                                                                                                            
                            
            
                                    
            
            
                | 146 |  |  |         print_chart(top) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 147 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 148 |  |  |     else: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 149 |  |  |         print_top(tweets, number=args.top_number, year=args.year, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 150 |  |  |                   search_term=args.search_term) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 151 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 152 |  |  |  | 
            
                                                                                                            
                                                                
            
                                    
            
            
                | 153 |  |  | # End of file | 
            
                                                        
            
                                    
            
            
                | 154 |  |  |  |