1
|
|
|
#! /usr/bin/python3 |
2
|
|
|
# vim: set expandtab tabstop=4 shiftwidth=4 : |
3
|
|
|
"""Module to define logic common to all comics.""" |
4
|
|
|
|
5
|
|
|
import json |
6
|
|
|
import time |
7
|
|
|
import os |
8
|
|
|
from datetime import date |
9
|
|
|
from urlfunctions import get_filename_from_url, get_file_at_url |
10
|
|
|
import inspect |
11
|
|
|
import logging |
12
|
|
|
|
13
|
|
|
|
14
|
|
|
def get_date_for_comic(comic): |
15
|
|
|
"""Return date object for a given comic.""" |
16
|
|
|
return date(comic['year'], comic['month'], comic['day']) |
17
|
|
|
|
18
|
|
|
|
19
|
|
|
def get_info_before_comic(comic): |
20
|
|
|
"""Generates the info to be put before the images.""" |
21
|
|
|
author = comic.get('author') |
22
|
|
|
if author: |
23
|
|
|
yield 'by ' + author |
24
|
|
|
|
25
|
|
|
|
26
|
|
|
def get_info_after_comic(comic): |
27
|
|
|
"""Generates the info to be put after the images.""" |
28
|
|
|
for name in ['alt', 'title', 'title2', 'texts', 'name', 'description']: |
29
|
|
|
info = comic.get(name) |
30
|
|
|
if info: |
31
|
|
|
yield info |
32
|
|
|
|
33
|
|
|
|
34
|
|
|
class GenericComic(object): |
35
|
|
|
"""Generic class to handle the logic common to all comics |
36
|
|
|
|
37
|
|
|
Attributes : |
38
|
|
|
name Name of the comic (for logging, CLI and default output dir) |
39
|
|
|
long_name Long name of the comic (to be added in the comic info) |
40
|
|
|
url Base url for the comic (without trailing slash).""" |
41
|
|
|
name = None |
42
|
|
|
long_name = None |
43
|
|
|
url = None |
44
|
|
|
_categories = ('ALL', ) |
45
|
|
|
|
46
|
|
|
@classmethod |
47
|
|
|
def log(cls, string): |
48
|
|
|
"""Dirty logging function.""" |
49
|
|
|
# TODO: https://docs.python.org/2/library/logging.html#logrecord-attributes |
50
|
|
|
# we do not need to retrieve the function name manually |
51
|
|
|
logging.debug(inspect.stack()[1][3] + " " + cls.name + " " + string) |
52
|
|
|
|
53
|
|
|
@classmethod |
54
|
|
|
def _get_output_dir(cls): |
55
|
|
|
"""Returns the name of the output directory (for comics and JSON file). |
56
|
|
|
To be overridden if needed.""" |
57
|
|
|
return cls.name |
58
|
|
|
|
59
|
|
|
@classmethod |
60
|
|
|
def _create_output_dir(cls): |
61
|
|
|
"""Create output directory for the comic on the file system.""" |
62
|
|
|
cls.log("start") |
63
|
|
|
os.makedirs(cls._get_output_dir(), exist_ok=True) |
64
|
|
|
cls.log("done") |
65
|
|
|
|
66
|
|
|
@classmethod |
67
|
|
|
def _get_json_file_path(cls): |
68
|
|
|
"""Get the full path to the JSON file.""" |
69
|
|
|
return os.path.join(cls._get_output_dir(), cls.name + '.json') |
70
|
|
|
|
71
|
|
|
@classmethod |
72
|
|
|
def _load_db(cls): |
73
|
|
|
"""Load the JSON file to return the list of comics.""" |
74
|
|
|
cls.log("start") |
75
|
|
|
try: |
76
|
|
|
with open(cls._get_json_file_path()) as file: |
77
|
|
|
return json.load(file) |
78
|
|
|
except IOError: |
79
|
|
|
return [] |
80
|
|
|
|
81
|
|
|
@classmethod |
82
|
|
|
def get_comics(cls): |
83
|
|
|
"""Return the list of comics.""" |
84
|
|
|
return [c for c in cls._load_db() if 'deleted' not in c] |
85
|
|
|
|
86
|
|
|
@classmethod |
87
|
|
|
def get_last_comic(cls, comics): |
88
|
|
|
"""Return the last (non-deleted) comic.""" |
89
|
|
|
return next((c for c in reversed(comics) if 'deleted' not in c), None) |
90
|
|
|
|
91
|
|
|
@classmethod |
92
|
|
|
def _save_db(cls, data): |
93
|
|
|
"""Save the list of comics in the JSON file.""" |
94
|
|
|
cls.log("start") |
95
|
|
|
with open(cls._get_json_file_path(), 'w+') as file: |
96
|
|
|
json.dump(data, file, indent=4, sort_keys=True) |
97
|
|
|
cls.log("done") |
98
|
|
|
|
99
|
|
|
@classmethod |
100
|
|
|
def get_file_in_output_dir(cls, url, prefix=None, referer=None): |
101
|
|
|
"""Download file from URL and save it in output folder.""" |
102
|
|
|
cls.log("start (url:%s)" % url) |
103
|
|
|
filename = os.path.join( |
104
|
|
|
cls._get_output_dir(), |
105
|
|
|
('' if prefix is None else prefix) + |
106
|
|
|
get_filename_from_url(url)) |
107
|
|
|
return get_file_at_url(url, filename, referer) |
108
|
|
|
|
109
|
|
|
@classmethod |
110
|
|
|
def check_everything_is_ok(cls): |
111
|
|
|
"""Perform tests on the database to check that everything is ok.""" |
112
|
|
|
cls.log("start") |
113
|
|
|
print(cls.name, ': about to check') |
114
|
|
|
comics = cls.get_comics() # cls._load_db() |
115
|
|
|
imgs_paths = {} |
116
|
|
|
imgs_urls = {} |
117
|
|
|
prev_date, prev_num = None, None |
118
|
|
|
today = date.today() |
119
|
|
|
for i, comic in enumerate(comics): |
120
|
|
|
cls.print_comic(comic) |
121
|
|
|
url = comic.get('url') |
122
|
|
|
assert isinstance(url, str), "Url %s not a string" % url |
123
|
|
|
assert comic.get('comic') == cls.long_name |
124
|
|
|
assert all(isinstance(comic.get(k), int) |
125
|
|
|
for k in ['day', 'month', 'year']), \ |
126
|
|
|
"Invalid date data (%s)" % url |
127
|
|
|
curr_date = get_date_for_comic(comic) |
128
|
|
|
assert curr_date <= today |
129
|
|
|
curr_num = comic.get('num', 0) |
130
|
|
|
assert isinstance(curr_num, int) |
131
|
|
|
assert prev_date is None or prev_date <= curr_date or \ |
132
|
|
|
prev_num < curr_num, \ |
133
|
|
|
"Comics are not in order (%s)" % url |
134
|
|
|
prev_date, prev_num = curr_date, curr_num |
135
|
|
|
img = comic.get('img') |
136
|
|
|
local_img = comic.get('local_img') |
137
|
|
|
assert isinstance(img, list) |
138
|
|
|
assert isinstance(local_img, list) |
139
|
|
|
assert len(local_img) == len(img) |
140
|
|
|
for path in local_img: |
141
|
|
|
if path is not None: |
142
|
|
|
assert os.path.isfile(path) |
143
|
|
|
imgs_paths.setdefault(path, set()).add(i) |
144
|
|
|
for img_url in img: |
145
|
|
|
imgs_urls.setdefault(img_url, set()).add(i) |
146
|
|
|
print() |
147
|
|
|
if False: # To check if imgs are not overriding themselves |
148
|
|
|
for path, nums in imgs_paths.items(): |
149
|
|
|
if len(nums) > 1: |
150
|
|
|
print("Image used multiple times", path, nums) |
151
|
|
|
for img_url, nums in imgs_urls.items(): |
152
|
|
|
if len(nums) > 1: |
153
|
|
|
print("Url used multiple times", img_url, nums) |
154
|
|
|
if False: # To check that all files in folder are useful |
155
|
|
|
json = cls._get_json_file_path() |
156
|
|
|
output_dir = cls._get_output_dir() |
157
|
|
|
for file_ in os.listdir(output_dir): |
158
|
|
|
file_path = os.path.join(output_dir, file_) |
159
|
|
|
if file_path not in imgs_paths and file_path != json: |
160
|
|
|
print("Unused image", file_path) |
161
|
|
|
cls.log("done") |
162
|
|
|
|
163
|
|
|
@classmethod |
164
|
|
|
def get_next_comic(cls, _): |
165
|
|
|
"""Generator to get the next comic. |
166
|
|
|
|
167
|
|
|
First argument is the last properly downloaded comic which gives |
168
|
|
|
a starting point to download more. |
169
|
|
|
|
170
|
|
|
This is the method called by update(). It should yield comics which |
171
|
|
|
are basically dictionnaries with the following property : |
172
|
|
|
- 'url' is linked to a string |
173
|
|
|
- 'img' is linked to a list of url (that will get downloaded) |
174
|
|
|
- 'day'/'month'/'year' are self explicit. They are linked to |
175
|
|
|
integers corresponding to the comic dates. There should be |
176
|
|
|
all of them or none of them |
177
|
|
|
- more fields can be provided.""" |
178
|
|
|
raise NotImplementedError |
179
|
|
|
|
180
|
|
|
@classmethod |
181
|
|
|
def print_text(cls, text): |
182
|
|
|
"""Print text by returning to the beginning of the line every time.""" |
183
|
|
|
print(cls.name, ':', text, ' ' * 10, '\r', end='') |
184
|
|
|
|
185
|
|
|
@classmethod |
186
|
|
|
def print_comic(cls, comic): |
187
|
|
|
"""Print information about a comic.""" |
188
|
|
|
cls.print_text(comic['url']) |
189
|
|
|
|
190
|
|
|
@classmethod |
191
|
|
|
def update(cls): |
192
|
|
|
"""Update the database : get the latest comics and save in the DB. |
193
|
|
|
|
194
|
|
|
This is a wrapper around get_next_comic() providing the following |
195
|
|
|
generic features : |
196
|
|
|
- logging |
197
|
|
|
- database handling (open and save) |
198
|
|
|
- exception handling (properly retrieved data are always saved) |
199
|
|
|
- file download |
200
|
|
|
- data management (adds current date if no date is provided).""" |
201
|
|
|
cls.log("start") |
202
|
|
|
print(cls.name, ': about to update') |
203
|
|
|
cls._create_output_dir() |
204
|
|
|
comics = cls._load_db() |
205
|
|
|
new_comics = [] |
206
|
|
|
start = time.time() |
207
|
|
|
try: |
208
|
|
|
last_comic = cls.get_last_comic(comics) |
209
|
|
|
cls.log("last comic is %s" % ('None' if last_comic is None else last_comic['url'])) |
210
|
|
|
for comic in cls.get_next_comic(last_comic): |
211
|
|
|
cls.log("got %s" % str(comic)) |
212
|
|
|
assert 'url' in comic |
213
|
|
|
assert 'img' in comic |
214
|
|
|
if 'day' in comic: |
215
|
|
|
assert all(isinstance(comic.get(k), int) for k in ['day', 'month', 'year']) |
216
|
|
|
else: |
217
|
|
|
assert all(k not in comic for k in ['day', 'month', 'year']) |
218
|
|
|
day = date.today() |
219
|
|
|
comic['day'], comic['month'], comic['year'] = \ |
220
|
|
|
day.day, day.month, day.year |
221
|
|
|
prefix = comic.get('prefix', '') |
222
|
|
|
comic['local_img'] = [cls.get_file_in_output_dir(i, prefix, referer=comic['url']) |
223
|
|
|
for i in comic['img']] |
224
|
|
|
comic['comic'] = cls.long_name |
225
|
|
|
comic['new'] = None # "'new' in comic" to check if new |
226
|
|
|
new_comics.append(comic) |
227
|
|
|
cls.print_comic(comic) |
228
|
|
|
finally: |
229
|
|
|
end = time.time() |
230
|
|
|
if new_comics: |
231
|
|
|
print() |
232
|
|
|
cls._save_db(comics + new_comics) |
233
|
|
|
print(cls.name, ": added", len(new_comics), |
234
|
|
|
"comics in", end - start, "seconds") |
235
|
|
|
else: |
236
|
|
|
print(cls.name, ": nothing new") |
237
|
|
|
cls.log("done") |
238
|
|
|
|
239
|
|
|
@classmethod |
240
|
|
|
def try_to_get_missing_resources(cls): |
241
|
|
|
"""Download images that might not have been downloaded properly in |
242
|
|
|
the first place.""" |
243
|
|
|
cls.log("start") |
244
|
|
|
print(cls.name, ': about to try to get missing resources') |
245
|
|
|
cls._create_output_dir() |
246
|
|
|
comics = cls._load_db() |
247
|
|
|
change = False |
248
|
|
|
for comic in comics: |
249
|
|
|
comicurl = comic['url'] |
250
|
|
|
local = comic['local_img'] |
251
|
|
|
prefix = comic.get('prefix', '') |
252
|
|
|
for i, (path, url) in enumerate(zip(local, comic['img'])): |
253
|
|
|
if path is None: |
254
|
|
|
new_path = cls.get_file_in_output_dir(url, prefix, referer=comicurl) |
255
|
|
|
if new_path is None: |
256
|
|
|
print(cls.name, ': failed to get', url) |
257
|
|
|
else: |
258
|
|
|
print(cls.name, ': got', url, 'at', new_path) |
259
|
|
|
local[i] = new_path |
260
|
|
|
change = True |
261
|
|
|
comic['new'] = None |
262
|
|
|
if change: |
263
|
|
|
cls._save_db(comics) |
264
|
|
|
print(cls.name, ": some missing resources have been downloaded") |
265
|
|
|
cls.log("done") |
266
|
|
|
|
267
|
|
|
@classmethod |
268
|
|
|
def reset_new(cls): |
269
|
|
|
"""Remove the 'new' flag on comics in the DB.""" |
270
|
|
|
cls.log("start") |
271
|
|
|
cls._create_output_dir() |
272
|
|
|
cls._save_db([{key: val for key, val in c.items() if key != 'new'} for c in cls._load_db()]) |
273
|
|
|
cls.log("done") |
274
|
|
|
|
275
|
|
|
@classmethod |
276
|
|
|
def delete_last(cls): |
277
|
|
|
"""Delete last (non-deleted) comic.""" |
278
|
|
|
cls.log("start") |
279
|
|
|
comics = cls._load_db() |
280
|
|
|
last_comic = cls.get_last_comic(comics) |
281
|
|
|
if last_comic is None: |
282
|
|
|
cls.log("no comic to delete") |
283
|
|
|
else: |
284
|
|
|
cls.log("about to delete %s" % last_comic['url']) |
285
|
|
|
last_comic['deleted'] = None # "'deleted' in comic" to check if deleted |
286
|
|
|
cls._save_db(comics) |
287
|
|
|
cls.log("done") |
288
|
|
|
|
289
|
|
|
@classmethod |
290
|
|
|
def delete_all(cls): |
291
|
|
|
"""Delete all comics.""" |
292
|
|
|
cls.log("start") |
293
|
|
|
comics = cls._load_db() |
294
|
|
|
if comics: |
295
|
|
|
for c in comics: |
296
|
|
|
c['deleted'] = None # "'deleted' in comic" to check if deleted |
297
|
|
|
cls._save_db(comics) |
298
|
|
|
cls.log("done") |
299
|
|
|
|
300
|
|
|
@classmethod |
301
|
|
|
def print_name(cls): |
302
|
|
|
"""Print name.""" |
303
|
|
|
cls.log("start") |
304
|
|
|
print(cls.name) |
305
|
|
|
cls.log("end") |
306
|
|
|
|
307
|
|
|
@classmethod |
308
|
|
|
def info(cls): |
309
|
|
|
"""Print information about the comics.""" |
310
|
|
|
cls.log("start") |
311
|
|
|
print("%s (%s) : " % (cls.long_name, cls.url)) |
312
|
|
|
print("In " + ', '.join(cls.get_categories())) |
313
|
|
|
cls._create_output_dir() |
314
|
|
|
comics = cls.get_comics() # cls._load_db() |
315
|
|
|
dates = [get_date_for_comic(c) for c in comics] |
316
|
|
|
print("%d comics (%d new)" % (len(comics), sum(1 for c in comics if 'new' in c))) |
317
|
|
|
print("%d images" % sum(len(c['img']) for c in comics)) |
318
|
|
|
if dates: |
319
|
|
|
date_min, date_max = min(dates), max(dates) |
320
|
|
|
print("from %s to %s (%d days)" % (date_min, date_max, (date_max - date_min).days)) |
321
|
|
|
print() |
322
|
|
|
cls.log("done") |
323
|
|
|
|
324
|
|
|
@classmethod |
325
|
|
|
def readme(cls): |
326
|
|
|
"""Return information to generate README.""" |
327
|
|
|
return ' * [%s](%s)\n' % (cls.long_name, cls.url) |
328
|
|
|
|
329
|
|
|
@classmethod |
330
|
|
|
def gitignore(cls): |
331
|
|
|
"""Return information to generate gitignore.""" |
332
|
|
|
return '%s\n' % (cls.name) |
333
|
|
|
|
334
|
|
|
@classmethod |
335
|
|
|
def get_categories(cls): |
336
|
|
|
"""Return categories to be able to group comics. |
337
|
|
|
|
338
|
|
|
Categories are such that all classes have their ancestrors' |
339
|
|
|
categories and their own (provided as an iterable in the |
340
|
|
|
`_categories` class member).""" |
341
|
|
|
return sorted(set(cat |
342
|
|
|
for klass in cls.__mro__ |
343
|
|
|
for cat in getattr(klass, '_categories', []))) |
344
|
|
|
|