|
1
|
|
|
#! /usr/bin/python3 |
|
2
|
|
|
# vim: set expandtab tabstop=4 shiftwidth=4 : |
|
3
|
|
|
"""Module to define logic common to all comics.""" |
|
4
|
|
|
|
|
5
|
|
|
import json |
|
6
|
|
|
import time |
|
7
|
|
|
import os |
|
8
|
|
|
from datetime import date |
|
9
|
|
|
from urlfunctions import get_filename_from_url, get_file_at_url |
|
10
|
|
|
import inspect |
|
11
|
|
|
import logging |
|
12
|
|
|
|
|
13
|
|
|
|
|
14
|
|
|
def get_date_for_comic(comic): |
|
15
|
|
|
"""Return date object for a given comic.""" |
|
16
|
|
|
return date(comic['year'], comic['month'], comic['day']) |
|
17
|
|
|
|
|
18
|
|
|
|
|
19
|
|
|
def get_info_before_comic(comic): |
|
20
|
|
|
"""Generates the info to be put before the images.""" |
|
21
|
|
|
author = comic.get('author') |
|
22
|
|
|
if author: |
|
23
|
|
|
yield 'by ' + author |
|
24
|
|
|
|
|
25
|
|
|
|
|
26
|
|
|
def get_info_after_comic(comic): |
|
27
|
|
|
"""Generates the info to be put after the images.""" |
|
28
|
|
|
for name in ['alt', 'title', 'title2', 'texts', 'name', 'description']: |
|
29
|
|
|
info = comic.get(name) |
|
30
|
|
|
if info: |
|
31
|
|
|
yield info |
|
32
|
|
|
|
|
33
|
|
|
|
|
34
|
|
|
class GenericComic(object): |
|
35
|
|
|
"""Generic class to handle the logic common to all comics |
|
36
|
|
|
|
|
37
|
|
|
Attributes : |
|
38
|
|
|
name Name of the comic (for logging, CLI and default output dir) |
|
39
|
|
|
long_name Long name of the comic (to be added in the comic info) |
|
40
|
|
|
url Base url for the comic (without trailing slash).""" |
|
41
|
|
|
name = None |
|
42
|
|
|
long_name = None |
|
43
|
|
|
url = None |
|
44
|
|
|
_categories = ('ALL', ) |
|
45
|
|
|
|
|
46
|
|
|
@classmethod |
|
47
|
|
|
def log(cls, string): |
|
48
|
|
|
"""Dirty logging function.""" |
|
49
|
|
|
# TODO: https://docs.python.org/2/library/logging.html#logrecord-attributes |
|
50
|
|
|
# we do not need to retrieve the function name manually |
|
51
|
|
|
logging.debug(inspect.stack()[1][3] + " " + cls.name + " " + string) |
|
52
|
|
|
|
|
53
|
|
|
@classmethod |
|
54
|
|
|
def _get_output_dir(cls): |
|
55
|
|
|
"""Returns the name of the output directory (for comics and JSON file). |
|
56
|
|
|
To be overridden if needed.""" |
|
57
|
|
|
return cls.name |
|
58
|
|
|
|
|
59
|
|
|
@classmethod |
|
60
|
|
|
def _create_output_dir(cls): |
|
61
|
|
|
"""Create output directory for the comic on the file system.""" |
|
62
|
|
|
cls.log("start") |
|
63
|
|
|
os.makedirs(cls._get_output_dir(), exist_ok=True) |
|
64
|
|
|
cls.log("done") |
|
65
|
|
|
|
|
66
|
|
|
@classmethod |
|
67
|
|
|
def _get_json_file_path(cls): |
|
68
|
|
|
"""Get the full path to the JSON file.""" |
|
69
|
|
|
return os.path.join(cls._get_output_dir(), cls.name + '.json') |
|
70
|
|
|
|
|
71
|
|
|
@classmethod |
|
72
|
|
|
def _load_db(cls): |
|
73
|
|
|
"""Load the JSON file to return the list of comics.""" |
|
74
|
|
|
cls.log("start") |
|
75
|
|
|
try: |
|
76
|
|
|
with open(cls._get_json_file_path()) as file: |
|
77
|
|
|
return json.load(file) |
|
78
|
|
|
except IOError: |
|
79
|
|
|
return [] |
|
80
|
|
|
|
|
81
|
|
|
@classmethod |
|
82
|
|
|
def get_comics(cls): |
|
83
|
|
|
"""Return the list of comics.""" |
|
84
|
|
|
return [c for c in cls._load_db() if 'deleted' not in c] |
|
85
|
|
|
|
|
86
|
|
|
@classmethod |
|
87
|
|
|
def get_last_comic(cls, comics): |
|
88
|
|
|
"""Return the last (non-deleted) comic.""" |
|
89
|
|
|
return next((c for c in reversed(comics) if 'deleted' not in c), None) |
|
90
|
|
|
|
|
91
|
|
|
@classmethod |
|
92
|
|
|
def _save_db(cls, data): |
|
93
|
|
|
"""Save the list of comics in the JSON file.""" |
|
94
|
|
|
cls.log("start") |
|
95
|
|
|
with open(cls._get_json_file_path(), 'w+') as file: |
|
96
|
|
|
json.dump(data, file, indent=4, sort_keys=True) |
|
97
|
|
|
cls.log("done") |
|
98
|
|
|
|
|
99
|
|
|
@classmethod |
|
100
|
|
|
def get_file_in_output_dir(cls, url, prefix=None, referer=None): |
|
101
|
|
|
"""Download file from URL and save it in output folder.""" |
|
102
|
|
|
cls.log("start (url:%s)" % url) |
|
103
|
|
|
filename = os.path.join( |
|
104
|
|
|
cls._get_output_dir(), |
|
105
|
|
|
('' if prefix is None else prefix) + |
|
106
|
|
|
get_filename_from_url(url)) |
|
107
|
|
|
return get_file_at_url(url, filename, referer) |
|
108
|
|
|
|
|
109
|
|
|
@classmethod |
|
110
|
|
|
def check_everything_is_ok(cls): |
|
111
|
|
|
"""Perform tests on the database to check that everything is ok.""" |
|
112
|
|
|
cls.log("start") |
|
113
|
|
|
print(cls.name, ': about to check') |
|
114
|
|
|
comics = cls.get_comics() # cls._load_db() |
|
115
|
|
|
imgs_paths = {} |
|
116
|
|
|
imgs_urls = {} |
|
117
|
|
|
prev_date, prev_num = None, None |
|
118
|
|
|
today = date.today() |
|
119
|
|
|
for i, comic in enumerate(comics): |
|
120
|
|
|
cls.print_comic(comic) |
|
121
|
|
|
url = comic.get('url') |
|
122
|
|
|
assert isinstance(url, str), "Url %s not a string" % url |
|
123
|
|
|
assert comic.get('comic') == cls.long_name |
|
124
|
|
|
assert all(isinstance(comic.get(k), int) |
|
125
|
|
|
for k in ['day', 'month', 'year']), \ |
|
126
|
|
|
"Invalid date data (%s)" % url |
|
127
|
|
|
curr_date = get_date_for_comic(comic) |
|
128
|
|
|
assert curr_date <= today |
|
129
|
|
|
curr_num = comic.get('num', 0) |
|
130
|
|
|
assert isinstance(curr_num, int) |
|
131
|
|
|
assert prev_date is None or prev_date <= curr_date or \ |
|
132
|
|
|
prev_num < curr_num, \ |
|
133
|
|
|
"Comics are not in order (%s)" % url |
|
134
|
|
|
prev_date, prev_num = curr_date, curr_num |
|
135
|
|
|
img = comic.get('img') |
|
136
|
|
|
local_img = comic.get('local_img') |
|
137
|
|
|
assert isinstance(img, list) |
|
138
|
|
|
assert isinstance(local_img, list) |
|
139
|
|
|
assert len(local_img) == len(img) |
|
140
|
|
|
for path in local_img: |
|
141
|
|
|
if path is not None: |
|
142
|
|
|
assert os.path.isfile(path) |
|
143
|
|
|
imgs_paths.setdefault(path, set()).add(i) |
|
144
|
|
|
for img_url in img: |
|
145
|
|
|
imgs_urls.setdefault(img_url, set()).add(i) |
|
146
|
|
|
print() |
|
147
|
|
|
if False: # To check if imgs are not overriding themselves |
|
148
|
|
|
for path, nums in imgs_paths.items(): |
|
149
|
|
|
if len(nums) > 1: |
|
150
|
|
|
print("Image used multiple times", path, nums) |
|
151
|
|
|
for img_url, nums in imgs_urls.items(): |
|
152
|
|
|
if len(nums) > 1: |
|
153
|
|
|
print("Url used multiple times", img_url, nums) |
|
154
|
|
|
if False: # To check that all files in folder are useful |
|
155
|
|
|
json = cls._get_json_file_path() |
|
156
|
|
|
output_dir = cls._get_output_dir() |
|
157
|
|
|
for file_ in os.listdir(output_dir): |
|
158
|
|
|
file_path = os.path.join(output_dir, file_) |
|
159
|
|
|
if file_path not in imgs_paths and file_path != json: |
|
160
|
|
|
print("Unused image", file_path) |
|
161
|
|
|
cls.log("done") |
|
162
|
|
|
|
|
163
|
|
|
@classmethod |
|
164
|
|
|
def get_next_comic(cls, _): |
|
165
|
|
|
"""Generator to get the next comic. |
|
166
|
|
|
|
|
167
|
|
|
First argument is the last properly downloaded comic which gives |
|
168
|
|
|
a starting point to download more. |
|
169
|
|
|
|
|
170
|
|
|
This is the method called by update(). It should yield comics which |
|
171
|
|
|
are basically dictionnaries with the following property : |
|
172
|
|
|
- 'url' is linked to a string |
|
173
|
|
|
- 'img' is linked to a list of url (that will get downloaded) |
|
174
|
|
|
- 'day'/'month'/'year' are self explicit. They are linked to |
|
175
|
|
|
integers corresponding to the comic dates. There should be |
|
176
|
|
|
all of them or none of them |
|
177
|
|
|
- more fields can be provided.""" |
|
178
|
|
|
raise NotImplementedError |
|
179
|
|
|
|
|
180
|
|
|
@classmethod |
|
181
|
|
|
def print_text(cls, text): |
|
182
|
|
|
"""Print text by returning to the beginning of the line every time.""" |
|
183
|
|
|
print(cls.name, ':', text, ' ' * 10, '\r', end='') |
|
184
|
|
|
|
|
185
|
|
|
@classmethod |
|
186
|
|
|
def print_comic(cls, comic): |
|
187
|
|
|
"""Print information about a comic.""" |
|
188
|
|
|
cls.print_text(comic['url']) |
|
189
|
|
|
|
|
190
|
|
|
@classmethod |
|
191
|
|
|
def update(cls): |
|
192
|
|
|
"""Update the database : get the latest comics and save in the DB. |
|
193
|
|
|
|
|
194
|
|
|
This is a wrapper around get_next_comic() providing the following |
|
195
|
|
|
generic features : |
|
196
|
|
|
- logging |
|
197
|
|
|
- database handling (open and save) |
|
198
|
|
|
- exception handling (properly retrieved data are always saved) |
|
199
|
|
|
- file download |
|
200
|
|
|
- data management (adds current date if no date is provided).""" |
|
201
|
|
|
cls.log("start") |
|
202
|
|
|
print(cls.name, ': about to update') |
|
203
|
|
|
cls._create_output_dir() |
|
204
|
|
|
comics = cls._load_db() |
|
205
|
|
|
new_comics = [] |
|
206
|
|
|
start = time.time() |
|
207
|
|
|
try: |
|
208
|
|
|
last_comic = cls.get_last_comic(comics) |
|
209
|
|
|
cls.log("last comic is %s" % ('None' if last_comic is None else last_comic['url'])) |
|
210
|
|
|
for comic in cls.get_next_comic(last_comic): |
|
211
|
|
|
cls.log("got %s" % str(comic)) |
|
212
|
|
|
assert 'url' in comic |
|
213
|
|
|
assert 'img' in comic |
|
214
|
|
|
if 'day' in comic: |
|
215
|
|
|
assert all(isinstance(comic.get(k), int) for k in ['day', 'month', 'year']) |
|
216
|
|
|
else: |
|
217
|
|
|
assert all(k not in comic for k in ['day', 'month', 'year']) |
|
218
|
|
|
day = date.today() |
|
219
|
|
|
comic['day'], comic['month'], comic['year'] = \ |
|
220
|
|
|
day.day, day.month, day.year |
|
221
|
|
|
prefix = comic.get('prefix', '') |
|
222
|
|
|
comic['local_img'] = [cls.get_file_in_output_dir(i, prefix, referer=comic['url']) |
|
223
|
|
|
for i in comic['img']] |
|
224
|
|
|
comic['comic'] = cls.long_name |
|
225
|
|
|
comic['new'] = None # "'new' in comic" to check if new |
|
226
|
|
|
new_comics.append(comic) |
|
227
|
|
|
cls.print_comic(comic) |
|
228
|
|
|
finally: |
|
229
|
|
|
end = time.time() |
|
230
|
|
|
if new_comics: |
|
231
|
|
|
print() |
|
232
|
|
|
cls._save_db(comics + new_comics) |
|
233
|
|
|
print(cls.name, ": added", len(new_comics), |
|
234
|
|
|
"comics in", end - start, "seconds") |
|
235
|
|
|
else: |
|
236
|
|
|
print(cls.name, ": nothing new") |
|
237
|
|
|
cls.log("done") |
|
238
|
|
|
|
|
239
|
|
|
@classmethod |
|
240
|
|
|
def try_to_get_missing_resources(cls): |
|
241
|
|
|
"""Download images that might not have been downloaded properly in |
|
242
|
|
|
the first place.""" |
|
243
|
|
|
cls.log("start") |
|
244
|
|
|
print(cls.name, ': about to try to get missing resources') |
|
245
|
|
|
cls._create_output_dir() |
|
246
|
|
|
comics = cls._load_db() |
|
247
|
|
|
change = False |
|
248
|
|
|
for comic in comics: |
|
249
|
|
|
comicurl = comic['url'] |
|
250
|
|
|
local = comic['local_img'] |
|
251
|
|
|
prefix = comic.get('prefix', '') |
|
252
|
|
|
for i, (path, url) in enumerate(zip(local, comic['img'])): |
|
253
|
|
|
if path is None: |
|
254
|
|
|
new_path = cls.get_file_in_output_dir(url, prefix, referer=comicurl) |
|
255
|
|
|
if new_path is None: |
|
256
|
|
|
print(cls.name, ': failed to get', url) |
|
257
|
|
|
else: |
|
258
|
|
|
print(cls.name, ': got', url, 'at', new_path) |
|
259
|
|
|
local[i] = new_path |
|
260
|
|
|
change = True |
|
261
|
|
|
comic['new'] = None |
|
262
|
|
|
if change: |
|
263
|
|
|
cls._save_db(comics) |
|
264
|
|
|
print(cls.name, ": some missing resources have been downloaded") |
|
265
|
|
|
cls.log("done") |
|
266
|
|
|
|
|
267
|
|
|
@classmethod |
|
268
|
|
|
def reset_new(cls): |
|
269
|
|
|
"""Remove the 'new' flag on comics in the DB.""" |
|
270
|
|
|
cls.log("start") |
|
271
|
|
|
cls._create_output_dir() |
|
272
|
|
|
cls._save_db([{key: val for key, val in c.items() if key != 'new'} for c in cls._load_db()]) |
|
273
|
|
|
cls.log("done") |
|
274
|
|
|
|
|
275
|
|
|
@classmethod |
|
276
|
|
|
def delete_last(cls): |
|
277
|
|
|
"""Delete last (non-deleted) comic.""" |
|
278
|
|
|
cls.log("start") |
|
279
|
|
|
comics = cls._load_db() |
|
280
|
|
|
last_comic = cls.get_last_comic(comics) |
|
281
|
|
|
if last_comic is None: |
|
282
|
|
|
cls.log("no comic to delete") |
|
283
|
|
|
else: |
|
284
|
|
|
cls.log("about to delete %s" % last_comic['url']) |
|
285
|
|
|
last_comic['deleted'] = None # "'deleted' in comic" to check if deleted |
|
286
|
|
|
cls._save_db(comics) |
|
287
|
|
|
cls.log("done") |
|
288
|
|
|
|
|
289
|
|
|
@classmethod |
|
290
|
|
|
def delete_all(cls): |
|
291
|
|
|
"""Delete all comics.""" |
|
292
|
|
|
cls.log("start") |
|
293
|
|
|
comics = cls._load_db() |
|
294
|
|
|
if comics: |
|
295
|
|
|
for c in comics: |
|
296
|
|
|
c['deleted'] = None # "'deleted' in comic" to check if deleted |
|
297
|
|
|
cls._save_db(comics) |
|
298
|
|
|
cls.log("done") |
|
299
|
|
|
|
|
300
|
|
|
@classmethod |
|
301
|
|
|
def print_name(cls): |
|
302
|
|
|
"""Print name.""" |
|
303
|
|
|
cls.log("start") |
|
304
|
|
|
print(cls.name) |
|
305
|
|
|
cls.log("end") |
|
306
|
|
|
|
|
307
|
|
|
@classmethod |
|
308
|
|
|
def info(cls): |
|
309
|
|
|
"""Print information about the comics.""" |
|
310
|
|
|
cls.log("start") |
|
311
|
|
|
print("%s (%s) : " % (cls.long_name, cls.url)) |
|
312
|
|
|
print("In " + ', '.join(cls.get_categories())) |
|
313
|
|
|
cls._create_output_dir() |
|
314
|
|
|
comics = cls.get_comics() # cls._load_db() |
|
315
|
|
|
dates = [get_date_for_comic(c) for c in comics] |
|
316
|
|
|
print("%d comics (%d new)" % (len(comics), sum(1 for c in comics if 'new' in c))) |
|
317
|
|
|
print("%d images" % sum(len(c['img']) for c in comics)) |
|
318
|
|
|
if dates: |
|
319
|
|
|
date_min, date_max = min(dates), max(dates) |
|
320
|
|
|
print("from %s to %s (%d days)" % (date_min, date_max, (date_max - date_min).days)) |
|
321
|
|
|
print() |
|
322
|
|
|
cls.log("done") |
|
323
|
|
|
|
|
324
|
|
|
@classmethod |
|
325
|
|
|
def readme(cls): |
|
326
|
|
|
"""Return information to generate README.""" |
|
327
|
|
|
return ' * [%s](%s)\n' % (cls.long_name, cls.url) |
|
328
|
|
|
|
|
329
|
|
|
@classmethod |
|
330
|
|
|
def gitignore(cls): |
|
331
|
|
|
"""Return information to generate gitignore.""" |
|
332
|
|
|
return '%s\n' % (cls.name) |
|
333
|
|
|
|
|
334
|
|
|
@classmethod |
|
335
|
|
|
def get_categories(cls): |
|
336
|
|
|
"""Return categories to be able to group comics. |
|
337
|
|
|
|
|
338
|
|
|
Categories are such that all classes have their ancestrors' |
|
339
|
|
|
categories and their own (provided as an iterable in the |
|
340
|
|
|
`_categories` class member).""" |
|
341
|
|
|
return sorted(set(cat |
|
342
|
|
|
for klass in cls.__mro__ |
|
343
|
|
|
for cat in getattr(klass, '_categories', []))) |
|
344
|
|
|
|