|
1
|
|
|
import gzip |
|
|
|
|
|
|
2
|
|
|
import logging |
|
3
|
|
|
import shutil |
|
4
|
|
|
import zipfile |
|
5
|
|
|
from datetime import datetime |
|
6
|
|
|
from pathlib import Path |
|
7
|
|
|
from typing import Mapping, Optional |
|
8
|
|
|
from urllib import request |
|
9
|
|
|
|
|
10
|
|
|
from pocketutils.core import PathLike |
|
|
|
|
|
|
11
|
|
|
|
|
12
|
|
|
logger = logging.getLogger("pocketutils") |
|
13
|
|
|
|
|
14
|
|
|
|
|
15
|
|
|
class WebResource: |
|
16
|
|
|
""" |
|
17
|
|
|
Useful for extracting files from ZIP and GZIPing them. |
|
18
|
|
|
""" |
|
19
|
|
|
|
|
20
|
|
|
def __init__(self, url: str, archive_member: Optional[str], local_path: PathLike): |
|
21
|
|
|
self._url = url |
|
22
|
|
|
self._archive_member = archive_member |
|
23
|
|
|
self._local_path = Path(local_path) |
|
24
|
|
|
|
|
25
|
|
|
def download(self, redownload: bool = False): |
|
|
|
|
|
|
26
|
|
|
now = datetime.now() |
|
27
|
|
|
to_path = Path(self._local_path) |
|
28
|
|
|
if not to_path.exists() or redownload: |
|
29
|
|
|
extracted, dled = None, None |
|
30
|
|
|
try: |
|
31
|
|
|
logger.info(f"Downloading {self._url}...") |
|
|
|
|
|
|
32
|
|
|
tmp = str(to_path) + now.strftime("%Y%m%d-%H%M%S-%f") + ".tmp" |
|
33
|
|
|
dled, response = request.urlretrieve(self._url, tmp) |
|
34
|
|
|
dled = Path(dled) |
|
35
|
|
|
if self._archive_member is not None: |
|
36
|
|
|
with zipfile.ZipFile(dled, "r") as zfile: |
|
37
|
|
|
extracted = Path(zfile.extract(self._archive_member)) |
|
38
|
|
|
else: |
|
39
|
|
|
extracted = dled |
|
40
|
|
|
if to_path.suffix == ".gz" and not self.__is_gzip(extracted): |
|
41
|
|
|
with extracted.open("rb") as f_in: |
|
42
|
|
|
with gzip.open(to_path, "wb") as f_out: |
|
43
|
|
|
shutil.copyfileobj(f_in, f_out) |
|
44
|
|
|
else: |
|
45
|
|
|
shutil.move(extracted, to_path) |
|
46
|
|
|
self._info_path.write_text( |
|
47
|
|
|
"url=" |
|
48
|
|
|
+ self._url |
|
49
|
|
|
+ "\n" |
|
50
|
|
|
+ "datetime_downloaded=" |
|
51
|
|
|
+ now.isoformat() |
|
52
|
|
|
+ "\n" |
|
53
|
|
|
+ "response=" |
|
54
|
|
|
+ str(response).replace("\n", " |") |
|
55
|
|
|
+ "\n", |
|
56
|
|
|
encoding="utf-8", |
|
57
|
|
|
) |
|
58
|
|
|
finally: |
|
59
|
|
|
if extracted is not None and extracted.exists(): |
|
60
|
|
|
extracted.unlink() |
|
61
|
|
|
if dled is not None and dled.exists(): |
|
62
|
|
|
dled.unlink() |
|
63
|
|
|
|
|
64
|
|
|
def datetime_downloaded(self) -> datetime: |
|
|
|
|
|
|
65
|
|
|
return datetime.fromisoformat(self.metadata()["datetime_downloaded"]) |
|
66
|
|
|
|
|
67
|
|
|
def metadata(self) -> Mapping[str, str]: |
|
|
|
|
|
|
68
|
|
|
return { |
|
69
|
|
|
line[: line.index("=")].strip(): line[line.index("=") + 1 :].strip() |
|
70
|
|
|
for line in self._info_path.read_text(encoding="utf8").splitlines() |
|
71
|
|
|
} |
|
72
|
|
|
|
|
73
|
|
|
@property |
|
74
|
|
|
def _info_path(self) -> Path: |
|
75
|
|
|
return self._local_path.with_suffix(self._local_path.suffix + ".info") |
|
76
|
|
|
|
|
77
|
|
|
def exists(self) -> bool: |
|
|
|
|
|
|
78
|
|
|
return self._local_path.exists() |
|
79
|
|
|
|
|
80
|
|
|
@property |
|
81
|
|
|
def path(self) -> Path: |
|
|
|
|
|
|
82
|
|
|
return self._local_path |
|
83
|
|
|
|
|
84
|
|
|
def delete(self) -> None: |
|
|
|
|
|
|
85
|
|
|
self.path.unlink(missing_ok=True) |
|
|
|
|
|
|
86
|
|
|
|
|
87
|
|
|
def __is_gzip(self, path): |
|
|
|
|
|
|
88
|
|
|
try: |
|
89
|
|
|
with gzip.open(path, "rb") as f: |
|
|
|
|
|
|
90
|
|
|
f.read(20) # 10-byte header |
|
91
|
|
|
except OSError as e: |
|
|
|
|
|
|
92
|
|
|
if "Not a gzipped file" in str(e): |
|
|
|
|
|
|
93
|
|
|
return False |
|
94
|
|
|
else: |
|
95
|
|
|
raise e |
|
96
|
|
|
return True |
|
97
|
|
|
|
|
98
|
|
|
|
|
99
|
|
|
__all__ = ["WebResource"] |
|
100
|
|
|
|