1
|
|
|
import gzip |
|
|
|
|
2
|
|
|
import logging |
3
|
|
|
import shutil |
4
|
|
|
import zipfile |
5
|
|
|
from datetime import datetime |
6
|
|
|
from pathlib import Path |
7
|
|
|
from typing import Mapping, Optional |
8
|
|
|
from urllib import request |
9
|
|
|
|
10
|
|
|
from pocketutils.core import PathLike |
|
|
|
|
11
|
|
|
|
12
|
|
|
logger = logging.getLogger("pocketutils") |
13
|
|
|
|
14
|
|
|
|
15
|
|
|
class WebResource: |
16
|
|
|
""" |
17
|
|
|
Useful for extracting files from ZIP and GZIPing them. |
18
|
|
|
""" |
19
|
|
|
|
20
|
|
|
def __init__(self, url: str, archive_member: Optional[str], local_path: PathLike): |
21
|
|
|
self._url = url |
22
|
|
|
self._archive_member = archive_member |
23
|
|
|
self._local_path = Path(local_path) |
24
|
|
|
|
25
|
|
|
def download(self, redownload: bool = False): |
|
|
|
|
26
|
|
|
now = datetime.now() |
27
|
|
|
to_path = Path(self._local_path) |
28
|
|
|
if not to_path.exists() or redownload: |
29
|
|
|
extracted, dled = None, None |
30
|
|
|
try: |
31
|
|
|
logger.info(f"Downloading {self._url}...") |
|
|
|
|
32
|
|
|
tmp = str(to_path) + now.strftime("%Y%m%d-%H%M%S-%f") + ".tmp" |
33
|
|
|
dled, response = request.urlretrieve(self._url, tmp) |
34
|
|
|
dled = Path(dled) |
35
|
|
|
if self._archive_member is not None: |
36
|
|
|
with zipfile.ZipFile(dled, "r") as zfile: |
37
|
|
|
extracted = Path(zfile.extract(self._archive_member)) |
38
|
|
|
else: |
39
|
|
|
extracted = dled |
40
|
|
|
if to_path.suffix == ".gz" and not self.__is_gzip(extracted): |
41
|
|
|
with extracted.open("rb") as f_in: |
42
|
|
|
with gzip.open(to_path, "wb") as f_out: |
43
|
|
|
shutil.copyfileobj(f_in, f_out) |
44
|
|
|
else: |
45
|
|
|
shutil.move(extracted, to_path) |
46
|
|
|
self._info_path.write_text( |
47
|
|
|
"url=" |
48
|
|
|
+ self._url |
49
|
|
|
+ "\n" |
50
|
|
|
+ "datetime_downloaded=" |
51
|
|
|
+ now.isoformat() |
52
|
|
|
+ "\n" |
53
|
|
|
+ "response=" |
54
|
|
|
+ str(response).replace("\n", " |") |
55
|
|
|
+ "\n", |
56
|
|
|
encoding="utf-8", |
57
|
|
|
) |
58
|
|
|
finally: |
59
|
|
|
if extracted is not None and extracted.exists(): |
60
|
|
|
extracted.unlink() |
61
|
|
|
if dled is not None and dled.exists(): |
62
|
|
|
dled.unlink() |
63
|
|
|
|
64
|
|
|
def datetime_downloaded(self) -> datetime: |
|
|
|
|
65
|
|
|
return datetime.fromisoformat(self.metadata()["datetime_downloaded"]) |
66
|
|
|
|
67
|
|
|
def metadata(self) -> Mapping[str, str]: |
|
|
|
|
68
|
|
|
return { |
69
|
|
|
line[: line.index("=")].strip(): line[line.index("=") + 1 :].strip() |
70
|
|
|
for line in self._info_path.read_text(encoding="utf8").splitlines() |
71
|
|
|
} |
72
|
|
|
|
73
|
|
|
@property |
74
|
|
|
def _info_path(self) -> Path: |
75
|
|
|
return self._local_path.with_suffix(self._local_path.suffix + ".info") |
76
|
|
|
|
77
|
|
|
def exists(self) -> bool: |
|
|
|
|
78
|
|
|
return self._local_path.exists() |
79
|
|
|
|
80
|
|
|
@property |
81
|
|
|
def path(self) -> Path: |
|
|
|
|
82
|
|
|
return self._local_path |
83
|
|
|
|
84
|
|
|
def delete(self) -> None: |
|
|
|
|
85
|
|
|
self.path.unlink(missing_ok=True) |
|
|
|
|
86
|
|
|
|
87
|
|
|
def __is_gzip(self, path): |
|
|
|
|
88
|
|
|
try: |
89
|
|
|
with gzip.open(path, "rb") as f: |
|
|
|
|
90
|
|
|
f.read(20) # 10-byte header |
91
|
|
|
except OSError as e: |
|
|
|
|
92
|
|
|
if "Not a gzipped file" in str(e): |
|
|
|
|
93
|
|
|
return False |
94
|
|
|
else: |
95
|
|
|
raise e |
96
|
|
|
return True |
97
|
|
|
|
98
|
|
|
|
99
|
|
|
__all__ = ["WebResource"] |
100
|
|
|
|