|
1
|
|
|
import tempfile |
|
2
|
|
|
from pathlib import Path |
|
3
|
|
|
|
|
4
|
|
|
import requests |
|
5
|
|
|
|
|
6
|
|
|
from ocrd.constants import TMP_PREFIX |
|
7
|
|
|
from ocrd_utils import ( |
|
8
|
|
|
getLogger, |
|
9
|
|
|
is_local_filename, |
|
10
|
|
|
get_local_filename, |
|
11
|
|
|
remove_non_path_from_url, |
|
12
|
|
|
nth_url_segment |
|
13
|
|
|
) |
|
14
|
|
|
from ocrd.workspace import Workspace |
|
15
|
|
|
from ocrd_models import OcrdMets, OcrdMetsFilter |
|
16
|
|
|
|
|
17
|
|
|
log = getLogger('ocrd.resolver') |
|
18
|
|
|
|
|
19
|
|
|
class Resolver(): |
|
20
|
|
|
""" |
|
21
|
|
|
Handle Uploads, Downloads, Repository access and manage temporary directories |
|
22
|
|
|
""" |
|
23
|
|
|
|
|
24
|
|
|
def download_to_directory(self, directory, url, basename=None, if_exists='skip', subdir=None): |
|
25
|
|
|
""" |
|
26
|
|
|
Download a file to a directory. |
|
27
|
|
|
|
|
28
|
|
|
Early Shortcut: If url is a local file and that file is already in the directory, keep it there. |
|
29
|
|
|
|
|
30
|
|
|
If basename is not given but subdir is, assume user knows what she's doing and use last URL segment as the basename. |
|
31
|
|
|
If basename is not given and no subdir is given, use the alnum characters in the URL as the basename. |
|
32
|
|
|
|
|
33
|
|
|
Args: |
|
34
|
|
|
directory (string): Directory to download files to |
|
35
|
|
|
basename (string, None): basename part of the filename on disk. |
|
36
|
|
|
url (string): URL to download from |
|
37
|
|
|
if_exists (string, "skip"): What to do if target file already exists. One of ``skip`` (default), ``overwrite`` or ``raise`` |
|
38
|
|
|
subdir (string, None): Subdirectory to create within the directory. Think fileGrp. |
|
39
|
|
|
|
|
40
|
|
|
Returns: |
|
41
|
|
|
Local filename, __relative__ to directory |
|
42
|
|
|
""" |
|
43
|
|
|
log = getLogger('ocrd.resolver.download_to_directory') # pylint: disable=redefined-outer-name |
|
44
|
|
|
log.debug("directory=|%s| url=|%s| basename=|%s| if_exists=|%s| subdir=|%s|", directory, url, basename, if_exists, subdir) |
|
45
|
|
|
|
|
46
|
|
|
if not url: |
|
47
|
|
|
raise Exception("'url' must be a string") |
|
48
|
|
|
if not directory: |
|
49
|
|
|
raise Exception("'directory' must be a string") # actually Path would also work |
|
50
|
|
|
|
|
51
|
|
|
directory = Path(directory) |
|
52
|
|
|
directory.mkdir(parents=True, exist_ok=True) |
|
53
|
|
|
directory = str(directory.resolve()) |
|
54
|
|
|
|
|
55
|
|
|
subdir_path = Path(subdir if subdir else '') |
|
56
|
|
|
basename_path = Path(basename if basename else nth_url_segment(url)) |
|
57
|
|
|
ret = str(Path(subdir_path, basename_path)) |
|
58
|
|
|
dst_path = Path(directory, ret) |
|
59
|
|
|
|
|
60
|
|
|
# log.info("\n\tdst_path='%s \n\turl=%s", dst_path, url) |
|
61
|
|
|
# print('url=%s', url) |
|
62
|
|
|
# print('directory=%s', directory) |
|
63
|
|
|
# print('subdir_path=%s', subdir_path) |
|
64
|
|
|
# print('basename_path=%s', basename_path) |
|
65
|
|
|
# print('ret=%s', ret) |
|
66
|
|
|
# print('dst_path=%s', dst_path) |
|
67
|
|
|
|
|
68
|
|
|
src_path = None |
|
69
|
|
|
if is_local_filename(url): |
|
70
|
|
|
try: |
|
71
|
|
|
# XXX this raises FNFE in Python 3.5 if src_path doesn't exist but not 3.6+ |
|
72
|
|
|
src_path = Path(get_local_filename(url)).resolve() |
|
73
|
|
|
except FileNotFoundError as e: |
|
74
|
|
|
log.error("Failed to resolve URL locally: %s --> '%s' which does not exist" % (url, src_path)) |
|
75
|
|
|
raise e |
|
76
|
|
|
if not src_path.exists(): |
|
77
|
|
|
raise FileNotFoundError("File path passed as 'url' to download_to_directory does not exist: %s" % url) |
|
78
|
|
|
if src_path == dst_path: |
|
79
|
|
|
log.debug("Stop early, src_path and dst_path are the same: '%s' (url: '%s')" % (src_path, url)) |
|
80
|
|
|
return ret |
|
81
|
|
|
|
|
82
|
|
|
# Respect 'if_exists' arg |
|
83
|
|
|
if dst_path.exists(): |
|
84
|
|
|
if if_exists == 'skip': |
|
85
|
|
|
return ret |
|
86
|
|
|
if if_exists == 'raise': |
|
87
|
|
|
raise FileExistsError("File already exists and if_exists == 'raise': %s" % (dst_path)) |
|
88
|
|
|
|
|
89
|
|
|
# Create dst_path parent dir |
|
90
|
|
|
dst_path.parent.mkdir(parents=True, exist_ok=True) |
|
91
|
|
|
|
|
92
|
|
|
# Copy files or download remote assets |
|
93
|
|
|
if src_path: |
|
94
|
|
|
log.debug("Copying file '%s' to '%s'", src_path, dst_path) |
|
95
|
|
|
dst_path.write_bytes(src_path.read_bytes()) |
|
96
|
|
|
else: |
|
97
|
|
|
log.debug("Downloading URL '%s' to '%s'", url, dst_path) |
|
98
|
|
|
response = requests.get(url) |
|
99
|
|
|
if response.status_code != 200: |
|
100
|
|
|
raise Exception("HTTP request failed: %s (HTTP %d)" % (url, response.status_code)) |
|
101
|
|
|
dst_path.write_bytes(response.content) |
|
102
|
|
|
|
|
103
|
|
|
return ret |
|
104
|
|
|
|
|
105
|
|
|
def workspace_from_url(self, mets_url, dst_dir=None, clobber_mets=False, mets_basename=None, download=False, src_baseurl=None): |
|
106
|
|
|
""" |
|
107
|
|
|
Create a workspace from a METS by URL (i.e. clone it). |
|
108
|
|
|
|
|
109
|
|
|
Sets the mets.xml file |
|
110
|
|
|
|
|
111
|
|
|
Arguments: |
|
112
|
|
|
mets_url (string): Source mets URL |
|
113
|
|
|
dst_dir (string, None): Target directory for the workspace |
|
114
|
|
|
clobber_mets (boolean, False): Whether to overwrite existing mets.xml. By default existing mets.xml will raise an exception. |
|
115
|
|
|
download (boolean, False): Whether to download all the files |
|
116
|
|
|
src_baseurl (string, None): Base URL for resolving relative file locations |
|
117
|
|
|
|
|
118
|
|
|
Returns: |
|
119
|
|
|
Workspace |
|
120
|
|
|
""" |
|
121
|
|
|
|
|
122
|
|
|
if mets_url is None: |
|
123
|
|
|
raise ValueError("Must pass 'mets_url' workspace_from_url") |
|
124
|
|
|
|
|
125
|
|
|
# if mets_url is a relative filename, make it absolute |
|
126
|
|
|
if is_local_filename(mets_url) and not Path(mets_url).is_absolute(): |
|
127
|
|
|
mets_url = str(Path(Path.cwd() / mets_url)) |
|
128
|
|
|
|
|
129
|
|
|
# if mets_basename is not given, use the last URL segment of the mets_url |
|
130
|
|
|
if mets_basename is None: |
|
131
|
|
|
mets_basename = nth_url_segment(mets_url, -1) |
|
132
|
|
|
|
|
133
|
|
|
# If src_baseurl wasn't given, determine from mets_url by removing last url segment |
|
134
|
|
|
if not src_baseurl: |
|
135
|
|
|
last_segment = nth_url_segment(mets_url) |
|
136
|
|
|
src_baseurl = remove_non_path_from_url(remove_non_path_from_url(mets_url)[:-len(last_segment)]) |
|
137
|
|
|
|
|
138
|
|
|
# resolve dst_dir |
|
139
|
|
|
if not dst_dir: |
|
140
|
|
|
if is_local_filename(mets_url): |
|
141
|
|
|
log.debug("Deriving dst_dir %s from %s", Path(mets_url).parent, mets_url) |
|
142
|
|
|
dst_dir = Path(mets_url).parent |
|
143
|
|
|
else: |
|
144
|
|
|
log.debug("Creating ephemeral workspace '%s' for METS @ <%s>", dst_dir, mets_url) |
|
145
|
|
|
dst_dir = tempfile.mkdtemp(prefix=TMP_PREFIX) |
|
146
|
|
|
# XXX Path.resolve is always strict in Python <= 3.5, so create dst_dir unless it exists consistently |
|
147
|
|
|
if not Path(dst_dir).exists(): |
|
148
|
|
|
Path(dst_dir).mkdir(parents=True, exist_ok=False) |
|
149
|
|
|
dst_dir = str(Path(dst_dir).resolve()) |
|
150
|
|
|
|
|
151
|
|
|
log.debug("workspace_from_url\nmets_basename='%s'\nmets_url='%s'\nsrc_baseurl='%s'\ndst_dir='%s'", |
|
152
|
|
|
mets_basename, mets_url, src_baseurl, dst_dir) |
|
153
|
|
|
|
|
154
|
|
|
self.download_to_directory(dst_dir, mets_url, basename=mets_basename, if_exists='overwrite' if clobber_mets else 'skip') |
|
155
|
|
|
|
|
156
|
|
|
workspace = Workspace(self, dst_dir, mets_basename=mets_basename, baseurl=src_baseurl) |
|
157
|
|
|
|
|
158
|
|
|
# XXX an empty dict is false-y but valid in this context |
|
159
|
|
|
if download or download == {}: |
|
160
|
|
|
if not isinstance(download, dict): |
|
161
|
|
|
download = {} |
|
162
|
|
|
mets_filter = OcrdMetsFilter(**download) |
|
163
|
|
|
for f in mets_filter.find_files(workspace): |
|
164
|
|
|
workspace.download_file(f) |
|
165
|
|
|
|
|
166
|
|
|
return workspace |
|
167
|
|
|
|
|
168
|
|
|
def workspace_from_nothing(self, directory, mets_basename='mets.xml', clobber_mets=False): |
|
169
|
|
|
""" |
|
170
|
|
|
Create an empty workspace. |
|
171
|
|
|
""" |
|
172
|
|
|
if directory is None: |
|
173
|
|
|
directory = tempfile.mkdtemp(prefix=TMP_PREFIX) |
|
174
|
|
|
Path(directory).mkdir(parents=True, exist_ok=True) |
|
175
|
|
|
mets_path = Path(directory, mets_basename) |
|
176
|
|
|
if mets_path.exists() and not clobber_mets: |
|
177
|
|
|
raise FileExistsError("METS '%s' already exists in '%s' and clobber_mets not set." % (mets_basename, directory)) |
|
178
|
|
|
mets = OcrdMets.empty_mets() |
|
179
|
|
|
log.info("Writing METS to %s", mets_path) |
|
180
|
|
|
mets_path.write_bytes(mets.to_xml(xmllint=True)) |
|
181
|
|
|
|
|
182
|
|
|
return Workspace(self, directory, mets, mets_basename=mets_basename) |
|
183
|
|
|
|