1
|
|
|
from tempfile import mkdtemp |
2
|
|
|
from pathlib import Path |
3
|
|
|
from warnings import warn |
4
|
|
|
|
5
|
|
|
import requests |
6
|
|
|
from requests.adapters import HTTPAdapter, Retry |
7
|
|
|
|
8
|
|
|
from ocrd.constants import TMP_PREFIX |
9
|
|
|
from ocrd_utils import ( |
10
|
|
|
config, |
11
|
|
|
DEFAULT_METS_BASENAME, |
12
|
|
|
getLogger, |
13
|
|
|
is_local_filename, |
14
|
|
|
get_local_filename, |
15
|
|
|
remove_non_path_from_url, |
16
|
|
|
is_file_in_directory, |
17
|
|
|
nth_url_segment |
18
|
|
|
) |
19
|
|
|
from ocrd.workspace import Workspace |
20
|
|
|
from ocrd_models import OcrdMets |
21
|
|
|
from ocrd_models.utils import handle_oai_response |
22
|
|
|
|
23
|
|
|
class Resolver(): |
24
|
|
|
""" |
25
|
|
|
Handle uploads, downloads, repository access, and manage temporary directories |
26
|
|
|
""" |
27
|
|
|
|
28
|
|
|
def download_to_directory(self, directory, url, basename=None, if_exists='skip', subdir=None, retries=None, timeout=None): |
29
|
|
|
""" |
30
|
|
|
Download a URL ``url`` to a local file in ``directory``. |
31
|
|
|
|
32
|
|
|
If ``url`` looks like a file path, check whether that exists. |
33
|
|
|
If it does exist and is within ``directory` already, return early. |
34
|
|
|
If it does exist but is outside of ``directory``. copy it. |
35
|
|
|
If ``url` does not appear to be a file path, try downloading via HTTP, retrying ``retries`` times with timeout ``timeout`` between calls. |
36
|
|
|
|
37
|
|
|
If ``basename`` is not given but ``subdir`` is, set ``basename`` to the last path segment of ``url``. |
38
|
|
|
|
39
|
|
|
If the target file already exists within ``directory``, behavior depends on ``if_exists``: |
40
|
|
|
- ``skip`` (default): do nothing and return early. Note that this |
41
|
|
|
- ``overwrite``: overwrite the existing file |
42
|
|
|
- ``raise``: raise a ``FileExistsError`` |
43
|
|
|
|
44
|
|
|
Args: |
45
|
|
|
directory (string): Directory to download files to |
46
|
|
|
url (string): URL to download from |
47
|
|
|
|
48
|
|
|
Keyword Args: |
49
|
|
|
basename (string, None): basename part of the filename on disk. Defaults to last path segment of ``url`` if unset. |
50
|
|
|
if_exists (string, "skip"): What to do if target file already exists. |
51
|
|
|
One of ``skip`` (default), ``overwrite`` or ``raise`` |
52
|
|
|
subdir (string, None): Subdirectory to create within the directory. Think ``mets:fileGrp[@USE]``. |
53
|
|
|
retries (int, None): Number of retries to attempt on network failure. |
54
|
|
|
timeout (tuple, None): Timeout in seconds for establishing a connection and reading next chunk of data. |
55
|
|
|
|
56
|
|
|
Returns: |
57
|
|
|
Local filename string, *relative* to directory |
58
|
|
|
""" |
59
|
|
|
log = getLogger('ocrd.resolver.download_to_directory') # pylint: disable=redefined-outer-name |
60
|
|
|
log.debug("directory=|%s| url=|%s| basename=|%s| if_exists=|%s| subdir=|%s|", directory, url, basename, if_exists, subdir) |
61
|
|
|
|
62
|
|
|
if not url: |
63
|
|
|
raise ValueError(f"'url' must be a non-empty string, not '{url}'") # actually Path also ok |
64
|
|
|
if not directory: |
65
|
|
|
raise ValueError(f"'directory' must be a non-empty string, not '{url}'") # actually Path would also work |
66
|
|
|
|
67
|
|
|
url = str(url) |
68
|
|
|
directory = Path(directory) |
69
|
|
|
directory.mkdir(parents=True, exist_ok=True) |
70
|
|
|
|
71
|
|
|
subdir_path = Path(subdir if subdir else '') |
72
|
|
|
basename_path = Path(basename if basename else nth_url_segment(url)) |
73
|
|
|
ret = Path(subdir_path, basename_path) |
74
|
|
|
dst_path = Path(directory, ret) |
75
|
|
|
|
76
|
|
|
# log.info("\n\tdst_path='%s \n\turl=%s", dst_path, url) |
77
|
|
|
# print(f'>>> url={url}') |
78
|
|
|
# print(f'>>> directory={directory}') |
79
|
|
|
# print(f'>>> subdir_path={subdir_path}') |
80
|
|
|
# print(f'>>> basename_path={basename_path}') |
81
|
|
|
# print(f'>>> dst_path={dst_path}') |
82
|
|
|
# print(f'>>> ret={ret}') |
83
|
|
|
|
84
|
|
|
src_path = None |
85
|
|
|
if is_local_filename(url): |
86
|
|
|
try: |
87
|
|
|
src_path = Path(get_local_filename(url)).resolve() |
88
|
|
|
except FileNotFoundError as e: |
89
|
|
|
log.error("Failed to resolve URL locally: %s --> '%s' which does not exist" % (url, src_path)) |
90
|
|
|
raise e |
91
|
|
|
if not src_path.exists(): |
92
|
|
|
raise FileNotFoundError(f"File path passed as 'url' to download_to_directory does not exist: '{url}") |
93
|
|
|
if src_path == dst_path: |
94
|
|
|
log.debug("Stop early, src_path and dst_path are the same: '%s' (url: '%s')" % (src_path, url)) |
95
|
|
|
return str(ret) |
96
|
|
|
|
97
|
|
|
# Respect 'if_exists' kwarg |
98
|
|
|
if dst_path.exists(): |
99
|
|
|
if if_exists == 'skip': |
100
|
|
|
log.debug(f"File already exists but if_exists == {if_exists}, skipping.") |
101
|
|
|
return str(ret) |
102
|
|
|
elif if_exists == 'raise': |
103
|
|
|
raise FileExistsError(f"File already exists and if_exists == '{if_exists}': {dst_path}") |
104
|
|
|
else: |
105
|
|
|
log.debug(f"File already exists but if_exists == {if_exists}, overwriting.") |
106
|
|
|
|
107
|
|
|
# Create dst_path parent dir |
108
|
|
|
dst_path.parent.mkdir(parents=True, exist_ok=True) |
109
|
|
|
|
110
|
|
|
# Copy files or download remote assets |
111
|
|
|
if src_path: |
112
|
|
|
# src_path set, so it is a file source, we can copy directly |
113
|
|
|
log.debug("Copying file '%s' to '%s'", src_path, dst_path) |
114
|
|
|
dst_path.write_bytes(src_path.read_bytes()) |
115
|
|
|
else: |
116
|
|
|
# src_path not set, it's an http URL, try to download |
117
|
|
|
log.debug("Downloading URL '%s' to '%s'", url, dst_path) |
118
|
|
|
if not retries and config.is_set('OCRD_DOWNLOAD_RETRIES'): |
119
|
|
|
retries = config.OCRD_DOWNLOAD_RETRIES |
120
|
|
|
if timeout is None and config.is_set('OCRD_DOWNLOAD_TIMEOUT'): |
121
|
|
|
timeout = config.OCRD_DOWNLOAD_TIMEOUT |
122
|
|
|
session = requests.Session() |
123
|
|
|
retries = Retry(total=retries or 0, |
124
|
|
|
status_forcelist=[ |
125
|
|
|
# probably too wide (only transient failures): |
126
|
|
|
408, # Request Timeout |
127
|
|
|
409, # Conflict |
128
|
|
|
412, # Precondition Failed |
129
|
|
|
417, # Expectation Failed |
130
|
|
|
423, # Locked |
131
|
|
|
424, # Fail |
132
|
|
|
425, # Too Early |
133
|
|
|
426, # Upgrade Required |
134
|
|
|
428, # Precondition Required |
135
|
|
|
429, # Too Many Requests |
136
|
|
|
440, # Login Timeout |
137
|
|
|
500, # Internal Server Error |
138
|
|
|
503, # Service Unavailable |
139
|
|
|
504, # Gateway Timeout |
140
|
|
|
509, # Bandwidth Limit Exceeded |
141
|
|
|
529, # Site Overloaded |
142
|
|
|
598, # Proxy Read Timeout |
143
|
|
|
599, # Proxy Connect Timeout |
144
|
|
|
]) |
145
|
|
|
adapter = HTTPAdapter(max_retries=retries) |
146
|
|
|
session.mount('http://', adapter) |
147
|
|
|
session.mount('https://', adapter) |
148
|
|
|
response = session.get(url, timeout=timeout) |
149
|
|
|
response.raise_for_status() |
150
|
|
|
contents = handle_oai_response(response) |
151
|
|
|
dst_path.write_bytes(contents) |
152
|
|
|
|
153
|
|
|
return str(ret) |
154
|
|
|
|
155
|
|
|
def workspace_from_url( |
156
|
|
|
self, |
157
|
|
|
mets_url, |
158
|
|
|
dst_dir=None, |
159
|
|
|
clobber_mets=False, |
160
|
|
|
mets_basename=None, |
161
|
|
|
download=False, |
162
|
|
|
src_baseurl=None, |
163
|
|
|
mets_server_url=None, |
164
|
|
|
**kwargs |
165
|
|
|
): |
166
|
|
|
""" |
167
|
|
|
Create a workspace from a METS by URL (i.e. clone if :py:attr:`mets_url` is remote or :py:attr:`dst_dir` is given). |
168
|
|
|
|
169
|
|
|
Arguments: |
170
|
|
|
mets_url (string): Source METS URL or filesystem path |
171
|
|
|
Keyword Arguments: |
172
|
|
|
dst_dir (string, None): Target directory for the workspace. \ |
173
|
|
|
By default create a temporary directory under :py:data:`ocrd.constants.TMP_PREFIX`. \ |
174
|
|
|
(The resulting path can be retrieved via :py:attr:`ocrd.Workspace.directory`.) |
175
|
|
|
clobber_mets (boolean, False): Whether to overwrite existing ``mets.xml``. \ |
176
|
|
|
By default existing ``mets.xml`` will raise an exception. |
177
|
|
|
download (boolean, False): Whether to also download all the files referenced by the METS |
178
|
|
|
src_baseurl (string, None): Base URL for resolving relative file locations |
179
|
|
|
mets_server_url (string, None): URI of TCP or local path of UDS for METS server handling |
180
|
|
|
the `OcrdMets` of the workspace. By default the METS will be read from and written to |
181
|
|
|
the filesystem directly. |
182
|
|
|
**kwargs (): Passed on to ``OcrdMets.find_files`` if download == True |
183
|
|
|
|
184
|
|
|
Download (clone) :py:attr:`mets_url` to ``mets.xml`` in :py:attr:`dst_dir`, unless |
185
|
|
|
the former is already local and the latter is ``none`` or already identical to its directory name. |
186
|
|
|
|
187
|
|
|
Returns: |
188
|
|
|
a new :py:class:`~ocrd.workspace.Workspace` |
189
|
|
|
""" |
190
|
|
|
log = getLogger('ocrd.resolver.workspace_from_url') |
191
|
|
|
|
192
|
|
|
if mets_url is None: |
193
|
|
|
raise ValueError("Must pass 'mets_url' workspace_from_url") |
194
|
|
|
|
195
|
|
|
# if mets_url is a relative filename, make it absolute |
196
|
|
|
if is_local_filename(mets_url) and not Path(mets_url).is_absolute(): |
197
|
|
|
mets_url = str(Path(Path.cwd() / mets_url)) |
198
|
|
|
|
199
|
|
|
# if mets_basename is not given, use the last URL segment of the mets_url |
200
|
|
|
if mets_basename is None: |
201
|
|
|
mets_basename = nth_url_segment(mets_url, -1) |
202
|
|
|
|
203
|
|
|
# If src_baseurl wasn't given, determine from mets_url by removing last url segment |
204
|
|
|
if not src_baseurl: |
205
|
|
|
last_segment = nth_url_segment(mets_url) |
206
|
|
|
src_baseurl = remove_non_path_from_url(remove_non_path_from_url(mets_url)[:-len(last_segment)]) |
207
|
|
|
|
208
|
|
|
# resolve dst_dir |
209
|
|
|
if not dst_dir: |
210
|
|
|
if is_local_filename(mets_url): |
211
|
|
|
log.debug("Deriving dst_dir %s from %s", Path(mets_url).parent, mets_url) |
212
|
|
|
dst_dir = Path(mets_url).parent |
213
|
|
|
else: |
214
|
|
|
log.debug("Creating ephemeral workspace '%s' for METS @ <%s>", dst_dir, mets_url) |
215
|
|
|
dst_dir = mkdtemp(prefix=TMP_PREFIX) |
216
|
|
|
# XXX Path.resolve is always strict in Python <= 3.5, so create dst_dir unless it exists consistently |
217
|
|
|
if not Path(dst_dir).exists(): |
218
|
|
|
Path(dst_dir).mkdir(parents=True, exist_ok=False) |
219
|
|
|
dst_dir = str(Path(dst_dir).resolve()) |
220
|
|
|
|
221
|
|
|
log.debug("workspace_from_url\nmets_basename='%s'\nmets_url='%s'\nsrc_baseurl='%s'\ndst_dir='%s'", |
222
|
|
|
mets_basename, mets_url, src_baseurl, dst_dir) |
223
|
|
|
self.download_to_directory(dst_dir, mets_url, basename=mets_basename, if_exists='overwrite' if clobber_mets else 'raise') |
224
|
|
|
|
225
|
|
|
workspace = Workspace(self, dst_dir, mets_basename=mets_basename, baseurl=src_baseurl, mets_server_url=mets_server_url) |
226
|
|
|
|
227
|
|
|
if download: |
228
|
|
|
for f in workspace.mets.find_files(**kwargs): |
229
|
|
|
workspace.download_file(f) |
230
|
|
|
|
231
|
|
|
return workspace |
232
|
|
|
|
233
|
|
|
def workspace_from_nothing(self, directory, mets_basename=DEFAULT_METS_BASENAME, clobber_mets=False): |
234
|
|
|
""" |
235
|
|
|
Create an empty workspace. |
236
|
|
|
|
237
|
|
|
Arguments: |
238
|
|
|
directory (string): Target directory for the workspace. \ |
239
|
|
|
If ``none``, create a temporary directory under :py:data:`ocrd.constants.TMP_PREFIX`. \ |
240
|
|
|
(The resulting path can be retrieved via :py:attr:`ocrd.Workspace.directory`.) |
241
|
|
|
Keyword Arguments: |
242
|
|
|
clobber_mets (boolean, False): Whether to overwrite existing ``mets.xml``. \ |
243
|
|
|
By default existing ``mets.xml`` will raise an exception. |
244
|
|
|
|
245
|
|
|
Returns: |
246
|
|
|
a new :py:class:`~ocrd.workspace.Workspace` |
247
|
|
|
""" |
248
|
|
|
log = getLogger('ocrd.resolver.workspace_from_nothing') |
249
|
|
|
if directory is None: |
250
|
|
|
directory = mkdtemp(prefix=TMP_PREFIX) |
251
|
|
|
Path(directory).mkdir(parents=True, exist_ok=True) |
252
|
|
|
mets_path = Path(directory, mets_basename) |
253
|
|
|
if mets_path.exists() and not clobber_mets: |
254
|
|
|
raise FileExistsError("METS '%s' already exists in '%s' and clobber_mets not set." % (mets_basename, directory)) |
255
|
|
|
mets = OcrdMets.empty_mets() |
256
|
|
|
log.info("Writing METS to %s", mets_path) |
257
|
|
|
mets_path.write_bytes(mets.to_xml(xmllint=True)) |
258
|
|
|
|
259
|
|
|
return Workspace(self, directory, mets, mets_basename=mets_basename) |
260
|
|
|
|
261
|
|
|
def resolve_mets_arguments(self, directory, mets_url, mets_basename=DEFAULT_METS_BASENAME, mets_server_url=None): |
262
|
|
|
""" |
263
|
|
|
Resolve the ``--mets``, ``--mets-basename``, `--directory``, |
264
|
|
|
``--mets-server-url``, arguments into a coherent set of arguments |
265
|
|
|
according to https://github.com/OCR-D/core/issues/517 |
266
|
|
|
""" |
267
|
|
|
log = getLogger('ocrd.resolver.resolve_mets_arguments') |
268
|
|
|
|
269
|
|
|
mets_is_remote = mets_url and (mets_url.startswith('http://') or mets_url.startswith('https://')) |
270
|
|
|
|
271
|
|
|
# XXX we might want to be more strict like this but it might break # legacy code |
272
|
|
|
# Allow --mets and --directory together iff --mets is a remote URL |
273
|
|
|
# if directory and mets_url and not mets_is_remote: |
274
|
|
|
# raise ValueError("Use either --mets or --directory, not both") |
275
|
|
|
|
276
|
|
|
# If --mets is a URL, a directory must be explicitly provided (not strictly necessary, but retained for legacy behavior) |
277
|
|
|
if not directory and mets_is_remote: |
278
|
|
|
raise ValueError("--mets is an http(s) URL but no --directory was given") |
279
|
|
|
|
280
|
|
|
# Determine --mets-basename |
281
|
|
|
if not mets_basename and mets_url: |
282
|
|
|
mets_basename = Path(mets_url).name |
283
|
|
|
elif not mets_basename and not mets_url: |
284
|
|
|
mets_basename = DEFAULT_METS_BASENAME |
285
|
|
|
elif mets_basename and mets_url: |
286
|
|
|
raise ValueError("Use either --mets or --mets-basename, not both") |
287
|
|
|
else: |
288
|
|
|
warn("--mets-basename is deprecated. Use --mets/--directory instead", DeprecationWarning) |
289
|
|
|
|
290
|
|
|
# Determine --directory and --mets-url |
291
|
|
|
if not directory and not mets_url: |
292
|
|
|
directory = Path.cwd() |
293
|
|
|
mets_url = Path(directory, mets_basename) |
294
|
|
|
elif directory and not mets_url: |
295
|
|
|
directory = Path(directory).resolve() |
296
|
|
|
mets_url = directory / mets_basename |
297
|
|
|
elif not directory and mets_url: |
298
|
|
|
mets_url = Path(mets_url).resolve() |
299
|
|
|
directory = mets_url.parent |
300
|
|
|
else: # == directory and mets_url: |
301
|
|
|
directory = Path(directory).resolve() |
302
|
|
|
if not mets_is_remote: |
303
|
|
|
# --mets is just a basename and --directory is set, so treat --mets as --mets-basename |
304
|
|
|
if Path(mets_url).parent == Path('.'): |
305
|
|
|
mets_url = directory / mets_url |
306
|
|
|
else: |
307
|
|
|
mets_url = Path(mets_url).resolve() |
308
|
|
|
if not is_file_in_directory(directory, mets_url): |
309
|
|
|
raise ValueError("--mets '%s' has a directory part inconsistent with --directory '%s'" % (mets_url, directory)) |
310
|
|
|
|
311
|
|
|
if mets_server_url and not mets_server_url.startswith('http://'): |
312
|
|
|
# UDS socket |
313
|
|
|
mets_server_url = str(Path(mets_server_url).resolve()) |
314
|
|
|
|
315
|
|
|
return str(Path(directory).resolve()), str(mets_url), str(mets_basename), mets_server_url |
316
|
|
|
|