|
1
|
|
|
from tempfile import mkdtemp |
|
2
|
|
|
from pathlib import Path |
|
3
|
|
|
from warnings import warn |
|
4
|
|
|
|
|
5
|
|
|
import requests |
|
6
|
|
|
|
|
7
|
|
|
from ocrd.constants import TMP_PREFIX |
|
8
|
|
|
from ocrd_utils import ( |
|
9
|
|
|
getLogger, |
|
10
|
|
|
is_local_filename, |
|
11
|
|
|
get_local_filename, |
|
12
|
|
|
remove_non_path_from_url, |
|
13
|
|
|
is_file_in_directory, |
|
14
|
|
|
nth_url_segment |
|
15
|
|
|
) |
|
16
|
|
|
from ocrd.workspace import Workspace |
|
17
|
|
|
from ocrd_models import OcrdMets |
|
18
|
|
|
from ocrd_models.constants import NAMESPACES as NS |
|
19
|
|
|
from ocrd_models.utils import handle_oai_response |
|
20
|
|
|
|
|
21
|
|
|
class Resolver(): |
|
22
|
|
|
""" |
|
23
|
|
|
Handle uploads, downloads, repository access, and manage temporary directories |
|
24
|
|
|
""" |
|
25
|
|
|
|
|
26
|
|
|
def download_to_directory(self, directory, url, basename=None, if_exists='skip', subdir=None): |
|
27
|
|
|
""" |
|
28
|
|
|
Download a file to a directory. |
|
29
|
|
|
|
|
30
|
|
|
Early Shortcut: If `url` is a local file and that file is already in the directory, keep it there. |
|
31
|
|
|
|
|
32
|
|
|
If `basename` is not given but subdir is, assume user knows what she's doing and |
|
33
|
|
|
use last URL segment as the basename. |
|
34
|
|
|
|
|
35
|
|
|
If `basename` is not given and no subdir is given, use the alnum characters in the URL as the basename. |
|
36
|
|
|
|
|
37
|
|
|
Args: |
|
38
|
|
|
directory (string): Directory to download files to |
|
39
|
|
|
basename (string, None): basename part of the filename on disk. |
|
40
|
|
|
url (string): URL to download from |
|
41
|
|
|
if_exists (string, "skip"): What to do if target file already exists. \ |
|
42
|
|
|
One of ``skip`` (default), ``overwrite`` or ``raise`` |
|
43
|
|
|
subdir (string, None): Subdirectory to create within the directory. Think ``mets:fileGrp``. |
|
44
|
|
|
|
|
45
|
|
|
Returns: |
|
46
|
|
|
Local filename string, *relative* to directory |
|
47
|
|
|
""" |
|
48
|
|
|
log = getLogger('ocrd.resolver.download_to_directory') # pylint: disable=redefined-outer-name |
|
49
|
|
|
log.debug("directory=|%s| url=|%s| basename=|%s| if_exists=|%s| subdir=|%s|", directory, url, basename, if_exists, subdir) |
|
50
|
|
|
|
|
51
|
|
|
if not url: |
|
52
|
|
|
raise Exception("'url' must be a string") |
|
53
|
|
|
if not directory: |
|
54
|
|
|
raise Exception("'directory' must be a string") # actually Path would also work |
|
55
|
|
|
|
|
56
|
|
|
directory = Path(directory) |
|
57
|
|
|
directory.mkdir(parents=True, exist_ok=True) |
|
58
|
|
|
directory = str(directory.resolve()) |
|
59
|
|
|
|
|
60
|
|
|
subdir_path = Path(subdir if subdir else '') |
|
61
|
|
|
basename_path = Path(basename if basename else nth_url_segment(url)) |
|
62
|
|
|
ret = str(Path(subdir_path, basename_path)) |
|
63
|
|
|
dst_path = Path(directory, ret) |
|
64
|
|
|
|
|
65
|
|
|
# log.info("\n\tdst_path='%s \n\turl=%s", dst_path, url) |
|
66
|
|
|
# print('url=%s', url) |
|
67
|
|
|
# print('directory=%s', directory) |
|
68
|
|
|
# print('subdir_path=%s', subdir_path) |
|
69
|
|
|
# print('basename_path=%s', basename_path) |
|
70
|
|
|
# print('ret=%s', ret) |
|
71
|
|
|
# print('dst_path=%s', dst_path) |
|
72
|
|
|
|
|
73
|
|
|
src_path = None |
|
74
|
|
|
if is_local_filename(url): |
|
75
|
|
|
try: |
|
76
|
|
|
# XXX this raises FNFE in Python 3.5 if src_path doesn't exist but not 3.6+ |
|
77
|
|
|
src_path = Path(get_local_filename(url)).resolve() |
|
78
|
|
|
except FileNotFoundError as e: |
|
79
|
|
|
log.error("Failed to resolve URL locally: %s --> '%s' which does not exist" % (url, src_path)) |
|
80
|
|
|
raise e |
|
81
|
|
|
if not src_path.exists(): |
|
82
|
|
|
raise FileNotFoundError("File path passed as 'url' to download_to_directory does not exist: %s" % url) |
|
83
|
|
|
if src_path == dst_path: |
|
84
|
|
|
log.debug("Stop early, src_path and dst_path are the same: '%s' (url: '%s')" % (src_path, url)) |
|
85
|
|
|
return ret |
|
86
|
|
|
|
|
87
|
|
|
# Respect 'if_exists' arg |
|
88
|
|
|
if dst_path.exists(): |
|
89
|
|
|
if if_exists == 'skip': |
|
90
|
|
|
return ret |
|
91
|
|
|
if if_exists == 'raise': |
|
92
|
|
|
raise FileExistsError("File already exists and if_exists == 'raise': %s" % (dst_path)) |
|
93
|
|
|
|
|
94
|
|
|
# Create dst_path parent dir |
|
95
|
|
|
dst_path.parent.mkdir(parents=True, exist_ok=True) |
|
96
|
|
|
|
|
97
|
|
|
# Copy files or download remote assets |
|
98
|
|
|
if src_path: |
|
99
|
|
|
log.debug("Copying file '%s' to '%s'", src_path, dst_path) |
|
100
|
|
|
dst_path.write_bytes(src_path.read_bytes()) |
|
101
|
|
|
else: |
|
102
|
|
|
log.debug("Downloading URL '%s' to '%s'", url, dst_path) |
|
103
|
|
|
response = requests.get(url) |
|
104
|
|
|
if response.status_code != 200: |
|
105
|
|
|
raise Exception("HTTP request failed: %s (HTTP %d)" % (url, response.status_code)) |
|
106
|
|
|
contents = handle_oai_response(response) |
|
107
|
|
|
dst_path.write_bytes(contents) |
|
108
|
|
|
|
|
109
|
|
|
return ret |
|
110
|
|
|
|
|
111
|
|
|
def workspace_from_url(self, mets_url, dst_dir=None, clobber_mets=False, mets_basename=None, download=False, src_baseurl=None): |
|
112
|
|
|
""" |
|
113
|
|
|
Create a workspace from a METS by URL (i.e. clone if :py:attr:`mets_url` is remote or :py:attr:`dst_dir` is given). |
|
114
|
|
|
|
|
115
|
|
|
Arguments: |
|
116
|
|
|
mets_url (string): Source METS URL or filesystem path |
|
117
|
|
|
Keyword Arguments: |
|
118
|
|
|
dst_dir (string, None): Target directory for the workspace. \ |
|
119
|
|
|
By default create a temporary directory under :py:data:`ocrd.constants.TMP_PREFIX`. \ |
|
120
|
|
|
(The resulting path can be retrieved via :py:attr:`ocrd.Workspace.directory`.) |
|
121
|
|
|
clobber_mets (boolean, False): Whether to overwrite existing ``mets.xml``. \ |
|
122
|
|
|
By default existing ``mets.xml`` will raise an exception. |
|
123
|
|
|
download (boolean, False): Whether to also download all the files referenced by the METS |
|
124
|
|
|
src_baseurl (string, None): Base URL for resolving relative file locations |
|
125
|
|
|
|
|
126
|
|
|
Download (clone) :py:attr:`mets_url` to ``mets.xml`` in :py:attr:`dst_dir`, unless |
|
127
|
|
|
the former is already local and the latter is ``none`` or already identical to its directory name. |
|
128
|
|
|
|
|
129
|
|
|
Returns: |
|
130
|
|
|
a new :py:class:`~ocrd.workspace.Workspace` |
|
131
|
|
|
""" |
|
132
|
|
|
log = getLogger('ocrd.resolver.workspace_from_url') |
|
133
|
|
|
|
|
134
|
|
|
if mets_url is None: |
|
135
|
|
|
raise ValueError("Must pass 'mets_url' workspace_from_url") |
|
136
|
|
|
|
|
137
|
|
|
# if mets_url is a relative filename, make it absolute |
|
138
|
|
|
if is_local_filename(mets_url) and not Path(mets_url).is_absolute(): |
|
139
|
|
|
mets_url = str(Path(Path.cwd() / mets_url)) |
|
140
|
|
|
|
|
141
|
|
|
# if mets_basename is not given, use the last URL segment of the mets_url |
|
142
|
|
|
if mets_basename is None: |
|
143
|
|
|
mets_basename = nth_url_segment(mets_url, -1) |
|
144
|
|
|
|
|
145
|
|
|
# If src_baseurl wasn't given, determine from mets_url by removing last url segment |
|
146
|
|
|
if not src_baseurl: |
|
147
|
|
|
last_segment = nth_url_segment(mets_url) |
|
148
|
|
|
src_baseurl = remove_non_path_from_url(remove_non_path_from_url(mets_url)[:-len(last_segment)]) |
|
149
|
|
|
|
|
150
|
|
|
# resolve dst_dir |
|
151
|
|
|
if not dst_dir: |
|
152
|
|
|
if is_local_filename(mets_url): |
|
153
|
|
|
log.debug("Deriving dst_dir %s from %s", Path(mets_url).parent, mets_url) |
|
154
|
|
|
dst_dir = Path(mets_url).parent |
|
155
|
|
|
else: |
|
156
|
|
|
log.debug("Creating ephemeral workspace '%s' for METS @ <%s>", dst_dir, mets_url) |
|
157
|
|
|
dst_dir = mkdtemp(prefix=TMP_PREFIX) |
|
158
|
|
|
# XXX Path.resolve is always strict in Python <= 3.5, so create dst_dir unless it exists consistently |
|
159
|
|
|
if not Path(dst_dir).exists(): |
|
160
|
|
|
Path(dst_dir).mkdir(parents=True, exist_ok=False) |
|
161
|
|
|
dst_dir = str(Path(dst_dir).resolve()) |
|
162
|
|
|
|
|
163
|
|
|
log.debug("workspace_from_url\nmets_basename='%s'\nmets_url='%s'\nsrc_baseurl='%s'\ndst_dir='%s'", |
|
164
|
|
|
mets_basename, mets_url, src_baseurl, dst_dir) |
|
165
|
|
|
|
|
166
|
|
|
self.download_to_directory(dst_dir, mets_url, basename=mets_basename, if_exists='overwrite' if clobber_mets else 'skip') |
|
167
|
|
|
|
|
168
|
|
|
workspace = Workspace(self, dst_dir, mets_basename=mets_basename, baseurl=src_baseurl) |
|
169
|
|
|
|
|
170
|
|
|
if download: |
|
171
|
|
|
for f in workspace.mets.find_files(): |
|
172
|
|
|
workspace.download_file(f) |
|
173
|
|
|
|
|
174
|
|
|
return workspace |
|
175
|
|
|
|
|
176
|
|
|
def workspace_from_nothing(self, directory, mets_basename='mets.xml', clobber_mets=False): |
|
177
|
|
|
""" |
|
178
|
|
|
Create an empty workspace. |
|
179
|
|
|
|
|
180
|
|
|
Arguments: |
|
181
|
|
|
directory (string): Target directory for the workspace. \ |
|
182
|
|
|
If ``none``, create a temporary directory under :py:data:`ocrd.constants.TMP_PREFIX`. \ |
|
183
|
|
|
(The resulting path can be retrieved via :py:attr:`ocrd.Workspace.directory`.) |
|
184
|
|
|
Keyword Arguments: |
|
185
|
|
|
clobber_mets (boolean, False): Whether to overwrite existing ``mets.xml``. \ |
|
186
|
|
|
By default existing ``mets.xml`` will raise an exception. |
|
187
|
|
|
|
|
188
|
|
|
Returns: |
|
189
|
|
|
a new :py:class:`~ocrd.workspace.Workspace` |
|
190
|
|
|
""" |
|
191
|
|
|
log = getLogger('ocrd.resolver.workspace_from_nothing') |
|
192
|
|
|
if directory is None: |
|
193
|
|
|
directory = mkdtemp(prefix=TMP_PREFIX) |
|
194
|
|
|
Path(directory).mkdir(parents=True, exist_ok=True) |
|
195
|
|
|
mets_path = Path(directory, mets_basename) |
|
196
|
|
|
if mets_path.exists() and not clobber_mets: |
|
197
|
|
|
raise FileExistsError("METS '%s' already exists in '%s' and clobber_mets not set." % (mets_basename, directory)) |
|
198
|
|
|
mets = OcrdMets.empty_mets() |
|
199
|
|
|
log.info("Writing METS to %s", mets_path) |
|
200
|
|
|
mets_path.write_bytes(mets.to_xml(xmllint=True)) |
|
201
|
|
|
|
|
202
|
|
|
return Workspace(self, directory, mets, mets_basename=mets_basename) |
|
203
|
|
|
|
|
204
|
|
|
def resolve_mets_arguments(self, directory, mets_url, mets_basename): |
|
205
|
|
|
""" |
|
206
|
|
|
Resolve the ``--mets``, ``--mets-basename`` and `--directory`` argument |
|
207
|
|
|
into a coherent set of arguments according to https://github.com/OCR-D/core/issues/517 |
|
208
|
|
|
""" |
|
209
|
|
|
log = getLogger('ocrd.resolver.resolve_mets_arguments') |
|
210
|
|
|
mets_is_remote = mets_url and (mets_url.startswith('http://') or mets_url.startswith('https://')) |
|
211
|
|
|
|
|
212
|
|
|
# XXX we might want to be more strict like this but it might break # legacy code |
|
213
|
|
|
# Allow --mets and --directory together iff --mets is a remote URL |
|
214
|
|
|
# if directory and mets_url and not mets_is_remote: |
|
215
|
|
|
# raise ValueError("Use either --mets or --directory, not both") |
|
216
|
|
|
|
|
217
|
|
|
# If --mets is a URL, a directory must be explicitly provided (not strictly necessary, but retained for legacy behavior) |
|
218
|
|
|
if not directory and mets_is_remote: |
|
219
|
|
|
raise ValueError("--mets is an http(s) URL but no --directory was given") |
|
220
|
|
|
|
|
221
|
|
|
# Determine --mets-basename |
|
222
|
|
|
if not mets_basename and mets_url: |
|
223
|
|
|
mets_basename = Path(mets_url).name |
|
224
|
|
|
elif not mets_basename and not mets_url: |
|
225
|
|
|
mets_basename = 'mets.xml' |
|
226
|
|
|
elif mets_basename and mets_url: |
|
227
|
|
|
raise ValueError("Use either --mets or --mets-basename, not both") |
|
228
|
|
|
else: |
|
229
|
|
|
warn("--mets-basename is deprecated. Use --mets/--directory instead", DeprecationWarning) |
|
230
|
|
|
|
|
231
|
|
|
# Determine --directory and --mets-url |
|
232
|
|
|
if not directory and not mets_url: |
|
233
|
|
|
directory = Path.cwd() |
|
234
|
|
|
mets_url = Path(directory, mets_basename) |
|
235
|
|
|
elif directory and not mets_url: |
|
236
|
|
|
directory = Path(directory).resolve() |
|
237
|
|
|
mets_url = directory / mets_basename |
|
238
|
|
|
elif not directory and mets_url: |
|
239
|
|
|
mets_url = Path(mets_url).resolve() |
|
240
|
|
|
directory = mets_url.parent |
|
241
|
|
|
else: # == directory and mets_url: |
|
242
|
|
|
directory = Path(directory).resolve() |
|
243
|
|
|
if not mets_is_remote: |
|
244
|
|
|
# --mets is just a basename and --directory is set, so treat --mets as --mets-basename |
|
245
|
|
|
if Path(mets_url).parent == Path('.'): |
|
246
|
|
|
mets_url = directory / mets_url |
|
247
|
|
|
else: |
|
248
|
|
|
mets_url = Path(mets_url).resolve() |
|
249
|
|
|
if not is_file_in_directory(directory, mets_url): |
|
250
|
|
|
raise ValueError("--mets '%s' has a directory part inconsistent with --directory '%s'" % (mets_url, directory)) |
|
251
|
|
|
|
|
252
|
|
|
return str(Path(directory).resolve()), str(mets_url), str(mets_basename) |
|
253
|
|
|
|
|
254
|
|
|
|
|
255
|
|
|
|