1
|
|
|
from tempfile import mkdtemp |
2
|
|
|
from pathlib import Path |
3
|
|
|
from warnings import warn |
4
|
|
|
|
5
|
|
|
import requests |
6
|
|
|
|
7
|
|
|
from ocrd.constants import TMP_PREFIX |
8
|
|
|
from ocrd_utils import ( |
9
|
|
|
getLogger, |
10
|
|
|
is_local_filename, |
11
|
|
|
get_local_filename, |
12
|
|
|
remove_non_path_from_url, |
13
|
|
|
is_file_in_directory, |
14
|
|
|
nth_url_segment |
15
|
|
|
) |
16
|
|
|
from ocrd.workspace import Workspace |
17
|
|
|
from ocrd_models import OcrdMets |
18
|
|
|
from ocrd_models.constants import NAMESPACES as NS |
19
|
|
|
from ocrd_models.utils import handle_oai_response |
20
|
|
|
|
21
|
|
|
class Resolver(): |
22
|
|
|
""" |
23
|
|
|
Handle uploads, downloads, repository access, and manage temporary directories |
24
|
|
|
""" |
25
|
|
|
|
26
|
|
|
def download_to_directory(self, directory, url, basename=None, if_exists='skip', subdir=None): |
27
|
|
|
""" |
28
|
|
|
Download a file to a directory. |
29
|
|
|
|
30
|
|
|
Early Shortcut: If `url` is a local file and that file is already in the directory, keep it there. |
31
|
|
|
|
32
|
|
|
If `basename` is not given but subdir is, assume user knows what she's doing and |
33
|
|
|
use last URL segment as the basename. |
34
|
|
|
|
35
|
|
|
If `basename` is not given and no subdir is given, use the alnum characters in the URL as the basename. |
36
|
|
|
|
37
|
|
|
Args: |
38
|
|
|
directory (string): Directory to download files to |
39
|
|
|
basename (string, None): basename part of the filename on disk. |
40
|
|
|
url (string): URL to download from |
41
|
|
|
if_exists (string, "skip"): What to do if target file already exists. \ |
42
|
|
|
One of ``skip`` (default), ``overwrite`` or ``raise`` |
43
|
|
|
subdir (string, None): Subdirectory to create within the directory. Think ``mets:fileGrp``. |
44
|
|
|
|
45
|
|
|
Returns: |
46
|
|
|
Local filename string, *relative* to directory |
47
|
|
|
""" |
48
|
|
|
log = getLogger('ocrd.resolver.download_to_directory') # pylint: disable=redefined-outer-name |
49
|
|
|
log.debug("directory=|%s| url=|%s| basename=|%s| if_exists=|%s| subdir=|%s|", directory, url, basename, if_exists, subdir) |
50
|
|
|
|
51
|
|
|
if not url: |
52
|
|
|
raise Exception("'url' must be a string") |
53
|
|
|
if not directory: |
54
|
|
|
raise Exception("'directory' must be a string") # actually Path would also work |
55
|
|
|
|
56
|
|
|
directory = Path(directory) |
57
|
|
|
directory.mkdir(parents=True, exist_ok=True) |
58
|
|
|
directory = str(directory.resolve()) |
59
|
|
|
|
60
|
|
|
subdir_path = Path(subdir if subdir else '') |
61
|
|
|
basename_path = Path(basename if basename else nth_url_segment(url)) |
62
|
|
|
ret = str(Path(subdir_path, basename_path)) |
63
|
|
|
dst_path = Path(directory, ret) |
64
|
|
|
|
65
|
|
|
# log.info("\n\tdst_path='%s \n\turl=%s", dst_path, url) |
66
|
|
|
# print('url=%s', url) |
67
|
|
|
# print('directory=%s', directory) |
68
|
|
|
# print('subdir_path=%s', subdir_path) |
69
|
|
|
# print('basename_path=%s', basename_path) |
70
|
|
|
# print('ret=%s', ret) |
71
|
|
|
# print('dst_path=%s', dst_path) |
72
|
|
|
|
73
|
|
|
src_path = None |
74
|
|
|
if is_local_filename(url): |
75
|
|
|
try: |
76
|
|
|
# XXX this raises FNFE in Python 3.5 if src_path doesn't exist but not 3.6+ |
77
|
|
|
src_path = Path(get_local_filename(url)).resolve() |
78
|
|
|
except FileNotFoundError as e: |
79
|
|
|
log.error("Failed to resolve URL locally: %s --> '%s' which does not exist" % (url, src_path)) |
80
|
|
|
raise e |
81
|
|
|
if not src_path.exists(): |
82
|
|
|
raise FileNotFoundError("File path passed as 'url' to download_to_directory does not exist: %s" % url) |
83
|
|
|
if src_path == dst_path: |
84
|
|
|
log.debug("Stop early, src_path and dst_path are the same: '%s' (url: '%s')" % (src_path, url)) |
85
|
|
|
return ret |
86
|
|
|
|
87
|
|
|
# Respect 'if_exists' arg |
88
|
|
|
if dst_path.exists(): |
89
|
|
|
if if_exists == 'skip': |
90
|
|
|
return ret |
91
|
|
|
if if_exists == 'raise': |
92
|
|
|
raise FileExistsError("File already exists and if_exists == 'raise': %s" % (dst_path)) |
93
|
|
|
|
94
|
|
|
# Create dst_path parent dir |
95
|
|
|
dst_path.parent.mkdir(parents=True, exist_ok=True) |
96
|
|
|
|
97
|
|
|
# Copy files or download remote assets |
98
|
|
|
if src_path: |
99
|
|
|
log.debug("Copying file '%s' to '%s'", src_path, dst_path) |
100
|
|
|
dst_path.write_bytes(src_path.read_bytes()) |
101
|
|
|
else: |
102
|
|
|
log.debug("Downloading URL '%s' to '%s'", url, dst_path) |
103
|
|
|
response = requests.get(url) |
104
|
|
|
if response.status_code != 200: |
105
|
|
|
raise Exception("HTTP request failed: %s (HTTP %d)" % (url, response.status_code)) |
106
|
|
|
contents = handle_oai_response(response) |
107
|
|
|
dst_path.write_bytes(contents) |
108
|
|
|
|
109
|
|
|
return ret |
110
|
|
|
|
111
|
|
|
def workspace_from_url(self, mets_url, dst_dir=None, clobber_mets=False, mets_basename=None, download=False, src_baseurl=None): |
112
|
|
|
""" |
113
|
|
|
Create a workspace from a METS by URL (i.e. clone if :py:attr:`mets_url` is remote or :py:attr:`dst_dir` is given). |
114
|
|
|
|
115
|
|
|
Arguments: |
116
|
|
|
mets_url (string): Source METS URL or filesystem path |
117
|
|
|
Keyword Arguments: |
118
|
|
|
dst_dir (string, None): Target directory for the workspace. \ |
119
|
|
|
By default create a temporary directory under :py:data:`ocrd.constants.TMP_PREFIX`. \ |
120
|
|
|
(The resulting path can be retrieved via :py:attr:`ocrd.Workspace.directory`.) |
121
|
|
|
clobber_mets (boolean, False): Whether to overwrite existing ``mets.xml``. \ |
122
|
|
|
By default existing ``mets.xml`` will raise an exception. |
123
|
|
|
download (boolean, False): Whether to also download all the files referenced by the METS |
124
|
|
|
src_baseurl (string, None): Base URL for resolving relative file locations |
125
|
|
|
|
126
|
|
|
Download (clone) :py:attr:`mets_url` to ``mets.xml`` in :py:attr:`dst_dir`, unless |
127
|
|
|
the former is already local and the latter is ``none`` or already identical to its directory name. |
128
|
|
|
|
129
|
|
|
Returns: |
130
|
|
|
a new :py:class:`~ocrd.workspace.Workspace` |
131
|
|
|
""" |
132
|
|
|
log = getLogger('ocrd.resolver.workspace_from_url') |
133
|
|
|
|
134
|
|
|
if mets_url is None: |
135
|
|
|
raise ValueError("Must pass 'mets_url' workspace_from_url") |
136
|
|
|
|
137
|
|
|
# if mets_url is a relative filename, make it absolute |
138
|
|
|
if is_local_filename(mets_url) and not Path(mets_url).is_absolute(): |
139
|
|
|
mets_url = str(Path(Path.cwd() / mets_url)) |
140
|
|
|
|
141
|
|
|
# if mets_basename is not given, use the last URL segment of the mets_url |
142
|
|
|
if mets_basename is None: |
143
|
|
|
mets_basename = nth_url_segment(mets_url, -1) |
144
|
|
|
|
145
|
|
|
# If src_baseurl wasn't given, determine from mets_url by removing last url segment |
146
|
|
|
if not src_baseurl: |
147
|
|
|
last_segment = nth_url_segment(mets_url) |
148
|
|
|
src_baseurl = remove_non_path_from_url(remove_non_path_from_url(mets_url)[:-len(last_segment)]) |
149
|
|
|
|
150
|
|
|
# resolve dst_dir |
151
|
|
|
if not dst_dir: |
152
|
|
|
if is_local_filename(mets_url): |
153
|
|
|
log.debug("Deriving dst_dir %s from %s", Path(mets_url).parent, mets_url) |
154
|
|
|
dst_dir = Path(mets_url).parent |
155
|
|
|
else: |
156
|
|
|
log.debug("Creating ephemeral workspace '%s' for METS @ <%s>", dst_dir, mets_url) |
157
|
|
|
dst_dir = mkdtemp(prefix=TMP_PREFIX) |
158
|
|
|
# XXX Path.resolve is always strict in Python <= 3.5, so create dst_dir unless it exists consistently |
159
|
|
|
if not Path(dst_dir).exists(): |
160
|
|
|
Path(dst_dir).mkdir(parents=True, exist_ok=False) |
161
|
|
|
dst_dir = str(Path(dst_dir).resolve()) |
162
|
|
|
|
163
|
|
|
log.debug("workspace_from_url\nmets_basename='%s'\nmets_url='%s'\nsrc_baseurl='%s'\ndst_dir='%s'", |
164
|
|
|
mets_basename, mets_url, src_baseurl, dst_dir) |
165
|
|
|
|
166
|
|
|
self.download_to_directory(dst_dir, mets_url, basename=mets_basename, if_exists='overwrite' if clobber_mets else 'skip') |
167
|
|
|
|
168
|
|
|
workspace = Workspace(self, dst_dir, mets_basename=mets_basename, baseurl=src_baseurl) |
169
|
|
|
|
170
|
|
|
if download: |
171
|
|
|
for f in workspace.mets.find_files(): |
172
|
|
|
workspace.download_file(f) |
173
|
|
|
|
174
|
|
|
return workspace |
175
|
|
|
|
176
|
|
|
def workspace_from_nothing(self, directory, mets_basename='mets.xml', clobber_mets=False): |
177
|
|
|
""" |
178
|
|
|
Create an empty workspace. |
179
|
|
|
|
180
|
|
|
Arguments: |
181
|
|
|
directory (string): Target directory for the workspace. \ |
182
|
|
|
If ``none``, create a temporary directory under :py:data:`ocrd.constants.TMP_PREFIX`. \ |
183
|
|
|
(The resulting path can be retrieved via :py:attr:`ocrd.Workspace.directory`.) |
184
|
|
|
Keyword Arguments: |
185
|
|
|
clobber_mets (boolean, False): Whether to overwrite existing ``mets.xml``. \ |
186
|
|
|
By default existing ``mets.xml`` will raise an exception. |
187
|
|
|
|
188
|
|
|
Returns: |
189
|
|
|
a new :py:class:`~ocrd.workspace.Workspace` |
190
|
|
|
""" |
191
|
|
|
log = getLogger('ocrd.resolver.workspace_from_nothing') |
192
|
|
|
if directory is None: |
193
|
|
|
directory = mkdtemp(prefix=TMP_PREFIX) |
194
|
|
|
Path(directory).mkdir(parents=True, exist_ok=True) |
195
|
|
|
mets_path = Path(directory, mets_basename) |
196
|
|
|
if mets_path.exists() and not clobber_mets: |
197
|
|
|
raise FileExistsError("METS '%s' already exists in '%s' and clobber_mets not set." % (mets_basename, directory)) |
198
|
|
|
mets = OcrdMets.empty_mets() |
199
|
|
|
log.info("Writing METS to %s", mets_path) |
200
|
|
|
mets_path.write_bytes(mets.to_xml(xmllint=True)) |
201
|
|
|
|
202
|
|
|
return Workspace(self, directory, mets, mets_basename=mets_basename) |
203
|
|
|
|
204
|
|
|
def resolve_mets_arguments(self, directory, mets_url, mets_basename): |
205
|
|
|
""" |
206
|
|
|
Resolve the ``--mets``, ``--mets-basename`` and `--directory`` argument |
207
|
|
|
into a coherent set of arguments according to https://github.com/OCR-D/core/issues/517 |
208
|
|
|
""" |
209
|
|
|
log = getLogger('ocrd.resolver.resolve_mets_arguments') |
210
|
|
|
mets_is_remote = mets_url and (mets_url.startswith('http://') or mets_url.startswith('https://')) |
211
|
|
|
|
212
|
|
|
# XXX we might want to be more strict like this but it might break # legacy code |
213
|
|
|
# Allow --mets and --directory together iff --mets is a remote URL |
214
|
|
|
# if directory and mets_url and not mets_is_remote: |
215
|
|
|
# raise ValueError("Use either --mets or --directory, not both") |
216
|
|
|
|
217
|
|
|
# If --mets is a URL, a directory must be explicitly provided (not strictly necessary, but retained for legacy behavior) |
218
|
|
|
if not directory and mets_is_remote: |
219
|
|
|
raise ValueError("--mets is an http(s) URL but no --directory was given") |
220
|
|
|
|
221
|
|
|
# Determine --mets-basename |
222
|
|
|
if not mets_basename and mets_url: |
223
|
|
|
mets_basename = Path(mets_url).name |
224
|
|
|
elif not mets_basename and not mets_url: |
225
|
|
|
mets_basename = 'mets.xml' |
226
|
|
|
elif mets_basename and mets_url: |
227
|
|
|
raise ValueError("Use either --mets or --mets-basename, not both") |
228
|
|
|
else: |
229
|
|
|
warn("--mets-basename is deprecated. Use --mets/--directory instead", DeprecationWarning) |
230
|
|
|
|
231
|
|
|
# Determine --directory and --mets-url |
232
|
|
|
if not directory and not mets_url: |
233
|
|
|
directory = Path.cwd() |
234
|
|
|
mets_url = Path(directory, mets_basename) |
235
|
|
|
elif directory and not mets_url: |
236
|
|
|
directory = Path(directory).resolve() |
237
|
|
|
mets_url = directory / mets_basename |
238
|
|
|
elif not directory and mets_url: |
239
|
|
|
mets_url = Path(mets_url).resolve() |
240
|
|
|
directory = mets_url.parent |
241
|
|
|
else: # == directory and mets_url: |
242
|
|
|
directory = Path(directory).resolve() |
243
|
|
|
if not mets_is_remote: |
244
|
|
|
# --mets is just a basename and --directory is set, so treat --mets as --mets-basename |
245
|
|
|
if Path(mets_url).parent == Path('.'): |
246
|
|
|
mets_url = directory / mets_url |
247
|
|
|
else: |
248
|
|
|
mets_url = Path(mets_url).resolve() |
249
|
|
|
if not is_file_in_directory(directory, mets_url): |
250
|
|
|
raise ValueError("--mets '%s' has a directory part inconsistent with --directory '%s'" % (mets_url, directory)) |
251
|
|
|
|
252
|
|
|
return str(Path(directory).resolve()), str(mets_url), str(mets_basename) |
253
|
|
|
|
254
|
|
|
|
255
|
|
|
|