ocrd.resolver   C
last analyzed

Complexity

Total Complexity 53

Size/Duplication

Total Lines 326
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
wmc 53
eloc 178
dl 0
loc 326
rs 6.96
c 0
b 0
f 0

4 Methods

Rating   Name   Duplication   Size   Complexity  
F Resolver.resolve_mets_arguments() 0 59 20
A Resolver.workspace_from_nothing() 0 27 4
F Resolver.download_to_directory() 0 129 17
D Resolver.workspace_from_url() 0 79 12

How to fix   Complexity   

Complexity

Complex classes like ocrd.resolver often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

1
from tempfile import mkdtemp
2
from pathlib import Path
3
from warnings import warn
4
5
import requests
6
from requests.adapters import HTTPAdapter, Retry
7
8
from ocrd.constants import TMP_PREFIX
9
from ocrd_utils import (
10
    config,
11
    DEFAULT_METS_BASENAME,
12
    getLogger,
13
    is_local_filename,
14
    get_local_filename,
15
    remove_non_path_from_url,
16
    is_file_in_directory,
17
    nth_url_segment
18
)
19
from ocrd.workspace import Workspace
20
from ocrd_models import OcrdMets
21
from ocrd_models.utils import handle_oai_response
22
23
24
class Resolver():
25
    """
26
    Handle uploads, downloads, repository access, and manage temporary directories
27
    """
28
29
    def download_to_directory(self, directory, url, basename=None, if_exists='skip', subdir=None, retries=None, timeout=None):
30
        """
31
        Download a URL ``url`` to a local file in ``directory``.
32
33
        If ``url`` looks like a file path, check whether that exists.
34
        If it does exist and is within ``directory` already, return early.
35
        If it does exist but is outside of ``directory``, copy it.
36
        If ``url` does not appear to be a file path, try downloading via HTTP,
37
        retrying ``retries`` times with timeout ``timeout`` between calls.
38
39
        If ``basename`` is not given but ``subdir`` is, set ``basename`` to the last path segment of ``url``.
40
41
        \b
42
        If the target file already exists within ``directory``, behavior depends on ``if_exists``:
43
            - ``skip`` (default): do nothing and return early. Note that this
44
            - ``overwrite``: overwrite the existing file
45
            - ``raise``: raise a ``FileExistsError``
46
47
        Args:
48
            directory (string): Directory to download files to
49
            url (string): URL to download from
50
51
        Keyword Args:
52
            basename (string, None): basename part of the filename on disk. Defaults to last path segment of ``url`` if unset.
53
            if_exists (string, "skip"): What to do if target file already exists.
54
                One of ``skip`` (default), ``overwrite`` or ``raise``
55
            subdir (string, None): Subdirectory to create within the directory. Think ``mets:fileGrp[@USE]``.
56
            retries (int, None): Number of retries to attempt on network failure.
57
            timeout (tuple, None): Timeout in seconds for establishing a connection and reading next chunk of data.
58
59
        Returns:
60
            Local filename string, *relative* to directory
61
        """
62
        log = getLogger('ocrd.resolver.download_to_directory')  # pylint: disable=redefined-outer-name
63
        log.debug("directory=|%s| url=|%s| basename=|%s| if_exists=|%s| subdir=|%s|",
64
                  directory, url, basename, if_exists, subdir)
65
66
        if not url:
67
            raise ValueError(f"'url' must be a non-empty string, not '{url}'")  # actually Path also ok
68
        if not directory:
69
            raise ValueError(f"'directory' must be a non-empty string, not '{url}'")  # actually Path would also work
70
71
        url = str(url)
72
        directory = Path(directory)
73
        directory.mkdir(parents=True, exist_ok=True)
74
75
        subdir_path = Path(subdir if subdir else '')
76
        basename_path = Path(basename if basename else nth_url_segment(url))
77
        ret = Path(subdir_path, basename_path)
78
        dst_path = Path(directory, ret)
79
80
        # log.info("\n\tdst_path='%s \n\turl=%s", dst_path, url)
81
        # print(f'>>> url={url}')
82
        # print(f'>>> directory={directory}')
83
        # print(f'>>> subdir_path={subdir_path}')
84
        # print(f'>>> basename_path={basename_path}')
85
        # print(f'>>> dst_path={dst_path}')
86
        # print(f'>>> ret={ret}')
87
88
        src_path = None
89
        if is_local_filename(url):
90
            try:
91
                src_path = Path(get_local_filename(url)).resolve()
92
            except FileNotFoundError as e:
93
                log.error("Failed to resolve URL locally: %s --> '%s' which does not exist" % (url, src_path))
94
                raise e
95
            if not src_path.exists():
96
                raise FileNotFoundError(f"File path passed as 'url' to download_to_directory does not exist: '{url}")
97
            if src_path == dst_path:
98
                log.debug("Stop early, src_path and dst_path are the same: '%s' (url: '%s')" % (src_path, url))
99
                return str(ret)
100
101
        # Respect 'if_exists' kwarg
102
        if dst_path.exists():
103
            if if_exists == 'skip':
104
                log.debug(f"File already exists but if_exists == {if_exists}, skipping.")
105
                return str(ret)
106
            elif if_exists == 'raise':
107
                raise FileExistsError(f"File already exists and if_exists == '{if_exists}': {dst_path}")
108
            else:
109
                log.debug(f"File already exists but if_exists == {if_exists}, overwriting.")
110
111
        # Create dst_path parent dir
112
        dst_path.parent.mkdir(parents=True, exist_ok=True)
113
114
        # Copy files or download remote assets
115
        if src_path:
116
            # src_path set, so it is a file source, we can copy directly
117
            log.debug("Copying file '%s' to '%s'", src_path, dst_path)
118
            dst_path.write_bytes(src_path.read_bytes())
119
        else:
120
            # src_path not set, it's an http URL, try to download
121
            log.debug("Downloading URL '%s' to '%s'", url, dst_path)
122
            if not retries and config.is_set('OCRD_DOWNLOAD_RETRIES'):
123
                retries = config.OCRD_DOWNLOAD_RETRIES
124
            if timeout is None and config.is_set('OCRD_DOWNLOAD_TIMEOUT'):
125
                timeout = config.OCRD_DOWNLOAD_TIMEOUT
126
            session = requests.Session()
127
            retries = Retry(total=retries or 0,
128
                            status_forcelist=[
129
                                # probably too wide (only transient failures):
130
                                408,  # Request Timeout
131
                                409,  # Conflict
132
                                412,  # Precondition Failed
133
                                417,  # Expectation Failed
134
                                423,  # Locked
135
                                424,  # Fail
136
                                425,  # Too Early
137
                                426,  # Upgrade Required
138
                                428,  # Precondition Required
139
                                429,  # Too Many Requests
140
                                440,  # Login Timeout
141
                                500,  # Internal Server Error
142
                                503,  # Service Unavailable
143
                                504,  # Gateway Timeout
144
                                509,  # Bandwidth Limit Exceeded
145
                                529,  # Site Overloaded
146
                                598,  # Proxy Read Timeout
147
                                599,  # Proxy Connect Timeout
148
                            ])
149
            adapter = HTTPAdapter(max_retries=retries)
150
            session.mount('http://', adapter)
151
            session.mount('https://', adapter)
152
            response = session.get(url, timeout=timeout)
153
            response.raise_for_status()
154
            contents = handle_oai_response(response)
155
            dst_path.write_bytes(contents)
156
157
        return str(ret)
158
159
    def workspace_from_url(
160
        self,
161
        mets_url,
162
        dst_dir=None,
163
        clobber_mets=False,
164
        mets_basename=None,
165
        download=False,
166
        src_baseurl=None,
167
        mets_server_url=None,
168
        **kwargs
169
    ):
170
        """
171
        Create a workspace from a METS by URL (i.e. clone if :py:attr:`mets_url` is remote or :py:attr:`dst_dir` is given).
172
173
        Arguments:
174
            mets_url (string): Source METS URL or filesystem path
175
        Keyword Arguments:
176
            dst_dir (string, None): Target directory for the workspace. \
177
                By default create a temporary directory under :py:data:`ocrd.constants.TMP_PREFIX`. \
178
                (The resulting path can be retrieved via :py:attr:`ocrd.Workspace.directory`.)
179
            clobber_mets (boolean, False): Whether to overwrite existing ``mets.xml``. \
180
                By default existing ``mets.xml`` will raise an exception.
181
            download (boolean, False): Whether to also download all the files referenced by the METS
182
            src_baseurl (string, None): Base URL for resolving relative file locations
183
            mets_server_url (string, None): URI of TCP or local path of UDS for METS server handling
184
                the `OcrdMets` of the workspace. By default the METS will be read from and written to
185
                the filesystem directly.
186
            **kwargs (): Passed on to ``OcrdMets.find_files`` if download == True
187
188
        Download (clone) :py:attr:`mets_url` to ``mets.xml`` in :py:attr:`dst_dir`, unless
189
        the former is already local and the latter is ``none`` or already identical to its directory name.
190
191
        Returns:
192
            a new :py:class:`~ocrd.workspace.Workspace`
193
        """
194
        log = getLogger('ocrd.resolver.workspace_from_url')
195
196
        if mets_url is None:
197
            raise ValueError("Must pass 'mets_url' workspace_from_url")
198
199
        # if mets_url is a relative filename, make it absolute
200
        if is_local_filename(mets_url) and not Path(mets_url).is_absolute():
201
            mets_url = str(Path(Path.cwd() / mets_url))
202
203
        # if mets_basename is not given, use the last URL segment of the mets_url
204
        if mets_basename is None:
205
            mets_basename = nth_url_segment(mets_url, -1)
206
207
        # If src_baseurl wasn't given, determine from mets_url by removing last url segment
208
        if not src_baseurl:
209
            last_segment = nth_url_segment(mets_url)
210
            src_baseurl = remove_non_path_from_url(remove_non_path_from_url(mets_url)[:-len(last_segment)])
211
212
        # resolve dst_dir
213
        if not dst_dir:
214
            if is_local_filename(mets_url):
215
                log.debug("Deriving dst_dir %s from %s", Path(mets_url).parent, mets_url)
216
                dst_dir = Path(mets_url).parent
217
            else:
218
                log.debug("Creating ephemeral workspace '%s' for METS @ <%s>", dst_dir, mets_url)
219
                dst_dir = mkdtemp(prefix=TMP_PREFIX)
220
        # XXX Path.resolve is always strict in Python <= 3.5, so create dst_dir unless it exists consistently
221
        if not Path(dst_dir).exists():
222
            Path(dst_dir).mkdir(parents=True, exist_ok=False)
223
        dst_dir = str(Path(dst_dir).resolve())
224
225
        log.debug("mets_basename='%s' mets_url='%s' src_baseurl='%s' dst_dir='%s'",
226
                  mets_basename, mets_url, src_baseurl, dst_dir)
227
        self.download_to_directory(dst_dir, mets_url, basename=mets_basename,
228
                                   if_exists='overwrite' if clobber_mets else 'raise')
229
230
        workspace = Workspace(self, dst_dir,
231
                              mets_basename=mets_basename, baseurl=src_baseurl, mets_server_url=mets_server_url)
232
233
        if download:
234
            for f in workspace.mets.find_files(**kwargs):
235
                workspace.download_file(f)
236
237
        return workspace
238
239
    def workspace_from_nothing(self, directory, mets_basename=DEFAULT_METS_BASENAME, clobber_mets=False):
240
        """
241
        Create an empty workspace.
242
243
        Arguments:
244
            directory (string): Target directory for the workspace. \
245
                If ``none``, create a temporary directory under :py:data:`ocrd.constants.TMP_PREFIX`. \
246
                (The resulting path can be retrieved via :py:attr:`ocrd.Workspace.directory`.)
247
        Keyword Arguments:
248
            clobber_mets (boolean, False): Whether to overwrite existing ``mets.xml``. \
249
                By default existing ``mets.xml`` will raise an exception.
250
251
        Returns:
252
            a new :py:class:`~ocrd.workspace.Workspace`
253
        """
254
        log = getLogger('ocrd.resolver.workspace_from_nothing')
255
        if directory is None:
256
            directory = mkdtemp(prefix=TMP_PREFIX)
257
        Path(directory).mkdir(parents=True, exist_ok=True)
258
        mets_path = Path(directory, mets_basename)
259
        if mets_path.exists() and not clobber_mets:
260
            raise FileExistsError("METS '%s' already exists in '%s' and clobber_mets not set." % (mets_basename, directory))
261
        mets = OcrdMets.empty_mets()
262
        log.info("Writing METS to %s", mets_path)
263
        mets_path.write_bytes(mets.to_xml(xmllint=True))
264
265
        return Workspace(self, directory, mets, mets_basename=mets_basename)
266
267
    def resolve_mets_arguments(self, directory, mets_url, mets_basename=DEFAULT_METS_BASENAME, mets_server_url=None):
268
        """
269
        Resolve the ``--mets``, ``--mets-basename``, `--directory``,
270
        ``--mets-server-url``, arguments into a coherent set of arguments
271
        according to https://github.com/OCR-D/core/issues/517
272
        """
273
        log = getLogger('ocrd.resolver.resolve_mets_arguments')
274
275
        mets_is_remote = mets_url and (mets_url.startswith('http://') or mets_url.startswith('https://'))
276
277
        # XXX we might want to be more strict like this but it might break # legacy code
278
        # Allow --mets and --directory together iff --mets is a remote URL
279
        # if directory and mets_url and not mets_is_remote:
280
        #     raise ValueError("Use either --mets or --directory, not both")
281
282
        # If --mets is a URL, a directory must be explicitly provided
283
        # (not strictly necessary, but retained for legacy behavior)
284
        if not directory and mets_is_remote:
285
            raise ValueError("--mets is an http(s) URL but no --directory was given")
286
287
        # Determine --mets-basename
288
        if not mets_basename and mets_url:
289
            mets_basename = Path(mets_url).name
290
        elif not mets_basename and not mets_url:
291
            mets_basename = DEFAULT_METS_BASENAME
292
        elif mets_basename and mets_url:
293
            raise ValueError("Use either --mets or --mets-basename, not both")
294
        else:
295
            warn("--mets-basename is deprecated. Use --mets/--directory instead", DeprecationWarning)
296
297
        # Determine --directory and --mets-url
298
        if not directory and not mets_url:
299
            directory = Path.cwd()
300
            mets_url = Path(directory, mets_basename)
301
        elif directory and not mets_url:
302
            directory = Path(directory).resolve()
303
            mets_url = directory / mets_basename
304
        elif not directory and mets_url:
305
            mets_url = Path(mets_url).resolve()
306
            directory = mets_url.parent
307
        else:  # == directory and mets_url:
308
            directory = Path(directory).resolve()
309
            if not mets_is_remote:
310
                # --mets is just a basename and --directory is set, so treat --mets as --mets-basename
311
                if Path(mets_url).parent == Path('.'):
312
                    mets_url = directory / mets_url
313
                else:
314
                    mets_url = Path(mets_url).resolve()
315
                    if not is_file_in_directory(directory, mets_url):
316
                        raise ValueError("--mets '%s' has a directory part inconsistent with --directory '%s'" % (
317
                            mets_url, directory))
318
319
        if mets_server_url and not mets_server_url.startswith('http://'):
320
            # UDS socket
321
            mets_server_url = str(Path(mets_server_url).resolve())
322
323
        log.debug("directory='%s' mets_url='%s', mets_basename='%s', mets_server_url='%s'" % (
324
            directory, str(mets_url), str(mets_basename), mets_server_url))
325
        return str(Path(directory).resolve()), str(mets_url), str(mets_basename), mets_server_url
326