Passed
Push — master ( 769919...66bb7b )
by Konstantin
03:06
created

ocrd.resolver   C

Complexity

Total Complexity 53

Size/Duplication

Total Lines 316
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
wmc 53
eloc 172
dl 0
loc 316
rs 6.96
c 0
b 0
f 0

4 Methods

Rating   Name   Duplication   Size   Complexity  
A Resolver.workspace_from_nothing() 0 27 4
F Resolver.download_to_directory() 0 126 17
D Resolver.workspace_from_url() 0 77 12
F Resolver.resolve_mets_arguments() 0 55 20

How to fix   Complexity   

Complexity

Complex classes like ocrd.resolver often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

1
from tempfile import mkdtemp
2
from pathlib import Path
3
from warnings import warn
4
5
import requests
6
from requests.adapters import HTTPAdapter, Retry
7
8
from ocrd.constants import TMP_PREFIX
9
from ocrd_utils import (
10
    config,
11
    DEFAULT_METS_BASENAME,
12
    getLogger,
13
    is_local_filename,
14
    get_local_filename,
15
    remove_non_path_from_url,
16
    is_file_in_directory,
17
    nth_url_segment
18
)
19
from ocrd.workspace import Workspace
20
from ocrd_models import OcrdMets
21
from ocrd_models.utils import handle_oai_response
22
23
class Resolver():
24
    """
25
    Handle uploads, downloads, repository access, and manage temporary directories
26
    """
27
28
    def download_to_directory(self, directory, url, basename=None, if_exists='skip', subdir=None, retries=None, timeout=None):
29
        """
30
        Download a URL ``url`` to a local file in ``directory``.
31
32
        If ``url`` looks like a file path, check whether that exists.
33
        If it does exist and is within ``directory` already, return early.
34
        If it does exist but is outside of ``directory``. copy it.
35
        If ``url` does not appear to be a file path, try downloading via HTTP, retrying ``retries`` times with timeout ``timeout`` between calls.
36
37
        If ``basename`` is not given but ``subdir`` is, set ``basename`` to the last path segment of ``url``.
38
39
        If the target file already exists within ``directory``, behavior depends on ``if_exists``:
40
            - ``skip`` (default): do nothing and return early. Note that this
41
            - ``overwrite``: overwrite the existing file
42
            - ``raise``: raise a ``FileExistsError``
43
44
        Args:
45
            directory (string): Directory to download files to
46
            url (string): URL to download from
47
48
        Keyword Args:
49
            basename (string, None): basename part of the filename on disk. Defaults to last path segment of ``url`` if unset.
50
            if_exists (string, "skip"): What to do if target file already exists.
51
                One of ``skip`` (default), ``overwrite`` or ``raise``
52
            subdir (string, None): Subdirectory to create within the directory. Think ``mets:fileGrp[@USE]``.
53
            retries (int, None): Number of retries to attempt on network failure.
54
            timeout (tuple, None): Timeout in seconds for establishing a connection and reading next chunk of data.
55
56
        Returns:
57
            Local filename string, *relative* to directory
58
        """
59
        log = getLogger('ocrd.resolver.download_to_directory') # pylint: disable=redefined-outer-name
60
        log.debug("directory=|%s| url=|%s| basename=|%s| if_exists=|%s| subdir=|%s|", directory, url, basename, if_exists, subdir)
61
62
        if not url:
63
            raise ValueError(f"'url' must be a non-empty string, not '{url}'") # actually Path also ok
64
        if not directory:
65
            raise ValueError(f"'directory' must be a non-empty string, not '{url}'")  # actually Path would also work
66
67
        url = str(url)
68
        directory = Path(directory)
69
        directory.mkdir(parents=True, exist_ok=True)
70
71
        subdir_path = Path(subdir if subdir else '')
72
        basename_path = Path(basename if basename else nth_url_segment(url))
73
        ret = Path(subdir_path, basename_path)
74
        dst_path = Path(directory, ret)
75
76
        # log.info("\n\tdst_path='%s \n\turl=%s", dst_path, url)
77
        # print(f'>>> url={url}')
78
        # print(f'>>> directory={directory}')
79
        # print(f'>>> subdir_path={subdir_path}')
80
        # print(f'>>> basename_path={basename_path}')
81
        # print(f'>>> dst_path={dst_path}')
82
        # print(f'>>> ret={ret}')
83
84
        src_path = None
85
        if is_local_filename(url):
86
            try:
87
                src_path = Path(get_local_filename(url)).resolve()
88
            except FileNotFoundError as e:
89
                log.error("Failed to resolve URL locally: %s --> '%s' which does not exist" % (url, src_path))
90
                raise e
91
            if not src_path.exists():
92
                raise FileNotFoundError(f"File path passed as 'url' to download_to_directory does not exist: '{url}")
93
            if src_path == dst_path:
94
                log.debug("Stop early, src_path and dst_path are the same: '%s' (url: '%s')" % (src_path, url))
95
                return str(ret)
96
97
        # Respect 'if_exists' kwarg
98
        if dst_path.exists():
99
            if if_exists == 'skip':
100
                log.debug(f"File already exists but if_exists == {if_exists}, skipping.")
101
                return str(ret)
102
            elif if_exists == 'raise':
103
                raise FileExistsError(f"File already exists and if_exists == '{if_exists}': {dst_path}")
104
            else:
105
                log.debug(f"File already exists but if_exists == {if_exists}, overwriting.")
106
107
        # Create dst_path parent dir
108
        dst_path.parent.mkdir(parents=True, exist_ok=True)
109
110
        # Copy files or download remote assets
111
        if src_path:
112
            # src_path set, so it is a file source, we can copy directly
113
            log.debug("Copying file '%s' to '%s'", src_path, dst_path)
114
            dst_path.write_bytes(src_path.read_bytes())
115
        else:
116
            # src_path not set, it's an http URL, try to download
117
            log.debug("Downloading URL '%s' to '%s'", url, dst_path)
118
            if not retries and config.is_set('OCRD_DOWNLOAD_RETRIES'):
119
                retries = config.OCRD_DOWNLOAD_RETRIES
120
            if timeout is None and config.is_set('OCRD_DOWNLOAD_TIMEOUT'):
121
                timeout = config.OCRD_DOWNLOAD_TIMEOUT
122
            session = requests.Session()
123
            retries = Retry(total=retries or 0,
124
                            status_forcelist=[
125
                                # probably too wide (only transient failures):
126
                                408, # Request Timeout
127
                                409, # Conflict
128
                                412, # Precondition Failed
129
                                417, # Expectation Failed
130
                                423, # Locked
131
                                424, # Fail
132
                                425, # Too Early
133
                                426, # Upgrade Required
134
                                428, # Precondition Required
135
                                429, # Too Many Requests
136
                                440, # Login Timeout
137
                                500, # Internal Server Error
138
                                503, # Service Unavailable
139
                                504, # Gateway Timeout
140
                                509, # Bandwidth Limit Exceeded
141
                                529, # Site Overloaded
142
                                598, # Proxy Read Timeout
143
                                599, # Proxy Connect Timeout
144
                    ])
145
            adapter = HTTPAdapter(max_retries=retries)
146
            session.mount('http://', adapter)
147
            session.mount('https://', adapter)
148
            response = session.get(url, timeout=timeout)
149
            response.raise_for_status()
150
            contents = handle_oai_response(response)
151
            dst_path.write_bytes(contents)
152
153
        return str(ret)
154
155
    def workspace_from_url(
156
        self,
157
        mets_url,
158
        dst_dir=None,
159
        clobber_mets=False,
160
        mets_basename=None,
161
        download=False,
162
        src_baseurl=None,
163
        mets_server_url=None,
164
        **kwargs
165
    ):
166
        """
167
        Create a workspace from a METS by URL (i.e. clone if :py:attr:`mets_url` is remote or :py:attr:`dst_dir` is given).
168
169
        Arguments:
170
            mets_url (string): Source METS URL or filesystem path
171
        Keyword Arguments:
172
            dst_dir (string, None): Target directory for the workspace. \
173
                By default create a temporary directory under :py:data:`ocrd.constants.TMP_PREFIX`. \
174
                (The resulting path can be retrieved via :py:attr:`ocrd.Workspace.directory`.)
175
            clobber_mets (boolean, False): Whether to overwrite existing ``mets.xml``. \
176
                By default existing ``mets.xml`` will raise an exception.
177
            download (boolean, False): Whether to also download all the files referenced by the METS
178
            src_baseurl (string, None): Base URL for resolving relative file locations
179
            mets_server_url (string, None): URI of TCP or local path of UDS for METS server handling
180
                the `OcrdMets` of the workspace. By default the METS will be read from and written to
181
                the filesystem directly.
182
            **kwargs (): Passed on to ``OcrdMets.find_files`` if download == True
183
184
        Download (clone) :py:attr:`mets_url` to ``mets.xml`` in :py:attr:`dst_dir`, unless 
185
        the former is already local and the latter is ``none`` or already identical to its directory name.
186
187
        Returns:
188
            a new :py:class:`~ocrd.workspace.Workspace`
189
        """
190
        log = getLogger('ocrd.resolver.workspace_from_url')
191
192
        if mets_url is None:
193
            raise ValueError("Must pass 'mets_url' workspace_from_url")
194
195
        # if mets_url is a relative filename, make it absolute
196
        if is_local_filename(mets_url) and not Path(mets_url).is_absolute():
197
            mets_url = str(Path(Path.cwd() / mets_url))
198
199
        # if mets_basename is not given, use the last URL segment of the mets_url
200
        if mets_basename is None:
201
            mets_basename = nth_url_segment(mets_url, -1)
202
203
        # If src_baseurl wasn't given, determine from mets_url by removing last url segment
204
        if not src_baseurl:
205
            last_segment = nth_url_segment(mets_url)
206
            src_baseurl = remove_non_path_from_url(remove_non_path_from_url(mets_url)[:-len(last_segment)])
207
208
        # resolve dst_dir
209
        if not dst_dir:
210
            if is_local_filename(mets_url):
211
                log.debug("Deriving dst_dir %s from %s", Path(mets_url).parent, mets_url)
212
                dst_dir = Path(mets_url).parent
213
            else:
214
                log.debug("Creating ephemeral workspace '%s' for METS @ <%s>", dst_dir, mets_url)
215
                dst_dir = mkdtemp(prefix=TMP_PREFIX)
216
        # XXX Path.resolve is always strict in Python <= 3.5, so create dst_dir unless it exists consistently
217
        if not Path(dst_dir).exists():
218
            Path(dst_dir).mkdir(parents=True, exist_ok=False)
219
        dst_dir = str(Path(dst_dir).resolve())
220
221
        log.debug("workspace_from_url\nmets_basename='%s'\nmets_url='%s'\nsrc_baseurl='%s'\ndst_dir='%s'",
222
            mets_basename, mets_url, src_baseurl, dst_dir)
223
        self.download_to_directory(dst_dir, mets_url, basename=mets_basename, if_exists='overwrite' if clobber_mets else 'raise')
224
225
        workspace = Workspace(self, dst_dir, mets_basename=mets_basename, baseurl=src_baseurl, mets_server_url=mets_server_url)
226
227
        if download:
228
            for f in workspace.mets.find_files(**kwargs):
229
                workspace.download_file(f)
230
231
        return workspace
232
233
    def workspace_from_nothing(self, directory, mets_basename=DEFAULT_METS_BASENAME, clobber_mets=False):
234
        """
235
        Create an empty workspace.
236
237
        Arguments:
238
            directory (string): Target directory for the workspace. \
239
                If ``none``, create a temporary directory under :py:data:`ocrd.constants.TMP_PREFIX`. \
240
                (The resulting path can be retrieved via :py:attr:`ocrd.Workspace.directory`.)
241
        Keyword Arguments:
242
            clobber_mets (boolean, False): Whether to overwrite existing ``mets.xml``. \
243
                By default existing ``mets.xml`` will raise an exception.
244
245
        Returns:
246
            a new :py:class:`~ocrd.workspace.Workspace`
247
        """
248
        log = getLogger('ocrd.resolver.workspace_from_nothing')
249
        if directory is None:
250
            directory = mkdtemp(prefix=TMP_PREFIX)
251
        Path(directory).mkdir(parents=True, exist_ok=True)
252
        mets_path = Path(directory, mets_basename)
253
        if mets_path.exists() and not clobber_mets:
254
            raise FileExistsError("METS '%s' already exists in '%s' and clobber_mets not set." % (mets_basename, directory))
255
        mets = OcrdMets.empty_mets()
256
        log.info("Writing METS to %s", mets_path)
257
        mets_path.write_bytes(mets.to_xml(xmllint=True))
258
259
        return Workspace(self, directory, mets, mets_basename=mets_basename)
260
261
    def resolve_mets_arguments(self, directory, mets_url, mets_basename=DEFAULT_METS_BASENAME, mets_server_url=None):
262
        """
263
        Resolve the ``--mets``, ``--mets-basename``, `--directory``,
264
        ``--mets-server-url``, arguments into a coherent set of arguments
265
        according to https://github.com/OCR-D/core/issues/517
266
        """
267
        log = getLogger('ocrd.resolver.resolve_mets_arguments')
268
269
        mets_is_remote = mets_url and (mets_url.startswith('http://') or mets_url.startswith('https://'))
270
271
        # XXX we might want to be more strict like this but it might break # legacy code
272
        # Allow --mets and --directory together iff --mets is a remote URL
273
        # if directory and mets_url and not mets_is_remote:
274
        #     raise ValueError("Use either --mets or --directory, not both")
275
276
        # If --mets is a URL, a directory must be explicitly provided (not strictly necessary, but retained for legacy behavior)
277
        if not directory and mets_is_remote:
278
            raise ValueError("--mets is an http(s) URL but no --directory was given")
279
280
        # Determine --mets-basename
281
        if not mets_basename and mets_url:
282
            mets_basename = Path(mets_url).name
283
        elif not mets_basename and not mets_url:
284
            mets_basename = DEFAULT_METS_BASENAME
285
        elif mets_basename and mets_url:
286
            raise ValueError("Use either --mets or --mets-basename, not both")
287
        else:
288
            warn("--mets-basename is deprecated. Use --mets/--directory instead", DeprecationWarning)
289
290
        # Determine --directory and --mets-url
291
        if not directory and not mets_url:
292
            directory = Path.cwd()
293
            mets_url = Path(directory, mets_basename)
294
        elif directory and not mets_url:
295
            directory = Path(directory).resolve()
296
            mets_url = directory / mets_basename
297
        elif not directory and mets_url:
298
            mets_url = Path(mets_url).resolve()
299
            directory = mets_url.parent
300
        else: # == directory and mets_url:
301
            directory = Path(directory).resolve()
302
            if not mets_is_remote:
303
                # --mets is just a basename and --directory is set, so treat --mets as --mets-basename
304
                if Path(mets_url).parent == Path('.'):
305
                    mets_url = directory / mets_url
306
                else:
307
                    mets_url = Path(mets_url).resolve()
308
                    if not is_file_in_directory(directory, mets_url):
309
                        raise ValueError("--mets '%s' has a directory part inconsistent with --directory '%s'" % (mets_url, directory))
310
311
        if mets_server_url and not mets_server_url.startswith('http://'):
312
            # UDS socket
313
            mets_server_url = str(Path(mets_server_url).resolve())
314
315
        return str(Path(directory).resolve()), str(mets_url), str(mets_basename), mets_server_url
316