Passed
Pull Request — master (#966)
by Konstantin
04:11 queued 01:47
created

ocrd.resolver.Resolver.resolve_mets_arguments()   F

Complexity

Conditions 25

Size

Total Lines 58
Code Lines 34

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 34
dl 0
loc 58
rs 0
c 0
b 0
f 0
cc 25
nop 7

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

Complexity

Complex classes like ocrd.resolver.Resolver.resolve_mets_arguments() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

1
from tempfile import mkdtemp
2
from pathlib import Path
3
from warnings import warn
4
5
import requests
6
7
from ocrd.constants import TMP_PREFIX
8
from ocrd_utils import (
9
    getLogger,
10
    is_local_filename,
11
    get_local_filename,
12
    remove_non_path_from_url,
13
    is_file_in_directory,
14
    nth_url_segment
15
)
16
from ocrd.workspace import Workspace
17
from ocrd_models import OcrdMets
18
from ocrd_models.constants import NAMESPACES as NS
19
from ocrd_models.utils import handle_oai_response
20
21
class Resolver():
22
    """
23
    Handle uploads, downloads, repository access, and manage temporary directories
24
    """
25
26
    def download_to_directory(self, directory, url, basename=None, if_exists='skip', subdir=None):
27
        """
28
        Download a file to a directory.
29
30
        Early Shortcut: If `url` is a local file and that file is already in the directory, keep it there.
31
32
        If `basename` is not given but subdir is, assume user knows what she's doing and
33
        use last URL segment as the basename.
34
35
        If `basename` is not given and no subdir is given, use the alnum characters in the URL as the basename.
36
37
        Args:
38
            directory (string): Directory to download files to
39
            basename (string, None): basename part of the filename on disk.
40
            url (string): URL to download from
41
            if_exists (string, "skip"): What to do if target file already exists. \
42
                One of ``skip`` (default), ``overwrite`` or ``raise``
43
            subdir (string, None): Subdirectory to create within the directory. Think ``mets:fileGrp``.
44
45
        Returns:
46
            Local filename string, *relative* to directory
47
        """
48
        log = getLogger('ocrd.resolver.download_to_directory') # pylint: disable=redefined-outer-name
49
        log.debug("directory=|%s| url=|%s| basename=|%s| if_exists=|%s| subdir=|%s|", directory, url, basename, if_exists, subdir)
50
51
        if not url:
52
            raise Exception("'url' must be a string")
53
        if not directory:
54
            raise Exception("'directory' must be a string")  # actually Path would also work
55
56
        directory = Path(directory)
57
        directory.mkdir(parents=True, exist_ok=True)
58
        directory = str(directory.resolve())
59
60
        subdir_path = Path(subdir if subdir else '')
61
        basename_path = Path(basename if basename else nth_url_segment(url))
62
        ret = str(Path(subdir_path, basename_path))
63
        dst_path = Path(directory, ret)
64
65
        #  log.info("\n\tdst_path='%s \n\turl=%s", dst_path, url)
66
        #  print('url=%s', url)
67
        #  print('directory=%s', directory)
68
        #  print('subdir_path=%s', subdir_path)
69
        #  print('basename_path=%s', basename_path)
70
        #  print('ret=%s', ret)
71
        #  print('dst_path=%s', dst_path)
72
73
        src_path = None
74
        if is_local_filename(url):
75
            try:
76
                # XXX this raises FNFE in Python 3.5 if src_path doesn't exist but not 3.6+
77
                src_path = Path(get_local_filename(url)).resolve()
78
            except FileNotFoundError as e:
79
                log.error("Failed to resolve URL locally: %s --> '%s' which does not exist" % (url, src_path))
80
                raise e
81
            if not src_path.exists():
82
                raise FileNotFoundError("File path passed as 'url' to download_to_directory does not exist: %s" % url)
83
            if src_path == dst_path:
84
                log.debug("Stop early, src_path and dst_path are the same: '%s' (url: '%s')" % (src_path, url))
85
                return ret
86
87
        # Respect 'if_exists' arg
88
        if dst_path.exists():
89
            if if_exists == 'skip':
90
                return ret
91
            if if_exists == 'raise':
92
                raise FileExistsError("File already exists and if_exists == 'raise': %s" % (dst_path))
93
94
        # Create dst_path parent dir
95
        dst_path.parent.mkdir(parents=True, exist_ok=True)
96
97
        # Copy files or download remote assets
98
        if src_path:
99
            log.debug("Copying file '%s' to '%s'", src_path, dst_path)
100
            dst_path.write_bytes(src_path.read_bytes())
101
        else:
102
            log.debug("Downloading URL '%s' to '%s'", url, dst_path)
103
            response = requests.get(url)
104
            if response.status_code != 200:
105
                raise Exception("HTTP request failed: %s (HTTP %d)" % (url, response.status_code))
106
            contents = handle_oai_response(response)
107
            dst_path.write_bytes(contents)
108
109
        return ret
110
111
    def workspace_from_url(
112
        self,
113
        mets_url,
114
        dst_dir=None,
115
        clobber_mets=False,
116
        mets_basename=None,
117
        download=False,
118
        src_baseurl=None,
119
        mets_server_host=None,
120
        mets_server_port=None,
121
        mets_server_socket=None,
122
    ):
123
        """
124
        Create a workspace from a METS by URL (i.e. clone if :py:attr:`mets_url` is remote or :py:attr:`dst_dir` is given).
125
126
        Arguments:
127
            mets_url (string): Source METS URL or filesystem path
128
        Keyword Arguments:
129
            dst_dir (string, None): Target directory for the workspace. \
130
                By default create a temporary directory under :py:data:`ocrd.constants.TMP_PREFIX`. \
131
                (The resulting path can be retrieved via :py:attr:`ocrd.Workspace.directory`.)
132
            clobber_mets (boolean, False): Whether to overwrite existing ``mets.xml``. \
133
                By default existing ``mets.xml`` will raise an exception.
134
            download (boolean, False): Whether to also download all the files referenced by the METS
135
            src_baseurl (string, None): Base URL for resolving relative file locations
136
137
        Download (clone) :py:attr:`mets_url` to ``mets.xml`` in :py:attr:`dst_dir`, unless 
138
        the former is already local and the latter is ``none`` or already identical to its directory name.
139
140
        Returns:
141
            a new :py:class:`~ocrd.workspace.Workspace`
142
        """
143
        log = getLogger('ocrd.resolver.workspace_from_url')
144
145
        if mets_url is None:
146
            raise ValueError("Must pass 'mets_url' workspace_from_url")
147
148
        # if mets_url is a relative filename, make it absolute
149
        if is_local_filename(mets_url) and not Path(mets_url).is_absolute():
150
            mets_url = str(Path(Path.cwd() / mets_url))
151
152
        # if mets_basename is not given, use the last URL segment of the mets_url
153
        if mets_basename is None:
154
            mets_basename = nth_url_segment(mets_url, -1)
155
156
        # If src_baseurl wasn't given, determine from mets_url by removing last url segment
157
        if not src_baseurl:
158
            last_segment = nth_url_segment(mets_url)
159
            src_baseurl = remove_non_path_from_url(remove_non_path_from_url(mets_url)[:-len(last_segment)])
160
161
        # resolve dst_dir
162
        if not dst_dir:
163
            if is_local_filename(mets_url):
164
                log.debug("Deriving dst_dir %s from %s", Path(mets_url).parent, mets_url)
165
                dst_dir = Path(mets_url).parent
166
            else:
167
                log.debug("Creating ephemeral workspace '%s' for METS @ <%s>", dst_dir, mets_url)
168
                dst_dir = mkdtemp(prefix=TMP_PREFIX)
169
        # XXX Path.resolve is always strict in Python <= 3.5, so create dst_dir unless it exists consistently
170
        if not Path(dst_dir).exists():
171
            Path(dst_dir).mkdir(parents=True, exist_ok=False)
172
        dst_dir = str(Path(dst_dir).resolve())
173
174
        log.debug("workspace_from_url\nmets_basename='%s'\nmets_url='%s'\nsrc_baseurl='%s'\ndst_dir='%s'",
175
            mets_basename, mets_url, src_baseurl, dst_dir)
176
177
        self.download_to_directory(dst_dir, mets_url, basename=mets_basename, if_exists='overwrite' if clobber_mets else 'skip')
178
179
        workspace = Workspace(self, dst_dir, mets_basename=mets_basename, baseurl=src_baseurl,
180
                mets_server_host=mets_server_host,
181
                mets_server_port=mets_server_port,
182
                mets_server_socket=mets_server_socket,
183
                )
184
185
        if download:
186
            for f in workspace.mets.find_files():
187
                workspace.download_file(f)
188
189
        return workspace
190
191
    def workspace_from_nothing(self, directory, mets_basename='mets.xml', clobber_mets=False):
192
        """
193
        Create an empty workspace.
194
195
        Arguments:
196
            directory (string): Target directory for the workspace. \
197
                If ``none``, create a temporary directory under :py:data:`ocrd.constants.TMP_PREFIX`. \
198
                (The resulting path can be retrieved via :py:attr:`ocrd.Workspace.directory`.)
199
        Keyword Arguments:
200
            clobber_mets (boolean, False): Whether to overwrite existing ``mets.xml``. \
201
                By default existing ``mets.xml`` will raise an exception.
202
203
        Returns:
204
            a new :py:class:`~ocrd.workspace.Workspace`
205
        """
206
        log = getLogger('ocrd.resolver.workspace_from_nothing')
207
        if directory is None:
208
            directory = mkdtemp(prefix=TMP_PREFIX)
209
        Path(directory).mkdir(parents=True, exist_ok=True)
210
        mets_path = Path(directory, mets_basename)
211
        if mets_path.exists() and not clobber_mets:
212
            raise FileExistsError("METS '%s' already exists in '%s' and clobber_mets not set." % (mets_basename, directory))
213
        mets = OcrdMets.empty_mets()
214
        log.info("Writing METS to %s", mets_path)
215
        mets_path.write_bytes(mets.to_xml(xmllint=True))
216
217
        return Workspace(self, directory, mets, mets_basename=mets_basename)
218
219
    def resolve_mets_arguments(self, directory, mets_url, mets_basename, mets_server_host, mets_server_port, mets_server_socket):
220
        """
221
        Resolve the ``--mets``, ``--mets-basename``, `--directory``, ``--mets-server-host``,
222
        ``--mets-server-port``, and ``--mets-server-socket`` arguments
223
        into a coherent set of arguments according to https://github.com/OCR-D/core/issues/517
224
        """
225
        log = getLogger('ocrd.resolver.resolve_mets_arguments')
226
227
        # Determine --mets-server-*
228
        if (mets_server_host or mets_server_port or mets_server_socket):
229
            if mets_server_socket and (mets_server_host or mets_server_port):
230
                raise ValueError('--mets-server-socket incompatible with --mets-server-host/--mets-server--port')
231
            if bool(mets_server_host) != bool(mets_server_port):
232
                raise ValueError('--mets-server-host and --mets-server-port must both be set or unset')
233
234
        mets_is_remote = mets_url and (mets_url.startswith('http://') or mets_url.startswith('https://'))
235
236
        # XXX we might want to be more strict like this but it might break # legacy code
237
        # Allow --mets and --directory together iff --mets is a remote URL
238
        # if directory and mets_url and not mets_is_remote:
239
        #     raise ValueError("Use either --mets or --directory, not both")
240
241
        # If --mets is a URL, a directory must be explicitly provided (not strictly necessary, but retained for legacy behavior)
242
        if not directory and mets_is_remote:
243
            raise ValueError("--mets is an http(s) URL but no --directory was given")
244
245
        # Determine --mets-basename
246
        if not mets_basename and mets_url:
247
            mets_basename = Path(mets_url).name
248
        elif not mets_basename and not mets_url:
249
            mets_basename = 'mets.xml'
250
        elif mets_basename and mets_url:
251
            raise ValueError("Use either --mets or --mets-basename, not both")
252
        else:
253
            warn("--mets-basename is deprecated. Use --mets/--directory instead", DeprecationWarning)
254
255
        # Determine --directory and --mets-url
256
        if not directory and not mets_url:
257
            directory = Path.cwd()
258
            mets_url = Path(directory, mets_basename)
259
        elif directory and not mets_url:
260
            directory = Path(directory).resolve()
261
            mets_url = directory / mets_basename
262
        elif not directory and mets_url:
263
            mets_url = Path(mets_url).resolve()
264
            directory = mets_url.parent
265
        else: # == directory and mets_url:
266
            directory = Path(directory).resolve()
267
            if not mets_is_remote:
268
                # --mets is just a basename and --directory is set, so treat --mets as --mets-basename
269
                if Path(mets_url).parent == Path('.'):
270
                    mets_url = directory / mets_url
271
                else:
272
                    mets_url = Path(mets_url).resolve()
273
                    if not is_file_in_directory(directory, mets_url):
274
                        raise ValueError("--mets '%s' has a directory part inconsistent with --directory '%s'" % (mets_url, directory))
275
276
        return str(Path(directory).resolve()), str(mets_url), str(mets_basename), mets_server_host, mets_server_port, mets_server_socket
277
278
279