Passed
Pull Request — master (#696)
by Konstantin
02:13
created

ocrd.resolver.Resolver.download_to_directory()   F

Complexity

Conditions 14

Size

Total Lines 84
Code Lines 42

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 42
dl 0
loc 84
rs 3.6
c 0
b 0
f 0
cc 14
nop 6

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

Complexity

Complex classes like ocrd.resolver.Resolver.download_to_directory() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

1
from tempfile import mkdtemp
2
from pathlib import Path
3
from warnings import warn
4
5
import requests
6
7
from ocrd.constants import TMP_PREFIX
8
from ocrd_utils import (
9
    getLogger,
10
    is_local_filename,
11
    get_local_filename,
12
    remove_non_path_from_url,
13
    is_file_in_directory,
14
    nth_url_segment
15
)
16
from ocrd.workspace import Workspace
17
from ocrd_models import OcrdMets
18
from ocrd_models.constants import NAMESPACES as NS
19
from ocrd_models.utils import handle_oai_response
20
21
class Resolver():
22
    """
23
    Handle uploads, downloads, repository access, and manage temporary directories
24
    """
25
26
    def download_to_directory(self, directory, url, basename=None, if_exists='skip', subdir=None):
27
        """
28
        Download a file to a directory.
29
30
        Early Shortcut: If `url` is a local file and that file is already in the directory, keep it there.
31
32
        If `basename` is not given but subdir is, assume user knows what she's doing and
33
        use last URL segment as the basename.
34
35
        If `basename` is not given and no subdir is given, use the alnum characters in the URL as the basename.
36
37
        Args:
38
            directory (string): Directory to download files to
39
            basename (string, None): basename part of the filename on disk.
40
            url (string): URL to download from
41
            if_exists (string, "skip"): What to do if target file already exists. \
42
                One of ``skip`` (default), ``overwrite`` or ``raise``
43
            subdir (string, None): Subdirectory to create within the directory. Think ``mets:fileGrp``.
44
45
        Returns:
46
            Local filename string, *relative* to directory
47
        """
48
        log = getLogger('ocrd.resolver.download_to_directory') # pylint: disable=redefined-outer-name
49
        log.debug("directory=|%s| url=|%s| basename=|%s| if_exists=|%s| subdir=|%s|", directory, url, basename, if_exists, subdir)
50
51
        if not url:
52
            raise Exception("'url' must be a string")
53
        if not directory:
54
            raise Exception("'directory' must be a string")  # actually Path would also work
55
56
        directory = Path(directory)
57
        directory.mkdir(parents=True, exist_ok=True)
58
        directory = str(directory.resolve())
59
60
        subdir_path = Path(subdir if subdir else '')
61
        basename_path = Path(basename if basename else nth_url_segment(url))
62
        ret = str(Path(subdir_path, basename_path))
63
        dst_path = Path(directory, ret)
64
65
        #  log.info("\n\tdst_path='%s \n\turl=%s", dst_path, url)
66
        #  print('url=%s', url)
67
        #  print('directory=%s', directory)
68
        #  print('subdir_path=%s', subdir_path)
69
        #  print('basename_path=%s', basename_path)
70
        #  print('ret=%s', ret)
71
        #  print('dst_path=%s', dst_path)
72
73
        src_path = None
74
        if is_local_filename(url):
75
            try:
76
                # XXX this raises FNFE in Python 3.5 if src_path doesn't exist but not 3.6+
77
                src_path = Path(get_local_filename(url)).resolve()
78
            except FileNotFoundError as e:
79
                log.error("Failed to resolve URL locally: %s --> '%s' which does not exist" % (url, src_path))
80
                raise e
81
            if not src_path.exists():
82
                raise FileNotFoundError("File path passed as 'url' to download_to_directory does not exist: %s" % url)
83
            if src_path == dst_path:
84
                log.debug("Stop early, src_path and dst_path are the same: '%s' (url: '%s')" % (src_path, url))
85
                return ret
86
87
        # Respect 'if_exists' arg
88
        if dst_path.exists():
89
            if if_exists == 'skip':
90
                return ret
91
            if if_exists == 'raise':
92
                raise FileExistsError("File already exists and if_exists == 'raise': %s" % (dst_path))
93
94
        # Create dst_path parent dir
95
        dst_path.parent.mkdir(parents=True, exist_ok=True)
96
97
        # Copy files or download remote assets
98
        if src_path:
99
            log.debug("Copying file '%s' to '%s'", src_path, dst_path)
100
            dst_path.write_bytes(src_path.read_bytes())
101
        else:
102
            log.debug("Downloading URL '%s' to '%s'", url, dst_path)
103
            response = requests.get(url)
104
            if response.status_code != 200:
105
                raise Exception("HTTP request failed: %s (HTTP %d)" % (url, response.status_code))
106
            contents = handle_oai_response(response)
107
            dst_path.write_bytes(contents)
108
109
        return ret
110
111
    def workspace_from_url(self, mets_url, dst_dir=None, clobber_mets=False, mets_basename=None, download=False, src_baseurl=None):
112
        """
113
        Create a workspace from a METS by URL (i.e. clone if :py:attr:`mets_url` is remote or :py:attr:`dst_dir` is given).
114
115
        Arguments:
116
            mets_url (string): Source METS URL or filesystem path
117
        Keyword Arguments:
118
            dst_dir (string, None): Target directory for the workspace. \
119
                By default create a temporary directory under :py:data:`ocrd.constants.TMP_PREFIX`. \
120
                (The resulting path can be retrieved via :py:attr:`ocrd.Workspace.directory`.)
121
            clobber_mets (boolean, False): Whether to overwrite existing ``mets.xml``. \
122
                By default existing ``mets.xml`` will raise an exception.
123
            download (boolean, False): Whether to also download all the files referenced by the METS
124
            src_baseurl (string, None): Base URL for resolving relative file locations
125
126
        Download (clone) :py:attr:`mets_url` to ``mets.xml`` in :py:attr:`dst_dir`, unless 
127
        the former is already local and the latter is ``none`` or already identical to its directory name.
128
129
        Returns:
130
            a new :py:class:`~ocrd.workspace.Workspace`
131
        """
132
        log = getLogger('ocrd.resolver.workspace_from_url')
133
134
        if mets_url is None:
135
            raise ValueError("Must pass 'mets_url' workspace_from_url")
136
137
        # if mets_url is a relative filename, make it absolute
138
        if is_local_filename(mets_url) and not Path(mets_url).is_absolute():
139
            mets_url = str(Path(Path.cwd() / mets_url))
140
141
        # if mets_basename is not given, use the last URL segment of the mets_url
142
        if mets_basename is None:
143
            mets_basename = nth_url_segment(mets_url, -1)
144
145
        # If src_baseurl wasn't given, determine from mets_url by removing last url segment
146
        if not src_baseurl:
147
            last_segment = nth_url_segment(mets_url)
148
            src_baseurl = remove_non_path_from_url(remove_non_path_from_url(mets_url)[:-len(last_segment)])
149
150
        # resolve dst_dir
151
        if not dst_dir:
152
            if is_local_filename(mets_url):
153
                log.debug("Deriving dst_dir %s from %s", Path(mets_url).parent, mets_url)
154
                dst_dir = Path(mets_url).parent
155
            else:
156
                log.debug("Creating ephemeral workspace '%s' for METS @ <%s>", dst_dir, mets_url)
157
                dst_dir = mkdtemp(prefix=TMP_PREFIX)
158
        # XXX Path.resolve is always strict in Python <= 3.5, so create dst_dir unless it exists consistently
159
        if not Path(dst_dir).exists():
160
            Path(dst_dir).mkdir(parents=True, exist_ok=False)
161
        dst_dir = str(Path(dst_dir).resolve())
162
163
        log.debug("workspace_from_url\nmets_basename='%s'\nmets_url='%s'\nsrc_baseurl='%s'\ndst_dir='%s'",
164
            mets_basename, mets_url, src_baseurl, dst_dir)
165
166
        self.download_to_directory(dst_dir, mets_url, basename=mets_basename, if_exists='overwrite' if clobber_mets else 'skip')
167
168
        workspace = Workspace(self, dst_dir, mets_basename=mets_basename, baseurl=src_baseurl)
169
170
        if download:
171
            for f in workspace.mets.find_files():
172
                workspace.download_file(f)
173
174
        return workspace
175
176
    def workspace_from_nothing(self, directory, mets_basename='mets.xml', clobber_mets=False):
177
        """
178
        Create an empty workspace.
179
180
        Arguments:
181
            directory (string): Target directory for the workspace. \
182
                If ``none``, create a temporary directory under :py:data:`ocrd.constants.TMP_PREFIX`. \
183
                (The resulting path can be retrieved via :py:attr:`ocrd.Workspace.directory`.)
184
        Keyword Arguments:
185
            clobber_mets (boolean, False): Whether to overwrite existing ``mets.xml``. \
186
                By default existing ``mets.xml`` will raise an exception.
187
188
        Returns:
189
            a new :py:class:`~ocrd.workspace.Workspace`
190
        """
191
        log = getLogger('ocrd.resolver.workspace_from_nothing')
192
        if directory is None:
193
            directory = mkdtemp(prefix=TMP_PREFIX)
194
        Path(directory).mkdir(parents=True, exist_ok=True)
195
        mets_path = Path(directory, mets_basename)
196
        if mets_path.exists() and not clobber_mets:
197
            raise FileExistsError("METS '%s' already exists in '%s' and clobber_mets not set." % (mets_basename, directory))
198
        mets = OcrdMets.empty_mets()
199
        log.info("Writing METS to %s", mets_path)
200
        mets_path.write_bytes(mets.to_xml(xmllint=True))
201
202
        return Workspace(self, directory, mets, mets_basename=mets_basename)
203
204
    def resolve_mets_arguments(self, directory, mets_url, mets_basename):
205
        """
206
        Resolve the ``--mets``, ``--mets-basename`` and `--directory`` argument
207
        into a coherent set of arguments according to https://github.com/OCR-D/core/issues/517
208
        """
209
        log = getLogger('ocrd.resolver.resolve_mets_arguments')
210
        mets_is_remote = mets_url and (mets_url.startswith('http://') or mets_url.startswith('https://'))
211
212
        # XXX we might want to be more strict like this but it might break # legacy code
213
        # Allow --mets and --directory together iff --mets is a remote URL
214
        # if directory and mets_url and not mets_is_remote:
215
        #     raise ValueError("Use either --mets or --directory, not both")
216
217
        # If --mets is a URL, a directory must be explicitly provided (not strictly necessary, but retained for legacy behavior)
218
        if not directory and mets_is_remote:
219
            raise ValueError("--mets is an http(s) URL but no --directory was given")
220
221
        # Determine --mets-basename
222
        if not mets_basename and mets_url:
223
            mets_basename = Path(mets_url).name
224
        elif not mets_basename and not mets_url:
225
            mets_basename = 'mets.xml'
226
        elif mets_basename and mets_url:
227
            raise ValueError("Use either --mets or --mets-basename, not both")
228
        else:
229
            warn("--mets-basename is deprecated. Use --mets/--directory instead", DeprecationWarning)
230
231
        # Determine --directory and --mets-url
232
        if not directory and not mets_url:
233
            directory = Path.cwd()
234
            mets_url = Path(directory, mets_basename)
235
        elif directory and not mets_url:
236
            directory = Path(directory).resolve()
237
            mets_url = directory / mets_basename
238
        elif not directory and mets_url:
239
            mets_url = Path(mets_url).resolve()
240
            directory = mets_url.parent
241
        else: # == directory and mets_url:
242
            directory = Path(directory).resolve()
243
            if not mets_is_remote:
244
                # --mets is just a basename and --directory is set, so treat --mets as --mets-basename
245
                if Path(mets_url).parent == Path('.'):
246
                    mets_url = directory / mets_url
247
                else:
248
                    mets_url = Path(mets_url).resolve()
249
                    if not is_file_in_directory(directory, mets_url):
250
                        raise ValueError("--mets '%s' has a directory part inconsistent with --directory '%s'" % (mets_url, directory))
251
252
        return str(Path(directory).resolve()), str(mets_url), str(mets_basename)
253
254
255