Passed
Pull Request — master (#696)
by Konstantin
02:16
created

ocrd.resolver   B

Complexity

Total Complexity 50

Size/Duplication

Total Lines 240
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
wmc 50
eloc 128
dl 0
loc 240
rs 8.4
c 0
b 0
f 0

4 Methods

Rating   Name   Duplication   Size   Complexity  
F Resolver.resolve_mets_arguments() 0 36 20
A Resolver.workspace_from_nothing() 0 27 4
F Resolver.download_to_directory() 0 84 14
D Resolver.workspace_from_url() 0 64 12

How to fix   Complexity   

Complexity

Complex classes like ocrd.resolver often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

1
from tempfile import mkdtemp
2
from pathlib import Path
3
from warnings import warn
4
5
import requests
6
7
from ocrd.constants import TMP_PREFIX
8
from ocrd_utils import (
9
    getLogger,
10
    is_local_filename,
11
    get_local_filename,
12
    remove_non_path_from_url,
13
    is_file_in_directory,
14
    nth_url_segment
15
)
16
from ocrd.workspace import Workspace
17
from ocrd_models import OcrdMets
18
from ocrd_models.constants import NAMESPACES as NS
19
from ocrd_models.utils import handle_oai_response
20
21
class Resolver():
22
    """
23
    Handle uploads, downloads, repository access, and manage temporary directories
24
    """
25
26
    def download_to_directory(self, directory, url, basename=None, if_exists='skip', subdir=None):
27
        """
28
        Download a file to a directory.
29
30
        Early Shortcut: If `url` is a local file and that file is already in the directory, keep it there.
31
32
        If `basename` is not given but subdir is, assume user knows what she's doing and
33
        use last URL segment as the basename.
34
35
        If `basename` is not given and no subdir is given, use the alnum characters in the URL as the basename.
36
37
        Args:
38
            directory (string): Directory to download files to
39
            basename (string, None): basename part of the filename on disk.
40
            url (string): URL to download from
41
            if_exists (string, "skip"): What to do if target file already exists. \
42
                One of ``skip`` (default), ``overwrite`` or ``raise``
43
            subdir (string, None): Subdirectory to create within the directory. Think ``mets:fileGrp``.
44
45
        Returns:
46
            Local filename string, *relative* to directory
47
        """
48
        log = getLogger('ocrd.resolver.download_to_directory') # pylint: disable=redefined-outer-name
49
        log.debug("directory=|%s| url=|%s| basename=|%s| if_exists=|%s| subdir=|%s|", directory, url, basename, if_exists, subdir)
50
51
        if not url:
52
            raise Exception("'url' must be a string")
53
        if not directory:
54
            raise Exception("'directory' must be a string")  # actually Path would also work
55
56
        directory = Path(directory)
57
        directory.mkdir(parents=True, exist_ok=True)
58
        directory = str(directory.resolve())
59
60
        subdir_path = Path(subdir if subdir else '')
61
        basename_path = Path(basename if basename else nth_url_segment(url))
62
        ret = str(Path(subdir_path, basename_path))
63
        dst_path = Path(directory, ret)
64
65
        #  log.info("\n\tdst_path='%s \n\turl=%s", dst_path, url)
66
        #  print('url=%s', url)
67
        #  print('directory=%s', directory)
68
        #  print('subdir_path=%s', subdir_path)
69
        #  print('basename_path=%s', basename_path)
70
        #  print('ret=%s', ret)
71
        #  print('dst_path=%s', dst_path)
72
73
        src_path = None
74
        if is_local_filename(url):
75
            try:
76
                # XXX this raises FNFE in Python 3.5 if src_path doesn't exist but not 3.6+
77
                src_path = Path(get_local_filename(url)).resolve()
78
            except FileNotFoundError as e:
79
                log.error("Failed to resolve URL locally: %s --> '%s' which does not exist" % (url, src_path))
80
                raise e
81
            if not src_path.exists():
82
                raise FileNotFoundError("File path passed as 'url' to download_to_directory does not exist: %s" % url)
83
            if src_path == dst_path:
84
                log.debug("Stop early, src_path and dst_path are the same: '%s' (url: '%s')" % (src_path, url))
85
                return ret
86
87
        # Respect 'if_exists' arg
88
        if dst_path.exists():
89
            if if_exists == 'skip':
90
                return ret
91
            if if_exists == 'raise':
92
                raise FileExistsError("File already exists and if_exists == 'raise': %s" % (dst_path))
93
94
        # Create dst_path parent dir
95
        dst_path.parent.mkdir(parents=True, exist_ok=True)
96
97
        # Copy files or download remote assets
98
        if src_path:
99
            log.debug("Copying file '%s' to '%s'", src_path, dst_path)
100
            dst_path.write_bytes(src_path.read_bytes())
101
        else:
102
            log.debug("Downloading URL '%s' to '%s'", url, dst_path)
103
            response = requests.get(url)
104
            if response.status_code != 200:
105
                raise Exception("HTTP request failed: %s (HTTP %d)" % (url, response.status_code))
106
            contents = handle_oai_response(response)
107
            dst_path.write_bytes(contents)
108
109
        return ret
110
111
    def workspace_from_url(self, mets_url, dst_dir=None, clobber_mets=False, mets_basename=None, download=False, src_baseurl=None):
112
        """
113
        Create a workspace from a METS by URL (i.e. clone if :py:attr:`mets_url` is remote or :py:attr:`dst_dir` is given).
114
115
        Arguments:
116
            mets_url (string): Source METS URL or filesystem path
117
        Keyword Arguments:
118
            dst_dir (string, None): Target directory for the workspace. \
119
                By default create a temporary directory under :py:data:`ocrd.constants.TMP_PREFIX`. \
120
                (The resulting path can be retrieved via :py:attr:`ocrd.Workspace.directory`.)
121
            clobber_mets (boolean, False): Whether to overwrite existing ``mets.xml``. \
122
                By default existing ``mets.xml`` will raise an exception.
123
            download (boolean, False): Whether to also download all the files referenced by the METS
124
            src_baseurl (string, None): Base URL for resolving relative file locations
125
126
        Download (clone) :py:attr:`mets_url` to ``mets.xml`` in :py:attr:`dst_dir`, unless 
127
        the former is already local and the latter is ``none`` or already identical to its directory name.
128
129
        Returns:
130
            a new :py:class:`~ocrd.workspace.Workspace`
131
        """
132
        log = getLogger('ocrd.resolver.workspace_from_url')
133
134
        if mets_url is None:
135
            raise ValueError("Must pass 'mets_url' workspace_from_url")
136
137
        # if mets_url is a relative filename, make it absolute
138
        if is_local_filename(mets_url) and not Path(mets_url).is_absolute():
139
            mets_url = str(Path(Path.cwd() / mets_url))
140
141
        # if mets_basename is not given, use the last URL segment of the mets_url
142
        if mets_basename is None:
143
            mets_basename = nth_url_segment(mets_url, -1)
144
145
        # If src_baseurl wasn't given, determine from mets_url by removing last url segment
146
        if not src_baseurl:
147
            last_segment = nth_url_segment(mets_url)
148
            src_baseurl = remove_non_path_from_url(remove_non_path_from_url(mets_url)[:-len(last_segment)])
149
150
        # resolve dst_dir
151
        if not dst_dir:
152
            if is_local_filename(mets_url):
153
                log.debug("Deriving dst_dir %s from %s", Path(mets_url).parent, mets_url)
154
                dst_dir = Path(mets_url).parent
155
            else:
156
                log.debug("Creating ephemeral workspace '%s' for METS @ <%s>", dst_dir, mets_url)
157
                dst_dir = mkdtemp(prefix=TMP_PREFIX)
158
        # XXX Path.resolve is always strict in Python <= 3.5, so create dst_dir unless it exists consistently
159
        if not Path(dst_dir).exists():
160
            Path(dst_dir).mkdir(parents=True, exist_ok=False)
161
        dst_dir = str(Path(dst_dir).resolve())
162
163
        log.debug("workspace_from_url\nmets_basename='%s'\nmets_url='%s'\nsrc_baseurl='%s'\ndst_dir='%s'",
164
            mets_basename, mets_url, src_baseurl, dst_dir)
165
166
        self.download_to_directory(dst_dir, mets_url, basename=mets_basename, if_exists='overwrite' if clobber_mets else 'skip')
167
168
        workspace = Workspace(self, dst_dir, mets_basename=mets_basename, baseurl=src_baseurl)
169
170
        if download:
171
            for f in workspace.mets.find_files():
172
                workspace.download_file(f)
173
174
        return workspace
175
176
    def workspace_from_nothing(self, directory, mets_basename='mets.xml', clobber_mets=False):
177
        """
178
        Create an empty workspace.
179
180
        Arguments:
181
            directory (string): Target directory for the workspace. \
182
                If ``none``, create a temporary directory under :py:data:`ocrd.constants.TMP_PREFIX`. \
183
                (The resulting path can be retrieved via :py:attr:`ocrd.Workspace.directory`.)
184
        Keyword Arguments:
185
            clobber_mets (boolean, False): Whether to overwrite existing ``mets.xml``. \
186
                By default existing ``mets.xml`` will raise an exception.
187
188
        Returns:
189
            a new :py:class:`~ocrd.workspace.Workspace`
190
        """
191
        log = getLogger('ocrd.resolver.workspace_from_nothing')
192
        if directory is None:
193
            directory = mkdtemp(prefix=TMP_PREFIX)
194
        Path(directory).mkdir(parents=True, exist_ok=True)
195
        mets_path = Path(directory, mets_basename)
196
        if mets_path.exists() and not clobber_mets:
197
            raise FileExistsError("METS '%s' already exists in '%s' and clobber_mets not set." % (mets_basename, directory))
198
        mets = OcrdMets.empty_mets()
199
        log.info("Writing METS to %s", mets_path)
200
        mets_path.write_bytes(mets.to_xml(xmllint=True))
201
202
        return Workspace(self, directory, mets, mets_basename=mets_basename)
203
204
    def resolve_mets_arguments(self, directory, mets_url, mets_basename):
205
        """
206
        Resolve the ``--mets``, ``--mets-basename`` and `--directory`` argument
207
        into a coherent set of arguments according to https://github.com/OCR-D/core/issues/517
208
        """
209
        log = getLogger('ocrd.resolver.resolve_mets_arguments')
210
        if not mets_basename and mets_url:
211
            mets_basename = Path(mets_url).name
212
        elif not mets_basename and not mets_url:
213
            mets_basename = 'mets.xml'
214
        elif mets_basename and mets_url:
215
            raise ValueError("Use either --mets or --mets-basename, not both")
216
        else:
217
            warn("--mets-basename is deprecated. Use --mets/--directory instead", DeprecationWarning)
218
219
        if directory and mets_url:
220
            # XXX check whether mets_url has no parents, i.e. is actually the mets_basename
221
            if Path(mets_url).parent == Path('.'):
222
                (log.warning if log else print)('Treating --mets_url as --mets-basename because it is just a basename "%s"' % mets_url)
223
                mets_basename, mets_url = mets_url, None
224
            elif not is_file_in_directory(directory, mets_url):
225
                raise ValueError("--mets '%s' has a directory part inconsistent with --directory '%s'" % (mets_url, directory))
226
227
        if directory and not mets_url:
228
            directory = Path(directory).resolve()
229
            mets_url = directory / mets_basename
230
        elif not directory and mets_url:
231
            if mets_url.startswith('http') or mets_url.startswith('https:'):
232
                raise ValueError("--mets is an http(s) URL but no --directory was given")
233
            mets_url = Path(mets_url).resolve()
234
            directory = Path.cwd() if mets_url.parent == Path('.') else mets_url.parent
235
        elif not directory:
236
            directory = Path.cwd()
237
            mets_url = Path(directory, mets_basename)
238
239
        return str(directory), str(mets_url), str(mets_basename)
240
241
242