Passed
Pull Request — master (#582)
by Konstantin
01:59
created

ocrd.resolver.Resolver.workspace_from_nothing()   A

Complexity

Conditions 4

Size

Total Lines 15
Code Lines 11

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 11
dl 0
loc 15
rs 9.85
c 0
b 0
f 0
cc 4
nop 4
1
import tempfile
2
from pathlib import Path
3
4
import requests
5
6
from ocrd.constants import TMP_PREFIX
7
from ocrd_utils import (
8
    getLogger,
9
    is_local_filename,
10
    get_local_filename,
11
    remove_non_path_from_url,
12
    nth_url_segment
13
)
14
from ocrd.workspace import Workspace
15
from ocrd_models import OcrdMets, OcrdMetsFilter
16
17
log = getLogger('ocrd.resolver')
18
19
class Resolver():
20
    """
21
    Handle Uploads, Downloads, Repository access and manage temporary directories
22
    """
23
24
    def download_to_directory(self, directory, url, basename=None, if_exists='skip', subdir=None):
25
        """
26
        Download a file to a directory.
27
28
        Early Shortcut: If url is a local file and that file is already in the directory, keep it there.
29
30
        If basename is not given but subdir is, assume user knows what she's doing and use last URL segment as the basename.
31
        If basename is not given and no subdir is given, use the alnum characters in the URL as the basename.
32
33
        Args:
34
            directory (string): Directory to download files to
35
            basename (string, None): basename part of the filename on disk.
36
            url (string): URL to download from
37
            if_exists (string, "skip"): What to do if target file already exists. One of ``skip`` (default), ``overwrite`` or ``raise``
38
            subdir (string, None): Subdirectory to create within the directory. Think fileGrp.
39
40
        Returns:
41
            Local filename, __relative__ to directory
42
        """
43
        log = getLogger('ocrd.resolver.download_to_directory') # pylint: disable=redefined-outer-name
44
        log.debug("directory=|%s| url=|%s| basename=|%s| if_exists=|%s| subdir=|%s|", directory, url, basename, if_exists, subdir)
45
46
        if not url:
47
            raise Exception("'url' must be a string")
48
        if not directory:
49
            raise Exception("'directory' must be a string")  # actually Path would also work
50
51
        directory = Path(directory)
52
        directory.mkdir(parents=True, exist_ok=True)
53
        directory = str(directory.resolve())
54
55
        subdir_path = Path(subdir if subdir else '')
56
        basename_path = Path(basename if basename else nth_url_segment(url))
57
        ret = str(Path(subdir_path, basename_path))
58
        dst_path = Path(directory, ret)
59
60
        #  log.info("\n\tdst_path='%s \n\turl=%s", dst_path, url)
61
        #  print('url=%s', url)
62
        #  print('directory=%s', directory)
63
        #  print('subdir_path=%s', subdir_path)
64
        #  print('basename_path=%s', basename_path)
65
        #  print('ret=%s', ret)
66
        #  print('dst_path=%s', dst_path)
67
68
        src_path = None
69
        if is_local_filename(url):
70
            try:
71
                # XXX this raises FNFE in Python 3.5 if src_path doesn't exist but not 3.6+
72
                src_path = Path(get_local_filename(url)).resolve()
73
            except FileNotFoundError as e:
74
                log.error("Failed to resolve URL locally: %s --> '%s' which does not exist" % (url, src_path))
75
                raise e
76
            if not src_path.exists():
77
                raise FileNotFoundError("File path passed as 'url' to download_to_directory does not exist: %s" % url)
78
            if src_path == dst_path:
79
                log.debug("Stop early, src_path and dst_path are the same: '%s' (url: '%s')" % (src_path, url))
80
                return ret
81
82
        # Respect 'if_exists' arg
83
        if dst_path.exists():
84
            if if_exists == 'skip':
85
                return ret
86
            if if_exists == 'raise':
87
                raise FileExistsError("File already exists and if_exists == 'raise': %s" % (dst_path))
88
89
        # Create dst_path parent dir
90
        dst_path.parent.mkdir(parents=True, exist_ok=True)
91
92
        # Copy files or download remote assets
93
        if src_path:
94
            log.debug("Copying file '%s' to '%s'", src_path, dst_path)
95
            dst_path.write_bytes(src_path.read_bytes())
96
        else:
97
            log.debug("Downloading URL '%s' to '%s'", url, dst_path)
98
            response = requests.get(url)
99
            if response.status_code != 200:
100
                raise Exception("HTTP request failed: %s (HTTP %d)" % (url, response.status_code))
101
            dst_path.write_bytes(response.content)
102
103
        return ret
104
105
    def workspace_from_url(self, mets_url, dst_dir=None, clobber_mets=False, mets_basename=None, download=False, src_baseurl=None):
106
        """
107
        Create a workspace from a METS by URL (i.e. clone it).
108
109
        Sets the mets.xml file
110
111
        Arguments:
112
            mets_url (string): Source mets URL
113
            dst_dir (string, None): Target directory for the workspace
114
            clobber_mets (boolean, False): Whether to overwrite existing mets.xml. By default existing mets.xml will raise an exception.
115
            download (boolean, False): Whether to download all the files
116
            src_baseurl (string, None): Base URL for resolving relative file locations
117
118
        Returns:
119
            Workspace
120
        """
121
122
        if mets_url is None:
123
            raise ValueError("Must pass 'mets_url' workspace_from_url")
124
125
        # if mets_url is a relative filename, make it absolute
126
        if is_local_filename(mets_url) and not Path(mets_url).is_absolute():
127
            mets_url = str(Path(Path.cwd() / mets_url))
128
129
        # if mets_basename is not given, use the last URL segment of the mets_url
130
        if mets_basename is None:
131
            mets_basename = nth_url_segment(mets_url, -1)
132
133
        # If src_baseurl wasn't given, determine from mets_url by removing last url segment
134
        if not src_baseurl:
135
            last_segment = nth_url_segment(mets_url)
136
            src_baseurl = remove_non_path_from_url(remove_non_path_from_url(mets_url)[:-len(last_segment)])
137
138
        # resolve dst_dir
139
        if not dst_dir:
140
            if is_local_filename(mets_url):
141
                log.debug("Deriving dst_dir %s from %s", Path(mets_url).parent, mets_url)
142
                dst_dir = Path(mets_url).parent
143
            else:
144
                log.debug("Creating ephemeral workspace '%s' for METS @ <%s>", dst_dir, mets_url)
145
                dst_dir = tempfile.mkdtemp(prefix=TMP_PREFIX)
146
        # XXX Path.resolve is always strict in Python <= 3.5, so create dst_dir unless it exists consistently
147
        if not Path(dst_dir).exists():
148
            Path(dst_dir).mkdir(parents=True, exist_ok=False)
149
        dst_dir = str(Path(dst_dir).resolve())
150
151
        log.debug("workspace_from_url\nmets_basename='%s'\nmets_url='%s'\nsrc_baseurl='%s'\ndst_dir='%s'",
152
            mets_basename, mets_url, src_baseurl, dst_dir)
153
154
        self.download_to_directory(dst_dir, mets_url, basename=mets_basename, if_exists='overwrite' if clobber_mets else 'skip')
155
156
        workspace = Workspace(self, dst_dir, mets_basename=mets_basename, baseurl=src_baseurl)
157
158
        # XXX an empty dict is false-y but valid in this context
159
        if download or download == {}:
160
            if not isinstance(download, dict):
161
                download = {}
162
            mets_filter = OcrdMetsFilter(**download)
163
            for f in mets_filter.find_files(workspace):
164
                workspace.download_file(f)
165
166
        return workspace
167
168
    def workspace_from_nothing(self, directory, mets_basename='mets.xml', clobber_mets=False):
169
        """
170
        Create an empty workspace.
171
        """
172
        if directory is None:
173
            directory = tempfile.mkdtemp(prefix=TMP_PREFIX)
174
        Path(directory).mkdir(parents=True, exist_ok=True)
175
        mets_path = Path(directory, mets_basename)
176
        if mets_path.exists() and not clobber_mets:
177
            raise FileExistsError("METS '%s' already exists in '%s' and clobber_mets not set." % (mets_basename, directory))
178
        mets = OcrdMets.empty_mets()
179
        log.info("Writing METS to %s", mets_path)
180
        mets_path.write_bytes(mets.to_xml(xmllint=True))
181
182
        return Workspace(self, directory, mets, mets_basename=mets_basename)
183