Passed
Pull Request — master (#797)
by Konstantin
04:57
created

OcrdResourceManager.download()   D

Complexity

Conditions 13

Size

Total Lines 48
Code Lines 40

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 40
dl 0
loc 48
rs 4.2
c 0
b 0
f 0
cc 13
nop 11

How to fix   Complexity    Many Parameters   

Complexity

Complex classes like ocrd.resource_manager.OcrdResourceManager.download() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

Many Parameters

Methods with many parameters are not only hard to understand, but their parameters also often become inconsistent when you need more, or different data.

There are several approaches to avoid long parameter lists:

1
from pathlib import Path
2
from os.path import join
3
from os import environ, listdir, getcwd, path, unlink
4
from shutil import copytree as copytree_, rmtree
5
from datetime import datetime
6
from tarfile import open as open_tarfile
7
from urllib.parse import urlparse, unquote
8
9
import requests
10
from yaml import safe_load, safe_dump
11
12
from ocrd_validators import OcrdResourceListValidator
13
from ocrd_utils import getLogger
14
from ocrd_utils.os import get_processor_resource_types, list_all_resources, pushd_popd
15
from .constants import RESOURCE_LIST_FILENAME, RESOURCE_USER_LIST_COMMENT
16
17
def copytree(src, dst, *args, overwrite=False, **kwargs):
18
    if overwrite:
19
        rmtree(dst)
20
    return copytree_(src, dst, *args, **kwargs)
21
22
class OcrdResourceManager():
23
24
    """
25
    Managing processor resources
26
    """
27
    def __init__(self, userdir=None, xdg_config_home=None, xdg_data_home=None):
28
        self.log = getLogger('ocrd.resource_manager')
29
        self.database = {}
30
31
        self._xdg_data_home = xdg_data_home
32
        self._xdg_config_home = xdg_config_home
33
        self._userdir = userdir
34
        self.user_list = Path(self.xdg_config_home, 'ocrd', 'resources.yml')
35
36
        self.load_resource_list(Path(RESOURCE_LIST_FILENAME))
37
        if not self.user_list.exists():
38
            if not self.user_list.parent.exists():
39
                self.user_list.parent.mkdir(parents=True)
40
            with open(str(self.user_list), 'w', encoding='utf-8') as f:
41
                f.write(RESOURCE_USER_LIST_COMMENT)
42
        self.load_resource_list(self.user_list)
43
44
    @property
45
    def userdir(self):
46
        if not self._userdir:
47
            self._userdir = path.expanduser('~')
48
            if 'HOME' in environ and environ['HOME'] != path.expanduser('~'):
49
                self._userdir = environ['HOME']
50
        return self._userdir
51
52
    @property
53
    def xdg_data_home(self):
54
        if not self._xdg_data_home:
55
            if 'XDG_DATA_HOME' in environ:
56
                self._xdg_data_home = environ['XDG_DATA_HOME']
57
            else:
58
                self._xdg_data_home = join(self.userdir, '.local', 'share')
59
        return self._xdg_data_home
60
61
    @property
62
    def xdg_config_home(self):
63
        if not self._xdg_config_home:
64
            if 'XDG_CONFIG_HOME' in environ:
65
                self._xdg_config_home = environ['XDG_CONFIG_HOME']
66
            else:
67
                self._xdg_config_home = join(self.userdir, '.config')
68
        return self._xdg_config_home
69
70
    def load_resource_list(self, list_filename, database=None):
71
        if not database:
72
            database = self.database
73
        if list_filename.is_file():
74
            with open(list_filename, 'r', encoding='utf-8') as f:
75
                list_loaded = safe_load(f) or {}
76
            report = OcrdResourceListValidator.validate(list_loaded)
77
            if not report.is_valid:
78
                self.log.error('\n'.join(report.errors))
79
                raise ValueError("Resource list %s is invalid!" % (list_filename))
80
            for executable, resource_list in list_loaded.items():
81
                if executable not in database:
82
                    database[executable] = []
83
                # Prepend, so user provided is sorted before builtin
84
                database[executable] = list_loaded[executable] + database[executable]
85
        return database
86
87
    def list_available(self, executable=None):
88
        """
89
        List models available for download by processor
90
        """
91
        if executable:
92
            return [(executable, self.database[executable])]
93
        return self.database.items()
94
95
    def list_installed(self, executable=None):
96
        """
97
        List installed resources, matching with registry by ``name``
98
        """
99
        ret = []
100
        if executable:
101
            all_executables = [executable]
102
        else:
103
            # resources we know about
104
            all_executables = list(self.database.keys())
105
            # resources in the file system
106
            parent_dirs = [join(x, 'ocrd-resources') for x in [self.xdg_data_home, '/usr/local/share']]
107
            for parent_dir in parent_dirs:
108
                if Path(parent_dir).exists():
109
                    all_executables += [x for x in listdir(parent_dir) if x.startswith('ocrd-')]
110
        for this_executable in set(all_executables):
111
            reslist = []
112
            has_dirs, has_files = get_processor_resource_types(this_executable)
113
            for res_filename in list_all_resources(this_executable):
114
                if Path(res_filename).is_dir() and not has_dirs:
115
                    continue
116
                if Path(res_filename).is_file() and not has_files:
117
                    continue
118
                res_name = Path(res_filename).name
119
                resdict = [x for x in self.database.get(this_executable, []) if x['name'] == res_name]
120
                if not resdict:
121
                    self.log.info("%s resource '%s' (%s) not a known resource, creating stub in %s'", this_executable, res_name, res_filename, self.user_list)
122
                    resdict = [self.add_to_user_database(this_executable, res_filename)]
123
                resdict[0]['path'] = res_filename
124
                reslist.append(resdict[0])
125
            ret.append((this_executable, reslist))
126
        return ret
127
128
    def add_to_user_database(self, executable, res_filename, url=None):
129
        """
130
        Add a stub entry to the user resource.yml
131
        """
132
        res_name = Path(res_filename).name
133
        res_size = Path(res_filename).stat().st_size
134
        with open(self.user_list, 'r', encoding='utf-8') as f:
135
            user_database = safe_load(f) or {}
136
        if executable not in user_database:
137
            user_database[executable] = []
138
        resources_found = self.find_resources(executable=executable, name=res_name, database=user_database)
139
        if not resources_found:
140
            resdict = {
141
                'name': res_name,
142
                'url': url if url else '???',
143
                'description': 'Found at %s on %s' % (self.resource_dir_to_location(res_filename), datetime.now()),
144
                'version_range': '???',
145
                'size': res_size
146
            }
147
            user_database[executable].append(resdict)
148
        else:
149
            resdict = resources_found[0]
150
        with open(self.user_list, 'w', encoding='utf-8') as f:
151
            f.write(RESOURCE_USER_LIST_COMMENT)
152
            f.write('\n')
153
            f.write(safe_dump(user_database))
154
        self.load_resource_list(self.user_list)
155
        return resdict
156
157
    def find_resources(self, executable=None, name=None, url=None, database=None):
158
        """
159
        Find resources in the registry
160
        """
161
        if not database:
162
            database = self.database
163
        ret = []
164
        if executable and executable not in database.keys():
165
            return ret
166
        for executable in [executable] if executable else database.keys():
167
            for resdict in database[executable]:
168
                if not name and not url:
169
                    ret.append((executable, resdict))
170
                elif url and url == resdict['url']:
171
                    ret.append((executable, resdict))
172
                elif name and name == resdict['name']:
173
                    ret.append((executable, resdict))
174
        return ret
175
176
    @property
177
    def default_resource_dir(self):
178
        return self.location_to_resource_dir('data')
179
180
    def location_to_resource_dir(self, location):
181
        return '/usr/local/share/ocrd-resources' if location == 'system' else \
182
                join(self.xdg_data_home, 'ocrd-resources') if location == 'data' else \
183
                getcwd()
184
185
    def resource_dir_to_location(self, resource_path):
186
        resource_path = str(resource_path)
187
        return 'system' if resource_path.startswith('/usr/local/share/ocrd-resources') else \
188
               'data' if resource_path.startswith(join(self.xdg_data_home, 'ocrd-resources')) else \
189
               'cwd' if resource_path.startswith(getcwd()) else \
190
               resource_path
191
192
    def parameter_usage(self, name, usage='as-is'):
193
        if usage == 'as-is':
194
            return name
195
        elif usage == 'without-extension':
196
            return Path(name).stem
197
        raise ValueError("No such usage '%s'" % usage)
198
199
    def _download_impl(self, url, filename, progress_cb=None, size=None, overwrite=False):
200
        log = getLogger('ocrd.resource_manager._download_impl')
201
        log.info("Downloading %s to %s" % (url, filename))
202
        if Path(filename).exists() and overwrite:
203
            unlink(filename)
204
        with open(filename, 'wb') as f:
205
            with requests.get(url, stream=True) as r:
206
                total = size if size else int(r.headers.get('content-length'))
207
                for data in r.iter_content(chunk_size=4096):
208
                    if progress_cb:
209
                        progress_cb(len(data))
210
                    f.write(data)
211
212
    def _copy_impl(self, src_filename, filename, progress_cb=None, overwrite=False):
213
        log = getLogger('ocrd.resource_manager._copy_impl')
214
        log.info("Copying %s" % src_filename)
215
        if Path(filename).exists() and overwrite:
216
            unlink(filename)
217
        with open(filename, 'wb') as f_out, open(src_filename, 'rb') as f_in:
218
            while True:
219
                chunk = f_in.read(4096)
220
                if chunk:
221
                    f_out.write(chunk)
222
                    if progress_cb:
223
                        progress_cb(len(chunk))
224
                else:
225
                    break
226
227
    # TODO Proper caching (make head request for size, If-Modified etc)
228
    def download(
229
        self,
230
        executable,
231
        url,
232
        basedir,
233
        overwrite=False,
234
        no_subdir=False,
235
        name=None,
236
        resource_type='file',
237
        path_in_archive='.',
238
        progress_cb=None,
239
        size=None,
240
    ):
241
        """
242
        Download a resource by URL
243
        """
244
        log = getLogger('ocrd.resource_manager.download')
245
        destdir = Path(basedir) if no_subdir else Path(basedir, executable)
246
        if not name:
247
            url_parsed = urlparse(url)
248
            name = Path(unquote(url_parsed.path)).name
249
        fpath = Path(destdir, name)
250
        is_url = url.startswith('https://') or url.startswith('http://')
251
        if fpath.exists() and not overwrite:
252
            log.info("%s to be %s to %s which already exists and overwrite is False" % (url, 'downloaded' if is_url else 'copied', fpath))
253
            return fpath
254
        destdir.mkdir(parents=True, exist_ok=True)
255
        if resource_type == 'file':
256
            if is_url:
257
                self._download_impl(url, fpath, progress_cb, overwrite=overwrite)
258
            else:
259
                self._copy_impl(url, fpath, progress_cb, overwrite=overwrite)
260
        elif resource_type == 'tarball':
261
            with pushd_popd(tempdir=True):
262
                if is_url:
263
                    self._download_impl(url, 'download.tar.xx', progress_cb, size, overwrite=overwrite)
264
                else:
265
                    self._copy_impl(url, 'download.tar.xx', progress_cb, overwrite=overwrite)
266
                Path('out').mkdir()
267
                with pushd_popd('out'):
268
                    log.info("Extracting tarball")
269
                    with open_tarfile('../download.tar.xx', 'r:*') as tar:
270
                        tar.extractall()
271
                    log.info("Copying '%s' from tarball to %s" % (path_in_archive, fpath))
272
                    copytree(path_in_archive, str(fpath), overwrite=overwrite)
273
        # TODO
274
        # elif resource_type == 'github-dir':
275
        return fpath
276