Passed
Pull Request — master (#559)
by Konstantin
02:15
created

OcrdResourceManager.download()   D

Complexity

Conditions 12

Size

Total Lines 46
Code Lines 38

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 38
dl 0
loc 46
rs 4.8
c 0
b 0
f 0
cc 12
nop 9

How to fix   Complexity    Many Parameters   

Complexity

Complex classes like ocrd.resource_manager.OcrdResourceManager.download() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

Many Parameters

Methods with many parameters are not only hard to understand, but their parameters also often become inconsistent when you need more, or different data.

There are several approaches to avoid long parameter lists:

1
from pathlib import Path
2
from os.path import join
3
from os import environ, listdir, getcwd
4
import re
5
from shutil import copytree
6
from datetime import datetime
7
from tarfile import open as open_tarfile
8
from urllib.parse import urlparse, unquote
9
10
import requests
11
from yaml import safe_load, safe_dump
12
13
from ocrd_validators import OcrdResourceListValidator
14
from ocrd_utils import getLogger
15
from ocrd_utils.constants import HOME, XDG_CACHE_HOME, XDG_CONFIG_HOME, XDG_DATA_HOME, VIRTUAL_ENV
16
from ocrd_utils.os import list_all_resources, pushd_popd
17
18
from .constants import RESOURCE_LIST_FILENAME, RESOURCE_USER_LIST_COMMENT
19
from .config import load_config_file
20
21
class OcrdResourceManager():
22
23
    """
24
    Managing processor resources
25
    """
26
    def __init__(self):
27
        self.log = getLogger('ocrd.resource_manager')
28
        self.database = {}
29
        self.load_resource_list(Path(RESOURCE_LIST_FILENAME))
30
        self.user_list = Path(XDG_CONFIG_HOME, 'ocrd', 'resources.yml')
31
        if not self.user_list.exists():
32
            if not self.user_list.parent.exists():
33
                self.user_list.parent.mkdir()
34
            with open(str(self.user_list), 'w', encoding='utf-8') as f:
35
                f.write(RESOURCE_USER_LIST_COMMENT)
36
        self.load_resource_list(self.user_list)
37
38
    def load_resource_list(self, list_filename, database=None):
39
        if not database:
40
            database = self.database
41
        if list_filename.is_file():
42
            with open(list_filename, 'r', encoding='utf-8') as f:
43
                list_loaded = safe_load(f) or {}
44
            report = OcrdResourceListValidator.validate(list_loaded)
45
            if not report.is_valid:
46
                self.log.error('\n'.join(report.errors))
47
                raise ValueError("Resource list %s is invalid!" % (list_filename))
48
            for executable, resource_list in list_loaded.items():
49
                if executable not in database:
50
                    database[executable] = []
51
                # Prepend, so user provided is sorted before builtin
52
                database[executable] = list_loaded[executable] + database[executable]
53
        return database
54
55
    def list_available(self, executable=None):
56
        """
57
        List models available for download by processor
58
        """
59
        if executable:
60
            return [(executable, self.database[executable])]
61
        return [(x, y) for x, y in self.database.items()]
62
63
    def list_installed(self, executable=None):
64
        """
65
        List installed resources, matching with registry by ``name``
66
        """
67
        ret = []
68
        if executable:
69
            all_executables = [executable]
70
        else:
71
            # resources we know about
72
            all_executables = list(self.database.keys())
73
            # resources in the file system
74
            parent_dirs = [join(x, 'ocrd-resources') for x in [XDG_CACHE_HOME, XDG_CONFIG_HOME, XDG_DATA_HOME]]
75
            if VIRTUAL_ENV:
76
                parent_dirs += [join(VIRTUAL_ENV, 'share', 'ocrd-resources')]
77
            for parent_dir in parent_dirs:
78
                if Path(parent_dir).exists():
79
                    all_executables += [x for x in listdir(parent_dir) if x.startswith('ocrd-')]
80
        for this_executable in set(all_executables):
81
            reslist = []
82
            for res_filename in list_all_resources(this_executable):
83
                res_name = Path(res_filename).name
84
                resdict = [x for x in self.database.get(this_executable, []) if x['name'] == res_name]
85
                if not resdict:
86
                    self.log.info("%s resource '%s' (%s) not a known resource, creating stub in %s'" % (this_executable, res_name, res_filename, self.user_list))
87
                    resdict = [self.add_to_user_database(this_executable, res_filename)]
88
                reslist.append(resdict[0])
89
            ret.append((this_executable, reslist))
90
        return ret
91
92
    def add_to_user_database(self, executable, res_filename, url=None):
93
        """
94
        Add a stub entry to the user resource.yml
95
        """
96
        res_name = Path(res_filename).name
97
        res_size = Path(res_filename).stat().st_size
98
        with open(self.user_list, 'r', encoding='utf-8') as f:
99
            user_database = safe_load(f) or {}
100
        if executable not in user_database:
101
            user_database[executable] = []
102
        if not self.find_resources(executable=executable, name=res_name, database=user_database):
103
            resdict = {
104
                'name': res_name,
105
                'url': url if url else '???',
106
                'description': 'Found at %s on %s' % (res_filename, datetime.now()),
107
                'version_range': '???',
108
                'size': res_size
109
            }
110
            user_database[executable].append(resdict)
111
        with open(self.user_list, 'w', encoding='utf-8') as f:
112
            f.write(RESOURCE_USER_LIST_COMMENT)
113
            f.write('\n')
114
            f.write(safe_dump(user_database))
115
        return resdict
0 ignored issues
show
introduced by
The variable resdict does not seem to be defined in case BooleanNotNode on line 102 is False. Are you sure this can never be the case?
Loading history...
116
117
    def find_resources(self, executable=None, name=None, url=None, database=None):
118
        """
119
        Find resources in the registry
120
        """
121
        if not database:
122
            database = self.database
123
        ret = []
124
        if executable and executable not in database.keys():
125
            return ret
126
        for executable in [executable] if executable else database.keys():
127
            for resdict in database[executable]:
128
                if not name and not url:
129
                    ret.append((executable, resdict))
130
                elif url and url == resdict['url']:
131
                    ret.append((executable, resdict))
132
                elif name and name == resdict['name']:
133
                    ret.append((executable, resdict))
134
        return ret
135
136
    def get_resource_dir(self, location):
137
        return join(VIRTUAL_ENV, 'ocrd-resources') if location == 'virtualenv' and VIRTUAL_ENV else \
138
                join(XDG_CACHE_HOME, 'ocrd-resources') if location == 'cache' else \
139
                join(XDG_DATA_HOME, 'ocrd-resources') if location == 'data' else \
140
                join(XDG_CONFIG_HOME, 'ocrd-resources') if location == 'config' else \
141
                getcwd()
142
143
    @property
144
    def default_resource_dir(self):
145
        config = load_config_file()
146
        return self.get_resource_dir(config.resource_location)
147
148
    def parameter_usage(self, name, usage='as-is'):
149
        if usage == 'as-is':
150
            return name
151
        if usage == 'without-extension':
152
            return Path(name).stem
153
154
    def _download_impl(self, url, filename, progress_cb=None):
155
        log = getLogger('ocrd.resource_manager._download_impl')
156
        log.info("Downloading %s" % url)
157
        with open(filename, 'wb') as f:
158
            with requests.get(url, stream=True) as r:
159
                total = int(r.headers.get('content-length'))
160
                for data in r.iter_content(chunk_size=4096):
161
                    if progress_cb:
162
                        progress_cb(len(data))
163
                    f.write(data)
164
165
    def _copy_impl(self, src_filename, filename, progress_cb=None):
166
        log = getLogger('ocrd.resource_manager._copy_impl')
167
        log.info("Copying %s" % src_filename)
168
        with open(filename, 'wb') as f_out, open(src_filename, 'rb') as f_in:
169
            while True:
170
                chunk = f_in.read(4096)
171
                if chunk:
172
                    f_out.write(chunk)
173
                    if progress_cb:
174
                        progress_cb(len(chunk))
175
                else:
176
                    break
177
178
    # TODO Proper caching (make head request for size, If-Modified etc)
179
    def download(
180
        self,
181
        executable,
182
        url,
183
        overwrite=False,
184
        basedir=XDG_CACHE_HOME,
185
        name=None,
186
        resource_type='file',
187
        path_in_archive='.',
188
        progress_cb=None,
189
    ):
190
        """
191
        Download a resource by URL
192
        """
193
        log = getLogger('ocrd.resource_manager.download')
194
        destdir = Path(basedir, executable)
195
        if not name:
196
            url_parsed = urlparse(url)
197
            name = Path(unquote(url_parsed.path)).name
198
        fpath = Path(destdir, name)
199
        is_url = url.startswith('https://') or url.startswith('http://')
200
        if fpath.exists() and not overwrite:
201
            log.info("%s to be %s to %s which already exists and overwrite is False" % (url, 'downloaded' if is_url else 'copied', fpath))
202
            return fpath
203
        destdir.mkdir(parents=True, exist_ok=True)
204
        if resource_type == 'file':
205
            if is_url:
206
                self._download_impl(url, fpath, progress_cb)
207
            else:
208
                self._copy_impl(url, fpath, progress_cb)
209
        elif resource_type == 'tarball':
210
            with pushd_popd(tempdir=True):
211
                if is_url:
212
                    self._download_impl(url, 'download.tar.xx', progress_cb)
213
                else:
214
                    self._copy_impl(url, 'download.tar.xx', progress_cb)
215
                Path('out').mkdir()
216
                with pushd_popd('out'):
217
                    log.info("Extracting tarball")
218
                    with open_tarfile('../download.tar.xx', 'r:*') as tar:
219
                        tar.extractall()
220
                    log.info("Copying '%s' from tarball to %s" % (path_in_archive, fpath))
221
                    copytree(path_in_archive, str(fpath))
222
        # TODO
223
        # elif resource_type == 'github-dir':
224
        return fpath
225