Passed
Pull Request — master (#770)
by Konstantin
02:25
created

OcrdResourceManager.download()   D

Complexity

Conditions 13

Size

Total Lines 48
Code Lines 40

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 40
dl 0
loc 48
rs 4.2
c 0
b 0
f 0
cc 13
nop 11

How to fix   Complexity    Many Parameters   

Complexity

Complex classes like ocrd.resource_manager.OcrdResourceManager.download() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

Many Parameters

Methods with many parameters are not only hard to understand, but their parameters also often become inconsistent when you need more, or different data.

There are several approaches to avoid long parameter lists:

1
from pathlib import Path
2
from os.path import join
3
from os import environ, listdir, getcwd, path
4
import re
5
from shutil import copytree
6
from datetime import datetime
7
from tarfile import open as open_tarfile
8
from urllib.parse import urlparse, unquote
9
10
import requests
11
from yaml import safe_load, safe_dump
12
13
from ocrd_validators import OcrdResourceListValidator
14
from ocrd_utils import getLogger
15
from ocrd_utils.os import list_all_resources, pushd_popd
16
from .constants import RESOURCE_LIST_FILENAME, RESOURCE_USER_LIST_COMMENT
17
18
class OcrdResourceManager():
19
20
    """
21
    Managing processor resources
22
    """
23
    def __init__(self, userdir=None, xdg_config_home=None, xdg_data_home=None):
24
        self.log = getLogger('ocrd.resource_manager')
25
        self.database = {}
26
27
        self._xdg_data_home = xdg_data_home
28
        self._xdg_config_home = xdg_config_home
29
        self._userdir = userdir
30
        self.user_list = Path(self.xdg_config_home, 'ocrd', 'resources.yml')
31
32
        self.load_resource_list(Path(RESOURCE_LIST_FILENAME))
33
        if not self.user_list.exists():
34
            if not self.user_list.parent.exists():
35
                self.user_list.parent.mkdir(parents=True)
36
            with open(str(self.user_list), 'w', encoding='utf-8') as f:
37
                f.write(RESOURCE_USER_LIST_COMMENT)
38
        self.load_resource_list(self.user_list)
39
40
    @property
41
    def userdir(self):
42
        if not self._userdir:
43
            self._userdir = path.expanduser('~')
44
            if 'HOME' in environ and environ['HOME'] != path.expanduser('~'):
45
                self._userdir = environ['HOME']
46
        return self._userdir
47
48
    @property
49
    def xdg_data_home(self):
50
        if not self._xdg_data_home:
51
            if 'XDG_DATA_HOME' in environ:
52
                self._xdg_data_home = environ['XDG_DATA_HOME']
53
            else:
54
                self._xdg_data_home = join(self.userdir, '.local', 'share')
55
        return self._xdg_data_home
56
57
    @property
58
    def xdg_config_home(self):
59
        if not self._xdg_config_home:
60
            if 'XDG_CONFIG_HOME' in environ:
61
                self._xdg_config_home = environ['XDG_CONFIG_HOME']
62
            else:
63
                self._xdg_config_home = join(self.userdir, '.config')
64
        return self._xdg_config_home
65
66
    def load_resource_list(self, list_filename, database=None):
67
        if not database:
68
            database = self.database
69
        if list_filename.is_file():
70
            with open(list_filename, 'r', encoding='utf-8') as f:
71
                list_loaded = safe_load(f) or {}
72
            report = OcrdResourceListValidator.validate(list_loaded)
73
            if not report.is_valid:
74
                self.log.error('\n'.join(report.errors))
75
                raise ValueError("Resource list %s is invalid!" % (list_filename))
76
            for executable, resource_list in list_loaded.items():
77
                if executable not in database:
78
                    database[executable] = []
79
                # Prepend, so user provided is sorted before builtin
80
                database[executable] = list_loaded[executable] + database[executable]
81
        return database
82
83
    def list_available(self, executable=None):
84
        """
85
        List models available for download by processor
86
        """
87
        if executable:
88
            return [(executable, self.database[executable])]
89
        return self.database.items()
90
91
    def list_installed(self, executable=None):
92
        """
93
        List installed resources, matching with registry by ``name``
94
        """
95
        ret = []
96
        if executable:
97
            all_executables = [executable]
98
        else:
99
            # resources we know about
100
            all_executables = list(self.database.keys())
101
            # resources in the file system
102
            parent_dirs = [join(x, 'ocrd-resources') for x in [self.xdg_data_home, '/usr/local/share']]
103
            for parent_dir in parent_dirs:
104
                if Path(parent_dir).exists():
105
                    all_executables += [x for x in listdir(parent_dir) if x.startswith('ocrd-')]
106
        for this_executable in set(all_executables):
107
            reslist = []
108
            for res_filename in list_all_resources(this_executable):
109
                res_name = Path(res_filename).name
110
                resdict = [x for x in self.database.get(this_executable, []) if x['name'] == res_name]
111
                if not resdict:
112
                    self.log.info("%s resource '%s' (%s) not a known resource, creating stub in %s'", this_executable, res_name, res_filename, self.user_list)
113
                    resdict = [self.add_to_user_database(this_executable, res_filename)]
114
                resdict[0]['path'] = res_filename
115
                reslist.append(resdict[0])
116
            ret.append((this_executable, reslist))
117
        return ret
118
119
    def add_to_user_database(self, executable, res_filename, url=None):
120
        """
121
        Add a stub entry to the user resource.yml
122
        """
123
        res_name = Path(res_filename).name
124
        res_size = Path(res_filename).stat().st_size
125
        with open(self.user_list, 'r', encoding='utf-8') as f:
126
            user_database = safe_load(f) or {}
127
        if executable not in user_database:
128
            user_database[executable] = []
129
        resources_found = self.find_resources(executable=executable, name=res_name, database=user_database)
130
        if not resources_found:
131
            resdict = {
132
                'name': res_name,
133
                'url': url if url else '???',
134
                'description': 'Found at %s on %s' % (self.resource_dir_to_location(res_filename), datetime.now()),
135
                'version_range': '???',
136
                'size': res_size
137
            }
138
            user_database[executable].append(resdict)
139
        else:
140
            resdict = resources_found[0]
141
        with open(self.user_list, 'w', encoding='utf-8') as f:
142
            f.write(RESOURCE_USER_LIST_COMMENT)
143
            f.write('\n')
144
            f.write(safe_dump(user_database))
145
        self.load_resource_list(self.user_list)
146
        return resdict
147
148
    def find_resources(self, executable=None, name=None, url=None, database=None):
149
        """
150
        Find resources in the registry
151
        """
152
        if not database:
153
            database = self.database
154
        ret = []
155
        if executable and executable not in database.keys():
156
            return ret
157
        for executable in [executable] if executable else database.keys():
158
            for resdict in database[executable]:
159
                if not name and not url:
160
                    ret.append((executable, resdict))
161
                elif url and url == resdict['url']:
162
                    ret.append((executable, resdict))
163
                elif name and name == resdict['name']:
164
                    ret.append((executable, resdict))
165
        return ret
166
167
    @property
168
    def default_resource_dir(self):
169
        return self.location_to_resource_dir('data')
170
171
    def location_to_resource_dir(self, location):
172
        return '/usr/local/share/ocrd-resources' if location == 'system' else \
173
                join(self.xdg_data_home, 'ocrd-resources') if location == 'data' else \
174
                getcwd()
175
176
    def resource_dir_to_location(self, resource_path):
177
        resource_path = str(resource_path)
178
        return 'system' if resource_path.startswith('/usr/local/share/ocrd-resources') else \
179
               'data' if resource_path.startswith(join(self.xdg_data_home, 'ocrd-resources')) else \
180
               'cwd' if resource_path.startswith(getcwd()) else \
181
               resource_path
182
183
    def parameter_usage(self, name, usage='as-is'):
184
        if usage == 'as-is':
185
            return name
186
        elif usage == 'without-extension':
187
            return Path(name).stem
188
        raise ValueError("No such usage '%s'" % usage)
189
190
    def _download_impl(self, url, filename, progress_cb=None, size=None):
191
        log = getLogger('ocrd.resource_manager._download_impl')
192
        log.info("Downloading %s to %s" % (url, filename))
193
        with open(filename, 'wb') as f:
194
            with requests.get(url, stream=True) as r:
195
                total = size if size else int(r.headers.get('content-length'))
196
                for data in r.iter_content(chunk_size=4096):
197
                    if progress_cb:
198
                        progress_cb(len(data))
199
                    f.write(data)
200
201
    def _copy_impl(self, src_filename, filename, progress_cb=None):
202
        log = getLogger('ocrd.resource_manager._copy_impl')
203
        log.info("Copying %s" % src_filename)
204
        with open(filename, 'wb') as f_out, open(src_filename, 'rb') as f_in:
205
            while True:
206
                chunk = f_in.read(4096)
207
                if chunk:
208
                    f_out.write(chunk)
209
                    if progress_cb:
210
                        progress_cb(len(chunk))
211
                else:
212
                    break
213
214
    # TODO Proper caching (make head request for size, If-Modified etc)
215
    def download(
216
        self,
217
        executable,
218
        url,
219
        basedir,
220
        overwrite=False,
221
        no_subdir=False,
222
        name=None,
223
        resource_type='file',
224
        path_in_archive='.',
225
        progress_cb=None,
226
        size=None,
227
    ):
228
        """
229
        Download a resource by URL
230
        """
231
        log = getLogger('ocrd.resource_manager.download')
232
        destdir = Path(basedir) if no_subdir else Path(basedir, executable)
233
        if not name:
234
            url_parsed = urlparse(url)
235
            name = Path(unquote(url_parsed.path)).name
236
        fpath = Path(destdir, name)
237
        is_url = url.startswith('https://') or url.startswith('http://')
238
        if fpath.exists() and not overwrite:
239
            log.info("%s to be %s to %s which already exists and overwrite is False" % (url, 'downloaded' if is_url else 'copied', fpath))
240
            return fpath
241
        destdir.mkdir(parents=True, exist_ok=True)
242
        if resource_type == 'file':
243
            if is_url:
244
                self._download_impl(url, fpath, progress_cb)
245
            else:
246
                self._copy_impl(url, fpath, progress_cb)
247
        elif resource_type == 'tarball':
248
            with pushd_popd(tempdir=True):
249
                if is_url:
250
                    self._download_impl(url, 'download.tar.xx', progress_cb, size)
251
                else:
252
                    self._copy_impl(url, 'download.tar.xx', progress_cb)
253
                Path('out').mkdir()
254
                with pushd_popd('out'):
255
                    log.info("Extracting tarball")
256
                    with open_tarfile('../download.tar.xx', 'r:*') as tar:
257
                        tar.extractall()
258
                    log.info("Copying '%s' from tarball to %s" % (path_in_archive, fpath))
259
                    copytree(path_in_archive, str(fpath))
260
        # TODO
261
        # elif resource_type == 'github-dir':
262
        return fpath
263