Passed
Pull Request — master (#800)
by Konstantin
02:21
created

OcrdResourceManager.download()   D

Complexity

Conditions 13

Size

Total Lines 48
Code Lines 40

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 40
dl 0
loc 48
rs 4.2
c 0
b 0
f 0
cc 13
nop 11

How to fix   Complexity    Many Parameters   

Complexity

Complex classes like ocrd.resource_manager.OcrdResourceManager.download() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

Many Parameters

Methods with many parameters are not only hard to understand, but their parameters also often become inconsistent when you need more, or different data.

There are several approaches to avoid long parameter lists:

1
from pathlib import Path
2
from os.path import join
3
from json import loads
4
from os import environ, listdir, getcwd, path
5
from shutil import copytree
6
from datetime import datetime
7
from tarfile import open as open_tarfile
8
from urllib.parse import urlparse, unquote
9
from subprocess import run, PIPE
10
11
import requests
12
from yaml import safe_load, safe_dump
13
14
from ocrd_validators import OcrdResourceListValidator
15
from ocrd_utils import getLogger
16
from ocrd_utils.os import get_processor_resource_types, list_all_resources, pushd_popd
17
from .constants import RESOURCE_LIST_FILENAME, RESOURCE_USER_LIST_COMMENT
18
19
class OcrdResourceManager():
20
21
    """
22
    Managing processor resources
23
    """
24
    def __init__(self, userdir=None, xdg_config_home=None, xdg_data_home=None):
25
        self.log = getLogger('ocrd.resource_manager')
26
        self.database = {}
27
28
        self._xdg_data_home = xdg_data_home
29
        self._xdg_config_home = xdg_config_home
30
        self._userdir = userdir
31
        self.user_list = Path(self.xdg_config_home, 'ocrd', 'resources.yml')
32
33
        self.load_resource_list(Path(RESOURCE_LIST_FILENAME))
34
        if not self.user_list.exists():
35
            if not self.user_list.parent.exists():
36
                self.user_list.parent.mkdir(parents=True)
37
            with open(str(self.user_list), 'w', encoding='utf-8') as f:
38
                f.write(RESOURCE_USER_LIST_COMMENT)
39
        self.load_resource_list(self.user_list)
40
41
    @property
42
    def userdir(self):
43
        if not self._userdir:
44
            self._userdir = path.expanduser('~')
45
            if 'HOME' in environ and environ['HOME'] != path.expanduser('~'):
46
                self._userdir = environ['HOME']
47
        return self._userdir
48
49
    @property
50
    def xdg_data_home(self):
51
        if not self._xdg_data_home:
52
            if 'XDG_DATA_HOME' in environ:
53
                self._xdg_data_home = environ['XDG_DATA_HOME']
54
            else:
55
                self._xdg_data_home = join(self.userdir, '.local', 'share')
56
        return self._xdg_data_home
57
58
    @property
59
    def xdg_config_home(self):
60
        if not self._xdg_config_home:
61
            if 'XDG_CONFIG_HOME' in environ:
62
                self._xdg_config_home = environ['XDG_CONFIG_HOME']
63
            else:
64
                self._xdg_config_home = join(self.userdir, '.config')
65
        return self._xdg_config_home
66
67
    def load_resource_list(self, list_filename, database=None):
68
        if not database:
69
            database = self.database
70
        if list_filename.is_file():
71
            with open(list_filename, 'r', encoding='utf-8') as f:
72
                list_loaded = safe_load(f) or {}
73
            report = OcrdResourceListValidator.validate(list_loaded)
74
            if not report.is_valid:
75
                self.log.error('\n'.join(report.errors))
76
                raise ValueError("Resource list %s is invalid!" % (list_filename))
77
            for executable, resource_list in list_loaded.items():
78
                if executable not in database:
79
                    database[executable] = []
80
                # Prepend, so user provided is sorted before builtin
81
                database[executable] = list_loaded[executable] + database[executable]
82
        return database
83
84
    def discover(self, dry_run=False, glob='ocrd-*'):
85
        """
86
        Discover resources by checking all the executables matching the
87
        ``glob`` glob and add them to the user resource_list.yml
88
        unless ``dry_run`` is ``True``.
89
        """
90
        ret = []
91
        if not dry_run:
92
            with open(self.user_list, 'r', encoding='utf-8') as f:
93
                user_database = safe_load(f) or {}
94
        for exec_dir in environ['PATH'].split(':'):
95
            for exec_path in Path(exec_dir).glob(glob):
96
                self.log.info(f"Inspecting '{exec_path} --dump-json' for resources")
97
                result = run([exec_path, '--dump-json'], stdout=PIPE)
98
                ocrd_tool = loads(result.stdout)
99
                if not dry_run:
100
                    if exec_path.name not in user_database:
0 ignored issues
show
introduced by
The variable user_database does not seem to be defined in case BooleanNotNode on line 91 is False. Are you sure this can never be the case?
Loading history...
101
                        user_database[exec_path.name] = []
102
                    user_database[exec_path.name] += ocrd_tool.get('resources', ())
103
                ret.append((exec_path.name, ocrd_tool.get('resources', ())))
104
        if not dry_run:
105
            with open(self.user_list, 'w', encoding='utf-8') as f:
106
                f.write(RESOURCE_USER_LIST_COMMENT)
107
                f.write('\n')
108
                f.write(safe_dump(user_database))
109
        return ret
110
111
    def list_available(self, executable=None):
112
        """
113
        List models available for download by processor
114
        """
115
        if executable:
116
            return [(executable, self.database[executable])]
117
        return self.database.items()
118
119
    def list_installed(self, executable=None):
120
        """
121
        List installed resources, matching with registry by ``name``
122
        """
123
        ret = []
124
        if executable:
125
            all_executables = [executable]
126
        else:
127
            # resources we know about
128
            all_executables = list(self.database.keys())
129
            # resources in the file system
130
            parent_dirs = [join(x, 'ocrd-resources') for x in [self.xdg_data_home, '/usr/local/share']]
131
            for parent_dir in parent_dirs:
132
                if Path(parent_dir).exists():
133
                    all_executables += [x for x in listdir(parent_dir) if x.startswith('ocrd-')]
134
        for this_executable in set(all_executables):
135
            reslist = []
136
            has_dirs, has_files = get_processor_resource_types(this_executable)
137
            for res_filename in list_all_resources(this_executable):
138
                if Path(res_filename).is_dir() and not has_dirs:
139
                    continue
140
                if Path(res_filename).is_file() and not has_files:
141
                    continue
142
                res_name = Path(res_filename).name
143
                resdict = [x for x in self.database.get(this_executable, []) if x['name'] == res_name]
144
                if not resdict:
145
                    self.log.info("%s resource '%s' (%s) not a known resource, creating stub in %s'", this_executable, res_name, res_filename, self.user_list)
146
                    resdict = [self.add_to_user_database(this_executable, res_filename)]
147
                resdict[0]['path'] = res_filename
148
                reslist.append(resdict[0])
149
            ret.append((this_executable, reslist))
150
        return ret
151
152
    def add_to_user_database(self, executable, res_filename, url=None, resource_type='file'):
153
        """
154
        Add a stub entry to the user resource.yml
155
        """
156
        res_name = Path(res_filename).name
157
        res_size = Path(res_filename).stat().st_size
158
        with open(self.user_list, 'r', encoding='utf-8') as f:
159
            user_database = safe_load(f) or {}
160
        if executable not in user_database:
161
            user_database[executable] = []
162
        resources_found = self.find_resources(executable=executable, name=res_name, database=user_database)
163
        if not resources_found:
164
            resdict = {
165
                'name': res_name,
166
                'url': url if url else '???',
167
                'description': 'Found at %s on %s' % (self.resource_dir_to_location(res_filename), datetime.now()),
168
                'version_range': '???',
169
                'type': resource_type,
170
                'size': res_size
171
            }
172
            user_database[executable].append(resdict)
173
        else:
174
            resdict = resources_found[0]
175
        with open(self.user_list, 'w', encoding='utf-8') as f:
176
            f.write(RESOURCE_USER_LIST_COMMENT)
177
            f.write('\n')
178
            f.write(safe_dump(user_database))
179
        self.load_resource_list(self.user_list)
180
        return resdict
181
182
    def find_resources(self, executable=None, name=None, url=None, database=None):
183
        """
184
        Find resources in the registry
185
        """
186
        if not database:
187
            database = self.database
188
        ret = []
189
        if executable and executable not in database.keys():
190
            return ret
191
        for executable in [executable] if executable else database.keys():
192
            for resdict in database[executable]:
193
                if not name and not url:
194
                    ret.append((executable, resdict))
195
                elif url and url == resdict['url']:
196
                    ret.append((executable, resdict))
197
                elif name and name == resdict['name']:
198
                    ret.append((executable, resdict))
199
        return ret
200
201
    @property
202
    def default_resource_dir(self):
203
        return self.location_to_resource_dir('data')
204
205
    def location_to_resource_dir(self, location):
206
        return '/usr/local/share/ocrd-resources' if location == 'system' else \
207
                join(self.xdg_data_home, 'ocrd-resources') if location == 'data' else \
208
                getcwd()
209
210
    def resource_dir_to_location(self, resource_path):
211
        resource_path = str(resource_path)
212
        return 'system' if resource_path.startswith('/usr/local/share/ocrd-resources') else \
213
               'data' if resource_path.startswith(join(self.xdg_data_home, 'ocrd-resources')) else \
214
               'cwd' if resource_path.startswith(getcwd()) else \
215
               resource_path
216
217
    def parameter_usage(self, name, usage='as-is'):
218
        if usage == 'as-is':
219
            return name
220
        elif usage == 'without-extension':
221
            return Path(name).stem
222
        raise ValueError("No such usage '%s'" % usage)
223
224
    def _download_impl(self, url, filename, progress_cb=None, size=None):
225
        log = getLogger('ocrd.resource_manager._download_impl')
226
        log.info("Downloading %s to %s" % (url, filename))
227
        with open(filename, 'wb') as f:
228
            with requests.get(url, stream=True) as r:
229
                total = size if size else int(r.headers.get('content-length'))
230
                for data in r.iter_content(chunk_size=4096):
231
                    if progress_cb:
232
                        progress_cb(len(data))
233
                    f.write(data)
234
235
    def _copy_impl(self, src_filename, filename, progress_cb=None):
236
        log = getLogger('ocrd.resource_manager._copy_impl')
237
        log.info("Copying %s" % src_filename)
238
        with open(filename, 'wb') as f_out, open(src_filename, 'rb') as f_in:
239
            while True:
240
                chunk = f_in.read(4096)
241
                if chunk:
242
                    f_out.write(chunk)
243
                    if progress_cb:
244
                        progress_cb(len(chunk))
245
                else:
246
                    break
247
248
    # TODO Proper caching (make head request for size, If-Modified etc)
249
    def download(
250
        self,
251
        executable,
252
        url,
253
        basedir,
254
        overwrite=False,
255
        no_subdir=False,
256
        name=None,
257
        resource_type='file',
258
        path_in_archive='.',
259
        progress_cb=None,
260
        size=None,
261
    ):
262
        """
263
        Download a resource by URL
264
        """
265
        log = getLogger('ocrd.resource_manager.download')
266
        destdir = Path(basedir) if no_subdir else Path(basedir, executable)
267
        if not name:
268
            url_parsed = urlparse(url)
269
            name = Path(unquote(url_parsed.path)).name
270
        fpath = Path(destdir, name)
271
        is_url = url.startswith('https://') or url.startswith('http://')
272
        if fpath.exists() and not overwrite:
273
            log.info("%s to be %s to %s which already exists and overwrite is False" % (url, 'downloaded' if is_url else 'copied', fpath))
274
            return fpath
275
        destdir.mkdir(parents=True, exist_ok=True)
276
        if resource_type == 'file':
277
            if is_url:
278
                self._download_impl(url, fpath, progress_cb)
279
            else:
280
                self._copy_impl(url, fpath, progress_cb)
281
        elif resource_type == 'tarball':
282
            with pushd_popd(tempdir=True):
283
                if is_url:
284
                    self._download_impl(url, 'download.tar.xx', progress_cb, size)
285
                else:
286
                    self._copy_impl(url, 'download.tar.xx', progress_cb)
287
                Path('out').mkdir()
288
                with pushd_popd('out'):
289
                    log.info("Extracting tarball")
290
                    with open_tarfile('../download.tar.xx', 'r:*') as tar:
291
                        tar.extractall()
292
                    log.info("Copying '%s' from tarball to %s" % (path_in_archive, fpath))
293
                    copytree(path_in_archive, str(fpath))
294
        # TODO
295
        # elif resource_type == 'github-dir':
296
        return fpath
297