Passed
Pull Request — master (#800)
by Konstantin
02:15
created

OcrdResourceManager.download()   D

Complexity

Conditions 13

Size

Total Lines 46
Code Lines 40

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 40
dl 0
loc 46
rs 4.2
c 0
b 0
f 0
cc 13
nop 11

How to fix   Complexity    Many Parameters   

Complexity

Complex classes like ocrd.resource_manager.OcrdResourceManager.download() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

Many Parameters

Methods with many parameters are not only hard to understand, but their parameters also often become inconsistent when you need more, or different data.

There are several approaches to avoid long parameter lists:

1
from pathlib import Path
2
from os.path import join
3
from json import loads
4
from os import environ, listdir, getcwd, path
5
from fnmatch import filter as apply_glob
6
from shutil import copytree
7
from datetime import datetime
8
from tarfile import open as open_tarfile
9
from urllib.parse import urlparse, unquote
10
from subprocess import run, PIPE
11
12
import requests
13
from yaml import safe_load, safe_dump
14
15
from ocrd_validators import OcrdResourceListValidator
16
from ocrd_utils import getLogger, directory_size
17
from ocrd_utils.os import get_processor_resource_types, list_all_resources, pushd_popd, get_ocrd_tool_json
18
from .constants import RESOURCE_LIST_FILENAME, RESOURCE_USER_LIST_COMMENT
19
20
class OcrdResourceManager():
21
22
    """
23
    Managing processor resources
24
    """
25
    def __init__(self, userdir=None, xdg_config_home=None, xdg_data_home=None):
26
        self.log = getLogger('ocrd.resource_manager')
27
        self.database = {}
28
29
        self._xdg_data_home = xdg_data_home
30
        self._xdg_config_home = xdg_config_home
31
        self._userdir = userdir
32
        self.user_list = Path(self.xdg_config_home, 'ocrd', 'resources.yml')
33
34
        self.load_resource_list(Path(RESOURCE_LIST_FILENAME))
35
        if not self.user_list.exists():
36
            if not self.user_list.parent.exists():
37
                self.user_list.parent.mkdir(parents=True)
38
            self.save_user_list()
39
        self.load_resource_list(self.user_list)
40
41
    @property
42
    def userdir(self):
43
        if not self._userdir:
44
            self._userdir = path.expanduser('~')
45
            if 'HOME' in environ and environ['HOME'] != path.expanduser('~'):
46
                self._userdir = environ['HOME']
47
        return self._userdir
48
49
    @property
50
    def xdg_data_home(self):
51
        if not self._xdg_data_home:
52
            if 'XDG_DATA_HOME' in environ:
53
                self._xdg_data_home = environ['XDG_DATA_HOME']
54
            else:
55
                self._xdg_data_home = join(self.userdir, '.local', 'share')
56
        return self._xdg_data_home
57
58
    @property
59
    def xdg_config_home(self):
60
        if not self._xdg_config_home:
61
            if 'XDG_CONFIG_HOME' in environ:
62
                self._xdg_config_home = environ['XDG_CONFIG_HOME']
63
            else:
64
                self._xdg_config_home = join(self.userdir, '.config')
65
        return self._xdg_config_home
66
67
    def save_user_list(self, database=None):
68
        if not database:
69
            database = self.database
70
        with open(self.user_list, 'w', encoding='utf-8') as f:
71
            f.write(RESOURCE_USER_LIST_COMMENT)
72
            f.write('\n')
73
            f.write(safe_dump(database))
74
75
    def load_resource_list(self, list_filename, database=None):
76
        if not database:
77
            database = self.database
78
        if list_filename.is_file():
79
            with open(list_filename, 'r', encoding='utf-8') as f:
80
                list_loaded = safe_load(f) or {}
81
            report = OcrdResourceListValidator.validate(list_loaded)
82
            if not report.is_valid:
83
                self.log.error('\n'.join(report.errors))
84
                raise ValueError("Resource list %s is invalid!" % (list_filename))
85
            for executable, resource_list in list_loaded.items():
86
                if executable not in database:
87
                    database[executable] = []
88
                # Prepend, so user provided is sorted before builtin
89
                database[executable] = list_loaded[executable] + database[executable]
90
        return database
91
92
    def list_available(self, executable=None, dynamic=True):
93
        """
94
        List models available for download by processor
95
        """
96
        if not executable:
97
            return self.database.items()
98
        if dynamic:
99
            for exec_dir in environ['PATH'].split(':'):
100
                for exec_path in Path(exec_dir).glob(f'{executable}*'):
101
                    self.log.info(f"Inspecting '{exec_path} --dump-json' for resources")
102
                    ocrd_tool = get_ocrd_tool_json(exec_path)
103
                    for resdict in ocrd_tool.get('resources', ()):
104
                        for res_remove in (res for res in self.database.get(executable, []) if res['name'] == resdict['name']):
105
                            self.database.get(executable).remove(res_remove)
106
                        self.database[exec_path.name].append(resdict)
107
        ret = []
108
        for k in self.database:
109
            if apply_glob(k, executable):
110
                ret.append((k, self.database[k]))
111
        return ret
112
113
    def list_installed(self, executable=None):
114
        """
115
        List installed resources, matching with registry by ``name``
116
        """
117
        ret = []
118
        if executable:
119
            all_executables = [executable]
120
        else:
121
            # resources we know about
122
            all_executables = list(self.database.keys())
123
            # resources in the file system
124
            parent_dirs = [join(x, 'ocrd-resources') for x in [self.xdg_data_home, '/usr/local/share']]
125
            for parent_dir in parent_dirs:
126
                if Path(parent_dir).exists():
127
                    all_executables += [x for x in listdir(parent_dir) if x.startswith('ocrd-')]
128
        for this_executable in set(all_executables):
129
            reslist = []
130
            mimetypes = get_processor_resource_types(this_executable)
131
            for res_filename in list_all_resources(this_executable, xdg_data_home=self.xdg_data_home):
132
                res_filename = Path(res_filename)
133
                if not '*/*' in mimetypes:
134
                    if res_filename.is_dir() and not 'text/directory' in mimetypes:
135
                        continue
136
                    if res_filename.is_file() and ['text/directory'] == mimetypes:
137
                        continue
138
                res_name = res_filename.name
139
                resdict = [x for x in self.database.get(this_executable, []) if x['name'] == res_name]
140
                if not resdict:
141
                    self.log.info("%s resource '%s' (%s) not a known resource, creating stub in %s'", this_executable, res_name, str(res_filename), self.user_list)
142
                    resdict = [self.add_to_user_database(this_executable, res_filename)]
143
                resdict[0]['path'] = str(res_filename)
144
                reslist.append(resdict[0])
145
            ret.append((this_executable, reslist))
146
        return ret
147
148
    def add_to_user_database(self, executable, res_filename, url=None, resource_type='file'):
149
        """
150
        Add a stub entry to the user resource.yml
151
        """
152
        res_name = Path(res_filename).name
153
        if Path(res_filename).is_dir():
154
            res_size = directory_size(res_filename)
155
        else:
156
            res_size = Path(res_filename).stat().st_size
157
        with open(self.user_list, 'r', encoding='utf-8') as f:
158
            user_database = safe_load(f) or {}
159
        if executable not in user_database:
160
            user_database[executable] = []
161
        resources_found = self.find_resources(executable=executable, name=res_name, database=user_database)
162
        if not resources_found:
163
            resdict = {
164
                'name': res_name,
165
                'url': url if url else '???',
166
                'description': 'Found at %s on %s' % (self.resource_dir_to_location(res_filename), datetime.now()),
167
                'version_range': '???',
168
                'type': resource_type,
169
                'size': res_size
170
            }
171
            user_database[executable].append(resdict)
172
        else:
173
            resdict = resources_found[0][1]
174
        self.save_user_list(user_database)
175
        self.load_resource_list(self.user_list)
176
        return resdict
177
178
    def find_resources(self, executable=None, name=None, url=None, database=None):
179
        """
180
        Find resources in the registry
181
        """
182
        if not database:
183
            database = self.database
184
        ret = []
185
        if executable and executable not in database.keys():
186
            return ret
187
        for executable in [executable] if executable else database.keys():
188
            for resdict in database[executable]:
189
                if not name and not url:
190
                    ret.append((executable, resdict))
191
                elif url and url == resdict['url']:
192
                    ret.append((executable, resdict))
193
                elif name and name == resdict['name']:
194
                    ret.append((executable, resdict))
195
        return ret
196
197
    @property
198
    def default_resource_dir(self):
199
        return self.location_to_resource_dir('data')
200
201
    def location_to_resource_dir(self, location):
202
        return '/usr/local/share/ocrd-resources' if location == 'system' else \
203
                join(self.xdg_data_home, 'ocrd-resources') if location == 'data' else \
204
                getcwd()
205
206
    def resource_dir_to_location(self, resource_path):
207
        resource_path = str(resource_path)
208
        return 'system' if resource_path.startswith('/usr/local/share/ocrd-resources') else \
209
               'data' if resource_path.startswith(join(self.xdg_data_home, 'ocrd-resources')) else \
210
               'cwd' if resource_path.startswith(getcwd()) else \
211
               resource_path
212
213
    def parameter_usage(self, name, usage='as-is'):
214
        if usage == 'as-is':
215
            return name
216
        elif usage == 'without-extension':
217
            return Path(name).stem
218
        raise ValueError("No such usage '%s'" % usage)
219
220
    def _download_impl(self, url, filename, progress_cb=None, size=None):
221
        log = getLogger('ocrd.resource_manager._download_impl')
222
        log.info("Downloading %s to %s" % (url, filename))
223
        with open(filename, 'wb') as f:
224
            with requests.get(url, stream=True) as r:
225
                total = size if size else int(r.headers.get('content-length'))
226
                for data in r.iter_content(chunk_size=4096):
227
                    if progress_cb:
228
                        progress_cb(len(data))
229
                    f.write(data)
230
231
    def _copy_impl(self, src_filename, filename, progress_cb=None):
232
        log = getLogger('ocrd.resource_manager._copy_impl')
233
        log.info("Copying %s to %s", src_filename, filename)
234
        if Path(src_filename).is_dir():
235
            log.info(f"Copying recursively from {src_filename} to {filename}")
236
            for child in Path(src_filename).rglob('*'):
237
                child_dst = Path(filename) / child.relative_to(src_filename)
238
                child_dst.parent.mkdir(parents=True, exist_ok=True)
239
                with open(child_dst, 'wb') as f_out, open(child, 'rb') as f_in:
240
                    while True:
241
                        chunk = f_in.read(4096)
242
                        if chunk:
243
                            f_out.write(chunk)
244
                            if progress_cb:
245
                                progress_cb(len(chunk))
246
                        else:
247
                            break
248
        else:
249
            with open(filename, 'wb') as f_out, open(src_filename, 'rb') as f_in:
250
                while True:
251
                    chunk = f_in.read(4096)
252
                    if chunk:
253
                        f_out.write(chunk)
254
                        if progress_cb:
255
                            progress_cb(len(chunk))
256
                    else:
257
                        break
258
259
    # TODO Proper caching (make head request for size, If-Modified etc)
260
    def download(
261
        self,
262
        executable,
263
        url,
264
        basedir,
265
        overwrite=False,
266
        no_subdir=False,
267
        name=None,
268
        resource_type='file',
269
        path_in_archive='.',
270
        progress_cb=None,
271
        size=None,
272
    ):
273
        """
274
        Download a resource by URL
275
        """
276
        log = getLogger('ocrd.resource_manager.download')
277
        destdir = Path(basedir) if no_subdir else Path(basedir, executable)
278
        if not name:
279
            url_parsed = urlparse(url)
280
            name = Path(unquote(url_parsed.path)).name
281
        fpath = Path(destdir, name)
282
        is_url = url.startswith('https://') or url.startswith('http://')
283
        if fpath.exists() and not overwrite:
284
            log.info("%s to be %s to %s which already exists and overwrite is False" % (url, 'downloaded' if is_url else 'copied', fpath))
285
            return fpath
286
        destdir.mkdir(parents=True, exist_ok=True)
287
        if resource_type in ('file', 'directory'):
288
            if is_url:
289
                self._download_impl(url, fpath, progress_cb)
290
            else:
291
                self._copy_impl(url, fpath, progress_cb)
292
        elif resource_type == 'archive':
293
            with pushd_popd(tempdir=True):
294
                if is_url:
295
                    self._download_impl(url, 'download.tar.xx', progress_cb, size)
296
                else:
297
                    self._copy_impl(url, 'download.tar.xx', progress_cb)
298
                Path('out').mkdir()
299
                with pushd_popd('out'):
300
                    log.info("Extracting archive")
301
                    with open_tarfile('../download.tar.xx', 'r:*') as tar:
302
                        tar.extractall()
303
                    log.info("Copying '%s' from archive to %s" % (path_in_archive, fpath))
304
                    copytree(path_in_archive, str(fpath))
305
        return fpath
306