Passed
Pull Request — master (#800)
by Konstantin
02:22
created

OcrdResourceManager.download()   D

Complexity

Conditions 13

Size

Total Lines 46
Code Lines 40

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 40
dl 0
loc 46
rs 4.2
c 0
b 0
f 0
cc 13
nop 11

How to fix   Complexity    Many Parameters   

Complexity

Complex classes like ocrd.resource_manager.OcrdResourceManager.download() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

Many Parameters

Methods with many parameters are not only hard to understand, but their parameters also often become inconsistent when you need more, or different data.

There are several approaches to avoid long parameter lists:

1
from pathlib import Path
2
from os.path import join
3
from json import loads
4
from os import environ, listdir, getcwd, path
5
from fnmatch import filter as apply_glob
6
from shutil import copytree
7
from datetime import datetime
8
from tarfile import open as open_tarfile
9
from urllib.parse import urlparse, unquote
10
from subprocess import run, PIPE
11
12
import requests
13
from yaml import safe_load, safe_dump
14
15
# https://github.com/OCR-D/core/issues/867
16
# https://stackoverflow.com/questions/50900727/skip-converting-entities-while-loading-a-yaml-string-using-pyyaml
17
import yaml.constructor
18
yaml.constructor.SafeConstructor.yaml_constructors[u'tag:yaml.org,2002:timestamp'] = \
19
    yaml.constructor.SafeConstructor.yaml_constructors[u'tag:yaml.org,2002:str']
20
21
from ocrd_validators import OcrdResourceListValidator
22
from ocrd_utils import getLogger, directory_size
23
from ocrd_utils.os import get_processor_resource_types, list_all_resources, pushd_popd, get_ocrd_tool_json
24
from .constants import RESOURCE_LIST_FILENAME, RESOURCE_USER_LIST_COMMENT
25
26
class OcrdResourceManager():
27
28
    """
29
    Managing processor resources
30
    """
31
    def __init__(self, userdir=None, xdg_config_home=None, xdg_data_home=None, skip_init=False):
32
        self.log = getLogger('ocrd.resource_manager')
33
        self.database = {}
34
35
        self._xdg_data_home = xdg_data_home
36
        self._xdg_config_home = xdg_config_home
37
        self._userdir = userdir
38
        self.user_list = Path(self.xdg_config_home, 'ocrd', 'resources.yml')
39
40
        if not skip_init:
41
            self.load_resource_list(Path(RESOURCE_LIST_FILENAME))
42
            if not self.user_list.exists():
43
                if not self.user_list.parent.exists():
44
                    self.user_list.parent.mkdir(parents=True)
45
                self.save_user_list()
46
            self.load_resource_list(self.user_list)
47
48
    @property
49
    def userdir(self):
50
        if not self._userdir:
51
            self._userdir = path.expanduser('~')
52
            if 'HOME' in environ and environ['HOME'] != path.expanduser('~'):
53
                self._userdir = environ['HOME']
54
        return self._userdir
55
56
    @property
57
    def xdg_data_home(self):
58
        if not self._xdg_data_home:
59
            if 'XDG_DATA_HOME' in environ:
60
                self._xdg_data_home = environ['XDG_DATA_HOME']
61
            else:
62
                self._xdg_data_home = join(self.userdir, '.local', 'share')
63
        return self._xdg_data_home
64
65
    @property
66
    def xdg_config_home(self):
67
        if not self._xdg_config_home:
68
            if 'XDG_CONFIG_HOME' in environ:
69
                self._xdg_config_home = environ['XDG_CONFIG_HOME']
70
            else:
71
                self._xdg_config_home = join(self.userdir, '.config')
72
        return self._xdg_config_home
73
74
    def save_user_list(self, database=None):
75
        if not database:
76
            database = self.database
77
        with open(self.user_list, 'w', encoding='utf-8') as f:
78
            f.write(RESOURCE_USER_LIST_COMMENT)
79
            f.write('\n')
80
            f.write(safe_dump(database))
81
82
    def load_resource_list(self, list_filename, database=None):
83
        if not database:
84
            database = self.database
85
        if list_filename.is_file():
86
            with open(list_filename, 'r', encoding='utf-8') as f:
87
                list_loaded = safe_load(f) or {}
88
            report = OcrdResourceListValidator.validate(list_loaded)
89
            if not report.is_valid:
90
                self.log.error('\n'.join(report.errors))
91
                raise ValueError("Resource list %s is invalid!" % (list_filename))
92
            for executable, resource_list in list_loaded.items():
93
                if executable not in database:
94
                    database[executable] = []
95
                # Prepend, so user provided is sorted before builtin
96
                database[executable] = list_loaded[executable] + database[executable]
97
        return database
98
99
    def list_available(self, executable=None, dynamic=True):
100
        """
101
        List models available for download by processor
102
        """
103
        if not executable:
104
            return self.database.items()
105
        if dynamic:
106
            for exec_dir in environ['PATH'].split(':'):
107
                for exec_path in Path(exec_dir).glob(f'{executable}'):
108
                    self.log.info(f"Inspecting '{exec_path} --dump-json' for resources")
109
                    ocrd_tool = get_ocrd_tool_json(exec_path)
110
                    for resdict in ocrd_tool.get('resources', ()):
111
                        if exec_path.name not in self.database:
112
                            self.database[exec_path.name] = []
113
                        for res_remove in (res for res in self.database.get(executable, []) if res['name'] == resdict['name']):
114
                            self.database.get(executable).remove(res_remove)
115
                        self.database[exec_path.name].append(resdict)
116
        ret = []
117
        found = False
118
        for k in self.database:
119
            if apply_glob([k], executable):
120
                found = True
121
                ret.append((k, self.database[k]))
122
        if not found:
123
            ret = [(executable, [])]
124
        return ret
125
126
    def list_installed(self, executable=None):
127
        """
128
        List installed resources, matching with registry by ``name``
129
        """
130
        ret = []
131
        if executable:
132
            all_executables = [executable]
133
        else:
134
            # resources we know about
135
            all_executables = list(self.database.keys())
136
            # resources in the file system
137
            parent_dirs = [join(x, 'ocrd-resources') for x in [self.xdg_data_home, '/usr/local/share']]
138
            for parent_dir in parent_dirs:
139
                if Path(parent_dir).exists():
140
                    all_executables += [x for x in listdir(parent_dir) if x.startswith('ocrd-')]
141
        for this_executable in set(all_executables):
142
            reslist = []
143
            mimetypes = get_processor_resource_types(this_executable)
144
            for res_filename in list_all_resources(this_executable, xdg_data_home=self.xdg_data_home):
145
                res_filename = Path(res_filename)
146
                if not '*/*' in mimetypes:
147
                    if res_filename.is_dir() and not 'text/directory' in mimetypes:
148
                        continue
149
                    if res_filename.is_file() and ['text/directory'] == mimetypes:
150
                        continue
151
                res_name = res_filename.name
152
                resdict = [x for x in self.database.get(this_executable, []) if x['name'] == res_name]
153
                if not resdict:
154
                    self.log.info("%s resource '%s' (%s) not a known resource, creating stub in %s'", this_executable, res_name, str(res_filename), self.user_list)
155
                    resdict = [self.add_to_user_database(this_executable, res_filename)]
156
                resdict[0]['path'] = str(res_filename)
157
                reslist.append(resdict[0])
158
            ret.append((this_executable, reslist))
159
        return ret
160
161
    def add_to_user_database(self, executable, res_filename, url=None, resource_type='file'):
162
        """
163
        Add a stub entry to the user resource.yml
164
        """
165
        res_name = Path(res_filename).name
166
        if Path(res_filename).is_dir():
167
            res_size = directory_size(res_filename)
168
        else:
169
            res_size = Path(res_filename).stat().st_size
170
        with open(self.user_list, 'r', encoding='utf-8') as f:
171
            user_database = safe_load(f) or {}
172
        if executable not in user_database:
173
            user_database[executable] = []
174
        resources_found = self.find_resources(executable=executable, name=res_name, database=user_database)
175
        if not resources_found:
176
            resdict = {
177
                'name': res_name,
178
                'url': url if url else '???',
179
                'description': 'Found at %s on %s' % (self.resource_dir_to_location(res_filename), datetime.now()),
180
                'version_range': '???',
181
                'type': resource_type,
182
                'size': res_size
183
            }
184
            user_database[executable].append(resdict)
185
        else:
186
            resdict = resources_found[0][1]
187
        self.save_user_list(user_database)
188
        self.load_resource_list(self.user_list)
189
        return resdict
190
191
    def find_resources(self, executable=None, name=None, url=None, database=None):
192
        """
193
        Find resources in the registry
194
        """
195
        if not database:
196
            database = self.database
197
        ret = []
198
        if executable and executable not in database.keys():
199
            return ret
200
        for executable in [executable] if executable else database.keys():
201
            for resdict in database[executable]:
202
                if not name and not url:
203
                    ret.append((executable, resdict))
204
                elif url and url == resdict['url']:
205
                    ret.append((executable, resdict))
206
                elif name and name == resdict['name']:
207
                    ret.append((executable, resdict))
208
        return ret
209
210
    @property
211
    def default_resource_dir(self):
212
        return self.location_to_resource_dir('data')
213
214
    def location_to_resource_dir(self, location):
215
        return '/usr/local/share/ocrd-resources' if location == 'system' else \
216
                join(self.xdg_data_home, 'ocrd-resources') if location == 'data' else \
217
                getcwd()
218
219
    def resource_dir_to_location(self, resource_path):
220
        resource_path = str(resource_path)
221
        return 'system' if resource_path.startswith('/usr/local/share/ocrd-resources') else \
222
               'data' if resource_path.startswith(join(self.xdg_data_home, 'ocrd-resources')) else \
223
               'cwd' if resource_path.startswith(getcwd()) else \
224
               resource_path
225
226
    def parameter_usage(self, name, usage='as-is'):
227
        if usage == 'as-is':
228
            return name
229
        elif usage == 'without-extension':
230
            return Path(name).stem
231
        raise ValueError("No such usage '%s'" % usage)
232
233
    def _download_impl(self, url, filename, progress_cb=None, size=None):
234
        log = getLogger('ocrd.resource_manager._download_impl')
235
        log.info("Downloading %s to %s" % (url, filename))
236
        with open(filename, 'wb') as f:
237
            with requests.get(url, stream=True) as r:
238
                total = size if size else int(r.headers.get('content-length'))
239
                for data in r.iter_content(chunk_size=4096):
240
                    if progress_cb:
241
                        progress_cb(len(data))
242
                    f.write(data)
243
244
    def _copy_impl(self, src_filename, filename, progress_cb=None):
245
        log = getLogger('ocrd.resource_manager._copy_impl')
246
        log.info("Copying %s to %s", src_filename, filename)
247
        if Path(src_filename).is_dir():
248
            log.info(f"Copying recursively from {src_filename} to {filename}")
249
            for child in Path(src_filename).rglob('*'):
250
                child_dst = Path(filename) / child.relative_to(src_filename)
251
                child_dst.parent.mkdir(parents=True, exist_ok=True)
252
                with open(child_dst, 'wb') as f_out, open(child, 'rb') as f_in:
253
                    while True:
254
                        chunk = f_in.read(4096)
255
                        if chunk:
256
                            f_out.write(chunk)
257
                            if progress_cb:
258
                                progress_cb(len(chunk))
259
                        else:
260
                            break
261
        else:
262
            with open(filename, 'wb') as f_out, open(src_filename, 'rb') as f_in:
263
                while True:
264
                    chunk = f_in.read(4096)
265
                    if chunk:
266
                        f_out.write(chunk)
267
                        if progress_cb:
268
                            progress_cb(len(chunk))
269
                    else:
270
                        break
271
272
    # TODO Proper caching (make head request for size, If-Modified etc)
273
    def download(
274
        self,
275
        executable,
276
        url,
277
        basedir,
278
        overwrite=False,
279
        no_subdir=False,
280
        name=None,
281
        resource_type='file',
282
        path_in_archive='.',
283
        progress_cb=None,
284
        size=None,
285
    ):
286
        """
287
        Download a resource by URL
288
        """
289
        log = getLogger('ocrd.resource_manager.download')
290
        destdir = Path(basedir) if no_subdir else Path(basedir, executable)
291
        if not name:
292
            url_parsed = urlparse(url)
293
            name = Path(unquote(url_parsed.path)).name
294
        fpath = Path(destdir, name)
295
        is_url = url.startswith('https://') or url.startswith('http://')
296
        if fpath.exists() and not overwrite:
297
            log.info("%s to be %s to %s which already exists and overwrite is False" % (url, 'downloaded' if is_url else 'copied', fpath))
298
            return fpath
299
        destdir.mkdir(parents=True, exist_ok=True)
300
        if resource_type in ('file', 'directory'):
301
            if is_url:
302
                self._download_impl(url, fpath, progress_cb)
303
            else:
304
                self._copy_impl(url, fpath, progress_cb)
305
        elif resource_type == 'archive':
306
            with pushd_popd(tempdir=True) as tempdir:
307
                if is_url:
308
                    self._download_impl(url, 'download.tar.xx', progress_cb, size)
309
                else:
310
                    self._copy_impl(url, 'download.tar.xx', progress_cb)
311
                Path('out').mkdir()
312
                with pushd_popd('out'):
313
                    log.info("Extracting archive to %s/out" % tempdir)
314
                    with open_tarfile('../download.tar.xx', 'r:*') as tar:
315
                        tar.extractall()
316
                    log.info("Copying '%s' from archive to %s" % (path_in_archive, fpath))
317
                    copytree(path_in_archive, str(fpath))
318
        return fpath
319