Passed
Pull Request — master (#904)
by Konstantin
05:38
created

OcrdResourceManager.download()   D

Complexity

Conditions 13

Size

Total Lines 46
Code Lines 40

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 40
dl 0
loc 46
rs 4.2
c 0
b 0
f 0
cc 13
nop 11

How to fix   Complexity    Many Parameters   

Complexity

Complex classes like ocrd.resource_manager.OcrdResourceManager.download() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

Many Parameters

Methods with many parameters are not only hard to understand, but their parameters also often become inconsistent when you need more, or different data.

There are several approaches to avoid long parameter lists:

1
from pathlib import Path
2
from os.path import join
3
from json import loads
4
from os import environ, listdir, getcwd, path
5
from fnmatch import filter as apply_glob
6
from shutil import copytree
7
from datetime import datetime
8
from tarfile import open as open_tarfile
9
from urllib.parse import urlparse, unquote
10
from subprocess import run, PIPE
11
12
import requests
13
from yaml import safe_load, safe_dump
14
15
# https://github.com/OCR-D/core/issues/867
16
# https://stackoverflow.com/questions/50900727/skip-converting-entities-while-loading-a-yaml-string-using-pyyaml
17
import yaml.constructor
18
yaml.constructor.SafeConstructor.yaml_constructors[u'tag:yaml.org,2002:timestamp'] = \
19
    yaml.constructor.SafeConstructor.yaml_constructors[u'tag:yaml.org,2002:str']
20
21
from ocrd_validators import OcrdResourceListValidator
22
from ocrd_utils import getLogger, directory_size, get_moduledir
23
from ocrd_utils.os import get_processor_resource_types, list_all_resources, pushd_popd, get_ocrd_tool_json
24
from .constants import RESOURCE_LIST_FILENAME, RESOURCE_USER_LIST_COMMENT
25
26
class OcrdResourceManager():
27
28
    """
29
    Managing processor resources
30
    """
31
    def __init__(self, userdir=None, xdg_config_home=None, xdg_data_home=None, skip_init=False):
32
        self.log = getLogger('ocrd.resource_manager')
33
        self.database = {}
34
35
        self._xdg_data_home = xdg_data_home
36
        self._xdg_config_home = xdg_config_home
37
        self._userdir = userdir
38
        self.user_list = Path(self.xdg_config_home, 'ocrd', 'resources.yml')
39
40
        if not skip_init:
41
            self.load_resource_list(Path(RESOURCE_LIST_FILENAME))
42
            if not self.user_list.exists():
43
                if not self.user_list.parent.exists():
44
                    self.user_list.parent.mkdir(parents=True)
45
                self.save_user_list()
46
            self.load_resource_list(self.user_list)
47
48
    @property
49
    def userdir(self):
50
        if not self._userdir:
51
            self._userdir = path.expanduser('~')
52
            if 'HOME' in environ and environ['HOME'] != path.expanduser('~'):
53
                self._userdir = environ['HOME']
54
        return self._userdir
55
56
    @property
57
    def xdg_data_home(self):
58
        if not self._xdg_data_home:
59
            if 'XDG_DATA_HOME' in environ:
60
                self._xdg_data_home = environ['XDG_DATA_HOME']
61
            else:
62
                self._xdg_data_home = join(self.userdir, '.local', 'share')
63
        return self._xdg_data_home
64
65
    @property
66
    def xdg_config_home(self):
67
        if not self._xdg_config_home:
68
            if 'XDG_CONFIG_HOME' in environ:
69
                self._xdg_config_home = environ['XDG_CONFIG_HOME']
70
            else:
71
                self._xdg_config_home = join(self.userdir, '.config')
72
        return self._xdg_config_home
73
74
    def save_user_list(self, database=None):
75
        if not database:
76
            database = self.database
77
        with open(self.user_list, 'w', encoding='utf-8') as f:
78
            f.write(RESOURCE_USER_LIST_COMMENT)
79
            f.write('\n')
80
            f.write(safe_dump(database))
81
82
    def load_resource_list(self, list_filename, database=None):
83
        if not database:
84
            database = self.database
85
        if list_filename.is_file():
86
            with open(list_filename, 'r', encoding='utf-8') as f:
87
                list_loaded = safe_load(f) or {}
88
            report = OcrdResourceListValidator.validate(list_loaded)
89
            if not report.is_valid:
90
                self.log.error('\n'.join(report.errors))
91
                raise ValueError("Resource list %s is invalid!" % (list_filename))
92
            for executable, resource_list in list_loaded.items():
93
                if executable not in database:
94
                    database[executable] = []
95
                # Prepend, so user provided is sorted before builtin
96
                database[executable] = list_loaded[executable] + database[executable]
97
        return database
98
99
    def list_available(self, executable=None, dynamic=True, name=None, database=None, url=None):
100
        """
101
        List models available for download by processor
102
        """
103
        if not database:
104
            database = self.database
105
        if not executable:
106
            return database.items()
107
        if dynamic:
108
            for exec_dir in environ['PATH'].split(':'):
109
                for exec_path in Path(exec_dir).glob(f'{executable}'):
110
                    self.log.info(f"Inspecting '{exec_path} --dump-json' for resources")
111
                    ocrd_tool = get_ocrd_tool_json(exec_path)
112
                    for resdict in ocrd_tool.get('resources', ()):
113
                        if exec_path.name not in database:
114
                            database[exec_path.name] = []
115
                        database[exec_path.name].append(resdict)
116
            database = self._dedup_database(database)
117
        found = False
118
        ret = []
119
        for k in database:
120
            if apply_glob([k], executable):
121
                found = True
122
                restuple = (k, [])
123
                ret.append(restuple)
124
                for resdict in database[k]:
125
                    if name and resdict['name'] != name:
126
                        continue
127
                    if url and resdict['url'] != url:
128
                        continue
129
                    restuple[1].append(resdict)
130
        if not found:
131
            ret = [(executable, [])]
132
        return ret
133
134
    def list_installed(self, executable=None):
135
        """
136
        List installed resources, matching with registry by ``name``
137
        """
138
        ret = []
139
        if executable:
140
            all_executables = [executable]
141
        else:
142
            # resources we know about
143
            all_executables = list(self.database.keys())
144
            # resources in the file system
145
            parent_dirs = [join(x, 'ocrd-resources') for x in [self.xdg_data_home, '/usr/local/share']]
146
            for parent_dir in parent_dirs:
147
                if Path(parent_dir).exists():
148
                    all_executables += [x for x in listdir(parent_dir) if x.startswith('ocrd-')]
149
        for this_executable in set(all_executables):
150
            reslist = []
151
            mimetypes = get_processor_resource_types(this_executable)
152
            moduledir = get_moduledir(this_executable)
153
            for res_filename in list_all_resources(this_executable, moduled=moduledir, xdg_data_home=self.xdg_data_home):
154
                res_filename = Path(res_filename)
155
                if not '*/*' in mimetypes:
156
                    if res_filename.is_dir() and not 'text/directory' in mimetypes:
157
                        continue
158
                    if res_filename.is_file() and ['text/directory'] == mimetypes:
159
                        continue
160
                res_name = res_filename.name
161
                resdict_list = [x for x in self.database.get(this_executable, []) if x['name'] == res_name]
162
                if resdict_list:
163
                    resdict = resdict_list[0]
164
                else:
165
                    resdict = self.add_to_user_database(this_executable, res_filename)
166
                resdict['path'] = str(res_filename)
167
                reslist.append(resdict)
168
            ret.append((this_executable, reslist))
169
        return ret
170
171
    def add_to_user_database(self, executable, res_filename, url=None, resource_type='file'):
172
        """
173
        Add a stub entry to the user resource.yml
174
        """
175
        res_name = Path(res_filename).name
176
        self.log.info("%s resource '%s' (%s) not a known resource, creating stub in %s'", executable, res_name, str(res_filename), self.user_list)
177
        if Path(res_filename).is_dir():
178
            res_size = directory_size(res_filename)
179
        else:
180
            res_size = Path(res_filename).stat().st_size
181
        with open(self.user_list, 'r', encoding='utf-8') as f:
182
            user_database = safe_load(f) or {}
183
        if executable not in user_database:
184
            user_database[executable] = []
185
        resources_found = self.list_available(executable=executable, name=res_name, database=user_database)[0][1]
186
        if not resources_found:
187
            resdict = {
188
                'name': res_name,
189
                'url': url if url else '???',
190
                'description': 'Found at %s on %s' % (self.resource_dir_to_location(res_filename), datetime.now()),
191
                'version_range': '???',
192
                'type': resource_type,
193
                'size': res_size
194
            }
195
            user_database[executable].append(resdict)
196
        else:
197
            resdict = resources_found[0]
198
        self.save_user_list(user_database)
199
        self.load_resource_list(self.user_list)
200
        return resdict
201
202
    @property
203
    def default_resource_dir(self):
204
        return self.location_to_resource_dir('data')
205
206
    def location_to_resource_dir(self, location):
207
        return '/usr/local/share/ocrd-resources' if location == 'system' else \
208
                join(self.xdg_data_home, 'ocrd-resources') if location == 'data' else \
209
                getcwd()
210
211
    def resource_dir_to_location(self, resource_path):
212
        resource_path = str(resource_path)
213
        return 'system' if resource_path.startswith('/usr/local/share/ocrd-resources') else \
214
               'data' if resource_path.startswith(join(self.xdg_data_home, 'ocrd-resources')) else \
215
               'cwd' if resource_path.startswith(getcwd()) else \
216
               resource_path
217
218
    def parameter_usage(self, name, usage='as-is'):
219
        if usage == 'as-is':
220
            return name
221
        elif usage == 'without-extension':
222
            return Path(name).stem
223
        raise ValueError("No such usage '%s'" % usage)
224
225
    def _download_impl(self, url, filename, progress_cb=None, size=None):
226
        log = getLogger('ocrd.resource_manager._download_impl')
227
        log.info("Downloading %s to %s" % (url, filename))
228
        with open(filename, 'wb') as f:
229
            with requests.get(url, stream=True) as r:
230
                total = size if size else int(r.headers.get('content-length'))
231
                for data in r.iter_content(chunk_size=4096):
232
                    if progress_cb:
233
                        progress_cb(len(data))
234
                    f.write(data)
235
236
    def _copy_impl(self, src_filename, filename, progress_cb=None):
237
        log = getLogger('ocrd.resource_manager._copy_impl')
238
        log.info("Copying %s to %s", src_filename, filename)
239
        if Path(src_filename).is_dir():
240
            log.info(f"Copying recursively from {src_filename} to {filename}")
241
            for child in Path(src_filename).rglob('*'):
242
                child_dst = Path(filename) / child.relative_to(src_filename)
243
                child_dst.parent.mkdir(parents=True, exist_ok=True)
244
                with open(child_dst, 'wb') as f_out, open(child, 'rb') as f_in:
245
                    while True:
246
                        chunk = f_in.read(4096)
247
                        if chunk:
248
                            f_out.write(chunk)
249
                            if progress_cb:
250
                                progress_cb(len(chunk))
251
                        else:
252
                            break
253
        else:
254
            with open(filename, 'wb') as f_out, open(src_filename, 'rb') as f_in:
255
                while True:
256
                    chunk = f_in.read(4096)
257
                    if chunk:
258
                        f_out.write(chunk)
259
                        if progress_cb:
260
                            progress_cb(len(chunk))
261
                    else:
262
                        break
263
264
    # TODO Proper caching (make head request for size, If-Modified etc)
265
    def download(
266
        self,
267
        executable,
268
        url,
269
        basedir,
270
        overwrite=False,
271
        no_subdir=False,
272
        name=None,
273
        resource_type='file',
274
        path_in_archive='.',
275
        progress_cb=None,
276
        size=None,
277
    ):
278
        """
279
        Download a resource by URL
280
        """
281
        log = getLogger('ocrd.resource_manager.download')
282
        destdir = Path(basedir) if no_subdir else Path(basedir, executable)
283
        if not name:
284
            url_parsed = urlparse(url)
285
            name = Path(unquote(url_parsed.path)).name
286
        fpath = Path(destdir, name)
287
        is_url = url.startswith('https://') or url.startswith('http://')
288
        if fpath.exists() and not overwrite:
289
            log.info("%s to be %s to %s which already exists and overwrite is False" % (url, 'downloaded' if is_url else 'copied', fpath))
290
            return fpath
291
        destdir.mkdir(parents=True, exist_ok=True)
292
        if resource_type in ('file', 'directory'):
293
            if is_url:
294
                self._download_impl(url, fpath, progress_cb)
295
            else:
296
                self._copy_impl(url, fpath, progress_cb)
297
        elif resource_type == 'archive':
298
            with pushd_popd(tempdir=True) as tempdir:
299
                if is_url:
300
                    self._download_impl(url, 'download.tar.xx', progress_cb, size)
301
                else:
302
                    self._copy_impl(url, 'download.tar.xx', progress_cb)
303
                Path('out').mkdir()
304
                with pushd_popd('out'):
305
                    log.info("Extracting archive to %s/out" % tempdir)
306
                    with open_tarfile('../download.tar.xx', 'r:*') as tar:
307
                        tar.extractall()
308
                    log.info("Copying '%s' from archive to %s" % (path_in_archive, fpath))
309
                    copytree(path_in_archive, str(fpath))
310
        return fpath
311
312
    def _dedup_database(self, database=None, dedup_key='name'):
313
        """
314
        Deduplicate resources by name
315
        """
316
        if not database:
317
            database = self.database
318
        for executable, reslist in database.items():
319
            reslist_dedup = []
320
            for resdict in reslist:
321
                if not any(r[dedup_key] == resdict[dedup_key] for r in reslist_dedup):
322
                    reslist_dedup.append(resdict)
323
            database[executable] = reslist_dedup
324
        return database
325