Passed
Pull Request — master (#904)
by Konstantin
04:52
created

OcrdResourceManager.download()   D

Complexity

Conditions 13

Size

Total Lines 46
Code Lines 40

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 40
dl 0
loc 46
rs 4.2
c 0
b 0
f 0
cc 13
nop 11

How to fix   Complexity    Many Parameters   

Complexity

Complex classes like ocrd.resource_manager.OcrdResourceManager.download() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

Many Parameters

Methods with many parameters are not only hard to understand, but their parameters also often become inconsistent when you need more, or different data.

There are several approaches to avoid long parameter lists:

1
from pathlib import Path
2
from os.path import join
3
from json import loads
4
from os import environ, listdir, getcwd, path
5
from fnmatch import filter as apply_glob
6
from shutil import copytree
7
from datetime import datetime
8
from tarfile import open as open_tarfile
9
from urllib.parse import urlparse, unquote
10
from subprocess import run, PIPE
11
12
import requests
13
from yaml import safe_load, safe_dump
14
15
# https://github.com/OCR-D/core/issues/867
16
# https://stackoverflow.com/questions/50900727/skip-converting-entities-while-loading-a-yaml-string-using-pyyaml
17
import yaml.constructor
18
yaml.constructor.SafeConstructor.yaml_constructors[u'tag:yaml.org,2002:timestamp'] = \
19
    yaml.constructor.SafeConstructor.yaml_constructors[u'tag:yaml.org,2002:str']
20
21
from ocrd_validators import OcrdResourceListValidator
22
from ocrd_utils import getLogger, directory_size, get_moduledir
23
from ocrd_utils.os import get_processor_resource_types, list_all_resources, pushd_popd, get_ocrd_tool_json
24
from .constants import RESOURCE_LIST_FILENAME, RESOURCE_USER_LIST_COMMENT
25
26
class OcrdResourceManager():
27
28
    """
29
    Managing processor resources
30
    """
31
    def __init__(self, userdir=None, xdg_config_home=None, xdg_data_home=None, skip_init=False):
32
        self.log = getLogger('ocrd.resource_manager')
33
        self.database = {}
34
35
        self._xdg_data_home = xdg_data_home
36
        self._xdg_config_home = xdg_config_home
37
        self._userdir = userdir
38
        self.user_list = Path(self.xdg_config_home, 'ocrd', 'resources.yml')
39
40
        if not skip_init:
41
            self.load_resource_list(Path(RESOURCE_LIST_FILENAME))
42
            if not self.user_list.exists():
43
                if not self.user_list.parent.exists():
44
                    self.user_list.parent.mkdir(parents=True)
45
                self.save_user_list()
46
            self.load_resource_list(self.user_list)
47
48
    @property
49
    def userdir(self):
50
        if not self._userdir:
51
            self._userdir = path.expanduser('~')
52
            if 'HOME' in environ and environ['HOME'] != path.expanduser('~'):
53
                self._userdir = environ['HOME']
54
        return self._userdir
55
56
    @property
57
    def xdg_data_home(self):
58
        if not self._xdg_data_home:
59
            if 'XDG_DATA_HOME' in environ:
60
                self._xdg_data_home = environ['XDG_DATA_HOME']
61
            else:
62
                self._xdg_data_home = join(self.userdir, '.local', 'share')
63
        return self._xdg_data_home
64
65
    @property
66
    def xdg_config_home(self):
67
        if not self._xdg_config_home:
68
            if 'XDG_CONFIG_HOME' in environ:
69
                self._xdg_config_home = environ['XDG_CONFIG_HOME']
70
            else:
71
                self._xdg_config_home = join(self.userdir, '.config')
72
        return self._xdg_config_home
73
74
    def save_user_list(self, database=None):
75
        if not database:
76
            database = self.database
77
        with open(self.user_list, 'w', encoding='utf-8') as f:
78
            f.write(RESOURCE_USER_LIST_COMMENT)
79
            f.write('\n')
80
            f.write(safe_dump(database))
81
82
    def load_resource_list(self, list_filename, database=None):
83
        if not database:
84
            database = self.database
85
        if list_filename.is_file():
86
            with open(list_filename, 'r', encoding='utf-8') as f:
87
                list_loaded = safe_load(f) or {}
88
            report = OcrdResourceListValidator.validate(list_loaded)
89
            if not report.is_valid:
90
                self.log.error('\n'.join(report.errors))
91
                raise ValueError("Resource list %s is invalid!" % (list_filename))
92
            for executable, resource_list in list_loaded.items():
93
                if executable not in database:
94
                    database[executable] = []
95
                # Prepend, so user provided is sorted before builtin
96
                database[executable] = list_loaded[executable] + database[executable]
97
        return database
98
99
    def list_available(self, executable=None, dynamic=True):
100
        """
101
        List models available for download by processor
102
        """
103
        if not executable:
104
            return self.database.items()
105
        if dynamic:
106
            for exec_dir in environ['PATH'].split(':'):
107
                for exec_path in Path(exec_dir).glob(f'{executable}'):
108
                    self.log.info(f"Inspecting '{exec_path} --dump-json' for resources")
109
                    ocrd_tool = get_ocrd_tool_json(exec_path)
110
                    for resdict in ocrd_tool.get('resources', ()):
111
                        if exec_path.name not in self.database:
112
                            self.database[exec_path.name] = []
113
                        self.database[exec_path.name].append(resdict)
114
                    self.database = self._dedup_database(self.database)
115
        ret = []
116
        found = False
117
        for k in self.database:
118
            if apply_glob([k], executable):
119
                found = True
120
                ret.append((k, self.database[k]))
121
        if not found:
122
            ret = [(executable, [])]
123
        return ret
124
125
    def list_installed(self, executable=None):
126
        """
127
        List installed resources, matching with registry by ``name``
128
        """
129
        ret = []
130
        if executable:
131
            all_executables = [executable]
132
        else:
133
            # resources we know about
134
            all_executables = list(self.database.keys())
135
            # resources in the file system
136
            parent_dirs = [join(x, 'ocrd-resources') for x in [self.xdg_data_home, '/usr/local/share']]
137
            for parent_dir in parent_dirs:
138
                if Path(parent_dir).exists():
139
                    all_executables += [x for x in listdir(parent_dir) if x.startswith('ocrd-')]
140
        for this_executable in set(all_executables):
141
            reslist = []
142
            mimetypes = get_processor_resource_types(this_executable)
143
            moduledir = get_moduledir(this_executable)
144
            for res_filename in list_all_resources(this_executable, moduled=moduledir, xdg_data_home=self.xdg_data_home):
145
                res_filename = Path(res_filename)
146
                if not '*/*' in mimetypes:
147
                    if res_filename.is_dir() and not 'text/directory' in mimetypes:
148
                        continue
149
                    if res_filename.is_file() and ['text/directory'] == mimetypes:
150
                        continue
151
                res_name = res_filename.name
152
                resdict_list = [x for x in self.database.get(this_executable, []) if x['name'] == res_name]
153
                if resdict_list:
154
                    resdict = resdict_list[0]
155
                else:
156
                    resdict = self.add_to_user_database(this_executable, res_filename)
157
                resdict['path'] = str(res_filename)
158
                reslist.append(resdict)
159
            ret.append((this_executable, reslist))
160
        return ret
161
162
    def add_to_user_database(self, executable, res_filename, url=None, resource_type='file'):
163
        """
164
        Add a stub entry to the user resource.yml
165
        """
166
        res_name = Path(res_filename).name
167
        self.log.info("%s resource '%s' (%s) not a known resource, creating stub in %s'", executable, res_name, str(res_filename), self.user_list)
168
        if Path(res_filename).is_dir():
169
            res_size = directory_size(res_filename)
170
        else:
171
            res_size = Path(res_filename).stat().st_size
172
        with open(self.user_list, 'r', encoding='utf-8') as f:
173
            user_database = safe_load(f) or {}
174
        if executable not in user_database:
175
            user_database[executable] = []
176
        resources_found = self.find_resources(executable=executable, name=res_name, database=user_database)
177
        if not resources_found:
178
            resdict = {
179
                'name': res_name,
180
                'url': url if url else '???',
181
                'description': 'Found at %s on %s' % (self.resource_dir_to_location(res_filename), datetime.now()),
182
                'version_range': '???',
183
                'type': resource_type,
184
                'size': res_size
185
            }
186
            user_database[executable].append(resdict)
187
        else:
188
            resdict = resources_found[0][1]
189
        self.save_user_list(user_database)
190
        self.load_resource_list(self.user_list)
191
        return resdict
192
193
    @property
194
    def default_resource_dir(self):
195
        return self.location_to_resource_dir('data')
196
197
    def location_to_resource_dir(self, location):
198
        return '/usr/local/share/ocrd-resources' if location == 'system' else \
199
                join(self.xdg_data_home, 'ocrd-resources') if location == 'data' else \
200
                getcwd()
201
202
    def resource_dir_to_location(self, resource_path):
203
        resource_path = str(resource_path)
204
        return 'system' if resource_path.startswith('/usr/local/share/ocrd-resources') else \
205
               'data' if resource_path.startswith(join(self.xdg_data_home, 'ocrd-resources')) else \
206
               'cwd' if resource_path.startswith(getcwd()) else \
207
               resource_path
208
209
    def parameter_usage(self, name, usage='as-is'):
210
        if usage == 'as-is':
211
            return name
212
        elif usage == 'without-extension':
213
            return Path(name).stem
214
        raise ValueError("No such usage '%s'" % usage)
215
216
    def _download_impl(self, url, filename, progress_cb=None, size=None):
217
        log = getLogger('ocrd.resource_manager._download_impl')
218
        log.info("Downloading %s to %s" % (url, filename))
219
        with open(filename, 'wb') as f:
220
            with requests.get(url, stream=True) as r:
221
                total = size if size else int(r.headers.get('content-length'))
222
                for data in r.iter_content(chunk_size=4096):
223
                    if progress_cb:
224
                        progress_cb(len(data))
225
                    f.write(data)
226
227
    def _copy_impl(self, src_filename, filename, progress_cb=None):
228
        log = getLogger('ocrd.resource_manager._copy_impl')
229
        log.info("Copying %s to %s", src_filename, filename)
230
        if Path(src_filename).is_dir():
231
            log.info(f"Copying recursively from {src_filename} to {filename}")
232
            for child in Path(src_filename).rglob('*'):
233
                child_dst = Path(filename) / child.relative_to(src_filename)
234
                child_dst.parent.mkdir(parents=True, exist_ok=True)
235
                with open(child_dst, 'wb') as f_out, open(child, 'rb') as f_in:
236
                    while True:
237
                        chunk = f_in.read(4096)
238
                        if chunk:
239
                            f_out.write(chunk)
240
                            if progress_cb:
241
                                progress_cb(len(chunk))
242
                        else:
243
                            break
244
        else:
245
            with open(filename, 'wb') as f_out, open(src_filename, 'rb') as f_in:
246
                while True:
247
                    chunk = f_in.read(4096)
248
                    if chunk:
249
                        f_out.write(chunk)
250
                        if progress_cb:
251
                            progress_cb(len(chunk))
252
                    else:
253
                        break
254
255
    # TODO Proper caching (make head request for size, If-Modified etc)
256
    def download(
257
        self,
258
        executable,
259
        url,
260
        basedir,
261
        overwrite=False,
262
        no_subdir=False,
263
        name=None,
264
        resource_type='file',
265
        path_in_archive='.',
266
        progress_cb=None,
267
        size=None,
268
    ):
269
        """
270
        Download a resource by URL
271
        """
272
        log = getLogger('ocrd.resource_manager.download')
273
        destdir = Path(basedir) if no_subdir else Path(basedir, executable)
274
        if not name:
275
            url_parsed = urlparse(url)
276
            name = Path(unquote(url_parsed.path)).name
277
        fpath = Path(destdir, name)
278
        is_url = url.startswith('https://') or url.startswith('http://')
279
        if fpath.exists() and not overwrite:
280
            log.info("%s to be %s to %s which already exists and overwrite is False" % (url, 'downloaded' if is_url else 'copied', fpath))
281
            return fpath
282
        destdir.mkdir(parents=True, exist_ok=True)
283
        if resource_type in ('file', 'directory'):
284
            if is_url:
285
                self._download_impl(url, fpath, progress_cb)
286
            else:
287
                self._copy_impl(url, fpath, progress_cb)
288
        elif resource_type == 'archive':
289
            with pushd_popd(tempdir=True) as tempdir:
290
                if is_url:
291
                    self._download_impl(url, 'download.tar.xx', progress_cb, size)
292
                else:
293
                    self._copy_impl(url, 'download.tar.xx', progress_cb)
294
                Path('out').mkdir()
295
                with pushd_popd('out'):
296
                    log.info("Extracting archive to %s/out" % tempdir)
297
                    with open_tarfile('../download.tar.xx', 'r:*') as tar:
298
                        tar.extractall()
299
                    log.info("Copying '%s' from archive to %s" % (path_in_archive, fpath))
300
                    copytree(path_in_archive, str(fpath))
301
        return fpath
302
303
    def _dedup_database(self, database=None):
304
        """
305
        Deduplicate resources by name
306
        """
307
        if not database:
308
            database = self.database
309
        for executable, reslist in database.items():
310
            reslist_dedup = []
311
            for resdict in reslist:
312
                if any(r['name'] == resdict['name'] for r in reslist_dedup):
313
                    continue
314
                else:
315
                    reslist_dedup.append(resdict)
316
            database[executable] = reslist_dedup
317
        return database
318