Passed
Pull Request — master (#800)
by Konstantin
02:30
created

OcrdResourceManager.download()   D

Complexity

Conditions 13

Size

Total Lines 46
Code Lines 40

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 40
dl 0
loc 46
rs 4.2
c 0
b 0
f 0
cc 13
nop 11

How to fix   Complexity    Many Parameters   

Complexity

Complex classes like ocrd.resource_manager.OcrdResourceManager.download() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

Many Parameters

Methods with many parameters are not only hard to understand, but their parameters also often become inconsistent when you need more, or different data.

There are several approaches to avoid long parameter lists:

1
from pathlib import Path
2
from os.path import join
3
from json import loads
4
from os import environ, listdir, getcwd, path
5
from shutil import copytree
6
from datetime import datetime
7
from tarfile import open as open_tarfile
8
from urllib.parse import urlparse, unquote
9
from subprocess import run, PIPE
10
11
import requests
12
from yaml import safe_load, safe_dump
13
14
from ocrd_validators import OcrdResourceListValidator
15
from ocrd_utils import getLogger, directory_size
16
from ocrd_utils.os import get_processor_resource_types, list_all_resources, pushd_popd, get_ocrd_tool_json
17
from .constants import RESOURCE_LIST_FILENAME, RESOURCE_USER_LIST_COMMENT
18
19
class OcrdResourceManager():
20
21
    """
22
    Managing processor resources
23
    """
24
    def __init__(self, userdir=None, xdg_config_home=None, xdg_data_home=None):
25
        self.log = getLogger('ocrd.resource_manager')
26
        self.database = {}
27
28
        self._xdg_data_home = xdg_data_home
29
        self._xdg_config_home = xdg_config_home
30
        self._userdir = userdir
31
        self.user_list = Path(self.xdg_config_home, 'ocrd', 'resources.yml')
32
33
        self.load_resource_list(Path(RESOURCE_LIST_FILENAME))
34
        if not self.user_list.exists():
35
            if not self.user_list.parent.exists():
36
                self.user_list.parent.mkdir(parents=True)
37
            self.save_user_list()
38
        self.load_resource_list(self.user_list)
39
40
    @property
41
    def userdir(self):
42
        if not self._userdir:
43
            self._userdir = path.expanduser('~')
44
            if 'HOME' in environ and environ['HOME'] != path.expanduser('~'):
45
                self._userdir = environ['HOME']
46
        return self._userdir
47
48
    @property
49
    def xdg_data_home(self):
50
        if not self._xdg_data_home:
51
            if 'XDG_DATA_HOME' in environ:
52
                self._xdg_data_home = environ['XDG_DATA_HOME']
53
            else:
54
                self._xdg_data_home = join(self.userdir, '.local', 'share')
55
        return self._xdg_data_home
56
57
    @property
58
    def xdg_config_home(self):
59
        if not self._xdg_config_home:
60
            if 'XDG_CONFIG_HOME' in environ:
61
                self._xdg_config_home = environ['XDG_CONFIG_HOME']
62
            else:
63
                self._xdg_config_home = join(self.userdir, '.config')
64
        return self._xdg_config_home
65
66
    def save_user_list(self, database=None):
67
        if not database:
68
            database = self.database
69
        with open(self.user_list, 'w', encoding='utf-8') as f:
70
            f.write(RESOURCE_USER_LIST_COMMENT)
71
            f.write('\n')
72
            f.write(safe_dump(database))
73
74
    def load_resource_list(self, list_filename, database=None):
75
        if not database:
76
            database = self.database
77
        if list_filename.is_file():
78
            with open(list_filename, 'r', encoding='utf-8') as f:
79
                list_loaded = safe_load(f) or {}
80
            report = OcrdResourceListValidator.validate(list_loaded)
81
            if not report.is_valid:
82
                self.log.error('\n'.join(report.errors))
83
                raise ValueError("Resource list %s is invalid!" % (list_filename))
84
            for executable, resource_list in list_loaded.items():
85
                if executable not in database:
86
                    database[executable] = []
87
                # Prepend, so user provided is sorted before builtin
88
                database[executable] = list_loaded[executable] + database[executable]
89
        return database
90
91
    def list_available(self, executable=None, dynamic=True):
92
        """
93
        List models available for download by processor
94
        """
95
        if dynamic:
96
            for exec_dir in environ['PATH'].split(':'):
97
                for exec_path in Path(exec_dir).glob(f'{executable}*'):
98
                    self.log.info(f"Inspecting '{exec_path} --dump-json' for resources")
99
                    ocrd_tool = get_ocrd_tool_json(exec_path)
100
                    for resdict in ocrd_tool.get('resources', ()):
101
                        for res_remove in (res for res in self.database.get(executable, []) if res['name'] == resdict['name']):
102
                            self.database.get(executable).remove(res_remove)
103
                        self.database[exec_path.name].append(resdict)
104
        if executable:
105
            ret = []
106
            for k in self.database:
107
                if k.startswith(executable):
108
                    ret.append((k, self.database[k]))
109
            return ret
110
        return self.database.items()
111
112
    def list_installed(self, executable=None):
113
        """
114
        List installed resources, matching with registry by ``name``
115
        """
116
        ret = []
117
        if executable:
118
            all_executables = [executable]
119
        else:
120
            # resources we know about
121
            all_executables = list(self.database.keys())
122
            # resources in the file system
123
            parent_dirs = [join(x, 'ocrd-resources') for x in [self.xdg_data_home, '/usr/local/share']]
124
            for parent_dir in parent_dirs:
125
                if Path(parent_dir).exists():
126
                    all_executables += [x for x in listdir(parent_dir) if x.startswith('ocrd-')]
127
        for this_executable in set(all_executables):
128
            reslist = []
129
            mimetypes = get_processor_resource_types(this_executable)
130
            for res_filename in list_all_resources(this_executable, xdg_data_home=self.xdg_data_home):
131
                res_filename = Path(res_filename)
132
                if not '*/*' in mimetypes:
133
                    if res_filename.is_dir() and not 'text/directory' in mimetypes:
134
                        continue
135
                    if res_filename.is_file() and ['text/directory'] == mimetypes:
136
                        continue
137
                res_name = res_filename.name
138
                resdict = [x for x in self.database.get(this_executable, []) if x['name'] == res_name]
139
                if not resdict:
140
                    self.log.info("%s resource '%s' (%s) not a known resource, creating stub in %s'", this_executable, res_name, str(res_filename), self.user_list)
141
                    resdict = [self.add_to_user_database(this_executable, res_filename)]
142
                resdict[0]['path'] = str(res_filename)
143
                reslist.append(resdict[0])
144
            ret.append((this_executable, reslist))
145
        return ret
146
147
    def add_to_user_database(self, executable, res_filename, url=None, resource_type='file'):
148
        """
149
        Add a stub entry to the user resource.yml
150
        """
151
        res_name = Path(res_filename).name
152
        if Path(res_filename).is_dir():
153
            res_size = directory_size(res_filename)
154
        else:
155
            res_size = Path(res_filename).stat().st_size
156
        with open(self.user_list, 'r', encoding='utf-8') as f:
157
            user_database = safe_load(f) or {}
158
        if executable not in user_database:
159
            user_database[executable] = []
160
        resources_found = self.find_resources(executable=executable, name=res_name, database=user_database)
161
        if not resources_found:
162
            resdict = {
163
                'name': res_name,
164
                'url': url if url else '???',
165
                'description': 'Found at %s on %s' % (self.resource_dir_to_location(res_filename), datetime.now()),
166
                'version_range': '???',
167
                'type': resource_type,
168
                'size': res_size
169
            }
170
            user_database[executable].append(resdict)
171
        else:
172
            resdict = resources_found[0][1]
173
        self.save_user_list(user_database)
174
        self.load_resource_list(self.user_list)
175
        return resdict
176
177
    def find_resources(self, executable=None, name=None, url=None, database=None):
178
        """
179
        Find resources in the registry
180
        """
181
        if not database:
182
            database = self.database
183
        ret = []
184
        if executable and executable not in database.keys():
185
            return ret
186
        for executable in [executable] if executable else database.keys():
187
            for resdict in database[executable]:
188
                if not name and not url:
189
                    ret.append((executable, resdict))
190
                elif url and url == resdict['url']:
191
                    ret.append((executable, resdict))
192
                elif name and name == resdict['name']:
193
                    ret.append((executable, resdict))
194
        return ret
195
196
    @property
197
    def default_resource_dir(self):
198
        return self.location_to_resource_dir('data')
199
200
    def location_to_resource_dir(self, location):
201
        return '/usr/local/share/ocrd-resources' if location == 'system' else \
202
                join(self.xdg_data_home, 'ocrd-resources') if location == 'data' else \
203
                getcwd()
204
205
    def resource_dir_to_location(self, resource_path):
206
        resource_path = str(resource_path)
207
        return 'system' if resource_path.startswith('/usr/local/share/ocrd-resources') else \
208
               'data' if resource_path.startswith(join(self.xdg_data_home, 'ocrd-resources')) else \
209
               'cwd' if resource_path.startswith(getcwd()) else \
210
               resource_path
211
212
    def parameter_usage(self, name, usage='as-is'):
213
        if usage == 'as-is':
214
            return name
215
        elif usage == 'without-extension':
216
            return Path(name).stem
217
        raise ValueError("No such usage '%s'" % usage)
218
219
    def _download_impl(self, url, filename, progress_cb=None, size=None):
220
        log = getLogger('ocrd.resource_manager._download_impl')
221
        log.info("Downloading %s to %s" % (url, filename))
222
        with open(filename, 'wb') as f:
223
            with requests.get(url, stream=True) as r:
224
                total = size if size else int(r.headers.get('content-length'))
225
                for data in r.iter_content(chunk_size=4096):
226
                    if progress_cb:
227
                        progress_cb(len(data))
228
                    f.write(data)
229
230
    def _copy_impl(self, src_filename, filename, progress_cb=None):
231
        log = getLogger('ocrd.resource_manager._copy_impl')
232
        log.info("Copying %s to %s", src_filename, filename)
233
        if Path(src_filename).is_dir():
234
            log.info(f"Copying recursively from {src_filename} to {filename}")
235
            for child in Path(src_filename).rglob('*'):
236
                child_dst = Path(filename) / child.relative_to(src_filename)
237
                child_dst.parent.mkdir(parents=True, exist_ok=True)
238
                with open(child_dst, 'wb') as f_out, open(child, 'rb') as f_in:
239
                    while True:
240
                        chunk = f_in.read(4096)
241
                        if chunk:
242
                            f_out.write(chunk)
243
                            if progress_cb:
244
                                progress_cb(len(chunk))
245
                        else:
246
                            break
247
        else:
248
            with open(filename, 'wb') as f_out, open(src_filename, 'rb') as f_in:
249
                while True:
250
                    chunk = f_in.read(4096)
251
                    if chunk:
252
                        f_out.write(chunk)
253
                        if progress_cb:
254
                            progress_cb(len(chunk))
255
                    else:
256
                        break
257
258
    # TODO Proper caching (make head request for size, If-Modified etc)
259
    def download(
260
        self,
261
        executable,
262
        url,
263
        basedir,
264
        overwrite=False,
265
        no_subdir=False,
266
        name=None,
267
        resource_type='file',
268
        path_in_archive='.',
269
        progress_cb=None,
270
        size=None,
271
    ):
272
        """
273
        Download a resource by URL
274
        """
275
        log = getLogger('ocrd.resource_manager.download')
276
        destdir = Path(basedir) if no_subdir else Path(basedir, executable)
277
        if not name:
278
            url_parsed = urlparse(url)
279
            name = Path(unquote(url_parsed.path)).name
280
        fpath = Path(destdir, name)
281
        is_url = url.startswith('https://') or url.startswith('http://')
282
        if fpath.exists() and not overwrite:
283
            log.info("%s to be %s to %s which already exists and overwrite is False" % (url, 'downloaded' if is_url else 'copied', fpath))
284
            return fpath
285
        destdir.mkdir(parents=True, exist_ok=True)
286
        if resource_type in ('file', 'directory'):
287
            if is_url:
288
                self._download_impl(url, fpath, progress_cb)
289
            else:
290
                self._copy_impl(url, fpath, progress_cb)
291
        elif resource_type == 'archive':
292
            with pushd_popd(tempdir=True):
293
                if is_url:
294
                    self._download_impl(url, 'download.tar.xx', progress_cb, size)
295
                else:
296
                    self._copy_impl(url, 'download.tar.xx', progress_cb)
297
                Path('out').mkdir()
298
                with pushd_popd('out'):
299
                    log.info("Extracting archive")
300
                    with open_tarfile('../download.tar.xx', 'r:*') as tar:
301
                        tar.extractall()
302
                    log.info("Copying '%s' from archive to %s" % (path_in_archive, fpath))
303
                    copytree(path_in_archive, str(fpath))
304
        return fpath
305