Passed
Pull Request — master (#800)
by Konstantin
02:41
created

OcrdResourceManager.download()   D

Complexity

Conditions 13

Size

Total Lines 46
Code Lines 40

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 40
dl 0
loc 46
rs 4.2
c 0
b 0
f 0
cc 13
nop 11

How to fix   Complexity    Many Parameters   

Complexity

Complex classes like ocrd.resource_manager.OcrdResourceManager.download() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

Many Parameters

Methods with many parameters are not only hard to understand, but their parameters also often become inconsistent when you need more, or different data.

There are several approaches to avoid long parameter lists:

1
from pathlib import Path
2
from os.path import join
3
from json import loads
4
from os import environ, listdir, getcwd, path
5
from shutil import copytree
6
from datetime import datetime
7
from tarfile import open as open_tarfile
8
from urllib.parse import urlparse, unquote
9
from subprocess import run, PIPE
10
11
import requests
12
from yaml import safe_load, safe_dump
13
14
from ocrd_validators import OcrdResourceListValidator
15
from ocrd_utils import getLogger, directory_size
16
from ocrd_utils.os import get_processor_resource_types, list_all_resources, pushd_popd, get_ocrd_tool_json
17
from .constants import RESOURCE_LIST_FILENAME, RESOURCE_USER_LIST_COMMENT
18
19
class OcrdResourceManager():
20
21
    """
22
    Managing processor resources
23
    """
24
    def __init__(self, userdir=None, xdg_config_home=None, xdg_data_home=None):
25
        self.log = getLogger('ocrd.resource_manager')
26
        self.database = {}
27
28
        self._xdg_data_home = xdg_data_home
29
        self._xdg_config_home = xdg_config_home
30
        self._userdir = userdir
31
        self.user_list = Path(self.xdg_config_home, 'ocrd', 'resources.yml')
32
33
        self.load_resource_list(Path(RESOURCE_LIST_FILENAME))
34
        if not self.user_list.exists():
35
            if not self.user_list.parent.exists():
36
                self.user_list.parent.mkdir(parents=True)
37
            self.save_user_list()
38
        self.load_resource_list(self.user_list)
39
40
    @property
41
    def userdir(self):
42
        if not self._userdir:
43
            self._userdir = path.expanduser('~')
44
            if 'HOME' in environ and environ['HOME'] != path.expanduser('~'):
45
                self._userdir = environ['HOME']
46
        return self._userdir
47
48
    @property
49
    def xdg_data_home(self):
50
        if not self._xdg_data_home:
51
            if 'XDG_DATA_HOME' in environ:
52
                self._xdg_data_home = environ['XDG_DATA_HOME']
53
            else:
54
                self._xdg_data_home = join(self.userdir, '.local', 'share')
55
        return self._xdg_data_home
56
57
    @property
58
    def xdg_config_home(self):
59
        if not self._xdg_config_home:
60
            if 'XDG_CONFIG_HOME' in environ:
61
                self._xdg_config_home = environ['XDG_CONFIG_HOME']
62
            else:
63
                self._xdg_config_home = join(self.userdir, '.config')
64
        return self._xdg_config_home
65
66
    def save_user_list(self, database=None):
67
        if not database:
68
            database = self.database
69
        with open(self.user_list, 'w', encoding='utf-8') as f:
70
            f.write(RESOURCE_USER_LIST_COMMENT)
71
            f.write('\n')
72
            f.write(safe_dump(database))
73
74
    def load_resource_list(self, list_filename, database=None):
75
        if not database:
76
            database = self.database
77
        if list_filename.is_file():
78
            with open(list_filename, 'r', encoding='utf-8') as f:
79
                list_loaded = safe_load(f) or {}
80
            report = OcrdResourceListValidator.validate(list_loaded)
81
            if not report.is_valid:
82
                self.log.error('\n'.join(report.errors))
83
                raise ValueError("Resource list %s is invalid!" % (list_filename))
84
            for executable, resource_list in list_loaded.items():
85
                if executable not in database:
86
                    database[executable] = []
87
                # Prepend, so user provided is sorted before builtin
88
                database[executable] = list_loaded[executable] + database[executable]
89
        return database
90
91
    def list_available(self, executable=None, dynamic=True):
92
        """
93
        List models available for download by processor
94
        """
95
        if dynamic:
96
            for exec_dir in environ['PATH'].split(':'):
97
                for exec_path in Path(exec_dir).glob(f'{executable}*'):
98
                    self.log.info(f"Inspecting '{exec_path} --dump-json' for resources")
99
                    ocrd_tool = get_ocrd_tool_json(exec_path)
100
                    for resdict in ocrd_tool.get('resources', ()):
101
                        if not any(x['name'] == resdict['name'] for x in self.database.get(executable, [])):
102
                            self.database[exec_path.name].append(resdict)
103
        if executable:
104
            ret = []
105
            for k in self.database:
106
                if k.startswith(executable):
107
                    ret.append((k, self.database[k]))
108
            return ret
109
        return self.database.items()
110
111
    def list_installed(self, executable=None):
112
        """
113
        List installed resources, matching with registry by ``name``
114
        """
115
        ret = []
116
        if executable:
117
            all_executables = [executable]
118
        else:
119
            # resources we know about
120
            all_executables = list(self.database.keys())
121
            # resources in the file system
122
            parent_dirs = [join(x, 'ocrd-resources') for x in [self.xdg_data_home, '/usr/local/share']]
123
            for parent_dir in parent_dirs:
124
                if Path(parent_dir).exists():
125
                    all_executables += [x for x in listdir(parent_dir) if x.startswith('ocrd-')]
126
        for this_executable in set(all_executables):
127
            reslist = []
128
            mimetypes = get_processor_resource_types(this_executable)
129
            for res_filename in list_all_resources(this_executable, xdg_data_home=self.xdg_data_home):
130
                res_filename = Path(res_filename)
131
                if not '*/*' in mimetypes:
132
                    if res_filename.is_dir() and not 'text/directory' in mimetypes:
133
                        continue
134
                    if res_filename.is_file() and ['text/directory'] == mimetypes:
135
                        continue
136
                res_name = res_filename.name
137
                resdict = [x for x in self.database.get(this_executable, []) if x['name'] == res_name]
138
                if not resdict:
139
                    self.log.info("%s resource '%s' (%s) not a known resource, creating stub in %s'", this_executable, res_name, str(res_filename), self.user_list)
140
                    resdict = [self.add_to_user_database(this_executable, res_filename)]
141
                resdict[0]['path'] = str(res_filename)
142
                reslist.append(resdict[0])
143
            ret.append((this_executable, reslist))
144
        return ret
145
146
    def add_to_user_database(self, executable, res_filename, url=None, resource_type='file'):
147
        """
148
        Add a stub entry to the user resource.yml
149
        """
150
        res_name = Path(res_filename).name
151
        if Path(res_filename).is_dir():
152
            res_size = directory_size(res_filename)
153
        else:
154
            res_size = Path(res_filename).stat().st_size
155
        with open(self.user_list, 'r', encoding='utf-8') as f:
156
            user_database = safe_load(f) or {}
157
        if executable not in user_database:
158
            user_database[executable] = []
159
        resources_found = self.find_resources(executable=executable, name=res_name, database=user_database)
160
        if not resources_found:
161
            resdict = {
162
                'name': res_name,
163
                'url': url if url else '???',
164
                'description': 'Found at %s on %s' % (self.resource_dir_to_location(res_filename), datetime.now()),
165
                'version_range': '???',
166
                'type': resource_type,
167
                'size': res_size
168
            }
169
            user_database[executable].append(resdict)
170
        else:
171
            resdict = resources_found[0][1]
172
        self.save_user_list(user_database)
173
        self.load_resource_list(self.user_list)
174
        return resdict
175
176
    def find_resources(self, executable=None, name=None, url=None, database=None):
177
        """
178
        Find resources in the registry
179
        """
180
        if not database:
181
            database = self.database
182
        ret = []
183
        if executable and executable not in database.keys():
184
            return ret
185
        for executable in [executable] if executable else database.keys():
186
            for resdict in database[executable]:
187
                if not name and not url:
188
                    ret.append((executable, resdict))
189
                elif url and url == resdict['url']:
190
                    ret.append((executable, resdict))
191
                elif name and name == resdict['name']:
192
                    ret.append((executable, resdict))
193
        return ret
194
195
    @property
196
    def default_resource_dir(self):
197
        return self.location_to_resource_dir('data')
198
199
    def location_to_resource_dir(self, location):
200
        return '/usr/local/share/ocrd-resources' if location == 'system' else \
201
                join(self.xdg_data_home, 'ocrd-resources') if location == 'data' else \
202
                getcwd()
203
204
    def resource_dir_to_location(self, resource_path):
205
        resource_path = str(resource_path)
206
        return 'system' if resource_path.startswith('/usr/local/share/ocrd-resources') else \
207
               'data' if resource_path.startswith(join(self.xdg_data_home, 'ocrd-resources')) else \
208
               'cwd' if resource_path.startswith(getcwd()) else \
209
               resource_path
210
211
    def parameter_usage(self, name, usage='as-is'):
212
        if usage == 'as-is':
213
            return name
214
        elif usage == 'without-extension':
215
            return Path(name).stem
216
        raise ValueError("No such usage '%s'" % usage)
217
218
    def _download_impl(self, url, filename, progress_cb=None, size=None):
219
        log = getLogger('ocrd.resource_manager._download_impl')
220
        log.info("Downloading %s to %s" % (url, filename))
221
        with open(filename, 'wb') as f:
222
            with requests.get(url, stream=True) as r:
223
                total = size if size else int(r.headers.get('content-length'))
224
                for data in r.iter_content(chunk_size=4096):
225
                    if progress_cb:
226
                        progress_cb(len(data))
227
                    f.write(data)
228
229
    def _copy_impl(self, src_filename, filename, progress_cb=None):
230
        log = getLogger('ocrd.resource_manager._copy_impl')
231
        log.info("Copying %s to %s", src_filename, filename)
232
        if Path(src_filename).is_dir():
233
            log.info(f"Copying recursively from {src_filename} to {filename}")
234
            for child in Path(src_filename).rglob('*'):
235
                child_dst = Path(filename) / child.relative_to(src_filename)
236
                child_dst.parent.mkdir(parents=True, exist_ok=True)
237
                with open(child_dst, 'wb') as f_out, open(child, 'rb') as f_in:
238
                    while True:
239
                        chunk = f_in.read(4096)
240
                        if chunk:
241
                            f_out.write(chunk)
242
                            if progress_cb:
243
                                progress_cb(len(chunk))
244
                        else:
245
                            break
246
        else:
247
            with open(filename, 'wb') as f_out, open(src_filename, 'rb') as f_in:
248
                while True:
249
                    chunk = f_in.read(4096)
250
                    if chunk:
251
                        f_out.write(chunk)
252
                        if progress_cb:
253
                            progress_cb(len(chunk))
254
                    else:
255
                        break
256
257
    # TODO Proper caching (make head request for size, If-Modified etc)
258
    def download(
259
        self,
260
        executable,
261
        url,
262
        basedir,
263
        overwrite=False,
264
        no_subdir=False,
265
        name=None,
266
        resource_type='file',
267
        path_in_archive='.',
268
        progress_cb=None,
269
        size=None,
270
    ):
271
        """
272
        Download a resource by URL
273
        """
274
        log = getLogger('ocrd.resource_manager.download')
275
        destdir = Path(basedir) if no_subdir else Path(basedir, executable)
276
        if not name:
277
            url_parsed = urlparse(url)
278
            name = Path(unquote(url_parsed.path)).name
279
        fpath = Path(destdir, name)
280
        is_url = url.startswith('https://') or url.startswith('http://')
281
        if fpath.exists() and not overwrite:
282
            log.info("%s to be %s to %s which already exists and overwrite is False" % (url, 'downloaded' if is_url else 'copied', fpath))
283
            return fpath
284
        destdir.mkdir(parents=True, exist_ok=True)
285
        if resource_type in ('file', 'directory'):
286
            if is_url:
287
                self._download_impl(url, fpath, progress_cb)
288
            else:
289
                self._copy_impl(url, fpath, progress_cb)
290
        elif resource_type == 'archive':
291
            with pushd_popd(tempdir=True):
292
                if is_url:
293
                    self._download_impl(url, 'download.tar.xx', progress_cb, size)
294
                else:
295
                    self._copy_impl(url, 'download.tar.xx', progress_cb)
296
                Path('out').mkdir()
297
                with pushd_popd('out'):
298
                    log.info("Extracting archive")
299
                    with open_tarfile('../download.tar.xx', 'r:*') as tar:
300
                        tar.extractall()
301
                    log.info("Copying '%s' from archive to %s" % (path_in_archive, fpath))
302
                    copytree(path_in_archive, str(fpath))
303
        return fpath
304