Passed
Pull Request — master (#800)
by Konstantin
02:46
created

OcrdResourceManager.download()   D

Complexity

Conditions 13

Size

Total Lines 46
Code Lines 40

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 40
dl 0
loc 46
rs 4.2
c 0
b 0
f 0
cc 13
nop 11

How to fix   Complexity    Many Parameters   

Complexity

Complex classes like ocrd.resource_manager.OcrdResourceManager.download() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

Many Parameters

Methods with many parameters are not only hard to understand, but their parameters also often become inconsistent when you need more, or different data.

There are several approaches to avoid long parameter lists:

1
from pathlib import Path
2
from os.path import join
3
from json import loads
4
from json.decoder import JSONDecodeError
5
from os import environ, listdir, getcwd, path
6
from fnmatch import filter as apply_glob
7
from shutil import copytree
8
from datetime import datetime
9
from tarfile import open as open_tarfile
10
from urllib.parse import urlparse, unquote
11
from subprocess import run, PIPE
12
13
import requests
14
from yaml import safe_load, safe_dump
15
16
from ocrd_validators import OcrdResourceListValidator
17
from ocrd_utils import getLogger, directory_size
18
from ocrd_utils.os import get_processor_resource_types, list_all_resources, pushd_popd, get_ocrd_tool_json
19
from .constants import RESOURCE_LIST_FILENAME, RESOURCE_USER_LIST_COMMENT
20
21
class OcrdResourceManager():
22
23
    """
24
    Managing processor resources
25
    """
26
    def __init__(self, userdir=None, xdg_config_home=None, xdg_data_home=None):
27
        self.log = getLogger('ocrd.resource_manager')
28
        self.database = {}
29
30
        self._xdg_data_home = xdg_data_home
31
        self._xdg_config_home = xdg_config_home
32
        self._userdir = userdir
33
        self.user_list = Path(self.xdg_config_home, 'ocrd', 'resources.yml')
34
35
        self.load_resource_list(Path(RESOURCE_LIST_FILENAME))
36
        if not self.user_list.exists():
37
            if not self.user_list.parent.exists():
38
                self.user_list.parent.mkdir(parents=True)
39
            self.save_user_list()
40
        self.load_resource_list(self.user_list)
41
42
    @property
43
    def userdir(self):
44
        if not self._userdir:
45
            self._userdir = path.expanduser('~')
46
            if 'HOME' in environ and environ['HOME'] != path.expanduser('~'):
47
                self._userdir = environ['HOME']
48
        return self._userdir
49
50
    @property
51
    def xdg_data_home(self):
52
        if not self._xdg_data_home:
53
            if 'XDG_DATA_HOME' in environ:
54
                self._xdg_data_home = environ['XDG_DATA_HOME']
55
            else:
56
                self._xdg_data_home = join(self.userdir, '.local', 'share')
57
        return self._xdg_data_home
58
59
    @property
60
    def xdg_config_home(self):
61
        if not self._xdg_config_home:
62
            if 'XDG_CONFIG_HOME' in environ:
63
                self._xdg_config_home = environ['XDG_CONFIG_HOME']
64
            else:
65
                self._xdg_config_home = join(self.userdir, '.config')
66
        return self._xdg_config_home
67
68
    def save_user_list(self, database=None):
69
        if not database:
70
            database = self.database
71
        with open(self.user_list, 'w', encoding='utf-8') as f:
72
            f.write(RESOURCE_USER_LIST_COMMENT)
73
            f.write('\n')
74
            f.write(safe_dump(database))
75
76
    def load_resource_list(self, list_filename, database=None):
77
        if not database:
78
            database = self.database
79
        if list_filename.is_file():
80
            with open(list_filename, 'r', encoding='utf-8') as f:
81
                list_loaded = safe_load(f) or {}
82
            report = OcrdResourceListValidator.validate(list_loaded)
83
            if not report.is_valid:
84
                self.log.error('\n'.join(report.errors))
85
                raise ValueError("Resource list %s is invalid!" % (list_filename))
86
            for executable, resource_list in list_loaded.items():
87
                if executable not in database:
88
                    database[executable] = []
89
                # Prepend, so user provided is sorted before builtin
90
                database[executable] = list_loaded[executable] + database[executable]
91
        return database
92
93
    def list_available(self, executable=None, dynamic=True):
94
        """
95
        List models available for download by processor
96
        """
97
        if not executable:
98
            return self.database.items()
99
        if dynamic:
100
            for exec_dir in environ['PATH'].split(':'):
101
                for exec_path in Path(exec_dir).glob(f'{executable}'):
102
                    self.log.info(f"Inspecting '{exec_path} --dump-json' for resources")
103
                    try:
104
                        ocrd_tool = get_ocrd_tool_json(exec_path)
105
                    except JSONDecodeError:
106
                        self.log.info(f"Failed to parse {exec_path} --dump-json output - not an OCR-D processor?")
107
                    for resdict in ocrd_tool.get('resources', ()):
108
                        for res_remove in (res for res in self.database.get(executable, []) if res['name'] == resdict['name']):
109
                            self.database.get(executable).remove(res_remove)
110
                        self.database[exec_path.name].append(resdict)
111
        ret = []
112
        for k in self.database:
113
            if apply_glob([k], executable):
114
                ret.append((k, self.database[k]))
115
        return ret
116
117
    def list_installed(self, executable=None):
118
        """
119
        List installed resources, matching with registry by ``name``
120
        """
121
        ret = []
122
        if executable:
123
            all_executables = [executable]
124
        else:
125
            # resources we know about
126
            all_executables = list(self.database.keys())
127
            # resources in the file system
128
            parent_dirs = [join(x, 'ocrd-resources') for x in [self.xdg_data_home, '/usr/local/share']]
129
            for parent_dir in parent_dirs:
130
                if Path(parent_dir).exists():
131
                    all_executables += [x for x in listdir(parent_dir) if x.startswith('ocrd-')]
132
        for this_executable in set(all_executables):
133
            reslist = []
134
            mimetypes = get_processor_resource_types(this_executable)
135
            for res_filename in list_all_resources(this_executable, xdg_data_home=self.xdg_data_home):
136
                res_filename = Path(res_filename)
137
                if not '*/*' in mimetypes:
138
                    if res_filename.is_dir() and not 'text/directory' in mimetypes:
139
                        continue
140
                    if res_filename.is_file() and ['text/directory'] == mimetypes:
141
                        continue
142
                res_name = res_filename.name
143
                resdict = [x for x in self.database.get(this_executable, []) if x['name'] == res_name]
144
                if not resdict:
145
                    self.log.info("%s resource '%s' (%s) not a known resource, creating stub in %s'", this_executable, res_name, str(res_filename), self.user_list)
146
                    resdict = [self.add_to_user_database(this_executable, res_filename)]
147
                resdict[0]['path'] = str(res_filename)
148
                reslist.append(resdict[0])
149
            ret.append((this_executable, reslist))
150
        return ret
151
152
    def add_to_user_database(self, executable, res_filename, url=None, resource_type='file'):
153
        """
154
        Add a stub entry to the user resource.yml
155
        """
156
        res_name = Path(res_filename).name
157
        if Path(res_filename).is_dir():
158
            res_size = directory_size(res_filename)
159
        else:
160
            res_size = Path(res_filename).stat().st_size
161
        with open(self.user_list, 'r', encoding='utf-8') as f:
162
            user_database = safe_load(f) or {}
163
        if executable not in user_database:
164
            user_database[executable] = []
165
        resources_found = self.find_resources(executable=executable, name=res_name, database=user_database)
166
        if not resources_found:
167
            resdict = {
168
                'name': res_name,
169
                'url': url if url else '???',
170
                'description': 'Found at %s on %s' % (self.resource_dir_to_location(res_filename), datetime.now()),
171
                'version_range': '???',
172
                'type': resource_type,
173
                'size': res_size
174
            }
175
            user_database[executable].append(resdict)
176
        else:
177
            resdict = resources_found[0][1]
178
        self.save_user_list(user_database)
179
        self.load_resource_list(self.user_list)
180
        return resdict
181
182
    def find_resources(self, executable=None, name=None, url=None, database=None):
183
        """
184
        Find resources in the registry
185
        """
186
        if not database:
187
            database = self.database
188
        ret = []
189
        if executable and executable not in database.keys():
190
            return ret
191
        for executable in [executable] if executable else database.keys():
192
            for resdict in database[executable]:
193
                if not name and not url:
194
                    ret.append((executable, resdict))
195
                elif url and url == resdict['url']:
196
                    ret.append((executable, resdict))
197
                elif name and name == resdict['name']:
198
                    ret.append((executable, resdict))
199
        return ret
200
201
    @property
202
    def default_resource_dir(self):
203
        return self.location_to_resource_dir('data')
204
205
    def location_to_resource_dir(self, location):
206
        return '/usr/local/share/ocrd-resources' if location == 'system' else \
207
                join(self.xdg_data_home, 'ocrd-resources') if location == 'data' else \
208
                getcwd()
209
210
    def resource_dir_to_location(self, resource_path):
211
        resource_path = str(resource_path)
212
        return 'system' if resource_path.startswith('/usr/local/share/ocrd-resources') else \
213
               'data' if resource_path.startswith(join(self.xdg_data_home, 'ocrd-resources')) else \
214
               'cwd' if resource_path.startswith(getcwd()) else \
215
               resource_path
216
217
    def parameter_usage(self, name, usage='as-is'):
218
        if usage == 'as-is':
219
            return name
220
        elif usage == 'without-extension':
221
            return Path(name).stem
222
        raise ValueError("No such usage '%s'" % usage)
223
224
    def _download_impl(self, url, filename, progress_cb=None, size=None):
225
        log = getLogger('ocrd.resource_manager._download_impl')
226
        log.info("Downloading %s to %s" % (url, filename))
227
        with open(filename, 'wb') as f:
228
            with requests.get(url, stream=True) as r:
229
                total = size if size else int(r.headers.get('content-length'))
230
                for data in r.iter_content(chunk_size=4096):
231
                    if progress_cb:
232
                        progress_cb(len(data))
233
                    f.write(data)
234
235
    def _copy_impl(self, src_filename, filename, progress_cb=None):
236
        log = getLogger('ocrd.resource_manager._copy_impl')
237
        log.info("Copying %s to %s", src_filename, filename)
238
        if Path(src_filename).is_dir():
239
            log.info(f"Copying recursively from {src_filename} to {filename}")
240
            for child in Path(src_filename).rglob('*'):
241
                child_dst = Path(filename) / child.relative_to(src_filename)
242
                child_dst.parent.mkdir(parents=True, exist_ok=True)
243
                with open(child_dst, 'wb') as f_out, open(child, 'rb') as f_in:
244
                    while True:
245
                        chunk = f_in.read(4096)
246
                        if chunk:
247
                            f_out.write(chunk)
248
                            if progress_cb:
249
                                progress_cb(len(chunk))
250
                        else:
251
                            break
252
        else:
253
            with open(filename, 'wb') as f_out, open(src_filename, 'rb') as f_in:
254
                while True:
255
                    chunk = f_in.read(4096)
256
                    if chunk:
257
                        f_out.write(chunk)
258
                        if progress_cb:
259
                            progress_cb(len(chunk))
260
                    else:
261
                        break
262
263
    # TODO Proper caching (make head request for size, If-Modified etc)
264
    def download(
265
        self,
266
        executable,
267
        url,
268
        basedir,
269
        overwrite=False,
270
        no_subdir=False,
271
        name=None,
272
        resource_type='file',
273
        path_in_archive='.',
274
        progress_cb=None,
275
        size=None,
276
    ):
277
        """
278
        Download a resource by URL
279
        """
280
        log = getLogger('ocrd.resource_manager.download')
281
        destdir = Path(basedir) if no_subdir else Path(basedir, executable)
282
        if not name:
283
            url_parsed = urlparse(url)
284
            name = Path(unquote(url_parsed.path)).name
285
        fpath = Path(destdir, name)
286
        is_url = url.startswith('https://') or url.startswith('http://')
287
        if fpath.exists() and not overwrite:
288
            log.info("%s to be %s to %s which already exists and overwrite is False" % (url, 'downloaded' if is_url else 'copied', fpath))
289
            return fpath
290
        destdir.mkdir(parents=True, exist_ok=True)
291
        if resource_type in ('file', 'directory'):
292
            if is_url:
293
                self._download_impl(url, fpath, progress_cb)
294
            else:
295
                self._copy_impl(url, fpath, progress_cb)
296
        elif resource_type == 'archive':
297
            with pushd_popd(tempdir=True) as tempdir:
298
                if is_url:
299
                    self._download_impl(url, 'download.tar.xx', progress_cb, size)
300
                else:
301
                    self._copy_impl(url, 'download.tar.xx', progress_cb)
302
                Path('out').mkdir()
303
                with pushd_popd('out'):
304
                    log.info("Extracting archive to %s/out" % tempdir)
305
                    with open_tarfile('../download.tar.xx', 'r:*') as tar:
306
                        tar.extractall()
307
                    log.info("Copying '%s' from archive to %s" % (path_in_archive, fpath))
308
                    copytree(path_in_archive, str(fpath))
309
        return fpath
310