Passed
Pull Request — master (#1248)
by
unknown
03:14
created

OcrdResourceManager.download()   F

Complexity

Conditions 19

Size

Total Lines 58
Code Lines 49

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 49
dl 0
loc 58
rs 0.5999
c 0
b 0
f 0
cc 19
nop 10

How to fix   Long Method    Complexity    Many Parameters   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

Complexity

Complex classes like ocrd.resource_manager.OcrdResourceManager.download() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

Many Parameters

Methods with many parameters are not only hard to understand, but their parameters also often become inconsistent when you need more, or different data.

There are several approaches to avoid long parameter lists:

1
from pathlib import Path
2
from os.path import join
3
from os import environ, listdir, makedirs, getcwd, path, unlink
4
from shutil import copytree, rmtree, copy
5
from fnmatch import filter as apply_glob
6
from datetime import datetime
7
from tarfile import open as open_tarfile
8
from urllib.parse import urlparse, unquote
9
from zipfile import ZipFile
10
11
import requests
12
from gdown.parse_url import parse_url as gparse_url
13
from gdown.download import get_url_from_gdrive_confirmation
14
from yaml import safe_load, safe_dump
15
16
# https://github.com/OCR-D/core/issues/867
17
# https://stackoverflow.com/questions/50900727/skip-converting-entities-while-loading-a-yaml-string-using-pyyaml
18
import yaml.constructor
19
yaml.constructor.SafeConstructor.yaml_constructors[u'tag:yaml.org,2002:timestamp'] = \
20
    yaml.constructor.SafeConstructor.yaml_constructors[u'tag:yaml.org,2002:str']
21
22
from ocrd_validators import OcrdResourceListValidator
23
from ocrd_utils import getLogger, directory_size, get_moduledir, EXT_TO_MIME, nth_url_segment, guess_media_type, config
24
from ocrd_utils.os import get_processor_resource_types, list_all_resources, pushd_popd, get_ocrd_tool_json
25
from .constants import RESOURCE_LIST_FILENAME, RESOURCE_USER_LIST_COMMENT
26
27
28
class OcrdResourceManager:
29
30
    """
31
    Managing processor resources
32
    """
33
    def __init__(self, userdir=None, xdg_config_home=None, xdg_data_home=None, skip_init=False):
34
        self.log = getLogger('ocrd.resource_manager')
35
        self.database = {}
36
37
        self._xdg_data_home = xdg_data_home
38
        self._xdg_config_home = xdg_config_home
39
        self._userdir = userdir
40
        self.user_list = Path(self.xdg_config_home, 'ocrd', 'resources.yml')
41
42
        if not skip_init:
43
            self.load_resource_list(Path(RESOURCE_LIST_FILENAME))
44
            if not self.user_list.exists():
45
                if not self.user_list.parent.exists():
46
                    self.user_list.parent.mkdir(parents=True)
47
                self.save_user_list()
48
            self.load_resource_list(self.user_list)
49
50
    @property
51
    def userdir(self):
52
        if not self._userdir:
53
            self._userdir = config.HOME
54
        return self._userdir
55
56
    @property
57
    def xdg_data_home(self):
58
        if not self._xdg_data_home:
59
            self._xdg_data_home = config.XDG_DATA_HOME
60
        return self._xdg_data_home
61
62
    @property
63
    def xdg_config_home(self):
64
        if self._xdg_config_home:
65
            return self._xdg_config_home
66
        return config.XDG_CONFIG_HOME
67
68
    def save_user_list(self, database=None):
69
        if not database:
70
            database = self.database
71
        with open(self.user_list, 'w', encoding='utf-8') as f:
72
            f.write(RESOURCE_USER_LIST_COMMENT)
73
            f.write('\n')
74
            f.write(safe_dump(database))
75
76
    def load_resource_list(self, list_filename, database=None):
77
        if not database:
78
            database = self.database
79
        if list_filename.is_file():
80
            with open(list_filename, 'r', encoding='utf-8') as f:
81
                list_loaded = safe_load(f) or {}
82
            report = OcrdResourceListValidator.validate(list_loaded)
83
            if not report.is_valid:
84
                self.log.error('\n'.join(report.errors))
85
                raise ValueError(f"Resource list {list_filename} is invalid!")
86
            for executable, resource_list in list_loaded.items():
87
                if executable not in database:
88
                    database[executable] = []
89
                # Prepend, so user provided is sorted before builtin
90
                database[executable] = list_loaded[executable] + database[executable]
91
        return database
92
93
    def list_available(self, executable=None, dynamic=True, name=None, database=None, url=None):
94
        """
95
        List models available for download by processor
96
        """
97
        if not database:
98
            database = self.database
99
        if not executable:
100
            return database.items()
101
        if dynamic:
102
            for exec_dir in environ['PATH'].split(':'):
103
                for exec_path in Path(exec_dir).glob(f'{executable}'):
104
                    self.log.debug(f"Inspecting '{exec_path} --dump-json' for resources")
105
                    ocrd_tool = get_ocrd_tool_json(exec_path)
106
                    for resdict in ocrd_tool.get('resources', ()):
107
                        if exec_path.name not in database:
108
                            database[exec_path.name] = []
109
                        database[exec_path.name].insert(0, resdict)
110
            database = self._dedup_database(database)
111
        found = False
112
        ret = []
113
        for k in database:
114
            if apply_glob([k], executable):
115
                found = True
116
                restuple = (k, [])
117
                ret.append(restuple)
118
                for resdict in database[k]:
119
                    if name and resdict['name'] != name:
120
                        continue
121
                    if url and resdict['url'] != url:
122
                        continue
123
                    restuple[1].append(resdict)
124
        if not found:
125
            ret = [(executable, [])]
126
        return ret
127
128
    def list_installed(self, executable=None):
129
        """
130
        List installed resources, matching with registry by ``name``
131
        """
132
        ret = []
133
        if executable:
134
            all_executables = [executable]
135
        else:
136
            # resources we know about
137
            all_executables = list(self.database.keys())
138
            # resources in the file system
139
            parent_dirs = [join(x, 'ocrd-resources') for x in [self.xdg_data_home, '/usr/local/share']]
140
            for parent_dir in parent_dirs:
141
                if Path(parent_dir).exists():
142
                    all_executables += [x for x in listdir(parent_dir) if x.startswith('ocrd-')]
143
        for this_executable in set(all_executables):
144
            reslist = []
145
            mimetypes = get_processor_resource_types(this_executable)
146
            moduledir = get_moduledir(this_executable)
147
            for res_filename in list_all_resources(this_executable, moduled=moduledir, xdg_data_home=self.xdg_data_home):
148
                res_filename = Path(res_filename)
149
                if not '*/*' in mimetypes:
150
                    if res_filename.is_dir() and not 'text/directory' in mimetypes:
151
                        continue
152
                    if res_filename.is_file() and ['text/directory'] == mimetypes:
153
                        continue
154
                res_name = res_filename.name
155
                res_type = 'file' if res_filename.is_file() else 'directory'
156
                res_size = res_filename.stat().st_size if res_filename.is_file() else directory_size(res_filename)
157
                resdict_list = [x for x in self.database.get(this_executable, []) if x['name'] == res_name]
158
                if resdict_list:
159
                    resdict = resdict_list[0]
160
                elif str(res_filename.parent) == moduledir:
161
                    resdict = {
162
                        'name': res_name, 
163
                        'url': str(res_filename), 
164
                        'description': 'Found at module', 
165
                        'type': res_type,
166
                        'size': res_size
167
                    }
168
                else:
169
                    resdict = self.add_to_user_database(this_executable, res_filename, resource_type=res_type)
170
                resdict['path'] = str(res_filename)
171
                reslist.append(resdict)
172
            ret.append((this_executable, reslist))
173
        return ret
174
175
    def add_to_user_database(self, executable, res_filename, url=None, resource_type='file'):
176
        """
177
        Add a stub entry to the user resource.yml
178
        """
179
        res_name = Path(res_filename).name
180
        self.log.info(f"{executable} resource '{res_name}' ({str(res_filename)}) not a known resource, "
181
                      f"creating stub in {self.user_list}'")
182
        if Path(res_filename).is_dir():
183
            res_size = directory_size(res_filename)
184
        else:
185
            res_size = Path(res_filename).stat().st_size
186
        with open(self.user_list, 'r', encoding='utf-8') as f:
187
            user_database = safe_load(f) or {}
188
        if executable not in user_database:
189
            user_database[executable] = []
190
        resources_found = self.list_available(executable=executable, name=res_name, database=user_database)[0][1]
191
        if not resources_found:
192
            resdict = {
193
                'name': res_name,
194
                'url': url if url else '???',
195
                'description': f'Found at {self.resource_dir_to_location(res_filename)} on {datetime.now()}',
196
                'version_range': '???',
197
                'type': resource_type,
198
                'size': res_size
199
            }
200
            user_database[executable].append(resdict)
201
        else:
202
            resdict = resources_found[0]
203
        self.save_user_list(user_database)
204
        self.load_resource_list(self.user_list)
205
        return resdict
206
207
    @property
208
    def default_resource_dir(self):
209
        return self.location_to_resource_dir('data')
210
211
    def location_to_resource_dir(self, location):
212
        return '/usr/local/share/ocrd-resources' if location == 'system' else \
213
                join(self.xdg_data_home, 'ocrd-resources') if location == 'data' else \
214
                getcwd()
215
216
    def resource_dir_to_location(self, resource_path):
217
        resource_path = str(resource_path)
218
        return 'system' if resource_path.startswith('/usr/local/share/ocrd-resources') else \
219
               'data' if resource_path.startswith(join(self.xdg_data_home, 'ocrd-resources')) else \
220
               'cwd' if resource_path.startswith(getcwd()) else \
221
               resource_path
222
223
    @staticmethod
224
    def parameter_usage(name, usage='as-is'):
225
        if usage == 'as-is':
226
            return name
227
        elif usage == 'without-extension':
228
            return Path(name).stem
229
        raise ValueError(f"No such usage '{usage}'")
230
231
    @staticmethod
232
    def _download_impl(url, filename, progress_cb=None, size=None):
233
        log = getLogger('ocrd.resource_manager._download_impl')
234
        log.info(f"Downloading {url} to {filename}")
235
        with open(filename, 'wb') as f:
236
            gdrive_file_id, is_gdrive_download_link = gparse_url(url, warning=False)
237
            if gdrive_file_id:
238
                if not is_gdrive_download_link:
239
                    url = f"https://drive.google.com/uc?id={gdrive_file_id}"
240
                try:
241
                    with requests.get(url, stream=True) as r:
242
                        if "Content-Disposition" not in r.headers:
243
                            url = get_url_from_gdrive_confirmation(r.text)
244
                except RuntimeError as e:
245
                    log.warning("Cannot unwrap Google Drive URL: ", e)
246
            with requests.get(url, stream=True) as r:
247
                r.raise_for_status()
248
                for data in r.iter_content(chunk_size=4096):
249
                    if progress_cb:
250
                        progress_cb(len(data))
251
                    f.write(data)
252
253
    @staticmethod
254
    def _copy_file(src, dst, progress_cb=None):
255
        log = getLogger('ocrd.resource_manager._copy_file')
256
        log.info(f"Copying file {src} to {dst}")
257
        with open(dst, 'wb') as f_out, open(src, 'rb') as f_in:
258
            while True:
259
                chunk = f_in.read(4096)
260
                if chunk:
261
                    f_out.write(chunk)
262
                    if progress_cb:
263
                        progress_cb(len(chunk))
264
                else:
265
                    break
266
267
    @staticmethod
268
    def _copy_dir(src, dst, progress_cb=None):
269
        log = getLogger('ocrd.resource_manager._copy_dir')
270
        log.info(f"Copying dir recursively from {src} to {dst}")
271
        if not Path(src).is_dir():
272
            raise ValueError(f"The source is not a directory: {src}")
273
        makedirs(name=dst, exist_ok=True)
274
        for child in Path(src).rglob('*'):
275
            child_dst = Path(dst) / child.relative_to(src)
276
            child_dst.parent.mkdir(parents=True, exist_ok=True)
277
            if Path(child).is_dir():
278
                OcrdResourceManager._copy_dir(child, child_dst, progress_cb)
279
            else:
280
                OcrdResourceManager._copy_file(child, child_dst, progress_cb)
281
282
    @staticmethod
283
    def _copy_impl(src_filename, filename, progress_cb=None):
284
        log = getLogger('ocrd.resource_manager._copy_impl')
285
        log.info(f"Copying {src_filename} to {filename}")
286
        if Path(src_filename).is_dir():
287
            OcrdResourceManager._copy_dir(src_filename, filename, progress_cb)
288
        else:
289
            OcrdResourceManager._copy_file(src_filename, filename, progress_cb)
290
291
    # TODO Proper caching (make head request for size, If-Modified etc)
292
    def download(
293
        self, executable, url, basedir, overwrite=False, no_subdir=False, name=None, resource_type='file',
294
        path_in_archive='.', progress_cb=None,
295
    ):
296
        """
297
        Download a resource by URL
298
        """
299
        log = getLogger('ocrd.resource_manager.download')
300
        destdir = Path(basedir) if no_subdir else Path(basedir, executable)
301
        if not name:
302
            url_parsed = urlparse(url)
303
            name = Path(unquote(url_parsed.path)).name
304
        fpath = Path(destdir, name)
305
        is_url = url.startswith('https://') or url.startswith('http://')
306
        if fpath.exists():
307
            if not overwrite:
308
                fpath_type = 'Directory' if fpath.is_dir() else 'File'
309
                raise FileExistsError(f"{fpath_type} {fpath} already exists but --overwrite is not set")
310
            if fpath.is_dir():
311
                log.info(f"Removing existing target directory {fpath}")
312
                rmtree(str(fpath))
313
            else:
314
                log.info(f"Removing existing target file {fpath}")
315
                unlink(str(fpath))
316
        destdir.mkdir(parents=True, exist_ok=True)
317
        if resource_type in ('file', 'directory'):
318
            if is_url:
319
                self._download_impl(url, fpath, progress_cb)
320
            else:
321
                self._copy_impl(url, fpath, progress_cb)
322
        elif resource_type == 'archive':
323
            archive_fname = 'download.tar.xx'
324
            with pushd_popd(tempdir=True) as tempdir:
325
                if is_url:
326
                    self._download_impl(url, archive_fname, progress_cb)
327
                else:
328
                    self._copy_impl(url, archive_fname, progress_cb)
329
                Path('out').mkdir()
330
                with pushd_popd('out'):
331
                    mimetype = guess_media_type(f'../{archive_fname}', fallback='application/octet-stream')
332
                    log.info(f"Extracting {mimetype} archive to {tempdir}/out")
333
                    if mimetype == 'application/zip':
334
                        with ZipFile(f'../{archive_fname}', 'r') as zipf:
335
                            zipf.extractall()
336
                    elif mimetype in ('application/gzip', 'application/x-xz'):
337
                        with open_tarfile(f'../{archive_fname}', 'r:*') as tar:
338
                            tar.extractall()
339
                    else:
340
                        raise RuntimeError(f"Unable to handle extraction of {mimetype} archive {url}")
341
                    log.info(f"Copying '{path_in_archive}' from archive to {fpath}")
342
                    if Path(path_in_archive).is_dir():
343
                        copytree(path_in_archive, str(fpath))
344
                    else:
345
                        copy(path_in_archive, str(fpath))
346
            if Path(tempdir).exists():
347
                log.info(f"Removing temp dir {tempdir}")
348
                rmtree(tempdir)
349
        return fpath
350
351
    def _dedup_database(self, database=None, dedup_key='name'):
352
        """
353
        Deduplicate resources by name
354
        """
355
        if not database:
356
            database = self.database
357
        for executable, reslist in database.items():
358
            reslist_dedup = []
359
            for resdict in reslist:
360
                if not any(r[dedup_key] == resdict[dedup_key] for r in reslist_dedup):
361
                    reslist_dedup.append(resdict)
362
            database[executable] = reslist_dedup
363
        return database
364