Passed
Push — master ( 648be5...719bbc )
by Konstantin
02:45
created

OcrdResourceManager.download()   F

Complexity

Conditions 18

Size

Total Lines 57
Code Lines 47

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 47
dl 0
loc 57
rs 1.2
c 0
b 0
f 0
cc 18
nop 10

How to fix   Long Method    Complexity    Many Parameters   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

Complexity

Complex classes like ocrd.resource_manager.OcrdResourceManager.download() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

Many Parameters

Methods with many parameters are not only hard to understand, but their parameters also often become inconsistent when you need more, or different data.

There are several approaches to avoid long parameter lists:

1
from pathlib import Path
2
from os.path import join
3
from os import environ, listdir, makedirs, getcwd, path, unlink
4
from shutil import copytree, rmtree, copy
5
from fnmatch import filter as apply_glob
6
from datetime import datetime
7
from tarfile import open as open_tarfile
8
from urllib.parse import urlparse, unquote
9
from zipfile import ZipFile
10
11
import requests
12
from gdown.parse_url import parse_url as gparse_url
13
from gdown.download import get_url_from_gdrive_confirmation
14
from yaml import safe_load, safe_dump
15
16
# https://github.com/OCR-D/core/issues/867
17
# https://stackoverflow.com/questions/50900727/skip-converting-entities-while-loading-a-yaml-string-using-pyyaml
18
import yaml.constructor
19
yaml.constructor.SafeConstructor.yaml_constructors[u'tag:yaml.org,2002:timestamp'] = \
20
    yaml.constructor.SafeConstructor.yaml_constructors[u'tag:yaml.org,2002:str']
21
22
from ocrd_validators import OcrdResourceListValidator
23
from ocrd_utils import getLogger, directory_size, get_moduledir, EXT_TO_MIME, nth_url_segment, guess_media_type, config
24
from ocrd_utils.os import get_processor_resource_types, list_all_resources, pushd_popd, get_ocrd_tool_json
25
from .constants import RESOURCE_LIST_FILENAME, RESOURCE_USER_LIST_COMMENT
26
27
28
class OcrdResourceManager:
29
30
    """
31
    Managing processor resources
32
    """
33
    def __init__(self, userdir=None, xdg_config_home=None, xdg_data_home=None, skip_init=False):
34
        self.log = getLogger('ocrd.resource_manager')
35
        self.database = {}
36
37
        self._xdg_data_home = xdg_data_home
38
        self._xdg_config_home = xdg_config_home
39
        self._userdir = userdir
40
        self.user_list = Path(self.xdg_config_home, 'ocrd', 'resources.yml')
41
42
        if not skip_init:
43
            self.load_resource_list(Path(RESOURCE_LIST_FILENAME))
44
            if not self.user_list.exists():
45
                if not self.user_list.parent.exists():
46
                    self.user_list.parent.mkdir(parents=True)
47
                self.save_user_list()
48
            self.load_resource_list(self.user_list)
49
50
    @property
51
    def userdir(self):
52
        if not self._userdir:
53
            self._userdir = config.HOME
54
        return self._userdir
55
56
    @property
57
    def xdg_data_home(self):
58
        if not self._xdg_data_home:
59
            self._xdg_data_home = config.XDG_DATA_HOME
60
        return self._xdg_data_home
61
62
    @property
63
    def xdg_config_home(self):
64
        if self._xdg_config_home:
65
            return self._xdg_config_home
66
        return config.XDG_CONFIG_HOME
67
68
    def save_user_list(self, database=None):
69
        if not database:
70
            database = self.database
71
        with open(self.user_list, 'w', encoding='utf-8') as f:
72
            f.write(RESOURCE_USER_LIST_COMMENT)
73
            f.write('\n')
74
            f.write(safe_dump(database))
75
76
    def load_resource_list(self, list_filename, database=None):
77
        if not database:
78
            database = self.database
79
        if list_filename.is_file():
80
            with open(list_filename, 'r', encoding='utf-8') as f:
81
                list_loaded = safe_load(f) or {}
82
            report = OcrdResourceListValidator.validate(list_loaded)
83
            if not report.is_valid:
84
                self.log.error('\n'.join(report.errors))
85
                raise ValueError(f"Resource list {list_filename} is invalid!")
86
            for executable, resource_list in list_loaded.items():
87
                if executable not in database:
88
                    database[executable] = []
89
                # Prepend, so user provided is sorted before builtin
90
                database[executable] = list_loaded[executable] + database[executable]
91
        return database
92
93
    def list_available(self, executable=None, dynamic=True, name=None, database=None, url=None):
94
        """
95
        List models available for download by processor
96
        """
97
        if not database:
98
            database = self.database
99
        if not executable:
100
            return database.items()
101
        if dynamic:
102
            skip_executables = ["ocrd-cis-data", "ocrd-import", "ocrd-make"]
103
            for exec_dir in environ['PATH'].split(':'):
104
                for exec_path in Path(exec_dir).glob(f'{executable}'):
105
                    if not exec_path.name.startswith('ocrd-'):
106
                        self.log.warning(f"OCR-D processor executable '{exec_path}' has no 'ocrd-' prefix")
107
                    if exec_path.name in skip_executables:
108
                        self.log.debug(f"Not an OCR-D processor CLI, skipping '{exec_path}'")
109
                        continue
110
                    self.log.debug(f"Inspecting '{exec_path} --dump-json' for resources")
111
                    ocrd_tool = get_ocrd_tool_json(exec_path)
112
                    for resdict in ocrd_tool.get('resources', ()):
113
                        if exec_path.name not in database:
114
                            database[exec_path.name] = []
115
                        database[exec_path.name].insert(0, resdict)
116
            database = self._dedup_database(database)
117
        found = False
118
        ret = []
119
        for k in database:
120
            if apply_glob([k], executable):
121
                found = True
122
                restuple = (k, [])
123
                ret.append(restuple)
124
                for resdict in database[k]:
125
                    if name and resdict['name'] != name:
126
                        continue
127
                    if url and resdict['url'] != url:
128
                        continue
129
                    restuple[1].append(resdict)
130
        if not found:
131
            ret = [(executable, [])]
132
        return ret
133
134
    def list_installed(self, executable=None):
135
        """
136
        List installed resources, matching with registry by ``name``
137
        """
138
        ret = []
139
        if executable:
140
            all_executables = [executable]
141
        else:
142
            # resources we know about
143
            all_executables = list(self.database.keys())
144
            # resources in the file system
145
            parent_dirs = [join(x, 'ocrd-resources') for x in [self.xdg_data_home, '/usr/local/share']]
146
            for parent_dir in parent_dirs:
147
                if Path(parent_dir).exists():
148
                    all_executables += [x for x in listdir(parent_dir) if x.startswith('ocrd-')]
149
        for this_executable in set(all_executables):
150
            reslist = []
151
            mimetypes = get_processor_resource_types(this_executable)
152
            moduledir = get_moduledir(this_executable)
153
            for res_filename in list_all_resources(this_executable, moduled=moduledir, xdg_data_home=self.xdg_data_home):
154
                res_filename = Path(res_filename)
155
                if not '*/*' in mimetypes:
156
                    if res_filename.is_dir() and not 'text/directory' in mimetypes:
157
                        continue
158
                    if res_filename.is_file() and ['text/directory'] == mimetypes:
159
                        continue
160
                res_name = res_filename.name
161
                res_type = 'file' if res_filename.is_file() else 'directory'
162
                res_size = res_filename.stat().st_size if res_filename.is_file() else directory_size(res_filename)
163
                resdict_list = [x for x in self.database.get(this_executable, []) if x['name'] == res_name]
164
                if resdict_list:
165
                    resdict = resdict_list[0]
166
                elif str(res_filename.parent) == moduledir:
167
                    resdict = {
168
                        'name': res_name, 
169
                        'url': str(res_filename), 
170
                        'description': 'Found at module', 
171
                        'type': res_type,
172
                        'size': res_size
173
                    }
174
                else:
175
                    resdict = self.add_to_user_database(this_executable, res_filename, resource_type=res_type)
176
                resdict['path'] = str(res_filename)
177
                reslist.append(resdict)
178
            ret.append((this_executable, reslist))
179
        return ret
180
181
    def add_to_user_database(self, executable, res_filename, url=None, resource_type='file'):
182
        """
183
        Add a stub entry to the user resource.yml
184
        """
185
        res_name = Path(res_filename).name
186
        self.log.info(f"{executable} resource '{res_name}' ({str(res_filename)}) not a known resource, "
187
                      f"creating stub in {self.user_list}'")
188
        if Path(res_filename).is_dir():
189
            res_size = directory_size(res_filename)
190
        else:
191
            res_size = Path(res_filename).stat().st_size
192
        with open(self.user_list, 'r', encoding='utf-8') as f:
193
            user_database = safe_load(f) or {}
194
        if executable not in user_database:
195
            user_database[executable] = []
196
        resources_found = self.list_available(executable=executable, name=res_name, database=user_database)[0][1]
197
        if not resources_found:
198
            resdict = {
199
                'name': res_name,
200
                'url': url if url else '???',
201
                'description': f'Found at {self.resource_dir_to_location(res_filename)} on {datetime.now()}',
202
                'version_range': '???',
203
                'type': resource_type,
204
                'size': res_size
205
            }
206
            user_database[executable].append(resdict)
207
        else:
208
            resdict = resources_found[0]
209
        self.save_user_list(user_database)
210
        self.load_resource_list(self.user_list)
211
        return resdict
212
213
    @property
214
    def default_resource_dir(self):
215
        return self.location_to_resource_dir('data')
216
217
    def location_to_resource_dir(self, location):
218
        return '/usr/local/share/ocrd-resources' if location == 'system' else \
219
                join(self.xdg_data_home, 'ocrd-resources') if location == 'data' else \
220
                getcwd()
221
222
    def resource_dir_to_location(self, resource_path):
223
        resource_path = str(resource_path)
224
        return 'system' if resource_path.startswith('/usr/local/share/ocrd-resources') else \
225
               'data' if resource_path.startswith(join(self.xdg_data_home, 'ocrd-resources')) else \
226
               'cwd' if resource_path.startswith(getcwd()) else \
227
               resource_path
228
229
    @staticmethod
230
    def parameter_usage(name, usage='as-is'):
231
        if usage == 'as-is':
232
            return name
233
        elif usage == 'without-extension':
234
            return Path(name).stem
235
        raise ValueError(f"No such usage '{usage}'")
236
237
    @staticmethod
238
    def _download_impl(url, filename, progress_cb=None, size=None):
239
        log = getLogger('ocrd.resource_manager._download_impl')
240
        log.info(f"Downloading {url} to {filename}")
241
        try:
242
            gdrive_file_id, is_gdrive_download_link = gparse_url(url, warning=False)
243
            if gdrive_file_id:
244
                if not is_gdrive_download_link:
245
                    url = f"https://drive.google.com/uc?id={gdrive_file_id}"
246
                try:
247
                    with requests.get(url, stream=True) as r:
248
                        if "Content-Disposition" not in r.headers:
249
                            url = get_url_from_gdrive_confirmation(r.text)
250
                except RuntimeError as e:
251
                    log.warning("Cannot unwrap Google Drive URL: ", e)
252
            with open(filename, 'wb') as f:
253
                with requests.get(url, stream=True) as r:
254
                    r.raise_for_status()
255
                    for data in r.iter_content(chunk_size=4096):
256
                        if progress_cb:
257
                            progress_cb(len(data))
258
                        f.write(data)
259
        except Exception as e:
260
            rmtree(filename, ignore_errors=True)
261
            Path(filename).unlink(missing_ok=True)
262
            raise e
263
264
    @staticmethod
265
    def _copy_file(src, dst, progress_cb=None):
266
        log = getLogger('ocrd.resource_manager._copy_file')
267
        log.info(f"Copying file {src} to {dst}")
268
        with open(dst, 'wb') as f_out, open(src, 'rb') as f_in:
269
            while True:
270
                chunk = f_in.read(4096)
271
                if chunk:
272
                    f_out.write(chunk)
273
                    if progress_cb:
274
                        progress_cb(len(chunk))
275
                else:
276
                    break
277
278
    @staticmethod
279
    def _copy_dir(src, dst, progress_cb=None):
280
        log = getLogger('ocrd.resource_manager._copy_dir')
281
        log.info(f"Copying dir recursively from {src} to {dst}")
282
        if not Path(src).is_dir():
283
            raise ValueError(f"The source is not a directory: {src}")
284
        Path(dst).mkdir(parents=True, exist_ok=True)
285
        for child in Path(src).rglob('*'):
286
            child_dst = Path(dst) / child.relative_to(src)
287
            if Path(child).is_dir():
288
                OcrdResourceManager._copy_dir(child, child_dst, progress_cb)
289
            else:
290
                OcrdResourceManager._copy_file(child, child_dst, progress_cb)
291
292
    @staticmethod
293
    def _copy_impl(src_filename, filename, progress_cb=None):
294
        log = getLogger('ocrd.resource_manager._copy_impl')
295
        log.info(f"Copying {src_filename} to {filename}")
296
        if Path(src_filename).is_dir():
297
            OcrdResourceManager._copy_dir(src_filename, filename, progress_cb)
298
        else:
299
            OcrdResourceManager._copy_file(src_filename, filename, progress_cb)
300
301
    # TODO Proper caching (make head request for size, If-Modified etc)
302
    def download(
303
        self, executable, url, basedir, overwrite=False, no_subdir=False, name=None, resource_type='file',
304
        path_in_archive='.', progress_cb=None,
305
    ):
306
        """
307
        Download a resource by URL
308
        """
309
        log = getLogger('ocrd.resource_manager.download')
310
        destdir = Path(basedir) if no_subdir else Path(basedir, executable)
311
        if not name:
312
            url_parsed = urlparse(url)
313
            name = Path(unquote(url_parsed.path)).name
314
        fpath = Path(destdir, name)
315
        is_url = url.startswith('https://') or url.startswith('http://')
316
        if fpath.exists():
317
            if not overwrite:
318
                fpath_type = 'Directory' if fpath.is_dir() else 'File'
319
                log.warning(f"{fpath_type} {fpath} already exists but --overwrite is not set, skipping the download")
320
                # raise FileExistsError(f"{fpath_type} {fpath} already exists but --overwrite is not set")
321
                return fpath
322
            if fpath.is_dir():
323
                log.info(f"Removing existing target directory {fpath}")
324
                rmtree(str(fpath))
325
            else:
326
                log.info(f"Removing existing target file {fpath}")
327
                unlink(str(fpath))
328
        destdir.mkdir(parents=True, exist_ok=True)
329
        if resource_type in ('file', 'directory'):
330
            if is_url:
331
                self._download_impl(url, fpath, progress_cb)
332
            else:
333
                self._copy_impl(url, fpath, progress_cb)
334
        elif resource_type == 'archive':
335
            archive_fname = 'download.tar.xx'
336
            with pushd_popd(tempdir=True) as tempdir:
337
                if is_url:
338
                    self._download_impl(url, archive_fname, progress_cb)
339
                else:
340
                    self._copy_impl(url, archive_fname, progress_cb)
341
                Path('out').mkdir()
342
                with pushd_popd('out'):
343
                    mimetype = guess_media_type(f'../{archive_fname}', fallback='application/octet-stream')
344
                    log.info(f"Extracting {mimetype} archive to {tempdir}/out")
345
                    if mimetype == 'application/zip':
346
                        with ZipFile(f'../{archive_fname}', 'r') as zipf:
347
                            zipf.extractall()
348
                    elif mimetype in ('application/gzip', 'application/x-xz'):
349
                        with open_tarfile(f'../{archive_fname}', 'r:*') as tar:
350
                            tar.extractall()
351
                    else:
352
                        raise RuntimeError(f"Unable to handle extraction of {mimetype} archive {url}")
353
                    log.info(f"Copying '{path_in_archive}' from archive to {fpath}")
354
                    if Path(path_in_archive).is_dir():
355
                        copytree(path_in_archive, str(fpath))
356
                    else:
357
                        copy(path_in_archive, str(fpath))
358
        return fpath
359
360
    def _dedup_database(self, database=None, dedup_key='name'):
361
        """
362
        Deduplicate resources by name
363
        """
364
        if not database:
365
            database = self.database
366
        for executable, reslist in database.items():
367
            reslist_dedup = []
368
            for resdict in reslist:
369
                if not any(r[dedup_key] == resdict[dedup_key] for r in reslist_dedup):
370
                    reslist_dedup.append(resdict)
371
            database[executable] = reslist_dedup
372
        return database
373