Passed
Pull Request — master (#1309)
by
unknown
08:02
created

OcrdResourceManager.download()   F

Complexity

Conditions 18

Size

Total Lines 57
Code Lines 47

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 47
dl 0
loc 57
rs 1.2
c 0
b 0
f 0
cc 18
nop 10

How to fix   Long Method    Complexity    Many Parameters   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

Complexity

Complex classes like ocrd.resource_manager.OcrdResourceManager.download() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

Many Parameters

Methods with many parameters are not only hard to understand, but their parameters also often become inconsistent when you need more, or different data.

There are several approaches to avoid long parameter lists:

1
from logging import Logger
2
from pathlib import Path
3
from os.path import join
4
from os import environ, listdir, getcwd, unlink
5
from shutil import copytree, rmtree, copy
6
from fnmatch import filter as apply_glob
7
from datetime import datetime
8
from tarfile import open as open_tarfile
9
from urllib.parse import urlparse, unquote
10
from zipfile import ZipFile
11
12
import requests
13
from gdown.parse_url import parse_url as gparse_url
14
from gdown.download import get_url_from_gdrive_confirmation
15
from yaml import safe_load, safe_dump
16
17
# pylint: disable=wrong-import-position
18
19
# https://github.com/OCR-D/core/issues/867
20
# https://stackoverflow.com/questions/50900727/skip-converting-entities-while-loading-a-yaml-string-using-pyyaml
21
import yaml.constructor
22
yaml.constructor.SafeConstructor.yaml_constructors['tag:yaml.org,2002:timestamp'] = \
23
    yaml.constructor.SafeConstructor.yaml_constructors['tag:yaml.org,2002:str']
24
25
# pylint: enable=wrong-import-position
26
27
# pylint: enable=wrong-import-position
28
29
# pylint: enable=wrong-import-position
30
31
from ocrd_validators import OcrdResourceListValidator
32
from ocrd_utils import getLogger, directory_size, get_moduledir, guess_media_type, config
33
from ocrd_utils.os import get_processor_resource_types, list_all_resources, pushd_popd, get_ocrd_tool_json
34
from .constants import RESOURCE_LIST_FILENAME, RESOURCE_USER_LIST_COMMENT
35
36
37
class OcrdResourceManager:
38
39
    """
40
    Managing processor resources
41
    """
42
    def __init__(self, userdir=None, xdg_config_home=None, xdg_data_home=None, skip_init=False):
43
        self.log = getLogger('ocrd.resource_manager')
44
        self.database = {}
45
46
        self._xdg_data_home = xdg_data_home
47
        self._xdg_config_home = xdg_config_home
48
        self._userdir = userdir
49
        self.user_list = Path(self.xdg_config_home, 'ocrd', 'resources.yml')
50
51
        if not skip_init:
52
            self.load_resource_list(Path(RESOURCE_LIST_FILENAME))
53
            if not self.user_list.exists():
54
                if not self.user_list.parent.exists():
55
                    self.user_list.parent.mkdir(parents=True)
56
                self.save_user_list()
57
            self.load_resource_list(self.user_list)
58
59
    @property
60
    def userdir(self):
61
        if not self._userdir:
62
            self._userdir = config.HOME
63
        return self._userdir
64
65
    @property
66
    def xdg_data_home(self):
67
        if not self._xdg_data_home:
68
            self._xdg_data_home = config.XDG_DATA_HOME
69
        return self._xdg_data_home
70
71
    @property
72
    def xdg_config_home(self):
73
        if self._xdg_config_home:
74
            return self._xdg_config_home
75
        return config.XDG_CONFIG_HOME
76
77
    def save_user_list(self, database=None):
78
        if not database:
79
            database = self.database
80
        with open(self.user_list, 'w', encoding='utf-8') as f:
81
            f.write(RESOURCE_USER_LIST_COMMENT)
82
            f.write('\n')
83
            f.write(safe_dump(database))
84
85
    def load_resource_list(self, list_filename, database=None):
86
        if not database:
87
            database = self.database
88
        if list_filename.is_file():
89
            with open(list_filename, 'r', encoding='utf-8') as f:
90
                list_loaded = safe_load(f) or {}
91
            report = OcrdResourceListValidator.validate(list_loaded)
92
            if not report.is_valid:
93
                self.log.error('\n'.join(report.errors))
94
                raise ValueError(f"Resource list {list_filename} is invalid!")
95
            for executable, resource_list in list_loaded.items():
96
                if executable not in database:
97
                    database[executable] = []
98
                # Prepend, so user provided is sorted before builtin
99
                database[executable] = list_loaded[executable] + database[executable]
100
        return database
101
102
    def list_available(self, executable=None, dynamic=True, name=None, database=None, url=None):
103
        """
104
        List models available for download by processor
105
        """
106
        if not database:
107
            database = self.database
108
        if not executable:
109
            return database.items()
110
        if dynamic:
111
            skip_executables = ["ocrd-cis-data", "ocrd-import", "ocrd-make"]
112
            for exec_dir in environ['PATH'].split(':'):
113
                for exec_path in Path(exec_dir).glob(f'{executable}'):
114
                    if not exec_path.name.startswith('ocrd-'):
115
                        self.log.warning(f"OCR-D processor executable '{exec_path}' has no 'ocrd-' prefix")
116
                    if exec_path.name in skip_executables:
117
                        self.log.debug(f"Not an OCR-D processor CLI, skipping '{exec_path}'")
118
                        continue
119
                    self.log.debug(f"Inspecting '{exec_path} --dump-json' for resources")
120
                    ocrd_tool = get_ocrd_tool_json(exec_path)
121
                    for resdict in ocrd_tool.get('resources', ()):
122
                        if exec_path.name not in database:
123
                            database[exec_path.name] = []
124
                        database[exec_path.name].insert(0, resdict)
125
            database = self._dedup_database(database)
126
        found = False
127
        ret = []
128
        for k in database:
129
            if apply_glob([k], executable):
130
                found = True
131
                restuple = (k, [])
132
                ret.append(restuple)
133
                for resdict in database[k]:
134
                    if name and resdict['name'] != name:
135
                        continue
136
                    if url and resdict['url'] != url:
137
                        continue
138
                    restuple[1].append(resdict)
139
        if not found:
140
            ret = [(executable, [])]
141
        return ret
142
143
    def list_installed(self, executable=None):
144
        """
145
        List installed resources, matching with registry by ``name``
146
        """
147
        ret = []
148
        if executable:
149
            all_executables = [executable]
150
        else:
151
            # resources we know about
152
            all_executables = list(self.database.keys())
153
            # resources in the file system
154
            parent_dirs = [join(x, 'ocrd-resources') for x in [self.xdg_data_home, '/usr/local/share']]
155
            for parent_dir in parent_dirs:
156
                if Path(parent_dir).exists():
157
                    all_executables += [x for x in listdir(parent_dir) if x.startswith('ocrd-')]
158
        for this_executable in set(all_executables):
159
            reslist = []
160
            mimetypes = get_processor_resource_types(this_executable)
161
            moduledir = get_moduledir(this_executable)
162
            for res_filename in list_all_resources(this_executable, moduled=moduledir, xdg_data_home=self.xdg_data_home):
163
                res_filename = Path(res_filename)
164
                if not '*/*' in mimetypes:
165
                    if res_filename.is_dir() and not 'text/directory' in mimetypes:
166
                        continue
167
                    if res_filename.is_file() and ['text/directory'] == mimetypes:
168
                        continue
169
                res_name = res_filename.name
170
                res_type = 'file' if res_filename.is_file() else 'directory'
171
                res_size = res_filename.stat().st_size if res_filename.is_file() else directory_size(res_filename)
172
                resdict_list = [x for x in self.database.get(this_executable, []) if x['name'] == res_name]
173
                if resdict_list:
174
                    resdict = resdict_list[0]
175
                elif str(res_filename.parent) == moduledir:
176
                    resdict = {
177
                        'name': res_name, 
178
                        'url': str(res_filename), 
179
                        'description': 'Found at module', 
180
                        'type': res_type,
181
                        'size': res_size
182
                    }
183
                else:
184
                    resdict = self.add_to_user_database(this_executable, res_filename, resource_type=res_type)
185
                resdict['path'] = str(res_filename)
186
                reslist.append(resdict)
187
            ret.append((this_executable, reslist))
188
        return ret
189
190
    def add_to_user_database(self, executable, res_filename, url=None, resource_type='file'):
191
        """
192
        Add a stub entry to the user resource.yml
193
        """
194
        res_name = Path(res_filename).name
195
        self.log.info(f"{executable} resource '{res_name}' ({str(res_filename)}) not a known resource, "
196
                      f"creating stub in {self.user_list}'")
197
        if Path(res_filename).is_dir():
198
            res_size = directory_size(res_filename)
199
        else:
200
            res_size = Path(res_filename).stat().st_size
201
        with open(self.user_list, 'r', encoding='utf-8') as f:
202
            user_database = safe_load(f) or {}
203
        if executable not in user_database:
204
            user_database[executable] = []
205
        resources_found = self.list_available(executable=executable, name=res_name, database=user_database)[0][1]
206
        if not resources_found:
207
            resdict = {
208
                'name': res_name,
209
                'url': url if url else '???',
210
                'description': f'Found at {self.resource_dir_to_location(res_filename)} on {datetime.now()}',
211
                'version_range': '???',
212
                'type': resource_type,
213
                'size': res_size
214
            }
215
            user_database[executable].append(resdict)
216
        else:
217
            resdict = resources_found[0]
218
        self.save_user_list(user_database)
219
        self.load_resource_list(self.user_list)
220
        return resdict
221
222
    @property
223
    def default_resource_dir(self):
224
        return self.location_to_resource_dir('data')
225
226
    def location_to_resource_dir(self, location):
227
        if location == 'data':
228
            return join(self.xdg_data_home, 'ocrd-resources')
229
        if location == 'system':
230
            return '/usr/local/share/ocrd-resources'
231
        return getcwd()
232
233
    def resource_dir_to_location(self, resource_path):
234
        resource_path = str(resource_path)
235
        if resource_path.startswith('/usr/local/share/ocrd-resources'):
236
            return 'system'
237
        if resource_path.startswith(join(self.xdg_data_home, 'ocrd-resources')):
238
            return 'data'
239
        if resource_path.startswith(getcwd()):
240
            return 'cwd'
241
        return resource_path
242
243
    @staticmethod
244
    def parameter_usage(name, usage='as-is'):
245
        if usage == 'as-is':
246
            return name
247
        elif usage == 'without-extension':
248
            return Path(name).stem
249
        raise ValueError(f"No such usage '{usage}'")
250
251
    @staticmethod
252
    def _download_impl(log: Logger, url, filename, progress_cb=None, size=None):
253
        log.info(f"Downloading {url} to {filename}")
254
        try:
255
            gdrive_file_id, is_gdrive_download_link = gparse_url(url, warning=False)
256
            if gdrive_file_id:
257
                if not is_gdrive_download_link:
258
                    url = f"https://drive.google.com/uc?id={gdrive_file_id}"
259
                try:
260
                    with requests.get(url, stream=True) as r:
261
                        if "Content-Disposition" not in r.headers:
262
                            url = get_url_from_gdrive_confirmation(r.text)
263
                except RuntimeError as e:
264
                    log.warning(f"Cannot unwrap Google Drive URL: {e}")
265
            with open(filename, 'wb') as f:
266
                with requests.get(url, stream=True) as r:
267
                    r.raise_for_status()
268
                    for data in r.iter_content(chunk_size=4096):
269
                        if progress_cb:
270
                            progress_cb(len(data))
271
                        f.write(data)
272
        except Exception as e:
273
            rmtree(filename, ignore_errors=True)
274
            Path(filename).unlink(missing_ok=True)
275
            raise e
276
277
    @staticmethod
278
    def _copy_file(log: Logger, src, dst, progress_cb=None):
279
        log.info(f"Copying file {src} to {dst}")
280
        with open(dst, 'wb') as f_out, open(src, 'rb') as f_in:
281
            while True:
282
                chunk = f_in.read(4096)
283
                if chunk:
284
                    f_out.write(chunk)
285
                    if progress_cb:
286
                        progress_cb(len(chunk))
287
                else:
288
                    break
289
290
    @staticmethod
291
    def _copy_dir(log: Logger, src, dst, progress_cb=None):
292
        log.info(f"Copying dir recursively from {src} to {dst}")
293
        if not Path(src).is_dir():
294
            raise ValueError(f"The source is not a directory: {src}")
295
        Path(dst).mkdir(parents=True, exist_ok=True)
296
        for child in Path(src).rglob('*'):
297
            child_dst = Path(dst) / child.relative_to(src)
298
            if Path(child).is_dir():
299
                OcrdResourceManager._copy_dir(child, child_dst, progress_cb)
300
            else:
301
                OcrdResourceManager._copy_file(child, child_dst, progress_cb)
302
303
    @staticmethod
304
    def _copy_impl(log: Logger, src_filename, filename, progress_cb=None):
305
        log.info(f"Copying {src_filename} to {filename}")
306
        if Path(src_filename).is_dir():
307
            OcrdResourceManager._copy_dir(log, src_filename, filename, progress_cb)
308
        else:
309
            OcrdResourceManager._copy_file(log, src_filename, filename, progress_cb)
310
311
    def _download_archive(self, log: Logger, url: str, path_in_archive: str, fpath: Path, progress_cb=None):
312
        archive_fname = 'download.tar.xx'
313
        with pushd_popd(tempdir=True) as tempdir:
314
            if url.startswith('https://') or url.startswith('http://'):
315
                self._download_impl(log, url, archive_fname, progress_cb)
316
            else:
317
                self._copy_impl(log, url, archive_fname, progress_cb)
318
            Path('out').mkdir()
319
            with pushd_popd('out'):
320
                mimetype = guess_media_type(f'../{archive_fname}', fallback='application/octet-stream')
321
                log.info(f"Extracting {mimetype} archive to {tempdir}/out")
322
                if mimetype == 'application/zip':
323
                    with ZipFile(f'../{archive_fname}', 'r') as zipf:
324
                        zipf.extractall()
325
                elif mimetype in ('application/gzip', 'application/x-xz'):
326
                    with open_tarfile(f'../{archive_fname}', 'r:*') as tar:
327
                        tar.extractall()
328
                else:
329
                    raise RuntimeError(f"Unable to handle extraction of {mimetype} archive {url}")
330
                log.info(f"Copying '{path_in_archive}' from archive to {fpath}")
331
                if Path(path_in_archive).is_dir():
332
                    copytree(path_in_archive, str(fpath))
333
                else:
334
                    copy(path_in_archive, str(fpath))
335
336
    # TODO Proper caching (make head request for size, If-Modified etc)
337
    def download_resource(
338
        self, executable, url, basedir, overwrite=False, no_subdir=False, name=None, resource_type='file',
339
        path_in_archive='.', progress_cb=None,
340
    ):
341
        """
342
        Download a resource by URL
343
        """
344
        log = getLogger('ocrd.resource_manager.download')
345
        destdir = Path(basedir) if no_subdir else Path(basedir, executable)
346
        if not name:
347
            url_parsed = urlparse(url)
348
            name = Path(unquote(url_parsed.path)).name
349
        fpath = Path(destdir, name)
350
        if fpath.exists():
351
            if not overwrite:
352
                fpath_type = 'Directory' if fpath.is_dir() else 'File'
353
                log.warning(f"{fpath_type} {fpath} already exists but --overwrite is not set, skipping the download")
354
                # raise FileExistsError(f"{fpath_type} {fpath} already exists but --overwrite is not set")
355
                return fpath
356
            if fpath.is_dir():
357
                log.info(f"Removing existing target directory {fpath}")
358
                rmtree(str(fpath))
359
            else:
360
                log.info(f"Removing existing target file {fpath}")
361
                unlink(str(fpath))
362
        destdir.mkdir(parents=True, exist_ok=True)
363
        if resource_type in ('file', 'directory'):
364
            if url.startswith('https://') or url.startswith('http://'):
365
                self._download_impl(log, url, fpath, progress_cb)
366
            else:
367
                self._copy_impl(log, url, fpath, progress_cb)
368
        elif resource_type == 'archive':
369
            self._download_archive(log, url, path_in_archive, fpath, progress_cb)
370
        return fpath
371
372
    def _dedup_database(self, database=None, dedup_key='name'):
373
        """
374
        Deduplicate resources by name
375
        """
376
        if not database:
377
            database = self.database
378
        for executable, reslist in database.items():
379
            reslist_dedup = []
380
            for resdict in reslist:
381
                if not any(r[dedup_key] == resdict[dedup_key] for r in reslist_dedup):
382
                    reslist_dedup.append(resdict)
383
            database[executable] = reslist_dedup
384
        return database
385