Passed
Push — master ( 648be5...719bbc )
by Konstantin
02:45
created

ocrd.resource_manager   F

Complexity

Total Complexity 115

Size/Duplication

Total Lines 373
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
wmc 115
eloc 307
dl 0
loc 373
rs 2
c 0
b 0
f 0

19 Methods

Rating   Name   Duplication   Size   Complexity  
F OcrdResourceManager.list_installed() 0 46 15
A OcrdResourceManager.default_resource_dir() 0 3 1
A OcrdResourceManager.resource_dir_to_location() 0 6 4
A OcrdResourceManager.location_to_resource_dir() 0 4 3
B OcrdResourceManager.load_resource_list() 0 16 7
A OcrdResourceManager.save_user_list() 0 7 3
A OcrdResourceManager.__init__() 0 16 4
A OcrdResourceManager.userdir() 0 5 2
A OcrdResourceManager.xdg_data_home() 0 5 2
A OcrdResourceManager.xdg_config_home() 0 5 2
A OcrdResourceManager._copy_impl() 0 8 2
A OcrdResourceManager._copy_dir() 0 13 4
A OcrdResourceManager._dedup_database() 0 13 5
A OcrdResourceManager._copy_file() 0 13 5
F OcrdResourceManager.download() 0 57 18
B OcrdResourceManager.add_to_user_database() 0 31 6
F OcrdResourceManager.list_available() 0 40 18
A OcrdResourceManager.parameter_usage() 0 7 3
C OcrdResourceManager._download_impl() 0 26 11

How to fix   Complexity   

Complexity

Complex classes like ocrd.resource_manager often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

1
from pathlib import Path
2
from os.path import join
3
from os import environ, listdir, makedirs, getcwd, path, unlink
4
from shutil import copytree, rmtree, copy
5
from fnmatch import filter as apply_glob
6
from datetime import datetime
7
from tarfile import open as open_tarfile
8
from urllib.parse import urlparse, unquote
9
from zipfile import ZipFile
10
11
import requests
12
from gdown.parse_url import parse_url as gparse_url
13
from gdown.download import get_url_from_gdrive_confirmation
14
from yaml import safe_load, safe_dump
15
16
# https://github.com/OCR-D/core/issues/867
17
# https://stackoverflow.com/questions/50900727/skip-converting-entities-while-loading-a-yaml-string-using-pyyaml
18
import yaml.constructor
19
yaml.constructor.SafeConstructor.yaml_constructors[u'tag:yaml.org,2002:timestamp'] = \
20
    yaml.constructor.SafeConstructor.yaml_constructors[u'tag:yaml.org,2002:str']
21
22
from ocrd_validators import OcrdResourceListValidator
23
from ocrd_utils import getLogger, directory_size, get_moduledir, EXT_TO_MIME, nth_url_segment, guess_media_type, config
24
from ocrd_utils.os import get_processor_resource_types, list_all_resources, pushd_popd, get_ocrd_tool_json
25
from .constants import RESOURCE_LIST_FILENAME, RESOURCE_USER_LIST_COMMENT
26
27
28
class OcrdResourceManager:
29
30
    """
31
    Managing processor resources
32
    """
33
    def __init__(self, userdir=None, xdg_config_home=None, xdg_data_home=None, skip_init=False):
34
        self.log = getLogger('ocrd.resource_manager')
35
        self.database = {}
36
37
        self._xdg_data_home = xdg_data_home
38
        self._xdg_config_home = xdg_config_home
39
        self._userdir = userdir
40
        self.user_list = Path(self.xdg_config_home, 'ocrd', 'resources.yml')
41
42
        if not skip_init:
43
            self.load_resource_list(Path(RESOURCE_LIST_FILENAME))
44
            if not self.user_list.exists():
45
                if not self.user_list.parent.exists():
46
                    self.user_list.parent.mkdir(parents=True)
47
                self.save_user_list()
48
            self.load_resource_list(self.user_list)
49
50
    @property
51
    def userdir(self):
52
        if not self._userdir:
53
            self._userdir = config.HOME
54
        return self._userdir
55
56
    @property
57
    def xdg_data_home(self):
58
        if not self._xdg_data_home:
59
            self._xdg_data_home = config.XDG_DATA_HOME
60
        return self._xdg_data_home
61
62
    @property
63
    def xdg_config_home(self):
64
        if self._xdg_config_home:
65
            return self._xdg_config_home
66
        return config.XDG_CONFIG_HOME
67
68
    def save_user_list(self, database=None):
69
        if not database:
70
            database = self.database
71
        with open(self.user_list, 'w', encoding='utf-8') as f:
72
            f.write(RESOURCE_USER_LIST_COMMENT)
73
            f.write('\n')
74
            f.write(safe_dump(database))
75
76
    def load_resource_list(self, list_filename, database=None):
77
        if not database:
78
            database = self.database
79
        if list_filename.is_file():
80
            with open(list_filename, 'r', encoding='utf-8') as f:
81
                list_loaded = safe_load(f) or {}
82
            report = OcrdResourceListValidator.validate(list_loaded)
83
            if not report.is_valid:
84
                self.log.error('\n'.join(report.errors))
85
                raise ValueError(f"Resource list {list_filename} is invalid!")
86
            for executable, resource_list in list_loaded.items():
87
                if executable not in database:
88
                    database[executable] = []
89
                # Prepend, so user provided is sorted before builtin
90
                database[executable] = list_loaded[executable] + database[executable]
91
        return database
92
93
    def list_available(self, executable=None, dynamic=True, name=None, database=None, url=None):
94
        """
95
        List models available for download by processor
96
        """
97
        if not database:
98
            database = self.database
99
        if not executable:
100
            return database.items()
101
        if dynamic:
102
            skip_executables = ["ocrd-cis-data", "ocrd-import", "ocrd-make"]
103
            for exec_dir in environ['PATH'].split(':'):
104
                for exec_path in Path(exec_dir).glob(f'{executable}'):
105
                    if not exec_path.name.startswith('ocrd-'):
106
                        self.log.warning(f"OCR-D processor executable '{exec_path}' has no 'ocrd-' prefix")
107
                    if exec_path.name in skip_executables:
108
                        self.log.debug(f"Not an OCR-D processor CLI, skipping '{exec_path}'")
109
                        continue
110
                    self.log.debug(f"Inspecting '{exec_path} --dump-json' for resources")
111
                    ocrd_tool = get_ocrd_tool_json(exec_path)
112
                    for resdict in ocrd_tool.get('resources', ()):
113
                        if exec_path.name not in database:
114
                            database[exec_path.name] = []
115
                        database[exec_path.name].insert(0, resdict)
116
            database = self._dedup_database(database)
117
        found = False
118
        ret = []
119
        for k in database:
120
            if apply_glob([k], executable):
121
                found = True
122
                restuple = (k, [])
123
                ret.append(restuple)
124
                for resdict in database[k]:
125
                    if name and resdict['name'] != name:
126
                        continue
127
                    if url and resdict['url'] != url:
128
                        continue
129
                    restuple[1].append(resdict)
130
        if not found:
131
            ret = [(executable, [])]
132
        return ret
133
134
    def list_installed(self, executable=None):
135
        """
136
        List installed resources, matching with registry by ``name``
137
        """
138
        ret = []
139
        if executable:
140
            all_executables = [executable]
141
        else:
142
            # resources we know about
143
            all_executables = list(self.database.keys())
144
            # resources in the file system
145
            parent_dirs = [join(x, 'ocrd-resources') for x in [self.xdg_data_home, '/usr/local/share']]
146
            for parent_dir in parent_dirs:
147
                if Path(parent_dir).exists():
148
                    all_executables += [x for x in listdir(parent_dir) if x.startswith('ocrd-')]
149
        for this_executable in set(all_executables):
150
            reslist = []
151
            mimetypes = get_processor_resource_types(this_executable)
152
            moduledir = get_moduledir(this_executable)
153
            for res_filename in list_all_resources(this_executable, moduled=moduledir, xdg_data_home=self.xdg_data_home):
154
                res_filename = Path(res_filename)
155
                if not '*/*' in mimetypes:
156
                    if res_filename.is_dir() and not 'text/directory' in mimetypes:
157
                        continue
158
                    if res_filename.is_file() and ['text/directory'] == mimetypes:
159
                        continue
160
                res_name = res_filename.name
161
                res_type = 'file' if res_filename.is_file() else 'directory'
162
                res_size = res_filename.stat().st_size if res_filename.is_file() else directory_size(res_filename)
163
                resdict_list = [x for x in self.database.get(this_executable, []) if x['name'] == res_name]
164
                if resdict_list:
165
                    resdict = resdict_list[0]
166
                elif str(res_filename.parent) == moduledir:
167
                    resdict = {
168
                        'name': res_name, 
169
                        'url': str(res_filename), 
170
                        'description': 'Found at module', 
171
                        'type': res_type,
172
                        'size': res_size
173
                    }
174
                else:
175
                    resdict = self.add_to_user_database(this_executable, res_filename, resource_type=res_type)
176
                resdict['path'] = str(res_filename)
177
                reslist.append(resdict)
178
            ret.append((this_executable, reslist))
179
        return ret
180
181
    def add_to_user_database(self, executable, res_filename, url=None, resource_type='file'):
182
        """
183
        Add a stub entry to the user resource.yml
184
        """
185
        res_name = Path(res_filename).name
186
        self.log.info(f"{executable} resource '{res_name}' ({str(res_filename)}) not a known resource, "
187
                      f"creating stub in {self.user_list}'")
188
        if Path(res_filename).is_dir():
189
            res_size = directory_size(res_filename)
190
        else:
191
            res_size = Path(res_filename).stat().st_size
192
        with open(self.user_list, 'r', encoding='utf-8') as f:
193
            user_database = safe_load(f) or {}
194
        if executable not in user_database:
195
            user_database[executable] = []
196
        resources_found = self.list_available(executable=executable, name=res_name, database=user_database)[0][1]
197
        if not resources_found:
198
            resdict = {
199
                'name': res_name,
200
                'url': url if url else '???',
201
                'description': f'Found at {self.resource_dir_to_location(res_filename)} on {datetime.now()}',
202
                'version_range': '???',
203
                'type': resource_type,
204
                'size': res_size
205
            }
206
            user_database[executable].append(resdict)
207
        else:
208
            resdict = resources_found[0]
209
        self.save_user_list(user_database)
210
        self.load_resource_list(self.user_list)
211
        return resdict
212
213
    @property
214
    def default_resource_dir(self):
215
        return self.location_to_resource_dir('data')
216
217
    def location_to_resource_dir(self, location):
218
        return '/usr/local/share/ocrd-resources' if location == 'system' else \
219
                join(self.xdg_data_home, 'ocrd-resources') if location == 'data' else \
220
                getcwd()
221
222
    def resource_dir_to_location(self, resource_path):
223
        resource_path = str(resource_path)
224
        return 'system' if resource_path.startswith('/usr/local/share/ocrd-resources') else \
225
               'data' if resource_path.startswith(join(self.xdg_data_home, 'ocrd-resources')) else \
226
               'cwd' if resource_path.startswith(getcwd()) else \
227
               resource_path
228
229
    @staticmethod
230
    def parameter_usage(name, usage='as-is'):
231
        if usage == 'as-is':
232
            return name
233
        elif usage == 'without-extension':
234
            return Path(name).stem
235
        raise ValueError(f"No such usage '{usage}'")
236
237
    @staticmethod
238
    def _download_impl(url, filename, progress_cb=None, size=None):
239
        log = getLogger('ocrd.resource_manager._download_impl')
240
        log.info(f"Downloading {url} to {filename}")
241
        try:
242
            gdrive_file_id, is_gdrive_download_link = gparse_url(url, warning=False)
243
            if gdrive_file_id:
244
                if not is_gdrive_download_link:
245
                    url = f"https://drive.google.com/uc?id={gdrive_file_id}"
246
                try:
247
                    with requests.get(url, stream=True) as r:
248
                        if "Content-Disposition" not in r.headers:
249
                            url = get_url_from_gdrive_confirmation(r.text)
250
                except RuntimeError as e:
251
                    log.warning("Cannot unwrap Google Drive URL: ", e)
252
            with open(filename, 'wb') as f:
253
                with requests.get(url, stream=True) as r:
254
                    r.raise_for_status()
255
                    for data in r.iter_content(chunk_size=4096):
256
                        if progress_cb:
257
                            progress_cb(len(data))
258
                        f.write(data)
259
        except Exception as e:
260
            rmtree(filename, ignore_errors=True)
261
            Path(filename).unlink(missing_ok=True)
262
            raise e
263
264
    @staticmethod
265
    def _copy_file(src, dst, progress_cb=None):
266
        log = getLogger('ocrd.resource_manager._copy_file')
267
        log.info(f"Copying file {src} to {dst}")
268
        with open(dst, 'wb') as f_out, open(src, 'rb') as f_in:
269
            while True:
270
                chunk = f_in.read(4096)
271
                if chunk:
272
                    f_out.write(chunk)
273
                    if progress_cb:
274
                        progress_cb(len(chunk))
275
                else:
276
                    break
277
278
    @staticmethod
279
    def _copy_dir(src, dst, progress_cb=None):
280
        log = getLogger('ocrd.resource_manager._copy_dir')
281
        log.info(f"Copying dir recursively from {src} to {dst}")
282
        if not Path(src).is_dir():
283
            raise ValueError(f"The source is not a directory: {src}")
284
        Path(dst).mkdir(parents=True, exist_ok=True)
285
        for child in Path(src).rglob('*'):
286
            child_dst = Path(dst) / child.relative_to(src)
287
            if Path(child).is_dir():
288
                OcrdResourceManager._copy_dir(child, child_dst, progress_cb)
289
            else:
290
                OcrdResourceManager._copy_file(child, child_dst, progress_cb)
291
292
    @staticmethod
293
    def _copy_impl(src_filename, filename, progress_cb=None):
294
        log = getLogger('ocrd.resource_manager._copy_impl')
295
        log.info(f"Copying {src_filename} to {filename}")
296
        if Path(src_filename).is_dir():
297
            OcrdResourceManager._copy_dir(src_filename, filename, progress_cb)
298
        else:
299
            OcrdResourceManager._copy_file(src_filename, filename, progress_cb)
300
301
    # TODO Proper caching (make head request for size, If-Modified etc)
302
    def download(
303
        self, executable, url, basedir, overwrite=False, no_subdir=False, name=None, resource_type='file',
304
        path_in_archive='.', progress_cb=None,
305
    ):
306
        """
307
        Download a resource by URL
308
        """
309
        log = getLogger('ocrd.resource_manager.download')
310
        destdir = Path(basedir) if no_subdir else Path(basedir, executable)
311
        if not name:
312
            url_parsed = urlparse(url)
313
            name = Path(unquote(url_parsed.path)).name
314
        fpath = Path(destdir, name)
315
        is_url = url.startswith('https://') or url.startswith('http://')
316
        if fpath.exists():
317
            if not overwrite:
318
                fpath_type = 'Directory' if fpath.is_dir() else 'File'
319
                log.warning(f"{fpath_type} {fpath} already exists but --overwrite is not set, skipping the download")
320
                # raise FileExistsError(f"{fpath_type} {fpath} already exists but --overwrite is not set")
321
                return fpath
322
            if fpath.is_dir():
323
                log.info(f"Removing existing target directory {fpath}")
324
                rmtree(str(fpath))
325
            else:
326
                log.info(f"Removing existing target file {fpath}")
327
                unlink(str(fpath))
328
        destdir.mkdir(parents=True, exist_ok=True)
329
        if resource_type in ('file', 'directory'):
330
            if is_url:
331
                self._download_impl(url, fpath, progress_cb)
332
            else:
333
                self._copy_impl(url, fpath, progress_cb)
334
        elif resource_type == 'archive':
335
            archive_fname = 'download.tar.xx'
336
            with pushd_popd(tempdir=True) as tempdir:
337
                if is_url:
338
                    self._download_impl(url, archive_fname, progress_cb)
339
                else:
340
                    self._copy_impl(url, archive_fname, progress_cb)
341
                Path('out').mkdir()
342
                with pushd_popd('out'):
343
                    mimetype = guess_media_type(f'../{archive_fname}', fallback='application/octet-stream')
344
                    log.info(f"Extracting {mimetype} archive to {tempdir}/out")
345
                    if mimetype == 'application/zip':
346
                        with ZipFile(f'../{archive_fname}', 'r') as zipf:
347
                            zipf.extractall()
348
                    elif mimetype in ('application/gzip', 'application/x-xz'):
349
                        with open_tarfile(f'../{archive_fname}', 'r:*') as tar:
350
                            tar.extractall()
351
                    else:
352
                        raise RuntimeError(f"Unable to handle extraction of {mimetype} archive {url}")
353
                    log.info(f"Copying '{path_in_archive}' from archive to {fpath}")
354
                    if Path(path_in_archive).is_dir():
355
                        copytree(path_in_archive, str(fpath))
356
                    else:
357
                        copy(path_in_archive, str(fpath))
358
        return fpath
359
360
    def _dedup_database(self, database=None, dedup_key='name'):
361
        """
362
        Deduplicate resources by name
363
        """
364
        if not database:
365
            database = self.database
366
        for executable, reslist in database.items():
367
            reslist_dedup = []
368
            for resdict in reslist:
369
                if not any(r[dedup_key] == resdict[dedup_key] for r in reslist_dedup):
370
                    reslist_dedup.append(resdict)
371
            database[executable] = reslist_dedup
372
        return database
373