Passed
Pull Request — master (#904)
by Konstantin
04:52
created

OcrdResourceManager.find_resources()   D

Complexity

Conditions 13

Size

Total Lines 18
Code Lines 15

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 15
dl 0
loc 18
rs 4.2
c 0
b 0
f 0
cc 13
nop 5

How to fix   Complexity   

Complexity

Complex classes like ocrd.resource_manager.OcrdResourceManager.find_resources() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

1
from pathlib import Path
2
from os.path import join
3
from json import loads
4
from os import environ, listdir, getcwd, path
5
from fnmatch import filter as apply_glob
6
from shutil import copytree
7
from datetime import datetime
8
from tarfile import open as open_tarfile
9
from urllib.parse import urlparse, unquote
10
from subprocess import run, PIPE
11
12
import requests
13
from yaml import safe_load, safe_dump
14
15
# https://github.com/OCR-D/core/issues/867
16
# https://stackoverflow.com/questions/50900727/skip-converting-entities-while-loading-a-yaml-string-using-pyyaml
17
import yaml.constructor
18
yaml.constructor.SafeConstructor.yaml_constructors[u'tag:yaml.org,2002:timestamp'] = \
19
    yaml.constructor.SafeConstructor.yaml_constructors[u'tag:yaml.org,2002:str']
20
21
from ocrd_validators import OcrdResourceListValidator
22
from ocrd_utils import getLogger, directory_size, get_moduledir
23
from ocrd_utils.os import get_processor_resource_types, list_all_resources, pushd_popd, get_ocrd_tool_json
24
from .constants import RESOURCE_LIST_FILENAME, RESOURCE_USER_LIST_COMMENT
25
26
class OcrdResourceManager():
27
28
    """
29
    Managing processor resources
30
    """
31
    def __init__(self, userdir=None, xdg_config_home=None, xdg_data_home=None, skip_init=False):
32
        self.log = getLogger('ocrd.resource_manager')
33
        self.database = {}
34
35
        self._xdg_data_home = xdg_data_home
36
        self._xdg_config_home = xdg_config_home
37
        self._userdir = userdir
38
        self.user_list = Path(self.xdg_config_home, 'ocrd', 'resources.yml')
39
40
        if not skip_init:
41
            self.load_resource_list(Path(RESOURCE_LIST_FILENAME))
42
            if not self.user_list.exists():
43
                if not self.user_list.parent.exists():
44
                    self.user_list.parent.mkdir(parents=True)
45
                self.save_user_list()
46
            self.load_resource_list(self.user_list)
47
48
    @property
49
    def userdir(self):
50
        if not self._userdir:
51
            self._userdir = path.expanduser('~')
52
            if 'HOME' in environ and environ['HOME'] != path.expanduser('~'):
53
                self._userdir = environ['HOME']
54
        return self._userdir
55
56
    @property
57
    def xdg_data_home(self):
58
        if not self._xdg_data_home:
59
            if 'XDG_DATA_HOME' in environ:
60
                self._xdg_data_home = environ['XDG_DATA_HOME']
61
            else:
62
                self._xdg_data_home = join(self.userdir, '.local', 'share')
63
        return self._xdg_data_home
64
65
    @property
66
    def xdg_config_home(self):
67
        if not self._xdg_config_home:
68
            if 'XDG_CONFIG_HOME' in environ:
69
                self._xdg_config_home = environ['XDG_CONFIG_HOME']
70
            else:
71
                self._xdg_config_home = join(self.userdir, '.config')
72
        return self._xdg_config_home
73
74
    def save_user_list(self, database=None):
75
        if not database:
76
            database = self.database
77
        with open(self.user_list, 'w', encoding='utf-8') as f:
78
            f.write(RESOURCE_USER_LIST_COMMENT)
79
            f.write('\n')
80
            f.write(safe_dump(database))
81
82
    def load_resource_list(self, list_filename, database=None):
83
        if not database:
84
            database = self.database
85
        if list_filename.is_file():
86
            with open(list_filename, 'r', encoding='utf-8') as f:
87
                list_loaded = safe_load(f) or {}
88
            report = OcrdResourceListValidator.validate(list_loaded)
89
            if not report.is_valid:
90
                self.log.error('\n'.join(report.errors))
91
                raise ValueError("Resource list %s is invalid!" % (list_filename))
92
            for executable, resource_list in list_loaded.items():
93
                if executable not in database:
94
                    database[executable] = []
95
                # Prepend, so user provided is sorted before builtin
96
                database[executable] = list_loaded[executable] + database[executable]
97
        return database
98
99
    def list_available(self, executable=None, dynamic=True):
100
        """
101
        List models available for download by processor
102
        """
103
        if not executable:
104
            return self.database.items()
105
        if dynamic:
106
            for exec_dir in environ['PATH'].split(':'):
107
                for exec_path in Path(exec_dir).glob(f'{executable}'):
108
                    self.log.info(f"Inspecting '{exec_path} --dump-json' for resources")
109
                    ocrd_tool = get_ocrd_tool_json(exec_path)
110
                    for resdict in ocrd_tool.get('resources', ()):
111
                        if exec_path.name not in self.database:
112
                            self.database[exec_path.name] = []
113
                        self.database[exec_path.name].append(resdict)
114
                    self.database = self._dedup_database(self.database)
115
        ret = []
116
        found = False
117
        for k in self.database:
118
            if apply_glob([k], executable):
119
                found = True
120
                ret.append((k, self.database[k]))
121
        if not found:
122
            ret = [(executable, [])]
123
        return ret
124
125
    def list_installed(self, executable=None):
126
        """
127
        List installed resources, matching with registry by ``name``
128
        """
129
        ret = []
130
        if executable:
131
            all_executables = [executable]
132
        else:
133
            # resources we know about
134
            all_executables = list(self.database.keys())
135
            # resources in the file system
136
            parent_dirs = [join(x, 'ocrd-resources') for x in [self.xdg_data_home, '/usr/local/share']]
137
            for parent_dir in parent_dirs:
138
                if Path(parent_dir).exists():
139
                    all_executables += [x for x in listdir(parent_dir) if x.startswith('ocrd-')]
140
        for this_executable in set(all_executables):
141
            reslist = []
142
            mimetypes = get_processor_resource_types(this_executable)
143
            moduledir = get_moduledir(this_executable)
144
            for res_filename in list_all_resources(this_executable, moduled=moduledir, xdg_data_home=self.xdg_data_home):
145
                res_filename = Path(res_filename)
146
                if not '*/*' in mimetypes:
147
                    if res_filename.is_dir() and not 'text/directory' in mimetypes:
148
                        continue
149
                    if res_filename.is_file() and ['text/directory'] == mimetypes:
150
                        continue
151
                res_name = res_filename.name
152
                resdict_list = [x for x in self.database.get(this_executable, []) if x['name'] == res_name]
153
                if resdict_list:
154
                    resdict = resdict_list[0]
155
                else:
156
                    resdict = self.add_to_user_database(this_executable, res_filename)
157
                resdict['path'] = str(res_filename)
158
                reslist.append(resdict)
159
            ret.append((this_executable, reslist))
160
        return ret
161
162
    def add_to_user_database(self, executable, res_filename, url=None, resource_type='file'):
163
        """
164
        Add a stub entry to the user resource.yml
165
        """
166
        res_name = Path(res_filename).name
167
        self.log.info("%s resource '%s' (%s) not a known resource, creating stub in %s'", executable, res_name, str(res_filename), self.user_list)
168
        if Path(res_filename).is_dir():
169
            res_size = directory_size(res_filename)
170
        else:
171
            res_size = Path(res_filename).stat().st_size
172
        with open(self.user_list, 'r', encoding='utf-8') as f:
173
            user_database = safe_load(f) or {}
174
        if executable not in user_database:
175
            user_database[executable] = []
176
        resources_found = self.find_resources(executable=executable, name=res_name, database=user_database)
177
        if not resources_found:
178
            resdict = {
179
                'name': res_name,
180
                'url': url if url else '???',
181
                'description': 'Found at %s on %s' % (self.resource_dir_to_location(res_filename), datetime.now()),
182
                'version_range': '???',
183
                'type': resource_type,
184
                'size': res_size
185
            }
186
            user_database[executable].append(resdict)
187
        else:
188
            resdict = resources_found[0][1]
189
        self.save_user_list(user_database)
190
        self.load_resource_list(self.user_list)
191
        return resdict
192
193
    @property
194
    def default_resource_dir(self):
195
        return self.location_to_resource_dir('data')
196
197
    def location_to_resource_dir(self, location):
198
        return '/usr/local/share/ocrd-resources' if location == 'system' else \
199
                join(self.xdg_data_home, 'ocrd-resources') if location == 'data' else \
200
                getcwd()
201
202
    def resource_dir_to_location(self, resource_path):
203
        resource_path = str(resource_path)
204
        return 'system' if resource_path.startswith('/usr/local/share/ocrd-resources') else \
205
               'data' if resource_path.startswith(join(self.xdg_data_home, 'ocrd-resources')) else \
206
               'cwd' if resource_path.startswith(getcwd()) else \
207
               resource_path
208
209
    def parameter_usage(self, name, usage='as-is'):
210
        if usage == 'as-is':
211
            return name
212
        elif usage == 'without-extension':
213
            return Path(name).stem
214
        raise ValueError("No such usage '%s'" % usage)
215
216
    def _download_impl(self, url, filename, progress_cb=None, size=None):
217
        log = getLogger('ocrd.resource_manager._download_impl')
218
        log.info("Downloading %s to %s" % (url, filename))
219
        with open(filename, 'wb') as f:
220
            with requests.get(url, stream=True) as r:
221
                total = size if size else int(r.headers.get('content-length'))
222
                for data in r.iter_content(chunk_size=4096):
223
                    if progress_cb:
224
                        progress_cb(len(data))
225
                    f.write(data)
226
227
    def _copy_impl(self, src_filename, filename, progress_cb=None):
228
        log = getLogger('ocrd.resource_manager._copy_impl')
229
        log.info("Copying %s to %s", src_filename, filename)
230
        if Path(src_filename).is_dir():
231
            log.info(f"Copying recursively from {src_filename} to {filename}")
232
            for child in Path(src_filename).rglob('*'):
233
                child_dst = Path(filename) / child.relative_to(src_filename)
234
                child_dst.parent.mkdir(parents=True, exist_ok=True)
235
                with open(child_dst, 'wb') as f_out, open(child, 'rb') as f_in:
236
                    while True:
237
                        chunk = f_in.read(4096)
238
                        if chunk:
239
                            f_out.write(chunk)
240
                            if progress_cb:
241
                                progress_cb(len(chunk))
242
                        else:
243
                            break
244
        else:
245
            with open(filename, 'wb') as f_out, open(src_filename, 'rb') as f_in:
246
                while True:
247
                    chunk = f_in.read(4096)
248
                    if chunk:
249
                        f_out.write(chunk)
250
                        if progress_cb:
251
                            progress_cb(len(chunk))
252
                    else:
253
                        break
254
255
    # TODO Proper caching (make head request for size, If-Modified etc)
256
    def download(
257
        self,
258
        executable,
259
        url,
260
        basedir,
261
        overwrite=False,
262
        no_subdir=False,
263
        name=None,
264
        resource_type='file',
265
        path_in_archive='.',
266
        progress_cb=None,
267
        size=None,
268
    ):
269
        """
270
        Download a resource by URL
271
        """
272
        log = getLogger('ocrd.resource_manager.download')
273
        destdir = Path(basedir) if no_subdir else Path(basedir, executable)
274
        if not name:
275
            url_parsed = urlparse(url)
276
            name = Path(unquote(url_parsed.path)).name
277
        fpath = Path(destdir, name)
278
        is_url = url.startswith('https://') or url.startswith('http://')
279
        if fpath.exists() and not overwrite:
280
            log.info("%s to be %s to %s which already exists and overwrite is False" % (url, 'downloaded' if is_url else 'copied', fpath))
281
            return fpath
282
        destdir.mkdir(parents=True, exist_ok=True)
283
        if resource_type in ('file', 'directory'):
284
            if is_url:
285
                self._download_impl(url, fpath, progress_cb)
286
            else:
287
                self._copy_impl(url, fpath, progress_cb)
288
        elif resource_type == 'archive':
289
            with pushd_popd(tempdir=True) as tempdir:
290
                if is_url:
291
                    self._download_impl(url, 'download.tar.xx', progress_cb, size)
292
                else:
293
                    self._copy_impl(url, 'download.tar.xx', progress_cb)
294
                Path('out').mkdir()
295
                with pushd_popd('out'):
296
                    log.info("Extracting archive to %s/out" % tempdir)
297
                    with open_tarfile('../download.tar.xx', 'r:*') as tar:
298
                        tar.extractall()
299
                    log.info("Copying '%s' from archive to %s" % (path_in_archive, fpath))
300
                    copytree(path_in_archive, str(fpath))
301
        return fpath
302
303
    def _dedup_database(self, database=None):
304
        """
305
        Deduplicate resources by name
306
        """
307
        if not database:
308
            database = self.database
309
        for executable, reslist in database.items():
310
            reslist_dedup = []
311
            for resdict in reslist:
312
                if any(r['name'] == resdict['name'] for r in reslist_dedup):
313
                    continue
314
                else:
315
                    reslist_dedup.append(resdict)
316
            database[executable] = reslist_dedup
317
        return database
318