Passed
Pull Request — master (#800)
by Konstantin
02:21
created

OcrdResourceManager.discover()   C

Complexity

Conditions 9

Size

Total Lines 26
Code Lines 21

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 21
dl 0
loc 26
rs 6.6666
c 0
b 0
f 0
cc 9
nop 3
1
from pathlib import Path
2
from os.path import join
3
from json import loads
4
from os import environ, listdir, getcwd, path
5
from shutil import copytree
6
from datetime import datetime
7
from tarfile import open as open_tarfile
8
from urllib.parse import urlparse, unquote
9
from subprocess import run, PIPE
10
11
import requests
12
from yaml import safe_load, safe_dump
13
14
from ocrd_validators import OcrdResourceListValidator
15
from ocrd_utils import getLogger
16
from ocrd_utils.os import get_processor_resource_types, list_all_resources, pushd_popd
17
from .constants import RESOURCE_LIST_FILENAME, RESOURCE_USER_LIST_COMMENT
18
19
class OcrdResourceManager():
20
21
    """
22
    Managing processor resources
23
    """
24
    def __init__(self, userdir=None, xdg_config_home=None, xdg_data_home=None):
25
        self.log = getLogger('ocrd.resource_manager')
26
        self.database = {}
27
28
        self._xdg_data_home = xdg_data_home
29
        self._xdg_config_home = xdg_config_home
30
        self._userdir = userdir
31
        self.user_list = Path(self.xdg_config_home, 'ocrd', 'resources.yml')
32
33
        self.load_resource_list(Path(RESOURCE_LIST_FILENAME))
34
        if not self.user_list.exists():
35
            if not self.user_list.parent.exists():
36
                self.user_list.parent.mkdir(parents=True)
37
            with open(str(self.user_list), 'w', encoding='utf-8') as f:
38
                f.write(RESOURCE_USER_LIST_COMMENT)
39
        self.load_resource_list(self.user_list)
40
41
    @property
42
    def userdir(self):
43
        if not self._userdir:
44
            self._userdir = path.expanduser('~')
45
            if 'HOME' in environ and environ['HOME'] != path.expanduser('~'):
46
                self._userdir = environ['HOME']
47
        return self._userdir
48
49
    @property
50
    def xdg_data_home(self):
51
        if not self._xdg_data_home:
52
            if 'XDG_DATA_HOME' in environ:
53
                self._xdg_data_home = environ['XDG_DATA_HOME']
54
            else:
55
                self._xdg_data_home = join(self.userdir, '.local', 'share')
56
        return self._xdg_data_home
57
58
    @property
59
    def xdg_config_home(self):
60
        if not self._xdg_config_home:
61
            if 'XDG_CONFIG_HOME' in environ:
62
                self._xdg_config_home = environ['XDG_CONFIG_HOME']
63
            else:
64
                self._xdg_config_home = join(self.userdir, '.config')
65
        return self._xdg_config_home
66
67
    def load_resource_list(self, list_filename, database=None):
68
        if not database:
69
            database = self.database
70
        if list_filename.is_file():
71
            with open(list_filename, 'r', encoding='utf-8') as f:
72
                list_loaded = safe_load(f) or {}
73
            report = OcrdResourceListValidator.validate(list_loaded)
74
            if not report.is_valid:
75
                self.log.error('\n'.join(report.errors))
76
                raise ValueError("Resource list %s is invalid!" % (list_filename))
77
            for executable, resource_list in list_loaded.items():
78
                if executable not in database:
79
                    database[executable] = []
80
                # Prepend, so user provided is sorted before builtin
81
                database[executable] = list_loaded[executable] + database[executable]
82
        return database
83
84
    def discover(self, dry_run=False, glob='ocrd-*'):
85
        """
86
        Discover resources by checking all the executables matching the
87
        ``glob`` glob and add them to the user resource_list.yml
88
        unless ``dry_run`` is ``True``.
89
        """
90
        ret = []
91
        if not dry_run:
92
            with open(self.user_list, 'r', encoding='utf-8') as f:
93
                user_database = safe_load(f) or {}
94
        for exec_dir in environ['PATH'].split(':'):
95
            for exec_path in Path(exec_dir).glob(glob):
96
                self.log.info(f"Inspecting '{exec_path} --dump-json' for resources")
97
                result = run([exec_path, '--dump-json'], stdout=PIPE)
98
                ocrd_tool = loads(result.stdout)
99
                if not dry_run:
100
                    if exec_path.name not in user_database:
0 ignored issues
show
introduced by
The variable user_database does not seem to be defined in case BooleanNotNode on line 91 is False. Are you sure this can never be the case?
Loading history...
101
                        user_database[exec_path.name] = []
102
                    user_database[exec_path.name] += ocrd_tool.get('resources', ())
103
                ret.append((exec_path.name, ocrd_tool.get('resources', ())))
104
        if not dry_run:
105
            with open(self.user_list, 'w', encoding='utf-8') as f:
106
                f.write(RESOURCE_USER_LIST_COMMENT)
107
                f.write('\n')
108
                f.write(safe_dump(user_database))
109
        return ret
110
111
    def list_available(self, executable=None):
112
        """
113
        List models available for download by processor
114
        """
115
        if executable:
116
            return [(executable, self.database[executable])]
117
        return self.database.items()
118
119
    def list_installed(self, executable=None):
120
        """
121
        List installed resources, matching with registry by ``name``
122
        """
123
        ret = []
124
        if executable:
125
            all_executables = [executable]
126
        else:
127
            # resources we know about
128
            all_executables = list(self.database.keys())
129
            # resources in the file system
130
            parent_dirs = [join(x, 'ocrd-resources') for x in [self.xdg_data_home, '/usr/local/share']]
131
            for parent_dir in parent_dirs:
132
                if Path(parent_dir).exists():
133
                    all_executables += [x for x in listdir(parent_dir) if x.startswith('ocrd-')]
134
        for this_executable in set(all_executables):
135
            reslist = []
136
            has_dirs, has_files = get_processor_resource_types(this_executable)
137
            for res_filename in list_all_resources(this_executable):
138
                if Path(res_filename).is_dir() and not has_dirs:
139
                    continue
140
                if Path(res_filename).is_file() and not has_files:
141
                    continue
142
                res_name = Path(res_filename).name
143
                resdict = [x for x in self.database.get(this_executable, []) if x['name'] == res_name]
144
                if not resdict:
145
                    self.log.info("%s resource '%s' (%s) not a known resource, creating stub in %s'", this_executable, res_name, res_filename, self.user_list)
146
                    resdict = [self.add_to_user_database(this_executable, res_filename)]
147
                resdict[0]['path'] = res_filename
148
                reslist.append(resdict[0])
149
            ret.append((this_executable, reslist))
150
        return ret
151
152
    def add_to_user_database(self, executable, res_filename, url=None, resource_type='file'):
153
        """
154
        Add a stub entry to the user resource.yml
155
        """
156
        res_name = Path(res_filename).name
157
        res_size = Path(res_filename).stat().st_size
158
        with open(self.user_list, 'r', encoding='utf-8') as f:
159
            user_database = safe_load(f) or {}
160
        if executable not in user_database:
161
            user_database[executable] = []
162
        resources_found = self.find_resources(executable=executable, name=res_name, database=user_database)
163
        if not resources_found:
164
            resdict = {
165
                'name': res_name,
166
                'url': url if url else '???',
167
                'description': 'Found at %s on %s' % (self.resource_dir_to_location(res_filename), datetime.now()),
168
                'version_range': '???',
169
                'type': resource_type,
170
                'size': res_size
171
            }
172
            user_database[executable].append(resdict)
173
        else:
174
            resdict = resources_found[0]
175
        with open(self.user_list, 'w', encoding='utf-8') as f:
176
            f.write(RESOURCE_USER_LIST_COMMENT)
177
            f.write('\n')
178
            f.write(safe_dump(user_database))
179
        self.load_resource_list(self.user_list)
180
        return resdict
181
182
    def find_resources(self, executable=None, name=None, url=None, database=None):
183
        """
184
        Find resources in the registry
185
        """
186
        if not database:
187
            database = self.database
188
        ret = []
189
        if executable and executable not in database.keys():
190
            return ret
191
        for executable in [executable] if executable else database.keys():
192
            for resdict in database[executable]:
193
                if not name and not url:
194
                    ret.append((executable, resdict))
195
                elif url and url == resdict['url']:
196
                    ret.append((executable, resdict))
197
                elif name and name == resdict['name']:
198
                    ret.append((executable, resdict))
199
        return ret
200
201
    @property
202
    def default_resource_dir(self):
203
        return self.location_to_resource_dir('data')
204
205
    def location_to_resource_dir(self, location):
206
        return '/usr/local/share/ocrd-resources' if location == 'system' else \
207
                join(self.xdg_data_home, 'ocrd-resources') if location == 'data' else \
208
                getcwd()
209
210
    def resource_dir_to_location(self, resource_path):
211
        resource_path = str(resource_path)
212
        return 'system' if resource_path.startswith('/usr/local/share/ocrd-resources') else \
213
               'data' if resource_path.startswith(join(self.xdg_data_home, 'ocrd-resources')) else \
214
               'cwd' if resource_path.startswith(getcwd()) else \
215
               resource_path
216
217
    def parameter_usage(self, name, usage='as-is'):
218
        if usage == 'as-is':
219
            return name
220
        elif usage == 'without-extension':
221
            return Path(name).stem
222
        raise ValueError("No such usage '%s'" % usage)
223
224
    def _download_impl(self, url, filename, progress_cb=None, size=None):
225
        log = getLogger('ocrd.resource_manager._download_impl')
226
        log.info("Downloading %s to %s" % (url, filename))
227
        with open(filename, 'wb') as f:
228
            with requests.get(url, stream=True) as r:
229
                total = size if size else int(r.headers.get('content-length'))
230
                for data in r.iter_content(chunk_size=4096):
231
                    if progress_cb:
232
                        progress_cb(len(data))
233
                    f.write(data)
234
235
    def _copy_impl(self, src_filename, filename, progress_cb=None):
236
        log = getLogger('ocrd.resource_manager._copy_impl')
237
        log.info("Copying %s" % src_filename)
238
        with open(filename, 'wb') as f_out, open(src_filename, 'rb') as f_in:
239
            while True:
240
                chunk = f_in.read(4096)
241
                if chunk:
242
                    f_out.write(chunk)
243
                    if progress_cb:
244
                        progress_cb(len(chunk))
245
                else:
246
                    break
247
248
    # TODO Proper caching (make head request for size, If-Modified etc)
249
    def download(
250
        self,
251
        executable,
252
        url,
253
        basedir,
254
        overwrite=False,
255
        no_subdir=False,
256
        name=None,
257
        resource_type='file',
258
        path_in_archive='.',
259
        progress_cb=None,
260
        size=None,
261
    ):
262
        """
263
        Download a resource by URL
264
        """
265
        log = getLogger('ocrd.resource_manager.download')
266
        destdir = Path(basedir) if no_subdir else Path(basedir, executable)
267
        if not name:
268
            url_parsed = urlparse(url)
269
            name = Path(unquote(url_parsed.path)).name
270
        fpath = Path(destdir, name)
271
        is_url = url.startswith('https://') or url.startswith('http://')
272
        if fpath.exists() and not overwrite:
273
            log.info("%s to be %s to %s which already exists and overwrite is False" % (url, 'downloaded' if is_url else 'copied', fpath))
274
            return fpath
275
        destdir.mkdir(parents=True, exist_ok=True)
276
        if resource_type == 'file':
277
            if is_url:
278
                self._download_impl(url, fpath, progress_cb)
279
            else:
280
                self._copy_impl(url, fpath, progress_cb)
281
        elif resource_type == 'tarball':
282
            with pushd_popd(tempdir=True):
283
                if is_url:
284
                    self._download_impl(url, 'download.tar.xx', progress_cb, size)
285
                else:
286
                    self._copy_impl(url, 'download.tar.xx', progress_cb)
287
                Path('out').mkdir()
288
                with pushd_popd('out'):
289
                    log.info("Extracting tarball")
290
                    with open_tarfile('../download.tar.xx', 'r:*') as tar:
291
                        tar.extractall()
292
                    log.info("Copying '%s' from tarball to %s" % (path_in_archive, fpath))
293
                    copytree(path_in_archive, str(fpath))
294
        # TODO
295
        # elif resource_type == 'github-dir':
296
        return fpath
297