Passed
Pull Request — master (#800)
by Konstantin
02:17
created

OcrdResourceManager._copy_impl()   C

Complexity

Conditions 11

Size

Total Lines 27
Code Lines 24

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 24
dl 0
loc 27
rs 5.4
c 0
b 0
f 0
cc 11
nop 4

How to fix   Complexity   

Complexity

Complex classes like ocrd.resource_manager.OcrdResourceManager._copy_impl() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

1
from pathlib import Path
2
from os.path import join
3
from json import loads
4
from os import environ, listdir, getcwd, path
5
from shutil import copytree
6
from datetime import datetime
7
from tarfile import open as open_tarfile
8
from urllib.parse import urlparse, unquote
9
from subprocess import run, PIPE
10
11
import requests
12
from yaml import safe_load, safe_dump
13
14
from ocrd_validators import OcrdResourceListValidator
15
from ocrd_utils import getLogger, directory_size
16
from ocrd_utils.os import get_processor_resource_types, list_all_resources, pushd_popd, get_ocrd_tool_json
17
from .constants import RESOURCE_LIST_FILENAME, RESOURCE_USER_LIST_COMMENT
18
19
class OcrdResourceManager():
20
21
    """
22
    Managing processor resources
23
    """
24
    def __init__(self, userdir=None, xdg_config_home=None, xdg_data_home=None):
25
        self.log = getLogger('ocrd.resource_manager')
26
        self.database = {}
27
28
        self._xdg_data_home = xdg_data_home
29
        self._xdg_config_home = xdg_config_home
30
        self._userdir = userdir
31
        self.user_list = Path(self.xdg_config_home, 'ocrd', 'resources.yml')
32
33
        self.load_resource_list(Path(RESOURCE_LIST_FILENAME))
34
        if not self.user_list.exists():
35
            if not self.user_list.parent.exists():
36
                self.user_list.parent.mkdir(parents=True)
37
            self.save_user_list()
38
        self.load_resource_list(self.user_list)
39
40
    @property
41
    def userdir(self):
42
        if not self._userdir:
43
            self._userdir = path.expanduser('~')
44
            if 'HOME' in environ and environ['HOME'] != path.expanduser('~'):
45
                self._userdir = environ['HOME']
46
        return self._userdir
47
48
    @property
49
    def xdg_data_home(self):
50
        if not self._xdg_data_home:
51
            if 'XDG_DATA_HOME' in environ:
52
                self._xdg_data_home = environ['XDG_DATA_HOME']
53
            else:
54
                self._xdg_data_home = join(self.userdir, '.local', 'share')
55
        return self._xdg_data_home
56
57
    @property
58
    def xdg_config_home(self):
59
        if not self._xdg_config_home:
60
            if 'XDG_CONFIG_HOME' in environ:
61
                self._xdg_config_home = environ['XDG_CONFIG_HOME']
62
            else:
63
                self._xdg_config_home = join(self.userdir, '.config')
64
        return self._xdg_config_home
65
66
    def save_user_list(self, database=None):
67
        if not database:
68
            database = self.database
69
        with open(self.user_list, 'w', encoding='utf-8') as f:
70
            f.write(RESOURCE_USER_LIST_COMMENT)
71
            f.write('\n')
72
            f.write(safe_dump(database))
73
74
    def load_resource_list(self, list_filename, database=None):
75
        if not database:
76
            database = self.database
77
        if list_filename.is_file():
78
            with open(list_filename, 'r', encoding='utf-8') as f:
79
                list_loaded = safe_load(f) or {}
80
            report = OcrdResourceListValidator.validate(list_loaded)
81
            if not report.is_valid:
82
                self.log.error('\n'.join(report.errors))
83
                raise ValueError("Resource list %s is invalid!" % (list_filename))
84
            for executable, resource_list in list_loaded.items():
85
                if executable not in database:
86
                    database[executable] = []
87
                # Prepend, so user provided is sorted before builtin
88
                database[executable] = list_loaded[executable] + database[executable]
89
        return database
90
91
    def list_available(self, executable=None, dynamic=True):
92
        """
93
        List models available for download by processor
94
        """
95
        if dynamic:
96
            for exec_dir in environ['PATH'].split(':'):
97
                for exec_path in Path(exec_dir).glob(f'{executable}*'):
98
                    self.log.info(f"Inspecting '{exec_path} --dump-json' for resources")
99
                    ocrd_tool = get_ocrd_tool_json(exec_path)
100
                    for resdict in ocrd_tool.get('resources', ()):
101
                        for res_remove in (res for res in self.database.get(executable, []) if res['name'] == resdict['name']):
102
                            self.database.get(executable).remove(res_remove)
103
                        self.database[exec_path.name].append(resdict)
104
        if executable:
105
            ret = []
106
            for k in self.database:
107
                if k.startswith(executable):
108
                    ret.append((k, self.database[k]))
109
            return ret
110
        return self.database.items()
111
112
    def list_installed(self, executable=None):
113
        """
114
        List installed resources, matching with registry by ``name``
115
        """
116
        ret = []
117
        if executable:
118
            all_executables = [executable]
119
        else:
120
            # resources we know about
121
            all_executables = list(self.database.keys())
122
            # resources in the file system
123
            parent_dirs = [join(x, 'ocrd-resources') for x in [self.xdg_data_home, '/usr/local/share']]
124
            for parent_dir in parent_dirs:
125
                if Path(parent_dir).exists():
126
                    all_executables += [x for x in listdir(parent_dir) if x.startswith('ocrd-')]
127
        for this_executable in set(all_executables):
128
            reslist = []
129
            mimetypes = get_processor_resource_types(this_executable)
130
            for res_filename in list_all_resources(this_executable, xdg_data_home=self.xdg_data_home):
131
                res_filename = Path(res_filename)
132
                if not '*/*' in mimetypes:
133
                    if res_filename.is_dir() and not 'text/directory' in mimetypes:
134
                        continue
135
                    if res_filename.is_file() and ['text/directory'] == mimetypes:
136
                        continue
137
                res_name = res_filename.name
138
                resdict = [x for x in self.database.get(this_executable, []) if x['name'] == res_name]
139
                if not resdict:
140
                    self.log.info("%s resource '%s' (%s) not a known resource, creating stub in %s'", this_executable, res_name, str(res_filename), self.user_list)
141
                    resdict = [self.add_to_user_database(this_executable, res_filename)]
142
                resdict[0]['path'] = str(res_filename)
143
                reslist.append(resdict[0])
144
            ret.append((this_executable, reslist))
145
        return ret
146
147
    def add_to_user_database(self, executable, res_filename, url=None, resource_type='file'):
148
        """
149
        Add a stub entry to the user resource.yml
150
        """
151
        res_name = Path(res_filename).name
152
        if Path(res_filename).is_dir():
153
            res_size = directory_size(res_filename)
154
        else:
155
            res_size = Path(res_filename).stat().st_size
156
        with open(self.user_list, 'r', encoding='utf-8') as f:
157
            user_database = safe_load(f) or {}
158
        if executable not in user_database:
159
            user_database[executable] = []
160
        resources_found = self.find_resources(executable=executable, name=res_name, database=user_database)
161
        if not resources_found:
162
            resdict = {
163
                'name': res_name,
164
                'url': url if url else '???',
165
                'description': 'Found at %s on %s' % (self.resource_dir_to_location(res_filename), datetime.now()),
166
                'version_range': '???',
167
                'type': resource_type,
168
                'size': res_size
169
            }
170
            user_database[executable].append(resdict)
171
        else:
172
            resdict = resources_found[0][1]
173
        self.save_user_list(user_database)
174
        self.load_resource_list(self.user_list)
175
        return resdict
176
177
    def find_resources(self, executable=None, name=None, url=None, database=None):
178
        """
179
        Find resources in the registry
180
        """
181
        if not database:
182
            database = self.database
183
        ret = []
184
        if executable and executable not in database.keys():
185
            return ret
186
        for executable in [executable] if executable else database.keys():
187
            for resdict in database[executable]:
188
                if not name and not url:
189
                    ret.append((executable, resdict))
190
                elif url and url == resdict['url']:
191
                    ret.append((executable, resdict))
192
                elif name and name == resdict['name']:
193
                    ret.append((executable, resdict))
194
        return ret
195
196
    @property
197
    def default_resource_dir(self):
198
        return self.location_to_resource_dir('data')
199
200
    def location_to_resource_dir(self, location):
201
        return '/usr/local/share/ocrd-resources' if location == 'system' else \
202
                join(self.xdg_data_home, 'ocrd-resources') if location == 'data' else \
203
                getcwd()
204
205
    def resource_dir_to_location(self, resource_path):
206
        resource_path = str(resource_path)
207
        return 'system' if resource_path.startswith('/usr/local/share/ocrd-resources') else \
208
               'data' if resource_path.startswith(join(self.xdg_data_home, 'ocrd-resources')) else \
209
               'cwd' if resource_path.startswith(getcwd()) else \
210
               resource_path
211
212
    def parameter_usage(self, name, usage='as-is'):
213
        if usage == 'as-is':
214
            return name
215
        elif usage == 'without-extension':
216
            return Path(name).stem
217
        raise ValueError("No such usage '%s'" % usage)
218
219
    def _download_impl(self, url, filename, progress_cb=None, size=None):
220
        log = getLogger('ocrd.resource_manager._download_impl')
221
        log.info("Downloading %s to %s" % (url, filename))
222
        with open(filename, 'wb') as f:
223
            with requests.get(url, stream=True) as r:
224
                total = size if size else int(r.headers.get('content-length'))
225
                for data in r.iter_content(chunk_size=4096):
226
                    if progress_cb:
227
                        progress_cb(len(data))
228
                    f.write(data)
229
230
    def _copy_impl(self, src_filename, filename, progress_cb=None):
231
        log = getLogger('ocrd.resource_manager._copy_impl')
232
        log.info("Copying %s to %s", src_filename, filename)
233
        if Path(src_filename).is_dir():
234
            log.info(f"Copying recursively from {src_filename} to {filename}")
235
            for child in Path(src_filename).rglob('*'):
236
                child_dst = Path(filename) / child.relative_to(src_filename)
237
                child_dst.parent.mkdir(parents=True, exist_ok=True)
238
                with open(child_dst, 'wb') as f_out, open(child, 'rb') as f_in:
239
                    while True:
240
                        chunk = f_in.read(4096)
241
                        if chunk:
242
                            f_out.write(chunk)
243
                            if progress_cb:
244
                                progress_cb(len(chunk))
245
                        else:
246
                            break
247
        else:
248
            with open(filename, 'wb') as f_out, open(src_filename, 'rb') as f_in:
249
                while True:
250
                    chunk = f_in.read(4096)
251
                    if chunk:
252
                        f_out.write(chunk)
253
                        if progress_cb:
254
                            progress_cb(len(chunk))
255
                    else:
256
                        break
257
258
    # TODO Proper caching (make head request for size, If-Modified etc)
259
    def download(
260
        self,
261
        executable,
262
        url,
263
        basedir,
264
        overwrite=False,
265
        no_subdir=False,
266
        name=None,
267
        resource_type='file',
268
        path_in_archive='.',
269
        progress_cb=None,
270
        size=None,
271
    ):
272
        """
273
        Download a resource by URL
274
        """
275
        log = getLogger('ocrd.resource_manager.download')
276
        destdir = Path(basedir) if no_subdir else Path(basedir, executable)
277
        if not name:
278
            url_parsed = urlparse(url)
279
            name = Path(unquote(url_parsed.path)).name
280
        fpath = Path(destdir, name)
281
        is_url = url.startswith('https://') or url.startswith('http://')
282
        if fpath.exists() and not overwrite:
283
            log.info("%s to be %s to %s which already exists and overwrite is False" % (url, 'downloaded' if is_url else 'copied', fpath))
284
            return fpath
285
        destdir.mkdir(parents=True, exist_ok=True)
286
        if resource_type in ('file', 'directory'):
287
            if is_url:
288
                self._download_impl(url, fpath, progress_cb)
289
            else:
290
                self._copy_impl(url, fpath, progress_cb)
291
        elif resource_type == 'archive':
292
            with pushd_popd(tempdir=True):
293
                if is_url:
294
                    self._download_impl(url, 'download.tar.xx', progress_cb, size)
295
                else:
296
                    self._copy_impl(url, 'download.tar.xx', progress_cb)
297
                Path('out').mkdir()
298
                with pushd_popd('out'):
299
                    log.info("Extracting archive")
300
                    with open_tarfile('../download.tar.xx', 'r:*') as tar:
301
                        tar.extractall()
302
                    log.info("Copying '%s' from archive to %s" % (path_in_archive, fpath))
303
                    copytree(path_in_archive, str(fpath))
304
        return fpath
305