Passed
Pull Request — master (#1309)
by
unknown
08:02
created

OcrdResourceManager._copy_dir()   A

Complexity

Conditions 4

Size

Total Lines 12
Code Lines 11

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 11
dl 0
loc 12
rs 9.85
c 0
b 0
f 0
cc 4
nop 4
1
from logging import Logger
2
from pathlib import Path
3
from os.path import join
4
from os import environ, listdir, getcwd, unlink
5
from shutil import copytree, rmtree, copy
6
from fnmatch import filter as apply_glob
7
from datetime import datetime
8
from tarfile import open as open_tarfile
9
from urllib.parse import urlparse, unquote
10
from zipfile import ZipFile
11
12
import requests
13
from gdown.parse_url import parse_url as gparse_url
14
from gdown.download import get_url_from_gdrive_confirmation
15
from yaml import safe_load, safe_dump
16
17
# pylint: disable=wrong-import-position
18
19
# https://github.com/OCR-D/core/issues/867
20
# https://stackoverflow.com/questions/50900727/skip-converting-entities-while-loading-a-yaml-string-using-pyyaml
21
import yaml.constructor
22
yaml.constructor.SafeConstructor.yaml_constructors['tag:yaml.org,2002:timestamp'] = \
23
    yaml.constructor.SafeConstructor.yaml_constructors['tag:yaml.org,2002:str']
24
25
# pylint: enable=wrong-import-position
26
27
# pylint: enable=wrong-import-position
28
29
# pylint: enable=wrong-import-position
30
31
from ocrd_validators import OcrdResourceListValidator
32
from ocrd_utils import getLogger, directory_size, get_moduledir, guess_media_type, config
33
from ocrd_utils.os import get_processor_resource_types, list_all_resources, pushd_popd, get_ocrd_tool_json
34
from .constants import RESOURCE_LIST_FILENAME, RESOURCE_USER_LIST_COMMENT
35
36
37
class OcrdResourceManager:
38
39
    """
40
    Managing processor resources
41
    """
42
    def __init__(self, userdir=None, xdg_config_home=None, xdg_data_home=None, skip_init=False):
43
        self.log = getLogger('ocrd.resource_manager')
44
        self.database = {}
45
46
        self._xdg_data_home = xdg_data_home
47
        self._xdg_config_home = xdg_config_home
48
        self._userdir = userdir
49
        self.user_list = Path(self.xdg_config_home, 'ocrd', 'resources.yml')
50
51
        if not skip_init:
52
            self.load_resource_list(Path(RESOURCE_LIST_FILENAME))
53
            if not self.user_list.exists():
54
                if not self.user_list.parent.exists():
55
                    self.user_list.parent.mkdir(parents=True)
56
                self.save_user_list()
57
            self.load_resource_list(self.user_list)
58
59
    @property
60
    def userdir(self):
61
        if not self._userdir:
62
            self._userdir = config.HOME
63
        return self._userdir
64
65
    @property
66
    def xdg_data_home(self):
67
        if not self._xdg_data_home:
68
            self._xdg_data_home = config.XDG_DATA_HOME
69
        return self._xdg_data_home
70
71
    @property
72
    def xdg_config_home(self):
73
        if self._xdg_config_home:
74
            return self._xdg_config_home
75
        return config.XDG_CONFIG_HOME
76
77
    def save_user_list(self, database=None):
78
        if not database:
79
            database = self.database
80
        with open(self.user_list, 'w', encoding='utf-8') as f:
81
            f.write(RESOURCE_USER_LIST_COMMENT)
82
            f.write('\n')
83
            f.write(safe_dump(database))
84
85
    def load_resource_list(self, list_filename, database=None):
86
        if not database:
87
            database = self.database
88
        if list_filename.is_file():
89
            with open(list_filename, 'r', encoding='utf-8') as f:
90
                list_loaded = safe_load(f) or {}
91
            report = OcrdResourceListValidator.validate(list_loaded)
92
            if not report.is_valid:
93
                self.log.error('\n'.join(report.errors))
94
                raise ValueError(f"Resource list {list_filename} is invalid!")
95
            for executable, resource_list in list_loaded.items():
96
                if executable not in database:
97
                    database[executable] = []
98
                # Prepend, so user provided is sorted before builtin
99
                database[executable] = list_loaded[executable] + database[executable]
100
        return database
101
102
    def list_available(self, executable=None, dynamic=True, name=None, database=None, url=None):
103
        """
104
        List models available for download by processor
105
        """
106
        if not database:
107
            database = self.database
108
        if not executable:
109
            return database.items()
110
        if dynamic:
111
            skip_executables = ["ocrd-cis-data", "ocrd-import", "ocrd-make"]
112
            for exec_dir in environ['PATH'].split(':'):
113
                for exec_path in Path(exec_dir).glob(f'{executable}'):
114
                    if not exec_path.name.startswith('ocrd-'):
115
                        self.log.warning(f"OCR-D processor executable '{exec_path}' has no 'ocrd-' prefix")
116
                    if exec_path.name in skip_executables:
117
                        self.log.debug(f"Not an OCR-D processor CLI, skipping '{exec_path}'")
118
                        continue
119
                    self.log.debug(f"Inspecting '{exec_path} --dump-json' for resources")
120
                    ocrd_tool = get_ocrd_tool_json(exec_path)
121
                    for resdict in ocrd_tool.get('resources', ()):
122
                        if exec_path.name not in database:
123
                            database[exec_path.name] = []
124
                        database[exec_path.name].insert(0, resdict)
125
            database = self._dedup_database(database)
126
        found = False
127
        ret = []
128
        for k in database:
129
            if apply_glob([k], executable):
130
                found = True
131
                restuple = (k, [])
132
                ret.append(restuple)
133
                for resdict in database[k]:
134
                    if name and resdict['name'] != name:
135
                        continue
136
                    if url and resdict['url'] != url:
137
                        continue
138
                    restuple[1].append(resdict)
139
        if not found:
140
            ret = [(executable, [])]
141
        return ret
142
143
    def list_installed(self, executable=None):
144
        """
145
        List installed resources, matching with registry by ``name``
146
        """
147
        ret = []
148
        if executable:
149
            all_executables = [executable]
150
        else:
151
            # resources we know about
152
            all_executables = list(self.database.keys())
153
            # resources in the file system
154
            parent_dirs = [join(x, 'ocrd-resources') for x in [self.xdg_data_home, '/usr/local/share']]
155
            for parent_dir in parent_dirs:
156
                if Path(parent_dir).exists():
157
                    all_executables += [x for x in listdir(parent_dir) if x.startswith('ocrd-')]
158
        for this_executable in set(all_executables):
159
            reslist = []
160
            mimetypes = get_processor_resource_types(this_executable)
161
            moduledir = get_moduledir(this_executable)
162
            for res_filename in list_all_resources(this_executable, moduled=moduledir, xdg_data_home=self.xdg_data_home):
163
                res_filename = Path(res_filename)
164
                if not '*/*' in mimetypes:
165
                    if res_filename.is_dir() and not 'text/directory' in mimetypes:
166
                        continue
167
                    if res_filename.is_file() and ['text/directory'] == mimetypes:
168
                        continue
169
                res_name = res_filename.name
170
                res_type = 'file' if res_filename.is_file() else 'directory'
171
                res_size = res_filename.stat().st_size if res_filename.is_file() else directory_size(res_filename)
172
                resdict_list = [x for x in self.database.get(this_executable, []) if x['name'] == res_name]
173
                if resdict_list:
174
                    resdict = resdict_list[0]
175
                elif str(res_filename.parent) == moduledir:
176
                    resdict = {
177
                        'name': res_name, 
178
                        'url': str(res_filename), 
179
                        'description': 'Found at module', 
180
                        'type': res_type,
181
                        'size': res_size
182
                    }
183
                else:
184
                    resdict = self.add_to_user_database(this_executable, res_filename, resource_type=res_type)
185
                resdict['path'] = str(res_filename)
186
                reslist.append(resdict)
187
            ret.append((this_executable, reslist))
188
        return ret
189
190
    def add_to_user_database(self, executable, res_filename, url=None, resource_type='file'):
191
        """
192
        Add a stub entry to the user resource.yml
193
        """
194
        res_name = Path(res_filename).name
195
        self.log.info(f"{executable} resource '{res_name}' ({str(res_filename)}) not a known resource, "
196
                      f"creating stub in {self.user_list}'")
197
        if Path(res_filename).is_dir():
198
            res_size = directory_size(res_filename)
199
        else:
200
            res_size = Path(res_filename).stat().st_size
201
        with open(self.user_list, 'r', encoding='utf-8') as f:
202
            user_database = safe_load(f) or {}
203
        if executable not in user_database:
204
            user_database[executable] = []
205
        resources_found = self.list_available(executable=executable, name=res_name, database=user_database)[0][1]
206
        if not resources_found:
207
            resdict = {
208
                'name': res_name,
209
                'url': url if url else '???',
210
                'description': f'Found at {self.resource_dir_to_location(res_filename)} on {datetime.now()}',
211
                'version_range': '???',
212
                'type': resource_type,
213
                'size': res_size
214
            }
215
            user_database[executable].append(resdict)
216
        else:
217
            resdict = resources_found[0]
218
        self.save_user_list(user_database)
219
        self.load_resource_list(self.user_list)
220
        return resdict
221
222
    @property
223
    def default_resource_dir(self):
224
        return self.location_to_resource_dir('data')
225
226
    def location_to_resource_dir(self, location):
227
        if location == 'data':
228
            return join(self.xdg_data_home, 'ocrd-resources')
229
        if location == 'system':
230
            return '/usr/local/share/ocrd-resources'
231
        return getcwd()
232
233
    def resource_dir_to_location(self, resource_path):
234
        resource_path = str(resource_path)
235
        if resource_path.startswith('/usr/local/share/ocrd-resources'):
236
            return 'system'
237
        if resource_path.startswith(join(self.xdg_data_home, 'ocrd-resources')):
238
            return 'data'
239
        if resource_path.startswith(getcwd()):
240
            return 'cwd'
241
        return resource_path
242
243
    @staticmethod
244
    def parameter_usage(name, usage='as-is'):
245
        if usage == 'as-is':
246
            return name
247
        elif usage == 'without-extension':
248
            return Path(name).stem
249
        raise ValueError(f"No such usage '{usage}'")
250
251
    @staticmethod
252
    def _download_impl(log: Logger, url, filename, progress_cb=None, size=None):
253
        log.info(f"Downloading {url} to {filename}")
254
        try:
255
            gdrive_file_id, is_gdrive_download_link = gparse_url(url, warning=False)
256
            if gdrive_file_id:
257
                if not is_gdrive_download_link:
258
                    url = f"https://drive.google.com/uc?id={gdrive_file_id}"
259
                try:
260
                    with requests.get(url, stream=True) as r:
261
                        if "Content-Disposition" not in r.headers:
262
                            url = get_url_from_gdrive_confirmation(r.text)
263
                except RuntimeError as e:
264
                    log.warning(f"Cannot unwrap Google Drive URL: {e}")
265
            with open(filename, 'wb') as f:
266
                with requests.get(url, stream=True) as r:
267
                    r.raise_for_status()
268
                    for data in r.iter_content(chunk_size=4096):
269
                        if progress_cb:
270
                            progress_cb(len(data))
271
                        f.write(data)
272
        except Exception as e:
273
            rmtree(filename, ignore_errors=True)
274
            Path(filename).unlink(missing_ok=True)
275
            raise e
276
277
    @staticmethod
278
    def _copy_file(log: Logger, src, dst, progress_cb=None):
279
        log.info(f"Copying file {src} to {dst}")
280
        with open(dst, 'wb') as f_out, open(src, 'rb') as f_in:
281
            while True:
282
                chunk = f_in.read(4096)
283
                if chunk:
284
                    f_out.write(chunk)
285
                    if progress_cb:
286
                        progress_cb(len(chunk))
287
                else:
288
                    break
289
290
    @staticmethod
291
    def _copy_dir(log: Logger, src, dst, progress_cb=None):
292
        log.info(f"Copying dir recursively from {src} to {dst}")
293
        if not Path(src).is_dir():
294
            raise ValueError(f"The source is not a directory: {src}")
295
        Path(dst).mkdir(parents=True, exist_ok=True)
296
        for child in Path(src).rglob('*'):
297
            child_dst = Path(dst) / child.relative_to(src)
298
            if Path(child).is_dir():
299
                OcrdResourceManager._copy_dir(child, child_dst, progress_cb)
300
            else:
301
                OcrdResourceManager._copy_file(child, child_dst, progress_cb)
302
303
    @staticmethod
304
    def _copy_impl(log: Logger, src_filename, filename, progress_cb=None):
305
        log.info(f"Copying {src_filename} to {filename}")
306
        if Path(src_filename).is_dir():
307
            OcrdResourceManager._copy_dir(log, src_filename, filename, progress_cb)
308
        else:
309
            OcrdResourceManager._copy_file(log, src_filename, filename, progress_cb)
310
311
    def _download_archive(self, log: Logger, url: str, path_in_archive: str, fpath: Path, progress_cb=None):
312
        archive_fname = 'download.tar.xx'
313
        with pushd_popd(tempdir=True) as tempdir:
314
            if url.startswith('https://') or url.startswith('http://'):
315
                self._download_impl(log, url, archive_fname, progress_cb)
316
            else:
317
                self._copy_impl(log, url, archive_fname, progress_cb)
318
            Path('out').mkdir()
319
            with pushd_popd('out'):
320
                mimetype = guess_media_type(f'../{archive_fname}', fallback='application/octet-stream')
321
                log.info(f"Extracting {mimetype} archive to {tempdir}/out")
322
                if mimetype == 'application/zip':
323
                    with ZipFile(f'../{archive_fname}', 'r') as zipf:
324
                        zipf.extractall()
325
                elif mimetype in ('application/gzip', 'application/x-xz'):
326
                    with open_tarfile(f'../{archive_fname}', 'r:*') as tar:
327
                        tar.extractall()
328
                else:
329
                    raise RuntimeError(f"Unable to handle extraction of {mimetype} archive {url}")
330
                log.info(f"Copying '{path_in_archive}' from archive to {fpath}")
331
                if Path(path_in_archive).is_dir():
332
                    copytree(path_in_archive, str(fpath))
333
                else:
334
                    copy(path_in_archive, str(fpath))
335
336
    # TODO Proper caching (make head request for size, If-Modified etc)
337
    def download_resource(
338
        self, executable, url, basedir, overwrite=False, no_subdir=False, name=None, resource_type='file',
339
        path_in_archive='.', progress_cb=None,
340
    ):
341
        """
342
        Download a resource by URL
343
        """
344
        log = getLogger('ocrd.resource_manager.download')
345
        destdir = Path(basedir) if no_subdir else Path(basedir, executable)
346
        if not name:
347
            url_parsed = urlparse(url)
348
            name = Path(unquote(url_parsed.path)).name
349
        fpath = Path(destdir, name)
350
        if fpath.exists():
351
            if not overwrite:
352
                fpath_type = 'Directory' if fpath.is_dir() else 'File'
353
                log.warning(f"{fpath_type} {fpath} already exists but --overwrite is not set, skipping the download")
354
                # raise FileExistsError(f"{fpath_type} {fpath} already exists but --overwrite is not set")
355
                return fpath
356
            if fpath.is_dir():
357
                log.info(f"Removing existing target directory {fpath}")
358
                rmtree(str(fpath))
359
            else:
360
                log.info(f"Removing existing target file {fpath}")
361
                unlink(str(fpath))
362
        destdir.mkdir(parents=True, exist_ok=True)
363
        if resource_type in ('file', 'directory'):
364
            if url.startswith('https://') or url.startswith('http://'):
365
                self._download_impl(log, url, fpath, progress_cb)
366
            else:
367
                self._copy_impl(log, url, fpath, progress_cb)
368
        elif resource_type == 'archive':
369
            self._download_archive(log, url, path_in_archive, fpath, progress_cb)
370
        return fpath
371
372
    def _dedup_database(self, database=None, dedup_key='name'):
373
        """
374
        Deduplicate resources by name
375
        """
376
        if not database:
377
            database = self.database
378
        for executable, reslist in database.items():
379
            reslist_dedup = []
380
            for resdict in reslist:
381
                if not any(r[dedup_key] == resdict[dedup_key] for r in reslist_dedup):
382
                    reslist_dedup.append(resdict)
383
            database[executable] = reslist_dedup
384
        return database
385