ocrd.resource_manager.OcrdResourceManager._copy_file() - Code Metrics - Inspection of "Implementation of the resource manager server (iss..." - OCR-D/core - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Pull Request — master (#1309)

unknown

created 2025-03-05 16:22 UTC

OcrdResourceManager._copy_file() A

↳ Parent: ocrd.resource_manager

Complexity

Conditions

Size

Total Lines	12
Code Lines	11

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
eloc	11
dl	0
loc	12
rs	9.3333
c	0
b	0
f	0
cc	5
nop	4

from logging import Logger
from pathlib import Path
from os.path import join
from os import environ, listdir, getcwd, unlink
from shutil import copytree, rmtree, copy
from fnmatch import filter as apply_glob
from datetime import datetime
from tarfile import open as open_tarfile
from urllib.parse import urlparse, unquote
from zipfile import ZipFile

import requests
from gdown.parse_url import parse_url as gparse_url
from gdown.download import get_url_from_gdrive_confirmation
from yaml import safe_load, safe_dump

# pylint: disable=wrong-import-position

# https://github.com/OCR-D/core/issues/867
# https://stackoverflow.com/questions/50900727/skip-converting-entities-while-loading-a-yaml-string-using-pyyaml
import yaml.constructor
yaml.constructor.SafeConstructor.yaml_constructors['tag:yaml.org,2002:timestamp'] = \
    yaml.constructor.SafeConstructor.yaml_constructors['tag:yaml.org,2002:str']

# pylint: enable=wrong-import-position

# pylint: enable=wrong-import-position

# pylint: enable=wrong-import-position

from ocrd_validators import OcrdResourceListValidator
from ocrd_utils import getLogger, directory_size, get_moduledir, guess_media_type, config
from ocrd_utils.os import get_processor_resource_types, list_all_resources, pushd_popd, get_ocrd_tool_json
from .constants import RESOURCE_LIST_FILENAME, RESOURCE_USER_LIST_COMMENT


class OcrdResourceManager:

    """
    Managing processor resources
    """
    def __init__(self, userdir=None, xdg_config_home=None, xdg_data_home=None, skip_init=False):
        self.log = getLogger('ocrd.resource_manager')
        self.database = {}

        self._xdg_data_home = xdg_data_home
        self._xdg_config_home = xdg_config_home
        self._userdir = userdir
        self.user_list = Path(self.xdg_config_home, 'ocrd', 'resources.yml')

        if not skip_init:
            self.load_resource_list(Path(RESOURCE_LIST_FILENAME))
            if not self.user_list.exists():
                if not self.user_list.parent.exists():
                    self.user_list.parent.mkdir(parents=True)
                self.save_user_list()
            self.load_resource_list(self.user_list)

    @property
    def userdir(self):
        if not self._userdir:
            self._userdir = config.HOME
        return self._userdir

    @property
    def xdg_data_home(self):
        if not self._xdg_data_home:
            self._xdg_data_home = config.XDG_DATA_HOME
        return self._xdg_data_home

    @property
    def xdg_config_home(self):
        if self._xdg_config_home:
            return self._xdg_config_home
        return config.XDG_CONFIG_HOME

    def save_user_list(self, database=None):
        if not database:
            database = self.database
        with open(self.user_list, 'w', encoding='utf-8') as f:
            f.write(RESOURCE_USER_LIST_COMMENT)
            f.write('\n')
            f.write(safe_dump(database))

    def load_resource_list(self, list_filename, database=None):
        if not database:
            database = self.database
        if list_filename.is_file():
            with open(list_filename, 'r', encoding='utf-8') as f:
                list_loaded = safe_load(f) or {}
            report = OcrdResourceListValidator.validate(list_loaded)
            if not report.is_valid:
                self.log.error('\n'.join(report.errors))
                raise ValueError(f"Resource list {list_filename} is invalid!")
            for executable, resource_list in list_loaded.items():
                if executable not in database:
                    database[executable] = []
                # Prepend, so user provided is sorted before builtin
                database[executable] = list_loaded[executable] + database[executable]
        return database

    def list_available(self, executable=None, dynamic=True, name=None, database=None, url=None):
        """
        List models available for download by processor
        """
        if not database:
            database = self.database
        if not executable:
            return database.items()
        if dynamic:
            skip_executables = ["ocrd-cis-data", "ocrd-import", "ocrd-make"]
            for exec_dir in environ['PATH'].split(':'):
                for exec_path in Path(exec_dir).glob(f'{executable}'):
                    if not exec_path.name.startswith('ocrd-'):
                        self.log.warning(f"OCR-D processor executable '{exec_path}' has no 'ocrd-' prefix")
                    if exec_path.name in skip_executables:
                        self.log.debug(f"Not an OCR-D processor CLI, skipping '{exec_path}'")
                        continue
                    self.log.debug(f"Inspecting '{exec_path} --dump-json' for resources")
                    ocrd_tool = get_ocrd_tool_json(exec_path)
                    for resdict in ocrd_tool.get('resources', ()):
                        if exec_path.name not in database:
                            database[exec_path.name] = []
                        database[exec_path.name].insert(0, resdict)
            database = self._dedup_database(database)
        found = False
        ret = []
        for k in database:
            if apply_glob([k], executable):
                found = True
                restuple = (k, [])
                ret.append(restuple)
                for resdict in database[k]:
                    if name and resdict['name'] != name:
                        continue
                    if url and resdict['url'] != url:
                        continue
                    restuple[1].append(resdict)
        if not found:
            ret = [(executable, [])]
        return ret

    def list_installed(self, executable=None):
        """
        List installed resources, matching with registry by ``name``
        """
        ret = []
        if executable:
            all_executables = [executable]
        else:
            # resources we know about
            all_executables = list(self.database.keys())
            # resources in the file system
            parent_dirs = [join(x, 'ocrd-resources') for x in [self.xdg_data_home, '/usr/local/share']]
            for parent_dir in parent_dirs:
                if Path(parent_dir).exists():
                    all_executables += [x for x in listdir(parent_dir) if x.startswith('ocrd-')]
        for this_executable in set(all_executables):
            reslist = []
            mimetypes = get_processor_resource_types(this_executable)
            moduledir = get_moduledir(this_executable)
            for res_filename in list_all_resources(this_executable, moduled=moduledir, xdg_data_home=self.xdg_data_home):
                res_filename = Path(res_filename)
                if not '*/*' in mimetypes:
                    if res_filename.is_dir() and not 'text/directory' in mimetypes:
                        continue
                    if res_filename.is_file() and ['text/directory'] == mimetypes:
                        continue
                res_name = res_filename.name
                res_type = 'file' if res_filename.is_file() else 'directory'
                res_size = res_filename.stat().st_size if res_filename.is_file() else directory_size(res_filename)
                resdict_list = [x for x in self.database.get(this_executable, []) if x['name'] == res_name]
                if resdict_list:
                    resdict = resdict_list[0]
                elif str(res_filename.parent) == moduledir:
                    resdict = {
                        'name': res_name, 
                        'url': str(res_filename), 
                        'description': 'Found at module', 
                        'type': res_type,
                        'size': res_size
                    }
                else:
                    resdict = self.add_to_user_database(this_executable, res_filename, resource_type=res_type)
                resdict['path'] = str(res_filename)
                reslist.append(resdict)
            ret.append((this_executable, reslist))
        return ret

    def add_to_user_database(self, executable, res_filename, url=None, resource_type='file'):
        """
        Add a stub entry to the user resource.yml
        """
        res_name = Path(res_filename).name
        self.log.info(f"{executable} resource '{res_name}' ({str(res_filename)}) not a known resource, "
                      f"creating stub in {self.user_list}'")
        if Path(res_filename).is_dir():
            res_size = directory_size(res_filename)
        else:
            res_size = Path(res_filename).stat().st_size
        with open(self.user_list, 'r', encoding='utf-8') as f:
            user_database = safe_load(f) or {}
        if executable not in user_database:
            user_database[executable] = []
        resources_found = self.list_available(executable=executable, name=res_name, database=user_database)[0][1]
        if not resources_found:
            resdict = {
                'name': res_name,
                'url': url if url else '???',
                'description': f'Found at {self.resource_dir_to_location(res_filename)} on {datetime.now()}',
                'version_range': '???',
                'type': resource_type,
                'size': res_size
            }
            user_database[executable].append(resdict)
        else:
            resdict = resources_found[0]
        self.save_user_list(user_database)
        self.load_resource_list(self.user_list)
        return resdict

    @property
    def default_resource_dir(self):
        return self.location_to_resource_dir('data')

    def location_to_resource_dir(self, location):
        if location == 'data':
            return join(self.xdg_data_home, 'ocrd-resources')
        if location == 'system':
            return '/usr/local/share/ocrd-resources'
        return getcwd()

    def resource_dir_to_location(self, resource_path):
        resource_path = str(resource_path)
        if resource_path.startswith('/usr/local/share/ocrd-resources'):
            return 'system'
        if resource_path.startswith(join(self.xdg_data_home, 'ocrd-resources')):
            return 'data'
        if resource_path.startswith(getcwd()):
            return 'cwd'
        return resource_path

    @staticmethod
    def parameter_usage(name, usage='as-is'):
        if usage == 'as-is':
            return name
        elif usage == 'without-extension':
            return Path(name).stem
        raise ValueError(f"No such usage '{usage}'")

    @staticmethod
    def _download_impl(log: Logger, url, filename, progress_cb=None, size=None):
        log.info(f"Downloading {url} to {filename}")
        try:
            gdrive_file_id, is_gdrive_download_link = gparse_url(url, warning=False)
            if gdrive_file_id:
                if not is_gdrive_download_link:
                    url = f"https://drive.google.com/uc?id={gdrive_file_id}"
                try:
                    with requests.get(url, stream=True) as r:
                        if "Content-Disposition" not in r.headers:
                            url = get_url_from_gdrive_confirmation(r.text)
                except RuntimeError as e:
                    log.warning(f"Cannot unwrap Google Drive URL: {e}")
            with open(filename, 'wb') as f:
                with requests.get(url, stream=True) as r:
                    r.raise_for_status()
                    for data in r.iter_content(chunk_size=4096):
                        if progress_cb:
                            progress_cb(len(data))
                        f.write(data)
        except Exception as e:
            rmtree(filename, ignore_errors=True)
            Path(filename).unlink(missing_ok=True)
            raise e

    @staticmethod
    def _copy_file(log: Logger, src, dst, progress_cb=None):
        log.info(f"Copying file {src} to {dst}")
        with open(dst, 'wb') as f_out, open(src, 'rb') as f_in:
            while True:
                chunk = f_in.read(4096)
                if chunk:
                    f_out.write(chunk)
                    if progress_cb:
                        progress_cb(len(chunk))
                else:
                    break

    @staticmethod
    def _copy_dir(log: Logger, src, dst, progress_cb=None):
        log.info(f"Copying dir recursively from {src} to {dst}")
        if not Path(src).is_dir():
            raise ValueError(f"The source is not a directory: {src}")
        Path(dst).mkdir(parents=True, exist_ok=True)
        for child in Path(src).rglob('*'):
            child_dst = Path(dst) / child.relative_to(src)
            if Path(child).is_dir():
                OcrdResourceManager._copy_dir(child, child_dst, progress_cb)
            else:
                OcrdResourceManager._copy_file(child, child_dst, progress_cb)

    @staticmethod
    def _copy_impl(log: Logger, src_filename, filename, progress_cb=None):
        log.info(f"Copying {src_filename} to {filename}")
        if Path(src_filename).is_dir():
            OcrdResourceManager._copy_dir(log, src_filename, filename, progress_cb)
        else:
            OcrdResourceManager._copy_file(log, src_filename, filename, progress_cb)

    def _download_archive(self, log: Logger, url: str, path_in_archive: str, fpath: Path, progress_cb=None):
        archive_fname = 'download.tar.xx'
        with pushd_popd(tempdir=True) as tempdir:
            if url.startswith('https://') or url.startswith('http://'):
                self._download_impl(log, url, archive_fname, progress_cb)
            else:
                self._copy_impl(log, url, archive_fname, progress_cb)
            Path('out').mkdir()
            with pushd_popd('out'):
                mimetype = guess_media_type(f'../{archive_fname}', fallback='application/octet-stream')
                log.info(f"Extracting {mimetype} archive to {tempdir}/out")
                if mimetype == 'application/zip':
                    with ZipFile(f'../{archive_fname}', 'r') as zipf:
                        zipf.extractall()
                elif mimetype in ('application/gzip', 'application/x-xz'):
                    with open_tarfile(f'../{archive_fname}', 'r:*') as tar:
                        tar.extractall()
                else:
                    raise RuntimeError(f"Unable to handle extraction of {mimetype} archive {url}")
                log.info(f"Copying '{path_in_archive}' from archive to {fpath}")
                if Path(path_in_archive).is_dir():
                    copytree(path_in_archive, str(fpath))
                else:
                    copy(path_in_archive, str(fpath))

    # TODO Proper caching (make head request for size, If-Modified etc)
    def download_resource(
        self, executable, url, basedir, overwrite=False, no_subdir=False, name=None, resource_type='file',
        path_in_archive='.', progress_cb=None,
    ):
        """
        Download a resource by URL
        """
        log = getLogger('ocrd.resource_manager.download')
        destdir = Path(basedir) if no_subdir else Path(basedir, executable)
        if not name:
            url_parsed = urlparse(url)
            name = Path(unquote(url_parsed.path)).name
        fpath = Path(destdir, name)
        if fpath.exists():
            if not overwrite:
                fpath_type = 'Directory' if fpath.is_dir() else 'File'
                log.warning(f"{fpath_type} {fpath} already exists but --overwrite is not set, skipping the download")
                # raise FileExistsError(f"{fpath_type} {fpath} already exists but --overwrite is not set")
                return fpath
            if fpath.is_dir():
                log.info(f"Removing existing target directory {fpath}")
                rmtree(str(fpath))
            else:
                log.info(f"Removing existing target file {fpath}")
                unlink(str(fpath))
        destdir.mkdir(parents=True, exist_ok=True)
        if resource_type in ('file', 'directory'):
            if url.startswith('https://') or url.startswith('http://'):
                self._download_impl(log, url, fpath, progress_cb)
            else:
                self._copy_impl(log, url, fpath, progress_cb)
        elif resource_type == 'archive':
            self._download_archive(log, url, path_in_archive, fpath, progress_cb)
        return fpath

    def _dedup_database(self, database=None, dedup_key='name'):
        """
        Deduplicate resources by name
        """
        if not database:
            database = self.database
        for executable, reslist in database.items():
            reslist_dedup = []
            for resdict in reslist:
                if not any(r[dedup_key] == resdict[dedup_key] for r in reslist_dedup):
                    reslist_dedup.append(resdict)
            database[executable] = reslist_dedup
        return database


1			from logging import Logger
2			from pathlib import Path
3			from os.path import join
4			from os import environ, listdir, getcwd, unlink
5			from shutil import copytree, rmtree, copy
6			from fnmatch import filter as apply_glob
7			from datetime import datetime
8			from tarfile import open as open_tarfile
9			from urllib.parse import urlparse, unquote
10			from zipfile import ZipFile
11
12			import requests
13			from gdown.parse_url import parse_url as gparse_url
14			from gdown.download import get_url_from_gdrive_confirmation
15			from yaml import safe_load, safe_dump
16
17			# pylint: disable=wrong-import-position
18
19			# https://github.com/OCR-D/core/issues/867
20			# https://stackoverflow.com/questions/50900727/skip-converting-entities-while-loading-a-yaml-string-using-pyyaml
21			import yaml.constructor
22			yaml.constructor.SafeConstructor.yaml_constructors['tag:yaml.org,2002:timestamp'] = \
23			yaml.constructor.SafeConstructor.yaml_constructors['tag:yaml.org,2002:str']
24
25			# pylint: enable=wrong-import-position
26
27			# pylint: enable=wrong-import-position
28
29			# pylint: enable=wrong-import-position
30
31			from ocrd_validators import OcrdResourceListValidator
32			from ocrd_utils import getLogger, directory_size, get_moduledir, guess_media_type, config
33			from ocrd_utils.os import get_processor_resource_types, list_all_resources, pushd_popd, get_ocrd_tool_json
34			from .constants import RESOURCE_LIST_FILENAME, RESOURCE_USER_LIST_COMMENT
35
36
37			class OcrdResourceManager:
38
39			"""
40			Managing processor resources
41			"""
42			def __init__(self, userdir=None, xdg_config_home=None, xdg_data_home=None, skip_init=False):
43			self.log = getLogger('ocrd.resource_manager')
44			self.database = {}
45
46			self._xdg_data_home = xdg_data_home
47			self._xdg_config_home = xdg_config_home
48			self._userdir = userdir
49			self.user_list = Path(self.xdg_config_home, 'ocrd', 'resources.yml')
50
51			if not skip_init:
52			self.load_resource_list(Path(RESOURCE_LIST_FILENAME))
53			if not self.user_list.exists():
54			if not self.user_list.parent.exists():
55			self.user_list.parent.mkdir(parents=True)
56			self.save_user_list()
57			self.load_resource_list(self.user_list)
58
59			@property
60			def userdir(self):
61			if not self._userdir:
62			self._userdir = config.HOME
63			return self._userdir
64
65			@property
66			def xdg_data_home(self):
67			if not self._xdg_data_home:
68			self._xdg_data_home = config.XDG_DATA_HOME
69			return self._xdg_data_home
70
71			@property
72			def xdg_config_home(self):
73			if self._xdg_config_home:
74			return self._xdg_config_home
75			return config.XDG_CONFIG_HOME
76
77			def save_user_list(self, database=None):
78			if not database:
79			database = self.database
80			with open(self.user_list, 'w', encoding='utf-8') as f:
81			f.write(RESOURCE_USER_LIST_COMMENT)
82			f.write('\n')
83			f.write(safe_dump(database))
84
85			def load_resource_list(self, list_filename, database=None):
86			if not database:
87			database = self.database
88			if list_filename.is_file():
89			with open(list_filename, 'r', encoding='utf-8') as f:
90			list_loaded = safe_load(f) or {}
91			report = OcrdResourceListValidator.validate(list_loaded)
92			if not report.is_valid:
93			self.log.error('\n'.join(report.errors))
94			raise ValueError(f"Resource list {list_filename} is invalid!")
95			for executable, resource_list in list_loaded.items():
96			if executable not in database:
97			database[executable] = []
98			# Prepend, so user provided is sorted before builtin
99			database[executable] = list_loaded[executable] + database[executable]
100			return database
101
102			def list_available(self, executable=None, dynamic=True, name=None, database=None, url=None):
103			"""
104			List models available for download by processor
105			"""
106			if not database:
107			database = self.database
108			if not executable:
109			return database.items()
110			if dynamic:
111			skip_executables = ["ocrd-cis-data", "ocrd-import", "ocrd-make"]
112			for exec_dir in environ['PATH'].split(':'):
113			for exec_path in Path(exec_dir).glob(f'{executable}'):
114			if not exec_path.name.startswith('ocrd-'):
115			self.log.warning(f"OCR-D processor executable '{exec_path}' has no 'ocrd-' prefix")
116			if exec_path.name in skip_executables:
117			self.log.debug(f"Not an OCR-D processor CLI, skipping '{exec_path}'")
118			continue
119			self.log.debug(f"Inspecting '{exec_path} --dump-json' for resources")
120			ocrd_tool = get_ocrd_tool_json(exec_path)
121			for resdict in ocrd_tool.get('resources', ()):
122			if exec_path.name not in database:
123			database[exec_path.name] = []
124			database[exec_path.name].insert(0, resdict)
125			database = self._dedup_database(database)
126			found = False
127			ret = []
128			for k in database:
129			if apply_glob([k], executable):
130			found = True
131			restuple = (k, [])
132			ret.append(restuple)
133			for resdict in database[k]:
134			if name and resdict['name'] != name:
135			continue
136			if url and resdict['url'] != url:
137			continue
138			restuple[1].append(resdict)
139			if not found:
140			ret = [(executable, [])]
141			return ret
142
143			def list_installed(self, executable=None):
144			"""
145			List installed resources, matching with registry by ``name``
146			"""
147			ret = []
148			if executable:
149			all_executables = [executable]
150			else:
151			# resources we know about
152			all_executables = list(self.database.keys())
153			# resources in the file system
154			parent_dirs = [join(x, 'ocrd-resources') for x in [self.xdg_data_home, '/usr/local/share']]
155			for parent_dir in parent_dirs:
156			if Path(parent_dir).exists():
157			all_executables += [x for x in listdir(parent_dir) if x.startswith('ocrd-')]
158			for this_executable in set(all_executables):
159			reslist = []
160			mimetypes = get_processor_resource_types(this_executable)
161			moduledir = get_moduledir(this_executable)
162			for res_filename in list_all_resources(this_executable, moduled=moduledir, xdg_data_home=self.xdg_data_home):
163			res_filename = Path(res_filename)
164			if not '/' in mimetypes:
165			if res_filename.is_dir() and not 'text/directory' in mimetypes:
166			continue
167			if res_filename.is_file() and ['text/directory'] == mimetypes:
168			continue
169			res_name = res_filename.name
170			res_type = 'file' if res_filename.is_file() else 'directory'
171			res_size = res_filename.stat().st_size if res_filename.is_file() else directory_size(res_filename)
172			resdict_list = [x for x in self.database.get(this_executable, []) if x['name'] == res_name]
173			if resdict_list:
174			resdict = resdict_list[0]
175			elif str(res_filename.parent) == moduledir:
176			resdict = {
177			'name': res_name,
178			'url': str(res_filename),
179			'description': 'Found at module',
180			'type': res_type,
181			'size': res_size
182			}
183			else:
184			resdict = self.add_to_user_database(this_executable, res_filename, resource_type=res_type)
185			resdict['path'] = str(res_filename)
186			reslist.append(resdict)
187			ret.append((this_executable, reslist))
188			return ret
189
190			def add_to_user_database(self, executable, res_filename, url=None, resource_type='file'):
191			"""
192			Add a stub entry to the user resource.yml
193			"""
194			res_name = Path(res_filename).name
195			self.log.info(f"{executable} resource '{res_name}' ({str(res_filename)}) not a known resource, "
196			f"creating stub in {self.user_list}'")
197			if Path(res_filename).is_dir():
198			res_size = directory_size(res_filename)
199			else:
200			res_size = Path(res_filename).stat().st_size
201			with open(self.user_list, 'r', encoding='utf-8') as f:
202			user_database = safe_load(f) or {}
203			if executable not in user_database:
204			user_database[executable] = []
205			resources_found = self.list_available(executable=executable, name=res_name, database=user_database)[0][1]
206			if not resources_found:
207			resdict = {
208			'name': res_name,
209			'url': url if url else '???',
210			'description': f'Found at {self.resource_dir_to_location(res_filename)} on {datetime.now()}',
211			'version_range': '???',
212			'type': resource_type,
213			'size': res_size
214			}
215			user_database[executable].append(resdict)
216			else:
217			resdict = resources_found[0]
218			self.save_user_list(user_database)
219			self.load_resource_list(self.user_list)
220			return resdict
221
222			@property
223			def default_resource_dir(self):
224			return self.location_to_resource_dir('data')
225
226			def location_to_resource_dir(self, location):
227			if location == 'data':
228			return join(self.xdg_data_home, 'ocrd-resources')
229			if location == 'system':
230			return '/usr/local/share/ocrd-resources'
231			return getcwd()
232
233			def resource_dir_to_location(self, resource_path):
234			resource_path = str(resource_path)
235			if resource_path.startswith('/usr/local/share/ocrd-resources'):
236			return 'system'
237			if resource_path.startswith(join(self.xdg_data_home, 'ocrd-resources')):
238			return 'data'
239			if resource_path.startswith(getcwd()):
240			return 'cwd'
241			return resource_path
242
243			@staticmethod
244			def parameter_usage(name, usage='as-is'):
245			if usage == 'as-is':
246			return name
247			elif usage == 'without-extension':
248			return Path(name).stem
249			raise ValueError(f"No such usage '{usage}'")
250
251			@staticmethod
252			def _download_impl(log: Logger, url, filename, progress_cb=None, size=None):
253			log.info(f"Downloading {url} to {filename}")
254			try:
255			gdrive_file_id, is_gdrive_download_link = gparse_url(url, warning=False)
256			if gdrive_file_id:
257			if not is_gdrive_download_link:
258			url = f"https://drive.google.com/uc?id={gdrive_file_id}"
259			try:
260			with requests.get(url, stream=True) as r:
261			if "Content-Disposition" not in r.headers:
262			url = get_url_from_gdrive_confirmation(r.text)
263			except RuntimeError as e:
264			log.warning(f"Cannot unwrap Google Drive URL: {e}")
265			with open(filename, 'wb') as f:
266			with requests.get(url, stream=True) as r:
267			r.raise_for_status()
268			for data in r.iter_content(chunk_size=4096):
269			if progress_cb:
270			progress_cb(len(data))
271			f.write(data)
272			except Exception as e:
273			rmtree(filename, ignore_errors=True)
274			Path(filename).unlink(missing_ok=True)
275			raise e
276
277			@staticmethod
278			def _copy_file(log: Logger, src, dst, progress_cb=None):
279			log.info(f"Copying file {src} to {dst}")
280			with open(dst, 'wb') as f_out, open(src, 'rb') as f_in:
281			while True:
282			chunk = f_in.read(4096)
283			if chunk:
284			f_out.write(chunk)
285			if progress_cb:
286			progress_cb(len(chunk))
287			else:
288			break
289
290			@staticmethod
291			def _copy_dir(log: Logger, src, dst, progress_cb=None):
292			log.info(f"Copying dir recursively from {src} to {dst}")
293			if not Path(src).is_dir():
294			raise ValueError(f"The source is not a directory: {src}")
295			Path(dst).mkdir(parents=True, exist_ok=True)
296			for child in Path(src).rglob('*'):
297			child_dst = Path(dst) / child.relative_to(src)
298			if Path(child).is_dir():
299			OcrdResourceManager._copy_dir(child, child_dst, progress_cb)
300			else:
301			OcrdResourceManager._copy_file(child, child_dst, progress_cb)
302
303			@staticmethod
304			def _copy_impl(log: Logger, src_filename, filename, progress_cb=None):
305			log.info(f"Copying {src_filename} to {filename}")
306			if Path(src_filename).is_dir():
307			OcrdResourceManager._copy_dir(log, src_filename, filename, progress_cb)
308			else:
309			OcrdResourceManager._copy_file(log, src_filename, filename, progress_cb)
310
311			def _download_archive(self, log: Logger, url: str, path_in_archive: str, fpath: Path, progress_cb=None):
312			archive_fname = 'download.tar.xx'
313			with pushd_popd(tempdir=True) as tempdir:
314			if url.startswith('https://') or url.startswith('http://'):
315			self._download_impl(log, url, archive_fname, progress_cb)
316			else:
317			self._copy_impl(log, url, archive_fname, progress_cb)
318			Path('out').mkdir()
319			with pushd_popd('out'):
320			mimetype = guess_media_type(f'../{archive_fname}', fallback='application/octet-stream')
321			log.info(f"Extracting {mimetype} archive to {tempdir}/out")
322			if mimetype == 'application/zip':
323			with ZipFile(f'../{archive_fname}', 'r') as zipf:
324			zipf.extractall()
325			elif mimetype in ('application/gzip', 'application/x-xz'):
326			with open_tarfile(f'../{archive_fname}', 'r:*') as tar:
327			tar.extractall()
328			else:
329			raise RuntimeError(f"Unable to handle extraction of {mimetype} archive {url}")
330			log.info(f"Copying '{path_in_archive}' from archive to {fpath}")
331			if Path(path_in_archive).is_dir():
332			copytree(path_in_archive, str(fpath))
333			else:
334			copy(path_in_archive, str(fpath))
335
336			# TODO Proper caching (make head request for size, If-Modified etc)
337			def download_resource(
338			self, executable, url, basedir, overwrite=False, no_subdir=False, name=None, resource_type='file',
339			path_in_archive='.', progress_cb=None,
340			):
341			"""
342			Download a resource by URL
343			"""
344			log = getLogger('ocrd.resource_manager.download')
345			destdir = Path(basedir) if no_subdir else Path(basedir, executable)
346			if not name:
347			url_parsed = urlparse(url)
348			name = Path(unquote(url_parsed.path)).name
349			fpath = Path(destdir, name)
350			if fpath.exists():
351			if not overwrite:
352			fpath_type = 'Directory' if fpath.is_dir() else 'File'
353			log.warning(f"{fpath_type} {fpath} already exists but --overwrite is not set, skipping the download")
354			# raise FileExistsError(f"{fpath_type} {fpath} already exists but --overwrite is not set")
355			return fpath
356			if fpath.is_dir():
357			log.info(f"Removing existing target directory {fpath}")
358			rmtree(str(fpath))
359			else:
360			log.info(f"Removing existing target file {fpath}")
361			unlink(str(fpath))
362			destdir.mkdir(parents=True, exist_ok=True)
363			if resource_type in ('file', 'directory'):
364			if url.startswith('https://') or url.startswith('http://'):
365			self._download_impl(log, url, fpath, progress_cb)
366			else:
367			self._copy_impl(log, url, fpath, progress_cb)
368			elif resource_type == 'archive':
369			self._download_archive(log, url, path_in_archive, fpath, progress_cb)
370			return fpath
371
372			def _dedup_database(self, database=None, dedup_key='name'):
373			"""
374			Deduplicate resources by name
375			"""
376			if not database:
377			database = self.database
378			for executable, reslist in database.items():
379			reslist_dedup = []
380			for resdict in reslist:
381			if not any(r[dedup_key] == resdict[dedup_key] for r in reslist_dedup):
382			reslist_dedup.append(resdict)
383			database[executable] = reslist_dedup
384			return database
385

OCR-D / core

Pull Request — master (#1309)

OcrdResourceManager._copy_file() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like