| 1 |  |  | from tempfile import mkdtemp | 
            
                                                        
            
                                    
            
            
                | 2 |  |  | from pathlib import Path | 
            
                                                        
            
                                    
            
            
                | 3 |  |  | from warnings import warn | 
            
                                                        
            
                                    
            
            
                | 4 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 5 |  |  | import requests | 
            
                                                        
            
                                    
            
            
                | 6 |  |  | from requests.adapters import HTTPAdapter, Retry | 
            
                                                        
            
                                    
            
            
                | 7 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 8 |  |  | from ocrd.constants import TMP_PREFIX | 
            
                                                        
            
                                    
            
            
                | 9 |  |  | from ocrd_utils import ( | 
            
                                                        
            
                                    
            
            
                | 10 |  |  |     config, | 
            
                                                        
            
                                    
            
            
                | 11 |  |  |     DEFAULT_METS_BASENAME, | 
            
                                                        
            
                                    
            
            
                | 12 |  |  |     getLogger, | 
            
                                                        
            
                                    
            
            
                | 13 |  |  |     is_local_filename, | 
            
                                                        
            
                                    
            
            
                | 14 |  |  |     get_local_filename, | 
            
                                                        
            
                                    
            
            
                | 15 |  |  |     remove_non_path_from_url, | 
            
                                                        
            
                                    
            
            
                | 16 |  |  |     is_file_in_directory, | 
            
                                                        
            
                                    
            
            
                | 17 |  |  |     nth_url_segment | 
            
                                                        
            
                                    
            
            
                | 18 |  |  | ) | 
            
                                                        
            
                                    
            
            
                | 19 |  |  | from ocrd.workspace import Workspace | 
            
                                                        
            
                                    
            
            
                | 20 |  |  | from ocrd_models import OcrdMets | 
            
                                                        
            
                                    
            
            
                | 21 |  |  | from ocrd_models.utils import handle_oai_response | 
            
                                                        
            
                                    
            
            
                | 22 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 23 |  |  | class Resolver(): | 
            
                                                        
            
                                    
            
            
                | 24 |  |  |     """ | 
            
                                                        
            
                                    
            
            
                | 25 |  |  |     Handle uploads, downloads, repository access, and manage temporary directories | 
            
                                                        
            
                                    
            
            
                | 26 |  |  |     """ | 
            
                                                        
            
                                    
            
            
                | 27 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 28 |  |  |     def download_to_directory(self, directory, url, basename=None, if_exists='skip', subdir=None, retries=None, timeout=None): | 
            
                                                        
            
                                    
            
            
                | 29 |  |  |         """ | 
            
                                                        
            
                                    
            
            
                | 30 |  |  |         Download a URL ``url`` to a local file in ``directory``. | 
            
                                                        
            
                                    
            
            
                | 31 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 32 |  |  |         If ``url`` looks like a file path, check whether that exists. | 
            
                                                        
            
                                    
            
            
                | 33 |  |  |         If it does exist and is within ``directory` already, return early. | 
            
                                                        
            
                                    
            
            
                | 34 |  |  |         If it does exist but is outside of ``directory``. copy it. | 
            
                                                        
            
                                    
            
            
                | 35 |  |  |         If ``url` does not appear to be a file path, try downloading via HTTP, retrying ``retries`` times with timeout ``timeout`` between calls. | 
            
                                                        
            
                                    
            
            
                | 36 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 37 |  |  |         If ``basename`` is not given but ``subdir`` is, set ``basename`` to the last path segment of ``url``. | 
            
                                                        
            
                                    
            
            
                | 38 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 39 |  |  |         If the target file already exists within ``directory``, behavior depends on ``if_exists``: | 
            
                                                        
            
                                    
            
            
                | 40 |  |  |             - ``skip`` (default): do nothing and return early. Note that this | 
            
                                                        
            
                                    
            
            
                | 41 |  |  |             - ``overwrite``: overwrite the existing file | 
            
                                                        
            
                                    
            
            
                | 42 |  |  |             - ``raise``: raise a ``FileExistsError`` | 
            
                                                        
            
                                    
            
            
                | 43 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 44 |  |  |         Args: | 
            
                                                        
            
                                    
            
            
                | 45 |  |  |             directory (string): Directory to download files to | 
            
                                                        
            
                                    
            
            
                | 46 |  |  |             url (string): URL to download from | 
            
                                                        
            
                                    
            
            
                | 47 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 48 |  |  |         Keyword Args: | 
            
                                                        
            
                                    
            
            
                | 49 |  |  |             basename (string, None): basename part of the filename on disk. Defaults to last path segment of ``url`` if unset. | 
            
                                                        
            
                                    
            
            
                | 50 |  |  |             if_exists (string, "skip"): What to do if target file already exists. | 
            
                                                        
            
                                    
            
            
                | 51 |  |  |                 One of ``skip`` (default), ``overwrite`` or ``raise`` | 
            
                                                        
            
                                    
            
            
                | 52 |  |  |             subdir (string, None): Subdirectory to create within the directory. Think ``mets:fileGrp[@USE]``. | 
            
                                                        
            
                                    
            
            
                | 53 |  |  |             retries (int, None): Number of retries to attempt on network failure. | 
            
                                                        
            
                                    
            
            
                | 54 |  |  |             timeout (tuple, None): Timeout in seconds for establishing a connection and reading next chunk of data. | 
            
                                                        
            
                                    
            
            
                | 55 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 56 |  |  |         Returns: | 
            
                                                        
            
                                    
            
            
                | 57 |  |  |             Local filename string, *relative* to directory | 
            
                                                        
            
                                    
            
            
                | 58 |  |  |         """ | 
            
                                                        
            
                                    
            
            
                | 59 |  |  |         log = getLogger('ocrd.resolver.download_to_directory') # pylint: disable=redefined-outer-name | 
            
                                                        
            
                                    
            
            
                | 60 |  |  |         log.debug("directory=|%s| url=|%s| basename=|%s| if_exists=|%s| subdir=|%s|", directory, url, basename, if_exists, subdir) | 
            
                                                        
            
                                    
            
            
                | 61 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 62 |  |  |         if not url: | 
            
                                                        
            
                                    
            
            
                | 63 |  |  |             raise ValueError(f"'url' must be a non-empty string, not '{url}'") # actually Path also ok | 
            
                                                        
            
                                    
            
            
                | 64 |  |  |         if not directory: | 
            
                                                        
            
                                    
            
            
                | 65 |  |  |             raise ValueError(f"'directory' must be a non-empty string, not '{url}'")  # actually Path would also work | 
            
                                                        
            
                                    
            
            
                | 66 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 67 |  |  |         url = str(url) | 
            
                                                        
            
                                    
            
            
                | 68 |  |  |         directory = Path(directory) | 
            
                                                        
            
                                    
            
            
                | 69 |  |  |         directory.mkdir(parents=True, exist_ok=True) | 
            
                                                        
            
                                    
            
            
                | 70 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 71 |  |  |         subdir_path = Path(subdir if subdir else '') | 
            
                                                        
            
                                    
            
            
                | 72 |  |  |         basename_path = Path(basename if basename else nth_url_segment(url)) | 
            
                                                        
            
                                    
            
            
                | 73 |  |  |         ret = Path(subdir_path, basename_path) | 
            
                                                        
            
                                    
            
            
                | 74 |  |  |         dst_path = Path(directory, ret) | 
            
                                                        
            
                                    
            
            
                | 75 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 76 |  |  |         # log.info("\n\tdst_path='%s \n\turl=%s", dst_path, url) | 
            
                                                        
            
                                    
            
            
                | 77 |  |  |         # print(f'>>> url={url}') | 
            
                                                        
            
                                    
            
            
                | 78 |  |  |         # print(f'>>> directory={directory}') | 
            
                                                        
            
                                    
            
            
                | 79 |  |  |         # print(f'>>> subdir_path={subdir_path}') | 
            
                                                        
            
                                    
            
            
                | 80 |  |  |         # print(f'>>> basename_path={basename_path}') | 
            
                                                        
            
                                    
            
            
                | 81 |  |  |         # print(f'>>> dst_path={dst_path}') | 
            
                                                        
            
                                    
            
            
                | 82 |  |  |         # print(f'>>> ret={ret}') | 
            
                                                        
            
                                    
            
            
                | 83 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 84 |  |  |         src_path = None | 
            
                                                        
            
                                    
            
            
                | 85 |  |  |         if is_local_filename(url): | 
            
                                                        
            
                                    
            
            
                | 86 |  |  |             try: | 
            
                                                        
            
                                    
            
            
                | 87 |  |  |                 src_path = Path(get_local_filename(url)).resolve() | 
            
                                                        
            
                                    
            
            
                | 88 |  |  |             except FileNotFoundError as e: | 
            
                                                        
            
                                    
            
            
                | 89 |  |  |                 log.error("Failed to resolve URL locally: %s --> '%s' which does not exist" % (url, src_path)) | 
            
                                                        
            
                                    
            
            
                | 90 |  |  |                 raise e | 
            
                                                        
            
                                    
            
            
                | 91 |  |  |             if not src_path.exists(): | 
            
                                                        
            
                                    
            
            
                | 92 |  |  |                 raise FileNotFoundError(f"File path passed as 'url' to download_to_directory does not exist: '{url}") | 
            
                                                        
            
                                    
            
            
                | 93 |  |  |             if src_path == dst_path: | 
            
                                                        
            
                                    
            
            
                | 94 |  |  |                 log.debug("Stop early, src_path and dst_path are the same: '%s' (url: '%s')" % (src_path, url)) | 
            
                                                        
            
                                    
            
            
                | 95 |  |  |                 return str(ret) | 
            
                                                        
            
                                    
            
            
                | 96 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 97 |  |  |         # Respect 'if_exists' kwarg | 
            
                                                        
            
                                    
            
            
                | 98 |  |  |         if dst_path.exists(): | 
            
                                                        
            
                                    
            
            
                | 99 |  |  |             if if_exists == 'skip': | 
            
                                                        
            
                                    
            
            
                | 100 |  |  |                 log.debug(f"File already exists but if_exists == {if_exists}, skipping.") | 
            
                                                        
            
                                    
            
            
                | 101 |  |  |                 return str(ret) | 
            
                                                        
            
                                    
            
            
                | 102 |  |  |             elif if_exists == 'raise': | 
            
                                                        
            
                                    
            
            
                | 103 |  |  |                 raise FileExistsError(f"File already exists and if_exists == '{if_exists}': {dst_path}") | 
            
                                                        
            
                                    
            
            
                | 104 |  |  |             else: | 
            
                                                        
            
                                    
            
            
                | 105 |  |  |                 log.debug(f"File already exists but if_exists == {if_exists}, overwriting.") | 
            
                                                        
            
                                    
            
            
                | 106 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 107 |  |  |         # Create dst_path parent dir | 
            
                                                        
            
                                    
            
            
                | 108 |  |  |         dst_path.parent.mkdir(parents=True, exist_ok=True) | 
            
                                                        
            
                                    
            
            
                | 109 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 110 |  |  |         # Copy files or download remote assets | 
            
                                                        
            
                                    
            
            
                | 111 |  |  |         if src_path: | 
            
                                                        
            
                                    
            
            
                | 112 |  |  |             # src_path set, so it is a file source, we can copy directly | 
            
                                                        
            
                                    
            
            
                | 113 |  |  |             log.debug("Copying file '%s' to '%s'", src_path, dst_path) | 
            
                                                        
            
                                    
            
            
                | 114 |  |  |             dst_path.write_bytes(src_path.read_bytes()) | 
            
                                                        
            
                                    
            
            
                | 115 |  |  |         else: | 
            
                                                        
            
                                    
            
            
                | 116 |  |  |             # src_path not set, it's an http URL, try to download | 
            
                                                        
            
                                    
            
            
                | 117 |  |  |             log.debug("Downloading URL '%s' to '%s'", url, dst_path) | 
            
                                                        
            
                                    
            
            
                | 118 |  |  |             if not retries and config.is_set('OCRD_DOWNLOAD_RETRIES'): | 
            
                                                        
            
                                    
            
            
                | 119 |  |  |                 retries = config.OCRD_DOWNLOAD_RETRIES | 
            
                                                        
            
                                    
            
            
                | 120 |  |  |             if timeout is None and config.is_set('OCRD_DOWNLOAD_TIMEOUT'): | 
            
                                                        
            
                                    
            
            
                | 121 |  |  |                 timeout = config.OCRD_DOWNLOAD_TIMEOUT | 
            
                                                        
            
                                    
            
            
                | 122 |  |  |             session = requests.Session() | 
            
                                                        
            
                                    
            
            
                | 123 |  |  |             retries = Retry(total=retries or 0, | 
            
                                                        
            
                                    
            
            
                | 124 |  |  |                             status_forcelist=[ | 
            
                                                        
            
                                    
            
            
                | 125 |  |  |                                 # probably too wide (only transient failures): | 
            
                                                        
            
                                    
            
            
                | 126 |  |  |                                 408, # Request Timeout | 
            
                                                        
            
                                    
            
            
                | 127 |  |  |                                 409, # Conflict | 
            
                                                        
            
                                    
            
            
                | 128 |  |  |                                 412, # Precondition Failed | 
            
                                                        
            
                                    
            
            
                | 129 |  |  |                                 417, # Expectation Failed | 
            
                                                        
            
                                    
            
            
                | 130 |  |  |                                 423, # Locked | 
            
                                                        
            
                                    
            
            
                | 131 |  |  |                                 424, # Fail | 
            
                                                        
            
                                    
            
            
                | 132 |  |  |                                 425, # Too Early | 
            
                                                        
            
                                    
            
            
                | 133 |  |  |                                 426, # Upgrade Required | 
            
                                                        
            
                                    
            
            
                | 134 |  |  |                                 428, # Precondition Required | 
            
                                                        
            
                                    
            
            
                | 135 |  |  |                                 429, # Too Many Requests | 
            
                                                        
            
                                    
            
            
                | 136 |  |  |                                 440, # Login Timeout | 
            
                                                        
            
                                    
            
            
                | 137 |  |  |                                 500, # Internal Server Error | 
            
                                                        
            
                                    
            
            
                | 138 |  |  |                                 503, # Service Unavailable | 
            
                                                        
            
                                    
            
            
                | 139 |  |  |                                 504, # Gateway Timeout | 
            
                                                        
            
                                    
            
            
                | 140 |  |  |                                 509, # Bandwidth Limit Exceeded | 
            
                                                        
            
                                    
            
            
                | 141 |  |  |                                 529, # Site Overloaded | 
            
                                                        
            
                                    
            
            
                | 142 |  |  |                                 598, # Proxy Read Timeout | 
            
                                                        
            
                                    
            
            
                | 143 |  |  |                                 599, # Proxy Connect Timeout | 
            
                                                        
            
                                    
            
            
                | 144 |  |  |                     ]) | 
            
                                                        
            
                                    
            
            
                | 145 |  |  |             adapter = HTTPAdapter(max_retries=retries) | 
            
                                                        
            
                                    
            
            
                | 146 |  |  |             session.mount('http://', adapter) | 
            
                                                        
            
                                    
            
            
                | 147 |  |  |             session.mount('https://', adapter) | 
            
                                                        
            
                                    
            
            
                | 148 |  |  |             response = session.get(url, timeout=timeout) | 
            
                                                        
            
                                    
            
            
                | 149 |  |  |             response.raise_for_status() | 
            
                                                        
            
                                    
            
            
                | 150 |  |  |             contents = handle_oai_response(response) | 
            
                                                        
            
                                    
            
            
                | 151 |  |  |             dst_path.write_bytes(contents) | 
            
                                                        
            
                                    
            
            
                | 152 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 153 |  |  |         return str(ret) | 
            
                                                        
            
                                    
            
            
                | 154 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 155 |  |  |     def workspace_from_url( | 
            
                                                        
            
                                    
            
            
                | 156 |  |  |         self, | 
            
                                                        
            
                                    
            
            
                | 157 |  |  |         mets_url, | 
            
                                                        
            
                                    
            
            
                | 158 |  |  |         dst_dir=None, | 
            
                                                        
            
                                    
            
            
                | 159 |  |  |         clobber_mets=False, | 
            
                                                        
            
                                    
            
            
                | 160 |  |  |         mets_basename=None, | 
            
                                                        
            
                                    
            
            
                | 161 |  |  |         download=False, | 
            
                                                        
            
                                    
            
            
                | 162 |  |  |         src_baseurl=None, | 
            
                                                        
            
                                    
            
            
                | 163 |  |  |         mets_server_url=None, | 
            
                                                        
            
                                    
            
            
                | 164 |  |  |         **kwargs | 
            
                                                        
            
                                    
            
            
                | 165 |  |  |     ): | 
            
                                                        
            
                                    
            
            
                | 166 |  |  |         """ | 
            
                                                        
            
                                    
            
            
                | 167 |  |  |         Create a workspace from a METS by URL (i.e. clone if :py:attr:`mets_url` is remote or :py:attr:`dst_dir` is given). | 
            
                                                        
            
                                    
            
            
                | 168 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 169 |  |  |         Arguments: | 
            
                                                        
            
                                    
            
            
                | 170 |  |  |             mets_url (string): Source METS URL or filesystem path | 
            
                                                        
            
                                    
            
            
                | 171 |  |  |         Keyword Arguments: | 
            
                                                        
            
                                    
            
            
                | 172 |  |  |             dst_dir (string, None): Target directory for the workspace. \ | 
            
                                                        
            
                                    
            
            
                | 173 |  |  |                 By default create a temporary directory under :py:data:`ocrd.constants.TMP_PREFIX`. \ | 
            
                                                        
            
                                    
            
            
                | 174 |  |  |                 (The resulting path can be retrieved via :py:attr:`ocrd.Workspace.directory`.) | 
            
                                                        
            
                                    
            
            
                | 175 |  |  |             clobber_mets (boolean, False): Whether to overwrite existing ``mets.xml``. \ | 
            
                                                        
            
                                    
            
            
                | 176 |  |  |                 By default existing ``mets.xml`` will raise an exception. | 
            
                                                        
            
                                    
            
            
                | 177 |  |  |             download (boolean, False): Whether to also download all the files referenced by the METS | 
            
                                                        
            
                                    
            
            
                | 178 |  |  |             src_baseurl (string, None): Base URL for resolving relative file locations | 
            
                                                        
            
                                    
            
            
                | 179 |  |  |             mets_server_url (string, None): URI of TCP or local path of UDS for METS server handling | 
            
                                                        
            
                                    
            
            
                | 180 |  |  |                 the `OcrdMets` of the workspace. By default the METS will be read from and written to | 
            
                                                        
            
                                    
            
            
                | 181 |  |  |                 the filesystem directly. | 
            
                                                        
            
                                    
            
            
                | 182 |  |  |             **kwargs (): Passed on to ``OcrdMets.find_files`` if download == True | 
            
                                                        
            
                                    
            
            
                | 183 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 184 |  |  |         Download (clone) :py:attr:`mets_url` to ``mets.xml`` in :py:attr:`dst_dir`, unless  | 
            
                                                        
            
                                    
            
            
                | 185 |  |  |         the former is already local and the latter is ``none`` or already identical to its directory name. | 
            
                                                        
            
                                    
            
            
                | 186 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 187 |  |  |         Returns: | 
            
                                                        
            
                                    
            
            
                | 188 |  |  |             a new :py:class:`~ocrd.workspace.Workspace` | 
            
                                                        
            
                                    
            
            
                | 189 |  |  |         """ | 
            
                                                        
            
                                    
            
            
                | 190 |  |  |         log = getLogger('ocrd.resolver.workspace_from_url') | 
            
                                                        
            
                                    
            
            
                | 191 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 192 |  |  |         if mets_url is None: | 
            
                                                        
            
                                    
            
            
                | 193 |  |  |             raise ValueError("Must pass 'mets_url' workspace_from_url") | 
            
                                                        
            
                                    
            
            
                | 194 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 195 |  |  |         # if mets_url is a relative filename, make it absolute | 
            
                                                        
            
                                    
            
            
                | 196 |  |  |         if is_local_filename(mets_url) and not Path(mets_url).is_absolute(): | 
            
                                                        
            
                                    
            
            
                | 197 |  |  |             mets_url = str(Path(Path.cwd() / mets_url)) | 
            
                                                        
            
                                    
            
            
                | 198 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 199 |  |  |         # if mets_basename is not given, use the last URL segment of the mets_url | 
            
                                                        
            
                                    
            
            
                | 200 |  |  |         if mets_basename is None: | 
            
                                                        
            
                                    
            
            
                | 201 |  |  |             mets_basename = nth_url_segment(mets_url, -1) | 
            
                                                        
            
                                    
            
            
                | 202 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 203 |  |  |         # If src_baseurl wasn't given, determine from mets_url by removing last url segment | 
            
                                                        
            
                                    
            
            
                | 204 |  |  |         if not src_baseurl: | 
            
                                                        
            
                                    
            
            
                | 205 |  |  |             last_segment = nth_url_segment(mets_url) | 
            
                                                        
            
                                    
            
            
                | 206 |  |  |             src_baseurl = remove_non_path_from_url(remove_non_path_from_url(mets_url)[:-len(last_segment)]) | 
            
                                                        
            
                                    
            
            
                | 207 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 208 |  |  |         # resolve dst_dir | 
            
                                                        
            
                                    
            
            
                | 209 |  |  |         if not dst_dir: | 
            
                                                        
            
                                    
            
            
                | 210 |  |  |             if is_local_filename(mets_url): | 
            
                                                        
            
                                    
            
            
                | 211 |  |  |                 log.debug("Deriving dst_dir %s from %s", Path(mets_url).parent, mets_url) | 
            
                                                        
            
                                    
            
            
                | 212 |  |  |                 dst_dir = Path(mets_url).parent | 
            
                                                        
            
                                    
            
            
                | 213 |  |  |             else: | 
            
                                                        
            
                                    
            
            
                | 214 |  |  |                 log.debug("Creating ephemeral workspace '%s' for METS @ <%s>", dst_dir, mets_url) | 
            
                                                        
            
                                    
            
            
                | 215 |  |  |                 dst_dir = mkdtemp(prefix=TMP_PREFIX) | 
            
                                                        
            
                                    
            
            
                | 216 |  |  |         # XXX Path.resolve is always strict in Python <= 3.5, so create dst_dir unless it exists consistently | 
            
                                                        
            
                                    
            
            
                | 217 |  |  |         if not Path(dst_dir).exists(): | 
            
                                                        
            
                                    
            
            
                | 218 |  |  |             Path(dst_dir).mkdir(parents=True, exist_ok=False) | 
            
                                                        
            
                                    
            
            
                | 219 |  |  |         dst_dir = str(Path(dst_dir).resolve()) | 
            
                                                        
            
                                    
            
            
                | 220 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 221 |  |  |         log.debug("workspace_from_url\nmets_basename='%s'\nmets_url='%s'\nsrc_baseurl='%s'\ndst_dir='%s'", | 
            
                                                        
            
                                    
            
            
                | 222 |  |  |             mets_basename, mets_url, src_baseurl, dst_dir) | 
            
                                                        
            
                                    
            
            
                | 223 |  |  |         self.download_to_directory(dst_dir, mets_url, basename=mets_basename, if_exists='overwrite' if clobber_mets else 'raise') | 
            
                                                        
            
                                    
            
            
                | 224 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 225 |  |  |         workspace = Workspace(self, dst_dir, mets_basename=mets_basename, baseurl=src_baseurl, mets_server_url=mets_server_url) | 
            
                                                        
            
                                    
            
            
                | 226 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 227 |  |  |         if download: | 
            
                                                        
            
                                    
            
            
                | 228 |  |  |             for f in workspace.mets.find_files(**kwargs): | 
            
                                                        
            
                                    
            
            
                | 229 |  |  |                 workspace.download_file(f) | 
            
                                                        
            
                                    
            
            
                | 230 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 231 |  |  |         return workspace | 
            
                                                        
            
                                    
            
            
                | 232 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 233 |  |  |     def workspace_from_nothing(self, directory, mets_basename=DEFAULT_METS_BASENAME, clobber_mets=False): | 
            
                                                        
            
                                    
            
            
                | 234 |  |  |         """ | 
            
                                                        
            
                                    
            
            
                | 235 |  |  |         Create an empty workspace. | 
            
                                                        
            
                                    
            
            
                | 236 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 237 |  |  |         Arguments: | 
            
                                                        
            
                                    
            
            
                | 238 |  |  |             directory (string): Target directory for the workspace. \ | 
            
                                                        
            
                                    
            
            
                | 239 |  |  |                 If ``none``, create a temporary directory under :py:data:`ocrd.constants.TMP_PREFIX`. \ | 
            
                                                        
            
                                    
            
            
                | 240 |  |  |                 (The resulting path can be retrieved via :py:attr:`ocrd.Workspace.directory`.) | 
            
                                                        
            
                                    
            
            
                | 241 |  |  |         Keyword Arguments: | 
            
                                                        
            
                                    
            
            
                | 242 |  |  |             clobber_mets (boolean, False): Whether to overwrite existing ``mets.xml``. \ | 
            
                                                        
            
                                    
            
            
                | 243 |  |  |                 By default existing ``mets.xml`` will raise an exception. | 
            
                                                        
            
                                    
            
            
                | 244 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 245 |  |  |         Returns: | 
            
                                                        
            
                                    
            
            
                | 246 |  |  |             a new :py:class:`~ocrd.workspace.Workspace` | 
            
                                                        
            
                                    
            
            
                | 247 |  |  |         """ | 
            
                                                        
            
                                    
            
            
                | 248 |  |  |         log = getLogger('ocrd.resolver.workspace_from_nothing') | 
            
                                                        
            
                                    
            
            
                | 249 |  |  |         if directory is None: | 
            
                                                        
            
                                    
            
            
                | 250 |  |  |             directory = mkdtemp(prefix=TMP_PREFIX) | 
            
                                                        
            
                                    
            
            
                | 251 |  |  |         Path(directory).mkdir(parents=True, exist_ok=True) | 
            
                                                        
            
                                    
            
            
                | 252 |  |  |         mets_path = Path(directory, mets_basename) | 
            
                                                        
            
                                    
            
            
                | 253 |  |  |         if mets_path.exists() and not clobber_mets: | 
            
                                                        
            
                                    
            
            
                | 254 |  |  |             raise FileExistsError("METS '%s' already exists in '%s' and clobber_mets not set." % (mets_basename, directory)) | 
            
                                                        
            
                                    
            
            
                | 255 |  |  |         mets = OcrdMets.empty_mets() | 
            
                                                        
            
                                    
            
            
                | 256 |  |  |         log.info("Writing METS to %s", mets_path) | 
            
                                                        
            
                                    
            
            
                | 257 |  |  |         mets_path.write_bytes(mets.to_xml(xmllint=True)) | 
            
                                                        
            
                                    
            
            
                | 258 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 259 |  |  |         return Workspace(self, directory, mets, mets_basename=mets_basename) | 
            
                                                        
            
                                    
            
            
                | 260 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 261 |  |  |     def resolve_mets_arguments(self, directory, mets_url, mets_basename=DEFAULT_METS_BASENAME, mets_server_url=None): | 
            
                                                        
            
                                    
            
            
                | 262 |  |  |         """ | 
            
                                                        
            
                                    
            
            
                | 263 |  |  |         Resolve the ``--mets``, ``--mets-basename``, `--directory``, | 
            
                                                        
            
                                    
            
            
                | 264 |  |  |         ``--mets-server-url``, arguments into a coherent set of arguments | 
            
                                                        
            
                                    
            
            
                | 265 |  |  |         according to https://github.com/OCR-D/core/issues/517 | 
            
                                                        
            
                                    
            
            
                | 266 |  |  |         """ | 
            
                                                        
            
                                    
            
            
                | 267 |  |  |         log = getLogger('ocrd.resolver.resolve_mets_arguments') | 
            
                                                        
            
                                    
            
            
                | 268 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 269 |  |  |         mets_is_remote = mets_url and (mets_url.startswith('http://') or mets_url.startswith('https://')) | 
            
                                                        
            
                                    
            
            
                | 270 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 271 |  |  |         # XXX we might want to be more strict like this but it might break # legacy code | 
            
                                                        
            
                                    
            
            
                | 272 |  |  |         # Allow --mets and --directory together iff --mets is a remote URL | 
            
                                                        
            
                                    
            
            
                | 273 |  |  |         # if directory and mets_url and not mets_is_remote: | 
            
                                                        
            
                                    
            
            
                | 274 |  |  |         #     raise ValueError("Use either --mets or --directory, not both") | 
            
                                                        
            
                                    
            
            
                | 275 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 276 |  |  |         # If --mets is a URL, a directory must be explicitly provided (not strictly necessary, but retained for legacy behavior) | 
            
                                                        
            
                                    
            
            
                | 277 |  |  |         if not directory and mets_is_remote: | 
            
                                                        
            
                                    
            
            
                | 278 |  |  |             raise ValueError("--mets is an http(s) URL but no --directory was given") | 
            
                                                        
            
                                    
            
            
                | 279 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 280 |  |  |         # Determine --mets-basename | 
            
                                                        
            
                                    
            
            
                | 281 |  |  |         if not mets_basename and mets_url: | 
            
                                                        
            
                                    
            
            
                | 282 |  |  |             mets_basename = Path(mets_url).name | 
            
                                                        
            
                                    
            
            
                | 283 |  |  |         elif not mets_basename and not mets_url: | 
            
                                                        
            
                                    
            
            
                | 284 |  |  |             mets_basename = DEFAULT_METS_BASENAME | 
            
                                                        
            
                                    
            
            
                | 285 |  |  |         elif mets_basename and mets_url: | 
            
                                                        
            
                                    
            
            
                | 286 |  |  |             raise ValueError("Use either --mets or --mets-basename, not both") | 
            
                                                        
            
                                    
            
            
                | 287 |  |  |         else: | 
            
                                                        
            
                                    
            
            
                | 288 |  |  |             warn("--mets-basename is deprecated. Use --mets/--directory instead", DeprecationWarning) | 
            
                                                        
            
                                    
            
            
                | 289 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 290 |  |  |         # Determine --directory and --mets-url | 
            
                                                        
            
                                    
            
            
                | 291 |  |  |         if not directory and not mets_url: | 
            
                                                        
            
                                    
            
            
                | 292 |  |  |             directory = Path.cwd() | 
            
                                                        
            
                                    
            
            
                | 293 |  |  |             mets_url = Path(directory, mets_basename) | 
            
                                                        
            
                                    
            
            
                | 294 |  |  |         elif directory and not mets_url: | 
            
                                                        
            
                                    
            
            
                | 295 |  |  |             directory = Path(directory).resolve() | 
            
                                                        
            
                                    
            
            
                | 296 |  |  |             mets_url = directory / mets_basename | 
            
                                                        
            
                                    
            
            
                | 297 |  |  |         elif not directory and mets_url: | 
            
                                                        
            
                                    
            
            
                | 298 |  |  |             mets_url = Path(mets_url).resolve() | 
            
                                                        
            
                                    
            
            
                | 299 |  |  |             directory = mets_url.parent | 
            
                                                        
            
                                    
            
            
                | 300 |  |  |         else: # == directory and mets_url: | 
            
                                                        
            
                                    
            
            
                | 301 |  |  |             directory = Path(directory).resolve() | 
            
                                                        
            
                                    
            
            
                | 302 |  |  |             if not mets_is_remote: | 
            
                                                        
            
                                    
            
            
                | 303 |  |  |                 # --mets is just a basename and --directory is set, so treat --mets as --mets-basename | 
            
                                                        
            
                                    
            
            
                | 304 |  |  |                 if Path(mets_url).parent == Path('.'): | 
            
                                                        
            
                                    
            
            
                | 305 |  |  |                     mets_url = directory / mets_url | 
            
                                                        
            
                                    
            
            
                | 306 |  |  |                 else: | 
            
                                                        
            
                                    
            
            
                | 307 |  |  |                     mets_url = Path(mets_url).resolve() | 
            
                                                        
            
                                    
            
            
                | 308 |  |  |                     if not is_file_in_directory(directory, mets_url): | 
            
                                                        
            
                                    
            
            
                | 309 |  |  |                         raise ValueError("--mets '%s' has a directory part inconsistent with --directory '%s'" % (mets_url, directory)) | 
            
                                                        
            
                                    
            
            
                | 310 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 311 |  |  |         if mets_server_url and not mets_server_url.startswith('http://'): | 
            
                                                        
            
                                    
            
            
                | 312 |  |  |             # UDS socket | 
            
                                                        
            
                                    
            
            
                | 313 |  |  |             mets_server_url = str(Path(mets_server_url).resolve()) | 
            
                                                        
            
                                    
            
            
                | 314 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 315 |  |  |         return str(Path(directory).resolve()), str(mets_url), str(mets_basename), mets_server_url | 
            
                                                        
            
                                    
            
            
                | 316 |  |  |  |