| Total Complexity | 203 | 
| Total Lines | 1246 | 
| Duplicated Lines | 4.98 % | 
| Changes | 0 | ||
Duplicate code is one of the most pungent code smells. A rule that is often used is to re-structure code once it is duplicated in three or more places.
Common duplication problems, and corresponding solutions are:
Complex classes like ocrd.workspace often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
| 1 | import io  | 
            ||
| 2 | from os import makedirs, unlink, listdir, path  | 
            ||
| 3 | from pathlib import Path  | 
            ||
| 4 | from shutil import copyfileobj  | 
            ||
| 5 | from re import sub  | 
            ||
| 6 | from tempfile import NamedTemporaryFile  | 
            ||
| 7 | from contextlib import contextmanager  | 
            ||
| 8 | from typing import Optional, Union, Callable  | 
            ||
| 9 | |||
| 10 | from cv2 import COLOR_GRAY2BGR, COLOR_RGB2BGR, cvtColor  | 
            ||
| 11 | from PIL import Image  | 
            ||
| 12 | import numpy as np  | 
            ||
| 13 | from deprecated.sphinx import deprecated  | 
            ||
| 14 | import requests  | 
            ||
| 15 | |||
| 16 | from ocrd_models import OcrdMets, OcrdFile  | 
            ||
| 17 | from ocrd_models.ocrd_file import ClientSideOcrdFile  | 
            ||
| 18 | from ocrd_models.ocrd_page import parse, BorderType, to_xml  | 
            ||
| 19 | from ocrd_modelfactory import exif_from_filename, page_from_file  | 
            ||
| 20 | from ocrd_utils import (  | 
            ||
| 21 | atomic_write,  | 
            ||
| 22 | config,  | 
            ||
| 23 | getLogger,  | 
            ||
| 24 | image_from_polygon,  | 
            ||
| 25 | coordinates_of_segment,  | 
            ||
| 26 | adjust_canvas_to_rotation,  | 
            ||
| 27 | adjust_canvas_to_transposition,  | 
            ||
| 28 | scale_coordinates,  | 
            ||
| 29 | shift_coordinates,  | 
            ||
| 30 | rotate_coordinates,  | 
            ||
| 31 | transform_coordinates,  | 
            ||
| 32 | transpose_coordinates,  | 
            ||
| 33 | crop_image,  | 
            ||
| 34 | rotate_image,  | 
            ||
| 35 | transpose_image,  | 
            ||
| 36 | bbox_from_polygon,  | 
            ||
| 37 | polygon_from_points,  | 
            ||
| 38 | xywh_from_bbox,  | 
            ||
| 39 | pushd_popd,  | 
            ||
| 40 | is_local_filename,  | 
            ||
| 41 | deprecated_alias,  | 
            ||
| 42 | DEFAULT_METS_BASENAME,  | 
            ||
| 43 | MIME_TO_EXT,  | 
            ||
| 44 | MIME_TO_PIL,  | 
            ||
| 45 | MIMETYPE_PAGE,  | 
            ||
| 46 | REGEX_PREFIX,  | 
            ||
| 47 | )  | 
            ||
| 48 | |||
| 49 | from .workspace_backup import WorkspaceBackupManager  | 
            ||
| 50 | from .mets_server import ClientSideOcrdMets  | 
            ||
| 51 | |||
| 52 | __all__ = ['Workspace']  | 
            ||
| 53 | |||
| 54 | @contextmanager  | 
            ||
| 55 | def download_temporary_file(url):  | 
            ||
| 56 | with NamedTemporaryFile(prefix='ocrd-download-') as f:  | 
            ||
| 57 | with requests.get(url) as r:  | 
            ||
| 58 | f.write(r.content)  | 
            ||
| 59 | yield f  | 
            ||
| 60 | |||
| 61 | |||
| 62 | class Workspace():  | 
            ||
| 63 | """  | 
            ||
| 64 | A workspace is a temporary directory set up for a processor. It's the  | 
            ||
| 65 | interface to the METS/PAGE XML and delegates download and upload to the  | 
            ||
| 66 | :py:class:`ocrd.resolver.Resolver`.  | 
            ||
| 67 | |||
| 68 | Args:  | 
            ||
| 69 | resolver (:py:class:`ocrd.Resolver`) : `Resolver` instance  | 
            ||
| 70 | directory (string) : Filesystem path to work in  | 
            ||
| 71 | mets (:py:class:`ocrd_models.ocrd_mets.OcrdMets`) : `OcrdMets` representing this workspace.  | 
            ||
| 72 | If `None`, then loaded from ``directory``/``mets_basename``  | 
            ||
| 73 | or delegated to ``mets_server_url``.  | 
            ||
| 74 | mets_basename (string, mets.xml) : Basename of the METS XML file in the workspace directory.  | 
            ||
| 75 | mets_server_url (string, None) : URI of TCP or local path of UDS for METS server handling the  | 
            ||
| 76 | `OcrdMets` of this workspace. If `None`, then the METS will be read from and written to  | 
            ||
| 77 | the filesystem directly.  | 
            ||
| 78 | baseurl (string, None) : Base URL to prefix to relative URL.  | 
            ||
| 79 | """  | 
            ||
| 80 | |||
| 81 | def __init__(  | 
            ||
| 82 | self,  | 
            ||
| 83 | resolver,  | 
            ||
| 84 | directory,  | 
            ||
| 85 | mets : Optional[Union[OcrdMets, ClientSideOcrdMets]] = None,  | 
            ||
| 86 | mets_basename=DEFAULT_METS_BASENAME,  | 
            ||
| 87 | automatic_backup=False,  | 
            ||
| 88 | baseurl=None,  | 
            ||
| 89 | mets_server_url=None  | 
            ||
| 90 | ):  | 
            ||
| 91 | self.resolver = resolver  | 
            ||
| 92 | self.directory = directory  | 
            ||
| 93 | self.mets_target = str(Path(directory, mets_basename))  | 
            ||
| 94 | self.is_remote = bool(mets_server_url)  | 
            ||
| 95 | if mets is None:  | 
            ||
| 96 | if self.is_remote:  | 
            ||
| 97 | mets = ClientSideOcrdMets(mets_server_url, self.directory)  | 
            ||
| 98 | if mets.workspace_path != self.directory:  | 
            ||
| 99 |                     raise ValueError(f"METS server {mets_server_url} workspace directory '{mets.workspace_path}' differs " | 
            ||
| 100 |                             f"from local workspace directory '{self.directory}'. These are not the same workspaces.") | 
            ||
| 101 | else:  | 
            ||
| 102 | mets = OcrdMets(filename=self.mets_target)  | 
            ||
| 103 | self.mets = mets  | 
            ||
| 104 | if automatic_backup:  | 
            ||
| 105 | self.automatic_backup = WorkspaceBackupManager(self)  | 
            ||
| 106 | self.automatic_backup.add()  | 
            ||
| 107 | else:  | 
            ||
| 108 | self.automatic_backup = None  | 
            ||
| 109 | self.baseurl = baseurl  | 
            ||
| 110 |         #  print(mets.to_xml(xmllint=True).decode('utf-8')) | 
            ||
| 111 | |||
| 112 | def __repr__(self):  | 
            ||
| 113 | return 'Workspace[remote=%s, directory=%s, baseurl=%s, file_groups=%s, files=%s]' % (  | 
            ||
| 114 | self.is_remote,  | 
            ||
| 115 | self.directory,  | 
            ||
| 116 | self.baseurl,  | 
            ||
| 117 | self.mets.file_groups,  | 
            ||
| 118 | [str(f) for f in self.mets.find_all_files()],  | 
            ||
| 119 | )  | 
            ||
| 120 | |||
| 121 | def reload_mets(self):  | 
            ||
| 122 | """  | 
            ||
| 123 | Reload METS from the filesystem.  | 
            ||
| 124 | """  | 
            ||
| 125 | if self.is_remote:  | 
            ||
| 126 | self.mets.reload()  | 
            ||
| 127 | else:  | 
            ||
| 128 | self.mets = OcrdMets(filename=self.mets_target)  | 
            ||
| 129 | |||
| 130 | @deprecated_alias(pageId="page_id")  | 
            ||
| 131 | @deprecated_alias(ID="file_id")  | 
            ||
| 132 | @deprecated_alias(fileGrp="file_grp")  | 
            ||
| 133 | @deprecated_alias(fileGrp_mapping="filegrp_mapping")  | 
            ||
| 134 | def merge(self, other_workspace, copy_files=True, overwrite=False, **kwargs):  | 
            ||
| 135 | """  | 
            ||
| 136 | Merge ``other_workspace`` into this one  | 
            ||
| 137 | |||
| 138 | See :py:meth:`ocrd_models.ocrd_mets.OcrdMets.merge` for the `kwargs`  | 
            ||
| 139 | |||
| 140 | Keyword Args:  | 
            ||
| 141 | copy_files (boolean): Whether to copy files from `other_workspace` to this one  | 
            ||
| 142 | """  | 
            ||
| 143 | def after_add_cb(f):  | 
            ||
| 144 | """callback to run on merged OcrdFile instances in the destination"""  | 
            ||
| 145 | if not f.local_filename:  | 
            ||
| 146 | # OcrdFile has no local_filename, so nothing to be copied  | 
            ||
| 147 | return  | 
            ||
| 148 | if not copy_files:  | 
            ||
| 149 | fpath_src = Path(other_workspace.directory).resolve()  | 
            ||
| 150 | fpath_dst = Path(self.directory).resolve()  | 
            ||
| 151 | dstprefix = fpath_src.relative_to(fpath_dst) # raises ValueError if not a subpath  | 
            ||
| 152 | f.local_filename = dstprefix / f.local_filename  | 
            ||
| 153 | return  | 
            ||
| 154 | fpath_src = Path(other_workspace.directory, f.local_filename)  | 
            ||
| 155 | fpath_dest = Path(self.directory, f.local_filename)  | 
            ||
| 156 | if fpath_src.exists():  | 
            ||
| 157 | if fpath_dest.exists() and not overwrite:  | 
            ||
| 158 |                     raise FileExistsError("Copying %s to %s would overwrite the latter" % (fpath_src, fpath_dest)) | 
            ||
| 159 | if not fpath_dest.parent.is_dir():  | 
            ||
| 160 | makedirs(str(fpath_dest.parent))  | 
            ||
| 161 | with open(str(fpath_src), 'rb') as fstream_in, open(str(fpath_dest), 'wb') as fstream_out:  | 
            ||
| 162 | copyfileobj(fstream_in, fstream_out)  | 
            ||
| 163 | if 'page_id' in kwargs:  | 
            ||
| 164 |             kwargs['pageId'] = kwargs.pop('page_id') | 
            ||
| 165 | if 'file_id' in kwargs:  | 
            ||
| 166 |             kwargs['ID'] = kwargs.pop('file_id') | 
            ||
| 167 | if 'file_grp' in kwargs:  | 
            ||
| 168 |             kwargs['fileGrp'] = kwargs.pop('file_grp') | 
            ||
| 169 | if 'filegrp_mapping' in kwargs:  | 
            ||
| 170 |             kwargs['fileGrp_mapping'] = kwargs.pop('filegrp_mapping') | 
            ||
| 171 | |||
| 172 | self.mets.merge(other_workspace.mets, after_add_cb=after_add_cb, **kwargs)  | 
            ||
| 173 | |||
| 174 | |||
| 175 | @deprecated(version='1.0.0', reason="Use workspace.download_file")  | 
            ||
| 176 | def download_url(self, url, **kwargs):  | 
            ||
| 177 | """  | 
            ||
| 178 | Download a URL to the workspace.  | 
            ||
| 179 | |||
| 180 | Args:  | 
            ||
| 181 | url (string): URL to download to directory  | 
            ||
| 182 | **kwargs : See :py:class:`ocrd_models.ocrd_file.OcrdFile`  | 
            ||
| 183 | |||
| 184 | Returns:  | 
            ||
| 185 | The local filename of the downloaded file  | 
            ||
| 186 | """  | 
            ||
| 187 | dummy_mets = OcrdMets.empty_mets()  | 
            ||
| 188 |         f = dummy_mets.add_file('DEPRECATED', ID=Path(url).name, url=url) | 
            ||
| 189 | f = self.download_file(f)  | 
            ||
| 190 | return f.local_filename  | 
            ||
| 191 | |||
| 192 | def download_file(self, f, _recursion_count=0):  | 
            ||
| 193 | """  | 
            ||
| 194 | Download a :py:class:`ocrd_models.ocrd_file.OcrdFile` to the workspace.  | 
            ||
| 195 | """  | 
            ||
| 196 |         log = getLogger('ocrd.workspace.download_file') | 
            ||
| 197 | with pushd_popd(self.directory):  | 
            ||
| 198 | if f.local_filename:  | 
            ||
| 199 | file_path = Path(f.local_filename).absolute()  | 
            ||
| 200 | if file_path.exists():  | 
            ||
| 201 | try:  | 
            ||
| 202 | file_path.relative_to(Path(self.directory).resolve()) # raises ValueError if not relative  | 
            ||
| 203 | # If the f.local_filename exists and is within self.directory, nothing to do  | 
            ||
| 204 |                         log.debug(f"'local_filename' {f.local_filename} already within {self.directory} - nothing to do") | 
            ||
| 205 | except ValueError:  | 
            ||
| 206 | # f.local_filename exists, but not within self.directory, copy it  | 
            ||
| 207 |                         log.debug("Copying 'local_filename' %s to workspace directory %s" % (f.local_filename, self.directory)) | 
            ||
| 208 | f.local_filename = self.resolver.download_to_directory(self.directory, f.local_filename, subdir=f.fileGrp)  | 
            ||
| 209 | return f  | 
            ||
| 210 | if f.url:  | 
            ||
| 211 |                     log.debug("OcrdFile has 'local_filename' but it doesn't resolve - trying to download from 'url' %s", f.url) | 
            ||
| 212 | url = f.url  | 
            ||
| 213 | elif self.baseurl:  | 
            ||
| 214 |                     log.debug("OcrdFile has 'local_filename' but it doesn't resolve, and no 'url' - trying 'baseurl' %s with 'local_filename' %s", | 
            ||
| 215 | self.baseurl, f.local_filename)  | 
            ||
| 216 | url = '%s/%s' % (self.baseurl, f.local_filename)  | 
            ||
| 217 | else:  | 
            ||
| 218 |                     raise FileNotFoundError(f"'local_filename' {f.local_filename} points to non-existing file, " | 
            ||
| 219 | "and no 'url' to download and no 'baseurl' set on workspace - nothing we can do.")  | 
            ||
| 220 | file_path = Path(f.local_filename)  | 
            ||
| 221 | self.resolver.download_to_directory(self.directory, url, subdir=file_path.parent, basename=file_path.name)  | 
            ||
| 222 | return f  | 
            ||
| 223 | if f.url:  | 
            ||
| 224 | # If f.url is set, download the file to the workspace  | 
            ||
| 225 | basename = '%s%s' % (f.ID, MIME_TO_EXT.get(f.mimetype, '')) if f.ID else f.basename  | 
            ||
| 226 | f.local_filename = self.resolver.download_to_directory(self.directory, f.url, subdir=f.fileGrp, basename=basename)  | 
            ||
| 227 | return f  | 
            ||
| 228 | # If neither f.local_filename nor f.url is set, fail  | 
            ||
| 229 |             raise ValueError(f"OcrdFile {f} has neither 'url' nor 'local_filename', so cannot be downloaded") | 
            ||
| 230 | |||
| 231 | def remove_file(self, file_id, force=False, keep_file=False, page_recursive=False, page_same_group=False):  | 
            ||
| 232 | """  | 
            ||
| 233 | Remove a METS `file` from the workspace.  | 
            ||
| 234 | |||
| 235 | Arguments:  | 
            ||
| 236 | file_id (string|:py:class:`ocrd_models.ocrd_file.OcrdFile`): `@ID` of the METS `file`  | 
            ||
| 237 | to delete or the file itself  | 
            ||
| 238 | Keyword Args:  | 
            ||
| 239 | force (boolean): Continue removing even if file not found in METS  | 
            ||
| 240 | keep_file (boolean): Whether to keep files on disk  | 
            ||
| 241 | page_recursive (boolean): Whether to remove all images referenced in the file  | 
            ||
| 242 | if the file is a PAGE-XML document.  | 
            ||
| 243 | page_same_group (boolean): Remove only images in the same file group as the PAGE-XML.  | 
            ||
| 244 | Has no effect unless ``page_recursive`` is `True`.  | 
            ||
| 245 | """  | 
            ||
| 246 |         log = getLogger('ocrd.workspace.remove_file') | 
            ||
| 247 |         log.debug('Deleting mets:file %s', file_id) | 
            ||
| 248 | if isinstance(file_id, OcrdFile):  | 
            ||
| 249 | file_id = file_id.ID  | 
            ||
| 250 | try:  | 
            ||
| 251 | try:  | 
            ||
| 252 | ocrd_file = next(self.mets.find_files(ID=file_id))  | 
            ||
| 253 | except StopIteration:  | 
            ||
| 254 | if file_id.startswith(REGEX_PREFIX):  | 
            ||
| 255 | # allow empty results if filter criteria involve a regex  | 
            ||
| 256 | return None  | 
            ||
| 257 |                 raise FileNotFoundError("File %s not found in METS" % file_id) | 
            ||
| 258 | if page_recursive and ocrd_file.mimetype == MIMETYPE_PAGE:  | 
            ||
| 259 | with pushd_popd(self.directory):  | 
            ||
| 260 | ocrd_page = parse(self.download_file(ocrd_file).local_filename, silence=True)  | 
            ||
| 261 | for img_url in ocrd_page.get_AllAlternativeImagePaths():  | 
            ||
| 262 |                         img_kwargs = {'local_filename': img_url} | 
            ||
| 263 | if page_same_group:  | 
            ||
| 264 | img_kwargs['fileGrp'] = ocrd_file.fileGrp  | 
            ||
| 265 | for img_file in self.mets.find_files(**img_kwargs):  | 
            ||
| 266 | self.remove_file(img_file, keep_file=keep_file, force=force)  | 
            ||
| 267 | if not keep_file:  | 
            ||
| 268 | with pushd_popd(self.directory):  | 
            ||
| 269 | if not ocrd_file.local_filename:  | 
            ||
| 270 | if force:  | 
            ||
| 271 |                             log.debug("File not locally available but --force is set: %s", ocrd_file) | 
            ||
| 272 | else:  | 
            ||
| 273 |                             raise Exception("File not locally available %s" % ocrd_file) | 
            ||
| 274 | else:  | 
            ||
| 275 |                         log.debug("rm %s [cwd=%s]", ocrd_file.local_filename, self.directory) | 
            ||
| 276 | unlink(ocrd_file.local_filename)  | 
            ||
| 277 | # Remove from METS only after the recursion of AlternativeImages  | 
            ||
| 278 | self.mets.remove_file(file_id)  | 
            ||
| 279 | return ocrd_file  | 
            ||
| 280 | except FileNotFoundError as e:  | 
            ||
| 281 | if not force:  | 
            ||
| 282 | raise e  | 
            ||
| 283 | |||
| 284 | def remove_file_group(self, USE, recursive=False, force=False, keep_files=False, page_recursive=False, page_same_group=False):  | 
            ||
| 285 | """  | 
            ||
| 286 | Remove a METS `fileGrp`.  | 
            ||
| 287 | |||
| 288 | Arguments:  | 
            ||
| 289 | USE (string): `@USE` of the METS `fileGrp` to delete  | 
            ||
| 290 | Keyword Args:  | 
            ||
| 291 | recursive (boolean): Whether to recursively delete all files in the group  | 
            ||
| 292 | force (boolean): Continue removing even if group or containing files not found in METS  | 
            ||
| 293 | keep_files (boolean): When deleting recursively whether to keep files on disk  | 
            ||
| 294 | page_recursive (boolean): Whether to remove all images referenced in the file  | 
            ||
| 295 | if the file is a PAGE-XML document.  | 
            ||
| 296 | page_same_group (boolean): Remove only images in the same file group as the PAGE-XML.  | 
            ||
| 297 | Has no effect unless ``page_recursive`` is `True`.  | 
            ||
| 298 | """  | 
            ||
| 299 | if (not USE.startswith(REGEX_PREFIX)) and (USE not in self.mets.file_groups) and (not force):  | 
            ||
| 300 |             raise Exception("No such fileGrp: %s" % USE) | 
            ||
| 301 | |||
| 302 | file_dirs = []  | 
            ||
| 303 | if recursive:  | 
            ||
| 304 | for f in self.mets.find_files(fileGrp=USE):  | 
            ||
| 305 | self.remove_file(f, force=force, keep_file=keep_files, page_recursive=page_recursive, page_same_group=page_same_group)  | 
            ||
| 306 | if f.local_filename:  | 
            ||
| 307 | f_dir = path.dirname(f.local_filename)  | 
            ||
| 308 | if f_dir:  | 
            ||
| 309 | file_dirs.append(f_dir)  | 
            ||
| 310 | |||
| 311 | self.mets.remove_file_group(USE, force=force, recursive=recursive)  | 
            ||
| 312 | |||
| 313 | # PLEASE NOTE: this only removes directories in the workspace if they are empty  | 
            ||
| 314 | # and named after the fileGrp which is a convention in OCR-D.  | 
            ||
| 315 | with pushd_popd(self.directory):  | 
            ||
| 316 | if Path(USE).is_dir() and not listdir(USE):  | 
            ||
| 317 | Path(USE).rmdir()  | 
            ||
| 318 | if file_dirs:  | 
            ||
| 319 | for file_dir in set(file_dirs):  | 
            ||
| 320 | if Path(file_dir).is_dir() and not listdir(file_dir):  | 
            ||
| 321 | Path(file_dir).rmdir()  | 
            ||
| 322 | |||
| 323 | |||
| 324 | def rename_file_group(self, old, new):  | 
            ||
| 325 | """  | 
            ||
| 326 | Rename a METS `fileGrp`.  | 
            ||
| 327 | |||
| 328 | Arguments:  | 
            ||
| 329 | old (string): `@USE` of the METS `fileGrp` to rename  | 
            ||
| 330 | new (string): `@USE` of the METS `fileGrp` to rename as  | 
            ||
| 331 | """  | 
            ||
| 332 |         log = getLogger('ocrd.workspace.rename_file_group') | 
            ||
| 333 | |||
| 334 | if old not in self.mets.file_groups:  | 
            ||
| 335 |             raise ValueError(f"No such fileGrp: {old}") | 
            ||
| 336 | if new in self.mets.file_groups:  | 
            ||
| 337 |             raise ValueError(f"fileGrp already exists {new}") | 
            ||
| 338 | |||
| 339 | with pushd_popd(self.directory):  | 
            ||
| 340 | # create workspace dir ``new``  | 
            ||
| 341 |             log.debug("mkdir %s" % new) | 
            ||
| 342 | if not Path(new).is_dir():  | 
            ||
| 343 | Path(new).mkdir()  | 
            ||
| 344 |             local_filename_replacements = {} | 
            ||
| 345 |             log.debug("Moving files") | 
            ||
| 346 | for mets_file in self.mets.find_files(fileGrp=old, local_only=True):  | 
            ||
| 347 | new_local_filename = old_local_filename = mets_file.local_filename  | 
            ||
| 348 | assert new_local_filename  | 
            ||
| 349 | assert old_local_filename  | 
            ||
| 350 | # Directory part  | 
            ||
| 351 | new_local_filename = sub(r'^%s/' % old, r'%s/' % new, new_local_filename)  | 
            ||
| 352 | # File part  | 
            ||
| 353 | new_local_filename = sub(r'/%s' % old, r'/%s' % new, new_local_filename)  | 
            ||
| 354 | local_filename_replacements[str(mets_file.local_filename)] = new_local_filename  | 
            ||
| 355 | # move file from ``old`` to ``new``  | 
            ||
| 356 | Path(old_local_filename).rename(new_local_filename)  | 
            ||
| 357 | # change the url of ``mets:file``  | 
            ||
| 358 | mets_file.local_filename = new_local_filename  | 
            ||
| 359 | # change the file ID and update structMap  | 
            ||
| 360 | # change the file ID and update structMap  | 
            ||
| 361 | new_id = sub(r'^%s' % old, r'%s' % new, mets_file.ID)  | 
            ||
| 362 | try:  | 
            ||
| 363 | next(self.mets.find_files(ID=new_id))  | 
            ||
| 364 |                     log.warning("ID %s already exists, not changing ID while renaming %s -> %s" % (new_id, old_local_filename, new_local_filename)) | 
            ||
| 365 | except StopIteration:  | 
            ||
| 366 | mets_file.ID = new_id  | 
            ||
| 367 | # change file paths in PAGE-XML imageFilename and filename attributes  | 
            ||
| 368 | for page_file in self.mets.find_files(mimetype=MIMETYPE_PAGE, local_only=True):  | 
            ||
| 369 |                 log.debug("Renaming file references in PAGE-XML %s" % page_file) | 
            ||
| 370 | pcgts = page_from_file(page_file)  | 
            ||
| 371 | changed = False  | 
            ||
| 372 | for old_local_filename, new_local_filename in local_filename_replacements.items():  | 
            ||
| 373 | if pcgts.get_Page().imageFilename == old_local_filename:  | 
            ||
| 374 | changed = True  | 
            ||
| 375 |                         log.debug("Rename pc:Page/@imageFilename: %s -> %s" % (old_local_filename, new_local_filename)) | 
            ||
| 376 | pcgts.get_Page().imageFilename = new_local_filename  | 
            ||
| 377 | for ai in pcgts.get_Page().get_AllAlternativeImages():  | 
            ||
| 378 | for old_local_filename, new_local_filename in local_filename_replacements.items():  | 
            ||
| 379 | if ai.filename == old_local_filename:  | 
            ||
| 380 | changed = True  | 
            ||
| 381 |                             log.debug("Rename pc:Page/../AlternativeImage: %s -> %s" % (old_local_filename, new_local_filename)) | 
            ||
| 382 | ai.filename = new_local_filename  | 
            ||
| 383 | if changed:  | 
            ||
| 384 |                     log.debug("PAGE-XML changed, writing %s" % (page_file.local_filename)) | 
            ||
| 385 | with open(page_file.local_filename, 'w', encoding='utf-8') as f:  | 
            ||
| 386 | f.write(to_xml(pcgts))  | 
            ||
| 387 | # change the ``USE`` attribute of the fileGrp  | 
            ||
| 388 | self.mets.rename_file_group(old, new)  | 
            ||
| 389 | # Remove the old dir  | 
            ||
| 390 |             log.debug("rmdir %s" % old) | 
            ||
| 391 | if Path(old).is_dir() and not listdir(old):  | 
            ||
| 392 | Path(old).rmdir()  | 
            ||
| 393 | |||
| 394 | @deprecated_alias(pageId="page_id")  | 
            ||
| 395 | @deprecated_alias(ID="file_id")  | 
            ||
| 396 | def add_file(self, file_grp, content=None, **kwargs) -> Union[OcrdFile, ClientSideOcrdFile]:  | 
            ||
| 397 | """  | 
            ||
| 398 | Add a file to the :py:class:`ocrd_models.ocrd_mets.OcrdMets` of the workspace.  | 
            ||
| 399 | |||
| 400 | Arguments:  | 
            ||
| 401 | file_grp (string): `@USE` of the METS `fileGrp` to add to  | 
            ||
| 402 | Keyword Args:  | 
            ||
| 403 | content (string|bytes): optional content to write to the file  | 
            ||
| 404 | in the filesystem  | 
            ||
| 405 | **kwargs: See :py:func:`ocrd_models.ocrd_mets.OcrdMets.add_file`  | 
            ||
| 406 | Returns:  | 
            ||
| 407 | a new :py:class:`ocrd_models.ocrd_file.OcrdFile`  | 
            ||
| 408 | """  | 
            ||
| 409 |         log = getLogger('ocrd.workspace.add_file') | 
            ||
| 410 | log.debug(  | 
            ||
| 411 | 'outputfile file_grp=%s local_filename=%s content=%s',  | 
            ||
| 412 | file_grp,  | 
            ||
| 413 |             kwargs.get('local_filename'), | 
            ||
| 414 | content is not None)  | 
            ||
| 415 | if 'page_id' not in kwargs:  | 
            ||
| 416 |             raise ValueError("workspace.add_file must be passed a 'page_id' kwarg, even if it is None.") | 
            ||
| 417 |         if content is not None and not kwargs.get('local_filename'): | 
            ||
| 418 |             raise Exception("'content' was set but no 'local_filename'") | 
            ||
| 419 | |||
| 420 | with pushd_popd(self.directory):  | 
            ||
| 421 |             if kwargs.get('local_filename'): | 
            ||
| 422 | # If the local filename has folder components, create those folders  | 
            ||
| 423 |                 local_filename_dir = str(kwargs['local_filename']).rsplit('/', 1)[0] | 
            ||
| 424 | if local_filename_dir != str(kwargs['local_filename']) and not Path(local_filename_dir).is_dir():  | 
            ||
| 425 | makedirs(local_filename_dir, exist_ok=True)  | 
            ||
| 426 | |||
| 427 | # print(kwargs)  | 
            ||
| 428 |             kwargs["pageId"] = kwargs.pop("page_id") | 
            ||
| 429 | if "file_id" in kwargs:  | 
            ||
| 430 |                 kwargs["ID"] = kwargs.pop("file_id") | 
            ||
| 431 | if config.OCRD_EXISTING_OUTPUT == 'OVERWRITE':  | 
            ||
| 432 | kwargs["force"] = True  | 
            ||
| 433 | |||
| 434 | ret = self.mets.add_file(file_grp, **kwargs)  | 
            ||
| 435 | |||
| 436 | # content being set implies is_remote==False because METS server  | 
            ||
| 437 | # does not pass file contents  | 
            ||
| 438 | if content is not None:  | 
            ||
| 439 | with open(kwargs['local_filename'], 'wb') as f:  | 
            ||
| 440 | if isinstance(content, str):  | 
            ||
| 441 | content = bytes(content, 'utf-8')  | 
            ||
| 442 | f.write(content)  | 
            ||
| 443 | |||
| 444 | return ret  | 
            ||
| 445 | |||
| 446 | def save_mets(self):  | 
            ||
| 447 | """  | 
            ||
| 448 | Write out the current state of the METS file to the filesystem.  | 
            ||
| 449 | """  | 
            ||
| 450 |         log = getLogger('ocrd.workspace.save_mets') | 
            ||
| 451 | if self.is_remote:  | 
            ||
| 452 | self.mets.save()  | 
            ||
| 453 | else:  | 
            ||
| 454 |             log.debug("Saving mets '%s'", self.mets_target) | 
            ||
| 455 | if self.automatic_backup:  | 
            ||
| 456 | WorkspaceBackupManager(self).add()  | 
            ||
| 457 | with atomic_write(self.mets_target) as f:  | 
            ||
| 458 |                 f.write(self.mets.to_xml(xmllint=True).decode('utf-8')) | 
            ||
| 459 | |||
| 460 | def _apply_mets_file(self, filename_or_url: str, fun: Callable):  | 
            ||
| 461 | if not filename_or_url:  | 
            ||
| 462 | # avoid "finding" just any file  | 
            ||
| 463 |             raise ValueError("requires non-empty filename or URL") | 
            ||
| 464 | with pushd_popd(self.directory):  | 
            ||
| 465 | if Path(filename_or_url).exists():  | 
            ||
| 466 | return fun(filename_or_url)  | 
            ||
| 467 | if image_file := next(self.mets.find_files(local_filename=str(filename_or_url)), None):  | 
            ||
| 468 | return fun(image_file.local_filename)  | 
            ||
| 469 | if image_file := next(self.mets.find_files(url=str(filename_or_url)), None):  | 
            ||
| 470 | return fun(self.download_file(image_file).local_filename)  | 
            ||
| 471 | with download_temporary_file(filename_or_url) as f:  | 
            ||
| 472 | return fun(f.name)  | 
            ||
| 473 | |||
| 474 | def resolve_image_exif(self, image_url):  | 
            ||
| 475 | """  | 
            ||
| 476 | Get the EXIF metadata about an image URL as :py:class:`ocrd_models.ocrd_exif.OcrdExif`  | 
            ||
| 477 | |||
| 478 | Args:  | 
            ||
| 479 | image_url (string) : `@href` (path or URL) of the METS `file` to inspect  | 
            ||
| 480 | |||
| 481 | Returns:  | 
            ||
| 482 | :py:class:`ocrd_models.ocrd_exif.OcrdExif`  | 
            ||
| 483 | """  | 
            ||
| 484 | return self._apply_mets_file(image_url, exif_from_filename)  | 
            ||
| 485 | |||
| 486 | @deprecated(version='1.0.0', reason="Use workspace.image_from_page and workspace.image_from_segment")  | 
            ||
| 487 | def resolve_image_as_pil(self, image_url, coords=None):  | 
            ||
| 488 | """  | 
            ||
| 489 | Resolve an image URL to a `PIL.Image`.  | 
            ||
| 490 | |||
| 491 | Arguments:  | 
            ||
| 492 | image_url (string): `@href` (path or URL) of the METS `file` to retrieve  | 
            ||
| 493 | Keyword Args:  | 
            ||
| 494 | coords (list) : Coordinates of the bounding box to cut from the image  | 
            ||
| 495 | |||
| 496 | Returns:  | 
            ||
| 497 | Full or cropped `PIL.Image`  | 
            ||
| 498 | |||
| 499 | """  | 
            ||
| 500 | return self._resolve_image_as_pil(image_url, coords)  | 
            ||
| 501 | |||
| 502 | def _resolve_image_as_pil(self, image_url, coords=None):  | 
            ||
| 503 |         log = getLogger('ocrd.workspace._resolve_image_as_pil') | 
            ||
| 504 | pil_image = self._apply_mets_file(image_url, Image.open)  | 
            ||
| 505 | pil_image.load() # alloc and give up the FD  | 
            ||
| 506 | |||
| 507 | # Pillow does not properly support higher color depths  | 
            ||
| 508 | # (e.g. 16-bit or 32-bit or floating point grayscale),  | 
            ||
| 509 | # clipping its dynamic range to the lower 8-bit in  | 
            ||
| 510 | # many operations (including paste, putalpha, ImageStat...),  | 
            ||
| 511 | # even including conversion.  | 
            ||
| 512 | # Cf. Pillow#3011 Pillow#3159 Pillow#3838 (still open in 8.0)  | 
            ||
| 513 | # So to be on the safe side, we must re-quantize these  | 
            ||
| 514 | # to 8-bit via numpy (conversion to/from which fortunately  | 
            ||
| 515 | # seems to work reliably):  | 
            ||
| 516 |         if (pil_image.mode.startswith('I') or | 
            ||
| 517 |             pil_image.mode.startswith('F')): | 
            ||
| 518 | arr_image = np.array(pil_image)  | 
            ||
| 519 | if arr_image.dtype.kind == 'i':  | 
            ||
| 520 | # signed integer is *not* trustworthy in this context  | 
            ||
| 521 | # (usually a mistake in the array interface)  | 
            ||
| 522 |                 log.debug('Casting image "%s" from signed to unsigned', image_url) | 
            ||
| 523 |                 arr_image.dtype = np.dtype('u' + arr_image.dtype.name) | 
            ||
| 524 | if arr_image.dtype.kind == 'u':  | 
            ||
| 525 | # integer needs to be scaled linearly to 8 bit  | 
            ||
| 526 | # of course, an image might actually have some lower range  | 
            ||
| 527 | # (e.g. 10-bit in I;16 or 20-bit in I or 4-bit in L),  | 
            ||
| 528 | # but that would be guessing anyway, so here don't  | 
            ||
| 529 | # make assumptions on _scale_, just reduce _precision_  | 
            ||
| 530 |                 log.debug('Reducing image "%s" from depth %d bit to 8 bit', | 
            ||
| 531 | image_url, arr_image.dtype.itemsize * 8)  | 
            ||
| 532 | arr_image = arr_image >> 8 * (arr_image.dtype.itemsize-1)  | 
            ||
| 533 | arr_image = arr_image.astype(np.uint8)  | 
            ||
| 534 | elif arr_image.dtype.kind == 'f':  | 
            ||
| 535 | # float needs to be scaled from [0,1.0] to [0,255]  | 
            ||
| 536 |                 log.debug('Reducing image "%s" from floating point to 8 bit', | 
            ||
| 537 | image_url)  | 
            ||
| 538 | arr_image *= 255  | 
            ||
| 539 | arr_image = arr_image.astype(np.uint8)  | 
            ||
| 540 | pil_image = Image.fromarray(arr_image)  | 
            ||
| 541 | |||
| 542 | if coords is None:  | 
            ||
| 543 | return pil_image  | 
            ||
| 544 | |||
| 545 | # FIXME: remove or replace this by (image_from_polygon+) crop_image ...  | 
            ||
| 546 |         log.debug("Converting PIL to OpenCV: %s", image_url) | 
            ||
| 547 |         color_conversion = COLOR_GRAY2BGR if pil_image.mode in ('1', 'L') else  COLOR_RGB2BGR | 
            ||
| 548 |         pil_as_np_array = np.array(pil_image).astype('uint8') if pil_image.mode == '1' else np.array(pil_image) | 
            ||
| 549 | cv2_image = cvtColor(pil_as_np_array, color_conversion)  | 
            ||
| 550 | |||
| 551 | poly = np.array(coords, np.int32)  | 
            ||
| 552 |         log.debug("Cutting region %s from %s", coords, image_url) | 
            ||
| 553 | region_cut = cv2_image[  | 
            ||
| 554 | np.min(poly[:, 1]):np.max(poly[:, 1]),  | 
            ||
| 555 | np.min(poly[:, 0]):np.max(poly[:, 0])  | 
            ||
| 556 | ]  | 
            ||
| 557 | return Image.fromarray(region_cut)  | 
            ||
| 558 | |||
| 559 | def image_from_page(self, page, page_id,  | 
            ||
| 560 | fill='background', transparency=False,  | 
            ||
| 561 | feature_selector='', feature_filter='', filename=''):  | 
            ||
| 562 | """Extract an image for a PAGE-XML page from the workspace.  | 
            ||
| 563 | |||
| 564 | Args:  | 
            ||
| 565 | page (:py:class:`ocrd_models.ocrd_page.PageType`): a PAGE `PageType` object  | 
            ||
| 566 | page_id (string): its `@ID` in the METS physical `structMap`  | 
            ||
| 567 | Keyword Args:  | 
            ||
| 568 | fill (string): a `PIL` color specifier, or `background` or `none`  | 
            ||
| 569 | transparency (boolean): whether to add an alpha channel for masking  | 
            ||
| 570 | feature_selector (string): a comma-separated list of `@comments` classes  | 
            ||
| 571 | feature_filter (string): a comma-separated list of `@comments` classes  | 
            ||
| 572 | filename (string): which file path to use  | 
            ||
| 573 | |||
| 574 | Extract a `PIL.Image` from ``page``, either from its `AlternativeImage`  | 
            ||
| 575 | (if it exists), or from its `@imageFilename` (otherwise). Also crop it,  | 
            ||
| 576 | if a `Border` exists, and rotate it, if any `@orientation` angle is  | 
            ||
| 577 | annotated.  | 
            ||
| 578 | |||
| 579 | If ``filename`` is given, then among `@imageFilename` and the available  | 
            ||
| 580 | `AlternativeImage/@filename` images, pick that one, or raise an error.  | 
            ||
| 581 | |||
| 582 | If ``feature_selector`` and/or ``feature_filter`` is given, then  | 
            ||
| 583 | among the `@imageFilename` image and the available AlternativeImages,  | 
            ||
| 584 | select/filter the richest one which contains all of the selected,  | 
            ||
| 585 | but none of the filtered features (i.e. `@comments` classes), or  | 
            ||
| 586 | raise an error.  | 
            ||
| 587 | |||
| 588 | (Required and produced features need not be in the same order, so  | 
            ||
| 589 | ``feature_selector`` is merely a mask specifying Boolean AND, and  | 
            ||
| 590 | ``feature_filter`` is merely a mask specifying Boolean OR.)  | 
            ||
| 591 | |||
| 592 | If the chosen image does not have the feature `"cropped"` yet, but  | 
            ||
| 593 | a `Border` exists, and unless `"cropped"` is being filtered, then crop it.  | 
            ||
| 594 | Likewise, if the chosen image does not have the feature `"deskewed"` yet,  | 
            ||
| 595 | but an `@orientation` angle is annotated, and unless `"deskewed"` is being  | 
            ||
| 596 | filtered, then rotate it. (However, if `@orientation` is above the  | 
            ||
| 597 | [-45°,45°] interval, then apply as much transposition as possible first,  | 
            ||
| 598 | unless `"rotated-90"` / `"rotated-180"` / `"rotated-270"` is being filtered.)  | 
            ||
| 599 | |||
| 600 | Cropping uses a polygon mask (not just the bounding box rectangle).  | 
            ||
| 601 | Areas outside the polygon will be filled according to ``fill``:  | 
            ||
| 602 | |||
| 603 | - if `"background"` (the default),  | 
            ||
| 604 | then fill with the median color of the image;  | 
            ||
| 605 | - else if `"none"`, then avoid masking polygons where possible  | 
            ||
| 606 | (i.e. when cropping) or revert to the default (i.e. when rotating)  | 
            ||
| 607 | - otherwise, use the given color, e.g. `"white"` or `(255,255,255)`.  | 
            ||
| 608 | |||
| 609 | Moreover, if ``transparency`` is true, and unless the image already  | 
            ||
| 610 | has an alpha channel, then add an alpha channel which is fully opaque  | 
            ||
| 611 | before cropping and rotating. (Thus, unexposed/masked areas will be  | 
            ||
| 612 | transparent afterwards for consumers that can interpret alpha channels).  | 
            ||
| 613 | |||
| 614 | Returns:  | 
            ||
| 615 | a tuple of  | 
            ||
| 616 | * the extracted `PIL.Image`,  | 
            ||
| 617 | * a `dict` with information about the extracted image:  | 
            ||
| 618 | |||
| 619 | - `"transform"`: a `Numpy` array with an affine transform which  | 
            ||
| 620 | converts from absolute coordinates to those relative to the image,  | 
            ||
| 621 | i.e. after cropping to the page's border / bounding box (if any)  | 
            ||
| 622 | and deskewing with the page's orientation angle (if any)  | 
            ||
| 623 | - `"angle"`: the rotation/reflection angle applied to the image so far,  | 
            ||
| 624 | - `"DPI"`: the pixel density of the original image,  | 
            ||
| 625 | - `"features"`: the `AlternativeImage` `@comments` for the image, i.e.  | 
            ||
| 626 | names of all applied operations that lead up to this result,  | 
            ||
| 627 | * an :py:class:`ocrd_models.ocrd_exif.OcrdExif` instance associated with  | 
            ||
| 628 | the original image.  | 
            ||
| 629 | |||
| 630 | (The first two can be used to annotate a new `AlternativeImage`,  | 
            ||
| 631 | or be passed down with :py:meth:`image_from_segment`.)  | 
            ||
| 632 | |||
| 633 | Examples:  | 
            ||
| 634 | |||
| 635 | * get a raw (colored) but already deskewed and cropped image::  | 
            ||
| 636 | |||
| 637 | page_image, page_coords, page_image_info = workspace.image_from_page(  | 
            ||
| 638 | page, page_id,  | 
            ||
| 639 | feature_selector='deskewed,cropped',  | 
            ||
| 640 | feature_filter='binarized,grayscale_normalized')  | 
            ||
| 641 | """  | 
            ||
| 642 |         log = getLogger('ocrd.workspace.image_from_page') | 
            ||
| 643 | page_image_info = self.resolve_image_exif(page.imageFilename)  | 
            ||
| 644 | page_image = self._resolve_image_as_pil(page.imageFilename)  | 
            ||
| 645 |         page_coords = {} | 
            ||
| 646 | # use identity as initial affine coordinate transform:  | 
            ||
| 647 | page_coords['transform'] = np.eye(3)  | 
            ||
| 648 | # interim bbox (updated with each change to the transform):  | 
            ||
| 649 | page_bbox = [0, 0, page_image.width, page_image.height]  | 
            ||
| 650 |         page_xywh = {'x': 0, 'y': 0, | 
            ||
| 651 | 'w': page_image.width, 'h': page_image.height}  | 
            ||
| 652 | |||
| 653 | border = page.get_Border()  | 
            ||
| 654 | # page angle: PAGE @orientation is defined clockwise,  | 
            ||
| 655 | # whereas PIL/ndimage rotation is in mathematical direction:  | 
            ||
| 656 | page_coords['angle'] = -(page.get_orientation() or 0)  | 
            ||
| 657 | # map angle from (-180,180] to [0,360], and partition into multiples of 90;  | 
            ||
| 658 | # but avoid unnecessary large remainders, i.e. split symmetrically:  | 
            ||
| 659 | orientation = (page_coords['angle'] + 45) % 360  | 
            ||
| 660 | orientation = orientation - (orientation % 90)  | 
            ||
| 661 | skew = (page_coords['angle'] % 360) - orientation  | 
            ||
| 662 | skew = 180 - (180 - skew) % 360 # map to [-45,45]  | 
            ||
| 663 | page_coords['angle'] = 0 # nothing applied yet (depends on filters)  | 
            ||
| 664 |         log.debug("page '%s' has %s orientation=%d skew=%.2f", | 
            ||
| 665 | page_id, "border," if border else "", orientation, skew)  | 
            ||
| 666 | if page_image_info.resolution != 1:  | 
            ||
| 667 | dpi = page_image_info.resolution  | 
            ||
| 668 | if page_image_info.resolutionUnit == 'cm':  | 
            ||
| 669 | dpi = round(dpi * 2.54)  | 
            ||
| 670 | dpi = int(dpi)  | 
            ||
| 671 |             log.debug("page '%s' images will use %d DPI from image meta-data", page_id, dpi) | 
            ||
| 672 | page_coords['DPI'] = dpi  | 
            ||
| 673 | |||
| 674 | # initialize AlternativeImage@comments classes as empty:  | 
            ||
| 675 | page_coords['features'] = ''  | 
            ||
| 676 | best_image = None  | 
            ||
| 677 | alternative_images = page.get_AlternativeImage()  | 
            ||
| 678 | if alternative_images:  | 
            ||
| 679 | # (e.g. from page-level cropping, binarization, deskewing or despeckling)  | 
            ||
| 680 | best_features = set()  | 
            ||
| 681 |             auto_features = {'cropped', 'deskewed', 'rotated-90', 'rotated-180', 'rotated-270'} | 
            ||
| 682 | # search to the end, because by convention we always append,  | 
            ||
| 683 | # and among multiple satisfactory images we want the most recent,  | 
            ||
| 684 | # but also ensure that we get the richest feature set, i.e. most  | 
            ||
| 685 | # of those features that we cannot reproduce automatically below  | 
            ||
| 686 | for alternative_image in alternative_images:  | 
            ||
| 687 | if filename and filename != alternative_image.filename:  | 
            ||
| 688 | continue  | 
            ||
| 689 | features = alternative_image.get_comments()  | 
            ||
| 690 | if not features:  | 
            ||
| 691 |                     log.warning("AlternativeImage %d for page '%s' does not have any feature attributes", | 
            ||
| 692 | alternative_images.index(alternative_image) + 1, page_id)  | 
            ||
| 693 | features = ''  | 
            ||
| 694 |                 featureset = set(features.split(',')) | 
            ||
| 695 | if (all(feature in featureset  | 
            ||
| 696 |                         for feature in feature_selector.split(',') if feature) and | 
            ||
| 697 | not any(feature in featureset  | 
            ||
| 698 |                             for feature in feature_filter.split(',') if feature) and | 
            ||
| 699 | len(featureset.difference(auto_features)) >= \  | 
            ||
| 700 | len(best_features.difference(auto_features))):  | 
            ||
| 701 | best_features = featureset  | 
            ||
| 702 | best_image = alternative_image  | 
            ||
| 703 | if best_image:  | 
            ||
| 704 |                 log.debug("Using AlternativeImage %d %s for page '%s'", | 
            ||
| 705 | alternative_images.index(best_image) + 1,  | 
            ||
| 706 | best_features, page_id)  | 
            ||
| 707 | page_image = self._resolve_image_as_pil(best_image.get_filename())  | 
            ||
| 708 | page_coords['features'] = best_image.get_comments() # including duplicates  | 
            ||
| 709 | |||
| 710 | # adjust the coord transformation to the steps applied on the image,  | 
            ||
| 711 | # and apply steps on the existing image in case it is missing there,  | 
            ||
| 712 | # but traverse all steps (crop/reflect/rotate) in a particular order:  | 
            ||
| 713 | # - existing image features take priority (in the order annotated),  | 
            ||
| 714 | # - next is cropping (if necessary but not already applied),  | 
            ||
| 715 | # - next is reflection (if necessary but not already applied),  | 
            ||
| 716 | # - next is rotation (if necessary but not already applied).  | 
            ||
| 717 | # This helps deal with arbitrary workflows (e.g. crop then deskew,  | 
            ||
| 718 | # or deskew then crop), regardless of where images are generated.  | 
            ||
| 719 |         alternative_image_features = page_coords['features'].split(',') | 
            ||
| 720 | for duplicate_feature in set([feature for feature in alternative_image_features  | 
            ||
| 721 | # features relevant in reconstructing coordinates:  | 
            ||
| 722 | if (feature in ['cropped', 'deskewed', 'rotated-90',  | 
            ||
| 723 | 'rotated-180', 'rotated-270'] and  | 
            ||
| 724 | alternative_image_features.count(feature) > 1)]):  | 
            ||
| 725 |             log.error("Duplicate feature %s in AlternativeImage for page '%s'", | 
            ||
| 726 | duplicate_feature, page_id)  | 
            ||
| 727 | for i, feature in enumerate(alternative_image_features +  | 
            ||
| 728 | (['cropped']  | 
            ||
| 729 | if (border and  | 
            ||
| 730 | not 'cropped' in alternative_image_features and  | 
            ||
| 731 |                                          not 'cropped' in feature_filter.split(',')) | 
            ||
| 732 | else []) +  | 
            ||
| 733 | (['rotated-%d' % orientation]  | 
            ||
| 734 | if (orientation and  | 
            ||
| 735 | not 'rotated-%d' % orientation in alternative_image_features and  | 
            ||
| 736 |                                          not 'rotated-%d' % orientation in feature_filter.split(',')) | 
            ||
| 737 | else []) +  | 
            ||
| 738 | (['deskewed']  | 
            ||
| 739 | if (skew and  | 
            ||
| 740 | not 'deskewed' in alternative_image_features and  | 
            ||
| 741 |                                          not 'deskewed' in feature_filter.split(',')) | 
            ||
| 742 | else []) +  | 
            ||
| 743 | # not a feature to be added, but merely as a fallback position  | 
            ||
| 744 | # to always enter loop at i == len(alternative_image_features)  | 
            ||
| 745 | ['_check']):  | 
            ||
| 746 | # image geometry vs feature consistency can only be checked  | 
            ||
| 747 | # after all features on the existing AlternativeImage have  | 
            ||
| 748 | # been adjusted for in the transform, and when there is a mismatch,  | 
            ||
| 749 | # additional steps applied here would only repeat the respective  | 
            ||
| 750 | # error message; so we only check once at the boundary between  | 
            ||
| 751 | # existing and new features  | 
            ||
| 752 | # FIXME we should check/enforce consistency when _adding_ AlternativeImage  | 
            ||
| 753 | if (i == len(alternative_image_features) and  | 
            ||
| 754 | not (page_xywh['w'] - 2 < page_image.width < page_xywh['w'] + 2 and  | 
            ||
| 755 | page_xywh['h'] - 2 < page_image.height < page_xywh['h'] + 2)):  | 
            ||
| 756 |                 log.error('page "%s" image (%s; %dx%d) has not been cropped properly (%dx%d)', | 
            ||
| 757 | page_id, page_coords['features'],  | 
            ||
| 758 | page_image.width, page_image.height,  | 
            ||
| 759 | page_xywh['w'], page_xywh['h'])  | 
            ||
| 760 |             name = "%s for page '%s'" % ("AlternativeImage" if best_image | 
            ||
| 761 | else "original image", page_id)  | 
            ||
| 762 | # adjust transform to feature, and ensure feature is applied to image  | 
            ||
| 763 | if feature == 'cropped':  | 
            ||
| 764 | page_image, page_coords, page_xywh = _crop(  | 
            ||
| 765 | log, name, border, page_image, page_coords,  | 
            ||
| 766 | fill=fill, transparency=transparency)  | 
            ||
| 767 | elif feature == 'rotated-%d' % orientation:  | 
            ||
| 768 | page_image, page_coords, page_xywh = _reflect(  | 
            ||
| 769 | log, name, orientation, page_image, page_coords, page_xywh)  | 
            ||
| 770 | elif feature == 'deskewed':  | 
            ||
| 771 | page_image, page_coords, page_xywh = _rotate(  | 
            ||
| 772 | log, name, skew, border, page_image, page_coords, page_xywh,  | 
            ||
| 773 | fill=fill, transparency=transparency)  | 
            ||
| 774 | |||
| 775 | # verify constraints again:  | 
            ||
| 776 | if filename and not getattr(page_image, 'filename', '').endswith(filename):  | 
            ||
| 777 |             raise Exception('Found no AlternativeImage that satisfies all requirements ' + | 
            ||
| 778 | 'filename="%s" in page "%s"' % (  | 
            ||
| 779 | filename, page_id))  | 
            ||
| 780 | if not all(feature in page_coords['features']  | 
            ||
| 781 |                    for feature in feature_selector.split(',') if feature): | 
            ||
| 782 |             raise Exception('Found no AlternativeImage that satisfies all requirements ' + | 
            ||
| 783 | 'selector="%s" in page "%s"' % (  | 
            ||
| 784 | feature_selector, page_id))  | 
            ||
| 785 | if any(feature in page_coords['features']  | 
            ||
| 786 |                for feature in feature_filter.split(',') if feature): | 
            ||
| 787 |             raise Exception('Found no AlternativeImage that satisfies all requirements ' + | 
            ||
| 788 | 'filter="%s" in page "%s"' % (  | 
            ||
| 789 | feature_filter, page_id))  | 
            ||
| 790 | # ensure DPI will be set in image meta-data again  | 
            ||
| 791 | if 'DPI' in page_coords:  | 
            ||
| 792 | dpi = page_coords['DPI']  | 
            ||
| 793 | if 'dpi' not in page_image.info:  | 
            ||
| 794 | page_image.info['dpi'] = (dpi, dpi)  | 
            ||
| 795 | return page_image, page_coords, page_image_info  | 
            ||
| 796 | |||
| 797 | def image_from_segment(self, segment, parent_image, parent_coords,  | 
            ||
| 798 | fill='background', transparency=False,  | 
            ||
| 799 | feature_selector='', feature_filter='', filename=''):  | 
            ||
| 800 | """Extract an image for a PAGE-XML hierarchy segment from its parent's image.  | 
            ||
| 801 | |||
| 802 | Args:  | 
            ||
| 803 | segment (object): a PAGE segment object \  | 
            ||
| 804 | (i.e. :py:class:`~ocrd_models.ocrd_page.TextRegionType` \  | 
            ||
| 805 | or :py:class:`~ocrd_models.ocrd_page.TextLineType` \  | 
            ||
| 806 | or :py:class:`~ocrd_models.ocrd_page.WordType` \  | 
            ||
| 807 | or :py:class:`~ocrd_models.ocrd_page.GlyphType`)  | 
            ||
| 808 | parent_image (`PIL.Image`): image of the `segment`'s parent  | 
            ||
| 809 | parent_coords (dict): a `dict` with information about `parent_image`:  | 
            ||
| 810 | |||
| 811 | - `"transform"`: a `Numpy` array with an affine transform which  | 
            ||
| 812 | converts from absolute coordinates to those relative to the image,  | 
            ||
| 813 | i.e. after applying all operations (starting with the original image)  | 
            ||
| 814 | - `"angle"`: the rotation/reflection angle applied to the image so far,  | 
            ||
| 815 | - `"DPI"`: the pixel density of the parent image,  | 
            ||
| 816 | - `"features"`: the ``AlternativeImage/@comments`` for the image, i.e.  | 
            ||
| 817 | names of all operations that lead up to this result, and  | 
            ||
| 818 | Keyword Args:  | 
            ||
| 819 | fill (string): a `PIL` color specifier, or `background` or `none`  | 
            ||
| 820 | transparency (boolean): whether to add an alpha channel for masking  | 
            ||
| 821 | feature_selector (string): a comma-separated list of ``@comments`` classes  | 
            ||
| 822 | feature_filter (string): a comma-separated list of ``@comments`` classes  | 
            ||
| 823 | |||
| 824 | Extract a `PIL.Image` from `segment`, either from ``AlternativeImage``  | 
            ||
| 825 | (if it exists), or producing a new image via cropping from `parent_image`  | 
            ||
| 826 | (otherwise). Pass in `parent_image` and `parent_coords` from the result  | 
            ||
| 827 | of the next higher-level of this function or from :py:meth:`image_from_page`.  | 
            ||
| 828 | |||
| 829 | If ``filename`` is given, then among the available `AlternativeImage/@filename`  | 
            ||
| 830 | images, pick that one, or raise an error.  | 
            ||
| 831 | |||
| 832 | If ``feature_selector`` and/or ``feature_filter`` is given, then  | 
            ||
| 833 | among the cropped `parent_image` and the available AlternativeImages,  | 
            ||
| 834 | select/filter the richest one which contains all of the selected,  | 
            ||
| 835 | but none of the filtered features (i.e. ``@comments`` classes), or  | 
            ||
| 836 | raise an error.  | 
            ||
| 837 | |||
| 838 | (Required and produced features need not be in the same order, so  | 
            ||
| 839 | `feature_selector` is merely a mask specifying Boolean AND, and  | 
            ||
| 840 | `feature_filter` is merely a mask specifying Boolean OR.)  | 
            ||
| 841 | |||
| 842 | Cropping uses a polygon mask (not just the bounding box rectangle).  | 
            ||
| 843 | Areas outside the polygon will be filled according to `fill`:  | 
            ||
| 844 | |||
| 845 | - if `"background"` (the default),  | 
            ||
| 846 | then fill with the median color of the image;  | 
            ||
| 847 | - else if `"none"`, then avoid masking polygons where possible  | 
            ||
| 848 | (i.e. when cropping) or revert to the default (i.e. when rotating)  | 
            ||
| 849 | - otherwise, use the given color, e.g. `"white"` or `(255,255,255)`.  | 
            ||
| 850 | |||
| 851 | Moreover, if `transparency` is true, and unless the image already  | 
            ||
| 852 | has an alpha channel, then add an alpha channel which is fully opaque  | 
            ||
| 853 | before cropping and rotating. (Thus, unexposed/masked areas will be  | 
            ||
| 854 | transparent afterwards for consumers that can interpret alpha channels).  | 
            ||
| 855 | |||
| 856 | When cropping, compensate any ``@orientation`` angle annotated for the  | 
            ||
| 857 | parent (from parent-level deskewing) by rotating the segment coordinates  | 
            ||
| 858 | in an inverse transformation (i.e. translation to center, then passive  | 
            ||
| 859 | rotation, and translation back).  | 
            ||
| 860 | |||
| 861 | Regardless, if any ``@orientation`` angle is annotated for the segment  | 
            ||
| 862 | (from segment-level deskewing), and the chosen image does not have  | 
            ||
| 863 | the feature `"deskewed"` yet, and unless `"deskewed"` is being filtered,  | 
            ||
| 864 | then rotate it - compensating for any previous `"angle"`. (However,  | 
            ||
| 865 | if ``@orientation`` is above the [-45°,45°] interval, then apply as much  | 
            ||
| 866 | transposition as possible first, unless `"rotated-90"` / `"rotated-180"` /  | 
            ||
| 867 | `"rotated-270"` is being filtered.)  | 
            ||
| 868 | |||
| 869 | Returns:  | 
            ||
| 870 | a tuple of  | 
            ||
| 871 | * the extracted `PIL.Image`,  | 
            ||
| 872 | * a `dict` with information about the extracted image:  | 
            ||
| 873 | |||
| 874 | - `"transform"`: a `Numpy` array with an affine transform which  | 
            ||
| 875 | converts from absolute coordinates to those relative to the image,  | 
            ||
| 876 | i.e. after applying all parent operations, and then cropping to  | 
            ||
| 877 | the segment's bounding box, and deskewing with the segment's  | 
            ||
| 878 | orientation angle (if any)  | 
            ||
| 879 | - `"angle"`: the rotation/reflection angle applied to the image so far,  | 
            ||
| 880 | - `"DPI"`: the pixel density of this image,  | 
            ||
| 881 | - `"features"`: the ``AlternativeImage/@comments`` for the image, i.e.  | 
            ||
| 882 | names of all applied operations that lead up to this result.  | 
            ||
| 883 | |||
| 884 | (These can be used to create a new ``AlternativeImage``, or passed down  | 
            ||
| 885 | for :py:meth:`image_from_segment` calls on lower hierarchy levels.)  | 
            ||
| 886 | |||
| 887 | Examples:  | 
            ||
| 888 | |||
| 889 | * get a raw (colored) but already deskewed and cropped image::  | 
            ||
| 890 | |||
| 891 | image, xywh = workspace.image_from_segment(region,  | 
            ||
| 892 | page_image, page_xywh,  | 
            ||
| 893 | feature_selector='deskewed,cropped',  | 
            ||
| 894 | feature_filter='binarized,grayscale_normalized')  | 
            ||
| 895 | """  | 
            ||
| 896 |         log = getLogger('ocrd.workspace.image_from_segment') | 
            ||
| 897 | # note: We should mask overlapping neighbouring segments here,  | 
            ||
| 898 | # but finding the right clipping rules can be difficult if operating  | 
            ||
| 899 | # on the raw (non-binary) image data alone: for each intersection, it  | 
            ||
| 900 | # must be decided which one of either segment or neighbour to assign,  | 
            ||
| 901 | # e.g. an ImageRegion which properly contains our TextRegion should be  | 
            ||
| 902 | # completely ignored, but an ImageRegion which is properly contained  | 
            ||
| 903 | # in our TextRegion should be completely masked, while partial overlap  | 
            ||
| 904 | # may be more difficult to decide. On the other hand, on the binary image,  | 
            ||
| 905 | # we can use connected component analysis to mask foreground areas which  | 
            ||
| 906 | # originate in the neighbouring regions. But that would introduce either  | 
            ||
| 907 | # the assumption that the input has already been binarized, or a dependency  | 
            ||
| 908 | # on some ad-hoc binarization method. Thus, it is preferable to use  | 
            ||
| 909 | # a dedicated processor for this (which produces clipped AlternativeImage  | 
            ||
| 910 | # or reduced polygon coordinates).  | 
            ||
| 911 | segment_image, segment_coords, segment_xywh = _crop(  | 
            ||
| 912 | log, "parent image for segment '%s'" % segment.id,  | 
            ||
| 913 | segment, parent_image, parent_coords,  | 
            ||
| 914 | fill=fill, transparency=transparency)  | 
            ||
| 915 | |||
| 916 | # Semantics of missing @orientation at region level could be either  | 
            ||
| 917 | # - inherited from page level: same as line or word level (no @orientation),  | 
            ||
| 918 | # - zero (unrotate page angle): different from line or word level (because  | 
            ||
| 919 | # otherwise deskewing would never have an effect on lines and words)  | 
            ||
| 920 | # The PAGE specification is silent here (but does generally not concern itself  | 
            ||
| 921 | # much with AlternativeImage coordinate consistency).  | 
            ||
| 922 | # Since our (generateDS-backed) ocrd_page supports the zero/none distinction,  | 
            ||
| 923 | # we choose the former (i.e. None is inheritance).  | 
            ||
| 924 | if 'orientation' in segment.__dict__ and segment.get_orientation() is not None:  | 
            ||
| 925 | # region angle: PAGE @orientation is defined clockwise,  | 
            ||
| 926 | # whereas PIL/ndimage rotation is in mathematical direction:  | 
            ||
| 927 | angle = -segment.get_orientation()  | 
            ||
| 928 | # @orientation is always absolute; if higher levels  | 
            ||
| 929 | # have already rotated, then we must compensate:  | 
            ||
| 930 | angle -= parent_coords['angle']  | 
            ||
| 931 | # map angle from (-180,180] to [0,360], and partition into multiples of 90;  | 
            ||
| 932 | # but avoid unnecessary large remainders, i.e. split symmetrically:  | 
            ||
| 933 | orientation = (angle + 45) % 360  | 
            ||
| 934 | orientation = orientation - (orientation % 90)  | 
            ||
| 935 | skew = (angle % 360) - orientation  | 
            ||
| 936 | skew = 180 - (180 - skew) % 360 # map to [-45,45]  | 
            ||
| 937 |             log.debug("segment '%s' has orientation=%d skew=%.2f", | 
            ||
| 938 | segment.id, orientation, skew)  | 
            ||
| 939 | else:  | 
            ||
| 940 | orientation = 0  | 
            ||
| 941 | skew = 0  | 
            ||
| 942 | segment_coords['angle'] = parent_coords['angle'] # nothing applied yet (depends on filters)  | 
            ||
| 943 | if 'DPI' in parent_coords:  | 
            ||
| 944 | segment_coords['DPI'] = parent_coords['DPI'] # not rescaled yet  | 
            ||
| 945 | |||
| 946 | # initialize AlternativeImage@comments classes from parent, except  | 
            ||
| 947 | # for those operations that can apply on multiple hierarchy levels:  | 
            ||
| 948 | segment_coords['features'] = ','.join(  | 
            ||
| 949 |             [feature for feature in parent_coords['features'].split(',') | 
            ||
| 950 | if feature in ['binarized', 'grayscale_normalized',  | 
            ||
| 951 | 'despeckled', 'dewarped']])  | 
            ||
| 952 | |||
| 953 | best_image = None  | 
            ||
| 954 | alternative_images = segment.get_AlternativeImage()  | 
            ||
| 955 | if alternative_images:  | 
            ||
| 956 | # (e.g. from segment-level cropping, binarization, deskewing or despeckling)  | 
            ||
| 957 | best_features = set()  | 
            ||
| 958 |             auto_features = {'cropped', 'deskewed', 'rotated-90', 'rotated-180', 'rotated-270'} | 
            ||
| 959 | # search to the end, because by convention we always append,  | 
            ||
| 960 | # and among multiple satisfactory images we want the most recent,  | 
            ||
| 961 | # but also ensure that we get the richest feature set, i.e. most  | 
            ||
| 962 | # of those features that we cannot reproduce automatically below  | 
            ||
| 963 | for alternative_image in alternative_images:  | 
            ||
| 964 | if filename and filename != alternative_image.filename:  | 
            ||
| 965 | continue  | 
            ||
| 966 | features = alternative_image.get_comments()  | 
            ||
| 967 | if not features:  | 
            ||
| 968 |                     log.warning("AlternativeImage %d for segment '%s' does not have any feature attributes", | 
            ||
| 969 | alternative_images.index(alternative_image) + 1, segment.id)  | 
            ||
| 970 | features = ''  | 
            ||
| 971 |                 featureset = set(features.split(',')) | 
            ||
| 972 | if (all(feature in featureset  | 
            ||
| 973 |                         for feature in feature_selector.split(',') if feature) and | 
            ||
| 974 | not any(feature in featureset  | 
            ||
| 975 |                             for feature in feature_filter.split(',') if feature) and | 
            ||
| 976 | len(featureset.difference(auto_features)) >= \  | 
            ||
| 977 | len(best_features.difference(auto_features))):  | 
            ||
| 978 | best_features = featureset  | 
            ||
| 979 | best_image = alternative_image  | 
            ||
| 980 | if best_image:  | 
            ||
| 981 |                 log.debug("Using AlternativeImage %d %s for segment '%s'", | 
            ||
| 982 | alternative_images.index(best_image) + 1,  | 
            ||
| 983 | best_features, segment.id)  | 
            ||
| 984 | segment_image = self._resolve_image_as_pil(alternative_image.get_filename())  | 
            ||
| 985 | segment_coords['features'] = best_image.get_comments() # including duplicates  | 
            ||
| 986 | |||
| 987 |         alternative_image_features = segment_coords['features'].split(',') | 
            ||
| 988 | for duplicate_feature in set([feature for feature in alternative_image_features  | 
            ||
| 989 | # features relevant in reconstructing coordinates:  | 
            ||
| 990 | if (feature in ['deskewed', 'rotated-90',  | 
            ||
| 991 | 'rotated-180', 'rotated-270'] and  | 
            ||
| 992 | alternative_image_features.count(feature) > 1)]):  | 
            ||
| 993 |             log.error("Duplicate feature %s in AlternativeImage for segment '%s'", | 
            ||
| 994 | duplicate_feature, segment.id)  | 
            ||
| 995 | for i, feature in enumerate(alternative_image_features +  | 
            ||
| 996 | (['rotated-%d' % orientation]  | 
            ||
| 997 | if (orientation and  | 
            ||
| 998 | not 'rotated-%d' % orientation in alternative_image_features and  | 
            ||
| 999 |                                          not 'rotated-%d' % orientation in feature_filter.split(',')) | 
            ||
| 1000 | else []) +  | 
            ||
| 1001 | (['deskewed']  | 
            ||
| 1002 | if (skew and  | 
            ||
| 1003 | not 'deskewed' in alternative_image_features and  | 
            ||
| 1004 |                                          not 'deskewed' in feature_filter.split(',')) | 
            ||
| 1005 | else []) +  | 
            ||
| 1006 | # not a feature to be added, but merely as a fallback position  | 
            ||
| 1007 | # to always enter loop at i == len(alternative_image_features)  | 
            ||
| 1008 | ['_check']):  | 
            ||
| 1009 | # image geometry vs feature consistency can only be checked  | 
            ||
| 1010 | # after all features on the existing AlternativeImage have  | 
            ||
| 1011 | # been adjusted for in the transform, and when there is a mismatch,  | 
            ||
| 1012 | # additional steps applied here would only repeat the respective  | 
            ||
| 1013 | # error message; so we only check once at the boundary between  | 
            ||
| 1014 | # existing and new features  | 
            ||
| 1015 | # FIXME we should enforce consistency here (i.e. split into transposition  | 
            ||
| 1016 | # and minimal rotation, rotation always reshapes, rescaling never happens)  | 
            ||
| 1017 | # FIXME: inconsistency currently unavoidable with line-level dewarping (which increases height)  | 
            ||
| 1018 | if (i == len(alternative_image_features) and  | 
            ||
| 1019 | not (segment_xywh['w'] - 2 < segment_image.width < segment_xywh['w'] + 2 and  | 
            ||
| 1020 | segment_xywh['h'] - 2 < segment_image.height < segment_xywh['h'] + 2)):  | 
            ||
| 1021 |                 log.error('segment "%s" image (%s; %dx%d) has not been cropped properly (%dx%d)', | 
            ||
| 1022 | segment.id, segment_coords['features'],  | 
            ||
| 1023 | segment_image.width, segment_image.height,  | 
            ||
| 1024 | segment_xywh['w'], segment_xywh['h'])  | 
            ||
| 1025 |             name = "%s for segment '%s'" % ("AlternativeImage" if best_image | 
            ||
| 1026 | else "parent image", segment.id)  | 
            ||
| 1027 | # adjust transform to feature, and ensure feature is applied to image  | 
            ||
| 1028 | if feature == 'rotated-%d' % orientation:  | 
            ||
| 1029 | segment_image, segment_coords, segment_xywh = _reflect(  | 
            ||
| 1030 | log, name, orientation, segment_image, segment_coords, segment_xywh)  | 
            ||
| 1031 | elif feature == 'deskewed':  | 
            ||
| 1032 | segment_image, segment_coords, segment_xywh = _rotate(  | 
            ||
| 1033 | log, name, skew, segment, segment_image, segment_coords, segment_xywh,  | 
            ||
| 1034 | fill=fill, transparency=transparency)  | 
            ||
| 1035 | |||
| 1036 | # verify constraints again:  | 
            ||
| 1037 | if filename and not getattr(segment_image, 'filename', '').endswith(filename):  | 
            ||
| 1038 |             raise Exception('Found no AlternativeImage that satisfies all requirements ' + | 
            ||
| 1039 | 'filename="%s" in segment "%s"' % (  | 
            ||
| 1040 | filename, segment.id))  | 
            ||
| 1041 | if not all(feature in segment_coords['features']  | 
            ||
| 1042 |                    for feature in feature_selector.split(',') if feature): | 
            ||
| 1043 |             raise Exception('Found no AlternativeImage that satisfies all requirements' + | 
            ||
| 1044 | 'selector="%s" in segment "%s"' % (  | 
            ||
| 1045 | feature_selector, segment.id))  | 
            ||
| 1046 | if any(feature in segment_coords['features']  | 
            ||
| 1047 |                for feature in feature_filter.split(',') if feature): | 
            ||
| 1048 |             raise Exception('Found no AlternativeImage that satisfies all requirements ' + | 
            ||
| 1049 | 'filter="%s" in segment "%s"' % (  | 
            ||
| 1050 | feature_filter, segment.id))  | 
            ||
| 1051 | # ensure DPI will be set in image meta-data again  | 
            ||
| 1052 | if 'DPI' in segment_coords:  | 
            ||
| 1053 | dpi = segment_coords['DPI']  | 
            ||
| 1054 | if 'dpi' not in segment_image.info:  | 
            ||
| 1055 | segment_image.info['dpi'] = (dpi, dpi)  | 
            ||
| 1056 | return segment_image, segment_coords  | 
            ||
| 1057 | |||
| 1058 | # pylint: disable=redefined-builtin  | 
            ||
| 1059 | def save_image_file(self, image : Image.Image,  | 
            ||
| 1060 | file_id : str,  | 
            ||
| 1061 | file_grp : str,  | 
            ||
| 1062 | file_path : Optional[str] = None,  | 
            ||
| 1063 | page_id : Optional[str] = None,  | 
            ||
| 1064 | mimetype : str = 'image/png',  | 
            ||
| 1065 | force : bool = False) -> str:  | 
            ||
| 1066 | """Store an image in the filesystem and reference it as new file in the METS.  | 
            ||
| 1067 | |||
| 1068 | Args:  | 
            ||
| 1069 | image (PIL.Image): derived image to save  | 
            ||
| 1070 | file_id (string): `@ID` of the METS `file` to use  | 
            ||
| 1071 | file_grp (string): `@USE` of the METS `fileGrp` to use  | 
            ||
| 1072 | Keyword Args:  | 
            ||
| 1073 | file_path (string): `@href` of the METS `file/FLocat` to use.  | 
            ||
| 1074 | page_id (string): `@ID` in the METS physical `structMap` to use  | 
            ||
| 1075 | mimetype (string): MIME type of the image format to serialize as  | 
            ||
| 1076 | force (boolean): whether to replace any existing `file` with that `@ID`  | 
            ||
| 1077 | |||
| 1078 | Serialize the image into the filesystem, and add a `file` for it in the METS.  | 
            ||
| 1079 | Use ``file_grp`` as directory and ``file_id`` concatenated with extension  | 
            ||
| 1080 | based on ``mimetype`` as file name, unless directly passing ``file_path``.  | 
            ||
| 1081 | |||
| 1082 | Returns:  | 
            ||
| 1083 | The (absolute) path of the created file.  | 
            ||
| 1084 | """  | 
            ||
| 1085 |         log = getLogger('ocrd.workspace.save_image_file') | 
            ||
| 1086 |         saveargs = {} | 
            ||
| 1087 | if 'dpi' in image.info:  | 
            ||
| 1088 | saveargs['dpi'] = image.info['dpi']  | 
            ||
| 1089 | image_bytes = io.BytesIO()  | 
            ||
| 1090 | image.save(image_bytes, format=MIME_TO_PIL[mimetype], **saveargs)  | 
            ||
| 1091 | if file_path is None:  | 
            ||
| 1092 | file_path = str(Path(file_grp, '%s%s' % (file_id, MIME_TO_EXT[mimetype])))  | 
            ||
| 1093 | out = self.add_file(  | 
            ||
| 1094 | file_grp,  | 
            ||
| 1095 | file_id=file_id,  | 
            ||
| 1096 | page_id=page_id,  | 
            ||
| 1097 | local_filename=file_path,  | 
            ||
| 1098 | mimetype=mimetype,  | 
            ||
| 1099 | content=image_bytes.getvalue(),  | 
            ||
| 1100 | force=force)  | 
            ||
| 1101 |         log.info('created file ID: %s, file_grp: %s, path: %s', | 
            ||
| 1102 | file_id, file_grp, out.local_filename)  | 
            ||
| 1103 | return file_path  | 
            ||
| 1104 | |||
| 1105 | def find_files(self, *args, **kwargs):  | 
            ||
| 1106 | """  | 
            ||
| 1107 | Search ``mets:file`` entries in wrapped METS document and yield results.  | 
            ||
| 1108 | |||
| 1109 | Delegator to :py:func:`ocrd_models.ocrd_mets.OcrdMets.find_files`  | 
            ||
| 1110 | |||
| 1111 | Keyword Args:  | 
            ||
| 1112 | **kwargs: See :py:func:`ocrd_models.ocrd_mets.OcrdMets.find_files`  | 
            ||
| 1113 | Returns:  | 
            ||
| 1114 | Generator which yields :py:class:`ocrd_models:ocrd_file:OcrdFile` instantiations  | 
            ||
| 1115 | """  | 
            ||
| 1116 |         log = getLogger('ocrd.workspace.find_files') | 
            ||
| 1117 |         log.debug('find files in mets. kwargs=%s' % kwargs) | 
            ||
| 1118 | if "page_id" in kwargs:  | 
            ||
| 1119 |             kwargs["pageId"] = kwargs.pop("page_id") | 
            ||
| 1120 | if "file_id" in kwargs:  | 
            ||
| 1121 |             kwargs["ID"] = kwargs.pop("file_id") | 
            ||
| 1122 | if "file_grp" in kwargs:  | 
            ||
| 1123 |             kwargs["fileGrp"] = kwargs.pop("file_grp") | 
            ||
| 1124 | with pushd_popd(self.directory):  | 
            ||
| 1125 | return self.mets.find_files(*args, **kwargs)  | 
            ||
| 1126 | |||
| 1127 | def _crop(log, name, segment, parent_image, parent_coords, op='cropped', **kwargs):  | 
            ||
| 1128 | segment_coords = parent_coords.copy()  | 
            ||
| 1129 | # get polygon outline of segment relative to parent image:  | 
            ||
| 1130 | segment_polygon = coordinates_of_segment(segment, parent_image, parent_coords)  | 
            ||
| 1131 | # get relative bounding box:  | 
            ||
| 1132 | segment_bbox = bbox_from_polygon(segment_polygon)  | 
            ||
| 1133 | # get size of the segment in the parent image after cropping  | 
            ||
| 1134 | # (i.e. possibly different from size before rotation at the parent, but  | 
            ||
| 1135 | # also possibly different from size after rotation below/AlternativeImage):  | 
            ||
| 1136 | segment_xywh = xywh_from_bbox(*segment_bbox)  | 
            ||
| 1137 | # crop, if (still) necessary:  | 
            ||
| 1138 | if (not isinstance(segment, BorderType) or # always crop below page level  | 
            ||
| 1139 | not op in parent_coords['features']):  | 
            ||
| 1140 | if op == 'recropped':  | 
            ||
| 1141 |             log.debug("Recropping %s", name) | 
            ||
| 1142 | elif isinstance(segment, BorderType):  | 
            ||
| 1143 |             log.debug("Cropping %s", name) | 
            ||
| 1144 | segment_coords['features'] += ',' + op  | 
            ||
| 1145 | # create a mask from the segment polygon:  | 
            ||
| 1146 | segment_image = image_from_polygon(parent_image, segment_polygon, **kwargs)  | 
            ||
| 1147 | # crop to bbox:  | 
            ||
| 1148 | segment_image = crop_image(segment_image, box=segment_bbox)  | 
            ||
| 1149 | else:  | 
            ||
| 1150 | segment_image = parent_image  | 
            ||
| 1151 | # subtract offset from parent in affine coordinate transform:  | 
            ||
| 1152 | # (consistent with image cropping)  | 
            ||
| 1153 | segment_coords['transform'] = shift_coordinates(  | 
            ||
| 1154 | parent_coords['transform'],  | 
            ||
| 1155 | np.array([-segment_bbox[0],  | 
            ||
| 1156 | -segment_bbox[1]]))  | 
            ||
| 1157 | return segment_image, segment_coords, segment_xywh  | 
            ||
| 1158 | |||
| 1159 | def _reflect(log, name, orientation, segment_image, segment_coords, segment_xywh):  | 
            ||
| 1160 | # Transpose in affine coordinate transform:  | 
            ||
| 1161 | # (consistent with image transposition or AlternativeImage below)  | 
            ||
| 1162 |     transposition = { | 
            ||
| 1163 | 90: Image.Transpose.ROTATE_90,  | 
            ||
| 1164 | 180: Image.Transpose.ROTATE_180,  | 
            ||
| 1165 | 270: Image.Transpose.ROTATE_270  | 
            ||
| 1166 | }.get(orientation) # no default  | 
            ||
| 1167 | segment_coords['transform'] = transpose_coordinates(  | 
            ||
| 1168 | segment_coords['transform'], transposition,  | 
            ||
| 1169 | np.array([0.5 * segment_xywh['w'],  | 
            ||
| 1170 | 0.5 * segment_xywh['h']]))  | 
            ||
| 1171 | segment_xywh['w'], segment_xywh['h'] = adjust_canvas_to_transposition(  | 
            ||
| 1172 | [segment_xywh['w'], segment_xywh['h']], transposition)  | 
            ||
| 1173 | segment_coords['angle'] += orientation  | 
            ||
| 1174 | # transpose, if (still) necessary:  | 
            ||
| 1175 | if not 'rotated-%d' % orientation in segment_coords['features']:  | 
            ||
| 1176 |         log.debug("Transposing %s by %d°", name, orientation) | 
            ||
| 1177 | segment_image = transpose_image(segment_image, transposition)  | 
            ||
| 1178 | segment_coords['features'] += ',rotated-%d' % orientation  | 
            ||
| 1179 | return segment_image, segment_coords, segment_xywh  | 
            ||
| 1180 | |||
| 1181 | def _rotate(log, name, skew, segment, segment_image, segment_coords, segment_xywh, **kwargs):  | 
            ||
| 1182 | # Rotate around center in affine coordinate transform:  | 
            ||
| 1183 | # (consistent with image rotation or AlternativeImage below)  | 
            ||
| 1184 | segment_coords['transform'] = rotate_coordinates(  | 
            ||
| 1185 | segment_coords['transform'], skew,  | 
            ||
| 1186 | np.array([0.5 * segment_xywh['w'],  | 
            ||
| 1187 | 0.5 * segment_xywh['h']]))  | 
            ||
| 1188 | segment_xywh['w'], segment_xywh['h'] = adjust_canvas_to_rotation(  | 
            ||
| 1189 | [segment_xywh['w'], segment_xywh['h']], skew)  | 
            ||
| 1190 | segment_coords['angle'] += skew  | 
            ||
| 1191 | # deskew, if (still) necessary:  | 
            ||
| 1192 | if not 'deskewed' in segment_coords['features']:  | 
            ||
| 1193 |         log.debug("Rotating %s by %.2f°", name, skew) | 
            ||
| 1194 | segment_image = rotate_image(segment_image, skew, **kwargs)  | 
            ||
| 1195 | segment_coords['features'] += ',deskewed'  | 
            ||
| 1196 | if (segment and  | 
            ||
| 1197 | (not isinstance(segment, BorderType) or # always crop below page level  | 
            ||
| 1198 | 'cropped' in segment_coords['features'])):  | 
            ||
| 1199 | # re-crop to new bbox (which may deviate  | 
            ||
| 1200 | # if segment polygon was not a rectangle)  | 
            ||
| 1201 | segment_image, segment_coords, segment_xywh = _crop(  | 
            ||
| 1202 | log, name, segment, segment_image, segment_coords,  | 
            ||
| 1203 | op='recropped', **kwargs)  | 
            ||
| 1204 | elif (segment and  | 
            ||
| 1205 | (not isinstance(segment, BorderType) or # always crop below page level  | 
            ||
| 1206 | 'cropped' in segment_coords['features'])):  | 
            ||
| 1207 | # only shift coordinates as if re-cropping  | 
            ||
| 1208 | segment_polygon = coordinates_of_segment(segment, segment_image, segment_coords)  | 
            ||
| 1209 | segment_bbox = bbox_from_polygon(segment_polygon)  | 
            ||
| 1210 | segment_xywh = xywh_from_bbox(*segment_bbox)  | 
            ||
| 1211 | segment_coords['transform'] = shift_coordinates(  | 
            ||
| 1212 | segment_coords['transform'],  | 
            ||
| 1213 | np.array([-segment_bbox[0],  | 
            ||
| 1214 | -segment_bbox[1]]))  | 
            ||
| 1215 | return segment_image, segment_coords, segment_xywh  | 
            ||
| 1216 | |||
| 1217 | def _scale(log, name, factor, segment_image, segment_coords, segment_xywh, **kwargs):  | 
            ||
| 1218 | # Resize linearly  | 
            ||
| 1219 | segment_coords['transform'] = scale_coordinates(  | 
            ||
| 1220 | segment_coords['transform'], [factor, factor])  | 
            ||
| 1221 |     segment_coords['scale'] = segment_coords.setdefault('scale', 1.0) * factor | 
            ||
| 1222 | segment_xywh['w'] *= factor  | 
            ||
| 1223 | segment_xywh['h'] *= factor  | 
            ||
| 1224 | # resize, if (still) necessary  | 
            ||
| 1225 | if not 'scaled' in segment_coords['features']:  | 
            ||
| 1226 |         log.debug("Scaling %s by %.2f", name, factor) | 
            ||
| 1227 | segment_coords['features'] += ',scaled'  | 
            ||
| 1228 | # FIXME: validate factor against PAGE-XML attributes  | 
            ||
| 1229 | # FIXME: factor should become less precise due to rounding  | 
            ||
| 1230 | segment_image = segment_image.resize((int(segment_image.width * factor),  | 
            ||
| 1231 | int(segment_image.height * factor)),  | 
            ||
| 1232 | # slowest, but highest quality:  | 
            ||
| 1233 | Image.Resampling.BICUBIC)  | 
            ||
| 1234 | return segment_image, segment_coords, segment_xywh  | 
            ||
| 1235 |