| Total Complexity | 194 |
| Total Lines | 1192 |
| Duplicated Lines | 5.2 % |
| Changes | 0 | ||
Duplicate code is one of the most pungent code smells. A rule that is often used is to re-structure code once it is duplicated in three or more places.
Common duplication problems, and corresponding solutions are:
Complex classes like ocrd.workspace often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
| 1 | import io |
||
| 2 | from os import makedirs, unlink, listdir, path |
||
| 3 | from pathlib import Path |
||
| 4 | from shutil import move, copyfileobj |
||
| 5 | from re import sub |
||
| 6 | from tempfile import NamedTemporaryFile |
||
| 7 | from contextlib import contextmanager |
||
| 8 | |||
| 9 | from cv2 import COLOR_GRAY2BGR, COLOR_RGB2BGR, cvtColor |
||
| 10 | from PIL import Image |
||
| 11 | import numpy as np |
||
| 12 | from deprecated.sphinx import deprecated |
||
| 13 | import requests |
||
| 14 | |||
| 15 | from ocrd_models import OcrdMets, OcrdFile |
||
| 16 | from ocrd_models.ocrd_page import parse, BorderType, to_xml |
||
| 17 | from ocrd_modelfactory import exif_from_filename, page_from_file |
||
| 18 | from ocrd_utils import ( |
||
| 19 | atomic_write, |
||
| 20 | getLogger, |
||
| 21 | image_from_polygon, |
||
| 22 | coordinates_of_segment, |
||
| 23 | adjust_canvas_to_rotation, |
||
| 24 | adjust_canvas_to_transposition, |
||
| 25 | shift_coordinates, |
||
| 26 | rotate_coordinates, |
||
| 27 | transform_coordinates, |
||
| 28 | transpose_coordinates, |
||
| 29 | crop_image, |
||
| 30 | rotate_image, |
||
| 31 | transpose_image, |
||
| 32 | bbox_from_polygon, |
||
| 33 | polygon_from_points, |
||
| 34 | xywh_from_bbox, |
||
| 35 | pushd_popd, |
||
| 36 | is_local_filename, |
||
| 37 | deprecated_alias, |
||
| 38 | MIME_TO_EXT, |
||
| 39 | MIME_TO_PIL, |
||
| 40 | MIMETYPE_PAGE, |
||
| 41 | REGEX_PREFIX |
||
| 42 | ) |
||
| 43 | |||
| 44 | from .workspace_backup import WorkspaceBackupManager |
||
| 45 | |||
| 46 | __all__ = ['Workspace'] |
||
| 47 | |||
| 48 | @contextmanager |
||
| 49 | def download_temporary_file(url): |
||
| 50 | with NamedTemporaryFile(prefix='ocrd-download-') as f: |
||
| 51 | with requests.get(url) as r: |
||
| 52 | f.write(r.content) |
||
| 53 | yield f |
||
| 54 | |||
| 55 | |||
| 56 | class Workspace(): |
||
| 57 | """ |
||
| 58 | A workspace is a temporary directory set up for a processor. It's the |
||
| 59 | interface to the METS/PAGE XML and delegates download and upload to the |
||
| 60 | :py:class:`ocrd.resolver.Resolver`. |
||
| 61 | |||
| 62 | Args: |
||
| 63 | |||
| 64 | directory (string) : Filesystem folder to work in |
||
| 65 | mets (:py:class:`ocrd_models.ocrd_mets.OcrdMets`) : `OcrdMets` representing this workspace. |
||
| 66 | Loaded from `'mets.xml'` if `None`. |
||
| 67 | mets_basename (string) : Basename of the METS XML file. Default: Last URL segment of the mets_url. |
||
| 68 | overwrite_mode (boolean) : Whether to force add operations on this workspace globally |
||
| 69 | baseurl (string) : Base URL to prefix to relative URL. |
||
| 70 | """ |
||
| 71 | |||
| 72 | def __init__(self, resolver, directory, mets=None, mets_basename='mets.xml', automatic_backup=False, baseurl=None): |
||
| 73 | self.resolver = resolver |
||
| 74 | self.directory = directory |
||
| 75 | self.mets_target = str(Path(directory, mets_basename)) |
||
| 76 | self.overwrite_mode = False |
||
| 77 | if mets is None: |
||
| 78 | mets = OcrdMets(filename=self.mets_target) |
||
| 79 | self.mets = mets |
||
| 80 | if automatic_backup: |
||
| 81 | self.automatic_backup = WorkspaceBackupManager(self) |
||
| 82 | self.automatic_backup.add() |
||
| 83 | else: |
||
| 84 | self.automatic_backup = None |
||
| 85 | self.baseurl = baseurl |
||
| 86 | # print(mets.to_xml(xmllint=True).decode('utf-8')) |
||
| 87 | |||
| 88 | def __str__(self): |
||
| 89 | return 'Workspace[directory=%s, baseurl=%s, file_groups=%s, files=%s]' % ( |
||
| 90 | self.directory, |
||
| 91 | self.baseurl, |
||
| 92 | self.mets.file_groups, |
||
| 93 | [str(f) for f in self.mets.find_all_files()], |
||
| 94 | ) |
||
| 95 | |||
| 96 | def reload_mets(self): |
||
| 97 | """ |
||
| 98 | Reload METS from the filesystem. |
||
| 99 | """ |
||
| 100 | self.mets = OcrdMets(filename=self.mets_target) |
||
| 101 | |||
| 102 | @deprecated_alias(pageId="page_id") |
||
| 103 | @deprecated_alias(ID="file_id") |
||
| 104 | @deprecated_alias(fileGrp="file_grp") |
||
| 105 | @deprecated_alias(fileGrp_mapping="filegrp_mapping") |
||
| 106 | def merge(self, other_workspace, copy_files=True, overwrite=False, **kwargs): |
||
| 107 | """ |
||
| 108 | Merge ``other_workspace`` into this one |
||
| 109 | |||
| 110 | See :py:meth:`ocrd_models.ocrd_mets.OcrdMets.merge` for the `kwargs` |
||
| 111 | |||
| 112 | Keyword Args: |
||
| 113 | copy_files (boolean): Whether to copy files from `other_workspace` to this one |
||
| 114 | """ |
||
| 115 | def after_add_cb(f): |
||
| 116 | """callback to run on merged OcrdFile instances in the destination""" |
||
| 117 | print(f) |
||
| 118 | if not f.local_filename: |
||
| 119 | # OcrdFile has no local_filename, so nothing to be copied |
||
| 120 | return |
||
| 121 | if not copy_files: |
||
| 122 | fpath_src = Path(other_workspace.directory).resolve() |
||
| 123 | fpath_dst = Path(self.directory).resolve() |
||
| 124 | dstprefix = fpath_src.relative_to(fpath_dst) # raises ValueError if not a subpath |
||
| 125 | f.local_filename = dstprefix / f.local_filename |
||
| 126 | return |
||
| 127 | fpath_src = Path(other_workspace.directory, f.local_filename) |
||
| 128 | fpath_dest = Path(self.directory, f.local_filename) |
||
| 129 | if fpath_src.exists(): |
||
| 130 | if fpath_dest.exists() and not overwrite: |
||
| 131 | raise FileExistsError("Copying %s to %s would overwrite the latter" % (fpath_src, fpath_dest)) |
||
| 132 | if not fpath_dest.parent.is_dir(): |
||
| 133 | makedirs(str(fpath_dest.parent)) |
||
| 134 | with open(str(fpath_src), 'rb') as fstream_in, open(str(fpath_dest), 'wb') as fstream_out: |
||
| 135 | copyfileobj(fstream_in, fstream_out) |
||
| 136 | if 'page_id' in kwargs: |
||
| 137 | kwargs['pageId'] = kwargs.pop('page_id') |
||
| 138 | if 'file_id' in kwargs: |
||
| 139 | kwargs['ID'] = kwargs.pop('file_id') |
||
| 140 | if 'file_grp' in kwargs: |
||
| 141 | kwargs['fileGrp'] = kwargs.pop('file_grp') |
||
| 142 | if 'filegrp_mapping' in kwargs: |
||
| 143 | kwargs['fileGrp_mapping'] = kwargs.pop('filegrp_mapping') |
||
| 144 | |||
| 145 | self.mets.merge(other_workspace.mets, after_add_cb=after_add_cb, **kwargs) |
||
| 146 | |||
| 147 | |||
| 148 | @deprecated(version='1.0.0', reason="Use workspace.download_file") |
||
| 149 | def download_url(self, url, **kwargs): |
||
| 150 | """ |
||
| 151 | Download a URL to the workspace. |
||
| 152 | |||
| 153 | Args: |
||
| 154 | url (string): URL to download to directory |
||
| 155 | **kwargs : See :py:class:`ocrd_models.ocrd_file.OcrdFile` |
||
| 156 | |||
| 157 | Returns: |
||
| 158 | The local filename of the downloaded file |
||
| 159 | """ |
||
| 160 | dummy_mets = OcrdMets.empty_mets() |
||
| 161 | f = dummy_mets.add_file('DEPRECATED', ID=Path(url).name, url=url) |
||
| 162 | f = self.download_file(f) |
||
| 163 | return f.local_filename |
||
| 164 | |||
| 165 | def download_file(self, f, _recursion_count=0): |
||
| 166 | """ |
||
| 167 | Download a :py:class:`ocrd_models.ocrd_file.OcrdFile` to the workspace. |
||
| 168 | """ |
||
| 169 | log = getLogger('ocrd.workspace.download_file') |
||
| 170 | with pushd_popd(self.directory): |
||
| 171 | print(f) |
||
| 172 | if f.local_filename: |
||
| 173 | file_path = Path(f.local_filename).absolute() |
||
| 174 | if file_path.exists(): |
||
| 175 | try: |
||
| 176 | file_path.relative_to(Path(self.directory).resolve()) # raises ValueError if not relative |
||
| 177 | # If the f.local_filename exists and is within self.directory, nothing to do |
||
| 178 | log.info(f"'local_filename' {f.local_filename} already within {self.directory}, nothing to do") |
||
| 179 | except ValueError: |
||
| 180 | # f.local_filename exists, but not within self.directory, copy it |
||
| 181 | log.info("Copying 'local_filename' %s to workspace directory %s" % (f.local_filename, self.directory)) |
||
| 182 | f.local_filename = self.resolver.download_to_directory(self.directory, f.local_filename, subdir=f.fileGrp) |
||
| 183 | return f |
||
| 184 | if f.url: |
||
| 185 | log.info("OcrdFile has 'local_filename' but it doesn't resolve, try to download from set 'url' %s", f.url) |
||
| 186 | elif self.baseurl: |
||
| 187 | log.info("OcrdFile has 'local_filename' but it doesn't resolve and no 'url', concatenate 'baseurl' %s and 'local_filename' %s", |
||
| 188 | self.baseurl, f.local_filename) |
||
| 189 | f.url = '%s/%s' % (self.baseurl, f.local_filename) |
||
| 190 | else: |
||
| 191 | raise FileNotFoundError(f"'local_filename' {f.local_filename} points to non-existing file," |
||
| 192 | "and no 'url' to download and no 'baseurl' set on workspace, nothing we can do.") |
||
| 193 | if f.url: |
||
| 194 | # If f.url is set, download the file to the workspace |
||
| 195 | basename = '%s%s' % (f.ID, MIME_TO_EXT.get(f.mimetype, '')) if f.ID else f.basename |
||
| 196 | f.local_filename = self.resolver.download_to_directory(self.directory, f.url, subdir=f.fileGrp, basename=basename) |
||
| 197 | else: |
||
| 198 | # If neither f.local_filename nor f.url is set, fail |
||
| 199 | raise ValueError("OcrdFile {f} has neither 'url' nor 'local_filename', so cannot be downloaded") |
||
| 200 | return f |
||
| 201 | |||
| 202 | def remove_file(self, file_id, force=False, keep_file=False, page_recursive=False, page_same_group=False): |
||
| 203 | """ |
||
| 204 | Remove a METS `file` from the workspace. |
||
| 205 | |||
| 206 | Arguments: |
||
| 207 | file_id (string|:py:class:`ocrd_models.ocrd_file.OcrdFile`): `@ID` of the METS `file` |
||
| 208 | to delete or the file itself |
||
| 209 | Keyword Args: |
||
| 210 | force (boolean): Continue removing even if file not found in METS |
||
| 211 | keep_file (boolean): Whether to keep files on disk |
||
| 212 | page_recursive (boolean): Whether to remove all images referenced in the file |
||
| 213 | if the file is a PAGE-XML document. |
||
| 214 | page_same_group (boolean): Remove only images in the same file group as the PAGE-XML. |
||
| 215 | Has no effect unless ``page_recursive`` is `True`. |
||
| 216 | """ |
||
| 217 | log = getLogger('ocrd.workspace.remove_file') |
||
| 218 | log.debug('Deleting mets:file %s', file_id) |
||
| 219 | if self.overwrite_mode: |
||
| 220 | force = True |
||
| 221 | if isinstance(file_id, OcrdFile): |
||
| 222 | file_id = file_id.ID |
||
| 223 | try: |
||
| 224 | try: |
||
| 225 | ocrd_file = next(self.mets.find_files(ID=file_id)) |
||
| 226 | except StopIteration: |
||
| 227 | if file_id.startswith(REGEX_PREFIX): |
||
| 228 | # allow empty results if filter criteria involve a regex |
||
| 229 | return None |
||
| 230 | raise FileNotFoundError("File %s not found in METS" % file_id) |
||
| 231 | if page_recursive and ocrd_file.mimetype == MIMETYPE_PAGE: |
||
| 232 | with pushd_popd(self.directory): |
||
| 233 | ocrd_page = parse(self.download_file(ocrd_file).local_filename, silence=True) |
||
| 234 | for img_url in ocrd_page.get_AllAlternativeImagePaths(): |
||
| 235 | img_kwargs = {'local_filename': img_url} |
||
| 236 | if page_same_group: |
||
| 237 | img_kwargs['fileGrp'] = ocrd_file.fileGrp |
||
| 238 | for img_file in self.mets.find_files(**img_kwargs): |
||
| 239 | self.remove_file(img_file, keep_file=keep_file, force=force) |
||
| 240 | if not keep_file: |
||
| 241 | with pushd_popd(self.directory): |
||
| 242 | if not ocrd_file.local_filename: |
||
| 243 | if force: |
||
| 244 | log.debug("File not locally available but --force is set: %s", ocrd_file) |
||
| 245 | else: |
||
| 246 | raise Exception("File not locally available %s" % ocrd_file) |
||
| 247 | else: |
||
| 248 | log.info("rm %s [cwd=%s]", ocrd_file.local_filename, self.directory) |
||
| 249 | unlink(ocrd_file.local_filename) |
||
| 250 | # Remove from METS only after the recursion of AlternativeImages |
||
| 251 | self.mets.remove_file(file_id) |
||
| 252 | return ocrd_file |
||
| 253 | except FileNotFoundError as e: |
||
| 254 | if not force: |
||
| 255 | raise e |
||
| 256 | |||
| 257 | def remove_file_group(self, USE, recursive=False, force=False, keep_files=False, page_recursive=False, page_same_group=False): |
||
| 258 | """ |
||
| 259 | Remove a METS `fileGrp`. |
||
| 260 | |||
| 261 | Arguments: |
||
| 262 | USE (string): `@USE` of the METS `fileGrp` to delete |
||
| 263 | Keyword Args: |
||
| 264 | recursive (boolean): Whether to recursively delete all files in the group |
||
| 265 | force (boolean): Continue removing even if group or containing files not found in METS |
||
| 266 | keep_files (boolean): When deleting recursively whether to keep files on disk |
||
| 267 | page_recursive (boolean): Whether to remove all images referenced in the file |
||
| 268 | if the file is a PAGE-XML document. |
||
| 269 | page_same_group (boolean): Remove only images in the same file group as the PAGE-XML. |
||
| 270 | Has no effect unless ``page_recursive`` is `True`. |
||
| 271 | """ |
||
| 272 | if not force and self.overwrite_mode: |
||
| 273 | force = True |
||
| 274 | |||
| 275 | if (not USE.startswith(REGEX_PREFIX)) and (USE not in self.mets.file_groups) and (not force): |
||
| 276 | raise Exception("No such fileGrp: %s" % USE) |
||
| 277 | |||
| 278 | file_dirs = [] |
||
| 279 | if recursive: |
||
| 280 | for f in self.mets.find_files(fileGrp=USE): |
||
| 281 | self.remove_file(f, force=force, keep_file=keep_files, page_recursive=page_recursive, page_same_group=page_same_group) |
||
| 282 | if f.local_filename: |
||
| 283 | f_dir = path.dirname(f.local_filename) |
||
| 284 | if f_dir: |
||
| 285 | file_dirs.append(f_dir) |
||
| 286 | |||
| 287 | self.mets.remove_file_group(USE, force=force, recursive=recursive) |
||
| 288 | |||
| 289 | # PLEASE NOTE: this only removes directories in the workspace if they are empty |
||
| 290 | # and named after the fileGrp which is a convention in OCR-D. |
||
| 291 | with pushd_popd(self.directory): |
||
| 292 | if Path(USE).is_dir() and not listdir(USE): |
||
| 293 | Path(USE).rmdir() |
||
| 294 | if file_dirs: |
||
| 295 | for file_dir in set(file_dirs): |
||
| 296 | if Path(file_dir).is_dir() and not listdir(file_dir): |
||
| 297 | Path(file_dir).rmdir() |
||
| 298 | |||
| 299 | |||
| 300 | def rename_file_group(self, old, new): |
||
| 301 | """ |
||
| 302 | Rename a METS `fileGrp`. |
||
| 303 | |||
| 304 | Arguments: |
||
| 305 | old (string): `@USE` of the METS `fileGrp` to rename |
||
| 306 | new (string): `@USE` of the METS `fileGrp` to rename as |
||
| 307 | """ |
||
| 308 | log = getLogger('ocrd.workspace.rename_file_group') |
||
| 309 | |||
| 310 | if old not in self.mets.file_groups: |
||
| 311 | raise ValueError(f"No such fileGrp: {old}") |
||
| 312 | if new in self.mets.file_groups: |
||
| 313 | raise ValueError(f"fileGrp already exists {new}") |
||
| 314 | |||
| 315 | with pushd_popd(self.directory): |
||
| 316 | # create workspace dir ``new`` |
||
| 317 | log.info("mkdir %s" % new) |
||
| 318 | if not Path(new).is_dir(): |
||
| 319 | Path(new).mkdir() |
||
| 320 | local_filename_replacements = {} |
||
| 321 | log.info("Moving files") |
||
| 322 | for mets_file in self.mets.find_files(fileGrp=old, local_only=True): |
||
| 323 | new_local_filename = old_local_filename = str(mets_file.local_filename) |
||
|
|
|||
| 324 | # Directory part |
||
| 325 | new_local_filename = sub(r'^%s/' % old, r'%s/' % new, new_local_filename) |
||
| 326 | # File part |
||
| 327 | new_local_filename = sub(r'/%s' % old, r'/%s' % new, new_local_filename) |
||
| 328 | local_filename_replacements[str(mets_file.local_filename)] = new_local_filename |
||
| 329 | # move file from ``old`` to ``new`` |
||
| 330 | mets_file.local_filename.rename(new_local_filename) |
||
| 331 | # change the url of ``mets:file`` |
||
| 332 | mets_file.local_filename = new_local_filename |
||
| 333 | # change the file ID and update structMap |
||
| 334 | # change the file ID and update structMap |
||
| 335 | new_id = sub(r'^%s' % old, r'%s' % new, mets_file.ID) |
||
| 336 | try: |
||
| 337 | next(self.mets.find_files(ID=new_id)) |
||
| 338 | log.warning("ID %s already exists, not changing ID while renaming %s -> %s" % (new_id, old_local_filename, new_local_filename)) |
||
| 339 | except StopIteration: |
||
| 340 | mets_file.ID = new_id |
||
| 341 | # change file paths in PAGE-XML imageFilename and filename attributes |
||
| 342 | for page_file in self.mets.find_files(mimetype=MIMETYPE_PAGE, local_only=True): |
||
| 343 | log.info("Renaming file references in PAGE-XML %s" % page_file) |
||
| 344 | pcgts = page_from_file(page_file) |
||
| 345 | changed = False |
||
| 346 | for old_local_filename, new_local_filename in local_filename_replacements.items(): |
||
| 347 | if pcgts.get_Page().imageFilename == old_local_filename: |
||
| 348 | changed = True |
||
| 349 | log.info("Rename pc:Page/@imageFilename: %s -> %s" % (old_local_filename, new_local_filename)) |
||
| 350 | pcgts.get_Page().imageFilename = new_local_filename |
||
| 351 | for ai in pcgts.get_Page().get_AllAlternativeImages(): |
||
| 352 | for old_local_filename, new_local_filename in local_filename_replacements.items(): |
||
| 353 | if ai.filename == old_local_filename: |
||
| 354 | changed = True |
||
| 355 | log.info("Rename pc:Page/../AlternativeImage: %s -> %s" % (old_local_filename, new_local_filename)) |
||
| 356 | ai.filename = new_local_filename |
||
| 357 | if changed: |
||
| 358 | log.info("PAGE-XML changed, writing %s" % (page_file.local_filename)) |
||
| 359 | with open(page_file.local_filename, 'w', encoding='utf-8') as f: |
||
| 360 | f.write(to_xml(pcgts)) |
||
| 361 | # change the ``USE`` attribute of the fileGrp |
||
| 362 | self.mets.rename_file_group(old, new) |
||
| 363 | # Remove the old dir |
||
| 364 | log.info("rmdir %s" % old) |
||
| 365 | if Path(old).is_dir() and not listdir(old): |
||
| 366 | Path(old).rmdir() |
||
| 367 | |||
| 368 | @deprecated_alias(pageId="page_id") |
||
| 369 | @deprecated_alias(ID="file_id") |
||
| 370 | def add_file(self, file_grp, content=None, **kwargs): |
||
| 371 | """ |
||
| 372 | Add a file to the :py:class:`ocrd_models.ocrd_mets.OcrdMets` of the workspace. |
||
| 373 | |||
| 374 | Arguments: |
||
| 375 | file_grp (string): `@USE` of the METS `fileGrp` to add to |
||
| 376 | Keyword Args: |
||
| 377 | content (string|bytes): optional content to write to the file |
||
| 378 | in the filesystem |
||
| 379 | **kwargs: See :py:func:`ocrd_models.ocrd_mets.OcrdMets.add_file` |
||
| 380 | Returns: |
||
| 381 | a new :py:class:`ocrd_models.ocrd_file.OcrdFile` |
||
| 382 | """ |
||
| 383 | log = getLogger('ocrd.workspace.add_file') |
||
| 384 | log.debug( |
||
| 385 | 'outputfile file_grp=%s local_filename=%s content=%s', |
||
| 386 | file_grp, |
||
| 387 | kwargs.get('local_filename'), |
||
| 388 | content is not None) |
||
| 389 | if 'page_id' not in kwargs: |
||
| 390 | raise ValueError("workspace.add_file must be passed a 'page_id' kwarg, even if it is None.") |
||
| 391 | if content is not None and not kwargs.get('local_filename'): |
||
| 392 | raise Exception("'content' was set but no 'local_filename'") |
||
| 393 | if self.overwrite_mode: |
||
| 394 | kwargs['force'] = True |
||
| 395 | |||
| 396 | with pushd_popd(self.directory): |
||
| 397 | if kwargs.get('local_filename'): |
||
| 398 | # If the local filename has folder components, create those folders |
||
| 399 | local_filename_dir = str(kwargs['local_filename']).rsplit('/', 1)[0] |
||
| 400 | if local_filename_dir != str(kwargs['local_filename']) and not Path(local_filename_dir).is_dir(): |
||
| 401 | makedirs(local_filename_dir) |
||
| 402 | |||
| 403 | # print(kwargs) |
||
| 404 | kwargs["pageId"] = kwargs.pop("page_id") |
||
| 405 | if "file_id" in kwargs: |
||
| 406 | kwargs["ID"] = kwargs.pop("file_id") |
||
| 407 | |||
| 408 | ret = self.mets.add_file(file_grp, **kwargs) |
||
| 409 | |||
| 410 | if content is not None: |
||
| 411 | with open(kwargs['local_filename'], 'wb') as f: |
||
| 412 | if isinstance(content, str): |
||
| 413 | content = bytes(content, 'utf-8') |
||
| 414 | f.write(content) |
||
| 415 | |||
| 416 | return ret |
||
| 417 | |||
| 418 | def save_mets(self): |
||
| 419 | """ |
||
| 420 | Write out the current state of the METS file to the filesystem. |
||
| 421 | """ |
||
| 422 | log = getLogger('ocrd.workspace.save_mets') |
||
| 423 | log.debug("Saving mets '%s'", self.mets_target) |
||
| 424 | if self.automatic_backup: |
||
| 425 | self.automatic_backup.add() |
||
| 426 | with atomic_write(self.mets_target) as f: |
||
| 427 | f.write(self.mets.to_xml(xmllint=True).decode('utf-8')) |
||
| 428 | |||
| 429 | def resolve_image_exif(self, image_url): |
||
| 430 | """ |
||
| 431 | Get the EXIF metadata about an image URL as :py:class:`ocrd_models.ocrd_exif.OcrdExif` |
||
| 432 | |||
| 433 | Args: |
||
| 434 | image_url (string) : `@href` (path or URL) of the METS `file` to inspect |
||
| 435 | |||
| 436 | Returns: |
||
| 437 | :py:class:`ocrd_models.ocrd_exif.OcrdExif` |
||
| 438 | """ |
||
| 439 | if not image_url: |
||
| 440 | # avoid "finding" just any file |
||
| 441 | raise ValueError(f"'image_url' must be a non-empty string, not '{image_url}' ({type(image_url)})") |
||
| 442 | try: |
||
| 443 | f = next(self.mets.find_files(local_filename=str(image_url))) |
||
| 444 | return exif_from_filename(f.local_filename) |
||
| 445 | except StopIteration: |
||
| 446 | try: |
||
| 447 | f = next(self.mets.find_files(url=str(image_url))) |
||
| 448 | return exif_from_filename(self.download_file(f).local_filename) |
||
| 449 | except StopIteration: |
||
| 450 | with download_temporary_file(image_url) as f: |
||
| 451 | return exif_from_filename(f.name) |
||
| 452 | |||
| 453 | @deprecated(version='1.0.0', reason="Use workspace.image_from_page and workspace.image_from_segment") |
||
| 454 | def resolve_image_as_pil(self, image_url, coords=None): |
||
| 455 | """ |
||
| 456 | Resolve an image URL to a `PIL.Image`. |
||
| 457 | |||
| 458 | Arguments: |
||
| 459 | image_url (string): `@href` (path or URL) of the METS `file` to retrieve |
||
| 460 | Keyword Args: |
||
| 461 | coords (list) : Coordinates of the bounding box to cut from the image |
||
| 462 | |||
| 463 | Returns: |
||
| 464 | Full or cropped `PIL.Image` |
||
| 465 | |||
| 466 | """ |
||
| 467 | return self._resolve_image_as_pil(image_url, coords) |
||
| 468 | |||
| 469 | def _resolve_image_as_pil(self, image_url, coords=None): |
||
| 470 | if not image_url: |
||
| 471 | # avoid "finding" just any file |
||
| 472 | raise Exception("Cannot resolve empty image path") |
||
| 473 | log = getLogger('ocrd.workspace._resolve_image_as_pil') |
||
| 474 | with pushd_popd(self.directory): |
||
| 475 | try: |
||
| 476 | f = next(self.mets.find_files(local_filename=str(image_url))) |
||
| 477 | pil_image = Image.open(f.local_filename) |
||
| 478 | except StopIteration: |
||
| 479 | try: |
||
| 480 | f = next(self.mets.find_files(url=str(image_url))) |
||
| 481 | pil_image = Image.open(self.download_file(f).local_filename) |
||
| 482 | except StopIteration: |
||
| 483 | with download_temporary_file(image_url) as f: |
||
| 484 | pil_image = Image.open(f.name) |
||
| 485 | pil_image.load() # alloc and give up the FD |
||
| 486 | |||
| 487 | # Pillow does not properly support higher color depths |
||
| 488 | # (e.g. 16-bit or 32-bit or floating point grayscale), |
||
| 489 | # clipping its dynamic range to the lower 8-bit in |
||
| 490 | # many operations (including paste, putalpha, ImageStat...), |
||
| 491 | # even including conversion. |
||
| 492 | # Cf. Pillow#3011 Pillow#3159 Pillow#3838 (still open in 8.0) |
||
| 493 | # So to be on the safe side, we must re-quantize these |
||
| 494 | # to 8-bit via numpy (conversion to/from which fortunately |
||
| 495 | # seems to work reliably): |
||
| 496 | if (pil_image.mode.startswith('I') or |
||
| 497 | pil_image.mode.startswith('F')): |
||
| 498 | arr_image = np.array(pil_image) |
||
| 499 | if arr_image.dtype.kind == 'i': |
||
| 500 | # signed integer is *not* trustworthy in this context |
||
| 501 | # (usually a mistake in the array interface) |
||
| 502 | log.debug('Casting image "%s" from signed to unsigned', image_url) |
||
| 503 | arr_image.dtype = np.dtype('u' + arr_image.dtype.name) |
||
| 504 | if arr_image.dtype.kind == 'u': |
||
| 505 | # integer needs to be scaled linearly to 8 bit |
||
| 506 | # of course, an image might actually have some lower range |
||
| 507 | # (e.g. 10-bit in I;16 or 20-bit in I or 4-bit in L), |
||
| 508 | # but that would be guessing anyway, so here don't |
||
| 509 | # make assumptions on _scale_, just reduce _precision_ |
||
| 510 | log.debug('Reducing image "%s" from depth %d bit to 8 bit', |
||
| 511 | image_url, arr_image.dtype.itemsize * 8) |
||
| 512 | arr_image = arr_image >> 8 * (arr_image.dtype.itemsize-1) |
||
| 513 | arr_image = arr_image.astype(np.uint8) |
||
| 514 | elif arr_image.dtype.kind == 'f': |
||
| 515 | # float needs to be scaled from [0,1.0] to [0,255] |
||
| 516 | log.debug('Reducing image "%s" from floating point to 8 bit', |
||
| 517 | image_url) |
||
| 518 | arr_image *= 255 |
||
| 519 | arr_image = arr_image.astype(np.uint8) |
||
| 520 | pil_image = Image.fromarray(arr_image) |
||
| 521 | |||
| 522 | if coords is None: |
||
| 523 | return pil_image |
||
| 524 | |||
| 525 | # FIXME: remove or replace this by (image_from_polygon+) crop_image ... |
||
| 526 | log.debug("Converting PIL to OpenCV: %s", image_url) |
||
| 527 | color_conversion = COLOR_GRAY2BGR if pil_image.mode in ('1', 'L') else COLOR_RGB2BGR |
||
| 528 | pil_as_np_array = np.array(pil_image).astype('uint8') if pil_image.mode == '1' else np.array(pil_image) |
||
| 529 | cv2_image = cvtColor(pil_as_np_array, color_conversion) |
||
| 530 | |||
| 531 | poly = np.array(coords, np.int32) |
||
| 532 | log.debug("Cutting region %s from %s", coords, image_url) |
||
| 533 | region_cut = cv2_image[ |
||
| 534 | np.min(poly[:, 1]):np.max(poly[:, 1]), |
||
| 535 | np.min(poly[:, 0]):np.max(poly[:, 0]) |
||
| 536 | ] |
||
| 537 | return Image.fromarray(region_cut) |
||
| 538 | |||
| 539 | def image_from_page(self, page, page_id, |
||
| 540 | fill='background', transparency=False, |
||
| 541 | feature_selector='', feature_filter='', filename=''): |
||
| 542 | """Extract an image for a PAGE-XML page from the workspace. |
||
| 543 | |||
| 544 | Args: |
||
| 545 | page (:py:class:`ocrd_models.ocrd_page.PageType`): a PAGE `PageType` object |
||
| 546 | page_id (string): its `@ID` in the METS physical `structMap` |
||
| 547 | Keyword Args: |
||
| 548 | fill (string): a `PIL` color specifier, or `background` or `none` |
||
| 549 | transparency (boolean): whether to add an alpha channel for masking |
||
| 550 | feature_selector (string): a comma-separated list of `@comments` classes |
||
| 551 | feature_filter (string): a comma-separated list of `@comments` classes |
||
| 552 | filename (string): which file path to use |
||
| 553 | |||
| 554 | Extract a `PIL.Image` from ``page``, either from its `AlternativeImage` |
||
| 555 | (if it exists), or from its `@imageFilename` (otherwise). Also crop it, |
||
| 556 | if a `Border` exists, and rotate it, if any `@orientation` angle is |
||
| 557 | annotated. |
||
| 558 | |||
| 559 | If ``filename`` is given, then among `@imageFilename` and the available |
||
| 560 | `AlternativeImage/@filename` images, pick that one, or raise an error. |
||
| 561 | |||
| 562 | If ``feature_selector`` and/or ``feature_filter`` is given, then |
||
| 563 | among the `@imageFilename` image and the available AlternativeImages, |
||
| 564 | select/filter the richest one which contains all of the selected, |
||
| 565 | but none of the filtered features (i.e. `@comments` classes), or |
||
| 566 | raise an error. |
||
| 567 | |||
| 568 | (Required and produced features need not be in the same order, so |
||
| 569 | ``feature_selector`` is merely a mask specifying Boolean AND, and |
||
| 570 | ``feature_filter`` is merely a mask specifying Boolean OR.) |
||
| 571 | |||
| 572 | If the chosen image does not have the feature `"cropped"` yet, but |
||
| 573 | a `Border` exists, and unless `"cropped"` is being filtered, then crop it. |
||
| 574 | Likewise, if the chosen image does not have the feature `"deskewed"` yet, |
||
| 575 | but an `@orientation` angle is annotated, and unless `"deskewed"` is being |
||
| 576 | filtered, then rotate it. (However, if `@orientation` is above the |
||
| 577 | [-45°,45°] interval, then apply as much transposition as possible first, |
||
| 578 | unless `"rotated-90"` / `"rotated-180"` / `"rotated-270"` is being filtered.) |
||
| 579 | |||
| 580 | Cropping uses a polygon mask (not just the bounding box rectangle). |
||
| 581 | Areas outside the polygon will be filled according to ``fill``: |
||
| 582 | |||
| 583 | \b |
||
| 584 | - if `"background"` (the default), |
||
| 585 | then fill with the median color of the image; |
||
| 586 | - else if `"none"`, then avoid masking polygons where possible |
||
| 587 | (i.e. when cropping) or revert to the default (i.e. when rotating) |
||
| 588 | - otherwise, use the given color, e.g. `"white"` or `(255,255,255)`. |
||
| 589 | |||
| 590 | Moreover, if ``transparency`` is true, and unless the image already |
||
| 591 | has an alpha channel, then add an alpha channel which is fully opaque |
||
| 592 | before cropping and rotating. (Thus, unexposed/masked areas will be |
||
| 593 | transparent afterwards for consumers that can interpret alpha channels). |
||
| 594 | |||
| 595 | Returns: |
||
| 596 | a tuple of |
||
| 597 | * the extracted `PIL.Image`, |
||
| 598 | * a `dict` with information about the extracted image: |
||
| 599 | |||
| 600 | - `"transform"`: a `Numpy` array with an affine transform which |
||
| 601 | converts from absolute coordinates to those relative to the image, |
||
| 602 | i.e. after cropping to the page's border / bounding box (if any) |
||
| 603 | and deskewing with the page's orientation angle (if any) |
||
| 604 | - `"angle"`: the rotation/reflection angle applied to the image so far, |
||
| 605 | - `"features"`: the `AlternativeImage` `@comments` for the image, i.e. |
||
| 606 | names of all applied operations that lead up to this result, |
||
| 607 | * an :py:class:`ocrd_models.ocrd_exif.OcrdExif` instance associated with |
||
| 608 | the original image. |
||
| 609 | |||
| 610 | (The first two can be used to annotate a new `AlternativeImage`, |
||
| 611 | or be passed down with :py:meth:`image_from_segment`.) |
||
| 612 | |||
| 613 | Examples: |
||
| 614 | |||
| 615 | * get a raw (colored) but already deskewed and cropped image:: |
||
| 616 | |||
| 617 | page_image, page_coords, page_image_info = workspace.image_from_page( |
||
| 618 | page, page_id, |
||
| 619 | feature_selector='deskewed,cropped', |
||
| 620 | feature_filter='binarized,grayscale_normalized') |
||
| 621 | """ |
||
| 622 | log = getLogger('ocrd.workspace.image_from_page') |
||
| 623 | page_image_info = self.resolve_image_exif(page.imageFilename) |
||
| 624 | page_image = self._resolve_image_as_pil(page.imageFilename) |
||
| 625 | page_coords = dict() |
||
| 626 | # use identity as initial affine coordinate transform: |
||
| 627 | page_coords['transform'] = np.eye(3) |
||
| 628 | # interim bbox (updated with each change to the transform): |
||
| 629 | page_bbox = [0, 0, page_image.width, page_image.height] |
||
| 630 | page_xywh = {'x': 0, 'y': 0, |
||
| 631 | 'w': page_image.width, 'h': page_image.height} |
||
| 632 | |||
| 633 | border = page.get_Border() |
||
| 634 | # page angle: PAGE @orientation is defined clockwise, |
||
| 635 | # whereas PIL/ndimage rotation is in mathematical direction: |
||
| 636 | page_coords['angle'] = -(page.get_orientation() or 0) |
||
| 637 | # map angle from (-180,180] to [0,360], and partition into multiples of 90; |
||
| 638 | # but avoid unnecessary large remainders, i.e. split symmetrically: |
||
| 639 | orientation = (page_coords['angle'] + 45) % 360 |
||
| 640 | orientation = orientation - (orientation % 90) |
||
| 641 | skew = (page_coords['angle'] % 360) - orientation |
||
| 642 | skew = 180 - (180 - skew) % 360 # map to [-45,45] |
||
| 643 | page_coords['angle'] = 0 # nothing applied yet (depends on filters) |
||
| 644 | log.debug("page '%s' has %s orientation=%d skew=%.2f", |
||
| 645 | page_id, "border," if border else "", orientation, skew) |
||
| 646 | |||
| 647 | # initialize AlternativeImage@comments classes as empty: |
||
| 648 | page_coords['features'] = '' |
||
| 649 | best_image = None |
||
| 650 | alternative_images = page.get_AlternativeImage() |
||
| 651 | View Code Duplication | if alternative_images: |
|
| 652 | # (e.g. from page-level cropping, binarization, deskewing or despeckling) |
||
| 653 | best_features = set() |
||
| 654 | auto_features = {'cropped', 'deskewed', 'rotated-90', 'rotated-180', 'rotated-270'} |
||
| 655 | # search to the end, because by convention we always append, |
||
| 656 | # and among multiple satisfactory images we want the most recent, |
||
| 657 | # but also ensure that we get the richest feature set, i.e. most |
||
| 658 | # of those features that we cannot reproduce automatically below |
||
| 659 | for alternative_image in alternative_images: |
||
| 660 | if filename and filename != alternative_image.filename: |
||
| 661 | continue |
||
| 662 | features = alternative_image.get_comments() |
||
| 663 | if not features: |
||
| 664 | log.warning("AlternativeImage %d for page '%s' does not have any feature attributes", |
||
| 665 | alternative_images.index(alternative_image) + 1, page_id) |
||
| 666 | features = '' |
||
| 667 | featureset = set(features.split(',')) |
||
| 668 | if (all(feature in featureset |
||
| 669 | for feature in feature_selector.split(',') if feature) and |
||
| 670 | not any(feature in featureset |
||
| 671 | for feature in feature_filter.split(',') if feature) and |
||
| 672 | len(featureset.difference(auto_features)) >= \ |
||
| 673 | len(best_features.difference(auto_features))): |
||
| 674 | best_features = featureset |
||
| 675 | best_image = alternative_image |
||
| 676 | if best_image: |
||
| 677 | log.debug("Using AlternativeImage %d %s for page '%s'", |
||
| 678 | alternative_images.index(best_image) + 1, |
||
| 679 | best_features, page_id) |
||
| 680 | page_image = self._resolve_image_as_pil(best_image.get_filename()) |
||
| 681 | page_coords['features'] = best_image.get_comments() # including duplicates |
||
| 682 | |||
| 683 | # adjust the coord transformation to the steps applied on the image, |
||
| 684 | # and apply steps on the existing image in case it is missing there, |
||
| 685 | # but traverse all steps (crop/reflect/rotate) in a particular order: |
||
| 686 | # - existing image features take priority (in the order annotated), |
||
| 687 | # - next is cropping (if necessary but not already applied), |
||
| 688 | # - next is reflection (if necessary but not already applied), |
||
| 689 | # - next is rotation (if necessary but not already applied). |
||
| 690 | # This helps deal with arbitrary workflows (e.g. crop then deskew, |
||
| 691 | # or deskew then crop), regardless of where images are generated. |
||
| 692 | alternative_image_features = page_coords['features'].split(',') |
||
| 693 | for duplicate_feature in set([feature for feature in alternative_image_features |
||
| 694 | # features relevant in reconstructing coordinates: |
||
| 695 | if (feature in ['cropped', 'deskewed', 'rotated-90', |
||
| 696 | 'rotated-180', 'rotated-270'] and |
||
| 697 | alternative_image_features.count(feature) > 1)]): |
||
| 698 | log.error("Duplicate feature %s in AlternativeImage for page '%s'", |
||
| 699 | duplicate_feature, page_id) |
||
| 700 | for i, feature in enumerate(alternative_image_features + |
||
| 701 | (['cropped'] |
||
| 702 | if (border and |
||
| 703 | not 'cropped' in alternative_image_features and |
||
| 704 | not 'cropped' in feature_filter.split(',')) |
||
| 705 | else []) + |
||
| 706 | (['rotated-%d' % orientation] |
||
| 707 | if (orientation and |
||
| 708 | not 'rotated-%d' % orientation in alternative_image_features and |
||
| 709 | not 'rotated-%d' % orientation in feature_filter.split(',')) |
||
| 710 | else []) + |
||
| 711 | (['deskewed'] |
||
| 712 | if (skew and |
||
| 713 | not 'deskewed' in alternative_image_features and |
||
| 714 | not 'deskewed' in feature_filter.split(',')) |
||
| 715 | else []) + |
||
| 716 | # not a feature to be added, but merely as a fallback position |
||
| 717 | # to always enter loop at i == len(alternative_image_features) |
||
| 718 | ['_check']): |
||
| 719 | # image geometry vs feature consistency can only be checked |
||
| 720 | # after all features on the existing AlternativeImage have |
||
| 721 | # been adjusted for in the transform, and when there is a mismatch, |
||
| 722 | # additional steps applied here would only repeat the respective |
||
| 723 | # error message; so we only check once at the boundary between |
||
| 724 | # existing and new features |
||
| 725 | # FIXME we should check/enforce consistency when _adding_ AlternativeImage |
||
| 726 | if (i == len(alternative_image_features) and |
||
| 727 | not (page_xywh['w'] - 2 < page_image.width < page_xywh['w'] + 2 and |
||
| 728 | page_xywh['h'] - 2 < page_image.height < page_xywh['h'] + 2)): |
||
| 729 | log.error('page "%s" image (%s; %dx%d) has not been cropped properly (%dx%d)', |
||
| 730 | page_id, page_coords['features'], |
||
| 731 | page_image.width, page_image.height, |
||
| 732 | page_xywh['w'], page_xywh['h']) |
||
| 733 | name = "%s for page '%s'" % ("AlternativeImage" if best_image |
||
| 734 | else "original image", page_id) |
||
| 735 | # adjust transform to feature, and ensure feature is applied to image |
||
| 736 | if feature == 'cropped': |
||
| 737 | page_image, page_coords, page_xywh = _crop( |
||
| 738 | log, name, border, page_image, page_coords, |
||
| 739 | fill=fill, transparency=transparency) |
||
| 740 | elif feature == 'rotated-%d' % orientation: |
||
| 741 | page_image, page_coords, page_xywh = _reflect( |
||
| 742 | log, name, orientation, page_image, page_coords, page_xywh) |
||
| 743 | elif feature == 'deskewed': |
||
| 744 | page_image, page_coords, page_xywh = _rotate( |
||
| 745 | log, name, skew, border, page_image, page_coords, page_xywh, |
||
| 746 | fill=fill, transparency=transparency) |
||
| 747 | |||
| 748 | # verify constraints again: |
||
| 749 | if filename and not getattr(page_image, 'filename', '').endswith(filename): |
||
| 750 | raise Exception('Found no AlternativeImage that satisfies all requirements ' + |
||
| 751 | 'filename="%s" in page "%s"' % ( |
||
| 752 | filename, page_id)) |
||
| 753 | if not all(feature in page_coords['features'] |
||
| 754 | for feature in feature_selector.split(',') if feature): |
||
| 755 | raise Exception('Found no AlternativeImage that satisfies all requirements ' + |
||
| 756 | 'selector="%s" in page "%s"' % ( |
||
| 757 | feature_selector, page_id)) |
||
| 758 | if any(feature in page_coords['features'] |
||
| 759 | for feature in feature_filter.split(',') if feature): |
||
| 760 | raise Exception('Found no AlternativeImage that satisfies all requirements ' + |
||
| 761 | 'filter="%s" in page "%s"' % ( |
||
| 762 | feature_filter, page_id)) |
||
| 763 | page_image.format = 'PNG' # workaround for tesserocr#194 |
||
| 764 | return page_image, page_coords, page_image_info |
||
| 765 | |||
| 766 | def image_from_segment(self, segment, parent_image, parent_coords, |
||
| 767 | fill='background', transparency=False, |
||
| 768 | feature_selector='', feature_filter='', filename=''): |
||
| 769 | """Extract an image for a PAGE-XML hierarchy segment from its parent's image. |
||
| 770 | |||
| 771 | Args: |
||
| 772 | segment (object): a PAGE segment object \ |
||
| 773 | (i.e. :py:class:`~ocrd_models.ocrd_page.TextRegionType` \ |
||
| 774 | or :py:class:`~ocrd_models.ocrd_page.TextLineType` \ |
||
| 775 | or :py:class:`~ocrd_models.ocrd_page.WordType` \ |
||
| 776 | or :py:class:`~ocrd_models.ocrd_page.GlyphType`) |
||
| 777 | parent_image (`PIL.Image`): image of the `segment`'s parent |
||
| 778 | parent_coords (dict): a `dict` with information about `parent_image`: |
||
| 779 | |||
| 780 | - `"transform"`: a `Numpy` array with an affine transform which |
||
| 781 | converts from absolute coordinates to those relative to the image, |
||
| 782 | i.e. after applying all operations (starting with the original image) |
||
| 783 | - `"angle"`: the rotation/reflection angle applied to the image so far, |
||
| 784 | - `"features"`: the ``AlternativeImage/@comments`` for the image, i.e. |
||
| 785 | names of all operations that lead up to this result, and |
||
| 786 | Keyword Args: |
||
| 787 | fill (string): a `PIL` color specifier, or `background` or `none` |
||
| 788 | transparency (boolean): whether to add an alpha channel for masking |
||
| 789 | feature_selector (string): a comma-separated list of ``@comments`` classes |
||
| 790 | feature_filter (string): a comma-separated list of ``@comments`` classes |
||
| 791 | |||
| 792 | Extract a `PIL.Image` from `segment`, either from ``AlternativeImage`` |
||
| 793 | (if it exists), or producing a new image via cropping from `parent_image` |
||
| 794 | (otherwise). Pass in `parent_image` and `parent_coords` from the result |
||
| 795 | of the next higher-level of this function or from :py:meth:`image_from_page`. |
||
| 796 | |||
| 797 | If ``filename`` is given, then among the available `AlternativeImage/@filename` |
||
| 798 | images, pick that one, or raise an error. |
||
| 799 | |||
| 800 | If ``feature_selector`` and/or ``feature_filter`` is given, then |
||
| 801 | among the cropped `parent_image` and the available AlternativeImages, |
||
| 802 | select/filter the richest one which contains all of the selected, |
||
| 803 | but none of the filtered features (i.e. ``@comments`` classes), or |
||
| 804 | raise an error. |
||
| 805 | |||
| 806 | (Required and produced features need not be in the same order, so |
||
| 807 | `feature_selector` is merely a mask specifying Boolean AND, and |
||
| 808 | `feature_filter` is merely a mask specifying Boolean OR.) |
||
| 809 | |||
| 810 | Cropping uses a polygon mask (not just the bounding box rectangle). |
||
| 811 | Areas outside the polygon will be filled according to `fill`: |
||
| 812 | |||
| 813 | \b |
||
| 814 | - if `"background"` (the default), |
||
| 815 | then fill with the median color of the image; |
||
| 816 | - else if `"none"`, then avoid masking polygons where possible |
||
| 817 | (i.e. when cropping) or revert to the default (i.e. when rotating) |
||
| 818 | - otherwise, use the given color, e.g. `"white"` or `(255,255,255)`. |
||
| 819 | |||
| 820 | Moreover, if `transparency` is true, and unless the image already |
||
| 821 | has an alpha channel, then add an alpha channel which is fully opaque |
||
| 822 | before cropping and rotating. (Thus, unexposed/masked areas will be |
||
| 823 | transparent afterwards for consumers that can interpret alpha channels). |
||
| 824 | |||
| 825 | When cropping, compensate any ``@orientation`` angle annotated for the |
||
| 826 | parent (from parent-level deskewing) by rotating the segment coordinates |
||
| 827 | in an inverse transformation (i.e. translation to center, then passive |
||
| 828 | rotation, and translation back). |
||
| 829 | |||
| 830 | Regardless, if any ``@orientation`` angle is annotated for the segment |
||
| 831 | (from segment-level deskewing), and the chosen image does not have |
||
| 832 | the feature `"deskewed"` yet, and unless `"deskewed"` is being filtered, |
||
| 833 | then rotate it - compensating for any previous `"angle"`. (However, |
||
| 834 | if ``@orientation`` is above the [-45°,45°] interval, then apply as much |
||
| 835 | transposition as possible first, unless `"rotated-90"` / `"rotated-180"` / |
||
| 836 | `"rotated-270"` is being filtered.) |
||
| 837 | |||
| 838 | Returns: |
||
| 839 | a tuple of |
||
| 840 | * the extracted `PIL.Image`, |
||
| 841 | * a `dict` with information about the extracted image: |
||
| 842 | |||
| 843 | - `"transform"`: a `Numpy` array with an affine transform which |
||
| 844 | converts from absolute coordinates to those relative to the image, |
||
| 845 | i.e. after applying all parent operations, and then cropping to |
||
| 846 | the segment's bounding box, and deskewing with the segment's |
||
| 847 | orientation angle (if any) |
||
| 848 | - `"angle"`: the rotation/reflection angle applied to the image so far, |
||
| 849 | - `"features"`: the ``AlternativeImage/@comments`` for the image, i.e. |
||
| 850 | names of all applied operations that lead up to this result. |
||
| 851 | |||
| 852 | (These can be used to create a new ``AlternativeImage``, or passed down |
||
| 853 | for :py:meth:`image_from_segment` calls on lower hierarchy levels.) |
||
| 854 | |||
| 855 | Examples: |
||
| 856 | |||
| 857 | * get a raw (colored) but already deskewed and cropped image:: |
||
| 858 | |||
| 859 | image, xywh = workspace.image_from_segment(region, |
||
| 860 | page_image, page_xywh, |
||
| 861 | feature_selector='deskewed,cropped', |
||
| 862 | feature_filter='binarized,grayscale_normalized') |
||
| 863 | """ |
||
| 864 | log = getLogger('ocrd.workspace.image_from_segment') |
||
| 865 | # note: We should mask overlapping neighbouring segments here, |
||
| 866 | # but finding the right clipping rules can be difficult if operating |
||
| 867 | # on the raw (non-binary) image data alone: for each intersection, it |
||
| 868 | # must be decided which one of either segment or neighbour to assign, |
||
| 869 | # e.g. an ImageRegion which properly contains our TextRegion should be |
||
| 870 | # completely ignored, but an ImageRegion which is properly contained |
||
| 871 | # in our TextRegion should be completely masked, while partial overlap |
||
| 872 | # may be more difficult to decide. On the other hand, on the binary image, |
||
| 873 | # we can use connected component analysis to mask foreground areas which |
||
| 874 | # originate in the neighbouring regions. But that would introduce either |
||
| 875 | # the assumption that the input has already been binarized, or a dependency |
||
| 876 | # on some ad-hoc binarization method. Thus, it is preferable to use |
||
| 877 | # a dedicated processor for this (which produces clipped AlternativeImage |
||
| 878 | # or reduced polygon coordinates). |
||
| 879 | segment_image, segment_coords, segment_xywh = _crop( |
||
| 880 | log, "parent image for segment '%s'" % segment.id, |
||
| 881 | segment, parent_image, parent_coords, |
||
| 882 | fill=fill, transparency=transparency) |
||
| 883 | |||
| 884 | # Semantics of missing @orientation at region level could be either |
||
| 885 | # - inherited from page level: same as line or word level (no @orientation), |
||
| 886 | # - zero (unrotate page angle): different from line or word level (because |
||
| 887 | # otherwise deskewing would never have an effect on lines and words) |
||
| 888 | # The PAGE specification is silent here (but does generally not concern itself |
||
| 889 | # much with AlternativeImage coordinate consistency). |
||
| 890 | # Since our (generateDS-backed) ocrd_page supports the zero/none distinction, |
||
| 891 | # we choose the former (i.e. None is inheritance). |
||
| 892 | if 'orientation' in segment.__dict__ and segment.get_orientation() is not None: |
||
| 893 | # region angle: PAGE @orientation is defined clockwise, |
||
| 894 | # whereas PIL/ndimage rotation is in mathematical direction: |
||
| 895 | angle = -segment.get_orientation() |
||
| 896 | # @orientation is always absolute; if higher levels |
||
| 897 | # have already rotated, then we must compensate: |
||
| 898 | angle -= parent_coords['angle'] |
||
| 899 | # map angle from (-180,180] to [0,360], and partition into multiples of 90; |
||
| 900 | # but avoid unnecessary large remainders, i.e. split symmetrically: |
||
| 901 | orientation = (angle + 45) % 360 |
||
| 902 | orientation = orientation - (orientation % 90) |
||
| 903 | skew = (angle % 360) - orientation |
||
| 904 | skew = 180 - (180 - skew) % 360 # map to [-45,45] |
||
| 905 | log.debug("segment '%s' has orientation=%d skew=%.2f", |
||
| 906 | segment.id, orientation, skew) |
||
| 907 | else: |
||
| 908 | orientation = 0 |
||
| 909 | skew = 0 |
||
| 910 | segment_coords['angle'] = parent_coords['angle'] # nothing applied yet (depends on filters) |
||
| 911 | |||
| 912 | # initialize AlternativeImage@comments classes from parent, except |
||
| 913 | # for those operations that can apply on multiple hierarchy levels: |
||
| 914 | segment_coords['features'] = ','.join( |
||
| 915 | [feature for feature in parent_coords['features'].split(',') |
||
| 916 | if feature in ['binarized', 'grayscale_normalized', |
||
| 917 | 'despeckled', 'dewarped']]) |
||
| 918 | |||
| 919 | best_image = None |
||
| 920 | alternative_images = segment.get_AlternativeImage() |
||
| 921 | View Code Duplication | if alternative_images: |
|
| 922 | # (e.g. from segment-level cropping, binarization, deskewing or despeckling) |
||
| 923 | best_features = set() |
||
| 924 | auto_features = {'cropped', 'deskewed', 'rotated-90', 'rotated-180', 'rotated-270'} |
||
| 925 | # search to the end, because by convention we always append, |
||
| 926 | # and among multiple satisfactory images we want the most recent, |
||
| 927 | # but also ensure that we get the richest feature set, i.e. most |
||
| 928 | # of those features that we cannot reproduce automatically below |
||
| 929 | for alternative_image in alternative_images: |
||
| 930 | if filename and filename != alternative_image.filename: |
||
| 931 | continue |
||
| 932 | features = alternative_image.get_comments() |
||
| 933 | if not features: |
||
| 934 | log.warning("AlternativeImage %d for segment '%s' does not have any feature attributes", |
||
| 935 | alternative_images.index(alternative_image) + 1, segment.id) |
||
| 936 | features = '' |
||
| 937 | featureset = set(features.split(',')) |
||
| 938 | if (all(feature in featureset |
||
| 939 | for feature in feature_selector.split(',') if feature) and |
||
| 940 | not any(feature in featureset |
||
| 941 | for feature in feature_filter.split(',') if feature) and |
||
| 942 | len(featureset.difference(auto_features)) >= \ |
||
| 943 | len(best_features.difference(auto_features))): |
||
| 944 | best_features = featureset |
||
| 945 | best_image = alternative_image |
||
| 946 | if best_image: |
||
| 947 | log.debug("Using AlternativeImage %d %s for segment '%s'", |
||
| 948 | alternative_images.index(best_image) + 1, |
||
| 949 | best_features, segment.id) |
||
| 950 | segment_image = self._resolve_image_as_pil(alternative_image.get_filename()) |
||
| 951 | segment_coords['features'] = best_image.get_comments() # including duplicates |
||
| 952 | |||
| 953 | alternative_image_features = segment_coords['features'].split(',') |
||
| 954 | for duplicate_feature in set([feature for feature in alternative_image_features |
||
| 955 | # features relevant in reconstructing coordinates: |
||
| 956 | if (feature in ['deskewed', 'rotated-90', |
||
| 957 | 'rotated-180', 'rotated-270'] and |
||
| 958 | alternative_image_features.count(feature) > 1)]): |
||
| 959 | log.error("Duplicate feature %s in AlternativeImage for segment '%s'", |
||
| 960 | duplicate_feature, segment.id) |
||
| 961 | for i, feature in enumerate(alternative_image_features + |
||
| 962 | (['rotated-%d' % orientation] |
||
| 963 | if (orientation and |
||
| 964 | not 'rotated-%d' % orientation in alternative_image_features and |
||
| 965 | not 'rotated-%d' % orientation in feature_filter.split(',')) |
||
| 966 | else []) + |
||
| 967 | (['deskewed'] |
||
| 968 | if (skew and |
||
| 969 | not 'deskewed' in alternative_image_features and |
||
| 970 | not 'deskewed' in feature_filter.split(',')) |
||
| 971 | else []) + |
||
| 972 | # not a feature to be added, but merely as a fallback position |
||
| 973 | # to always enter loop at i == len(alternative_image_features) |
||
| 974 | ['_check']): |
||
| 975 | # image geometry vs feature consistency can only be checked |
||
| 976 | # after all features on the existing AlternativeImage have |
||
| 977 | # been adjusted for in the transform, and when there is a mismatch, |
||
| 978 | # additional steps applied here would only repeat the respective |
||
| 979 | # error message; so we only check once at the boundary between |
||
| 980 | # existing and new features |
||
| 981 | # FIXME we should enforce consistency here (i.e. split into transposition |
||
| 982 | # and minimal rotation, rotation always reshapes, rescaling never happens) |
||
| 983 | # FIXME: inconsistency currently unavoidable with line-level dewarping (which increases height) |
||
| 984 | if (i == len(alternative_image_features) and |
||
| 985 | not (segment_xywh['w'] - 2 < segment_image.width < segment_xywh['w'] + 2 and |
||
| 986 | segment_xywh['h'] - 2 < segment_image.height < segment_xywh['h'] + 2)): |
||
| 987 | log.error('segment "%s" image (%s; %dx%d) has not been cropped properly (%dx%d)', |
||
| 988 | segment.id, segment_coords['features'], |
||
| 989 | segment_image.width, segment_image.height, |
||
| 990 | segment_xywh['w'], segment_xywh['h']) |
||
| 991 | name = "%s for segment '%s'" % ("AlternativeImage" if best_image |
||
| 992 | else "parent image", segment.id) |
||
| 993 | # adjust transform to feature, and ensure feature is applied to image |
||
| 994 | if feature == 'rotated-%d' % orientation: |
||
| 995 | segment_image, segment_coords, segment_xywh = _reflect( |
||
| 996 | log, name, orientation, segment_image, segment_coords, segment_xywh) |
||
| 997 | elif feature == 'deskewed': |
||
| 998 | segment_image, segment_coords, segment_xywh = _rotate( |
||
| 999 | log, name, skew, segment, segment_image, segment_coords, segment_xywh, |
||
| 1000 | fill=fill, transparency=transparency) |
||
| 1001 | |||
| 1002 | # verify constraints again: |
||
| 1003 | if filename and not getattr(segment_image, 'filename', '').endswith(filename): |
||
| 1004 | raise Exception('Found no AlternativeImage that satisfies all requirements ' + |
||
| 1005 | 'filename="%s" in segment "%s"' % ( |
||
| 1006 | filename, segment.id)) |
||
| 1007 | if not all(feature in segment_coords['features'] |
||
| 1008 | for feature in feature_selector.split(',') if feature): |
||
| 1009 | raise Exception('Found no AlternativeImage that satisfies all requirements' + |
||
| 1010 | 'selector="%s" in segment "%s"' % ( |
||
| 1011 | feature_selector, segment.id)) |
||
| 1012 | if any(feature in segment_coords['features'] |
||
| 1013 | for feature in feature_filter.split(',') if feature): |
||
| 1014 | raise Exception('Found no AlternativeImage that satisfies all requirements ' + |
||
| 1015 | 'filter="%s" in segment "%s"' % ( |
||
| 1016 | feature_filter, segment.id)) |
||
| 1017 | segment_image.format = 'PNG' # workaround for tesserocr#194 |
||
| 1018 | return segment_image, segment_coords |
||
| 1019 | |||
| 1020 | # pylint: disable=redefined-builtin |
||
| 1021 | def save_image_file(self, image, |
||
| 1022 | file_id, |
||
| 1023 | file_grp, |
||
| 1024 | page_id=None, |
||
| 1025 | mimetype='image/png', |
||
| 1026 | force=False): |
||
| 1027 | """Store an image in the filesystem and reference it as new file in the METS. |
||
| 1028 | |||
| 1029 | Args: |
||
| 1030 | image (PIL.Image): derived image to save |
||
| 1031 | file_id (string): `@ID` of the METS `file` to use |
||
| 1032 | file_grp (string): `@USE` of the METS `fileGrp` to use |
||
| 1033 | Keyword Args: |
||
| 1034 | page_id (string): `@ID` in the METS physical `structMap` to use |
||
| 1035 | mimetype (string): MIME type of the image format to serialize as |
||
| 1036 | force (boolean): whether to replace any existing `file` with that `@ID` |
||
| 1037 | |||
| 1038 | Serialize the image into the filesystem, and add a `file` for it in the METS. |
||
| 1039 | Use a filename extension based on ``mimetype``. |
||
| 1040 | |||
| 1041 | Returns: |
||
| 1042 | The (absolute) path of the created file. |
||
| 1043 | """ |
||
| 1044 | log = getLogger('ocrd.workspace.save_image_file') |
||
| 1045 | if self.overwrite_mode: |
||
| 1046 | force = True |
||
| 1047 | image_bytes = io.BytesIO() |
||
| 1048 | image.save(image_bytes, format=MIME_TO_PIL[mimetype]) |
||
| 1049 | file_path = str(Path(file_grp, '%s%s' % (file_id, MIME_TO_EXT[mimetype]))) |
||
| 1050 | out = self.add_file( |
||
| 1051 | file_grp, |
||
| 1052 | file_id=file_id, |
||
| 1053 | page_id=page_id, |
||
| 1054 | local_filename=file_path, |
||
| 1055 | mimetype=mimetype, |
||
| 1056 | content=image_bytes.getvalue(), |
||
| 1057 | force=force) |
||
| 1058 | log.info('created file ID: %s, file_grp: %s, path: %s', |
||
| 1059 | file_id, file_grp, out.local_filename) |
||
| 1060 | return file_path |
||
| 1061 | |||
| 1062 | def find_files(self, *args, **kwargs): |
||
| 1063 | """ |
||
| 1064 | Search ``mets:file`` entries in wrapped METS document and yield results. |
||
| 1065 | |||
| 1066 | Delegator to :py:func:`ocrd_models.ocrd_mets.OcrdMets.find_files` |
||
| 1067 | |||
| 1068 | Keyword Args: |
||
| 1069 | **kwargs: See :py:func:`ocrd_models.ocrd_mets.OcrdMets.find_files` |
||
| 1070 | Returns: |
||
| 1071 | Generator which yields :py:class:`ocrd_models:ocrd_file:OcrdFile` instantiations |
||
| 1072 | """ |
||
| 1073 | log = getLogger('ocrd.workspace.find_files') |
||
| 1074 | log.debug('find files in mets. kwargs=%s' % kwargs) |
||
| 1075 | if "page_id" in kwargs: |
||
| 1076 | kwargs["pageId"] = kwargs.pop("page_id") |
||
| 1077 | if "file_id" in kwargs: |
||
| 1078 | kwargs["ID"] = kwargs.pop("file_id") |
||
| 1079 | if "file_grp" in kwargs: |
||
| 1080 | kwargs["fileGrp"] = kwargs.pop("file_grp") |
||
| 1081 | with pushd_popd(self.directory): |
||
| 1082 | return self.mets.find_files(*args, **kwargs) |
||
| 1083 | |||
| 1084 | def _crop(log, name, segment, parent_image, parent_coords, op='cropped', **kwargs): |
||
| 1085 | segment_coords = parent_coords.copy() |
||
| 1086 | # get polygon outline of segment relative to parent image: |
||
| 1087 | segment_polygon = coordinates_of_segment(segment, parent_image, parent_coords) |
||
| 1088 | # get relative bounding box: |
||
| 1089 | segment_bbox = bbox_from_polygon(segment_polygon) |
||
| 1090 | # get size of the segment in the parent image after cropping |
||
| 1091 | # (i.e. possibly different from size before rotation at the parent, but |
||
| 1092 | # also possibly different from size after rotation below/AlternativeImage): |
||
| 1093 | segment_xywh = xywh_from_bbox(*segment_bbox) |
||
| 1094 | # crop, if (still) necessary: |
||
| 1095 | if (not isinstance(segment, BorderType) or # always crop below page level |
||
| 1096 | not op in parent_coords['features']): |
||
| 1097 | if op == 'recropped': |
||
| 1098 | log.info("Recropping %s", name) |
||
| 1099 | elif isinstance(segment, BorderType): |
||
| 1100 | log.info("Cropping %s", name) |
||
| 1101 | segment_coords['features'] += ',' + op |
||
| 1102 | # create a mask from the segment polygon: |
||
| 1103 | segment_image = image_from_polygon(parent_image, segment_polygon, **kwargs) |
||
| 1104 | # crop to bbox: |
||
| 1105 | segment_image = crop_image(segment_image, box=segment_bbox) |
||
| 1106 | else: |
||
| 1107 | segment_image = parent_image |
||
| 1108 | # subtract offset from parent in affine coordinate transform: |
||
| 1109 | # (consistent with image cropping) |
||
| 1110 | segment_coords['transform'] = shift_coordinates( |
||
| 1111 | parent_coords['transform'], |
||
| 1112 | np.array([-segment_bbox[0], |
||
| 1113 | -segment_bbox[1]])) |
||
| 1114 | return segment_image, segment_coords, segment_xywh |
||
| 1115 | |||
| 1116 | def _reflect(log, name, orientation, segment_image, segment_coords, segment_xywh): |
||
| 1117 | # Transpose in affine coordinate transform: |
||
| 1118 | # (consistent with image transposition or AlternativeImage below) |
||
| 1119 | transposition = { |
||
| 1120 | 90: Image.ROTATE_90, |
||
| 1121 | 180: Image.ROTATE_180, |
||
| 1122 | 270: Image.ROTATE_270 |
||
| 1123 | }.get(orientation) # no default |
||
| 1124 | segment_coords['transform'] = transpose_coordinates( |
||
| 1125 | segment_coords['transform'], transposition, |
||
| 1126 | np.array([0.5 * segment_xywh['w'], |
||
| 1127 | 0.5 * segment_xywh['h']])) |
||
| 1128 | segment_xywh['w'], segment_xywh['h'] = adjust_canvas_to_transposition( |
||
| 1129 | [segment_xywh['w'], segment_xywh['h']], transposition) |
||
| 1130 | segment_coords['angle'] += orientation |
||
| 1131 | # transpose, if (still) necessary: |
||
| 1132 | if not 'rotated-%d' % orientation in segment_coords['features']: |
||
| 1133 | log.info("Transposing %s by %d°", name, orientation) |
||
| 1134 | segment_image = transpose_image(segment_image, transposition) |
||
| 1135 | segment_coords['features'] += ',rotated-%d' % orientation |
||
| 1136 | return segment_image, segment_coords, segment_xywh |
||
| 1137 | |||
| 1138 | def _rotate(log, name, skew, segment, segment_image, segment_coords, segment_xywh, **kwargs): |
||
| 1139 | # Rotate around center in affine coordinate transform: |
||
| 1140 | # (consistent with image rotation or AlternativeImage below) |
||
| 1141 | segment_coords['transform'] = rotate_coordinates( |
||
| 1142 | segment_coords['transform'], skew, |
||
| 1143 | np.array([0.5 * segment_xywh['w'], |
||
| 1144 | 0.5 * segment_xywh['h']])) |
||
| 1145 | segment_xywh['w'], segment_xywh['h'] = adjust_canvas_to_rotation( |
||
| 1146 | [segment_xywh['w'], segment_xywh['h']], skew) |
||
| 1147 | segment_coords['angle'] += skew |
||
| 1148 | # deskew, if (still) necessary: |
||
| 1149 | if not 'deskewed' in segment_coords['features']: |
||
| 1150 | log.info("Rotating %s by %.2f°", name, skew) |
||
| 1151 | segment_image = rotate_image(segment_image, skew, **kwargs) |
||
| 1152 | segment_coords['features'] += ',deskewed' |
||
| 1153 | if (segment and |
||
| 1154 | (not isinstance(segment, BorderType) or # always crop below page level |
||
| 1155 | 'cropped' in segment_coords['features'])): |
||
| 1156 | # re-crop to new bbox (which may deviate |
||
| 1157 | # if segment polygon was not a rectangle) |
||
| 1158 | segment_image, segment_coords, segment_xywh = _crop( |
||
| 1159 | log, name, segment, segment_image, segment_coords, |
||
| 1160 | op='recropped', **kwargs) |
||
| 1161 | elif (segment and |
||
| 1162 | (not isinstance(segment, BorderType) or # always crop below page level |
||
| 1163 | 'cropped' in segment_coords['features'])): |
||
| 1164 | # only shift coordinates as if re-cropping |
||
| 1165 | segment_polygon = coordinates_of_segment(segment, segment_image, segment_coords) |
||
| 1166 | segment_bbox = bbox_from_polygon(segment_polygon) |
||
| 1167 | segment_xywh = xywh_from_bbox(*segment_bbox) |
||
| 1168 | segment_coords['transform'] = shift_coordinates( |
||
| 1169 | segment_coords['transform'], |
||
| 1170 | np.array([-segment_bbox[0], |
||
| 1171 | -segment_bbox[1]])) |
||
| 1172 | return segment_image, segment_coords, segment_xywh |
||
| 1173 | |||
| 1174 | def _scale(log, name, factor, segment_image, segment_coords, segment_xywh, **kwargs): |
||
| 1175 | # Resize linearly |
||
| 1176 | segment_coords['transform'] = scale_coordinates( |
||
| 1177 | segment_coords['transform'], [factor, factor]) |
||
| 1178 | segment_coords['scale'] = segment_coords.setdefault('scale', 1.0) * factor |
||
| 1179 | segment_xywh['w'] *= factor |
||
| 1180 | segment_xywh['h'] *= factor |
||
| 1181 | # resize, if (still) necessary |
||
| 1182 | if not 'scaled' in segment_coords['features']: |
||
| 1183 | log.info("Scaling %s by %.2f", name, factor) |
||
| 1184 | segment_coords['features'] += ',scaled' |
||
| 1185 | # FIXME: validate factor against PAGE-XML attributes |
||
| 1186 | # FIXME: factor should become less precise due to rounding |
||
| 1187 | segment_image = segment_image.resize((int(segment_image.width * factor), |
||
| 1188 | int(segment_image.height * factor)), |
||
| 1189 | # slowest, but highest quality: |
||
| 1190 | Image.BICUBIC) |
||
| 1191 | return segment_image, segment_coords, segment_xywh |
||
| 1192 |