Passed
Push — master ( 488518...bfe6a0 )
by Konstantin
03:11
created

ocrd.workspace.Workspace.add_file()   D

Complexity

Conditions 13

Size

Total Lines 51
Code Lines 30

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 30
dl 0
loc 51
rs 4.2
c 0
b 0
f 0
cc 13
nop 4

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

Complexity

Complex classes like ocrd.workspace.Workspace.add_file() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

1
import io
2
from os import makedirs, unlink, listdir, path
3
from pathlib import Path
4
from shutil import move, copyfileobj
5
from re import sub
6
from tempfile import NamedTemporaryFile
7
from contextlib import contextmanager
8
9
from cv2 import COLOR_GRAY2BGR, COLOR_RGB2BGR, cvtColor
10
from PIL import Image
11
import numpy as np
12
from deprecated.sphinx import deprecated
13
import requests
14
15
from ocrd_models import OcrdMets, OcrdFile
16
from ocrd_models.ocrd_page import parse, BorderType, to_xml
17
from ocrd_modelfactory import exif_from_filename, page_from_file
18
from ocrd_utils import (
19
    atomic_write,
20
    getLogger,
21
    image_from_polygon,
22
    coordinates_of_segment,
23
    adjust_canvas_to_rotation,
24
    adjust_canvas_to_transposition,
25
    shift_coordinates,
26
    rotate_coordinates,
27
    transform_coordinates,
28
    transpose_coordinates,
29
    crop_image,
30
    rotate_image,
31
    transpose_image,
32
    bbox_from_polygon,
33
    polygon_from_points,
34
    xywh_from_bbox,
35
    pushd_popd,
36
    is_local_filename,
37
    deprecated_alias,
38
    MIME_TO_EXT,
39
    MIME_TO_PIL,
40
    MIMETYPE_PAGE,
41
    REGEX_PREFIX
42
)
43
44
from .workspace_backup import WorkspaceBackupManager
45
from .mets_server import ClientSideOcrdMets
46
47
__all__ = ['Workspace']
48
49
@contextmanager
50
def download_temporary_file(url):
51
    with NamedTemporaryFile(prefix='ocrd-download-') as f:
52
        with requests.get(url) as r:
53
            f.write(r.content)
54
        yield f
55
56
57
class Workspace():
58
    """
59
    A workspace is a temporary directory set up for a processor. It's the
60
    interface to the METS/PAGE XML and delegates download and upload to the
61
    :py:class:`ocrd.resolver.Resolver`.
62
63
    Args:
64
65
        directory (string) : Filesystem folder to work in
66
        mets (:py:class:`ocrd_models.ocrd_mets.OcrdMets`) : `OcrdMets` representing this workspace.
67
            Loaded from `'mets.xml'` if `None`.
68
        mets_basename (string) : Basename of the METS XML file. Default: Last URL segment of the mets_url.
69
        overwrite_mode (boolean) : Whether to force add operations on this workspace globally
70
        baseurl (string) : Base URL to prefix to relative URL.
71
    """
72
73
    def __init__(self, resolver, directory, mets=None, mets_basename='mets.xml', automatic_backup=False, baseurl=None, mets_server_url=None):
74
        self.resolver = resolver
75
        self.directory = directory
76
        self.mets_target = str(Path(directory, mets_basename))
77
        self.overwrite_mode = False
78
        self.is_remote = bool(mets_server_url)
79
        if mets is None:
80
            if self.is_remote:
81
                mets = ClientSideOcrdMets(mets_server_url)
82
                if mets.workspace_path != self.directory:
83
                    raise ValueError(f"METS server {mets_server_url} workspace directory {mets.workspace_path} differs "
84
                            "from local workspace directory {self.directory}. These are not the same workspaces.")
85
            else:
86
                mets = OcrdMets(filename=self.mets_target)
87
        self.mets = mets
88
        if automatic_backup:
89
            self.automatic_backup = WorkspaceBackupManager(self)
90
            self.automatic_backup.add()
91
        else:
92
            self.automatic_backup = None
93
        self.baseurl = baseurl
94
        #  print(mets.to_xml(xmllint=True).decode('utf-8'))
95
96
    def __str__(self):
97
        return 'Workspace[remote=%s, directory=%s, baseurl=%s, file_groups=%s, files=%s]' % (
98
            not not self.is_remote,
99
            self.directory,
100
            self.baseurl,
101
            self.mets.file_groups,
102
            [str(f) for f in self.mets.find_all_files()],
103
        )
104
105
    def reload_mets(self):
106
        """
107
        Reload METS from the filesystem.
108
        """
109
        self.mets = OcrdMets(filename=self.mets_target)
110
111
    @deprecated_alias(pageId="page_id")
112
    @deprecated_alias(ID="file_id")
113
    @deprecated_alias(fileGrp="file_grp")
114
    @deprecated_alias(fileGrp_mapping="filegrp_mapping")
115
    def merge(self, other_workspace, copy_files=True, overwrite=False, **kwargs):
116
        """
117
        Merge ``other_workspace`` into this one
118
119
        See :py:meth:`ocrd_models.ocrd_mets.OcrdMets.merge` for the `kwargs`
120
121
        Keyword Args:
122
            copy_files (boolean): Whether to copy files from `other_workspace` to this one
123
        """
124
        def after_add_cb(f):
125
            """callback to run on merged OcrdFile instances in the destination"""
126
            print(f)
127
            if not f.local_filename:
128
                # OcrdFile has no local_filename, so nothing to be copied
129
                return
130
            if not copy_files:
131
                fpath_src = Path(other_workspace.directory).resolve()
132
                fpath_dst = Path(self.directory).resolve()
133
                dstprefix = fpath_src.relative_to(fpath_dst) # raises ValueError if not a subpath
134
                f.local_filename = dstprefix / f.local_filename
135
                return
136
            fpath_src = Path(other_workspace.directory, f.local_filename)
137
            fpath_dest = Path(self.directory, f.local_filename)
138
            if fpath_src.exists():
139
                if fpath_dest.exists() and not overwrite:
140
                    raise FileExistsError("Copying %s to %s would overwrite the latter" % (fpath_src, fpath_dest))
141
                if not fpath_dest.parent.is_dir():
142
                    makedirs(str(fpath_dest.parent))
143
                with open(str(fpath_src), 'rb') as fstream_in, open(str(fpath_dest), 'wb') as fstream_out:
144
                    copyfileobj(fstream_in, fstream_out)
145
        if 'page_id' in kwargs:
146
            kwargs['pageId'] = kwargs.pop('page_id')
147
        if 'file_id' in kwargs:
148
            kwargs['ID'] = kwargs.pop('file_id')
149
        if 'file_grp' in kwargs:
150
            kwargs['fileGrp'] = kwargs.pop('file_grp')
151
        if 'filegrp_mapping' in kwargs:
152
            kwargs['fileGrp_mapping'] = kwargs.pop('filegrp_mapping')
153
154
        self.mets.merge(other_workspace.mets, after_add_cb=after_add_cb, **kwargs)
155
156
157
    @deprecated(version='1.0.0', reason="Use workspace.download_file")
158
    def download_url(self, url, **kwargs):
159
        """
160
        Download a URL to the workspace.
161
162
        Args:
163
            url (string): URL to download to directory
164
            **kwargs : See :py:class:`ocrd_models.ocrd_file.OcrdFile`
165
166
        Returns:
167
            The local filename of the downloaded file
168
        """
169
        dummy_mets = OcrdMets.empty_mets()
170
        f = dummy_mets.add_file('DEPRECATED', ID=Path(url).name, url=url)
171
        f = self.download_file(f)
172
        return f.local_filename
173
174
    def download_file(self, f, _recursion_count=0):
175
        """
176
        Download a :py:class:`ocrd_models.ocrd_file.OcrdFile` to the workspace.
177
        """
178
        log = getLogger('ocrd.workspace.download_file')
179
        with pushd_popd(self.directory):
180
            print(f)
181
            if f.local_filename:
182
                file_path = Path(f.local_filename).absolute()
183
                if file_path.exists():
184
                    try:
185
                        file_path.relative_to(Path(self.directory).resolve()) # raises ValueError if not relative
186
                        # If the f.local_filename exists and is within self.directory, nothing to do
187
                        log.info(f"'local_filename' {f.local_filename} already within {self.directory}, nothing to do")
188
                    except ValueError:
189
                        # f.local_filename exists, but not within self.directory, copy it
190
                        log.info("Copying 'local_filename' %s to workspace directory %s" % (f.local_filename, self.directory))
191
                        f.local_filename = self.resolver.download_to_directory(self.directory, f.local_filename, subdir=f.fileGrp)
192
                    return f
193
                if f.url:
194
                    log.info("OcrdFile has 'local_filename' but it doesn't resolve, try to download from set 'url' %s", f.url)
195
                elif self.baseurl:
196
                    log.info("OcrdFile has 'local_filename' but it doesn't resolve and no 'url', concatenate 'baseurl' %s and 'local_filename' %s",
197
                            self.baseurl, f.local_filename)
198
                    f.url = '%s/%s' % (self.baseurl, f.local_filename)
199
                else:
200
                    raise FileNotFoundError(f"'local_filename' {f.local_filename} points to non-existing file,"
201
                                                 "and no 'url' to download and no 'baseurl' set on workspace, nothing we can do.")
202
            if f.url:
203
                # If f.url is set, download the file to the workspace
204
                basename = '%s%s' % (f.ID, MIME_TO_EXT.get(f.mimetype, '')) if f.ID else f.basename
205
                f.local_filename = self.resolver.download_to_directory(self.directory, f.url, subdir=f.fileGrp, basename=basename)
206
            else:
207
                # If neither f.local_filename nor f.url is set, fail
208
                raise ValueError("OcrdFile {f} has neither 'url' nor 'local_filename', so cannot be downloaded")
209
            return f
210
211
    def remove_file(self, file_id, force=False, keep_file=False, page_recursive=False, page_same_group=False):
212
        """
213
        Remove a METS `file` from the workspace.
214
215
        Arguments:
216
            file_id (string|:py:class:`ocrd_models.ocrd_file.OcrdFile`): `@ID` of the METS `file`
217
                to delete or the file itself
218
        Keyword Args:
219
            force (boolean): Continue removing even if file not found in METS
220
            keep_file (boolean): Whether to keep files on disk
221
            page_recursive (boolean): Whether to remove all images referenced in the file
222
                if the file is a PAGE-XML document.
223
            page_same_group (boolean): Remove only images in the same file group as the PAGE-XML.
224
                Has no effect unless ``page_recursive`` is `True`.
225
        """
226
        log = getLogger('ocrd.workspace.remove_file')
227
        log.debug('Deleting mets:file %s', file_id)
228
        if self.overwrite_mode:
229
            force = True
230
        if isinstance(file_id, OcrdFile):
231
            file_id = file_id.ID
232
        try:
233
            try:
234
                ocrd_file = next(self.mets.find_files(ID=file_id))
235
            except StopIteration:
236
                if file_id.startswith(REGEX_PREFIX):
237
                    # allow empty results if filter criteria involve a regex
238
                    return None
239
                raise FileNotFoundError("File %s not found in METS" % file_id)
240
            if page_recursive and ocrd_file.mimetype == MIMETYPE_PAGE:
241
                with pushd_popd(self.directory):
242
                    ocrd_page = parse(self.download_file(ocrd_file).local_filename, silence=True)
243
                    for img_url in ocrd_page.get_AllAlternativeImagePaths():
244
                        img_kwargs = {'local_filename': img_url}
245
                        if page_same_group:
246
                            img_kwargs['fileGrp'] = ocrd_file.fileGrp
247
                        for img_file in self.mets.find_files(**img_kwargs):
248
                            self.remove_file(img_file, keep_file=keep_file, force=force)
249
            if not keep_file:
250
                with pushd_popd(self.directory):
251
                    if not ocrd_file.local_filename:
252
                        if force:
253
                            log.debug("File not locally available but --force is set: %s", ocrd_file)
254
                        else:
255
                            raise Exception("File not locally available %s" % ocrd_file)
256
                    else:
257
                        log.info("rm %s [cwd=%s]", ocrd_file.local_filename, self.directory)
258
                        unlink(ocrd_file.local_filename)
259
            # Remove from METS only after the recursion of AlternativeImages
260
            self.mets.remove_file(file_id)
261
            return ocrd_file
262
        except FileNotFoundError as e:
263
            if not force:
264
                raise e
265
266
    def remove_file_group(self, USE, recursive=False, force=False, keep_files=False, page_recursive=False, page_same_group=False):
267
        """
268
        Remove a METS `fileGrp`.
269
270
        Arguments:
271
            USE (string): `@USE` of the METS `fileGrp` to delete
272
        Keyword Args:
273
            recursive (boolean): Whether to recursively delete all files in the group
274
            force (boolean): Continue removing even if group or containing files not found in METS
275
            keep_files (boolean): When deleting recursively whether to keep files on disk
276
            page_recursive (boolean): Whether to remove all images referenced in the file
277
                if the file is a PAGE-XML document.
278
            page_same_group (boolean): Remove only images in the same file group as the PAGE-XML.
279
                Has no effect unless ``page_recursive`` is `True`.
280
        """
281
        if not force and self.overwrite_mode:
282
            force = True
283
284
        if (not USE.startswith(REGEX_PREFIX)) and (USE not in self.mets.file_groups) and (not force):
285
            raise Exception("No such fileGrp: %s" % USE)
286
287
        file_dirs = []
288
        if recursive:
289
            for f in self.mets.find_files(fileGrp=USE):
290
                self.remove_file(f, force=force, keep_file=keep_files, page_recursive=page_recursive, page_same_group=page_same_group)
291
                if f.local_filename:
292
                    f_dir = path.dirname(f.local_filename)
293
                    if f_dir:
294
                        file_dirs.append(f_dir)
295
296
        self.mets.remove_file_group(USE, force=force, recursive=recursive)
297
298
        # PLEASE NOTE: this only removes directories in the workspace if they are empty
299
        # and named after the fileGrp which is a convention in OCR-D.
300
        with pushd_popd(self.directory):
301
            if Path(USE).is_dir() and not listdir(USE):
302
                Path(USE).rmdir()
303
            if file_dirs:
304
                for file_dir in set(file_dirs):
305
                    if Path(file_dir).is_dir() and not listdir(file_dir):
306
                        Path(file_dir).rmdir()
307
308
309
    def rename_file_group(self, old, new):
310
        """
311
        Rename a METS `fileGrp`.
312
313
        Arguments:
314
            old (string): `@USE` of the METS `fileGrp` to rename
315
            new (string): `@USE` of the METS `fileGrp` to rename as
316
        """
317
        log = getLogger('ocrd.workspace.rename_file_group')
318
319
        if old not in self.mets.file_groups:
320
            raise ValueError(f"No such fileGrp: {old}")
321
        if new in self.mets.file_groups:
322
            raise ValueError(f"fileGrp already exists {new}")
323
324
        with pushd_popd(self.directory):
325
            # create workspace dir ``new``
326
            log.info("mkdir %s" % new)
327
            if not Path(new).is_dir():
328
                Path(new).mkdir()
329
            local_filename_replacements = {}
330
            log.info("Moving files")
331
            for mets_file in self.mets.find_files(fileGrp=old, local_only=True):
332
                new_local_filename = old_local_filename = str(mets_file.local_filename)
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable str does not seem to be defined.
Loading history...
333
                # Directory part
334
                new_local_filename = sub(r'^%s/' % old, r'%s/' % new, new_local_filename)
335
                # File part
336
                new_local_filename = sub(r'/%s' % old, r'/%s' % new, new_local_filename)
337
                local_filename_replacements[str(mets_file.local_filename)] = new_local_filename
338
                # move file from ``old`` to ``new``
339
                mets_file.local_filename.rename(new_local_filename)
340
                # change the url of ``mets:file``
341
                mets_file.local_filename = new_local_filename
342
                # change the file ID and update structMap
343
                # change the file ID and update structMap
344
                new_id = sub(r'^%s' % old, r'%s' % new, mets_file.ID)
345
                try:
346
                    next(self.mets.find_files(ID=new_id))
347
                    log.warning("ID %s already exists, not changing ID while renaming %s -> %s" % (new_id, old_local_filename, new_local_filename))
348
                except StopIteration:
349
                    mets_file.ID = new_id
350
            # change file paths in PAGE-XML imageFilename and filename attributes
351
            for page_file in self.mets.find_files(mimetype=MIMETYPE_PAGE, local_only=True):
352
                log.info("Renaming file references in PAGE-XML %s" % page_file)
353
                pcgts = page_from_file(page_file)
354
                changed = False
355
                for old_local_filename, new_local_filename in local_filename_replacements.items():
356
                    if pcgts.get_Page().imageFilename == old_local_filename:
357
                        changed = True
358
                        log.info("Rename pc:Page/@imageFilename: %s -> %s" % (old_local_filename, new_local_filename))
359
                        pcgts.get_Page().imageFilename = new_local_filename
360
                for ai in pcgts.get_Page().get_AllAlternativeImages():
361
                    for old_local_filename, new_local_filename in local_filename_replacements.items():
362
                        if ai.filename == old_local_filename:
363
                            changed = True
364
                            log.info("Rename pc:Page/../AlternativeImage: %s -> %s" % (old_local_filename, new_local_filename))
365
                            ai.filename = new_local_filename
366
                if changed:
367
                    log.info("PAGE-XML changed, writing %s" % (page_file.local_filename))
368
                    with open(page_file.local_filename, 'w', encoding='utf-8') as f:
369
                        f.write(to_xml(pcgts))
370
            # change the ``USE`` attribute of the fileGrp
371
            self.mets.rename_file_group(old, new)
372
            # Remove the old dir
373
            log.info("rmdir %s" % old)
374
            if Path(old).is_dir() and not listdir(old):
375
                Path(old).rmdir()
376
377
    @deprecated_alias(pageId="page_id")
378
    @deprecated_alias(ID="file_id")
379
    def add_file(self, file_grp, content=None, **kwargs):
380
        """
381
        Add a file to the :py:class:`ocrd_models.ocrd_mets.OcrdMets` of the workspace.
382
383
        Arguments:
384
            file_grp (string): `@USE` of the METS `fileGrp` to add to
385
        Keyword Args:
386
            content (string|bytes): optional content to write to the file
387
                in the filesystem
388
            **kwargs: See :py:func:`ocrd_models.ocrd_mets.OcrdMets.add_file`
389
        Returns:
390
            a new :py:class:`ocrd_models.ocrd_file.OcrdFile`
391
        """
392
        log = getLogger('ocrd.workspace.add_file')
393
        log.debug(
394
            'outputfile file_grp=%s local_filename=%s content=%s',
395
            file_grp,
396
            kwargs.get('local_filename'),
397
            content is not None)
398
        if 'page_id' not in kwargs:
399
            raise ValueError("workspace.add_file must be passed a 'page_id' kwarg, even if it is None.")
400
        if content is not None and not kwargs.get('local_filename'):
401
            raise Exception("'content' was set but no 'local_filename'")
402
        if self.overwrite_mode:
403
            kwargs['force'] = True
404
405
        with pushd_popd(self.directory):
406
            if kwargs.get('local_filename'):
407
                # If the local filename has folder components, create those folders
408
                local_filename_dir = str(kwargs['local_filename']).rsplit('/', 1)[0]
409
                if local_filename_dir != str(kwargs['local_filename']) and not Path(local_filename_dir).is_dir():
410
                    makedirs(local_filename_dir)
411
412
            #  print(kwargs)
413
            kwargs["pageId"] = kwargs.pop("page_id")
414
            if "file_id" in kwargs:
415
                kwargs["ID"] = kwargs.pop("file_id")
416
417
            ret = self.mets.add_file(file_grp, **kwargs)
418
419
            # content being set implies is_remote==False because METS server
420
            # does not pass file contents
421
            if content is not None:
422
                with open(kwargs['local_filename'], 'wb') as f:
423
                    if isinstance(content, str):
424
                        content = bytes(content, 'utf-8')
425
                    f.write(content)
426
427
        return ret
428
429
    def save_mets(self):
430
        """
431
        Write out the current state of the METS file to the filesystem.
432
        """
433
        log = getLogger('ocrd.workspace.save_mets')
434
        if self.is_remote:
435
            self.mets.save()
436
        else:
437
            log.debug("Saving mets '%s'", self.mets_target)
438
            if self.automatic_backup:
439
                WorkspaceBackupManager(self).add()
440
            with atomic_write(self.mets_target) as f:
441
                f.write(self.mets.to_xml(xmllint=True).decode('utf-8'))
442
443
    def resolve_image_exif(self, image_url):
444
        """
445
        Get the EXIF metadata about an image URL as :py:class:`ocrd_models.ocrd_exif.OcrdExif`
446
447
        Args:
448
            image_url (string) : `@href` (path or URL) of the METS `file` to inspect
449
450
        Returns:
451
            :py:class:`ocrd_models.ocrd_exif.OcrdExif`
452
        """
453
        if not image_url:
454
            # avoid "finding" just any file
455
            raise ValueError(f"'image_url' must be a non-empty string, not '{image_url}' ({type(image_url)})")
456
        try:
457
            f = next(self.mets.find_files(local_filename=str(image_url)))
458
            return exif_from_filename(f.local_filename)
459
        except StopIteration:
460
            try:
461
                f = next(self.mets.find_files(url=str(image_url)))
462
                return exif_from_filename(self.download_file(f).local_filename)
463
            except StopIteration:
464
                with download_temporary_file(image_url) as f:
465
                    return exif_from_filename(f.name)
466
467
    @deprecated(version='1.0.0', reason="Use workspace.image_from_page and workspace.image_from_segment")
468
    def resolve_image_as_pil(self, image_url, coords=None):
469
        """
470
        Resolve an image URL to a `PIL.Image`.
471
472
        Arguments:
473
            image_url (string): `@href` (path or URL) of the METS `file` to retrieve
474
        Keyword Args:
475
            coords (list) : Coordinates of the bounding box to cut from the image
476
477
        Returns:
478
            Full or cropped `PIL.Image`
479
480
        """
481
        return self._resolve_image_as_pil(image_url, coords)
482
483
    def _resolve_image_as_pil(self, image_url, coords=None):
484
        if not image_url:
485
            # avoid "finding" just any file
486
            raise Exception("Cannot resolve empty image path")
487
        log = getLogger('ocrd.workspace._resolve_image_as_pil')
488
        with pushd_popd(self.directory):
489
            try:
490
                f = next(self.mets.find_files(local_filename=str(image_url)))
491
                pil_image = Image.open(f.local_filename)
492
            except StopIteration:
493
                try:
494
                    f = next(self.mets.find_files(url=str(image_url)))
495
                    pil_image = Image.open(self.download_file(f).local_filename)
496
                except StopIteration:
497
                    with download_temporary_file(image_url) as f:
498
                        pil_image = Image.open(f.name)
499
            pil_image.load() # alloc and give up the FD
500
501
        # Pillow does not properly support higher color depths
502
        # (e.g. 16-bit or 32-bit or floating point grayscale),
503
        # clipping its dynamic range to the lower 8-bit in
504
        # many operations (including paste, putalpha, ImageStat...),
505
        # even including conversion.
506
        # Cf. Pillow#3011 Pillow#3159 Pillow#3838 (still open in 8.0)
507
        # So to be on the safe side, we must re-quantize these
508
        # to 8-bit via numpy (conversion to/from which fortunately
509
        # seems to work reliably):
510
        if (pil_image.mode.startswith('I') or
511
            pil_image.mode.startswith('F')):
512
            arr_image = np.array(pil_image)
513
            if arr_image.dtype.kind == 'i':
514
                # signed integer is *not* trustworthy in this context
515
                # (usually a mistake in the array interface)
516
                log.debug('Casting image "%s" from signed to unsigned', image_url)
517
                arr_image.dtype = np.dtype('u' + arr_image.dtype.name)
518
            if arr_image.dtype.kind == 'u':
519
                # integer needs to be scaled linearly to 8 bit
520
                # of course, an image might actually have some lower range
521
                # (e.g. 10-bit in I;16 or 20-bit in I or 4-bit in L),
522
                # but that would be guessing anyway, so here don't
523
                # make assumptions on _scale_, just reduce _precision_
524
                log.debug('Reducing image "%s" from depth %d bit to 8 bit',
525
                          image_url, arr_image.dtype.itemsize * 8)
526
                arr_image = arr_image >> 8 * (arr_image.dtype.itemsize-1)
527
                arr_image = arr_image.astype(np.uint8)
528
            elif arr_image.dtype.kind == 'f':
529
                # float needs to be scaled from [0,1.0] to [0,255]
530
                log.debug('Reducing image "%s" from floating point to 8 bit',
531
                          image_url)
532
                arr_image *= 255
533
                arr_image = arr_image.astype(np.uint8)
534
            pil_image = Image.fromarray(arr_image)
535
536
        if coords is None:
537
            return pil_image
538
539
        # FIXME: remove or replace this by (image_from_polygon+) crop_image ...
540
        log.debug("Converting PIL to OpenCV: %s", image_url)
541
        color_conversion = COLOR_GRAY2BGR if pil_image.mode in ('1', 'L') else  COLOR_RGB2BGR
542
        pil_as_np_array = np.array(pil_image).astype('uint8') if pil_image.mode == '1' else np.array(pil_image)
543
        cv2_image = cvtColor(pil_as_np_array, color_conversion)
544
545
        poly = np.array(coords, np.int32)
546
        log.debug("Cutting region %s from %s", coords, image_url)
547
        region_cut = cv2_image[
548
            np.min(poly[:, 1]):np.max(poly[:, 1]),
549
            np.min(poly[:, 0]):np.max(poly[:, 0])
550
        ]
551
        return Image.fromarray(region_cut)
552
553
    def image_from_page(self, page, page_id,
554
                        fill='background', transparency=False,
555
                        feature_selector='', feature_filter='', filename=''):
556
        """Extract an image for a PAGE-XML page from the workspace.
557
558
        Args:
559
            page (:py:class:`ocrd_models.ocrd_page.PageType`): a PAGE `PageType` object
560
            page_id (string): its `@ID` in the METS physical `structMap`
561
        Keyword Args:
562
            fill (string): a `PIL` color specifier, or `background` or `none`
563
            transparency (boolean): whether to add an alpha channel for masking
564
            feature_selector (string): a comma-separated list of `@comments` classes
565
            feature_filter (string): a comma-separated list of `@comments` classes
566
            filename (string): which file path to use
567
568
        Extract a `PIL.Image` from ``page``, either from its `AlternativeImage`
569
        (if it exists), or from its `@imageFilename` (otherwise). Also crop it,
570
        if a `Border` exists, and rotate it, if any `@orientation` angle is
571
        annotated.
572
573
        If ``filename`` is given, then among `@imageFilename` and the available
574
        `AlternativeImage/@filename` images, pick that one, or raise an error.
575
576
        If ``feature_selector`` and/or ``feature_filter`` is given, then
577
        among the `@imageFilename` image and the available AlternativeImages,
578
        select/filter the richest one which contains all of the selected,
579
        but none of the filtered features (i.e. `@comments` classes), or
580
        raise an error.
581
582
        (Required and produced features need not be in the same order, so
583
        ``feature_selector`` is merely a mask specifying Boolean AND, and
584
        ``feature_filter`` is merely a mask specifying Boolean OR.)
585
586
        If the chosen image does not have the feature `"cropped"` yet, but
587
        a `Border` exists, and unless `"cropped"` is being filtered, then crop it.
588
        Likewise, if the chosen image does not have the feature `"deskewed"` yet,
589
        but an `@orientation` angle is annotated, and unless `"deskewed"` is being
590
        filtered, then rotate it. (However, if `@orientation` is above the
591
        [-45°,45°] interval, then apply as much transposition as possible first,
592
        unless `"rotated-90"` / `"rotated-180"` / `"rotated-270"` is being filtered.)
593
594
        Cropping uses a polygon mask (not just the bounding box rectangle).
595
        Areas outside the polygon will be filled according to ``fill``:
596
597
        \b
598
        - if `"background"` (the default),
599
          then fill with the median color of the image;
600
        - else if `"none"`, then avoid masking polygons where possible
601
          (i.e. when cropping) or revert to the default (i.e. when rotating)
602
        - otherwise, use the given color, e.g. `"white"` or `(255,255,255)`.
603
604
        Moreover, if ``transparency`` is true, and unless the image already
605
        has an alpha channel, then add an alpha channel which is fully opaque
606
        before cropping and rotating. (Thus, unexposed/masked areas will be
607
        transparent afterwards for consumers that can interpret alpha channels).
608
609
        Returns:
610
            a tuple of
611
             * the extracted `PIL.Image`,
612
             * a `dict` with information about the extracted image:
613
614
               - `"transform"`: a `Numpy` array with an affine transform which
615
                   converts from absolute coordinates to those relative to the image,
616
                   i.e. after cropping to the page's border / bounding box (if any)
617
                   and deskewing with the page's orientation angle (if any)
618
               - `"angle"`: the rotation/reflection angle applied to the image so far,
619
               - `"features"`: the `AlternativeImage` `@comments` for the image, i.e.
620
                 names of all applied operations that lead up to this result,
621
             * an :py:class:`ocrd_models.ocrd_exif.OcrdExif` instance associated with
622
               the original image.
623
624
        (The first two can be used to annotate a new `AlternativeImage`,
625
         or be passed down with :py:meth:`image_from_segment`.)
626
627
        Examples:
628
629
         * get a raw (colored) but already deskewed and cropped image::
630
631
                page_image, page_coords, page_image_info = workspace.image_from_page(
632
                    page, page_id,
633
                    feature_selector='deskewed,cropped',
634
                    feature_filter='binarized,grayscale_normalized')
635
        """
636
        log = getLogger('ocrd.workspace.image_from_page')
637
        page_image_info = self.resolve_image_exif(page.imageFilename)
638
        page_image = self._resolve_image_as_pil(page.imageFilename)
639
        page_coords = dict()
640
        # use identity as initial affine coordinate transform:
641
        page_coords['transform'] = np.eye(3)
642
        # interim bbox (updated with each change to the transform):
643
        page_bbox = [0, 0, page_image.width, page_image.height]
644
        page_xywh = {'x': 0, 'y': 0,
645
                     'w': page_image.width, 'h': page_image.height}
646
647
        border = page.get_Border()
648
        # page angle: PAGE @orientation is defined clockwise,
649
        # whereas PIL/ndimage rotation is in mathematical direction:
650
        page_coords['angle'] = -(page.get_orientation() or 0)
651
        # map angle from (-180,180] to [0,360], and partition into multiples of 90;
652
        # but avoid unnecessary large remainders, i.e. split symmetrically:
653
        orientation = (page_coords['angle'] + 45) % 360
654
        orientation = orientation - (orientation % 90)
655
        skew = (page_coords['angle'] % 360) - orientation
656
        skew = 180 - (180 - skew) % 360 # map to [-45,45]
657
        page_coords['angle'] = 0 # nothing applied yet (depends on filters)
658
        log.debug("page '%s' has %s orientation=%d skew=%.2f",
659
                  page_id, "border," if border else "", orientation, skew)
660
661
        # initialize AlternativeImage@comments classes as empty:
662
        page_coords['features'] = ''
663
        best_image = None
664
        alternative_images = page.get_AlternativeImage()
665 View Code Duplication
        if alternative_images:
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
666
            # (e.g. from page-level cropping, binarization, deskewing or despeckling)
667
            best_features = set()
668
            auto_features = {'cropped', 'deskewed', 'rotated-90', 'rotated-180', 'rotated-270'}
669
            # search to the end, because by convention we always append,
670
            # and among multiple satisfactory images we want the most recent,
671
            # but also ensure that we get the richest feature set, i.e. most
672
            # of those features that we cannot reproduce automatically below
673
            for alternative_image in alternative_images:
674
                if filename and filename != alternative_image.filename:
675
                    continue
676
                features = alternative_image.get_comments()
677
                if not features:
678
                    log.warning("AlternativeImage %d for page '%s' does not have any feature attributes",
679
                                alternative_images.index(alternative_image) + 1, page_id)
680
                    features = ''
681
                featureset = set(features.split(','))
682
                if (all(feature in featureset
683
                        for feature in feature_selector.split(',') if feature) and
684
                    not any(feature in featureset
685
                            for feature in feature_filter.split(',') if feature) and
686
                    len(featureset.difference(auto_features)) >= \
687
                    len(best_features.difference(auto_features))):
688
                    best_features = featureset
689
                    best_image = alternative_image
690
            if best_image:
691
                log.debug("Using AlternativeImage %d %s for page '%s'",
692
                          alternative_images.index(best_image) + 1,
693
                          best_features, page_id)
694
                page_image = self._resolve_image_as_pil(best_image.get_filename())
695
                page_coords['features'] = best_image.get_comments() # including duplicates
696
697
        # adjust the coord transformation to the steps applied on the image,
698
        # and apply steps on the existing image in case it is missing there,
699
        # but traverse all steps (crop/reflect/rotate) in a particular order:
700
        # - existing image features take priority (in the order annotated),
701
        # - next is cropping (if necessary but not already applied),
702
        # - next is reflection (if necessary but not already applied),
703
        # - next is rotation (if necessary but not already applied).
704
        # This helps deal with arbitrary workflows (e.g. crop then deskew,
705
        # or deskew then crop), regardless of where images are generated.
706
        alternative_image_features = page_coords['features'].split(',')
707
        for duplicate_feature in set([feature for feature in alternative_image_features
708
                                      # features relevant in reconstructing coordinates:
709
                                      if (feature in ['cropped', 'deskewed', 'rotated-90',
710
                                                      'rotated-180', 'rotated-270'] and
711
                                          alternative_image_features.count(feature) > 1)]):
712
            log.error("Duplicate feature %s in AlternativeImage for page '%s'",
713
                      duplicate_feature, page_id)
714
        for i, feature in enumerate(alternative_image_features +
715
                                    (['cropped']
716
                                     if (border and
717
                                         not 'cropped' in alternative_image_features and
718
                                         not 'cropped' in feature_filter.split(','))
719
                                     else []) +
720
                                    (['rotated-%d' % orientation]
721
                                     if (orientation and
722
                                         not 'rotated-%d' % orientation in alternative_image_features and
723
                                         not 'rotated-%d' % orientation in feature_filter.split(','))
724
                                     else []) +
725
                                    (['deskewed']
726
                                     if (skew and
727
                                         not 'deskewed' in alternative_image_features and
728
                                         not 'deskewed' in feature_filter.split(','))
729
                                     else []) +
730
                                    # not a feature to be added, but merely as a fallback position
731
                                    # to always enter loop at i == len(alternative_image_features)
732
                                    ['_check']):
733
            # image geometry vs feature consistency can only be checked
734
            # after all features on the existing AlternativeImage have
735
            # been adjusted for in the transform, and when there is a mismatch,
736
            # additional steps applied here would only repeat the respective
737
            # error message; so we only check once at the boundary between
738
            # existing and new features
739
            # FIXME we should check/enforce consistency when _adding_ AlternativeImage
740
            if (i == len(alternative_image_features) and
741
                not (page_xywh['w'] - 2 < page_image.width < page_xywh['w'] + 2 and
742
                     page_xywh['h'] - 2 < page_image.height < page_xywh['h'] + 2)):
743
                log.error('page "%s" image (%s; %dx%d) has not been cropped properly (%dx%d)',
744
                          page_id, page_coords['features'],
745
                          page_image.width, page_image.height,
746
                          page_xywh['w'], page_xywh['h'])
747
            name = "%s for page '%s'" % ("AlternativeImage" if best_image
748
                                         else "original image", page_id)
749
            # adjust transform to feature, and ensure feature is applied to image
750
            if feature == 'cropped':
751
                page_image, page_coords, page_xywh = _crop(
752
                    log, name, border, page_image, page_coords,
753
                    fill=fill, transparency=transparency)
754
            elif feature == 'rotated-%d' % orientation:
755
                page_image, page_coords, page_xywh = _reflect(
756
                    log, name, orientation, page_image, page_coords, page_xywh)
757
            elif feature == 'deskewed':
758
                page_image, page_coords, page_xywh = _rotate(
759
                    log, name, skew, border, page_image, page_coords, page_xywh,
760
                    fill=fill, transparency=transparency)
761
762
        # verify constraints again:
763
        if filename and not getattr(page_image, 'filename', '').endswith(filename):
764
            raise Exception('Found no AlternativeImage that satisfies all requirements ' +
765
                            'filename="%s" in page "%s"' % (
766
                                filename, page_id))
767
        if not all(feature in page_coords['features']
768
                   for feature in feature_selector.split(',') if feature):
769
            raise Exception('Found no AlternativeImage that satisfies all requirements ' +
770
                            'selector="%s" in page "%s"' % (
771
                                feature_selector, page_id))
772
        if any(feature in page_coords['features']
773
               for feature in feature_filter.split(',') if feature):
774
            raise Exception('Found no AlternativeImage that satisfies all requirements ' +
775
                            'filter="%s" in page "%s"' % (
776
                                feature_filter, page_id))
777
        page_image.format = 'PNG' # workaround for tesserocr#194
778
        return page_image, page_coords, page_image_info
779
780
    def image_from_segment(self, segment, parent_image, parent_coords,
781
                           fill='background', transparency=False,
782
                           feature_selector='', feature_filter='', filename=''):
783
        """Extract an image for a PAGE-XML hierarchy segment from its parent's image.
784
785
        Args:
786
            segment (object): a PAGE segment object \
787
                (i.e. :py:class:`~ocrd_models.ocrd_page.TextRegionType` \
788
                or :py:class:`~ocrd_models.ocrd_page.TextLineType` \
789
                or :py:class:`~ocrd_models.ocrd_page.WordType` \
790
                or :py:class:`~ocrd_models.ocrd_page.GlyphType`)
791
            parent_image (`PIL.Image`): image of the `segment`'s parent
792
            parent_coords (dict): a `dict` with information about `parent_image`:
793
794
               - `"transform"`: a `Numpy` array with an affine transform which
795
                 converts from absolute coordinates to those relative to the image,
796
                 i.e. after applying all operations (starting with the original image)
797
               - `"angle"`: the rotation/reflection angle applied to the image so far,
798
               - `"features"`: the ``AlternativeImage/@comments`` for the image, i.e.
799
                 names of all operations that lead up to this result, and
800
        Keyword Args:
801
            fill (string): a `PIL` color specifier, or `background` or `none`
802
            transparency (boolean): whether to add an alpha channel for masking
803
            feature_selector (string): a comma-separated list of ``@comments`` classes
804
            feature_filter (string): a comma-separated list of ``@comments`` classes
805
806
        Extract a `PIL.Image` from `segment`, either from ``AlternativeImage``
807
        (if it exists), or producing a new image via cropping from `parent_image`
808
        (otherwise). Pass in `parent_image` and `parent_coords` from the result
809
        of the next higher-level of this function or from :py:meth:`image_from_page`.
810
811
        If ``filename`` is given, then among the available `AlternativeImage/@filename`
812
        images, pick that one, or raise an error.
813
814
        If ``feature_selector`` and/or ``feature_filter`` is given, then
815
        among the cropped `parent_image` and the available AlternativeImages,
816
        select/filter the richest one which contains all of the selected,
817
        but none of the filtered features (i.e. ``@comments`` classes), or
818
        raise an error.
819
820
        (Required and produced features need not be in the same order, so
821
        `feature_selector` is merely a mask specifying Boolean AND, and
822
        `feature_filter` is merely a mask specifying Boolean OR.)
823
824
        Cropping uses a polygon mask (not just the bounding box rectangle).
825
        Areas outside the polygon will be filled according to `fill`:
826
827
        \b
828
        - if `"background"` (the default),
829
          then fill with the median color of the image;
830
        - else if `"none"`, then avoid masking polygons where possible
831
          (i.e. when cropping) or revert to the default (i.e. when rotating)
832
        - otherwise, use the given color, e.g. `"white"` or `(255,255,255)`.
833
834
        Moreover, if `transparency` is true, and unless the image already
835
        has an alpha channel, then add an alpha channel which is fully opaque
836
        before cropping and rotating. (Thus, unexposed/masked areas will be
837
        transparent afterwards for consumers that can interpret alpha channels).
838
839
        When cropping, compensate any ``@orientation`` angle annotated for the
840
        parent (from parent-level deskewing) by rotating the segment coordinates
841
        in an inverse transformation (i.e. translation to center, then passive
842
        rotation, and translation back).
843
844
        Regardless, if any ``@orientation`` angle is annotated for the segment
845
        (from segment-level deskewing), and the chosen image does not have
846
        the feature `"deskewed"` yet, and unless `"deskewed"` is being filtered,
847
        then rotate it - compensating for any previous `"angle"`. (However,
848
        if ``@orientation`` is above the [-45°,45°] interval, then apply as much
849
        transposition as possible first, unless `"rotated-90"` / `"rotated-180"` /
850
        `"rotated-270"` is being filtered.)
851
852
        Returns:
853
            a tuple of
854
             * the extracted `PIL.Image`,
855
             * a `dict` with information about the extracted image:
856
857
               - `"transform"`: a `Numpy` array with an affine transform which
858
                   converts from absolute coordinates to those relative to the image,
859
                   i.e. after applying all parent operations, and then cropping to
860
                   the segment's bounding box, and deskewing with the segment's
861
                   orientation angle (if any)
862
               - `"angle"`: the rotation/reflection angle applied to the image so far,
863
               - `"features"`: the ``AlternativeImage/@comments`` for the image, i.e.
864
                 names of all applied operations that lead up to this result.
865
866
        (These can be used to create a new ``AlternativeImage``, or passed down
867
         for :py:meth:`image_from_segment` calls on lower hierarchy levels.)
868
869
        Examples:
870
871
         * get a raw (colored) but already deskewed and cropped image::
872
873
                image, xywh = workspace.image_from_segment(region,
874
                    page_image, page_xywh,
875
                    feature_selector='deskewed,cropped',
876
                    feature_filter='binarized,grayscale_normalized')
877
        """
878
        log = getLogger('ocrd.workspace.image_from_segment')
879
        # note: We should mask overlapping neighbouring segments here,
880
        # but finding the right clipping rules can be difficult if operating
881
        # on the raw (non-binary) image data alone: for each intersection, it
882
        # must be decided which one of either segment or neighbour to assign,
883
        # e.g. an ImageRegion which properly contains our TextRegion should be
884
        # completely ignored, but an ImageRegion which is properly contained
885
        # in our TextRegion should be completely masked, while partial overlap
886
        # may be more difficult to decide. On the other hand, on the binary image,
887
        # we can use connected component analysis to mask foreground areas which
888
        # originate in the neighbouring regions. But that would introduce either
889
        # the assumption that the input has already been binarized, or a dependency
890
        # on some ad-hoc binarization method. Thus, it is preferable to use
891
        # a dedicated processor for this (which produces clipped AlternativeImage
892
        # or reduced polygon coordinates).
893
        segment_image, segment_coords, segment_xywh = _crop(
894
            log, "parent image for segment '%s'" % segment.id,
895
            segment, parent_image, parent_coords,
896
            fill=fill, transparency=transparency)
897
898
        # Semantics of missing @orientation at region level could be either
899
        # - inherited from page level: same as line or word level (no @orientation),
900
        # - zero (unrotate page angle): different from line or word level (because
901
        #   otherwise deskewing would never have an effect on lines and words)
902
        # The PAGE specification is silent here (but does generally not concern itself
903
        # much with AlternativeImage coordinate consistency).
904
        # Since our (generateDS-backed) ocrd_page supports the zero/none distinction,
905
        # we choose the former (i.e. None is inheritance).
906
        if 'orientation' in segment.__dict__ and segment.get_orientation() is not None:
907
            # region angle: PAGE @orientation is defined clockwise,
908
            # whereas PIL/ndimage rotation is in mathematical direction:
909
            angle = -segment.get_orientation()
910
            # @orientation is always absolute; if higher levels
911
            # have already rotated, then we must compensate:
912
            angle -= parent_coords['angle']
913
            # map angle from (-180,180] to [0,360], and partition into multiples of 90;
914
            # but avoid unnecessary large remainders, i.e. split symmetrically:
915
            orientation = (angle + 45) % 360
916
            orientation = orientation - (orientation % 90)
917
            skew = (angle % 360) - orientation
918
            skew = 180 - (180 - skew) % 360 # map to [-45,45]
919
            log.debug("segment '%s' has orientation=%d skew=%.2f",
920
                      segment.id, orientation, skew)
921
        else:
922
            orientation = 0
923
            skew = 0
924
        segment_coords['angle'] = parent_coords['angle'] # nothing applied yet (depends on filters)
925
926
        # initialize AlternativeImage@comments classes from parent, except
927
        # for those operations that can apply on multiple hierarchy levels:
928
        segment_coords['features'] = ','.join(
929
            [feature for feature in parent_coords['features'].split(',')
930
             if feature in ['binarized', 'grayscale_normalized',
931
                            'despeckled', 'dewarped']])
932
933
        best_image = None
934
        alternative_images = segment.get_AlternativeImage()
935 View Code Duplication
        if alternative_images:
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
936
            # (e.g. from segment-level cropping, binarization, deskewing or despeckling)
937
            best_features = set()
938
            auto_features = {'cropped', 'deskewed', 'rotated-90', 'rotated-180', 'rotated-270'}
939
            # search to the end, because by convention we always append,
940
            # and among multiple satisfactory images we want the most recent,
941
            # but also ensure that we get the richest feature set, i.e. most
942
            # of those features that we cannot reproduce automatically below
943
            for alternative_image in alternative_images:
944
                if filename and filename != alternative_image.filename:
945
                    continue
946
                features = alternative_image.get_comments()
947
                if not features:
948
                    log.warning("AlternativeImage %d for segment '%s' does not have any feature attributes",
949
                                alternative_images.index(alternative_image) + 1, segment.id)
950
                    features = ''
951
                featureset = set(features.split(','))
952
                if (all(feature in featureset
953
                        for feature in feature_selector.split(',') if feature) and
954
                    not any(feature in featureset
955
                            for feature in feature_filter.split(',') if feature) and
956
                    len(featureset.difference(auto_features)) >= \
957
                    len(best_features.difference(auto_features))):
958
                    best_features = featureset
959
                    best_image = alternative_image
960
            if best_image:
961
                log.debug("Using AlternativeImage %d %s for segment '%s'",
962
                          alternative_images.index(best_image) + 1,
963
                          best_features, segment.id)
964
                segment_image = self._resolve_image_as_pil(alternative_image.get_filename())
0 ignored issues
show
introduced by
The variable alternative_image does not seem to be defined in case the for loop on line 943 is not entered. Are you sure this can never be the case?
Loading history...
965
                segment_coords['features'] = best_image.get_comments() # including duplicates
966
967
        alternative_image_features = segment_coords['features'].split(',')
968
        for duplicate_feature in set([feature for feature in alternative_image_features
969
                                      # features relevant in reconstructing coordinates:
970
                                      if (feature in ['deskewed', 'rotated-90',
971
                                                      'rotated-180', 'rotated-270'] and
972
                                          alternative_image_features.count(feature) > 1)]):
973
            log.error("Duplicate feature %s in AlternativeImage for segment '%s'",
974
                      duplicate_feature, segment.id)
975
        for i, feature in enumerate(alternative_image_features +
976
                                    (['rotated-%d' % orientation]
977
                                     if (orientation and
978
                                         not 'rotated-%d' % orientation in alternative_image_features and
979
                                         not 'rotated-%d' % orientation in feature_filter.split(','))
980
                                     else []) +
981
                                    (['deskewed']
982
                                     if (skew and
983
                                         not 'deskewed' in alternative_image_features and
984
                                         not 'deskewed' in feature_filter.split(','))
985
                                     else []) +
986
                                    # not a feature to be added, but merely as a fallback position
987
                                    # to always enter loop at i == len(alternative_image_features)
988
                                    ['_check']):
989
            # image geometry vs feature consistency can only be checked
990
            # after all features on the existing AlternativeImage have
991
            # been adjusted for in the transform, and when there is a mismatch,
992
            # additional steps applied here would only repeat the respective
993
            # error message; so we only check once at the boundary between
994
            # existing and new features
995
            # FIXME we should enforce consistency here (i.e. split into transposition
996
            #       and minimal rotation, rotation always reshapes, rescaling never happens)
997
            # FIXME: inconsistency currently unavoidable with line-level dewarping (which increases height)
998
            if (i == len(alternative_image_features) and
999
                not (segment_xywh['w'] - 2 < segment_image.width < segment_xywh['w'] + 2 and
1000
                     segment_xywh['h'] - 2 < segment_image.height < segment_xywh['h'] + 2)):
1001
                log.error('segment "%s" image (%s; %dx%d) has not been cropped properly (%dx%d)',
1002
                          segment.id, segment_coords['features'],
1003
                          segment_image.width, segment_image.height,
1004
                          segment_xywh['w'], segment_xywh['h'])
1005
            name = "%s for segment '%s'" % ("AlternativeImage" if best_image
1006
                                            else "parent image", segment.id)
1007
            # adjust transform to feature, and ensure feature is applied to image
1008
            if feature == 'rotated-%d' % orientation:
1009
                segment_image, segment_coords, segment_xywh = _reflect(
1010
                    log, name, orientation, segment_image, segment_coords, segment_xywh)
1011
            elif feature == 'deskewed':
1012
                segment_image, segment_coords, segment_xywh = _rotate(
1013
                    log, name, skew, segment, segment_image, segment_coords, segment_xywh,
1014
                    fill=fill, transparency=transparency)
1015
1016
        # verify constraints again:
1017
        if filename and not getattr(segment_image, 'filename', '').endswith(filename):
1018
            raise Exception('Found no AlternativeImage that satisfies all requirements ' +
1019
                            'filename="%s" in segment "%s"' % (
1020
                                filename, segment.id))
1021
        if not all(feature in segment_coords['features']
1022
                   for feature in feature_selector.split(',') if feature):
1023
            raise Exception('Found no AlternativeImage that satisfies all requirements' +
1024
                            'selector="%s" in segment "%s"' % (
1025
                                feature_selector, segment.id))
1026
        if any(feature in segment_coords['features']
1027
               for feature in feature_filter.split(',') if feature):
1028
            raise Exception('Found no AlternativeImage that satisfies all requirements ' +
1029
                            'filter="%s" in segment "%s"' % (
1030
                                feature_filter, segment.id))
1031
        segment_image.format = 'PNG' # workaround for tesserocr#194
1032
        return segment_image, segment_coords
1033
1034
    # pylint: disable=redefined-builtin
1035
    def save_image_file(self, image,
1036
                        file_id,
1037
                        file_grp,
1038
                        page_id=None,
1039
                        mimetype='image/png',
1040
                        force=False):
1041
        """Store an image in the filesystem and reference it as new file in the METS.
1042
1043
        Args:
1044
            image (PIL.Image): derived image to save
1045
            file_id (string): `@ID` of the METS `file` to use
1046
            file_grp (string): `@USE` of the METS `fileGrp` to use
1047
        Keyword Args:
1048
            page_id (string): `@ID` in the METS physical `structMap` to use
1049
            mimetype (string): MIME type of the image format to serialize as
1050
            force (boolean): whether to replace any existing `file` with that `@ID`
1051
1052
        Serialize the image into the filesystem, and add a `file` for it in the METS.
1053
        Use a filename extension based on ``mimetype``.
1054
1055
        Returns:
1056
            The (absolute) path of the created file.
1057
        """
1058
        log = getLogger('ocrd.workspace.save_image_file')
1059
        if self.overwrite_mode:
1060
            force = True
1061
        image_bytes = io.BytesIO()
1062
        image.save(image_bytes, format=MIME_TO_PIL[mimetype])
1063
        file_path = str(Path(file_grp, '%s%s' % (file_id, MIME_TO_EXT[mimetype])))
1064
        out = self.add_file(
1065
            file_grp,
1066
            file_id=file_id,
1067
            page_id=page_id,
1068
            local_filename=file_path,
1069
            mimetype=mimetype,
1070
            content=image_bytes.getvalue(),
1071
            force=force)
1072
        log.info('created file ID: %s, file_grp: %s, path: %s',
1073
                 file_id, file_grp, out.local_filename)
1074
        return file_path
1075
1076
    def find_files(self, *args, **kwargs):
1077
        """
1078
        Search ``mets:file`` entries in wrapped METS document and yield results.
1079
1080
        Delegator to :py:func:`ocrd_models.ocrd_mets.OcrdMets.find_files`
1081
1082
        Keyword Args:
1083
            **kwargs: See :py:func:`ocrd_models.ocrd_mets.OcrdMets.find_files`
1084
        Returns:
1085
            Generator which yields :py:class:`ocrd_models:ocrd_file:OcrdFile` instantiations
1086
        """
1087
        log = getLogger('ocrd.workspace.find_files')
1088
        log.debug('find files in mets. kwargs=%s' % kwargs)
1089
        if "page_id" in kwargs:
1090
            kwargs["pageId"] = kwargs.pop("page_id")
1091
        if "file_id" in kwargs:
1092
            kwargs["ID"] = kwargs.pop("file_id")
1093
        if "file_grp" in kwargs:
1094
            kwargs["fileGrp"] = kwargs.pop("file_grp")
1095
        with pushd_popd(self.directory):
1096
            return self.mets.find_files(*args, **kwargs)
1097
1098
def _crop(log, name, segment, parent_image, parent_coords, op='cropped', **kwargs):
1099
    segment_coords = parent_coords.copy()
1100
    # get polygon outline of segment relative to parent image:
1101
    segment_polygon = coordinates_of_segment(segment, parent_image, parent_coords)
1102
    # get relative bounding box:
1103
    segment_bbox = bbox_from_polygon(segment_polygon)
1104
    # get size of the segment in the parent image after cropping
1105
    # (i.e. possibly different from size before rotation at the parent, but
1106
    #  also possibly different from size after rotation below/AlternativeImage):
1107
    segment_xywh = xywh_from_bbox(*segment_bbox)
1108
    # crop, if (still) necessary:
1109
    if (not isinstance(segment, BorderType) or # always crop below page level
1110
        not op in parent_coords['features']):
1111
        if op == 'recropped':
1112
            log.info("Recropping %s", name)
1113
        elif isinstance(segment, BorderType):
1114
            log.info("Cropping %s", name)
1115
            segment_coords['features'] += ',' + op
1116
        # create a mask from the segment polygon:
1117
        segment_image = image_from_polygon(parent_image, segment_polygon, **kwargs)
1118
        # crop to bbox:
1119
        segment_image = crop_image(segment_image, box=segment_bbox)
1120
    else:
1121
        segment_image = parent_image
1122
    # subtract offset from parent in affine coordinate transform:
1123
    # (consistent with image cropping)
1124
    segment_coords['transform'] = shift_coordinates(
1125
        parent_coords['transform'],
1126
        np.array([-segment_bbox[0],
1127
                  -segment_bbox[1]]))
1128
    return segment_image, segment_coords, segment_xywh
1129
1130
def _reflect(log, name, orientation, segment_image, segment_coords, segment_xywh):
1131
    # Transpose in affine coordinate transform:
1132
    # (consistent with image transposition or AlternativeImage below)
1133
    transposition = {
1134
        90: Image.ROTATE_90,
1135
        180: Image.ROTATE_180,
1136
        270: Image.ROTATE_270
1137
    }.get(orientation) # no default
1138
    segment_coords['transform'] = transpose_coordinates(
1139
        segment_coords['transform'], transposition,
1140
        np.array([0.5 * segment_xywh['w'],
1141
                  0.5 * segment_xywh['h']]))
1142
    segment_xywh['w'], segment_xywh['h'] = adjust_canvas_to_transposition(
1143
        [segment_xywh['w'], segment_xywh['h']], transposition)
1144
    segment_coords['angle'] += orientation
1145
    # transpose, if (still) necessary:
1146
    if not 'rotated-%d' % orientation in segment_coords['features']:
1147
        log.info("Transposing %s by %d°", name, orientation)
1148
        segment_image = transpose_image(segment_image, transposition)
1149
        segment_coords['features'] += ',rotated-%d' % orientation
1150
    return segment_image, segment_coords, segment_xywh
1151
1152
def _rotate(log, name, skew, segment, segment_image, segment_coords, segment_xywh, **kwargs):
1153
    # Rotate around center in affine coordinate transform:
1154
    # (consistent with image rotation or AlternativeImage below)
1155
    segment_coords['transform'] = rotate_coordinates(
1156
        segment_coords['transform'], skew,
1157
        np.array([0.5 * segment_xywh['w'],
1158
                  0.5 * segment_xywh['h']]))
1159
    segment_xywh['w'], segment_xywh['h'] = adjust_canvas_to_rotation(
1160
        [segment_xywh['w'], segment_xywh['h']], skew)
1161
    segment_coords['angle'] += skew
1162
    # deskew, if (still) necessary:
1163
    if not 'deskewed' in segment_coords['features']:
1164
        log.info("Rotating %s by %.2f°", name, skew)
1165
        segment_image = rotate_image(segment_image, skew, **kwargs)
1166
        segment_coords['features'] += ',deskewed'
1167
        if (segment and
1168
            (not isinstance(segment, BorderType) or # always crop below page level
1169
             'cropped' in segment_coords['features'])):
1170
            # re-crop to new bbox (which may deviate
1171
            # if segment polygon was not a rectangle)
1172
            segment_image, segment_coords, segment_xywh = _crop(
1173
                log, name, segment, segment_image, segment_coords,
1174
                op='recropped', **kwargs)
1175
    elif (segment and
1176
          (not isinstance(segment, BorderType) or # always crop below page level
1177
           'cropped' in segment_coords['features'])):
1178
        # only shift coordinates as if re-cropping
1179
        segment_polygon = coordinates_of_segment(segment, segment_image, segment_coords)
1180
        segment_bbox = bbox_from_polygon(segment_polygon)
1181
        segment_xywh = xywh_from_bbox(*segment_bbox)
1182
        segment_coords['transform'] = shift_coordinates(
1183
            segment_coords['transform'],
1184
            np.array([-segment_bbox[0],
1185
                      -segment_bbox[1]]))
1186
    return segment_image, segment_coords, segment_xywh
1187
1188
def _scale(log, name, factor, segment_image, segment_coords, segment_xywh, **kwargs):
1189
    # Resize linearly
1190
    segment_coords['transform'] = scale_coordinates(
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable scale_coordinates does not seem to be defined.
Loading history...
1191
        segment_coords['transform'], [factor, factor])
1192
    segment_coords['scale'] = segment_coords.setdefault('scale', 1.0) * factor
1193
    segment_xywh['w'] *= factor
1194
    segment_xywh['h'] *= factor
1195
    # resize, if (still) necessary
1196
    if not 'scaled' in segment_coords['features']:
1197
        log.info("Scaling %s by %.2f", name, factor)
1198
        segment_coords['features'] += ',scaled'
1199
        # FIXME: validate factor against PAGE-XML attributes
1200
        # FIXME: factor should become less precise due to rounding
1201
        segment_image = segment_image.resize((int(segment_image.width * factor),
1202
                                              int(segment_image.height * factor)),
1203
                                             # slowest, but highest quality:
1204
                                             Image.BICUBIC)
1205
    return segment_image, segment_coords, segment_xywh
1206