Passed
Pull Request — master (#889)
by Konstantin
02:28
created

ocrd.workspace.Workspace.find_files()   A

Complexity

Conditions 5

Size

Total Lines 21
Code Lines 11

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 11
dl 0
loc 21
rs 9.3333
c 0
b 0
f 0
cc 5
nop 3
1
import io
2
from os import makedirs, unlink, listdir, path
3
from pathlib import Path
4
from shutil import move, copyfileobj
5
from re import sub
6
from tempfile import NamedTemporaryFile
7
from contextlib import contextmanager
8
9
from cv2 import COLOR_GRAY2BGR, COLOR_RGB2BGR, cvtColor
10
from PIL import Image
11
import numpy as np
12
from deprecated.sphinx import deprecated
13
import requests
14
15
from ocrd_models import OcrdMets, OcrdFile
16
from ocrd_models.ocrd_page import parse, BorderType, to_xml
17
from ocrd_modelfactory import exif_from_filename, page_from_file
18
from ocrd_utils import (
19
    atomic_write,
20
    getLogger,
21
    image_from_polygon,
22
    coordinates_of_segment,
23
    adjust_canvas_to_rotation,
24
    adjust_canvas_to_transposition,
25
    shift_coordinates,
26
    rotate_coordinates,
27
    transform_coordinates,
28
    transpose_coordinates,
29
    crop_image,
30
    rotate_image,
31
    transpose_image,
32
    bbox_from_polygon,
33
    polygon_from_points,
34
    xywh_from_bbox,
35
    pushd_popd,
36
    is_local_filename,
37
    deprecated_alias,
38
    MIME_TO_EXT,
39
    MIME_TO_PIL,
40
    MIMETYPE_PAGE,
41
    REGEX_PREFIX
42
)
43
44
from .workspace_backup import WorkspaceBackupManager
45
46
__all__ = ['Workspace']
47
48
@contextmanager
49
def download_temporary_file(url):
50
    with NamedTemporaryFile(prefix='ocrd-download-') as f:
51
        with requests.get(url) as r:
52
            f.write(r.content)
53
        yield f
54
55
56
class Workspace():
57
    """
58
    A workspace is a temporary directory set up for a processor. It's the
59
    interface to the METS/PAGE XML and delegates download and upload to the
60
    :py:class:`ocrd.resolver.Resolver`.
61
62
    Args:
63
64
        directory (string) : Filesystem folder to work in
65
        mets (:py:class:`ocrd_models.ocrd_mets.OcrdMets`) : `OcrdMets` representing this workspace.
66
            Loaded from `'mets.xml'` if `None`.
67
        mets_basename (string) : Basename of the METS XML file. Default: Last URL segment of the mets_url.
68
        overwrite_mode (boolean) : Whether to force add operations on this workspace globally
69
        baseurl (string) : Base URL to prefix to relative URL.
70
    """
71
72
    def __init__(self, resolver, directory, mets=None, mets_basename='mets.xml', automatic_backup=False, baseurl=None):
73
        self.resolver = resolver
74
        self.directory = directory
75
        self.mets_target = str(Path(directory, mets_basename))
76
        self.overwrite_mode = False
77
        if mets is None:
78
            mets = OcrdMets(filename=self.mets_target)
79
        self.mets = mets
80
        self.automatic_backup = automatic_backup
81
        self.baseurl = baseurl
82
        #  print(mets.to_xml(xmllint=True).decode('utf-8'))
83
84
    def __str__(self):
85
        return 'Workspace[directory=%s, baseurl=%s, file_groups=%s, files=%s]' % (
86
            self.directory,
87
            self.baseurl,
88
            self.mets.file_groups,
89
            [str(f) for f in self.mets.find_all_files()],
90
        )
91
92
    def reload_mets(self):
93
        """
94
        Reload METS from the filesystem.
95
        """
96
        self.mets = OcrdMets(filename=self.mets_target)
97
98
    @deprecated_alias(pageId="page_id")
99
    @deprecated_alias(ID="file_id")
100
    @deprecated_alias(fileGrp="file_grp")
101
    @deprecated_alias(fileGrp_mapping="filegrp_mapping")
102
    def merge(self, other_workspace, copy_files=True, **kwargs):
103
        """
104
        Merge ``other_workspace`` into this one
105
106
        See :py:meth:`ocrd_models.ocrd_mets.OcrdMets.merge` for the `kwargs`
107
108
        Keyword Args:
109
            copy_files (boolean): Whether to copy files from `other_workspace` to this one
110
        """
111
        def after_add_cb(f):
112
            """callback to run on merged OcrdFile instances in the destination"""
113
            if not copy_files:
114
                fpath_src = Path(other_workspace.directory).resolve()
115
                fpath_dst = Path(self.directory).resolve()
116
                dstprefix = fpath_src.relative_to(fpath_dst) # raises ValueError if not a subpath
117
                if is_local_filename(f.url):
118
                    f.url = str(Path(dstprefix, f.url))
119
                return
120
            fpath_src = Path(other_workspace.directory, f.url)
121
            fpath_dest = Path(self.directory, f.url)
122
            if fpath_src.exists():
123
                if fpath_dest.exists():
124
                    raise Exception("Copying %s to %s would overwrite the latter" % (fpath_src, fpath_dest))
125
                if not fpath_dest.parent.is_dir():
126
                    makedirs(str(fpath_dest.parent))
127
                with open(str(fpath_src), 'rb') as fstream_in, open(str(fpath_dest), 'wb') as fstream_out:
128
                    copyfileobj(fstream_in, fstream_out)
129
        if 'page_id' in kwargs:
130
            kwargs['pageId'] = kwargs.pop('page_id')
131
        if 'file_id' in kwargs:
132
            kwargs['ID'] = kwargs.pop('file_id')
133
        if 'file_grp' in kwargs:
134
            kwargs['fileGrp'] = kwargs.pop('file_grp')
135
        if 'filegrp_mapping' in kwargs:
136
            kwargs['fileGrp_mapping'] = kwargs.pop('filegrp_mapping')
137
138
        self.mets.merge(other_workspace.mets, after_add_cb=after_add_cb, **kwargs)
139
140
141
    @deprecated(version='1.0.0', reason="Use workspace.download_file")
142
    def download_url(self, url, **kwargs):
143
        """
144
        Download a URL to the workspace.
145
146
        Args:
147
            url (string): URL to download to directory
148
            **kwargs : See :py:class:`ocrd_models.ocrd_file.OcrdFile`
149
150
        Returns:
151
            The local filename of the downloaded file
152
        """
153
        dummy_mets = OcrdMets.empty_mets()
154
        f = dummy_mets.add_file('DEPRECATED', ID=Path(url).name, url=url)
155
        f = self.download_file(f)
156
        return f.local_filename
157
158
    def download_file(self, f, _recursion_count=0):
159
        """
160
        Download a :py:class:`ocrd_models.ocrd_file.OcrdFile` to the workspace.
161
        """
162
        log = getLogger('ocrd.workspace.download_file')
163
        log.debug('download_file %s [_recursion_count=%s]' % (f, _recursion_count))
164
        with pushd_popd(self.directory):
165
            try:
166
                # If the f.url is already a file path, and is within self.directory, do nothing
167
                url_path = Path(f.url).resolve()
168
                if not (url_path.exists() and url_path.relative_to(str(Path(self.directory).resolve()))):
169
                    raise Exception("Not already downloaded, moving on")
170
            except Exception as e:
171
                basename = '%s%s' % (f.ID, MIME_TO_EXT.get(f.mimetype, '')) if f.ID else f.basename
172
                try:
173
                    f.url = self.resolver.download_to_directory(self.directory, f.url, subdir=f.fileGrp, basename=basename)
174
                except FileNotFoundError as e:
175
                    if not self.baseurl:
176
                        raise Exception("No baseurl defined by workspace. Cannot retrieve '%s'" % f.url)
177
                    if _recursion_count >= 1:
178
                        raise FileNotFoundError("Already tried prepending baseurl '%s'. Cannot retrieve '%s'" % (self.baseurl, f.url))
179
                    log.debug("First run of resolver.download_to_directory(%s) failed, try prepending baseurl '%s': %s", f.url, self.baseurl, e)
180
                    f.url = '%s/%s' % (self.baseurl, f.url)
181
                    f.url = self.download_file(f, _recursion_count + 1).local_filename
182
            f.local_filename = f.url
183
            return f
184
185
    def remove_file(self, file_id, force=False, keep_file=False, page_recursive=False, page_same_group=False):
186
        """
187
        Remove a METS `file` from the workspace.
188
189
        Arguments:
190
            file_id (string|:py:class:`ocrd_models.ocrd_file.OcrdFile`): `@ID` of the METS `file`
191
                to delete or the file itself
192
        Keyword Args:
193
            force (boolean): Continue removing even if file not found in METS
194
            keep_file (boolean): Whether to keep files on disk
195
            page_recursive (boolean): Whether to remove all images referenced in the file
196
                if the file is a PAGE-XML document.
197
            page_same_group (boolean): Remove only images in the same file group as the PAGE-XML.
198
                Has no effect unless ``page_recursive`` is `True`.
199
        """
200
        log = getLogger('ocrd.workspace.remove_file')
201
        log.debug('Deleting mets:file %s', file_id)
202
        if not force and self.overwrite_mode:
203
            force = True
204
        if isinstance(file_id, OcrdFile):
205
            file_id = file_id.ID
206
        try:
207
            try:
208
                ocrd_file = next(self.mets.find_files(ID=file_id))
209
            except StopIteration:
210
                if file_id.startswith(REGEX_PREFIX):
211
                    # allow empty results if filter criteria involve a regex
212
                    return None
213
                raise FileNotFoundError("File %s not found in METS" % file_id)
214
            if page_recursive and ocrd_file.mimetype == MIMETYPE_PAGE:
215
                with pushd_popd(self.directory):
216
                    ocrd_page = parse(self.download_file(ocrd_file).local_filename, silence=True)
217
                    for img_url in ocrd_page.get_AllAlternativeImagePaths():
218
                        img_kwargs = {'url': img_url}
219
                        if page_same_group:
220
                            img_kwargs['fileGrp'] = ocrd_file.fileGrp
221
                        for img_file in self.mets.find_files(**img_kwargs):
222
                            self.remove_file(img_file, keep_file=keep_file, force=force)
223
            if not keep_file:
224
                with pushd_popd(self.directory):
225
                    if not ocrd_file.local_filename:
226
                        log.warning("File not locally available %s", ocrd_file)
227
                        if not force:
228
                            raise Exception("File not locally available %s" % ocrd_file)
229
                    else:
230
                        log.info("rm %s [cwd=%s]", ocrd_file.local_filename, self.directory)
231
                        unlink(ocrd_file.local_filename)
232
            # Remove from METS only after the recursion of AlternativeImages
233
            self.mets.remove_file(file_id)
234
            return ocrd_file
235
        except FileNotFoundError as e:
236
            if not force:
237
                raise e
238
239
    def remove_file_group(self, USE, recursive=False, force=False, keep_files=False, page_recursive=False, page_same_group=False):
240
        """
241
        Remove a METS `fileGrp`.
242
243
        Arguments:
244
            USE (string): `@USE` of the METS `fileGrp` to delete
245
        Keyword Args:
246
            recursive (boolean): Whether to recursively delete all files in the group
247
            force (boolean): Continue removing even if group or containing files not found in METS
248
            keep_files (boolean): When deleting recursively whether to keep files on disk
249
            page_recursive (boolean): Whether to remove all images referenced in the file
250
                if the file is a PAGE-XML document.
251
            page_same_group (boolean): Remove only images in the same file group as the PAGE-XML.
252
                Has no effect unless ``page_recursive`` is `True`.
253
        """
254
        if not force and self.overwrite_mode:
255
            force = True
256
257
        if (not USE.startswith(REGEX_PREFIX)) and (USE not in self.mets.file_groups) and (not force):
258
            raise Exception("No such fileGrp: %s" % USE)
259
260
        file_dirs = []
261
        if recursive:
262
            for f in self.mets.find_files(fileGrp=USE):
263
                self.remove_file(f, force=force, keep_file=keep_files, page_recursive=page_recursive, page_same_group=page_same_group)
264
                if f.local_filename:
265
                    f_dir = path.dirname(f.local_filename)
266
                    if f_dir:
267
                        file_dirs.append(f_dir)
268
269
        self.mets.remove_file_group(USE, force=force, recursive=recursive)
270
271
        # PLEASE NOTE: this only removes directories in the workspace if they are empty
272
        # and named after the fileGrp which is a convention in OCR-D.
273
        with pushd_popd(self.directory):
274
            if Path(USE).is_dir() and not listdir(USE):
275
                Path(USE).rmdir()
276
            if file_dirs:
277
                for file_dir in set(file_dirs):
278
                    if Path(file_dir).is_dir() and not listdir(file_dir):
279
                        Path(file_dir).rmdir()
280
281
282
    def rename_file_group(self, old, new):
283
        """
284
        Rename a METS `fileGrp`.
285
286
        Arguments:
287
            old (string): `@USE` of the METS `fileGrp` to rename
288
            new (string): `@USE` of the METS `fileGrp` to rename as
289
        """
290
        log = getLogger('ocrd.workspace.rename_file_group')
291
292
        if old not in self.mets.file_groups:
293
            raise ValueError("No such fileGrp: %s" % old)
294
        if new in self.mets.file_groups:
295
            raise ValueError("fileGrp already exists %s" % new)
296
297
        with pushd_popd(self.directory):
298
            # create workspace dir ``new``
299
            log.info("mkdir %s" % new)
300
            if not Path(new).is_dir():
301
                Path(new).mkdir()
302
            url_replacements = {}
303
            log.info("Moving files")
304
            for mets_file in self.mets.find_files(fileGrp=old, local_only=True):
305
                new_url = old_url = mets_file.url
306
                # Directory part
307
                new_url = sub(r'^%s/' % old, r'%s/' % new, new_url)
308
                # File part
309
                new_url = sub(r'/%s' % old, r'/%s' % new, new_url)
310
                url_replacements[mets_file.url] = new_url
311
                # move file from ``old`` to ``new``
312
                move(mets_file.url, new_url)
313
                # change the url of ``mets:file``
314
                mets_file.url = new_url
315
                # change the file ID and update structMap
316
                # change the file ID and update structMap
317
                new_id = sub(r'^%s' % old, r'%s' % new, mets_file.ID)
318
                try:
319
                    next(self.mets.find_files(ID=new_id))
320
                    log.warning("ID %s already exists, not changing ID while renaming %s -> %s" % (new_id, old_url, new_url))
321
                except StopIteration:
322
                    mets_file.ID = new_id
323
            # change file paths in PAGE-XML imageFilename and filename attributes
324
            for page_file in self.mets.find_files(mimetype=MIMETYPE_PAGE, local_only=True):
325
                log.info("Renaming file references in PAGE-XML %s" % page_file)
326
                pcgts = page_from_file(page_file)
327
                changed = False
328
                for old_url, new_url in url_replacements.items():
329
                    if pcgts.get_Page().imageFilename == old_url:
330
                        changed = True
331
                        log.info("Rename pc:Page/@imageFilename: %s -> %s" % (old_url, new_url))
332
                        pcgts.get_Page().imageFilename = new_url
333
                for ai in pcgts.get_Page().get_AllAlternativeImages():
334
                    for old_url, new_url in url_replacements.items():
335
                        if ai.filename == old_url:
336
                            changed = True
337
                            log.info("Rename pc:Page/../AlternativeImage: %s -> %s" % (old_url, new_url))
338
                            ai.filename = new_url
339
                if changed:
340
                    log.info("PAGE-XML changed, writing %s" % (page_file.local_filename))
341
                    with open(page_file.local_filename, 'w', encoding='utf-8') as f:
342
                        f.write(to_xml(pcgts))
343
            # change the ``USE`` attribute of the fileGrp
344
            self.mets.rename_file_group(old, new)
345
            # Remove the old dir
346
            log.info("rmdir %s" % old)
347
            if Path(old).is_dir() and not listdir(old):
348
                Path(old).rmdir()
349
350
    @deprecated_alias(pageId="page_id")
351
    @deprecated_alias(ID="file_id")
352
    def add_file(self, file_grp, content=None, **kwargs):
353
        """
354
        Add a file to the :py:class:`ocrd_models.ocrd_mets.OcrdMets` of the workspace.
355
356
        Arguments:
357
            file_grp (string): `@USE` of the METS `fileGrp` to add to
358
        Keyword Args:
359
            content (string|bytes): optional content to write to the file
360
                in the filesystem
361
            **kwargs: See :py:func:`ocrd_models.ocrd_mets.OcrdMets.add_file`
362
        Returns:
363
            a new :py:class:`ocrd_models.ocrd_file.OcrdFile`
364
        """
365
        log = getLogger('ocrd.workspace.add_file')
366
        log.debug(
367
            'outputfile file_grp=%s local_filename=%s content=%s',
368
            file_grp,
369
            kwargs.get('local_filename'),
370
            content is not None)
371
        if 'page_id' not in kwargs:
372
            raise ValueError("workspace.add_file must be passed a 'page_id' kwarg, even if it is None.")
373
        if content is not None and not kwargs.get('local_filename'):
374
            raise Exception("'content' was set but no 'local_filename'")
375
        if self.overwrite_mode:
376
            kwargs['force'] = True
377
378
        with pushd_popd(self.directory):
379
            if kwargs.get('local_filename'):
380
                # If the local filename has folder components, create those folders
381
                local_filename_dir = kwargs['local_filename'].rsplit('/', 1)[0]
382
                if local_filename_dir != kwargs['local_filename'] and not Path(local_filename_dir).is_dir():
383
                    makedirs(local_filename_dir)
384
                if 'url' not in kwargs:
385
                    kwargs['url'] = kwargs['local_filename']
386
387
            #  print(kwargs)
388
            kwargs["pageId"] = kwargs.pop("page_id")
389
            if "file_id" in kwargs:
390
                kwargs["ID"] = kwargs.pop("file_id")
391
392
            ret = self.mets.add_file(file_grp, **kwargs)
393
394
            if content is not None:
395
                with open(kwargs['local_filename'], 'wb') as f:
396
                    if isinstance(content, str):
397
                        content = bytes(content, 'utf-8')
398
                    f.write(content)
399
400
        return ret
401
402
    def save_mets(self):
403
        """
404
        Write out the current state of the METS file to the filesystem.
405
        """
406
        log = getLogger('ocrd.workspace.save_mets')
407
        log.info("Saving mets '%s'", self.mets_target)
408
        if self.automatic_backup:
409
            WorkspaceBackupManager(self).add()
410
        with atomic_write(self.mets_target) as f:
411
            f.write(self.mets.to_xml(xmllint=True).decode('utf-8'))
412
413
    def resolve_image_exif(self, image_url):
414
        """
415
        Get the EXIF metadata about an image URL as :py:class:`ocrd_models.ocrd_exif.OcrdExif`
416
417
        Args:
418
            image_url (string) : `@href` (path or URL) of the METS `file` to inspect
419
420
        Returns:
421
            :py:class:`ocrd_models.ocrd_exif.OcrdExif`
422
        """
423
        if not image_url:
424
            # avoid "finding" just any file
425
            raise Exception("Cannot resolve empty image path")
426
        try:
427
            f = next(self.mets.find_files(url=image_url))
428
            image_filename = self.download_file(f).local_filename
429
            ocrd_exif = exif_from_filename(image_filename)
430
        except StopIteration:
431
            with download_temporary_file(image_url) as f:
432
                ocrd_exif = exif_from_filename(f.name)
433
        return ocrd_exif
434
435
    @deprecated(version='1.0.0', reason="Use workspace.image_from_page and workspace.image_from_segment")
436
    def resolve_image_as_pil(self, image_url, coords=None):
437
        """
438
        Resolve an image URL to a `PIL.Image`.
439
440
        Arguments:
441
            image_url (string): `@href` (path or URL) of the METS `file` to retrieve
442
        Keyword Args:
443
            coords (list) : Coordinates of the bounding box to cut from the image
444
445
        Returns:
446
            Full or cropped `PIL.Image`
447
448
        """
449
        return self._resolve_image_as_pil(image_url, coords)
450
451
    def _resolve_image_as_pil(self, image_url, coords=None):
452
        if not image_url:
453
            # avoid "finding" just any file
454
            raise Exception("Cannot resolve empty image path")
455
        log = getLogger('ocrd.workspace._resolve_image_as_pil')
456
        with pushd_popd(self.directory):
457
            try:
458
                f = next(self.mets.find_files(url=image_url))
459
                pil_image = Image.open(self.download_file(f).local_filename)
460
            except StopIteration:
461
                with download_temporary_file(image_url) as f:
462
                    pil_image = Image.open(f.name)
463
            pil_image.load() # alloc and give up the FD
464
465
        # Pillow does not properly support higher color depths
466
        # (e.g. 16-bit or 32-bit or floating point grayscale),
467
        # clipping its dynamic range to the lower 8-bit in
468
        # many operations (including paste, putalpha, ImageStat...),
469
        # even including conversion.
470
        # Cf. Pillow#3011 Pillow#3159 Pillow#3838 (still open in 8.0)
471
        # So to be on the safe side, we must re-quantize these
472
        # to 8-bit via numpy (conversion to/from which fortunately
473
        # seems to work reliably):
474
        if (pil_image.mode.startswith('I') or
475
            pil_image.mode.startswith('F')):
476
            arr_image = np.array(pil_image)
477
            if arr_image.dtype.kind == 'i':
478
                # signed integer is *not* trustworthy in this context
479
                # (usually a mistake in the array interface)
480
                log.debug('Casting image "%s" from signed to unsigned', image_url)
481
                arr_image.dtype = np.dtype('u' + arr_image.dtype.name)
482
            if arr_image.dtype.kind == 'u':
483
                # integer needs to be scaled linearly to 8 bit
484
                # of course, an image might actually have some lower range
485
                # (e.g. 10-bit in I;16 or 20-bit in I or 4-bit in L),
486
                # but that would be guessing anyway, so here don't
487
                # make assumptions on _scale_, just reduce _precision_
488
                log.debug('Reducing image "%s" from depth %d bit to 8 bit',
489
                          image_url, arr_image.dtype.itemsize * 8)
490
                arr_image = arr_image >> 8 * (arr_image.dtype.itemsize-1)
491
                arr_image = arr_image.astype(np.uint8)
492
            elif arr_image.dtype.kind == 'f':
493
                # float needs to be scaled from [0,1.0] to [0,255]
494
                log.debug('Reducing image "%s" from floating point to 8 bit',
495
                          image_url)
496
                arr_image *= 255
497
                arr_image = arr_image.astype(np.uint8)
498
            pil_image = Image.fromarray(arr_image)
499
500
        if coords is None:
501
            return pil_image
502
503
        # FIXME: remove or replace this by (image_from_polygon+) crop_image ...
504
        log.debug("Converting PIL to OpenCV: %s", image_url)
505
        color_conversion = COLOR_GRAY2BGR if pil_image.mode in ('1', 'L') else  COLOR_RGB2BGR
506
        pil_as_np_array = np.array(pil_image).astype('uint8') if pil_image.mode == '1' else np.array(pil_image)
507
        cv2_image = cvtColor(pil_as_np_array, color_conversion)
508
509
        poly = np.array(coords, np.int32)
510
        log.debug("Cutting region %s from %s", coords, image_url)
511
        region_cut = cv2_image[
512
            np.min(poly[:, 1]):np.max(poly[:, 1]),
513
            np.min(poly[:, 0]):np.max(poly[:, 0])
514
        ]
515
        return Image.fromarray(region_cut)
516
517
    def image_from_page(self, page, page_id,
518
                        fill='background', transparency=False,
519
                        feature_selector='', feature_filter='', filename=''):
520
        """Extract an image for a PAGE-XML page from the workspace.
521
522
        Args:
523
            page (:py:class:`ocrd_models.ocrd_page.PageType`): a PAGE `PageType` object
524
            page_id (string): its `@ID` in the METS physical `structMap`
525
        Keyword Args:
526
            fill (string): a `PIL` color specifier, or `background` or `none`
527
            transparency (boolean): whether to add an alpha channel for masking
528
            feature_selector (string): a comma-separated list of `@comments` classes
529
            feature_filter (string): a comma-separated list of `@comments` classes
530
            filename (string): which file path to use
531
532
        Extract a `PIL.Image` from ``page``, either from its `AlternativeImage`
533
        (if it exists), or from its `@imageFilename` (otherwise). Also crop it,
534
        if a `Border` exists, and rotate it, if any `@orientation` angle is
535
        annotated.
536
537
        If ``filename`` is given, then among `@imageFilename` and the available
538
        `AlternativeImage/@filename` images, pick that one, or raise an error.
539
540
        If ``feature_selector`` and/or ``feature_filter`` is given, then
541
        among the `@imageFilename` image and the available AlternativeImages,
542
        select/filter the richest one which contains all of the selected,
543
        but none of the filtered features (i.e. `@comments` classes), or
544
        raise an error.
545
546
        (Required and produced features need not be in the same order, so
547
        ``feature_selector`` is merely a mask specifying Boolean AND, and
548
        ``feature_filter`` is merely a mask specifying Boolean OR.)
549
550
        If the chosen image does not have the feature `"cropped"` yet, but
551
        a `Border` exists, and unless `"cropped"` is being filtered, then crop it.
552
        Likewise, if the chosen image does not have the feature `"deskewed"` yet,
553
        but an `@orientation` angle is annotated, and unless `"deskewed"` is being
554
        filtered, then rotate it. (However, if `@orientation` is above the
555
        [-45°,45°] interval, then apply as much transposition as possible first,
556
        unless `"rotated-90"` / `"rotated-180"` / `"rotated-270"` is being filtered.)
557
558
        Cropping uses a polygon mask (not just the bounding box rectangle).
559
        Areas outside the polygon will be filled according to ``fill``:
560
561
        \b
562
        - if `"background"` (the default),
563
          then fill with the median color of the image;
564
        - else if `"none"`, then avoid masking polygons where possible
565
          (i.e. when cropping) or revert to the default (i.e. when rotating)
566
        - otherwise, use the given color, e.g. `"white"` or `(255,255,255)`.
567
568
        Moreover, if ``transparency`` is true, and unless the image already
569
        has an alpha channel, then add an alpha channel which is fully opaque
570
        before cropping and rotating. (Thus, unexposed/masked areas will be
571
        transparent afterwards for consumers that can interpret alpha channels).
572
573
        Returns:
574
            a tuple of
575
             * the extracted `PIL.Image`,
576
             * a `dict` with information about the extracted image:
577
578
               - `"transform"`: a `Numpy` array with an affine transform which
579
                   converts from absolute coordinates to those relative to the image,
580
                   i.e. after cropping to the page's border / bounding box (if any)
581
                   and deskewing with the page's orientation angle (if any)
582
               - `"angle"`: the rotation/reflection angle applied to the image so far,
583
               - `"features"`: the `AlternativeImage` `@comments` for the image, i.e.
584
                 names of all applied operations that lead up to this result,
585
             * an :py:class:`ocrd_models.ocrd_exif.OcrdExif` instance associated with
586
               the original image.
587
588
        (The first two can be used to annotate a new `AlternativeImage`,
589
         or be passed down with :py:meth:`image_from_segment`.)
590
591
        Examples:
592
593
         * get a raw (colored) but already deskewed and cropped image::
594
595
                page_image, page_coords, page_image_info = workspace.image_from_page(
596
                    page, page_id,
597
                    feature_selector='deskewed,cropped',
598
                    feature_filter='binarized,grayscale_normalized')
599
        """
600
        log = getLogger('ocrd.workspace.image_from_page')
601
        page_image_info = self.resolve_image_exif(page.imageFilename)
602
        page_image = self._resolve_image_as_pil(page.imageFilename)
603
        page_coords = dict()
604
        # use identity as initial affine coordinate transform:
605
        page_coords['transform'] = np.eye(3)
606
        # interim bbox (updated with each change to the transform):
607
        page_bbox = [0, 0, page_image.width, page_image.height]
608
        page_xywh = {'x': 0, 'y': 0,
609
                     'w': page_image.width, 'h': page_image.height}
610
611
        border = page.get_Border()
612
        # page angle: PAGE @orientation is defined clockwise,
613
        # whereas PIL/ndimage rotation is in mathematical direction:
614
        page_coords['angle'] = -(page.get_orientation() or 0)
615
        # map angle from (-180,180] to [0,360], and partition into multiples of 90;
616
        # but avoid unnecessary large remainders, i.e. split symmetrically:
617
        orientation = (page_coords['angle'] + 45) % 360
618
        orientation = orientation - (orientation % 90)
619
        skew = (page_coords['angle'] % 360) - orientation
620
        skew = 180 - (180 - skew) % 360 # map to [-45,45]
621
        page_coords['angle'] = 0 # nothing applied yet (depends on filters)
622
        log.debug("page '%s' has %s orientation=%d skew=%.2f",
623
                  page_id, "border," if border else "", orientation, skew)
624
625
        # initialize AlternativeImage@comments classes as empty:
626
        page_coords['features'] = ''
627
        best_image = None
628
        alternative_images = page.get_AlternativeImage()
629 View Code Duplication
        if alternative_images:
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
630
            # (e.g. from page-level cropping, binarization, deskewing or despeckling)
631
            best_features = set()
632
            auto_features = {'cropped', 'deskewed', 'rotated-90', 'rotated-180', 'rotated-270'}
633
            # search to the end, because by convention we always append,
634
            # and among multiple satisfactory images we want the most recent,
635
            # but also ensure that we get the richest feature set, i.e. most
636
            # of those features that we cannot reproduce automatically below
637
            for alternative_image in alternative_images:
638
                if filename and filename != alternative_image.filename:
639
                    continue
640
                features = alternative_image.get_comments()
641
                if not features:
642
                    log.warning("AlternativeImage %d for page '%s' does not have any feature attributes",
643
                                alternative_images.index(alternative_image) + 1, page_id)
644
                    features = ''
645
                featureset = set(features.split(','))
646
                if (all(feature in featureset
647
                        for feature in feature_selector.split(',') if feature) and
648
                    not any(feature in featureset
649
                            for feature in feature_filter.split(',') if feature) and
650
                    len(featureset.difference(auto_features)) >= \
651
                    len(best_features.difference(auto_features))):
652
                    best_features = featureset
653
                    best_image = alternative_image
654
            if best_image:
655
                log.debug("Using AlternativeImage %d %s for page '%s'",
656
                          alternative_images.index(best_image) + 1,
657
                          best_features, page_id)
658
                page_image = self._resolve_image_as_pil(best_image.get_filename())
659
                page_coords['features'] = best_image.get_comments() # including duplicates
660
661
        # adjust the coord transformation to the steps applied on the image,
662
        # and apply steps on the existing image in case it is missing there,
663
        # but traverse all steps (crop/reflect/rotate) in a particular order:
664
        # - existing image features take priority (in the order annotated),
665
        # - next is cropping (if necessary but not already applied),
666
        # - next is reflection (if necessary but not already applied),
667
        # - next is rotation (if necessary but not already applied).
668
        # This helps deal with arbitrary workflows (e.g. crop then deskew,
669
        # or deskew then crop), regardless of where images are generated.
670
        alternative_image_features = page_coords['features'].split(',')
671
        for duplicate_feature in set([feature for feature in alternative_image_features
672
                                      # features relevant in reconstructing coordinates:
673
                                      if (feature in ['cropped', 'deskewed', 'rotated-90',
674
                                                      'rotated-180', 'rotated-270'] and
675
                                          alternative_image_features.count(feature) > 1)]):
676
            log.error("Duplicate feature %s in AlternativeImage for page '%s'",
677
                      duplicate_feature, page_id)
678
        for i, feature in enumerate(alternative_image_features +
679
                                    (['cropped']
680
                                     if (border and
681
                                         not 'cropped' in alternative_image_features and
682
                                         not 'cropped' in feature_filter.split(','))
683
                                     else []) +
684
                                    (['rotated-%d' % orientation]
685
                                     if (orientation and
686
                                         not 'rotated-%d' % orientation in alternative_image_features and
687
                                         not 'rotated-%d' % orientation in feature_filter.split(','))
688
                                     else []) +
689
                                    (['deskewed']
690
                                     if (skew and
691
                                         not 'deskewed' in alternative_image_features and
692
                                         not 'deskewed' in feature_filter.split(','))
693
                                     else []) +
694
                                    # not a feature to be added, but merely as a fallback position
695
                                    # to always enter loop at i == len(alternative_image_features)
696
                                    ['_check']):
697
            # image geometry vs feature consistency can only be checked
698
            # after all features on the existing AlternativeImage have
699
            # been adjusted for in the transform, and when there is a mismatch,
700
            # additional steps applied here would only repeat the respective
701
            # error message; so we only check once at the boundary between
702
            # existing and new features
703
            # FIXME we should check/enforce consistency when _adding_ AlternativeImage
704
            if (i == len(alternative_image_features) and
705
                not (page_xywh['w'] - 2 < page_image.width < page_xywh['w'] + 2 and
706
                     page_xywh['h'] - 2 < page_image.height < page_xywh['h'] + 2)):
707
                log.error('page "%s" image (%s; %dx%d) has not been cropped properly (%dx%d)',
708
                          page_id, page_coords['features'],
709
                          page_image.width, page_image.height,
710
                          page_xywh['w'], page_xywh['h'])
711
            name = "%s for page '%s'" % ("AlternativeImage" if best_image
712
                                         else "original image", page_id)
713
            # adjust transform to feature, and ensure feature is applied to image
714
            if feature == 'cropped':
715
                page_image, page_coords, page_xywh = _crop(
716
                    log, name, border, page_image, page_coords,
717
                    fill=fill, transparency=transparency)
718
            elif feature == 'rotated-%d' % orientation:
719
                page_image, page_coords, page_xywh = _reflect(
720
                    log, name, orientation, page_image, page_coords, page_xywh)
721
            elif feature == 'deskewed':
722
                page_image, page_coords, page_xywh = _rotate(
723
                    log, name, skew, border, page_image, page_coords, page_xywh,
724
                    fill=fill, transparency=transparency)
725
726
        # verify constraints again:
727
        if filename and not getattr(page_image, 'filename', '').endswith(filename):
728
            raise Exception('Found no AlternativeImage that satisfies all requirements ' +
729
                            'filename="%s" in page "%s"' % (
730
                                filename, page_id))
731
        if not all(feature in page_coords['features']
732
                   for feature in feature_selector.split(',') if feature):
733
            raise Exception('Found no AlternativeImage that satisfies all requirements ' +
734
                            'selector="%s" in page "%s"' % (
735
                                feature_selector, page_id))
736
        if any(feature in page_coords['features']
737
               for feature in feature_filter.split(',') if feature):
738
            raise Exception('Found no AlternativeImage that satisfies all requirements ' +
739
                            'filter="%s" in page "%s"' % (
740
                                feature_filter, page_id))
741
        page_image.format = 'PNG' # workaround for tesserocr#194
742
        return page_image, page_coords, page_image_info
743
744
    def image_from_segment(self, segment, parent_image, parent_coords,
745
                           fill='background', transparency=False,
746
                           feature_selector='', feature_filter='', filename=''):
747
        """Extract an image for a PAGE-XML hierarchy segment from its parent's image.
748
749
        Args:
750
            segment (object): a PAGE segment object \
751
                (i.e. :py:class:`~ocrd_models.ocrd_page.TextRegionType` \
752
                or :py:class:`~ocrd_models.ocrd_page.TextLineType` \
753
                or :py:class:`~ocrd_models.ocrd_page.WordType` \
754
                or :py:class:`~ocrd_models.ocrd_page.GlyphType`)
755
            parent_image (`PIL.Image`): image of the `segment`'s parent
756
            parent_coords (dict): a `dict` with information about `parent_image`:
757
758
               - `"transform"`: a `Numpy` array with an affine transform which
759
                 converts from absolute coordinates to those relative to the image,
760
                 i.e. after applying all operations (starting with the original image)
761
               - `"angle"`: the rotation/reflection angle applied to the image so far,
762
               - `"features"`: the ``AlternativeImage/@comments`` for the image, i.e.
763
                 names of all operations that lead up to this result, and
764
        Keyword Args:
765
            fill (string): a `PIL` color specifier, or `background` or `none`
766
            transparency (boolean): whether to add an alpha channel for masking
767
            feature_selector (string): a comma-separated list of ``@comments`` classes
768
            feature_filter (string): a comma-separated list of ``@comments`` classes
769
770
        Extract a `PIL.Image` from `segment`, either from ``AlternativeImage``
771
        (if it exists), or producing a new image via cropping from `parent_image`
772
        (otherwise). Pass in `parent_image` and `parent_coords` from the result
773
        of the next higher-level of this function or from :py:meth:`image_from_page`.
774
775
        If ``filename`` is given, then among the available `AlternativeImage/@filename`
776
        images, pick that one, or raise an error.
777
778
        If ``feature_selector`` and/or ``feature_filter`` is given, then
779
        among the cropped `parent_image` and the available AlternativeImages,
780
        select/filter the richest one which contains all of the selected,
781
        but none of the filtered features (i.e. ``@comments`` classes), or
782
        raise an error.
783
784
        (Required and produced features need not be in the same order, so
785
        `feature_selector` is merely a mask specifying Boolean AND, and
786
        `feature_filter` is merely a mask specifying Boolean OR.)
787
788
        Cropping uses a polygon mask (not just the bounding box rectangle).
789
        Areas outside the polygon will be filled according to `fill`:
790
791
        \b
792
        - if `"background"` (the default),
793
          then fill with the median color of the image;
794
        - else if `"none"`, then avoid masking polygons where possible
795
          (i.e. when cropping) or revert to the default (i.e. when rotating)
796
        - otherwise, use the given color, e.g. `"white"` or `(255,255,255)`.
797
798
        Moreover, if `transparency` is true, and unless the image already
799
        has an alpha channel, then add an alpha channel which is fully opaque
800
        before cropping and rotating. (Thus, unexposed/masked areas will be
801
        transparent afterwards for consumers that can interpret alpha channels).
802
803
        When cropping, compensate any ``@orientation`` angle annotated for the
804
        parent (from parent-level deskewing) by rotating the segment coordinates
805
        in an inverse transformation (i.e. translation to center, then passive
806
        rotation, and translation back).
807
808
        Regardless, if any ``@orientation`` angle is annotated for the segment
809
        (from segment-level deskewing), and the chosen image does not have
810
        the feature `"deskewed"` yet, and unless `"deskewed"` is being filtered,
811
        then rotate it - compensating for any previous `"angle"`. (However,
812
        if ``@orientation`` is above the [-45°,45°] interval, then apply as much
813
        transposition as possible first, unless `"rotated-90"` / `"rotated-180"` /
814
        `"rotated-270"` is being filtered.)
815
816
        Returns:
817
            a tuple of
818
             * the extracted `PIL.Image`,
819
             * a `dict` with information about the extracted image:
820
821
               - `"transform"`: a `Numpy` array with an affine transform which
822
                   converts from absolute coordinates to those relative to the image,
823
                   i.e. after applying all parent operations, and then cropping to
824
                   the segment's bounding box, and deskewing with the segment's
825
                   orientation angle (if any)
826
               - `"angle"`: the rotation/reflection angle applied to the image so far,
827
               - `"features"`: the ``AlternativeImage/@comments`` for the image, i.e.
828
                 names of all applied operations that lead up to this result.
829
830
        (These can be used to create a new ``AlternativeImage``, or passed down
831
         for :py:meth:`image_from_segment` calls on lower hierarchy levels.)
832
833
        Examples:
834
835
         * get a raw (colored) but already deskewed and cropped image::
836
837
                image, xywh = workspace.image_from_segment(region,
838
                    page_image, page_xywh,
839
                    feature_selector='deskewed,cropped',
840
                    feature_filter='binarized,grayscale_normalized')
841
        """
842
        log = getLogger('ocrd.workspace.image_from_segment')
843
        # note: We should mask overlapping neighbouring segments here,
844
        # but finding the right clipping rules can be difficult if operating
845
        # on the raw (non-binary) image data alone: for each intersection, it
846
        # must be decided which one of either segment or neighbour to assign,
847
        # e.g. an ImageRegion which properly contains our TextRegion should be
848
        # completely ignored, but an ImageRegion which is properly contained
849
        # in our TextRegion should be completely masked, while partial overlap
850
        # may be more difficult to decide. On the other hand, on the binary image,
851
        # we can use connected component analysis to mask foreground areas which
852
        # originate in the neighbouring regions. But that would introduce either
853
        # the assumption that the input has already been binarized, or a dependency
854
        # on some ad-hoc binarization method. Thus, it is preferable to use
855
        # a dedicated processor for this (which produces clipped AlternativeImage
856
        # or reduced polygon coordinates).
857
        segment_image, segment_coords, segment_xywh = _crop(
858
            log, "parent image for segment '%s'" % segment.id,
859
            segment, parent_image, parent_coords,
860
            fill=fill, transparency=transparency)
861
862
        # Semantics of missing @orientation at region level could be either
863
        # - inherited from page level: same as line or word level (no @orientation),
864
        # - zero (unrotate page angle): different from line or word level (because
865
        #   otherwise deskewing would never have an effect on lines and words)
866
        # The PAGE specification is silent here (but does generally not concern itself
867
        # much with AlternativeImage coordinate consistency).
868
        # Since our (generateDS-backed) ocrd_page supports the zero/none distinction,
869
        # we choose the former (i.e. None is inheritance).
870
        if 'orientation' in segment.__dict__ and segment.get_orientation() is not None:
871
            # region angle: PAGE @orientation is defined clockwise,
872
            # whereas PIL/ndimage rotation is in mathematical direction:
873
            angle = -segment.get_orientation()
874
            # @orientation is always absolute; if higher levels
875
            # have already rotated, then we must compensate:
876
            angle -= parent_coords['angle']
877
            # map angle from (-180,180] to [0,360], and partition into multiples of 90;
878
            # but avoid unnecessary large remainders, i.e. split symmetrically:
879
            orientation = (angle + 45) % 360
880
            orientation = orientation - (orientation % 90)
881
            skew = (angle % 360) - orientation
882
            skew = 180 - (180 - skew) % 360 # map to [-45,45]
883
            log.debug("segment '%s' has orientation=%d skew=%.2f",
884
                      segment.id, orientation, skew)
885
        else:
886
            orientation = 0
887
            skew = 0
888
        segment_coords['angle'] = parent_coords['angle'] # nothing applied yet (depends on filters)
889
890
        # initialize AlternativeImage@comments classes from parent, except
891
        # for those operations that can apply on multiple hierarchy levels:
892
        segment_coords['features'] = ','.join(
893
            [feature for feature in parent_coords['features'].split(',')
894
             if feature in ['binarized', 'grayscale_normalized',
895
                            'despeckled', 'dewarped']])
896
897
        best_image = None
898
        alternative_images = segment.get_AlternativeImage()
899 View Code Duplication
        if alternative_images:
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
900
            # (e.g. from segment-level cropping, binarization, deskewing or despeckling)
901
            best_features = set()
902
            auto_features = {'cropped', 'deskewed', 'rotated-90', 'rotated-180', 'rotated-270'}
903
            # search to the end, because by convention we always append,
904
            # and among multiple satisfactory images we want the most recent,
905
            # but also ensure that we get the richest feature set, i.e. most
906
            # of those features that we cannot reproduce automatically below
907
            for alternative_image in alternative_images:
908
                if filename and filename != alternative_image.filename:
909
                    continue
910
                features = alternative_image.get_comments()
911
                if not features:
912
                    log.warning("AlternativeImage %d for segment '%s' does not have any feature attributes",
913
                                alternative_images.index(alternative_image) + 1, segment.id)
914
                    features = ''
915
                featureset = set(features.split(','))
916
                if (all(feature in featureset
917
                        for feature in feature_selector.split(',') if feature) and
918
                    not any(feature in featureset
919
                            for feature in feature_filter.split(',') if feature) and
920
                    len(featureset.difference(auto_features)) >= \
921
                    len(best_features.difference(auto_features))):
922
                    best_features = featureset
923
                    best_image = alternative_image
924
            if best_image:
925
                log.debug("Using AlternativeImage %d %s for segment '%s'",
926
                          alternative_images.index(best_image) + 1,
927
                          best_features, segment.id)
928
                segment_image = self._resolve_image_as_pil(alternative_image.get_filename())
0 ignored issues
show
introduced by
The variable alternative_image does not seem to be defined in case the for loop on line 907 is not entered. Are you sure this can never be the case?
Loading history...
929
                segment_coords['features'] = best_image.get_comments() # including duplicates
930
931
        alternative_image_features = segment_coords['features'].split(',')
932
        for duplicate_feature in set([feature for feature in alternative_image_features
933
                                      # features relevant in reconstructing coordinates:
934
                                      if (feature in ['deskewed', 'rotated-90',
935
                                                      'rotated-180', 'rotated-270'] and
936
                                          alternative_image_features.count(feature) > 1)]):
937
            log.error("Duplicate feature %s in AlternativeImage for segment '%s'",
938
                      duplicate_feature, segment.id)
939
        for i, feature in enumerate(alternative_image_features +
940
                                    (['rotated-%d' % orientation]
941
                                     if (orientation and
942
                                         not 'rotated-%d' % orientation in alternative_image_features and
943
                                         not 'rotated-%d' % orientation in feature_filter.split(','))
944
                                     else []) +
945
                                    (['deskewed']
946
                                     if (skew and
947
                                         not 'deskewed' in alternative_image_features and
948
                                         not 'deskewed' in feature_filter.split(','))
949
                                     else []) +
950
                                    # not a feature to be added, but merely as a fallback position
951
                                    # to always enter loop at i == len(alternative_image_features)
952
                                    ['_check']):
953
            # image geometry vs feature consistency can only be checked
954
            # after all features on the existing AlternativeImage have
955
            # been adjusted for in the transform, and when there is a mismatch,
956
            # additional steps applied here would only repeat the respective
957
            # error message; so we only check once at the boundary between
958
            # existing and new features
959
            # FIXME we should enforce consistency here (i.e. split into transposition
960
            #       and minimal rotation, rotation always reshapes, rescaling never happens)
961
            # FIXME: inconsistency currently unavoidable with line-level dewarping (which increases height)
962
            if (i == len(alternative_image_features) and
963
                not (segment_xywh['w'] - 2 < segment_image.width < segment_xywh['w'] + 2 and
964
                     segment_xywh['h'] - 2 < segment_image.height < segment_xywh['h'] + 2)):
965
                log.error('segment "%s" image (%s; %dx%d) has not been cropped properly (%dx%d)',
966
                          segment.id, segment_coords['features'],
967
                          segment_image.width, segment_image.height,
968
                          segment_xywh['w'], segment_xywh['h'])
969
            name = "%s for segment '%s'" % ("AlternativeImage" if best_image
970
                                            else "parent image", segment.id)
971
            # adjust transform to feature, and ensure feature is applied to image
972
            if feature == 'rotated-%d' % orientation:
973
                segment_image, segment_coords, segment_xywh = _reflect(
974
                    log, name, orientation, segment_image, segment_coords, segment_xywh)
975
            elif feature == 'deskewed':
976
                segment_image, segment_coords, segment_xywh = _rotate(
977
                    log, name, skew, segment, segment_image, segment_coords, segment_xywh,
978
                    fill=fill, transparency=transparency)
979
980
        # verify constraints again:
981
        if filename and not getattr(segment_image, 'filename', '').endswith(filename):
982
            raise Exception('Found no AlternativeImage that satisfies all requirements ' +
983
                            'filename="%s" in segment "%s"' % (
984
                                filename, segment.id))
985
        if not all(feature in segment_coords['features']
986
                   for feature in feature_selector.split(',') if feature):
987
            raise Exception('Found no AlternativeImage that satisfies all requirements' +
988
                            'selector="%s" in segment "%s"' % (
989
                                feature_selector, segment.id))
990
        if any(feature in segment_coords['features']
991
               for feature in feature_filter.split(',') if feature):
992
            raise Exception('Found no AlternativeImage that satisfies all requirements ' +
993
                            'filter="%s" in segment "%s"' % (
994
                                feature_filter, segment.id))
995
        segment_image.format = 'PNG' # workaround for tesserocr#194
996
        return segment_image, segment_coords
997
998
    # pylint: disable=redefined-builtin
999
    def save_image_file(self, image,
1000
                        file_id,
1001
                        file_grp,
1002
                        page_id=None,
1003
                        mimetype='image/png',
1004
                        force=False):
1005
        """Store an image in the filesystem and reference it as new file in the METS.
1006
1007
        Args:
1008
            image (PIL.Image): derived image to save
1009
            file_id (string): `@ID` of the METS `file` to use
1010
            file_grp (string): `@USE` of the METS `fileGrp` to use
1011
        Keyword Args:
1012
            page_id (string): `@ID` in the METS physical `structMap` to use
1013
            mimetype (string): MIME type of the image format to serialize as
1014
            force (boolean): whether to replace any existing `file` with that `@ID`
1015
1016
        Serialize the image into the filesystem, and add a `file` for it in the METS.
1017
        Use a filename extension based on ``mimetype``.
1018
1019
        Returns:
1020
            The (absolute) path of the created file.
1021
        """
1022
        log = getLogger('ocrd.workspace.save_image_file')
1023
        if not force and self.overwrite_mode:
1024
            force = True
1025
        image_bytes = io.BytesIO()
1026
        image.save(image_bytes, format=MIME_TO_PIL[mimetype])
1027
        file_path = str(Path(file_grp, '%s%s' % (file_id, MIME_TO_EXT[mimetype])))
1028
        out = self.add_file(
1029
            file_grp,
1030
            file_id=file_id,
1031
            page_id=page_id,
1032
            local_filename=file_path,
1033
            mimetype=mimetype,
1034
            content=image_bytes.getvalue(),
1035
            force=force)
1036
        log.info('created file ID: %s, file_grp: %s, path: %s',
1037
                 file_id, file_grp, out.local_filename)
1038
        return file_path
1039
1040
    def find_files(self, *args, **kwargs):
1041
        """
1042
        Search ``mets:file`` entries in wrapped METS document and yield results.
1043
1044
        Delegator to :py:func:`ocrd_models.ocrd_mets.OcrdMets.find_files`
1045
1046
        Keyword Args:
1047
            **kwargs: See :py:func:`ocrd_models.ocrd_mets.OcrdMets.find_files`
1048
        Returns:
1049
            Generator which yields :py:class:`ocrd_models:ocrd_file:OcrdFile` instantiations
1050
        """
1051
        log = getLogger('ocrd.workspace.find_files')
1052
        log.debug('find files in mets. kwargs=%s' % kwargs)
1053
        if "page_id" in kwargs:
1054
            kwargs["pageId"] = kwargs.pop("page_id")
1055
        if "file_id" in kwargs:
1056
            kwargs["ID"] = kwargs.pop("file_id")
1057
        if "file_grp" in kwargs:
1058
            kwargs["fileGrp"] = kwargs.pop("file_grp")
1059
        with pushd_popd(self.directory):
1060
            return self.mets.find_files(*args, **kwargs)
1061
1062
def _crop(log, name, segment, parent_image, parent_coords, op='cropped', **kwargs):
1063
    segment_coords = parent_coords.copy()
1064
    # get polygon outline of segment relative to parent image:
1065
    segment_polygon = coordinates_of_segment(segment, parent_image, parent_coords)
1066
    # get relative bounding box:
1067
    segment_bbox = bbox_from_polygon(segment_polygon)
1068
    # get size of the segment in the parent image after cropping
1069
    # (i.e. possibly different from size before rotation at the parent, but
1070
    #  also possibly different from size after rotation below/AlternativeImage):
1071
    segment_xywh = xywh_from_bbox(*segment_bbox)
1072
    # crop, if (still) necessary:
1073
    if (not isinstance(segment, BorderType) or # always crop below page level
1074
        not op in parent_coords['features']):
1075
        if op == 'recropped':
1076
            log.info("Recropping %s", name)
1077
        elif isinstance(segment, BorderType):
1078
            log.info("Cropping %s", name)
1079
            segment_coords['features'] += ',' + op
1080
        # create a mask from the segment polygon:
1081
        segment_image = image_from_polygon(parent_image, segment_polygon, **kwargs)
1082
        # crop to bbox:
1083
        segment_image = crop_image(segment_image, box=segment_bbox)
1084
    else:
1085
        segment_image = parent_image
1086
    # subtract offset from parent in affine coordinate transform:
1087
    # (consistent with image cropping)
1088
    segment_coords['transform'] = shift_coordinates(
1089
        parent_coords['transform'],
1090
        np.array([-segment_bbox[0],
1091
                  -segment_bbox[1]]))
1092
    return segment_image, segment_coords, segment_xywh
1093
1094
def _reflect(log, name, orientation, segment_image, segment_coords, segment_xywh):
1095
    # Transpose in affine coordinate transform:
1096
    # (consistent with image transposition or AlternativeImage below)
1097
    transposition = {
1098
        90: Image.ROTATE_90,
1099
        180: Image.ROTATE_180,
1100
        270: Image.ROTATE_270
1101
    }.get(orientation) # no default
1102
    segment_coords['transform'] = transpose_coordinates(
1103
        segment_coords['transform'], transposition,
1104
        np.array([0.5 * segment_xywh['w'],
1105
                  0.5 * segment_xywh['h']]))
1106
    segment_xywh['w'], segment_xywh['h'] = adjust_canvas_to_transposition(
1107
        [segment_xywh['w'], segment_xywh['h']], transposition)
1108
    segment_coords['angle'] += orientation
1109
    # transpose, if (still) necessary:
1110
    if not 'rotated-%d' % orientation in segment_coords['features']:
1111
        log.info("Transposing %s by %d°", name, orientation)
1112
        segment_image = transpose_image(segment_image, transposition)
1113
        segment_coords['features'] += ',rotated-%d' % orientation
1114
    return segment_image, segment_coords, segment_xywh
1115
1116
def _rotate(log, name, skew, segment, segment_image, segment_coords, segment_xywh, **kwargs):
1117
    # Rotate around center in affine coordinate transform:
1118
    # (consistent with image rotation or AlternativeImage below)
1119
    segment_coords['transform'] = rotate_coordinates(
1120
        segment_coords['transform'], skew,
1121
        np.array([0.5 * segment_xywh['w'],
1122
                  0.5 * segment_xywh['h']]))
1123
    segment_xywh['w'], segment_xywh['h'] = adjust_canvas_to_rotation(
1124
        [segment_xywh['w'], segment_xywh['h']], skew)
1125
    segment_coords['angle'] += skew
1126
    # deskew, if (still) necessary:
1127
    if not 'deskewed' in segment_coords['features']:
1128
        log.info("Rotating %s by %.2f°", name, skew)
1129
        segment_image = rotate_image(segment_image, skew, **kwargs)
1130
        segment_coords['features'] += ',deskewed'
1131
        if (segment and
1132
            (not isinstance(segment, BorderType) or # always crop below page level
1133
             'cropped' in segment_coords['features'])):
1134
            # re-crop to new bbox (which may deviate
1135
            # if segment polygon was not a rectangle)
1136
            segment_image, segment_coords, segment_xywh = _crop(
1137
                log, name, segment, segment_image, segment_coords,
1138
                op='recropped', **kwargs)
1139
    elif (segment and
1140
          (not isinstance(segment, BorderType) or # always crop below page level
1141
           'cropped' in segment_coords['features'])):
1142
        # only shift coordinates as if re-cropping
1143
        segment_polygon = coordinates_of_segment(segment, segment_image, segment_coords)
1144
        segment_bbox = bbox_from_polygon(segment_polygon)
1145
        segment_xywh = xywh_from_bbox(*segment_bbox)
1146
        segment_coords['transform'] = shift_coordinates(
1147
            segment_coords['transform'],
1148
            np.array([-segment_bbox[0],
1149
                      -segment_bbox[1]]))
1150
    return segment_image, segment_coords, segment_xywh
1151
1152
def _scale(log, name, factor, segment_image, segment_coords, segment_xywh, **kwargs):
1153
    # Resize linearly
1154
    segment_coords['transform'] = scale_coordinates(
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable scale_coordinates does not seem to be defined.
Loading history...
1155
        segment_coords['transform'], [factor, factor])
1156
    segment_coords['scale'] = segment_coords.setdefault('scale', 1.0) * factor
1157
    segment_xywh['w'] *= factor
1158
    segment_xywh['h'] *= factor
1159
    # resize, if (still) necessary
1160
    if not 'scaled' in segment_coords['features']:
1161
        log.info("Scaling %s by %.2f", name, factor)
1162
        segment_coords['features'] += ',scaled'
1163
        # FIXME: validate factor against PAGE-XML attributes
1164
        # FIXME: factor should become less precise due to rounding
1165
        segment_image = segment_image.resize((int(segment_image.width * factor),
1166
                                              int(segment_image.height * factor)),
1167
                                             # slowest, but highest quality:
1168
                                             Image.BICUBIC)
1169
    return segment_image, segment_coords, segment_xywh
1170