Passed
Pull Request — master (#673)
by Konstantin
02:29
created

ocrd.workspace.Workspace.merge()   B

Complexity

Conditions 6

Size

Total Lines 22
Code Lines 14

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 14
dl 0
loc 22
rs 8.6666
c 0
b 0
f 0
cc 6
nop 4
1
import io
2
from os import makedirs, unlink, listdir, path
3
from pathlib import Path
4
from shutil import move, copyfileobj
5
from re import sub
6
7
from cv2 import COLOR_GRAY2BGR, COLOR_RGB2BGR, cvtColor
8
from PIL import Image
9
import numpy as np
10
from deprecated.sphinx import deprecated
11
12
from ocrd_models import OcrdMets, OcrdFile
13
from ocrd_models.ocrd_page import parse, BorderType, to_xml
14
from ocrd_modelfactory import exif_from_filename, page_from_file
15
from ocrd_utils import (
16
    atomic_write,
17
    getLogger,
18
    image_from_polygon,
19
    coordinates_of_segment,
20
    adjust_canvas_to_rotation,
21
    adjust_canvas_to_transposition,
22
    shift_coordinates,
23
    rotate_coordinates,
24
    transform_coordinates,
25
    transpose_coordinates,
26
    crop_image,
27
    rotate_image,
28
    transpose_image,
29
    bbox_from_polygon,
30
    polygon_from_points,
31
    xywh_from_bbox,
32
    pushd_popd,
33
    MIME_TO_EXT,
34
    MIME_TO_PIL,
35
    MIMETYPE_PAGE,
36
    REGEX_PREFIX
37
)
38
39
from .workspace_backup import WorkspaceBackupManager
40
41
class Workspace():
42
    """
43
    A workspace is a temporary directory set up for a processor. It's the
44
    interface to the METS/PAGE XML and delegates download and upload to the
45
    :py:class:`ocrd.Resolver`.
46
47
    Args:
48
49
        directory (string) : Filesystem folder to work in
50
        mets (:py:class:`ocrd_models.ocrd_mets.OcrdMets`) : `OcrdMets` representing this workspace. 
51
            Loaded from `'mets.xml'` if `None`.
52
        mets_basename (string) : Basename of the METS XML file. Default: Last URL segment of the mets_url.
53
        overwrite_mode (boolean) : Whether to force add operations on this workspace globally
54
        baseurl (string) : Base URL to prefix to relative URL.
55
    """
56
57
    def __init__(self, resolver, directory, mets=None, mets_basename='mets.xml', automatic_backup=False, baseurl=None):
58
        self.resolver = resolver
59
        self.directory = directory
60
        self.mets_target = str(Path(directory, mets_basename))
61
        self.overwrite_mode = False
62
        if mets is None:
63
            mets = OcrdMets(filename=self.mets_target)
64
        self.mets = mets
65
        self.automatic_backup = automatic_backup
66
        self.baseurl = baseurl
67
        #  print(mets.to_xml(xmllint=True).decode('utf-8'))
68
69
    def __str__(self):
70
        return 'Workspace[directory=%s, baseurl=%s, file_groups=%s, files=%s]' % (
71
            self.directory,
72
            self.baseurl,
73
            self.mets.file_groups,
74
            [str(f) for f in self.mets.find_all_files()],
75
        )
76
77
    def reload_mets(self):
78
        """
79
        Reload METS from the filesystem.
80
        """
81
        self.mets = OcrdMets(filename=self.mets_target)
82
83
    def merge(self, other_workspace, copy_files=True, **kwargs):
84
        """
85
        Merge ``other_workspace`` into this one
86
87
        See :py:func:OcrdMets.merge: for the ``kwargs``
88
89
        Keyword Args:
90
            copy_files (boolean): Whether to copy files from ``other_workspace`` to this one
91
        """
92
        def after_add_cb(f):
93
            if not copy_files:
94
                return
95
            fpath_src = Path(other_workspace.directory, f.url)
96
            fpath_dest = Path(self.directory, f.url)
97
            if fpath_src.exists():
98
                if fpath_dest.exists():
99
                    raise Exception("Copying %s to %s would overwrite the latter" % (fpath_src, fpath_dest))
100
                if not fpath_dest.parent.is_dir():
101
                    makedirs(str(fpath_dest.parent))
102
                with open(str(fpath_src), 'rb') as fstream_in, open(str(fpath_dest), 'wb') as fstream_out:
103
                    copyfileobj(fstream_in, fstream_out)
104
        self.mets.merge(other_workspace.mets, after_add_cb=after_add_cb, **kwargs)
105
106
107
    @deprecated(version='1.0.0', reason="Use workspace.download_file")
108
    def download_url(self, url, **kwargs):
109
        """
110
        Download a URL to the workspace.
111
112
        Args:
113
            url (string): URL to download to directory
114
            **kwargs : See :py:class:`ocrd_models.ocrd_file.OcrdFile`
115
116
        Returns:
117
            The local filename of the downloaded file
118
        """
119
        f = OcrdFile(None, url=url, **kwargs)
120
        f = self.download_file(f)
121
        return f.local_filename
122
123
124
    def download_file(self, f, _recursion_count=0):
125
        """
126
        Download a :py:class:`ocrd_models.ocrd_file.OcrdFile` to the workspace.
127
        """
128
        log = getLogger('ocrd.workspace.download_file')
129
        log.debug('download_file %s [_recursion_count=%s]' % (f, _recursion_count))
130
        with pushd_popd(self.directory):
131
            try:
132
                # If the f.url is already a file path, and is within self.directory, do nothing
133
                url_path = Path(f.url).resolve()
134
                if not (url_path.exists() and url_path.relative_to(str(Path(self.directory).resolve()))):
135
                    raise Exception("Not already downloaded, moving on")
136
            except Exception as e:
137
                basename = '%s%s' % (f.ID, MIME_TO_EXT.get(f.mimetype, '')) if f.ID else f.basename
138
                try:
139
                    f.url = self.resolver.download_to_directory(self.directory, f.url, subdir=f.fileGrp, basename=basename)
140
                except FileNotFoundError as e:
141
                    if not self.baseurl:
142
                        raise Exception("No baseurl defined by workspace. Cannot retrieve '%s'" % f.url)
143
                    if _recursion_count >= 1:
144
                        raise Exception("Already tried prepending baseurl '%s'. Cannot retrieve '%s'" % (self.baseurl, f.url))
145
                    log.debug("First run of resolver.download_to_directory(%s) failed, try prepending baseurl '%s': %s", f.url, self.baseurl, e)
146
                    f.url = '%s/%s' % (self.baseurl, f.url)
147
                    f.url = self.download_file(f, _recursion_count + 1).local_filename
148
            f.local_filename = f.url
149
            return f
150
151
    def remove_file(self, ID, force=False, keep_file=False, page_recursive=False, page_same_group=False):
152
        """
153
        Remove a METS `file` from the workspace.
154
155
        Arguments:
156
            ID (string|:py:class:`ocrd_models.ocrd_file.OcrdFile`): `@ID` of the METS `file`
157
                to delete or the file itself
158
        Keyword Args:
159
            force (boolean): Continue removing even if file not found in METS
160
            keep_file (boolean): Whether to keep files on disk
161
            page_recursive (boolean): Whether to remove all images referenced in the file
162
                if the file is a PAGE-XML document.
163
            page_same_group (boolean): Remove only images in the same file group as the PAGE-XML.
164
                Has no effect unless ``page_recursive`` is `True`.
165
        """
166
        log = getLogger('ocrd.workspace.remove_file')
167
        log.debug('Deleting mets:file %s', ID)
168
        if not force and self.overwrite_mode:
169
            force = True
170
        if isinstance(ID, OcrdFile):
171
            ID = ID.ID
172
        try:
173
            ocrd_file_ = self.mets.remove_file(ID)
174
            ocrd_files = [ocrd_file_] if isinstance(ocrd_file_, OcrdFile) else ocrd_file_
175
            if page_recursive:
176
                with pushd_popd(self.directory):
177
                    for ocrd_file in ocrd_files:
178
                        if ocrd_file.mimetype != MIMETYPE_PAGE:
179
                            continue
180
                        ocrd_page = parse(self.download_file(ocrd_file).local_filename, silence=True)
181
                        for img_url in ocrd_page.get_AllAlternativeImagePaths():
182
                            img_kwargs = {'url': img_url}
183
                            if page_same_group:
184
                                img_kwargs['fileGrp'] = ocrd_file.fileGrp
185
                            for img_file in self.mets.find_files(**img_kwargs):
186
                                self.remove_file(img_file, keep_file=keep_file, force=force)
187
            if not keep_file:
188
                with pushd_popd(self.directory):
189
                    for ocrd_file in ocrd_files:
190
                        if not ocrd_file.local_filename:
191
                            log.warning("File not locally available %s", ocrd_file)
192
                            if not force:
193
                                raise Exception("File not locally available %s" % ocrd_file)
194
                        else:
195
                            log.info("rm %s [cwd=%s]", ocrd_file.local_filename, self.directory)
196
                            unlink(ocrd_file.local_filename)
197
            return ocrd_file_
198
        except FileNotFoundError as e:
199
            if not force:
200
                raise e
201
202
    def remove_file_group(self, USE, recursive=False, force=False, keep_files=False, page_recursive=False, page_same_group=False):
203
        """
204
        Remove a METS `fileGrp`.
205
206
        Arguments:
207
            USE (string): `@USE` of the METS `fileGrp` to delete
208
        Keyword Args:
209
            recursive (boolean): Whether to recursively delete all files in the group
210
            force (boolean): Continue removing even if group or containing files not found in METS
211
            keep_files (boolean): When deleting recursively whether to keep files on disk
212
            page_recursive (boolean): Whether to remove all images referenced in the file 
213
                if the file is a PAGE-XML document.
214
            page_same_group (boolean): Remove only images in the same file group as the PAGE-XML. 
215
                Has no effect unless ``page_recursive`` is `True`.
216
        """
217
        if not force and self.overwrite_mode:
218
            force = True
219
220
        if (not USE.startswith(REGEX_PREFIX)) and (USE not in self.mets.file_groups) and (not force):
221
            raise Exception("No such fileGrp: %s" % USE)
222
223
        file_dirs = []
224
        if recursive:
225
            for f in self.mets.find_files(fileGrp=USE):
226
                self.remove_file(f, force=force, keep_file=keep_files, page_recursive=page_recursive, page_same_group=page_same_group)
227
                if f.local_filename:
228
                    file_dirs.append(path.dirname(f.local_filename))
229
230
        self.mets.remove_file_group(USE, force=force)
231
232
        # PLEASE NOTE: this only removes directories in the workspace if they are empty
233
        # and named after the fileGrp which is a convention in OCR-D.
234
        with pushd_popd(self.directory):
235
            if Path(USE).is_dir() and not listdir(USE):
236
                Path(USE).rmdir()
237
            if file_dirs:
238
                for file_dir in set(file_dirs):
239
                    if Path(file_dir).is_dir() and not listdir(file_dir):
240
                        Path(file_dir).rmdir()
241
242
243
    def rename_file_group(self, old, new):
244
        """
245
        Rename a METS `fileGrp`.
246
247
        Arguments:
248
            old (string): `@USE` of the METS `fileGrp` to rename
249
            new (string): `@USE` of the METS `fileGrp` to rename as
250
        """
251
        log = getLogger('ocrd.workspace.rename_file_group')
252
253
        if old not in self.mets.file_groups:
254
            raise ValueError("No such fileGrp: %s" % old)
255
        if new in self.mets.file_groups:
256
            raise ValueError("fileGrp already exists %s" % new)
257
258
        with pushd_popd(self.directory):
259
            # create workspace dir ``new``
260
            log.info("mkdir %s" % new)
261
            if not Path(new).is_dir():
262
                Path(new).mkdir()
263
            url_replacements = {}
264
            log.info("Moving files")
265
            for mets_file in self.mets.find_files(fileGrp=old, local_only=True):
266
                new_url = sub(r'^%s/' % old, '%s/' % new, mets_file.url)
267
                url_replacements[mets_file.url] = new_url
268
                # move file from ``old`` to ``new``
269
                move(mets_file.url, new_url)
270
                # change the url of ``mets:file``
271
                mets_file.url = new_url
272
            # change file paths in PAGE-XML imageFilename and filename attributes
273
            for page_file in self.mets.find_files(mimetype=MIMETYPE_PAGE, local_only=True):
274
                log.info("Renaming file references in PAGE-XML %s" % page_file)
275
                pcgts = page_from_file(page_file)
276
                changed = False
277
                for old_url, new_url in url_replacements.items():
278
                    if pcgts.get_Page().imageFilename == old_url:
279
                        changed = True
280
                        log.info("Rename pc:Page/@imageFilename: %s -> %s" % (old_url, new_url))
281
                        pcgts.get_Page().imageFilename = new_url
282
                for ai in pcgts.get_Page().get_AllAlternativeImages():
283
                    for old_url, new_url in url_replacements.items():
284
                        if ai.filename == old_url:
285
                            changed = True
286
                            log.info("Rename pc:Page/../AlternativeImage: %s -> %s" % (old_url, new_url))
287
                            ai.filename = new_url
288
                if changed:
289
                    log.info("PAGE-XML changed, writing %s" % (page_file.local_filename))
290
                    with open(page_file.local_filename, 'w', encoding='utf-8') as f:
291
                        f.write(to_xml(pcgts))
292
            # change the ``USE`` attribute of the fileGrp
293
            self.mets.rename_file_group(old, new)
294
            # Remove the old dir
295
            log.info("rmdir %s" % old)
296
            if Path(old).is_dir() and not listdir(old):
297
                Path(old).rmdir()
298
299
    def add_file(self, file_grp, content=None, **kwargs):
300
        """
301
        Add a file to the :py:class:`ocrd_models.ocrd_mets.OcrdMets` of the workspace.
302
        
303
        Arguments:
304
            file_grp (string): `@USE` of the METS `fileGrp` to add to
305
        Keyword Args:
306
            content (string|bytes): optional content to write to the file
307
                in the filesystem
308
            **kwargs: See :py:func:`ocrd_models.ocrd_mets.OcrdMets.add_file`
309
        Returns:
310
            a new :py:class:`ocrd_models.ocrd_file.OcrdFile` 
311
        """
312
        log = getLogger('ocrd.workspace.add_file')
313
        log.debug(
314
            'outputfile file_grp=%s local_filename=%s content=%s',
315
            file_grp,
316
            kwargs.get('local_filename'),
317
            content is not None)
318
        if 'pageId' not in kwargs:
319
            raise ValueError("workspace.add_file must be passed a 'pageId' kwarg, even if it is None.")
320
        if content is not None and 'local_filename' not in kwargs:
321
            raise Exception("'content' was set but no 'local_filename'")
322
        if self.overwrite_mode:
323
            kwargs['force'] = True
324
325
        with pushd_popd(self.directory):
326
            if 'local_filename' in kwargs:
327
                # If the local filename has folder components, create those folders
328
                local_filename_dir = kwargs['local_filename'].rsplit('/', 1)[0]
329
                if local_filename_dir != kwargs['local_filename'] and not Path(local_filename_dir).is_dir():
330
                    makedirs(local_filename_dir)
331
                if 'url' not in kwargs:
332
                    kwargs['url'] = kwargs['local_filename']
333
334
            #  print(kwargs)
335
            ret = self.mets.add_file(file_grp, **kwargs)
336
337
            if content is not None:
338
                with open(kwargs['local_filename'], 'wb') as f:
339
                    if isinstance(content, str):
340
                        content = bytes(content, 'utf-8')
341
                    f.write(content)
342
343
        return ret
344
345
    def save_mets(self):
346
        """
347
        Write out the current state of the METS file to the filesystem.
348
        """
349
        log = getLogger('ocrd.workspace.save_mets')
350
        log.info("Saving mets '%s'", self.mets_target)
351
        if self.automatic_backup:
352
            WorkspaceBackupManager(self).add()
353
        with atomic_write(self.mets_target) as f:
354
            f.write(self.mets.to_xml(xmllint=True).decode('utf-8'))
355
356
    def resolve_image_exif(self, image_url):
357
        """
358
        Get the EXIF metadata about an image URL as :py:class:`ocrd_models.ocrd_exif.OcrdExif`
359
360
        Args:
361
            image_url (string) : `@href` (path or URL) of the METS `file` to inspect
362
363
        Returns:
364
            :py:class:`ocrd_models.ocrd_exif.OcrdExif`
365
        """
366
        if not image_url:
367
            # avoid "finding" just any file
368
            raise Exception("Cannot resolve empty image path")
369
        f = next(self.mets.find_files(url=image_url), OcrdFile(None, url=image_url))
370
        image_filename = self.download_file(f).local_filename
371
        ocrd_exif = exif_from_filename(image_filename)
372
        return ocrd_exif
373
374
    @deprecated(version='1.0.0', reason="Use workspace.image_from_page and workspace.image_from_segment")
375
    def resolve_image_as_pil(self, image_url, coords=None):
376
        """
377
        Resolve an image URL to a `PIL.Image`.
378
379
        Arguments:
380
            image_url (string): `@href` (path or URL) of the METS `file` to retrieve
381
        Keyword Args:
382
            coords (list) : Coordinates of the bounding box to cut from the image
383
384
        Returns:
385
            Full or cropped `PIL.Image`
386
387
        """
388
        return self._resolve_image_as_pil(image_url, coords)
389
390
    def _resolve_image_as_pil(self, image_url, coords=None):
391
        if not image_url:
392
            # avoid "finding" just any file
393
            raise Exception("Cannot resolve empty image path")
394
        log = getLogger('ocrd.workspace._resolve_image_as_pil')
395
        f = next(self.mets.find_files(url=image_url), OcrdFile(None, url=image_url))
396
        image_filename = self.download_file(f).local_filename
397
398
        with pushd_popd(self.directory):
399
            pil_image = Image.open(image_filename)
400
            pil_image.load() # alloc and give up the FD
401
402
        # Pillow does not properly support higher color depths
403
        # (e.g. 16-bit or 32-bit or floating point grayscale),
404
        # clipping its dynamic range to the lower 8-bit in
405
        # many operations (including paste, putalpha, ImageStat...),
406
        # even including conversion.
407
        # Cf. Pillow#3011 Pillow#3159 Pillow#3838 (still open in 8.0)
408
        # So to be on the safe side, we must re-quantize these
409
        # to 8-bit via numpy (conversion to/from which fortunately
410
        # seems to work reliably):
411
        if (pil_image.mode.startswith('I') or
412
            pil_image.mode.startswith('F')):
413
            arr_image = np.array(pil_image)
414
            if arr_image.dtype.kind == 'i':
415
                # signed integer is *not* trustworthy in this context
416
                # (usually a mistake in the array interface)
417
                log.debug('Casting image "%s" from signed to unsigned', image_url)
418
                arr_image.dtype = np.dtype('u' + arr_image.dtype.name)
419
            if arr_image.dtype.kind == 'u':
420
                # integer needs to be scaled linearly to 8 bit
421
                # of course, an image might actually have some lower range
422
                # (e.g. 10-bit in I;16 or 20-bit in I or 4-bit in L),
423
                # but that would be guessing anyway, so here don't
424
                # make assumptions on _scale_, just reduce _precision_
425
                log.debug('Reducing image "%s" from depth %d bit to 8 bit',
426
                          image_url, arr_image.dtype.itemsize * 8)
427
                arr_image = arr_image >> 8 * (arr_image.dtype.itemsize-1)
428
                arr_image = arr_image.astype(np.uint8)
429
            elif arr_image.dtype.kind == 'f':
430
                # float needs to be scaled from [0,1.0] to [0,255]
431
                log.debug('Reducing image "%s" from floating point to 8 bit',
432
                          image_url)
433
                arr_image *= 255
434
                arr_image = arr_image.astype(np.uint8)
435
            pil_image = Image.fromarray(arr_image)
436
437
        if coords is None:
438
            return pil_image
439
440
        # FIXME: remove or replace this by (image_from_polygon+) crop_image ...
441
        log.debug("Converting PIL to OpenCV: %s", image_url)
442
        color_conversion = COLOR_GRAY2BGR if pil_image.mode in ('1', 'L') else  COLOR_RGB2BGR
443
        pil_as_np_array = np.array(pil_image).astype('uint8') if pil_image.mode == '1' else np.array(pil_image)
444
        cv2_image = cvtColor(pil_as_np_array, color_conversion)
445
446
        poly = np.array(coords, np.int32)
447
        log.debug("Cutting region %s from %s", coords, image_url)
448
        region_cut = cv2_image[
449
            np.min(poly[:, 1]):np.max(poly[:, 1]),
450
            np.min(poly[:, 0]):np.max(poly[:, 0])
451
        ]
452
        return Image.fromarray(region_cut)
453
454
    def image_from_page(self, page, page_id,
455
                        fill='background', transparency=False,
456
                        feature_selector='', feature_filter=''):
457
        """Extract an image for a PAGE-XML page from the workspace.
458
459
        Args:
460
            page (:py:class:`ocrd_models.ocrd_page.PageType`): a PAGE `PageType` object
461
            page_id (string): its `@ID` in the METS physical `structMap`
462
        Keyword Args:
463
            fill (string): a `PIL` color specifier
464
            transparency (boolean): whether to add an alpha channel for masking
465
            feature_selector (string): a comma-separated list of `@comments` classes
466
            feature_filter (string): a comma-separated list of `@comments` classes            
467
468
        Extract a `PIL.Image` from ``page``, either from its `AlternativeImage`
469
        (if it exists), or from its `@imageFilename` (otherwise). Also crop it,
470
        if a `Border` exists, and rotate it, if any `@orientation` angle is
471
        annotated.
472
473
        If ``feature_selector`` and/or ``feature_filter`` is given, then
474
        select/filter among the `@imageFilename` image and the available
475
        AlternativeImages the last one which contains all of the selected,
476
        but none of the filtered features (i.e. `@comments` classes), or
477
        raise an error.
478
479
        (Required and produced features need not be in the same order, so
480
        ``feature_selector`` is merely a mask specifying Boolean AND, and
481
        ``feature_filter`` is merely a mask specifying Boolean OR.)
482
483
        If the chosen image does not have the feature `"cropped"` yet, but
484
        a `Border` exists, and unless `"cropped"` is being filtered, then crop it.
485
        Likewise, if the chosen image does not have the feature `"deskewed"` yet,
486
        but an `@orientation` angle is annotated, and unless `"deskewed"` is being
487
        filtered, then rotate it. (However, if `@orientation` is above the
488
        [-45°,45°] interval, then apply as much transposition as possible first,
489
        unless `"rotated-90"` / `"rotated-180"` / `"rotated-270"` is being filtered.)
490
491
        Cropping uses a polygon mask (not just the bounding box rectangle).
492
        Areas outside the polygon will be filled according to ``fill``:
493
494
        - if `"background"` (the default),
495
          then fill with the median color of the image;
496
        - otherwise, use the given color, e.g. `"white"` or `(255,255,255)`.
497
498
        Moreover, if ``transparency`` is true, and unless the image already
499
        has an alpha channel, then add an alpha channel which is fully opaque
500
        before cropping and rotating. (Thus, unexposed/masked areas will be
501
        transparent afterwards for consumers that can interpret alpha channels).
502
503
        Returns:
504
            a tuple of
505
             * the extracted `PIL.Image`,
506
             * a `dict` with information about the extracted image:
507
               - `"transform"`: a `Numpy` array with an affine transform which
508
                   converts from absolute coordinates to those relative to the image,
509
                   i.e. after cropping to the page's border / bounding box (if any)
510
                   and deskewing with the page's orientation angle (if any)
511
               - `"angle"`: the rotation/reflection angle applied to the image so far,
512
               - `"features"`: the `AlternativeImage` `@comments` for the image, i.e.
513
                 names of all applied operations that lead up to this result,
514
             * an :py:class:`ocrd_models.ocrd_exif.OcrdExif` instance associated with 
515
               the original image.
516
517
        (The first two can be used to annotate a new `AlternativeImage`,
518
         or be passed down with :py:meth:`image_from_segment`.)
519
520
        Examples:
521
522
         * get a raw (colored) but already deskewed and cropped image::
523
524
           page_image, page_coords, page_image_info = workspace.image_from_page(
525
                 page, page_id,
526
                 feature_selector='deskewed,cropped',
527
                 feature_filter='binarized,grayscale_normalized')
528
        """
529
        log = getLogger('ocrd.workspace.image_from_page')
530
        page_image_info = self.resolve_image_exif(page.imageFilename)
531
        page_image = self._resolve_image_as_pil(page.imageFilename)
532
        page_coords = dict()
533
        # use identity as initial affine coordinate transform:
534
        page_coords['transform'] = np.eye(3)
535
        # interim bbox (updated with each change to the transform):
536
        page_bbox = [0, 0, page_image.width, page_image.height]
537
        page_xywh = {'x': 0, 'y': 0,
538
                     'w': page_image.width, 'h': page_image.height}
539
540
        border = page.get_Border()
541
        # page angle: PAGE @orientation is defined clockwise,
542
        # whereas PIL/ndimage rotation is in mathematical direction:
543
        page_coords['angle'] = -(page.get_orientation() or 0)
544
        # map angle from (-180,180] to [0,360], and partition into multiples of 90;
545
        # but avoid unnecessary large remainders, i.e. split symmetrically:
546
        orientation = (page_coords['angle'] + 45) % 360
547
        orientation = orientation - (orientation % 90)
548
        skew = (page_coords['angle'] % 360) - orientation
549
        skew = 180 - (180 - skew) % 360 # map to [-45,45]
550
        page_coords['angle'] = 0 # nothing applied yet (depends on filters)
551
        log.debug("page '%s' has %s orientation=%d skew=%.2f",
552
                  page_id, "border," if border else "", orientation, skew)
553
554
        # initialize AlternativeImage@comments classes as empty:
555
        page_coords['features'] = ''
556
        alternative_image = None
557
        alternative_images = page.get_AlternativeImage()
558 View Code Duplication
        if alternative_images:
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
559
            # (e.g. from page-level cropping, binarization, deskewing or despeckling)
560
            if feature_selector or feature_filter:
561
                alternative_image = None
562
                # search from the end, because by convention we always append,
563
                # and among multiple satisfactory images we want the most recent:
564
                for alternative_image in reversed(alternative_images):
565
                    features = alternative_image.get_comments()
566
                    if not features:
567
                        log.warning("AlternativeImage %d for page '%s' does not have any feature attributes",
568
                                    alternative_images.index(alternative_image) + 1, page_id)
569
                        features = ''
570
                    if (all(feature in features
571
                            for feature in feature_selector.split(',') if feature) and
572
                        not any(feature in features
573
                                for feature in feature_filter.split(',') if feature)):
574
                        break
575
                    else:
576
                        alternative_image = None
577
            else:
578
                alternative_image = alternative_images[-1]
579
                features = alternative_image.get_comments()
580
                if not features:
581
                    log.warning("AlternativeImage %d for page '%s' does not have any feature attributes",
582
                                alternative_images.index(alternative_image) + 1, page_id)
583
                    features = ''
584
            if alternative_image:
585
                log.debug("Using AlternativeImage %d (%s) for page '%s'",
586
                          alternative_images.index(alternative_image) + 1,
587
                          features, page_id)
0 ignored issues
show
introduced by
The variable features does not seem to be defined in case the for loop on line 564 is not entered. Are you sure this can never be the case?
Loading history...
588
                page_image = self._resolve_image_as_pil(alternative_image.get_filename())
589
                page_coords['features'] = features
590
591
        # adjust the coord transformation to the steps applied on the image,
592
        # and apply steps on the existing image in case it is missing there,
593
        # but traverse all steps (crop/reflect/rotate) in a particular order:
594
        # - existing image features take priority (in the order annotated),
595
        # - next is cropping (if necessary but not already applied),
596
        # - next is reflection (if necessary but not already applied),
597
        # - next is rotation (if necessary but not already applied).
598
        # This helps deal with arbitrary workflows (e.g. crop then deskew,
599
        # or deskew then crop), regardless of where images are generated.
600
        alternative_image_features = page_coords['features'].split(',')
601
        for duplicate_feature in set([feature for feature in alternative_image_features
602
                                      # features relevant in reconstructing coordinates:
603
                                      if (feature in ['cropped', 'deskewed', 'rotated-90',
604
                                                      'rotated-180', 'rotated-270'] and
605
                                          alternative_image_features.count(feature) > 1)]):
606
            log.error("Duplicate feature %s in AlternativeImage for page '%s'",
607
                      duplicate_feature, page_id)
608
        for i, feature in enumerate(alternative_image_features +
609
                                    (['cropped']
610
                                     if (border and
611
                                         not 'cropped' in alternative_image_features and
612
                                         not 'cropped' in feature_filter.split(','))
613
                                     else []) +
614
                                    (['rotated-%d' % orientation]
615
                                     if (orientation and
616
                                         not 'rotated-%d' % orientation in alternative_image_features and
617
                                         not 'rotated-%d' % orientation in feature_filter.split(','))
618
                                     else []) +
619
                                    (['deskewed']
620
                                     if (skew and
621
                                         not 'deskewed' in alternative_image_features and
622
                                         not 'deskewed' in feature_filter.split(','))
623
                                     else []) +
624
                                    # not a feature to be added, but merely as a fallback position
625
                                    # to always enter loop at i == len(alternative_image_features)
626
                                    ['_check']):
627
            # image geometry vs feature consistency can only be checked
628
            # after all features on the existing AlternativeImage have
629
            # been adjusted for in the transform, and when there is a mismatch,
630
            # additional steps applied here would only repeat the respective
631
            # error message; so we only check once at the boundary between
632
            # existing and new features
633
            # FIXME we should check/enforce consistency when _adding_ AlternativeImage
634
            if (i == len(alternative_image_features) and
635
                not (page_xywh['w'] - 2 < page_image.width < page_xywh['w'] + 2 and
636
                     page_xywh['h'] - 2 < page_image.height < page_xywh['h'] + 2)):
637
                log.error('page "%s" image (%s; %dx%d) has not been cropped properly (%dx%d)',
638
                          page_id, page_coords['features'],
639
                          page_image.width, page_image.height,
640
                          page_xywh['w'], page_xywh['h'])
641
            name = "%s for page '%s'" % ("AlternativeImage" if alternative_image
642
                                         else "original image", page_id)
643
            # adjust transform to feature, and ensure feature is applied to image
644
            if feature == 'cropped':
645
                page_image, page_coords, page_xywh = _crop(
646
                    log, name, border, page_image, page_coords,
647
                    fill=fill, transparency=transparency)
648
            elif feature == 'rotated-%d' % orientation:
649
                page_image, page_coords, page_xywh = _reflect(
650
                    log, name, orientation, page_image, page_coords, page_xywh)
651
            elif feature == 'deskewed':
652
                page_image, page_coords, page_xywh = _rotate(
653
                    log, name, skew, border, page_image, page_coords, page_xywh,
654
                    fill=fill, transparency=transparency)
655
656
        # verify constraints again:
657
        if not all(feature in page_coords['features']
658
                   for feature in feature_selector.split(',') if feature):
659
            raise Exception('Found no AlternativeImage that satisfies all requirements ' +
660
                            'selector="%s" in page "%s"' % (
661
                                feature_selector, page_id))
662
        if any(feature in page_coords['features']
663
               for feature in feature_filter.split(',') if feature):
664
            raise Exception('Found no AlternativeImage that satisfies all requirements ' +
665
                            'filter="%s" in page "%s"' % (
666
                                feature_filter, page_id))
667
        page_image.format = 'PNG' # workaround for tesserocr#194
668
        return page_image, page_coords, page_image_info
669
670
    def image_from_segment(self, segment, parent_image, parent_coords,
671
                           fill='background', transparency=False,
672
                           feature_selector='', feature_filter=''):
673
        """Extract an image for a PAGE-XML hierarchy segment from its parent's image.
674
675
        Args:
676
            segment (object): a PAGE segment object
677
            (i.e. :py:class:`ocrd_models.ocrd_page.TextRegionType`
678
             or :py:class:`ocrd_models.ocrd_page.TextLineType`
679
             or :py:class:`ocrd_models.ocrd_page.WordType`
680
             or :py:class:`ocrd_models.ocrd_page.GlyphType`)
681
           parent_image (PIL.Image): image of the segment's parent
682
           parent_coords (dict): a `dict` with information about ``parent_image``:
683
               - `"transform"`: a `Numpy` array with an affine transform which
684
                 converts from absolute coordinates to those relative to the image,
685
                 i.e. after applying all operations (starting with the original image)
686
               - `"angle"`: the rotation/reflection angle applied to the image so far,
687
               - `"features"`: the `AlternativeImage` `@comments` for the image, i.e.
688
                 names of all operations that lead up to this result, and
689
        Keyword Args:
690
            fill (string): a `PIL` color specifier
691
            transparency (boolean): whether to add an alpha channel for masking
692
            feature_selector (string): a comma-separated list of `@comments` classes
693
            feature_filter (string): a comma-separated list of `@comments` classes            
694
        
695
        Extract a `PIL.Image` from ``segment``, either from `AlternativeImage`
696
        (if it exists), or producing a new image via cropping from ``parent_image``
697
        (otherwise). Pass in ``parent_image`` and ``parent_coords`` from the result
698
        of the next higher-level of this function or from :py:meth:`image_from_page`.
699
700
        If ``feature_selector`` and/or ``feature_filter`` is given, then
701
        select/filter among the cropped ``parent_image`` and the available
702
        `AlternativeImage`s the last one which contains all of the selected,
703
        but none of the filtered features (i.e. `@comments` classes), or
704
        raise an error.
705
706
        (Required and produced features need not be in the same order, so
707
        ``feature_selector`` is merely a mask specifying Boolean AND, and
708
        ``feature_filter`` is merely a mask specifying Boolean OR.)
709
710
        Cropping uses a polygon mask (not just the bounding box rectangle).
711
        Areas outside the polygon will be filled according to ``fill``:
712
713
        - if `"background"` (the default),
714
          then fill with the median color of the image;
715
        - otherwise, use the given color, e.g. `"white"` or `(255,255,255)`.
716
717
        Moreover, if ``transparency`` is true, and unless the image already
718
        has an alpha channel, then add an alpha channel which is fully opaque
719
        before cropping and rotating. (Thus, unexposed/masked areas will be
720
        transparent afterwards for consumers that can interpret alpha channels).
721
722
        When cropping, compensate any `@orientation` angle annotated for the
723
        parent (from parent-level deskewing) by rotating the segment coordinates
724
        in an inverse transformation (i.e. translation to center, then passive
725
        rotation, and translation back).
726
727
        Regardless, if any `@orientation` angle is annotated for the segment
728
        (from segment-level deskewing), and the chosen image does not have
729
        the feature `"deskewed"` yet, and unless `"deskewed"` is being filtered,
730
        then rotate it - compensating for any previous `"angle"`. (However,
731
        if `@orientation` is above the [-45°,45°] interval, then apply as much
732
        transposition as possible first, unless `"rotated-90"` / `"rotated-180"` /
733
        `"rotated-270"` is being filtered.)
734
735
        Returns:
736
            a tuple of
737
             * the extracted `PIL.Image`,
738
             * a `dict` with information about the extracted image:
739
               - `"transform"`: a `Numpy` array with an affine transform which
740
                   converts from absolute coordinates to those relative to the image,
741
                   i.e. after applying all parent operations, and then cropping to
742
                   the segment's bounding box, and deskewing with the segment's
743
                   orientation angle (if any)
744
               - `"angle"`: the rotation/reflection angle applied to the image so far,
745
               - `"features"`: the `AlternativeImage` `@comments` for the image, i.e.
746
                 names of all applied operations that lead up to this result.
747
748
        (These can be used to create a new `AlternativeImage`, or passed down
749
         for :py:meth:`image_from_segment` calls on lower hierarchy levels.)
750
751
        Examples:
752
753
         * get a raw (colored) but already deskewed and cropped image::
754
755
           image, xywh = workspace.image_from_segment(region,
756
                 page_image, page_xywh,
757
                 feature_selector='deskewed,cropped',
758
                 feature_filter='binarized,grayscale_normalized')
759
        """
760
        log = getLogger('ocrd.workspace.image_from_segment')
761
        # note: We should mask overlapping neighbouring segments here,
762
        # but finding the right clipping rules can be difficult if operating
763
        # on the raw (non-binary) image data alone: for each intersection, it
764
        # must be decided which one of either segment or neighbour to assign,
765
        # e.g. an ImageRegion which properly contains our TextRegion should be
766
        # completely ignored, but an ImageRegion which is properly contained
767
        # in our TextRegion should be completely masked, while partial overlap
768
        # may be more difficult to decide. On the other hand, on the binary image,
769
        # we can use connected component analysis to mask foreground areas which
770
        # originate in the neighbouring regions. But that would introduce either
771
        # the assumption that the input has already been binarized, or a dependency
772
        # on some ad-hoc binarization method. Thus, it is preferable to use
773
        # a dedicated processor for this (which produces clipped AlternativeImage
774
        # or reduced polygon coordinates).
775
        segment_image, segment_coords, segment_xywh = _crop(
776
            log, "parent image for segment '%s'" % segment.id,
777
            segment, parent_image, parent_coords,
778
            fill=fill, transparency=transparency)
779
780
        # Semantics of missing @orientation at region level could be either
781
        # - inherited from page level: same as line or word level (no @orientation),
782
        # - zero (unrotate page angle): different from line or word level (because
783
        #   otherwise deskewing would never have an effect on lines and words)
784
        # The PAGE specification is silent here (but does generally not concern itself
785
        # much with AlternativeImage coordinate consistency).
786
        # Since our (generateDS-backed) ocrd_page supports the zero/none distinction,
787
        # we choose the former (i.e. None is inheritance).
788
        if 'orientation' in segment.__dict__ and segment.get_orientation() is not None:
789
            # region angle: PAGE @orientation is defined clockwise,
790
            # whereas PIL/ndimage rotation is in mathematical direction:
791
            angle = -segment.get_orientation()
792
            # @orientation is always absolute; if higher levels
793
            # have already rotated, then we must compensate:
794
            angle -= parent_coords['angle']
795
            # map angle from (-180,180] to [0,360], and partition into multiples of 90;
796
            # but avoid unnecessary large remainders, i.e. split symmetrically:
797
            orientation = (angle + 45) % 360
798
            orientation = orientation - (orientation % 90)
799
            skew = (angle % 360) - orientation
800
            skew = 180 - (180 - skew) % 360 # map to [-45,45]
801
            log.debug("segment '%s' has orientation=%d skew=%.2f",
802
                      segment.id, orientation, skew)
803
        else:
804
            orientation = 0
805
            skew = 0
806
        segment_coords['angle'] = parent_coords['angle'] # nothing applied yet (depends on filters)
807
808
        # initialize AlternativeImage@comments classes from parent, except
809
        # for those operations that can apply on multiple hierarchy levels:
810
        segment_coords['features'] = ','.join(
811
            [feature for feature in parent_coords['features'].split(',')
812
             if feature in ['binarized', 'grayscale_normalized',
813
                            'despeckled', 'dewarped']])
814
815
        alternative_image = None
816
        alternative_images = segment.get_AlternativeImage()
817 View Code Duplication
        if alternative_images:
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
818
            # (e.g. from segment-level cropping, binarization, deskewing or despeckling)
819
            if feature_selector or feature_filter:
820
                alternative_image = None
821
                # search from the end, because by convention we always append,
822
                # and among multiple satisfactory images we want the most recent:
823
                for alternative_image in reversed(alternative_images):
824
                    features = alternative_image.get_comments()
825
                    if not features:
826
                        log.warning("AlternativeImage %d for segment '%s' does not have any feature attributes",
827
                                    alternative_images.index(alternative_image) + 1, segment.id)
828
                        features = ''
829
                    if (all(feature in features
830
                            for feature in feature_selector.split(',') if feature) and
831
                        not any(feature in features
832
                                for feature in feature_filter.split(',') if feature)):
833
                        break
834
                    else:
835
                        alternative_image = None
836
            else:
837
                alternative_image = alternative_images[-1]
838
                features = alternative_image.get_comments()
839
                if not features:
840
                    log.warning("AlternativeImage %d for segment '%s' does not have any feature attributes",
841
                                alternative_images.index(alternative_image) + 1, segment.id)
842
                    features = ''
843
            if alternative_image:
844
                log.debug("Using AlternativeImage %d (%s) for segment '%s'",
845
                          alternative_images.index(alternative_image) + 1,
846
                          features, segment.id)
0 ignored issues
show
introduced by
The variable features does not seem to be defined in case the for loop on line 823 is not entered. Are you sure this can never be the case?
Loading history...
847
                segment_image = self._resolve_image_as_pil(alternative_image.get_filename())
848
                segment_coords['features'] = features
849
850
        alternative_image_features = segment_coords['features'].split(',')
851
        for duplicate_feature in set([feature for feature in alternative_image_features
852
                                      # features relevant in reconstructing coordinates:
853
                                      if (feature in ['deskewed', 'rotated-90',
854
                                                      'rotated-180', 'rotated-270'] and
855
                                          alternative_image_features.count(feature) > 1)]):
856
            log.error("Duplicate feature %s in AlternativeImage for segment '%s'",
857
                      duplicate_feature, segment.id)
858
        for i, feature in enumerate(alternative_image_features +
859
                                    (['rotated-%d' % orientation]
860
                                     if (orientation and
861
                                         not 'rotated-%d' % orientation in alternative_image_features and
862
                                         not 'rotated-%d' % orientation in feature_filter.split(','))
863
                                     else []) +
864
                                    (['deskewed']
865
                                     if (skew and
866
                                         not 'deskewed' in alternative_image_features and
867
                                         not 'deskewed' in feature_filter.split(','))
868
                                     else []) +
869
                                    # not a feature to be added, but merely as a fallback position
870
                                    # to always enter loop at i == len(alternative_image_features)
871
                                    ['_check']):
872
            # image geometry vs feature consistency can only be checked
873
            # after all features on the existing AlternativeImage have
874
            # been adjusted for in the transform, and when there is a mismatch,
875
            # additional steps applied here would only repeat the respective
876
            # error message; so we only check once at the boundary between
877
            # existing and new features
878
            # FIXME we should enforce consistency here (i.e. split into transposition
879
            #       and minimal rotation, rotation always reshapes, rescaling never happens)
880
            # FIXME: inconsistency currently unavoidable with line-level dewarping (which increases height)
881
            if (i == len(alternative_image_features) and
882
                not (segment_xywh['w'] - 2 < segment_image.width < segment_xywh['w'] + 2 and
883
                     segment_xywh['h'] - 2 < segment_image.height < segment_xywh['h'] + 2)):
884
                log.error('segment "%s" image (%s; %dx%d) has not been cropped properly (%dx%d)',
885
                          segment.id, segment_coords['features'],
886
                          segment_image.width, segment_image.height,
887
                          segment_xywh['w'], segment_xywh['h'])
888
            name = "%s for segment '%s'" % ("AlternativeImage" if alternative_image
889
                                            else "parent image", segment.id)
890
            # adjust transform to feature, and ensure feature is applied to image
891
            if feature == 'rotated-%d' % orientation:
892
                segment_image, segment_coords, segment_xywh = _reflect(
893
                    log, name, orientation, segment_image, segment_coords, segment_xywh)
894
            elif feature == 'deskewed':
895
                segment_image, segment_coords, segment_xywh = _rotate(
896
                    log, name, skew, segment, segment_image, segment_coords, segment_xywh,
897
                    fill=fill, transparency=transparency)
898
899
        # verify constraints again:
900
        if not all(feature in segment_coords['features']
901
                   for feature in feature_selector.split(',') if feature):
902
            raise Exception('Found no AlternativeImage that satisfies all requirements' +
903
                            'selector="%s" in segment "%s"' % (
904
                                feature_selector, segment.id))
905
        if any(feature in segment_coords['features']
906
               for feature in feature_filter.split(',') if feature):
907
            raise Exception('Found no AlternativeImage that satisfies all requirements ' +
908
                            'filter="%s" in segment "%s"' % (
909
                                feature_filter, segment.id))
910
        segment_image.format = 'PNG' # workaround for tesserocr#194
911
        return segment_image, segment_coords
912
913
    # pylint: disable=redefined-builtin
914
    def save_image_file(self, image,
915
                        file_id,
916
                        file_grp,
917
                        page_id=None,
918
                        mimetype='image/png',
919
                        force=False):
920
        """Store an image in the filesystem and reference it as new file in the METS.
921
922
        Args:
923
            image (PIL.Image): derived image to save
924
            file_id (string): `@ID` of the METS `file` to use
925
            file_grp (string): `@USE` of the METS `fileGrp` to use
926
        Keyword Args:
927
            page_id (string): `@ID` in the METS physical `structMap` to use
928
            mimetype (string): MIME type of the image format to serialize as
929
            force (boolean): whether to replace any existing `file` with that `@ID`
930
        
931
        Serialize the image into the filesystem, and add a `file` for it in the METS.
932
        Use a filename extension based on ``mimetype``.
933
934
        Returns:
935
            The (absolute) path of the created file.
936
        """
937
        log = getLogger('ocrd.workspace.save_image_file')
938
        if not force and self.overwrite_mode:
939
            force = True
940
        image_bytes = io.BytesIO()
941
        image.save(image_bytes, format=MIME_TO_PIL[mimetype])
942
        file_path = str(Path(file_grp, '%s%s' % (file_id, MIME_TO_EXT[mimetype])))
943
        out = self.add_file(
944
            ID=file_id,
945
            file_grp=file_grp,
946
            pageId=page_id,
947
            local_filename=file_path,
948
            mimetype=mimetype,
949
            content=image_bytes.getvalue(),
950
            force=force)
951
        log.info('created file ID: %s, file_grp: %s, path: %s',
952
                 file_id, file_grp, out.local_filename)
953
        return file_path
954
955
def _crop(log, name, segment, parent_image, parent_coords, **kwargs):
956
    segment_coords = parent_coords.copy()
957
    # get polygon outline of segment relative to parent image:
958
    segment_polygon = coordinates_of_segment(segment, parent_image, parent_coords)
959
    # get relative bounding box:
960
    segment_bbox = bbox_from_polygon(segment_polygon)
961
    # get size of the segment in the parent image after cropping
962
    # (i.e. possibly different from size before rotation at the parent, but
963
    #  also possibly different from size after rotation below/AlternativeImage):
964
    segment_xywh = xywh_from_bbox(*segment_bbox)
965
    # crop, if (still) necessary:
966
    if (not isinstance(segment, BorderType) or # always crop below page level
967
        not 'cropped' in parent_coords['features']):
968
        if isinstance(segment, BorderType):
969
            log.info("Cropping %s", name)
970
            segment_coords['features'] += ',cropped'
971
        # create a mask from the segment polygon:
972
        segment_image = image_from_polygon(parent_image, segment_polygon, **kwargs)
973
        # crop to bbox:
974
        segment_image = crop_image(segment_image, box=segment_bbox)
975
    else:
976
        segment_image = parent_image
977
    # subtract offset from parent in affine coordinate transform:
978
    # (consistent with image cropping)
979
    segment_coords['transform'] = shift_coordinates(
980
        parent_coords['transform'],
981
        np.array([-segment_bbox[0],
982
                  -segment_bbox[1]]))
983
    return segment_image, segment_coords, segment_xywh
984
985
def _reflect(log, name, orientation, segment_image, segment_coords, segment_xywh):
986
    # Transpose in affine coordinate transform:
987
    # (consistent with image transposition or AlternativeImage below)
988
    transposition = {
989
        90: Image.ROTATE_90,
990
        180: Image.ROTATE_180,
991
        270: Image.ROTATE_270
992
    }.get(orientation) # no default
993
    segment_coords['transform'] = transpose_coordinates(
994
        segment_coords['transform'], transposition,
995
        np.array([0.5 * segment_xywh['w'],
996
                  0.5 * segment_xywh['h']]))
997
    segment_xywh['w'], segment_xywh['h'] = adjust_canvas_to_transposition(
998
        [segment_xywh['w'], segment_xywh['h']], transposition)
999
    segment_coords['angle'] += orientation
1000
    # transpose, if (still) necessary:
1001
    if not 'rotated-%d' % orientation in segment_coords['features']:
1002
        log.info("Transposing %s by %d°", name, orientation)
1003
        segment_image = transpose_image(segment_image, transposition)
1004
        segment_coords['features'] += ',rotated-%d' % orientation
1005
    return segment_image, segment_coords, segment_xywh
1006
1007
def _rotate(log, name, skew, segment, segment_image, segment_coords, segment_xywh, **kwargs):
1008
    # Rotate around center in affine coordinate transform:
1009
    # (consistent with image rotation or AlternativeImage below)
1010
    segment_coords['transform'] = rotate_coordinates(
1011
        segment_coords['transform'], skew,
1012
        np.array([0.5 * segment_xywh['w'],
1013
                  0.5 * segment_xywh['h']]))
1014
    segment_xywh['w'], segment_xywh['h'] = adjust_canvas_to_rotation(
1015
        [segment_xywh['w'], segment_xywh['h']], skew)
1016
    segment_coords['angle'] += skew
1017
    # deskew, if (still) necessary:
1018
    if not 'deskewed' in segment_coords['features']:
1019
        log.info("Rotating %s by %.2f°", name, skew)
1020
        segment_image = rotate_image(segment_image, skew, **kwargs)
1021
        segment_coords['features'] += ',deskewed'
1022
        if (segment and
1023
            (not isinstance(segment, BorderType) or # always crop below page level
1024
             'cropped' in segment_coords['features'])):
1025
            # re-crop to new bbox (which may deviate
1026
            # if segment polygon was not a rectangle)
1027
            segment_image, segment_coords, segment_xywh = _crop(
1028
                log, name, segment, segment_image, segment_coords,
1029
                **kwargs)
1030
    elif (segment and
1031
          (not isinstance(segment, BorderType) or # always crop below page level
1032
           'cropped' in segment_coords['features'])):
1033
        # only shift coordinates as if re-cropping
1034
        _, segment_coords, segment_xywh = _crop(
1035
            log, name, segment, segment_image, segment_coords,
1036
            **kwargs)
1037
    return segment_image, segment_coords, segment_xywh
1038