Passed
Push — master ( ffe4e5...079aac )
by Konstantin
03:47 queued 29s
created

ocrd.workspace.Workspace.add_file()   D

Complexity

Conditions 13

Size

Total Lines 37
Code Lines 27

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 27
dl 0
loc 37
rs 4.2
c 0
b 0
f 0
cc 13
nop 4

How to fix   Complexity   

Complexity

Complex classes like ocrd.workspace.Workspace.add_file() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

1
import io
2
from os import makedirs, unlink, listdir, path
3
from pathlib import Path
4
5
import cv2
6
from PIL import Image
7
import numpy as np
8
from deprecated.sphinx import deprecated
9
10
from ocrd_models import OcrdMets, OcrdFile
11
from ocrd_models.ocrd_page import parse, BorderType
12
from ocrd_modelfactory import exif_from_filename
13
from ocrd_utils import (
14
    atomic_write,
15
    getLogger,
16
    image_from_polygon,
17
    coordinates_of_segment,
18
    adjust_canvas_to_rotation,
19
    adjust_canvas_to_transposition,
20
    shift_coordinates,
21
    rotate_coordinates,
22
    transform_coordinates,
23
    transpose_coordinates,
24
    crop_image,
25
    rotate_image,
26
    transpose_image,
27
    bbox_from_polygon,
28
    polygon_from_points,
29
    xywh_from_bbox,
30
    pushd_popd,
31
    MIME_TO_EXT,
32
    MIME_TO_PIL,
33
    MIMETYPE_PAGE,
34
    REGEX_PREFIX
35
)
36
37
from .workspace_backup import WorkspaceBackupManager
38
39
class Workspace():
40
    """
41
    A workspace is a temporary directory set up for a processor. It's the
42
    interface to the METS/PAGE XML and delegates download and upload to the
43
    Resolver.
44
45
    Args:
46
47
        directory (string) : Folder to work in
48
        mets (:class:`OcrdMets`) : OcrdMets representing this workspace. Loaded from 'mets.xml' if ``None``.
49
        mets_basename (string) : Basename of the METS XML file. Default: Last URL segment of the mets_url.
50
        overwrite_mode (boolean) : Whether to force add operations on this workspace globally
51
        baseurl (string) : Base URL to prefix to relative URL.
52
    """
53
54
    def __init__(self, resolver, directory, mets=None, mets_basename='mets.xml', automatic_backup=False, baseurl=None):
55
        self.resolver = resolver
56
        self.directory = directory
57
        self.mets_target = str(Path(directory, mets_basename))
58
        self.overwrite_mode = False
59
        if mets is None:
60
            mets = OcrdMets(filename=self.mets_target)
61
        self.mets = mets
62
        self.automatic_backup = automatic_backup
63
        self.baseurl = baseurl
64
        #  print(mets.to_xml(xmllint=True).decode('utf-8'))
65
66
    def __str__(self):
67
        return 'Workspace[directory=%s, baseurl=%s, file_groups=%s, files=%s]' % (
68
            self.directory,
69
            self.baseurl,
70
            self.mets.file_groups,
71
            [str(f) for f in self.mets.find_all_files()],
72
        )
73
74
    def reload_mets(self):
75
        """
76
        Reload METS from disk.
77
        """
78
        self.mets = OcrdMets(filename=self.mets_target)
79
80
81
    @deprecated(version='1.0.0', reason="Use workspace.download_file")
82
    def download_url(self, url, **kwargs):
83
        """
84
        Download a URL to the workspace.
85
86
        Args:
87
            url (string): URL to download to directory
88
            **kwargs : See :py:mod:`ocrd_models.ocrd_file.OcrdFile`
89
90
        Returns:
91
            The local filename of the downloaded file
92
        """
93
        f = OcrdFile(None, url=url, **kwargs)
94
        f = self.download_file(f)
95
        return f.local_filename
96
97
98
    def download_file(self, f, _recursion_count=0):
99
        """
100
        Download a :py:mod:`ocrd.model.ocrd_file.OcrdFile` to the workspace.
101
        """
102
        log = getLogger('ocrd.workspace.download_file')
103
        log.debug('download_file %s [_recursion_count=%s]' % (f, _recursion_count))
104
        with pushd_popd(self.directory):
105
            try:
106
                # If the f.url is already a file path, and is within self.directory, do nothing
107
                url_path = Path(f.url).resolve()
108
                if not (url_path.exists() and url_path.relative_to(str(Path(self.directory).resolve()))):
109
                    raise Exception("Not already downloaded, moving on")
110
            except Exception as e:
111
                basename = '%s%s' % (f.ID, MIME_TO_EXT.get(f.mimetype, '')) if f.ID else f.basename
112
                try:
113
                    f.url = self.resolver.download_to_directory(self.directory, f.url, subdir=f.fileGrp, basename=basename)
114
                except FileNotFoundError as e:
115
                    if not self.baseurl:
116
                        raise Exception("No baseurl defined by workspace. Cannot retrieve '%s'" % f.url)
117
                    if _recursion_count >= 1:
118
                        raise Exception("Already tried prepending baseurl '%s'. Cannot retrieve '%s'" % (self.baseurl, f.url))
119
                    log.debug("First run of resolver.download_to_directory(%s) failed, try prepending baseurl '%s': %s", f.url, self.baseurl, e)
120
                    f.url = '%s/%s' % (self.baseurl, f.url)
121
                    f.url = self.download_file(f, _recursion_count + 1).local_filename
122
            f.local_filename = f.url
123
            return f
124
125
    def remove_file(self, ID, force=False, keep_file=False, page_recursive=False, page_same_group=False):
126
        """
127
        Remove a file from the workspace.
128
129
        Arguments:
130
            ID (string|OcrdFile): ID of the file to delete or the file itself
131
            force (boolean): Continue removing even if file not found in METS
132
            keep_file (boolean): Whether to keep files on disk
133
            page_recursive (boolean): Whether to remove all images referenced in the file if the file is a PAGE-XML document.
134
            page_same_group (boolean): Remove only images in the same file group as the PAGE-XML. Has no effect unless ``page_recursive`` is ``True``.
135
        """
136
        log = getLogger('ocrd.workspace.remove_file')
137
        log.debug('Deleting mets:file %s', ID)
138
        if not force and self.overwrite_mode:
139
            force = True
140
        if isinstance(ID, OcrdFile):
141
            ID = ID.ID
142
        try:
143
            ocrd_file_ = self.mets.remove_file(ID)
144
            ocrd_files = [ocrd_file_] if isinstance(ocrd_file_, OcrdFile) else ocrd_file_
145
            if page_recursive:
146
                with pushd_popd(self.directory):
147
                    for ocrd_file in ocrd_files:
148
                        if ocrd_file.mimetype != MIMETYPE_PAGE:
149
                            continue
150
                        ocrd_page = parse(self.download_file(ocrd_file).local_filename, silence=True)
151
                        for img_url in ocrd_page.get_AllAlternativeImagePaths():
152
                            img_kwargs = {'url': img_url}
153
                            if page_same_group:
154
                                img_kwargs['fileGrp'] = ocrd_file.fileGrp
155
                            for img_file in self.mets.find_files(**img_kwargs):
156
                                self.remove_file(img_file, keep_file=keep_file, force=force)
157
            if not keep_file:
158
                with pushd_popd(self.directory):
159
                    for ocrd_file in ocrd_files:
160
                        if not ocrd_file.local_filename:
161
                            log.warning("File not locally available %s", ocrd_file)
162
                            if not force:
163
                                raise Exception("File not locally available %s" % ocrd_file)
164
                        else:
165
                            log.info("rm %s [cwd=%s]", ocrd_file.local_filename, self.directory)
166
                            unlink(ocrd_file.local_filename)
167
            return ocrd_file_
168
        except FileNotFoundError as e:
169
            if not force:
170
                raise e
171
172
    def remove_file_group(self, USE, recursive=False, force=False, keep_files=False, page_recursive=False, page_same_group=False):
173
        """
174
        Remove a fileGrp.
175
176
        Arguments:
177
            USE (string): USE attribute of the fileGrp to delete
178
            recursive (boolean): Whether to recursively delete all files in the group
179
            force (boolean): Continue removing even if group or containing files not found in METS
180
            keep_files (boolean): When deleting recursively whether to keep files on disk
181
            page_recursive (boolean): Whether to remove all images referenced in the file if the file is a PAGE-XML document.
182
            page_same_group (boolean): Remove only images in the same file group as the PAGE-XML. Has no effect unless ``page_recursive`` is ``True``.
183
        """
184
        if not force and self.overwrite_mode:
185
            force = True
186
187
        if (not USE.startswith(REGEX_PREFIX)) and (USE not in self.mets.file_groups) and (not force):
188
            raise Exception("No such fileGrp: %s" % USE)
189
190
        file_dirs = []
191
        if recursive:
192
            for f in self.mets.find_files(fileGrp=USE):
193
                self.remove_file(f, force=force, keep_file=keep_files, page_recursive=page_recursive, page_same_group=page_same_group)
194
                if f.local_filename:
195
                    file_dirs.append(path.dirname(f.local_filename))
196
197
        self.mets.remove_file_group(USE, force=force)
198
199
        # PLEASE NOTE: this only removes directories in the workspace if they are empty
200
        # and named after the fileGrp which is a convention in OCR-D.
201
        with pushd_popd(self.directory):
202
            if Path(USE).is_dir() and not listdir(USE):
203
                Path(USE).rmdir()
204
            if file_dirs:
205
                for file_dir in set(file_dirs):
206
                    if Path(file_dir).is_dir() and not listdir(file_dir):
207
                        Path(file_dir).rmdir()
208
209
210
    def add_file(self, file_grp, content=None, **kwargs):
211
        """
212
        Add an output file. Creates an :class:`OcrdFile` to pass around and adds that to the
213
        OcrdMets OUTPUT section.
214
        """
215
        log = getLogger('ocrd.workspace.add_file')
216
        log.debug(
217
            'outputfile file_grp=%s local_filename=%s content=%s',
218
            file_grp,
219
            kwargs.get('local_filename'),
220
            content is not None)
221
        if 'pageId' not in kwargs:
222
            raise ValueError("workspace.add_file must be passed a 'pageId' kwarg, even if it is None.")
223
        if content is not None and 'local_filename' not in kwargs:
224
            raise Exception("'content' was set but no 'local_filename'")
225
        if self.overwrite_mode:
226
            kwargs['force'] = True
227
228
        with pushd_popd(self.directory):
229
            if 'local_filename' in kwargs:
230
                # If the local filename has folder components, create those folders
231
                local_filename_dir = kwargs['local_filename'].rsplit('/', 1)[0]
232
                if local_filename_dir != kwargs['local_filename'] and not Path(local_filename_dir).is_dir():
233
                    makedirs(local_filename_dir)
234
                if 'url' not in kwargs:
235
                    kwargs['url'] = kwargs['local_filename']
236
237
            #  print(kwargs)
238
            ret = self.mets.add_file(file_grp, **kwargs)
239
240
            if content is not None:
241
                with open(kwargs['local_filename'], 'wb') as f:
242
                    if isinstance(content, str):
243
                        content = bytes(content, 'utf-8')
244
                    f.write(content)
245
246
        return ret
247
248
    def save_mets(self):
249
        """
250
        Write out the current state of the METS file.
251
        """
252
        log = getLogger('ocrd.workspace.save_mets')
253
        log.info("Saving mets '%s'", self.mets_target)
254
        if self.automatic_backup:
255
            WorkspaceBackupManager(self).add()
256
        with atomic_write(self.mets_target) as f:
257
            f.write(self.mets.to_xml(xmllint=True).decode('utf-8'))
258
259
    def resolve_image_exif(self, image_url):
260
        """
261
        Get the EXIF metadata about an image URL as :class:`OcrdExif`
262
263
        Args:
264
            image_url (string) : URL of image
265
266
        Return
267
            :class:`OcrdExif`
268
        """
269
        if not image_url:
270
            # avoid "finding" just any file
271
            raise Exception("Cannot resolve empty image path")
272
        f = next(self.mets.find_files(url=image_url), OcrdFile(None, url=image_url))
273
        image_filename = self.download_file(f).local_filename
274
        ocrd_exif = exif_from_filename(image_filename)
275
        return ocrd_exif
276
277
    @deprecated(version='1.0.0', reason="Use workspace.image_from_page and workspace.image_from_segment")
278
    def resolve_image_as_pil(self, image_url, coords=None):
279
        return self._resolve_image_as_pil(image_url, coords)
280
281
    def _resolve_image_as_pil(self, image_url, coords=None):
282
        """
283
        Resolve an image URL to a PIL image.
284
285
        Args:
286
            - coords (list) : Coordinates of the bounding box to cut from the image
287
288
        Returns:
289
            Image or region in image as PIL.Image
290
291
        """
292
        if not image_url:
293
            # avoid "finding" just any file
294
            raise Exception("Cannot resolve empty image path")
295
        log = getLogger('ocrd.workspace._resolve_image_as_pil')
296
        f = next(self.mets.find_files(url=image_url), OcrdFile(None, url=image_url))
297
        image_filename = self.download_file(f).local_filename
298
299
        with pushd_popd(self.directory):
300
            pil_image = Image.open(image_filename)
301
            pil_image.load() # alloc and give up the FD
302
303
        # Pillow does not properly support higher color depths
304
        # (e.g. 16-bit or 32-bit or floating point grayscale),
305
        # clipping its dynamic range to the lower 8-bit in
306
        # many operations (including paste, putalpha, ImageStat...),
307
        # even including conversion.
308
        # Cf. Pillow#3011 Pillow#3159 Pillow#3838 (still open in 8.0)
309
        # So to be on the safe side, we must re-quantize these
310
        # to 8-bit via numpy (conversion to/from which fortunately
311
        # seems to work reliably):
312
        if (pil_image.mode.startswith('I') or
313
            pil_image.mode.startswith('F')):
314
            arr_image = np.array(pil_image)
315
            if arr_image.dtype.kind == 'i':
316
                # signed integer is *not* trustworthy in this context
317
                # (usually a mistake in the array interface)
318
                log.debug('Casting image "%s" from signed to unsigned', image_url)
319
                arr_image.dtype = np.dtype('u' + arr_image.dtype.name)
320
            if arr_image.dtype.kind == 'u':
321
                # integer needs to be scaled linearly to 8 bit
322
                # of course, an image might actually have some lower range
323
                # (e.g. 10-bit in I;16 or 20-bit in I or 4-bit in L),
324
                # but that would be guessing anyway, so here don't
325
                # make assumptions on _scale_, just reduce _precision_
326
                log.debug('Reducing image "%s" from depth %d bit to 8 bit',
327
                          image_url, arr_image.dtype.itemsize * 8)
328
                arr_image = arr_image >> 8 * (arr_image.dtype.itemsize-1)
329
                arr_image = arr_image.astype(np.uint8)
330
            elif arr_image.dtype.kind == 'f':
331
                # float needs to be scaled from [0,1.0] to [0,255]
332
                log.debug('Reducing image "%s" from floating point to 8 bit',
333
                          image_url)
334
                arr_image *= 255
335
                arr_image = arr_image.astype(np.uint8)
336
            pil_image = Image.fromarray(arr_image)
337
338
        if coords is None:
339
            return pil_image
340
341
        # FIXME: remove or replace this by (image_from_polygon+) crop_image ...
342
        log.debug("Converting PIL to OpenCV: %s", image_url)
343
        color_conversion = cv2.COLOR_GRAY2BGR if pil_image.mode in ('1', 'L') else  cv2.COLOR_RGB2BGR
344
        pil_as_np_array = np.array(pil_image).astype('uint8') if pil_image.mode == '1' else np.array(pil_image)
345
        cv2_image = cv2.cvtColor(pil_as_np_array, color_conversion)
346
347
        poly = np.array(coords, np.int32)
348
        log.debug("Cutting region %s from %s", coords, image_url)
349
        region_cut = cv2_image[
350
            np.min(poly[:, 1]):np.max(poly[:, 1]),
351
            np.min(poly[:, 0]):np.max(poly[:, 0])
352
        ]
353
        return Image.fromarray(region_cut)
354
355
    def image_from_page(self, page, page_id,
356
                        fill='background', transparency=False,
357
                        feature_selector='', feature_filter=''):
358
        """Extract an image for a PAGE-XML page from the workspace.
359
360
        Given ``page``, a PAGE PageType object, extract its PIL.Image,
361
        either from its AlternativeImage (if it exists), or from its
362
        @imageFilename (otherwise). Also crop it, if a Border exists,
363
        and rotate it, if any @orientation angle is annotated.
364
365
        If ``feature_selector`` and/or ``feature_filter`` is given, then
366
        select/filter among the @imageFilename image and the available
367
        AlternativeImages the last one which contains all of the selected,
368
        but none of the filtered features (i.e. @comments classes), or
369
        raise an error.
370
371
        (Required and produced features need not be in the same order, so
372
        ``feature_selector`` is merely a mask specifying Boolean AND, and
373
        ``feature_filter`` is merely a mask specifying Boolean OR.)
374
375
        If the chosen image does not have the feature "cropped" yet, but
376
        a Border exists, and unless "cropped" is being filtered, then crop it.
377
        Likewise, if the chosen image does not have the feature "deskewed" yet,
378
        but an @orientation angle is annotated, and unless "deskewed" is being
379
        filtered, then rotate it. (However, if @orientation is above the
380
        [-45°,45°] interval, then apply as much transposition as possible first,
381
        unless "rotated-90" / "rotated-180" / "rotated-270" is being filtered.)
382
383
        Cropping uses a polygon mask (not just the bounding box rectangle).
384
        Areas outside the polygon will be filled according to ``fill``:
385
386
        - if ``background`` (the default),
387
          then fill with the median color of the image;
388
        - otherwise, use the given color, e.g. ``white`` or (255,255,255).
389
390
        Moreover, if ``transparency`` is true, and unless the image already
391
        has an alpha channel, then add an alpha channel which is fully opaque
392
        before cropping and rotating. (Thus, only the exposed areas will be
393
        transparent afterwards, for those that can interpret alpha channels).
394
395
        Return a tuple:
396
397
         * the extracted image,
398
         * a dictionary with information about the extracted image:
399
400
           - ``transform``: a Numpy array with an affine transform which
401
             converts from absolute coordinates to those relative to the image,
402
             i.e. after cropping to the page's border / bounding box (if any)
403
             and deskewing with the page's orientation angle (if any)
404
           - ``angle``: the rotation/reflection angle applied to the image so far,
405
           - ``features``: the AlternativeImage @comments for the image, i.e.
406
             names of all operations that lead up to this result,
407
408
         * an OcrdExif instance associated with the original image.
409
410
        (The first two can be used to annotate a new AlternativeImage,
411
         or be passed down with ``image_from_segment``.)
412
413
        Example:
414
415
         * get a raw (colored) but already deskewed and cropped image:
416
417
           ``
418
           page_image, page_coords, page_image_info = workspace.image_from_page(
419
                 page, page_id,
420
                 feature_selector='deskewed,cropped',
421
                 feature_filter='binarized,grayscale_normalized')
422
           ``
423
        """
424
        log = getLogger('ocrd.workspace.image_from_page')
425
        page_image_info = self.resolve_image_exif(page.imageFilename)
426
        page_image = self._resolve_image_as_pil(page.imageFilename)
427
        page_coords = dict()
428
        # use identity as initial affine coordinate transform:
429
        page_coords['transform'] = np.eye(3)
430
        # interim bbox (updated with each change to the transform):
431
        page_bbox = [0, 0, page_image.width, page_image.height]
432
        page_xywh = {'x': 0, 'y': 0,
433
                     'w': page_image.width, 'h': page_image.height}
434
435
        border = page.get_Border()
436
        # page angle: PAGE @orientation is defined clockwise,
437
        # whereas PIL/ndimage rotation is in mathematical direction:
438
        page_coords['angle'] = -(page.get_orientation() or 0)
439
        # map angle from (-180,180] to [0,360], and partition into multiples of 90;
440
        # but avoid unnecessary large remainders, i.e. split symmetrically:
441
        orientation = (page_coords['angle'] + 45) % 360
442
        orientation = orientation - (orientation % 90)
443
        skew = (page_coords['angle'] % 360) - orientation
444
        skew = 180 - (180 - skew) % 360 # map to [-45,45]
445
        page_coords['angle'] = 0 # nothing applied yet (depends on filters)
446
        log.debug("page '%s' has %s orientation=%d skew=%.2f",
447
                  page_id, "border," if border else "", orientation, skew)
448
449
        # initialize AlternativeImage@comments classes as empty:
450
        page_coords['features'] = ''
451
        alternative_image = None
452
        alternative_images = page.get_AlternativeImage()
453 View Code Duplication
        if alternative_images:
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
454
            # (e.g. from page-level cropping, binarization, deskewing or despeckling)
455
            if feature_selector or feature_filter:
456
                alternative_image = None
457
                # search from the end, because by convention we always append,
458
                # and among multiple satisfactory images we want the most recent:
459
                for alternative_image in reversed(alternative_images):
460
                    features = alternative_image.get_comments()
461
                    if not features:
462
                        log.warning("AlternativeImage %d for page '%s' does not have any feature attributes",
463
                                    alternative_images.index(alternative_image) + 1, page_id)
464
                        features = ''
465
                    if (all(feature in features
466
                            for feature in feature_selector.split(',') if feature) and
467
                        not any(feature in features
468
                                for feature in feature_filter.split(',') if feature)):
469
                        break
470
                    else:
471
                        alternative_image = None
472
            else:
473
                alternative_image = alternative_images[-1]
474
                features = alternative_image.get_comments()
475
                if not features:
476
                    log.warning("AlternativeImage %d for page '%s' does not have any feature attributes",
477
                                alternative_images.index(alternative_image) + 1, page_id)
478
                    features = ''
479
            if alternative_image:
480
                log.debug("Using AlternativeImage %d (%s) for page '%s'",
481
                          alternative_images.index(alternative_image) + 1,
482
                          features, page_id)
0 ignored issues
show
introduced by
The variable features does not seem to be defined in case the for loop on line 459 is not entered. Are you sure this can never be the case?
Loading history...
483
                page_image = self._resolve_image_as_pil(alternative_image.get_filename())
484
                page_coords['features'] = features
485
486
        # adjust the coord transformation to the steps applied on the image,
487
        # and apply steps on the existing image in case it is missing there,
488
        # but traverse all steps (crop/reflect/rotate) in a particular order:
489
        # - existing image features take priority (in the order annotated),
490
        # - next is cropping (if necessary but not already applied),
491
        # - next is reflection (if necessary but not already applied),
492
        # - next is rotation (if necessary but not already applied).
493
        # This helps deal with arbitrary workflows (e.g. crop then deskew,
494
        # or deskew then crop), regardless of where images are generated.
495
        alternative_image_features = page_coords['features'].split(',')
496
        for duplicate_feature in set([feature for feature in alternative_image_features
497
                                      # features relevant in reconstructing coordinates:
498
                                      if (feature in ['cropped', 'deskewed', 'rotated-90',
499
                                                      'rotated-180', 'rotated-270'] and
500
                                          alternative_image_features.count(feature) > 1)]):
501
            log.error("Duplicate feature %s in AlternativeImage for page '%s'",
502
                      duplicate_feature, page_id)
503
        for i, feature in enumerate(alternative_image_features +
504
                                    (['cropped']
505
                                     if (border and
506
                                         not 'cropped' in alternative_image_features and
507
                                         not 'cropped' in feature_filter.split(','))
508
                                     else []) +
509
                                    (['rotated-%d' % orientation]
510
                                     if (orientation and
511
                                         not 'rotated-%d' % orientation in alternative_image_features and
512
                                         not 'rotated-%d' % orientation in feature_filter.split(','))
513
                                     else []) +
514
                                    (['deskewed']
515
                                     if (skew and
516
                                         not 'deskewed' in alternative_image_features and
517
                                         not 'deskewed' in feature_filter.split(','))
518
                                     else []) +
519
                                    # not a feature to be added, but merely as a fallback position
520
                                    # to always enter loop at i == len(alternative_image_features)
521
                                    ['_check']):
522
            # image geometry vs feature consistency can only be checked
523
            # after all features on the existing AlternativeImage have
524
            # been adjusted for in the transform, and when there is a mismatch,
525
            # additional steps applied here would only repeat the respective
526
            # error message; so we only check once at the boundary between
527
            # existing and new features
528
            # FIXME we should check/enforce consistency when _adding_ AlternativeImage
529
            if (i == len(alternative_image_features) and
530
                not (page_xywh['w'] - 2 < page_image.width < page_xywh['w'] + 2 and
531
                     page_xywh['h'] - 2 < page_image.height < page_xywh['h'] + 2)):
532
                log.error('page "%s" image (%s; %dx%d) has not been cropped properly (%dx%d)',
533
                          page_id, page_coords['features'],
534
                          page_image.width, page_image.height,
535
                          page_xywh['w'], page_xywh['h'])
536
            name = "%s for page '%s'" % ("AlternativeImage" if alternative_image
537
                                         else "original image", page_id)
538
            # adjust transform to feature, and ensure feature is applied to image
539
            if feature == 'cropped':
540
                page_image, page_coords, page_xywh = _crop(
541
                    log, name, border, page_image, page_coords,
542
                    fill=fill, transparency=transparency)
543
            elif feature == 'rotated-%d' % orientation:
544
                page_image, page_coords, page_xywh = _reflect(
545
                    log, name, orientation, page_image, page_coords, page_xywh)
546
            elif feature == 'deskewed':
547
                page_image, page_coords, page_xywh = _rotate(
548
                    log, name, skew, border, page_image, page_coords, page_xywh,
549
                    fill=fill, transparency=transparency)
550
551
        # verify constraints again:
552
        if not all(feature in page_coords['features']
553
                   for feature in feature_selector.split(',') if feature):
554
            raise Exception('Found no AlternativeImage that satisfies all requirements ' +
555
                            'selector="%s" in page "%s"' % (
556
                                feature_selector, page_id))
557
        if any(feature in page_coords['features']
558
               for feature in feature_filter.split(',') if feature):
559
            raise Exception('Found no AlternativeImage that satisfies all requirements ' +
560
                            'filter="%s" in page "%s"' % (
561
                                feature_filter, page_id))
562
        page_image.format = 'PNG' # workaround for tesserocr#194
563
        return page_image, page_coords, page_image_info
564
565
    def image_from_segment(self, segment, parent_image, parent_coords,
566
                           fill='background', transparency=False,
567
                           feature_selector='', feature_filter=''):
568
        """Extract an image for a PAGE-XML hierarchy segment from its parent's image.
569
570
        Given...
571
572
         * ``parent_image``, a PIL.Image of the parent, with
573
         * ``parent_coords``, a dict with information about ``parent_image``:
574
           - ``transform``: a Numpy array with an affine transform which
575
             converts from absolute coordinates to those relative to the image,
576
             i.e. after applying all operations (starting with the original image)
577
           - ``angle``: the rotation/reflection angle applied to the image so far,
578
           - ``features``: the AlternativeImage @comments for the image, i.e.
579
             names of all operations that lead up to this result, and
580
         * ``segment``, a PAGE segment object logically contained in it
581
           (i.e. TextRegionType / TextLineType / WordType / GlyphType),
582
583
        ...extract the segment's corresponding PIL.Image, either from
584
        AlternativeImage (if it exists), or producing a new image via
585
        cropping from ``parent_image`` (otherwise).
586
587
        If ``feature_selector`` and/or ``feature_filter`` is given, then
588
        select/filter among the cropped ``parent_image`` and the available
589
        AlternativeImages the last one which contains all of the selected,
590
        but none of the filtered features (i.e. @comments classes), or
591
        raise an error.
592
593
        (Required and produced features need not be in the same order, so
594
        ``feature_selector`` is merely a mask specifying Boolean AND, and
595
        ``feature_filter`` is merely a mask specifying Boolean OR.)
596
597
        Cropping uses a polygon mask (not just the bounding box rectangle).
598
        Areas outside the polygon will be filled according to ``fill``:
599
600
        - if ``background`` (the default),
601
          then fill with the median color of the image;
602
        - otherwise, use the given color, e.g. ``white`` or (255,255,255).
603
604
        Moreover, if ``transparency`` is true, and unless the image already
605
        has an alpha channel, then add an alpha channel which is fully opaque
606
        before cropping and rotating. (Thus, only the exposed areas will be
607
        transparent afterwards, for those that can interpret alpha channels).
608
609
        When cropping, compensate any @orientation angle annotated for the
610
        parent (from parent-level deskewing) by rotating the segment coordinates
611
        in an inverse transformation (i.e. translation to center, then passive
612
        rotation, and translation back).
613
614
        Regardless, if any @orientation angle is annotated for the segment
615
        (from segment-level deskewing), and the chosen image does not have
616
        the feature "deskewed" yet, and unless "deskewed" is being filtered,
617
        then rotate it - compensating for any previous ``angle``. (However,
618
        if @orientation is above the [-45°,45°] interval, then apply as much
619
        transposition as possible first, unless "rotated-90" / "rotated-180" /
620
        "rotated-270" is being filtered.)
621
622
        Return a tuple:
623
624
         * the extracted image,
625
         * a dictionary with information about the extracted image:
626
           - ``transform``: a Numpy array with an affine transform which
627
             converts from absolute coordinates to those relative to the image,
628
             i.e. after applying all parent operations, and then cropping to
629
             the segment's bounding box, and deskewing with the segment's
630
             orientation angle (if any)
631
           - ``angle``: the rotation/reflection angle applied to the image so far,
632
           - ``features``: the AlternativeImage @comments for the image, i.e.
633
             names of all operations that lead up to this result.
634
635
        (These can be used to create a new AlternativeImage, or passed down
636
         for calls on lower hierarchy levels.)
637
638
        Example:
639
640
         * get a raw (colored) but already deskewed and cropped image:
641
642
           ``image, xywh = workspace.image_from_segment(region,
643
                 page_image, page_xywh,
644
                 feature_selector='deskewed,cropped',
645
                 feature_filter='binarized,grayscale_normalized')``
646
        """
647
        log = getLogger('ocrd.workspace.image_from_segment')
648
        # note: We should mask overlapping neighbouring segments here,
649
        # but finding the right clipping rules can be difficult if operating
650
        # on the raw (non-binary) image data alone: for each intersection, it
651
        # must be decided which one of either segment or neighbour to assign,
652
        # e.g. an ImageRegion which properly contains our TextRegion should be
653
        # completely ignored, but an ImageRegion which is properly contained
654
        # in our TextRegion should be completely masked, while partial overlap
655
        # may be more difficult to decide. On the other hand, on the binary image,
656
        # we can use connected component analysis to mask foreground areas which
657
        # originate in the neighbouring regions. But that would introduce either
658
        # the assumption that the input has already been binarized, or a dependency
659
        # on some ad-hoc binarization method. Thus, it is preferable to use
660
        # a dedicated processor for this (which produces clipped AlternativeImage
661
        # or reduced polygon coordinates).
662
        segment_image, segment_coords, segment_xywh = _crop(
663
            log, "parent image for segment '%s'" % segment.id,
664
            segment, parent_image, parent_coords,
665
            fill=fill, transparency=transparency)
666
667
        # Semantics of missing @orientation at region level could be either
668
        # - inherited from page level: same as line or word level (no @orientation),
669
        # - zero (unrotate page angle): different from line or word level (because
670
        #   otherwise deskewing would never have an effect on lines and words)
671
        # The PAGE specification is silent here (but does generally not concern itself
672
        # much with AlternativeImage coordinate consistency).
673
        # Since our (generateDS-backed) ocrd_page supports the zero/none distinction,
674
        # we choose the former (i.e. None is inheritance).
675
        if 'orientation' in segment.__dict__ and segment.get_orientation() is not None:
676
            # region angle: PAGE @orientation is defined clockwise,
677
            # whereas PIL/ndimage rotation is in mathematical direction:
678
            angle = -segment.get_orientation()
679
            # @orientation is always absolute; if higher levels
680
            # have already rotated, then we must compensate:
681
            angle -= parent_coords['angle']
682
            # map angle from (-180,180] to [0,360], and partition into multiples of 90;
683
            # but avoid unnecessary large remainders, i.e. split symmetrically:
684
            orientation = (angle + 45) % 360
685
            orientation = orientation - (orientation % 90)
686
            skew = (angle % 360) - orientation
687
            skew = 180 - (180 - skew) % 360 # map to [-45,45]
688
            log.debug("segment '%s' has orientation=%d skew=%.2f",
689
                      segment.id, orientation, skew)
690
        else:
691
            orientation = 0
692
            skew = 0
693
        segment_coords['angle'] = parent_coords['angle'] # nothing applied yet (depends on filters)
694
695
        # initialize AlternativeImage@comments classes from parent, except
696
        # for those operations that can apply on multiple hierarchy levels:
697
        segment_coords['features'] = ','.join(
698
            [feature for feature in parent_coords['features'].split(',')
699
             if feature in ['binarized', 'grayscale_normalized',
700
                            'despeckled', 'dewarped']])
701
702
        alternative_image = None
703
        alternative_images = segment.get_AlternativeImage()
704 View Code Duplication
        if alternative_images:
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
705
            # (e.g. from segment-level cropping, binarization, deskewing or despeckling)
706
            if feature_selector or feature_filter:
707
                alternative_image = None
708
                # search from the end, because by convention we always append,
709
                # and among multiple satisfactory images we want the most recent:
710
                for alternative_image in reversed(alternative_images):
711
                    features = alternative_image.get_comments()
712
                    if not features:
713
                        log.warning("AlternativeImage %d for segment '%s' does not have any feature attributes",
714
                                    alternative_images.index(alternative_image) + 1, segment.id)
715
                        features = ''
716
                    if (all(feature in features
717
                            for feature in feature_selector.split(',') if feature) and
718
                        not any(feature in features
719
                                for feature in feature_filter.split(',') if feature)):
720
                        break
721
                    else:
722
                        alternative_image = None
723
            else:
724
                alternative_image = alternative_images[-1]
725
                features = alternative_image.get_comments()
726
                if not features:
727
                    log.warning("AlternativeImage %d for segment '%s' does not have any feature attributes",
728
                                alternative_images.index(alternative_image) + 1, segment.id)
729
                    features = ''
730
            if alternative_image:
731
                log.debug("Using AlternativeImage %d (%s) for segment '%s'",
732
                          alternative_images.index(alternative_image) + 1,
733
                          features, segment.id)
0 ignored issues
show
introduced by
The variable features does not seem to be defined in case the for loop on line 710 is not entered. Are you sure this can never be the case?
Loading history...
734
                segment_image = self._resolve_image_as_pil(alternative_image.get_filename())
735
                segment_coords['features'] = features
736
737
        alternative_image_features = segment_coords['features'].split(',')
738
        for duplicate_feature in set([feature for feature in alternative_image_features
739
                                      # features relevant in reconstructing coordinates:
740
                                      if (feature in ['deskewed', 'rotated-90',
741
                                                      'rotated-180', 'rotated-270'] and
742
                                          alternative_image_features.count(feature) > 1)]):
743
            log.error("Duplicate feature %s in AlternativeImage for segment '%s'",
744
                      duplicate_feature, segment.id)
745
        for i, feature in enumerate(alternative_image_features +
746
                                    (['rotated-%d' % orientation]
747
                                     if (orientation and
748
                                         not 'rotated-%d' % orientation in alternative_image_features and
749
                                         not 'rotated-%d' % orientation in feature_filter.split(','))
750
                                     else []) +
751
                                    (['deskewed']
752
                                     if (skew and
753
                                         not 'deskewed' in alternative_image_features and
754
                                         not 'deskewed' in feature_filter.split(','))
755
                                     else []) +
756
                                    # not a feature to be added, but merely as a fallback position
757
                                    # to always enter loop at i == len(alternative_image_features)
758
                                    ['_check']):
759
            # image geometry vs feature consistency can only be checked
760
            # after all features on the existing AlternativeImage have
761
            # been adjusted for in the transform, and when there is a mismatch,
762
            # additional steps applied here would only repeat the respective
763
            # error message; so we only check once at the boundary between
764
            # existing and new features
765
            # FIXME we should enforce consistency here (i.e. split into transposition
766
            #       and minimal rotation, rotation always reshapes, rescaling never happens)
767
            # FIXME: inconsistency currently unavoidable with line-level dewarping (which increases height)
768
            if (i == len(alternative_image_features) and
769
                not (segment_xywh['w'] - 2 < segment_image.width < segment_xywh['w'] + 2 and
770
                     segment_xywh['h'] - 2 < segment_image.height < segment_xywh['h'] + 2)):
771
                log.error('segment "%s" image (%s; %dx%d) has not been cropped properly (%dx%d)',
772
                          segment.id, segment_coords['features'],
773
                          segment_image.width, segment_image.height,
774
                          segment_xywh['w'], segment_xywh['h'])
775
            name = "%s for segment '%s'" % ("AlternativeImage" if alternative_image
776
                                            else "parent image", segment.id)
777
            # adjust transform to feature, and ensure feature is applied to image
778
            if feature == 'rotated-%d' % orientation:
779
                segment_image, segment_coords, segment_xywh = _reflect(
780
                    log, name, orientation, segment_image, segment_coords, segment_xywh)
781
            elif feature == 'deskewed':
782
                segment_image, segment_coords, segment_xywh = _rotate(
783
                    log, name, skew, segment, segment_image, segment_coords, segment_xywh,
784
                    fill=fill, transparency=transparency)
785
786
        # verify constraints again:
787
        if not all(feature in segment_coords['features']
788
                   for feature in feature_selector.split(',') if feature):
789
            raise Exception('Found no AlternativeImage that satisfies all requirements' +
790
                            'selector="%s" in segment "%s"' % (
791
                                feature_selector, segment.id))
792
        if any(feature in segment_coords['features']
793
               for feature in feature_filter.split(',') if feature):
794
            raise Exception('Found no AlternativeImage that satisfies all requirements ' +
795
                            'filter="%s" in segment "%s"' % (
796
                                feature_filter, segment.id))
797
        segment_image.format = 'PNG' # workaround for tesserocr#194
798
        return segment_image, segment_coords
799
800
    # pylint: disable=redefined-builtin
801
    def save_image_file(self, image,
802
                        file_id,
803
                        file_grp,
804
                        page_id=None,
805
                        mimetype='image/png',
806
                        force=False):
807
        """Store and reference an image as file into the workspace.
808
809
        Given a PIL.Image `image`, and an ID `file_id` to use in METS,
810
        store the image under the fileGrp `file_grp` and physical page
811
        `page_id` into the workspace (in a file name based on
812
        the `file_grp`, `file_id` and `format` extension).
813
814
        Return the (absolute) path of the created file.
815
        """
816
        log = getLogger('ocrd.workspace.save_image_file')
817
        if not force and self.overwrite_mode:
818
            force = True
819
        image_bytes = io.BytesIO()
820
        image.save(image_bytes, format=MIME_TO_PIL[mimetype])
821
        file_path = str(Path(file_grp, '%s%s' % (file_id, MIME_TO_EXT[mimetype])))
822
        out = self.add_file(
823
            ID=file_id,
824
            file_grp=file_grp,
825
            pageId=page_id,
826
            local_filename=file_path,
827
            mimetype=mimetype,
828
            content=image_bytes.getvalue(),
829
            force=force)
830
        log.info('created file ID: %s, file_grp: %s, path: %s',
831
                 file_id, file_grp, out.local_filename)
832
        return file_path
833
834
def _crop(log, name, segment, parent_image, parent_coords, **kwargs):
835
    segment_coords = parent_coords.copy()
836
    # get polygon outline of segment relative to parent image:
837
    segment_polygon = coordinates_of_segment(segment, parent_image, parent_coords)
838
    # get relative bounding box:
839
    segment_bbox = bbox_from_polygon(segment_polygon)
840
    # get size of the segment in the parent image after cropping
841
    # (i.e. possibly different from size before rotation at the parent, but
842
    #  also possibly different from size after rotation below/AlternativeImage):
843
    segment_xywh = xywh_from_bbox(*segment_bbox)
844
    # crop, if (still) necessary:
845
    if (not isinstance(segment, BorderType) or # always crop below page level
846
        not 'cropped' in parent_coords['features']):
847
        if isinstance(segment, BorderType):
848
            log.info("Cropping %s", name)
849
            segment_coords['features'] += ',cropped'
850
        # create a mask from the segment polygon:
851
        segment_image = image_from_polygon(parent_image, segment_polygon, **kwargs)
852
        # crop to bbox:
853
        segment_image = crop_image(segment_image, box=segment_bbox)
854
    else:
855
        segment_image = parent_image
856
    # subtract offset from parent in affine coordinate transform:
857
    # (consistent with image cropping)
858
    segment_coords['transform'] = shift_coordinates(
859
        parent_coords['transform'],
860
        np.array([-segment_bbox[0],
861
                  -segment_bbox[1]]))
862
    return segment_image, segment_coords, segment_xywh
863
864
def _reflect(log, name, orientation, segment_image, segment_coords, segment_xywh):
865
    # Transpose in affine coordinate transform:
866
    # (consistent with image transposition or AlternativeImage below)
867
    transposition = {
868
        90: Image.ROTATE_90,
869
        180: Image.ROTATE_180,
870
        270: Image.ROTATE_270
871
    }.get(orientation) # no default
872
    segment_coords['transform'] = transpose_coordinates(
873
        segment_coords['transform'], transposition,
874
        np.array([0.5 * segment_xywh['w'],
875
                  0.5 * segment_xywh['h']]))
876
    segment_xywh['w'], segment_xywh['h'] = adjust_canvas_to_transposition(
877
        [segment_xywh['w'], segment_xywh['h']], transposition)
878
    segment_coords['angle'] += orientation
879
    # transpose, if (still) necessary:
880
    if not 'rotated-%d' % orientation in segment_coords['features']:
881
        log.info("Transposing %s by %d°", name, orientation)
882
        segment_image = transpose_image(segment_image, transposition)
883
        segment_coords['features'] += ',rotated-%d' % orientation
884
    return segment_image, segment_coords, segment_xywh
885
886
def _rotate(log, name, skew, segment, segment_image, segment_coords, segment_xywh, **kwargs):
887
    # Rotate around center in affine coordinate transform:
888
    # (consistent with image rotation or AlternativeImage below)
889
    segment_coords['transform'] = rotate_coordinates(
890
        segment_coords['transform'], skew,
891
        np.array([0.5 * segment_xywh['w'],
892
                  0.5 * segment_xywh['h']]))
893
    segment_xywh['w'], segment_xywh['h'] = adjust_canvas_to_rotation(
894
        [segment_xywh['w'], segment_xywh['h']], skew)
895
    segment_coords['angle'] += skew
896
    # deskew, if (still) necessary:
897
    if not 'deskewed' in segment_coords['features']:
898
        log.info("Rotating %s by %.2f°", name, skew)
899
        segment_image = rotate_image(segment_image, skew, **kwargs)
900
        segment_coords['features'] += ',deskewed'
901
        if (segment and
902
            (not isinstance(segment, BorderType) or # always crop below page level
903
             'cropped' in segment_coords['features'])):
904
            # re-crop to new bbox (which may deviate
905
            # if segment polygon was not a rectangle)
906
            segment_image, segment_coords, segment_xywh = _crop(
907
                log, name, segment, segment_image, segment_coords,
908
                **kwargs)
909
    elif (segment and
910
          (not isinstance(segment, BorderType) or # always crop below page level
911
           'cropped' in segment_coords['features'])):
912
        # only shift coordinates as if re-cropping
913
        _, segment_coords, segment_xywh = _crop(
914
            log, name, segment, segment_image, segment_coords,
915
            **kwargs)
916
    return segment_image, segment_coords, segment_xywh
917