Passed
Pull Request — master (#1343)
by
unknown
02:10
created

ocrd.processor.builtin.merge_processor   A

Complexity

Total Complexity 14

Size/Duplication

Total Lines 132
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
wmc 14
eloc 96
dl 0
loc 132
rs 10
c 0
b 0
f 0

3 Functions

Rating   Name   Duplication   Size   Complexity  
A get_border_bbox() 0 4 2
A rename_segments() 0 26 4
A cli() 0 4 1

3 Methods

Rating   Name   Duplication   Size   Complexity  
A MergeProcessor.metadata_filename() 0 3 1
A MergeProcessor.executable() 0 3 1
B MergeProcessor.process_page_pcgts() 0 43 5
1
# pylint: disable=missing-module-docstring,invalid-name
2
from typing import Optional
3
from itertools import count
4
from collections import OrderedDict as odict
5
6
import click
7
8
from ocrd import Processor, OcrdPageResult
9
from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor
10
from ocrd_modelfactory import page_from_file
11
from ocrd_models import OcrdPage
12
from ocrd_models.ocrd_page import (
13
    BorderType,
14
    CoordsType,
15
    ReadingOrderType,
16
    UnorderedGroupType,
17
)
18
from ocrd_utils import bbox_from_points
19
20
_SEGTYPES = [
21
    "NoiseRegion",
22
    "LineDrawingRegion",
23
    "AdvertRegion",
24
    "ImageRegion",
25
    "ChartRegion",
26
    "MusicRegion",
27
    "GraphicRegion",
28
    "UnknownRegion",
29
    "CustomRegion",
30
    "SeparatorRegion",
31
    "MathsRegion",
32
    "TextRegion",
33
    "MapRegion",
34
    "ChemRegion",
35
    "TableRegion",
36
    "TextLine",
37
    "Word",
38
    "Glyph"
39
]
40
41
42
def get_border_bbox(pcgts):
43
    if pcgts.Page.Border is None:
44
        return [0, 0, pcgts.Page.imageWidth, pcgts.Page.imageHeight]
45
    return bbox_from_points(pcgts.Page.Border.Coords.points)
46
47
def rename_segments(pcgts, start=1):
48
    renamed = {}
49
    rodict = pcgts.Page.get_ReadingOrderGroups()
50
    # get everything that has an identifier
51
    nodes = pcgts.xpath("//*[@id]")
52
    # filter segments
53
    segments = [segment for segment in map(pcgts.revmap.get, nodes)
54
                # get PAGE objects from matching etree nodes
55
                # but allow only hierarchy segments
56
                if segment.__class__.__name__.replace('Type', '') in _SEGTYPES]
57
    # count segments and rename them
58
    # fixme: or perhaps better to have each segment type named and counted differently?
59
    num = 0
60
    regions = []
61
    for num, segment in zip(count(start=start), segments):
62
        segtype = segment.original_tagname_
63
        #parent = segment.parent_object_
64
        newname = "seg%011d" % num
65
        assert not segment.id in renamed
66
        if segtype.endswith('Region') and segment.id in rodict:
67
            # update reading order
68
            roelem = rodict[segment.id]
69
            roelem.regionRef = newname
70
        renamed[segment.id] = newname
71
        segment.id = newname
72
    return num
73
74
class MergeProcessor(Processor):
75
    def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult:
76
        """
77
        Merge PAGE segment hierarchy elements from all input file groups.
78
79
        For each page, open and deserialise PAGE input files. Rename all elements
80
        of the segment hierarchy to new (clash-free) identifers. Redefine the
81
        `Border` coordinates as the convex hull of all input borders. Then add all
82
        regions from all input files, concatenating them into a single `ReadingOrder`
83
        in the order of input file groups.
84
85
        Produce a new PAGE output file by serialising the resulting hierarchy.
86
        """
87
        actual_pcgts = list(filter(None, input_pcgts))
88
        assert len(set(pcgts.Page.imageFilename for pcgts in actual_pcgts)) == 1, \
89
            "input files must all reference the same @imageFilename"
90
        # create new PAGE for image
91
        result = OcrdPageResult(page_from_file(actual_pcgts[0].Page.imageFilename))
92
        # unify Border
93
        borders = [get_border_bbox(pcgts) for pcgts in actual_pcgts]
94
        minx, miny, maxx, maxy = zip(*borders)
95
        minx = min(minx)
96
        miny = min(miny)
97
        maxx = max(maxx)
98
        maxy = max(maxy)
99
        result.pcgts.Page.set_Border(
100
            BorderType(CoordsType(
101
                points=f"{minx},{miny} {maxx},{miny} {maxx},{maxy} {minx},{maxy}")))
102
        # rename all segments
103
        num = 1
104
        for pcgts in actual_pcgts:
105
            num = rename_segments(pcgts, num)
106
        # concatenate all regions
107
        ug = UnorderedGroupType(id="merged")
108
        result.pcgts.Page.set_ReadingOrder(ReadingOrderType(UnorderedGroup=ug))
109
        for pcgts in actual_pcgts:
110
            for region in pcgts.Page.get_AllRegions():
111
                adder = getattr(result.pcgts.Page, 'add_' + region.original_tagname_)
112
                adder(region)
113
            if pcgts.Page.ReadingOrder:
114
                group = pcgts.Page.ReadingOrder.OrderedGroup or pcgts.Page.ReadingOrder.UnorderedGroup
115
                adder = getattr(ug, 'add_' + group.original_tagname_)
116
                adder(group)
117
        return result
118
119
    @property
120
    def metadata_filename(self):
121
        return 'processor/builtin/dummy/ocrd-tool.json'
122
123
    @property
124
    def executable(self):
125
        return 'ocrd-merge'
126
127
128
@click.command()
129
@ocrd_cli_options
130
def cli(*args, **kwargs):
131
    return ocrd_cli_wrap_processor(MergeProcessor, *args, **kwargs)
132