ocrd_validators.page_validator.concatenate()   A
last analyzed

Complexity

Conditions 5

Size

Total Lines 15
Code Lines 11

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 11
dl 0
loc 15
rs 9.3333
c 0
b 0
f 0
cc 5
nop 4
1
"""
2
API for validating `OcrdPage <../ocrd_models/ocrd_models.ocrd_page.html>`_.
3
"""
4
import re
5
from shapely.geometry import Polygon, LineString
6
from shapely.validation import explain_validity
7
8
from ocrd_utils import getLogger, polygon_from_points, deprecated_alias
9
from ocrd_models.ocrd_page import OcrdPage, parse
10
from ocrd_modelfactory import page_from_file
11
12
from ocrd_models.ocrd_page import (
13
    PcGtsType,
14
    PageType,
15
    TextRegionType,
16
    TextLineType,
17
    WordType,
18
    GlyphType,
19
    TextEquivType
20
)
21
from ocrd_models.ocrd_page_generateds import (
22
    RegionType,
23
    ReadingDirectionSimpleType,
24
    TextLineOrderSimpleType,
25
    RegionRefType,
26
    RegionRefIndexedType,
27
    OrderedGroupType,
28
    OrderedGroupIndexedType,
29
    UnorderedGroupType,
30
    UnorderedGroupIndexedType,
31
)
32
from ocrd_models import ValidationReport
33
34
35
_HIERARCHY = [
36
    # page can contain different types of regions
37
    (PageType,       'get_AdvertRegion', None),
38
    (PageType,       'get_ChartRegion', None),
39
    (PageType,       'get_ChemRegion', None),
40
    (PageType,       'get_CustomRegion', None),
41
    (PageType,       'get_GraphicRegion', None),
42
    (PageType,       'get_ImageRegion', None),
43
    (PageType,       'get_LineDrawingRegion', None),
44
    (PageType,       'get_MapRegion', None),
45
    (PageType,       'get_MathsRegion', None),
46
    (PageType,       'get_MusicRegion', None),
47
    (PageType,       'get_NoiseRegion', None),
48
    (PageType,       'get_SeparatorRegion', None),
49
    (PageType,       'get_TableRegion', None),
50
    (PageType,       'get_TextRegion', None),
51
    (PageType,       'get_UnknownRegion', None),
52
    # all regions can be recursive
53
    (RegionType,     'get_AdvertRegion', None),
54
    (RegionType,     'get_ChartRegion', None),
55
    (RegionType,     'get_ChemRegion', None),
56
    (RegionType,     'get_CustomRegion', None),
57
    (RegionType,     'get_GraphicRegion', None),
58
    (RegionType,     'get_ImageRegion', None),
59
    (RegionType,     'get_LineDrawingRegion', None),
60
    #(RegionType,     'get_MapRegion', None),
61
    (RegionType,     'get_MathsRegion', None),
62
    (RegionType,     'get_MusicRegion', None),
63
    (RegionType,     'get_NoiseRegion', None),
64
    (RegionType,     'get_SeparatorRegion', None),
65
    (RegionType,     'get_TableRegion', None),
66
    (RegionType,     'get_TextRegion', None),
67
    (RegionType,     'get_UnknownRegion', None),
68
    # only TextRegion can contain TextLine
69
    (TextRegionType, 'get_TextLine',   '\n'),
70
    (TextLineType,   'get_Word',       ' '),
71
    (WordType,       'get_Glyph',      ''),
72
    (GlyphType,      None,             None),
73
]
74
75
_ORDER = [
76
    (None, TextLineOrderSimpleType.BOTTOMTOTOP, ReadingDirectionSimpleType.RIGHTTOLEFT),
77
    (PageType,       'get_textLineOrder', 'get_readingDirection'),
78
    (TextRegionType, 'get_textLineOrder', 'get_readingDirection'),
79
    (TextLineType,   None,                'get_readingDirection'),
80
    (WordType,       None,                'get_readingDirection'),
81
]
82
83
# The following parameters control how tolerant we are with respect to
84
# polygon path self-validity and parent-child containment. We have to
85
# offer this, because most implementations, including PRImA itself,
86
# do _not_ offer pixel-precise correctness.
87
# How much may polygon paths deviate when simplifying them
88
# to avoid self-intersections?
89
POLY_TOLERANCE = 1.0
90
# How large a margin to increase parent polygons before
91
# checking their children are properly contained?
92
PARENT_SLACK = 1.5
93
94
95
class ConsistencyError(Exception):
96
    """
97
    Exception representing a consistency error in textual transcription across levels of a PAGE-XML.
98
    (Element text strings must be the concatenation of their children's text strings, joined by white space.)
99
    """
100
101
    def __init__(self, tag, ID, file_id, actual, expected):
102
        """
103
        Construct a new ConsistencyError.
104
105
        Arguments:
106
            tag (string): Level of the inconsistent element (parent)
107
            ID (string): ``ID`` of the inconsistent element (parent)
108
            file_id (string): ``mets:id`` of the PAGE file
109
            actual (string): Value of parent's TextEquiv[0]/Unicode
110
            expected (string): Concatenated values of children's
111
                               TextEquiv[0]/Unicode, joined by white-space
112
        """
113
        self.tag = tag
114
        self.ID = ID
115
        self.file_id = file_id
116
        self.actual = actual
117
        self.expected = expected
118
        super().__init__(
119
            f"INCONSISTENCY in {tag} ID '{ID}' of file '{file_id}': "
120
            f"text results '{actual}' != concatenated '{expected}'")
121
122
class CoordinateConsistencyError(Exception):
123
    """
124
    Exception representing a consistency error in coordinate confinement across levels of a PAGE-XML.
125
    (Element coordinate polygons must be properly contained in their parents' coordinate polygons.)
126
    """
127
128
    def __init__(self, tag, ID, file_id, outer, inner):
129
        """
130
        Construct a new CoordinateConsistencyError.
131
132
        Arguments:
133
            tag (string): Level of the offending element (child)
134
            ID (string): ``ID`` of the offending element (child)
135
            file_id (string): ``mets:id`` of the PAGE file
136
            outer (string): Coordinate points of the parent
137
            inner (string): Coordinate points of the child
138
        """
139
        self.tag = tag
140
        self.ID = ID
141
        self.file_id = file_id
142
        self.outer = outer
143
        self.inner = inner
144
        super().__init__(
145
            f"INCONSISTENCY in {tag} ID '{ID}' of '{file_id}': "
146
            f"coords '{inner}' not within parent coords '{outer}'")
147
148
class CoordinateValidityError(Exception):
149
    """
150
    Exception representing a validity error of an element's coordinates in PAGE-XML.
151
    (Element coordinate polygons must have at least 3 points, and must not
152
     self-intersect or be non-contiguous or be negative.)
153
    """
154
155
    def __init__(self, tag, ID, file_id, points, reason='unknown'):
156
        """
157
        Construct a new CoordinateValidityError.
158
159
        Arguments:
160
            tag (string): Level of the offending element (child)
161
            ID (string): ``ID`` of the offending element (child)
162
            points (string): Coordinate points
163
            reason (string): description of the problem
164
        """
165
        self.tag = tag
166
        self.ID = ID
167
        self.file_id = file_id
168
        self.points = points
169
        super().__init__(
170
            f"INVALIDITY in {tag} ID '{ID}' of '{file_id}': coords '{points}' - {reason}")
171
172
def compare_without_whitespace(a, b):
173
    """
174
    Compare two strings, ignoring all whitespace.
175
    """
176
    return re.sub('\\s+', '', a) == re.sub('\\s+', '', b)
177
178
def page_get_reading_order(ro, rogroup):
179
    """
180
    Add all elements from the given reading order group to the given dictionary.
181
182
    Given a dict ``ro`` from layout element IDs to ReadingOrder element objects,
183
    and an object ``rogroup`` with additional ReadingOrder element objects,
184
    add all references to the dict, traversing the group recursively.
185
    """
186
    regionrefs = []
187
    if isinstance(rogroup, (OrderedGroupType, OrderedGroupIndexedType)):
188
        regionrefs = (rogroup.get_RegionRefIndexed() +
189
                      rogroup.get_OrderedGroupIndexed() +
190
                      rogroup.get_UnorderedGroupIndexed())
191
    if isinstance(rogroup, (UnorderedGroupType, UnorderedGroupIndexedType)):
192
        regionrefs = (rogroup.get_RegionRef() +
193
                      rogroup.get_OrderedGroup() +
194
                      rogroup.get_UnorderedGroup())
195
    for elem in regionrefs:
196
        ro[elem.get_regionRef()] = elem
197
        if not isinstance(elem, (RegionRefType, RegionRefIndexedType)):
198
            page_get_reading_order(ro, elem)
199
200
def make_poly(polygon_points):
201
    """Instantiate a Polygon from a list of point pairs, or return an error string"""
202
    if len(polygon_points) < 4:
203
        return 'has too few points'
204
    poly = Polygon(polygon_points)
205
    if POLY_TOLERANCE:
206
        poly = poly.simplify(POLY_TOLERANCE)
207
    if not poly.is_valid:
208
        return explain_validity(poly)
209
    elif poly.is_empty:
210
        return 'is empty'
211
    elif poly.bounds[0] < 0 or poly.bounds[1] < 0:
212
        return 'is negative'
213
    return poly
214
215
def make_line(line_points):
216
    """Instantiate a LineString from a list of point pairs, or return an error string"""
217
    if len(line_points) < 2:
218
        return 'has too few points'
219
    line = LineString(line_points)
220
    if not line.is_valid:
221
        return explain_validity(line)
222
    elif line.is_empty:
223
        return 'is empty'
224
    elif line.bounds[0] < 0 or line.bounds[1] < 0:
225
        return 'is negative'
226
    return line
227
228
@deprecated_alias(strictness='page_textequiv_consistency')
229
@deprecated_alias(strategy='page_textequiv_strategy')
230
def validate_consistency(node, page_textequiv_consistency, page_textequiv_strategy,
231
                         check_baseline, check_coords, report, file_id,
232
                         joinRelations=None, readingOrder=None,
233
                         textLineOrder=None, readingDirection=None):
234
    """
235
    Check whether the text results on an element is consistent with its child element text results,
236
    and whether the coordinates of an element are fully within its parent element coordinates.
237
    """
238
    log = getLogger('ocrd.page_validator.validate_consistency')
239
    if isinstance(node, (PcGtsType, OcrdPage)):
240
        # top-level (start recursion)
241
        node_id = node.get_pcGtsId()
242
        node = node.get_Page() # has no .id
243
        if not readingOrder:
244
            readingOrder = {}
245
        ro = node.get_ReadingOrder()
246
        if ro:
247
            page_get_reading_order(readingOrder, ro.get_OrderedGroup() or ro.get_UnorderedGroup())
248
        if not joinRelations:
249
            joinRelations = []
250
        relations = node.get_Relations() # get RelationsType
251
        if relations:
252
            relations = relations.get_Relation() # get list of RelationType
253
        else:
254
            relations = []
255
        for relation in relations:
256
            if relation.get_type() == 'join': # ignore 'link' type here
257
                joinRelations.append((relation.get_SourceRegionRef().get_regionRef(),
258
                                      relation.get_TargetRegionRef().get_regionRef()))
259
    elif isinstance(node, GlyphType):
260
        # terminal level (end recursion)
261
        return True
262
    else:
263
        node_id = node.id
264
    tag = node.original_tagname_
265
    log.debug("Validating %s %s", tag, node_id)
266
    consistent = True
267
    if check_coords or check_baseline:
268
        if isinstance(node, PageType):
269
            parent = node.get_Border()
270
        else:
271
            parent = node
272
        if parent:
273
            parent_points = parent.get_Coords().points
274
            node_poly = make_poly(polygon_from_points(parent_points))
275
            if not isinstance(node_poly, Polygon):
276
                report.add_error(CoordinateValidityError(tag, node_id, file_id,
277
                                                         parent_points, node_poly))
278
                log.debug("Invalid coords of %s %s", tag, node_id)
279
                consistent = False
280
                node_poly = None # don't use in further comparisons
281
        else:
282
            node_poly = None
283
    for class_, getterLO, getterRD in _ORDER[1:]:
284
        if isinstance(node, class_):
285
            if getterLO:
286
                textLineOrder = getattr(node, getterLO)()
287
            if getterRD:
288
                readingDirection = getattr(node, getterRD)()
289
    for class_, getter, concatenate_with in _HIERARCHY:
290
        if not isinstance(node, class_):
291
            continue
292
        children = getattr(node, getter)()
293
        if (getter == 'get_TextRegion' and children and
294
            all(child.id in readingOrder for child in children) and
295
            isinstance(readingOrder[children[0].id].parent_object_,
296
                       (OrderedGroupType, OrderedGroupIndexedType))):
297
            children = sorted(children, key=lambda child:
298
                              readingOrder[child.id].index)
299
        elif ((getter == 'get_TextLine' and textLineOrder == _ORDER[0][1]) or
300
              (getter in ['get_Word', 'get_Glyph'] and readingDirection == _ORDER[0][2])):
301
            children = list(reversed(children))
302
        for child in children:
303
            consistent = (validate_consistency(child, page_textequiv_consistency, page_textequiv_strategy,
304
                                               check_baseline, check_coords,
305
                                               report, file_id,
306
                                               joinRelations, readingOrder,
307
                                               textLineOrder, readingDirection)
308
                          and consistent)
309
            if check_coords and node_poly:
0 ignored issues
show
introduced by
The variable node_poly does not seem to be defined in case check_coords or check_baseline on line 267 is False. Are you sure this can never be the case?
Loading history...
310
                child_tag = child.original_tagname_
311
                child_points = child.get_Coords().points
312
                child_poly = make_poly(polygon_from_points(child_points))
313
                if not isinstance(child_poly, Polygon):
314
                    # report.add_error(CoordinateValidityError(child_tag, child.id, file_id, child_points))
315
                    # log.debug("Invalid coords of %s %s", child_tag, child.id)
316
                    # consistent = False
317
                    pass # already reported in recursive call above
318
                elif not child_poly.within(node_poly.buffer(PARENT_SLACK)):
319
                    # TODO: automatic repair?
320
                    report.add_error(CoordinateConsistencyError(child_tag, child.id, file_id,
321
                                                                parent_points, child_points))
0 ignored issues
show
introduced by
The variable parent_points does not seem to be defined for all execution paths.
Loading history...
322
                    log.debug("Inconsistent coords of %s %s", child_tag, child.id)
323
                    consistent = False
324
        if isinstance(node, TextLineType) and check_baseline and node.get_Baseline():
325
            baseline_points = node.get_Baseline().points
326
            baseline_line = make_line(polygon_from_points(baseline_points))
327
            if not isinstance(baseline_line, LineString):
328
                report.add_error(CoordinateValidityError("Baseline", node_id, file_id,
329
                                                         baseline_points, baseline_line))
330
                log.debug("Invalid coords of baseline in %s", node_id)
331
                consistent = False
332
            elif node_poly and not baseline_line.within(node_poly.buffer(PARENT_SLACK)):
333
                report.add_error(CoordinateConsistencyError("Baseline", node_id, file_id,
334
                                                            parent_points, baseline_points))
335
                log.debug("Inconsistent coords of baseline in %s %s", tag, node_id)
336
                consistent = False
337
        if concatenate_with is not None and page_textequiv_consistency != 'off':
338
            # validate textual consistency of node with children
339
            concatenated = concatenate(children, concatenate_with, page_textequiv_strategy,
340
                                       joinRelations)
341
            text_results = get_text(node, page_textequiv_strategy)
342
            if concatenated and text_results and concatenated != text_results:
343
                consistent = False
344
                if page_textequiv_consistency == 'fix':
345
                    log.debug("Repaired text of %s %s", tag, node_id)
346
                    set_text(node, concatenated, page_textequiv_strategy)
347
                elif (page_textequiv_consistency == 'strict' # or 'lax' but...
348
                      or not compare_without_whitespace(concatenated, text_results)):
349
                    log.debug("Inconsistent text of %s %s", tag, node_id)
350
                    report.add_error(ConsistencyError(tag, node_id, file_id,
351
                                                      text_results, concatenated))
352
    return consistent
353
354
def concatenate(nodes, concatenate_with, page_textequiv_strategy, joins=None):
355
    """
356
    Concatenate nodes textually according to https://ocr-d.github.io/page#consistency-of-text-results-on-different-levels
357
    """
358
    if not nodes:
359
        return ''
360
    if not joins:
361
        joins = []
362
    result = get_text(nodes[0], page_textequiv_strategy)
363
    for node, next_node in zip(nodes, nodes[1:]):
364
        if (node.id, next_node.id) not in joins:
365
            # TODO: also cover 2-level joins like word-word
366
            result += concatenate_with
367
        result += get_text(next_node, page_textequiv_strategy)
368
    return result.strip()
369
370
def get_text(node, page_textequiv_strategy='first'):
371
    """
372
    Get the first or most confident among text results (depending on ``page_textequiv_strategy``).
373
    For the strategy ``best``, return the string of the highest scoring result.
374
    For the strategy ``first``, return the string of the lowest indexed result.
375
    If there are no scores/indexes, use the first result.
376
    If there are no results, return the empty string.
377
    """
378
    log = getLogger('ocrd.page_validator.get_text')
379
    textEquivs = node.get_TextEquiv()
380
    if not textEquivs:
381
        log.debug("No text results on %s %s", node, node.id)
382
        return ''
383
    elif page_textequiv_strategy == 'best':
384
        if len(textEquivs) > 1:
385
            textEquivsSorted = sorted([x for x in textEquivs if x.conf],
386
                                      # generateDS does not convert simpleType for attributes (yet?)
387
                                      key=lambda x: float(x.conf))
388
            if textEquivsSorted:
389
                return textEquivsSorted[-1].get_Unicode().strip()
390
        # fall back to first element
391
        return textEquivs[0].get_Unicode().strip()
392
    #elif page_textequiv_strategy == 'first':
393
    else:
394
        if len(textEquivs) > 1:
395
            textEquivsSorted = sorted([x for x in textEquivs if isinstance(x.index, int)],
396
                                      key=lambda x: x.index)
397
            if textEquivsSorted:
398
                return textEquivsSorted[0].get_Unicode().strip()
399
        # fall back to first element
400
        return textEquivs[0].get_Unicode().strip()
401
402
def set_text(node, text, page_textequiv_strategy):
403
    """
404
    Set the first or most confident among text results (depending on ``page_textequiv_strategy``).
405
    For the strategy ``best``, set the string of the highest scoring result.
406
    For the strategy ``first``, set the string of the lowest indexed result.
407
    If there are no scores/indexes, use the first result.
408
    If there are no results, add a new one.
409
    """
410
    text = text.strip()
411
    textEquivs = node.get_TextEquiv()
412
    if not textEquivs:
413
        node.add_TextEquiv(TextEquivType(Unicode=text)) # or index=0 ?
414
    elif page_textequiv_strategy == 'best':
415
        if len(textEquivs) > 1:
416
            textEquivsSorted = sorted([x for x in textEquivs if x.conf],
417
                                      # generateDS does not convert simpleType for attributes (yet?)
418
                                      key=lambda x: float(x.conf))
419
            if textEquivsSorted:
420
                textEquivsSorted[-1].set_Unicode(text)
421
                return
422
        # fall back to first element
423
        textEquivs[0].set_Unicode(text)
424
    #elif page_textequiv_strategy == 'first':
425
    else:
426
        if len(textEquivs) > 1:
427
            textEquivsSorted = sorted([x for x in textEquivs if isinstance(x.index, int)],
428
                                      key=lambda x: x.index)
429
            if textEquivsSorted:
430
                textEquivsSorted[0].set_Unicode(text)
431
                return
432
        # fall back to first element
433
        textEquivs[0].set_Unicode(text)
434
435
class PageValidator():
436
    """
437
    Validator for `OcrdPage <../ocrd_models/ocrd_models.ocrd_page.html>`.
438
    """
439
440
    @staticmethod
441
    @deprecated_alias(strictness='page_textequiv_consistency')
442
    @deprecated_alias(strategy='page_textequiv_strategy')
443
    def validate(filename=None, ocrd_page=None, ocrd_file=None,
444
                 page_textequiv_consistency='strict', page_textequiv_strategy='first',
445
                 check_baseline=True, check_coords=True):
446
        """
447
        Validates a PAGE file for consistency by filename, OcrdFile or passing OcrdPage directly.
448
449
        Arguments:
450
            filename (string): Path to PAGE
451
            ocrd_page (OcrdPage): OcrdPage instance
452
            ocrd_file (OcrdFile): OcrdFile instance wrapping OcrdPage
453
            page_textequiv_consistency (string): 'strict', 'lax', 'fix' or 'off'
454
            page_textequiv_strategy (string): Currently only 'first'
455
            check_baseline (bool): whether Baseline must be fully within TextLine/Coords
456
            check_coords (bool): whether *Region/TextLine/Word/Glyph must each be fully
457
                                 contained within Border/*Region/TextLine/Word, resp.
458
459
        Returns:
460
            report (:class:`ValidationReport`) Report on the validity
461
        """
462
        log = getLogger('ocrd.page_validator.validate')
463
        if ocrd_page:
464
            page = ocrd_page
465
            file_id = ocrd_page.get_pcGtsId()
466
        elif ocrd_file:
467
            page = page_from_file(ocrd_file)
468
            file_id = ocrd_file.ID
469
        elif filename:
470
            page = parse(filename, silence=True)
471
            file_id = filename
472
        else:
473
            raise ValueError("At least one of ocrd_page, ocrd_file or filename must be set")
474
        if page_textequiv_strategy not in ('first'):
475
            raise ValueError("page_textequiv_strategy %s not implemented" % page_textequiv_strategy)
476
        if page_textequiv_consistency not in ('strict', 'lax', 'fix', 'off'):
477
            raise ValueError("page_textequiv_consistency level %s not implemented" % page_textequiv_consistency)
478
        report = ValidationReport()
479
        log.info("Validating input file '%s'", file_id)
480
        validate_consistency(page, page_textequiv_consistency, page_textequiv_strategy, check_baseline, check_coords, report, file_id)
481
        return report
482