|
1
|
|
|
""" |
|
2
|
|
|
API for validating `OcrdPage <../ocrd_models/ocrd_models.ocrd_page.html>`_. |
|
3
|
|
|
""" |
|
4
|
|
|
import re |
|
5
|
|
|
from shapely.geometry import Polygon, LineString |
|
6
|
|
|
from shapely.validation import explain_validity |
|
7
|
|
|
|
|
8
|
|
|
from ocrd_utils import getLogger, polygon_from_points, deprecated_alias |
|
9
|
|
|
from ocrd_models.ocrd_page import OcrdPage, parse |
|
10
|
|
|
from ocrd_modelfactory import page_from_file |
|
11
|
|
|
|
|
12
|
|
|
from ocrd_models.ocrd_page import ( |
|
13
|
|
|
PcGtsType, |
|
14
|
|
|
PageType, |
|
15
|
|
|
TextRegionType, |
|
16
|
|
|
TextLineType, |
|
17
|
|
|
WordType, |
|
18
|
|
|
GlyphType, |
|
19
|
|
|
TextEquivType |
|
20
|
|
|
) |
|
21
|
|
|
from ocrd_models.ocrd_page_generateds import ( |
|
22
|
|
|
RegionType, |
|
23
|
|
|
ReadingDirectionSimpleType, |
|
24
|
|
|
TextLineOrderSimpleType, |
|
25
|
|
|
RegionRefType, |
|
26
|
|
|
RegionRefIndexedType, |
|
27
|
|
|
OrderedGroupType, |
|
28
|
|
|
OrderedGroupIndexedType, |
|
29
|
|
|
UnorderedGroupType, |
|
30
|
|
|
UnorderedGroupIndexedType, |
|
31
|
|
|
) |
|
32
|
|
|
from ocrd_models import ValidationReport |
|
33
|
|
|
|
|
34
|
|
|
|
|
35
|
|
|
_HIERARCHY = [ |
|
36
|
|
|
# page can contain different types of regions |
|
37
|
|
|
(PageType, 'get_AdvertRegion', None), |
|
38
|
|
|
(PageType, 'get_ChartRegion', None), |
|
39
|
|
|
(PageType, 'get_ChemRegion', None), |
|
40
|
|
|
(PageType, 'get_CustomRegion', None), |
|
41
|
|
|
(PageType, 'get_GraphicRegion', None), |
|
42
|
|
|
(PageType, 'get_ImageRegion', None), |
|
43
|
|
|
(PageType, 'get_LineDrawingRegion', None), |
|
44
|
|
|
(PageType, 'get_MapRegion', None), |
|
45
|
|
|
(PageType, 'get_MathsRegion', None), |
|
46
|
|
|
(PageType, 'get_MusicRegion', None), |
|
47
|
|
|
(PageType, 'get_NoiseRegion', None), |
|
48
|
|
|
(PageType, 'get_SeparatorRegion', None), |
|
49
|
|
|
(PageType, 'get_TableRegion', None), |
|
50
|
|
|
(PageType, 'get_TextRegion', None), |
|
51
|
|
|
(PageType, 'get_UnknownRegion', None), |
|
52
|
|
|
# all regions can be recursive |
|
53
|
|
|
(RegionType, 'get_AdvertRegion', None), |
|
54
|
|
|
(RegionType, 'get_ChartRegion', None), |
|
55
|
|
|
(RegionType, 'get_ChemRegion', None), |
|
56
|
|
|
(RegionType, 'get_CustomRegion', None), |
|
57
|
|
|
(RegionType, 'get_GraphicRegion', None), |
|
58
|
|
|
(RegionType, 'get_ImageRegion', None), |
|
59
|
|
|
(RegionType, 'get_LineDrawingRegion', None), |
|
60
|
|
|
#(RegionType, 'get_MapRegion', None), |
|
61
|
|
|
(RegionType, 'get_MathsRegion', None), |
|
62
|
|
|
(RegionType, 'get_MusicRegion', None), |
|
63
|
|
|
(RegionType, 'get_NoiseRegion', None), |
|
64
|
|
|
(RegionType, 'get_SeparatorRegion', None), |
|
65
|
|
|
(RegionType, 'get_TableRegion', None), |
|
66
|
|
|
(RegionType, 'get_TextRegion', None), |
|
67
|
|
|
(RegionType, 'get_UnknownRegion', None), |
|
68
|
|
|
# only TextRegion can contain TextLine |
|
69
|
|
|
(TextRegionType, 'get_TextLine', '\n'), |
|
70
|
|
|
(TextLineType, 'get_Word', ' '), |
|
71
|
|
|
(WordType, 'get_Glyph', ''), |
|
72
|
|
|
(GlyphType, None, None), |
|
73
|
|
|
] |
|
74
|
|
|
|
|
75
|
|
|
_ORDER = [ |
|
76
|
|
|
(None, TextLineOrderSimpleType.BOTTOMTOTOP, ReadingDirectionSimpleType.RIGHTTOLEFT), |
|
77
|
|
|
(PageType, 'get_textLineOrder', 'get_readingDirection'), |
|
78
|
|
|
(TextRegionType, 'get_textLineOrder', 'get_readingDirection'), |
|
79
|
|
|
(TextLineType, None, 'get_readingDirection'), |
|
80
|
|
|
(WordType, None, 'get_readingDirection'), |
|
81
|
|
|
] |
|
82
|
|
|
|
|
83
|
|
|
# The following parameters control how tolerant we are with respect to |
|
84
|
|
|
# polygon path self-validity and parent-child containment. We have to |
|
85
|
|
|
# offer this, because most implementations, including PRImA itself, |
|
86
|
|
|
# do _not_ offer pixel-precise correctness. |
|
87
|
|
|
# How much may polygon paths deviate when simplifying them |
|
88
|
|
|
# to avoid self-intersections? |
|
89
|
|
|
POLY_TOLERANCE = 1.0 |
|
90
|
|
|
# How large a margin to increase parent polygons before |
|
91
|
|
|
# checking their children are properly contained? |
|
92
|
|
|
PARENT_SLACK = 1.5 |
|
93
|
|
|
|
|
94
|
|
|
|
|
95
|
|
|
class ConsistencyError(Exception): |
|
96
|
|
|
""" |
|
97
|
|
|
Exception representing a consistency error in textual transcription across levels of a PAGE-XML. |
|
98
|
|
|
(Element text strings must be the concatenation of their children's text strings, joined by white space.) |
|
99
|
|
|
""" |
|
100
|
|
|
|
|
101
|
|
|
def __init__(self, tag, ID, file_id, actual, expected): |
|
102
|
|
|
""" |
|
103
|
|
|
Construct a new ConsistencyError. |
|
104
|
|
|
|
|
105
|
|
|
Arguments: |
|
106
|
|
|
tag (string): Level of the inconsistent element (parent) |
|
107
|
|
|
ID (string): ``ID`` of the inconsistent element (parent) |
|
108
|
|
|
file_id (string): ``mets:id`` of the PAGE file |
|
109
|
|
|
actual (string): Value of parent's TextEquiv[0]/Unicode |
|
110
|
|
|
expected (string): Concatenated values of children's |
|
111
|
|
|
TextEquiv[0]/Unicode, joined by white-space |
|
112
|
|
|
""" |
|
113
|
|
|
self.tag = tag |
|
114
|
|
|
self.ID = ID |
|
115
|
|
|
self.file_id = file_id |
|
116
|
|
|
self.actual = actual |
|
117
|
|
|
self.expected = expected |
|
118
|
|
|
super().__init__( |
|
119
|
|
|
f"INCONSISTENCY in {tag} ID '{ID}' of file '{file_id}': " |
|
120
|
|
|
f"text results '{actual}' != concatenated '{expected}'") |
|
121
|
|
|
|
|
122
|
|
|
|
|
123
|
|
|
class CoordinateConsistencyError(Exception): |
|
124
|
|
|
""" |
|
125
|
|
|
Exception representing a consistency error in coordinate confinement across levels of a PAGE-XML. |
|
126
|
|
|
(Element coordinate polygons must be properly contained in their parents' coordinate polygons.) |
|
127
|
|
|
""" |
|
128
|
|
|
|
|
129
|
|
|
def __init__(self, tag, ID, file_id, outer, inner): |
|
130
|
|
|
""" |
|
131
|
|
|
Construct a new CoordinateConsistencyError. |
|
132
|
|
|
|
|
133
|
|
|
Arguments: |
|
134
|
|
|
tag (string): Level of the offending element (child) |
|
135
|
|
|
ID (string): ``ID`` of the offending element (child) |
|
136
|
|
|
file_id (string): ``mets:id`` of the PAGE file |
|
137
|
|
|
outer (string): Coordinate points of the parent |
|
138
|
|
|
inner (string): Coordinate points of the child |
|
139
|
|
|
""" |
|
140
|
|
|
self.tag = tag |
|
141
|
|
|
self.ID = ID |
|
142
|
|
|
self.file_id = file_id |
|
143
|
|
|
self.outer = outer |
|
144
|
|
|
self.inner = inner |
|
145
|
|
|
super().__init__( |
|
146
|
|
|
f"INCONSISTENCY in {tag} ID '{ID}' of '{file_id}': " |
|
147
|
|
|
f"coords '{inner}' not within parent coords '{outer}'") |
|
148
|
|
|
|
|
149
|
|
|
|
|
150
|
|
|
class CoordinateValidityError(Exception): |
|
151
|
|
|
""" |
|
152
|
|
|
Exception representing a validity error of an element's coordinates in PAGE-XML. |
|
153
|
|
|
(Element coordinate polygons must have at least 3 points, and must not |
|
154
|
|
|
self-intersect or be non-contiguous or be negative.) |
|
155
|
|
|
""" |
|
156
|
|
|
|
|
157
|
|
|
def __init__(self, tag, ID, file_id, points, reason='unknown'): |
|
158
|
|
|
""" |
|
159
|
|
|
Construct a new CoordinateValidityError. |
|
160
|
|
|
|
|
161
|
|
|
Arguments: |
|
162
|
|
|
tag (string): Level of the offending element (child) |
|
163
|
|
|
ID (string): ``ID`` of the offending element (child) |
|
164
|
|
|
points (string): Coordinate points |
|
165
|
|
|
reason (string): description of the problem |
|
166
|
|
|
""" |
|
167
|
|
|
self.tag = tag |
|
168
|
|
|
self.ID = ID |
|
169
|
|
|
self.file_id = file_id |
|
170
|
|
|
self.points = points |
|
171
|
|
|
super().__init__( |
|
172
|
|
|
f"INVALIDITY in {tag} ID '{ID}' of '{file_id}': coords '{points}' - {reason}") |
|
173
|
|
|
|
|
174
|
|
|
|
|
175
|
|
|
def compare_without_whitespace(a, b): |
|
176
|
|
|
""" |
|
177
|
|
|
Compare two strings, ignoring all whitespace. |
|
178
|
|
|
""" |
|
179
|
|
|
return re.sub('\\s+', '', a) == re.sub('\\s+', '', b) |
|
180
|
|
|
|
|
181
|
|
|
|
|
182
|
|
|
def page_get_reading_order(ro, rogroup): |
|
183
|
|
|
""" |
|
184
|
|
|
Add all elements from the given reading order group to the given dictionary. |
|
185
|
|
|
|
|
186
|
|
|
Given a dict ``ro`` from layout element IDs to ReadingOrder element objects, |
|
187
|
|
|
and an object ``rogroup`` with additional ReadingOrder element objects, |
|
188
|
|
|
add all references to the dict, traversing the group recursively. |
|
189
|
|
|
""" |
|
190
|
|
|
regionrefs = [] |
|
191
|
|
|
if isinstance(rogroup, (OrderedGroupType, OrderedGroupIndexedType)): |
|
192
|
|
|
regionrefs = (rogroup.get_RegionRefIndexed() + |
|
193
|
|
|
rogroup.get_OrderedGroupIndexed() + |
|
194
|
|
|
rogroup.get_UnorderedGroupIndexed()) |
|
195
|
|
|
if isinstance(rogroup, (UnorderedGroupType, UnorderedGroupIndexedType)): |
|
196
|
|
|
regionrefs = (rogroup.get_RegionRef() + |
|
197
|
|
|
rogroup.get_OrderedGroup() + |
|
198
|
|
|
rogroup.get_UnorderedGroup()) |
|
199
|
|
|
for elem in regionrefs: |
|
200
|
|
|
ro[elem.get_regionRef()] = elem |
|
201
|
|
|
if not isinstance(elem, (RegionRefType, RegionRefIndexedType)): |
|
202
|
|
|
page_get_reading_order(ro, elem) |
|
203
|
|
|
|
|
204
|
|
|
|
|
205
|
|
|
def make_poly(polygon_points): |
|
206
|
|
|
"""Instantiate a Polygon from a list of point pairs, or return an error string""" |
|
207
|
|
|
if len(polygon_points) < 4: |
|
208
|
|
|
return 'has too few points' |
|
209
|
|
|
poly = Polygon(polygon_points) |
|
210
|
|
|
if POLY_TOLERANCE: |
|
211
|
|
|
poly = poly.simplify(POLY_TOLERANCE) |
|
212
|
|
|
if not poly.is_valid: |
|
213
|
|
|
return explain_validity(poly) |
|
214
|
|
|
elif poly.is_empty: |
|
215
|
|
|
return 'is empty' |
|
216
|
|
|
elif poly.bounds[0] < 0 or poly.bounds[1] < 0: |
|
217
|
|
|
return 'is negative' |
|
218
|
|
|
return poly |
|
219
|
|
|
|
|
220
|
|
|
|
|
221
|
|
|
def make_line(line_points): |
|
222
|
|
|
"""Instantiate a LineString from a list of point pairs, or return an error string""" |
|
223
|
|
|
if len(line_points) < 2: |
|
224
|
|
|
return 'has too few points' |
|
225
|
|
|
line = LineString(line_points) |
|
226
|
|
|
if not line.is_valid: |
|
227
|
|
|
return explain_validity(line) |
|
228
|
|
|
elif line.is_empty: |
|
229
|
|
|
return 'is empty' |
|
230
|
|
|
elif line.bounds[0] < 0 or line.bounds[1] < 0: |
|
231
|
|
|
return 'is negative' |
|
232
|
|
|
return line |
|
233
|
|
|
|
|
234
|
|
|
|
|
235
|
|
|
@deprecated_alias(strictness='page_textequiv_consistency') |
|
236
|
|
|
@deprecated_alias(strategy='page_textequiv_strategy') |
|
237
|
|
|
def validate_consistency(node, page_textequiv_consistency, page_textequiv_strategy, |
|
238
|
|
|
check_baseline, check_coords, report, file_id, |
|
239
|
|
|
joinRelations=None, readingOrder=None, |
|
240
|
|
|
textLineOrder=None, readingDirection=None): |
|
241
|
|
|
""" |
|
242
|
|
|
Check whether the text results on an element is consistent with its child element text results, |
|
243
|
|
|
and whether the coordinates of an element are fully within its parent element coordinates. |
|
244
|
|
|
""" |
|
245
|
|
|
log = getLogger('ocrd.page_validator.validate_consistency') |
|
246
|
|
|
if isinstance(node, (PcGtsType, OcrdPage)): |
|
247
|
|
|
# top-level (start recursion) |
|
248
|
|
|
node_id = node.get_pcGtsId() |
|
249
|
|
|
node = node.get_Page() # has no .id |
|
250
|
|
|
if not readingOrder: |
|
251
|
|
|
readingOrder = {} |
|
252
|
|
|
ro = node.get_ReadingOrder() |
|
253
|
|
|
if ro: |
|
254
|
|
|
page_get_reading_order(readingOrder, ro.get_OrderedGroup() or ro.get_UnorderedGroup()) |
|
255
|
|
|
if not joinRelations: |
|
256
|
|
|
joinRelations = [] |
|
257
|
|
|
relations = node.get_Relations() # get RelationsType |
|
258
|
|
|
if relations: |
|
259
|
|
|
relations = relations.get_Relation() # get list of RelationType |
|
260
|
|
|
else: |
|
261
|
|
|
relations = [] |
|
262
|
|
|
for relation in relations: |
|
263
|
|
|
if relation.get_type() == 'join': # ignore 'link' type here |
|
264
|
|
|
joinRelations.append((relation.get_SourceRegionRef().get_regionRef(), |
|
265
|
|
|
relation.get_TargetRegionRef().get_regionRef())) |
|
266
|
|
|
elif isinstance(node, GlyphType): |
|
267
|
|
|
# terminal level (end recursion) |
|
268
|
|
|
return True |
|
269
|
|
|
else: |
|
270
|
|
|
node_id = node.id |
|
271
|
|
|
tag = node.original_tagname_ |
|
272
|
|
|
log.debug("Validating %s %s", tag, node_id) |
|
273
|
|
|
consistent = True |
|
274
|
|
|
if check_coords or check_baseline: |
|
275
|
|
|
if isinstance(node, PageType): |
|
276
|
|
|
parent = node.get_Border() |
|
277
|
|
|
else: |
|
278
|
|
|
parent = node |
|
279
|
|
|
if parent: |
|
280
|
|
|
parent_points = parent.get_Coords().points |
|
281
|
|
|
node_poly = make_poly(polygon_from_points(parent_points)) |
|
282
|
|
|
if not isinstance(node_poly, Polygon): |
|
283
|
|
|
report.add_error(CoordinateValidityError(tag, node_id, file_id, |
|
284
|
|
|
parent_points, node_poly)) |
|
285
|
|
|
log.debug("Invalid coords of %s %s", tag, node_id) |
|
286
|
|
|
consistent = False |
|
287
|
|
|
node_poly = None # don't use in further comparisons |
|
288
|
|
|
else: |
|
289
|
|
|
node_poly = None |
|
290
|
|
|
for class_, getterLO, getterRD in _ORDER[1:]: |
|
291
|
|
|
if isinstance(node, class_): |
|
292
|
|
|
if getterLO: |
|
293
|
|
|
textLineOrder = getattr(node, getterLO)() |
|
294
|
|
|
if getterRD: |
|
295
|
|
|
readingDirection = getattr(node, getterRD)() |
|
296
|
|
|
for class_, getter, concatenate_with in _HIERARCHY: |
|
297
|
|
|
if not isinstance(node, class_): |
|
298
|
|
|
continue |
|
299
|
|
|
children = getattr(node, getter)() |
|
300
|
|
|
if (getter == 'get_TextRegion' and children and |
|
301
|
|
|
all(child.id in readingOrder for child in children) and |
|
302
|
|
|
isinstance(readingOrder[children[0].id].parent_object_, |
|
303
|
|
|
(OrderedGroupType, OrderedGroupIndexedType))): |
|
304
|
|
|
children = sorted(children, key=lambda child: |
|
305
|
|
|
readingOrder[child.id].index) |
|
306
|
|
|
elif ((getter == 'get_TextLine' and textLineOrder == _ORDER[0][1]) or |
|
307
|
|
|
(getter in ['get_Word', 'get_Glyph'] and readingDirection == _ORDER[0][2])): |
|
308
|
|
|
children = list(reversed(children)) |
|
309
|
|
|
for child in children: |
|
310
|
|
|
consistent = (validate_consistency(child, page_textequiv_consistency, page_textequiv_strategy, |
|
311
|
|
|
check_baseline, check_coords, |
|
312
|
|
|
report, file_id, |
|
313
|
|
|
joinRelations, readingOrder, |
|
314
|
|
|
textLineOrder, readingDirection) |
|
315
|
|
|
and consistent) |
|
316
|
|
|
if check_coords and node_poly: |
|
|
|
|
|
|
317
|
|
|
child_tag = child.original_tagname_ |
|
318
|
|
|
child_points = child.get_Coords().points |
|
319
|
|
|
child_poly = make_poly(polygon_from_points(child_points)) |
|
320
|
|
|
if not isinstance(child_poly, Polygon): |
|
321
|
|
|
# report.add_error(CoordinateValidityError(child_tag, child.id, file_id, child_points)) |
|
322
|
|
|
# log.debug("Invalid coords of %s %s", child_tag, child.id) |
|
323
|
|
|
# consistent = False |
|
324
|
|
|
pass # already reported in recursive call above |
|
325
|
|
|
elif not child_poly.within(node_poly.buffer(PARENT_SLACK)): |
|
326
|
|
|
# TODO: automatic repair? |
|
327
|
|
|
report.add_error(CoordinateConsistencyError(child_tag, child.id, file_id, |
|
328
|
|
|
parent_points, child_points)) |
|
|
|
|
|
|
329
|
|
|
log.debug("Inconsistent coords of %s %s", child_tag, child.id) |
|
330
|
|
|
consistent = False |
|
331
|
|
|
if isinstance(node, TextLineType) and check_baseline and node.get_Baseline(): |
|
332
|
|
|
baseline_points = node.get_Baseline().points |
|
333
|
|
|
baseline_line = make_line(polygon_from_points(baseline_points)) |
|
334
|
|
|
if not isinstance(baseline_line, LineString): |
|
335
|
|
|
report.add_error(CoordinateValidityError("Baseline", node_id, file_id, |
|
336
|
|
|
baseline_points, baseline_line)) |
|
337
|
|
|
log.debug("Invalid coords of baseline in %s", node_id) |
|
338
|
|
|
consistent = False |
|
339
|
|
|
elif node_poly and not baseline_line.within(node_poly.buffer(PARENT_SLACK)): |
|
340
|
|
|
report.add_error(CoordinateConsistencyError("Baseline", node_id, file_id, |
|
341
|
|
|
parent_points, baseline_points)) |
|
342
|
|
|
log.debug("Inconsistent coords of baseline in %s %s", tag, node_id) |
|
343
|
|
|
consistent = False |
|
344
|
|
|
if concatenate_with is not None and page_textequiv_consistency != 'off': |
|
345
|
|
|
# validate textual consistency of node with children |
|
346
|
|
|
concatenated = concatenate(children, concatenate_with, page_textequiv_strategy, |
|
347
|
|
|
joinRelations) |
|
348
|
|
|
text_results = get_text(node, page_textequiv_strategy) |
|
349
|
|
|
if concatenated and text_results and concatenated != text_results: |
|
350
|
|
|
consistent = False |
|
351
|
|
|
if page_textequiv_consistency == 'fix': |
|
352
|
|
|
log.debug("Repaired text of %s %s", tag, node_id) |
|
353
|
|
|
set_text(node, concatenated, page_textequiv_strategy) |
|
354
|
|
|
elif (page_textequiv_consistency == 'strict' # or 'lax' but... |
|
355
|
|
|
or not compare_without_whitespace(concatenated, text_results)): |
|
356
|
|
|
log.debug("Inconsistent text of %s %s", tag, node_id) |
|
357
|
|
|
report.add_error(ConsistencyError(tag, node_id, file_id, |
|
358
|
|
|
text_results, concatenated)) |
|
359
|
|
|
return consistent |
|
360
|
|
|
|
|
361
|
|
|
|
|
362
|
|
|
def concatenate(nodes, concatenate_with, page_textequiv_strategy, joins=None): |
|
363
|
|
|
""" |
|
364
|
|
|
Concatenate nodes textually according to https://ocr-d.github.io/page#consistency-of-text-results-on-different-levels |
|
365
|
|
|
""" |
|
366
|
|
|
if not nodes: |
|
367
|
|
|
return '' |
|
368
|
|
|
if not joins: |
|
369
|
|
|
joins = [] |
|
370
|
|
|
result = get_text(nodes[0], page_textequiv_strategy) |
|
371
|
|
|
for node, next_node in zip(nodes, nodes[1:]): |
|
372
|
|
|
if (node.id, next_node.id) not in joins: |
|
373
|
|
|
# TODO: also cover 2-level joins like word-word |
|
374
|
|
|
result += concatenate_with |
|
375
|
|
|
result += get_text(next_node, page_textequiv_strategy) |
|
376
|
|
|
return result.strip() |
|
377
|
|
|
|
|
378
|
|
|
|
|
379
|
|
|
def get_text(node, page_textequiv_strategy='first'): |
|
380
|
|
|
""" |
|
381
|
|
|
Get the first or most confident among text results (depending on ``page_textequiv_strategy``). |
|
382
|
|
|
For the strategy ``best``, return the string of the highest scoring result. |
|
383
|
|
|
For the strategy ``first``, return the string of the lowest indexed result. |
|
384
|
|
|
If there are no scores/indexes, use the first result. |
|
385
|
|
|
If there are no results, return the empty string. |
|
386
|
|
|
""" |
|
387
|
|
|
log = getLogger('ocrd.page_validator.get_text') |
|
388
|
|
|
textEquivs = node.get_TextEquiv() |
|
389
|
|
|
if not textEquivs: |
|
390
|
|
|
log.debug("No text results on %s %s", node, node.id) |
|
391
|
|
|
return '' |
|
392
|
|
|
elif page_textequiv_strategy == 'best': |
|
393
|
|
|
if len(textEquivs) > 1: |
|
394
|
|
|
textEquivsSorted = sorted([x for x in textEquivs if x.conf], |
|
395
|
|
|
# generateDS does not convert simpleType for attributes (yet?) |
|
396
|
|
|
key=lambda x: float(x.conf)) |
|
397
|
|
|
if textEquivsSorted: |
|
398
|
|
|
return textEquivsSorted[-1].get_Unicode().strip() |
|
399
|
|
|
# fall back to first element |
|
400
|
|
|
return textEquivs[0].get_Unicode().strip() |
|
401
|
|
|
#elif page_textequiv_strategy == 'first': |
|
402
|
|
|
else: |
|
403
|
|
|
if len(textEquivs) > 1: |
|
404
|
|
|
textEquivsSorted = sorted([x for x in textEquivs if isinstance(x.index, int)], |
|
405
|
|
|
key=lambda x: x.index) |
|
406
|
|
|
if textEquivsSorted: |
|
407
|
|
|
return textEquivsSorted[0].get_Unicode().strip() |
|
408
|
|
|
# fall back to first element |
|
409
|
|
|
return textEquivs[0].get_Unicode().strip() |
|
410
|
|
|
|
|
411
|
|
|
|
|
412
|
|
|
def set_text(node, text, page_textequiv_strategy): |
|
413
|
|
|
""" |
|
414
|
|
|
Set the first or most confident among text results (depending on ``page_textequiv_strategy``). |
|
415
|
|
|
For the strategy ``best``, set the string of the highest scoring result. |
|
416
|
|
|
For the strategy ``first``, set the string of the lowest indexed result. |
|
417
|
|
|
If there are no scores/indexes, use the first result. |
|
418
|
|
|
If there are no results, add a new one. |
|
419
|
|
|
""" |
|
420
|
|
|
text = text.strip() |
|
421
|
|
|
textEquivs = node.get_TextEquiv() |
|
422
|
|
|
if not textEquivs: |
|
423
|
|
|
node.add_TextEquiv(TextEquivType(Unicode=text)) # or index=0 ? |
|
424
|
|
|
elif page_textequiv_strategy == 'best': |
|
425
|
|
|
if len(textEquivs) > 1: |
|
426
|
|
|
textEquivsSorted = sorted([x for x in textEquivs if x.conf], |
|
427
|
|
|
# generateDS does not convert simpleType for attributes (yet?) |
|
428
|
|
|
key=lambda x: float(x.conf)) |
|
429
|
|
|
if textEquivsSorted: |
|
430
|
|
|
textEquivsSorted[-1].set_Unicode(text) |
|
431
|
|
|
return |
|
432
|
|
|
# fall back to first element |
|
433
|
|
|
textEquivs[0].set_Unicode(text) |
|
434
|
|
|
#elif page_textequiv_strategy == 'first': |
|
435
|
|
|
else: |
|
436
|
|
|
if len(textEquivs) > 1: |
|
437
|
|
|
textEquivsSorted = sorted([x for x in textEquivs if isinstance(x.index, int)], |
|
438
|
|
|
key=lambda x: x.index) |
|
439
|
|
|
if textEquivsSorted: |
|
440
|
|
|
textEquivsSorted[0].set_Unicode(text) |
|
441
|
|
|
return |
|
442
|
|
|
# fall back to first element |
|
443
|
|
|
textEquivs[0].set_Unicode(text) |
|
444
|
|
|
|
|
445
|
|
|
|
|
446
|
|
|
class PageValidator(): |
|
447
|
|
|
""" |
|
448
|
|
|
Validator for `OcrdPage <../ocrd_models/ocrd_models.ocrd_page.html>`. |
|
449
|
|
|
""" |
|
450
|
|
|
|
|
451
|
|
|
@staticmethod |
|
452
|
|
|
@deprecated_alias(strictness='page_textequiv_consistency') |
|
453
|
|
|
@deprecated_alias(strategy='page_textequiv_strategy') |
|
454
|
|
|
def validate(filename=None, ocrd_page=None, ocrd_file=None, |
|
455
|
|
|
page_textequiv_consistency='strict', page_textequiv_strategy='first', |
|
456
|
|
|
check_baseline=True, check_coords=True): |
|
457
|
|
|
""" |
|
458
|
|
|
Validates a PAGE file for consistency by filename, OcrdFile or passing OcrdPage directly. |
|
459
|
|
|
|
|
460
|
|
|
Arguments: |
|
461
|
|
|
filename (string): Path to PAGE |
|
462
|
|
|
ocrd_page (OcrdPage): OcrdPage instance |
|
463
|
|
|
ocrd_file (OcrdFile): OcrdFile instance wrapping OcrdPage |
|
464
|
|
|
page_textequiv_consistency (string): 'strict', 'lax', 'fix' or 'off' |
|
465
|
|
|
page_textequiv_strategy (string): Currently only 'first' |
|
466
|
|
|
check_baseline (bool): whether Baseline must be fully within TextLine/Coords |
|
467
|
|
|
check_coords (bool): whether *Region/TextLine/Word/Glyph must each be fully |
|
468
|
|
|
contained within Border/*Region/TextLine/Word, resp. |
|
469
|
|
|
|
|
470
|
|
|
Returns: |
|
471
|
|
|
report (:class:`ValidationReport`) Report on the validity |
|
472
|
|
|
""" |
|
473
|
|
|
log = getLogger('ocrd.page_validator.validate') |
|
474
|
|
|
if ocrd_page: |
|
475
|
|
|
page = ocrd_page |
|
476
|
|
|
file_id = ocrd_page.get_pcGtsId() |
|
477
|
|
|
elif ocrd_file: |
|
478
|
|
|
page = page_from_file(ocrd_file) |
|
479
|
|
|
file_id = ocrd_file.ID |
|
480
|
|
|
elif filename: |
|
481
|
|
|
page = parse(filename, silence=True) |
|
482
|
|
|
file_id = filename |
|
483
|
|
|
else: |
|
484
|
|
|
raise ValueError("At least one of ocrd_page, ocrd_file or filename must be set") |
|
485
|
|
|
if page_textequiv_strategy not in ('first'): |
|
486
|
|
|
raise ValueError("page_textequiv_strategy %s not implemented" % page_textequiv_strategy) |
|
487
|
|
|
if page_textequiv_consistency not in ('strict', 'lax', 'fix', 'off'): |
|
488
|
|
|
raise ValueError("page_textequiv_consistency level %s not implemented" % page_textequiv_consistency) |
|
489
|
|
|
report = ValidationReport() |
|
490
|
|
|
log.info("Validating input file '%s'", file_id) |
|
491
|
|
|
validate_consistency(page, page_textequiv_consistency, page_textequiv_strategy, check_baseline, check_coords, |
|
492
|
|
|
report, file_id) |
|
493
|
|
|
return report |
|
494
|
|
|
|