1
|
|
|
""" |
2
|
|
|
API for validating `OcrdPage <../ocrd_models/ocrd_models.ocrd_page.html>`_. |
3
|
|
|
""" |
4
|
|
|
import re |
5
|
|
|
from shapely.geometry import Polygon, LineString |
6
|
|
|
from shapely.validation import explain_validity |
7
|
|
|
|
8
|
|
|
from ocrd_utils import getLogger, polygon_from_points, deprecated_alias |
9
|
|
|
from ocrd_models.ocrd_page import OcrdPage, parse |
10
|
|
|
from ocrd_modelfactory import page_from_file |
11
|
|
|
|
12
|
|
|
from ocrd_models.ocrd_page import ( |
13
|
|
|
PcGtsType, |
14
|
|
|
PageType, |
15
|
|
|
TextRegionType, |
16
|
|
|
TextLineType, |
17
|
|
|
WordType, |
18
|
|
|
GlyphType, |
19
|
|
|
TextEquivType |
20
|
|
|
) |
21
|
|
|
from ocrd_models.ocrd_page_generateds import ( |
22
|
|
|
RegionType, |
23
|
|
|
ReadingDirectionSimpleType, |
24
|
|
|
TextLineOrderSimpleType, |
25
|
|
|
RegionRefType, |
26
|
|
|
RegionRefIndexedType, |
27
|
|
|
OrderedGroupType, |
28
|
|
|
OrderedGroupIndexedType, |
29
|
|
|
UnorderedGroupType, |
30
|
|
|
UnorderedGroupIndexedType, |
31
|
|
|
) |
32
|
|
|
from ocrd_models import ValidationReport |
33
|
|
|
|
34
|
|
|
|
35
|
|
|
_HIERARCHY = [ |
36
|
|
|
# page can contain different types of regions |
37
|
|
|
(PageType, 'get_AdvertRegion', None), |
38
|
|
|
(PageType, 'get_ChartRegion', None), |
39
|
|
|
(PageType, 'get_ChemRegion', None), |
40
|
|
|
(PageType, 'get_CustomRegion', None), |
41
|
|
|
(PageType, 'get_GraphicRegion', None), |
42
|
|
|
(PageType, 'get_ImageRegion', None), |
43
|
|
|
(PageType, 'get_LineDrawingRegion', None), |
44
|
|
|
(PageType, 'get_MapRegion', None), |
45
|
|
|
(PageType, 'get_MathsRegion', None), |
46
|
|
|
(PageType, 'get_MusicRegion', None), |
47
|
|
|
(PageType, 'get_NoiseRegion', None), |
48
|
|
|
(PageType, 'get_SeparatorRegion', None), |
49
|
|
|
(PageType, 'get_TableRegion', None), |
50
|
|
|
(PageType, 'get_TextRegion', None), |
51
|
|
|
(PageType, 'get_UnknownRegion', None), |
52
|
|
|
# all regions can be recursive |
53
|
|
|
(RegionType, 'get_AdvertRegion', None), |
54
|
|
|
(RegionType, 'get_ChartRegion', None), |
55
|
|
|
(RegionType, 'get_ChemRegion', None), |
56
|
|
|
(RegionType, 'get_CustomRegion', None), |
57
|
|
|
(RegionType, 'get_GraphicRegion', None), |
58
|
|
|
(RegionType, 'get_ImageRegion', None), |
59
|
|
|
(RegionType, 'get_LineDrawingRegion', None), |
60
|
|
|
#(RegionType, 'get_MapRegion', None), |
61
|
|
|
(RegionType, 'get_MathsRegion', None), |
62
|
|
|
(RegionType, 'get_MusicRegion', None), |
63
|
|
|
(RegionType, 'get_NoiseRegion', None), |
64
|
|
|
(RegionType, 'get_SeparatorRegion', None), |
65
|
|
|
(RegionType, 'get_TableRegion', None), |
66
|
|
|
(RegionType, 'get_TextRegion', None), |
67
|
|
|
(RegionType, 'get_UnknownRegion', None), |
68
|
|
|
# only TextRegion can contain TextLine |
69
|
|
|
(TextRegionType, 'get_TextLine', '\n'), |
70
|
|
|
(TextLineType, 'get_Word', ' '), |
71
|
|
|
(WordType, 'get_Glyph', ''), |
72
|
|
|
(GlyphType, None, None), |
73
|
|
|
] |
74
|
|
|
|
75
|
|
|
_ORDER = [ |
76
|
|
|
(None, TextLineOrderSimpleType.BOTTOMTOTOP, ReadingDirectionSimpleType.RIGHTTOLEFT), |
77
|
|
|
(PageType, 'get_textLineOrder', 'get_readingDirection'), |
78
|
|
|
(TextRegionType, 'get_textLineOrder', 'get_readingDirection'), |
79
|
|
|
(TextLineType, None, 'get_readingDirection'), |
80
|
|
|
(WordType, None, 'get_readingDirection'), |
81
|
|
|
] |
82
|
|
|
|
83
|
|
|
# The following parameters control how tolerant we are with respect to |
84
|
|
|
# polygon path self-validity and parent-child containment. We have to |
85
|
|
|
# offer this, because most implementations, including PRImA itself, |
86
|
|
|
# do _not_ offer pixel-precise correctness. |
87
|
|
|
# How much may polygon paths deviate when simplifying them |
88
|
|
|
# to avoid self-intersections? |
89
|
|
|
POLY_TOLERANCE = 1.0 |
90
|
|
|
# How large a margin to increase parent polygons before |
91
|
|
|
# checking their children are properly contained? |
92
|
|
|
PARENT_SLACK = 1.5 |
93
|
|
|
|
94
|
|
|
|
95
|
|
|
class ConsistencyError(Exception): |
96
|
|
|
""" |
97
|
|
|
Exception representing a consistency error in textual transcription across levels of a PAGE-XML. |
98
|
|
|
(Element text strings must be the concatenation of their children's text strings, joined by white space.) |
99
|
|
|
""" |
100
|
|
|
|
101
|
|
|
def __init__(self, tag, ID, file_id, actual, expected): |
102
|
|
|
""" |
103
|
|
|
Construct a new ConsistencyError. |
104
|
|
|
|
105
|
|
|
Arguments: |
106
|
|
|
tag (string): Level of the inconsistent element (parent) |
107
|
|
|
ID (string): ``ID`` of the inconsistent element (parent) |
108
|
|
|
file_id (string): ``mets:id`` of the PAGE file |
109
|
|
|
actual (string): Value of parent's TextEquiv[0]/Unicode |
110
|
|
|
expected (string): Concatenated values of children's |
111
|
|
|
TextEquiv[0]/Unicode, joined by white-space |
112
|
|
|
""" |
113
|
|
|
self.tag = tag |
114
|
|
|
self.ID = ID |
115
|
|
|
self.file_id = file_id |
116
|
|
|
self.actual = actual |
117
|
|
|
self.expected = expected |
118
|
|
|
super().__init__( |
119
|
|
|
f"INCONSISTENCY in {tag} ID '{ID}' of file '{file_id}': " |
120
|
|
|
f"text results '{actual}' != concatenated '{expected}'") |
121
|
|
|
|
122
|
|
|
class CoordinateConsistencyError(Exception): |
123
|
|
|
""" |
124
|
|
|
Exception representing a consistency error in coordinate confinement across levels of a PAGE-XML. |
125
|
|
|
(Element coordinate polygons must be properly contained in their parents' coordinate polygons.) |
126
|
|
|
""" |
127
|
|
|
|
128
|
|
|
def __init__(self, tag, ID, file_id, outer, inner): |
129
|
|
|
""" |
130
|
|
|
Construct a new CoordinateConsistencyError. |
131
|
|
|
|
132
|
|
|
Arguments: |
133
|
|
|
tag (string): Level of the offending element (child) |
134
|
|
|
ID (string): ``ID`` of the offending element (child) |
135
|
|
|
file_id (string): ``mets:id`` of the PAGE file |
136
|
|
|
outer (string): Coordinate points of the parent |
137
|
|
|
inner (string): Coordinate points of the child |
138
|
|
|
""" |
139
|
|
|
self.tag = tag |
140
|
|
|
self.ID = ID |
141
|
|
|
self.file_id = file_id |
142
|
|
|
self.outer = outer |
143
|
|
|
self.inner = inner |
144
|
|
|
super().__init__( |
145
|
|
|
f"INCONSISTENCY in {tag} ID '{ID}' of '{file_id}': " |
146
|
|
|
f"coords '{inner}' not within parent coords '{outer}'") |
147
|
|
|
|
148
|
|
|
class CoordinateValidityError(Exception): |
149
|
|
|
""" |
150
|
|
|
Exception representing a validity error of an element's coordinates in PAGE-XML. |
151
|
|
|
(Element coordinate polygons must have at least 3 points, and must not |
152
|
|
|
self-intersect or be non-contiguous or be negative.) |
153
|
|
|
""" |
154
|
|
|
|
155
|
|
|
def __init__(self, tag, ID, file_id, points, reason='unknown'): |
156
|
|
|
""" |
157
|
|
|
Construct a new CoordinateValidityError. |
158
|
|
|
|
159
|
|
|
Arguments: |
160
|
|
|
tag (string): Level of the offending element (child) |
161
|
|
|
ID (string): ``ID`` of the offending element (child) |
162
|
|
|
points (string): Coordinate points |
163
|
|
|
reason (string): description of the problem |
164
|
|
|
""" |
165
|
|
|
self.tag = tag |
166
|
|
|
self.ID = ID |
167
|
|
|
self.file_id = file_id |
168
|
|
|
self.points = points |
169
|
|
|
super().__init__( |
170
|
|
|
f"INVALIDITY in {tag} ID '{ID}' of '{file_id}': coords '{points}' - {reason}") |
171
|
|
|
|
172
|
|
|
def compare_without_whitespace(a, b): |
173
|
|
|
""" |
174
|
|
|
Compare two strings, ignoring all whitespace. |
175
|
|
|
""" |
176
|
|
|
return re.sub('\\s+', '', a) == re.sub('\\s+', '', b) |
177
|
|
|
|
178
|
|
|
def page_get_reading_order(ro, rogroup): |
179
|
|
|
""" |
180
|
|
|
Add all elements from the given reading order group to the given dictionary. |
181
|
|
|
|
182
|
|
|
Given a dict ``ro`` from layout element IDs to ReadingOrder element objects, |
183
|
|
|
and an object ``rogroup`` with additional ReadingOrder element objects, |
184
|
|
|
add all references to the dict, traversing the group recursively. |
185
|
|
|
""" |
186
|
|
|
regionrefs = [] |
187
|
|
|
if isinstance(rogroup, (OrderedGroupType, OrderedGroupIndexedType)): |
188
|
|
|
regionrefs = (rogroup.get_RegionRefIndexed() + |
189
|
|
|
rogroup.get_OrderedGroupIndexed() + |
190
|
|
|
rogroup.get_UnorderedGroupIndexed()) |
191
|
|
|
if isinstance(rogroup, (UnorderedGroupType, UnorderedGroupIndexedType)): |
192
|
|
|
regionrefs = (rogroup.get_RegionRef() + |
193
|
|
|
rogroup.get_OrderedGroup() + |
194
|
|
|
rogroup.get_UnorderedGroup()) |
195
|
|
|
for elem in regionrefs: |
196
|
|
|
ro[elem.get_regionRef()] = elem |
197
|
|
|
if not isinstance(elem, (RegionRefType, RegionRefIndexedType)): |
198
|
|
|
page_get_reading_order(ro, elem) |
199
|
|
|
|
200
|
|
|
def make_poly(polygon_points): |
201
|
|
|
"""Instantiate a Polygon from a list of point pairs, or return an error string""" |
202
|
|
|
if len(polygon_points) < 4: |
203
|
|
|
return 'has too few points' |
204
|
|
|
poly = Polygon(polygon_points) |
205
|
|
|
if POLY_TOLERANCE: |
206
|
|
|
poly = poly.simplify(POLY_TOLERANCE) |
207
|
|
|
if not poly.is_valid: |
208
|
|
|
return explain_validity(poly) |
209
|
|
|
elif poly.is_empty: |
210
|
|
|
return 'is empty' |
211
|
|
|
elif poly.bounds[0] < 0 or poly.bounds[1] < 0: |
212
|
|
|
return 'is negative' |
213
|
|
|
return poly |
214
|
|
|
|
215
|
|
|
def make_line(line_points): |
216
|
|
|
"""Instantiate a LineString from a list of point pairs, or return an error string""" |
217
|
|
|
if len(line_points) < 2: |
218
|
|
|
return 'has too few points' |
219
|
|
|
line = LineString(line_points) |
220
|
|
|
if not line.is_valid: |
221
|
|
|
return explain_validity(line) |
222
|
|
|
elif line.is_empty: |
223
|
|
|
return 'is empty' |
224
|
|
|
elif line.bounds[0] < 0 or line.bounds[1] < 0: |
225
|
|
|
return 'is negative' |
226
|
|
|
return line |
227
|
|
|
|
228
|
|
|
@deprecated_alias(strictness='page_textequiv_consistency') |
229
|
|
|
@deprecated_alias(strategy='page_textequiv_strategy') |
230
|
|
|
def validate_consistency(node, page_textequiv_consistency, page_textequiv_strategy, |
231
|
|
|
check_baseline, check_coords, report, file_id, |
232
|
|
|
joinRelations=None, readingOrder=None, |
233
|
|
|
textLineOrder=None, readingDirection=None): |
234
|
|
|
""" |
235
|
|
|
Check whether the text results on an element is consistent with its child element text results, |
236
|
|
|
and whether the coordinates of an element are fully within its parent element coordinates. |
237
|
|
|
""" |
238
|
|
|
log = getLogger('ocrd.page_validator.validate_consistency') |
239
|
|
|
if isinstance(node, (PcGtsType, OcrdPage)): |
240
|
|
|
# top-level (start recursion) |
241
|
|
|
node_id = node.get_pcGtsId() |
242
|
|
|
node = node.get_Page() # has no .id |
243
|
|
|
if not readingOrder: |
244
|
|
|
readingOrder = {} |
245
|
|
|
ro = node.get_ReadingOrder() |
246
|
|
|
if ro: |
247
|
|
|
page_get_reading_order(readingOrder, ro.get_OrderedGroup() or ro.get_UnorderedGroup()) |
248
|
|
|
if not joinRelations: |
249
|
|
|
joinRelations = [] |
250
|
|
|
relations = node.get_Relations() # get RelationsType |
251
|
|
|
if relations: |
252
|
|
|
relations = relations.get_Relation() # get list of RelationType |
253
|
|
|
else: |
254
|
|
|
relations = [] |
255
|
|
|
for relation in relations: |
256
|
|
|
if relation.get_type() == 'join': # ignore 'link' type here |
257
|
|
|
joinRelations.append((relation.get_SourceRegionRef().get_regionRef(), |
258
|
|
|
relation.get_TargetRegionRef().get_regionRef())) |
259
|
|
|
elif isinstance(node, GlyphType): |
260
|
|
|
# terminal level (end recursion) |
261
|
|
|
return True |
262
|
|
|
else: |
263
|
|
|
node_id = node.id |
264
|
|
|
tag = node.original_tagname_ |
265
|
|
|
log.debug("Validating %s %s", tag, node_id) |
266
|
|
|
consistent = True |
267
|
|
|
if check_coords or check_baseline: |
268
|
|
|
if isinstance(node, PageType): |
269
|
|
|
parent = node.get_Border() |
270
|
|
|
else: |
271
|
|
|
parent = node |
272
|
|
|
if parent: |
273
|
|
|
parent_points = parent.get_Coords().points |
274
|
|
|
node_poly = make_poly(polygon_from_points(parent_points)) |
275
|
|
|
if not isinstance(node_poly, Polygon): |
276
|
|
|
report.add_error(CoordinateValidityError(tag, node_id, file_id, |
277
|
|
|
parent_points, node_poly)) |
278
|
|
|
log.debug("Invalid coords of %s %s", tag, node_id) |
279
|
|
|
consistent = False |
280
|
|
|
node_poly = None # don't use in further comparisons |
281
|
|
|
else: |
282
|
|
|
node_poly = None |
283
|
|
|
for class_, getterLO, getterRD in _ORDER[1:]: |
284
|
|
|
if isinstance(node, class_): |
285
|
|
|
if getterLO: |
286
|
|
|
textLineOrder = getattr(node, getterLO)() |
287
|
|
|
if getterRD: |
288
|
|
|
readingDirection = getattr(node, getterRD)() |
289
|
|
|
for class_, getter, concatenate_with in _HIERARCHY: |
290
|
|
|
if not isinstance(node, class_): |
291
|
|
|
continue |
292
|
|
|
children = getattr(node, getter)() |
293
|
|
|
if (getter == 'get_TextRegion' and children and |
294
|
|
|
all(child.id in readingOrder for child in children) and |
295
|
|
|
isinstance(readingOrder[children[0].id].parent_object_, |
296
|
|
|
(OrderedGroupType, OrderedGroupIndexedType))): |
297
|
|
|
children = sorted(children, key=lambda child: |
298
|
|
|
readingOrder[child.id].index) |
299
|
|
|
elif ((getter == 'get_TextLine' and textLineOrder == _ORDER[0][1]) or |
300
|
|
|
(getter in ['get_Word', 'get_Glyph'] and readingDirection == _ORDER[0][2])): |
301
|
|
|
children = list(reversed(children)) |
302
|
|
|
for child in children: |
303
|
|
|
consistent = (validate_consistency(child, page_textequiv_consistency, page_textequiv_strategy, |
304
|
|
|
check_baseline, check_coords, |
305
|
|
|
report, file_id, |
306
|
|
|
joinRelations, readingOrder, |
307
|
|
|
textLineOrder, readingDirection) |
308
|
|
|
and consistent) |
309
|
|
|
if check_coords and node_poly: |
|
|
|
|
310
|
|
|
child_tag = child.original_tagname_ |
311
|
|
|
child_points = child.get_Coords().points |
312
|
|
|
child_poly = make_poly(polygon_from_points(child_points)) |
313
|
|
|
if not isinstance(child_poly, Polygon): |
314
|
|
|
# report.add_error(CoordinateValidityError(child_tag, child.id, file_id, child_points)) |
315
|
|
|
# log.debug("Invalid coords of %s %s", child_tag, child.id) |
316
|
|
|
# consistent = False |
317
|
|
|
pass # already reported in recursive call above |
318
|
|
|
elif not child_poly.within(node_poly.buffer(PARENT_SLACK)): |
319
|
|
|
# TODO: automatic repair? |
320
|
|
|
report.add_error(CoordinateConsistencyError(child_tag, child.id, file_id, |
321
|
|
|
parent_points, child_points)) |
|
|
|
|
322
|
|
|
log.debug("Inconsistent coords of %s %s", child_tag, child.id) |
323
|
|
|
consistent = False |
324
|
|
|
if isinstance(node, TextLineType) and check_baseline and node.get_Baseline(): |
325
|
|
|
baseline_points = node.get_Baseline().points |
326
|
|
|
baseline_line = make_line(polygon_from_points(baseline_points)) |
327
|
|
|
if not isinstance(baseline_line, LineString): |
328
|
|
|
report.add_error(CoordinateValidityError("Baseline", node_id, file_id, |
329
|
|
|
baseline_points, baseline_line)) |
330
|
|
|
log.debug("Invalid coords of baseline in %s", node_id) |
331
|
|
|
consistent = False |
332
|
|
|
elif node_poly and not baseline_line.within(node_poly.buffer(PARENT_SLACK)): |
333
|
|
|
report.add_error(CoordinateConsistencyError("Baseline", node_id, file_id, |
334
|
|
|
parent_points, baseline_points)) |
335
|
|
|
log.debug("Inconsistent coords of baseline in %s %s", tag, node_id) |
336
|
|
|
consistent = False |
337
|
|
|
if concatenate_with is not None and page_textequiv_consistency != 'off': |
338
|
|
|
# validate textual consistency of node with children |
339
|
|
|
concatenated = concatenate(children, concatenate_with, page_textequiv_strategy, |
340
|
|
|
joinRelations) |
341
|
|
|
text_results = get_text(node, page_textequiv_strategy) |
342
|
|
|
if concatenated and text_results and concatenated != text_results: |
343
|
|
|
consistent = False |
344
|
|
|
if page_textequiv_consistency == 'fix': |
345
|
|
|
log.debug("Repaired text of %s %s", tag, node_id) |
346
|
|
|
set_text(node, concatenated, page_textequiv_strategy) |
347
|
|
|
elif (page_textequiv_consistency == 'strict' # or 'lax' but... |
348
|
|
|
or not compare_without_whitespace(concatenated, text_results)): |
349
|
|
|
log.debug("Inconsistent text of %s %s", tag, node_id) |
350
|
|
|
report.add_error(ConsistencyError(tag, node_id, file_id, |
351
|
|
|
text_results, concatenated)) |
352
|
|
|
return consistent |
353
|
|
|
|
354
|
|
|
def concatenate(nodes, concatenate_with, page_textequiv_strategy, joins=None): |
355
|
|
|
""" |
356
|
|
|
Concatenate nodes textually according to https://ocr-d.github.io/page#consistency-of-text-results-on-different-levels |
357
|
|
|
""" |
358
|
|
|
if not nodes: |
359
|
|
|
return '' |
360
|
|
|
if not joins: |
361
|
|
|
joins = [] |
362
|
|
|
result = get_text(nodes[0], page_textequiv_strategy) |
363
|
|
|
for node, next_node in zip(nodes, nodes[1:]): |
364
|
|
|
if (node.id, next_node.id) not in joins: |
365
|
|
|
# TODO: also cover 2-level joins like word-word |
366
|
|
|
result += concatenate_with |
367
|
|
|
result += get_text(next_node, page_textequiv_strategy) |
368
|
|
|
return result.strip() |
369
|
|
|
|
370
|
|
|
def get_text(node, page_textequiv_strategy='first'): |
371
|
|
|
""" |
372
|
|
|
Get the first or most confident among text results (depending on ``page_textequiv_strategy``). |
373
|
|
|
For the strategy ``best``, return the string of the highest scoring result. |
374
|
|
|
For the strategy ``first``, return the string of the lowest indexed result. |
375
|
|
|
If there are no scores/indexes, use the first result. |
376
|
|
|
If there are no results, return the empty string. |
377
|
|
|
""" |
378
|
|
|
log = getLogger('ocrd.page_validator.get_text') |
379
|
|
|
textEquivs = node.get_TextEquiv() |
380
|
|
|
if not textEquivs: |
381
|
|
|
log.debug("No text results on %s %s", node, node.id) |
382
|
|
|
return '' |
383
|
|
|
elif page_textequiv_strategy == 'best': |
384
|
|
|
if len(textEquivs) > 1: |
385
|
|
|
textEquivsSorted = sorted([x for x in textEquivs if x.conf], |
386
|
|
|
# generateDS does not convert simpleType for attributes (yet?) |
387
|
|
|
key=lambda x: float(x.conf)) |
388
|
|
|
if textEquivsSorted: |
389
|
|
|
return textEquivsSorted[-1].get_Unicode().strip() |
390
|
|
|
# fall back to first element |
391
|
|
|
return textEquivs[0].get_Unicode().strip() |
392
|
|
|
#elif page_textequiv_strategy == 'first': |
393
|
|
|
else: |
394
|
|
|
if len(textEquivs) > 1: |
395
|
|
|
textEquivsSorted = sorted([x for x in textEquivs if isinstance(x.index, int)], |
396
|
|
|
key=lambda x: x.index) |
397
|
|
|
if textEquivsSorted: |
398
|
|
|
return textEquivsSorted[0].get_Unicode().strip() |
399
|
|
|
# fall back to first element |
400
|
|
|
return textEquivs[0].get_Unicode().strip() |
401
|
|
|
|
402
|
|
|
def set_text(node, text, page_textequiv_strategy): |
403
|
|
|
""" |
404
|
|
|
Set the first or most confident among text results (depending on ``page_textequiv_strategy``). |
405
|
|
|
For the strategy ``best``, set the string of the highest scoring result. |
406
|
|
|
For the strategy ``first``, set the string of the lowest indexed result. |
407
|
|
|
If there are no scores/indexes, use the first result. |
408
|
|
|
If there are no results, add a new one. |
409
|
|
|
""" |
410
|
|
|
text = text.strip() |
411
|
|
|
textEquivs = node.get_TextEquiv() |
412
|
|
|
if not textEquivs: |
413
|
|
|
node.add_TextEquiv(TextEquivType(Unicode=text)) # or index=0 ? |
414
|
|
|
elif page_textequiv_strategy == 'best': |
415
|
|
|
if len(textEquivs) > 1: |
416
|
|
|
textEquivsSorted = sorted([x for x in textEquivs if x.conf], |
417
|
|
|
# generateDS does not convert simpleType for attributes (yet?) |
418
|
|
|
key=lambda x: float(x.conf)) |
419
|
|
|
if textEquivsSorted: |
420
|
|
|
textEquivsSorted[-1].set_Unicode(text) |
421
|
|
|
return |
422
|
|
|
# fall back to first element |
423
|
|
|
textEquivs[0].set_Unicode(text) |
424
|
|
|
#elif page_textequiv_strategy == 'first': |
425
|
|
|
else: |
426
|
|
|
if len(textEquivs) > 1: |
427
|
|
|
textEquivsSorted = sorted([x for x in textEquivs if isinstance(x.index, int)], |
428
|
|
|
key=lambda x: x.index) |
429
|
|
|
if textEquivsSorted: |
430
|
|
|
textEquivsSorted[0].set_Unicode(text) |
431
|
|
|
return |
432
|
|
|
# fall back to first element |
433
|
|
|
textEquivs[0].set_Unicode(text) |
434
|
|
|
|
435
|
|
|
class PageValidator(): |
436
|
|
|
""" |
437
|
|
|
Validator for `OcrdPage <../ocrd_models/ocrd_models.ocrd_page.html>`. |
438
|
|
|
""" |
439
|
|
|
|
440
|
|
|
@staticmethod |
441
|
|
|
@deprecated_alias(strictness='page_textequiv_consistency') |
442
|
|
|
@deprecated_alias(strategy='page_textequiv_strategy') |
443
|
|
|
def validate(filename=None, ocrd_page=None, ocrd_file=None, |
444
|
|
|
page_textequiv_consistency='strict', page_textequiv_strategy='first', |
445
|
|
|
check_baseline=True, check_coords=True): |
446
|
|
|
""" |
447
|
|
|
Validates a PAGE file for consistency by filename, OcrdFile or passing OcrdPage directly. |
448
|
|
|
|
449
|
|
|
Arguments: |
450
|
|
|
filename (string): Path to PAGE |
451
|
|
|
ocrd_page (OcrdPage): OcrdPage instance |
452
|
|
|
ocrd_file (OcrdFile): OcrdFile instance wrapping OcrdPage |
453
|
|
|
page_textequiv_consistency (string): 'strict', 'lax', 'fix' or 'off' |
454
|
|
|
page_textequiv_strategy (string): Currently only 'first' |
455
|
|
|
check_baseline (bool): whether Baseline must be fully within TextLine/Coords |
456
|
|
|
check_coords (bool): whether *Region/TextLine/Word/Glyph must each be fully |
457
|
|
|
contained within Border/*Region/TextLine/Word, resp. |
458
|
|
|
|
459
|
|
|
Returns: |
460
|
|
|
report (:class:`ValidationReport`) Report on the validity |
461
|
|
|
""" |
462
|
|
|
log = getLogger('ocrd.page_validator.validate') |
463
|
|
|
if ocrd_page: |
464
|
|
|
page = ocrd_page |
465
|
|
|
file_id = ocrd_page.get_pcGtsId() |
466
|
|
|
elif ocrd_file: |
467
|
|
|
page = page_from_file(ocrd_file) |
468
|
|
|
file_id = ocrd_file.ID |
469
|
|
|
elif filename: |
470
|
|
|
page = parse(filename, silence=True) |
471
|
|
|
file_id = filename |
472
|
|
|
else: |
473
|
|
|
raise ValueError("At least one of ocrd_page, ocrd_file or filename must be set") |
474
|
|
|
if page_textequiv_strategy not in ('first'): |
475
|
|
|
raise ValueError("page_textequiv_strategy %s not implemented" % page_textequiv_strategy) |
476
|
|
|
if page_textequiv_consistency not in ('strict', 'lax', 'fix', 'off'): |
477
|
|
|
raise ValueError("page_textequiv_consistency level %s not implemented" % page_textequiv_consistency) |
478
|
|
|
report = ValidationReport() |
479
|
|
|
log.info("Validating input file '%s'", file_id) |
480
|
|
|
validate_consistency(page, page_textequiv_consistency, page_textequiv_strategy, check_baseline, check_coords, report, file_id) |
481
|
|
|
return report |
482
|
|
|
|