1
|
|
|
# -*- coding: utf-8 -*- |
2
|
|
|
|
3
|
|
|
import pytest |
4
|
|
|
|
5
|
|
|
from tests.base import main, assets, create_ocrd_file_with_defaults |
6
|
|
|
|
7
|
|
|
from ocrd_modelfactory import page_from_image |
8
|
|
|
from ocrd_models.ocrd_page_generateds import TextTypeSimpleType |
9
|
|
|
from ocrd_models.ocrd_page import ( |
10
|
|
|
AlternativeImageType, |
11
|
|
|
PcGtsType, |
12
|
|
|
PageType, |
13
|
|
|
TextRegionType, |
14
|
|
|
TextLineType, |
15
|
|
|
OrderedGroupIndexedType, |
16
|
|
|
UnorderedGroupIndexedType, |
17
|
|
|
ReadingOrderType, |
18
|
|
|
RegionRefIndexedType, |
19
|
|
|
WordType, |
20
|
|
|
GlyphType, |
21
|
|
|
|
22
|
|
|
parseString, |
23
|
|
|
parse, |
24
|
|
|
to_xml |
25
|
|
|
) |
26
|
|
|
|
27
|
|
|
simple_page = """\ |
28
|
|
|
<PcGts xmlns="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15 http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15/pagecontent.xsd"> |
29
|
|
|
<Metadata> |
30
|
|
|
<Creator>OCR-D</Creator> |
31
|
|
|
<Created>2016-09-20T11:09:27.041+02:00</Created> |
32
|
|
|
<LastChange>2018-04-25T17:44:49.605+01:00</LastChange> |
33
|
|
|
</Metadata> |
34
|
|
|
<Page |
35
|
|
|
imageFilename="https://github.com/OCR-D/assets/raw/master/data/kant_aufklaerung_1784/data/OCR-D-IMG/INPUT_0017.tif" |
36
|
|
|
imageWidth="1457" |
37
|
|
|
imageHeight="2083" |
38
|
|
|
type="content"> |
39
|
|
|
<TextRegion type="heading" id="r_1_1" custom="readingOrder {index:0;} structure {type:heading;}"> |
40
|
|
|
<Coords points="113,365 919,365 919,439 113,439"/> |
41
|
|
|
<TextLine id="tl_1" primaryLanguage="German" custom="readingOrder {index:0;} textStyle {offset:0; length:26;fontFamily:Arial; fontSize:17.0; bold:true;}"> |
42
|
|
|
<Coords points="114,366 918,366 918,438 114,438"/> |
43
|
|
|
<Baseline points="114,429 918,429"/> |
44
|
|
|
<Word id="w_w1aab1b1b2b1b1ab1" language="German" custom="readingOrder {index:0;} textStyle {offset:0; length:11;fontFamily:Arial; fontSize:17.0; bold:true;}"> |
45
|
|
|
<Coords points="114,368 442,368 442,437 114,437"/> |
46
|
|
|
<TextEquiv conf="1"> |
47
|
|
|
<Unicode>Berliniſche</Unicode> |
48
|
|
|
</TextEquiv> |
49
|
|
|
</Word> |
50
|
|
|
</TextLine> |
51
|
|
|
</TextRegion> |
52
|
|
|
</Page> |
53
|
|
|
</PcGts> |
54
|
|
|
""" |
55
|
|
|
|
56
|
|
|
|
57
|
|
|
@pytest.fixture(name='faulty_glyphs') |
58
|
|
|
def _fixture_faulty_glyphs(): |
59
|
|
|
with open(assets.path_to('glyph-consistency/data/OCR-D-GT-PAGE/FAULTY_GLYPHS.xml'), 'rb') as f: |
60
|
|
|
xml_as_str = f.read() |
61
|
|
|
pcgts = parseString(xml_as_str, silence=True) |
62
|
|
|
yield pcgts |
63
|
|
|
|
64
|
|
|
|
65
|
|
|
def test_pcgts_id_matches(faulty_glyphs): |
66
|
|
|
assert faulty_glyphs.pcGtsId == 'FAULTY_GLYPHS_FILE' |
67
|
|
|
|
68
|
|
|
|
69
|
|
|
def test_faulty_glyphs_to_xml(faulty_glyphs): |
70
|
|
|
as_xml = to_xml(faulty_glyphs) |
71
|
|
|
assert ' xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"' in as_xml[:1000] |
72
|
|
|
assert ' xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15 http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15/pagecontent.xsd"', as_xml[:1000] |
73
|
|
|
assert '<pc:PcGts' in as_xml[0:100] |
74
|
|
|
assert '<pc:TextRegion' in as_xml[1000:3000] |
75
|
|
|
|
76
|
|
|
|
77
|
|
|
def test_to_xml_unicode_nsprefix(): |
78
|
|
|
"""see https://github.com/OCR-D/core/pull/474#issuecomment-621477590""" |
79
|
|
|
|
80
|
|
|
# arrange |
81
|
|
|
with open(assets.path_to('kant_aufklaerung_1784-binarized/data/OCR-D-GT-WORD/INPUT_0020.xml'), 'rb') as f: |
82
|
|
|
from_xml = f.read() |
83
|
|
|
|
84
|
|
|
# assert |
85
|
|
|
assert '<Unicode>' in from_xml.decode('utf-8'), 'without NS prefix' |
86
|
|
|
assert '<Created' in from_xml.decode('utf-8'), 'without NS prefix' |
87
|
|
|
pcgts = parseString(from_xml, silence=True) |
88
|
|
|
as_xml = to_xml(pcgts) |
89
|
|
|
assert '<pc:Unicode>' in as_xml, 'with NS prefix' |
90
|
|
|
assert '<pc:Created>' in as_xml, 'with NS prefix' |
91
|
|
|
|
92
|
|
|
|
93
|
|
|
def test_issue_269(faulty_glyphs): |
94
|
|
|
""" |
95
|
|
|
@conf is parsed as str but should be float |
96
|
|
|
https://github.com/OCR-D/core/issues/269 |
97
|
|
|
""" |
98
|
|
|
# GIGO |
99
|
|
|
faulty_glyphs.get_Page().get_TextRegion()[0].get_TextEquiv()[0].set_conf(1.0) |
100
|
|
|
assert type(faulty_glyphs.get_Page().get_TextRegion()[0].get_TextEquiv()[0].get_conf()) == float |
101
|
|
|
faulty_glyphs.get_Page().get_TextRegion()[0].get_TextEquiv()[0].set_conf('1.0') |
102
|
|
|
assert type(faulty_glyphs.get_Page().get_TextRegion()[0].get_TextEquiv()[0].get_conf()) == str |
103
|
|
|
|
104
|
|
|
|
105
|
|
|
def test_parse_string_succeeds(): |
106
|
|
|
"""parseString with @conf in TextEquiv won't throw an error""" |
107
|
|
|
assert parseString(simple_page, silence=True) is not None |
108
|
|
|
|
109
|
|
|
|
110
|
|
|
def test_delete_region(): |
111
|
|
|
pcgts = parseString(simple_page, silence=True) |
112
|
|
|
assert len(pcgts.get_Page().get_TextRegion()) == 1 |
113
|
|
|
|
114
|
|
|
# act |
115
|
|
|
del pcgts.get_Page().get_TextRegion()[0] |
116
|
|
|
|
117
|
|
|
# assert |
118
|
|
|
assert len(pcgts.get_Page().get_TextRegion()) == 0 |
119
|
|
|
|
120
|
|
|
|
121
|
|
|
def test_set_image_filename(faulty_glyphs): |
122
|
|
|
assert faulty_glyphs.get_Page().imageFilename == '00000259.sw.tif' |
123
|
|
|
|
124
|
|
|
# act |
125
|
|
|
faulty_glyphs.get_Page().imageFilename = 'foo' |
126
|
|
|
|
127
|
|
|
# assert |
128
|
|
|
assert faulty_glyphs.get_Page().imageFilename == 'foo' |
129
|
|
|
|
130
|
|
|
|
131
|
|
|
def test_alternative_image_additions(): |
132
|
|
|
pcgts = PcGtsType(pcGtsId="foo") |
133
|
|
|
assert pcgts.pcGtsId == 'foo' |
134
|
|
|
|
135
|
|
|
# act |
136
|
|
|
# Page/AlternativeImage |
137
|
|
|
page = PageType() |
138
|
|
|
pcgts.set_Page(page) |
139
|
|
|
page.add_AlternativeImage(AlternativeImageType()) |
140
|
|
|
# TextRegion/AlternativeImage |
141
|
|
|
region = TextRegionType() |
142
|
|
|
page.add_TextRegion(region) |
143
|
|
|
region.add_AlternativeImage(AlternativeImageType()) |
144
|
|
|
# TextLine/AlternativeImage |
145
|
|
|
line = TextLineType() |
146
|
|
|
region.add_TextLine(line) |
147
|
|
|
line.add_AlternativeImage(AlternativeImageType()) |
148
|
|
|
# Word/AlternativeImage |
149
|
|
|
word = WordType() |
150
|
|
|
line.add_Word(word) |
151
|
|
|
word.add_AlternativeImage(AlternativeImageType()) |
152
|
|
|
# Glyph/AlternativeImage |
153
|
|
|
glyph = GlyphType() |
154
|
|
|
word.add_Glyph(glyph) |
155
|
|
|
glyph.add_AlternativeImage(AlternativeImageType()) |
156
|
|
|
|
157
|
|
|
# TODO assertions |
158
|
|
|
|
159
|
|
|
|
160
|
|
|
def test_simple_types(faulty_glyphs): |
161
|
|
|
regions = faulty_glyphs.get_Page().get_TextRegion() |
162
|
|
|
reg = regions[0] |
163
|
|
|
|
164
|
|
|
# assert |
165
|
|
|
assert isinstance(reg.get_type(), str) |
166
|
|
|
assert reg.get_type() == TextTypeSimpleType.CREDIT |
167
|
|
|
assert isinstance(TextTypeSimpleType.CREDIT, str) |
168
|
|
|
assert reg.get_type() == 'credit' |
169
|
|
|
assert isinstance(TextTypeSimpleType.CREDIT, str) |
170
|
|
|
reg.set_type(TextTypeSimpleType.PAGENUMBER) |
171
|
|
|
assert reg.get_type() == 'page-number' |
172
|
|
|
assert isinstance(reg.get_type(), str) |
173
|
|
|
|
174
|
|
|
|
175
|
|
|
def test_orderedgroup_export_order(): |
176
|
|
|
""" |
177
|
|
|
See https://github.com/OCR-D/core/issues/475 |
178
|
|
|
""" |
179
|
|
|
# arrange |
180
|
|
|
with open(assets.path_to('gutachten/data/TEMP1/PAGE_TEMP1.xml'), 'r') as f: |
181
|
|
|
pcgts = parseString(f.read().encode('utf8'), silence=True) |
182
|
|
|
|
183
|
|
|
# act |
184
|
|
|
og = pcgts.get_Page().get_ReadingOrder().get_OrderedGroup() |
185
|
|
|
xml_before = to_xml(og) |
186
|
|
|
children = og.get_AllIndexed() |
187
|
|
|
|
188
|
|
|
# assert |
189
|
|
|
assert len(children) == 22 |
190
|
|
|
assert [c.index for c in children] == list(range(0, 22)) |
191
|
|
|
# mix up the indexes |
192
|
|
|
children[0].index = 11 |
193
|
|
|
children[11].index = 3 |
194
|
|
|
children[3].index = 0 |
195
|
|
|
assert [c.index for c in children] == [11, 1, 2, 0, 4, 5, 6, 7, 8, 9, 10, 3, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21] |
196
|
|
|
assert [c.index for c in og.get_AllIndexed()] == list(range(0, 22)) |
197
|
|
|
assert og.get_AllIndexed()[1].__class__ == OrderedGroupIndexedType |
198
|
|
|
# serialize and make sure the correct order was serialized |
199
|
|
|
new_pcgts = parseString(to_xml(pcgts).encode('utf8'), silence=True) |
200
|
|
|
new_og = new_pcgts.get_Page().get_ReadingOrder().get_OrderedGroup() |
201
|
|
|
assert [c.index for c in new_og.get_AllIndexed()] == list(range(0, 22)) |
202
|
|
|
|
203
|
|
|
xml_after = to_xml(new_og) |
204
|
|
|
# TODO why not working? |
205
|
|
|
#assert xml_after == xml_before |
206
|
|
|
|
207
|
|
|
|
208
|
|
|
def test_empty_groups_to_regionrefindexed(): |
209
|
|
|
""" |
210
|
|
|
Corrolary See https://github.com/OCR-D/core/issues/475 |
211
|
|
|
""" |
212
|
|
|
# arrange |
213
|
|
|
with open(assets.path_to('gutachten/data/TEMP1/PAGE_TEMP1.xml'), 'r') as f: |
214
|
|
|
pcgts = parseString(f.read().encode('utf8'), silence=True) |
215
|
|
|
|
216
|
|
|
og = pcgts.get_Page().get_ReadingOrder().get_OrderedGroup() |
217
|
|
|
children = og.get_AllIndexed() |
218
|
|
|
|
219
|
|
|
# assert |
220
|
|
|
assert isinstance(children[1], OrderedGroupIndexedType) |
221
|
|
|
assert isinstance(children[21], UnorderedGroupIndexedType) |
222
|
|
|
# empty all the elements in the first orederdGroupIndexed |
223
|
|
|
children[1].set_RegionRefIndexed([]) |
224
|
|
|
# serialize apnd parse to see empty group converted |
225
|
|
|
pcgts = parseString(to_xml(pcgts).encode('utf8'), silence=True) |
226
|
|
|
og = pcgts.get_Page().get_ReadingOrder().get_OrderedGroup() |
227
|
|
|
children = og.get_AllIndexed() |
228
|
|
|
assert isinstance(children[1], RegionRefIndexedType) |
229
|
|
|
assert isinstance(children[21], RegionRefIndexedType) |
230
|
|
|
|
231
|
|
|
|
232
|
|
|
def test_all_regions_without_reading_order(): |
233
|
|
|
""" |
234
|
|
|
https://github.com/OCR-D/core/pull/479 |
235
|
|
|
https://github.com/OCR-D/core/issues/240#issuecomment-493135797 |
236
|
|
|
""" |
237
|
|
|
with open(assets.path_to('gutachten/data/TEMP1/PAGE_TEMP1.xml'), 'r') as f: |
238
|
|
|
pcgts = parseString(f.read().encode('utf8'), silence=True) |
239
|
|
|
|
240
|
|
|
# act |
241
|
|
|
pg = pcgts.get_Page() |
242
|
|
|
|
243
|
|
|
# assert |
244
|
|
|
assert len(pg.get_AllRegions()) == 65 |
245
|
|
|
assert len(pg.get_AllRegions(depth=0)) == 65 |
246
|
|
|
assert len(pg.get_AllRegions(depth=1)) == 45 |
247
|
|
|
assert len(pg.get_AllRegions(depth=2)) == 65 |
248
|
|
|
assert len(pg.get_AllRegions(depth=3)) == 65 |
249
|
|
|
assert len(pg.get_AllRegions(classes=['Separator'])) == 25 |
250
|
|
|
assert len(pg.get_AllRegions(classes=['Table'])) == 3 |
251
|
|
|
assert len(pg.get_AllRegions(classes=['Text'])) == 37 |
252
|
|
|
assert len(pg.get_AllRegions(classes=['Text'], depth=1)) == 17 |
253
|
|
|
assert len(pg.get_AllRegions(classes=['Text'], depth=2)) == 37 |
254
|
|
|
|
255
|
|
|
|
256
|
|
|
def test_get_all_regions_invalid_order_raises_exception(): |
257
|
|
|
# arrange |
258
|
|
|
with open(assets.path_to('gutachten/data/TEMP1/PAGE_TEMP1.xml'), 'r') as f: |
259
|
|
|
pg = parseString(f.read().encode('utf8'), silence=True).get_Page() |
260
|
|
|
|
261
|
|
|
# act |
262
|
|
|
with pytest.raises(Exception) as exc: |
263
|
|
|
pg.get_AllRegions(order='random') |
264
|
|
|
|
265
|
|
|
# assert |
266
|
|
|
assert "Argument 'order' must be either 'document', 'reading-order' or 'reading-order-only', not 'random'" in str(exc.value) |
267
|
|
|
|
268
|
|
|
|
269
|
|
|
def test_get_all_regions_invalid_depth_raises_exception(): |
270
|
|
|
# arrange |
271
|
|
|
with open(assets.path_to('gutachten/data/TEMP1/PAGE_TEMP1.xml'), 'r') as f: |
272
|
|
|
pg = parseString(f.read().encode('utf8'), silence=True).get_Page() |
273
|
|
|
|
274
|
|
|
# act |
275
|
|
|
with pytest.raises(Exception) as exc: |
276
|
|
|
pg.get_AllRegions(depth=-1) |
277
|
|
|
|
278
|
|
|
# assert |
279
|
|
|
assert "Argument 'depth' must be an integer greater-or-equal 0, not '-1'" in str(exc.value) |
280
|
|
|
|
281
|
|
|
|
282
|
|
|
def test_all_regions_with_reading_order(): |
283
|
|
|
""" |
284
|
|
|
https://github.com/OCR-D/core/pull/479 |
285
|
|
|
https://github.com/OCR-D/core/issues/240#issuecomment-493135797 |
286
|
|
|
""" |
287
|
|
|
|
288
|
|
|
# arrange |
289
|
|
|
with open(assets.path_to('gutachten/data/TEMP1/PAGE_TEMP1.xml'), 'r') as f: |
290
|
|
|
pg = parseString(f.read().encode('utf8'), silence=True).get_Page() |
291
|
|
|
|
292
|
|
|
# assert |
293
|
|
|
assert len(pg.get_AllRegions(order='reading-order-only')) == 40 |
294
|
|
|
assert len(pg.get_AllRegions(order='reading-order-only', depth=1)) == 20 |
295
|
|
|
assert len(pg.get_AllRegions(order='reading-order-only', depth=2)) == 40 |
296
|
|
|
assert len(pg.get_AllRegions(order='reading-order', depth=0)) == 65 |
297
|
|
|
assert len(pg.get_AllRegions(order='reading-order', depth=1)) == 45 |
298
|
|
|
assert len(pg.get_AllRegions(order='reading-order', depth=2)) == 65 |
299
|
|
|
assert len(pg.get_AllRegions(classes=['Table'], order='reading-order')) == 3 |
300
|
|
|
assert len(pg.get_AllRegions(classes=['Text'], order='reading-order')) == 37 |
301
|
|
|
assert len(pg.get_AllRegions(classes=['Text'], order='reading-order', depth=1)) == 17 |
302
|
|
|
|
303
|
|
|
|
304
|
|
|
def test_get_unordered_group_children(): |
305
|
|
|
# arrange |
306
|
|
|
with open(assets.path_to('gutachten/data/TEMP1/PAGE_TEMP1.xml'), 'r') as f: |
307
|
|
|
pcgts = parseString(f.read().encode('utf8'), silence=True) |
308
|
|
|
|
309
|
|
|
# act |
310
|
|
|
ug = pcgts.get_Page().get_ReadingOrder().get_OrderedGroup().get_UnorderedGroupIndexed()[0] |
311
|
|
|
|
312
|
|
|
# assert |
313
|
|
|
assert len(ug.get_UnorderedGroupChildren()) == 1 |
314
|
|
|
|
315
|
|
|
|
316
|
|
|
def test_get_all_indexed_classes(): |
317
|
|
|
# arrange |
318
|
|
|
with open(assets.path_to('gutachten/data/TEMP1/PAGE_TEMP1.xml'), 'r') as f: |
319
|
|
|
pcgts = parseString(f.read().encode('utf8'), silence=True) |
320
|
|
|
|
321
|
|
|
# act |
322
|
|
|
og = pcgts.get_Page().get_ReadingOrder().get_OrderedGroup() |
323
|
|
|
|
324
|
|
|
# assert |
325
|
|
|
assert len(og.get_AllIndexed(classes=['RegionRef'])) == 17 |
326
|
|
|
assert len(og.get_AllIndexed(classes=['OrderedGroup'])) == 3 |
327
|
|
|
assert len(og.get_AllIndexed(classes=['UnorderedGroup'])) == 2 |
328
|
|
|
|
329
|
|
|
|
330
|
|
|
def test_get_all_indexed_index_sort(): |
331
|
|
|
# arrange |
332
|
|
|
with open(assets.path_to('gutachten/data/TEMP1/PAGE_TEMP1.xml'), 'r') as f: |
333
|
|
|
og = parseString(f.read().encode('utf8'), silence=True).get_Page().get_ReadingOrder().get_OrderedGroup() |
334
|
|
|
|
335
|
|
|
# act |
336
|
|
|
unogs = og.get_UnorderedGroupIndexed() |
337
|
|
|
|
338
|
|
|
# assert |
339
|
|
|
assert [x.index for x in unogs] == [20, 21] |
340
|
|
|
unogs[0].index = 21 |
341
|
|
|
unogs[1].index = 20 |
342
|
|
|
assert [x.index for x in og.get_AllIndexed(classes=['UnorderedGroup'], index_sort=True)] == [20, 21] |
343
|
|
|
assert [x.index for x in og.get_AllIndexed(classes=['UnorderedGroup'], index_sort=False)] == [21, 20] |
344
|
|
|
og.sort_AllIndexed() |
345
|
|
|
assert [x.index for x in og.get_AllIndexed(classes=['UnorderedGroup'], index_sort=False)] == [20, 21] |
346
|
|
|
|
347
|
|
|
|
348
|
|
|
def test_extend_all_indexed_no_validation(): |
349
|
|
|
# arrange |
350
|
|
|
with open(assets.path_to('gutachten/data/TEMP1/PAGE_TEMP1.xml'), 'r') as f: |
351
|
|
|
og = parseString(f.read().encode('utf8'), silence=True).get_Page().get_ReadingOrder().get_OrderedGroup() |
352
|
|
|
|
353
|
|
|
# act |
354
|
|
|
og.extend_AllIndexed([ |
355
|
|
|
RegionRefIndexedType(index=3, id='r3'), |
356
|
|
|
RegionRefIndexedType(index=2, id='r2'), |
357
|
|
|
RegionRefIndexedType(index=1, id='r1'), |
358
|
|
|
]) |
359
|
|
|
rrs = og.get_RegionRefIndexed() |
360
|
|
|
|
361
|
|
|
# assert |
362
|
|
|
assert [x.index for x in rrs][-3:] == [22, 23, 24] |
363
|
|
|
|
364
|
|
|
|
365
|
|
|
def test_get_all_text_lines(): |
366
|
|
|
with open(assets.path_to('gutachten/data/TEMP1/PAGE_TEMP1.xml'), 'r') as f: |
367
|
|
|
page = parseString(f.read().encode('utf8'), silence=True).get_Page() |
368
|
|
|
|
369
|
|
|
# assert |
370
|
|
|
assert len(page.get_AllTextLines()) == 55 |
371
|
|
|
|
372
|
|
|
|
373
|
|
|
def test_extend_all_indexed_validate_continuity(): |
374
|
|
|
# arrange |
375
|
|
|
with open(assets.path_to('gutachten/data/TEMP1/PAGE_TEMP1.xml'), 'r') as f: |
376
|
|
|
og = parseString(f.read().encode('utf8'), silence=True).get_Page().get_ReadingOrder().get_OrderedGroup() |
377
|
|
|
|
378
|
|
|
# act |
379
|
|
|
with pytest.raises(Exception) as index_exc: |
380
|
|
|
og.extend_AllIndexed([ |
381
|
|
|
RegionRefIndexedType(index=3, id='r3'), |
382
|
|
|
RegionRefIndexedType(index=2, id='r2'), |
383
|
|
|
RegionRefIndexedType(index=1, id='r1'), |
384
|
|
|
], validate_continuity=True) |
385
|
|
|
|
386
|
|
|
assert "@index already used: 1" in str(index_exc.value) |
387
|
|
|
|
388
|
|
|
|
389
|
|
|
def test_get_all_alternative_image_paths(): |
390
|
|
|
# arrange |
391
|
|
|
with open(assets.path_to('kant_aufklaerung_1784-complex/data/OCR-D-OCR-OCRO-fraktur-SEG-LINE-tesseract-ocropy-DEWARP/OCR-D-OCR-OCRO-fraktur-SEG-LINE-tesseract-ocropy-DEWARP_0001.xml'), 'r') as f: |
392
|
|
|
pcgts = parseString(f.read().encode('utf8'), silence=True) |
393
|
|
|
|
394
|
|
|
# assert |
395
|
|
|
assert pcgts.get_AllAlternativeImagePaths(page=False, region=False, line=False) == [] |
396
|
|
|
assert pcgts.get_AllAlternativeImagePaths(page=True, region=False, line=False) == [ |
397
|
|
|
'OCR-D-IMG-BIN/OCR-D-IMG-BINPAGE-sauvola_0001-BIN_sauvola-ms-split.png', |
398
|
|
|
'OCR-D-IMG-CROP/OCR-D-IMG-CROP_0001.png', |
399
|
|
|
'OCR-D-IMG-BIN/INPUT_0017-BIN_sauvola-ms-split.png', |
400
|
|
|
'OCR-D-IMG-DESPECK/OCR-D-IMG-DESPECK_0001.png', |
401
|
|
|
'OCR-D-IMG-DESKEW/OCR-D-IMG-DESKEW_0001.png', |
402
|
|
|
'OCR-D-IMG-DESKEW/OCR-D-IMG-DESKEW_0001.png'] |
403
|
|
|
assert len(pcgts.get_AllAlternativeImagePaths(page=True, region=True, line=False)) == 12 |
404
|
|
|
assert len(pcgts.get_AllAlternativeImagePaths(page=True, region=True, line=False)) == 12 |
405
|
|
|
assert len(pcgts.get_AllAlternativeImagePaths(page=True, region=True, line=True)) == 36 |
406
|
|
|
|
407
|
|
|
# TODO: Test with word/glyph-level AlternativeImages |
408
|
|
|
# would work with len == 36 |
409
|
|
|
# assert len(pcgts.get_AllAlternativeImagePaths(word=False)) == 37 |
410
|
|
|
|
411
|
|
|
|
412
|
|
|
def test_get_AllAlternativeImages(): |
413
|
|
|
with open(assets.path_to('kant_aufklaerung_1784-complex/data/OCR-D-OCR-OCRO-fraktur-SEG-LINE-tesseract-ocropy-DEWARP/OCR-D-OCR-OCRO-fraktur-SEG-LINE-tesseract-ocropy-DEWARP_0001.xml'), 'r') as f: |
414
|
|
|
pcgts = parseString(f.read().encode('utf8'), silence=True) |
415
|
|
|
page = pcgts.get_Page() |
416
|
|
|
assert page.get_AllAlternativeImages(page=False, region=False, line=False) == [] |
417
|
|
|
assert [x.filename for x in page.get_AllAlternativeImages(page=True, region=False, line=False)] == [ |
418
|
|
|
'OCR-D-IMG-BIN/OCR-D-IMG-BINPAGE-sauvola_0001-BIN_sauvola-ms-split.png', |
419
|
|
|
'OCR-D-IMG-CROP/OCR-D-IMG-CROP_0001.png', |
420
|
|
|
'OCR-D-IMG-BIN/INPUT_0017-BIN_sauvola-ms-split.png', |
421
|
|
|
'OCR-D-IMG-DESPECK/OCR-D-IMG-DESPECK_0001.png', |
422
|
|
|
'OCR-D-IMG-DESKEW/OCR-D-IMG-DESKEW_0001.png', |
423
|
|
|
'OCR-D-IMG-DESKEW/OCR-D-IMG-DESKEW_0001.png'] |
424
|
|
|
assert isinstance(page.get_AllAlternativeImages()[0], AlternativeImageType) |
425
|
|
|
|
426
|
|
|
|
427
|
|
|
def test_serialize_no_empty_readingorder(): |
428
|
|
|
""" |
429
|
|
|
https://github.com/OCR-D/core/issues/602 |
430
|
|
|
""" |
431
|
|
|
pcgts = page_from_image(create_ocrd_file_with_defaults(local_filename=assets.path_to('kant_aufklaerung_1784/data/OCR-D-IMG/INPUT_0017.tif'))) |
432
|
|
|
pcgts.get_Page().set_ReadingOrder(ReadingOrderType()) |
433
|
|
|
assert pcgts.get_Page().get_ReadingOrder() |
434
|
|
|
pcgts = parseString(to_xml(pcgts, skip_declaration=True)) |
435
|
|
|
assert not pcgts.get_Page().get_ReadingOrder() |
436
|
|
|
|
437
|
|
|
|
438
|
|
|
def test_hashable(): |
439
|
|
|
""" |
440
|
|
|
https://github.com/OCR-D/ocrd_segment/issues/45 |
441
|
|
|
""" |
442
|
|
|
pcgts = page_from_image(create_ocrd_file_with_defaults(local_filename=assets.path_to('kant_aufklaerung_1784/data/OCR-D-IMG/INPUT_0017.tif'))) |
443
|
|
|
page = pcgts.get_Page() |
444
|
|
|
testset = set() |
445
|
|
|
testset.add(pcgts) |
446
|
|
|
testset.add(page) |
447
|
|
|
|
448
|
|
|
# TODO: was is actually to be asserted? |
449
|
|
|
|
450
|
|
|
|
451
|
|
|
def test_id(): |
452
|
|
|
""" |
453
|
|
|
https://github.com/OCR-D/core/issues/682 |
454
|
|
|
""" |
455
|
|
|
fpath_page = assets.path_to('kant_aufklaerung_1784/data/OCR-D-GT-PAGE/PAGE_0017_PAGE.xml') |
456
|
|
|
pcgts = parse(fpath_page) |
457
|
|
|
|
458
|
|
|
# assert |
459
|
|
|
assert pcgts.id == 'PAGE_0017_PAGE' |
460
|
|
|
|
461
|
|
|
# TODO: is this *really* desired? |
462
|
|
|
# I would expect for a single Page-Element the ID is like from the top-level-Pgts-Container, not like a fileName |
463
|
|
|
assert pcgts.get_Page().id == 'OCR-D-IMG/INPUT_0017.tif' |
464
|
|
|
|
465
|
|
|
|
466
|
|
|
if __name__ == '__main__': |
467
|
|
|
main(__file__) |
468
|
|
|
|