1
|
|
|
# -*- coding: utf-8 -*- |
2
|
|
|
from datetime import datetime |
3
|
|
|
|
4
|
|
|
from os.path import join |
5
|
|
|
from os import environ |
6
|
|
|
from contextlib import contextmanager |
7
|
|
|
import re |
8
|
|
|
import shutil |
9
|
|
|
from lxml import etree as ET |
10
|
|
|
|
11
|
|
|
from tests.base import ( |
12
|
|
|
main, |
13
|
|
|
capture_log, |
14
|
|
|
assets, |
15
|
|
|
) |
16
|
|
|
|
17
|
|
|
from ocrd_utils import ( |
18
|
|
|
VERSION, |
19
|
|
|
MIMETYPE_PAGE |
20
|
|
|
) |
21
|
|
|
from ocrd_models import ( |
22
|
|
|
OcrdMets |
23
|
|
|
) |
24
|
|
|
|
25
|
|
|
import pytest |
26
|
|
|
|
27
|
|
|
CACHING_ENABLED = [False, True] |
28
|
|
|
|
29
|
|
|
|
30
|
|
|
@pytest.fixture(name='sbb_sample_01', params=CACHING_ENABLED) |
31
|
|
|
def _fixture(request): |
32
|
|
|
mets = OcrdMets(filename=assets.url_of( |
33
|
|
|
'SBB0000F29300010000/data/mets.xml'), cache_flag=request.param) |
34
|
|
|
yield mets |
35
|
|
|
|
36
|
|
|
|
37
|
|
|
@pytest.fixture(name='sbb_directory_ocrd_mets', params=CACHING_ENABLED) |
38
|
|
|
def _fixture_sbb(tmp_path, request): |
39
|
|
|
src_path = assets.path_to('SBB0000F29300010000/data') |
40
|
|
|
dst_path = tmp_path / 'SBB_directory' |
41
|
|
|
shutil.copytree(src_path, dst_path) |
42
|
|
|
mets_path = str(join(dst_path, 'mets.xml')) |
43
|
|
|
yield OcrdMets(filename=mets_path, cache_flag=request.param) |
44
|
|
|
|
45
|
|
|
|
46
|
|
|
def test_unique_identifier(): |
47
|
|
|
mets = OcrdMets(filename=assets.url_of('SBB0000F29300010000/data/mets.xml')) |
48
|
|
|
assert mets.unique_identifier == 'http://resolver.staatsbibliothek-berlin.de/SBB0000F29300010000', 'Right identifier' |
49
|
|
|
mets.unique_identifier = 'foo' |
50
|
|
|
assert mets.unique_identifier == 'foo', 'Right identifier after change' |
51
|
|
|
|
52
|
|
|
|
53
|
|
|
def test_unique_identifier_from_nothing(): |
54
|
|
|
mets = OcrdMets.empty_mets(datetime.now().isoformat()) |
55
|
|
|
assert mets.unique_identifier == None, 'no identifier' |
56
|
|
|
mets.unique_identifier = 'foo' |
57
|
|
|
assert mets.unique_identifier == 'foo', 'Right identifier after change is "foo"' |
58
|
|
|
as_string = mets.to_xml().decode('utf-8') |
59
|
|
|
assert 'ocrd/core v%s' % VERSION in as_string |
60
|
|
|
assert 'CREATEDATE="%04u-%02u-%02uT' % (datetime.now().year, datetime.now().month, datetime.now().day,) in as_string |
61
|
|
|
|
62
|
|
|
|
63
|
|
|
def test_str(): |
64
|
|
|
mets = OcrdMets(content='<mets/>', cache_flag=False) |
65
|
|
|
assert str(mets) == 'OcrdMets[cached=False,fileGrps=[],files=[]]' |
66
|
|
|
mets_cached = OcrdMets(content='<mets/>', cache_flag=True) |
67
|
|
|
assert str(mets_cached) == 'OcrdMets[cached=True,fileGrps=[],files=[]]' |
68
|
|
|
|
69
|
|
|
|
70
|
|
|
def test_file_groups(sbb_sample_01): |
71
|
|
|
assert len(sbb_sample_01.file_groups) == 17, '17 file groups shall be found' |
72
|
|
|
|
73
|
|
|
|
74
|
|
|
def test_find_all_files(sbb_sample_01): |
75
|
|
|
mets = sbb_sample_01 |
76
|
|
|
assert len(mets.find_all_files()) == 35, '35 files total' |
77
|
|
|
assert len(mets.find_all_files(fileGrp='OCR-D-IMG')) == 3, '3 files in "OCR-D-IMG"' |
78
|
|
|
assert len(mets.find_all_files(include_fileGrp='OCR-D-IMG')) == 3, '3 files in "OCR-D-IMG"' |
79
|
|
|
assert len(mets.find_all_files(fileGrp='//OCR-D-I.*')) == 13, '13 files in "//OCR-D-I.*"' |
80
|
|
|
assert len(mets.find_all_files(fileGrp='//OCR-D-I.*', exclude_fileGrp=['OCR-D-IMG'])) == 10, '10 files in "//OCR-D-I.*" sans OCR-D-IMG' |
81
|
|
|
assert len(mets.find_all_files(ID="FILE_0001_IMAGE")) == 1, '1 files with ID "FILE_0001_IMAGE"' |
82
|
|
|
assert len(mets.find_all_files(ID="//FILE_0005_.*")) == 1, '1 files with ID "//FILE_0005_.*"' |
83
|
|
|
assert len(mets.find_all_files(pageId='PHYS_0001')) == 17, '17 files for page "PHYS_0001"' |
84
|
|
|
assert len(mets.find_all_files(mimetype='image/tiff')) == 13, '13 image/tiff' |
85
|
|
|
assert len(mets.find_all_files(mimetype='//application/.*')) == 22, '22 application/.*' |
86
|
|
|
assert len(mets.find_all_files(mimetype=MIMETYPE_PAGE)) == 20, '20 ' + MIMETYPE_PAGE |
87
|
|
|
assert len(mets.find_all_files(local_filename='OCR-D-IMG/FILE_0005_IMAGE.tif')) == 1, '1 FILE xlink:href="OCR-D-IMG/FILE_0005_IMAGE.tif"' |
88
|
|
|
assert len(mets.find_all_files(url='https://github.com/OCR-D/assets/raw/master/data/SBB0000F29300010000/00000001_DESKEW.tif')) == 1, '1 URL xlink:href="https://github.com/OCR-D/assets/raw/master/data/SBB0000F29300010000/00000001_DESKEW.tif"' |
89
|
|
|
assert len(mets.find_all_files(pageId='PHYS_0001..PHYS_0005')) == 35, '35 files for page "PHYS_0001..PHYS_0005"' |
90
|
|
|
assert len(mets.find_all_files(pageId='//PHYS_000(1|2)')) == 34, '34 files in PHYS_001 and PHYS_0002' |
91
|
|
|
assert len(mets.find_all_files(pageId='//PHYS_0001,//PHYS_0005')) == 18, '18 files in PHYS_001 and PHYS_0005 (two regexes)' |
92
|
|
|
assert len(mets.find_all_files(pageId='//PHYS_0005,PHYS_0001..PHYS_0002')) == 35, '35 files in //PHYS_0005,PHYS_0001..PHYS_0002' |
93
|
|
|
assert len(mets.find_all_files(pageId='//PHYS_0005,PHYS_0001..PHYS_0002')) == 35, '35 files in //PHYS_0005,PHYS_0001..PHYS_0002' |
94
|
|
|
assert len(mets.find_all_files(pageId='1..10')) == 35, '35 files in @ORDER range 1..10' |
95
|
|
|
assert len(mets.find_all_files(pageId='1..5')) == 35, '35 files in @ORDER range 1..10' |
96
|
|
|
assert len(mets.find_all_files(pageId='PHYS_0001,PHYS_0002,PHYS_0005')) == 35, '35 in PHYS_0001,PHYS_0002,PHYS_0005' |
97
|
|
|
assert len(mets.find_all_files(pageId='PHYS_0001..PHYS_0002,PHYS_0005')) == 35, '35 in PHYS_0001,PHYS_0002,PHYS_0005' |
98
|
|
|
assert len(mets.find_all_files(pageId='page 1..page 2,5')) == 35, '35 in PHYS_0001,PHYS_0002,PHYS_0005' |
99
|
|
|
assert len(mets.find_all_files(pageId='PHYS_0005,1..2')) == 35, '35 in PHYS_0001,PHYS_0002,PHYS_0005' |
100
|
|
|
with pytest.raises(ValueError, match='differ in their non-numeric part'): |
101
|
|
|
len(mets.find_all_files(pageId='1..PHYS_0002')) |
102
|
|
|
with pytest.raises(ValueError, match=re.compile(f'match(es)? none')): |
103
|
|
|
mets.find_all_files(pageId='PHYS_0006..PHYS_0029') |
104
|
|
|
with pytest.raises(ValueError, match=re.compile(f'match(es)? none')): |
105
|
|
|
mets.find_all_files(pageId='PHYS_0001-NOTEXIST') |
106
|
|
|
with pytest.raises(ValueError, match=re.compile(f'match(es)? none')): |
107
|
|
|
mets.find_all_files(pageId='1..5,PHYS_0006..PHYS_0029') |
108
|
|
|
with pytest.raises(ValueError, match=re.compile(f'match(es)? none')): |
109
|
|
|
mets.find_all_files(pageId='//PHYS000.*') |
110
|
|
|
with pytest.raises(ValueError, match=re.compile(f'Start of range pattern')): |
111
|
|
|
mets.find_all_files(pageId='PHYS_0000..PHYS_0004') |
112
|
|
|
|
113
|
|
|
def test_find_all_files_local_only(sbb_sample_01): |
114
|
|
|
assert len(sbb_sample_01.find_all_files(pageId='PHYS_0001', |
115
|
|
|
local_only=True)) == 14, '14 local files for page "PHYS_0001"' |
116
|
|
|
|
117
|
|
|
|
118
|
|
|
def test_physical_pages(sbb_sample_01): |
119
|
|
|
assert len(sbb_sample_01.physical_pages) == 3, '3 physical pages' |
120
|
|
|
assert isinstance(sbb_sample_01.physical_pages, list) |
121
|
|
|
assert isinstance(sbb_sample_01.physical_pages[0], str) |
122
|
|
|
assert not isinstance(sbb_sample_01.physical_pages[0], ET._ElementUnicodeResult) |
123
|
|
|
|
124
|
|
|
def test_physical_pages_from_empty_mets(): |
125
|
|
|
mets = OcrdMets(content="<mets></mets>") |
126
|
|
|
assert len(mets.physical_pages) == 0, 'no physical page' |
127
|
|
|
mets.add_file('OUTPUT', ID="foo123", pageId="foobar") |
128
|
|
|
assert len(mets.physical_pages) == 1, '1 physical page' |
129
|
|
|
|
130
|
|
|
|
131
|
|
|
def test_physical_pages_for_fileids(sbb_directory_ocrd_mets): |
132
|
|
|
assert sbb_directory_ocrd_mets.get_physical_pages( |
133
|
|
|
for_fileIds=['FILE_0002_IMAGE']) == ['PHYS_0002'] |
134
|
|
|
|
135
|
|
|
def test_physical_pages_for_emtpy_fileids(sbb_directory_ocrd_mets): |
136
|
|
|
assert sbb_directory_ocrd_mets.get_physical_pages( |
137
|
|
|
for_fileIds=[]) == [] |
138
|
|
|
|
139
|
|
|
|
140
|
|
|
def test_add_group(): |
141
|
|
|
mets = OcrdMets.empty_mets() |
142
|
|
|
assert len(mets.file_groups) == 0, '0 file groups' |
143
|
|
|
mets.add_file_group('TEST') |
144
|
|
|
assert len(mets.file_groups) == 1, '1 file groups' |
145
|
|
|
mets.add_file_group('TEST') |
146
|
|
|
assert len(mets.file_groups) == 1, '1 file groups' |
147
|
|
|
|
148
|
|
|
|
149
|
|
|
def test_add_file0(): |
150
|
|
|
mets = OcrdMets.empty_mets() |
151
|
|
|
assert len(mets.file_groups) == 0, '0 file groups' |
152
|
|
|
assert len(list(mets.find_all_files(fileGrp='OUTPUT'))) == 0, '0 files in "OUTPUT"' |
153
|
|
|
f = mets.add_file('OUTPUT', ID="foo123", mimetype="bla/quux", pageId="foobar") |
154
|
|
|
# TODO unless pageId/mimetype/fileGrp match raises exception this won't work |
155
|
|
|
# with pytest.raises(Exception) as exc: |
156
|
|
|
# f2 = mets.add_file('OUTPUT', ID="foo1232", mimetype="bla/quux", pageId="foobar") |
157
|
|
|
# assert str(exc.value) == "Exception: File with pageId='foobar' already exists in fileGrp 'OUTPUTx'" |
158
|
|
|
f2 = mets.add_file('OUTPUT', ID="foo1232", mimetype="bla/quux", pageId="foobar") |
159
|
|
|
assert f.pageId == 'foobar', 'pageId set' |
160
|
|
|
assert len(mets.file_groups) == 1, '1 file groups' |
161
|
|
|
assert len(list(mets.find_all_files(fileGrp='OUTPUT'))) == 2, '2 files in "OUTPUT"' |
162
|
|
|
mets.set_physical_page_for_file('barfoo', f, order='300', orderlabel="page 300") |
163
|
|
|
assert f.pageId == 'barfoo', 'pageId changed' |
164
|
|
|
mets.set_physical_page_for_file('quux', f2, order='302', orderlabel="page 302") |
165
|
|
|
assert f2.pageId == 'quux', 'pageId changed' |
166
|
|
|
mets.set_physical_page_for_file('barfoo', f2, order='301', orderlabel="page 301") |
167
|
|
|
assert f2.pageId == 'barfoo', 'pageId changed' |
168
|
|
|
assert len(mets.file_groups) == 1, '1 file group' |
169
|
|
|
|
170
|
|
|
|
171
|
|
|
def test_add_file_id_already_exists(sbb_sample_01): |
172
|
|
|
f = sbb_sample_01.add_file('OUTPUT', ID='best-id-ever', mimetype="beep/boop") |
173
|
|
|
assert f.ID == 'best-id-ever', "ID kept" |
174
|
|
|
with pytest.raises(FileExistsError) as exc: |
175
|
|
|
sbb_sample_01.add_file('OUTPUT', ID='best-id-ever', mimetype="boop/beep") |
176
|
|
|
|
177
|
|
|
# Still fails because differing mimetypes |
178
|
|
|
with pytest.raises(FileExistsError) as exc: |
179
|
|
|
f2 = sbb_sample_01.add_file('OUTPUT', ID='best-id-ever', mimetype="boop/beep", force=True) |
180
|
|
|
|
181
|
|
|
# Works but is unwise, there are now two files with clashing ID in METS |
182
|
|
|
f2 = sbb_sample_01.add_file('OUTPUT', ID='best-id-ever', mimetype="boop/beep", ignore=True) |
183
|
|
|
assert len(list(sbb_sample_01.find_files(ID='best-id-ever'))) == 1 if sbb_sample_01._cache_flag else 2 |
184
|
|
|
|
185
|
|
|
if sbb_sample_01._cache_flag: |
186
|
|
|
# Does not work with caching |
187
|
|
|
with pytest.raises(FileExistsError) as val_err: |
188
|
|
|
sbb_sample_01.add_file('OUTPUT', ID='best-id-ever', mimetype="beep/boop", force=True) |
189
|
|
|
else: |
190
|
|
|
# Works because fileGrp, mimetype and pageId(== None) match and force is set |
191
|
|
|
f2 = sbb_sample_01.add_file('OUTPUT', ID='best-id-ever', mimetype="beep/boop", force=True) |
192
|
|
|
|
193
|
|
|
# Previous step removed duplicate mets:file |
194
|
|
|
assert len(list(sbb_sample_01.find_files(ID='best-id-ever'))) == 1 |
195
|
|
|
|
196
|
|
|
def test_add_file_nopageid_overwrite(sbb_sample_01: OcrdMets): |
197
|
|
|
""" |
198
|
|
|
Test that when adding files without pageId |
199
|
|
|
""" |
200
|
|
|
with capture_log('ocrd_models.ocrd_mets.add_file') as cap: |
201
|
|
|
file1 = sbb_sample_01.add_file('OUTPUT', ID='best-id-ever', mimetype="application/tei+xml") |
202
|
|
|
with pytest.raises(FileExistsError): |
203
|
|
|
file2 = sbb_sample_01.add_file('OUTPUT', ID='best-id-ever', mimetype="application/tei+xml", ignore=False, force=False) |
204
|
|
|
|
205
|
|
|
def test_add_file_ignore(sbb_sample_01: OcrdMets): |
206
|
|
|
"""Behavior if ignore-Flag set to true: |
207
|
|
|
delegate responsibility to overwrite existing files to user""" |
208
|
|
|
|
209
|
|
|
the_file = sbb_sample_01.add_file('OUTPUT', ID='best-id-ever', mimetype="beep/boop") |
210
|
|
|
assert the_file.ID == 'best-id-ever' |
211
|
|
|
the_same = sbb_sample_01.add_file('OUTPUT', ID='best-id-ever', mimetype="boop/beep", ignore=True) |
212
|
|
|
assert the_same.ID == 'best-id-ever' |
213
|
|
|
|
214
|
|
|
# how many files inserted |
215
|
|
|
the_files = list(sbb_sample_01.find_files(ID='best-id-ever')) |
216
|
|
|
assert len(the_files) == 1 if sbb_sample_01._cache_flag else 2 |
217
|
|
|
|
218
|
|
|
|
219
|
|
|
def test_add_file_id_invalid(sbb_sample_01): |
220
|
|
|
with pytest.raises(Exception) as exc: |
221
|
|
|
sbb_sample_01.add_file('OUTPUT', ID='1234:::', mimetype="beep/boop") |
222
|
|
|
assert "Invalid syntax for mets:file/@ID 1234:::" in str(exc) |
223
|
|
|
|
224
|
|
|
|
225
|
|
|
def test_filegrp_from_file(sbb_sample_01): |
226
|
|
|
f = sbb_sample_01.find_all_files(fileGrp='OCR-D-IMG')[0] |
227
|
|
|
assert f.fileGrp == 'OCR-D-IMG' |
228
|
|
|
|
229
|
|
|
|
230
|
|
|
def test_add_file_no_id(sbb_sample_01): |
231
|
|
|
with pytest.raises(Exception) as exc: |
232
|
|
|
sbb_sample_01.add_file('FOO') |
233
|
|
|
assert "Must set ID of the mets:file" in str(exc) |
234
|
|
|
|
235
|
|
|
|
236
|
|
|
def test_add_file_no_pageid(sbb_sample_01): |
237
|
|
|
f = sbb_sample_01.add_file('OUTPUT', mimetype="bla/quux", ID="foo3") |
238
|
|
|
assert not f.pageId, 'No pageId available, dude!' |
239
|
|
|
|
240
|
|
|
|
241
|
|
|
def test_file_pageid(sbb_sample_01): |
242
|
|
|
f = sbb_sample_01.find_all_files()[0] |
243
|
|
|
assert f.pageId == 'PHYS_0001' |
244
|
|
|
f.pageId = 'foo' |
245
|
|
|
assert f.pageId == 'foo' |
246
|
|
|
|
247
|
|
|
|
248
|
|
|
def test_agent(sbb_sample_01): |
249
|
|
|
beforelen = len(sbb_sample_01.agents) |
250
|
|
|
sbb_sample_01.add_agent('foo bar v0.0.1', 'OTHER', 'OTHER', 'YETOTHERSTILL') |
251
|
|
|
assert len(sbb_sample_01.agents) == beforelen + 1 |
252
|
|
|
|
253
|
|
|
def test_metshdr(): |
254
|
|
|
""" |
255
|
|
|
Test whether metsHdr is created on-demand |
256
|
|
|
""" |
257
|
|
|
mets = OcrdMets(content="<mets></mets>") |
258
|
|
|
assert not list(mets._tree.getroot()) |
259
|
|
|
mets.add_agent() |
260
|
|
|
assert len(mets._tree.getroot()) == 1 |
261
|
|
|
|
262
|
|
|
|
263
|
|
|
def test_nocontent_nofilename_exception(): |
264
|
|
|
with pytest.raises(Exception) as exc: |
265
|
|
|
OcrdMets() |
266
|
|
|
assert "Must pass 'filename' or 'content' to" in str(exc) |
267
|
|
|
|
268
|
|
|
|
269
|
|
|
def test_encoding_entities(): |
270
|
|
|
mets = OcrdMets(content=""" |
271
|
|
|
<mets> |
272
|
|
|
<metsHdr> |
273
|
|
|
<agent> |
274
|
|
|
<name>Őh śéé Áŕ</name> |
275
|
|
|
<note>OCR-D</note> |
276
|
|
|
</agent> |
277
|
|
|
</metsHdr> |
278
|
|
|
</mets> |
279
|
|
|
""") |
280
|
|
|
assert 'Őh śéé Áŕ' in mets.to_xml().decode('utf-8') |
281
|
|
|
|
282
|
|
|
|
283
|
|
|
def test_remove_page(sbb_directory_ocrd_mets): |
284
|
|
|
assert sbb_directory_ocrd_mets.physical_pages, ['PHYS_0001', 'PHYS_0002', 'PHYS_0005'] |
285
|
|
|
sbb_directory_ocrd_mets.remove_physical_page('PHYS_0001') |
286
|
|
|
assert sbb_directory_ocrd_mets.physical_pages, ['PHYS_0002', 'PHYS_0005'] |
287
|
|
|
|
288
|
|
|
|
289
|
|
|
def test_remove_physical_page_fptr(sbb_directory_ocrd_mets): |
290
|
|
|
assert sbb_directory_ocrd_mets.get_physical_pages(for_fileIds=['FILE_0002_IMAGE']), ['PHYS_0002'] |
291
|
|
|
sbb_directory_ocrd_mets.remove_physical_page_fptr('FILE_0002_IMAGE') |
292
|
|
|
sbb_directory_ocrd_mets.remove_physical_page_fptr('FILE_0002_IMAGE') |
293
|
|
|
assert sbb_directory_ocrd_mets.get_physical_pages(for_fileIds=['FILE_0002_IMAGE']), [None] |
294
|
|
|
|
295
|
|
|
|
296
|
|
|
def test_remove_page_after_remove_file(sbb_directory_ocrd_mets): |
297
|
|
|
assert sbb_directory_ocrd_mets.physical_pages, ['PHYS_0001', 'PHYS_0002', 'PHYS_0005'] |
298
|
|
|
sbb_directory_ocrd_mets.remove_one_file('FILE_0005_IMAGE') |
299
|
|
|
assert sbb_directory_ocrd_mets.physical_pages, ['PHYS_0001', 'PHYS_0002'] |
300
|
|
|
|
301
|
|
|
|
302
|
|
|
def test_remove_file_ocrdfile(sbb_directory_ocrd_mets): |
303
|
|
|
assert sbb_directory_ocrd_mets.physical_pages, ['PHYS_0001', 'PHYS_0002', 'PHYS_0005'] |
304
|
|
|
ocrd_file = sbb_directory_ocrd_mets.find_all_files(ID='FILE_0005_IMAGE')[0] |
305
|
|
|
sbb_directory_ocrd_mets.remove_one_file(ocrd_file) |
306
|
|
|
assert sbb_directory_ocrd_mets.physical_pages, ['PHYS_0001', 'PHYS_0002'] |
307
|
|
|
|
308
|
|
|
|
309
|
|
|
def test_remove_file_regex(sbb_directory_ocrd_mets): |
310
|
|
|
assert sbb_directory_ocrd_mets.physical_pages, ['PHYS_0001', 'PHYS_0002', 'PHYS_0005'] |
311
|
|
|
sbb_directory_ocrd_mets.remove_file('//FILE_0005.*') |
312
|
|
|
assert sbb_directory_ocrd_mets.physical_pages, ['PHYS_0001', 'PHYS_0002'] |
313
|
|
|
|
314
|
|
|
|
315
|
|
|
def test_rename_non_existent_filegroup_exception(sbb_directory_ocrd_mets): |
316
|
|
|
with pytest.raises(FileNotFoundError) as fnf_exc: |
317
|
|
|
sbb_directory_ocrd_mets.rename_file_group('FOOBAR', 'FOOBAR') |
318
|
|
|
# assert |
319
|
|
|
assert "No such fileGrp 'FOOBAR'" in str(fnf_exc) |
320
|
|
|
|
321
|
|
|
|
322
|
|
|
def test_rename_file_group0(sbb_directory_ocrd_mets): |
323
|
|
|
assert 'FOOBAR' not in sbb_directory_ocrd_mets.file_groups |
324
|
|
|
|
325
|
|
|
# act |
326
|
|
|
sbb_directory_ocrd_mets.rename_file_group('OCR-D-GT-PAGE', 'FOOBAR') |
327
|
|
|
|
328
|
|
|
# assert |
329
|
|
|
assert 'OCR-D-GT-PAGE' not in sbb_directory_ocrd_mets.file_groups |
330
|
|
|
assert 'FOOBAR' in sbb_directory_ocrd_mets.file_groups |
331
|
|
|
|
332
|
|
|
|
333
|
|
|
def test_remove_non_empty_filegroup_exception(sbb_directory_ocrd_mets): |
334
|
|
|
with pytest.raises(Exception) as exc: |
335
|
|
|
sbb_directory_ocrd_mets.remove_file_group('OCR-D-GT-ALTO') |
336
|
|
|
assert "not empty" in str(exc) |
337
|
|
|
|
338
|
|
|
|
339
|
|
|
def test_remove_file_group0(sbb_directory_ocrd_mets): |
340
|
|
|
""" |
341
|
|
|
Test removal of filegrp |
342
|
|
|
""" |
343
|
|
|
|
344
|
|
|
assert len(sbb_directory_ocrd_mets.file_groups) == 17 |
345
|
|
|
assert len(sbb_directory_ocrd_mets.find_all_files()) == 35 |
346
|
|
|
|
347
|
|
|
sbb_directory_ocrd_mets.remove_file_group('OCR-D-GT-PAGE', recursive=True) |
348
|
|
|
assert len(sbb_directory_ocrd_mets.file_groups) == 16 |
349
|
|
|
assert len(sbb_directory_ocrd_mets.find_all_files()) == 33 |
350
|
|
|
|
351
|
|
|
|
352
|
|
|
def test_remove_file_group_regex(sbb_directory_ocrd_mets): |
353
|
|
|
""" |
354
|
|
|
Test removal of filegrp |
355
|
|
|
""" |
356
|
|
|
|
357
|
|
|
assert len(sbb_directory_ocrd_mets.file_groups) == 17 |
358
|
|
|
assert len(sbb_directory_ocrd_mets.find_all_files()) == 35 |
359
|
|
|
|
360
|
|
|
# act |
361
|
|
|
sbb_directory_ocrd_mets.remove_file_group('//OCR-D-GT-.*', recursive=True) |
362
|
|
|
|
363
|
|
|
# assert |
364
|
|
|
assert len(sbb_directory_ocrd_mets.file_groups) == 15 |
365
|
|
|
assert len(sbb_directory_ocrd_mets.find_all_files()) == 31 |
366
|
|
|
|
367
|
|
|
|
368
|
|
|
def test_merge(sbb_sample_01): |
369
|
|
|
assert len(sbb_sample_01.file_groups) == 17 |
370
|
|
|
other_mets = OcrdMets(filename=assets.path_to('kant_aufklaerung_1784/data/mets.xml')) |
371
|
|
|
sbb_sample_01.merge(other_mets, fileGrp_mapping={'OCR-D-IMG': 'FOO'}) |
372
|
|
|
assert len(sbb_sample_01.file_groups) == 18 |
373
|
|
|
|
374
|
|
|
def test_invalid_filegrp(): |
375
|
|
|
"""addresses https://github.com/OCR-D/core/issues/746""" |
376
|
|
|
|
377
|
|
|
mets = OcrdMets(content="<mets></mets>") |
378
|
|
|
with pytest.raises(ValueError) as val_err: |
379
|
|
|
mets.add_file('1:! bad filegrp', ID="foo123", pageId="foobar") |
380
|
|
|
|
381
|
|
|
assert "Invalid syntax for mets:fileGrp/@USE" in str(val_err.value) |
382
|
|
|
|
383
|
|
|
@contextmanager |
384
|
|
|
def temp_env_var(k, v): |
385
|
|
|
v_before = environ.get(k, None) |
|
|
|
|
386
|
|
|
environ[k] = v |
387
|
|
|
yield |
388
|
|
|
if v_before is not None: |
389
|
|
|
environ[k] = v_before |
390
|
|
|
else: |
391
|
|
|
del environ[k] |
392
|
|
|
|
393
|
|
|
def test_envvar(): |
394
|
|
|
assert OcrdMets(filename=assets.url_of('SBB0000F29300010000/data/mets.xml'), cache_flag=True)._cache_flag |
395
|
|
|
assert not OcrdMets(filename=assets.url_of('SBB0000F29300010000/data/mets.xml'), cache_flag=False)._cache_flag |
396
|
|
|
with temp_env_var('OCRD_METS_CACHING', 'true'): |
397
|
|
|
assert OcrdMets(filename=assets.url_of('SBB0000F29300010000/data/mets.xml'), cache_flag=True)._cache_flag |
398
|
|
|
assert OcrdMets(filename=assets.url_of('SBB0000F29300010000/data/mets.xml'), cache_flag=False)._cache_flag |
399
|
|
|
with temp_env_var('OCRD_METS_CACHING', 'false'): |
400
|
|
|
assert not OcrdMets(filename=assets.url_of('SBB0000F29300010000/data/mets.xml'), cache_flag=True)._cache_flag |
401
|
|
|
assert not OcrdMets(filename=assets.url_of('SBB0000F29300010000/data/mets.xml'), cache_flag=False)._cache_flag |
402
|
|
|
|
403
|
|
|
def test_update_physical_page_attributes(sbb_directory_ocrd_mets): |
404
|
|
|
m = sbb_directory_ocrd_mets |
405
|
|
|
m.remove_file() |
406
|
|
|
assert len(m.physical_pages) == 0 |
407
|
|
|
m.add_file('FOO', pageId='new page', ID='foo1', mimetype='foo/bar') |
408
|
|
|
m.add_file('FOO', pageId='new page', ID='foo2', mimetype='foo/bar') |
409
|
|
|
m.add_file('FOO', pageId='new page', ID='foo3', mimetype='foo/bar') |
410
|
|
|
m.add_file('FOO', pageId='new page', ID='foo4', mimetype='foo/bar') |
411
|
|
|
assert len(m.physical_pages) == 1 |
412
|
|
|
assert b'ORDER' not in m.to_xml() |
413
|
|
|
assert b'ORDERLABEL' not in m.to_xml() |
414
|
|
|
m.update_physical_page_attributes('new page', ORDER='foo', ORDERLABEL='bar') |
415
|
|
|
assert b'ORDER' in m.to_xml() |
416
|
|
|
assert b'ORDERLABEL' in m.to_xml() |
417
|
|
|
|
418
|
|
|
|
419
|
|
|
if __name__ == '__main__': |
420
|
|
|
main(__file__) |
421
|
|
|
|