|
1
|
|
|
# -*- coding: utf-8 -*- |
|
2
|
|
|
from datetime import datetime |
|
3
|
|
|
|
|
4
|
|
|
from os.path import join |
|
5
|
|
|
from os import environ |
|
6
|
|
|
from contextlib import contextmanager |
|
7
|
|
|
import re |
|
8
|
|
|
import shutil |
|
9
|
|
|
from lxml import etree as ET |
|
10
|
|
|
|
|
11
|
|
|
from tests.base import ( |
|
12
|
|
|
main, |
|
13
|
|
|
capture_log, |
|
14
|
|
|
assets, |
|
15
|
|
|
) |
|
16
|
|
|
|
|
17
|
|
|
from ocrd_utils import ( |
|
18
|
|
|
VERSION, |
|
19
|
|
|
MIMETYPE_PAGE |
|
20
|
|
|
) |
|
21
|
|
|
from ocrd_models import ( |
|
22
|
|
|
OcrdMets |
|
23
|
|
|
) |
|
24
|
|
|
|
|
25
|
|
|
import pytest |
|
26
|
|
|
|
|
27
|
|
|
CACHING_ENABLED = [False, True] |
|
28
|
|
|
|
|
29
|
|
|
|
|
30
|
|
|
@pytest.fixture(name='sbb_sample_01', params=CACHING_ENABLED) |
|
31
|
|
|
def _fixture(request): |
|
32
|
|
|
mets = OcrdMets(filename=assets.url_of( |
|
33
|
|
|
'SBB0000F29300010000/data/mets.xml'), cache_flag=request.param) |
|
34
|
|
|
yield mets |
|
35
|
|
|
|
|
36
|
|
|
|
|
37
|
|
|
@pytest.fixture(name='sbb_directory_ocrd_mets', params=CACHING_ENABLED) |
|
38
|
|
|
def _fixture_sbb(tmp_path, request): |
|
39
|
|
|
src_path = assets.path_to('SBB0000F29300010000/data') |
|
40
|
|
|
dst_path = tmp_path / 'SBB_directory' |
|
41
|
|
|
shutil.copytree(src_path, dst_path) |
|
42
|
|
|
mets_path = str(join(dst_path, 'mets.xml')) |
|
43
|
|
|
yield OcrdMets(filename=mets_path, cache_flag=request.param) |
|
44
|
|
|
|
|
45
|
|
|
|
|
46
|
|
|
def test_unique_identifier(): |
|
47
|
|
|
mets = OcrdMets(filename=assets.url_of('SBB0000F29300010000/data/mets.xml')) |
|
48
|
|
|
assert mets.unique_identifier == 'http://resolver.staatsbibliothek-berlin.de/SBB0000F29300010000', 'Right identifier' |
|
49
|
|
|
mets.unique_identifier = 'foo' |
|
50
|
|
|
assert mets.unique_identifier == 'foo', 'Right identifier after change' |
|
51
|
|
|
|
|
52
|
|
|
|
|
53
|
|
|
def test_unique_identifier_from_nothing(): |
|
54
|
|
|
mets = OcrdMets.empty_mets(datetime.now().isoformat()) |
|
55
|
|
|
assert mets.unique_identifier == None, 'no identifier' |
|
56
|
|
|
mets.unique_identifier = 'foo' |
|
57
|
|
|
assert mets.unique_identifier == 'foo', 'Right identifier after change is "foo"' |
|
58
|
|
|
as_string = mets.to_xml().decode('utf-8') |
|
59
|
|
|
assert 'ocrd/core v%s' % VERSION in as_string |
|
60
|
|
|
assert 'CREATEDATE="%04u-%02u-%02uT' % (datetime.now().year, datetime.now().month, datetime.now().day,) in as_string |
|
61
|
|
|
|
|
62
|
|
|
|
|
63
|
|
|
def test_str(): |
|
64
|
|
|
mets = OcrdMets(content='<mets/>', cache_flag=False) |
|
65
|
|
|
assert str(mets) == 'OcrdMets[cached=False,fileGrps=[],files=[]]' |
|
66
|
|
|
mets_cached = OcrdMets(content='<mets/>', cache_flag=True) |
|
67
|
|
|
assert str(mets_cached) == 'OcrdMets[cached=True,fileGrps=[],files=[]]' |
|
68
|
|
|
|
|
69
|
|
|
|
|
70
|
|
|
def test_file_groups(sbb_sample_01): |
|
71
|
|
|
assert len(sbb_sample_01.file_groups) == 17, '17 file groups shall be found' |
|
72
|
|
|
|
|
73
|
|
|
|
|
74
|
|
|
def test_find_all_files(sbb_sample_01): |
|
75
|
|
|
mets = sbb_sample_01 |
|
76
|
|
|
assert len(mets.find_all_files()) == 35, '35 files total' |
|
77
|
|
|
assert len(mets.find_all_files(fileGrp='OCR-D-IMG')) == 3, '3 files in "OCR-D-IMG"' |
|
78
|
|
|
assert len(mets.find_all_files(include_fileGrp='OCR-D-IMG')) == 3, '3 files in "OCR-D-IMG"' |
|
79
|
|
|
assert len(mets.find_all_files(fileGrp='//OCR-D-I.*')) == 13, '13 files in "//OCR-D-I.*"' |
|
80
|
|
|
assert len(mets.find_all_files(fileGrp='//OCR-D-I.*', exclude_fileGrp=['OCR-D-IMG'])) == 10, '10 files in "//OCR-D-I.*" sans OCR-D-IMG' |
|
81
|
|
|
assert len(mets.find_all_files(ID="FILE_0001_IMAGE")) == 1, '1 files with ID "FILE_0001_IMAGE"' |
|
82
|
|
|
assert len(mets.find_all_files(ID="//FILE_0005_.*")) == 1, '1 files with ID "//FILE_0005_.*"' |
|
83
|
|
|
assert len(mets.find_all_files(pageId='PHYS_0001')) == 17, '17 files for page "PHYS_0001"' |
|
84
|
|
|
assert len(mets.find_all_files(mimetype='image/tiff')) == 13, '13 image/tiff' |
|
85
|
|
|
assert len(mets.find_all_files(mimetype='//application/.*')) == 22, '22 application/.*' |
|
86
|
|
|
assert len(mets.find_all_files(mimetype=MIMETYPE_PAGE)) == 20, '20 ' + MIMETYPE_PAGE |
|
87
|
|
|
assert len(mets.find_all_files(local_filename='OCR-D-IMG/FILE_0005_IMAGE.tif')) == 1, '1 FILE xlink:href="OCR-D-IMG/FILE_0005_IMAGE.tif"' |
|
88
|
|
|
assert len(mets.find_all_files(url='https://github.com/OCR-D/assets/raw/master/data/SBB0000F29300010000/00000001_DESKEW.tif')) == 1, '1 URL xlink:href="https://github.com/OCR-D/assets/raw/master/data/SBB0000F29300010000/00000001_DESKEW.tif"' |
|
89
|
|
|
assert len(mets.find_all_files(pageId='PHYS_0001..PHYS_0005')) == 35, '35 files for page "PHYS_0001..PHYS_0005"' |
|
90
|
|
|
assert len(mets.find_all_files(pageId='//PHYS_000(1|2)')) == 34, '34 files in PHYS_001 and PHYS_0002' |
|
91
|
|
|
assert len(mets.find_all_files(pageId='//PHYS_0001,//PHYS_0005')) == 18, '18 files in PHYS_001 and PHYS_0005 (two regexes)' |
|
92
|
|
|
assert len(mets.find_all_files(pageId='//PHYS_0005,PHYS_0001..PHYS_0002')) == 35, '35 files in //PHYS_0005,PHYS_0001..PHYS_0002' |
|
93
|
|
|
assert len(mets.find_all_files(pageId='//PHYS_0005,PHYS_0001..PHYS_0002')) == 35, '35 files in //PHYS_0005,PHYS_0001..PHYS_0002' |
|
94
|
|
|
assert len(mets.find_all_files(pageId='1..10')) == 35, '35 files in @ORDER range 1..10' |
|
95
|
|
|
assert len(mets.find_all_files(pageId='1..5')) == 35, '35 files in @ORDER range 1..10' |
|
96
|
|
|
assert len(mets.find_all_files(pageId='PHYS_0001,PHYS_0002,PHYS_0005')) == 35, '35 in PHYS_0001,PHYS_0002,PHYS_0005' |
|
97
|
|
|
assert len(mets.find_all_files(pageId='PHYS_0001..PHYS_0002,PHYS_0005')) == 35, '35 in PHYS_0001,PHYS_0002,PHYS_0005' |
|
98
|
|
|
assert len(mets.find_all_files(pageId='page 1..page 2,5')) == 35, '35 in PHYS_0001,PHYS_0002,PHYS_0005' |
|
99
|
|
|
assert len(mets.find_all_files(pageId='PHYS_0005,1..2')) == 35, '35 in PHYS_0001,PHYS_0002,PHYS_0005' |
|
100
|
|
|
with pytest.raises(ValueError, match='differ in their non-numeric part'): |
|
101
|
|
|
len(mets.find_all_files(pageId='1..PHYS_0002')) |
|
102
|
|
|
with pytest.raises(ValueError, match=re.compile(f'match(es)? none')): |
|
103
|
|
|
mets.find_all_files(pageId='PHYS_0006..PHYS_0029') |
|
104
|
|
|
with pytest.raises(ValueError, match=re.compile(f'match(es)? none')): |
|
105
|
|
|
mets.find_all_files(pageId='PHYS_0001-NOTEXIST') |
|
106
|
|
|
with pytest.raises(ValueError, match=re.compile(f'match(es)? none')): |
|
107
|
|
|
mets.find_all_files(pageId='1..5,PHYS_0006..PHYS_0029') |
|
108
|
|
|
with pytest.raises(ValueError, match=re.compile(f'match(es)? none')): |
|
109
|
|
|
mets.find_all_files(pageId='//PHYS000.*') |
|
110
|
|
|
with pytest.raises(ValueError, match=re.compile(f'Start of range pattern')): |
|
111
|
|
|
mets.find_all_files(pageId='PHYS_0000..PHYS_0004') |
|
112
|
|
|
|
|
113
|
|
|
def test_find_all_files_local_only(sbb_sample_01): |
|
114
|
|
|
assert len(sbb_sample_01.find_all_files(pageId='PHYS_0001', |
|
115
|
|
|
local_only=True)) == 14, '14 local files for page "PHYS_0001"' |
|
116
|
|
|
|
|
117
|
|
|
|
|
118
|
|
|
def test_physical_pages(sbb_sample_01): |
|
119
|
|
|
assert len(sbb_sample_01.physical_pages) == 3, '3 physical pages' |
|
120
|
|
|
assert isinstance(sbb_sample_01.physical_pages, list) |
|
121
|
|
|
assert isinstance(sbb_sample_01.physical_pages[0], str) |
|
122
|
|
|
assert not isinstance(sbb_sample_01.physical_pages[0], ET._ElementUnicodeResult) |
|
123
|
|
|
|
|
124
|
|
|
def test_physical_pages_from_empty_mets(): |
|
125
|
|
|
mets = OcrdMets(content="<mets></mets>") |
|
126
|
|
|
assert len(mets.physical_pages) == 0, 'no physical page' |
|
127
|
|
|
mets.add_file('OUTPUT', ID="foo123", pageId="foobar") |
|
128
|
|
|
assert len(mets.physical_pages) == 1, '1 physical page' |
|
129
|
|
|
|
|
130
|
|
|
|
|
131
|
|
|
def test_physical_pages_for_fileids(sbb_directory_ocrd_mets): |
|
132
|
|
|
assert sbb_directory_ocrd_mets.get_physical_pages( |
|
133
|
|
|
for_fileIds=['FILE_0002_IMAGE']) == ['PHYS_0002'] |
|
134
|
|
|
|
|
135
|
|
|
def test_physical_pages_for_empty_fileids(sbb_directory_ocrd_mets): |
|
136
|
|
|
assert sbb_directory_ocrd_mets.get_physical_pages( |
|
137
|
|
|
for_fileIds=[]) == [] |
|
138
|
|
|
|
|
139
|
|
|
|
|
140
|
|
|
def test_add_group(): |
|
141
|
|
|
mets = OcrdMets.empty_mets() |
|
142
|
|
|
assert len(mets.file_groups) == 0, '0 file groups' |
|
143
|
|
|
mets.add_file_group('TEST') |
|
144
|
|
|
assert len(mets.file_groups) == 1, '1 file groups' |
|
145
|
|
|
mets.add_file_group('TEST') |
|
146
|
|
|
assert len(mets.file_groups) == 1, '1 file groups' |
|
147
|
|
|
|
|
148
|
|
|
|
|
149
|
|
|
def test_add_file0(): |
|
150
|
|
|
mets = OcrdMets.empty_mets() |
|
151
|
|
|
assert len(mets.file_groups) == 0, '0 file groups' |
|
152
|
|
|
assert len(list(mets.find_all_files(fileGrp='OUTPUT'))) == 0, '0 files in "OUTPUT"' |
|
153
|
|
|
f = mets.add_file('OUTPUT', ID="foo123", mimetype="bla/quux", pageId="foobar") |
|
154
|
|
|
# TODO unless pageId/mimetype/fileGrp match raises exception this won't work |
|
155
|
|
|
# with pytest.raises(Exception) as exc: |
|
156
|
|
|
# f2 = mets.add_file('OUTPUT', ID="foo1232", mimetype="bla/quux", pageId="foobar") |
|
157
|
|
|
# assert str(exc.value) == "Exception: File with pageId='foobar' already exists in fileGrp 'OUTPUTx'" |
|
158
|
|
|
f2 = mets.add_file('OUTPUT', ID="foo1232", mimetype="bla/quux", pageId="foobar") |
|
159
|
|
|
assert f.pageId == 'foobar', 'pageId set' |
|
160
|
|
|
assert len(mets.file_groups) == 1, '1 file groups' |
|
161
|
|
|
assert len(list(mets.find_all_files(fileGrp='OUTPUT'))) == 2, '2 files in "OUTPUT"' |
|
162
|
|
|
mets.set_physical_page_for_file('barfoo', f, order='300', orderlabel="page 300") |
|
163
|
|
|
assert f.pageId == 'barfoo', 'pageId changed' |
|
164
|
|
|
mets.set_physical_page_for_file('quux', f2, order='302', orderlabel="page 302") |
|
165
|
|
|
assert f2.pageId == 'quux', 'pageId changed' |
|
166
|
|
|
mets.set_physical_page_for_file('barfoo', f2, order='301', orderlabel="page 301") |
|
167
|
|
|
assert f2.pageId == 'barfoo', 'pageId changed' |
|
168
|
|
|
assert len(mets.file_groups) == 1, '1 file group' |
|
169
|
|
|
|
|
170
|
|
|
|
|
171
|
|
|
def test_add_file_id_already_exists(sbb_sample_01): |
|
172
|
|
|
f = sbb_sample_01.add_file('OUTPUT', ID='best-id-ever', mimetype="beep/boop") |
|
173
|
|
|
assert f.ID == 'best-id-ever', "ID kept" |
|
174
|
|
|
with pytest.raises(FileExistsError) as exc: |
|
175
|
|
|
sbb_sample_01.add_file('OUTPUT', ID='best-id-ever', mimetype="boop/beep") |
|
176
|
|
|
|
|
177
|
|
|
# Still fails because differing mimetypes |
|
178
|
|
|
with pytest.raises(FileExistsError) as exc: |
|
179
|
|
|
f2 = sbb_sample_01.add_file('OUTPUT', ID='best-id-ever', mimetype="boop/beep", force=True) |
|
180
|
|
|
|
|
181
|
|
|
# Works but is unwise, there are now two files with clashing ID in METS |
|
182
|
|
|
f2 = sbb_sample_01.add_file('OUTPUT', ID='best-id-ever', mimetype="boop/beep", ignore=True) |
|
183
|
|
|
assert len(list(sbb_sample_01.find_files(ID='best-id-ever'))) == 1 if sbb_sample_01._cache_flag else 2 |
|
184
|
|
|
|
|
185
|
|
|
if sbb_sample_01._cache_flag: |
|
186
|
|
|
# Does not work with caching |
|
187
|
|
|
with pytest.raises(FileExistsError) as val_err: |
|
188
|
|
|
sbb_sample_01.add_file('OUTPUT', ID='best-id-ever', mimetype="beep/boop", force=True) |
|
189
|
|
|
else: |
|
190
|
|
|
# Works because fileGrp, mimetype and pageId(== None) match and force is set |
|
191
|
|
|
f2 = sbb_sample_01.add_file('OUTPUT', ID='best-id-ever', mimetype="beep/boop", force=True) |
|
192
|
|
|
|
|
193
|
|
|
# Previous step removed duplicate mets:file |
|
194
|
|
|
assert len(list(sbb_sample_01.find_files(ID='best-id-ever'))) == 1 |
|
195
|
|
|
|
|
196
|
|
|
def test_add_file_nopageid_overwrite(sbb_sample_01: OcrdMets): |
|
197
|
|
|
""" |
|
198
|
|
|
Test that when adding files without pageId |
|
199
|
|
|
""" |
|
200
|
|
|
with capture_log('ocrd_models.ocrd_mets.add_file') as cap: |
|
201
|
|
|
file1 = sbb_sample_01.add_file('OUTPUT', ID='best-id-ever', mimetype="application/tei+xml") |
|
202
|
|
|
with pytest.raises(FileExistsError): |
|
203
|
|
|
file2 = sbb_sample_01.add_file('OUTPUT', ID='best-id-ever', mimetype="application/tei+xml", ignore=False, force=False) |
|
204
|
|
|
|
|
205
|
|
|
def test_add_file_ignore(sbb_sample_01: OcrdMets): |
|
206
|
|
|
"""Behavior if ignore-Flag set to true: |
|
207
|
|
|
delegate responsibility to overwrite existing files to user""" |
|
208
|
|
|
|
|
209
|
|
|
the_file = sbb_sample_01.add_file('OUTPUT', ID='best-id-ever', mimetype="beep/boop") |
|
210
|
|
|
assert the_file.ID == 'best-id-ever' |
|
211
|
|
|
the_same = sbb_sample_01.add_file('OUTPUT', ID='best-id-ever', mimetype="boop/beep", ignore=True) |
|
212
|
|
|
assert the_same.ID == 'best-id-ever' |
|
213
|
|
|
|
|
214
|
|
|
# how many files inserted |
|
215
|
|
|
the_files = list(sbb_sample_01.find_files(ID='best-id-ever')) |
|
216
|
|
|
assert len(the_files) == 1 if sbb_sample_01._cache_flag else 2 |
|
217
|
|
|
|
|
218
|
|
|
|
|
219
|
|
|
def test_add_file_id_invalid(sbb_sample_01): |
|
220
|
|
|
with pytest.raises(Exception) as exc: |
|
221
|
|
|
sbb_sample_01.add_file('OUTPUT', ID='1234:::', mimetype="beep/boop") |
|
222
|
|
|
assert "Invalid syntax for mets:file/@ID 1234:::" in str(exc) |
|
223
|
|
|
|
|
224
|
|
|
|
|
225
|
|
|
def test_filegrp_from_file(sbb_sample_01): |
|
226
|
|
|
f = sbb_sample_01.find_all_files(fileGrp='OCR-D-IMG')[0] |
|
227
|
|
|
assert f.fileGrp == 'OCR-D-IMG' |
|
228
|
|
|
|
|
229
|
|
|
|
|
230
|
|
|
def test_add_file_no_id(sbb_sample_01): |
|
231
|
|
|
with pytest.raises(Exception) as exc: |
|
232
|
|
|
sbb_sample_01.add_file('FOO') |
|
233
|
|
|
assert "Must set ID of the mets:file" in str(exc) |
|
234
|
|
|
|
|
235
|
|
|
|
|
236
|
|
|
def test_add_file_no_pageid(sbb_sample_01): |
|
237
|
|
|
f = sbb_sample_01.add_file('OUTPUT', mimetype="bla/quux", ID="foo3") |
|
238
|
|
|
assert not f.pageId, 'No pageId available, dude!' |
|
239
|
|
|
|
|
240
|
|
|
|
|
241
|
|
|
def test_file_pageid(sbb_sample_01): |
|
242
|
|
|
f = sbb_sample_01.find_all_files()[0] |
|
243
|
|
|
assert f.pageId == 'PHYS_0001' |
|
244
|
|
|
f.pageId = 'foo' |
|
245
|
|
|
assert f.pageId == 'foo' |
|
246
|
|
|
|
|
247
|
|
|
|
|
248
|
|
|
def test_agent(sbb_sample_01): |
|
249
|
|
|
beforelen = len(sbb_sample_01.agents) |
|
250
|
|
|
sbb_sample_01.add_agent('foo bar v0.0.1', 'OTHER', 'OTHER', 'YETOTHERSTILL') |
|
251
|
|
|
assert len(sbb_sample_01.agents) == beforelen + 1 |
|
252
|
|
|
|
|
253
|
|
|
def test_metshdr(): |
|
254
|
|
|
""" |
|
255
|
|
|
Test whether metsHdr is created on-demand |
|
256
|
|
|
""" |
|
257
|
|
|
mets = OcrdMets(content="<mets></mets>") |
|
258
|
|
|
assert not list(mets._tree.getroot()) |
|
259
|
|
|
mets.add_agent() |
|
260
|
|
|
assert len(mets._tree.getroot()) == 1 |
|
261
|
|
|
|
|
262
|
|
|
|
|
263
|
|
|
def test_nocontent_nofilename_exception(): |
|
264
|
|
|
with pytest.raises(Exception) as exc: |
|
265
|
|
|
OcrdMets() |
|
266
|
|
|
assert "Must pass 'filename' or 'content' to" in str(exc) |
|
267
|
|
|
|
|
268
|
|
|
|
|
269
|
|
|
def test_encoding_entities(): |
|
270
|
|
|
mets = OcrdMets(content=""" |
|
271
|
|
|
<mets> |
|
272
|
|
|
<metsHdr> |
|
273
|
|
|
<agent> |
|
274
|
|
|
<name>Őh śéé Áŕ</name> |
|
275
|
|
|
<note>OCR-D</note> |
|
276
|
|
|
</agent> |
|
277
|
|
|
</metsHdr> |
|
278
|
|
|
</mets> |
|
279
|
|
|
""") |
|
280
|
|
|
assert 'Őh śéé Áŕ' in mets.to_xml().decode('utf-8') |
|
281
|
|
|
|
|
282
|
|
|
|
|
283
|
|
|
def test_remove_page(sbb_directory_ocrd_mets): |
|
284
|
|
|
assert sbb_directory_ocrd_mets.physical_pages, ['PHYS_0001', 'PHYS_0002', 'PHYS_0005'] |
|
285
|
|
|
sbb_directory_ocrd_mets.remove_physical_page('PHYS_0001') |
|
286
|
|
|
assert sbb_directory_ocrd_mets.physical_pages, ['PHYS_0002', 'PHYS_0005'] |
|
287
|
|
|
|
|
288
|
|
|
|
|
289
|
|
|
def test_remove_physical_page_fptr(sbb_directory_ocrd_mets): |
|
290
|
|
|
assert sbb_directory_ocrd_mets.get_physical_pages(for_fileIds=['FILE_0002_IMAGE']), ['PHYS_0002'] |
|
291
|
|
|
sbb_directory_ocrd_mets.remove_physical_page_fptr('FILE_0002_IMAGE') |
|
292
|
|
|
sbb_directory_ocrd_mets.remove_physical_page_fptr('FILE_0002_IMAGE') |
|
293
|
|
|
assert sbb_directory_ocrd_mets.get_physical_pages(for_fileIds=['FILE_0002_IMAGE']), [None] |
|
294
|
|
|
|
|
295
|
|
|
|
|
296
|
|
|
def test_remove_page_after_remove_file(sbb_directory_ocrd_mets): |
|
297
|
|
|
assert sbb_directory_ocrd_mets.physical_pages, ['PHYS_0001', 'PHYS_0002', 'PHYS_0005'] |
|
298
|
|
|
sbb_directory_ocrd_mets.remove_one_file('FILE_0005_IMAGE') |
|
299
|
|
|
assert sbb_directory_ocrd_mets.physical_pages, ['PHYS_0001', 'PHYS_0002'] |
|
300
|
|
|
|
|
301
|
|
|
|
|
302
|
|
|
def test_remove_file_ocrdfile(sbb_directory_ocrd_mets): |
|
303
|
|
|
assert sbb_directory_ocrd_mets.physical_pages, ['PHYS_0001', 'PHYS_0002', 'PHYS_0005'] |
|
304
|
|
|
ocrd_file = sbb_directory_ocrd_mets.find_all_files(ID='FILE_0005_IMAGE')[0] |
|
305
|
|
|
sbb_directory_ocrd_mets.remove_one_file(ocrd_file) |
|
306
|
|
|
assert sbb_directory_ocrd_mets.physical_pages, ['PHYS_0001', 'PHYS_0002'] |
|
307
|
|
|
|
|
308
|
|
|
|
|
309
|
|
|
def test_remove_file_regex(sbb_directory_ocrd_mets): |
|
310
|
|
|
assert sbb_directory_ocrd_mets.physical_pages, ['PHYS_0001', 'PHYS_0002', 'PHYS_0005'] |
|
311
|
|
|
sbb_directory_ocrd_mets.remove_file('//FILE_0005.*') |
|
312
|
|
|
assert sbb_directory_ocrd_mets.physical_pages, ['PHYS_0001', 'PHYS_0002'] |
|
313
|
|
|
|
|
314
|
|
|
|
|
315
|
|
|
def test_rename_non_existent_filegroup_exception(sbb_directory_ocrd_mets): |
|
316
|
|
|
with pytest.raises(FileNotFoundError) as fnf_exc: |
|
317
|
|
|
sbb_directory_ocrd_mets.rename_file_group('FOOBAR', 'FOOBAR') |
|
318
|
|
|
# assert |
|
319
|
|
|
assert "No such fileGrp 'FOOBAR'" in str(fnf_exc) |
|
320
|
|
|
|
|
321
|
|
|
|
|
322
|
|
|
def test_rename_file_group0(sbb_directory_ocrd_mets): |
|
323
|
|
|
assert 'FOOBAR' not in sbb_directory_ocrd_mets.file_groups |
|
324
|
|
|
|
|
325
|
|
|
# act |
|
326
|
|
|
sbb_directory_ocrd_mets.rename_file_group('OCR-D-GT-PAGE', 'FOOBAR') |
|
327
|
|
|
|
|
328
|
|
|
# assert |
|
329
|
|
|
assert 'OCR-D-GT-PAGE' not in sbb_directory_ocrd_mets.file_groups |
|
330
|
|
|
assert 'FOOBAR' in sbb_directory_ocrd_mets.file_groups |
|
331
|
|
|
|
|
332
|
|
|
|
|
333
|
|
|
def test_remove_non_empty_filegroup_exception(sbb_directory_ocrd_mets): |
|
334
|
|
|
with pytest.raises(Exception) as exc: |
|
335
|
|
|
sbb_directory_ocrd_mets.remove_file_group('OCR-D-GT-ALTO') |
|
336
|
|
|
assert "not empty" in str(exc) |
|
337
|
|
|
|
|
338
|
|
|
|
|
339
|
|
|
def test_remove_file_group0(sbb_directory_ocrd_mets): |
|
340
|
|
|
""" |
|
341
|
|
|
Test removal of filegrp |
|
342
|
|
|
""" |
|
343
|
|
|
|
|
344
|
|
|
assert len(sbb_directory_ocrd_mets.file_groups) == 17 |
|
345
|
|
|
assert len(sbb_directory_ocrd_mets.find_all_files()) == 35 |
|
346
|
|
|
|
|
347
|
|
|
sbb_directory_ocrd_mets.remove_file_group('OCR-D-GT-PAGE', recursive=True) |
|
348
|
|
|
assert len(sbb_directory_ocrd_mets.file_groups) == 16 |
|
349
|
|
|
assert len(sbb_directory_ocrd_mets.find_all_files()) == 33 |
|
350
|
|
|
|
|
351
|
|
|
|
|
352
|
|
|
def test_remove_file_group_regex(sbb_directory_ocrd_mets): |
|
353
|
|
|
""" |
|
354
|
|
|
Test removal of filegrp |
|
355
|
|
|
""" |
|
356
|
|
|
|
|
357
|
|
|
assert len(sbb_directory_ocrd_mets.file_groups) == 17 |
|
358
|
|
|
assert len(sbb_directory_ocrd_mets.find_all_files()) == 35 |
|
359
|
|
|
|
|
360
|
|
|
# act |
|
361
|
|
|
sbb_directory_ocrd_mets.remove_file_group('//OCR-D-GT-.*', recursive=True) |
|
362
|
|
|
|
|
363
|
|
|
# assert |
|
364
|
|
|
assert len(sbb_directory_ocrd_mets.file_groups) == 15 |
|
365
|
|
|
assert len(sbb_directory_ocrd_mets.find_all_files()) == 31 |
|
366
|
|
|
|
|
367
|
|
|
|
|
368
|
|
|
def test_merge(sbb_sample_01): |
|
369
|
|
|
assert len(sbb_sample_01.file_groups) == 17 |
|
370
|
|
|
other_mets = OcrdMets(filename=assets.path_to('kant_aufklaerung_1784/data/mets.xml')) |
|
371
|
|
|
sbb_sample_01.merge(other_mets, fileGrp_mapping={'OCR-D-IMG': 'FOO'}) |
|
372
|
|
|
assert len(sbb_sample_01.file_groups) == 18 |
|
373
|
|
|
|
|
374
|
|
|
def test_invalid_filegrp(): |
|
375
|
|
|
"""addresses https://github.com/OCR-D/core/issues/746""" |
|
376
|
|
|
|
|
377
|
|
|
mets = OcrdMets(content="<mets></mets>") |
|
378
|
|
|
with pytest.raises(ValueError) as val_err: |
|
379
|
|
|
mets.add_file('1:! bad filegrp', ID="foo123", pageId="foobar") |
|
380
|
|
|
|
|
381
|
|
|
assert "Invalid syntax for mets:fileGrp/@USE" in str(val_err.value) |
|
382
|
|
|
|
|
383
|
|
|
@contextmanager |
|
384
|
|
|
def temp_env_var(k, v): |
|
385
|
|
|
v_before = environ.get(k, None) |
|
|
|
|
|
|
386
|
|
|
environ[k] = v |
|
387
|
|
|
yield |
|
388
|
|
|
if v_before is not None: |
|
389
|
|
|
environ[k] = v_before |
|
390
|
|
|
else: |
|
391
|
|
|
del environ[k] |
|
392
|
|
|
|
|
393
|
|
|
def test_envvar(): |
|
394
|
|
|
assert OcrdMets(filename=assets.url_of('SBB0000F29300010000/data/mets.xml'), cache_flag=True)._cache_flag |
|
395
|
|
|
assert not OcrdMets(filename=assets.url_of('SBB0000F29300010000/data/mets.xml'), cache_flag=False)._cache_flag |
|
396
|
|
|
with temp_env_var('OCRD_METS_CACHING', 'true'): |
|
397
|
|
|
assert OcrdMets(filename=assets.url_of('SBB0000F29300010000/data/mets.xml'), cache_flag=True)._cache_flag |
|
398
|
|
|
assert OcrdMets(filename=assets.url_of('SBB0000F29300010000/data/mets.xml'), cache_flag=False)._cache_flag |
|
399
|
|
|
with temp_env_var('OCRD_METS_CACHING', 'false'): |
|
400
|
|
|
assert not OcrdMets(filename=assets.url_of('SBB0000F29300010000/data/mets.xml'), cache_flag=True)._cache_flag |
|
401
|
|
|
assert not OcrdMets(filename=assets.url_of('SBB0000F29300010000/data/mets.xml'), cache_flag=False)._cache_flag |
|
402
|
|
|
|
|
403
|
|
|
def test_update_physical_page_attributes(sbb_directory_ocrd_mets): |
|
404
|
|
|
m = sbb_directory_ocrd_mets |
|
405
|
|
|
m.remove_file() |
|
406
|
|
|
assert len(m.physical_pages) == 0 |
|
407
|
|
|
m.add_file('FOO', pageId='new page', ID='foo1', mimetype='foo/bar') |
|
408
|
|
|
m.add_file('FOO', pageId='new page', ID='foo2', mimetype='foo/bar') |
|
409
|
|
|
m.add_file('FOO', pageId='new page', ID='foo3', mimetype='foo/bar') |
|
410
|
|
|
m.add_file('FOO', pageId='new page', ID='foo4', mimetype='foo/bar') |
|
411
|
|
|
assert len(m.physical_pages) == 1 |
|
412
|
|
|
assert b'ORDER' not in m.to_xml() |
|
413
|
|
|
assert b'ORDERLABEL' not in m.to_xml() |
|
414
|
|
|
m.update_physical_page_attributes('new page', ORDER='foo', ORDERLABEL='bar') |
|
415
|
|
|
assert b'ORDER' in m.to_xml() |
|
416
|
|
|
assert b'ORDERLABEL' in m.to_xml() |
|
417
|
|
|
|
|
418
|
|
|
|
|
419
|
|
|
if __name__ == '__main__': |
|
420
|
|
|
main(__file__) |
|
421
|
|
|
|