1
|
|
|
import json |
2
|
|
|
|
3
|
|
|
from tempfile import TemporaryDirectory |
4
|
|
|
from os.path import join |
5
|
|
|
from tests.base import CapturingTestCase as TestCase, assets, main # pylint: disable=import-error, no-name-in-module |
6
|
|
|
from tests.data import DummyProcessor, DummyProcessorWithRequiredParameters, DummyProcessorWithOutput, IncompleteProcessor |
7
|
|
|
|
8
|
|
|
from ocrd_utils import MIMETYPE_PAGE, pushd_popd, initLogging, disableLogging |
9
|
|
|
from ocrd.resolver import Resolver |
10
|
|
|
from ocrd.processor.base import Processor, run_processor, run_cli |
11
|
|
|
|
12
|
|
|
import pytest |
13
|
|
|
|
14
|
|
|
class TestProcessor(TestCase): |
15
|
|
|
|
16
|
|
|
def setUp(self): |
17
|
|
|
super().setUp() |
18
|
|
|
self.resolver = Resolver() |
19
|
|
|
self.workspace = self.resolver.workspace_from_url(assets.url_of('SBB0000F29300010000/data/mets.xml')) |
20
|
|
|
|
21
|
|
|
def test_incomplete_processor(self): |
22
|
|
|
proc = IncompleteProcessor(None) |
23
|
|
|
with self.assertRaisesRegex(Exception, 'Must be implemented'): |
24
|
|
|
proc.process() |
25
|
|
|
|
26
|
|
|
def test_no_resolver(self): |
27
|
|
|
with self.assertRaisesRegex(Exception, 'pass a resolver to create a workspace'): |
28
|
|
|
run_processor(DummyProcessor) |
29
|
|
|
|
30
|
|
|
def test_no_mets_url(self): |
31
|
|
|
with self.assertRaisesRegex(Exception, 'pass mets_url to create a workspace'): |
32
|
|
|
run_processor(DummyProcessor, resolver=self.resolver) |
33
|
|
|
|
34
|
|
|
def test_no_input_file_grp(self): |
35
|
|
|
processor = run_processor(DummyProcessor, |
36
|
|
|
resolver=self.resolver, |
37
|
|
|
workspace=self.workspace) |
38
|
|
|
with self.assertRaisesRegex(Exception, 'Processor is missing input fileGrp'): |
39
|
|
|
_ = processor.input_files |
40
|
|
|
|
41
|
|
|
def test_with_mets_url_input_files(self): |
42
|
|
|
assert len(list(self.workspace.mets.find_files(fileGrp='OCR-D-SEG-PAGE'))) == 2 |
43
|
|
|
processor = run_processor(DummyProcessor, |
44
|
|
|
input_file_grp='OCR-D-SEG-PAGE', |
45
|
|
|
resolver=self.resolver, |
46
|
|
|
workspace=self.workspace) |
47
|
|
|
assert len(processor.input_files) == 2 |
48
|
|
|
assert [f.mimetype for f in processor.input_files] == [MIMETYPE_PAGE, MIMETYPE_PAGE] |
49
|
|
|
|
50
|
|
|
def test_parameter(self): |
51
|
|
|
with TemporaryDirectory() as tempdir: |
52
|
|
|
jsonpath = join(tempdir, 'params.json') |
53
|
|
|
with open(jsonpath, 'w') as f: |
54
|
|
|
f.write('{"baz": "quux"}') |
55
|
|
|
with open(jsonpath, 'r') as f: |
56
|
|
|
processor = run_processor( |
57
|
|
|
DummyProcessor, |
58
|
|
|
parameter=json.load(f), |
59
|
|
|
input_file_grp="OCR-D-IMG", |
60
|
|
|
resolver=self.resolver, |
61
|
|
|
workspace=self.workspace |
62
|
|
|
) |
63
|
|
|
self.assertEqual(len(processor.input_files), 3) |
64
|
|
|
|
65
|
|
|
def test_verify(self): |
66
|
|
|
proc = DummyProcessor(self.workspace) |
67
|
|
|
self.assertEqual(proc.verify(), True) |
68
|
|
|
|
69
|
|
|
def test_json(self): |
70
|
|
|
DummyProcessor(self.workspace, dump_json=True) |
71
|
|
|
|
72
|
|
|
def test_params_missing_required(self): |
73
|
|
|
with self.assertRaisesRegex(Exception, 'is a required property'): |
74
|
|
|
DummyProcessorWithRequiredParameters(workspace=self.workspace) |
75
|
|
|
|
76
|
|
|
def test_params(self): |
77
|
|
|
proc = Processor(workspace=self.workspace) |
78
|
|
|
self.assertEqual(proc.parameter, {}) |
79
|
|
|
|
80
|
|
|
def test_run_agent(self): |
81
|
|
|
no_agents_before = len(self.workspace.mets.agents) |
82
|
|
|
run_processor(DummyProcessor, workspace=self.workspace) |
83
|
|
|
self.assertEqual(len(self.workspace.mets.agents), no_agents_before + 1, 'one more agent') |
84
|
|
|
# print(self.workspace.mets.agents[no_agents_before]) |
85
|
|
|
|
86
|
|
|
def test_run_input(self): |
87
|
|
|
run_processor(DummyProcessor, workspace=self.workspace, input_file_grp="OCR-D-IMG") |
88
|
|
|
assert len(self.workspace.mets.agents) > 0 |
89
|
|
|
assert len(self.workspace.mets.agents[-1].notes) > 0 |
90
|
|
|
assert ({'{https://ocr-d.de}option': 'input-file-grp'}, 'OCR-D-IMG') in self.workspace.mets.agents[-1].notes |
91
|
|
|
|
92
|
|
|
def test_run_output0(self): |
93
|
|
|
with pushd_popd(tempdir=True) as tempdir: |
94
|
|
|
ws = self.resolver.workspace_from_nothing(directory=tempdir) |
95
|
|
|
ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar1', pageId='phys_0001') |
96
|
|
|
ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar2', pageId='phys_0002') |
97
|
|
|
run_processor(DummyProcessorWithOutput, workspace=ws, |
98
|
|
|
input_file_grp="GRP1", |
99
|
|
|
output_file_grp="OCR-D-OUT") |
100
|
|
|
assert len(ws.mets.find_all_files(fileGrp="OCR-D-OUT")) == 2 |
101
|
|
|
|
102
|
|
|
def test_run_output_overwrite(self): |
103
|
|
|
with pushd_popd(tempdir=True) as tempdir: |
104
|
|
|
ws = self.resolver.workspace_from_nothing(directory=tempdir) |
105
|
|
|
ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar1', pageId='phys_0001') |
106
|
|
|
ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar2', pageId='phys_0002') |
107
|
|
|
ws.overwrite_mode = True |
108
|
|
|
ws.add_file('OCR-D-OUT', mimetype=MIMETYPE_PAGE, ID='OCR-D-OUT_phys_0001', pageId='phys_0001') |
109
|
|
|
ws.overwrite_mode = False |
110
|
|
|
with pytest.raises(Exception) as exc: |
111
|
|
|
run_processor(DummyProcessorWithOutput, workspace=ws, |
112
|
|
|
input_file_grp="GRP1", |
113
|
|
|
output_file_grp="OCR-D-OUT") |
114
|
|
|
assert str(exc.value) == "File with ID='OCR-D-OUT_phys_0001' already exists" |
115
|
|
|
ws.overwrite_mode = True |
116
|
|
|
run_processor(DummyProcessorWithOutput, workspace=ws, |
117
|
|
|
input_file_grp="GRP1", |
118
|
|
|
output_file_grp="OCR-D-OUT") |
119
|
|
|
assert len(ws.mets.find_all_files(fileGrp="OCR-D-OUT")) == 2 |
120
|
|
|
|
121
|
|
|
def test_run_cli(self): |
122
|
|
|
with TemporaryDirectory() as tempdir: |
123
|
|
|
run_processor(DummyProcessor, workspace=self.workspace) |
124
|
|
|
run_cli( |
125
|
|
|
'echo', |
126
|
|
|
mets_url=assets.url_of('SBB0000F29300010000/data/mets.xml'), |
127
|
|
|
resolver=Resolver(), |
128
|
|
|
workspace=None, |
129
|
|
|
page_id='page1', |
130
|
|
|
log_level='DEBUG', |
131
|
|
|
input_file_grp='INPUT', |
132
|
|
|
output_file_grp='OUTPUT', |
133
|
|
|
parameter='/path/to/param.json', |
134
|
|
|
working_dir=tempdir |
135
|
|
|
) |
136
|
|
|
run_cli( |
137
|
|
|
'echo', |
138
|
|
|
mets_url=assets.url_of('SBB0000F29300010000/data/mets.xml'), |
139
|
|
|
resolver=Resolver(), |
140
|
|
|
) |
141
|
|
|
|
142
|
|
|
def test_zip_input_files(self): |
143
|
|
|
class ZipTestProcessor(Processor): pass |
144
|
|
|
with pushd_popd(tempdir=True) as tempdir: |
145
|
|
|
ws = self.resolver.workspace_from_nothing(directory=tempdir) |
146
|
|
|
ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, file_id='foobar1', page_id='phys_0001') |
147
|
|
|
ws.add_file('GRP2', mimetype='application/alto+xml', file_id='foobar2', page_id='phys_0001') |
148
|
|
|
ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, file_id='foobar3', page_id='phys_0002') |
149
|
|
|
ws.add_file('GRP2', mimetype=MIMETYPE_PAGE, file_id='foobar4', page_id='phys_0002') |
150
|
|
|
for page_id in [None, 'phys_0001,phys_0002']: |
151
|
|
|
with self.subTest(page_id=page_id): |
152
|
|
|
proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2', page_id=page_id) |
153
|
|
|
tuples = [(one.ID, two.ID) for one, two in proc.zip_input_files()] |
154
|
|
|
assert ('foobar1', 'foobar2') in tuples |
155
|
|
|
assert ('foobar3', 'foobar4') in tuples |
156
|
|
|
tuples = [(one.ID, two) for one, two in proc.zip_input_files(mimetype=MIMETYPE_PAGE)] |
157
|
|
|
assert ('foobar1', None) in tuples |
158
|
|
|
tuples = [(one.ID, two.ID) for one, two in proc.zip_input_files(mimetype=r'//application/(vnd.prima.page|alto)\+xml')] |
159
|
|
|
assert ('foobar1', 'foobar2') in tuples |
160
|
|
|
assert ('foobar3', 'foobar4') in tuples |
161
|
|
|
|
162
|
|
|
def test_zip_input_files_multi_mixed(self): |
163
|
|
|
class ZipTestProcessor(Processor): pass |
164
|
|
|
with pushd_popd(tempdir=True) as tempdir: |
165
|
|
|
ws = self.resolver.workspace_from_nothing(directory=tempdir) |
166
|
|
|
ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, file_id='foobar1', page_id='phys_0001') |
167
|
|
|
ws.add_file('GRP1', mimetype='image/png', file_id='foobar1img1', page_id='phys_0001') |
168
|
|
|
ws.add_file('GRP1', mimetype='image/png', file_id='foobar1img2', page_id='phys_0001') |
169
|
|
|
ws.add_file('GRP2', mimetype=MIMETYPE_PAGE, file_id='foobar2', page_id='phys_0001') |
170
|
|
|
ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, file_id='foobar3', page_id='phys_0002') |
171
|
|
|
ws.add_file('GRP2', mimetype='image/tiff', file_id='foobar4', page_id='phys_0002') |
172
|
|
|
for page_id in [None, 'phys_0001,phys_0002']: |
173
|
|
|
with self.subTest(page_id=page_id): |
174
|
|
|
proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2', page_id=page_id) |
175
|
|
|
print("unfiltered") |
176
|
|
|
tuples = [(one.ID, two.ID) for one, two in proc.zip_input_files()] |
177
|
|
|
assert ('foobar1', 'foobar2') in tuples |
178
|
|
|
assert ('foobar3', 'foobar4') in tuples |
179
|
|
|
print("PAGE-filtered") |
180
|
|
|
tuples = [(one.ID, two) for one, two in proc.zip_input_files(mimetype=MIMETYPE_PAGE)] |
181
|
|
|
assert ('foobar3', None) in tuples |
182
|
|
|
ws.add_file('GRP2', mimetype='image/tiff', file_id='foobar4dup', page_id='phys_0002') |
183
|
|
|
for page_id in [None, 'phys_0001,phys_0002']: |
184
|
|
|
with self.subTest(page_id=page_id): |
185
|
|
|
proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2', page_id=page_id) |
186
|
|
|
tuples = [(one.ID, two.ID) for one, two in proc.zip_input_files(on_error='first')] |
187
|
|
|
assert ('foobar1', 'foobar2') in tuples |
188
|
|
|
assert ('foobar3', 'foobar4') in tuples |
189
|
|
|
tuples = [(one.ID, two) for one, two in proc.zip_input_files(on_error='skip')] |
190
|
|
|
assert ('foobar3', None) in tuples |
191
|
|
|
with self.assertRaisesRegex(Exception, "No PAGE-XML for page .* in fileGrp .* but multiple matches."): |
192
|
|
|
tuples = proc.zip_input_files(on_error='abort') |
193
|
|
|
ws.add_file('GRP2', mimetype=MIMETYPE_PAGE, file_id='foobar2dup', page_id='phys_0001') |
194
|
|
|
for page_id in [None, 'phys_0001,phys_0002']: |
195
|
|
|
with self.subTest(page_id=page_id): |
196
|
|
|
proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2', page_id=page_id) |
197
|
|
|
with self.assertRaisesRegex(Exception, "Multiple PAGE-XML matches for page"): |
198
|
|
|
tuples = proc.zip_input_files() |
199
|
|
|
|
200
|
|
|
def test_zip_input_files_require_first(self): |
201
|
|
|
class ZipTestProcessor(Processor): pass |
202
|
|
|
self.capture_out_err() |
203
|
|
|
with pushd_popd(tempdir=True) as tempdir: |
204
|
|
|
ws = self.resolver.workspace_from_nothing(directory=tempdir) |
205
|
|
|
ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, file_id='foobar1', page_id=None) |
206
|
|
|
ws.add_file('GRP2', mimetype=MIMETYPE_PAGE, file_id='foobar2', page_id='phys_0001') |
207
|
|
|
for page_id in [None, 'phys_0001']: |
208
|
|
|
with self.subTest(page_id=page_id): |
209
|
|
|
proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2', page_id=page_id) |
210
|
|
|
assert [(one, two.ID) for one, two in proc.zip_input_files(require_first=False)] == [(None, 'foobar2')] |
211
|
|
|
r = self.capture_out_err() |
212
|
|
|
assert 'ERROR ocrd.processor.base - found no page phys_0001 in file group GRP1' in r.err |
213
|
|
|
|
214
|
|
|
if __name__ == "__main__": |
215
|
|
|
main(__file__) |
216
|
|
|
|