|
1
|
|
|
import json |
|
2
|
|
|
|
|
3
|
|
|
from tempfile import TemporaryDirectory |
|
4
|
|
|
from os.path import join |
|
5
|
|
|
from tests.base import CapturingTestCase as TestCase, assets, main # pylint: disable=import-error, no-name-in-module |
|
6
|
|
|
from tests.data import DummyProcessor, DummyProcessorWithRequiredParameters, DummyProcessorWithOutput, IncompleteProcessor |
|
7
|
|
|
|
|
8
|
|
|
from ocrd_utils import MIMETYPE_PAGE, pushd_popd, initLogging, disableLogging |
|
9
|
|
|
from ocrd.resolver import Resolver |
|
10
|
|
|
from ocrd.processor.base import Processor, run_processor, run_cli |
|
11
|
|
|
|
|
12
|
|
|
import pytest |
|
13
|
|
|
|
|
14
|
|
|
class TestProcessor(TestCase): |
|
15
|
|
|
|
|
16
|
|
|
def setUp(self): |
|
17
|
|
|
super().setUp() |
|
18
|
|
|
self.resolver = Resolver() |
|
19
|
|
|
self.workspace = self.resolver.workspace_from_url(assets.url_of('SBB0000F29300010000/data/mets.xml')) |
|
20
|
|
|
|
|
21
|
|
|
def test_incomplete_processor(self): |
|
22
|
|
|
proc = IncompleteProcessor(None) |
|
23
|
|
|
with self.assertRaisesRegex(Exception, 'Must be implemented'): |
|
24
|
|
|
proc.process() |
|
25
|
|
|
|
|
26
|
|
|
def test_no_resolver(self): |
|
27
|
|
|
with self.assertRaisesRegex(Exception, 'pass a resolver to create a workspace'): |
|
28
|
|
|
run_processor(DummyProcessor) |
|
29
|
|
|
|
|
30
|
|
|
def test_no_mets_url(self): |
|
31
|
|
|
with self.assertRaisesRegex(Exception, 'pass mets_url to create a workspace'): |
|
32
|
|
|
run_processor(DummyProcessor, resolver=self.resolver) |
|
33
|
|
|
|
|
34
|
|
|
def test_no_input_file_grp(self): |
|
35
|
|
|
processor = run_processor(DummyProcessor, |
|
36
|
|
|
resolver=self.resolver, |
|
37
|
|
|
workspace=self.workspace) |
|
38
|
|
|
with self.assertRaisesRegex(Exception, 'Processor is missing input fileGrp'): |
|
39
|
|
|
_ = processor.input_files |
|
40
|
|
|
|
|
41
|
|
|
def test_with_mets_url_input_files(self): |
|
42
|
|
|
assert len(list(self.workspace.mets.find_files(fileGrp='OCR-D-SEG-PAGE'))) == 2 |
|
43
|
|
|
processor = run_processor(DummyProcessor, |
|
44
|
|
|
input_file_grp='OCR-D-SEG-PAGE', |
|
45
|
|
|
resolver=self.resolver, |
|
46
|
|
|
workspace=self.workspace) |
|
47
|
|
|
assert len(processor.input_files) == 2 |
|
48
|
|
|
assert [f.mimetype for f in processor.input_files] == [MIMETYPE_PAGE, MIMETYPE_PAGE] |
|
49
|
|
|
|
|
50
|
|
|
def test_parameter(self): |
|
51
|
|
|
with TemporaryDirectory() as tempdir: |
|
52
|
|
|
jsonpath = join(tempdir, 'params.json') |
|
53
|
|
|
with open(jsonpath, 'w') as f: |
|
54
|
|
|
f.write('{"baz": "quux"}') |
|
55
|
|
|
with open(jsonpath, 'r') as f: |
|
56
|
|
|
processor = run_processor( |
|
57
|
|
|
DummyProcessor, |
|
58
|
|
|
parameter=json.load(f), |
|
59
|
|
|
input_file_grp="OCR-D-IMG", |
|
60
|
|
|
resolver=self.resolver, |
|
61
|
|
|
workspace=self.workspace |
|
62
|
|
|
) |
|
63
|
|
|
self.assertEqual(len(processor.input_files), 3) |
|
64
|
|
|
|
|
65
|
|
|
def test_verify(self): |
|
66
|
|
|
proc = DummyProcessor(self.workspace) |
|
67
|
|
|
self.assertEqual(proc.verify(), True) |
|
68
|
|
|
|
|
69
|
|
|
def test_json(self): |
|
70
|
|
|
DummyProcessor(self.workspace, dump_json=True) |
|
71
|
|
|
|
|
72
|
|
|
def test_params_missing_required(self): |
|
73
|
|
|
with self.assertRaisesRegex(Exception, 'is a required property'): |
|
74
|
|
|
DummyProcessorWithRequiredParameters(workspace=self.workspace) |
|
75
|
|
|
|
|
76
|
|
|
def test_params(self): |
|
77
|
|
|
proc = Processor(workspace=self.workspace) |
|
78
|
|
|
self.assertEqual(proc.parameter, {}) |
|
79
|
|
|
|
|
80
|
|
|
def test_run_agent(self): |
|
81
|
|
|
no_agents_before = len(self.workspace.mets.agents) |
|
82
|
|
|
run_processor(DummyProcessor, workspace=self.workspace) |
|
83
|
|
|
self.assertEqual(len(self.workspace.mets.agents), no_agents_before + 1, 'one more agent') |
|
84
|
|
|
# print(self.workspace.mets.agents[no_agents_before]) |
|
85
|
|
|
|
|
86
|
|
|
def test_run_input(self): |
|
87
|
|
|
run_processor(DummyProcessor, workspace=self.workspace, input_file_grp="OCR-D-IMG") |
|
88
|
|
|
assert len(self.workspace.mets.agents) > 0 |
|
89
|
|
|
assert len(self.workspace.mets.agents[-1].notes) > 0 |
|
90
|
|
|
assert ({'{https://ocr-d.de}option': 'input-file-grp'}, 'OCR-D-IMG') in self.workspace.mets.agents[-1].notes |
|
91
|
|
|
|
|
92
|
|
|
def test_run_output0(self): |
|
93
|
|
|
with pushd_popd(tempdir=True) as tempdir: |
|
94
|
|
|
ws = self.resolver.workspace_from_nothing(directory=tempdir) |
|
95
|
|
|
ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar1', pageId='phys_0001') |
|
96
|
|
|
ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar2', pageId='phys_0002') |
|
97
|
|
|
run_processor(DummyProcessorWithOutput, workspace=ws, |
|
98
|
|
|
input_file_grp="GRP1", |
|
99
|
|
|
output_file_grp="OCR-D-OUT") |
|
100
|
|
|
assert len(ws.mets.find_all_files(fileGrp="OCR-D-OUT")) == 2 |
|
101
|
|
|
|
|
102
|
|
|
def test_run_output_overwrite(self): |
|
103
|
|
|
with pushd_popd(tempdir=True) as tempdir: |
|
104
|
|
|
ws = self.resolver.workspace_from_nothing(directory=tempdir) |
|
105
|
|
|
ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar1', pageId='phys_0001') |
|
106
|
|
|
ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar2', pageId='phys_0002') |
|
107
|
|
|
ws.overwrite_mode = True |
|
108
|
|
|
ws.add_file('OCR-D-OUT', mimetype=MIMETYPE_PAGE, ID='OCR-D-OUT_phys_0001', pageId='phys_0001') |
|
109
|
|
|
ws.overwrite_mode = False |
|
110
|
|
|
with pytest.raises(Exception) as exc: |
|
111
|
|
|
run_processor(DummyProcessorWithOutput, workspace=ws, |
|
112
|
|
|
input_file_grp="GRP1", |
|
113
|
|
|
output_file_grp="OCR-D-OUT") |
|
114
|
|
|
assert str(exc.value) == "File with ID='OCR-D-OUT_phys_0001' already exists" |
|
115
|
|
|
ws.overwrite_mode = True |
|
116
|
|
|
run_processor(DummyProcessorWithOutput, workspace=ws, |
|
117
|
|
|
input_file_grp="GRP1", |
|
118
|
|
|
output_file_grp="OCR-D-OUT") |
|
119
|
|
|
assert len(ws.mets.find_all_files(fileGrp="OCR-D-OUT")) == 2 |
|
120
|
|
|
|
|
121
|
|
|
def test_run_cli(self): |
|
122
|
|
|
with TemporaryDirectory() as tempdir: |
|
123
|
|
|
run_processor(DummyProcessor, workspace=self.workspace) |
|
124
|
|
|
run_cli( |
|
125
|
|
|
'echo', |
|
126
|
|
|
mets_url=assets.url_of('SBB0000F29300010000/data/mets.xml'), |
|
127
|
|
|
resolver=Resolver(), |
|
128
|
|
|
workspace=None, |
|
129
|
|
|
page_id='page1', |
|
130
|
|
|
log_level='DEBUG', |
|
131
|
|
|
input_file_grp='INPUT', |
|
132
|
|
|
output_file_grp='OUTPUT', |
|
133
|
|
|
parameter='/path/to/param.json', |
|
134
|
|
|
working_dir=tempdir |
|
135
|
|
|
) |
|
136
|
|
|
run_cli( |
|
137
|
|
|
'echo', |
|
138
|
|
|
mets_url=assets.url_of('SBB0000F29300010000/data/mets.xml'), |
|
139
|
|
|
resolver=Resolver(), |
|
140
|
|
|
) |
|
141
|
|
|
|
|
142
|
|
|
def test_zip_input_files(self): |
|
143
|
|
|
class ZipTestProcessor(Processor): pass |
|
144
|
|
|
with pushd_popd(tempdir=True) as tempdir: |
|
145
|
|
|
ws = self.resolver.workspace_from_nothing(directory=tempdir) |
|
146
|
|
|
ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, file_id='foobar1', page_id='phys_0001') |
|
147
|
|
|
ws.add_file('GRP2', mimetype='application/alto+xml', file_id='foobar2', page_id='phys_0001') |
|
148
|
|
|
ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, file_id='foobar3', page_id='phys_0002') |
|
149
|
|
|
ws.add_file('GRP2', mimetype=MIMETYPE_PAGE, file_id='foobar4', page_id='phys_0002') |
|
150
|
|
|
for page_id in [None, 'phys_0001,phys_0002']: |
|
151
|
|
|
with self.subTest(page_id=page_id): |
|
152
|
|
|
proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2', page_id=page_id) |
|
153
|
|
|
tuples = [(one.ID, two.ID) for one, two in proc.zip_input_files()] |
|
154
|
|
|
assert ('foobar1', 'foobar2') in tuples |
|
155
|
|
|
assert ('foobar3', 'foobar4') in tuples |
|
156
|
|
|
tuples = [(one.ID, two) for one, two in proc.zip_input_files(mimetype=MIMETYPE_PAGE)] |
|
157
|
|
|
assert ('foobar1', None) in tuples |
|
158
|
|
|
tuples = [(one.ID, two.ID) for one, two in proc.zip_input_files(mimetype=r'//application/(vnd.prima.page|alto)\+xml')] |
|
159
|
|
|
assert ('foobar1', 'foobar2') in tuples |
|
160
|
|
|
assert ('foobar3', 'foobar4') in tuples |
|
161
|
|
|
|
|
162
|
|
|
def test_zip_input_files_multi_mixed(self): |
|
163
|
|
|
class ZipTestProcessor(Processor): pass |
|
164
|
|
|
with pushd_popd(tempdir=True) as tempdir: |
|
165
|
|
|
ws = self.resolver.workspace_from_nothing(directory=tempdir) |
|
166
|
|
|
ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, file_id='foobar1', page_id='phys_0001') |
|
167
|
|
|
ws.add_file('GRP1', mimetype='image/png', file_id='foobar1img1', page_id='phys_0001') |
|
168
|
|
|
ws.add_file('GRP1', mimetype='image/png', file_id='foobar1img2', page_id='phys_0001') |
|
169
|
|
|
ws.add_file('GRP2', mimetype=MIMETYPE_PAGE, file_id='foobar2', page_id='phys_0001') |
|
170
|
|
|
ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, file_id='foobar3', page_id='phys_0002') |
|
171
|
|
|
ws.add_file('GRP2', mimetype='image/tiff', file_id='foobar4', page_id='phys_0002') |
|
172
|
|
|
for page_id in [None, 'phys_0001,phys_0002']: |
|
173
|
|
|
with self.subTest(page_id=page_id): |
|
174
|
|
|
proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2', page_id=page_id) |
|
175
|
|
|
print("unfiltered") |
|
176
|
|
|
tuples = [(one.ID, two.ID) for one, two in proc.zip_input_files()] |
|
177
|
|
|
assert ('foobar1', 'foobar2') in tuples |
|
178
|
|
|
assert ('foobar3', 'foobar4') in tuples |
|
179
|
|
|
print("PAGE-filtered") |
|
180
|
|
|
tuples = [(one.ID, two) for one, two in proc.zip_input_files(mimetype=MIMETYPE_PAGE)] |
|
181
|
|
|
assert ('foobar3', None) in tuples |
|
182
|
|
|
ws.add_file('GRP2', mimetype='image/tiff', file_id='foobar4dup', page_id='phys_0002') |
|
183
|
|
|
for page_id in [None, 'phys_0001,phys_0002']: |
|
184
|
|
|
with self.subTest(page_id=page_id): |
|
185
|
|
|
proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2', page_id=page_id) |
|
186
|
|
|
tuples = [(one.ID, two.ID) for one, two in proc.zip_input_files(on_error='first')] |
|
187
|
|
|
assert ('foobar1', 'foobar2') in tuples |
|
188
|
|
|
assert ('foobar3', 'foobar4') in tuples |
|
189
|
|
|
tuples = [(one.ID, two) for one, two in proc.zip_input_files(on_error='skip')] |
|
190
|
|
|
assert ('foobar3', None) in tuples |
|
191
|
|
|
with self.assertRaisesRegex(Exception, "No PAGE-XML for page .* in fileGrp .* but multiple matches."): |
|
192
|
|
|
tuples = proc.zip_input_files(on_error='abort') |
|
193
|
|
|
ws.add_file('GRP2', mimetype=MIMETYPE_PAGE, file_id='foobar2dup', page_id='phys_0001') |
|
194
|
|
|
for page_id in [None, 'phys_0001,phys_0002']: |
|
195
|
|
|
with self.subTest(page_id=page_id): |
|
196
|
|
|
proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2', page_id=page_id) |
|
197
|
|
|
with self.assertRaisesRegex(Exception, "Multiple PAGE-XML matches for page"): |
|
198
|
|
|
tuples = proc.zip_input_files() |
|
199
|
|
|
|
|
200
|
|
|
def test_zip_input_files_require_first(self): |
|
201
|
|
|
class ZipTestProcessor(Processor): pass |
|
202
|
|
|
self.capture_out_err() |
|
203
|
|
|
with pushd_popd(tempdir=True) as tempdir: |
|
204
|
|
|
ws = self.resolver.workspace_from_nothing(directory=tempdir) |
|
205
|
|
|
ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, file_id='foobar1', page_id=None) |
|
206
|
|
|
ws.add_file('GRP2', mimetype=MIMETYPE_PAGE, file_id='foobar2', page_id='phys_0001') |
|
207
|
|
|
for page_id in [None, 'phys_0001']: |
|
208
|
|
|
with self.subTest(page_id=page_id): |
|
209
|
|
|
proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2', page_id=page_id) |
|
210
|
|
|
assert [(one, two.ID) for one, two in proc.zip_input_files(require_first=False)] == [(None, 'foobar2')] |
|
211
|
|
|
r = self.capture_out_err() |
|
212
|
|
|
assert 'ERROR ocrd.processor.base - found no page phys_0001 in file group GRP1' in r.err |
|
213
|
|
|
|
|
214
|
|
|
if __name__ == "__main__": |
|
215
|
|
|
main(__file__) |
|
216
|
|
|
|