Passed
Pull Request — master (#1184)
by Konstantin
03:15
created

tests.processor.test_processor   B

Complexity

Total Complexity 46

Size/Duplication

Total Lines 216
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
wmc 46
eloc 189
dl 0
loc 216
rs 8.72
c 0
b 0
f 0

19 Methods

Rating   Name   Duplication   Size   Complexity  
A TestProcessor.test_verify() 0 3 1
A TestProcessor.test_zip_input_files() 0 19 4
A TestProcessor.test_no_input_file_grp() 0 6 2
A TestProcessor.test_no_resolver() 0 3 2
A TestProcessor.test_incomplete_processor() 0 4 2
A TestProcessor.test_run_output0() 0 9 2
A TestProcessor.test_json() 0 2 1
A TestProcessor.test_no_mets_url() 0 3 2
A TestProcessor.test_run_input() 0 5 1
A TestProcessor.test_params_missing_required() 0 3 2
A TestProcessor.test_with_mets_url_input_files() 0 8 1
A TestProcessor.test_run_agent() 0 4 1
A TestProcessor.setUp() 0 4 1
A TestProcessor.test_run_cli() 0 19 2
A TestProcessor.test_parameter() 0 14 4
C TestProcessor.test_zip_input_files_multi_mixed() 0 37 10
A TestProcessor.test_zip_input_files_require_first() 0 13 4
A TestProcessor.test_run_output_overwrite() 0 18 3
A TestProcessor.test_params() 0 3 1

How to fix   Complexity   

Complexity

Complex classes like tests.processor.test_processor often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

1
import json
2
3
from tempfile import TemporaryDirectory
4
from os.path import join
5
from tests.base import CapturingTestCase as TestCase, assets, main # pylint: disable=import-error, no-name-in-module
6
from tests.data import DummyProcessor, DummyProcessorWithRequiredParameters, DummyProcessorWithOutput, IncompleteProcessor
7
8
from ocrd_utils import MIMETYPE_PAGE, pushd_popd, initLogging, disableLogging
9
from ocrd.resolver import Resolver
10
from ocrd.processor.base import Processor, run_processor, run_cli
11
12
import pytest
13
14
class TestProcessor(TestCase):
15
16
    def setUp(self):
17
        super().setUp()
18
        self.resolver = Resolver()
19
        self.workspace = self.resolver.workspace_from_url(assets.url_of('SBB0000F29300010000/data/mets.xml'))
20
21
    def test_incomplete_processor(self):
22
        proc = IncompleteProcessor(None)
23
        with self.assertRaisesRegex(Exception, 'Must be implemented'):
24
            proc.process()
25
26
    def test_no_resolver(self):
27
        with self.assertRaisesRegex(Exception, 'pass a resolver to create a workspace'):
28
            run_processor(DummyProcessor)
29
30
    def test_no_mets_url(self):
31
        with self.assertRaisesRegex(Exception, 'pass mets_url to create a workspace'):
32
            run_processor(DummyProcessor, resolver=self.resolver)
33
34
    def test_no_input_file_grp(self):
35
        processor = run_processor(DummyProcessor,
36
                                  resolver=self.resolver,
37
                                  workspace=self.workspace)
38
        with self.assertRaisesRegex(Exception, 'Processor is missing input fileGrp'):
39
            _ = processor.input_files
40
41
    def test_with_mets_url_input_files(self):
42
        assert len(list(self.workspace.mets.find_files(fileGrp='OCR-D-SEG-PAGE'))) == 2
43
        processor = run_processor(DummyProcessor,
44
                                  input_file_grp='OCR-D-SEG-PAGE',
45
                                  resolver=self.resolver,
46
                                  workspace=self.workspace)
47
        assert len(processor.input_files) == 2
48
        assert [f.mimetype for f in processor.input_files] == [MIMETYPE_PAGE, MIMETYPE_PAGE]
49
50
    def test_parameter(self):
51
        with TemporaryDirectory() as tempdir:
52
            jsonpath = join(tempdir, 'params.json')
53
            with open(jsonpath, 'w') as f:
54
                f.write('{"baz": "quux"}')
55
            with open(jsonpath, 'r') as f:
56
                processor = run_processor(
57
                    DummyProcessor,
58
                    parameter=json.load(f),
59
                    input_file_grp="OCR-D-IMG",
60
                    resolver=self.resolver,
61
                    workspace=self.workspace
62
                )
63
            self.assertEqual(len(processor.input_files), 3)
64
65
    def test_verify(self):
66
        proc = DummyProcessor(self.workspace)
67
        self.assertEqual(proc.verify(), True)
68
69
    def test_json(self):
70
        DummyProcessor(self.workspace, dump_json=True)
71
72
    def test_params_missing_required(self):
73
        with self.assertRaisesRegex(Exception, 'is a required property'):
74
            DummyProcessorWithRequiredParameters(workspace=self.workspace)
75
76
    def test_params(self):
77
        proc = Processor(workspace=self.workspace)
78
        self.assertEqual(proc.parameter, {})
79
80
    def test_run_agent(self):
81
        no_agents_before = len(self.workspace.mets.agents)
82
        run_processor(DummyProcessor, workspace=self.workspace)
83
        self.assertEqual(len(self.workspace.mets.agents), no_agents_before + 1, 'one more agent')
84
        #  print(self.workspace.mets.agents[no_agents_before])
85
86
    def test_run_input(self):
87
        run_processor(DummyProcessor, workspace=self.workspace, input_file_grp="OCR-D-IMG")
88
        assert len(self.workspace.mets.agents) > 0
89
        assert len(self.workspace.mets.agents[-1].notes) > 0
90
        assert ({'{https://ocr-d.de}option': 'input-file-grp'}, 'OCR-D-IMG') in self.workspace.mets.agents[-1].notes
91
92
    def test_run_output0(self):
93
        with pushd_popd(tempdir=True) as tempdir:
94
            ws = self.resolver.workspace_from_nothing(directory=tempdir)
95
            ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar1', pageId='phys_0001')
96
            ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar2', pageId='phys_0002')
97
            run_processor(DummyProcessorWithOutput, workspace=ws,
98
                          input_file_grp="GRP1",
99
                          output_file_grp="OCR-D-OUT")
100
            assert len(ws.mets.find_all_files(fileGrp="OCR-D-OUT")) == 2
101
102
    def test_run_output_overwrite(self):
103
        with pushd_popd(tempdir=True) as tempdir:
104
            ws = self.resolver.workspace_from_nothing(directory=tempdir)
105
            ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar1', pageId='phys_0001')
106
            ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar2', pageId='phys_0002')
107
            ws.overwrite_mode = True
108
            ws.add_file('OCR-D-OUT', mimetype=MIMETYPE_PAGE, ID='OCR-D-OUT_phys_0001', pageId='phys_0001')
109
            ws.overwrite_mode = False
110
            with pytest.raises(Exception) as exc:
111
                run_processor(DummyProcessorWithOutput, workspace=ws,
112
                              input_file_grp="GRP1",
113
                              output_file_grp="OCR-D-OUT")
114
                assert str(exc.value) == "File with ID='OCR-D-OUT_phys_0001' already exists"
115
            ws.overwrite_mode = True
116
            run_processor(DummyProcessorWithOutput, workspace=ws,
117
                          input_file_grp="GRP1",
118
                          output_file_grp="OCR-D-OUT")
119
            assert len(ws.mets.find_all_files(fileGrp="OCR-D-OUT")) == 2
120
121
    def test_run_cli(self):
122
        with TemporaryDirectory() as tempdir:
123
            run_processor(DummyProcessor, workspace=self.workspace)
124
            run_cli(
125
                'echo',
126
                mets_url=assets.url_of('SBB0000F29300010000/data/mets.xml'),
127
                resolver=Resolver(),
128
                workspace=None,
129
                page_id='page1',
130
                log_level='DEBUG',
131
                input_file_grp='INPUT',
132
                output_file_grp='OUTPUT',
133
                parameter='/path/to/param.json',
134
                working_dir=tempdir
135
            )
136
            run_cli(
137
                'echo',
138
                mets_url=assets.url_of('SBB0000F29300010000/data/mets.xml'),
139
                resolver=Resolver(),
140
            )
141
142
    def test_zip_input_files(self):
143
        class ZipTestProcessor(Processor): pass
144
        with pushd_popd(tempdir=True) as tempdir:
145
            ws = self.resolver.workspace_from_nothing(directory=tempdir)
146
            ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, file_id='foobar1', page_id='phys_0001')
147
            ws.add_file('GRP2', mimetype='application/alto+xml', file_id='foobar2', page_id='phys_0001')
148
            ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, file_id='foobar3', page_id='phys_0002')
149
            ws.add_file('GRP2', mimetype=MIMETYPE_PAGE, file_id='foobar4', page_id='phys_0002')
150
            for page_id in [None, 'phys_0001,phys_0002']:
151
                with self.subTest(page_id=page_id):
152
                    proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2', page_id=page_id)
153
                    tuples = [(one.ID, two.ID) for one, two in proc.zip_input_files()]
154
                    assert ('foobar1', 'foobar2') in tuples
155
                    assert ('foobar3', 'foobar4') in tuples
156
                    tuples = [(one.ID, two) for one, two in proc.zip_input_files(mimetype=MIMETYPE_PAGE)]
157
                    assert ('foobar1', None) in tuples
158
                    tuples = [(one.ID, two.ID) for one, two in proc.zip_input_files(mimetype=r'//application/(vnd.prima.page|alto)\+xml')]
159
                    assert ('foobar1', 'foobar2') in tuples
160
                    assert ('foobar3', 'foobar4') in tuples
161
162
    def test_zip_input_files_multi_mixed(self):
163
        class ZipTestProcessor(Processor): pass
164
        with pushd_popd(tempdir=True) as tempdir:
165
            ws = self.resolver.workspace_from_nothing(directory=tempdir)
166
            ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, file_id='foobar1', page_id='phys_0001')
167
            ws.add_file('GRP1', mimetype='image/png', file_id='foobar1img1', page_id='phys_0001')
168
            ws.add_file('GRP1', mimetype='image/png', file_id='foobar1img2', page_id='phys_0001')
169
            ws.add_file('GRP2', mimetype=MIMETYPE_PAGE, file_id='foobar2', page_id='phys_0001')
170
            ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, file_id='foobar3', page_id='phys_0002')
171
            ws.add_file('GRP2', mimetype='image/tiff', file_id='foobar4', page_id='phys_0002')
172
            for page_id in [None, 'phys_0001,phys_0002']:
173
                with self.subTest(page_id=page_id):
174
                    proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2', page_id=page_id)
175
                    print("unfiltered")
176
                    tuples = [(one.ID, two.ID) for one, two in proc.zip_input_files()]
177
                    assert ('foobar1', 'foobar2') in tuples
178
                    assert ('foobar3', 'foobar4') in tuples
179
                    print("PAGE-filtered")
180
                    tuples = [(one.ID, two) for one, two in proc.zip_input_files(mimetype=MIMETYPE_PAGE)]
181
                    assert ('foobar3', None) in tuples
182
            ws.add_file('GRP2', mimetype='image/tiff', file_id='foobar4dup', page_id='phys_0002')
183
            for page_id in [None, 'phys_0001,phys_0002']:
184
                with self.subTest(page_id=page_id):
185
                    proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2', page_id=page_id)
186
                    tuples = [(one.ID, two.ID) for one, two in proc.zip_input_files(on_error='first')]
187
                    assert ('foobar1', 'foobar2') in tuples
188
                    assert ('foobar3', 'foobar4') in tuples
189
                    tuples = [(one.ID, two) for one, two in proc.zip_input_files(on_error='skip')]
190
                    assert ('foobar3', None) in tuples
191
                    with self.assertRaisesRegex(Exception, "No PAGE-XML for page .* in fileGrp .* but multiple matches."):
192
                        tuples = proc.zip_input_files(on_error='abort')
193
            ws.add_file('GRP2', mimetype=MIMETYPE_PAGE, file_id='foobar2dup', page_id='phys_0001')
194
            for page_id in [None, 'phys_0001,phys_0002']:
195
                with self.subTest(page_id=page_id):
196
                    proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2', page_id=page_id)
197
                    with self.assertRaisesRegex(Exception, "Multiple PAGE-XML matches for page"):
198
                        tuples = proc.zip_input_files()
199
200
    def test_zip_input_files_require_first(self):
201
        class ZipTestProcessor(Processor): pass
202
        self.capture_out_err()
203
        with pushd_popd(tempdir=True) as tempdir:
204
            ws = self.resolver.workspace_from_nothing(directory=tempdir)
205
            ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, file_id='foobar1', page_id=None)
206
            ws.add_file('GRP2', mimetype=MIMETYPE_PAGE, file_id='foobar2', page_id='phys_0001')
207
            for page_id in [None, 'phys_0001']:
208
                with self.subTest(page_id=page_id):
209
                    proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2', page_id=page_id)
210
                    assert [(one, two.ID) for one, two in proc.zip_input_files(require_first=False)] == [(None, 'foobar2')]
211
        r = self.capture_out_err()
212
        assert 'ERROR ocrd.processor.base - found no page phys_0001 in file group GRP1' in r.err
213
214
if __name__ == "__main__":
215
    main(__file__)
216