Passed
Pull Request — master (#1184)
by Konstantin
03:15
created

tests.test_resolver   A

Complexity

Total Complexity 34

Size/Duplication

Total Lines 307
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
wmc 34
eloc 191
dl 0
loc 307
rs 9.68
c 0
b 0
f 0

21 Functions

Rating   Name   Duplication   Size   Complexity  
A test_workspace_from_url_kant_with_resources_existing_local() 0 18 1
A test_resolve_image0() 0 9 1
A test_workspace_from_url_kant() 0 18 1
A _fixture_copy_kant() 0 5 1
A test_resolve_mets_arguments() 0 20 5
A test_download_to_directory_with_badargs() 0 11 2
A test_download_to_directory_default() 0 6 1
A test_workspace_from_url_404() 0 15 2
A test_workspace_from_url_kant_with_resources() 0 23 1
A test_download_to_directory_basename() 0 6 1
A test_workspace_from_nothing() 0 3 1
A test_workspace_from_nothing_makedirs() 0 4 1
A test_workspace_from_nothing_noclobber() 0 13 2
A test_workspace_from_url0() 0 11 1
A request_behavior() 0 10 2
A test_workspace_from_url_with_rel_dir() 0 10 2
A _get_kant_data() 0 5 3
A test_download_to_directory_subdir() 0 6 1
A test_resolve_image_as_pil() 0 10 1
A test_workspace_from_url_bad() 0 6 2
A test_resolve_image_as_pil_deprecated() 0 9 2
1
# -*- coding: utf-8 -*-
2
3
import os
4
import shutil
5
6
from pathlib import (
7
    Path
8
)
9
from requests import Session
10
from unittest.mock import (
11
    patch,
12
    Mock
13
)
14
from PIL import (
15
    Image
16
)
17
18
from ocrd_models.ocrd_page import OcrdPage
19
20
import pytest
21
22
from tests.base import (
23
    assets,
24
    main
25
)
26
27
from ocrd.resolver import Resolver
28
from ocrd_utils import pushd_popd
29
30
31
# set pylint once on module level
32
# pylint: disable=protected-access
33
34
METS_HEROLD = assets.url_of('SBB0000F29300010000/data/mets.xml')
35
FOLDER_KANT = assets.path_to('kant_aufklaerung_1784')
36
DATA_KANT = {'mets.xml': (os.path.join(FOLDER_KANT, 'data', 'mets.xml'), 'text/xml'),
37
             'INPUT_0017.tif': (os.path.join(FOLDER_KANT, 'data', 'OCR-D-IMG', 'INPUT_0017.tif'), 'image/tiff'),
38
             'INPUT_0020.tif': (os.path.join(FOLDER_KANT, 'data', 'OCR-D-IMG', 'INPUT_0020.tif'), 'image/tiff'),
39
             'PAGE_0017_ALTO.xml': (os.path.join(FOLDER_KANT, 'data', 'OCR-D-GT-ALTO', 'PAGE_0017_ALTO.xml'), 'text/xml'),
40
             'PAGE_0020_ALTO.xml': (os.path.join(FOLDER_KANT, 'data', 'OCR-D-GT-ALTO', 'PAGE_0020_ALTO.xml'), 'text/xml'),
41
             'PAGE_0017_PAGE.xml': (os.path.join(FOLDER_KANT, 'data', 'OCR-D-GT-PAGE', 'PAGE_0017_PAGE.xml'), 'text/xml'),
42
             'PAGE_0020_PAGE.xml': (os.path.join(FOLDER_KANT, 'data', 'OCR-D-GT-PAGE', 'PAGE_0020_PAGE.xml'), 'text/xml'),
43
             }
44
45
46
def _get_kant_data(key):
47
    if key in DATA_KANT.keys():
48
        (path, mime) = DATA_KANT[key]
49
        with open(path, mode='rb') as _file:
50
            return (_file.read(), mime)
51
52
53
def request_behavior(*args, **kwargs):
54
    resp = Mock()
55
    resp.status_code = 200
56
    resp.headers = {}
57
    the_key = args[0].split('/')[-1]
58
    if the_key in DATA_KANT:
59
        (cnt, mime) = _get_kant_data(the_key)
60
        resp.content = cnt
61
        resp.headers = {'Content-Type': mime}
62
    return resp
63
64
65
def test_workspace_from_url_bad():
66
    with pytest.raises(Exception) as exc:
67
        Resolver().workspace_from_url(None)
68
69
    # check exception
70
    assert "Must pass 'mets_url'" in str(exc)
71
72
73
@patch.object(Session, "get")
74
def test_workspace_from_url_kant(mock_request, tmp_path):
75
76
    # arrange
77
    url_src = 'https://raw.githubusercontent.com/OCR-D/assets/master/data/kant_aufklaerung_1784/data/mets.xml'
78
    mock_request.side_effect = request_behavior
79
    dst_dir = tmp_path / 'workspace_kant'
80
    dst_dir.mkdir()
81
82
    # act
83
    resolver = Resolver()
84
    resolver.workspace_from_url(url_src, mets_basename='foo.xml', dst_dir=dst_dir)
85
86
    # assert
87
    local_path = dst_dir / 'foo.xml'
88
    assert os.path.isfile(str(local_path))
89
    # 1 time data was requested
90
    assert mock_request.call_count == 1
91
92
93
@patch.object(Session, "get")
94
def test_workspace_from_url_kant_with_resources(mock_request, tmp_path):
95
96
    # arrange
97
    url_src = 'https://raw.githubusercontent.com/OCR-D/assets/master/data/kant_aufklaerung_1784/data/mets.xml'
98
    mock_request.side_effect = request_behavior
99
    dst_dir = tmp_path / 'workspace_kant'
100
    dst_dir.mkdir()
101
102
    # act
103
    resolver = Resolver()
104
    resolver.workspace_from_url(url_src, mets_basename='kant_aufklaerung_1784.xml', dst_dir=dst_dir, download=True)
105
106
    # assert files present under local tmp_path
107
    local_path_mets = dst_dir / 'kant_aufklaerung_1784.xml'
108
    assert os.path.isfile(str(local_path_mets))
109
    local_path_img1 = dst_dir / 'OCR-D-IMG' / 'INPUT_0017.tif'
110
    assert os.path.isfile(str(local_path_img1))
111
    local_path_page1 = dst_dir / 'OCR-D-GT-PAGE' / 'PAGE_0017_PAGE.xml'
112
    assert os.path.isfile(str(local_path_page1))
113
114
    # 1 METS/MODS + 2 images + 4 OCR files = 7 requests
115
    assert mock_request.call_count == 7
116
117
118
@patch.object(Session, "get")
119
def test_workspace_from_url_kant_with_resources_existing_local(mock_request, tmp_path):
120
121
    # arrange
122
    url_src = 'https://raw.githubusercontent.com/OCR-D/assets/master/data/kant_aufklaerung_1784/data/mets.xml'
123
    mock_request.side_effect = request_behavior
124
    dst_dir = tmp_path / 'workspace_kant'
125
    dst_dir.mkdir()
126
    src_mets = Path(assets.path_to('kant_aufklaerung_1784-binarized/data/mets.xml'))
127
    dst_mets = Path(dst_dir, 'mets.xml')
128
    shutil.copyfile(src_mets, dst_mets)
129
130
    # act
131
    Resolver().workspace_from_url(url_src, clobber_mets=False, dst_dir=dst_dir)
132
133
    # assert
134
    # no real request was made, since mets already present
135
    assert mock_request.call_count == 0
136
137
138
@patch.object(Session, "get")
139
def test_workspace_from_url_404(mock_request):
140
    """Expected behavior when try create workspace from invalid online target
141
    """
142
143
    # arrange
144
    url_404 = 'https://raw.githubusercontent.com/OCR-D/assets/master/data/kant_aufklaerung_1784/data/mets.xmlX'
145
    mock_request.side_effect = Exception('HTTP request failed')
146
147
    with pytest.raises(Exception) as exc:
148
        Resolver().workspace_from_url(mets_url=url_404)
149
150
    # assert
151
    assert "HTTP request failed" in str(exc)
152
    assert mock_request.call_count == 1
153
154
155
def test_workspace_from_url_with_rel_dir(tmp_path):
156
    bogus_dst_dir = '../../../../../../../../../../../../../../../../%s' % str(tmp_path)[1:]
157
158
    # act
159
    with pushd_popd(FOLDER_KANT):
160
        ws1 = Resolver().workspace_from_url('data/mets.xml', dst_dir=bogus_dst_dir)
161
162
    # assert
163
    assert os.path.join(tmp_path, 'mets.xml') == ws1.mets_target
164
    assert str(tmp_path) == ws1.directory
165
166
167
def test_workspace_from_url0():
168
169
    # act
170
    workspace = Resolver().workspace_from_url(METS_HEROLD)
171
    input_files = workspace.mets.find_all_files(fileGrp='OCR-D-IMG')
172
    image_file = input_files[0]
173
    f = workspace.download_file(image_file)
174
175
    # assert
176
    assert '%s.tif' % f.ID == 'FILE_0001_IMAGE.tif'
177
    assert f.local_filename == 'OCR-D-IMG/FILE_0001_IMAGE.tif'
178
179
180
def test_resolve_image0():
181
    workspace = Resolver().workspace_from_url(METS_HEROLD)
182
    input_files = workspace.mets.find_all_files(fileGrp='OCR-D-IMG')
183
    f = input_files[0]
184
    print(f)
185
    img_pil1 = workspace._resolve_image_as_pil(f.local_filename)
186
    assert img_pil1.size == (2875, 3749)
187
    img_pil2 = workspace._resolve_image_as_pil(f.local_filename, [[0, 0], [1, 1]])
188
    assert img_pil2.size == (1, 1)
189
190
191
@pytest.mark.parametrize(
192
    "image_url,size_pil",
193
    [('OCR-D-IMG-NRM/OCR-D-IMG-NRM_0017.png', (1, 1)),
194
     ('OCR-D-IMG-1BIT/OCR-D-IMG-1BIT_0017.png', (1, 1)),
195
     ])
196
def test_resolve_image_as_pil(image_url, size_pil):
197
    url_path = assets.url_of('kant_aufklaerung_1784-binarized/data/mets.xml')
198
    workspace = Resolver().workspace_from_url(url_path)
199
    img_pil = workspace._resolve_image_as_pil(image_url, [[0, 0], [1, 1]])
200
    assert img_pil.size == size_pil
201
202
203
def test_resolve_image_as_pil_deprecated():
204
    url_path = os.path.join(assets.url_of('kant_aufklaerung_1784-binarized'), 'data/mets.xml')
205
    workspace = Resolver().workspace_from_url(url_path)
206
    with pytest.warns(DeprecationWarning) as record:
207
        workspace.resolve_image_as_pil('OCR-D-IMG-NRM/OCR-D-IMG-NRM_0017.png')
208
209
    # assert
210
    assert len(record) == 1
211
    assert 'Call to deprecated method resolve_image_as_pil.' in str(record[0].message)
212
213
214
def test_workspace_from_nothing():
215
    ws1 = Resolver().workspace_from_nothing(None)
216
    assert ws1.mets
217
218
219
def test_workspace_from_nothing_makedirs(tmp_path):
220
    non_existent_dir = tmp_path / 'target'
221
    ws1 = Resolver().workspace_from_nothing(non_existent_dir)
222
    assert ws1.directory == non_existent_dir
223
224
225
def test_workspace_from_nothing_noclobber(tmp_path):
226
    """Attempt to re-create workspace shall fail because already created
227
    """
228
229
    ws2 = Resolver().workspace_from_nothing(tmp_path)
230
    assert ws2.directory == tmp_path
231
232
    with pytest.raises(Exception) as exc:
233
        Resolver().workspace_from_nothing(tmp_path)
234
235
    # assert
236
    the_msg = "METS 'mets.xml' already exists in '%s' and clobber_mets not set" % tmp_path
237
    assert the_msg in str(exc)
238
239
240
@pytest.mark.parametrize("url,basename,exc_msg",
241
                         [(None, None, "'url' must be a non-empty string"),
242
                          (None, 'foo', "'directory' must be a non-empty string")]
243
                         )
244
def test_download_to_directory_with_badargs(url, basename, exc_msg):
245
246
    with pytest.raises(ValueError) as exc:
247
        Resolver().download_to_directory(url, basename)
248
249
    # assert exception message contained
250
    assert exc_msg in str(exc)
251
252
253
@pytest.fixture(name='fixture_copy_kant')
254
def _fixture_copy_kant(tmp_path):
255
    temporary_phil = tmp_path / 'kant_aufklaerung_1784'
256
    shutil.copytree(FOLDER_KANT, temporary_phil)
257
    yield temporary_phil
258
259
260
def test_download_to_directory_default(fixture_copy_kant):
261
    tmp_root = fixture_copy_kant.parent
262
    phil_data = fixture_copy_kant / 'data' / 'mets.xml'
263
    fn = Resolver().download_to_directory(str(tmp_root), str(phil_data))
264
    assert Path(tmp_root, fn).exists()
265
    assert fn == 'mets.xml'
266
267
268
def test_download_to_directory_basename(fixture_copy_kant):
269
    tmp_root = fixture_copy_kant.parent
270
    phil_data = fixture_copy_kant / 'data' / 'mets.xml'
271
    fn = Resolver().download_to_directory(str(tmp_root), str(phil_data), basename='foo')
272
    assert Path(tmp_root, fn).exists()
273
    assert fn == 'foo'
274
275
276
def test_download_to_directory_subdir(fixture_copy_kant):
277
    tmp_root = fixture_copy_kant.parent
278
    phil_data = fixture_copy_kant / 'data' / 'mets.xml'
279
    fn = Resolver().download_to_directory(str(tmp_root), str(phil_data), subdir='baz')
280
    assert Path(tmp_root, fn).exists()
281
    assert fn == 'baz/mets.xml'
282
283
284
def test_resolve_mets_arguments():
285
    """
286
    https://github.com/OCR-D/core/issues/693
287
    https://github.com/OCR-D/core/issues/517
288
    """
289
    resolver = Resolver()
290
    assert resolver.resolve_mets_arguments(None, None, None, None) == (str(Path.cwd()), str(Path.cwd() / 'mets.xml'), 'mets.xml', None)
291
    assert resolver.resolve_mets_arguments('/', None, 'mets.xml', None) == ('/', '/mets.xml', 'mets.xml', None)
292
    assert resolver.resolve_mets_arguments('/foo', '/foo/foo.xml', None, None) == ('/foo', '/foo/foo.xml', 'foo.xml', None)
293
    assert resolver.resolve_mets_arguments(None, '/foo/foo.xml', None, None) == ('/foo', '/foo/foo.xml', 'foo.xml', None)
294
    assert resolver.resolve_mets_arguments('/foo', 'foo.xml', None, None) == ('/foo', '/foo/foo.xml', 'foo.xml', None)
295
    assert resolver.resolve_mets_arguments('/foo', 'http://bar/foo.xml', None, None) == ('/foo', 'http://bar/foo.xml', 'foo.xml', None)
296
    with pytest.raises(ValueError, match="Use either --mets or --mets-basename, not both"):
297
        resolver.resolve_mets_arguments('/', '/foo/bar', 'foo.xml', None)
298
    with pytest.raises(ValueError, match="inconsistent with --directory"):
299
        resolver.resolve_mets_arguments('/foo', '/bar/foo.xml', None, None)
300
    with pytest.warns(DeprecationWarning):
301
        resolver.resolve_mets_arguments('/foo', None, 'not_mets.xml', None)
302
    with pytest.raises(ValueError, match=r"--mets is an http\(s\) URL but no --directory was given"):
303
        resolver.resolve_mets_arguments(None, 'http://bar/foo.xml', None, None)
304
305
if __name__ == '__main__':
306
    main(__file__)
307