1
|
|
|
from os import getcwd |
2
|
|
|
from tempfile import TemporaryDirectory, gettempdir |
3
|
|
|
from pathlib import Path |
4
|
|
|
|
5
|
|
|
from PIL import Image |
6
|
|
|
|
7
|
|
|
from tests.base import TestCase, main, assets, create_ocrd_file |
8
|
|
|
from pytest import raises, warns |
9
|
|
|
from ocrd_utils import ( |
10
|
|
|
abspath, |
11
|
|
|
|
12
|
|
|
assert_file_grp_cardinality, |
13
|
|
|
make_file_id, |
14
|
|
|
|
15
|
|
|
bbox_from_points, |
16
|
|
|
bbox_from_xywh, |
17
|
|
|
|
18
|
|
|
concat_padded, |
19
|
|
|
is_local_filename, |
20
|
|
|
get_local_filename, |
21
|
|
|
is_string, |
22
|
|
|
membername, |
23
|
|
|
generate_range, |
24
|
|
|
sparkline, |
25
|
|
|
|
26
|
|
|
nth_url_segment, |
27
|
|
|
remove_non_path_from_url, |
28
|
|
|
safe_filename, |
29
|
|
|
|
30
|
|
|
parse_json_string_or_file, |
31
|
|
|
set_json_key_value_overrides, |
32
|
|
|
|
33
|
|
|
partition_list, |
34
|
|
|
|
35
|
|
|
points_from_bbox, |
36
|
|
|
points_from_x0y0x1y1, |
37
|
|
|
points_from_xywh, |
38
|
|
|
points_from_polygon, |
39
|
|
|
|
40
|
|
|
polygon_from_points, |
41
|
|
|
polygon_from_x0y0x1y1, |
42
|
|
|
|
43
|
|
|
xywh_from_points, |
44
|
|
|
xywh_from_polygon, |
45
|
|
|
pushd_popd, |
46
|
|
|
|
47
|
|
|
MIME_TO_EXT, EXT_TO_MIME, |
48
|
|
|
MIME_TO_PIL, PIL_TO_MIME, |
49
|
|
|
) |
50
|
|
|
from ocrd_models.utils import xmllint_format |
51
|
|
|
from ocrd_models import OcrdMets |
52
|
|
|
|
53
|
|
|
|
54
|
|
|
def test_abspath(): |
55
|
|
|
assert abspath('file:///') == '/' |
56
|
|
|
|
57
|
|
|
def test_points_from_xywh(): |
58
|
|
|
assert points_from_xywh({'x': 100, 'y': 100, 'w': 100, 'h': 100}) == '100,100 200,100 200,200 100,200' |
59
|
|
|
|
60
|
|
|
def test_points_from_bbox(): |
61
|
|
|
assert points_from_bbox(100, 100, 200, 200) == '100,100 200,100 200,200 100,200' |
62
|
|
|
|
63
|
|
|
def test_points_from_polygon(): |
64
|
|
|
assert points_from_polygon([[100, 100], [200, 100], [200, 200], [100, 200]]) == '100,100 200,100 200,200 100,200' |
65
|
|
|
|
66
|
|
|
def test_polygon_from_x0y0x1y1(): |
67
|
|
|
assert polygon_from_x0y0x1y1([100, 100, 200, 200]) == [[100, 100], [200, 100], [200, 200], [100, 200]] |
68
|
|
|
|
69
|
|
|
def test_points_from_x0y0x1y1(): |
70
|
|
|
assert points_from_x0y0x1y1([100, 100, 200, 200]) == '100,100 200,100 200,200 100,200' |
71
|
|
|
|
72
|
|
|
def test_bbox_from_points(): |
73
|
|
|
assert bbox_from_points('100,100 200,100 200,200 100,200') == (100, 100, 200, 200) |
74
|
|
|
|
75
|
|
|
def test_bbox_from_xywh(): |
76
|
|
|
assert bbox_from_xywh({'x': 100, 'y': 100, 'w': 100, 'h': 100}) == (100, 100, 200, 200) |
77
|
|
|
|
78
|
|
|
def test_xywh_from_polygon(): |
79
|
|
|
assert xywh_from_polygon([[100, 100], [200, 100], [200, 200], [100, 200]]) == {'x': 100, 'y': 100, 'w': 100, 'h': 100} |
80
|
|
|
|
81
|
|
|
def test_xywh_from_points(): |
82
|
|
|
assert xywh_from_points('100,100 200,100 200,200 100,200') == {'x': 100, 'y': 100, 'w': 100, 'h': 100} |
83
|
|
|
|
84
|
|
|
def test_xywh_from_points_unordered(): |
85
|
|
|
assert xywh_from_points('500,500 100,100 200,100 200,200 100,200') == {'x': 100, 'y': 100, 'w': 400, 'h': 400} |
86
|
|
|
|
87
|
|
|
def test_polygon_from_points(): |
88
|
|
|
assert polygon_from_points('100,100 200,100 200,200 100,200') == [[100, 100], [200, 100], [200, 200], [100, 200]] |
89
|
|
|
|
90
|
|
|
def test_concat_padded(): |
91
|
|
|
assert concat_padded('x', 1) == 'x_0001' |
92
|
|
|
assert concat_padded('x', 1, 2, 3) == 'x_0001_0002_0003' |
93
|
|
|
assert concat_padded('x', 1, '2', 3) == 'x_0001_2_0003' |
94
|
|
|
|
95
|
|
|
def test_is_string(): |
96
|
|
|
assert is_string('x') |
97
|
|
|
assert is_string(u'x') |
98
|
|
|
|
99
|
|
|
def test_xmllint(): |
100
|
|
|
xml_str = '<beep>\n <boop>42</boop>\n</beep>\n' |
101
|
|
|
pretty_xml = xmllint_format(xml_str).decode('utf-8') |
102
|
|
|
assert pretty_xml == '<?xml version="1.0" encoding="UTF-8"?>\n' + xml_str |
103
|
|
|
|
104
|
|
|
def test_membername(): |
105
|
|
|
class Klazz: |
106
|
|
|
def __init__(self): |
107
|
|
|
self.prop = 42 |
108
|
|
|
instance = Klazz() |
109
|
|
|
assert membername(instance, 42) == 'prop' |
110
|
|
|
|
111
|
|
|
def test_pil_version(): |
112
|
|
|
""" |
113
|
|
|
Test segfault issue in PIL TiffImagePlugin |
114
|
|
|
|
115
|
|
|
Run the same code multiple times to make segfaults more probable |
116
|
|
|
|
117
|
|
|
Test is failing due to segfaults in Pillow versions: |
118
|
|
|
6.0.0 |
119
|
|
|
6.1.0 |
120
|
|
|
|
121
|
|
|
Test succeeds in Pillow versions: |
122
|
|
|
5.3.1 |
123
|
|
|
5.4.1 |
124
|
|
|
6.2.0 |
125
|
|
|
""" |
126
|
|
|
for _ in range(0, 10): |
127
|
|
|
pil_image = Image.open(assets.path_to('grenzboten-test/data/OCR-D-IMG-BIN/p179470.tif')) |
128
|
|
|
pil_image.crop(box=[1539, 202, 1626, 271]) |
129
|
|
|
|
130
|
|
|
def test_pushd_popd_newcwd(): |
131
|
|
|
cwd = getcwd() |
132
|
|
|
tmp_dir = Path(gettempdir()).resolve() |
133
|
|
|
with pushd_popd(tmp_dir): |
134
|
|
|
assert getcwd() == str(tmp_dir) |
135
|
|
|
assert getcwd() == cwd |
136
|
|
|
assert getcwd() == cwd |
137
|
|
|
|
138
|
|
|
def test_pushd_popd_tempdir(): |
139
|
|
|
cwd = getcwd() |
140
|
|
|
tmp_dir = str(Path(gettempdir()).resolve()) |
141
|
|
|
with pushd_popd(tempdir=True) as newcwd: |
142
|
|
|
newcwd_str = str(newcwd) |
143
|
|
|
assert getcwd() == newcwd_str |
144
|
|
|
assert newcwd_str.startswith(tmp_dir) |
145
|
|
|
assert getcwd() == cwd |
146
|
|
|
assert getcwd() == cwd |
147
|
|
|
|
148
|
|
|
def test_pushd_popd_bad_call(): |
149
|
|
|
with raises(Exception, match='pushd_popd can accept either newcwd or tempdir, not both'): |
150
|
|
|
with pushd_popd('/foo/bar', True): |
151
|
|
|
pass |
152
|
|
|
|
153
|
|
|
def test_is_local_filename(): |
154
|
|
|
assert is_local_filename('/foo/bar') |
155
|
|
|
assert is_local_filename('file:///foo/bar') |
156
|
|
|
assert is_local_filename('file:/foo/bar') |
157
|
|
|
assert is_local_filename('foo/bar') |
158
|
|
|
assert not is_local_filename('bad-scheme://foo/bar') |
159
|
|
|
|
160
|
|
|
def test_local_filename(): |
161
|
|
|
assert get_local_filename('/foo/bar') == '/foo/bar' |
162
|
|
|
assert get_local_filename('file:///foo/bar') == '/foo/bar' |
163
|
|
|
assert get_local_filename('file:/foo/bar') == '/foo/bar' |
164
|
|
|
assert get_local_filename('/foo/bar', '/foo/') == 'bar' |
165
|
|
|
assert get_local_filename('/foo/bar', '/foo') == 'bar' |
166
|
|
|
assert get_local_filename('foo/bar', 'foo') == 'bar' |
167
|
|
|
|
168
|
|
|
def test_remove_non_path_from_url(): |
169
|
|
|
assert remove_non_path_from_url('https://foo/bar') == 'https://foo/bar' |
170
|
|
|
assert remove_non_path_from_url('https://foo//?bar#frag') == 'https://foo' |
171
|
|
|
assert remove_non_path_from_url('/path/to/foo#frag') == '/path/to/foo' |
172
|
|
|
|
173
|
|
|
def test_nth_url_segment(): |
174
|
|
|
assert nth_url_segment('') == '' |
175
|
|
|
assert nth_url_segment('foo') == 'foo' |
176
|
|
|
assert nth_url_segment('foo', n=-1) == 'foo' |
177
|
|
|
assert nth_url_segment('foo', n=-2) == '' |
178
|
|
|
assert nth_url_segment('foo/bar', n=-2) == 'foo' |
179
|
|
|
assert nth_url_segment('/baz/bar', n=-2) == 'baz' |
180
|
|
|
assert nth_url_segment('foo/') == 'foo' |
181
|
|
|
assert nth_url_segment('foo//?bar#frag') == 'foo' |
182
|
|
|
assert nth_url_segment('/path/to/foo#frag') == 'foo' |
183
|
|
|
assert nth_url_segment('/path/to/foo#frag', n=-2) == 'to' |
184
|
|
|
assert nth_url_segment('https://server/foo?xyz=zyx') == 'foo' |
185
|
|
|
|
186
|
|
|
def test_parse_json_string_or_file(): |
187
|
|
|
assert parse_json_string_or_file() == {} |
188
|
|
|
assert parse_json_string_or_file('') == {} |
189
|
|
|
assert parse_json_string_or_file(' ') == {} |
190
|
|
|
assert parse_json_string_or_file('{}') == {} |
191
|
|
|
assert parse_json_string_or_file('{"foo": 32}') == {'foo': 32} |
192
|
|
|
assert parse_json_string_or_file( |
193
|
|
|
'{"dpi": -1, "textequiv_level": "word", "overwrite_words": false, "raw_lines": false, "char_whitelist": "", "char_blacklist": "", "char_unblacklist": ""}') == \ |
194
|
|
|
{"dpi": -1, "textequiv_level": "word", "overwrite_words": False, "raw_lines": False, "char_whitelist": "", "char_blacklist": "", "char_unblacklist": ""} |
195
|
|
|
|
196
|
|
|
def test_parameter_file(): |
197
|
|
|
""" |
198
|
|
|
Verify that existing filenames get priority over valid JSON string interpretation |
199
|
|
|
""" |
200
|
|
|
with TemporaryDirectory() as tempdir: |
201
|
|
|
paramfile = Path(tempdir, '{"foo":23}') # XXX yes, the file is called '{"foo":23}' |
202
|
|
|
paramfile.write_text('{"bar": 42}') |
203
|
|
|
# /tmp/<var>/{"foo":23} -- exists, read file and parse as JSON |
204
|
|
|
assert parse_json_string_or_file(str(paramfile)) == {'bar': 42} |
205
|
|
|
# $PWD/{"foo":23} -- does not exist, parse as json |
206
|
|
|
assert parse_json_string_or_file(paramfile.name) == {'foo': 23} |
207
|
|
|
|
208
|
|
|
def test_parameter_file_comments(): |
209
|
|
|
with TemporaryDirectory() as tempdir: |
210
|
|
|
jsonpath = Path(tempdir, 'test.json') |
211
|
|
|
jsonpath.write_text("""\ |
212
|
|
|
{ |
213
|
|
|
# Metasyntactical variables are rarely imaginative |
214
|
|
|
"foo": 42, |
215
|
|
|
# case in point: |
216
|
|
|
"bar": 23 |
217
|
|
|
}""") |
218
|
|
|
assert parse_json_string_or_file(str(jsonpath)) == {'foo': 42, 'bar': 23} |
219
|
|
|
|
220
|
|
|
def test_parameters_invalid(): |
221
|
|
|
with raises(ValueError, match='Not a valid JSON object'): |
222
|
|
|
parse_json_string_or_file('[]') |
223
|
|
|
with raises(ValueError, match='Error parsing'): |
224
|
|
|
parse_json_string_or_file('[}') |
225
|
|
|
|
226
|
|
|
def test_mime_ext(): |
227
|
|
|
assert MIME_TO_EXT['image/jp2'] == '.jp2' |
228
|
|
|
assert EXT_TO_MIME['.jp2'] == 'image/jp2' |
229
|
|
|
assert MIME_TO_PIL['image/jp2'] == 'JP2' |
230
|
|
|
assert PIL_TO_MIME['JP2'] == 'image/jp2' |
231
|
|
|
|
232
|
|
|
|
233
|
|
|
def test_set_json_key_value_overrides(): |
234
|
|
|
assert set_json_key_value_overrides({}, ('foo', 'true')) == {'foo': True} |
235
|
|
|
assert set_json_key_value_overrides({}, ('foo', 'false')) == {'foo': False} |
236
|
|
|
assert set_json_key_value_overrides({}, ('foo', '42')) == {'foo': 42} |
237
|
|
|
assert set_json_key_value_overrides({}, ('foo', '42.3')) == {'foo': 42.3} |
238
|
|
|
assert set_json_key_value_overrides({}, ('foo', '["one", 2, 3.33]')) == {'foo': ['one', 2, 3.33]} |
239
|
|
|
assert set_json_key_value_overrides({}, ('foo', '{"one": 2}')) == {'foo': {'one': 2}} |
240
|
|
|
assert set_json_key_value_overrides({}, ('foo', '"a string"')) == {'foo': 'a string'} |
241
|
|
|
assert set_json_key_value_overrides({}, ('foo', 'a string')) == {'foo': 'a string'} |
242
|
|
|
|
243
|
|
|
def test_assert_file_grp_cardinality(): |
244
|
|
|
with raises(AssertionError, match="Expected exactly 5 output file groups, but '.'FOO', 'BAR'.' has 2"): |
245
|
|
|
assert_file_grp_cardinality('FOO,BAR', 5) |
246
|
|
|
with raises(AssertionError, match="Expected exactly 1 output file group, but '.'FOO', 'BAR'.' has 2"): |
247
|
|
|
assert_file_grp_cardinality('FOO,BAR', 1) |
248
|
|
|
assert_file_grp_cardinality('FOO,BAR', 2) |
249
|
|
|
with raises(AssertionError, match="Expected exactly 1 output file group .foo bar., but '.'FOO', 'BAR'.' has 2"): |
250
|
|
|
assert_file_grp_cardinality('FOO,BAR', 1, 'foo bar') |
251
|
|
|
|
252
|
|
|
def test_make_file_id_simple(): |
253
|
|
|
f = create_ocrd_file('MAX', ID="MAX_0012") |
254
|
|
|
assert make_file_id(f, 'FOO') == 'FOO_0012' |
255
|
|
|
|
256
|
|
|
def test_make_file_id_mets(): |
257
|
|
|
mets = OcrdMets.empty_mets() |
258
|
|
|
for i in range(1, 10): |
259
|
|
|
mets.add_file('FOO', ID="FOO_%04d" % (i), mimetype="image/tiff", pageId='FOO_%04d' % i) |
260
|
|
|
mets.add_file('BAR', ID="BAR_%04d" % (i), mimetype="image/tiff", pageId='BAR_%04d' % i) |
261
|
|
|
assert make_file_id(mets.find_all_files(ID='BAR_0007')[0], 'FOO') == 'FOO_0007' |
262
|
|
|
f = mets.add_file('ABC', ID="BAR_42", mimetype="image/tiff") |
263
|
|
|
mets.remove_file(fileGrp='FOO') |
264
|
|
|
assert make_file_id(f, 'FOO') == 'FOO_BAR_42' |
265
|
|
|
mets.add_file('FOO', ID="FOO_0001", mimetype="image/tiff") |
266
|
|
|
|
267
|
|
|
def test_make_file_id_570(): |
268
|
|
|
"""https://github.com/OCR-D/core/pull/570""" |
269
|
|
|
mets = OcrdMets.empty_mets() |
270
|
|
|
f = mets.add_file('GRP', ID='FOO_0001', pageId='phys0001') |
271
|
|
|
mets.add_file('GRP', ID='GRP2_0001', pageId='phys0002') |
272
|
|
|
assert make_file_id(f, 'GRP2') == 'GRP2_phys0001' |
273
|
|
|
|
274
|
|
|
def test_make_file_id_605(): |
275
|
|
|
""" |
276
|
|
|
https://github.com/OCR-D/core/pull/605 |
277
|
|
|
Also: Same fileGrp! |
278
|
|
|
""" |
279
|
|
|
mets = OcrdMets.empty_mets() |
280
|
|
|
f = mets.add_file('GRP1', ID='FOO_0001', pageId='phys0001') |
281
|
|
|
f = mets.add_file('GRP2', ID='FOO_0002', pageId='phys0002') |
282
|
|
|
# NB: same fileGrp |
283
|
|
|
assert make_file_id(f, 'GRP2') == 'FOO_0002' |
284
|
|
|
assert make_file_id(f, 'GRP3') == 'GRP3_phys0002' |
285
|
|
|
|
286
|
|
|
def test_make_file_id_744(): |
287
|
|
|
""" |
288
|
|
|
https://github.com/OCR-D/core/pull/744 |
289
|
|
|
> Often file IDs have two numbers, one of which will clash. In that case only the numerical fallback works. |
290
|
|
|
""" |
291
|
|
|
mets = OcrdMets.empty_mets() |
292
|
|
|
f = mets.add_file('GRP2', ID='img1796-97_00000024_img', pageId='phys0024') |
293
|
|
|
f = mets.add_file('GRP2', ID='img1796-97_00000025_img', pageId='phys0025') |
294
|
|
|
assert make_file_id(f, 'GRP3') == 'GRP3_phys0025' |
295
|
|
|
|
296
|
|
|
def test_generate_range(): |
297
|
|
|
assert generate_range('PHYS_0001', 'PHYS_0005') == ['PHYS_0001', 'PHYS_0002', 'PHYS_0003', 'PHYS_0004', 'PHYS_0005'] |
298
|
|
|
with raises(ValueError, match='could not find numeric part'): |
299
|
|
|
assert generate_range('NONUMBER', 'ALSO_NONUMBER') |
300
|
|
|
with raises(ValueError, match='differ in their non-numeric part'): |
301
|
|
|
generate_range('PHYS_0001_123', 'PHYS_0010_123') |
302
|
|
|
with raises(ValueError, match='differ in their non-numeric part'): |
303
|
|
|
assert generate_range('1', 'PHYS_0005') == 0 |
304
|
|
|
with raises(ValueError, match='differ in their non-numeric part'): |
305
|
|
|
assert generate_range('1', 'page 5') == 0 |
306
|
|
|
with warns(UserWarning, match='same number'): |
307
|
|
|
assert generate_range('PHYS_0001_123', 'PHYS_0001_123') == ['PHYS_0001_123'] |
308
|
|
|
|
309
|
|
|
def test_safe_filename(): |
310
|
|
|
assert safe_filename('Hello world,!') == 'Hello_world_' |
311
|
|
|
assert safe_filename(' Καλημέρα κόσμε,') == '_Καλημέρα_κόσμε_' |
312
|
|
|
assert safe_filename(':コンニチハ:') == '_コンニチハ_' |
313
|
|
|
|
314
|
|
|
def test_partition_list(): |
315
|
|
|
lst_10 = list(range(1, 11)) |
316
|
|
|
assert partition_list(None, 1) == [] |
317
|
|
|
assert partition_list([], 1) == [] |
318
|
|
|
assert partition_list(lst_10, 1) == [lst_10] |
319
|
|
|
assert partition_list(lst_10, 3) == [[1, 2, 3, 4], [5, 6, 7], [8, 9, 10]] |
320
|
|
|
assert partition_list(lst_10, 3, 1) == [[5, 6, 7]] |
321
|
|
|
assert partition_list(lst_10, 3, 0) == [[1, 2, 3, 4]] |
322
|
|
|
with raises(IndexError): |
323
|
|
|
partition_list(lst_10, chunks=4, chunk_index=5) |
324
|
|
|
partition_list(lst_10, chunks=5, chunk_index=5) |
325
|
|
|
partition_list(lst_10, chunks=5, chunk_index=6) |
326
|
|
|
with raises(ValueError): |
327
|
|
|
partition_list(lst_10, chunks=11) |
328
|
|
|
# odd prime number tests |
329
|
|
|
lst_13 = list(range(1, 14)) |
330
|
|
|
assert partition_list(lst_13, chunks=2) == [[1, 2, 3, 4, 5, 6, 7], [8, 9, 10, 11, 12, 13]] |
331
|
|
|
assert partition_list(lst_13, chunks=3) == [[1, 2, 3, 4, 5], [6, 7, 8, 9], [10, 11, 12, 13]] |
332
|
|
|
assert partition_list(lst_13, chunks=4) == [[1, 2, 3, 4], [5, 6, 7], [8, 9, 10], [11, 12, 13]] |
333
|
|
|
assert partition_list(lst_13, chunks=4, chunk_index=1) == [[5, 6, 7]] |
334
|
|
|
|
335
|
|
|
def test_sparkline(): |
336
|
|
|
assert sparkline([5, 2, 3]) == '█▃▄' |
337
|
|
|
assert sparkline([1000, 1, 2222]) == '▃ █' |
338
|
|
|
assert sparkline([8, 7, 6, 5, 4, 3, 2, 1, 0]) == '█▇▆▅▄▃▂▁ ' |
339
|
|
|
assert sparkline([-1, None, 'forty-two']) == '' |
340
|
|
|
|
341
|
|
|
|
342
|
|
|
if __name__ == '__main__': |
343
|
|
|
main(__file__) |
344
|
|
|
|