Passed
Pull Request — master (#1184)
by Konstantin
03:15
created

tests.test_workspace.test_deskewing()   B

Complexity

Conditions 1

Size

Total Lines 69
Code Lines 57

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 57
dl 0
loc 69
rs 8.4072
c 0
b 0
f 0
cc 1
nop 1

How to fix   Long Method   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
# -*- coding: utf-8 -*-
2
3
from os import chdir, curdir, walk, stat, chmod, umask
4
import shutil
5
import logging
6
from stat import filemode
7
from os.path import join, exists, abspath, basename, dirname
8
from shutil import copyfile, copytree as copytree_, rmtree
9
from pathlib import Path
10
from gzip import open as gzip_open
11
12
from PIL import Image
13
import numpy as np
14
15
import pytest
16
17
from tests.base import (
18
    assets,
19
    main,
20
    FIFOIO
21
)
22
23
from ocrd_models import (
24
    OcrdFile,
25
    OcrdMets
26
)
27
from ocrd_models.ocrd_page import parseString
28
from ocrd_models.ocrd_page import TextRegionType, CoordsType, AlternativeImageType
29
from ocrd_utils import polygon_mask, xywh_from_polygon, bbox_from_polygon, points_from_polygon
30
from ocrd_modelfactory import page_from_file
31
from ocrd.resolver import Resolver
32
from ocrd.workspace import Workspace
33
from ocrd.workspace_backup import WorkspaceBackupManager
34
from ocrd_validators import WorkspaceValidator
35
36
TMP_FOLDER = '/tmp/test-core-workspace'
37
SRC_METS = assets.path_to('kant_aufklaerung_1784/data/mets.xml')
38
39
SAMPLE_FILE_FILEGRP = 'OCR-D-IMG'
40
SAMPLE_FILE_ID = 'INPUT_0017'
41
SAMPLE_FILE_URL = join(SAMPLE_FILE_FILEGRP, '%s.tif' % SAMPLE_FILE_ID)
42
43
44
def copytree(src, dst, *args, **kwargs):
45
    rmtree(dst)
46
    copytree_(src, dst, *args, **kwargs)
47
48
49
def count_files(d): return sum(len(files) for _, _, files in walk(d))
50
51
52
@pytest.fixture(name='plain_workspace')
53
def _fixture_plain_workspace(tmp_path):
54
    resolver = Resolver()
55
    ws = resolver.workspace_from_nothing(directory=tmp_path)
56
    prev_dir = abspath(curdir)
57
    chdir(tmp_path)
58
    yield ws
59
    chdir(prev_dir)
60
61
def test_workspace_add_file(plain_workspace):
62
    fpath = plain_workspace.directory / 'ID1.tif'
63
64
    # act
65
    plain_workspace.add_file(
66
        'GRP',
67
        file_id='ID1',
68
        mimetype='image/tiff',
69
        content='CONTENT',
70
        page_id=None,
71
        local_filename=fpath
72
    )
73
    f = plain_workspace.mets.find_all_files()[0]
74
75
    # assert
76
    assert f.ID == 'ID1'
77
    assert f.mimetype == 'image/tiff'
78
    assert not f.url
79
    assert f.local_filename == str(fpath)
80
    assert Path(f.local_filename).exists()
81
82
83
def test_workspace_add_file_overwrite(plain_workspace):
84
    fpath = plain_workspace.directory / 'ID1.tif'
85
86
    # act
87
    plain_workspace.add_file('GRP', file_id='ID1', mimetype='image/tiff', content='CONTENT', page_id='phys1', local_filename=fpath)
88
    with pytest.raises(FileExistsError) as fn_exc:
89
        plain_workspace.add_file('GRP', file_id='ID1', mimetype='image/tiff', content='CONTENT', page_id=None, local_filename=fpath)
90
        assert str(fn_exc.value) == "File with file_id='ID1' already exists"
91
    with pytest.raises(FileExistsError) as fn_exc:
92
        plain_workspace.add_file('GRP', file_id='ID1', mimetype='image/tiff', content='CONTENT', page_id='phys2', local_filename=fpath, force=True)
93
        assert 'cannot mitigate' in str(fn_exc.value)
94
    plain_workspace.add_file('GRP', file_id='ID1', mimetype='image/tiff', content='CONTENT2', page_id='phys1', local_filename=fpath, force=True)
95
96
    f = plain_workspace.mets.find_all_files()[0]
97
    assert f.ID == 'ID1'
98
    assert f.mimetype == 'image/tiff'
99
    assert not f.url
100
    assert f.local_filename == str(fpath)
101
    assert f.pageId == 'phys1'
102
    assert fpath.exists()
103
104
105
def test_workspace_add_file_basename_no_content(plain_workspace):
106
    plain_workspace.add_file('GRP', file_id='ID1', mimetype='image/tiff', page_id=None)
107
    f = next(plain_workspace.mets.find_files())
108
    assert f.url == ''
109
110
def test_workspace_add_file_binary_content(plain_workspace):
111
    fpath = join(plain_workspace.directory, 'subdir', 'ID1.tif')
112
    plain_workspace.add_file('GRP', file_id='ID1', content=b'CONTENT', local_filename=fpath, url='http://foo/bar', page_id=None)
113
114
    # assert
115
    assert exists(fpath)
116
117
118
def test_workspacec_add_file_content_wo_local_filename(plain_workspace):
119
    # act
120
    with pytest.raises(Exception) as fn_exc:
121
        plain_workspace.add_file('GRP', file_id='ID1', content=b'CONTENT', page_id='foo1234')
122
123
    assert "'content' was set but no 'local_filename'" in str(fn_exc.value)
124
125
126
def test_workspacec_add_file_content_wo_pageid(plain_workspace):
127
    # act
128
    with pytest.raises(ValueError) as val_err:
129
        plain_workspace.add_file('GRP', file_id='ID1', content=b'CONTENT', local_filename='foo')
130
131
    assert "workspace.add_file must be passed a 'page_id' kwarg, even if it is None." in str(val_err.value)
132
133
134
def test_workspace_str(plain_workspace):
135
136
    # act
137
    plain_workspace.save_mets()
138
    plain_workspace.reload_mets()
139
140
    # assert
141
    ws_dir = plain_workspace.directory
142
    assert str(plain_workspace) == 'Workspace[remote=False, directory=%s, baseurl=None, file_groups=[], files=[]]' % ws_dir
143
144
145
def test_workspace_backup(plain_workspace):
146
147
    # act
148
    plain_workspace.automatic_backup = WorkspaceBackupManager(plain_workspace)
149
    plain_workspace.save_mets()
150
    plain_workspace.reload_mets()
151
152
    # TODO
153
    # changed test semantics
154
    assert exists(join(plain_workspace.directory, '.backup'))
155
156
157
def _url_to_file(the_path):
158
    dummy_mets = OcrdMets.empty_mets()
159
    dummy_url = abspath(the_path)
160
    return dummy_mets.add_file('TESTGRP', ID=Path(dummy_url).name, url=dummy_url)
161
162
163
def test_download_very_self_file(plain_workspace):
164
    the_file = _url_to_file(abspath(__file__))
165
    fn = plain_workspace.download_file(the_file)
166
    assert fn, join('TESTGRP', basename(__file__))
167
    assert fn == the_file
168
169
170
def test_download_url_without_baseurl_raises_exception(tmp_path):
171
    # arrange
172
    dst_mets = join(tmp_path, 'mets.xml')
173
    copyfile(SRC_METS, dst_mets)
174
    ws1 = Resolver().workspace_from_url(dst_mets)
175
    the_file = _url_to_file(SAMPLE_FILE_URL)
176
177
    # act
178
    with pytest.raises(Exception) as exc:
179
        ws1.download_file(the_file)
180
181
    # assert exception message contents
182
    assert "File path passed as 'url' to download_to_directory does not exist:" in str(exc.value)
183
184
185
def test_download_url_with_baseurl(tmp_path):
186
    # arrange
187
    dst_mets = join(tmp_path, 'mets.xml')
188
    copyfile(SRC_METS, dst_mets)
189
    tif_dir = tmp_path / 'OCR-D-IMG'
190
    tif_dir.mkdir()
191
    dst_tif = join(tmp_path, SAMPLE_FILE_URL)
192
    copyfile(join(dirname(SRC_METS), SAMPLE_FILE_URL), dst_tif)
193
    ws1 = Resolver().workspace_from_url(dst_mets, src_baseurl=dirname(SRC_METS))
194
    the_file = _url_to_file(dst_tif)
195
196
    # act
197
    # TODO
198
    # semantics changed from .download_url to .download_file
199
    # and from context path 'DEPRECATED' to 'OCR-D-IMG'
200
    f = Path(ws1.download_file(the_file).local_filename)
201
202
    # assert
203
    assert f.name == f'{SAMPLE_FILE_ID}.tif'
204
    assert f.parent.name == 'TESTGRP'
205
    assert Path(ws1.directory, f).exists()
206
207
208
def test_from_url_dst_dir_download(plain_workspace):
209
    """
210
    https://github.com/OCR-D/core/issues/319
211
    """
212
    ws_dir = join(plain_workspace.directory, 'non-existing-for-good-measure')
213
    # Create a relative path to trigger #319
214
    src_path = str(Path(assets.path_to('kant_aufklaerung_1784/data/mets.xml')))
215
    plain_workspace.resolver.workspace_from_url(src_path, dst_dir=ws_dir, download=True)
216
217
    # assert
218
    assert Path(ws_dir, 'mets.xml').exists()  # sanity check, mets.xml must exist
219
    assert Path(ws_dir, 'OCR-D-GT-PAGE/PAGE_0017_PAGE.xml').exists()
220
221
222
def test_superfluous_copies_in_ws_dir(tmp_path):
223
    """
224
    https://github.com/OCR-D/core/issues/227
225
    """
226
    # arrange
227
    src_path = assets.path_to('SBB0000F29300010000/data/mets_one_file.xml')
228
    dst_path = join(tmp_path, 'mets.xml')
229
    copyfile(src_path, dst_path)
230
    ws1 = Workspace(Resolver(), tmp_path)
231
232
    # assert directory files
233
    assert count_files(tmp_path) == 1
234
235
    # act
236
    for file in ws1.mets.find_all_files():
237
        ws1.download_file(file)
238
239
    # assert
240
    assert count_files(tmp_path) == 2
241
    assert exists(join(tmp_path, 'OCR-D-IMG/FILE_0005_IMAGE.tif'))
242
243
244
@pytest.fixture(name='sbb_data_tmp')
245
def _fixture_sbb_data_tmp(tmp_path):
246
    copytree(assets.path_to('SBB0000F29300010000/data'), str(tmp_path))
247
    yield str(tmp_path)
248
249
250
@pytest.fixture(name='sbb_data_workspace')
251
def _fixture_sbb_data(sbb_data_tmp):
252
    resolver = Resolver()
253
    workspace = Workspace(resolver, directory=sbb_data_tmp)
254
    yield workspace
255
256
257
def test_remove_file_not_existing_raises_error(sbb_data_workspace):
258
259
    # act
260
    with pytest.raises(FileNotFoundError) as fnf_err:
261
        sbb_data_workspace.remove_file('non-existing-id')
262
263
    # assert
264
    assert "not found" in str(fnf_err.value)
265
266
267
def test_remove_file_force(sbb_data_workspace):
268
    """Enforce removal of non-existing-id doesn't yield any error
269
    but also returns no ocrd-file identifier"""
270
271
    # TODO check semantics - can a non-existent thing be removed?
272
    assert not sbb_data_workspace.remove_file('non-existing-id', force=True)
273
    # should also succeed
274
    sbb_data_workspace.overwrite_mode = True
275
    assert not sbb_data_workspace.remove_file('non-existing-id', force=False)
276
277
278
def test_remove_file_remote_not_available_raises_exception(plain_workspace):
279
    plain_workspace.add_file('IMG', file_id='page1_img', mimetype='image/tiff', url='http://remote', page_id=None)
280
    with pytest.raises(Exception) as not_avail_exc:
281
        plain_workspace.remove_file('page1_img')
282
283
    assert "not locally available" in str(not_avail_exc.value)
284
285
286
def test_remove_file_remote(plain_workspace):
287
288
    # act
289
    plain_workspace.add_file('IMG', file_id='page1_img', mimetype='image/tiff', url='http://remote', page_id=None)
290
291
    # must succeed because removal is enforced
292
    assert plain_workspace.remove_file('page1_img', force=True)
293
294
    # TODO check returned value
295
    # should also "succeed", because overwrite_mode is set which also sets 'force' to 'True'
296
    plain_workspace.overwrite_mode = True
297
    assert not plain_workspace.remove_file('page1_img')
298
299
300
def test_rename_file_group(tmp_path):
301
    # arrange
302
    copytree(assets.path_to('kant_aufklaerung_1784-page-region-line-word_glyph/data'), tmp_path)
303
    workspace = Workspace(Resolver(), directory=tmp_path)
304
305
    # before act
306
    # TODO clear semantics
307
    # requires rather odd additional path-setting because root path from
308
    # workspace is not propagated - works only if called inside workspace
309
    # which can be achieved with pushd_popd functionalities
310
    ocrd_file = next(workspace.mets.find_files(ID='OCR-D-GT-SEG-WORD_0001'))
311
    relative_name = ocrd_file.local_filename
312
    ocrd_file.local_filename = tmp_path / relative_name
313
    pcgts_before = page_from_file(ocrd_file)
314
    # before assert
315
    assert pcgts_before.get_Page().imageFilename == 'OCR-D-IMG/INPUT_0017.tif'
316
317
    # act
318
    workspace.rename_file_group('OCR-D-IMG', 'FOOBAR')
319
    next_ocrd_file = next(workspace.mets.find_files(ID='OCR-D-GT-SEG-WORD_0001'))
320
    next_ocrd_file.local_filename = str(tmp_path / relative_name)
321
    pcgts_after = page_from_file(next_ocrd_file)
322
323
    # assert
324
    assert pcgts_after.get_Page().imageFilename == 'FOOBAR/INPUT_0017.tif'
325
    assert Path(tmp_path / 'FOOBAR/INPUT_0017.tif').exists()
326
    assert not Path('OCR-D-IMG/OCR-D-IMG_0001.tif').exists()
327
    assert workspace.mets.get_physical_pages(for_fileIds=['OCR-D-IMG_0001']) == [None]
328
    assert workspace.mets.get_physical_pages(for_fileIds=['FOOBAR_0001']) == ['phys_0001']
329
330
331
def test_remove_file_group_invalid_raises_exception(sbb_data_workspace):
332
    with pytest.raises(Exception) as no_fg_exc:
333
        # should fail
334
        sbb_data_workspace.remove_file_group('I DO NOT EXIST')
335
    assert "No such fileGrp" in str(no_fg_exc.value)
336
337
338
def test_remove_file_group_force(sbb_data_workspace):
339
340
    # TODO
341
    # check function and tests semantics
342
    # should succeed
343
    assert not sbb_data_workspace.remove_file_group('I DO NOT EXIST', force=True)
344
    # should also succeed
345
    sbb_data_workspace.overwrite_mode = True
346
    assert not sbb_data_workspace.remove_file_group('I DO NOT EXIST', force=False)
347
348
349
def test_remove_file_group_rmdir(sbb_data_tmp, sbb_data_workspace):
350
    assert exists(join(sbb_data_tmp, 'OCR-D-IMG'))
351
    sbb_data_workspace.remove_file_group('OCR-D-IMG', recursive=True)
352
    assert not exists(join(sbb_data_tmp, 'OCR-D-IMG'))
353
354
355
def test_remove_file_group_flat(plain_workspace):
356
    """
357
    https://github.com/OCR-D/core/issues/728
358
    """
359
360
    # act
361
    added_res = plain_workspace.add_file('FOO', file_id='foo', mimetype='foo/bar', local_filename='file.ext', content='foo', page_id=None).local_filename
362
    # requires additional prepending of current path because not pushd_popd-magic at work
363
    added_filename = join(plain_workspace.directory, added_res)
364
365
    # assert
366
    assert Path(added_filename).exists()
367
    plain_workspace.remove_file_group('FOO', recursive=True)
368
369
370
@pytest.fixture(name='kant_complex_workspace')
371
def _fixture_kant_complex(tmp_path):
372
    copytree(assets.path_to('kant_aufklaerung_1784-complex/data'), str(tmp_path))
373
    yield Workspace(Resolver, directory=tmp_path)
374
375
376
def test_remove_file_page_recursive0(kant_complex_workspace):
377
    assert len(kant_complex_workspace.mets.find_all_files()) == 119
378
    kant_complex_workspace.remove_file('OCR-D-OCR-OCRO-fraktur-SEG-LINE-tesseract-ocropy-DEWARP_0001', page_recursive=True, page_same_group=False, keep_file=True)
379
    assert len(kant_complex_workspace.mets.find_all_files()) == 83
380
    kant_complex_workspace.remove_file('PAGE_0017_ALTO', page_recursive=True)
381
382
383
def test_remove_file_page_recursive_keep_file(kant_complex_workspace):
384
    before = count_files(kant_complex_workspace.directory)
385
    kant_complex_workspace.remove_file('OCR-D-IMG-BINPAGE-sauvola_0001', page_recursive=True, page_same_group=False, force=True)
386
    after = count_files(kant_complex_workspace.directory)
387
    assert after == (before - 2), '2 files deleted'
388
389
390
def test_remove_file_page_recursive_same_group(kant_complex_workspace):
391
    before = count_files(kant_complex_workspace.directory)
392
    kant_complex_workspace.remove_file('OCR-D-IMG-BINPAGE-sauvola_0001', page_recursive=True, page_same_group=True, force=False)
393
    after = count_files(kant_complex_workspace.directory)
394
    assert after == before - 1, '1 file deleted'
395
396
397
def test_download_to_directory_from_workspace_download_file(plain_workspace):
398
    """
399
    https://github.com/OCR-D/core/issues/342
400
    """
401
    f1 = plain_workspace.add_file('IMG', file_id='page1_img', mimetype='image/tiff', local_filename='test.tif', content='', page_id=None)
402
    f2 = plain_workspace.add_file('GT', file_id='page1_gt', mimetype='text/xml', local_filename='test.xml', content='', page_id=None)
403
404
    assert not f1.url
405
    assert not f2.url
406
407
    # these should be no-ops
408
    plain_workspace.download_file(f1)
409
    plain_workspace.download_file(f2)
410
411
    assert f1.local_filename == 'test.tif'
412
    assert f2.local_filename == 'test.xml'
413
414
415
def test_save_image_file_invalid_mimetype_raises_exception(plain_workspace):
416
    img = Image.new('RGB', (1000, 1000))
417
418
    # act raise
419
    with pytest.raises(KeyError) as key_exc:
420
        plain_workspace.save_image_file(img, 'page1_img', 'IMG', 'page1', 'ceci/nest/pas/une/mimetype')
421
422
    assert "'ceci/nest/pas/une/mimetype'" == str(key_exc.value)
423
424
425
def test_save_image_file(plain_workspace):
426
427
    # arrange
428
    img = Image.new('RGB', (1000, 1000))
429
430
    # act
431
    assert plain_workspace.save_image_file(img, 'page1_img', 'IMG', 'page1', 'image/jpeg')
432
    assert exists(join(plain_workspace.directory, 'IMG', 'page1_img.jpg'))
433
    # should succeed
434
    assert plain_workspace.save_image_file(img, 'page1_img', 'IMG', 'page1', 'image/jpeg', force=True)
435
    # should also succeed
436
    plain_workspace.overwrite_mode = True
437
    assert plain_workspace.save_image_file(img, 'page1_img', 'IMG', 'page1', 'image/jpeg')
438
439
440
@pytest.fixture(name='workspace_kant_aufklaerung')
441
def _fixture_workspace_kant_aufklaerung(tmp_path):
442
    copytree(assets.path_to('kant_aufklaerung_1784/data/'), str(tmp_path))
443
    resolver = Resolver()
444
    ws = resolver.workspace_from_url(join(tmp_path, 'mets.xml'), src_baseurl=tmp_path)
445
    prev_dir = abspath(curdir)
446
    chdir(tmp_path)
447
    yield ws
448
    chdir(prev_dir)
449
450
451
def test_resolve_image_exif(workspace_kant_aufklaerung):
452
453
    tif_path = 'OCR-D-IMG/INPUT_0017.tif'
454
455
    # act
456
    exif = workspace_kant_aufklaerung.resolve_image_exif(tif_path)
457
458
    # assert
459
    assert exif.compression == 'jpeg'
460
    assert exif.width == 1457
461
462
463
def test_resolve_image_as_pil(workspace_kant_aufklaerung):
464
    img = workspace_kant_aufklaerung._resolve_image_as_pil('OCR-D-IMG/INPUT_0017.tif')
465
    assert img.width == 1457
466
    img = workspace_kant_aufklaerung._resolve_image_as_pil('OCR-D-IMG/INPUT_0017.tif', coords=([100, 100], [50, 50]))
467
    assert img.width == 50
468
469
470
@pytest.fixture(name='workspace_gutachten_data')
471
def _fixture_workspace_gutachten_data(tmp_path):
472
    copytree(assets.path_to('gutachten/data'), str(tmp_path))
473
    resolver = Resolver()
474
    ws = resolver.workspace_from_url(join(str(tmp_path), 'mets.xml'))
475
    prev_path = abspath(curdir)
476
    chdir(tmp_path)
477
    yield ws
478
    chdir(prev_path)
479
480
481
def test_image_from_page_basic(workspace_gutachten_data):
482
    # arrange
483
    with open(assets.path_to('gutachten/data/TEMP1/PAGE_TEMP1.xml'), 'r') as f:
484
        pcgts = parseString(f.read().encode('utf8'), silence=True)
485
486
    # act + assert
487
    _, info, _ = workspace_gutachten_data.image_from_page(pcgts.get_Page(), page_id='PHYS_0017', feature_selector='clipped', feature_filter='cropped')
488
    assert info['features'] == 'binarized,clipped'
489
    _, info, _ = workspace_gutachten_data.image_from_page(pcgts.get_Page(), page_id='PHYS_0017')
490
    assert info['features'] == 'binarized,clipped'
491
492
493
@pytest.fixture(name='workspace_sample_features')
494
def _fixture_workspace_sample_features(tmp_path):
495
    copytree('tests/data/sample-features', str(tmp_path))
496
    resolver = Resolver()
497
    ws = resolver.workspace_from_url(join(str(tmp_path), 'mets.xml'))
498
    prev_path = abspath(curdir)
499
    chdir(tmp_path)
500
    yield ws
501
    chdir(prev_path)
502
503
504
def test_image_feature_selectoro(workspace_sample_features):
505
    # arrange
506
    with open(Path(workspace_sample_features.directory) / 'image_features.page.xml', 'r', encoding='utf-8') as f:
507
        pcgts = parseString(f.read().encode('utf-8'))
508
509
    # richest feature set is not last:
510
    _, info, _ = workspace_sample_features.image_from_page(pcgts.get_Page(), page_id='page1', feature_selector='dewarped')
511
    # recropped because foo4 contains cropped+deskewed but not recropped yet:
512
    assert info['features'] == 'cropped,dewarped,binarized,despeckled,deskewed'
513
    # richest feature set is also last:
514
    _, info, _ = workspace_sample_features.image_from_page(pcgts.get_Page(), page_id='page1', feature_selector='dewarped', feature_filter='binarized')
515
    # no deskewing here, thus no recropping:
516
    assert info['features'] == 'cropped,dewarped,despeckled'
517
518
def test_deskewing(plain_workspace):
519
    #from ocrd_utils import initLogging, setOverrideLogLevel
520
    #setOverrideLogLevel('DEBUG')
521
    size = (3000, 4000)
522
    poly = [[1403, 2573], [1560, 2573], [1560, 2598], [2311, 2598], [2311, 2757],
523
            [2220, 2757], [2220, 2798], [2311, 2798], [2311, 2908], [1403, 2908]]
524
    xywh = xywh_from_polygon(poly)
525
    bbox = bbox_from_polygon(poly)
526
    skew = 4.625
527
    image = Image.new('L', size)
528
    image = polygon_mask(image, poly)
529
    #image.show(title='image')
530
    pixels = np.count_nonzero(np.array(image) > 0)
531
    name = 'foo0'
532
    assert plain_workspace.save_image_file(image, name, 'IMG')
533
    pcgts = page_from_file(next(plain_workspace.mets.find_files(ID=name)))
534
    page = pcgts.get_Page()
535
    region = TextRegionType(id='nonrect',
536
                            Coords=CoordsType(points=points_from_polygon(poly)),
537
                            orientation=-skew)
538
    page.add_TextRegion(region)
539
    page_image, page_coords, _ = plain_workspace.image_from_page(page, '')
540
    #page_image.show(title='page_image')
541
    assert list(image.getdata()) == list(page_image.getdata())
542
    assert np.all(page_coords['transform'] == np.eye(3))
543
    reg_image, reg_coords = plain_workspace.image_from_segment(region, page_image, page_coords,
544
                                                               feature_filter='deskewed', fill=0)
545
    assert list(image.crop(bbox).getdata()) == list(reg_image.getdata())
546
    assert reg_image.width == xywh['w'] == 908
547
    assert reg_image.height == xywh['h'] == 335
548
    assert reg_coords['transform'][0, 2] == -xywh['x']
549
    assert reg_coords['transform'][1, 2] == -xywh['y']
550
    # same fg after cropping to minimal bbox
551
    reg_pixels = np.count_nonzero(np.array(reg_image) > 0)
552
    assert pixels == reg_pixels
553
    # now with deskewing (test for size after recropping)
554
    reg_image, reg_coords = plain_workspace.image_from_segment(region, page_image, page_coords, fill=0)
555
    #reg_image.show(title='reg_image')
556
    assert reg_image.width == 932 > xywh['w']
557
    assert reg_image.height == 382 > xywh['h']
558
    assert reg_coords['transform'][0, 1] != 0
559
    assert reg_coords['transform'][1, 0] != 0
560
    assert 'deskewed' in reg_coords['features']
561
    # same fg after cropping to minimal bbox (roughly - due to aliasing)
562
    reg_pixels = np.count_nonzero(np.array(reg_image) > 0)
563
    assert np.abs(pixels - reg_pixels) / pixels < 0.005
564
    reg_array = np.array(reg_image) > 0
565
    # now via AlternativeImage
566
    path = plain_workspace.save_image_file(reg_image, region.id + '_img', 'IMG')
567
    region.add_AlternativeImage(AlternativeImageType(filename=path, comments=reg_coords['features']))
568
    logger_capture = FIFOIO(256)
569
    logger_handler = logging.StreamHandler(logger_capture)
570
    #logger_handler.setFormatter(logging.Formatter(fmt=LOG_FORMAT, datefmt=LOG_TIMEFMT))
571
    logger = logging.getLogger('ocrd.utils.crop_image')
572
    logger.addHandler(logger_handler)
573
    reg_image2, reg_coords2 = plain_workspace.image_from_segment(region, page_image, page_coords, fill=0)
574
    #reg_image2.show(title='reg_image2')
575
    logger_output = logger_capture.getvalue()
576
    logger_capture.close()
577
    assert logger_output == ''
578
    assert reg_image2.width == reg_image.width
579
    assert reg_image2.height == reg_image.height
580
    assert np.allclose(reg_coords2['transform'], reg_coords['transform'])
581
    assert reg_coords2['features'] == reg_coords['features']
582
    # same fg after cropping to minimal bbox (roughly - due to aliasing)
583
    reg_pixels2 = np.count_nonzero(np.array(reg_image) > 0)
584
    assert reg_pixels2 == reg_pixels
585
    reg_array2 = np.array(reg_image2) > 0
586
    assert 0.98 < np.sum(reg_array == reg_array2) / reg_array.size <= 1.0
587
588
def test_downsample_16bit_image(plain_workspace):
589
    # arrange image
590
    img_path = Path(plain_workspace.directory, '16bit.tif')
591
    with gzip_open(Path(__file__).parent / 'data/OCR-D-IMG_APBB_Mitteilungen_62.0002.tif.gz', 'rb') as gzip_in:
592
        with open(img_path, 'wb') as tif_out:
593
            tif_out.write(gzip_in.read())
594
595
    # act
596
    plain_workspace.add_file('IMG', file_id='foo', local_filename=img_path, mimetype='image/tiff', page_id=None)
597
598
    # assert
599
    pil_before = Image.open(img_path)
600
    assert pil_before.mode == 'I;16'
601
    pil_after = plain_workspace._resolve_image_as_pil(img_path)
602
    assert pil_after.mode == 'L'
603
604
605
def test_mets_permissions(plain_workspace):
606
    plain_workspace.save_mets()
607
    mets_path = join(plain_workspace.directory, 'mets.xml')
608
    mask = umask(0)
609
    umask(mask)
610
    assert (stat(mets_path).st_mode) == 0o100664 & ~mask
611
    chmod(mets_path, 0o777)
612
    plain_workspace.save_mets()
613
    assert filemode(stat(mets_path).st_mode) == '-rwxrwxrwx'
614
615
616
def test_merge0(tmp_path):
617
618
    # arrange
619
    dst_path1 = tmp_path / 'kant_aufklaerung'
620
    dst_path1.mkdir()
621
    dst_path2 = tmp_path / 'sbb'
622
    dst_path2.mkdir()
623
    copytree(assets.path_to('kant_aufklaerung_1784/data'), dst_path1)
624
    copytree(assets.path_to('SBB0000F29300010000/data'), dst_path2)
625
626
    ws1 = Workspace(Resolver(), dst_path1)
627
    ws2 = Workspace(Resolver(), dst_path2)
628
629
    # assert number of files before
630
    assert len(ws1.mets.find_all_files()) == 6
631
    assert len(ws2.mets.find_all_files()) == 35
632
633
    # act
634
    ws1.merge(ws2, overwrite=True)
635
636
    # assert
637
    assert len(ws1.mets.find_all_files()) == 41
638
    assert exists(join(dst_path1, 'OCR-D-IMG/INPUT_0017.tif'))
639
640
def test_merge_no_copy_files(tmp_path):
641
642
    # arrange
643
    dst_path1 = tmp_path / 'ws1'
644
    dst_path1.mkdir()
645
    dst_path2 = dst_path1 / 'ws2'
646
    dst_path2.mkdir()
647
648
    ws1 = Resolver().workspace_from_nothing(directory=dst_path1)
649
    ws2 = Resolver().workspace_from_nothing(directory=dst_path2)
650
651
    ws2.add_file('GRP2', page_id='p01', mimetype='text/plain', file_id='f1', local_filename='GRP2/f1', content='ws2')
652
653
    ws1.merge(ws2, copy_files=False, fileId_mapping={'f1': 'f1_copy_files'})
654
655
    assert next(ws1.mets.find_files(ID='f1_copy_files')).local_filename == 'ws2/GRP2/f1'
656
657
    with pytest.raises(FileExistsError):
658
        ws1.merge(ws2, copy_files=True, fileId_mapping={'f1': 'f1_copy_files'})
659
    ws1.merge(ws2, copy_files=True, fileId_mapping={'f1': 'f1_copy_files'}, force=True)
660
    assert next(ws1.mets.find_files(ID='f1_copy_files')).local_filename == 'GRP2/f1'
661
662
def test_merge_overwrite(tmp_path):
663
    # arrange
664
    dst_path1 = tmp_path / 'ws1'
665
    dst_path1.mkdir()
666
    dst_path2 = dst_path1 / 'ws2'
667
    dst_path2.mkdir()
668
669
    ws1 = Resolver().workspace_from_nothing(directory=dst_path1)
670
    ws2 = Resolver().workspace_from_nothing(directory=dst_path2)
671
672
    with pytest.raises(Exception) as exc:
673
        ws1.add_file('X', page_id='X', mimetype='X', file_id='id123', local_filename='X/X', content='ws1')
674
        ws2.add_file('X', page_id='X', mimetype='X', file_id='id456', local_filename='X/X', content='ws2')
675
        ws1.merge(ws2)
676
        assert "would overwrite" == str(exc.value)
677
678
def test_merge_with_filter(plain_workspace, tmp_path):
679
    # arrange
680
    page_id1, file_id1, file_grp1 = 'page1', 'ID1', 'GRP1'
681
    plain_workspace.add_file(file_grp1, file_id='ID1', mimetype='image/tiff', page_id='page1')
682
683
    dst_path2 = tmp_path / 'foo'
684
    resolver = Resolver()
685
    ws2 = resolver.workspace_from_nothing(directory=dst_path2)
686
    page_id2, file_id2, file_grp2 = 'page2', 'ID2', 'GRP2'
687
    ws2.add_file('GRP2', file_id=file_id2, mimetype='image/tiff', page_id=page_id2, url='bar')
688
    ws2.add_file('GRP2', file_id='ID2-2', mimetype='image/tiff', page_id='page3', url='bar')
689
690
    # act
691
    plain_workspace.merge(ws2, copy_files=False, page_id=page_id2, file_id=file_id2,
692
                          file_grp=file_grp2, filegrp_mapping={file_grp2: file_grp1})
693
694
    # assert:
695
    files = list(plain_workspace.find_files())
696
    assert len(files) == 2
697
698
    for f in files:
699
        assert f.fileGrp == file_grp1
700
        assert f.pageId in [page_id1, page_id2]
701
        assert f.ID in [file_id1, file_id2]
702
703
def test_merge_force(plain_workspace, tmp_path):
704
    resolver = Resolver()
705
706
    # target ws
707
    page_id1, file_id1, file_grp1 = 'page1', 'ID1', 'GRP1'
708
    plain_workspace.add_file(file_grp1, file_id=file_id1, mimetype='image/tiff', page_id=page_id1)
709
710
    # source ws
711
    dst_path2 = tmp_path / 'foo'
712
    ws2 = resolver.workspace_from_nothing(directory=dst_path2)
713
    page_id2, file_id2, file_grp2 = 'page1', 'ID1', 'GRP1'
714
    ws2.add_file(file_grp2, file_id=file_id2, mimetype='image/tiff', page_id=page_id2, url='bar')
715
716
    # fails because force is false
717
    with pytest.raises(Exception) as fn_exc:
718
        plain_workspace.merge(ws2, force=False)
719
720
    # works because force overrides ID clash
721
    plain_workspace.merge(ws2, force=True)
722
723
    files = list(plain_workspace.find_files())
724
    assert len(files) == 1
725
726
@pytest.fixture(name='workspace_metsDocumentID')
727
def _fixture_metsDocumentID(tmp_path):
728
    resolver = Resolver()
729
    mets_content = (Path(__file__).parent / "data/mets-with-metsDocumentID.xml").read_text()
730
    with open(tmp_path / 'mets.xml', 'w', encoding='utf-8') as f:
731
        f.write(mets_content)
732
    yield Workspace(Resolver, directory=tmp_path)
733
734
def test_agent_before_metsDocumentID(workspace_metsDocumentID):
735
    report = WorkspaceValidator.validate(Resolver(), mets_url=workspace_metsDocumentID.mets_target)
736
    assert report.is_valid
737
    workspace_metsDocumentID.mets.add_agent('foo bar v0.0.1', 'OTHER', 'OTHER', 'OTHER')
738
    workspace_metsDocumentID.save_mets()
739
    report = WorkspaceValidator.validate(Resolver(), mets_url=workspace_metsDocumentID.mets_target)
740
    print(report.errors)
741
    assert report.is_valid
742
743
if __name__ == '__main__':
744
    main(__file__)
745