tests.model.test_ocrd_page - Code Metrics - Inspection of "Test for ocrd-network" - OCR-D/core - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Pull Request — master (#1184)

by Konstantin

created 2024-04-16 14:19 UTC

tests.model.test_ocrd_page B

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	468
Duplicated Lines	0 %

Importance

Changes

Metric	Value
wmc	46
eloc	250
dl	0
loc	468
rs	8.72
c	0
b	0
f	0

27 Functions

Rating	Name	Size	Complexity
A	test_extend_all_indexed_no_validation()	15	2
A	test_delete_region()	9	1
A	test_hashable()	9	1
A	test_faulty_glyphs_to_xml()	6	1
A	test_alternative_image_additions()	25	1
A	test_empty_groups_to_regionrefindexed()	22	2
A	test_extend_all_indexed_validate_continuity()	14	3
A	test_serialize_no_empty_readingorder()	9	1
A	test_get_all_text_lines()	6	2
A	test_parse_string_succeeds()	3	1
A	test_get_all_indexed_index_sort()	16	2
A	test_pcgts_id_matches()	2	1
A	test_simple_types()	13	1
A	_fixture_faulty_glyphs()	6	2
A	test_get_AllAlternativeImages()	13	2
A	test_to_xml_unicode_nsprefix()	14	2
A	test_orderedgroup_export_order()	29	2
A	test_id()	13	1
A	test_get_all_alternative_image_paths()	17	2
A	test_issue_269()	10	1
A	test_set_image_filename()	8	1
A	test_all_regions_with_reading_order()	20	2
A	test_get_all_indexed_classes()	12	2
A	test_all_regions_without_reading_order()	22	2
A	test_get_unordered_group_children()	10	2
A	test_get_all_regions_invalid_order_raises_exception()	11	3
A	test_get_all_regions_invalid_depth_raises_exception()	11	3

How to fix Complexity

# -*- coding: utf-8 -*-

import pytest

from tests.base import main, assets, create_ocrd_file_with_defaults

from ocrd_modelfactory import page_from_image
from ocrd_models.ocrd_page_generateds import TextTypeSimpleType
from ocrd_models.ocrd_page import (
    AlternativeImageType,
    PcGtsType,
    PageType,
    TextRegionType,
    TextLineType,
    OrderedGroupIndexedType,
    UnorderedGroupIndexedType,
    ReadingOrderType,
    RegionRefIndexedType,
    WordType,
    GlyphType,

    parseString,
    parse,
    to_xml
)

simple_page = """\
<PcGts xmlns="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15 http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15/pagecontent.xsd">
    <Metadata>
        <Creator>OCR-D</Creator>
        <Created>2016-09-20T11:09:27.041+02:00</Created>
        <LastChange>2018-04-25T17:44:49.605+01:00</LastChange>
    </Metadata>
    <Page
        imageFilename="https://github.com/OCR-D/assets/raw/master/data/kant_aufklaerung_1784/data/OCR-D-IMG/INPUT_0017.tif"
        imageWidth="1457"
        imageHeight="2083"
        type="content">
        <TextRegion type="heading" id="r_1_1" custom="readingOrder {index:0;} structure {type:heading;}">
            <Coords points="113,365 919,365 919,439 113,439"/>
            <TextLine id="tl_1" primaryLanguage="German" custom="readingOrder {index:0;} textStyle {offset:0; length:26;fontFamily:Arial; fontSize:17.0; bold:true;}">
                <Coords points="114,366 918,366 918,438 114,438"/>
                <Baseline points="114,429 918,429"/>
                <Word id="w_w1aab1b1b2b1b1ab1" language="German" custom="readingOrder {index:0;} textStyle {offset:0; length:11;fontFamily:Arial; fontSize:17.0; bold:true;}">
                    <Coords points="114,368 442,368 442,437 114,437"/>
                    <TextEquiv conf="1">
                        <Unicode>Berliniſche</Unicode>
                    </TextEquiv>
                </Word>
            </TextLine>
        </TextRegion>
    </Page>
</PcGts>
"""


@pytest.fixture(name='faulty_glyphs')
def _fixture_faulty_glyphs():
    with open(assets.path_to('glyph-consistency/data/OCR-D-GT-PAGE/FAULTY_GLYPHS.xml'), 'rb') as f:
        xml_as_str = f.read()
    pcgts = parseString(xml_as_str, silence=True)
    yield pcgts


def test_pcgts_id_matches(faulty_glyphs):
    assert faulty_glyphs.pcGtsId == 'FAULTY_GLYPHS_FILE'


def test_faulty_glyphs_to_xml(faulty_glyphs):
    as_xml = to_xml(faulty_glyphs)
    assert ' xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"' in as_xml[:1000]
    assert ' xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15 http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15/pagecontent.xsd"', as_xml[:1000]
    assert '<pc:PcGts' in as_xml[0:100]
    assert '<pc:TextRegion' in as_xml[1000:3000]


def test_to_xml_unicode_nsprefix():
    """see https://github.com/OCR-D/core/pull/474#issuecomment-621477590"""

    # arrange
    with open(assets.path_to('kant_aufklaerung_1784-binarized/data/OCR-D-GT-WORD/INPUT_0020.xml'), 'rb') as f:
        from_xml = f.read()

    # assert
    assert '<Unicode>' in from_xml.decode('utf-8'), 'without NS prefix'
    assert '<Created' in from_xml.decode('utf-8'), 'without NS prefix'
    pcgts = parseString(from_xml, silence=True)
    as_xml = to_xml(pcgts)
    assert '<pc:Unicode>' in as_xml, 'with NS prefix'
    assert '<pc:Created>' in as_xml, 'with NS prefix'


def test_issue_269(faulty_glyphs):
    """
    @conf is parsed as str but should be float
    https://github.com/OCR-D/core/issues/269
    """
    # GIGO
    faulty_glyphs.get_Page().get_TextRegion()[0].get_TextEquiv()[0].set_conf(1.0)
    assert type(faulty_glyphs.get_Page().get_TextRegion()[0].get_TextEquiv()[0].get_conf()) == float
    faulty_glyphs.get_Page().get_TextRegion()[0].get_TextEquiv()[0].set_conf('1.0')
    assert type(faulty_glyphs.get_Page().get_TextRegion()[0].get_TextEquiv()[0].get_conf()) == str


def test_parse_string_succeeds():
    """parseString with @conf in TextEquiv won't throw an error"""
    assert parseString(simple_page, silence=True) is not None


def test_delete_region():
    pcgts = parseString(simple_page, silence=True)
    assert len(pcgts.get_Page().get_TextRegion()) == 1

    # act
    del pcgts.get_Page().get_TextRegion()[0]

    # assert
    assert len(pcgts.get_Page().get_TextRegion()) == 0


def test_set_image_filename(faulty_glyphs):
    assert faulty_glyphs.get_Page().imageFilename == '00000259.sw.tif'

    # act
    faulty_glyphs.get_Page().imageFilename = 'foo'

    # assert
    assert faulty_glyphs.get_Page().imageFilename == 'foo'


def test_alternative_image_additions():
    pcgts = PcGtsType(pcGtsId="foo")
    assert pcgts.pcGtsId == 'foo'

    # act
    # Page/AlternativeImage
    page = PageType()
    pcgts.set_Page(page)
    page.add_AlternativeImage(AlternativeImageType())
    # TextRegion/AlternativeImage
    region = TextRegionType()
    page.add_TextRegion(region)
    region.add_AlternativeImage(AlternativeImageType())
    # TextLine/AlternativeImage
    line = TextLineType()
    region.add_TextLine(line)
    line.add_AlternativeImage(AlternativeImageType())
    # Word/AlternativeImage
    word = WordType()
    line.add_Word(word)
    word.add_AlternativeImage(AlternativeImageType())
    # Glyph/AlternativeImage
    glyph = GlyphType()
    word.add_Glyph(glyph)
    glyph.add_AlternativeImage(AlternativeImageType())

    # TODO assertions


def test_simple_types(faulty_glyphs):
    regions = faulty_glyphs.get_Page().get_TextRegion()
    reg = regions[0]

    # assert
    assert isinstance(reg.get_type(), str)
    assert reg.get_type() == TextTypeSimpleType.CREDIT
    assert isinstance(TextTypeSimpleType.CREDIT, str)
    assert reg.get_type() == 'credit'
    assert isinstance(TextTypeSimpleType.CREDIT, str)
    reg.set_type(TextTypeSimpleType.PAGENUMBER)
    assert reg.get_type() == 'page-number'
    assert isinstance(reg.get_type(), str)


def test_orderedgroup_export_order():
    """
    See https://github.com/OCR-D/core/issues/475
    """
    # arrange
    with open(assets.path_to('gutachten/data/TEMP1/PAGE_TEMP1.xml'), 'r') as f:
        pcgts = parseString(f.read().encode('utf8'), silence=True)

    # act
    og = pcgts.get_Page().get_ReadingOrder().get_OrderedGroup()
    xml_before = to_xml(og)
    children = og.get_AllIndexed()

    # assert
    assert len(children) == 22
    assert [c.index for c in children] == list(range(0, 22))
    # mix up the indexes
    children[0].index = 11
    children[11].index = 3
    children[3].index = 0
    assert [c.index for c in children] == [11, 1, 2, 0, 4, 5, 6, 7, 8, 9, 10, 3, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]
    assert [c.index for c in og.get_AllIndexed()] == list(range(0, 22))
    assert og.get_AllIndexed()[1].__class__ == OrderedGroupIndexedType
    # serialize and make sure the correct order was serialized
    new_pcgts = parseString(to_xml(pcgts).encode('utf8'), silence=True)
    new_og = new_pcgts.get_Page().get_ReadingOrder().get_OrderedGroup()
    assert [c.index for c in new_og.get_AllIndexed()] == list(range(0, 22))

    xml_after = to_xml(new_og)
    # TODO why not working?
    #assert xml_after == xml_before


def test_empty_groups_to_regionrefindexed():
    """
    Corrolary See https://github.com/OCR-D/core/issues/475
    """
    # arrange
    with open(assets.path_to('gutachten/data/TEMP1/PAGE_TEMP1.xml'), 'r') as f:
        pcgts = parseString(f.read().encode('utf8'), silence=True)

    og = pcgts.get_Page().get_ReadingOrder().get_OrderedGroup()
    children = og.get_AllIndexed()

    # assert
    assert isinstance(children[1], OrderedGroupIndexedType)
    assert isinstance(children[21], UnorderedGroupIndexedType)
    # empty all the elements in the first orederdGroupIndexed
    children[1].set_RegionRefIndexed([])
    # serialize apnd parse to see empty group converted
    pcgts = parseString(to_xml(pcgts).encode('utf8'), silence=True)
    og = pcgts.get_Page().get_ReadingOrder().get_OrderedGroup()
    children = og.get_AllIndexed()
    assert isinstance(children[1], RegionRefIndexedType)
    assert isinstance(children[21], RegionRefIndexedType)


def test_all_regions_without_reading_order():
    """
    https://github.com/OCR-D/core/pull/479
    https://github.com/OCR-D/core/issues/240#issuecomment-493135797
    """
    with open(assets.path_to('gutachten/data/TEMP1/PAGE_TEMP1.xml'), 'r') as f:
        pcgts = parseString(f.read().encode('utf8'), silence=True)

    # act
    pg = pcgts.get_Page()

    # assert
    assert len(pg.get_AllRegions()) == 65
    assert len(pg.get_AllRegions(depth=0)) == 65
    assert len(pg.get_AllRegions(depth=1)) == 45
    assert len(pg.get_AllRegions(depth=2)) == 65
    assert len(pg.get_AllRegions(depth=3)) == 65
    assert len(pg.get_AllRegions(classes=['Separator'])) == 25
    assert len(pg.get_AllRegions(classes=['Table'])) == 3
    assert len(pg.get_AllRegions(classes=['Text'])) == 37
    assert len(pg.get_AllRegions(classes=['Text'], depth=1)) == 17
    assert len(pg.get_AllRegions(classes=['Text'], depth=2)) == 37


def test_get_all_regions_invalid_order_raises_exception():
    # arrange
    with open(assets.path_to('gutachten/data/TEMP1/PAGE_TEMP1.xml'), 'r') as f:
        pg = parseString(f.read().encode('utf8'), silence=True).get_Page()

    # act
    with pytest.raises(Exception) as exc:
        pg.get_AllRegions(order='random')

    # assert
    assert "Argument 'order' must be either 'document', 'reading-order' or 'reading-order-only', not 'random'" in str(exc.value)


def test_get_all_regions_invalid_depth_raises_exception():
    # arrange
    with open(assets.path_to('gutachten/data/TEMP1/PAGE_TEMP1.xml'), 'r') as f:
        pg = parseString(f.read().encode('utf8'), silence=True).get_Page()

    # act
    with pytest.raises(Exception) as exc:
        pg.get_AllRegions(depth=-1)

    # assert
    assert "Argument 'depth' must be an integer greater-or-equal 0, not '-1'" in str(exc.value)


def test_all_regions_with_reading_order():
    """
    https://github.com/OCR-D/core/pull/479
    https://github.com/OCR-D/core/issues/240#issuecomment-493135797
    """

    # arrange
    with open(assets.path_to('gutachten/data/TEMP1/PAGE_TEMP1.xml'), 'r') as f:
        pg = parseString(f.read().encode('utf8'), silence=True).get_Page()

    # assert
    assert len(pg.get_AllRegions(order='reading-order-only')) == 40
    assert len(pg.get_AllRegions(order='reading-order-only', depth=1)) == 20
    assert len(pg.get_AllRegions(order='reading-order-only', depth=2)) == 40
    assert len(pg.get_AllRegions(order='reading-order', depth=0)) == 65
    assert len(pg.get_AllRegions(order='reading-order', depth=1)) == 45
    assert len(pg.get_AllRegions(order='reading-order', depth=2)) == 65
    assert len(pg.get_AllRegions(classes=['Table'], order='reading-order')) == 3
    assert len(pg.get_AllRegions(classes=['Text'], order='reading-order')) == 37
    assert len(pg.get_AllRegions(classes=['Text'], order='reading-order', depth=1)) == 17


def test_get_unordered_group_children():
    # arrange
    with open(assets.path_to('gutachten/data/TEMP1/PAGE_TEMP1.xml'), 'r') as f:
        pcgts = parseString(f.read().encode('utf8'), silence=True)

    # act
    ug = pcgts.get_Page().get_ReadingOrder().get_OrderedGroup().get_UnorderedGroupIndexed()[0]

    # assert
    assert len(ug.get_UnorderedGroupChildren()) == 1


def test_get_all_indexed_classes():
    # arrange
    with open(assets.path_to('gutachten/data/TEMP1/PAGE_TEMP1.xml'), 'r') as f:
        pcgts = parseString(f.read().encode('utf8'), silence=True)

    # act
    og = pcgts.get_Page().get_ReadingOrder().get_OrderedGroup()

    # assert
    assert len(og.get_AllIndexed(classes=['RegionRef'])) == 17
    assert len(og.get_AllIndexed(classes=['OrderedGroup'])) == 3
    assert len(og.get_AllIndexed(classes=['UnorderedGroup'])) == 2


def test_get_all_indexed_index_sort():
    # arrange
    with open(assets.path_to('gutachten/data/TEMP1/PAGE_TEMP1.xml'), 'r') as f:
        og = parseString(f.read().encode('utf8'), silence=True).get_Page().get_ReadingOrder().get_OrderedGroup()

    # act
    unogs = og.get_UnorderedGroupIndexed()

    # assert
    assert [x.index for x in unogs] == [20, 21]
    unogs[0].index = 21
    unogs[1].index = 20
    assert [x.index for x in og.get_AllIndexed(classes=['UnorderedGroup'], index_sort=True)] == [20, 21]
    assert [x.index for x in og.get_AllIndexed(classes=['UnorderedGroup'], index_sort=False)] == [21, 20]
    og.sort_AllIndexed()
    assert [x.index for x in og.get_AllIndexed(classes=['UnorderedGroup'], index_sort=False)] == [20, 21]


def test_extend_all_indexed_no_validation():
    # arrange
    with open(assets.path_to('gutachten/data/TEMP1/PAGE_TEMP1.xml'), 'r') as f:
        og = parseString(f.read().encode('utf8'), silence=True).get_Page().get_ReadingOrder().get_OrderedGroup()

    # act
    og.extend_AllIndexed([
        RegionRefIndexedType(index=3, id='r3'),
        RegionRefIndexedType(index=2, id='r2'),
        RegionRefIndexedType(index=1, id='r1'),
    ])
    rrs = og.get_RegionRefIndexed()

    # assert
    assert [x.index for x in rrs][-3:] == [22, 23, 24]


def test_get_all_text_lines():
    with open(assets.path_to('gutachten/data/TEMP1/PAGE_TEMP1.xml'), 'r') as f:
        page = parseString(f.read().encode('utf8'), silence=True).get_Page()

    # assert
    assert len(page.get_AllTextLines()) == 55


def test_extend_all_indexed_validate_continuity():
    # arrange
    with open(assets.path_to('gutachten/data/TEMP1/PAGE_TEMP1.xml'), 'r') as f:
        og = parseString(f.read().encode('utf8'), silence=True).get_Page().get_ReadingOrder().get_OrderedGroup()

    # act
    with pytest.raises(Exception) as index_exc:
        og.extend_AllIndexed([
            RegionRefIndexedType(index=3, id='r3'),
            RegionRefIndexedType(index=2, id='r2'),
            RegionRefIndexedType(index=1, id='r1'),
        ], validate_continuity=True)

    assert "@index already used: 1" in str(index_exc.value)


def test_get_all_alternative_image_paths():
    # arrange
    with open(assets.path_to('kant_aufklaerung_1784-complex/data/OCR-D-OCR-OCRO-fraktur-SEG-LINE-tesseract-ocropy-DEWARP/OCR-D-OCR-OCRO-fraktur-SEG-LINE-tesseract-ocropy-DEWARP_0001.xml'), 'r') as f:
        pcgts = parseString(f.read().encode('utf8'), silence=True)

    # assert
    assert pcgts.get_AllAlternativeImagePaths(page=False, region=False, line=False) == []
    assert pcgts.get_AllAlternativeImagePaths(page=True, region=False, line=False) == [
        'OCR-D-IMG-BIN/OCR-D-IMG-BINPAGE-sauvola_0001-BIN_sauvola-ms-split.png',
        'OCR-D-IMG-CROP/OCR-D-IMG-CROP_0001.png',
        'OCR-D-IMG-BIN/INPUT_0017-BIN_sauvola-ms-split.png',
        'OCR-D-IMG-DESPECK/OCR-D-IMG-DESPECK_0001.png',
        'OCR-D-IMG-DESKEW/OCR-D-IMG-DESKEW_0001.png',
        'OCR-D-IMG-DESKEW/OCR-D-IMG-DESKEW_0001.png']
    assert len(pcgts.get_AllAlternativeImagePaths(page=True, region=True, line=False)) == 12
    assert len(pcgts.get_AllAlternativeImagePaths(page=True, region=True, line=False)) == 12
    assert len(pcgts.get_AllAlternativeImagePaths(page=True, region=True, line=True)) == 36

    # TODO: Test with word/glyph-level AlternativeImages
    # would work with len == 36
    # assert len(pcgts.get_AllAlternativeImagePaths(word=False)) == 37


def test_get_AllAlternativeImages():
    with open(assets.path_to('kant_aufklaerung_1784-complex/data/OCR-D-OCR-OCRO-fraktur-SEG-LINE-tesseract-ocropy-DEWARP/OCR-D-OCR-OCRO-fraktur-SEG-LINE-tesseract-ocropy-DEWARP_0001.xml'), 'r') as f:
        pcgts = parseString(f.read().encode('utf8'), silence=True)
        page = pcgts.get_Page()
        assert page.get_AllAlternativeImages(page=False, region=False, line=False) == []
        assert [x.filename for x in page.get_AllAlternativeImages(page=True, region=False, line=False)] == [
            'OCR-D-IMG-BIN/OCR-D-IMG-BINPAGE-sauvola_0001-BIN_sauvola-ms-split.png',
            'OCR-D-IMG-CROP/OCR-D-IMG-CROP_0001.png',
            'OCR-D-IMG-BIN/INPUT_0017-BIN_sauvola-ms-split.png',
            'OCR-D-IMG-DESPECK/OCR-D-IMG-DESPECK_0001.png',
            'OCR-D-IMG-DESKEW/OCR-D-IMG-DESKEW_0001.png',
            'OCR-D-IMG-DESKEW/OCR-D-IMG-DESKEW_0001.png']
        assert isinstance(page.get_AllAlternativeImages()[0], AlternativeImageType)


def test_serialize_no_empty_readingorder():
    """
    https://github.com/OCR-D/core/issues/602
    """
    pcgts = page_from_image(create_ocrd_file_with_defaults(local_filename=assets.path_to('kant_aufklaerung_1784/data/OCR-D-IMG/INPUT_0017.tif')))
    pcgts.get_Page().set_ReadingOrder(ReadingOrderType())
    assert pcgts.get_Page().get_ReadingOrder()
    pcgts = parseString(to_xml(pcgts, skip_declaration=True))
    assert not pcgts.get_Page().get_ReadingOrder()


def test_hashable():
    """
    https://github.com/OCR-D/ocrd_segment/issues/45
    """
    pcgts = page_from_image(create_ocrd_file_with_defaults(local_filename=assets.path_to('kant_aufklaerung_1784/data/OCR-D-IMG/INPUT_0017.tif')))
    page = pcgts.get_Page()
    testset = set()
    testset.add(pcgts)
    testset.add(page)

    # TODO: was is actually to be asserted?


def test_id():
    """
    https://github.com/OCR-D/core/issues/682
    """
    fpath_page = assets.path_to('kant_aufklaerung_1784/data/OCR-D-GT-PAGE/PAGE_0017_PAGE.xml')
    pcgts = parse(fpath_page)

    # assert
    assert pcgts.id == 'PAGE_0017_PAGE'

    # TODO: is this *really* desired?
    # I would expect for a single Page-Element the ID is like from the top-level-Pgts-Container, not like a fileName
    assert pcgts.get_Page().id == 'OCR-D-IMG/INPUT_0017.tif'


if __name__ == '__main__':
    main(__file__)


1			# -- coding: utf-8 --
2
3			import pytest
4
5			from tests.base import main, assets, create_ocrd_file_with_defaults
6
7			from ocrd_modelfactory import page_from_image
8			from ocrd_models.ocrd_page_generateds import TextTypeSimpleType
9			from ocrd_models.ocrd_page import (
10			AlternativeImageType,
11			PcGtsType,
12			PageType,
13			TextRegionType,
14			TextLineType,
15			OrderedGroupIndexedType,
16			UnorderedGroupIndexedType,
17			ReadingOrderType,
18			RegionRefIndexedType,
19			WordType,
20			GlyphType,
21
22			parseString,
23			parse,
24			to_xml
25			)
26
27			simple_page = """\
28			<PcGts xmlns="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15 http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15/pagecontent.xsd">
29			<Metadata>
30			<Creator>OCR-D</Creator>
31			<Created>2016-09-20T11:09:27.041+02:00</Created>
32			<LastChange>2018-04-25T17:44:49.605+01:00</LastChange>
33			</Metadata>
34			<Page
35			imageFilename="https://github.com/OCR-D/assets/raw/master/data/kant_aufklaerung_1784/data/OCR-D-IMG/INPUT_0017.tif"
36			imageWidth="1457"
37			imageHeight="2083"
38			type="content">
39			<TextRegion type="heading" id="r_1_1" custom="readingOrder {index:0;} structure {type:heading;}">
40			<Coords points="113,365 919,365 919,439 113,439"/>
41			<TextLine id="tl_1" primaryLanguage="German" custom="readingOrder {index:0;} textStyle {offset:0; length:26;fontFamily:Arial; fontSize:17.0; bold:true;}">
42			<Coords points="114,366 918,366 918,438 114,438"/>
43			<Baseline points="114,429 918,429"/>
44			<Word id="w_w1aab1b1b2b1b1ab1" language="German" custom="readingOrder {index:0;} textStyle {offset:0; length:11;fontFamily:Arial; fontSize:17.0; bold:true;}">
45			<Coords points="114,368 442,368 442,437 114,437"/>
46			<TextEquiv conf="1">
47			<Unicode>Berliniſche</Unicode>
48			</TextEquiv>
49			</Word>
50			</TextLine>
51			</TextRegion>
52			</Page>
53			</PcGts>
54			"""
55
56
57			@pytest.fixture(name='faulty_glyphs')
58			def _fixture_faulty_glyphs():
59			with open(assets.path_to('glyph-consistency/data/OCR-D-GT-PAGE/FAULTY_GLYPHS.xml'), 'rb') as f:
60			xml_as_str = f.read()
61			pcgts = parseString(xml_as_str, silence=True)
62			yield pcgts
63
64
65			def test_pcgts_id_matches(faulty_glyphs):
66			assert faulty_glyphs.pcGtsId == 'FAULTY_GLYPHS_FILE'
67
68
69			def test_faulty_glyphs_to_xml(faulty_glyphs):
70			as_xml = to_xml(faulty_glyphs)
71			assert ' xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"' in as_xml[:1000]
72			assert ' xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15 http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15/pagecontent.xsd"', as_xml[:1000]
73			assert '<pc:PcGts' in as_xml[0:100]
74			assert '<pc:TextRegion' in as_xml[1000:3000]
75
76
77			def test_to_xml_unicode_nsprefix():
78			"""see https://github.com/OCR-D/core/pull/474#issuecomment-621477590"""
79
80			# arrange
81			with open(assets.path_to('kant_aufklaerung_1784-binarized/data/OCR-D-GT-WORD/INPUT_0020.xml'), 'rb') as f:
82			from_xml = f.read()
83
84			# assert
85			assert '<Unicode>' in from_xml.decode('utf-8'), 'without NS prefix'
86			assert '<Created' in from_xml.decode('utf-8'), 'without NS prefix'
87			pcgts = parseString(from_xml, silence=True)
88			as_xml = to_xml(pcgts)
89			assert '<pc:Unicode>' in as_xml, 'with NS prefix'
90			assert '<pc:Created>' in as_xml, 'with NS prefix'
91
92
93			def test_issue_269(faulty_glyphs):
94			"""
95			@conf is parsed as str but should be float
96			https://github.com/OCR-D/core/issues/269
97			"""
98			# GIGO
99			faulty_glyphs.get_Page().get_TextRegion()[0].get_TextEquiv()[0].set_conf(1.0)
100			assert type(faulty_glyphs.get_Page().get_TextRegion()[0].get_TextEquiv()[0].get_conf()) == float
101			faulty_glyphs.get_Page().get_TextRegion()[0].get_TextEquiv()[0].set_conf('1.0')
102			assert type(faulty_glyphs.get_Page().get_TextRegion()[0].get_TextEquiv()[0].get_conf()) == str
103
104
105			def test_parse_string_succeeds():
106			"""parseString with @conf in TextEquiv won't throw an error"""
107			assert parseString(simple_page, silence=True) is not None
108
109
110			def test_delete_region():
111			pcgts = parseString(simple_page, silence=True)
112			assert len(pcgts.get_Page().get_TextRegion()) == 1
113
114			# act
115			del pcgts.get_Page().get_TextRegion()[0]
116
117			# assert
118			assert len(pcgts.get_Page().get_TextRegion()) == 0
119
120
121			def test_set_image_filename(faulty_glyphs):
122			assert faulty_glyphs.get_Page().imageFilename == '00000259.sw.tif'
123
124			# act
125			faulty_glyphs.get_Page().imageFilename = 'foo'
126
127			# assert
128			assert faulty_glyphs.get_Page().imageFilename == 'foo'
129
130
131			def test_alternative_image_additions():
132			pcgts = PcGtsType(pcGtsId="foo")
133			assert pcgts.pcGtsId == 'foo'
134
135			# act
136			# Page/AlternativeImage
137			page = PageType()
138			pcgts.set_Page(page)
139			page.add_AlternativeImage(AlternativeImageType())
140			# TextRegion/AlternativeImage
141			region = TextRegionType()
142			page.add_TextRegion(region)
143			region.add_AlternativeImage(AlternativeImageType())
144			# TextLine/AlternativeImage
145			line = TextLineType()
146			region.add_TextLine(line)
147			line.add_AlternativeImage(AlternativeImageType())
148			# Word/AlternativeImage
149			word = WordType()
150			line.add_Word(word)
151			word.add_AlternativeImage(AlternativeImageType())
152			# Glyph/AlternativeImage
153			glyph = GlyphType()
154			word.add_Glyph(glyph)
155			glyph.add_AlternativeImage(AlternativeImageType())
156
157			# TODO assertions
158
159
160			def test_simple_types(faulty_glyphs):
161			regions = faulty_glyphs.get_Page().get_TextRegion()
162			reg = regions[0]
163
164			# assert
165			assert isinstance(reg.get_type(), str)
166			assert reg.get_type() == TextTypeSimpleType.CREDIT
167			assert isinstance(TextTypeSimpleType.CREDIT, str)
168			assert reg.get_type() == 'credit'
169			assert isinstance(TextTypeSimpleType.CREDIT, str)
170			reg.set_type(TextTypeSimpleType.PAGENUMBER)
171			assert reg.get_type() == 'page-number'
172			assert isinstance(reg.get_type(), str)
173
174
175			def test_orderedgroup_export_order():
176			"""
177			See https://github.com/OCR-D/core/issues/475
178			"""
179			# arrange
180			with open(assets.path_to('gutachten/data/TEMP1/PAGE_TEMP1.xml'), 'r') as f:
181			pcgts = parseString(f.read().encode('utf8'), silence=True)
182
183			# act
184			og = pcgts.get_Page().get_ReadingOrder().get_OrderedGroup()
185			xml_before = to_xml(og)
186			children = og.get_AllIndexed()
187
188			# assert
189			assert len(children) == 22
190			assert [c.index for c in children] == list(range(0, 22))
191			# mix up the indexes
192			children[0].index = 11
193			children[11].index = 3
194			children[3].index = 0
195			assert [c.index for c in children] == [11, 1, 2, 0, 4, 5, 6, 7, 8, 9, 10, 3, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]
196			assert [c.index for c in og.get_AllIndexed()] == list(range(0, 22))
197			assert og.get_AllIndexed()[1].__class__ == OrderedGroupIndexedType
198			# serialize and make sure the correct order was serialized
199			new_pcgts = parseString(to_xml(pcgts).encode('utf8'), silence=True)
200			new_og = new_pcgts.get_Page().get_ReadingOrder().get_OrderedGroup()
201			assert [c.index for c in new_og.get_AllIndexed()] == list(range(0, 22))
202
203			xml_after = to_xml(new_og)
204			# TODO why not working?
205			#assert xml_after == xml_before
206
207
208			def test_empty_groups_to_regionrefindexed():
209			"""
210			Corrolary See https://github.com/OCR-D/core/issues/475
211			"""
212			# arrange
213			with open(assets.path_to('gutachten/data/TEMP1/PAGE_TEMP1.xml'), 'r') as f:
214			pcgts = parseString(f.read().encode('utf8'), silence=True)
215
216			og = pcgts.get_Page().get_ReadingOrder().get_OrderedGroup()
217			children = og.get_AllIndexed()
218
219			# assert
220			assert isinstance(children[1], OrderedGroupIndexedType)
221			assert isinstance(children[21], UnorderedGroupIndexedType)
222			# empty all the elements in the first orederdGroupIndexed
223			children[1].set_RegionRefIndexed([])
224			# serialize apnd parse to see empty group converted
225			pcgts = parseString(to_xml(pcgts).encode('utf8'), silence=True)
226			og = pcgts.get_Page().get_ReadingOrder().get_OrderedGroup()
227			children = og.get_AllIndexed()
228			assert isinstance(children[1], RegionRefIndexedType)
229			assert isinstance(children[21], RegionRefIndexedType)
230
231
232			def test_all_regions_without_reading_order():
233			"""
234			https://github.com/OCR-D/core/pull/479
235			https://github.com/OCR-D/core/issues/240#issuecomment-493135797
236			"""
237			with open(assets.path_to('gutachten/data/TEMP1/PAGE_TEMP1.xml'), 'r') as f:
238			pcgts = parseString(f.read().encode('utf8'), silence=True)
239
240			# act
241			pg = pcgts.get_Page()
242
243			# assert
244			assert len(pg.get_AllRegions()) == 65
245			assert len(pg.get_AllRegions(depth=0)) == 65
246			assert len(pg.get_AllRegions(depth=1)) == 45
247			assert len(pg.get_AllRegions(depth=2)) == 65
248			assert len(pg.get_AllRegions(depth=3)) == 65
249			assert len(pg.get_AllRegions(classes=['Separator'])) == 25
250			assert len(pg.get_AllRegions(classes=['Table'])) == 3
251			assert len(pg.get_AllRegions(classes=['Text'])) == 37
252			assert len(pg.get_AllRegions(classes=['Text'], depth=1)) == 17
253			assert len(pg.get_AllRegions(classes=['Text'], depth=2)) == 37
254
255
256			def test_get_all_regions_invalid_order_raises_exception():
257			# arrange
258			with open(assets.path_to('gutachten/data/TEMP1/PAGE_TEMP1.xml'), 'r') as f:
259			pg = parseString(f.read().encode('utf8'), silence=True).get_Page()
260
261			# act
262			with pytest.raises(Exception) as exc:
263			pg.get_AllRegions(order='random')
264
265			# assert
266			assert "Argument 'order' must be either 'document', 'reading-order' or 'reading-order-only', not 'random'" in str(exc.value)
267
268
269			def test_get_all_regions_invalid_depth_raises_exception():
270			# arrange
271			with open(assets.path_to('gutachten/data/TEMP1/PAGE_TEMP1.xml'), 'r') as f:
272			pg = parseString(f.read().encode('utf8'), silence=True).get_Page()
273
274			# act
275			with pytest.raises(Exception) as exc:
276			pg.get_AllRegions(depth=-1)
277
278			# assert
279			assert "Argument 'depth' must be an integer greater-or-equal 0, not '-1'" in str(exc.value)
280
281
282			def test_all_regions_with_reading_order():
283			"""
284			https://github.com/OCR-D/core/pull/479
285			https://github.com/OCR-D/core/issues/240#issuecomment-493135797
286			"""
287
288			# arrange
289			with open(assets.path_to('gutachten/data/TEMP1/PAGE_TEMP1.xml'), 'r') as f:
290			pg = parseString(f.read().encode('utf8'), silence=True).get_Page()
291
292			# assert
293			assert len(pg.get_AllRegions(order='reading-order-only')) == 40
294			assert len(pg.get_AllRegions(order='reading-order-only', depth=1)) == 20
295			assert len(pg.get_AllRegions(order='reading-order-only', depth=2)) == 40
296			assert len(pg.get_AllRegions(order='reading-order', depth=0)) == 65
297			assert len(pg.get_AllRegions(order='reading-order', depth=1)) == 45
298			assert len(pg.get_AllRegions(order='reading-order', depth=2)) == 65
299			assert len(pg.get_AllRegions(classes=['Table'], order='reading-order')) == 3
300			assert len(pg.get_AllRegions(classes=['Text'], order='reading-order')) == 37
301			assert len(pg.get_AllRegions(classes=['Text'], order='reading-order', depth=1)) == 17
302
303
304			def test_get_unordered_group_children():
305			# arrange
306			with open(assets.path_to('gutachten/data/TEMP1/PAGE_TEMP1.xml'), 'r') as f:
307			pcgts = parseString(f.read().encode('utf8'), silence=True)
308
309			# act
310			ug = pcgts.get_Page().get_ReadingOrder().get_OrderedGroup().get_UnorderedGroupIndexed()[0]
311
312			# assert
313			assert len(ug.get_UnorderedGroupChildren()) == 1
314
315
316			def test_get_all_indexed_classes():
317			# arrange
318			with open(assets.path_to('gutachten/data/TEMP1/PAGE_TEMP1.xml'), 'r') as f:
319			pcgts = parseString(f.read().encode('utf8'), silence=True)
320
321			# act
322			og = pcgts.get_Page().get_ReadingOrder().get_OrderedGroup()
323
324			# assert
325			assert len(og.get_AllIndexed(classes=['RegionRef'])) == 17
326			assert len(og.get_AllIndexed(classes=['OrderedGroup'])) == 3
327			assert len(og.get_AllIndexed(classes=['UnorderedGroup'])) == 2
328
329
330			def test_get_all_indexed_index_sort():
331			# arrange
332			with open(assets.path_to('gutachten/data/TEMP1/PAGE_TEMP1.xml'), 'r') as f:
333			og = parseString(f.read().encode('utf8'), silence=True).get_Page().get_ReadingOrder().get_OrderedGroup()
334
335			# act
336			unogs = og.get_UnorderedGroupIndexed()
337
338			# assert
339			assert [x.index for x in unogs] == [20, 21]
340			unogs[0].index = 21
341			unogs[1].index = 20
342			assert [x.index for x in og.get_AllIndexed(classes=['UnorderedGroup'], index_sort=True)] == [20, 21]
343			assert [x.index for x in og.get_AllIndexed(classes=['UnorderedGroup'], index_sort=False)] == [21, 20]
344			og.sort_AllIndexed()
345			assert [x.index for x in og.get_AllIndexed(classes=['UnorderedGroup'], index_sort=False)] == [20, 21]
346
347
348			def test_extend_all_indexed_no_validation():
349			# arrange
350			with open(assets.path_to('gutachten/data/TEMP1/PAGE_TEMP1.xml'), 'r') as f:
351			og = parseString(f.read().encode('utf8'), silence=True).get_Page().get_ReadingOrder().get_OrderedGroup()
352
353			# act
354			og.extend_AllIndexed([
355			RegionRefIndexedType(index=3, id='r3'),
356			RegionRefIndexedType(index=2, id='r2'),
357			RegionRefIndexedType(index=1, id='r1'),
358			])
359			rrs = og.get_RegionRefIndexed()
360
361			# assert
362			assert [x.index for x in rrs][-3:] == [22, 23, 24]
363
364
365			def test_get_all_text_lines():
366			with open(assets.path_to('gutachten/data/TEMP1/PAGE_TEMP1.xml'), 'r') as f:
367			page = parseString(f.read().encode('utf8'), silence=True).get_Page()
368
369			# assert
370			assert len(page.get_AllTextLines()) == 55
371
372
373			def test_extend_all_indexed_validate_continuity():
374			# arrange
375			with open(assets.path_to('gutachten/data/TEMP1/PAGE_TEMP1.xml'), 'r') as f:
376			og = parseString(f.read().encode('utf8'), silence=True).get_Page().get_ReadingOrder().get_OrderedGroup()
377
378			# act
379			with pytest.raises(Exception) as index_exc:
380			og.extend_AllIndexed([
381			RegionRefIndexedType(index=3, id='r3'),
382			RegionRefIndexedType(index=2, id='r2'),
383			RegionRefIndexedType(index=1, id='r1'),
384			], validate_continuity=True)
385
386			assert "@index already used: 1" in str(index_exc.value)
387
388
389			def test_get_all_alternative_image_paths():
390			# arrange
391			with open(assets.path_to('kant_aufklaerung_1784-complex/data/OCR-D-OCR-OCRO-fraktur-SEG-LINE-tesseract-ocropy-DEWARP/OCR-D-OCR-OCRO-fraktur-SEG-LINE-tesseract-ocropy-DEWARP_0001.xml'), 'r') as f:
392			pcgts = parseString(f.read().encode('utf8'), silence=True)
393
394			# assert
395			assert pcgts.get_AllAlternativeImagePaths(page=False, region=False, line=False) == []
396			assert pcgts.get_AllAlternativeImagePaths(page=True, region=False, line=False) == [
397			'OCR-D-IMG-BIN/OCR-D-IMG-BINPAGE-sauvola_0001-BIN_sauvola-ms-split.png',
398			'OCR-D-IMG-CROP/OCR-D-IMG-CROP_0001.png',
399			'OCR-D-IMG-BIN/INPUT_0017-BIN_sauvola-ms-split.png',
400			'OCR-D-IMG-DESPECK/OCR-D-IMG-DESPECK_0001.png',
401			'OCR-D-IMG-DESKEW/OCR-D-IMG-DESKEW_0001.png',
402			'OCR-D-IMG-DESKEW/OCR-D-IMG-DESKEW_0001.png']
403			assert len(pcgts.get_AllAlternativeImagePaths(page=True, region=True, line=False)) == 12
404			assert len(pcgts.get_AllAlternativeImagePaths(page=True, region=True, line=False)) == 12
405			assert len(pcgts.get_AllAlternativeImagePaths(page=True, region=True, line=True)) == 36
406
407			# TODO: Test with word/glyph-level AlternativeImages
408			# would work with len == 36
409			# assert len(pcgts.get_AllAlternativeImagePaths(word=False)) == 37
410
411
412			def test_get_AllAlternativeImages():
413			with open(assets.path_to('kant_aufklaerung_1784-complex/data/OCR-D-OCR-OCRO-fraktur-SEG-LINE-tesseract-ocropy-DEWARP/OCR-D-OCR-OCRO-fraktur-SEG-LINE-tesseract-ocropy-DEWARP_0001.xml'), 'r') as f:
414			pcgts = parseString(f.read().encode('utf8'), silence=True)
415			page = pcgts.get_Page()
416			assert page.get_AllAlternativeImages(page=False, region=False, line=False) == []
417			assert [x.filename for x in page.get_AllAlternativeImages(page=True, region=False, line=False)] == [
418			'OCR-D-IMG-BIN/OCR-D-IMG-BINPAGE-sauvola_0001-BIN_sauvola-ms-split.png',
419			'OCR-D-IMG-CROP/OCR-D-IMG-CROP_0001.png',
420			'OCR-D-IMG-BIN/INPUT_0017-BIN_sauvola-ms-split.png',
421			'OCR-D-IMG-DESPECK/OCR-D-IMG-DESPECK_0001.png',
422			'OCR-D-IMG-DESKEW/OCR-D-IMG-DESKEW_0001.png',
423			'OCR-D-IMG-DESKEW/OCR-D-IMG-DESKEW_0001.png']
424			assert isinstance(page.get_AllAlternativeImages()[0], AlternativeImageType)
425
426
427			def test_serialize_no_empty_readingorder():
428			"""
429			https://github.com/OCR-D/core/issues/602
430			"""
431			pcgts = page_from_image(create_ocrd_file_with_defaults(local_filename=assets.path_to('kant_aufklaerung_1784/data/OCR-D-IMG/INPUT_0017.tif')))
432			pcgts.get_Page().set_ReadingOrder(ReadingOrderType())
433			assert pcgts.get_Page().get_ReadingOrder()
434			pcgts = parseString(to_xml(pcgts, skip_declaration=True))
435			assert not pcgts.get_Page().get_ReadingOrder()
436
437
438			def test_hashable():
439			"""
440			https://github.com/OCR-D/ocrd_segment/issues/45
441			"""
442			pcgts = page_from_image(create_ocrd_file_with_defaults(local_filename=assets.path_to('kant_aufklaerung_1784/data/OCR-D-IMG/INPUT_0017.tif')))
443			page = pcgts.get_Page()
444			testset = set()
445			testset.add(pcgts)
446			testset.add(page)
447
448			# TODO: was is actually to be asserted?
449
450
451			def test_id():
452			"""
453			https://github.com/OCR-D/core/issues/682
454			"""
455			fpath_page = assets.path_to('kant_aufklaerung_1784/data/OCR-D-GT-PAGE/PAGE_0017_PAGE.xml')
456			pcgts = parse(fpath_page)
457
458			# assert
459			assert pcgts.id == 'PAGE_0017_PAGE'
460
461			# TODO: is this really desired?
462			# I would expect for a single Page-Element the ID is like from the top-level-Pgts-Container, not like a fileName
463			assert pcgts.get_Page().id == 'OCR-D-IMG/INPUT_0017.tif'
464
465
466			if __name__ == '__main__':
467			main(__file__)
468

OCR-D / core

Pull Request — master (#1184)

tests.model.test_ocrd_page B

Complexity

Size/Duplication

Importance

27 Functions

How to fix Complexity

Complexity

Duplication Side-by-Side

Filter issues like