Code Duplication    Length = 49-49 lines in 2 locations

src/ocrd_page_user_methods/get_AllAlternativeImagePaths.py 1 location

@@ 1-49 (lines=49) @@
1
def get_AllAlternativeImagePaths(self, page=True, region=True, line=True, word=True, glyph=True):
2
    """
3
    Get all the ``pc:AlternativeImage/@filename`` paths referenced in the PAGE-XML document.
4
5
    Arguments:
6
        page (boolean): Get images on ``pc:Page`` level
7
        region (boolean): Get images on ``pc:*Region`` level
8
        line (boolean): Get images on ``pc:TextLine`` level
9
        word (boolean): Get images on ``pc:Word`` level
10
        glyph (boolean): Get images on ``pc:Glyph`` level
11
12
    Returns:
13
        a list of image filename strings
14
    """
15
    from .constants import NAMESPACES, PAGE_REGION_TYPES # pylint: disable=relative-beyond-top-level,import-outside-toplevel
16
    from io import StringIO  # pylint: disable=import-outside-toplevel
17
    ret = []
18
    # XXX Since we're only interested in the **paths** of the images,
19
    # export, parse and xpath are less convoluted than traversing
20
    # the generateDS API. Quite possibly not as efficient as could be.
21
    sio = StringIO()
22
    self.export(
23
            outfile=sio,
24
            level=0,
25
            name_='PcGts',
26
            namespaceprefix_='pc:',
27
            namespacedef_='xmlns:pc="%s" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="%s %s/pagecontent.xsd"' % (
28
                NAMESPACES['page'],
29
                NAMESPACES['page'],
30
                NAMESPACES['page']
31
            ))
32
    doc = parsexmlstring_(sio.getvalue())  # pylint: disable=undefined-variable
33
    # shortcut
34
    if page and region and line and word and glyph:
35
        ret += doc.xpath('//page:AlternativeImage/@filename', namespaces=NAMESPACES)
36
    else:
37
        if page:
38
            ret += doc.xpath('/page:PcGts/page:Page/page:AlternativeImage/@filename', namespaces=NAMESPACES)
39
        if region:
40
            for class_ in PAGE_REGION_TYPES:
41
                ret += doc.xpath('//page:%sRegion/page:AlternativeImage/@filename' % class_, namespaces=NAMESPACES)
42
        if line:
43
            ret += doc.xpath('//page:TextLine/page:AlternativeImage/@filename', namespaces=NAMESPACES)
44
        if word:
45
            ret += doc.xpath('//page:Word/page:AlternativeImage/@filename', namespaces=NAMESPACES)
46
        if glyph:
47
            ret += doc.xpath('//page:Glyph/page:AlternativeImage/@filename', namespaces=NAMESPACES)
48
49
    return ret
50

src/ocrd_models/ocrd_page_generateds.py 1 location

@@ 1722-1770 (lines=49) @@
1719
        if hasattr(self, 'pcGtsId'):
1720
            return self.pcGtsId or ''
1721
        return make_xml_id(self.imageFilename)
1722
    def get_AllAlternativeImagePaths(self, page=True, region=True, line=True, word=True, glyph=True):
1723
        """
1724
        Get all the ``pc:AlternativeImage/@filename`` paths referenced in the PAGE-XML document.
1725
    
1726
        Arguments:
1727
            page (boolean): Get images on ``pc:Page`` level
1728
            region (boolean): Get images on ``pc:*Region`` level
1729
            line (boolean): Get images on ``pc:TextLine`` level
1730
            word (boolean): Get images on ``pc:Word`` level
1731
            glyph (boolean): Get images on ``pc:Glyph`` level
1732
    
1733
        Returns:
1734
            a list of image filename strings
1735
        """
1736
        from .constants import NAMESPACES, PAGE_REGION_TYPES # pylint: disable=relative-beyond-top-level,import-outside-toplevel
1737
        from io import StringIO  # pylint: disable=import-outside-toplevel
1738
        ret = []
1739
        # XXX Since we're only interested in the **paths** of the images,
1740
        # export, parse and xpath are less convoluted than traversing
1741
        # the generateDS API. Quite possibly not as efficient as could be.
1742
        sio = StringIO()
1743
        self.export(
1744
                outfile=sio,
1745
                level=0,
1746
                name_='PcGts',
1747
                namespaceprefix_='pc:',
1748
                namespacedef_='xmlns:pc="%s" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="%s %s/pagecontent.xsd"' % (
1749
                    NAMESPACES['page'],
1750
                    NAMESPACES['page'],
1751
                    NAMESPACES['page']
1752
                ))
1753
        doc = parsexmlstring_(sio.getvalue())  # pylint: disable=undefined-variable
1754
        # shortcut
1755
        if page and region and line and word and glyph:
1756
            ret += doc.xpath('//page:AlternativeImage/@filename', namespaces=NAMESPACES)
1757
        else:
1758
            if page:
1759
                ret += doc.xpath('/page:PcGts/page:Page/page:AlternativeImage/@filename', namespaces=NAMESPACES)
1760
            if region:
1761
                for class_ in PAGE_REGION_TYPES:
1762
                    ret += doc.xpath('//page:%sRegion/page:AlternativeImage/@filename' % class_, namespaces=NAMESPACES)
1763
            if line:
1764
                ret += doc.xpath('//page:TextLine/page:AlternativeImage/@filename', namespaces=NAMESPACES)
1765
            if word:
1766
                ret += doc.xpath('//page:Word/page:AlternativeImage/@filename', namespaces=NAMESPACES)
1767
            if glyph:
1768
                ret += doc.xpath('//page:Glyph/page:AlternativeImage/@filename', namespaces=NAMESPACES)
1769
    
1770
        return ret
1771
    def prune_ReadingOrder(self):
1772
        """
1773
        Remove any empty ReadingOrder elements