| @@ 1-49 (lines=49) @@ | ||
| 1 | def get_AllAlternativeImagePaths(self, page=True, region=True, line=True, word=True, glyph=True): |
|
| 2 | """ |
|
| 3 | Get all the ``pc:AlternativeImage/@filename`` paths referenced in the PAGE-XML document. |
|
| 4 | ||
| 5 | Arguments: |
|
| 6 | page (boolean): Get images on ``pc:Page`` level |
|
| 7 | region (boolean): Get images on ``pc:*Region`` level |
|
| 8 | line (boolean): Get images on ``pc:TextLine`` level |
|
| 9 | word (boolean): Get images on ``pc:Word`` level |
|
| 10 | glyph (boolean): Get images on ``pc:Glyph`` level |
|
| 11 | ||
| 12 | Returns: |
|
| 13 | a list of image filename strings |
|
| 14 | """ |
|
| 15 | from .constants import NAMESPACES, PAGE_REGION_TYPES # pylint: disable=relative-beyond-top-level,import-outside-toplevel |
|
| 16 | from io import StringIO # pylint: disable=import-outside-toplevel |
|
| 17 | ret = [] |
|
| 18 | # XXX Since we're only interested in the **paths** of the images, |
|
| 19 | # export, parse and xpath are less convoluted than traversing |
|
| 20 | # the generateDS API. Quite possibly not as efficient as could be. |
|
| 21 | sio = StringIO() |
|
| 22 | self.export( |
|
| 23 | outfile=sio, |
|
| 24 | level=0, |
|
| 25 | name_='PcGts', |
|
| 26 | namespaceprefix_='pc:', |
|
| 27 | namespacedef_='xmlns:pc="%s" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="%s %s/pagecontent.xsd"' % ( |
|
| 28 | NAMESPACES['page'], |
|
| 29 | NAMESPACES['page'], |
|
| 30 | NAMESPACES['page'] |
|
| 31 | )) |
|
| 32 | doc = parsexmlstring_(sio.getvalue()) # pylint: disable=undefined-variable |
|
| 33 | # shortcut |
|
| 34 | if page and region and line and word and glyph: |
|
| 35 | ret += doc.xpath('//page:AlternativeImage/@filename', namespaces=NAMESPACES) |
|
| 36 | else: |
|
| 37 | if page: |
|
| 38 | ret += doc.xpath('/page:PcGts/page:Page/page:AlternativeImage/@filename', namespaces=NAMESPACES) |
|
| 39 | if region: |
|
| 40 | for class_ in PAGE_REGION_TYPES: |
|
| 41 | ret += doc.xpath('//page:%sRegion/page:AlternativeImage/@filename' % class_, namespaces=NAMESPACES) |
|
| 42 | if line: |
|
| 43 | ret += doc.xpath('//page:TextLine/page:AlternativeImage/@filename', namespaces=NAMESPACES) |
|
| 44 | if word: |
|
| 45 | ret += doc.xpath('//page:Word/page:AlternativeImage/@filename', namespaces=NAMESPACES) |
|
| 46 | if glyph: |
|
| 47 | ret += doc.xpath('//page:Glyph/page:AlternativeImage/@filename', namespaces=NAMESPACES) |
|
| 48 | ||
| 49 | return ret |
|
| 50 | ||
| @@ 1722-1770 (lines=49) @@ | ||
| 1719 | if hasattr(self, 'pcGtsId'): |
|
| 1720 | return self.pcGtsId or '' |
|
| 1721 | return make_xml_id(self.imageFilename) |
|
| 1722 | def get_AllAlternativeImagePaths(self, page=True, region=True, line=True, word=True, glyph=True): |
|
| 1723 | """ |
|
| 1724 | Get all the ``pc:AlternativeImage/@filename`` paths referenced in the PAGE-XML document. |
|
| 1725 | ||
| 1726 | Arguments: |
|
| 1727 | page (boolean): Get images on ``pc:Page`` level |
|
| 1728 | region (boolean): Get images on ``pc:*Region`` level |
|
| 1729 | line (boolean): Get images on ``pc:TextLine`` level |
|
| 1730 | word (boolean): Get images on ``pc:Word`` level |
|
| 1731 | glyph (boolean): Get images on ``pc:Glyph`` level |
|
| 1732 | ||
| 1733 | Returns: |
|
| 1734 | a list of image filename strings |
|
| 1735 | """ |
|
| 1736 | from .constants import NAMESPACES, PAGE_REGION_TYPES # pylint: disable=relative-beyond-top-level,import-outside-toplevel |
|
| 1737 | from io import StringIO # pylint: disable=import-outside-toplevel |
|
| 1738 | ret = [] |
|
| 1739 | # XXX Since we're only interested in the **paths** of the images, |
|
| 1740 | # export, parse and xpath are less convoluted than traversing |
|
| 1741 | # the generateDS API. Quite possibly not as efficient as could be. |
|
| 1742 | sio = StringIO() |
|
| 1743 | self.export( |
|
| 1744 | outfile=sio, |
|
| 1745 | level=0, |
|
| 1746 | name_='PcGts', |
|
| 1747 | namespaceprefix_='pc:', |
|
| 1748 | namespacedef_='xmlns:pc="%s" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="%s %s/pagecontent.xsd"' % ( |
|
| 1749 | NAMESPACES['page'], |
|
| 1750 | NAMESPACES['page'], |
|
| 1751 | NAMESPACES['page'] |
|
| 1752 | )) |
|
| 1753 | doc = parsexmlstring_(sio.getvalue()) # pylint: disable=undefined-variable |
|
| 1754 | # shortcut |
|
| 1755 | if page and region and line and word and glyph: |
|
| 1756 | ret += doc.xpath('//page:AlternativeImage/@filename', namespaces=NAMESPACES) |
|
| 1757 | else: |
|
| 1758 | if page: |
|
| 1759 | ret += doc.xpath('/page:PcGts/page:Page/page:AlternativeImage/@filename', namespaces=NAMESPACES) |
|
| 1760 | if region: |
|
| 1761 | for class_ in PAGE_REGION_TYPES: |
|
| 1762 | ret += doc.xpath('//page:%sRegion/page:AlternativeImage/@filename' % class_, namespaces=NAMESPACES) |
|
| 1763 | if line: |
|
| 1764 | ret += doc.xpath('//page:TextLine/page:AlternativeImage/@filename', namespaces=NAMESPACES) |
|
| 1765 | if word: |
|
| 1766 | ret += doc.xpath('//page:Word/page:AlternativeImage/@filename', namespaces=NAMESPACES) |
|
| 1767 | if glyph: |
|
| 1768 | ret += doc.xpath('//page:Glyph/page:AlternativeImage/@filename', namespaces=NAMESPACES) |
|
| 1769 | ||
| 1770 | return ret |
|
| 1771 | def prune_ReadingOrder(self): |
|
| 1772 | """ |
|
| 1773 | Remove any empty ReadingOrder elements |
|