@@ 1-49 (lines=49) @@ | ||
1 | def get_AllAlternativeImagePaths(self, page=True, region=True, line=True, word=True, glyph=True): |
|
2 | """ |
|
3 | Get all the ``pc:AlternativeImage/@filename`` paths referenced in the PAGE-XML document. |
|
4 | ||
5 | Arguments: |
|
6 | page (boolean): Get images on ``pc:Page`` level |
|
7 | region (boolean): Get images on ``pc:*Region`` level |
|
8 | line (boolean): Get images on ``pc:TextLine`` level |
|
9 | word (boolean): Get images on ``pc:Word`` level |
|
10 | glyph (boolean): Get images on ``pc:Glyph`` level |
|
11 | ||
12 | Returns: |
|
13 | a list of image filename strings |
|
14 | """ |
|
15 | from .constants import NAMESPACES, PAGE_REGION_TYPES # pylint: disable=relative-beyond-top-level,import-outside-toplevel |
|
16 | from io import StringIO # pylint: disable=import-outside-toplevel |
|
17 | ret = [] |
|
18 | # XXX Since we're only interested in the **paths** of the images, |
|
19 | # export, parse and xpath are less convoluted than traversing |
|
20 | # the generateDS API. Quite possibly not as efficient as could be. |
|
21 | sio = StringIO() |
|
22 | self.export( |
|
23 | outfile=sio, |
|
24 | level=0, |
|
25 | name_='PcGts', |
|
26 | namespaceprefix_='pc:', |
|
27 | namespacedef_='xmlns:pc="%s" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="%s %s/pagecontent.xsd"' % ( |
|
28 | NAMESPACES['page'], |
|
29 | NAMESPACES['page'], |
|
30 | NAMESPACES['page'] |
|
31 | )) |
|
32 | doc = parsexmlstring_(sio.getvalue()) # pylint: disable=undefined-variable |
|
33 | # shortcut |
|
34 | if page and region and line and word and glyph: |
|
35 | ret += doc.xpath('//page:AlternativeImage/@filename', namespaces=NAMESPACES) |
|
36 | else: |
|
37 | if page: |
|
38 | ret += doc.xpath('/page:PcGts/page:Page/page:AlternativeImage/@filename', namespaces=NAMESPACES) |
|
39 | if region: |
|
40 | for class_ in PAGE_REGION_TYPES: |
|
41 | ret += doc.xpath('//page:%sRegion/page:AlternativeImage/@filename' % class_, namespaces=NAMESPACES) |
|
42 | if line: |
|
43 | ret += doc.xpath('//page:TextLine/page:AlternativeImage/@filename', namespaces=NAMESPACES) |
|
44 | if word: |
|
45 | ret += doc.xpath('//page:Word/page:AlternativeImage/@filename', namespaces=NAMESPACES) |
|
46 | if glyph: |
|
47 | ret += doc.xpath('//page:Glyph/page:AlternativeImage/@filename', namespaces=NAMESPACES) |
|
48 | ||
49 | return ret |
|
50 |
@@ 1722-1770 (lines=49) @@ | ||
1719 | if hasattr(self, 'pcGtsId'): |
|
1720 | return self.pcGtsId or '' |
|
1721 | return make_xml_id(self.imageFilename) |
|
1722 | def get_AllAlternativeImagePaths(self, page=True, region=True, line=True, word=True, glyph=True): |
|
1723 | """ |
|
1724 | Get all the ``pc:AlternativeImage/@filename`` paths referenced in the PAGE-XML document. |
|
1725 | ||
1726 | Arguments: |
|
1727 | page (boolean): Get images on ``pc:Page`` level |
|
1728 | region (boolean): Get images on ``pc:*Region`` level |
|
1729 | line (boolean): Get images on ``pc:TextLine`` level |
|
1730 | word (boolean): Get images on ``pc:Word`` level |
|
1731 | glyph (boolean): Get images on ``pc:Glyph`` level |
|
1732 | ||
1733 | Returns: |
|
1734 | a list of image filename strings |
|
1735 | """ |
|
1736 | from .constants import NAMESPACES, PAGE_REGION_TYPES # pylint: disable=relative-beyond-top-level,import-outside-toplevel |
|
1737 | from io import StringIO # pylint: disable=import-outside-toplevel |
|
1738 | ret = [] |
|
1739 | # XXX Since we're only interested in the **paths** of the images, |
|
1740 | # export, parse and xpath are less convoluted than traversing |
|
1741 | # the generateDS API. Quite possibly not as efficient as could be. |
|
1742 | sio = StringIO() |
|
1743 | self.export( |
|
1744 | outfile=sio, |
|
1745 | level=0, |
|
1746 | name_='PcGts', |
|
1747 | namespaceprefix_='pc:', |
|
1748 | namespacedef_='xmlns:pc="%s" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="%s %s/pagecontent.xsd"' % ( |
|
1749 | NAMESPACES['page'], |
|
1750 | NAMESPACES['page'], |
|
1751 | NAMESPACES['page'] |
|
1752 | )) |
|
1753 | doc = parsexmlstring_(sio.getvalue()) # pylint: disable=undefined-variable |
|
1754 | # shortcut |
|
1755 | if page and region and line and word and glyph: |
|
1756 | ret += doc.xpath('//page:AlternativeImage/@filename', namespaces=NAMESPACES) |
|
1757 | else: |
|
1758 | if page: |
|
1759 | ret += doc.xpath('/page:PcGts/page:Page/page:AlternativeImage/@filename', namespaces=NAMESPACES) |
|
1760 | if region: |
|
1761 | for class_ in PAGE_REGION_TYPES: |
|
1762 | ret += doc.xpath('//page:%sRegion/page:AlternativeImage/@filename' % class_, namespaces=NAMESPACES) |
|
1763 | if line: |
|
1764 | ret += doc.xpath('//page:TextLine/page:AlternativeImage/@filename', namespaces=NAMESPACES) |
|
1765 | if word: |
|
1766 | ret += doc.xpath('//page:Word/page:AlternativeImage/@filename', namespaces=NAMESPACES) |
|
1767 | if glyph: |
|
1768 | ret += doc.xpath('//page:Glyph/page:AlternativeImage/@filename', namespaces=NAMESPACES) |
|
1769 | ||
1770 | return ret |
|
1771 | def prune_ReadingOrder(self): |
|
1772 | """ |
|
1773 | Remove any empty ReadingOrder elements |