| @@ 1234-1282 (lines=49) @@ | ||
| 1231 | if hasattr(self, 'pcGtsId'): |
|
| 1232 | return self.pcGtsId or '' |
|
| 1233 | return self.imageFilename |
|
| 1234 | def get_AllAlternativeImagePaths(self, page=True, region=True, line=True, word=True, glyph=True): |
|
| 1235 | """ |
|
| 1236 | Get all the ``pc:AlternativeImage/@filename`` paths referenced in the PAGE-XML document. |
|
| 1237 | ||
| 1238 | Arguments: |
|
| 1239 | page (boolean): Get images on ``pc:Page`` level |
|
| 1240 | region (boolean): Get images on ``pc:*Region`` level |
|
| 1241 | line (boolean): Get images on ``pc:TextLine`` level |
|
| 1242 | word (boolean): Get images on ``pc:Word`` level |
|
| 1243 | glyph (boolean): Get images on ``pc:Glyph`` level |
|
| 1244 | ||
| 1245 | Returns: |
|
| 1246 | a list of image filename strings |
|
| 1247 | """ |
|
| 1248 | from .constants import NAMESPACES, PAGE_REGION_TYPES # pylint: disable=relative-beyond-top-level,import-outside-toplevel |
|
| 1249 | from io import StringIO # pylint: disable=import-outside-toplevel |
|
| 1250 | ret = [] |
|
| 1251 | # XXX Since we're only interested in the **paths** of the images, |
|
| 1252 | # export, parse and xpath are less convoluted than traversing |
|
| 1253 | # the generateDS API. Quite possibly not as efficient as could be. |
|
| 1254 | sio = StringIO() |
|
| 1255 | self.export( |
|
| 1256 | outfile=sio, |
|
| 1257 | level=0, |
|
| 1258 | name_='PcGts', |
|
| 1259 | namespaceprefix_='pc:', |
|
| 1260 | namespacedef_='xmlns:pc="%s" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="%s %s/pagecontent.xsd"' % ( |
|
| 1261 | NAMESPACES['page'], |
|
| 1262 | NAMESPACES['page'], |
|
| 1263 | NAMESPACES['page'] |
|
| 1264 | )) |
|
| 1265 | doc = parsexmlstring_(sio.getvalue()) # pylint: disable=undefined-variable |
|
| 1266 | # shortcut |
|
| 1267 | if page and region and line and word and glyph: |
|
| 1268 | ret += doc.xpath('//page:AlternativeImage/@filename', namespaces=NAMESPACES) |
|
| 1269 | else: |
|
| 1270 | if page: |
|
| 1271 | ret += doc.xpath('/page:PcGts/page:Page/page:AlternativeImage/@filename', namespaces=NAMESPACES) |
|
| 1272 | if region: |
|
| 1273 | for class_ in PAGE_REGION_TYPES: |
|
| 1274 | ret += doc.xpath('//page:%sRegion/page:AlternativeImage/@filename' % class_, namespaces=NAMESPACES) |
|
| 1275 | if line: |
|
| 1276 | ret += doc.xpath('//page:TextLine/page:AlternativeImage/@filename', namespaces=NAMESPACES) |
|
| 1277 | if word: |
|
| 1278 | ret += doc.xpath('//page:Word/page:AlternativeImage/@filename', namespaces=NAMESPACES) |
|
| 1279 | if glyph: |
|
| 1280 | ret += doc.xpath('//page:Glyph/page:AlternativeImage/@filename', namespaces=NAMESPACES) |
|
| 1281 | ||
| 1282 | return ret |
|
| 1283 | def prune_ReadingOrder(self): |
|
| 1284 | """ |
|
| 1285 | Remove any empty ReadingOrder elements |
|
| @@ 1-49 (lines=49) @@ | ||
| 1 | def get_AllAlternativeImagePaths(self, page=True, region=True, line=True, word=True, glyph=True): |
|
| 2 | """ |
|
| 3 | Get all the ``pc:AlternativeImage/@filename`` paths referenced in the PAGE-XML document. |
|
| 4 | ||
| 5 | Arguments: |
|
| 6 | page (boolean): Get images on ``pc:Page`` level |
|
| 7 | region (boolean): Get images on ``pc:*Region`` level |
|
| 8 | line (boolean): Get images on ``pc:TextLine`` level |
|
| 9 | word (boolean): Get images on ``pc:Word`` level |
|
| 10 | glyph (boolean): Get images on ``pc:Glyph`` level |
|
| 11 | ||
| 12 | Returns: |
|
| 13 | a list of image filename strings |
|
| 14 | """ |
|
| 15 | from .constants import NAMESPACES, PAGE_REGION_TYPES # pylint: disable=relative-beyond-top-level,import-outside-toplevel |
|
| 16 | from io import StringIO # pylint: disable=import-outside-toplevel |
|
| 17 | ret = [] |
|
| 18 | # XXX Since we're only interested in the **paths** of the images, |
|
| 19 | # export, parse and xpath are less convoluted than traversing |
|
| 20 | # the generateDS API. Quite possibly not as efficient as could be. |
|
| 21 | sio = StringIO() |
|
| 22 | self.export( |
|
| 23 | outfile=sio, |
|
| 24 | level=0, |
|
| 25 | name_='PcGts', |
|
| 26 | namespaceprefix_='pc:', |
|
| 27 | namespacedef_='xmlns:pc="%s" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="%s %s/pagecontent.xsd"' % ( |
|
| 28 | NAMESPACES['page'], |
|
| 29 | NAMESPACES['page'], |
|
| 30 | NAMESPACES['page'] |
|
| 31 | )) |
|
| 32 | doc = parsexmlstring_(sio.getvalue()) # pylint: disable=undefined-variable |
|
| 33 | # shortcut |
|
| 34 | if page and region and line and word and glyph: |
|
| 35 | ret += doc.xpath('//page:AlternativeImage/@filename', namespaces=NAMESPACES) |
|
| 36 | else: |
|
| 37 | if page: |
|
| 38 | ret += doc.xpath('/page:PcGts/page:Page/page:AlternativeImage/@filename', namespaces=NAMESPACES) |
|
| 39 | if region: |
|
| 40 | for class_ in PAGE_REGION_TYPES: |
|
| 41 | ret += doc.xpath('//page:%sRegion/page:AlternativeImage/@filename' % class_, namespaces=NAMESPACES) |
|
| 42 | if line: |
|
| 43 | ret += doc.xpath('//page:TextLine/page:AlternativeImage/@filename', namespaces=NAMESPACES) |
|
| 44 | if word: |
|
| 45 | ret += doc.xpath('//page:Word/page:AlternativeImage/@filename', namespaces=NAMESPACES) |
|
| 46 | if glyph: |
|
| 47 | ret += doc.xpath('//page:Glyph/page:AlternativeImage/@filename', namespaces=NAMESPACES) |
|
| 48 | ||
| 49 | return ret |
|
| 50 | ||