| @@ 3162-3217 (lines=56) @@ | ||
| 3159 | regionrefs.extend(self._get_recursive_reading_order(elem)) |
|
| 3160 | return regionrefs |
|
| 3161 | ||
| 3162 | def get_AllRegions(self, classes=None, order='document', depth=0): |
|
| 3163 | """ |
|
| 3164 | Get all the ``*Region`` elements, or only those provided by `classes`. |
|
| 3165 | Return in document order, unless `order` is ``reading-order``. |
|
| 3166 | ||
| 3167 | Arguments: |
|
| 3168 | classes (list): Classes of regions that shall be returned, \ |
|
| 3169 | e.g. ``['Text', 'Image']`` |
|
| 3170 | order ("document"|"reading-order"|"reading-order-only"): Whether to \ |
|
| 3171 | return regions sorted by document order (``document``, default) or by |
|
| 3172 | reading order with regions not in the reading order at the end of the |
|
| 3173 | returned list (``reading-order``) or regions not in the reading order |
|
| 3174 | omitted (``reading-order-only``) |
|
| 3175 | depth (int): Recursive depth to look for regions at, set to `0` for \ |
|
| 3176 | all regions at any depth. Default: 0 |
|
| 3177 | ||
| 3178 | Returns: |
|
| 3179 | a list of :py:class:`TextRegionType`, :py:class:`ImageRegionType`, \ |
|
| 3180 | :py:class:`LineDrawingRegionType`, :py:class:`GraphicRegionType`, \ |
|
| 3181 | :py:class:`TableRegionType`, :py:class:`ChartRegionType`, \ |
|
| 3182 | :py:class:`MapRegionType`, :py:class:`SeparatorRegionType`, \ |
|
| 3183 | :py:class:`MathsRegionType`, :py:class:`ChemRegionType`, \ |
|
| 3184 | :py:class:`MusicRegionType`, :py:class:`AdvertRegionType`, \ |
|
| 3185 | :py:class:`NoiseRegionType`, :py:class:`UnknownRegionType`, \ |
|
| 3186 | and/or :py:class:`CustomRegionType` |
|
| 3187 | ||
| 3188 | For example, to get all text anywhere on the page in reading order, use: |
|
| 3189 | :: |
|
| 3190 | '\\n'.join(line.get_TextEquiv()[0].Unicode |
|
| 3191 | for region in page.get_AllRegions(classes=['Text'], depth=0, order='reading-order') |
|
| 3192 | for line in region.get_TextLine()) |
|
| 3193 | """ |
|
| 3194 | if order not in ['document', 'reading-order', 'reading-order-only']: |
|
| 3195 | raise Exception("Argument 'order' must be either 'document', 'reading-order' or 'reading-order-only', not '{}'".format(order)) |
|
| 3196 | if depth < 0: |
|
| 3197 | raise Exception("Argument 'depth' must be an integer greater-or-equal 0, not '{}'".format(depth)) |
|
| 3198 | ret = self._get_recursive_regions([self], depth + 1 if depth else 0, classes) |
|
| 3199 | if order.startswith('reading-order'): |
|
| 3200 | reading_order = self.get_ReadingOrder() |
|
| 3201 | if reading_order: |
|
| 3202 | reading_order = reading_order.get_OrderedGroup() or reading_order.get_UnorderedGroup() |
|
| 3203 | if reading_order: |
|
| 3204 | reading_order = self._get_recursive_reading_order(reading_order) |
|
| 3205 | if reading_order: |
|
| 3206 | id2region = {region.id: region for region in ret} |
|
| 3207 | in_reading_order = [id2region[region_id] for region_id in reading_order if region_id in id2region] |
|
| 3208 | # print("ret: {} / in_ro: {} / not-in-ro: {}".format( |
|
| 3209 | # len(ret), |
|
| 3210 | # len([id2region[region_id] for region_id in reading_order if region_id in id2region]), |
|
| 3211 | # len([r for r in ret if r not in in_reading_order]) |
|
| 3212 | # )) |
|
| 3213 | if order == 'reading-order-only': |
|
| 3214 | ret = in_reading_order |
|
| 3215 | else: |
|
| 3216 | ret = in_reading_order + [r for r in ret if r not in in_reading_order] |
|
| 3217 | return ret |
|
| 3218 | def get_AllAlternativeImages(self, page=True, region=True, line=True, word=True, glyph=True): |
|
| 3219 | """ |
|
| 3220 | Get all the ``pc:AlternativeImage`` in a document |
|
| @@ 43-98 (lines=56) @@ | ||
| 40 | regionrefs.extend(self._get_recursive_reading_order(elem)) |
|
| 41 | return regionrefs |
|
| 42 | ||
| 43 | def get_AllRegions(self, classes=None, order='document', depth=0): |
|
| 44 | """ |
|
| 45 | Get all the ``*Region`` elements, or only those provided by `classes`. |
|
| 46 | Return in document order, unless `order` is ``reading-order``. |
|
| 47 | ||
| 48 | Arguments: |
|
| 49 | classes (list): Classes of regions that shall be returned, \ |
|
| 50 | e.g. ``['Text', 'Image']`` |
|
| 51 | order ("document"|"reading-order"|"reading-order-only"): Whether to \ |
|
| 52 | return regions sorted by document order (``document``, default) or by |
|
| 53 | reading order with regions not in the reading order at the end of the |
|
| 54 | returned list (``reading-order``) or regions not in the reading order |
|
| 55 | omitted (``reading-order-only``) |
|
| 56 | depth (int): Recursive depth to look for regions at, set to `0` for \ |
|
| 57 | all regions at any depth. Default: 0 |
|
| 58 | ||
| 59 | Returns: |
|
| 60 | a list of :py:class:`TextRegionType`, :py:class:`ImageRegionType`, \ |
|
| 61 | :py:class:`LineDrawingRegionType`, :py:class:`GraphicRegionType`, \ |
|
| 62 | :py:class:`TableRegionType`, :py:class:`ChartRegionType`, \ |
|
| 63 | :py:class:`MapRegionType`, :py:class:`SeparatorRegionType`, \ |
|
| 64 | :py:class:`MathsRegionType`, :py:class:`ChemRegionType`, \ |
|
| 65 | :py:class:`MusicRegionType`, :py:class:`AdvertRegionType`, \ |
|
| 66 | :py:class:`NoiseRegionType`, :py:class:`UnknownRegionType`, \ |
|
| 67 | and/or :py:class:`CustomRegionType` |
|
| 68 | ||
| 69 | For example, to get all text anywhere on the page in reading order, use: |
|
| 70 | :: |
|
| 71 | '\\n'.join(line.get_TextEquiv()[0].Unicode |
|
| 72 | for region in page.get_AllRegions(classes=['Text'], depth=0, order='reading-order') |
|
| 73 | for line in region.get_TextLine()) |
|
| 74 | """ |
|
| 75 | if order not in ['document', 'reading-order', 'reading-order-only']: |
|
| 76 | raise Exception("Argument 'order' must be either 'document', 'reading-order' or 'reading-order-only', not '{}'".format(order)) |
|
| 77 | if depth < 0: |
|
| 78 | raise Exception("Argument 'depth' must be an integer greater-or-equal 0, not '{}'".format(depth)) |
|
| 79 | ret = self._get_recursive_regions([self], depth + 1 if depth else 0, classes) |
|
| 80 | if order.startswith('reading-order'): |
|
| 81 | reading_order = self.get_ReadingOrder() |
|
| 82 | if reading_order: |
|
| 83 | reading_order = reading_order.get_OrderedGroup() or reading_order.get_UnorderedGroup() |
|
| 84 | if reading_order: |
|
| 85 | reading_order = self._get_recursive_reading_order(reading_order) |
|
| 86 | if reading_order: |
|
| 87 | id2region = {region.id: region for region in ret} |
|
| 88 | in_reading_order = [id2region[region_id] for region_id in reading_order if region_id in id2region] |
|
| 89 | # print("ret: {} / in_ro: {} / not-in-ro: {}".format( |
|
| 90 | # len(ret), |
|
| 91 | # len([id2region[region_id] for region_id in reading_order if region_id in id2region]), |
|
| 92 | # len([r for r in ret if r not in in_reading_order]) |
|
| 93 | # )) |
|
| 94 | if order == 'reading-order-only': |
|
| 95 | ret = in_reading_order |
|
| 96 | else: |
|
| 97 | ret = in_reading_order + [r for r in ret if r not in in_reading_order] |
|
| 98 | return ret |
|
| 99 | ||