| @@ 43-98 (lines=56) @@ | ||
| 40 | regionrefs.extend(self._get_recursive_reading_order(elem)) |
|
| 41 | return regionrefs |
|
| 42 | ||
| 43 | def get_AllRegions(self, classes=None, order='document', depth=0): |
|
| 44 | """ |
|
| 45 | Get all the ``*Region`` elements, or only those provided by `classes`. |
|
| 46 | Return in document order, unless `order` is ``reading-order``. |
|
| 47 | ||
| 48 | Arguments: |
|
| 49 | classes (list): Classes of regions that shall be returned, \ |
|
| 50 | e.g. ``['Text', 'Image']`` |
|
| 51 | order ("document"|"reading-order"|"reading-order-only"): Whether to \ |
|
| 52 | return regions sorted by document order (``document``, default) or by |
|
| 53 | reading order with regions not in the reading order at the end of the |
|
| 54 | returned list (``reading-order``) or regions not in the reading order |
|
| 55 | omitted (``reading-order-only``) |
|
| 56 | depth (int): Recursive depth to look for regions at, set to `0` for \ |
|
| 57 | all regions at any depth. Default: 0 |
|
| 58 | ||
| 59 | Returns: |
|
| 60 | a list of :py:class:`TextRegionType`, :py:class:`ImageRegionType`, \ |
|
| 61 | :py:class:`LineDrawingRegionType`, :py:class:`GraphicRegionType`, \ |
|
| 62 | :py:class:`TableRegionType`, :py:class:`ChartRegionType`, \ |
|
| 63 | :py:class:`MapRegionType`, :py:class:`SeparatorRegionType`, \ |
|
| 64 | :py:class:`MathsRegionType`, :py:class:`ChemRegionType`, \ |
|
| 65 | :py:class:`MusicRegionType`, :py:class:`AdvertRegionType`, \ |
|
| 66 | :py:class:`NoiseRegionType`, :py:class:`UnknownRegionType`, \ |
|
| 67 | and/or :py:class:`CustomRegionType` |
|
| 68 | ||
| 69 | For example, to get all text anywhere on the page in reading order, use: |
|
| 70 | :: |
|
| 71 | '\\n'.join(line.get_TextEquiv()[0].Unicode |
|
| 72 | for region in page.get_AllRegions(classes=['Text'], depth=0, order='reading-order') |
|
| 73 | for line in region.get_TextLine()) |
|
| 74 | """ |
|
| 75 | if order not in ['document', 'reading-order', 'reading-order-only']: |
|
| 76 | raise Exception("Argument 'order' must be either 'document', 'reading-order' or 'reading-order-only', not '{}'".format(order)) |
|
| 77 | if depth < 0: |
|
| 78 | raise Exception("Argument 'depth' must be an integer greater-or-equal 0, not '{}'".format(depth)) |
|
| 79 | ret = self._get_recursive_regions([self], depth + 1 if depth else 0, classes) |
|
| 80 | if order.startswith('reading-order'): |
|
| 81 | reading_order = self.get_ReadingOrder() |
|
| 82 | if reading_order: |
|
| 83 | reading_order = reading_order.get_OrderedGroup() or reading_order.get_UnorderedGroup() |
|
| 84 | if reading_order: |
|
| 85 | reading_order = self._get_recursive_reading_order(reading_order) |
|
| 86 | if reading_order: |
|
| 87 | id2region = {region.id: region for region in ret} |
|
| 88 | in_reading_order = [id2region[region_id] for region_id in reading_order if region_id in id2region] |
|
| 89 | # print("ret: {} / in_ro: {} / not-in-ro: {}".format( |
|
| 90 | # len(ret), |
|
| 91 | # len([id2region[region_id] for region_id in reading_order if region_id in id2region]), |
|
| 92 | # len([r for r in ret if r not in in_reading_order]) |
|
| 93 | # )) |
|
| 94 | if order == 'reading-order-only': |
|
| 95 | ret = in_reading_order |
|
| 96 | else: |
|
| 97 | ret = in_reading_order + [r for r in ret if r not in in_reading_order] |
|
| 98 | return ret |
|
| 99 | ||
| @@ 3766-3821 (lines=56) @@ | ||
| 3763 | regionrefs.extend(self._get_recursive_reading_order(elem)) |
|
| 3764 | return regionrefs |
|
| 3765 | ||
| 3766 | def get_AllRegions(self, classes=None, order='document', depth=0): |
|
| 3767 | """ |
|
| 3768 | Get all the ``*Region`` elements, or only those provided by `classes`. |
|
| 3769 | Return in document order, unless `order` is ``reading-order``. |
|
| 3770 | ||
| 3771 | Arguments: |
|
| 3772 | classes (list): Classes of regions that shall be returned, \ |
|
| 3773 | e.g. ``['Text', 'Image']`` |
|
| 3774 | order ("document"|"reading-order"|"reading-order-only"): Whether to \ |
|
| 3775 | return regions sorted by document order (``document``, default) or by |
|
| 3776 | reading order with regions not in the reading order at the end of the |
|
| 3777 | returned list (``reading-order``) or regions not in the reading order |
|
| 3778 | omitted (``reading-order-only``) |
|
| 3779 | depth (int): Recursive depth to look for regions at, set to `0` for \ |
|
| 3780 | all regions at any depth. Default: 0 |
|
| 3781 | ||
| 3782 | Returns: |
|
| 3783 | a list of :py:class:`TextRegionType`, :py:class:`ImageRegionType`, \ |
|
| 3784 | :py:class:`LineDrawingRegionType`, :py:class:`GraphicRegionType`, \ |
|
| 3785 | :py:class:`TableRegionType`, :py:class:`ChartRegionType`, \ |
|
| 3786 | :py:class:`MapRegionType`, :py:class:`SeparatorRegionType`, \ |
|
| 3787 | :py:class:`MathsRegionType`, :py:class:`ChemRegionType`, \ |
|
| 3788 | :py:class:`MusicRegionType`, :py:class:`AdvertRegionType`, \ |
|
| 3789 | :py:class:`NoiseRegionType`, :py:class:`UnknownRegionType`, \ |
|
| 3790 | and/or :py:class:`CustomRegionType` |
|
| 3791 | ||
| 3792 | For example, to get all text anywhere on the page in reading order, use: |
|
| 3793 | :: |
|
| 3794 | '\\n'.join(line.get_TextEquiv()[0].Unicode |
|
| 3795 | for region in page.get_AllRegions(classes=['Text'], depth=0, order='reading-order') |
|
| 3796 | for line in region.get_TextLine()) |
|
| 3797 | """ |
|
| 3798 | if order not in ['document', 'reading-order', 'reading-order-only']: |
|
| 3799 | raise Exception("Argument 'order' must be either 'document', 'reading-order' or 'reading-order-only', not '{}'".format(order)) |
|
| 3800 | if depth < 0: |
|
| 3801 | raise Exception("Argument 'depth' must be an integer greater-or-equal 0, not '{}'".format(depth)) |
|
| 3802 | ret = self._get_recursive_regions([self], depth + 1 if depth else 0, classes) |
|
| 3803 | if order.startswith('reading-order'): |
|
| 3804 | reading_order = self.get_ReadingOrder() |
|
| 3805 | if reading_order: |
|
| 3806 | reading_order = reading_order.get_OrderedGroup() or reading_order.get_UnorderedGroup() |
|
| 3807 | if reading_order: |
|
| 3808 | reading_order = self._get_recursive_reading_order(reading_order) |
|
| 3809 | if reading_order: |
|
| 3810 | id2region = {region.id: region for region in ret} |
|
| 3811 | in_reading_order = [id2region[region_id] for region_id in reading_order if region_id in id2region] |
|
| 3812 | # print("ret: {} / in_ro: {} / not-in-ro: {}".format( |
|
| 3813 | # len(ret), |
|
| 3814 | # len([id2region[region_id] for region_id in reading_order if region_id in id2region]), |
|
| 3815 | # len([r for r in ret if r not in in_reading_order]) |
|
| 3816 | # )) |
|
| 3817 | if order == 'reading-order-only': |
|
| 3818 | ret = in_reading_order |
|
| 3819 | else: |
|
| 3820 | ret = in_reading_order + [r for r in ret if r not in in_reading_order] |
|
| 3821 | return ret |
|
| 3822 | def get_AllAlternativeImages(self, page=True, region=True, line=True, word=True, glyph=True): |
|
| 3823 | """ |
|
| 3824 | Get all the ``pc:AlternativeImage`` in a document |
|