@@ 43-98 (lines=56) @@ | ||
40 | regionrefs.extend(self._get_recursive_reading_order(elem)) |
|
41 | return regionrefs |
|
42 | ||
43 | def get_AllRegions(self, classes=None, order='document', depth=0): |
|
44 | """ |
|
45 | Get all the ``*Region`` elements, or only those provided by `classes`. |
|
46 | Return in document order, unless `order` is ``reading-order``. |
|
47 | ||
48 | Arguments: |
|
49 | classes (list): Classes of regions that shall be returned, \ |
|
50 | e.g. ``['Text', 'Image']`` |
|
51 | order ("document"|"reading-order"|"reading-order-only"): Whether to \ |
|
52 | return regions sorted by document order (``document``, default) or by |
|
53 | reading order with regions not in the reading order at the end of the |
|
54 | returned list (``reading-order``) or regions not in the reading order |
|
55 | omitted (``reading-order-only``) |
|
56 | depth (int): Recursive depth to look for regions at, set to `0` for \ |
|
57 | all regions at any depth. Default: 0 |
|
58 | ||
59 | Returns: |
|
60 | a list of :py:class:`TextRegionType`, :py:class:`ImageRegionType`, \ |
|
61 | :py:class:`LineDrawingRegionType`, :py:class:`GraphicRegionType`, \ |
|
62 | :py:class:`TableRegionType`, :py:class:`ChartRegionType`, \ |
|
63 | :py:class:`MapRegionType`, :py:class:`SeparatorRegionType`, \ |
|
64 | :py:class:`MathsRegionType`, :py:class:`ChemRegionType`, \ |
|
65 | :py:class:`MusicRegionType`, :py:class:`AdvertRegionType`, \ |
|
66 | :py:class:`NoiseRegionType`, :py:class:`UnknownRegionType`, \ |
|
67 | and/or :py:class:`CustomRegionType` |
|
68 | ||
69 | For example, to get all text anywhere on the page in reading order, use: |
|
70 | :: |
|
71 | '\\n'.join(line.get_TextEquiv()[0].Unicode |
|
72 | for region in page.get_AllRegions(classes=['Text'], depth=0, order='reading-order') |
|
73 | for line in region.get_TextLine()) |
|
74 | """ |
|
75 | if order not in ['document', 'reading-order', 'reading-order-only']: |
|
76 | raise Exception("Argument 'order' must be either 'document', 'reading-order' or 'reading-order-only', not '{}'".format(order)) |
|
77 | if depth < 0: |
|
78 | raise Exception("Argument 'depth' must be an integer greater-or-equal 0, not '{}'".format(depth)) |
|
79 | ret = self._get_recursive_regions([self], depth + 1 if depth else 0, classes) |
|
80 | if order.startswith('reading-order'): |
|
81 | reading_order = self.get_ReadingOrder() |
|
82 | if reading_order: |
|
83 | reading_order = reading_order.get_OrderedGroup() or reading_order.get_UnorderedGroup() |
|
84 | if reading_order: |
|
85 | reading_order = self._get_recursive_reading_order(reading_order) |
|
86 | if reading_order: |
|
87 | id2region = {region.id: region for region in ret} |
|
88 | in_reading_order = [id2region[region_id] for region_id in reading_order if region_id in id2region] |
|
89 | # print("ret: {} / in_ro: {} / not-in-ro: {}".format( |
|
90 | # len(ret), |
|
91 | # len([id2region[region_id] for region_id in reading_order if region_id in id2region]), |
|
92 | # len([r for r in ret if r not in in_reading_order]) |
|
93 | # )) |
|
94 | if order == 'reading-order-only': |
|
95 | ret = in_reading_order |
|
96 | else: |
|
97 | ret = in_reading_order + [r for r in ret if r not in in_reading_order] |
|
98 | return ret |
|
99 |
@@ 3766-3821 (lines=56) @@ | ||
3763 | regionrefs.extend(self._get_recursive_reading_order(elem)) |
|
3764 | return regionrefs |
|
3765 | ||
3766 | def get_AllRegions(self, classes=None, order='document', depth=0): |
|
3767 | """ |
|
3768 | Get all the ``*Region`` elements, or only those provided by `classes`. |
|
3769 | Return in document order, unless `order` is ``reading-order``. |
|
3770 | ||
3771 | Arguments: |
|
3772 | classes (list): Classes of regions that shall be returned, \ |
|
3773 | e.g. ``['Text', 'Image']`` |
|
3774 | order ("document"|"reading-order"|"reading-order-only"): Whether to \ |
|
3775 | return regions sorted by document order (``document``, default) or by |
|
3776 | reading order with regions not in the reading order at the end of the |
|
3777 | returned list (``reading-order``) or regions not in the reading order |
|
3778 | omitted (``reading-order-only``) |
|
3779 | depth (int): Recursive depth to look for regions at, set to `0` for \ |
|
3780 | all regions at any depth. Default: 0 |
|
3781 | ||
3782 | Returns: |
|
3783 | a list of :py:class:`TextRegionType`, :py:class:`ImageRegionType`, \ |
|
3784 | :py:class:`LineDrawingRegionType`, :py:class:`GraphicRegionType`, \ |
|
3785 | :py:class:`TableRegionType`, :py:class:`ChartRegionType`, \ |
|
3786 | :py:class:`MapRegionType`, :py:class:`SeparatorRegionType`, \ |
|
3787 | :py:class:`MathsRegionType`, :py:class:`ChemRegionType`, \ |
|
3788 | :py:class:`MusicRegionType`, :py:class:`AdvertRegionType`, \ |
|
3789 | :py:class:`NoiseRegionType`, :py:class:`UnknownRegionType`, \ |
|
3790 | and/or :py:class:`CustomRegionType` |
|
3791 | ||
3792 | For example, to get all text anywhere on the page in reading order, use: |
|
3793 | :: |
|
3794 | '\\n'.join(line.get_TextEquiv()[0].Unicode |
|
3795 | for region in page.get_AllRegions(classes=['Text'], depth=0, order='reading-order') |
|
3796 | for line in region.get_TextLine()) |
|
3797 | """ |
|
3798 | if order not in ['document', 'reading-order', 'reading-order-only']: |
|
3799 | raise Exception("Argument 'order' must be either 'document', 'reading-order' or 'reading-order-only', not '{}'".format(order)) |
|
3800 | if depth < 0: |
|
3801 | raise Exception("Argument 'depth' must be an integer greater-or-equal 0, not '{}'".format(depth)) |
|
3802 | ret = self._get_recursive_regions([self], depth + 1 if depth else 0, classes) |
|
3803 | if order.startswith('reading-order'): |
|
3804 | reading_order = self.get_ReadingOrder() |
|
3805 | if reading_order: |
|
3806 | reading_order = reading_order.get_OrderedGroup() or reading_order.get_UnorderedGroup() |
|
3807 | if reading_order: |
|
3808 | reading_order = self._get_recursive_reading_order(reading_order) |
|
3809 | if reading_order: |
|
3810 | id2region = {region.id: region for region in ret} |
|
3811 | in_reading_order = [id2region[region_id] for region_id in reading_order if region_id in id2region] |
|
3812 | # print("ret: {} / in_ro: {} / not-in-ro: {}".format( |
|
3813 | # len(ret), |
|
3814 | # len([id2region[region_id] for region_id in reading_order if region_id in id2region]), |
|
3815 | # len([r for r in ret if r not in in_reading_order]) |
|
3816 | # )) |
|
3817 | if order == 'reading-order-only': |
|
3818 | ret = in_reading_order |
|
3819 | else: |
|
3820 | ret = in_reading_order + [r for r in ret if r not in in_reading_order] |
|
3821 | return ret |
|
3822 | def get_AllAlternativeImages(self, page=True, region=True, line=True, word=True, glyph=True): |
|
3823 | """ |
|
3824 | Get all the ``pc:AlternativeImage`` in a document |