Passed
Push — master ( 7d6f78...eb825e )
by Konstantin
11:20
created

get_AllRegions.get_AllRegions()   C

Complexity

Conditions 9

Size

Total Lines 43
Code Lines 19

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 19
dl 0
loc 43
rs 6.6666
c 0
b 0
f 0
cc 9
nop 4
1
# pylint: disable=line-too-long,invalid-name,protected-access,missing-module-docstring
2
def _region_class(self, x): # pylint: disable=unused-argument
3
    return x.__class__.__name__.replace('RegionType', '')
4
5
def _get_recursive_regions(self, regions, level, classes=None):
6
    if level == 1:
7
        # stop recursion, filter classes
8
        if classes:
9
            return [r for r in regions if self._region_class(r) in classes]
10
        if regions and regions[0].__class__.__name__ == 'PageType':
11
            regions = regions[1:]
12
        return regions
13
    # find more regions recursively
14
    more_regions = []
15
    for region in regions:
16
        more_regions.append([])
17
        for class_ in ['Advert', 'Chart', 'Chem', 'Custom', 'Graphic', 'Image',
18
                       'LineDrawing', 'Map', 'Maths', 'Music', 'Noise',
19
                       'Separator', 'Table', 'Text', 'Unknown']:
20
            if class_ == 'Map' and not isinstance(region, PageType): # pylint: disable=undefined-variable
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable PageType does not seem to be defined.
Loading history...
21
                # 'Map' is not recursive in 2019 schema
22
                continue
23
            more_regions[-1] += getattr(region, 'get_{}Region'.format(class_))()
24
    if not any(more_regions):
25
        return self._get_recursive_regions(regions, 1, classes)
26
    ret = []
27
    for r, more in zip(regions, more_regions):
28
        ret.append(r)
29
        ret += self._get_recursive_regions(more, level - 1 if level else 0, classes)
30
    return self._get_recursive_regions(ret, 1, classes)
31
32
def _get_recursive_reading_order(self, rogroup):
33
    if isinstance(rogroup, (OrderedGroupType, OrderedGroupIndexedType)): # pylint: disable=undefined-variable
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable OrderedGroupType does not seem to be defined.
Loading history...
Comprehensibility Best Practice introduced by
The variable OrderedGroupIndexedType does not seem to be defined.
Loading history...
34
        elements = rogroup.get_AllIndexed()
35
    if isinstance(rogroup, (UnorderedGroupType, UnorderedGroupIndexedType)): # pylint: disable=undefined-variable
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable UnorderedGroupType does not seem to be defined.
Loading history...
Comprehensibility Best Practice introduced by
The variable UnorderedGroupIndexedType does not seem to be defined.
Loading history...
36
        elements = (rogroup.get_RegionRef() + rogroup.get_OrderedGroup() + rogroup.get_UnorderedGroup())
37
    regionrefs = list()
38
    for elem in elements:
0 ignored issues
show
introduced by
The variable elements does not seem to be defined in case isinstance(rogroup, TupleNode) on line 33 is False. Are you sure this can never be the case?
Loading history...
39
        regionrefs.append(elem.get_regionRef())
40
        if not isinstance(elem, (RegionRefType, RegionRefIndexedType)): # pylint: disable=undefined-variable
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable RegionRefType does not seem to be defined.
Loading history...
Comprehensibility Best Practice introduced by
The variable RegionRefIndexedType does not seem to be defined.
Loading history...
41
            regionrefs.extend(self._get_recursive_reading_order(elem))
42
    return regionrefs
43
44
def get_AllRegions(self, classes=None, order='document', depth=0):
45
    """
46
    Get all the *Region element or only those provided by ``classes``.
47
    Returned in document order unless ``order`` is ``reading-order``
48
    Arguments:
49
        classes (list) Classes of regions that shall be returned, e.g. ``['Text', 'Image']``
50
        order ("document"|"reading-order"|"reading-order-only") Whether to
51
            return regions sorted by document order (``document``, default) or by
52
            reading order with regions not in the reading order at the end of the
53
            returned list (``reading-order``) or regions not in the reading order
54
            omitted (``reading-order-only``)
55
        depth (int) Recursive depth to look for regions at, set to `0` for all regions at any depth. Default: 0
56
57
    For example, to get all text anywhere on the page in reading order, use:
58
    ::
59
        '\\n'.join(line.get_TextEquiv()[0].Unicode
60
                  for region in page.get_AllRegions(classes=['Text'], depth=0, order='reading-order')
61
                  for line in region.get_TextLine())
62
    """
63
    if order not in ['document', 'reading-order', 'reading-order-only']:
64
        raise Exception("Argument 'order' must be either 'document', 'reading-order' or 'reading-order-only', not '{}'".format(order))
65
    if depth < 0:
66
        raise Exception("Argument 'depth' must be an integer greater-or-equal 0, not '{}'".format(depth))
67
    ret = self._get_recursive_regions([self], depth + 1 if depth else 0, classes)
68
    if order.startswith('reading-order'):
69
        reading_order = self.get_ReadingOrder()
70
        if reading_order:
71
            reading_order = reading_order.get_OrderedGroup() or reading_order.get_UnorderedGroup()
72
        if reading_order:
73
            reading_order = self._get_recursive_reading_order(reading_order)
74
        if reading_order:
75
            id2region = {region.id: region for region in ret}
76
            in_reading_order = [id2region[region_id] for region_id in reading_order if region_id in id2region]
77
            #  print("ret: {} / in_ro: {} / not-in-ro: {}".format(
78
            #      len(ret),
79
            #      len([id2region[region_id] for region_id in reading_order if region_id in id2region]),
80
            #      len([r for r in ret if r not in in_reading_order])
81
            #      ))
82
            if order == 'reading-order-only':
83
                ret = in_reading_order
84
            else:
85
                ret = in_reading_order + [r for r in ret if r not in in_reading_order]
86
    return ret
87