|
@@ 13812-14072 (lines=261) @@
|
| 13809 |
|
# end class MathsRegionType |
| 13810 |
|
|
| 13811 |
|
|
| 13812 |
|
class SeparatorRegionType(RegionType): |
| 13813 |
|
"""SeparatorRegionType -- |
| 13814 |
|
Separators are lines that lie between columns and |
| 13815 |
|
paragraphs and can be used to logically separate |
| 13816 |
|
different articles from each other. |
| 13817 |
|
|
| 13818 |
|
* orientation -- |
| 13819 |
|
The angle the rectangle encapsulating a region |
| 13820 |
|
has to be rotated in clockwise direction |
| 13821 |
|
in order to correct the present skew |
| 13822 |
|
(negative values indicate anti-clockwise rotation). |
| 13823 |
|
Range: -179.999,180 |
| 13824 |
|
|
| 13825 |
|
* colour -- |
| 13826 |
|
The colour of the separator |
| 13827 |
|
|
| 13828 |
|
""" |
| 13829 |
|
__hash__ = GeneratedsSuper.__hash__ |
| 13830 |
|
member_data_items_ = [ |
| 13831 |
|
MemberSpec_('orientation', 'float', 0, 1, {'use': 'optional', 'name': 'orientation'}), |
| 13832 |
|
MemberSpec_('colour', 'pc:ColourSimpleType', 0, 1, {'use': 'optional', 'name': 'colour'}), |
| 13833 |
|
] |
| 13834 |
|
subclass = None |
| 13835 |
|
superclass = RegionType |
| 13836 |
|
def __init__(self, id=None, custom=None, comments=None, continuation=None, AlternativeImage=None, Coords=None, UserDefined=None, Labels=None, Roles=None, TextRegion=None, ImageRegion=None, LineDrawingRegion=None, GraphicRegion=None, TableRegion=None, ChartRegion=None, SeparatorRegion=None, MathsRegion=None, ChemRegion=None, MusicRegion=None, AdvertRegion=None, NoiseRegion=None, UnknownRegion=None, CustomRegion=None, orientation=None, colour=None, gds_collector_=None, **kwargs_): |
| 13837 |
|
self.gds_collector_ = gds_collector_ |
| 13838 |
|
self.gds_elementtree_node_ = None |
| 13839 |
|
self.original_tagname_ = None |
| 13840 |
|
self.parent_object_ = kwargs_.get('parent_object_') |
| 13841 |
|
self.ns_prefix_ = "pc" |
| 13842 |
|
super(globals().get("SeparatorRegionType"), self).__init__(id, custom, comments, continuation, AlternativeImage, Coords, UserDefined, Labels, Roles, TextRegion, ImageRegion, LineDrawingRegion, GraphicRegion, TableRegion, ChartRegion, SeparatorRegion, MathsRegion, ChemRegion, MusicRegion, AdvertRegion, NoiseRegion, UnknownRegion, CustomRegion, **kwargs_) |
| 13843 |
|
self.orientation = _cast(float, orientation) |
| 13844 |
|
self.orientation_nsprefix_ = "pc" |
| 13845 |
|
self.colour = _cast(None, colour) |
| 13846 |
|
self.colour_nsprefix_ = "pc" |
| 13847 |
|
def factory(*args_, **kwargs_): |
| 13848 |
|
if CurrentSubclassModule_ is not None: |
| 13849 |
|
subclass = getSubclassFromModule_( |
| 13850 |
|
CurrentSubclassModule_, SeparatorRegionType) |
| 13851 |
|
if subclass is not None: |
| 13852 |
|
return subclass(*args_, **kwargs_) |
| 13853 |
|
if SeparatorRegionType.subclass: |
| 13854 |
|
return SeparatorRegionType.subclass(*args_, **kwargs_) |
| 13855 |
|
else: |
| 13856 |
|
return SeparatorRegionType(*args_, **kwargs_) |
| 13857 |
|
factory = staticmethod(factory) |
| 13858 |
|
def get_ns_prefix_(self): |
| 13859 |
|
return self.ns_prefix_ |
| 13860 |
|
def set_ns_prefix_(self, ns_prefix): |
| 13861 |
|
self.ns_prefix_ = ns_prefix |
| 13862 |
|
def get_orientation(self): |
| 13863 |
|
return self.orientation |
| 13864 |
|
def set_orientation(self, orientation): |
| 13865 |
|
self.orientation = orientation |
| 13866 |
|
def get_colour(self): |
| 13867 |
|
return self.colour |
| 13868 |
|
def set_colour(self, colour): |
| 13869 |
|
self.colour = colour |
| 13870 |
|
def validate_ColourSimpleType(self, value): |
| 13871 |
|
# Validate type pc:ColourSimpleType, a restriction on string. |
| 13872 |
|
if value is not None and Validate_simpletypes_ and self.gds_collector_ is not None: |
| 13873 |
|
if not isinstance(value, str): |
| 13874 |
|
lineno = self.gds_get_node_lineno_() |
| 13875 |
|
self.gds_collector_.add_message('Value "%(value)s"%(lineno)s is not of the correct base simple type (str)' % {"value": value, "lineno": lineno, }) |
| 13876 |
|
return False |
| 13877 |
|
value = value |
| 13878 |
|
enumerations = ['black', 'blue', 'brown', 'cyan', 'green', 'grey', 'indigo', 'magenta', 'orange', 'pink', 'red', 'turquoise', 'violet', 'white', 'yellow', 'other'] |
| 13879 |
|
if value not in enumerations: |
| 13880 |
|
lineno = self.gds_get_node_lineno_() |
| 13881 |
|
self.gds_collector_.add_message('Value "%(value)s"%(lineno)s does not match xsd enumeration restriction on ColourSimpleType' % {"value" : encode_str_2_3(value), "lineno": lineno} ) |
| 13882 |
|
result = False |
| 13883 |
|
def has__content(self): |
| 13884 |
|
if ( |
| 13885 |
|
super(SeparatorRegionType, self).has__content() |
| 13886 |
|
): |
| 13887 |
|
return True |
| 13888 |
|
else: |
| 13889 |
|
return False |
| 13890 |
|
def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='SeparatorRegionType', pretty_print=True): |
| 13891 |
|
imported_ns_def_ = GenerateDSNamespaceDefs_.get('SeparatorRegionType') |
| 13892 |
|
if imported_ns_def_ is not None: |
| 13893 |
|
namespacedef_ = imported_ns_def_ |
| 13894 |
|
if pretty_print: |
| 13895 |
|
eol_ = '\n' |
| 13896 |
|
else: |
| 13897 |
|
eol_ = '' |
| 13898 |
|
if self.original_tagname_ is not None and name_ == 'SeparatorRegionType': |
| 13899 |
|
name_ = self.original_tagname_ |
| 13900 |
|
if UseCapturedNS_ and self.ns_prefix_: |
| 13901 |
|
namespaceprefix_ = self.ns_prefix_ + ':' |
| 13902 |
|
showIndent(outfile, level, pretty_print) |
| 13903 |
|
outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) |
| 13904 |
|
already_processed = set() |
| 13905 |
|
self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='SeparatorRegionType') |
| 13906 |
|
if self.has__content(): |
| 13907 |
|
outfile.write('>%s' % (eol_, )) |
| 13908 |
|
self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='SeparatorRegionType', pretty_print=pretty_print) |
| 13909 |
|
showIndent(outfile, level, pretty_print) |
| 13910 |
|
outfile.write('</%s%s>%s' % (namespaceprefix_, name_, eol_)) |
| 13911 |
|
else: |
| 13912 |
|
outfile.write('/>%s' % (eol_, )) |
| 13913 |
|
def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='SeparatorRegionType'): |
| 13914 |
|
super(SeparatorRegionType, self)._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='SeparatorRegionType') |
| 13915 |
|
if self.orientation is not None and 'orientation' not in already_processed: |
| 13916 |
|
already_processed.add('orientation') |
| 13917 |
|
outfile.write(' orientation="%s"' % self.gds_format_float(self.orientation, input_name='orientation')) |
| 13918 |
|
if self.colour is not None and 'colour' not in already_processed: |
| 13919 |
|
already_processed.add('colour') |
| 13920 |
|
outfile.write(' colour=%s' % (self.gds_encode(self.gds_format_string(quote_attrib(self.colour), input_name='colour')), )) |
| 13921 |
|
def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='SeparatorRegionType', fromsubclass_=False, pretty_print=True): |
| 13922 |
|
super(SeparatorRegionType, self)._exportChildren(outfile, level, namespaceprefix_, namespacedef_, name_, True, pretty_print=pretty_print) |
| 13923 |
|
def to_etree(self, parent_element=None, name_='SeparatorRegionType', mapping_=None, reverse_mapping_=None, nsmap_=None): |
| 13924 |
|
element = super(SeparatorRegionType, self).to_etree(parent_element, name_, mapping_, reverse_mapping_, nsmap_) |
| 13925 |
|
if self.orientation is not None: |
| 13926 |
|
element.set('orientation', self.gds_format_float(self.orientation)) |
| 13927 |
|
if self.colour is not None: |
| 13928 |
|
element.set('colour', self.gds_format_string(self.colour)) |
| 13929 |
|
if mapping_ is not None: |
| 13930 |
|
mapping_[id(self)] = element |
| 13931 |
|
if reverse_mapping_ is not None: |
| 13932 |
|
reverse_mapping_[element] = self |
| 13933 |
|
return element |
| 13934 |
|
def build(self, node, gds_collector_=None): |
| 13935 |
|
self.gds_collector_ = gds_collector_ |
| 13936 |
|
if SaveElementTreeNode: |
| 13937 |
|
self.gds_elementtree_node_ = node |
| 13938 |
|
already_processed = set() |
| 13939 |
|
self.ns_prefix_ = node.prefix |
| 13940 |
|
self._buildAttributes(node, node.attrib, already_processed) |
| 13941 |
|
for child in node: |
| 13942 |
|
nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] |
| 13943 |
|
self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) |
| 13944 |
|
return self |
| 13945 |
|
def _buildAttributes(self, node, attrs, already_processed): |
| 13946 |
|
value = find_attr_value_('orientation', node) |
| 13947 |
|
if value is not None and 'orientation' not in already_processed: |
| 13948 |
|
already_processed.add('orientation') |
| 13949 |
|
value = self.gds_parse_float(value, node, 'orientation') |
| 13950 |
|
self.orientation = value |
| 13951 |
|
value = find_attr_value_('colour', node) |
| 13952 |
|
if value is not None and 'colour' not in already_processed: |
| 13953 |
|
already_processed.add('colour') |
| 13954 |
|
self.colour = value |
| 13955 |
|
self.validate_ColourSimpleType(self.colour) # validate type ColourSimpleType |
| 13956 |
|
super(SeparatorRegionType, self)._buildAttributes(node, attrs, already_processed) |
| 13957 |
|
def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): |
| 13958 |
|
super(SeparatorRegionType, self)._buildChildren(child_, node, nodeName_, True) |
| 13959 |
|
pass |
| 13960 |
|
def __hash__(self): |
| 13961 |
|
return hash(self.id) |
| 13962 |
|
# pylint: disable=line-too-long,invalid-name,protected-access,missing-module-docstring |
| 13963 |
|
def _region_class(self, x): # pylint: disable=unused-argument |
| 13964 |
|
return x.__class__.__name__.replace('RegionType', '') |
| 13965 |
|
|
| 13966 |
|
def _get_recursive_regions(self, regions, level, classes=None): |
| 13967 |
|
from .constants import PAGE_REGION_TYPES # pylint: disable=relative-beyond-top-level,import-outside-toplevel |
| 13968 |
|
if level == 1: |
| 13969 |
|
# stop recursion, filter classes |
| 13970 |
|
if classes: |
| 13971 |
|
return [r for r in regions if self._region_class(r) in classes] |
| 13972 |
|
if regions and regions[0].__class__.__name__ == 'PageType': |
| 13973 |
|
regions = regions[1:] |
| 13974 |
|
return regions |
| 13975 |
|
# find more regions recursively |
| 13976 |
|
more_regions = [] |
| 13977 |
|
for region in regions: |
| 13978 |
|
more_regions.append([]) |
| 13979 |
|
for class_ in PAGE_REGION_TYPES: |
| 13980 |
|
if class_ == 'Map' and not isinstance(region, PageType): # pylint: disable=undefined-variable |
| 13981 |
|
# 'Map' is not recursive in 2019 schema |
| 13982 |
|
continue |
| 13983 |
|
more_regions[-1] += getattr(region, 'get_{}Region'.format(class_))() |
| 13984 |
|
if not any(more_regions): |
| 13985 |
|
return self._get_recursive_regions(regions, 1, classes) |
| 13986 |
|
ret = [] |
| 13987 |
|
for r, more in zip(regions, more_regions): |
| 13988 |
|
ret.append(r) |
| 13989 |
|
ret += self._get_recursive_regions(more, level - 1 if level else 0, classes) |
| 13990 |
|
return self._get_recursive_regions(ret, 1, classes) |
| 13991 |
|
|
| 13992 |
|
def _get_recursive_reading_order(self, rogroup): |
| 13993 |
|
if isinstance(rogroup, (OrderedGroupType, OrderedGroupIndexedType)): # pylint: disable=undefined-variable |
| 13994 |
|
elements = rogroup.get_AllIndexed() |
| 13995 |
|
if isinstance(rogroup, (UnorderedGroupType, UnorderedGroupIndexedType)): # pylint: disable=undefined-variable |
| 13996 |
|
elements = (rogroup.get_RegionRef() + rogroup.get_OrderedGroup() + rogroup.get_UnorderedGroup()) |
| 13997 |
|
regionrefs = list() |
| 13998 |
|
for elem in elements: |
| 13999 |
|
regionrefs.append(elem.get_regionRef()) |
| 14000 |
|
if not isinstance(elem, (RegionRefType, RegionRefIndexedType)): # pylint: disable=undefined-variable |
| 14001 |
|
regionrefs.extend(self._get_recursive_reading_order(elem)) |
| 14002 |
|
return regionrefs |
| 14003 |
|
|
| 14004 |
|
def get_AllRegions(self, classes=None, order='document', depth=0): |
| 14005 |
|
""" |
| 14006 |
|
Get all the ``*Region`` elements, or only those provided by `classes`. |
| 14007 |
|
Return in document order, unless the top element is ``Page`` and |
| 14008 |
|
`order` is ``reading-order``. |
| 14009 |
|
|
| 14010 |
|
Arguments: |
| 14011 |
|
classes (list): Classes of regions that shall be returned, \ |
| 14012 |
|
e.g. ``['Text', 'Image']`` |
| 14013 |
|
order ("document"|"reading-order"|"reading-order-only"): Whether to \ |
| 14014 |
|
return regions sorted by document order (``document``, default) or by |
| 14015 |
|
reading order with regions not in the reading order at the end of the |
| 14016 |
|
returned list (``reading-order``) or regions not in the reading order |
| 14017 |
|
omitted (``reading-order-only``). The latter two are only available |
| 14018 |
|
on page level. |
| 14019 |
|
depth (int): Recursive depth to look for regions at, set to `0` for \ |
| 14020 |
|
all regions at any depth. Default: 0 |
| 14021 |
|
|
| 14022 |
|
Returns: |
| 14023 |
|
a list of :py:class:`TextRegionType`, :py:class:`ImageRegionType`, \ |
| 14024 |
|
:py:class:`LineDrawingRegionType`, :py:class:`GraphicRegionType`, \ |
| 14025 |
|
:py:class:`TableRegionType`, :py:class:`ChartRegionType`, \ |
| 14026 |
|
:py:class:`MapRegionType`, :py:class:`SeparatorRegionType`, \ |
| 14027 |
|
:py:class:`MathsRegionType`, :py:class:`ChemRegionType`, \ |
| 14028 |
|
:py:class:`MusicRegionType`, :py:class:`AdvertRegionType`, \ |
| 14029 |
|
:py:class:`NoiseRegionType`, :py:class:`UnknownRegionType`, \ |
| 14030 |
|
and/or :py:class:`CustomRegionType` |
| 14031 |
|
|
| 14032 |
|
For example, to get all text anywhere on the page in reading order, use: |
| 14033 |
|
:: |
| 14034 |
|
'\\n'.join(line.get_TextEquiv()[0].Unicode |
| 14035 |
|
for region in page.get_AllRegions(classes=['Text'], depth=0, order='reading-order') |
| 14036 |
|
for line in region.get_TextLine()) |
| 14037 |
|
""" |
| 14038 |
|
if order not in ['document', 'reading-order', 'reading-order-only']: |
| 14039 |
|
raise Exception("Argument 'order' must be either 'document', 'reading-order' or 'reading-order-only', not '{}'".format(order)) |
| 14040 |
|
if depth < 0: |
| 14041 |
|
raise Exception("Argument 'depth' must be an integer greater-or-equal 0, not '{}'".format(depth)) |
| 14042 |
|
ret = self._get_recursive_regions([self], depth + 1 if depth else 0, classes) |
| 14043 |
|
if self.__class__.__name__ == 'PageType' and order.startswith('reading-order'): |
| 14044 |
|
reading_order = self.get_ReadingOrder() |
| 14045 |
|
if reading_order: |
| 14046 |
|
reading_order = reading_order.get_OrderedGroup() or reading_order.get_UnorderedGroup() |
| 14047 |
|
if reading_order: |
| 14048 |
|
reading_order = self._get_recursive_reading_order(reading_order) |
| 14049 |
|
if reading_order: |
| 14050 |
|
id2region = {region.id: region for region in ret} |
| 14051 |
|
in_reading_order = [id2region[region_id] for region_id in reading_order if region_id in id2region] |
| 14052 |
|
# print("ret: {} / in_ro: {} / not-in-ro: {}".format( |
| 14053 |
|
# len(ret), |
| 14054 |
|
# len([id2region[region_id] for region_id in reading_order if region_id in id2region]), |
| 14055 |
|
# len([r for r in ret if r not in in_reading_order]) |
| 14056 |
|
# )) |
| 14057 |
|
if order == 'reading-order-only': |
| 14058 |
|
ret = in_reading_order |
| 14059 |
|
else: |
| 14060 |
|
ret = in_reading_order + [r for r in ret if r not in in_reading_order] |
| 14061 |
|
return ret |
| 14062 |
|
def set_orientation(self, orientation): |
| 14063 |
|
""" |
| 14064 |
|
Set deskewing angle to given `orientation` number. |
| 14065 |
|
Moreover, invalidate self's ``pc:AlternativeImage``s |
| 14066 |
|
(because they will have been rotated and enlarged |
| 14067 |
|
with the angle of the previous value). |
| 14068 |
|
""" |
| 14069 |
|
if hasattr(self, 'invalidate_AlternativeImage'): |
| 14070 |
|
# PageType, RegionType: |
| 14071 |
|
self.invalidate_AlternativeImage(feature_selector='deskewed') |
| 14072 |
|
self.orientation = orientation |
| 14073 |
|
# end class SeparatorRegionType |
| 14074 |
|
|
| 14075 |
|
|
|
@@ 13549-13808 (lines=260) @@
|
| 13546 |
|
# end class ChemRegionType |
| 13547 |
|
|
| 13548 |
|
|
| 13549 |
|
class MathsRegionType(RegionType): |
| 13550 |
|
"""MathsRegionType -- |
| 13551 |
|
Regions containing equations and mathematical symbols |
| 13552 |
|
should be marked as maths regions. |
| 13553 |
|
|
| 13554 |
|
* orientation -- |
| 13555 |
|
The angle the rectangle encapsulating a region |
| 13556 |
|
has to be rotated in clockwise direction |
| 13557 |
|
in order to correct the present skew |
| 13558 |
|
(negative values indicate anti-clockwise rotation). |
| 13559 |
|
Range: -179.999,180 |
| 13560 |
|
|
| 13561 |
|
* bgColour -- |
| 13562 |
|
The background colour of the region |
| 13563 |
|
|
| 13564 |
|
""" |
| 13565 |
|
__hash__ = GeneratedsSuper.__hash__ |
| 13566 |
|
member_data_items_ = [ |
| 13567 |
|
MemberSpec_('orientation', 'float', 0, 1, {'use': 'optional', 'name': 'orientation'}), |
| 13568 |
|
MemberSpec_('bgColour', 'pc:ColourSimpleType', 0, 1, {'use': 'optional', 'name': 'bgColour'}), |
| 13569 |
|
] |
| 13570 |
|
subclass = None |
| 13571 |
|
superclass = RegionType |
| 13572 |
|
def __init__(self, id=None, custom=None, comments=None, continuation=None, AlternativeImage=None, Coords=None, UserDefined=None, Labels=None, Roles=None, TextRegion=None, ImageRegion=None, LineDrawingRegion=None, GraphicRegion=None, TableRegion=None, ChartRegion=None, SeparatorRegion=None, MathsRegion=None, ChemRegion=None, MusicRegion=None, AdvertRegion=None, NoiseRegion=None, UnknownRegion=None, CustomRegion=None, orientation=None, bgColour=None, gds_collector_=None, **kwargs_): |
| 13573 |
|
self.gds_collector_ = gds_collector_ |
| 13574 |
|
self.gds_elementtree_node_ = None |
| 13575 |
|
self.original_tagname_ = None |
| 13576 |
|
self.parent_object_ = kwargs_.get('parent_object_') |
| 13577 |
|
self.ns_prefix_ = "pc" |
| 13578 |
|
super(globals().get("MathsRegionType"), self).__init__(id, custom, comments, continuation, AlternativeImage, Coords, UserDefined, Labels, Roles, TextRegion, ImageRegion, LineDrawingRegion, GraphicRegion, TableRegion, ChartRegion, SeparatorRegion, MathsRegion, ChemRegion, MusicRegion, AdvertRegion, NoiseRegion, UnknownRegion, CustomRegion, **kwargs_) |
| 13579 |
|
self.orientation = _cast(float, orientation) |
| 13580 |
|
self.orientation_nsprefix_ = "pc" |
| 13581 |
|
self.bgColour = _cast(None, bgColour) |
| 13582 |
|
self.bgColour_nsprefix_ = "pc" |
| 13583 |
|
def factory(*args_, **kwargs_): |
| 13584 |
|
if CurrentSubclassModule_ is not None: |
| 13585 |
|
subclass = getSubclassFromModule_( |
| 13586 |
|
CurrentSubclassModule_, MathsRegionType) |
| 13587 |
|
if subclass is not None: |
| 13588 |
|
return subclass(*args_, **kwargs_) |
| 13589 |
|
if MathsRegionType.subclass: |
| 13590 |
|
return MathsRegionType.subclass(*args_, **kwargs_) |
| 13591 |
|
else: |
| 13592 |
|
return MathsRegionType(*args_, **kwargs_) |
| 13593 |
|
factory = staticmethod(factory) |
| 13594 |
|
def get_ns_prefix_(self): |
| 13595 |
|
return self.ns_prefix_ |
| 13596 |
|
def set_ns_prefix_(self, ns_prefix): |
| 13597 |
|
self.ns_prefix_ = ns_prefix |
| 13598 |
|
def get_orientation(self): |
| 13599 |
|
return self.orientation |
| 13600 |
|
def set_orientation(self, orientation): |
| 13601 |
|
self.orientation = orientation |
| 13602 |
|
def get_bgColour(self): |
| 13603 |
|
return self.bgColour |
| 13604 |
|
def set_bgColour(self, bgColour): |
| 13605 |
|
self.bgColour = bgColour |
| 13606 |
|
def validate_ColourSimpleType(self, value): |
| 13607 |
|
# Validate type pc:ColourSimpleType, a restriction on string. |
| 13608 |
|
if value is not None and Validate_simpletypes_ and self.gds_collector_ is not None: |
| 13609 |
|
if not isinstance(value, str): |
| 13610 |
|
lineno = self.gds_get_node_lineno_() |
| 13611 |
|
self.gds_collector_.add_message('Value "%(value)s"%(lineno)s is not of the correct base simple type (str)' % {"value": value, "lineno": lineno, }) |
| 13612 |
|
return False |
| 13613 |
|
value = value |
| 13614 |
|
enumerations = ['black', 'blue', 'brown', 'cyan', 'green', 'grey', 'indigo', 'magenta', 'orange', 'pink', 'red', 'turquoise', 'violet', 'white', 'yellow', 'other'] |
| 13615 |
|
if value not in enumerations: |
| 13616 |
|
lineno = self.gds_get_node_lineno_() |
| 13617 |
|
self.gds_collector_.add_message('Value "%(value)s"%(lineno)s does not match xsd enumeration restriction on ColourSimpleType' % {"value" : encode_str_2_3(value), "lineno": lineno} ) |
| 13618 |
|
result = False |
| 13619 |
|
def has__content(self): |
| 13620 |
|
if ( |
| 13621 |
|
super(MathsRegionType, self).has__content() |
| 13622 |
|
): |
| 13623 |
|
return True |
| 13624 |
|
else: |
| 13625 |
|
return False |
| 13626 |
|
def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='MathsRegionType', pretty_print=True): |
| 13627 |
|
imported_ns_def_ = GenerateDSNamespaceDefs_.get('MathsRegionType') |
| 13628 |
|
if imported_ns_def_ is not None: |
| 13629 |
|
namespacedef_ = imported_ns_def_ |
| 13630 |
|
if pretty_print: |
| 13631 |
|
eol_ = '\n' |
| 13632 |
|
else: |
| 13633 |
|
eol_ = '' |
| 13634 |
|
if self.original_tagname_ is not None and name_ == 'MathsRegionType': |
| 13635 |
|
name_ = self.original_tagname_ |
| 13636 |
|
if UseCapturedNS_ and self.ns_prefix_: |
| 13637 |
|
namespaceprefix_ = self.ns_prefix_ + ':' |
| 13638 |
|
showIndent(outfile, level, pretty_print) |
| 13639 |
|
outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) |
| 13640 |
|
already_processed = set() |
| 13641 |
|
self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='MathsRegionType') |
| 13642 |
|
if self.has__content(): |
| 13643 |
|
outfile.write('>%s' % (eol_, )) |
| 13644 |
|
self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='MathsRegionType', pretty_print=pretty_print) |
| 13645 |
|
showIndent(outfile, level, pretty_print) |
| 13646 |
|
outfile.write('</%s%s>%s' % (namespaceprefix_, name_, eol_)) |
| 13647 |
|
else: |
| 13648 |
|
outfile.write('/>%s' % (eol_, )) |
| 13649 |
|
def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='MathsRegionType'): |
| 13650 |
|
super(MathsRegionType, self)._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='MathsRegionType') |
| 13651 |
|
if self.orientation is not None and 'orientation' not in already_processed: |
| 13652 |
|
already_processed.add('orientation') |
| 13653 |
|
outfile.write(' orientation="%s"' % self.gds_format_float(self.orientation, input_name='orientation')) |
| 13654 |
|
if self.bgColour is not None and 'bgColour' not in already_processed: |
| 13655 |
|
already_processed.add('bgColour') |
| 13656 |
|
outfile.write(' bgColour=%s' % (self.gds_encode(self.gds_format_string(quote_attrib(self.bgColour), input_name='bgColour')), )) |
| 13657 |
|
def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='MathsRegionType', fromsubclass_=False, pretty_print=True): |
| 13658 |
|
super(MathsRegionType, self)._exportChildren(outfile, level, namespaceprefix_, namespacedef_, name_, True, pretty_print=pretty_print) |
| 13659 |
|
def to_etree(self, parent_element=None, name_='MathsRegionType', mapping_=None, reverse_mapping_=None, nsmap_=None): |
| 13660 |
|
element = super(MathsRegionType, self).to_etree(parent_element, name_, mapping_, reverse_mapping_, nsmap_) |
| 13661 |
|
if self.orientation is not None: |
| 13662 |
|
element.set('orientation', self.gds_format_float(self.orientation)) |
| 13663 |
|
if self.bgColour is not None: |
| 13664 |
|
element.set('bgColour', self.gds_format_string(self.bgColour)) |
| 13665 |
|
if mapping_ is not None: |
| 13666 |
|
mapping_[id(self)] = element |
| 13667 |
|
if reverse_mapping_ is not None: |
| 13668 |
|
reverse_mapping_[element] = self |
| 13669 |
|
return element |
| 13670 |
|
def build(self, node, gds_collector_=None): |
| 13671 |
|
self.gds_collector_ = gds_collector_ |
| 13672 |
|
if SaveElementTreeNode: |
| 13673 |
|
self.gds_elementtree_node_ = node |
| 13674 |
|
already_processed = set() |
| 13675 |
|
self.ns_prefix_ = node.prefix |
| 13676 |
|
self._buildAttributes(node, node.attrib, already_processed) |
| 13677 |
|
for child in node: |
| 13678 |
|
nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] |
| 13679 |
|
self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) |
| 13680 |
|
return self |
| 13681 |
|
def _buildAttributes(self, node, attrs, already_processed): |
| 13682 |
|
value = find_attr_value_('orientation', node) |
| 13683 |
|
if value is not None and 'orientation' not in already_processed: |
| 13684 |
|
already_processed.add('orientation') |
| 13685 |
|
value = self.gds_parse_float(value, node, 'orientation') |
| 13686 |
|
self.orientation = value |
| 13687 |
|
value = find_attr_value_('bgColour', node) |
| 13688 |
|
if value is not None and 'bgColour' not in already_processed: |
| 13689 |
|
already_processed.add('bgColour') |
| 13690 |
|
self.bgColour = value |
| 13691 |
|
self.validate_ColourSimpleType(self.bgColour) # validate type ColourSimpleType |
| 13692 |
|
super(MathsRegionType, self)._buildAttributes(node, attrs, already_processed) |
| 13693 |
|
def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): |
| 13694 |
|
super(MathsRegionType, self)._buildChildren(child_, node, nodeName_, True) |
| 13695 |
|
pass |
| 13696 |
|
def __hash__(self): |
| 13697 |
|
return hash(self.id) |
| 13698 |
|
# pylint: disable=line-too-long,invalid-name,protected-access,missing-module-docstring |
| 13699 |
|
def _region_class(self, x): # pylint: disable=unused-argument |
| 13700 |
|
return x.__class__.__name__.replace('RegionType', '') |
| 13701 |
|
|
| 13702 |
|
def _get_recursive_regions(self, regions, level, classes=None): |
| 13703 |
|
from .constants import PAGE_REGION_TYPES # pylint: disable=relative-beyond-top-level,import-outside-toplevel |
| 13704 |
|
if level == 1: |
| 13705 |
|
# stop recursion, filter classes |
| 13706 |
|
if classes: |
| 13707 |
|
return [r for r in regions if self._region_class(r) in classes] |
| 13708 |
|
if regions and regions[0].__class__.__name__ == 'PageType': |
| 13709 |
|
regions = regions[1:] |
| 13710 |
|
return regions |
| 13711 |
|
# find more regions recursively |
| 13712 |
|
more_regions = [] |
| 13713 |
|
for region in regions: |
| 13714 |
|
more_regions.append([]) |
| 13715 |
|
for class_ in PAGE_REGION_TYPES: |
| 13716 |
|
if class_ == 'Map' and not isinstance(region, PageType): # pylint: disable=undefined-variable |
| 13717 |
|
# 'Map' is not recursive in 2019 schema |
| 13718 |
|
continue |
| 13719 |
|
more_regions[-1] += getattr(region, 'get_{}Region'.format(class_))() |
| 13720 |
|
if not any(more_regions): |
| 13721 |
|
return self._get_recursive_regions(regions, 1, classes) |
| 13722 |
|
ret = [] |
| 13723 |
|
for r, more in zip(regions, more_regions): |
| 13724 |
|
ret.append(r) |
| 13725 |
|
ret += self._get_recursive_regions(more, level - 1 if level else 0, classes) |
| 13726 |
|
return self._get_recursive_regions(ret, 1, classes) |
| 13727 |
|
|
| 13728 |
|
def _get_recursive_reading_order(self, rogroup): |
| 13729 |
|
if isinstance(rogroup, (OrderedGroupType, OrderedGroupIndexedType)): # pylint: disable=undefined-variable |
| 13730 |
|
elements = rogroup.get_AllIndexed() |
| 13731 |
|
if isinstance(rogroup, (UnorderedGroupType, UnorderedGroupIndexedType)): # pylint: disable=undefined-variable |
| 13732 |
|
elements = (rogroup.get_RegionRef() + rogroup.get_OrderedGroup() + rogroup.get_UnorderedGroup()) |
| 13733 |
|
regionrefs = list() |
| 13734 |
|
for elem in elements: |
| 13735 |
|
regionrefs.append(elem.get_regionRef()) |
| 13736 |
|
if not isinstance(elem, (RegionRefType, RegionRefIndexedType)): # pylint: disable=undefined-variable |
| 13737 |
|
regionrefs.extend(self._get_recursive_reading_order(elem)) |
| 13738 |
|
return regionrefs |
| 13739 |
|
|
| 13740 |
|
def get_AllRegions(self, classes=None, order='document', depth=0): |
| 13741 |
|
""" |
| 13742 |
|
Get all the ``*Region`` elements, or only those provided by `classes`. |
| 13743 |
|
Return in document order, unless the top element is ``Page`` and |
| 13744 |
|
`order` is ``reading-order``. |
| 13745 |
|
|
| 13746 |
|
Arguments: |
| 13747 |
|
classes (list): Classes of regions that shall be returned, \ |
| 13748 |
|
e.g. ``['Text', 'Image']`` |
| 13749 |
|
order ("document"|"reading-order"|"reading-order-only"): Whether to \ |
| 13750 |
|
return regions sorted by document order (``document``, default) or by |
| 13751 |
|
reading order with regions not in the reading order at the end of the |
| 13752 |
|
returned list (``reading-order``) or regions not in the reading order |
| 13753 |
|
omitted (``reading-order-only``). The latter two are only available |
| 13754 |
|
on page level. |
| 13755 |
|
depth (int): Recursive depth to look for regions at, set to `0` for \ |
| 13756 |
|
all regions at any depth. Default: 0 |
| 13757 |
|
|
| 13758 |
|
Returns: |
| 13759 |
|
a list of :py:class:`TextRegionType`, :py:class:`ImageRegionType`, \ |
| 13760 |
|
:py:class:`LineDrawingRegionType`, :py:class:`GraphicRegionType`, \ |
| 13761 |
|
:py:class:`TableRegionType`, :py:class:`ChartRegionType`, \ |
| 13762 |
|
:py:class:`MapRegionType`, :py:class:`SeparatorRegionType`, \ |
| 13763 |
|
:py:class:`MathsRegionType`, :py:class:`ChemRegionType`, \ |
| 13764 |
|
:py:class:`MusicRegionType`, :py:class:`AdvertRegionType`, \ |
| 13765 |
|
:py:class:`NoiseRegionType`, :py:class:`UnknownRegionType`, \ |
| 13766 |
|
and/or :py:class:`CustomRegionType` |
| 13767 |
|
|
| 13768 |
|
For example, to get all text anywhere on the page in reading order, use: |
| 13769 |
|
:: |
| 13770 |
|
'\\n'.join(line.get_TextEquiv()[0].Unicode |
| 13771 |
|
for region in page.get_AllRegions(classes=['Text'], depth=0, order='reading-order') |
| 13772 |
|
for line in region.get_TextLine()) |
| 13773 |
|
""" |
| 13774 |
|
if order not in ['document', 'reading-order', 'reading-order-only']: |
| 13775 |
|
raise Exception("Argument 'order' must be either 'document', 'reading-order' or 'reading-order-only', not '{}'".format(order)) |
| 13776 |
|
if depth < 0: |
| 13777 |
|
raise Exception("Argument 'depth' must be an integer greater-or-equal 0, not '{}'".format(depth)) |
| 13778 |
|
ret = self._get_recursive_regions([self], depth + 1 if depth else 0, classes) |
| 13779 |
|
if self.__class__.__name__ == 'PageType' and order.startswith('reading-order'): |
| 13780 |
|
reading_order = self.get_ReadingOrder() |
| 13781 |
|
if reading_order: |
| 13782 |
|
reading_order = reading_order.get_OrderedGroup() or reading_order.get_UnorderedGroup() |
| 13783 |
|
if reading_order: |
| 13784 |
|
reading_order = self._get_recursive_reading_order(reading_order) |
| 13785 |
|
if reading_order: |
| 13786 |
|
id2region = {region.id: region for region in ret} |
| 13787 |
|
in_reading_order = [id2region[region_id] for region_id in reading_order if region_id in id2region] |
| 13788 |
|
# print("ret: {} / in_ro: {} / not-in-ro: {}".format( |
| 13789 |
|
# len(ret), |
| 13790 |
|
# len([id2region[region_id] for region_id in reading_order if region_id in id2region]), |
| 13791 |
|
# len([r for r in ret if r not in in_reading_order]) |
| 13792 |
|
# )) |
| 13793 |
|
if order == 'reading-order-only': |
| 13794 |
|
ret = in_reading_order |
| 13795 |
|
else: |
| 13796 |
|
ret = in_reading_order + [r for r in ret if r not in in_reading_order] |
| 13797 |
|
return ret |
| 13798 |
|
def set_orientation(self, orientation): |
| 13799 |
|
""" |
| 13800 |
|
Set deskewing angle to given `orientation` number. |
| 13801 |
|
Moreover, invalidate self's ``pc:AlternativeImage``s |
| 13802 |
|
(because they will have been rotated and enlarged |
| 13803 |
|
with the angle of the previous value). |
| 13804 |
|
""" |
| 13805 |
|
if hasattr(self, 'invalidate_AlternativeImage'): |
| 13806 |
|
# PageType, RegionType: |
| 13807 |
|
self.invalidate_AlternativeImage(feature_selector='deskewed') |
| 13808 |
|
self.orientation = orientation |
| 13809 |
|
# end class MathsRegionType |
| 13810 |
|
|
| 13811 |
|
|
|
@@ 13286-13545 (lines=260) @@
|
| 13283 |
|
# end class MapRegionType |
| 13284 |
|
|
| 13285 |
|
|
| 13286 |
|
class ChemRegionType(RegionType): |
| 13287 |
|
"""ChemRegionType -- |
| 13288 |
|
Regions containing chemical formulas. |
| 13289 |
|
|
| 13290 |
|
* orientation -- |
| 13291 |
|
The angle the rectangle encapsulating a |
| 13292 |
|
region has to be rotated in clockwise |
| 13293 |
|
direction in order to correct the present |
| 13294 |
|
skew (negative values indicate |
| 13295 |
|
anti-clockwise rotation). Range: |
| 13296 |
|
-179.999,180 |
| 13297 |
|
|
| 13298 |
|
* bgColour -- |
| 13299 |
|
The background colour of the region |
| 13300 |
|
|
| 13301 |
|
""" |
| 13302 |
|
__hash__ = GeneratedsSuper.__hash__ |
| 13303 |
|
member_data_items_ = [ |
| 13304 |
|
MemberSpec_('orientation', 'float', 0, 1, {'use': 'optional', 'name': 'orientation'}), |
| 13305 |
|
MemberSpec_('bgColour', 'pc:ColourSimpleType', 0, 1, {'use': 'optional', 'name': 'bgColour'}), |
| 13306 |
|
] |
| 13307 |
|
subclass = None |
| 13308 |
|
superclass = RegionType |
| 13309 |
|
def __init__(self, id=None, custom=None, comments=None, continuation=None, AlternativeImage=None, Coords=None, UserDefined=None, Labels=None, Roles=None, TextRegion=None, ImageRegion=None, LineDrawingRegion=None, GraphicRegion=None, TableRegion=None, ChartRegion=None, SeparatorRegion=None, MathsRegion=None, ChemRegion=None, MusicRegion=None, AdvertRegion=None, NoiseRegion=None, UnknownRegion=None, CustomRegion=None, orientation=None, bgColour=None, gds_collector_=None, **kwargs_): |
| 13310 |
|
self.gds_collector_ = gds_collector_ |
| 13311 |
|
self.gds_elementtree_node_ = None |
| 13312 |
|
self.original_tagname_ = None |
| 13313 |
|
self.parent_object_ = kwargs_.get('parent_object_') |
| 13314 |
|
self.ns_prefix_ = "pc" |
| 13315 |
|
super(globals().get("ChemRegionType"), self).__init__(id, custom, comments, continuation, AlternativeImage, Coords, UserDefined, Labels, Roles, TextRegion, ImageRegion, LineDrawingRegion, GraphicRegion, TableRegion, ChartRegion, SeparatorRegion, MathsRegion, ChemRegion, MusicRegion, AdvertRegion, NoiseRegion, UnknownRegion, CustomRegion, **kwargs_) |
| 13316 |
|
self.orientation = _cast(float, orientation) |
| 13317 |
|
self.orientation_nsprefix_ = "pc" |
| 13318 |
|
self.bgColour = _cast(None, bgColour) |
| 13319 |
|
self.bgColour_nsprefix_ = "pc" |
| 13320 |
|
def factory(*args_, **kwargs_): |
| 13321 |
|
if CurrentSubclassModule_ is not None: |
| 13322 |
|
subclass = getSubclassFromModule_( |
| 13323 |
|
CurrentSubclassModule_, ChemRegionType) |
| 13324 |
|
if subclass is not None: |
| 13325 |
|
return subclass(*args_, **kwargs_) |
| 13326 |
|
if ChemRegionType.subclass: |
| 13327 |
|
return ChemRegionType.subclass(*args_, **kwargs_) |
| 13328 |
|
else: |
| 13329 |
|
return ChemRegionType(*args_, **kwargs_) |
| 13330 |
|
factory = staticmethod(factory) |
| 13331 |
|
def get_ns_prefix_(self): |
| 13332 |
|
return self.ns_prefix_ |
| 13333 |
|
def set_ns_prefix_(self, ns_prefix): |
| 13334 |
|
self.ns_prefix_ = ns_prefix |
| 13335 |
|
def get_orientation(self): |
| 13336 |
|
return self.orientation |
| 13337 |
|
def set_orientation(self, orientation): |
| 13338 |
|
self.orientation = orientation |
| 13339 |
|
def get_bgColour(self): |
| 13340 |
|
return self.bgColour |
| 13341 |
|
def set_bgColour(self, bgColour): |
| 13342 |
|
self.bgColour = bgColour |
| 13343 |
|
def validate_ColourSimpleType(self, value): |
| 13344 |
|
# Validate type pc:ColourSimpleType, a restriction on string. |
| 13345 |
|
if value is not None and Validate_simpletypes_ and self.gds_collector_ is not None: |
| 13346 |
|
if not isinstance(value, str): |
| 13347 |
|
lineno = self.gds_get_node_lineno_() |
| 13348 |
|
self.gds_collector_.add_message('Value "%(value)s"%(lineno)s is not of the correct base simple type (str)' % {"value": value, "lineno": lineno, }) |
| 13349 |
|
return False |
| 13350 |
|
value = value |
| 13351 |
|
enumerations = ['black', 'blue', 'brown', 'cyan', 'green', 'grey', 'indigo', 'magenta', 'orange', 'pink', 'red', 'turquoise', 'violet', 'white', 'yellow', 'other'] |
| 13352 |
|
if value not in enumerations: |
| 13353 |
|
lineno = self.gds_get_node_lineno_() |
| 13354 |
|
self.gds_collector_.add_message('Value "%(value)s"%(lineno)s does not match xsd enumeration restriction on ColourSimpleType' % {"value" : encode_str_2_3(value), "lineno": lineno} ) |
| 13355 |
|
result = False |
| 13356 |
|
def has__content(self): |
| 13357 |
|
if ( |
| 13358 |
|
super(ChemRegionType, self).has__content() |
| 13359 |
|
): |
| 13360 |
|
return True |
| 13361 |
|
else: |
| 13362 |
|
return False |
| 13363 |
|
def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='ChemRegionType', pretty_print=True): |
| 13364 |
|
imported_ns_def_ = GenerateDSNamespaceDefs_.get('ChemRegionType') |
| 13365 |
|
if imported_ns_def_ is not None: |
| 13366 |
|
namespacedef_ = imported_ns_def_ |
| 13367 |
|
if pretty_print: |
| 13368 |
|
eol_ = '\n' |
| 13369 |
|
else: |
| 13370 |
|
eol_ = '' |
| 13371 |
|
if self.original_tagname_ is not None and name_ == 'ChemRegionType': |
| 13372 |
|
name_ = self.original_tagname_ |
| 13373 |
|
if UseCapturedNS_ and self.ns_prefix_: |
| 13374 |
|
namespaceprefix_ = self.ns_prefix_ + ':' |
| 13375 |
|
showIndent(outfile, level, pretty_print) |
| 13376 |
|
outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) |
| 13377 |
|
already_processed = set() |
| 13378 |
|
self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='ChemRegionType') |
| 13379 |
|
if self.has__content(): |
| 13380 |
|
outfile.write('>%s' % (eol_, )) |
| 13381 |
|
self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='ChemRegionType', pretty_print=pretty_print) |
| 13382 |
|
showIndent(outfile, level, pretty_print) |
| 13383 |
|
outfile.write('</%s%s>%s' % (namespaceprefix_, name_, eol_)) |
| 13384 |
|
else: |
| 13385 |
|
outfile.write('/>%s' % (eol_, )) |
| 13386 |
|
def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='ChemRegionType'): |
| 13387 |
|
super(ChemRegionType, self)._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='ChemRegionType') |
| 13388 |
|
if self.orientation is not None and 'orientation' not in already_processed: |
| 13389 |
|
already_processed.add('orientation') |
| 13390 |
|
outfile.write(' orientation="%s"' % self.gds_format_float(self.orientation, input_name='orientation')) |
| 13391 |
|
if self.bgColour is not None and 'bgColour' not in already_processed: |
| 13392 |
|
already_processed.add('bgColour') |
| 13393 |
|
outfile.write(' bgColour=%s' % (self.gds_encode(self.gds_format_string(quote_attrib(self.bgColour), input_name='bgColour')), )) |
| 13394 |
|
def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='ChemRegionType', fromsubclass_=False, pretty_print=True): |
| 13395 |
|
super(ChemRegionType, self)._exportChildren(outfile, level, namespaceprefix_, namespacedef_, name_, True, pretty_print=pretty_print) |
| 13396 |
|
def to_etree(self, parent_element=None, name_='ChemRegionType', mapping_=None, reverse_mapping_=None, nsmap_=None): |
| 13397 |
|
element = super(ChemRegionType, self).to_etree(parent_element, name_, mapping_, reverse_mapping_, nsmap_) |
| 13398 |
|
if self.orientation is not None: |
| 13399 |
|
element.set('orientation', self.gds_format_float(self.orientation)) |
| 13400 |
|
if self.bgColour is not None: |
| 13401 |
|
element.set('bgColour', self.gds_format_string(self.bgColour)) |
| 13402 |
|
if mapping_ is not None: |
| 13403 |
|
mapping_[id(self)] = element |
| 13404 |
|
if reverse_mapping_ is not None: |
| 13405 |
|
reverse_mapping_[element] = self |
| 13406 |
|
return element |
| 13407 |
|
def build(self, node, gds_collector_=None): |
| 13408 |
|
self.gds_collector_ = gds_collector_ |
| 13409 |
|
if SaveElementTreeNode: |
| 13410 |
|
self.gds_elementtree_node_ = node |
| 13411 |
|
already_processed = set() |
| 13412 |
|
self.ns_prefix_ = node.prefix |
| 13413 |
|
self._buildAttributes(node, node.attrib, already_processed) |
| 13414 |
|
for child in node: |
| 13415 |
|
nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] |
| 13416 |
|
self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) |
| 13417 |
|
return self |
| 13418 |
|
def _buildAttributes(self, node, attrs, already_processed): |
| 13419 |
|
value = find_attr_value_('orientation', node) |
| 13420 |
|
if value is not None and 'orientation' not in already_processed: |
| 13421 |
|
already_processed.add('orientation') |
| 13422 |
|
value = self.gds_parse_float(value, node, 'orientation') |
| 13423 |
|
self.orientation = value |
| 13424 |
|
value = find_attr_value_('bgColour', node) |
| 13425 |
|
if value is not None and 'bgColour' not in already_processed: |
| 13426 |
|
already_processed.add('bgColour') |
| 13427 |
|
self.bgColour = value |
| 13428 |
|
self.validate_ColourSimpleType(self.bgColour) # validate type ColourSimpleType |
| 13429 |
|
super(ChemRegionType, self)._buildAttributes(node, attrs, already_processed) |
| 13430 |
|
def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): |
| 13431 |
|
super(ChemRegionType, self)._buildChildren(child_, node, nodeName_, True) |
| 13432 |
|
pass |
| 13433 |
|
def __hash__(self): |
| 13434 |
|
return hash(self.id) |
| 13435 |
|
# pylint: disable=line-too-long,invalid-name,protected-access,missing-module-docstring |
| 13436 |
|
def _region_class(self, x): # pylint: disable=unused-argument |
| 13437 |
|
return x.__class__.__name__.replace('RegionType', '') |
| 13438 |
|
|
| 13439 |
|
def _get_recursive_regions(self, regions, level, classes=None): |
| 13440 |
|
from .constants import PAGE_REGION_TYPES # pylint: disable=relative-beyond-top-level,import-outside-toplevel |
| 13441 |
|
if level == 1: |
| 13442 |
|
# stop recursion, filter classes |
| 13443 |
|
if classes: |
| 13444 |
|
return [r for r in regions if self._region_class(r) in classes] |
| 13445 |
|
if regions and regions[0].__class__.__name__ == 'PageType': |
| 13446 |
|
regions = regions[1:] |
| 13447 |
|
return regions |
| 13448 |
|
# find more regions recursively |
| 13449 |
|
more_regions = [] |
| 13450 |
|
for region in regions: |
| 13451 |
|
more_regions.append([]) |
| 13452 |
|
for class_ in PAGE_REGION_TYPES: |
| 13453 |
|
if class_ == 'Map' and not isinstance(region, PageType): # pylint: disable=undefined-variable |
| 13454 |
|
# 'Map' is not recursive in 2019 schema |
| 13455 |
|
continue |
| 13456 |
|
more_regions[-1] += getattr(region, 'get_{}Region'.format(class_))() |
| 13457 |
|
if not any(more_regions): |
| 13458 |
|
return self._get_recursive_regions(regions, 1, classes) |
| 13459 |
|
ret = [] |
| 13460 |
|
for r, more in zip(regions, more_regions): |
| 13461 |
|
ret.append(r) |
| 13462 |
|
ret += self._get_recursive_regions(more, level - 1 if level else 0, classes) |
| 13463 |
|
return self._get_recursive_regions(ret, 1, classes) |
| 13464 |
|
|
| 13465 |
|
def _get_recursive_reading_order(self, rogroup): |
| 13466 |
|
if isinstance(rogroup, (OrderedGroupType, OrderedGroupIndexedType)): # pylint: disable=undefined-variable |
| 13467 |
|
elements = rogroup.get_AllIndexed() |
| 13468 |
|
if isinstance(rogroup, (UnorderedGroupType, UnorderedGroupIndexedType)): # pylint: disable=undefined-variable |
| 13469 |
|
elements = (rogroup.get_RegionRef() + rogroup.get_OrderedGroup() + rogroup.get_UnorderedGroup()) |
| 13470 |
|
regionrefs = list() |
| 13471 |
|
for elem in elements: |
| 13472 |
|
regionrefs.append(elem.get_regionRef()) |
| 13473 |
|
if not isinstance(elem, (RegionRefType, RegionRefIndexedType)): # pylint: disable=undefined-variable |
| 13474 |
|
regionrefs.extend(self._get_recursive_reading_order(elem)) |
| 13475 |
|
return regionrefs |
| 13476 |
|
|
| 13477 |
|
def get_AllRegions(self, classes=None, order='document', depth=0): |
| 13478 |
|
""" |
| 13479 |
|
Get all the ``*Region`` elements, or only those provided by `classes`. |
| 13480 |
|
Return in document order, unless the top element is ``Page`` and |
| 13481 |
|
`order` is ``reading-order``. |
| 13482 |
|
|
| 13483 |
|
Arguments: |
| 13484 |
|
classes (list): Classes of regions that shall be returned, \ |
| 13485 |
|
e.g. ``['Text', 'Image']`` |
| 13486 |
|
order ("document"|"reading-order"|"reading-order-only"): Whether to \ |
| 13487 |
|
return regions sorted by document order (``document``, default) or by |
| 13488 |
|
reading order with regions not in the reading order at the end of the |
| 13489 |
|
returned list (``reading-order``) or regions not in the reading order |
| 13490 |
|
omitted (``reading-order-only``). The latter two are only available |
| 13491 |
|
on page level. |
| 13492 |
|
depth (int): Recursive depth to look for regions at, set to `0` for \ |
| 13493 |
|
all regions at any depth. Default: 0 |
| 13494 |
|
|
| 13495 |
|
Returns: |
| 13496 |
|
a list of :py:class:`TextRegionType`, :py:class:`ImageRegionType`, \ |
| 13497 |
|
:py:class:`LineDrawingRegionType`, :py:class:`GraphicRegionType`, \ |
| 13498 |
|
:py:class:`TableRegionType`, :py:class:`ChartRegionType`, \ |
| 13499 |
|
:py:class:`MapRegionType`, :py:class:`SeparatorRegionType`, \ |
| 13500 |
|
:py:class:`MathsRegionType`, :py:class:`ChemRegionType`, \ |
| 13501 |
|
:py:class:`MusicRegionType`, :py:class:`AdvertRegionType`, \ |
| 13502 |
|
:py:class:`NoiseRegionType`, :py:class:`UnknownRegionType`, \ |
| 13503 |
|
and/or :py:class:`CustomRegionType` |
| 13504 |
|
|
| 13505 |
|
For example, to get all text anywhere on the page in reading order, use: |
| 13506 |
|
:: |
| 13507 |
|
'\\n'.join(line.get_TextEquiv()[0].Unicode |
| 13508 |
|
for region in page.get_AllRegions(classes=['Text'], depth=0, order='reading-order') |
| 13509 |
|
for line in region.get_TextLine()) |
| 13510 |
|
""" |
| 13511 |
|
if order not in ['document', 'reading-order', 'reading-order-only']: |
| 13512 |
|
raise Exception("Argument 'order' must be either 'document', 'reading-order' or 'reading-order-only', not '{}'".format(order)) |
| 13513 |
|
if depth < 0: |
| 13514 |
|
raise Exception("Argument 'depth' must be an integer greater-or-equal 0, not '{}'".format(depth)) |
| 13515 |
|
ret = self._get_recursive_regions([self], depth + 1 if depth else 0, classes) |
| 13516 |
|
if self.__class__.__name__ == 'PageType' and order.startswith('reading-order'): |
| 13517 |
|
reading_order = self.get_ReadingOrder() |
| 13518 |
|
if reading_order: |
| 13519 |
|
reading_order = reading_order.get_OrderedGroup() or reading_order.get_UnorderedGroup() |
| 13520 |
|
if reading_order: |
| 13521 |
|
reading_order = self._get_recursive_reading_order(reading_order) |
| 13522 |
|
if reading_order: |
| 13523 |
|
id2region = {region.id: region for region in ret} |
| 13524 |
|
in_reading_order = [id2region[region_id] for region_id in reading_order if region_id in id2region] |
| 13525 |
|
# print("ret: {} / in_ro: {} / not-in-ro: {}".format( |
| 13526 |
|
# len(ret), |
| 13527 |
|
# len([id2region[region_id] for region_id in reading_order if region_id in id2region]), |
| 13528 |
|
# len([r for r in ret if r not in in_reading_order]) |
| 13529 |
|
# )) |
| 13530 |
|
if order == 'reading-order-only': |
| 13531 |
|
ret = in_reading_order |
| 13532 |
|
else: |
| 13533 |
|
ret = in_reading_order + [r for r in ret if r not in in_reading_order] |
| 13534 |
|
return ret |
| 13535 |
|
def set_orientation(self, orientation): |
| 13536 |
|
""" |
| 13537 |
|
Set deskewing angle to given `orientation` number. |
| 13538 |
|
Moreover, invalidate self's ``pc:AlternativeImage``s |
| 13539 |
|
(because they will have been rotated and enlarged |
| 13540 |
|
with the angle of the previous value). |
| 13541 |
|
""" |
| 13542 |
|
if hasattr(self, 'invalidate_AlternativeImage'): |
| 13543 |
|
# PageType, RegionType: |
| 13544 |
|
self.invalidate_AlternativeImage(feature_selector='deskewed') |
| 13545 |
|
self.orientation = orientation |
| 13546 |
|
# end class ChemRegionType |
| 13547 |
|
|
| 13548 |
|
|
|
@@ 12794-13052 (lines=259) @@
|
| 12791 |
|
# end class AdvertRegionType |
| 12792 |
|
|
| 12793 |
|
|
| 12794 |
|
class MusicRegionType(RegionType): |
| 12795 |
|
"""MusicRegionType -- |
| 12796 |
|
Regions containing musical notations. |
| 12797 |
|
|
| 12798 |
|
* orientation -- |
| 12799 |
|
The angle the rectangle encapsulating a region |
| 12800 |
|
has to be rotated in clockwise direction |
| 12801 |
|
in order to correct the present skew |
| 12802 |
|
(negative values indicate anti-clockwise rotation). |
| 12803 |
|
Range: -179.999,180 |
| 12804 |
|
|
| 12805 |
|
* bgColour -- |
| 12806 |
|
The background colour of the region |
| 12807 |
|
|
| 12808 |
|
""" |
| 12809 |
|
__hash__ = GeneratedsSuper.__hash__ |
| 12810 |
|
member_data_items_ = [ |
| 12811 |
|
MemberSpec_('orientation', 'float', 0, 1, {'use': 'optional', 'name': 'orientation'}), |
| 12812 |
|
MemberSpec_('bgColour', 'pc:ColourSimpleType', 0, 1, {'use': 'optional', 'name': 'bgColour'}), |
| 12813 |
|
] |
| 12814 |
|
subclass = None |
| 12815 |
|
superclass = RegionType |
| 12816 |
|
def __init__(self, id=None, custom=None, comments=None, continuation=None, AlternativeImage=None, Coords=None, UserDefined=None, Labels=None, Roles=None, TextRegion=None, ImageRegion=None, LineDrawingRegion=None, GraphicRegion=None, TableRegion=None, ChartRegion=None, SeparatorRegion=None, MathsRegion=None, ChemRegion=None, MusicRegion=None, AdvertRegion=None, NoiseRegion=None, UnknownRegion=None, CustomRegion=None, orientation=None, bgColour=None, gds_collector_=None, **kwargs_): |
| 12817 |
|
self.gds_collector_ = gds_collector_ |
| 12818 |
|
self.gds_elementtree_node_ = None |
| 12819 |
|
self.original_tagname_ = None |
| 12820 |
|
self.parent_object_ = kwargs_.get('parent_object_') |
| 12821 |
|
self.ns_prefix_ = "pc" |
| 12822 |
|
super(globals().get("MusicRegionType"), self).__init__(id, custom, comments, continuation, AlternativeImage, Coords, UserDefined, Labels, Roles, TextRegion, ImageRegion, LineDrawingRegion, GraphicRegion, TableRegion, ChartRegion, SeparatorRegion, MathsRegion, ChemRegion, MusicRegion, AdvertRegion, NoiseRegion, UnknownRegion, CustomRegion, **kwargs_) |
| 12823 |
|
self.orientation = _cast(float, orientation) |
| 12824 |
|
self.orientation_nsprefix_ = "pc" |
| 12825 |
|
self.bgColour = _cast(None, bgColour) |
| 12826 |
|
self.bgColour_nsprefix_ = "pc" |
| 12827 |
|
def factory(*args_, **kwargs_): |
| 12828 |
|
if CurrentSubclassModule_ is not None: |
| 12829 |
|
subclass = getSubclassFromModule_( |
| 12830 |
|
CurrentSubclassModule_, MusicRegionType) |
| 12831 |
|
if subclass is not None: |
| 12832 |
|
return subclass(*args_, **kwargs_) |
| 12833 |
|
if MusicRegionType.subclass: |
| 12834 |
|
return MusicRegionType.subclass(*args_, **kwargs_) |
| 12835 |
|
else: |
| 12836 |
|
return MusicRegionType(*args_, **kwargs_) |
| 12837 |
|
factory = staticmethod(factory) |
| 12838 |
|
def get_ns_prefix_(self): |
| 12839 |
|
return self.ns_prefix_ |
| 12840 |
|
def set_ns_prefix_(self, ns_prefix): |
| 12841 |
|
self.ns_prefix_ = ns_prefix |
| 12842 |
|
def get_orientation(self): |
| 12843 |
|
return self.orientation |
| 12844 |
|
def set_orientation(self, orientation): |
| 12845 |
|
self.orientation = orientation |
| 12846 |
|
def get_bgColour(self): |
| 12847 |
|
return self.bgColour |
| 12848 |
|
def set_bgColour(self, bgColour): |
| 12849 |
|
self.bgColour = bgColour |
| 12850 |
|
def validate_ColourSimpleType(self, value): |
| 12851 |
|
# Validate type pc:ColourSimpleType, a restriction on string. |
| 12852 |
|
if value is not None and Validate_simpletypes_ and self.gds_collector_ is not None: |
| 12853 |
|
if not isinstance(value, str): |
| 12854 |
|
lineno = self.gds_get_node_lineno_() |
| 12855 |
|
self.gds_collector_.add_message('Value "%(value)s"%(lineno)s is not of the correct base simple type (str)' % {"value": value, "lineno": lineno, }) |
| 12856 |
|
return False |
| 12857 |
|
value = value |
| 12858 |
|
enumerations = ['black', 'blue', 'brown', 'cyan', 'green', 'grey', 'indigo', 'magenta', 'orange', 'pink', 'red', 'turquoise', 'violet', 'white', 'yellow', 'other'] |
| 12859 |
|
if value not in enumerations: |
| 12860 |
|
lineno = self.gds_get_node_lineno_() |
| 12861 |
|
self.gds_collector_.add_message('Value "%(value)s"%(lineno)s does not match xsd enumeration restriction on ColourSimpleType' % {"value" : encode_str_2_3(value), "lineno": lineno} ) |
| 12862 |
|
result = False |
| 12863 |
|
def has__content(self): |
| 12864 |
|
if ( |
| 12865 |
|
super(MusicRegionType, self).has__content() |
| 12866 |
|
): |
| 12867 |
|
return True |
| 12868 |
|
else: |
| 12869 |
|
return False |
| 12870 |
|
def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='MusicRegionType', pretty_print=True): |
| 12871 |
|
imported_ns_def_ = GenerateDSNamespaceDefs_.get('MusicRegionType') |
| 12872 |
|
if imported_ns_def_ is not None: |
| 12873 |
|
namespacedef_ = imported_ns_def_ |
| 12874 |
|
if pretty_print: |
| 12875 |
|
eol_ = '\n' |
| 12876 |
|
else: |
| 12877 |
|
eol_ = '' |
| 12878 |
|
if self.original_tagname_ is not None and name_ == 'MusicRegionType': |
| 12879 |
|
name_ = self.original_tagname_ |
| 12880 |
|
if UseCapturedNS_ and self.ns_prefix_: |
| 12881 |
|
namespaceprefix_ = self.ns_prefix_ + ':' |
| 12882 |
|
showIndent(outfile, level, pretty_print) |
| 12883 |
|
outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) |
| 12884 |
|
already_processed = set() |
| 12885 |
|
self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='MusicRegionType') |
| 12886 |
|
if self.has__content(): |
| 12887 |
|
outfile.write('>%s' % (eol_, )) |
| 12888 |
|
self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='MusicRegionType', pretty_print=pretty_print) |
| 12889 |
|
showIndent(outfile, level, pretty_print) |
| 12890 |
|
outfile.write('</%s%s>%s' % (namespaceprefix_, name_, eol_)) |
| 12891 |
|
else: |
| 12892 |
|
outfile.write('/>%s' % (eol_, )) |
| 12893 |
|
def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='MusicRegionType'): |
| 12894 |
|
super(MusicRegionType, self)._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='MusicRegionType') |
| 12895 |
|
if self.orientation is not None and 'orientation' not in already_processed: |
| 12896 |
|
already_processed.add('orientation') |
| 12897 |
|
outfile.write(' orientation="%s"' % self.gds_format_float(self.orientation, input_name='orientation')) |
| 12898 |
|
if self.bgColour is not None and 'bgColour' not in already_processed: |
| 12899 |
|
already_processed.add('bgColour') |
| 12900 |
|
outfile.write(' bgColour=%s' % (self.gds_encode(self.gds_format_string(quote_attrib(self.bgColour), input_name='bgColour')), )) |
| 12901 |
|
def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='MusicRegionType', fromsubclass_=False, pretty_print=True): |
| 12902 |
|
super(MusicRegionType, self)._exportChildren(outfile, level, namespaceprefix_, namespacedef_, name_, True, pretty_print=pretty_print) |
| 12903 |
|
def to_etree(self, parent_element=None, name_='MusicRegionType', mapping_=None, reverse_mapping_=None, nsmap_=None): |
| 12904 |
|
element = super(MusicRegionType, self).to_etree(parent_element, name_, mapping_, reverse_mapping_, nsmap_) |
| 12905 |
|
if self.orientation is not None: |
| 12906 |
|
element.set('orientation', self.gds_format_float(self.orientation)) |
| 12907 |
|
if self.bgColour is not None: |
| 12908 |
|
element.set('bgColour', self.gds_format_string(self.bgColour)) |
| 12909 |
|
if mapping_ is not None: |
| 12910 |
|
mapping_[id(self)] = element |
| 12911 |
|
if reverse_mapping_ is not None: |
| 12912 |
|
reverse_mapping_[element] = self |
| 12913 |
|
return element |
| 12914 |
|
def build(self, node, gds_collector_=None): |
| 12915 |
|
self.gds_collector_ = gds_collector_ |
| 12916 |
|
if SaveElementTreeNode: |
| 12917 |
|
self.gds_elementtree_node_ = node |
| 12918 |
|
already_processed = set() |
| 12919 |
|
self.ns_prefix_ = node.prefix |
| 12920 |
|
self._buildAttributes(node, node.attrib, already_processed) |
| 12921 |
|
for child in node: |
| 12922 |
|
nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] |
| 12923 |
|
self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) |
| 12924 |
|
return self |
| 12925 |
|
def _buildAttributes(self, node, attrs, already_processed): |
| 12926 |
|
value = find_attr_value_('orientation', node) |
| 12927 |
|
if value is not None and 'orientation' not in already_processed: |
| 12928 |
|
already_processed.add('orientation') |
| 12929 |
|
value = self.gds_parse_float(value, node, 'orientation') |
| 12930 |
|
self.orientation = value |
| 12931 |
|
value = find_attr_value_('bgColour', node) |
| 12932 |
|
if value is not None and 'bgColour' not in already_processed: |
| 12933 |
|
already_processed.add('bgColour') |
| 12934 |
|
self.bgColour = value |
| 12935 |
|
self.validate_ColourSimpleType(self.bgColour) # validate type ColourSimpleType |
| 12936 |
|
super(MusicRegionType, self)._buildAttributes(node, attrs, already_processed) |
| 12937 |
|
def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): |
| 12938 |
|
super(MusicRegionType, self)._buildChildren(child_, node, nodeName_, True) |
| 12939 |
|
pass |
| 12940 |
|
def __hash__(self): |
| 12941 |
|
return hash(self.id) |
| 12942 |
|
# pylint: disable=line-too-long,invalid-name,protected-access,missing-module-docstring |
| 12943 |
|
def _region_class(self, x): # pylint: disable=unused-argument |
| 12944 |
|
return x.__class__.__name__.replace('RegionType', '') |
| 12945 |
|
|
| 12946 |
|
def _get_recursive_regions(self, regions, level, classes=None): |
| 12947 |
|
from .constants import PAGE_REGION_TYPES # pylint: disable=relative-beyond-top-level,import-outside-toplevel |
| 12948 |
|
if level == 1: |
| 12949 |
|
# stop recursion, filter classes |
| 12950 |
|
if classes: |
| 12951 |
|
return [r for r in regions if self._region_class(r) in classes] |
| 12952 |
|
if regions and regions[0].__class__.__name__ == 'PageType': |
| 12953 |
|
regions = regions[1:] |
| 12954 |
|
return regions |
| 12955 |
|
# find more regions recursively |
| 12956 |
|
more_regions = [] |
| 12957 |
|
for region in regions: |
| 12958 |
|
more_regions.append([]) |
| 12959 |
|
for class_ in PAGE_REGION_TYPES: |
| 12960 |
|
if class_ == 'Map' and not isinstance(region, PageType): # pylint: disable=undefined-variable |
| 12961 |
|
# 'Map' is not recursive in 2019 schema |
| 12962 |
|
continue |
| 12963 |
|
more_regions[-1] += getattr(region, 'get_{}Region'.format(class_))() |
| 12964 |
|
if not any(more_regions): |
| 12965 |
|
return self._get_recursive_regions(regions, 1, classes) |
| 12966 |
|
ret = [] |
| 12967 |
|
for r, more in zip(regions, more_regions): |
| 12968 |
|
ret.append(r) |
| 12969 |
|
ret += self._get_recursive_regions(more, level - 1 if level else 0, classes) |
| 12970 |
|
return self._get_recursive_regions(ret, 1, classes) |
| 12971 |
|
|
| 12972 |
|
def _get_recursive_reading_order(self, rogroup): |
| 12973 |
|
if isinstance(rogroup, (OrderedGroupType, OrderedGroupIndexedType)): # pylint: disable=undefined-variable |
| 12974 |
|
elements = rogroup.get_AllIndexed() |
| 12975 |
|
if isinstance(rogroup, (UnorderedGroupType, UnorderedGroupIndexedType)): # pylint: disable=undefined-variable |
| 12976 |
|
elements = (rogroup.get_RegionRef() + rogroup.get_OrderedGroup() + rogroup.get_UnorderedGroup()) |
| 12977 |
|
regionrefs = list() |
| 12978 |
|
for elem in elements: |
| 12979 |
|
regionrefs.append(elem.get_regionRef()) |
| 12980 |
|
if not isinstance(elem, (RegionRefType, RegionRefIndexedType)): # pylint: disable=undefined-variable |
| 12981 |
|
regionrefs.extend(self._get_recursive_reading_order(elem)) |
| 12982 |
|
return regionrefs |
| 12983 |
|
|
| 12984 |
|
def get_AllRegions(self, classes=None, order='document', depth=0): |
| 12985 |
|
""" |
| 12986 |
|
Get all the ``*Region`` elements, or only those provided by `classes`. |
| 12987 |
|
Return in document order, unless the top element is ``Page`` and |
| 12988 |
|
`order` is ``reading-order``. |
| 12989 |
|
|
| 12990 |
|
Arguments: |
| 12991 |
|
classes (list): Classes of regions that shall be returned, \ |
| 12992 |
|
e.g. ``['Text', 'Image']`` |
| 12993 |
|
order ("document"|"reading-order"|"reading-order-only"): Whether to \ |
| 12994 |
|
return regions sorted by document order (``document``, default) or by |
| 12995 |
|
reading order with regions not in the reading order at the end of the |
| 12996 |
|
returned list (``reading-order``) or regions not in the reading order |
| 12997 |
|
omitted (``reading-order-only``). The latter two are only available |
| 12998 |
|
on page level. |
| 12999 |
|
depth (int): Recursive depth to look for regions at, set to `0` for \ |
| 13000 |
|
all regions at any depth. Default: 0 |
| 13001 |
|
|
| 13002 |
|
Returns: |
| 13003 |
|
a list of :py:class:`TextRegionType`, :py:class:`ImageRegionType`, \ |
| 13004 |
|
:py:class:`LineDrawingRegionType`, :py:class:`GraphicRegionType`, \ |
| 13005 |
|
:py:class:`TableRegionType`, :py:class:`ChartRegionType`, \ |
| 13006 |
|
:py:class:`MapRegionType`, :py:class:`SeparatorRegionType`, \ |
| 13007 |
|
:py:class:`MathsRegionType`, :py:class:`ChemRegionType`, \ |
| 13008 |
|
:py:class:`MusicRegionType`, :py:class:`AdvertRegionType`, \ |
| 13009 |
|
:py:class:`NoiseRegionType`, :py:class:`UnknownRegionType`, \ |
| 13010 |
|
and/or :py:class:`CustomRegionType` |
| 13011 |
|
|
| 13012 |
|
For example, to get all text anywhere on the page in reading order, use: |
| 13013 |
|
:: |
| 13014 |
|
'\\n'.join(line.get_TextEquiv()[0].Unicode |
| 13015 |
|
for region in page.get_AllRegions(classes=['Text'], depth=0, order='reading-order') |
| 13016 |
|
for line in region.get_TextLine()) |
| 13017 |
|
""" |
| 13018 |
|
if order not in ['document', 'reading-order', 'reading-order-only']: |
| 13019 |
|
raise Exception("Argument 'order' must be either 'document', 'reading-order' or 'reading-order-only', not '{}'".format(order)) |
| 13020 |
|
if depth < 0: |
| 13021 |
|
raise Exception("Argument 'depth' must be an integer greater-or-equal 0, not '{}'".format(depth)) |
| 13022 |
|
ret = self._get_recursive_regions([self], depth + 1 if depth else 0, classes) |
| 13023 |
|
if self.__class__.__name__ == 'PageType' and order.startswith('reading-order'): |
| 13024 |
|
reading_order = self.get_ReadingOrder() |
| 13025 |
|
if reading_order: |
| 13026 |
|
reading_order = reading_order.get_OrderedGroup() or reading_order.get_UnorderedGroup() |
| 13027 |
|
if reading_order: |
| 13028 |
|
reading_order = self._get_recursive_reading_order(reading_order) |
| 13029 |
|
if reading_order: |
| 13030 |
|
id2region = {region.id: region for region in ret} |
| 13031 |
|
in_reading_order = [id2region[region_id] for region_id in reading_order if region_id in id2region] |
| 13032 |
|
# print("ret: {} / in_ro: {} / not-in-ro: {}".format( |
| 13033 |
|
# len(ret), |
| 13034 |
|
# len([id2region[region_id] for region_id in reading_order if region_id in id2region]), |
| 13035 |
|
# len([r for r in ret if r not in in_reading_order]) |
| 13036 |
|
# )) |
| 13037 |
|
if order == 'reading-order-only': |
| 13038 |
|
ret = in_reading_order |
| 13039 |
|
else: |
| 13040 |
|
ret = in_reading_order + [r for r in ret if r not in in_reading_order] |
| 13041 |
|
return ret |
| 13042 |
|
def set_orientation(self, orientation): |
| 13043 |
|
""" |
| 13044 |
|
Set deskewing angle to given `orientation` number. |
| 13045 |
|
Moreover, invalidate self's ``pc:AlternativeImage``s |
| 13046 |
|
(because they will have been rotated and enlarged |
| 13047 |
|
with the angle of the previous value). |
| 13048 |
|
""" |
| 13049 |
|
if hasattr(self, 'invalidate_AlternativeImage'): |
| 13050 |
|
# PageType, RegionType: |
| 13051 |
|
self.invalidate_AlternativeImage(feature_selector='deskewed') |
| 13052 |
|
self.orientation = orientation |
| 13053 |
|
# end class MusicRegionType |
| 13054 |
|
|
| 13055 |
|
|
|
@@ 12532-12790 (lines=259) @@
|
| 12529 |
|
# end class NoiseRegionType |
| 12530 |
|
|
| 12531 |
|
|
| 12532 |
|
class AdvertRegionType(RegionType): |
| 12533 |
|
"""AdvertRegionType -- |
| 12534 |
|
Regions containing advertisements. |
| 12535 |
|
|
| 12536 |
|
* orientation -- |
| 12537 |
|
The angle the rectangle encapsulating a region |
| 12538 |
|
has to be rotated in clockwise direction |
| 12539 |
|
in order to correct the present skew |
| 12540 |
|
(negative values indicate anti-clockwise rotation). |
| 12541 |
|
Range: -179.999,180 |
| 12542 |
|
|
| 12543 |
|
* bgColour -- |
| 12544 |
|
The background colour of the region |
| 12545 |
|
|
| 12546 |
|
""" |
| 12547 |
|
__hash__ = GeneratedsSuper.__hash__ |
| 12548 |
|
member_data_items_ = [ |
| 12549 |
|
MemberSpec_('orientation', 'float', 0, 1, {'use': 'optional', 'name': 'orientation'}), |
| 12550 |
|
MemberSpec_('bgColour', 'pc:ColourSimpleType', 0, 1, {'use': 'optional', 'name': 'bgColour'}), |
| 12551 |
|
] |
| 12552 |
|
subclass = None |
| 12553 |
|
superclass = RegionType |
| 12554 |
|
def __init__(self, id=None, custom=None, comments=None, continuation=None, AlternativeImage=None, Coords=None, UserDefined=None, Labels=None, Roles=None, TextRegion=None, ImageRegion=None, LineDrawingRegion=None, GraphicRegion=None, TableRegion=None, ChartRegion=None, SeparatorRegion=None, MathsRegion=None, ChemRegion=None, MusicRegion=None, AdvertRegion=None, NoiseRegion=None, UnknownRegion=None, CustomRegion=None, orientation=None, bgColour=None, gds_collector_=None, **kwargs_): |
| 12555 |
|
self.gds_collector_ = gds_collector_ |
| 12556 |
|
self.gds_elementtree_node_ = None |
| 12557 |
|
self.original_tagname_ = None |
| 12558 |
|
self.parent_object_ = kwargs_.get('parent_object_') |
| 12559 |
|
self.ns_prefix_ = "pc" |
| 12560 |
|
super(globals().get("AdvertRegionType"), self).__init__(id, custom, comments, continuation, AlternativeImage, Coords, UserDefined, Labels, Roles, TextRegion, ImageRegion, LineDrawingRegion, GraphicRegion, TableRegion, ChartRegion, SeparatorRegion, MathsRegion, ChemRegion, MusicRegion, AdvertRegion, NoiseRegion, UnknownRegion, CustomRegion, **kwargs_) |
| 12561 |
|
self.orientation = _cast(float, orientation) |
| 12562 |
|
self.orientation_nsprefix_ = "pc" |
| 12563 |
|
self.bgColour = _cast(None, bgColour) |
| 12564 |
|
self.bgColour_nsprefix_ = "pc" |
| 12565 |
|
def factory(*args_, **kwargs_): |
| 12566 |
|
if CurrentSubclassModule_ is not None: |
| 12567 |
|
subclass = getSubclassFromModule_( |
| 12568 |
|
CurrentSubclassModule_, AdvertRegionType) |
| 12569 |
|
if subclass is not None: |
| 12570 |
|
return subclass(*args_, **kwargs_) |
| 12571 |
|
if AdvertRegionType.subclass: |
| 12572 |
|
return AdvertRegionType.subclass(*args_, **kwargs_) |
| 12573 |
|
else: |
| 12574 |
|
return AdvertRegionType(*args_, **kwargs_) |
| 12575 |
|
factory = staticmethod(factory) |
| 12576 |
|
def get_ns_prefix_(self): |
| 12577 |
|
return self.ns_prefix_ |
| 12578 |
|
def set_ns_prefix_(self, ns_prefix): |
| 12579 |
|
self.ns_prefix_ = ns_prefix |
| 12580 |
|
def get_orientation(self): |
| 12581 |
|
return self.orientation |
| 12582 |
|
def set_orientation(self, orientation): |
| 12583 |
|
self.orientation = orientation |
| 12584 |
|
def get_bgColour(self): |
| 12585 |
|
return self.bgColour |
| 12586 |
|
def set_bgColour(self, bgColour): |
| 12587 |
|
self.bgColour = bgColour |
| 12588 |
|
def validate_ColourSimpleType(self, value): |
| 12589 |
|
# Validate type pc:ColourSimpleType, a restriction on string. |
| 12590 |
|
if value is not None and Validate_simpletypes_ and self.gds_collector_ is not None: |
| 12591 |
|
if not isinstance(value, str): |
| 12592 |
|
lineno = self.gds_get_node_lineno_() |
| 12593 |
|
self.gds_collector_.add_message('Value "%(value)s"%(lineno)s is not of the correct base simple type (str)' % {"value": value, "lineno": lineno, }) |
| 12594 |
|
return False |
| 12595 |
|
value = value |
| 12596 |
|
enumerations = ['black', 'blue', 'brown', 'cyan', 'green', 'grey', 'indigo', 'magenta', 'orange', 'pink', 'red', 'turquoise', 'violet', 'white', 'yellow', 'other'] |
| 12597 |
|
if value not in enumerations: |
| 12598 |
|
lineno = self.gds_get_node_lineno_() |
| 12599 |
|
self.gds_collector_.add_message('Value "%(value)s"%(lineno)s does not match xsd enumeration restriction on ColourSimpleType' % {"value" : encode_str_2_3(value), "lineno": lineno} ) |
| 12600 |
|
result = False |
| 12601 |
|
def has__content(self): |
| 12602 |
|
if ( |
| 12603 |
|
super(AdvertRegionType, self).has__content() |
| 12604 |
|
): |
| 12605 |
|
return True |
| 12606 |
|
else: |
| 12607 |
|
return False |
| 12608 |
|
def export(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='AdvertRegionType', pretty_print=True): |
| 12609 |
|
imported_ns_def_ = GenerateDSNamespaceDefs_.get('AdvertRegionType') |
| 12610 |
|
if imported_ns_def_ is not None: |
| 12611 |
|
namespacedef_ = imported_ns_def_ |
| 12612 |
|
if pretty_print: |
| 12613 |
|
eol_ = '\n' |
| 12614 |
|
else: |
| 12615 |
|
eol_ = '' |
| 12616 |
|
if self.original_tagname_ is not None and name_ == 'AdvertRegionType': |
| 12617 |
|
name_ = self.original_tagname_ |
| 12618 |
|
if UseCapturedNS_ and self.ns_prefix_: |
| 12619 |
|
namespaceprefix_ = self.ns_prefix_ + ':' |
| 12620 |
|
showIndent(outfile, level, pretty_print) |
| 12621 |
|
outfile.write('<%s%s%s' % (namespaceprefix_, name_, namespacedef_ and ' ' + namespacedef_ or '', )) |
| 12622 |
|
already_processed = set() |
| 12623 |
|
self._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='AdvertRegionType') |
| 12624 |
|
if self.has__content(): |
| 12625 |
|
outfile.write('>%s' % (eol_, )) |
| 12626 |
|
self._exportChildren(outfile, level + 1, namespaceprefix_, namespacedef_, name_='AdvertRegionType', pretty_print=pretty_print) |
| 12627 |
|
showIndent(outfile, level, pretty_print) |
| 12628 |
|
outfile.write('</%s%s>%s' % (namespaceprefix_, name_, eol_)) |
| 12629 |
|
else: |
| 12630 |
|
outfile.write('/>%s' % (eol_, )) |
| 12631 |
|
def _exportAttributes(self, outfile, level, already_processed, namespaceprefix_='', name_='AdvertRegionType'): |
| 12632 |
|
super(AdvertRegionType, self)._exportAttributes(outfile, level, already_processed, namespaceprefix_, name_='AdvertRegionType') |
| 12633 |
|
if self.orientation is not None and 'orientation' not in already_processed: |
| 12634 |
|
already_processed.add('orientation') |
| 12635 |
|
outfile.write(' orientation="%s"' % self.gds_format_float(self.orientation, input_name='orientation')) |
| 12636 |
|
if self.bgColour is not None and 'bgColour' not in already_processed: |
| 12637 |
|
already_processed.add('bgColour') |
| 12638 |
|
outfile.write(' bgColour=%s' % (self.gds_encode(self.gds_format_string(quote_attrib(self.bgColour), input_name='bgColour')), )) |
| 12639 |
|
def _exportChildren(self, outfile, level, namespaceprefix_='', namespacedef_='xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"', name_='AdvertRegionType', fromsubclass_=False, pretty_print=True): |
| 12640 |
|
super(AdvertRegionType, self)._exportChildren(outfile, level, namespaceprefix_, namespacedef_, name_, True, pretty_print=pretty_print) |
| 12641 |
|
def to_etree(self, parent_element=None, name_='AdvertRegionType', mapping_=None, reverse_mapping_=None, nsmap_=None): |
| 12642 |
|
element = super(AdvertRegionType, self).to_etree(parent_element, name_, mapping_, reverse_mapping_, nsmap_) |
| 12643 |
|
if self.orientation is not None: |
| 12644 |
|
element.set('orientation', self.gds_format_float(self.orientation)) |
| 12645 |
|
if self.bgColour is not None: |
| 12646 |
|
element.set('bgColour', self.gds_format_string(self.bgColour)) |
| 12647 |
|
if mapping_ is not None: |
| 12648 |
|
mapping_[id(self)] = element |
| 12649 |
|
if reverse_mapping_ is not None: |
| 12650 |
|
reverse_mapping_[element] = self |
| 12651 |
|
return element |
| 12652 |
|
def build(self, node, gds_collector_=None): |
| 12653 |
|
self.gds_collector_ = gds_collector_ |
| 12654 |
|
if SaveElementTreeNode: |
| 12655 |
|
self.gds_elementtree_node_ = node |
| 12656 |
|
already_processed = set() |
| 12657 |
|
self.ns_prefix_ = node.prefix |
| 12658 |
|
self._buildAttributes(node, node.attrib, already_processed) |
| 12659 |
|
for child in node: |
| 12660 |
|
nodeName_ = Tag_pattern_.match(child.tag).groups()[-1] |
| 12661 |
|
self._buildChildren(child, node, nodeName_, gds_collector_=gds_collector_) |
| 12662 |
|
return self |
| 12663 |
|
def _buildAttributes(self, node, attrs, already_processed): |
| 12664 |
|
value = find_attr_value_('orientation', node) |
| 12665 |
|
if value is not None and 'orientation' not in already_processed: |
| 12666 |
|
already_processed.add('orientation') |
| 12667 |
|
value = self.gds_parse_float(value, node, 'orientation') |
| 12668 |
|
self.orientation = value |
| 12669 |
|
value = find_attr_value_('bgColour', node) |
| 12670 |
|
if value is not None and 'bgColour' not in already_processed: |
| 12671 |
|
already_processed.add('bgColour') |
| 12672 |
|
self.bgColour = value |
| 12673 |
|
self.validate_ColourSimpleType(self.bgColour) # validate type ColourSimpleType |
| 12674 |
|
super(AdvertRegionType, self)._buildAttributes(node, attrs, already_processed) |
| 12675 |
|
def _buildChildren(self, child_, node, nodeName_, fromsubclass_=False, gds_collector_=None): |
| 12676 |
|
super(AdvertRegionType, self)._buildChildren(child_, node, nodeName_, True) |
| 12677 |
|
pass |
| 12678 |
|
def __hash__(self): |
| 12679 |
|
return hash(self.id) |
| 12680 |
|
# pylint: disable=line-too-long,invalid-name,protected-access,missing-module-docstring |
| 12681 |
|
def _region_class(self, x): # pylint: disable=unused-argument |
| 12682 |
|
return x.__class__.__name__.replace('RegionType', '') |
| 12683 |
|
|
| 12684 |
|
def _get_recursive_regions(self, regions, level, classes=None): |
| 12685 |
|
from .constants import PAGE_REGION_TYPES # pylint: disable=relative-beyond-top-level,import-outside-toplevel |
| 12686 |
|
if level == 1: |
| 12687 |
|
# stop recursion, filter classes |
| 12688 |
|
if classes: |
| 12689 |
|
return [r for r in regions if self._region_class(r) in classes] |
| 12690 |
|
if regions and regions[0].__class__.__name__ == 'PageType': |
| 12691 |
|
regions = regions[1:] |
| 12692 |
|
return regions |
| 12693 |
|
# find more regions recursively |
| 12694 |
|
more_regions = [] |
| 12695 |
|
for region in regions: |
| 12696 |
|
more_regions.append([]) |
| 12697 |
|
for class_ in PAGE_REGION_TYPES: |
| 12698 |
|
if class_ == 'Map' and not isinstance(region, PageType): # pylint: disable=undefined-variable |
| 12699 |
|
# 'Map' is not recursive in 2019 schema |
| 12700 |
|
continue |
| 12701 |
|
more_regions[-1] += getattr(region, 'get_{}Region'.format(class_))() |
| 12702 |
|
if not any(more_regions): |
| 12703 |
|
return self._get_recursive_regions(regions, 1, classes) |
| 12704 |
|
ret = [] |
| 12705 |
|
for r, more in zip(regions, more_regions): |
| 12706 |
|
ret.append(r) |
| 12707 |
|
ret += self._get_recursive_regions(more, level - 1 if level else 0, classes) |
| 12708 |
|
return self._get_recursive_regions(ret, 1, classes) |
| 12709 |
|
|
| 12710 |
|
def _get_recursive_reading_order(self, rogroup): |
| 12711 |
|
if isinstance(rogroup, (OrderedGroupType, OrderedGroupIndexedType)): # pylint: disable=undefined-variable |
| 12712 |
|
elements = rogroup.get_AllIndexed() |
| 12713 |
|
if isinstance(rogroup, (UnorderedGroupType, UnorderedGroupIndexedType)): # pylint: disable=undefined-variable |
| 12714 |
|
elements = (rogroup.get_RegionRef() + rogroup.get_OrderedGroup() + rogroup.get_UnorderedGroup()) |
| 12715 |
|
regionrefs = list() |
| 12716 |
|
for elem in elements: |
| 12717 |
|
regionrefs.append(elem.get_regionRef()) |
| 12718 |
|
if not isinstance(elem, (RegionRefType, RegionRefIndexedType)): # pylint: disable=undefined-variable |
| 12719 |
|
regionrefs.extend(self._get_recursive_reading_order(elem)) |
| 12720 |
|
return regionrefs |
| 12721 |
|
|
| 12722 |
|
def get_AllRegions(self, classes=None, order='document', depth=0): |
| 12723 |
|
""" |
| 12724 |
|
Get all the ``*Region`` elements, or only those provided by `classes`. |
| 12725 |
|
Return in document order, unless the top element is ``Page`` and |
| 12726 |
|
`order` is ``reading-order``. |
| 12727 |
|
|
| 12728 |
|
Arguments: |
| 12729 |
|
classes (list): Classes of regions that shall be returned, \ |
| 12730 |
|
e.g. ``['Text', 'Image']`` |
| 12731 |
|
order ("document"|"reading-order"|"reading-order-only"): Whether to \ |
| 12732 |
|
return regions sorted by document order (``document``, default) or by |
| 12733 |
|
reading order with regions not in the reading order at the end of the |
| 12734 |
|
returned list (``reading-order``) or regions not in the reading order |
| 12735 |
|
omitted (``reading-order-only``). The latter two are only available |
| 12736 |
|
on page level. |
| 12737 |
|
depth (int): Recursive depth to look for regions at, set to `0` for \ |
| 12738 |
|
all regions at any depth. Default: 0 |
| 12739 |
|
|
| 12740 |
|
Returns: |
| 12741 |
|
a list of :py:class:`TextRegionType`, :py:class:`ImageRegionType`, \ |
| 12742 |
|
:py:class:`LineDrawingRegionType`, :py:class:`GraphicRegionType`, \ |
| 12743 |
|
:py:class:`TableRegionType`, :py:class:`ChartRegionType`, \ |
| 12744 |
|
:py:class:`MapRegionType`, :py:class:`SeparatorRegionType`, \ |
| 12745 |
|
:py:class:`MathsRegionType`, :py:class:`ChemRegionType`, \ |
| 12746 |
|
:py:class:`MusicRegionType`, :py:class:`AdvertRegionType`, \ |
| 12747 |
|
:py:class:`NoiseRegionType`, :py:class:`UnknownRegionType`, \ |
| 12748 |
|
and/or :py:class:`CustomRegionType` |
| 12749 |
|
|
| 12750 |
|
For example, to get all text anywhere on the page in reading order, use: |
| 12751 |
|
:: |
| 12752 |
|
'\\n'.join(line.get_TextEquiv()[0].Unicode |
| 12753 |
|
for region in page.get_AllRegions(classes=['Text'], depth=0, order='reading-order') |
| 12754 |
|
for line in region.get_TextLine()) |
| 12755 |
|
""" |
| 12756 |
|
if order not in ['document', 'reading-order', 'reading-order-only']: |
| 12757 |
|
raise Exception("Argument 'order' must be either 'document', 'reading-order' or 'reading-order-only', not '{}'".format(order)) |
| 12758 |
|
if depth < 0: |
| 12759 |
|
raise Exception("Argument 'depth' must be an integer greater-or-equal 0, not '{}'".format(depth)) |
| 12760 |
|
ret = self._get_recursive_regions([self], depth + 1 if depth else 0, classes) |
| 12761 |
|
if self.__class__.__name__ == 'PageType' and order.startswith('reading-order'): |
| 12762 |
|
reading_order = self.get_ReadingOrder() |
| 12763 |
|
if reading_order: |
| 12764 |
|
reading_order = reading_order.get_OrderedGroup() or reading_order.get_UnorderedGroup() |
| 12765 |
|
if reading_order: |
| 12766 |
|
reading_order = self._get_recursive_reading_order(reading_order) |
| 12767 |
|
if reading_order: |
| 12768 |
|
id2region = {region.id: region for region in ret} |
| 12769 |
|
in_reading_order = [id2region[region_id] for region_id in reading_order if region_id in id2region] |
| 12770 |
|
# print("ret: {} / in_ro: {} / not-in-ro: {}".format( |
| 12771 |
|
# len(ret), |
| 12772 |
|
# len([id2region[region_id] for region_id in reading_order if region_id in id2region]), |
| 12773 |
|
# len([r for r in ret if r not in in_reading_order]) |
| 12774 |
|
# )) |
| 12775 |
|
if order == 'reading-order-only': |
| 12776 |
|
ret = in_reading_order |
| 12777 |
|
else: |
| 12778 |
|
ret = in_reading_order + [r for r in ret if r not in in_reading_order] |
| 12779 |
|
return ret |
| 12780 |
|
def set_orientation(self, orientation): |
| 12781 |
|
""" |
| 12782 |
|
Set deskewing angle to given `orientation` number. |
| 12783 |
|
Moreover, invalidate self's ``pc:AlternativeImage``s |
| 12784 |
|
(because they will have been rotated and enlarged |
| 12785 |
|
with the angle of the previous value). |
| 12786 |
|
""" |
| 12787 |
|
if hasattr(self, 'invalidate_AlternativeImage'): |
| 12788 |
|
# PageType, RegionType: |
| 12789 |
|
self.invalidate_AlternativeImage(feature_selector='deskewed') |
| 12790 |
|
self.orientation = orientation |
| 12791 |
|
# end class AdvertRegionType |
| 12792 |
|
|
| 12793 |
|
|