Passed
Pull Request — master (#1329)
by
unknown
03:57 queued 01:46
created

ocrd_models.constants   A

Complexity

Total Complexity 1

Size/Duplication

Total Lines 101
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
wmc 1
eloc 81
dl 0
loc 101
rs 10
c 0
b 0
f 0
1
"""
2
Constants for ocrd_models.
3
"""
4
from re import Pattern
5
from enum import Enum, auto
6
from dataclasses import dataclass, field
7
from abc import ABC, abstractmethod
8
from typing import Any, List, Optional, Union
9
from ocrd_utils import resource_string
10
11
__all__ = [
12
    'IDENTIFIER_PRIORITY',
13
    'METS_XML_EMPTY',
14
    'NAMESPACES',
15
    'TAG_METS_AGENT',
16
    'TAG_METS_DIV',
17
    'TAG_METS_FILE',
18
    'TAG_METS_FILEGRP',
19
    'TAG_METS_FILESEC',
20
    'TAG_METS_FPTR',
21
    'TAG_METS_FLOCAT',
22
    'TAG_METS_METSHDR',
23
    'TAG_METS_NAME',
24
    'TAG_METS_NOTE',
25
    'TAG_METS_STRUCTMAP',
26
    'TAG_MODS_IDENTIFIER',
27
    'TAG_PAGE_ALTERNATIVEIMAGE',
28
    'TAG_PAGE_COORDS',
29
    'TAG_PAGE_READINGORDER',
30
    'TAG_PAGE_REGIONREFINDEXED',
31
    'TAG_PAGE_TEXTLINE',
32
    'TAG_PAGE_TEXTEQUIV',
33
    'TAG_PAGE_TEXTREGION',
34
    'METS_PAGE_DIV_ATTRIBUTE',
35
    'METS_STRUCT_DIV_ATTRIBUTE',
36
    'METS_DIV_ATTRIBUTE_ATOM_PATTERN',
37
    'METS_DIV_ATTRIBUTE_RANGE_PATTERN',
38
    'METS_DIV_ATTRIBUTE_REGEX_PATTERN',
39
    'PAGE_REGION_TYPES',
40
    'PAGE_ALTIMG_FEATURES',
41
]
42
43
44
IDENTIFIER_PRIORITY = ['purl', 'urn', 'doi', 'url']
45
46
METS_XML_EMPTY = resource_string(__package__, 'mets-empty.xml')
47
48
NAMESPACES = {
49
    'mets': "http://www.loc.gov/METS/",
50
    'mods': "http://www.loc.gov/mods/v3",
51
    'xlink': "http://www.w3.org/1999/xlink",
52
    'page': "http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15",
53
    'xsl': 'http://www.w3.org/1999/XSL/Transform#',
54
    'ocrd': 'https://ocr-d.de',
55
}
56
57
TAG_METS_AGENT            = '{%s}agent' % NAMESPACES['mets']
58
TAG_METS_DIV              = '{%s}div' % NAMESPACES['mets']
59
TAG_METS_FILE             = '{%s}file' % NAMESPACES['mets']
60
TAG_METS_FILEGRP          = '{%s}fileGrp' % NAMESPACES['mets']
61
TAG_METS_FILESEC          = '{%s}fileSec' % NAMESPACES['mets']
62
TAG_METS_FPTR             = '{%s}fptr' % NAMESPACES['mets']
63
TAG_METS_FLOCAT           = '{%s}FLocat' % NAMESPACES['mets']
64
TAG_METS_METSHDR          = '{%s}metsHdr' % NAMESPACES['mets']
65
TAG_METS_NAME             = '{%s}name' % NAMESPACES['mets']
66
TAG_METS_NOTE             = '{%s}note' % NAMESPACES['mets']
67
TAG_METS_STRUCTMAP        = '{%s}structMap' % NAMESPACES['mets']
68
69
TAG_MODS_IDENTIFIER       = '{%s}identifier' % NAMESPACES['mods']
70
71
TAG_PAGE_ALTERNATIVEIMAGE = '{%s}AlternativeImage' % NAMESPACES['page']
72
TAG_PAGE_COORDS           = '{%s}Coords' % NAMESPACES['page']
73
TAG_PAGE_READINGORDER     = '{%s}ReadingOrder' % NAMESPACES['page']
74
TAG_PAGE_REGIONREFINDEXED = '{%s}RegionRefIndexed' % NAMESPACES['page']
75
TAG_PAGE_TEXTLINE         = '{%s}TextLine' % NAMESPACES['page']
76
TAG_PAGE_TEXTEQUIV        = '{%s}TextEquiv' % NAMESPACES['page']
77
TAG_PAGE_TEXTREGION       = '{%s}TextRegion' % NAMESPACES['page']
78
79
PAGE_REGION_TYPES = [
80
    'Advert', 'Chart', 'Chem', 'Custom', 'Graphic', 'Image',
81
    'LineDrawing', 'Map', 'Maths', 'Music', 'Noise',
82
    'Separator', 'Table', 'Text', 'Unknown'
83
]
84
85
PAGE_ALTIMG_FEATURES = [
86
    'binarized',
87
    'grayscale_normalized',
88
    'despeckled',
89
    'cropped',
90
    'deskewed',
91
    'rotated-90',
92
    'rotated-180',
93
    'rotated-270',
94
    'dewarped',
95
    'clipped',
96
]
97
98
99
class METS_PAGE_DIV_ATTRIBUTE(Enum):
100
    """page selection attributes of PHYSICAL mets:structMap//mets:div"""
101
    ID = auto()
102
    ORDER = auto()
103
    ORDERLABEL = auto()
104
    LABEL = auto()
105
    CONTENTIDS = auto()
106
107
    @classmethod
108
    def names(cls):
109
        return [x.name for x in cls]
110
    @classmethod
111
    def type_prefix(cls):
112
        """disambiguation prefix to use for all subtypes"""
113
        return "physical:"
114
    def prefix(self):
115
        """disambiguation prefix to use for this attribute type"""
116
        return self.type_prefix() + self.name.lower() + ":"
117
118
class METS_STRUCT_DIV_ATTRIBUTE(Enum):
119
    """page selection attributes of LOGICAL mets:structMap//mets:div"""
120
    ID = auto()
121
    DMDID = auto()
122
    TYPE = auto()
123
    LABEL = auto()
124
125
    @classmethod
126
    def names(cls):
127
        return [x.name for x in cls]
128
    @classmethod
129
    def type_prefix(cls):
130
        """disambiguation prefix to use for all subtypes"""
131
        return "logical:"
132
    def prefix(self):
133
        """disambiguation prefix to use for this attribute type"""
134
        return self.type_prefix() + self.name.lower() + ":"
135
136
@dataclass
137
class METS_DIV_ATTRIBUTE_PATTERN(ABC):
138
    """page selection pattern (abstract supertype)"""
139
140
    expr: Any
141
    """pattern value to match a mets:div against"""
142
    attr: List[Union[METS_PAGE_DIV_ATTRIBUTE, METS_STRUCT_DIV_ATTRIBUTE]] = field(
143
        default_factory=lambda: list(METS_PAGE_DIV_ATTRIBUTE) + list(METS_STRUCT_DIV_ATTRIBUTE))
144
    """attribute type(s) to match a mets:div for
145
    (pre-disambiguated with prefix syntax, or filled upon first match)
146
    """
147
    has_matched: bool = field(init=False, default=False)
148
    """whether this pattern has already been matched"""
149
150
    def attr_prefix(self):
151
        """attribute type disambiguation prefix corresponding to the current state of disambiguation"""
152
        if self.attr == list(METS_PAGE_DIV_ATTRIBUTE) + list(METS_STRUCT_DIV_ATTRIBUTE):
153
            return ""
154
        if self.attr == list(METS_PAGE_DIV_ATTRIBUTE):
155
            return METS_PAGE_DIV_ATTRIBUTE.type_prefix()
156
        if self.attr == list(METS_STRUCT_DIV_ATTRIBUTE):
157
            return METS_STRUCT_DIV_ATTRIBUTE.type_prefix()
158
        assert len(self.attr) == 1, "unexpected type ambiguity: %s" % repr(self.attr)
159
        return self.attr[0].prefix()
160
161
    @abstractmethod
162
    def _matches(self, input) -> bool:
163
        return
164
    def matches(self, input) -> bool:
165
        """does the selection pattern match on the given attribute value?"""
166
        if (matched := self._matches(input)):
167
            self.has_matched = True
168
        return matched
169
170
@dataclass
171
class METS_DIV_ATTRIBUTE_ATOM_PATTERN(METS_DIV_ATTRIBUTE_PATTERN):
172
    """page selection pattern for literal (single value) matching"""
173
174
    expr: str
175
    def __repr__(self):
176
        return "%s%s" % (self.attr_prefix(), self.expr)
177
    def _matches(self, input):
178
        return input == self.expr
179
180
@dataclass
181
class METS_DIV_ATTRIBUTE_RANGE_PATTERN(METS_DIV_ATTRIBUTE_PATTERN):
182
    """page selection pattern for interval (list expansion) matching"""
183
184
    expr: List[str]
185
    start: str = field(init=False)
186
    """first value of the range after expansion, before matching-exhausting"""
187
    stop: str = field(init=False)
188
    """last value of the range after expansion, before matching-exhausting"""
189
    def __post_init__(self):
190
        self.start = self.expr[0]
191
        self.stop = self.expr[-1]
192
    def __repr__(self):
193
        return "%s%s..%s" % (self.attr_prefix(), self.start, self.stop)
194
    def _matches(self, input):
195
        return input in self.expr
196
197
@dataclass
198
class METS_DIV_ATTRIBUTE_REGEX_PATTERN(METS_DIV_ATTRIBUTE_PATTERN):
199
    """page selection pattern for regular expression matching"""
200
201
    expr: Pattern
202
    def __repr__(self):
203
        return "%s//%s" % (self.attr_prefix(), self.expr.pattern)
204
    def _matches(self, input):
205
        return bool(self.expr.fullmatch(input))
206