1
|
|
|
""" |
2
|
|
|
Constants for ocrd_models. |
3
|
|
|
""" |
4
|
|
|
from re import Pattern |
5
|
|
|
from enum import Enum, auto |
6
|
|
|
from dataclasses import dataclass, field |
7
|
|
|
from abc import ABC, abstractmethod |
8
|
|
|
from typing import Any, List, Optional, Union |
9
|
|
|
from ocrd_utils import resource_string |
10
|
|
|
|
11
|
|
|
__all__ = [ |
12
|
|
|
'IDENTIFIER_PRIORITY', |
13
|
|
|
'METS_XML_EMPTY', |
14
|
|
|
'NAMESPACES', |
15
|
|
|
'TAG_METS_AGENT', |
16
|
|
|
'TAG_METS_DIV', |
17
|
|
|
'TAG_METS_FILE', |
18
|
|
|
'TAG_METS_FILEGRP', |
19
|
|
|
'TAG_METS_FILESEC', |
20
|
|
|
'TAG_METS_FPTR', |
21
|
|
|
'TAG_METS_FLOCAT', |
22
|
|
|
'TAG_METS_METSHDR', |
23
|
|
|
'TAG_METS_NAME', |
24
|
|
|
'TAG_METS_NOTE', |
25
|
|
|
'TAG_METS_STRUCTMAP', |
26
|
|
|
'TAG_MODS_IDENTIFIER', |
27
|
|
|
'TAG_PAGE_ALTERNATIVEIMAGE', |
28
|
|
|
'TAG_PAGE_COORDS', |
29
|
|
|
'TAG_PAGE_READINGORDER', |
30
|
|
|
'TAG_PAGE_REGIONREFINDEXED', |
31
|
|
|
'TAG_PAGE_TEXTLINE', |
32
|
|
|
'TAG_PAGE_TEXTEQUIV', |
33
|
|
|
'TAG_PAGE_TEXTREGION', |
34
|
|
|
'METS_PAGE_DIV_ATTRIBUTE', |
35
|
|
|
'METS_STRUCT_DIV_ATTRIBUTE', |
36
|
|
|
'METS_DIV_ATTRIBUTE_ATOM_PATTERN', |
37
|
|
|
'METS_DIV_ATTRIBUTE_RANGE_PATTERN', |
38
|
|
|
'METS_DIV_ATTRIBUTE_REGEX_PATTERN', |
39
|
|
|
'PAGE_REGION_TYPES', |
40
|
|
|
'PAGE_ALTIMG_FEATURES', |
41
|
|
|
] |
42
|
|
|
|
43
|
|
|
|
44
|
|
|
IDENTIFIER_PRIORITY = ['purl', 'urn', 'doi', 'url'] |
45
|
|
|
|
46
|
|
|
METS_XML_EMPTY = resource_string(__package__, 'mets-empty.xml') |
47
|
|
|
|
48
|
|
|
NAMESPACES = { |
49
|
|
|
'mets': "http://www.loc.gov/METS/", |
50
|
|
|
'mods': "http://www.loc.gov/mods/v3", |
51
|
|
|
'xlink': "http://www.w3.org/1999/xlink", |
52
|
|
|
'page': "http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15", |
53
|
|
|
'xsl': 'http://www.w3.org/1999/XSL/Transform#', |
54
|
|
|
'ocrd': 'https://ocr-d.de', |
55
|
|
|
} |
56
|
|
|
|
57
|
|
|
TAG_METS_AGENT = '{%s}agent' % NAMESPACES['mets'] |
58
|
|
|
TAG_METS_DIV = '{%s}div' % NAMESPACES['mets'] |
59
|
|
|
TAG_METS_FILE = '{%s}file' % NAMESPACES['mets'] |
60
|
|
|
TAG_METS_FILEGRP = '{%s}fileGrp' % NAMESPACES['mets'] |
61
|
|
|
TAG_METS_FILESEC = '{%s}fileSec' % NAMESPACES['mets'] |
62
|
|
|
TAG_METS_FPTR = '{%s}fptr' % NAMESPACES['mets'] |
63
|
|
|
TAG_METS_FLOCAT = '{%s}FLocat' % NAMESPACES['mets'] |
64
|
|
|
TAG_METS_METSHDR = '{%s}metsHdr' % NAMESPACES['mets'] |
65
|
|
|
TAG_METS_NAME = '{%s}name' % NAMESPACES['mets'] |
66
|
|
|
TAG_METS_NOTE = '{%s}note' % NAMESPACES['mets'] |
67
|
|
|
TAG_METS_STRUCTMAP = '{%s}structMap' % NAMESPACES['mets'] |
68
|
|
|
|
69
|
|
|
TAG_MODS_IDENTIFIER = '{%s}identifier' % NAMESPACES['mods'] |
70
|
|
|
|
71
|
|
|
TAG_PAGE_ALTERNATIVEIMAGE = '{%s}AlternativeImage' % NAMESPACES['page'] |
72
|
|
|
TAG_PAGE_COORDS = '{%s}Coords' % NAMESPACES['page'] |
73
|
|
|
TAG_PAGE_READINGORDER = '{%s}ReadingOrder' % NAMESPACES['page'] |
74
|
|
|
TAG_PAGE_REGIONREFINDEXED = '{%s}RegionRefIndexed' % NAMESPACES['page'] |
75
|
|
|
TAG_PAGE_TEXTLINE = '{%s}TextLine' % NAMESPACES['page'] |
76
|
|
|
TAG_PAGE_TEXTEQUIV = '{%s}TextEquiv' % NAMESPACES['page'] |
77
|
|
|
TAG_PAGE_TEXTREGION = '{%s}TextRegion' % NAMESPACES['page'] |
78
|
|
|
|
79
|
|
|
PAGE_REGION_TYPES = [ |
80
|
|
|
'Advert', 'Chart', 'Chem', 'Custom', 'Graphic', 'Image', |
81
|
|
|
'LineDrawing', 'Map', 'Maths', 'Music', 'Noise', |
82
|
|
|
'Separator', 'Table', 'Text', 'Unknown' |
83
|
|
|
] |
84
|
|
|
|
85
|
|
|
PAGE_ALTIMG_FEATURES = [ |
86
|
|
|
'binarized', |
87
|
|
|
'grayscale_normalized', |
88
|
|
|
'despeckled', |
89
|
|
|
'cropped', |
90
|
|
|
'deskewed', |
91
|
|
|
'rotated-90', |
92
|
|
|
'rotated-180', |
93
|
|
|
'rotated-270', |
94
|
|
|
'dewarped', |
95
|
|
|
'clipped', |
96
|
|
|
] |
97
|
|
|
|
98
|
|
|
|
99
|
|
|
class METS_PAGE_DIV_ATTRIBUTE(Enum): |
100
|
|
|
"""page selection attributes of PHYSICAL mets:structMap//mets:div""" |
101
|
|
|
ID = auto() |
102
|
|
|
ORDER = auto() |
103
|
|
|
ORDERLABEL = auto() |
104
|
|
|
LABEL = auto() |
105
|
|
|
CONTENTIDS = auto() |
106
|
|
|
|
107
|
|
|
@classmethod |
108
|
|
|
def names(cls): |
109
|
|
|
return [x.name for x in cls] |
110
|
|
|
@classmethod |
111
|
|
|
def type_prefix(cls): |
112
|
|
|
"""disambiguation prefix to use for all subtypes""" |
113
|
|
|
return "physical:" |
114
|
|
|
def prefix(self): |
115
|
|
|
"""disambiguation prefix to use for this attribute type""" |
116
|
|
|
return self.type_prefix() + self.name.lower() + ":" |
117
|
|
|
|
118
|
|
|
class METS_STRUCT_DIV_ATTRIBUTE(Enum): |
119
|
|
|
"""page selection attributes of LOGICAL mets:structMap//mets:div""" |
120
|
|
|
ID = auto() |
121
|
|
|
DMDID = auto() |
122
|
|
|
TYPE = auto() |
123
|
|
|
LABEL = auto() |
124
|
|
|
|
125
|
|
|
@classmethod |
126
|
|
|
def names(cls): |
127
|
|
|
return [x.name for x in cls] |
128
|
|
|
@classmethod |
129
|
|
|
def type_prefix(cls): |
130
|
|
|
"""disambiguation prefix to use for all subtypes""" |
131
|
|
|
return "logical:" |
132
|
|
|
def prefix(self): |
133
|
|
|
"""disambiguation prefix to use for this attribute type""" |
134
|
|
|
return self.type_prefix() + self.name.lower() + ":" |
135
|
|
|
|
136
|
|
|
@dataclass |
137
|
|
|
class METS_DIV_ATTRIBUTE_PATTERN(ABC): |
138
|
|
|
"""page selection pattern (abstract supertype)""" |
139
|
|
|
|
140
|
|
|
expr: Any |
141
|
|
|
"""pattern value to match a mets:div against""" |
142
|
|
|
attr: List[Union[METS_PAGE_DIV_ATTRIBUTE, METS_STRUCT_DIV_ATTRIBUTE]] = field( |
143
|
|
|
default_factory=lambda: list(METS_PAGE_DIV_ATTRIBUTE) + list(METS_STRUCT_DIV_ATTRIBUTE)) |
144
|
|
|
"""attribute type(s) to match a mets:div for |
145
|
|
|
(pre-disambiguated with prefix syntax, or filled upon first match) |
146
|
|
|
""" |
147
|
|
|
has_matched: bool = field(init=False, default=False) |
148
|
|
|
"""whether this pattern has already been matched""" |
149
|
|
|
|
150
|
|
|
def attr_prefix(self): |
151
|
|
|
"""attribute type disambiguation prefix corresponding to the current state of disambiguation""" |
152
|
|
|
if self.attr == list(METS_PAGE_DIV_ATTRIBUTE) + list(METS_STRUCT_DIV_ATTRIBUTE): |
153
|
|
|
return "" |
154
|
|
|
if self.attr == list(METS_PAGE_DIV_ATTRIBUTE): |
155
|
|
|
return METS_PAGE_DIV_ATTRIBUTE.type_prefix() |
156
|
|
|
if self.attr == list(METS_STRUCT_DIV_ATTRIBUTE): |
157
|
|
|
return METS_STRUCT_DIV_ATTRIBUTE.type_prefix() |
158
|
|
|
assert len(self.attr) == 1, "unexpected type ambiguity: %s" % repr(self.attr) |
159
|
|
|
return self.attr[0].prefix() |
160
|
|
|
|
161
|
|
|
@abstractmethod |
162
|
|
|
def _matches(self, input) -> bool: |
163
|
|
|
return |
164
|
|
|
def matches(self, input) -> bool: |
165
|
|
|
"""does the selection pattern match on the given attribute value?""" |
166
|
|
|
if (matched := self._matches(input)): |
167
|
|
|
self.has_matched = True |
168
|
|
|
return matched |
169
|
|
|
|
170
|
|
|
@dataclass |
171
|
|
|
class METS_DIV_ATTRIBUTE_ATOM_PATTERN(METS_DIV_ATTRIBUTE_PATTERN): |
172
|
|
|
"""page selection pattern for literal (single value) matching""" |
173
|
|
|
|
174
|
|
|
expr: str |
175
|
|
|
def __repr__(self): |
176
|
|
|
return "%s%s" % (self.attr_prefix(), self.expr) |
177
|
|
|
def _matches(self, input): |
178
|
|
|
return input == self.expr |
179
|
|
|
|
180
|
|
|
@dataclass |
181
|
|
|
class METS_DIV_ATTRIBUTE_RANGE_PATTERN(METS_DIV_ATTRIBUTE_PATTERN): |
182
|
|
|
"""page selection pattern for interval (list expansion) matching""" |
183
|
|
|
|
184
|
|
|
expr: List[str] |
185
|
|
|
start: str = field(init=False) |
186
|
|
|
"""first value of the range after expansion, before matching-exhausting""" |
187
|
|
|
stop: str = field(init=False) |
188
|
|
|
"""last value of the range after expansion, before matching-exhausting""" |
189
|
|
|
def __post_init__(self): |
190
|
|
|
self.start = self.expr[0] |
191
|
|
|
self.stop = self.expr[-1] |
192
|
|
|
def __repr__(self): |
193
|
|
|
return "%s%s..%s" % (self.attr_prefix(), self.start, self.stop) |
194
|
|
|
def _matches(self, input): |
195
|
|
|
return input in self.expr |
196
|
|
|
|
197
|
|
|
@dataclass |
198
|
|
|
class METS_DIV_ATTRIBUTE_REGEX_PATTERN(METS_DIV_ATTRIBUTE_PATTERN): |
199
|
|
|
"""page selection pattern for regular expression matching""" |
200
|
|
|
|
201
|
|
|
expr: Pattern |
202
|
|
|
def __repr__(self): |
203
|
|
|
return "%s//%s" % (self.attr_prefix(), self.expr.pattern) |
204
|
|
|
def _matches(self, input): |
205
|
|
|
return bool(self.expr.fullmatch(input)) |
206
|
|
|
|