|
1
|
|
|
""" |
|
2
|
|
|
Utility functions for strings, paths and URL. |
|
3
|
|
|
""" |
|
4
|
|
|
|
|
5
|
|
|
import re |
|
6
|
|
|
import json |
|
7
|
|
|
from typing import List, Union |
|
8
|
|
|
from .constants import REGEX_FILE_ID, SPARKLINE_CHARS |
|
9
|
|
|
from .deprecate import deprecation_warning |
|
10
|
|
|
from warnings import warn |
|
11
|
|
|
from numpy import array_split |
|
12
|
|
|
|
|
13
|
|
|
__all__ = [ |
|
14
|
|
|
'assert_file_grp_cardinality', |
|
15
|
|
|
'concat_padded', |
|
16
|
|
|
'get_local_filename', |
|
17
|
|
|
'is_local_filename', |
|
18
|
|
|
'partition_list', |
|
19
|
|
|
'is_string', |
|
20
|
|
|
'make_file_id', |
|
21
|
|
|
'make_xml_id', |
|
22
|
|
|
'nth_url_segment', |
|
23
|
|
|
'parse_json_string_or_file', |
|
24
|
|
|
'parse_json_string_with_comments', |
|
25
|
|
|
'remove_non_path_from_url', |
|
26
|
|
|
'safe_filename', |
|
27
|
|
|
] |
|
28
|
|
|
|
|
29
|
|
|
|
|
30
|
|
|
def assert_file_grp_cardinality(grps, n, msg=None): |
|
31
|
|
|
""" |
|
32
|
|
|
Assert that a string of comma-separated fileGrps contains exactly ``n`` entries. |
|
33
|
|
|
""" |
|
34
|
|
|
if isinstance(grps, str): |
|
35
|
|
|
grps = grps.split(',') |
|
36
|
|
|
assert len(grps) == n, \ |
|
37
|
|
|
"Expected exactly %d output file group%s%s, but '%s' has %d" % ( |
|
38
|
|
|
n, |
|
39
|
|
|
'' if n == 1 else 's', |
|
40
|
|
|
' (%s)' % msg if msg else '', |
|
41
|
|
|
grps, |
|
42
|
|
|
len(grps) |
|
43
|
|
|
) |
|
44
|
|
|
|
|
45
|
|
|
def concat_padded(base, *args): |
|
46
|
|
|
""" |
|
47
|
|
|
Concatenate string and zero-padded 4 digit number |
|
48
|
|
|
""" |
|
49
|
|
|
ret = base |
|
50
|
|
|
for n in args: |
|
51
|
|
|
if is_string(n): |
|
52
|
|
|
ret = "%s_%s" % (ret, n) |
|
53
|
|
|
else: |
|
54
|
|
|
ret = "%s_%04i" % (ret, n) |
|
55
|
|
|
return ret |
|
56
|
|
|
|
|
57
|
|
|
def remove_non_path_from_url(url): |
|
58
|
|
|
""" |
|
59
|
|
|
Remove everything from URL after path. |
|
60
|
|
|
""" |
|
61
|
|
|
url = url.split('?', 1)[0] # query |
|
62
|
|
|
url = url.split('#', 1)[0] # fragment identifier |
|
63
|
|
|
url = re.sub(r"/+$", "", url) # trailing slashes |
|
64
|
|
|
return url |
|
65
|
|
|
|
|
66
|
|
|
def make_file_id(ocrd_file, output_file_grp): |
|
67
|
|
|
""" |
|
68
|
|
|
Derive a new file ID for an output file from an existing input file ``ocrd_file`` |
|
69
|
|
|
and the name of the output file's ``fileGrp/@USE``, ``output_file_grp``. |
|
70
|
|
|
If ``ocrd_file``'s ID contains the input file's fileGrp name, then replace it by ``output_file_grp``. |
|
71
|
|
|
Else if ``ocrd_file`` has a ``pageId`` but it is not contained in the ``ocrd_file.ID``, then |
|
72
|
|
|
concatenate ``output_file_grp`` and ``ocrd_file.pageId``. |
|
73
|
|
|
Otherwise concatenate ``output_file_grp`` with the ``ocrd_file.ID``. |
|
74
|
|
|
|
|
75
|
|
|
Note: ``make_file_id`` cannot guarantee that the new ID is unique within an actual |
|
76
|
|
|
:py:class:`ocrd_models.ocrd_mets.OcrdMets`. |
|
77
|
|
|
The caller is responsible for ensuring uniqueness of files to be added. |
|
78
|
|
|
Ultimately, ID conflicts will lead to :py:meth:`ocrd_models.ocrd_mets.OcrdMets.add_file` |
|
79
|
|
|
raising an exception. |
|
80
|
|
|
This can be avoided if all processors use ``make_file_id`` consistently for ID generation. |
|
81
|
|
|
|
|
82
|
|
|
Note: ``make_file_id`` generates page-specific IDs. For IDs representing page segments |
|
83
|
|
|
or ``pc:AlternativeImage`` files, the output of ``make_file_id`` may need to be concatenated |
|
84
|
|
|
with a unique string for that sub-page element, such as `".IMG"` or the segment ID. |
|
85
|
|
|
""" |
|
86
|
|
|
# considerations for this behaviour: |
|
87
|
|
|
# - uniqueness (in spite of different METS and processor conventions) |
|
88
|
|
|
# - predictability (i.e. output name can be anticipated from the input name) |
|
89
|
|
|
# - stability (i.e. output at least as much sorted and consistent as the input) |
|
90
|
|
|
# ... and all this in spite of --page-id selection and --overwrite |
|
91
|
|
|
# (i.e. --overwrite should target the existing ID, and input vs output |
|
92
|
|
|
# IDs should be different, except when overwriting the input fileGrp) |
|
93
|
|
|
ret = ocrd_file.ID.replace(ocrd_file.fileGrp, output_file_grp) |
|
94
|
|
|
if ret == ocrd_file.ID and output_file_grp != ocrd_file.fileGrp: |
|
95
|
|
|
if ocrd_file.pageId and ocrd_file.pageId not in ocrd_file.ID: |
|
96
|
|
|
ret = output_file_grp + '_' + ocrd_file.pageId |
|
97
|
|
|
else: |
|
98
|
|
|
ret = output_file_grp + '_' + ocrd_file.ID |
|
99
|
|
|
return make_xml_id(ret) |
|
100
|
|
|
|
|
101
|
|
|
def make_xml_id(idstr: str) -> str: |
|
102
|
|
|
ret = idstr |
|
103
|
|
|
if not REGEX_FILE_ID.fullmatch(ret): |
|
104
|
|
|
ret = ret.replace(':', '_') |
|
105
|
|
|
ret = re.sub(r'^([^a-zA-Z_])', r'id_\1', ret) |
|
106
|
|
|
ret = re.sub(r'[^\w.-]', r'', ret) |
|
107
|
|
|
return ret |
|
108
|
|
|
|
|
109
|
|
|
def nth_url_segment(url, n=-1): |
|
110
|
|
|
""" |
|
111
|
|
|
Return the last /-delimited segment of a URL-like string |
|
112
|
|
|
|
|
113
|
|
|
Arguments: |
|
114
|
|
|
url (string): |
|
115
|
|
|
n (integer): index of segment, default: -1 |
|
116
|
|
|
""" |
|
117
|
|
|
segments = remove_non_path_from_url(url).split('/') |
|
118
|
|
|
try: |
|
119
|
|
|
return segments[n] |
|
120
|
|
|
except IndexError: |
|
121
|
|
|
return '' |
|
122
|
|
|
|
|
123
|
|
|
def get_local_filename(url, start=None): |
|
124
|
|
|
""" |
|
125
|
|
|
Return local filename, optionally relative to ``start`` |
|
126
|
|
|
|
|
127
|
|
|
Arguments: |
|
128
|
|
|
url (string): filename or URL |
|
129
|
|
|
start (string): Base path to remove from filename. Raise an exception if not a prefix of url |
|
130
|
|
|
""" |
|
131
|
|
|
if url.startswith('https://') or url.startswith('http:'): |
|
132
|
|
|
raise ValueError("Can't determine local filename of http(s) URL") |
|
133
|
|
|
if url.startswith('file://'): |
|
134
|
|
|
url = url[len('file://'):] |
|
135
|
|
|
# Goobi/Kitodo produces those, they are always absolute |
|
136
|
|
|
if url.startswith('file:/'): |
|
137
|
|
|
url = url[len('file:'):] |
|
138
|
|
|
if start: |
|
139
|
|
|
if not url.startswith(start): |
|
140
|
|
|
raise ValueError("Cannot remove prefix %s from url %s" % (start, url)) |
|
141
|
|
|
if not start.endswith('/'): |
|
142
|
|
|
start += '/' |
|
143
|
|
|
url = url[len(start):] |
|
144
|
|
|
return url |
|
145
|
|
|
|
|
146
|
|
|
def is_local_filename(url): |
|
147
|
|
|
""" |
|
148
|
|
|
Whether a url is a local filename. |
|
149
|
|
|
""" |
|
150
|
|
|
# deprecation_warning("Deprecated so we spot inconsistent URL/file handling") |
|
151
|
|
|
return url.startswith('file://') or not('://' in url) |
|
152
|
|
|
|
|
153
|
|
|
def is_string(val): |
|
154
|
|
|
""" |
|
155
|
|
|
Return whether a value is a ``str``. |
|
156
|
|
|
""" |
|
157
|
|
|
return isinstance(val, str) |
|
158
|
|
|
|
|
159
|
|
|
|
|
160
|
|
|
def parse_json_string_with_comments(val): |
|
161
|
|
|
""" |
|
162
|
|
|
Parse a string of JSON interspersed with #-prefixed full-line comments |
|
163
|
|
|
""" |
|
164
|
|
|
jsonstr = re.sub(r'^\s*#.*$', '', val, flags=re.MULTILINE) |
|
165
|
|
|
return json.loads(jsonstr) |
|
166
|
|
|
|
|
167
|
|
|
def parse_json_string_or_file(*values, resolve_preset_file=None): # pylint: disable=unused-argument |
|
168
|
|
|
""" |
|
169
|
|
|
Parse a string as either the path to a JSON object or a literal JSON object. |
|
170
|
|
|
|
|
171
|
|
|
Empty strings are equivalent to '{}' |
|
172
|
|
|
""" |
|
173
|
|
|
ret = {} |
|
174
|
|
|
for value in values: |
|
175
|
|
|
err = None |
|
176
|
|
|
value_parsed = None |
|
177
|
|
|
if re.fullmatch(r"\s*", value): |
|
178
|
|
|
continue |
|
179
|
|
|
try: |
|
180
|
|
|
try: |
|
181
|
|
|
path = value |
|
182
|
|
|
if callable(resolve_preset_file): |
|
183
|
|
|
path = resolve_preset_file(value) or value |
|
184
|
|
|
with open(path, 'r') as f: |
|
185
|
|
|
value_parsed = parse_json_string_with_comments(f.read()) |
|
186
|
|
|
except (FileNotFoundError, OSError): |
|
187
|
|
|
value_parsed = parse_json_string_with_comments(value.strip()) |
|
188
|
|
|
if not isinstance(value_parsed, dict): |
|
189
|
|
|
err = ValueError("Not a valid JSON object: '%s' (parsed as '%s')" % (value, value_parsed)) |
|
190
|
|
|
except json.decoder.JSONDecodeError as e: |
|
191
|
|
|
err = ValueError("Error parsing '%s': %s" % (value, e)) |
|
192
|
|
|
if err: |
|
193
|
|
|
raise err # pylint: disable=raising-bad-type |
|
194
|
|
|
ret = {**ret, **value_parsed} |
|
195
|
|
|
return ret |
|
196
|
|
|
|
|
197
|
|
|
def safe_filename(url): |
|
198
|
|
|
""" |
|
199
|
|
|
Sanitize input to be safely used as the basename of a local file. |
|
200
|
|
|
""" |
|
201
|
|
|
ret = re.sub(r'[^\w]+', '_', url) |
|
202
|
|
|
ret = re.sub(r'^\.*', '', ret) |
|
203
|
|
|
ret = re.sub(r'\.\.*', '.', ret) |
|
204
|
|
|
# print('safe filename: %s -> %s' % (url, ret)) |
|
205
|
|
|
return ret |
|
206
|
|
|
|
|
207
|
|
|
def generate_range(start : str, end : str) -> List[str]: |
|
208
|
|
|
""" |
|
209
|
|
|
Generate a list of strings by incrementing the number part of ``start`` until including ``end``. |
|
210
|
|
|
""" |
|
211
|
|
|
ret = [] |
|
212
|
|
|
try: |
|
213
|
|
|
start_num, end_num = re.findall(r'\d+', start)[-1], re.findall(r'\d+', end)[-1] |
|
214
|
|
|
except IndexError: |
|
215
|
|
|
raise ValueError("Range '%s..%s': could not find numeric part" % (start, end)) |
|
216
|
|
|
if start[:-len(start_num)] != end[:-len(end_num)]: |
|
217
|
|
|
raise ValueError(f"Range '{start}..{end}' differ in their non-numeric part: '{start[:-len(start_num)]}' != '{end[:-len(end_num)]}'") |
|
218
|
|
|
if start_num == end_num: |
|
219
|
|
|
warn("Range '%s..%s': evaluates to the same number") |
|
220
|
|
|
for i in range(int(start_num), int(end_num) + 1): |
|
221
|
|
|
ret.append(start.replace(start_num, str(i).zfill(len(start_num)))) |
|
222
|
|
|
return ret |
|
223
|
|
|
|
|
224
|
|
|
|
|
225
|
|
|
def partition_list(lst, chunks, chunk_index=None): |
|
226
|
|
|
""" |
|
227
|
|
|
Partition a list into roughly equally-sized chunks |
|
228
|
|
|
|
|
229
|
|
|
Args: |
|
230
|
|
|
lst (list): list to partition |
|
231
|
|
|
chunks (int): number of chunks to generate (not per chunk!) |
|
232
|
|
|
|
|
233
|
|
|
Keyword Args: |
|
234
|
|
|
chunk_index (None|int): If provided, return only a list consisting of this chunk |
|
235
|
|
|
|
|
236
|
|
|
Returns: |
|
237
|
|
|
list(list()) |
|
238
|
|
|
""" |
|
239
|
|
|
if not lst: |
|
240
|
|
|
return [] |
|
241
|
|
|
# Catch potential empty ranges returned by numpy.array_split |
|
242
|
|
|
# which are problematic in the ocr-d scope |
|
243
|
|
|
if chunks > len(lst): |
|
244
|
|
|
raise ValueError("Amount of chunks bigger than list size") |
|
245
|
|
|
ret = [x.tolist() for x in array_split(lst, chunks)] |
|
246
|
|
|
if chunk_index is not None: |
|
247
|
|
|
return [ret[chunk_index]] |
|
248
|
|
|
return ret |
|
249
|
|
|
|
|
250
|
|
|
def sparkline(values : List[int]) -> str: |
|
251
|
|
|
""" |
|
252
|
|
|
Render a list of points with block characters |
|
253
|
|
|
""" |
|
254
|
|
|
if any(x is None or not isinstance(x, (int, float)) or x < 0 for x in values): |
|
255
|
|
|
# return an empty string on non-positive-int values, better not to |
|
256
|
|
|
# output a sparkline than to cancel execution due to problematic input |
|
257
|
|
|
return '' |
|
258
|
|
|
max_value = max(values) |
|
259
|
|
|
max_mapping = len(SPARKLINE_CHARS) - 1 |
|
260
|
|
|
# normalize to 0..1 and convert to index in SPARKLINE_CHARS |
|
261
|
|
|
mapped = [int(x / max_value * max_mapping) for x in values] |
|
262
|
|
|
return ''.join(SPARKLINE_CHARS[x] for x in mapped) |
|
263
|
|
|
|
|
264
|
|
|
|