Passed
Push — master ( 604178...18ecf8 )
by Konstantin
03:39
created

ocrd_utils.str.make_xml_id()   A

Complexity

Conditions 2

Size

Total Lines 10
Code Lines 7

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 7
dl 0
loc 10
rs 10
c 0
b 0
f 0
cc 2
nop 1
1
"""
2
Utility functions for strings, paths and URL.
3
"""
4
5
import re
6
import json
7
from typing import List, Union
8
from .constants import REGEX_FILE_ID, SPARKLINE_CHARS
9
from .deprecate import deprecation_warning
10
from warnings import warn
11
from numpy import array_split
12
13
__all__ = [
14
    'assert_file_grp_cardinality',
15
    'concat_padded',
16
    'get_local_filename',
17
    'is_local_filename',
18
    'partition_list',
19
    'is_string',
20
    'make_file_id',
21
    'make_xml_id',
22
    'nth_url_segment',
23
    'parse_json_string_or_file',
24
    'parse_json_string_with_comments',
25
    'remove_non_path_from_url',
26
    'safe_filename',
27
]
28
29
30
def assert_file_grp_cardinality(grps, n, msg=None):
31
    """
32
    Assert that a string of comma-separated fileGrps contains exactly ``n`` entries.
33
    """
34
    if isinstance(grps, str):
35
        grps = grps.split(',')
36
    assert len(grps) == n, \
37
            "Expected exactly %d output file group%s%s, but '%s' has %d" % (
38
                n,
39
                '' if n == 1 else 's',
40
                ' (%s)' % msg if msg else '',
41
                grps,
42
                len(grps)
43
            )
44
45
def concat_padded(base, *args):
46
    """
47
    Concatenate string and zero-padded 4 digit number
48
    """
49
    ret = base
50
    for n in args:
51
        if is_string(n):
52
            ret = "%s_%s" % (ret, n)
53
        else:
54
            ret = "%s_%04i"  % (ret, n)
55
    return ret
56
57
def remove_non_path_from_url(url):
58
    """
59
    Remove everything from URL after path.
60
    """
61
    url = url.split('?', 1)[0]    # query
62
    url = url.split('#', 1)[0]    # fragment identifier
63
    url = re.sub(r"/+$", "", url) # trailing slashes
64
    return url
65
66
def make_file_id(ocrd_file, output_file_grp):
67
    """
68
    Derive a new file ID for an output file from an existing input file ``ocrd_file``
69
    and the name of the output file's ``fileGrp/@USE``, ``output_file_grp``.
70
    If ``ocrd_file``'s ID contains the input file's fileGrp name, then replace it by ``output_file_grp``.
71
    Else if ``ocrd_file`` has a ``pageId`` but it is not contained in the ``ocrd_file.ID``, then
72
        concatenate ``output_file_grp`` and ``ocrd_file.pageId``.
73
    Otherwise concatenate ``output_file_grp`` with the ``ocrd_file.ID``.
74
75
    Note: ``make_file_id`` cannot guarantee that the new ID is unique within an actual
76
    :py:class:`ocrd_models.ocrd_mets.OcrdMets`.
77
    The caller is responsible for ensuring uniqueness of files to be added.
78
    Ultimately, ID conflicts will lead to :py:meth:`ocrd_models.ocrd_mets.OcrdMets.add_file`
79
    raising an exception.
80
    This can be avoided if all processors use ``make_file_id`` consistently for ID generation.
81
82
    Note: ``make_file_id`` generates page-specific IDs. For IDs representing page segments
83
    or ``pc:AlternativeImage`` files, the output of ``make_file_id`` may need to be concatenated
84
    with a unique string for that sub-page element, such as `".IMG"` or the segment ID.
85
    """
86
    # considerations for this behaviour:
87
    # - uniqueness (in spite of different METS and processor conventions)
88
    # - predictability (i.e. output name can be anticipated from the input name)
89
    # - stability (i.e. output at least as much sorted and consistent as the input)
90
    # ... and all this in spite of --page-id selection and --overwrite
91
    # (i.e. --overwrite should target the existing ID, and input vs output
92
    #  IDs should be different, except when overwriting the input fileGrp)
93
    ret = ocrd_file.ID.replace(ocrd_file.fileGrp, output_file_grp)
94
    if ret == ocrd_file.ID and output_file_grp != ocrd_file.fileGrp:
95
        if ocrd_file.pageId and ocrd_file.pageId not in ocrd_file.ID:
96
            ret = output_file_grp + '_' + ocrd_file.pageId
97
        else:
98
            ret = output_file_grp + '_' + ocrd_file.ID
99
    return make_xml_id(ret)
100
101
def make_xml_id(idstr: str) -> str:
102
    """
103
    Turn ``idstr`` into a valid ``xml:id`` literal by replacing ``:`` with ``_``, removing everything non-alphanumeric, ``.`` and ``-`` and prepending `id_` if ``idstr`` starts with a number.
104
    """
105
    ret = idstr
106
    if not REGEX_FILE_ID.fullmatch(ret):
107
        ret = ret.replace(':', '_')
108
        ret = re.sub(r'^([^a-zA-Z_])', r'id_\1', ret)
109
        ret = re.sub(r'[^\w.-]', r'', ret)
110
    return ret
111
    
112
def nth_url_segment(url, n=-1):
113
    """
114
    Return the last /-delimited segment of a URL-like string
115
116
    Arguments:
117
        url (string):
118
        n (integer): index of segment, default: -1
119
    """
120
    segments = remove_non_path_from_url(url).split('/')
121
    try:
122
        return segments[n]
123
    except IndexError:
124
        return ''
125
126
def get_local_filename(url, start=None):
127
    """
128
    Return local filename, optionally relative to ``start``
129
130
    Arguments:
131
        url (string): filename or URL
132
        start (string): Base path to remove from filename. Raise an exception if not a prefix of url
133
    """
134
    if url.startswith('https://') or url.startswith('http:'):
135
        raise ValueError("Can't determine local filename of http(s) URL")
136
    if url.startswith('file://'):
137
        url = url[len('file://'):]
138
    # Goobi/Kitodo produces those, they are always absolute
139
    if url.startswith('file:/'):
140
        url = url[len('file:'):]
141
    if start:
142
        if not url.startswith(start):
143
            raise ValueError("Cannot remove prefix %s from url %s" % (start, url))
144
        if not start.endswith('/'):
145
            start += '/'
146
        url = url[len(start):]
147
    return url
148
149
def is_local_filename(url):
150
    """
151
    Whether a url is a local filename.
152
    """
153
    # deprecation_warning("Deprecated so we spot inconsistent URL/file handling")
154
    return url.startswith('file://') or not('://' in url)
155
156
def is_string(val):
157
    """
158
    Return whether a value is a ``str``.
159
    """
160
    return isinstance(val, str)
161
162
163
def parse_json_string_with_comments(val):
164
    """
165
    Parse a string of JSON interspersed with #-prefixed full-line comments
166
    """
167
    jsonstr = re.sub(r'^\s*#.*$', '', val, flags=re.MULTILINE)
168
    return json.loads(jsonstr)
169
170
def parse_json_string_or_file(*values, resolve_preset_file=None):    # pylint: disable=unused-argument
171
    """
172
    Parse a string as either the path to a JSON object or a literal JSON object.
173
174
    Empty strings are equivalent to '{}'
175
    """
176
    ret = {}
177
    for value in values:
178
        err = None
179
        value_parsed = None
180
        if re.fullmatch(r"\s*", value):
181
            continue
182
        try:
183
            try:
184
                path = value
185
                if callable(resolve_preset_file):
186
                    path = resolve_preset_file(value) or value
187
                with open(path, 'r') as f:
188
                    value_parsed = parse_json_string_with_comments(f.read())
189
            except (FileNotFoundError, OSError):
190
                value_parsed = parse_json_string_with_comments(value.strip())
191
            if not isinstance(value_parsed, dict):
192
                err = ValueError("Not a valid JSON object: '%s' (parsed as '%s')" % (value, value_parsed))
193
        except json.decoder.JSONDecodeError as e:
194
            err = ValueError("Error parsing '%s': %s" % (value, e))
195
        if err:
196
            raise err       # pylint: disable=raising-bad-type
197
        ret = {**ret, **value_parsed}
198
    return ret
199
200
def safe_filename(url):
201
    """
202
    Sanitize input to be safely used as the basename of a local file.
203
    """
204
    ret = re.sub(r'[^\w]+', '_', url)
205
    ret = re.sub(r'^\.*', '', ret)
206
    ret = re.sub(r'\.\.*', '.', ret)
207
    #  print('safe filename: %s -> %s' % (url, ret))
208
    return ret
209
210
def generate_range(start : str, end : str) -> List[str]:
211
    """
212
    Generate a list of strings by incrementing the number part of ``start`` until including ``end``.
213
    """
214
    ret = []
215
    try:
216
        start_num, end_num = re.findall(r'\d+', start)[-1], re.findall(r'\d+', end)[-1]
217
    except IndexError:
218
        raise ValueError("Range '%s..%s': could not find numeric part" % (start, end))
219
    if start[:-len(start_num)] != end[:-len(end_num)]:
220
        raise ValueError(f"Range '{start}..{end}' differ in their non-numeric part: '{start[:-len(start_num)]}' != '{end[:-len(end_num)]}'")
221
    if start_num == end_num:
222
        warn("Range '%s..%s': evaluates to the same number")
223
    for i in range(int(start_num), int(end_num) + 1):
224
        ret.append(start.replace(start_num, str(i).zfill(len(start_num))))
225
    return ret
226
227
228
def partition_list(lst, chunks, chunk_index=None):
229
    """
230
    Partition a list into roughly equally-sized chunks
231
232
    Args:
233
        lst (list): list to partition
234
        chunks (int): number of chunks to generate (not per chunk!)
235
236
    Keyword Args:
237
        chunk_index (None|int): If provided, return only a list consisting of this chunk
238
239
    Returns:
240
        list(list())
241
    """
242
    if not lst:
243
        return []
244
    # Catch potential empty ranges returned by numpy.array_split
245
    #  which are problematic in the ocr-d scope
246
    if chunks > len(lst):
247
        raise ValueError("Amount of chunks bigger than list size")
248
    ret = [x.tolist() for x in array_split(lst, chunks)]
249
    if chunk_index is not None:
250
        return [ret[chunk_index]]
251
    return ret
252
253
def sparkline(values : List[int]) -> str:
254
    """
255
    Render a list of points with block characters
256
    """
257
    if any(x is None or not isinstance(x, (int, float)) or x < 0 for x in values):
258
        # return an empty string on non-positive-int values, better not to
259
        # output a sparkline than to cancel execution due to problematic input
260
        return ''
261
    max_value = max(values)
262
    max_mapping = len(SPARKLINE_CHARS) - 1
263
    # normalize to 0..1 and convert to index in SPARKLINE_CHARS
264
    mapped = [int(x / max_value * max_mapping) for x in values]
265
    return ''.join(SPARKLINE_CHARS[x] for x in mapped)
266
267