Passed
Push — master ( a2259f...ef70d1 )
by Konstantin
02:37
created

ocrd_utils.str.safe_filename()   A

Complexity

Conditions 1

Size

Total Lines 9
Code Lines 5

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 5
dl 0
loc 9
rs 10
c 0
b 0
f 0
cc 1
nop 1
1
"""
2
Utility functions for strings, paths and URL.
3
"""
4
5
import re
6
import json
7
from .constants import REGEX_FILE_ID
8
from .deprecate import deprecation_warning
9
from warnings import warn
10
from math import ceil
11
import sys
12
from itertools import islice
13
14
if sys.version_info >= (3, 12):
15
    from itertools import batched
16
else:
17
    def batched(iterable, chunk_size):
18
        iterator = iter(iterable)
19
        chunk = None
20
        while True:
21
            chunk = tuple(islice(iterator, chunk_size))
22
            if not chunk:
23
                break
24
            yield chunk
25
26
__all__ = [
27
    'assert_file_grp_cardinality',
28
    'concat_padded',
29
    'get_local_filename',
30
    'is_local_filename',
31
    'partition_list',
32
    'is_string',
33
    'make_file_id',
34
    'nth_url_segment',
35
    'parse_json_string_or_file',
36
    'parse_json_string_with_comments',
37
    'remove_non_path_from_url',
38
    'safe_filename',
39
]
40
41
42
def assert_file_grp_cardinality(grps, n, msg=None):
43
    """
44
    Assert that a string of comma-separated fileGrps contains exactly ``n`` entries.
45
    """
46
    if isinstance(grps, str):
47
        grps = grps.split(',')
48
    assert len(grps) == n, \
49
            "Expected exactly %d output file group%s%s, but '%s' has %d" % (
50
                n,
51
                '' if n == 1 else 's',
52
                ' (%s)' % msg if msg else '',
53
                grps,
54
                len(grps)
55
            )
56
57
def concat_padded(base, *args):
58
    """
59
    Concatenate string and zero-padded 4 digit number
60
    """
61
    ret = base
62
    for n in args:
63
        if is_string(n):
64
            ret = "%s_%s" % (ret, n)
65
        else:
66
            ret = "%s_%04i"  % (ret, n)
67
    return ret
68
69
def remove_non_path_from_url(url):
70
    """
71
    Remove everything from URL after path.
72
    """
73
    url = url.split('?', 1)[0]    # query
74
    url = url.split('#', 1)[0]    # fragment identifier
75
    url = re.sub(r"/+$", "", url) # trailing slashes
76
    return url
77
78
def make_file_id(ocrd_file, output_file_grp):
79
    """
80
    Derive a new file ID for an output file from an existing input file ``ocrd_file``
81
    and the name of the output file's ``fileGrp/@USE``, ``output_file_grp``.
82
    If ``ocrd_file``'s ID contains the input file's fileGrp name, then replace it by ``output_file_grp``.
83
    Else if ``ocrd_file`` has a ``pageId`` but it is not contained in the ``ocrd_file.ID``, then
84
        concatenate ``output_file_grp`` and ``ocrd_file.pageId``.
85
    Otherwise concatenate ``output_file_grp`` with the ``ocrd_file.ID``.
86
87
    Note: ``make_file_id`` cannot guarantee that the new ID is unique within an actual
88
    :py:class:`ocrd_models.ocrd_mets.OcrdMets`.
89
    The caller is responsible for ensuring uniqueness of files to be added.
90
    Ultimately, ID conflicts will lead to :py:meth:`ocrd_models.ocrd_mets.OcrdMets.add_file`
91
    raising an exception.
92
    This can be avoided if all processors use ``make_file_id`` consistently for ID generation.
93
94
    Note: ``make_file_id`` generates page-specific IDs. For IDs representing page segments
95
    or ``pc:AlternativeImage`` files, the output of ``make_file_id`` may need to be concatenated
96
    with a unique string for that sub-page element, such as `".IMG"` or the segment ID.
97
    """
98
    # considerations for this behaviour:
99
    # - uniqueness (in spite of different METS and processor conventions)
100
    # - predictability (i.e. output name can be anticipated from the input name)
101
    # - stability (i.e. output at least as much sorted and consistent as the input)
102
    # ... and all this in spite of --page-id selection and --overwrite
103
    # (i.e. --overwrite should target the existing ID, and input vs output
104
    #  IDs should be different, except when overwriting the input fileGrp)
105
    ret = ocrd_file.ID.replace(ocrd_file.fileGrp, output_file_grp)
106
    if ret == ocrd_file.ID and output_file_grp != ocrd_file.fileGrp:
107
        if ocrd_file.pageId and ocrd_file.pageId not in ocrd_file.ID:
108
            ret = output_file_grp + '_' + ocrd_file.pageId
109
        else:
110
            ret = output_file_grp + '_' + ocrd_file.ID
111
    if not REGEX_FILE_ID.fullmatch(ret):
112
        ret = ret.replace(':', '_')
113
        ret = re.sub(r'^([^a-zA-Z_])', r'id_\1', ret)
114
        ret = re.sub(r'[^\w.-]', r'', ret)
115
    return ret
116
117
def nth_url_segment(url, n=-1):
118
    """
119
    Return the last /-delimited segment of a URL-like string
120
121
    Arguments:
122
        url (string):
123
        n (integer): index of segment, default: -1
124
    """
125
    segments = remove_non_path_from_url(url).split('/')
126
    try:
127
        return segments[n]
128
    except IndexError:
129
        return ''
130
131
def get_local_filename(url, start=None):
132
    """
133
    Return local filename, optionally relative to ``start``
134
135
    Arguments:
136
        url (string): filename or URL
137
        start (string): Base path to remove from filename. Raise an exception if not a prefix of url
138
    """
139
    if url.startswith('https://') or url.startswith('http:'):
140
        raise ValueError("Can't determine local filename of http(s) URL")
141
    if url.startswith('file://'):
142
        url = url[len('file://'):]
143
    # Goobi/Kitodo produces those, they are always absolute
144
    if url.startswith('file:/'):
145
        url = url[len('file:'):]
146
    if start:
147
        if not url.startswith(start):
148
            raise ValueError("Cannot remove prefix %s from url %s" % (start, url))
149
        if not start.endswith('/'):
150
            start += '/'
151
        url = url[len(start):]
152
    return url
153
154
def is_local_filename(url):
155
    """
156
    Whether a url is a local filename.
157
    """
158
    # deprecation_warning("Deprecated so we spot inconsistent URL/file handling")
159
    return url.startswith('file://') or not('://' in url)
160
161
def is_string(val):
162
    """
163
    Return whether a value is a ``str``.
164
    """
165
    return isinstance(val, str)
166
167
168
def parse_json_string_with_comments(val):
169
    """
170
    Parse a string of JSON interspersed with #-prefixed full-line comments
171
    """
172
    jsonstr = re.sub(r'^\s*#.*$', '', val, flags=re.MULTILINE)
173
    return json.loads(jsonstr)
174
175
def parse_json_string_or_file(*values):    # pylint: disable=unused-argument
176
    """
177
    Parse a string as either the path to a JSON object or a literal JSON object.
178
179
    Empty strings are equivalent to '{}'
180
    """
181
    ret = {}
182
    for value in values:
183
        err = None
184
        value_parsed = None
185
        if re.fullmatch(r"\s*", value):
186
            continue
187
        try:
188
            try:
189
                with open(value, 'r') as f:
190
                    value_parsed = parse_json_string_with_comments(f.read())
191
            except (FileNotFoundError, OSError):
192
                value_parsed = parse_json_string_with_comments(value.strip())
193
            if not isinstance(value_parsed, dict):
194
                err = ValueError("Not a valid JSON object: '%s' (parsed as '%s')" % (value, value_parsed))
195
        except json.decoder.JSONDecodeError as e:
196
            err = ValueError("Error parsing '%s': %s" % (value, e))
197
        if err:
198
            raise err       # pylint: disable=raising-bad-type
199
        ret = {**ret, **value_parsed}
200
    return ret
201
202
def safe_filename(url):
203
    """
204
    Sanitize input to be safely used as the basename of a local file.
205
    """
206
    ret = re.sub(r'[^\w]+', '_', url)
207
    ret = re.sub(r'^\.*', '', ret)
208
    ret = re.sub(r'\.\.*', '.', ret)
209
    #  print('safe filename: %s -> %s' % (url, ret))
210
    return ret
211
212
def generate_range(start, end):
213
    """
214
    Generate a list of strings by incrementing the number part of ``start`` until including ``end``.
215
    """
216
    ret = []
217
    try:
218
        start_num, end_num = re.findall(r'\d+', start)[-1], re.findall(r'\d+', end)[-1]
219
    except IndexError:
220
        raise ValueError("Range '%s..%s': could not find numeric part" % (start, end))
221
    if start_num == end_num:
222
        warn("Range '%s..%s': evaluates to the same number")
223
    for i in range(int(start_num), int(end_num) + 1):
224
        ret.append(start.replace(start_num, str(i).zfill(len(start_num))))
225
    return ret
226
227
def partition_list(lst, chunks, chunk_index=None):
228
    """
229
    Partition a list into roughly equally-sized chunks
230
231
    Args:
232
        lst (list): list to partition
233
        chunks (int): number of chunks to generate (not per chunk!)
234
235
    Keyword Args:
236
        chunk_index (None|int): If provided, return only a list consisting of this chunk
237
238
    Returns:
239
        list(list())
240
    """
241
    if not lst:
242
        return []
243
    items_per_chunk = ceil(len(lst) / chunks)
244
    ret = list(map(list, batched(lst, items_per_chunk)))
245
    if chunk_index is not None:
246
        return [ret[chunk_index]]
247
    return ret
248