Passed
Pull Request — master (#672)
by Konstantin
02:17
created

ocrd_utils.str.generate_range()   A

Complexity

Conditions 4

Size

Total Lines 12
Code Lines 9

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 9
dl 0
loc 12
rs 9.95
c 0
b 0
f 0
cc 4
nop 2
1
"""
2
Utility functions for strings, paths and URL.
3
"""
4
5
import re
6
import json
7
from .constants import REGEX_FILE_ID
8
9
__all__ = [
10
    'assert_file_grp_cardinality',
11
    'concat_padded',
12
    'get_local_filename',
13
    'is_local_filename',
14
    'is_string',
15
    'make_file_id',
16
    'nth_url_segment',
17
    'parse_json_string_or_file',
18
    'parse_json_string_with_comments',
19
    'remove_non_path_from_url',
20
    'safe_filename',
21
]
22
23
24
def assert_file_grp_cardinality(grps, n, msg=None):
25
    """
26
    Assert that a string of comma-separated fileGrps contains exactly ``n`` entries.
27
    """
28
    if isinstance(grps, str):
29
        grps = grps.split(',')
30
    assert len(grps) == n, \
31
            "Expected exactly %d output file group%s%s, but '%s' has %d" % (
32
                n,
33
                '' if n == 1 else 's',
34
                ' (%s)' % msg if msg else '',
35
                grps,
36
                len(grps)
37
            )
38
39
def concat_padded(base, *args):
40
    """
41
    Concatenate string and zero-padded 4 digit number
42
    """
43
    ret = base
44
    for n in args:
45
        if is_string(n):
46
            ret = "%s_%s" % (ret, n)
47
        else:
48
            ret = "%s_%04i"  % (ret, n)
49
    return ret
50
51
def remove_non_path_from_url(url):
52
    """
53
    Remove everything from URL after path.
54
    """
55
    url = url.split('?', 1)[0]    # query
56
    url = url.split('#', 1)[0]    # fragment identifier
57
    url = re.sub(r"/+$", "", url) # trailing slashes
58
    return url
59
60
def make_file_id(ocrd_file, output_file_grp):
61
    """
62
    Derive a new file ID for an output file from an existing input file ``ocrd_file``
63
    and the name of the output file's ``fileGrp/@USE``, ``output_file_grp``.
64
    If ``ocrd_file``'s ID contains the input file's fileGrp name, then replace it by ``output_file_grp``.
65
    Otherwise use ``output_file_grp`` together with the position of ``ocrd_file`` within the input fileGrp
66
    (as a fallback counter). Increment counter until there is no more ID conflict.
67
    """
68
    ret = ocrd_file.ID.replace(ocrd_file.fileGrp, output_file_grp)
69
    if ret == ocrd_file.ID:
70
        m = re.match(r'.*?(\d{3,}).*', ocrd_file.pageId or '')
71
        if m:
72
            n = int(m.group(1))
73
        else:
74
            ids = [f.ID for f in ocrd_file.mets.find_files(fileGrp=ocrd_file.fileGrp, mimetype=ocrd_file.mimetype)]
75
            try:
76
                n = ids.index(ocrd_file.ID) + 1
77
            except ValueError:
78
                n = len(ids)
79
        ret = concat_padded(output_file_grp, n)
80
        while next(ocrd_file.mets.find_files(ID=ret), None):
81
            n += 1
82
            ret = concat_padded(output_file_grp, n)
83
    if not REGEX_FILE_ID.fullmatch(ret):
84
        ret = ret.replace(':', '_')
85
        ret = re.sub(r'^([^a-zA-Z_])', r'id_\1', ret)
86
        ret = re.sub(r'[^\w.-]', r'', ret)
87
    return ret
88
89
def nth_url_segment(url, n=-1):
90
    """
91
    Return the last /-delimited segment of a URL-like string
92
93
    Arguments:
94
        url (string):
95
        n (integer): index of segment, default: -1
96
    """
97
    segments = remove_non_path_from_url(url).split('/')
98
    try:
99
        return segments[n]
100
    except IndexError:
101
        return ''
102
103
def get_local_filename(url, start=None):
104
    """
105
    Return local filename, optionally relative to ``start``
106
107
    Arguments:
108
        url (string): filename or URL
109
        start (string): Base path to remove from filename. Raise an exception if not a prefix of url
110
    """
111
    if url.startswith('https://') or url.startswith('http:'):
112
        raise Exception("Can't determine local filename of http(s) URL")
113
    if url.startswith('file://'):
114
        url = url[len('file://'):]
115
    # Goobi/Kitodo produces those, they are always absolute
116
    if url.startswith('file:/'):
117
        raise Exception("Invalid (java) URL: %s" % url)
118
    if start:
119
        if not url.startswith(start):
120
            raise Exception("Cannot remove prefix %s from url %s" % (start, url))
121
        if not start.endswith('/'):
122
            start += '/'
123
        url = url[len(start):]
124
    return url
125
126
def is_local_filename(url):
127
    """
128
    Whether a url is a local filename.
129
    """
130
    return url.startswith('file://') or not('://' in url)
131
132
def is_string(val):
133
    """
134
    Return whether a value is a ``str``.
135
    """
136
    return isinstance(val, str)
137
138
139
def parse_json_string_with_comments(val):
140
    """
141
    Parse a string of JSON interspersed with #-prefixed full-line comments
142
    """
143
    jsonstr = re.sub(r'^\s*#.*$', '', val, flags=re.MULTILINE)
144
    return json.loads(jsonstr)
145
146
def parse_json_string_or_file(*values):    # pylint: disable=unused-argument
147
    """
148
    Parse a string as either the path to a JSON object or a literal JSON object.
149
150
    Empty strings are equivalent to '{}'
151
    """
152
    ret = {}
153
    for value in values:
154
        err = None
155
        value_parsed = None
156
        if re.fullmatch(r"\s*", value):
157
            continue
158
        try:
159
            try:
160
                with open(value, 'r') as f:
161
                    value_parsed = parse_json_string_with_comments(f.read())
162
            except (FileNotFoundError, OSError):
163
                value_parsed = parse_json_string_with_comments(value.strip())
164
            if not isinstance(value_parsed, dict):
165
                err = ValueError("Not a valid JSON object: '%s' (parsed as '%s')" % (value, value_parsed))
166
        except json.decoder.JSONDecodeError as e:
167
            err = ValueError("Error parsing '%s': %s" % (value, e))
168
        if err:
169
            raise err       # pylint: disable=raising-bad-type
170
        ret = {**ret, **value_parsed}
171
    return ret
172
173
def safe_filename(url):
174
    """
175
    Sanitize input to be safely used as the basename of a local file.
176
    """
177
    ret = re.sub('[^A-Za-z0-9]+', '.', url)
178
    #  print('safe filename: %s -> %s' % (url, ret))
179
    return ret
180
181
def generate_range(start, end):
182
    """
183
    Generate a list of strings by incrementing the number part of ``start`` until including ``end``.
184
    """
185
    ret = []
186
    start_num, end_num = re.search(r'\d+', start), re.search(r'\d+', end)
187
    if not (start_num and end_num):
188
        raise ValueError("Unable to generate range %s .. %s, could not detect number part" % (start, end))
189
    start_num, end_num = start_num.group(0), end_num.group(0)
190
    for i in range(int(start_num), int(end_num) + 1):
191
        ret.append(start.replace(start_num, str(i).zfill(len(start_num))))
192
    return ret
193