Passed
Pull Request — master (#536)
by Konstantin
01:40
created

ocrd_utils.str   A

Complexity

Total Complexity 34

Size/Duplication

Total Lines 171
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
wmc 34
eloc 101
dl 0
loc 171
rs 9.68
c 0
b 0
f 0

11 Functions

Rating   Name   Duplication   Size   Complexity  
A assert_file_grp_cardinality() 0 9 3
A is_local_filename() 0 5 1
A remove_non_path_from_url() 0 8 1
A is_string() 0 5 1
A nth_url_segment() 0 13 2
B get_local_filename() 0 22 8
B parse_json_string_or_file() 0 26 8
A safe_filename() 0 7 1
A parse_json_string_with_comments() 0 6 1
A concat_padded() 0 11 3
A make_file_id() 0 24 5
1
"""
2
Utility functions for strings, paths and URL.
3
"""
4
5
import re
6
import json
7
8
__all__ = [
9
    'assert_file_grp_cardinality',
10
    'concat_padded',
11
    'get_local_filename',
12
    'is_local_filename',
13
    'is_string',
14
    'make_file_id',
15
    'nth_url_segment',
16
    'parse_json_string_or_file',
17
    'parse_json_string_with_comments',
18
    'remove_non_path_from_url',
19
    'safe_filename',
20
    'set_json_key_value_overrides',
21
]
22
23
24
def assert_file_grp_cardinality(grps, n):
25
    """
26
    Assert that a string of comma-separated fileGrps contains exactly ``n`` entries.
27
    """
28
    if isinstance(grps, str):
29
        grps = grps.split(',')
30
    assert len(grps) == n, \
31
            "Expected exactly %d output file group%s, but '%s' has %d" % (
32
                n, '' if n == 1 else 's', grps, len(grps))
33
34
def concat_padded(base, *args):
35
    """
36
    Concatenate string and zero-padded 4 digit number
37
    """
38
    ret = base
39
    for n in args:
40
        if is_string(n):
41
            ret = "%s_%s" % (ret, n)
42
        else:
43
            ret = "%s_%04i"  % (ret, n + 1)
44
    return ret
45
46
def remove_non_path_from_url(url):
47
    """
48
    Remove everything from URL after path.
49
    """
50
    url = url.split('?', 1)[0]    # query
51
    url = url.split('#', 1)[0]    # fragment identifier
52
    url = re.sub(r"/+$", "", url) # trailing slashes
53
    return url
54
55
def make_file_id(ocrd_file, output_file_grp):
56
    """
57
    Derive a new file ID for an output file from an existing input file ``ocrd_file``
58
    and the name of the output file's ``fileGrp/@USE``, ``output_file_grp``.
59
    If ``ocrd_file``'s ID contains the input file's fileGrp name, then replace it by ``output_file_grp``.
60
    Otherwise use ``output_file_grp`` together with the position of ``ocrd_file`` within the input fileGrp
61
    (as a fallback counter). Increment counter until there is no more ID conflict.
62
    """
63
    ret = ocrd_file.ID.replace(ocrd_file.fileGrp, output_file_grp)
64
    if ret == ocrd_file.ID:
65
        m = re.match(r'.*?(\d{3,}).*', ocrd_file.pageId or '')
66
        if m:
67
            n = m.group(1)
68
        else:
69
            ids = [f.ID for f in ocrd_file.mets.find_files(fileGrp=ocrd_file.fileGrp, mimetype=ocrd_file.mimetype)]
70
            try:
71
                n = ids.index(ocrd_file.ID)
72
            except ValueError:
73
                n = len(ids)
74
        ret = concat_padded(output_file_grp, n)
75
        while ocrd_file.mets.find_files(ID=ret):
76
            n += 1
77
            ret = concat_padded(output_file_grp, n)
78
    return ret
79
80
def nth_url_segment(url, n=-1):
81
    """
82
    Return the last /-delimited segment of a URL-like string
83
84
    Arguments:
85
        url (string):
86
        n (integer): index of segment, default: -1
87
    """
88
    segments = remove_non_path_from_url(url).split('/')
89
    try:
90
        return segments[n]
91
    except IndexError:
92
        return ''
93
94
def get_local_filename(url, start=None):
95
    """
96
    Return local filename, optionally relative to ``start``
97
98
    Arguments:
99
        url (string): filename or URL
100
        start (string): Base path to remove from filename. Raise an exception if not a prefix of url
101
    """
102
    if url.startswith('https://') or url.startswith('http:'):
103
        raise Exception("Can't determine local filename of http(s) URL")
104
    if url.startswith('file://'):
105
        url = url[len('file://'):]
106
    # Goobi/Kitodo produces those, they are always absolute
107
    if url.startswith('file:/'):
108
        raise Exception("Invalid (java) URL: %s" % url)
109
    if start:
110
        if not url.startswith(start):
111
            raise Exception("Cannot remove prefix %s from url %s" % (start, url))
112
        if not start.endswith('/'):
113
            start += '/'
114
        url = url[len(start):]
115
    return url
116
117
def is_local_filename(url):
118
    """
119
    Whether a url is a local filename.
120
    """
121
    return url.startswith('file://') or not('://' in url)
122
123
def is_string(val):
124
    """
125
    Return whether a value is a ``str``.
126
    """
127
    return isinstance(val, str)
128
129
130
def parse_json_string_with_comments(val):
131
    """
132
    Parse a string of JSON interspersed with #-prefixed full-line comments
133
    """
134
    jsonstr = re.sub(r'^\s*#.*$', '', val, flags=re.MULTILINE)
135
    return json.loads(jsonstr)
136
137
def parse_json_string_or_file(*values):    # pylint: disable=unused-argument
138
    """
139
    Parse a string as either the path to a JSON object or a literal JSON object.
140
141
    Empty strings are equivalent to '{}'
142
    """
143
    ret = {}
144
    for value in values:
145
        err = None
146
        value_parsed = None
147
        if re.fullmatch(r"\s*", value):
148
            continue
149
        try:
150
            try:
151
                with open(value, 'r') as f:
152
                    value_parsed = parse_json_string_with_comments(f.read())
153
            except (FileNotFoundError, OSError):
154
                value_parsed = parse_json_string_with_comments(value.strip())
155
            if not isinstance(value_parsed, dict):
156
                err = ValueError("Not a valid JSON object: '%s' (parsed as '%s')" % (value, value_parsed))
157
        except json.decoder.JSONDecodeError as e:
158
            err = ValueError("Error parsing '%s': %s" % (value, e))
159
        if err:
160
            raise err       # pylint: disable=raising-bad-type
161
        ret = {**ret, **value_parsed}
162
    return ret
163
164
def safe_filename(url):
165
    """
166
    Sanitize input to be safely used as the basename of a local file.
167
    """
168
    ret = re.sub('[^A-Za-z0-9]+', '.', url)
169
    #  print('safe filename: %s -> %s' % (url, ret))
170
    return ret
171
172