ocrd_utils.str   C
last analyzed

Complexity

Total Complexity 53

Size/Duplication

Total Lines 277
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
wmc 53
eloc 149
dl 0
loc 277
rs 6.96
c 0
b 0
f 0

16 Functions

Rating   Name   Duplication   Size   Complexity  
A assert_file_grp_cardinality() 0 14 4
A remove_non_path_from_url() 0 8 1
A make_file_id() 0 34 5
A concat_padded() 0 11 3
A is_local_filename() 0 6 1
A is_string() 0 5 1
A sparkline() 0 13 4
A generate_range() 0 16 5
A partition_list() 0 24 4
A nth_url_segment() 0 13 2
A parse_json_file_with_comments() 0 6 2
A make_xml_id() 0 11 2
B get_local_filename() 0 22 8
C parse_json_string_or_file() 0 29 9
A safe_filename() 0 9 1
A parse_json_string_with_comments() 0 6 1

How to fix   Complexity   

Complexity

Complex classes like ocrd_utils.str often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

1
"""
2
Utility functions for strings, paths and URL.
3
"""
4
5
import re
6
import json
7
from typing import List
8
from .constants import REGEX_FILE_ID, SPARKLINE_CHARS
9
#from .deprecate import deprecation_warning
10
from deprecated import deprecated
11
from warnings import warn
12
from numpy import array_split
13
14
__all__ = [
15
    'assert_file_grp_cardinality',
16
    'concat_padded',
17
    'get_local_filename',
18
    'is_local_filename',
19
    'partition_list',
20
    'is_string',
21
    'make_file_id',
22
    'make_xml_id',
23
    'nth_url_segment',
24
    'parse_json_file_with_comments',
25
    'parse_json_string_or_file',
26
    'parse_json_string_with_comments',
27
    'remove_non_path_from_url',
28
    'safe_filename',
29
]
30
31
32
@deprecated(version='3.0', reason='specify input and output file_grp_cardinality in ocrd-tool.json instead')
33
def assert_file_grp_cardinality(grps, n, msg=None):
34
    """
35
    Assert that a string of comma-separated fileGrps contains exactly ``n`` entries.
36
    """
37
    if isinstance(grps, str):
38
        grps = grps.split(',')
39
    assert len(grps) == n, \
40
            "Expected exactly %d output file group%s%s, but '%s' has %d" % (
41
                n,
42
                '' if n == 1 else 's',
43
                ' (%s)' % msg if msg else '',
44
                grps,
45
                len(grps)
46
            )
47
48
def concat_padded(base, *args):
49
    """
50
    Concatenate string and zero-padded 4 digit number
51
    """
52
    ret = base
53
    for n in args:
54
        if is_string(n):
55
            ret = "%s_%s" % (ret, n)
56
        else:
57
            ret = "%s_%04i"  % (ret, n)
58
    return ret
59
60
def remove_non_path_from_url(url):
61
    """
62
    Remove everything from URL after path.
63
    """
64
    url = url.split('?', 1)[0]    # query
65
    url = url.split('#', 1)[0]    # fragment identifier
66
    url = re.sub(r"/+$", "", url) # trailing slashes
67
    return url
68
69
def make_file_id(ocrd_file, output_file_grp):
70
    """
71
    Derive a new file ID for an output file from an existing input file ``ocrd_file``
72
    and the name of the output file's ``fileGrp/@USE``, ``output_file_grp``.
73
    If ``ocrd_file``'s ID contains the input file's fileGrp name, then replace it by ``output_file_grp``.
74
    Else if ``ocrd_file`` has a ``pageId`` but it is not contained in the ``ocrd_file.ID``, then
75
        concatenate ``output_file_grp`` and ``ocrd_file.pageId``.
76
    Otherwise concatenate ``output_file_grp`` with the ``ocrd_file.ID``.
77
78
    Note: ``make_file_id`` cannot guarantee that the new ID is unique within an actual
79
    :py:class:`ocrd_models.ocrd_mets.OcrdMets`.
80
    The caller is responsible for ensuring uniqueness of files to be added.
81
    Ultimately, ID conflicts will lead to :py:meth:`ocrd_models.ocrd_mets.OcrdMets.add_file`
82
    raising an exception.
83
    This can be avoided if all processors use ``make_file_id`` consistently for ID generation.
84
85
    Note: ``make_file_id`` generates page-specific IDs. For IDs representing page segments
86
    or ``pc:AlternativeImage`` files, the output of ``make_file_id`` may need to be concatenated
87
    with a unique string for that sub-page element, such as `".IMG"` or the segment ID.
88
    """
89
    # considerations for this behaviour:
90
    # - uniqueness (in spite of different METS and processor conventions)
91
    # - predictability (i.e. output name can be anticipated from the input name)
92
    # - stability (i.e. output at least as much sorted and consistent as the input)
93
    # ... and all this in spite of --page-id selection and --overwrite
94
    # (i.e. --overwrite should target the existing ID, and input vs output
95
    #  IDs should be different, except when overwriting the input fileGrp)
96
    ret = ocrd_file.ID.replace(ocrd_file.fileGrp, output_file_grp)
97
    if ret == ocrd_file.ID and output_file_grp != ocrd_file.fileGrp:
98
        if ocrd_file.pageId and ocrd_file.pageId not in ocrd_file.ID:
99
            ret = output_file_grp + '_' + ocrd_file.pageId
100
        else:
101
            ret = output_file_grp + '_' + ocrd_file.ID
102
    return make_xml_id(ret)
103
104
def make_xml_id(idstr: str) -> str:
105
    """
106
    Turn ``idstr`` into a valid ``xml:id`` literal by replacing ``:`` with ``_``, removing everything non-alphanumeric, ``.`` and ``-`` and prepending `id_` if ``idstr`` starts with a number.
107
    """
108
    ret = idstr
109
    if not REGEX_FILE_ID.fullmatch(ret):
110
        ret = ret.replace(':', '_')
111
        ret = ret.replace('/', '_')
112
        ret = re.sub(r'^([^a-zA-Z_])', r'id_\1', ret)
113
        ret = re.sub(r'[^\w.-]', r'', ret)
114
    return ret
115
116
def nth_url_segment(url, n=-1):
117
    """
118
    Return the last /-delimited segment of a URL-like string
119
120
    Arguments:
121
        url (string):
122
        n (integer): index of segment, default: -1
123
    """
124
    segments = remove_non_path_from_url(url).split('/')
125
    try:
126
        return segments[n]
127
    except IndexError:
128
        return ''
129
130
def get_local_filename(url, start=None):
131
    """
132
    Return local filename, optionally relative to ``start``
133
134
    Arguments:
135
        url (string): filename or URL
136
        start (string): Base path to remove from filename. Raise an exception if not a prefix of url
137
    """
138
    if url.startswith('https://') or url.startswith('http:'):
139
        raise ValueError("Can't determine local filename of http(s) URL")
140
    if url.startswith('file://'):
141
        url = url[len('file://'):]
142
    # Goobi/Kitodo produces those, they are always absolute
143
    if url.startswith('file:/'):
144
        url = url[len('file:'):]
145
    if start:
146
        if not url.startswith(start):
147
            raise ValueError("Cannot remove prefix %s from url %s" % (start, url))
148
        if not start.endswith('/'):
149
            start += '/'
150
        url = url[len(start):]
151
    return url
152
153
def is_local_filename(url):
154
    """
155
    Whether a url is a local filename.
156
    """
157
    # deprecation_warning("Deprecated so we spot inconsistent URL/file handling")
158
    return url.startswith('file://') or not('://' in url)
159
160
def is_string(val):
161
    """
162
    Return whether a value is a ``str``.
163
    """
164
    return isinstance(val, str)
165
166
167
def parse_json_file_with_comments(val):
168
    """
169
    Parse a file of JSON interspersed with #-prefixed full-line comments
170
    """
171
    with open(val, 'r', encoding='utf-8') as inputf:
172
        return parse_json_string_with_comments(inputf.read())
173
174
def parse_json_string_with_comments(val):
175
    """
176
    Parse a string of JSON interspersed with #-prefixed full-line comments
177
    """
178
    jsonstr = re.sub(r'^\s*#.*$', '', val, flags=re.MULTILINE)
179
    return json.loads(jsonstr)
180
181
def parse_json_string_or_file(*values, resolve_preset_file=None):    # pylint: disable=unused-argument
182
    """
183
    Parse a string as either the path to a JSON object or a literal JSON object.
184
185
    Empty strings are equivalent to '{}'
186
    """
187
    ret = {}
188
    for value in values:
189
        err = None
190
        value_parsed = None
191
        if re.fullmatch(r"\s*", value):
192
            continue
193
        try:
194
            try:
195
                path = value
196
                if callable(resolve_preset_file):
197
                    path = resolve_preset_file(value) or value
198
                with open(path, 'r') as f:
199
                    value_parsed = parse_json_string_with_comments(f.read())
200
            except (FileNotFoundError, OSError):
201
                value_parsed = parse_json_string_with_comments(value.strip())
202
            if not isinstance(value_parsed, dict):
203
                err = ValueError("Not a valid JSON object: '%s' (parsed as '%s')" % (value, value_parsed))
204
        except json.decoder.JSONDecodeError as e:
205
            err = ValueError("Error parsing '%s': %s" % (value, e))
206
        if err:
207
            raise err       # pylint: disable=raising-bad-type
208
        ret = {**ret, **value_parsed}
209
    return ret
210
211
def safe_filename(url):
212
    """
213
    Sanitize input to be safely used as the basename of a local file.
214
    """
215
    ret = re.sub(r'[^\w]+', '_', url)
216
    ret = re.sub(r'^\.*', '', ret)
217
    ret = re.sub(r'\.\.*', '.', ret)
218
    #  print('safe filename: %s -> %s' % (url, ret))
219
    return ret
220
221
def generate_range(start : str, end : str) -> List[str]:
222
    """
223
    Generate a list of strings by incrementing the number part of ``start`` until including ``end``.
224
    """
225
    ret = []
226
    try:
227
        start_num, end_num = re.findall(r'\d+', start)[-1], re.findall(r'\d+', end)[-1]
228
    except IndexError:
229
        raise ValueError("Range '%s..%s': could not find numeric part" % (start, end))
230
    if start[:-len(start_num)] != end[:-len(end_num)]:
231
        raise ValueError(f"Range '{start}..{end}' differ in their non-numeric part: '{start[:-len(start_num)]}' != '{end[:-len(end_num)]}'")
232
    if start_num == end_num:
233
        warn("Range '%s..%s': evaluates to the same number")
234
    for i in range(int(start_num), int(end_num) + 1):
235
        ret.append(start.replace(start_num, str(i).zfill(len(start_num))))
236
    return ret
237
238
239
def partition_list(lst, chunks, chunk_index=None):
240
    """
241
    Partition a list into roughly equally-sized chunks
242
243
    Args:
244
        lst (list): list to partition
245
        chunks (int): number of chunks to generate (not per chunk!)
246
247
    Keyword Args:
248
        chunk_index (None|int): If provided, return only a list consisting of this chunk
249
250
    Returns:
251
        list(list())
252
    """
253
    if not lst:
254
        return []
255
    # Catch potential empty ranges returned by numpy.array_split
256
    #  which are problematic in the ocr-d scope
257
    if chunks > len(lst):
258
        raise ValueError("Amount of chunks bigger than list size")
259
    ret = [x.tolist() for x in array_split(lst, chunks)]
260
    if chunk_index is not None:
261
        return [ret[chunk_index]]
262
    return ret
263
264
def sparkline(values : List[int]) -> str:
265
    """
266
    Render a list of points with block characters
267
    """
268
    if any(x is None or not isinstance(x, (int, float)) or x < 0 for x in values):
269
        # return an empty string on non-positive-int values, better not to
270
        # output a sparkline than to cancel execution due to problematic input
271
        return ''
272
    max_value = max(values)
273
    max_mapping = len(SPARKLINE_CHARS) - 1
274
    # normalize to 0..1 and convert to index in SPARKLINE_CHARS
275
    mapped = [int(x / max_value * max_mapping) for x in values]
276
    return ''.join(SPARKLINE_CHARS[x] for x in mapped)
277