ocrd_utils.str.sparkline() - Code Metrics - OCR-D/core - Measure and Improve Code Quality continuously with Scrutinizer

ocrd_utils.str.sparkline() A
last analyzed 2025-12-10 15:15 UTC

↳ Parent: ocrd_utils.str

Complexity

Conditions

Size

Total Lines	13
Code Lines	7

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
eloc	7
dl	0
loc	13
rs	10
c	0
b	0
f	0
cc	4
nop	1

"""
Utility functions for strings, paths and URL.
"""

import re
import json
from typing import List
from .constants import REGEX_FILE_ID, SPARKLINE_CHARS
#from .deprecate import deprecation_warning
from deprecated import deprecated
from warnings import warn
from numpy import array_split

__all__ = [
    'assert_file_grp_cardinality',
    'concat_padded',
    'get_local_filename',
    'is_local_filename',
    'partition_list',
    'is_string',
    'make_file_id',
    'make_xml_id',
    'nth_url_segment',
    'parse_json_file_with_comments',
    'parse_json_string_or_file',
    'parse_json_string_with_comments',
    'remove_non_path_from_url',
    'safe_filename',
]


@deprecated(version='3.0', reason='specify input and output file_grp_cardinality in ocrd-tool.json instead')
def assert_file_grp_cardinality(grps, n, msg=None):
    """
    Assert that a string of comma-separated fileGrps contains exactly ``n`` entries.
    """
    if isinstance(grps, str):
        grps = grps.split(',')
    assert len(grps) == n, \
        "Expected exactly %d output file group%s%s, but '%s' has %d" % (
            n,
            '' if n == 1 else 's',
            ' (%s)' % msg if msg else '',
            grps,
            len(grps)
        )


def concat_padded(base, *args):
    """
    Concatenate string and zero-padded 4 digit number
    """
    ret = base
    for n in args:
        if is_string(n):
            ret = "%s_%s" % (ret, n)
        else:
            ret = "%s_%04i" % (ret, n)
    return ret


def remove_non_path_from_url(url):
    """
    Remove everything from URL after path.
    """
    url = url.split('?', 1)[0]     # query
    url = url.split('#', 1)[0]     # fragment identifier
    url = re.sub(r"/+$", "", url)  # trailing slashes
    return url


def make_file_id(ocrd_file, output_file_grp):
    """
    Derive a new file ID for an output file from an existing input file ``ocrd_file``
    and the name of the output file's ``fileGrp/@USE``, ``output_file_grp``.
    If ``ocrd_file``'s ID contains the input file's fileGrp name, then replace it by ``output_file_grp``.
    Else if ``ocrd_file`` has a ``pageId`` but it is not contained in the ``ocrd_file.ID``, then
        concatenate ``output_file_grp`` and ``ocrd_file.pageId``.
    Otherwise concatenate ``output_file_grp`` with the ``ocrd_file.ID``.

    Note: ``make_file_id`` cannot guarantee that the new ID is unique within an actual
    :py:class:`ocrd_models.ocrd_mets.OcrdMets`.
    The caller is responsible for ensuring uniqueness of files to be added.
    Ultimately, ID conflicts will lead to :py:meth:`ocrd_models.ocrd_mets.OcrdMets.add_file`
    raising an exception.
    This can be avoided if all processors use ``make_file_id`` consistently for ID generation.

    Note: ``make_file_id`` generates page-specific IDs. For IDs representing page segments
    or ``pc:AlternativeImage`` files, the output of ``make_file_id`` may need to be concatenated
    with a unique string for that sub-page element, such as `".IMG"` or the segment ID.
    """
    # considerations for this behaviour:
    # - uniqueness (in spite of different METS and processor conventions)
    # - predictability (i.e. output name can be anticipated from the input name)
    # - stability (i.e. output at least as much sorted and consistent as the input)
    # ... and all this in spite of --page-id selection and --overwrite
    # (i.e. --overwrite should target the existing ID, and input vs output
    #  IDs should be different, except when overwriting the input fileGrp)
    ret = ocrd_file.ID.replace(ocrd_file.fileGrp, output_file_grp)
    if ret == ocrd_file.ID and output_file_grp != ocrd_file.fileGrp:
        if ocrd_file.pageId and ocrd_file.pageId not in ocrd_file.ID:
            ret = output_file_grp + '_' + ocrd_file.pageId
        else:
            ret = output_file_grp + '_' + ocrd_file.ID
    return make_xml_id(ret)


def make_xml_id(idstr: str) -> str:
    """
    Turn ``idstr`` into a valid ``xml:id`` literal by replacing ``:`` with ``_``,
    removing everything non-alphanumeric, ``.`` and ``-`` and prepending `id_`
    if ``idstr`` starts with a number.
    """
    ret = idstr
    if not REGEX_FILE_ID.fullmatch(ret):
        ret = ret.replace(':', '_')
        ret = ret.replace('/', '_')
        ret = re.sub(r'^([^a-zA-Z_])', r'id_\1', ret)
        ret = re.sub(r'[^\w.-]', r'', ret)
    return ret


def nth_url_segment(url, n=-1):
    """
    Return the last /-delimited segment of a URL-like string

    Arguments:
        url (string):
        n (integer): index of segment, default: -1
    """
    segments = remove_non_path_from_url(url).split('/')
    try:
        return segments[n]
    except IndexError:
        return ''


def get_local_filename(url, start=None):
    """
    Return local filename, optionally relative to ``start``

    Arguments:
        url (string): filename or URL
        start (string): Base path to remove from filename. Raise an exception if not a prefix of url
    """
    if url.startswith('https://') or url.startswith('http:'):
        raise ValueError("Can't determine local filename of http(s) URL")
    if url.startswith('file://'):
        url = url[len('file://'):]
    # Goobi/Kitodo produces those, they are always absolute
    if url.startswith('file:/'):
        url = url[len('file:'):]
    if start:
        if not url.startswith(start):
            raise ValueError("Cannot remove prefix %s from url %s" % (start, url))
        if not start.endswith('/'):
            start += '/'
        url = url[len(start):]
    return url


def is_local_filename(url):
    """
    Whether a url is a local filename.
    """
    # deprecation_warning("Deprecated so we spot inconsistent URL/file handling")
    return url.startswith('file://') or '://' not in url


def is_string(val):
    """
    Return whether a value is a ``str``.
    """
    return isinstance(val, str)


def parse_json_file_with_comments(val):
    """
    Parse a file of JSON interspersed with #-prefixed full-line comments
    """
    with open(val, 'r', encoding='utf-8') as inputf:
        return parse_json_string_with_comments(inputf.read())


def parse_json_string_with_comments(val):
    """
    Parse a string of JSON interspersed with #-prefixed full-line comments
    """
    jsonstr = re.sub(r'^\s*#.*$', '', val, flags=re.MULTILINE)
    return json.loads(jsonstr)


def parse_json_string_or_file(*values, resolve_preset_file=None):    # pylint: disable=unused-argument
    """
    Parse a string as either the path to a JSON object or a literal JSON object.

    Empty strings are equivalent to '{}'
    """
    ret = {}
    for value in values:
        err = None
        value_parsed = None
        if re.fullmatch(r"\s*", value):
            continue
        try:
            try:
                path = value
                if callable(resolve_preset_file):
                    path = resolve_preset_file(value) or value
                with open(path, 'r') as f:
                    value_parsed = parse_json_string_with_comments(f.read())
            except (FileNotFoundError, OSError):
                value_parsed = parse_json_string_with_comments(value.strip())
            if not isinstance(value_parsed, dict):
                err = ValueError("Not a valid JSON object: '%s' (parsed as '%s')" % (value, value_parsed))
        except json.decoder.JSONDecodeError as e:
            err = ValueError("Error parsing '%s': %s" % (value, e))
        if err:
            raise err       # pylint: disable=raising-bad-type
        ret = {**ret, **value_parsed}
    return ret


def safe_filename(url):
    """
    Sanitize input to be safely used as the basename of a local file.
    """
    ret = re.sub(r'[^\w]+', '_', url)
    ret = re.sub(r'^\.*', '', ret)
    ret = re.sub(r'\.\.*', '.', ret)
    #  print('safe filename: %s -> %s' % (url, ret))
    return ret


def generate_range(start: str, end: str) -> List[str]:
    """
    Generate a list of strings by incrementing the number part of ``start`` until including ``end``.
    """
    ret = []
    try:
        start_num, end_num = re.findall(r'\d+', start)[-1], re.findall(r'\d+', end)[-1]
    except IndexError:
        raise ValueError("Range '%s..%s': could not find numeric part" % (start, end))
    if start[:-len(start_num)] != end[:-len(end_num)]:
        raise ValueError(f"Range '{start}..{end}' differ in their non-numeric part: "
                         f"'{start[:-len(start_num)]}' != '{end[:-len(end_num)]}'")
    if start_num == end_num:
        warn("Range '%s..%s': evaluates to the same number")
    for i in range(int(start_num), int(end_num) + 1):
        ret.append(start.replace(start_num, str(i).zfill(len(start_num))))
    return ret


def partition_list(lst, chunks, chunk_index=None):
    """
    Partition a list into roughly equally-sized chunks

    Args:
        lst (list): list to partition
        chunks (int): number of chunks to generate (not per chunk!)

    Keyword Args:
        chunk_index (None|int): If provided, return only a list consisting of this chunk

    Returns:
        list(list())
    """
    if not lst:
        return []
    # Catch potential empty ranges returned by numpy.array_split
    #  which are problematic in the ocr-d scope
    if chunks > len(lst):
        raise ValueError("Amount of chunks bigger than list size")
    ret = [x.tolist() for x in array_split(lst, chunks)]
    if chunk_index is not None:
        return [ret[chunk_index]]
    return ret


def sparkline(values: List[int]) -> str:
    """
    Render a list of points with block characters
    """
    if any(x is None or not isinstance(x, (int, float)) or x < 0 for x in values):
        # return an empty string on non-positive-int values, better not to
        # output a sparkline than to cancel execution due to problematic input
        return ''
    max_value = max(values)
    max_mapping = len(SPARKLINE_CHARS) - 1
    # normalize to 0..1 and convert to index in SPARKLINE_CHARS
    mapped = [int(x / max_value * max_mapping) for x in values]
    return ''.join(SPARKLINE_CHARS[x] for x in mapped)


1			"""
2			Utility functions for strings, paths and URL.
3			"""
4
5			import re
6			import json
7			from typing import List
8			from .constants import REGEX_FILE_ID, SPARKLINE_CHARS
9			#from .deprecate import deprecation_warning
10			from deprecated import deprecated
11			from warnings import warn
12			from numpy import array_split
13
14			__all__ = [
15			'assert_file_grp_cardinality',
16			'concat_padded',
17			'get_local_filename',
18			'is_local_filename',
19			'partition_list',
20			'is_string',
21			'make_file_id',
22			'make_xml_id',
23			'nth_url_segment',
24			'parse_json_file_with_comments',
25			'parse_json_string_or_file',
26			'parse_json_string_with_comments',
27			'remove_non_path_from_url',
28			'safe_filename',
29			]
30
31
32			@deprecated(version='3.0', reason='specify input and output file_grp_cardinality in ocrd-tool.json instead')
33			def assert_file_grp_cardinality(grps, n, msg=None):
34			"""
35			Assert that a string of comma-separated fileGrps contains exactly ``n`` entries.
36			"""
37			if isinstance(grps, str):
38			grps = grps.split(',')
39			assert len(grps) == n, \
40			"Expected exactly %d output file group%s%s, but '%s' has %d" % (
41			n,
42			'' if n == 1 else 's',
43			' (%s)' % msg if msg else '',
44			grps,
45			len(grps)
46			)
47
48
49			def concat_padded(base, *args):
50			"""
51			Concatenate string and zero-padded 4 digit number
52			"""
53			ret = base
54			for n in args:
55			if is_string(n):
56			ret = "%s_%s" % (ret, n)
57			else:
58			ret = "%s_%04i" % (ret, n)
59			return ret
60
61
62			def remove_non_path_from_url(url):
63			"""
64			Remove everything from URL after path.
65			"""
66			url = url.split('?', 1)[0] # query
67			url = url.split('#', 1)[0] # fragment identifier
68			url = re.sub(r"/+$", "", url) # trailing slashes
69			return url
70
71
72			def make_file_id(ocrd_file, output_file_grp):
73			"""
74			Derive a new file ID for an output file from an existing input file ``ocrd_file``
75			and the name of the output file's ``fileGrp/@USE``, ``output_file_grp``.
76			If ``ocrd_file``'s ID contains the input file's fileGrp name, then replace it by ``output_file_grp``.
77			Else if ``ocrd_file`` has a ``pageId`` but it is not contained in the ``ocrd_file.ID``, then
78			concatenate ``output_file_grp`` and ``ocrd_file.pageId``.
79			Otherwise concatenate ``output_file_grp`` with the ``ocrd_file.ID``.
80
81			Note: ``make_file_id`` cannot guarantee that the new ID is unique within an actual
82			:py:class:`ocrd_models.ocrd_mets.OcrdMets`.
83			The caller is responsible for ensuring uniqueness of files to be added.
84			Ultimately, ID conflicts will lead to :py:meth:`ocrd_models.ocrd_mets.OcrdMets.add_file`
85			raising an exception.
86			This can be avoided if all processors use ``make_file_id`` consistently for ID generation.
87
88			Note: ``make_file_id`` generates page-specific IDs. For IDs representing page segments
89			or ``pc:AlternativeImage`` files, the output of ``make_file_id`` may need to be concatenated
90			with a unique string for that sub-page element, such as `".IMG"` or the segment ID.
91			"""
92			# considerations for this behaviour:
93			# - uniqueness (in spite of different METS and processor conventions)
94			# - predictability (i.e. output name can be anticipated from the input name)
95			# - stability (i.e. output at least as much sorted and consistent as the input)
96			# ... and all this in spite of --page-id selection and --overwrite
97			# (i.e. --overwrite should target the existing ID, and input vs output
98			# IDs should be different, except when overwriting the input fileGrp)
99			ret = ocrd_file.ID.replace(ocrd_file.fileGrp, output_file_grp)
100			if ret == ocrd_file.ID and output_file_grp != ocrd_file.fileGrp:
101			if ocrd_file.pageId and ocrd_file.pageId not in ocrd_file.ID:
102			ret = output_file_grp + '_' + ocrd_file.pageId
103			else:
104			ret = output_file_grp + '_' + ocrd_file.ID
105			return make_xml_id(ret)
106
107
108			def make_xml_id(idstr: str) -> str:
109			"""
110			Turn ``idstr`` into a valid ``xml:id`` literal by replacing ``:`` with ``_``,
111			removing everything non-alphanumeric, ``.`` and ``-`` and prepending `id_`
112			if ``idstr`` starts with a number.
113			"""
114			ret = idstr
115			if not REGEX_FILE_ID.fullmatch(ret):
116			ret = ret.replace(':', '_')
117			ret = ret.replace('/', '_')
118			ret = re.sub(r'^([^a-zA-Z_])', r'id_\1', ret)
119			ret = re.sub(r'[^\w.-]', r'', ret)
120			return ret
121
122
123			def nth_url_segment(url, n=-1):
124			"""
125			Return the last /-delimited segment of a URL-like string
126
127			Arguments:
128			url (string):
129			n (integer): index of segment, default: -1
130			"""
131			segments = remove_non_path_from_url(url).split('/')
132			try:
133			return segments[n]
134			except IndexError:
135			return ''
136
137
138			def get_local_filename(url, start=None):
139			"""
140			Return local filename, optionally relative to ``start``
141
142			Arguments:
143			url (string): filename or URL
144			start (string): Base path to remove from filename. Raise an exception if not a prefix of url
145			"""
146			if url.startswith('https://') or url.startswith('http:'):
147			raise ValueError("Can't determine local filename of http(s) URL")
148			if url.startswith('file://'):
149			url = url[len('file://'):]
150			# Goobi/Kitodo produces those, they are always absolute
151			if url.startswith('file:/'):
152			url = url[len('file:'):]
153			if start:
154			if not url.startswith(start):
155			raise ValueError("Cannot remove prefix %s from url %s" % (start, url))
156			if not start.endswith('/'):
157			start += '/'
158			url = url[len(start):]
159			return url
160
161
162			def is_local_filename(url):
163			"""
164			Whether a url is a local filename.
165			"""
166			# deprecation_warning("Deprecated so we spot inconsistent URL/file handling")
167			return url.startswith('file://') or '://' not in url
168
169
170			def is_string(val):
171			"""
172			Return whether a value is a ``str``.
173			"""
174			return isinstance(val, str)
175
176
177			def parse_json_file_with_comments(val):
178			"""
179			Parse a file of JSON interspersed with #-prefixed full-line comments
180			"""
181			with open(val, 'r', encoding='utf-8') as inputf:
182			return parse_json_string_with_comments(inputf.read())
183
184
185			def parse_json_string_with_comments(val):
186			"""
187			Parse a string of JSON interspersed with #-prefixed full-line comments
188			"""
189			jsonstr = re.sub(r'^\s#.$', '', val, flags=re.MULTILINE)
190			return json.loads(jsonstr)
191
192
193			def parse_json_string_or_file(*values, resolve_preset_file=None): # pylint: disable=unused-argument
194			"""
195			Parse a string as either the path to a JSON object or a literal JSON object.
196
197			Empty strings are equivalent to '{}'
198			"""
199			ret = {}
200			for value in values:
201			err = None
202			value_parsed = None
203			if re.fullmatch(r"\s*", value):
204			continue
205			try:
206			try:
207			path = value
208			if callable(resolve_preset_file):
209			path = resolve_preset_file(value) or value
210			with open(path, 'r') as f:
211			value_parsed = parse_json_string_with_comments(f.read())
212			except (FileNotFoundError, OSError):
213			value_parsed = parse_json_string_with_comments(value.strip())
214			if not isinstance(value_parsed, dict):
215			err = ValueError("Not a valid JSON object: '%s' (parsed as '%s')" % (value, value_parsed))
216			except json.decoder.JSONDecodeError as e:
217			err = ValueError("Error parsing '%s': %s" % (value, e))
218			if err:
219			raise err # pylint: disable=raising-bad-type
220			ret = {ret, value_parsed}
221			return ret
222
223
224			def safe_filename(url):
225			"""
226			Sanitize input to be safely used as the basename of a local file.
227			"""
228			ret = re.sub(r'[^\w]+', '_', url)
229			ret = re.sub(r'^\.*', '', ret)
230			ret = re.sub(r'\.\.*', '.', ret)
231			# print('safe filename: %s -> %s' % (url, ret))
232			return ret
233
234
235			def generate_range(start: str, end: str) -> List[str]:
236			"""
237			Generate a list of strings by incrementing the number part of ``start`` until including ``end``.
238			"""
239			ret = []
240			try:
241			start_num, end_num = re.findall(r'\d+', start)[-1], re.findall(r'\d+', end)[-1]
242			except IndexError:
243			raise ValueError("Range '%s..%s': could not find numeric part" % (start, end))
244			if start[:-len(start_num)] != end[:-len(end_num)]:
245			raise ValueError(f"Range '{start}..{end}' differ in their non-numeric part: "
246			f"'{start[:-len(start_num)]}' != '{end[:-len(end_num)]}'")
247			if start_num == end_num:
248			warn("Range '%s..%s': evaluates to the same number")
249			for i in range(int(start_num), int(end_num) + 1):
250			ret.append(start.replace(start_num, str(i).zfill(len(start_num))))
251			return ret
252
253
254			def partition_list(lst, chunks, chunk_index=None):
255			"""
256			Partition a list into roughly equally-sized chunks
257
258			Args:
259			lst (list): list to partition
260			chunks (int): number of chunks to generate (not per chunk!)
261
262			Keyword Args:
263			chunk_index (None\|int): If provided, return only a list consisting of this chunk
264
265			Returns:
266			list(list())
267			"""
268			if not lst:
269			return []
270			# Catch potential empty ranges returned by numpy.array_split
271			# which are problematic in the ocr-d scope
272			if chunks > len(lst):
273			raise ValueError("Amount of chunks bigger than list size")
274			ret = [x.tolist() for x in array_split(lst, chunks)]
275			if chunk_index is not None:
276			return [ret[chunk_index]]
277			return ret
278
279
280			def sparkline(values: List[int]) -> str:
281			"""
282			Render a list of points with block characters
283			"""
284			if any(x is None or not isinstance(x, (int, float)) or x < 0 for x in values):
285			# return an empty string on non-positive-int values, better not to
286			# output a sparkline than to cancel execution due to problematic input
287			return ''
288			max_value = max(values)
289			max_mapping = len(SPARKLINE_CHARS) - 1
290			# normalize to 0..1 and convert to index in SPARKLINE_CHARS
291			mapped = [int(x / max_value * max_mapping) for x in values]
292			return ''.join(SPARKLINE_CHARS[x] for x in mapped)
293

OCR-D / core

ocrd_utils.str.sparkline() A last analyzed 2025-12-10 15:15 UTC

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like

ocrd_utils.str.sparkline() A
last analyzed 2025-12-10 15:15 UTC