ocrd_utils.str.safe_filename() - Code Metrics - Inspection of ":memo: changelog" - OCR-D/core - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Push — master ( a2259f...ef70d1 )

by Konstantin

created 2023-11-23 12:10 UTC

ocrd_utils.str.safe_filename() A

↳ Parent: ocrd_utils.str

Complexity

Conditions

Size

Total Lines	9
Code Lines	5

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
eloc	5
dl	0
loc	9
rs	10
c	0
b	0
f	0
cc	1
nop	1

"""
Utility functions for strings, paths and URL.
"""

import re
import json
from .constants import REGEX_FILE_ID
from .deprecate import deprecation_warning
from warnings import warn
from math import ceil
import sys
from itertools import islice

if sys.version_info >= (3, 12):
    from itertools import batched
else:
    def batched(iterable, chunk_size):
        iterator = iter(iterable)
        chunk = None
        while True:
            chunk = tuple(islice(iterator, chunk_size))
            if not chunk:
                break
            yield chunk

__all__ = [
    'assert_file_grp_cardinality',
    'concat_padded',
    'get_local_filename',
    'is_local_filename',
    'partition_list',
    'is_string',
    'make_file_id',
    'nth_url_segment',
    'parse_json_string_or_file',
    'parse_json_string_with_comments',
    'remove_non_path_from_url',
    'safe_filename',
]


def assert_file_grp_cardinality(grps, n, msg=None):
    """
    Assert that a string of comma-separated fileGrps contains exactly ``n`` entries.
    """
    if isinstance(grps, str):
        grps = grps.split(',')
    assert len(grps) == n, \
            "Expected exactly %d output file group%s%s, but '%s' has %d" % (
                n,
                '' if n == 1 else 's',
                ' (%s)' % msg if msg else '',
                grps,
                len(grps)
            )

def concat_padded(base, *args):
    """
    Concatenate string and zero-padded 4 digit number
    """
    ret = base
    for n in args:
        if is_string(n):
            ret = "%s_%s" % (ret, n)
        else:
            ret = "%s_%04i"  % (ret, n)
    return ret

def remove_non_path_from_url(url):
    """
    Remove everything from URL after path.
    """
    url = url.split('?', 1)[0]    # query
    url = url.split('#', 1)[0]    # fragment identifier
    url = re.sub(r"/+$", "", url) # trailing slashes
    return url

def make_file_id(ocrd_file, output_file_grp):
    """
    Derive a new file ID for an output file from an existing input file ``ocrd_file``
    and the name of the output file's ``fileGrp/@USE``, ``output_file_grp``.
    If ``ocrd_file``'s ID contains the input file's fileGrp name, then replace it by ``output_file_grp``.
    Else if ``ocrd_file`` has a ``pageId`` but it is not contained in the ``ocrd_file.ID``, then
        concatenate ``output_file_grp`` and ``ocrd_file.pageId``.
    Otherwise concatenate ``output_file_grp`` with the ``ocrd_file.ID``.

    Note: ``make_file_id`` cannot guarantee that the new ID is unique within an actual
    :py:class:`ocrd_models.ocrd_mets.OcrdMets`.
    The caller is responsible for ensuring uniqueness of files to be added.
    Ultimately, ID conflicts will lead to :py:meth:`ocrd_models.ocrd_mets.OcrdMets.add_file`
    raising an exception.
    This can be avoided if all processors use ``make_file_id`` consistently for ID generation.

    Note: ``make_file_id`` generates page-specific IDs. For IDs representing page segments
    or ``pc:AlternativeImage`` files, the output of ``make_file_id`` may need to be concatenated
    with a unique string for that sub-page element, such as `".IMG"` or the segment ID.
    """
    # considerations for this behaviour:
    # - uniqueness (in spite of different METS and processor conventions)
    # - predictability (i.e. output name can be anticipated from the input name)
    # - stability (i.e. output at least as much sorted and consistent as the input)
    # ... and all this in spite of --page-id selection and --overwrite
    # (i.e. --overwrite should target the existing ID, and input vs output
    #  IDs should be different, except when overwriting the input fileGrp)
    ret = ocrd_file.ID.replace(ocrd_file.fileGrp, output_file_grp)
    if ret == ocrd_file.ID and output_file_grp != ocrd_file.fileGrp:
        if ocrd_file.pageId and ocrd_file.pageId not in ocrd_file.ID:
            ret = output_file_grp + '_' + ocrd_file.pageId
        else:
            ret = output_file_grp + '_' + ocrd_file.ID
    if not REGEX_FILE_ID.fullmatch(ret):
        ret = ret.replace(':', '_')
        ret = re.sub(r'^([^a-zA-Z_])', r'id_\1', ret)
        ret = re.sub(r'[^\w.-]', r'', ret)
    return ret

def nth_url_segment(url, n=-1):
    """
    Return the last /-delimited segment of a URL-like string

    Arguments:
        url (string):
        n (integer): index of segment, default: -1
    """
    segments = remove_non_path_from_url(url).split('/')
    try:
        return segments[n]
    except IndexError:
        return ''

def get_local_filename(url, start=None):
    """
    Return local filename, optionally relative to ``start``

    Arguments:
        url (string): filename or URL
        start (string): Base path to remove from filename. Raise an exception if not a prefix of url
    """
    if url.startswith('https://') or url.startswith('http:'):
        raise ValueError("Can't determine local filename of http(s) URL")
    if url.startswith('file://'):
        url = url[len('file://'):]
    # Goobi/Kitodo produces those, they are always absolute
    if url.startswith('file:/'):
        url = url[len('file:'):]
    if start:
        if not url.startswith(start):
            raise ValueError("Cannot remove prefix %s from url %s" % (start, url))
        if not start.endswith('/'):
            start += '/'
        url = url[len(start):]
    return url

def is_local_filename(url):
    """
    Whether a url is a local filename.
    """
    # deprecation_warning("Deprecated so we spot inconsistent URL/file handling")
    return url.startswith('file://') or not('://' in url)

def is_string(val):
    """
    Return whether a value is a ``str``.
    """
    return isinstance(val, str)


def parse_json_string_with_comments(val):
    """
    Parse a string of JSON interspersed with #-prefixed full-line comments
    """
    jsonstr = re.sub(r'^\s*#.*$', '', val, flags=re.MULTILINE)
    return json.loads(jsonstr)

def parse_json_string_or_file(*values):    # pylint: disable=unused-argument
    """
    Parse a string as either the path to a JSON object or a literal JSON object.

    Empty strings are equivalent to '{}'
    """
    ret = {}
    for value in values:
        err = None
        value_parsed = None
        if re.fullmatch(r"\s*", value):
            continue
        try:
            try:
                with open(value, 'r') as f:
                    value_parsed = parse_json_string_with_comments(f.read())
            except (FileNotFoundError, OSError):
                value_parsed = parse_json_string_with_comments(value.strip())
            if not isinstance(value_parsed, dict):
                err = ValueError("Not a valid JSON object: '%s' (parsed as '%s')" % (value, value_parsed))
        except json.decoder.JSONDecodeError as e:
            err = ValueError("Error parsing '%s': %s" % (value, e))
        if err:
            raise err       # pylint: disable=raising-bad-type
        ret = {**ret, **value_parsed}
    return ret

def safe_filename(url):
    """
    Sanitize input to be safely used as the basename of a local file.
    """
    ret = re.sub(r'[^\w]+', '_', url)
    ret = re.sub(r'^\.*', '', ret)
    ret = re.sub(r'\.\.*', '.', ret)
    #  print('safe filename: %s -> %s' % (url, ret))
    return ret

def generate_range(start, end):
    """
    Generate a list of strings by incrementing the number part of ``start`` until including ``end``.
    """
    ret = []
    try:
        start_num, end_num = re.findall(r'\d+', start)[-1], re.findall(r'\d+', end)[-1]
    except IndexError:
        raise ValueError("Range '%s..%s': could not find numeric part" % (start, end))
    if start_num == end_num:
        warn("Range '%s..%s': evaluates to the same number")
    for i in range(int(start_num), int(end_num) + 1):
        ret.append(start.replace(start_num, str(i).zfill(len(start_num))))
    return ret

def partition_list(lst, chunks, chunk_index=None):
    """
    Partition a list into roughly equally-sized chunks

    Args:
        lst (list): list to partition
        chunks (int): number of chunks to generate (not per chunk!)

    Keyword Args:
        chunk_index (None|int): If provided, return only a list consisting of this chunk

    Returns:
        list(list())
    """
    if not lst:
        return []
    items_per_chunk = ceil(len(lst) / chunks)
    ret = list(map(list, batched(lst, items_per_chunk)))
    if chunk_index is not None:
        return [ret[chunk_index]]
    return ret


1			"""
2			Utility functions for strings, paths and URL.
3			"""
4
5			import re
6			import json
7			from .constants import REGEX_FILE_ID
8			from .deprecate import deprecation_warning
9			from warnings import warn
10			from math import ceil
11			import sys
12			from itertools import islice
13
14			if sys.version_info >= (3, 12):
15			from itertools import batched
16			else:
17			def batched(iterable, chunk_size):
18			iterator = iter(iterable)
19			chunk = None
20			while True:
21			chunk = tuple(islice(iterator, chunk_size))
22			if not chunk:
23			break
24			yield chunk
25
26			__all__ = [
27			'assert_file_grp_cardinality',
28			'concat_padded',
29			'get_local_filename',
30			'is_local_filename',
31			'partition_list',
32			'is_string',
33			'make_file_id',
34			'nth_url_segment',
35			'parse_json_string_or_file',
36			'parse_json_string_with_comments',
37			'remove_non_path_from_url',
38			'safe_filename',
39			]
40
41
42			def assert_file_grp_cardinality(grps, n, msg=None):
43			"""
44			Assert that a string of comma-separated fileGrps contains exactly ``n`` entries.
45			"""
46			if isinstance(grps, str):
47			grps = grps.split(',')
48			assert len(grps) == n, \
49			"Expected exactly %d output file group%s%s, but '%s' has %d" % (
50			n,
51			'' if n == 1 else 's',
52			' (%s)' % msg if msg else '',
53			grps,
54			len(grps)
55			)
56
57			def concat_padded(base, *args):
58			"""
59			Concatenate string and zero-padded 4 digit number
60			"""
61			ret = base
62			for n in args:
63			if is_string(n):
64			ret = "%s_%s" % (ret, n)
65			else:
66			ret = "%s_%04i" % (ret, n)
67			return ret
68
69			def remove_non_path_from_url(url):
70			"""
71			Remove everything from URL after path.
72			"""
73			url = url.split('?', 1)[0] # query
74			url = url.split('#', 1)[0] # fragment identifier
75			url = re.sub(r"/+$", "", url) # trailing slashes
76			return url
77
78			def make_file_id(ocrd_file, output_file_grp):
79			"""
80			Derive a new file ID for an output file from an existing input file ``ocrd_file``
81			and the name of the output file's ``fileGrp/@USE``, ``output_file_grp``.
82			If ``ocrd_file``'s ID contains the input file's fileGrp name, then replace it by ``output_file_grp``.
83			Else if ``ocrd_file`` has a ``pageId`` but it is not contained in the ``ocrd_file.ID``, then
84			concatenate ``output_file_grp`` and ``ocrd_file.pageId``.
85			Otherwise concatenate ``output_file_grp`` with the ``ocrd_file.ID``.
86
87			Note: ``make_file_id`` cannot guarantee that the new ID is unique within an actual
88			:py:class:`ocrd_models.ocrd_mets.OcrdMets`.
89			The caller is responsible for ensuring uniqueness of files to be added.
90			Ultimately, ID conflicts will lead to :py:meth:`ocrd_models.ocrd_mets.OcrdMets.add_file`
91			raising an exception.
92			This can be avoided if all processors use ``make_file_id`` consistently for ID generation.
93
94			Note: ``make_file_id`` generates page-specific IDs. For IDs representing page segments
95			or ``pc:AlternativeImage`` files, the output of ``make_file_id`` may need to be concatenated
96			with a unique string for that sub-page element, such as `".IMG"` or the segment ID.
97			"""
98			# considerations for this behaviour:
99			# - uniqueness (in spite of different METS and processor conventions)
100			# - predictability (i.e. output name can be anticipated from the input name)
101			# - stability (i.e. output at least as much sorted and consistent as the input)
102			# ... and all this in spite of --page-id selection and --overwrite
103			# (i.e. --overwrite should target the existing ID, and input vs output
104			# IDs should be different, except when overwriting the input fileGrp)
105			ret = ocrd_file.ID.replace(ocrd_file.fileGrp, output_file_grp)
106			if ret == ocrd_file.ID and output_file_grp != ocrd_file.fileGrp:
107			if ocrd_file.pageId and ocrd_file.pageId not in ocrd_file.ID:
108			ret = output_file_grp + '_' + ocrd_file.pageId
109			else:
110			ret = output_file_grp + '_' + ocrd_file.ID
111			if not REGEX_FILE_ID.fullmatch(ret):
112			ret = ret.replace(':', '_')
113			ret = re.sub(r'^([^a-zA-Z_])', r'id_\1', ret)
114			ret = re.sub(r'[^\w.-]', r'', ret)
115			return ret
116
117			def nth_url_segment(url, n=-1):
118			"""
119			Return the last /-delimited segment of a URL-like string
120
121			Arguments:
122			url (string):
123			n (integer): index of segment, default: -1
124			"""
125			segments = remove_non_path_from_url(url).split('/')
126			try:
127			return segments[n]
128			except IndexError:
129			return ''
130
131			def get_local_filename(url, start=None):
132			"""
133			Return local filename, optionally relative to ``start``
134
135			Arguments:
136			url (string): filename or URL
137			start (string): Base path to remove from filename. Raise an exception if not a prefix of url
138			"""
139			if url.startswith('https://') or url.startswith('http:'):
140			raise ValueError("Can't determine local filename of http(s) URL")
141			if url.startswith('file://'):
142			url = url[len('file://'):]
143			# Goobi/Kitodo produces those, they are always absolute
144			if url.startswith('file:/'):
145			url = url[len('file:'):]
146			if start:
147			if not url.startswith(start):
148			raise ValueError("Cannot remove prefix %s from url %s" % (start, url))
149			if not start.endswith('/'):
150			start += '/'
151			url = url[len(start):]
152			return url
153
154			def is_local_filename(url):
155			"""
156			Whether a url is a local filename.
157			"""
158			# deprecation_warning("Deprecated so we spot inconsistent URL/file handling")
159			return url.startswith('file://') or not('://' in url)
160
161			def is_string(val):
162			"""
163			Return whether a value is a ``str``.
164			"""
165			return isinstance(val, str)
166
167
168			def parse_json_string_with_comments(val):
169			"""
170			Parse a string of JSON interspersed with #-prefixed full-line comments
171			"""
172			jsonstr = re.sub(r'^\s#.$', '', val, flags=re.MULTILINE)
173			return json.loads(jsonstr)
174
175			def parse_json_string_or_file(*values): # pylint: disable=unused-argument
176			"""
177			Parse a string as either the path to a JSON object or a literal JSON object.
178
179			Empty strings are equivalent to '{}'
180			"""
181			ret = {}
182			for value in values:
183			err = None
184			value_parsed = None
185			if re.fullmatch(r"\s*", value):
186			continue
187			try:
188			try:
189			with open(value, 'r') as f:
190			value_parsed = parse_json_string_with_comments(f.read())
191			except (FileNotFoundError, OSError):
192			value_parsed = parse_json_string_with_comments(value.strip())
193			if not isinstance(value_parsed, dict):
194			err = ValueError("Not a valid JSON object: '%s' (parsed as '%s')" % (value, value_parsed))
195			except json.decoder.JSONDecodeError as e:
196			err = ValueError("Error parsing '%s': %s" % (value, e))
197			if err:
198			raise err # pylint: disable=raising-bad-type
199			ret = {ret, value_parsed}
200			return ret
201
202			def safe_filename(url):
203			"""
204			Sanitize input to be safely used as the basename of a local file.
205			"""
206			ret = re.sub(r'[^\w]+', '_', url)
207			ret = re.sub(r'^\.*', '', ret)
208			ret = re.sub(r'\.\.*', '.', ret)
209			# print('safe filename: %s -> %s' % (url, ret))
210			return ret
211
212			def generate_range(start, end):
213			"""
214			Generate a list of strings by incrementing the number part of ``start`` until including ``end``.
215			"""
216			ret = []
217			try:
218			start_num, end_num = re.findall(r'\d+', start)[-1], re.findall(r'\d+', end)[-1]
219			except IndexError:
220			raise ValueError("Range '%s..%s': could not find numeric part" % (start, end))
221			if start_num == end_num:
222			warn("Range '%s..%s': evaluates to the same number")
223			for i in range(int(start_num), int(end_num) + 1):
224			ret.append(start.replace(start_num, str(i).zfill(len(start_num))))
225			return ret
226
227			def partition_list(lst, chunks, chunk_index=None):
228			"""
229			Partition a list into roughly equally-sized chunks
230
231			Args:
232			lst (list): list to partition
233			chunks (int): number of chunks to generate (not per chunk!)
234
235			Keyword Args:
236			chunk_index (None\|int): If provided, return only a list consisting of this chunk
237
238			Returns:
239			list(list())
240			"""
241			if not lst:
242			return []
243			items_per_chunk = ceil(len(lst) / chunks)
244			ret = list(map(list, batched(lst, items_per_chunk)))
245			if chunk_index is not None:
246			return [ret[chunk_index]]
247			return ret
248

OCR-D / core

Push — master ( a2259f...ef70d1 )

ocrd_utils.str.safe_filename() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like