ocrd_utils.str - Code Metrics - Inspection of "Refactor utils tests" - OCR-D/core - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Pull Request — master (#536)

by Konstantin

created 2020-07-15 19:10 UTC

ocrd_utils.str A

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	171
Duplicated Lines	0 %

Importance

Changes

Metric	Value
wmc	34
eloc	101
dl	0
loc	171
rs	9.68
c	0
b	0
f	0

11 Functions

Rating	Name	Size	Complexity
A	assert_file_grp_cardinality()	9	3
A	is_local_filename()	5	1
A	remove_non_path_from_url()	8	1
A	is_string()	5	1
A	nth_url_segment()	13	2
B	get_local_filename()	22	8
B	parse_json_string_or_file()	26	8
A	safe_filename()	7	1
A	parse_json_string_with_comments()	6	1
A	concat_padded()	11	3
A	make_file_id()	24	5

"""
Utility functions for strings, paths and URL.
"""

import re
import json

__all__ = [
    'assert_file_grp_cardinality',
    'concat_padded',
    'get_local_filename',
    'is_local_filename',
    'is_string',
    'make_file_id',
    'nth_url_segment',
    'parse_json_string_or_file',
    'parse_json_string_with_comments',
    'remove_non_path_from_url',
    'safe_filename',
    'set_json_key_value_overrides',
]


def assert_file_grp_cardinality(grps, n):
    """
    Assert that a string of comma-separated fileGrps contains exactly ``n`` entries.
    """
    if isinstance(grps, str):
        grps = grps.split(',')
    assert len(grps) == n, \
            "Expected exactly %d output file group%s, but '%s' has %d" % (
                n, '' if n == 1 else 's', grps, len(grps))

def concat_padded(base, *args):
    """
    Concatenate string and zero-padded 4 digit number
    """
    ret = base
    for n in args:
        if is_string(n):
            ret = "%s_%s" % (ret, n)
        else:
            ret = "%s_%04i"  % (ret, n + 1)
    return ret

def remove_non_path_from_url(url):
    """
    Remove everything from URL after path.
    """
    url = url.split('?', 1)[0]    # query
    url = url.split('#', 1)[0]    # fragment identifier
    url = re.sub(r"/+$", "", url) # trailing slashes
    return url

def make_file_id(ocrd_file, output_file_grp):
    """
    Derive a new file ID for an output file from an existing input file ``ocrd_file``
    and the name of the output file's ``fileGrp/@USE``, ``output_file_grp``.
    If ``ocrd_file``'s ID contains the input file's fileGrp name, then replace it by ``output_file_grp``.
    Otherwise use ``output_file_grp`` together with the position of ``ocrd_file`` within the input fileGrp
    (as a fallback counter). Increment counter until there is no more ID conflict.
    """
    ret = ocrd_file.ID.replace(ocrd_file.fileGrp, output_file_grp)
    if ret == ocrd_file.ID:
        m = re.match(r'.*?(\d{3,}).*', ocrd_file.pageId or '')
        if m:
            n = m.group(1)
        else:
            ids = [f.ID for f in ocrd_file.mets.find_files(fileGrp=ocrd_file.fileGrp, mimetype=ocrd_file.mimetype)]
            try:
                n = ids.index(ocrd_file.ID)
            except ValueError:
                n = len(ids)
        ret = concat_padded(output_file_grp, n)
        while ocrd_file.mets.find_files(ID=ret):
            n += 1
            ret = concat_padded(output_file_grp, n)
    return ret

def nth_url_segment(url, n=-1):
    """
    Return the last /-delimited segment of a URL-like string

    Arguments:
        url (string):
        n (integer): index of segment, default: -1
    """
    segments = remove_non_path_from_url(url).split('/')
    try:
        return segments[n]
    except IndexError:
        return ''

def get_local_filename(url, start=None):
    """
    Return local filename, optionally relative to ``start``

    Arguments:
        url (string): filename or URL
        start (string): Base path to remove from filename. Raise an exception if not a prefix of url
    """
    if url.startswith('https://') or url.startswith('http:'):
        raise Exception("Can't determine local filename of http(s) URL")
    if url.startswith('file://'):
        url = url[len('file://'):]
    # Goobi/Kitodo produces those, they are always absolute
    if url.startswith('file:/'):
        raise Exception("Invalid (java) URL: %s" % url)
    if start:
        if not url.startswith(start):
            raise Exception("Cannot remove prefix %s from url %s" % (start, url))
        if not start.endswith('/'):
            start += '/'
        url = url[len(start):]
    return url

def is_local_filename(url):
    """
    Whether a url is a local filename.
    """
    return url.startswith('file://') or not('://' in url)

def is_string(val):
    """
    Return whether a value is a ``str``.
    """
    return isinstance(val, str)


def parse_json_string_with_comments(val):
    """
    Parse a string of JSON interspersed with #-prefixed full-line comments
    """
    jsonstr = re.sub(r'^\s*#.*$', '', val, flags=re.MULTILINE)
    return json.loads(jsonstr)

def parse_json_string_or_file(*values):    # pylint: disable=unused-argument
    """
    Parse a string as either the path to a JSON object or a literal JSON object.

    Empty strings are equivalent to '{}'
    """
    ret = {}
    for value in values:
        err = None
        value_parsed = None
        if re.fullmatch(r"\s*", value):
            continue
        try:
            try:
                with open(value, 'r') as f:
                    value_parsed = parse_json_string_with_comments(f.read())
            except (FileNotFoundError, OSError):
                value_parsed = parse_json_string_with_comments(value.strip())
            if not isinstance(value_parsed, dict):
                err = ValueError("Not a valid JSON object: '%s' (parsed as '%s')" % (value, value_parsed))
        except json.decoder.JSONDecodeError as e:
            err = ValueError("Error parsing '%s': %s" % (value, e))
        if err:
            raise err       # pylint: disable=raising-bad-type
        ret = {**ret, **value_parsed}
    return ret

def safe_filename(url):
    """
    Sanitize input to be safely used as the basename of a local file.
    """
    ret = re.sub('[^A-Za-z0-9]+', '.', url)
    #  print('safe filename: %s -> %s' % (url, ret))
    return ret



1			"""
2			Utility functions for strings, paths and URL.
3			"""
4
5			import re
6			import json
7
8			__all__ = [
9			'assert_file_grp_cardinality',
10			'concat_padded',
11			'get_local_filename',
12			'is_local_filename',
13			'is_string',
14			'make_file_id',
15			'nth_url_segment',
16			'parse_json_string_or_file',
17			'parse_json_string_with_comments',
18			'remove_non_path_from_url',
19			'safe_filename',
20			'set_json_key_value_overrides',
21			]
22
23
24			def assert_file_grp_cardinality(grps, n):
25			"""
26			Assert that a string of comma-separated fileGrps contains exactly ``n`` entries.
27			"""
28			if isinstance(grps, str):
29			grps = grps.split(',')
30			assert len(grps) == n, \
31			"Expected exactly %d output file group%s, but '%s' has %d" % (
32			n, '' if n == 1 else 's', grps, len(grps))
33
34			def concat_padded(base, *args):
35			"""
36			Concatenate string and zero-padded 4 digit number
37			"""
38			ret = base
39			for n in args:
40			if is_string(n):
41			ret = "%s_%s" % (ret, n)
42			else:
43			ret = "%s_%04i" % (ret, n + 1)
44			return ret
45
46			def remove_non_path_from_url(url):
47			"""
48			Remove everything from URL after path.
49			"""
50			url = url.split('?', 1)[0] # query
51			url = url.split('#', 1)[0] # fragment identifier
52			url = re.sub(r"/+$", "", url) # trailing slashes
53			return url
54
55			def make_file_id(ocrd_file, output_file_grp):
56			"""
57			Derive a new file ID for an output file from an existing input file ``ocrd_file``
58			and the name of the output file's ``fileGrp/@USE``, ``output_file_grp``.
59			If ``ocrd_file``'s ID contains the input file's fileGrp name, then replace it by ``output_file_grp``.
60			Otherwise use ``output_file_grp`` together with the position of ``ocrd_file`` within the input fileGrp
61			(as a fallback counter). Increment counter until there is no more ID conflict.
62			"""
63			ret = ocrd_file.ID.replace(ocrd_file.fileGrp, output_file_grp)
64			if ret == ocrd_file.ID:
65			m = re.match(r'.?(\d{3,}).', ocrd_file.pageId or '')
66			if m:
67			n = m.group(1)
68			else:
69			ids = [f.ID for f in ocrd_file.mets.find_files(fileGrp=ocrd_file.fileGrp, mimetype=ocrd_file.mimetype)]
70			try:
71			n = ids.index(ocrd_file.ID)
72			except ValueError:
73			n = len(ids)
74			ret = concat_padded(output_file_grp, n)
75			while ocrd_file.mets.find_files(ID=ret):
76			n += 1
77			ret = concat_padded(output_file_grp, n)
78			return ret
79
80			def nth_url_segment(url, n=-1):
81			"""
82			Return the last /-delimited segment of a URL-like string
83
84			Arguments:
85			url (string):
86			n (integer): index of segment, default: -1
87			"""
88			segments = remove_non_path_from_url(url).split('/')
89			try:
90			return segments[n]
91			except IndexError:
92			return ''
93
94			def get_local_filename(url, start=None):
95			"""
96			Return local filename, optionally relative to ``start``
97
98			Arguments:
99			url (string): filename or URL
100			start (string): Base path to remove from filename. Raise an exception if not a prefix of url
101			"""
102			if url.startswith('https://') or url.startswith('http:'):
103			raise Exception("Can't determine local filename of http(s) URL")
104			if url.startswith('file://'):
105			url = url[len('file://'):]
106			# Goobi/Kitodo produces those, they are always absolute
107			if url.startswith('file:/'):
108			raise Exception("Invalid (java) URL: %s" % url)
109			if start:
110			if not url.startswith(start):
111			raise Exception("Cannot remove prefix %s from url %s" % (start, url))
112			if not start.endswith('/'):
113			start += '/'
114			url = url[len(start):]
115			return url
116
117			def is_local_filename(url):
118			"""
119			Whether a url is a local filename.
120			"""
121			return url.startswith('file://') or not('://' in url)
122
123			def is_string(val):
124			"""
125			Return whether a value is a ``str``.
126			"""
127			return isinstance(val, str)
128
129
130			def parse_json_string_with_comments(val):
131			"""
132			Parse a string of JSON interspersed with #-prefixed full-line comments
133			"""
134			jsonstr = re.sub(r'^\s#.$', '', val, flags=re.MULTILINE)
135			return json.loads(jsonstr)
136
137			def parse_json_string_or_file(*values): # pylint: disable=unused-argument
138			"""
139			Parse a string as either the path to a JSON object or a literal JSON object.
140
141			Empty strings are equivalent to '{}'
142			"""
143			ret = {}
144			for value in values:
145			err = None
146			value_parsed = None
147			if re.fullmatch(r"\s*", value):
148			continue
149			try:
150			try:
151			with open(value, 'r') as f:
152			value_parsed = parse_json_string_with_comments(f.read())
153			except (FileNotFoundError, OSError):
154			value_parsed = parse_json_string_with_comments(value.strip())
155			if not isinstance(value_parsed, dict):
156			err = ValueError("Not a valid JSON object: '%s' (parsed as '%s')" % (value, value_parsed))
157			except json.decoder.JSONDecodeError as e:
158			err = ValueError("Error parsing '%s': %s" % (value, e))
159			if err:
160			raise err # pylint: disable=raising-bad-type
161			ret = {ret, value_parsed}
162			return ret
163
164			def safe_filename(url):
165			"""
166			Sanitize input to be safely used as the basename of a local file.
167			"""
168			ret = re.sub('[^A-Za-z0-9]+', '.', url)
169			# print('safe filename: %s -> %s' % (url, ret))
170			return ret
171
172

OCR-D / core

Pull Request — master (#536)

ocrd_utils.str A

Complexity

Size/Duplication

Importance

11 Functions

Duplication Side-by-Side

Filter issues like