StringUtils.roman_to_arabic()   B
last analyzed

Complexity

Conditions 7

Size

Total Lines 40
Code Lines 26

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 26
dl 0
loc 40
rs 7.856
c 0
b 0
f 0
cc 7
nop 4
1
# SPDX-FileCopyrightText: Copyright 2020-2023, Contributors to pocketutils
2
# SPDX-PackageHomePage: https://github.com/dmyersturnbull/pocketutils
3
# SPDX-License-Identifier: Apache-2.0
4
"""
5
6
"""
7
8
import re
9
from collections.abc import Callable, Iterable, Mapping, Sequence
10
from dataclasses import dataclass
11
from typing import Any, Self, TypeVar
12
13
import orjson
14
import regex
15
16
from pocketutils.core.exceptions import ValueIllegalError, ValueOutOfRangeError
17
18
__all__ = ["StringUtils", "StringTools"]
19
20
K_contra = TypeVar("K_contra", contravariant=True)
21
V_co = TypeVar("V_co", covariant=True)
22
_control_chars = regex.compile(r"\p{C}", flags=regex.VERSION1)
23
24
25
def is_true_iterable(s: Any) -> bool:
26
    return (
27
        s is not None
28
        and isinstance(s, Iterable)
29
        and not isinstance(s, str)
30
        and not isinstance(s, bytes | bytearray | memoryview)
31
    )
32
33
34
@dataclass(slots=True, frozen=True)
35
class StringUtils:
36
    def pretty_dict(self: Self, dct: Mapping[Any, Any]) -> str:
37
        """
38
        Returns a pretty-printed dict, complete with indentation. Will fail on non-JSON-serializable datatypes.
39
        """
40
        # return Pretty.condensed(dct)
41
        return orjson.dumps(dct, option=orjson.OPT_INDENT_2).decode(encoding="utf-8")
42
43
    def join_to_str(self: Self, *items: Any, last: str, sep: str = ", ") -> str:
44
        """
45
        Joins items to something like "cat, dog, and pigeon" or "cat, dog, or pigeon".
46
47
        Args:
48
            *items: Items to join; `str(item) for item in items` will be used
49
            last: Probably "and", "or", "and/or", or ""
50
                    Spaces are added/removed as needed if `suffix` is alphanumeric
51
                    or "and/or", after stripping whitespace off the ends.
52
            sep: Used to separate all words; include spaces as desired
53
54
        Examples:
55
            - `join_to_str(["cat", "dog", "elephant"], last="and")  # cat, dog, and elephant`
56
            - `join_to_str(["cat", "dog"], last="and")  # cat and dog`
57
            - `join_to_str(["cat", "dog", "elephant"], last="", sep="/")  # cat/dog/elephant`
58
        """
59
60
    def strip_control_chars(self: Self, s: str) -> str:
61
        """
62
        Strips all characters under the Unicode 'Cc' category.
63
        """
64
        return _control_chars.sub("", s)
65
66
    def roman_to_arabic(self: Self, roman: str, min_val: int | None = None, max_val: int | None = None) -> int:
67
        """
68
        Converts roman numerals to an integer.
69
70
        Args:
71
            roman: A string like "MCIV"
72
            min_val: Raise a ValueError if the parsed value is less than this
73
            max_val: Raise a ValueError if the parsed value is more than this
74
75
        Returns:
76
            The arabic numeral as a Python int
77
        """
78
        # this order is IMPORTANT!
79
        mp = {
80
            "IV": 4,
81
            "IX": 9,
82
            "XL": 40,
83
            "XC": 90,
84
            "CD": 400,
85
            "CM": 900,
86
            "I": 1,
87
            "V": 5,
88
            "X": 10,
89
            "L": 50,
90
            "C": 100,
91
            "D": 500,
92
            "M": 1000,
93
        }
94
        for k, v in mp.items():
95
            roman = roman.replace(k, str(v))
96
        # it'll just error if it's empty
97
        try:
98
            value = sum(int(num) for num in roman)
99
        except (ValueError, StopIteration):
100
            msg = f"Cannot parse roman numerals '{roman}'"
101
            raise ValueIllegalError(msg, value=roman)
102
        if min_val is not None and value < min_val or max_val is not None and value > max_val:
103
            msg = f"Value {roman} (int={value}) is out of range ({min_val}, {max_val})"
104
            raise ValueIllegalError(msg, value=roman)
105
        return value
106
107
    def tabs_to_list(self: Self, s: str) -> Sequence[str]:
108
        """
109
        Splits by tabs, but preserving quoted tabs, stripping quotes.
110
        In other words, will not split within a quoted substring.
111
        Double and single quotes are handled.
112
        """
113
        pat = re.compile(r"""((?:[^\t"']|"[^"]*"|'[^']*')+)""")
114
115
        # Don't strip double 2x quotes: ex ""55"" should be "55", not 55
116
        def strip(i: str) -> str:
117
            if i.endswith(('"', "'")):
118
                i = i[:-1]
119
            if i.startswith(('"', "'")):
120
                i = i[1:]
121
            return i.strip()
122
123
        return [strip(i) for i in pat.findall(s)]
124
125
    def truncate(self: Self, s: str | None, n: int = 40, *, null: str | None = None) -> str | None:
126
        """
127
        Truncates a string and adds ellipses, if needed.
128
129
        Returns a string if it has `n` or fewer characters;
130
        otherwise truncates to length `n-1` and appends `…` (UTF character).
131
        If `s` is None and `always_dots` is True, returns `n` copies of `.` (as a string).
132
        If `s` is None otherwise, returns None.
133
134
        Args:
135
            s: The string
136
            n: The maximum length, inclusive
137
            null: Replace `None` with this string
138
139
        Returns:
140
            A string or None
141
        """
142
        if s is None:
143
            return null
144
        if len(s) > n:
145
            nx = max(0, n - 1)
146
            return s[:nx] + "…"
147
        return s
148
149
    def strip_any_ends(self: Self, s: str, prefixes: str | Sequence[str], suffixes: str | Sequence[str]) -> str:
150
        """
151
        Flexible variant that strips any number of prefixes and any number of suffixes.
152
        Also less type-safe than more specific variants.
153
        Note that the order of the prefixes (or suffixes) DOES matter.
154
        """
155
        prefixes = [str(z) for z in prefixes] if is_true_iterable(prefixes) else [str(prefixes)]
156
        suffixes = [str(z) for z in suffixes] if is_true_iterable(suffixes) else [str(suffixes)]
157
        s = str(s)
158
        for pre in prefixes:
159
            if s.startswith(pre):
160
                s = s[len(pre) :]
161
        for suf in suffixes:
162
            if s.endswith(suf):
163
                s = s[: -len(suf)]
164
        return s
165
166
    def strip_brackets(self: Self, text: str) -> str:
167
        """
168
        Strips any and all pairs of brackets from start and end of a string, but only if they're paired.
169
170
        See Also:
171
             strip_paired
172
        """
173
        pieces = [
174
            "()",
175
            "[]",
176
            "[]",
177
            "{}",
178
            "<>",
179
            "⦗⦘",
180
            "⟨⟩",
181
            "⸨⸩",
182
            "⟦〛",
183
            "《》",
184
            "〘〙",
185
        ]
186
        return StringTools.strip_paired(text, pieces)
187
188
    def strip_quotes(self: Self, text: str) -> str:
189
        """
190
        Strips any and all pairs of quotes from start and end of a string, but only if they're paired.
191
192
        See Also:
193
            strip_paired
194
        """
195
        pieces = [
196
            "`",
197
            "`",
198
            "”“",
199
            "''",
200
            '""',
201
        ]
202
        return StringTools.strip_paired(text, pieces)
203
204
    def strip_brackets_and_quotes(self: Self, text: str) -> str:
205
        """
206
        Strips any and all pairs of brackets and quotes from start and end of a string, but only if they're paired.
207
208
        See Also:
209
            strip_paired
210
        """
211
        pieces = [
212
            "()",
213
            "[]",
214
            "[]",
215
            "{}",
216
            "<>",
217
            "⦗⦘",
218
            "⟨⟩",
219
            "⸨⸩",
220
            "⟦〛",
221
            "《》",
222
            "〘〙",
223
            "`",
224
            "`",
225
            "”“",
226
            "''",
227
            '""',
228
        ]
229
        return StringTools.strip_paired(text, pieces)
230
231
    def strip_paired(self: Self, text: str, pieces: Iterable[tuple[str, str] | str]) -> str:
232
        """
233
        Strips pairs of (start, end) from the ends of strings.
234
235
        Example:
236
237
            StringTools.strip_paired("[(abc]", [("()"), ("[]"))  # returns "(abc"
238
239
        See Also:
240
            [`strip_brackets`](pocketutils.tools.string_tools.StringUtils.strip_brackets)
241
        """
242
        if any(a for a in pieces if len(a) != 2):
243
            msg = f"Each item must be a string of length 2: (stard, end); got {pieces}"
244
            raise ValueIllegalError(msg, value=str(pieces))
245
        text = str(text)
246
        while len(text) > 0:
247
            yes = False
248
            for a, b in pieces:
249
                while text.startswith(a) and text.endswith(b):
250
                    text = text[1:-1]
251
                    yes = True
252
            if not yes:
253
                break
254
        return text
255
256
    def replace_digits_with_superscript_chars(self: Self, s: str | float) -> str:
257
        """
258
        Replaces digits, +, =, (, and ) with equivalent Unicode superscript chars (ex ¹).
259
        """
260
        return "".join(dict(zip("0123456789-+=()", "⁰¹²³⁴⁵⁶⁷⁸⁹⁻⁺⁼⁽⁾")).get(c, c) for c in s)
261
262
    def replace_digits_with_subscript_chars(self: Self, s: str | float) -> str:
263
        """
264
        Replaces digits, +, =, (, and ) with equivalent Unicode subscript chars (ex ₁).
265
        """
266
        return "".join(dict(zip("0123456789+-=()", "₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎")).get(c, c) for c in s)
267
268
    def replace_superscript_chars_with_digits(self: Self, s: str | float) -> str:
269
        """
270
        Replaces Unicode superscript digits, +, =, (, and ) with normal chars.
271
        """
272
        return "".join(dict(zip("⁰¹²³⁴⁵⁶⁷⁸⁹⁻⁺⁼⁽⁾", "0123456789-+=()")).get(c, c) for c in s)
273
274
    def replace_subscript_chars_with_digits(self: Self, s: str | float) -> str:
275
        """
276
        Replaces Unicode superscript digits, +, =, (, and ) with normal chars.
277
        """
278
        return "".join(dict(zip("₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎", "0123456789+-=()")).get(c, c) for c in s)
279
280
    def pretty_float(self: Self, v: float | int, n_sigfigs: int | None = 5) -> str:
281
        """
282
        Represents a float as a string, with symbols for NaN and infinity.
283
        The returned string always has a minus or + prepended. Strip off the plus with `.lstrip('+')`.
284
        If v is an integer (by isinstance), makes sure to display without a decimal point.
285
        If `n_sigfigs < 2`, will never have a
286
287
        For example:
288
            - StringTools.pretty_float(.2222222)       # '+0.22222'
289
            - StringTools.pretty_float(-.2222222)      # '-0.22222' (Unicode minus)
290
            - StringTools.pretty_float(-float('inf'))  # '-∞'
291
            - StringTools.pretty_float(np.NaN)         # 'NaN'
292
        """
293
        # TODO this seems absurdly long for what it does
294
        if n_sigfigs is None or n_sigfigs < 1:
295
            msg = f"Sigfigs of {n_sigfigs} is nonpositive"
296
            raise ValueOutOfRangeError(
297
                msg,
298
                value=n_sigfigs,
299
                minimum=1,
300
            )
301
        # first, handle NaN and infinities
302
        if v == float("-Inf"):
303
            return "-∞"
304
        if v == float("Inf"):
305
            return "+∞"
306
        elif not isinstance(v, str) and str(v) in ["nan", "na", "NaN"]:
307
            return "NaN"
308
        elif not isinstance(v, str) and str(v) == "NaT":
309
            return "NaT"
310
        # sweet. it's a regular float or int.
311
        if n_sigfigs is None:
312
            s = str(v).removesuffix(".0")
313
        else:
314
            # yes, this is weird. we need to convert from str to float then back to str
315
            s = str(float(str(("%." + str(n_sigfigs) + "g") % v)))
316
        # remove the .0 if the precision doesn't support it
317
        # if v >= 1 and n_sigfigs<2, it couldn't have a decimal
318
        # and if n_sigfigs<1, it definitely can't
319
        # and ... %g does this.
320
        if isinstance(v, int) or n_sigfigs is not None and n_sigfigs < 2:
321
            s = s.removesuffix(".0")
322
        # prepend + or - (unless 0)
323
        if float(s) == 0.0:
324
            return s
325
        s = s.replace("-", "-")
326
        if not s.startswith("-"):
327
            s = "+" + s[1:]
328
        if len(s) > 1 and s[1] == ".":
329
            s = s[0] + "0." + s[2:]
330
        return s
331
332
    def pretty_function(self: Self, function: Callable, *, with_addr: bool = False) -> str:
333
        n_args = str(function.__code__.co_argcount) if hasattr(function, "__code__") else "?"
334
        pat = re.compile(r"^<bound method [^ .]+\.([^ ]+) of (.+)>$")
335
        boundmatch = pat.fullmatch(str(function))
336
        addr = " @ " + hex(id(function)) if with_addr else ""
337
        # if isinstance(function, FunctionType):
338
        #    # simplify lambda functions!
339
        #    return "⟨" + "λ(" + n_args + ")" + addr + "⟩"
340
        if boundmatch is not None:
341
            # it's a method (bound function)
342
            # don't show the address of the instance AND its method
343
            pat = re.compile(r"@ ?0x[0-9a-hA-H]+\)?$")
344
            s = pat.sub("", boundmatch.group(2)).strip()
345
            return "⟨" + "`" + s + "`." + boundmatch.group(1) + "(" + n_args + ")" + addr + "⟩"
346
        elif callable(function):
347
            # it's an actual function
348
            name = function.__name__
349
            if name is None:
350
                return "⟨<fn>" + addr + "⟩"
351
            if name == "<lambda>":
352
                return "⟨λ" + addr + "⟩"
353
            return "⟨" + function.__name__ + addr + "⟩"
354
        msg = f"Wrong type {type(function)} for '{function}"
355
        raise ValueIllegalError(msg, value=type(function).__name__)
356
357
    def pretty_object(self: Self, thing: Any, *, with_addr: bool = False) -> str:
358
        """
359
        Get a better and shorter name for a function than str(function).
360
        Ex: `pprint_function(lambda s: s)  == '<λ>'`
361
362
        - Instead of '<bound method ...', you'll get '<name(nargs)>'
363
        - Instead of 'lambda ...', you'll get '<λ(nargs)>'
364
        - etc.
365
366
        Note:
367
          - If function is None, returns '⌀'
368
          - If function does not have __name__, returns prefix + type(function) + <address> + suffix
369
          - If it's a primitive, returns str(function)
370
371
        Args:
372
            thing: Can be anything, but especially useful for functions
373
            with_addr: Include `@ hex-mem-addr` in the name
374
        """
375
        addr = " @ " + hex(id(thing)) if with_addr else ""
376
        pat = re.compile(r"<([A-Za-z0-9_.<>]+)[ ']*object")
377
        objmatch = pat.search(str(thing))  # instance of global or local class
378
        if thing is None:
379
            return "⌀"
380
        if isinstance(thing, type):
381
            # it's a class
382
            return "⟨" + "type:" + thing.__name__ + "⟩"
383
        elif callable(thing):
384
            return self.pretty_function(thing, with_addr=with_addr)
385
        elif hasattr(thing, "__dict__") and len(thing.__dict__) > 0:
386
            # it's a member with attributes
387
            # it's interesting enough that it may have a good __str__
388
            # strip prefix and suffix because we'll re-add it
389
            s = str(thing).removeprefix("⟨").removesuffix("⟩")
390
            return "⟨" + s + addr + "⟩"
391
        elif objmatch is not None:
392
            # it's an instance without attributes
393
            s = objmatch.group(1)
394
            if "." in s:
395
                s = s[s.rindex(".") + 1 :]
396
            return "⟨" + s + addr + "⟩"
397
        # it's a primitive, etc
398
        return str(thing)
399
400
    def greek_chars_to_letter_names(self: Self) -> Mapping[str, str]:
401
        """
402
        Returns a dict from Greek lowercase+uppercase Unicode chars to their full names.
403
        """
404
        return dict(StringTools._greek_alphabet)
405
406
    def greek_letter_names_to_chars(self: Self) -> Mapping[str, str]:
407
        """
408
        Returns a dict from Greek lowercase+uppercase letter names to their Unicode chars.
409
        """
410
        return {v: k for k, v in StringTools._greek_alphabet.items()}
411
412
    def replace_greek_letter_names_with_chars(self: Self, s: str, lowercase: bool = False) -> str:
413
        """
414
        Replaces Greek letter names with their Unicode equivalents.
415
        Does this correctly by replacing superstrings before substrings.
416
        Ex: '1-beta' is '1-β' rather than '1-bη'
417
        If lowercase is True: Replaces Beta, BeTa, and BETA with β
418
        Else: Replaces Beta with a capital Greek Beta and ignores BETA and BeTa.
419
        """
420
        # Clever if I may say so:
421
        # If we just sort from longest to shortest, we can't replace substrings by accident
422
        # For example we'll replace 'beta' before 'eta', so '1-beta' won't become '1-bη'
423
        greek = sorted(
424
            [(v, k) for k, v in StringTools._greek_alphabet.items()],
425
            key=lambda t: -len(t[1]),
426
        )
427
        for k, v in greek:
428
            if k[0].isupper() and lowercase:
429
                continue
430
            s = re.compile(k | regex.IGNORECASE).sub(v, s) if lowercase else s.replace(k, v)
431
        return s
432
433
    def dict_to_compact_str(self: Self, seq: Mapping[K_contra, V_co], *, eq: str = "=", sep: str = ", ") -> str:
434
        return self.dict_to_str(seq, sep=sep, eq=eq)
435
436
    def dict_to_quote_str(self: Self, seq: Mapping[K_contra, V_co], *, eq: str = ": ", sep: str = "; ") -> str:
437
        return self.dict_to_str(seq, sep=sep, eq=eq, prefix="'", suffix="'")
438
439
    def dict_to_str(
440
        self: Self,
441
        seq: Mapping[K_contra, V_co],
442
        *,
443
        sep: str = "\t",
444
        eq: str = "=",
445
        prefix: str = "",
446
        suffix: str = "",
447
    ) -> str:
448
        """
449
        Joins dict elements into a str like 'a=1, b=2, c=3`.
450
        Won't break with ValueError if the keys or values aren't strs.
451
452
        Args:
453
            seq: Dict-like, with `items()`
454
            sep: Delimiter
455
            eq: Separates a key with its value
456
            prefix: Prepend before every key
457
            suffix: Append after every value
458
        """
459
        return sep.join([prefix + str(k) + eq + str(v) + suffix for k, v in seq.items()])
460
461
    _greek_alphabet = {
462
        "\u0391": "Alpha",
463
        "\u0392": "Beta",
464
        "\u0393": "Gamma",
465
        "\u0394": "Delta",
466
        "\u0395": "Epsilon",
467
        "\u0396": "Zeta",
468
        "\u0397": "Eta",
469
        "\u0398": "Theta",
470
        "\u0399": "Iota",
471
        "\u039A": "Kappa",
472
        "\u039B": "Lambda",
473
        "\u039C": "Mu",
474
        "\u039D": "Nu",
475
        "\u039E": "Xi",
476
        "\u039F": "Omicron",
477
        "\u03A0": "Pi",
478
        "\u03A1": "Rho",
479
        "\u03A3": "Sigma",
480
        "\u03A4": "Tau",
481
        "\u03A5": "Upsilon",
482
        "\u03A6": "Phi",
483
        "\u03A7": "Chi",
484
        "\u03A8": "Psi",
485
        "\u03A9": "Omega",
486
        "\u03B1": "alpha",
487
        "\u03B2": "beta",
488
        "\u03B3": "gamma",
489
        "\u03B4": "delta",
490
        "\u03B5": "epsilon",
491
        "\u03B6": "zeta",
492
        "\u03B7": "eta",
493
        "\u03B8": "theta",
494
        "\u03B9": "iota",
495
        "\u03BA": "kappa",
496
        "\u03BB": "lambda",
497
        "\u03BC": "mu",
498
        "\u03BD": "nu",
499
        "\u03BE": "xi",
500
        "\u03BF": "omicron",
501
        "\u03C0": "pi",
502
        "\u03C1": "rho",
503
        "\u03C3": "sigma",
504
        "\u03C4": "tau",
505
        "\u03C5": "upsilon",
506
        "\u03C6": "phi",
507
        "\u03C7": "chi",
508
        "\u03C8": "psi",
509
        "\u03C9": "omega",
510
    }
511
512
513
StringTools = StringUtils()
514