pocketutils.tools.string_tools.StringTools.pretty_repr() - Code Metrics - Inspection of "feat: revamp" - dmyersturnbull/pocketutils - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Push — main ( ed7d21...87238c )

by Douglas

created 2023-08-26 01:04 UTC

StringTools.pretty_repr() C

↳ Parent: pocketutils.tools.string_tools

Complexity

Conditions

Size

Total Lines	45
Code Lines	24

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
eloc	24
dl	0
loc	45
rs	6.6666
c	0
b	0
f	0
cc	9
nop	4

import re
from collections.abc import ByteString, Callable, Iterable, Mapping, Sequence
from typing import Any, Self, TypeVar

import orjson
import regex

from pocketutils.core.exceptions import OutOfRangeError, XTypeError, XValueError

T = TypeVar("T")
V = TypeVar("V")
_control_chars = regex.compile(r"\p{C}", flags=regex.V1)


def is_true_iterable(s: Any) -> bool:
    return s is not None and isinstance(s, Iterable) and not isinstance(s, str) and not isinstance(s, ByteString)


def _is_lambda(function: Any) -> bool:
    # noinspection PyPep8Naming
    LAMBDA = lambda: 0  # noqa: E731
    if not hasattr(function, "__name__"):
        return False  # not a function
    return (
        isinstance(function, type(LAMBDA))
        and function.__name__ == LAMBDA.__name__
        or str(function).startswith("<function <lambda> at ")
        and str(function).endswith(">")
    )


class StringTools:
    @classmethod
    def pretty_dict(cls: type[Self], dct: Mapping[Any, Any]) -> str:
        """
        Returns a pretty-printed dict, complete with indentation. Will fail on non-JSON-serializable datatypes.
        """
        # return Pretty.condensed(dct)
        return orjson.dumps(dct, option=orjson.OPT_INDENT_2).decode(encoding="utf-8")

    @classmethod
    def extract_group(
        cls: type[Self],
        pattern: str | re.Pattern | regex.Pattern,
        value: str | None,
        *,
        group: int = 0,
    ) -> str | None:
        """
        Extracts a capture group from a regex full-match.
        Returns None if there was no match.
        **Always** uses https://pypi.org/project/regex with `flags=regex.V1`.

        Args:
            pattern: Regex pattern
            value: The target string
            group: The group number

        Returns The capture group, or None
        """
        if isinstance(pattern, re.Pattern):
            pattern = regex.compile(pattern.pattern, flags=regex.V1)
        elif isinstance(pattern, str):
            pattern = regex.compile(pattern, flags=regex.V1)
        elif isinstance(pattern, regex.Pattern) and not pattern.flags & regex.V1:
            pattern = regex.compile(pattern.pattern, flags=regex.V1)
        match = pattern.fullmatch(value)
        if match is None:
            return None
        return match.group(group)

    @classmethod
    def join_to_str(cls: type[Self], *items: Any, last: str, sep: str = ", ") -> str:
        """
        Joins items to something like "cat, dog, and pigeon" or "cat, dog, or pigeon".

        Args:
            *items: Items to join; `str(item) for item in items` will be used
            last: Probably "and", "or", "and/or", or ""
                    Spaces are added/removed as needed if `suffix` is alphanumeric
                    or "and/or", after stripping whitespace off the ends.
            sep: Used to separate all words; include spaces as desired

        Examples:
            - `join_to_str(["cat", "dog", "elephant"], last="and")  # cat, dog, and elephant`
            - `join_to_str(["cat", "dog"], last="and")  # cat and dog`
            - `join_to_str(["cat", "dog", "elephant"], last="", sep="/")  # cat/dog/elephant`
        """
        if last.strip().isalpha() or last.strip() == "and/or":
            last = last.strip() + " "
        items = [str(s).strip("'" + '"' + " ") for s in items]
        if len(items) > 2:
            return sep.join(items[:-1]) + sep + last + items[-1]
        else:
            return (" " + last + " ").join(items)

    @classmethod
    def strip_control_chars(cls: type[Self], s: str) -> str:
        """
        Strips all characters under the Unicode 'Cc' category.
        """
        return _control_chars.sub("", s)

    @classmethod
    def roman_to_arabic(
        cls: type[Self],
        roman: str,
        min_val: int | None = None,
        max_val: int | None = None,
    ) -> int:
        """
        Converts roman numerals to an integer.

        Args:
            roman: A string like "MCIV"
            min_val: Raise a ValueError if the parsed value is less than this
            max_val: Raise a ValueError if the parsed value is more than this

        Returns:
            The arabic numeral as a Python int
        """
        # this order is IMPORTANT!
        mp = {
            "IV": 4,
            "IX": 9,
            "XL": 40,
            "XC": 90,
            "CD": 400,
            "CM": 900,
            "I": 1,
            "V": 5,
            "X": 10,
            "L": 50,
            "C": 100,
            "D": 500,
            "M": 1000,
        }
        for k, v in mp.items():
            roman = roman.replace(k, str(v))
        # it'll just error if it's empty
        try:
            value = sum(int(num) for num in roman)
        except (ValueError, StopIteration):
            msg = f"Cannot parse roman numerals '{roman}'"
            raise XValueError(msg, value=roman)
        if min_val is not None and value < min_val or max_val is not None and value > max_val:
            msg = f"Value {roman} (int={value}) is out of range ({min_val}, {max_val})"
            raise XValueError(
                msg,
                value=roman,
            )
        return value

    @classmethod
    def retab(cls: type[Self], s: str, n_spaces: int) -> str:
        """
        Converts indentation with spaces to tab indentation.

        Args:
            s: The string to convert
            n_spaces: A tab is this number of spaces
        """

        def fix(m):
            n = len(m.group(1)) // n_spaces
            return "\t" * n + " " * (len(m.group(1)) % n_spaces)

        return regex.sub("^( +)", fix, s, flags=regex.V1 | regex.MULTILINE)

    @classmethod
    def strip_empty_decimal(cls: type[Self], num: float | str) -> str:
        """
        Replaces prefix . with 0. and strips trailing .0 and trailing .
        """
        try:
            float(num)
        except TypeError:
            if not isinstance(num, str):
                msg = "Must be either str or float-like"
                raise TypeError(msg) from None
        t = str(num)
        if t.startswith("."):
            t = "0" + t
        if "." in t:
            return t.rstrip("0").rstrip(".")
        else:
            return t

    @classmethod
    def tabs_to_list(cls: type[Self], s: str) -> Sequence[str]:
        """
        Splits by tabs, but preserving quoted tabs, stripping quotes.
        In other words, will not split within a quoted substring.
        Double and single quotes are handled.
        """
        pat = regex.compile(r"""((?:[^\t"']|"[^"]*"|'[^']*')+)""", flags=regex.V1)

        # Don't strip double 2x quotes: ex ""55"" should be "55", not 55
        def strip(i: str) -> str:
            if i.endswith(('"', "'")):
                i = i[:-1]
            if i.startswith(('"', "'")):
                i = i[1:]
            return i.strip()

        return [strip(i) for i in pat.findall(s)]

    @classmethod
    def truncate(
        cls: type[Self],
        s: str | None,
        n: int = 40,
        *,
        null: str | None = None,
    ) -> str | None:
        """
        Truncates a string and adds ellipses, if needed.

        Returns a string if it has `n` or fewer characters;
        otherwise truncates to length `n-1` and appends `…` (UTF character).
        If `s` is None and `always_dots` is True, returns `n` copies of `.` (as a string).
        If `s` is None otherwise, returns None.

        Args:
            s: The string
            n: The maximum length, inclusive
            null: Replace `None` with this string

        Returns:
            A string or None
        """
        if s is None:
            return null
        if len(s) > n:
            nx = max(0, n - 1)
            return s[:nx] + "…"
        return s

    # these are provided to avoid having to call with labdas or functools.partial
    @classmethod
    def truncating(
        cls: type[Self],
        n: int = 40,
        always_dots: bool = False,
        *,
        null: str | None = None,
    ) -> Callable[[str], str]:
        # pretty much functools.partial
        def trunc(s: str) -> str:
            return cls.truncate(s, n, null=null)

        trunc.__name__ = f"truncate({n},{'…' if always_dots else ''})"
        return trunc

    @classmethod
    def longest(cls: type[Self], parts: Iterable[T]) -> T:
        """
        Returns an element with the highest `len`.
        """
        mx = ""
        for _i, x in enumerate(parts):
            if len(x) > len(mx):
                mx = x
        return mx

    @classmethod
    def strip_any_ends(
        cls: type[Self],
        s: str,
        prefixes: str | Sequence[str],
        suffixes: str | Sequence[str],
    ) -> str:
        """
        Flexible variant that strips any number of prefixes and any number of suffixes.
        Also less type-safe than more specific variants.
        Note that the order of the prefixes (or suffixes) DOES matter.
        """
        prefixes = [str(z) for z in prefixes] if is_true_iterable(prefixes) else [str(prefixes)]
        suffixes = [str(z) for z in suffixes] if is_true_iterable(suffixes) else [str(suffixes)]
        s = str(s)
        for pre in prefixes:
            if s.startswith(pre):
                s = s[len(pre) :]
        for suf in suffixes:
            if s.endswith(suf):
                s = s[: -len(suf)]
        return s

    @classmethod
    def strip_brackets(cls: type[Self], text: str) -> str:
        """
        Strips any and all pairs of brackets from start and end of a string, but only if they're paired.

        See Also:
             strip_paired
        """
        pieces = [
            "()",
            "[]",
            "[]",
            "{}",
            "<>",
            "⦗⦘",
            "⟨⟩",
            "⸨⸩",
            "⟦〛",
            "《》",
            "〘〙",
        ]
        return StringTools.strip_paired(text, pieces)

    @classmethod
    def strip_quotes(cls: type[Self], text: str) -> str:
        """
        Strips any and all pairs of quotes from start and end of a string, but only if they're paired.

        See Also:
            strip_paired
        """
        pieces = [
            "`",
            "``",
            "”“",
            "''",
            '""',
        ]
        return StringTools.strip_paired(text, pieces)

    @classmethod
    def strip_brackets_and_quotes(cls: type[Self], text: str) -> str:
        """
        Strips any and all pairs of brackets and quotes from start and end of a string, but only if they're paired.

        See Also:
            strip_paired
        """
        pieces = [
            "()",
            "[]",
            "[]",
            "{}",
            "<>",
            "⦗⦘",
            "⟨⟩",
            "⸨⸩",
            "⟦〛",
            "《》",
            "〘〙",
            "`",
            "``",
            "”“",
            "''",
            '""',
        ]
        return StringTools.strip_paired(text, pieces)

    @classmethod
    def strip_paired(cls: type[Self], text: str, pieces: Iterable[tuple[str, str] | str]) -> str:
        """
        Strips pairs of (start, end) from the ends of strings.

        Example:
            .. code-block::
                StringTools.strip_paired("[(abc]", [("()"), ("[]"))  # returns "(abc"

        See Also:
            strip_brackets
        """
        if any(a for a in pieces if len(a) != 2):
            msg = f"Each item must be a string of length 2: (stard, end); got {pieces}"
            raise XValueError(
                msg,
                value=str(pieces),
            )
        text = str(text)
        while len(text) > 0:
            yes = False
            for a, b in pieces:
                while text.startswith(a) and text.endswith(b):
                    text = text[1:-1]
                    yes = True
            if not yes:
                break
        return text

    @classmethod
    def superscript(cls: type[Self], s: str | float) -> str:
        """
        Replaces digits, +, =, (, and ) with equivalent Unicode superscript chars (ex ¹).
        """
        return "".join(dict(zip("0123456789-+=()", "⁰¹²³⁴⁵⁶⁷⁸⁹⁻⁺⁼⁽⁾")).get(c, c) for c in s)

    @classmethod
    def subscript(cls: type[Self], s: str | float) -> str:
        """
        Replaces digits, +, =, (, and ) with equivalent Unicode subscript chars (ex ₁).
        """
        return "".join(dict(zip("0123456789+-=()", "₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎")).get(c, c) for c in s)

    @classmethod
    def unsuperscript(cls: type[Self], s: str | float) -> str:
        """
        Replaces Unicode superscript digits, +, =, (, and ) with normal chars.
        """
        return "".join(dict(zip("⁰¹²³⁴⁵⁶⁷⁸⁹⁻⁺⁼⁽⁾", "0123456789-+=()")).get(c, c) for c in s)

    @classmethod
    def unsubscript(cls: type[Self], s: str | float) -> str:
        """
        Replaces Unicode superscript digits, +, =, (, and ) with normal chars.
        """
        return "".join(dict(zip("₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎", "0123456789+-=()")).get(c, c) for c in s)

    @classmethod
    def pretty_float(cls: type[Self], v: float | int, n_sigfigs: int | None = 5) -> str:
        """
        Represents a float as a string, with symbols for NaN and infinity.
        The returned string always has a minus or + prepended. Strip off the plus with .lstrip('+').
        If v is an integer (by isinstance), makes sure to display without a decimal point.
        If n_sigfigs < 2, will never have a
        For ex:
            - StringTools.pretty_float(.2222222)       # '+0.22222'
            - StringTools.pretty_float(-.2222222)      # '-0.22222' (Unicode minus)
            - StringTools.pretty_float(-float('inf'))  # '-∞'
            - StringTools.pretty_float(np.NaN)         # 'NaN'
        """
        # TODO this seems absurdly long for what it does
        if n_sigfigs is None or n_sigfigs < 1:
            msg = f"Sigfigs of {n_sigfigs} is nonpositive"
            raise OutOfRangeError(
                msg,
                value=n_sigfigs,
                minimum=1,
            )
        # first, handle NaN and infinities
        if str(v) in {"nan", "NaN"}:
            return "⌀"
        if v == float("-Inf"):
            return "-∞"
        if v == float("Inf"):
            return "+∞"
        elif not isinstance(v, str) and str(v) in ["nan", "na", "NaN"]:
            return "NaN"
        elif not isinstance(v, str) and str(v) == "NaT":
            return "NaT"
        # sweet. it's a regular float or int.
        if n_sigfigs is None:
            s = cls.strip_empty_decimal(str(v))
        else:
            # yes, this is weird. we need to convert from str to float then back to str
            s = str(float(str(("%." + str(n_sigfigs) + "g") % v)))
        # remove the .0 if the precision doesn't support it
        # if v >= 1 and n_sigfigs<2, it couldn't have a decimal
        # and if n_sigfigs<1, it definitely can't
        # and ... %g does this.
        if isinstance(v, int) or n_sigfigs is not None and n_sigfigs < 2:
            s = cls.strip_empty_decimal(s)
        # prepend + or - (unless 0)
        if float(s) == 0.0:
            return s
        s = s.replace("-", "-")
        if not s.startswith("-"):
            s = "+" + s[1:]
        if len(s) > 1 and s[1] == ".":
            s = s[0] + "0." + s[2:]
        return s

    @classmethod
    def pretty_function(
        cls: type[Self],
        function: Callable,
        *,
        with_address: bool = False,
    ) -> str:
        n_args = str(function.__code__.co_argcount) if hasattr(function, "__code__") else "?"
        pat = regex.compile(r"^<bound method [^ .]+\.([^ ]+) of (.+)>$", flags=regex.V1)
        boundmatch = pat.fullmatch(str(function))
        addr = " @ " + hex(id(function)) if with_address else ""
        if _is_lambda(function):
            # simplify lambda functions!
            return "⟨" + "λ(" + n_args + ")" + addr + "⟩"
        elif boundmatch is not None:
            # it's a method (bound function)
            # don't show the address of the instance AND its method
            pat = regex.compile(r"@ ?0x[0-9a-hA-H]+\)?$", flags=regex.V1)
            s = pat.sub("", boundmatch.group(2)).strip()
            return "⟨" + "`" + s + "`." + boundmatch.group(1) + "(" + n_args + ")" + addr + "⟩"
        elif callable(function):
            # it's an actual function
            return "⟨" + function.__name__ + addr + "⟩"
        msg = f"Wrong type {type(function)} for '{function}"
        raise XTypeError(msg, actual=type(function).__name__)

    @classmethod
    def pretty_repr(
        cls: type[Self],
        function: Any,
        *,
        with_address: bool = False,
    ) -> str:
        """
        Get a better and shorter name for a function than str(function).
        Ex: pprint_function(lambda s: s)  == '<λ>'
        - Instead of '<bound method ...', you'll get '<name(nargs)>'
        - Instead of 'lambda ...', you'll get '<λ(nargs)>'
        - etc.
        NOTE 1: If function is None, returns '⌀'
        NOTE 2: If function does not have __name__, returns prefix + type(function) + <address> + suffix
        NOTE 3: If it's a primitive, returns str(function)

        Args:
            function: Can be anything, but especially useful for functions
            with_address: Include `@ hex-mem-addr` in the name
        """
        addr = " @ " + hex(id(function)) if with_address else ""
        pat = regex.compile(r"<([A-Za-z0-9_.<>]+)[ ']*object", flags=regex.V1)
        objmatch = pat.search(str(function))  # instance of global or local class
        if function is None:
            return "⌀"
        if isinstance(function, type):
            # it's a class
            return "⟨" + "type:" + function.__name__ + "⟩"
        elif callable(function):
            return cls.pretty_function(function, with_address=with_address)
        elif hasattr(function, "__dict__") and len(function.__dict__) > 0:
            # it's a member with attributes
            # it's interesting enough that it may have a good __str__
            # strip prefix and suffix because we'll re-add it
            s = str(function).removeprefix("⟨").removesuffix("⟩")
            return "⟨" + s + addr + "⟩"
        elif objmatch is not None:
            # it's an instance without attributes
            s = objmatch.group(1)
            if "." in s:
                s = s[s.rindex(".") + 1 :]
            return "⟨" + s + addr + "⟩"
        # it's a primitive, etc
        return str(function)

    @classmethod
    def greek_to_name(cls: type[Self]) -> Mapping[str, str]:
        """
        Returns a dict from Greek lowercase+uppercase Unicode chars to their full names.
        """
        return dict(StringTools._greek_alphabet)

    @classmethod
    def name_to_greek(cls: type[Self]) -> Mapping[str, str]:
        """
        Returns a dict from Greek lowercase+uppercase letter names to their Unicode chars.
        """
        return {v: k for k, v in StringTools._greek_alphabet.items()}

    @classmethod
    def fix_greek(cls: type[Self], s: str, lowercase: bool = False) -> str:
        """
        Replaces Greek letter names with their Unicode equivalents.
        Does this correctly by replacing superstrings before substrings.
        Ex: '1-beta' is '1-β' rather than '1-bη'
        If lowercase is True: Replaces Beta, BeTa, and BETA with β
        Else: Replaces Beta with a capital Greek Beta and ignores BETA and BeTa.
        """
        # Clever if I may say so:
        # If we just sort from longest to shortest, we can't replace substrings by accident
        # For example we'll replace 'beta' before 'eta', so '1-beta' won't become '1-bη'
        greek = sorted(
            [(v, k) for k, v in StringTools._greek_alphabet.items()],
            key=lambda t: -len(t[1]),
        )
        for k, v in greek:
            if k[0].isupper() and lowercase:
                continue
            s = regex.compile(k, flags=regex.V1 | regex.IGNORECASE).sub(v, s) if lowercase else s.replace(k, v)
        return s

    @classmethod
    def join(
        cls: type[Self],
        seq: Iterable[T],
        *,
        sep: str = "\t",
        attr: str | None = None,
        prefix: str = "",
        suffix: str = "",
    ) -> str:
        """
        Join elements into a str more easily than ''.join. Just simplifies potentially long expressions.
        Won't break with ValueError if the elements aren't strs.

        Example:
            ``python
                - StringTools.join([1,2,3])  # "1    2    3"
                - StringTools.join(cars, sep=',', attr='make', prefix="(", suffix=")")`  # "(Ford),(Ford),(BMW)"
            ``

        Args:
            seq: Sequence of elements
            sep: Delimiter
            attr: Get this attribute from each element (in `seq`), or use the element itself if None
            prefix: Prefix before each item
            suffix: Suffix after each item

        Returns:
            A string
        """
        if attr is None:
            return sep.join([prefix + str(s) + suffix for s in seq])
        else:
            return sep.join([prefix + str(getattr(s, attr)) + suffix for s in seq])

    @classmethod
    def join_kv_neat(cls: type[Self], seq: Mapping[T, V], *, eq: str = "=", sep: str = ", ") -> str:
        return cls.join_kv(seq, sep=sep, eq=eq)

    @classmethod
    def join_kv_spaced(cls: type[Self], seq: Mapping[T, V], *, eq: str = ": ", sep: str = "; ") -> str:
        return cls.join_kv(seq, sep=sep, eq=eq)

    @classmethod
    def join_kv_quoted(cls: type[Self], seq: Mapping[T, V], *, eq: str = ": ", sep: str = "; ") -> str:
        return cls.join_kv(seq, sep=sep, eq=eq, prefix="'", suffix="'")

    @classmethod
    def join_kv(
        cls: type[Self],
        seq: Mapping[T, V],
        *,
        sep: str = "\t",
        eq: str = "=",
        prefix: str = "",
        suffix: str = "",
    ) -> str:
        """
        Joins dict elements into a str like 'a=1, b=2, c=3`.
        Won't break with ValueError if the keys or values aren't strs.

        Args:
            seq: Dict-like, with `items()`
            sep: Delimiter
            eq: Separates a key with its value
            prefix: Prepend before every key
            suffix: Append after every value

        Returns:
            A string
        """
        return sep.join([prefix + str(k) + eq + str(v) + suffix for k, v in seq.items()])

    _greek_alphabet = {
        "\u0391": "Alpha",
        "\u0392": "Beta",
        "\u0393": "Gamma",
        "\u0394": "Delta",
        "\u0395": "Epsilon",
        "\u0396": "Zeta",
        "\u0397": "Eta",
        "\u0398": "Theta",
        "\u0399": "Iota",
        "\u039A": "Kappa",
        "\u039B": "Lambda",
        "\u039C": "Mu",
        "\u039D": "Nu",
        "\u039E": "Xi",
        "\u039F": "Omicron",
        "\u03A0": "Pi",
        "\u03A1": "Rho",
        "\u03A3": "Sigma",
        "\u03A4": "Tau",
        "\u03A5": "Upsilon",
        "\u03A6": "Phi",
        "\u03A7": "Chi",
        "\u03A8": "Psi",
        "\u03A9": "Omega",
        "\u03B1": "alpha",
        "\u03B2": "beta",
        "\u03B3": "gamma",
        "\u03B4": "delta",
        "\u03B5": "epsilon",
        "\u03B6": "zeta",
        "\u03B7": "eta",
        "\u03B8": "theta",
        "\u03B9": "iota",
        "\u03BA": "kappa",
        "\u03BB": "lambda",
        "\u03BC": "mu",
        "\u03BD": "nu",
        "\u03BE": "xi",
        "\u03BF": "omicron",
        "\u03C0": "pi",
        "\u03C1": "rho",
        "\u03C3": "sigma",
        "\u03C4": "tau",
        "\u03C5": "upsilon",
        "\u03C6": "phi",
        "\u03C7": "chi",
        "\u03C8": "psi",
        "\u03C9": "omega",
    }


__all__ = ["StringTools"]


1			import re
2			from collections.abc import ByteString, Callable, Iterable, Mapping, Sequence
3			from typing import Any, Self, TypeVar
4
5			import orjson
6			import regex
7
8			from pocketutils.core.exceptions import OutOfRangeError, XTypeError, XValueError
9
10			T = TypeVar("T")
11			V = TypeVar("V")
12			_control_chars = regex.compile(r"\p{C}", flags=regex.V1)
13
14
15			def is_true_iterable(s: Any) -> bool:
16			return s is not None and isinstance(s, Iterable) and not isinstance(s, str) and not isinstance(s, ByteString)
17
18
19			def _is_lambda(function: Any) -> bool:
20			# noinspection PyPep8Naming
21			LAMBDA = lambda: 0 # noqa: E731
22			if not hasattr(function, "__name__"):
23			return False # not a function
24			return (
25			isinstance(function, type(LAMBDA))
26			and function.__name__ == LAMBDA.__name__
27			or str(function).startswith("<function <lambda> at ")
28			and str(function).endswith(">")
29			)
30
31
32			class StringTools:
33			@classmethod
34			def pretty_dict(cls: type[Self], dct: Mapping[Any, Any]) -> str:
35			"""
36			Returns a pretty-printed dict, complete with indentation. Will fail on non-JSON-serializable datatypes.
37			"""
38			# return Pretty.condensed(dct)
39			return orjson.dumps(dct, option=orjson.OPT_INDENT_2).decode(encoding="utf-8")
40
41			@classmethod
42			def extract_group(
43			cls: type[Self],
44			pattern: str \| re.Pattern \| regex.Pattern,
45			value: str \| None,
46			*,
47			group: int = 0,
48			) -> str \| None:
49			"""
50			Extracts a capture group from a regex full-match.
51			Returns None if there was no match.
52			Always uses https://pypi.org/project/regex with `flags=regex.V1`.
53
54			Args:
55			pattern: Regex pattern
56			value: The target string
57			group: The group number
58
59			Returns The capture group, or None
60			"""
61			if isinstance(pattern, re.Pattern):
62			pattern = regex.compile(pattern.pattern, flags=regex.V1)
63			elif isinstance(pattern, str):
64			pattern = regex.compile(pattern, flags=regex.V1)
65			elif isinstance(pattern, regex.Pattern) and not pattern.flags & regex.V1:
66			pattern = regex.compile(pattern.pattern, flags=regex.V1)
67			match = pattern.fullmatch(value)
68			if match is None:
69			return None
70			return match.group(group)
71
72			@classmethod
73			def join_to_str(cls: type[Self], *items: Any, last: str, sep: str = ", ") -> str:
74			"""
75			Joins items to something like "cat, dog, and pigeon" or "cat, dog, or pigeon".
76
77			Args:
78			*items: Items to join; `str(item) for item in items` will be used
79			last: Probably "and", "or", "and/or", or ""
80			Spaces are added/removed as needed if `suffix` is alphanumeric
81			or "and/or", after stripping whitespace off the ends.
82			sep: Used to separate all words; include spaces as desired
83
84			Examples:
85			- `join_to_str(["cat", "dog", "elephant"], last="and") # cat, dog, and elephant`
86			- `join_to_str(["cat", "dog"], last="and") # cat and dog`
87			- `join_to_str(["cat", "dog", "elephant"], last="", sep="/") # cat/dog/elephant`
88			"""
89			if last.strip().isalpha() or last.strip() == "and/or":
90			last = last.strip() + " "
91			items = [str(s).strip("'" + '"' + " ") for s in items]
92			if len(items) > 2:
93			return sep.join(items[:-1]) + sep + last + items[-1]
94			else:
95			return (" " + last + " ").join(items)
96
97			@classmethod
98			def strip_control_chars(cls: type[Self], s: str) -> str:
99			"""
100			Strips all characters under the Unicode 'Cc' category.
101			"""
102			return _control_chars.sub("", s)
103
104			@classmethod
105			def roman_to_arabic(
106			cls: type[Self],
107			roman: str,
108			min_val: int \| None = None,
109			max_val: int \| None = None,
110			) -> int:
111			"""
112			Converts roman numerals to an integer.
113
114			Args:
115			roman: A string like "MCIV"
116			min_val: Raise a ValueError if the parsed value is less than this
117			max_val: Raise a ValueError if the parsed value is more than this
118
119			Returns:
120			The arabic numeral as a Python int
121			"""
122			# this order is IMPORTANT!
123			mp = {
124			"IV": 4,
125			"IX": 9,
126			"XL": 40,
127			"XC": 90,
128			"CD": 400,
129			"CM": 900,
130			"I": 1,
131			"V": 5,
132			"X": 10,
133			"L": 50,
134			"C": 100,
135			"D": 500,
136			"M": 1000,
137			}
138			for k, v in mp.items():
139			roman = roman.replace(k, str(v))
140			# it'll just error if it's empty
141			try:
142			value = sum(int(num) for num in roman)
143			except (ValueError, StopIteration):
144			msg = f"Cannot parse roman numerals '{roman}'"
145			raise XValueError(msg, value=roman)
146			if min_val is not None and value < min_val or max_val is not None and value > max_val:
147			msg = f"Value {roman} (int={value}) is out of range ({min_val}, {max_val})"
148			raise XValueError(
149			msg,
150			value=roman,
151			)
152			return value
153
154			@classmethod
155			def retab(cls: type[Self], s: str, n_spaces: int) -> str:
156			"""
157			Converts indentation with spaces to tab indentation.
158
159			Args:
160			s: The string to convert
161			n_spaces: A tab is this number of spaces
162			"""
163
164			def fix(m):
165			n = len(m.group(1)) // n_spaces
166			return "\t" * n + " " * (len(m.group(1)) % n_spaces)
167
168			return regex.sub("^( +)", fix, s, flags=regex.V1 \| regex.MULTILINE)
169
170			@classmethod
171			def strip_empty_decimal(cls: type[Self], num: float \| str) -> str:
172			"""
173			Replaces prefix . with 0. and strips trailing .0 and trailing .
174			"""
175			try:
176			float(num)
177			except TypeError:
178			if not isinstance(num, str):
179			msg = "Must be either str or float-like"
180			raise TypeError(msg) from None
181			t = str(num)
182			if t.startswith("."):
183			t = "0" + t
184			if "." in t:
185			return t.rstrip("0").rstrip(".")
186			else:
187			return t
188
189			@classmethod
190			def tabs_to_list(cls: type[Self], s: str) -> Sequence[str]:
191			"""
192			Splits by tabs, but preserving quoted tabs, stripping quotes.
193			In other words, will not split within a quoted substring.
194			Double and single quotes are handled.
195			"""
196			pat = regex.compile(r"""((?:[^\t"']\|"[^"]"\|'[^']')+)""", flags=regex.V1)
197
198			# Don't strip double 2x quotes: ex ""55"" should be "55", not 55
199			def strip(i: str) -> str:
200			if i.endswith(('"', "'")):
201			i = i[:-1]
202			if i.startswith(('"', "'")):
203			i = i[1:]
204			return i.strip()
205
206			return [strip(i) for i in pat.findall(s)]
207
208			@classmethod
209			def truncate(
210			cls: type[Self],
211			s: str \| None,
212			n: int = 40,
213			*,
214			null: str \| None = None,
215			) -> str \| None:
216			"""
217			Truncates a string and adds ellipses, if needed.
218
219			Returns a string if it has `n` or fewer characters;
220			otherwise truncates to length `n-1` and appends `…` (UTF character).
221			If `s` is None and `always_dots` is True, returns `n` copies of `.` (as a string).
222			If `s` is None otherwise, returns None.
223
224			Args:
225			s: The string
226			n: The maximum length, inclusive
227			null: Replace `None` with this string
228
229			Returns:
230			A string or None
231			"""
232			if s is None:
233			return null
234			if len(s) > n:
235			nx = max(0, n - 1)
236			return s[:nx] + "…"
237			return s
238
239			# these are provided to avoid having to call with labdas or functools.partial
240			@classmethod
241			def truncating(
242			cls: type[Self],
243			n: int = 40,
244			always_dots: bool = False,
245			*,
246			null: str \| None = None,
247			) -> Callable[[str], str]:
248			# pretty much functools.partial
249			def trunc(s: str) -> str:
250			return cls.truncate(s, n, null=null)
251
252			trunc.__name__ = f"truncate({n},{'…' if always_dots else ''})"
253			return trunc
254
255			@classmethod
256			def longest(cls: type[Self], parts: Iterable[T]) -> T:
257			"""
258			Returns an element with the highest `len`.
259			"""
260			mx = ""
261			for _i, x in enumerate(parts):
262			if len(x) > len(mx):
263			mx = x
264			return mx
265
266			@classmethod
267			def strip_any_ends(
268			cls: type[Self],
269			s: str,
270			prefixes: str \| Sequence[str],
271			suffixes: str \| Sequence[str],
272			) -> str:
273			"""
274			Flexible variant that strips any number of prefixes and any number of suffixes.
275			Also less type-safe than more specific variants.
276			Note that the order of the prefixes (or suffixes) DOES matter.
277			"""
278			prefixes = [str(z) for z in prefixes] if is_true_iterable(prefixes) else [str(prefixes)]
279			suffixes = [str(z) for z in suffixes] if is_true_iterable(suffixes) else [str(suffixes)]
280			s = str(s)
281			for pre in prefixes:
282			if s.startswith(pre):
283			s = s[len(pre) :]
284			for suf in suffixes:
285			if s.endswith(suf):
286			s = s[: -len(suf)]
287			return s
288
289			@classmethod
290			def strip_brackets(cls: type[Self], text: str) -> str:
291			"""
292			Strips any and all pairs of brackets from start and end of a string, but only if they're paired.
293
294			See Also:
295			strip_paired
296			"""
297			pieces = [
298			"()",
299			"[]",
300			"[]",
301			"{}",
302			"<>",
303			"⦗⦘",
304			"⟨⟩",
305			"⸨⸩",
306			"⟦〛",
307			"《》",
308			"〘〙",
309			]
310			return StringTools.strip_paired(text, pieces)
311
312			@classmethod
313			def strip_quotes(cls: type[Self], text: str) -> str:
314			"""
315			Strips any and all pairs of quotes from start and end of a string, but only if they're paired.
316
317			See Also:
318			strip_paired
319			"""
320			pieces = [
321			"`",
322			"``",
323			"”“",
324			"''",
325			'""',
326			]
327			return StringTools.strip_paired(text, pieces)
328
329			@classmethod
330			def strip_brackets_and_quotes(cls: type[Self], text: str) -> str:
331			"""
332			Strips any and all pairs of brackets and quotes from start and end of a string, but only if they're paired.
333
334			See Also:
335			strip_paired
336			"""
337			pieces = [
338			"()",
339			"[]",
340			"[]",
341			"{}",
342			"<>",
343			"⦗⦘",
344			"⟨⟩",
345			"⸨⸩",
346			"⟦〛",
347			"《》",
348			"〘〙",
349			"`",
350			"``",
351			"”“",
352			"''",
353			'""',
354			]
355			return StringTools.strip_paired(text, pieces)
356
357			@classmethod
358			def strip_paired(cls: type[Self], text: str, pieces: Iterable[tuple[str, str] \| str]) -> str:
359			"""
360			Strips pairs of (start, end) from the ends of strings.
361
362			Example:
363			.. code-block::
364			StringTools.strip_paired("[(abc]", [("()"), ("[]")) # returns "(abc"
365
366			See Also:
367			strip_brackets
368			"""
369			if any(a for a in pieces if len(a) != 2):
370			msg = f"Each item must be a string of length 2: (stard, end); got {pieces}"
371			raise XValueError(
372			msg,
373			value=str(pieces),
374			)
375			text = str(text)
376			while len(text) > 0:
377			yes = False
378			for a, b in pieces:
379			while text.startswith(a) and text.endswith(b):
380			text = text[1:-1]
381			yes = True
382			if not yes:
383			break
384			return text
385
386			@classmethod
387			def superscript(cls: type[Self], s: str \| float) -> str:
388			"""
389			Replaces digits, +, =, (, and ) with equivalent Unicode superscript chars (ex ¹).
390			"""
391			return "".join(dict(zip("0123456789-+=()", "⁰¹²³⁴⁵⁶⁷⁸⁹⁻⁺⁼⁽⁾")).get(c, c) for c in s)
392
393			@classmethod
394			def subscript(cls: type[Self], s: str \| float) -> str:
395			"""
396			Replaces digits, +, =, (, and ) with equivalent Unicode subscript chars (ex ₁).
397			"""
398			return "".join(dict(zip("0123456789+-=()", "₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎")).get(c, c) for c in s)
399
400			@classmethod
401			def unsuperscript(cls: type[Self], s: str \| float) -> str:
402			"""
403			Replaces Unicode superscript digits, +, =, (, and ) with normal chars.
404			"""
405			return "".join(dict(zip("⁰¹²³⁴⁵⁶⁷⁸⁹⁻⁺⁼⁽⁾", "0123456789-+=()")).get(c, c) for c in s)
406
407			@classmethod
408			def unsubscript(cls: type[Self], s: str \| float) -> str:
409			"""
410			Replaces Unicode superscript digits, +, =, (, and ) with normal chars.
411			"""
412			return "".join(dict(zip("₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎", "0123456789+-=()")).get(c, c) for c in s)
413
414			@classmethod
415			def pretty_float(cls: type[Self], v: float \| int, n_sigfigs: int \| None = 5) -> str:
416			"""
417			Represents a float as a string, with symbols for NaN and infinity.
418			The returned string always has a minus or + prepended. Strip off the plus with .lstrip('+').
419			If v is an integer (by isinstance), makes sure to display without a decimal point.
420			If n_sigfigs < 2, will never have a
421			For ex:
422			- StringTools.pretty_float(.2222222) # '+0.22222'
423			- StringTools.pretty_float(-.2222222) # '-0.22222' (Unicode minus)
424			- StringTools.pretty_float(-float('inf')) # '-∞'
425			- StringTools.pretty_float(np.NaN) # 'NaN'
426			"""
427			# TODO this seems absurdly long for what it does
428			if n_sigfigs is None or n_sigfigs < 1:
429			msg = f"Sigfigs of {n_sigfigs} is nonpositive"
430			raise OutOfRangeError(
431			msg,
432			value=n_sigfigs,
433			minimum=1,
434			)
435			# first, handle NaN and infinities
436			if str(v) in {"nan", "NaN"}:
437			return "⌀"
438			if v == float("-Inf"):
439			return "-∞"
440			if v == float("Inf"):
441			return "+∞"
442			elif not isinstance(v, str) and str(v) in ["nan", "na", "NaN"]:
443			return "NaN"
444			elif not isinstance(v, str) and str(v) == "NaT":
445			return "NaT"
446			# sweet. it's a regular float or int.
447			if n_sigfigs is None:
448			s = cls.strip_empty_decimal(str(v))
449			else:
450			# yes, this is weird. we need to convert from str to float then back to str
451			s = str(float(str(("%." + str(n_sigfigs) + "g") % v)))
452			# remove the .0 if the precision doesn't support it
453			# if v >= 1 and n_sigfigs<2, it couldn't have a decimal
454			# and if n_sigfigs<1, it definitely can't
455			# and ... %g does this.
456			if isinstance(v, int) or n_sigfigs is not None and n_sigfigs < 2:
457			s = cls.strip_empty_decimal(s)
458			# prepend + or - (unless 0)
459			if float(s) == 0.0:
460			return s
461			s = s.replace("-", "-")
462			if not s.startswith("-"):
463			s = "+" + s[1:]
464			if len(s) > 1 and s[1] == ".":
465			s = s[0] + "0." + s[2:]
466			return s
467
468			@classmethod
469			def pretty_function(
470			cls: type[Self],
471			function: Callable,
472			*,
473			with_address: bool = False,
474			) -> str:
475			n_args = str(function.__code__.co_argcount) if hasattr(function, "__code__") else "?"
476			pat = regex.compile(r"^<bound method [^ .]+\.([^ ]+) of (.+)>$", flags=regex.V1)
477			boundmatch = pat.fullmatch(str(function))
478			addr = " @ " + hex(id(function)) if with_address else ""
479			if _is_lambda(function):
480			# simplify lambda functions!
481			return "⟨" + "λ(" + n_args + ")" + addr + "⟩"
482			elif boundmatch is not None:
483			# it's a method (bound function)
484			# don't show the address of the instance AND its method
485			pat = regex.compile(r"@ ?0x[0-9a-hA-H]+\)?$", flags=regex.V1)
486			s = pat.sub("", boundmatch.group(2)).strip()
487			return "⟨" + "`" + s + "`." + boundmatch.group(1) + "(" + n_args + ")" + addr + "⟩"
488			elif callable(function):
489			# it's an actual function
490			return "⟨" + function.__name__ + addr + "⟩"
491			msg = f"Wrong type {type(function)} for '{function}"
492			raise XTypeError(msg, actual=type(function).__name__)
493
494			@classmethod
495			def pretty_repr(
496			cls: type[Self],
497			function: Any,
498			*,
499			with_address: bool = False,
500			) -> str:
501			"""
502			Get a better and shorter name for a function than str(function).
503			Ex: pprint_function(lambda s: s) == '<λ>'
504			- Instead of '<bound method ...', you'll get '<name(nargs)>'
505			- Instead of 'lambda ...', you'll get '<λ(nargs)>'
506			- etc.
507			NOTE 1: If function is None, returns '⌀'
508			NOTE 2: If function does not have __name__, returns prefix + type(function) + <address> + suffix
509			NOTE 3: If it's a primitive, returns str(function)
510
511			Args:
512			function: Can be anything, but especially useful for functions
513			with_address: Include `@ hex-mem-addr` in the name
514			"""
515			addr = " @ " + hex(id(function)) if with_address else ""
516			pat = regex.compile(r"<([A-Za-z0-9_.<>]+)[ ']*object", flags=regex.V1)
517			objmatch = pat.search(str(function)) # instance of global or local class
518			if function is None:
519			return "⌀"
520			if isinstance(function, type):
521			# it's a class
522			return "⟨" + "type:" + function.__name__ + "⟩"
523			elif callable(function):
524			return cls.pretty_function(function, with_address=with_address)
525			elif hasattr(function, "__dict__") and len(function.__dict__) > 0:
526			# it's a member with attributes
527			# it's interesting enough that it may have a good __str__
528			# strip prefix and suffix because we'll re-add it
529			s = str(function).removeprefix("⟨").removesuffix("⟩")
530			return "⟨" + s + addr + "⟩"
531			elif objmatch is not None:
532			# it's an instance without attributes
533			s = objmatch.group(1)
534			if "." in s:
535			s = s[s.rindex(".") + 1 :]
536			return "⟨" + s + addr + "⟩"
537			# it's a primitive, etc
538			return str(function)
539
540			@classmethod
541			def greek_to_name(cls: type[Self]) -> Mapping[str, str]:
542			"""
543			Returns a dict from Greek lowercase+uppercase Unicode chars to their full names.
544			"""
545			return dict(StringTools._greek_alphabet)
546
547			@classmethod
548			def name_to_greek(cls: type[Self]) -> Mapping[str, str]:
549			"""
550			Returns a dict from Greek lowercase+uppercase letter names to their Unicode chars.
551			"""
552			return {v: k for k, v in StringTools._greek_alphabet.items()}
553
554			@classmethod
555			def fix_greek(cls: type[Self], s: str, lowercase: bool = False) -> str:
556			"""
557			Replaces Greek letter names with their Unicode equivalents.
558			Does this correctly by replacing superstrings before substrings.
559			Ex: '1-beta' is '1-β' rather than '1-bη'
560			If lowercase is True: Replaces Beta, BeTa, and BETA with β
561			Else: Replaces Beta with a capital Greek Beta and ignores BETA and BeTa.
562			"""
563			# Clever if I may say so:
564			# If we just sort from longest to shortest, we can't replace substrings by accident
565			# For example we'll replace 'beta' before 'eta', so '1-beta' won't become '1-bη'
566			greek = sorted(
567			[(v, k) for k, v in StringTools._greek_alphabet.items()],
568			key=lambda t: -len(t[1]),
569			)
570			for k, v in greek:
571			if k[0].isupper() and lowercase:
572			continue
573			s = regex.compile(k, flags=regex.V1 \| regex.IGNORECASE).sub(v, s) if lowercase else s.replace(k, v)
574			return s
575
576			@classmethod
577			def join(
578			cls: type[Self],
579			seq: Iterable[T],
580			*,
581			sep: str = "\t",
582			attr: str \| None = None,
583			prefix: str = "",
584			suffix: str = "",
585			) -> str:
586			"""
587			Join elements into a str more easily than ''.join. Just simplifies potentially long expressions.
588			Won't break with ValueError if the elements aren't strs.
589
590			Example:
591			``python
592			- StringTools.join([1,2,3]) # "1 2 3"
593			- StringTools.join(cars, sep=',', attr='make', prefix="(", suffix=")")` # "(Ford),(Ford),(BMW)"
594			``
595
596			Args:
597			seq: Sequence of elements
598			sep: Delimiter
599			attr: Get this attribute from each element (in `seq`), or use the element itself if None
600			prefix: Prefix before each item
601			suffix: Suffix after each item
602
603			Returns:
604			A string
605			"""
606			if attr is None:
607			return sep.join([prefix + str(s) + suffix for s in seq])
608			else:
609			return sep.join([prefix + str(getattr(s, attr)) + suffix for s in seq])
610
611			@classmethod
612			def join_kv_neat(cls: type[Self], seq: Mapping[T, V], *, eq: str = "=", sep: str = ", ") -> str:
613			return cls.join_kv(seq, sep=sep, eq=eq)
614
615			@classmethod
616			def join_kv_spaced(cls: type[Self], seq: Mapping[T, V], *, eq: str = ": ", sep: str = "; ") -> str:
617			return cls.join_kv(seq, sep=sep, eq=eq)
618
619			@classmethod
620			def join_kv_quoted(cls: type[Self], seq: Mapping[T, V], *, eq: str = ": ", sep: str = "; ") -> str:
621			return cls.join_kv(seq, sep=sep, eq=eq, prefix="'", suffix="'")
622
623			@classmethod
624			def join_kv(
625			cls: type[Self],
626			seq: Mapping[T, V],
627			*,
628			sep: str = "\t",
629			eq: str = "=",
630			prefix: str = "",
631			suffix: str = "",
632			) -> str:
633			"""
634			Joins dict elements into a str like 'a=1, b=2, c=3`.
635			Won't break with ValueError if the keys or values aren't strs.
636
637			Args:
638			seq: Dict-like, with `items()`
639			sep: Delimiter
640			eq: Separates a key with its value
641			prefix: Prepend before every key
642			suffix: Append after every value
643
644			Returns:
645			A string
646			"""
647			return sep.join([prefix + str(k) + eq + str(v) + suffix for k, v in seq.items()])
648
649			_greek_alphabet = {
650			"\u0391": "Alpha",
651			"\u0392": "Beta",
652			"\u0393": "Gamma",
653			"\u0394": "Delta",
654			"\u0395": "Epsilon",
655			"\u0396": "Zeta",
656			"\u0397": "Eta",
657			"\u0398": "Theta",
658			"\u0399": "Iota",
659			"\u039A": "Kappa",
660			"\u039B": "Lambda",
661			"\u039C": "Mu",
662			"\u039D": "Nu",
663			"\u039E": "Xi",
664			"\u039F": "Omicron",
665			"\u03A0": "Pi",
666			"\u03A1": "Rho",
667			"\u03A3": "Sigma",
668			"\u03A4": "Tau",
669			"\u03A5": "Upsilon",
670			"\u03A6": "Phi",
671			"\u03A7": "Chi",
672			"\u03A8": "Psi",
673			"\u03A9": "Omega",
674			"\u03B1": "alpha",
675			"\u03B2": "beta",
676			"\u03B3": "gamma",
677			"\u03B4": "delta",
678			"\u03B5": "epsilon",
679			"\u03B6": "zeta",
680			"\u03B7": "eta",
681			"\u03B8": "theta",
682			"\u03B9": "iota",
683			"\u03BA": "kappa",
684			"\u03BB": "lambda",
685			"\u03BC": "mu",
686			"\u03BD": "nu",
687			"\u03BE": "xi",
688			"\u03BF": "omicron",
689			"\u03C0": "pi",
690			"\u03C1": "rho",
691			"\u03C3": "sigma",
692			"\u03C4": "tau",
693			"\u03C5": "upsilon",
694			"\u03C6": "phi",
695			"\u03C7": "chi",
696			"\u03C8": "psi",
697			"\u03C9": "omega",
698			}
699
700
701			__all__ = ["StringTools"]
702

dmyersturnbull / pocketutils

Push — main ( ed7d21...87238c )

StringTools.pretty_repr() C

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like