typeddfs.builders - Code Metrics - dmyersturnbull/typed-dfs - Measure and Improve Code Quality continuously with Scrutinizer

typeddfs.builders F
last analyzed 2023-10-24 08:21 UTC

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	703
Duplicated Lines	0 %

Importance

Changes

Metric	Value
eloc	302
dl	0
loc	703
rs	3.52
c	0
b	0
f	0
wmc	61

33 Methods

Rating	Name	Size	Complexity
A	TypedDfBuilder._check_final()	18	2
A	_GenericBuilder.add_write_kwargs()	21	2
A	_GenericBuilder.secure()	10	1
A	_GenericBuilder.add_classmethods()	13	1
A	_GenericBuilder.add_methods()	21	1
A	_GenericBuilder.subclass()	15	2
A	_GenericBuilder.encoding()	22	1
A	AffinityMatrixDfBuilder.__init__()	3	1
A	MatrixDfBuilder.build()	21	1
A	_GenericBuilder.recommended_only()	10	1
A	TypedDfBuilder.series_names()	16	1
A	_GenericBuilder.suffix()	31	5
A	_GenericBuilder.doc()	10	1
A	MatrixDfBuilder.__init__()	6	1
A	TypedDfBuilder.build()	16	1
A	TypedDfBuilder.strict()	14	1
A	TypedDfBuilder.reserve()	27	4
A	_GenericBuilder.add_read_kwargs()	16	2
A	TypedDfBuilder.require()	25	4
A	TypedDfBuilder.__init__()	3	1
A	AffinityMatrixDfBuilder.build()	18	1
A	_GenericBuilder._check_final()	2	1
A	_GenericBuilder.add_custom_format()	18	3
A	_GenericBuilder.verify()	15	1
B	_GenericBuilder._build()	52	6
A	_GenericBuilder.post()	11	1
A	_GenericBuilder.hash()	33	1
A	MatrixDfBuilder.dtype()	18	2
A	MatrixDfBuilder._check_final()	2	1
B	_GenericBuilder.__init__()	54	2
A	_GenericBuilder.attrs()	39	2
A	TypedDfBuilder._check()	17	5
A	TypedDfBuilder.drop()	12	1

How to fix Complexity

# SPDX-FileCopyrightText: Copyright 2020-2023, Contributors to typed-dfs
# SPDX-PackageHomePage: https://github.com/dmyersturnbull/typed-dfs
# SPDX-License-Identifier: Apache-2.0
"""
Defines a builder pattern for ``TypedDf``.
"""
from __future__ import annotations

from collections import defaultdict
from typing import TYPE_CHECKING, Any, Optional

import pandas as pd

from typeddfs.df_errors import ClashError, DfTypeConstructionError
from typeddfs.df_typing import DfTyping, IoTyping
from typeddfs.file_formats import FileFormat
from typeddfs.matrix_dfs import AffinityMatrixDf, MatrixDf
from typeddfs.typed_dfs import TypedDf
from typeddfs.utils import Utils
from typeddfs.utils._utils import (
    _AUTO_DROPPED_NAMES,
    _DEFAULT_ATTRS_SUFFIX,
    _DEFAULT_HASH_ALG,
    _FORBIDDEN_NAMES,
    _PICKLE_VR,
)
from typeddfs.utils.checksums import Checksums

if TYPE_CHECKING:
    from collections.abc import Callable, Mapping, Sequence
    from pathlib import Path

    from typeddfs.base_dfs import BaseDf


class _GenericBuilder:
    def __init__(self, name: str, doc: str | None = None) -> None:
        """
        Constructs a new builder.

        Args:
            name: The name of the resulting class
            doc: The docstring of the resulting class

        Raises:
            TypeError: If ``name`` or ``doc`` non-string
        """
        if not isinstance(name, str):
            msg = f"Class name {name} is a {type(name)}, not str"
            raise TypeError(msg)
        self._name = name
        self._doc = doc
        self._clazz = None
        self._classes = []
        self._remapped_suffixes = {}
        self._remapped_read_kwargs = {}
        self._remapped_write_kwargs = {}
        self._encoding = "utf-8"
        self._errors = "strict"
        self._read_kwargs = defaultdict(dict)
        self._write_kwargs = defaultdict(dict)
        self._methods = {}
        self._classmethods = {}
        self._post_processing = None
        self._verifications = []
        self._req_meta = []
        self._res_meta = []
        self._req_cols = []
        self._res_cols = []
        self._dtypes = {}
        self._value_dtype = None
        self._drop = []
        self._strict_meta = False
        self._strict_cols = False
        self._hash_alg = _DEFAULT_HASH_ALG
        self._hash_file = False
        self._hash_dir = False
        self._index_series_name = False
        self._column_series_name = False
        self._secure = False
        self._recommended = False
        self._req_hash: Optional = False
        self._req_order: Optional = False
        self._attr_suffix = None
        self._attr_json_kwargs = {}
        self._custom_formats = {}
        # make these use an explicit version
        # the user can override if needed
        self.add_read_kwargs("pickle", protocol=_PICKLE_VR)
        self.add_write_kwargs("pickle", protocol=_PICKLE_VR)

    def subclass(self, clazz: type[Any]) -> __qualname__:
        """
        Make the class inherit from some type.
        May only subclass from a single subclass of DataFrame.
        If ``class`` is a ``DataFrame`` subclass, replaces the existing subclass.
        Otherwise, adds an additional superclass for multiple inheritance.

        Returns:
            This builder for chaining
        """
        if issubclass(clazz, pd.DataFrame):
            self._clazz = clazz
        else:
            self._classes.append(clazz)
        return self

    def doc(self, s: str) -> __qualname__:
        """
        Sets the docstring.
        This has the same effect as setting doc in :meth:`__init__`.

        Returns:
            This builder for chaining
        """
        self._doc = s
        return self

    def add_methods(
        self,
        *args: Callable[[BaseDf, ...], Any],
        **kwargs: Callable[[BaseDf, ...], Any],
    ) -> __qualname__:
        """
        Attaches methods to the class.

        Args:
            args: Functions whose names are used directly
            kwargs: Mapping from function names to functions (the keys will be the method names)

        Example:
            ``add_methods(summary=lambda df: f"{len(df) rows")``

        Returns:
            This builder for chaining
        """
        self._methods.update({m.__name__: m for m in args})
        self._methods.update(**kwargs)
        return self

    def add_classmethods(self, **kwargs: Callable[[type[BaseDf], ...], Any]) -> __qualname__:
        """
        Attaches classmethods to the class.
        Mostly useful for factory methods.

        Example:
            ``add_classmethods(flat_instance=lambda t, value: MyClass(value))``

        Returns:
            This builder for chaining
        """
        self._classmethods.update(**kwargs)
        return self

    def post(self, fn: Callable[[BaseDf], BaseDf]) -> __qualname__:
        """
        Adds a method that is called on the converted DataFrame.
        It is called immediately before final optional conditions (``verify``) are checked.
        The function must return a new DataFrame.

        Returns:
            This builder for chaining
        """
        self._post_processing = fn
        return self

    def verify(self, *conditions: Callable[[pd.DataFrame], str, bool | None]) -> __qualname__:
        """
        Adds additional requirement(s) for the DataFrames.

        Returns:
            this builder for chaining

        Args:
            conditions: Functions of the DataFrame that return None if the condition is met, or an error message

        Returns:
            This builder for chaining
        """
        self._verifications.extend(conditions)
        return self

    def suffix(
        self,
        suffix: str,
        fmt: FileFormat | str,
        *,
        read: Mapping[str, Any] | None = None,
        write: Mapping[str, Any] | None = None,
    ) -> __qualname__:
        """
        Makes read_files and write_files interpret a filename suffix differently.
        Suffixes like .gz, .zip, etc. are also included for text formats that are provided.

        Args:
            suffix: e.g. .txt (a prepended '.' is ignored)
            fmt: The FileFormat used to map to read/write methods
            read: Custom params to pass to the read function
            write: Custom params to pass to the write function

        Returns:
            This builder for chaining
        """
        if not suffix.startswith("."):
            suffix = "." + suffix
        fmt = FileFormat.of(fmt)
        for s in fmt.compressed_variants(suffix):
            self._remapped_suffixes[s] = fmt
        if read is not None:
            self._remapped_read_kwargs[suffix] = read
        if write is not None:
            self._remapped_write_kwargs[suffix] = write
        return self

    def hash(
        self,
        *,
        alg: str = "sha256",
        file: bool = True,
        directory: bool = False,
    ) -> __qualname__:
        """
        Write a hash file (e.g. .sha256) alongside files.
        Performed when calling :meth:`typeddfs.abs_dfs.AbsDf.write_file`.
        The hash files will be in the `sha1sum <https://en.wikipedia.org/wiki/Sha1sum>`_ format,
        with a the filename, followed by ``" *"``, followed by the filename.

        Note that this affects the default behavior of :meth:`typeddfs.abs_dfs.AbsDf.write_file`,
        which can be called with ``file_hash=False`` and/or ``dir_hash=False``.

        Args:
            alg: The name of the algorithm in ``hashlib``;
                 The final name will ignore any hyphens and be converted to lowercase,
                 and the suffix will be ``"." + alg``.
            file: Alongside a file ``"my_file.csv.gz"``,
                  write a file ``"my_file.csv.gz."+alg`` alongside.
            directory: Alongside a file ``"my_file.csv.gz"`` in ``"my_dir"``,
                       append to a file ``"my_dir/my_dir"+alg``,
                       which presumably should contain hashes for files in that directory.

        Returns:
            This builder for chaining
        """
        self._hash_alg = Checksums.resolve_algorithm(alg)
        self._hash_file = file
        self._hash_dir = directory
        return self

    def attrs(
        self,
        *,
        suffix: str = ".attrs.json",  # leave like this for clear documentation
        preserve_inf: bool = True,
        sort: bool = False,
        indent: bool = True,
        fallback: Callable[[Any], Any] | None = None,
    ) -> __qualname__:
        """
        Sets ``pd.DataFrame.attrs`` to be read and written by default.

        Args:
            suffix: Will be appended to the filename of the DataFrame;
                    must end with .json, .json.gz, etc.
            preserve_inf: Convert numpy ``ndarray`` values, ``float("inf")``,
                          and  ``float("-inf")`` to str when writing JSON
            sort: Sort JSON before writing
            indent: Indent JSON before writing
            fallback: Try this method to serialize to JSON if all others fail

        Returns:
            This builder for chaining

        Raises:
            ValueError: If the format is not JSON
        """
        fmt = FileFormat.from_path(suffix)
        if fmt is not FileFormat.json:
            msg = f"File format must be JSON ({suffix}"
            raise ValueError(msg)
        self._attr_suffix = suffix
        self._attr_json_kwargs = {
            "preserve_inf": preserve_inf,
            "sort": sort,
            "indent": indent,
            "fallbacks": [fallback],
        }
        return self

    def secure(self) -> __qualname__:
        """
        Bans IO with insecure formats.
        This includes Pickle and Excel formats that support macros.

        Returns:
            This builder for chaining
        """
        self._secure = True
        return self

    def recommended_only(self) -> __qualname__:
        """
        Bans IO with non-recommended formats.
        This includes all insecure formats along with fixed-width, HDF5, INI, TOML, .properties, etc.

        Returns:
            This builder for chaining
        """
        self._recommended = True
        return self

    def encoding(self, encoding: str = "utf-8") -> __qualname__:
        """
        Has pandas-defined text read/write functions use UTF-8.
        UTF-8 was the default when the builder was constructed.

        Arguments:
            encoding: Use this encoding.
                      Values are case-insensitive and ignore hyphen.
                      (i.e. ``utf-8(bom)`` and ``utf8(bom)`` are the same.
                      Special values are ``platform`` and ``utf-8(bom)``.
                      "platform" is equivalent to ``sys.getdefaultencoding()``.
                      "utf8(bom)" changes the encoding depending on the platform at the time of writing.
                      (I.e. The read/write functions will work as expected when pickled.)
                      If ``utf8(bom)``, will use utf-8-sig if the platform is Windows ('nt').
                      Some applications will otherwise assume the default encoding (and break).
                      (Note: ``utf16(bom)`` will also work.)

        Returns:
            This builder for chaining
        """
        self._encoding = encoding.lower().replace("-", "")
        return self

    def add_custom_format(
        self,
        suffix: str,
        reader: Callable[[Path], pd.DataFrame],
        writer: Callable[[pd.DataFrame], Path],
        *,
        replace: bool = False,
    ) -> __qualname__:
        """
        Adds custom readers and writers for read_file and write_file.
        """
        if not replace:
            fmt = FileFormat.from_path_or_none(suffix)
            if fmt is not None:
                msg = f"Cannot override suffix {suffix} for format {fmt.name}"
                raise ValueError(msg)
        self._custom_formats[suffix] = (reader, writer)
        return self

    def add_read_kwargs(self, fmt: FileFormat | str, **kwargs) -> __qualname__:
        """
        Adds keyword arguments that are passed to ``read_`` methods when called from ``read_file``.
        Rarely needed.

        Arguments:
            fmt: The file format (which corresponds to the delegated method)
            kwargs: key-value pairs that are used for the specified format

        Returns:
            This builder for chaining
        """
        fmt = FileFormat.of(fmt)
        for k, v in kwargs.items():
            self._read_kwargs[fmt][k] = v
        return self

    def add_write_kwargs(self, fmt: FileFormat | str, **kwargs) -> __qualname__:
        """
        Adds keyword arguments that are passed to ``to_`` methods when called from ``to_file``.
        Rarely needed.

        Example:
            .. code::

                TypedDfs.typed("x").add_write_kwargs()

        Arguments:
            fmt: The file format (which corresponds to the delegated method)
            kwargs: key-value pairs that are used for the specified format

        Returns:
            This builder for chaining
        """
        fmt = FileFormat.of(fmt)
        for k, v in kwargs.items():
            self._write_kwargs[fmt][k] = v
        return self

    def _build(self) -> type[BaseDf]:
        if self._secure and self._hash_alg in Utils.insecure_hash_functions():
            msg = f"Hash algorithm {self._hash_alg} forbidden by .secure()"
            raise DfTypeConstructionError(msg)
        self._check_final()

        _io_typing = IoTyping(
            _remap_suffixes=dict(self._remapped_suffixes),
            _text_encoding=self._encoding,
            _read_kwargs=dict(self._read_kwargs),
            _write_kwargs=dict(self._write_kwargs),
            _hash_alg=self._hash_alg,
            _save_hash_file=self._hash_file,
            _save_hash_dir=self._hash_dir,
            _secure=self._secure,
            _recommended=self._recommended,
            _attrs_suffix=_DEFAULT_ATTRS_SUFFIX if self._attr_suffix is None else self._attr_suffix,
            _use_attrs=self._attr_suffix is not None,
            _attrs_json_kwargs=self._attr_json_kwargs,
            _custom_readers={k: v[0] for k, v in self._custom_formats.items()},
            _custom_writers={k: v[1] for k, v in self._custom_formats.items()},
        )

        _typing = DfTyping(
            _io_typing=_io_typing,
            _auto_dtypes=dict(self._dtypes),
            _post_processing=self._post_processing,
            _verifications=self._verifications,
            _more_index_names_allowed=not self._strict_meta,
            _more_columns_allowed=not self._strict_cols,
            _required_columns=list(self._req_cols),
            _required_index_names=list(self._req_meta),
            _reserved_columns=list(self._res_cols),
            _reserved_index_names=list(self._res_meta),
            _columns_to_drop=set(self._drop),
            _index_series_name=self._index_series_name,
            _column_series_name=self._column_series_name,
            _value_dtype=self._value_dtype,
        )

        class New(self._clazz, *self._classes):
            @classmethod
            def get_typing(cls) -> DfTyping:
                return _typing

        New.__name__ = self._name
        New.__doc__ = self._doc
        for k, v in self._methods.items():
            setattr(New, k, v)
        for k, v in self._classmethods.items():
            setattr(New, k, classmethod(v))
        return New

    def _check_final(self) -> None:
        raise NotImplementedError()


class MatrixDfBuilder(_GenericBuilder):
    """
    A builder pattern for :class:`typeddfs.matrix_dfs.MatrixDf`.
    """

    def __init__(self, name: str, doc: str | None = None) -> None:
        super().__init__(name, doc)
        self._clazz = MatrixDf
        self._index_series_name = "row"
        self._column_series_name = "column"
        self._req_meta.append("row")

    def build(self) -> type[MatrixDf]:
        """
        Builds this type.

        Returns:
            A newly created subclass of :class:`typeddfs.matrix_dfs.MatrixDf`.

        Raises:
            ClashError: If there is a contradiction in the specification
            FormatInsecureError: If :meth:`hash` set an insecure
                                 hash format and :meth:`secure` was set.

        .. note ::

            Copies, so this builder can be used to create more types without interference.

        Raises:
            DfTypeConstructionError: for some errors
        """
        # noinspection PyTypeChecker
        return self._build()

    def dtype(self, dt: type[Any]) -> __qualname__:
        """
        Sets the type of value for all matrix elements.
        This should almost certainly be a numeric type,
        and it must be ordered.

        .. caution:
            Never use a mutable type for ``dt``.
            Doing so can result in hard-to-detect and potentially serious bugs.

        Returns:
            This builder for chaining
        """
        self._value_dtype = dt
        if not hasattr(dt, "__lt__"):
            msg = f"Dtype {dt} is unordered"
            raise DfTypeConstructionError(msg)
        return self

    def _check_final(self) -> None:
        pass


class AffinityMatrixDfBuilder(MatrixDfBuilder):
    """
    A builder pattern for :class:`typeddfs.matrix_dfs.AffinityMatrixDf`.
    """

    def __init__(self, name: str, doc: str | None = None) -> None:
        super().__init__(name, doc)
        self._clazz = AffinityMatrixDf

    def build(self) -> type[AffinityMatrixDf]:
        """
        Builds this type.

        Returns:
            A newly created subclass of :class:`typeddfs.matrix_dfs.AffinityMatrixDf`.

        Raises:
            typeddfs.df_errors.ClashError: If there is a contradiction in the specification
            typeddfs.df_errors.FormatInsecureError: If :meth:`hash` set an insecure
                                                    hash format and :meth:`secure` was set.

        .. note ::

            Copies, so this builder can be used to create more types without interference.
        """
        # noinspection PyTypeChecker
        return self._build()


class TypedDfBuilder(_GenericBuilder):
    """
    A builder pattern for :class:`typeddfs.typed_dfs.TypedDf`.

    Example:
        ``TypedDfBuilder.typed().require("name").build()``
    """

    def __init__(self, name: str, doc: str | None = None) -> None:
        super().__init__(name, doc)
        self._clazz = TypedDf

    def series_names(
        self,
        index: None | bool | str = False,
        columns: None | bool | str = False,
    ) -> __qualname__:
        """
        Sets ``pd.DataFrame.index.name`` and/or ``pd.DataFrame.columns.name``.
        Valid values are ``False`` to not set (default), ``None`` to set to ``None``,
        or a string to set to.

        Returns:
            This builder for chaining
        """
        self._index_series_name = index
        self._column_series_name = columns
        return self

    def build(self) -> type[TypedDf]:
        """
        Builds this type.

        Returns:
            A newly created subclass of :class:`typeddfs.typed_dfs.TypedDf`.

        Raises:
            DfTypeConstructionError: If there is a contradiction in the specification

        .. note ::

            Copies, so this builder can be used to create more types without interference.
        """
        # noinspection PyTypeChecker
        return self._build()

    def require(self, *names: str, dtype: type | None = None, index: bool = False) -> __qualname__:
        """
        Requires column(s) or index name(s).
        DataFrames will fail if they are missing any of these.

        Args:
            names: A varargs list of columns or index names
            dtype: An automatically applied transformation of the column values using ``.astype``
            index: If True, put these in the index

        Returns:
            This builder for chaining

        Raises:
            typeddfs.df_errors.ClashError: If a name was already added or is forbidden
        """
        self._check(names)
        if index:
            self._req_meta.extend(names)
        else:
            self._req_cols.extend(names)
        if dtype is not None:
            for name in names:
                self._dtypes[name] = dtype
        return self

    def reserve(self, *names: str, dtype: type | None = None, index: bool = False) -> __qualname__:
        """
        Reserves column(s) or index name(s) for optional inclusion.
        A reserved column will be accepted even if ``strict`` is set.
        A reserved index will be accepted even if ``strict`` is set;
        additionally, it will be automatically moved from the list of columns to the list of index names.

        Args:
            names: A varargs list of columns or index names
            dtype: An automatically applied transformation of the column values using ``.astype``
            index: If True, put these in the index

        Returns:
            This builder for chaining

        Raises:
            typeddfs.df_errors.ClashError: If a name was already added or is forbidden
        """
        self._check(names)
        if index:
            self._res_meta.extend(names)
        else:
            self._res_cols.extend(names)
        if dtype is not None:
            for name in names:
                self._dtypes[name] = dtype
        return self

    def drop(self, *names: str) -> __qualname__:
        """
        Adds columns (and index names) that should be automatically dropped.

        Args:
            names: Varargs list of names

        Returns:
            This builder for chaining
        """
        self._drop.extend(names)
        return self

    def strict(self, index: bool = True, cols: bool = True) -> __qualname__:
        """
        Disallows any columns or index names not in the lists of reserved/required.

        Args:
            index: Disallow additional names in the index
            cols: Disallow additional columns

        Returns:
            This builder for chaining
        """
        self._strict_meta = index
        self._strict_cols = cols
        return self

    def _check_final(self) -> None:
        """
        Final method in the chain.
        Creates a new subclass of ``TypedDf``.

        Returns:
            The new class

        Raises:
            typeddfs.df_errors.ClashError: If there is a contradiction in the specification
        """
        all_names = [*self._req_cols, *self._req_meta, *self._res_cols, *self._res_meta]
        problem_names = [name for name in all_names if name in self._drop]
        if len(problem_names) > 0:
            msg = f"Required/reserved column/index names {problem_names} are auto-dropped"
            raise ClashError(
                msg,
                keys=set(problem_names),
            )

    def _check(self, names: Sequence[str]) -> None:
        if any(name in _AUTO_DROPPED_NAMES for name in names):
            msg = f"Columns {','.join(_AUTO_DROPPED_NAMES)} are auto-dropped"
            raise ClashError(
                msg,
                keys=_AUTO_DROPPED_NAMES,
            )
        if any(name in _FORBIDDEN_NAMES for name in names):
            msg = f"{','.join(_FORBIDDEN_NAMES)} are forbidden names"
            raise ClashError(
                msg,
                keys=_FORBIDDEN_NAMES,
            )
        for name in names:
            if name in [*self._req_cols, *self._req_meta, *self._res_cols, *self._res_meta]:
                msg = f"Column {name} for {self._name} already exists"
                raise ClashError(msg, keys={name})


__all__ = ["TypedDfBuilder", "MatrixDfBuilder", "AffinityMatrixDfBuilder"]


1			# SPDX-FileCopyrightText: Copyright 2020-2023, Contributors to typed-dfs
2			# SPDX-PackageHomePage: https://github.com/dmyersturnbull/typed-dfs
3			# SPDX-License-Identifier: Apache-2.0
4			"""
5			Defines a builder pattern for ``TypedDf``.
6			"""
7			from __future__ import annotations
8
9			from collections import defaultdict
10			from typing import TYPE_CHECKING, Any, Optional
11
12			import pandas as pd
13
14			from typeddfs.df_errors import ClashError, DfTypeConstructionError
15			from typeddfs.df_typing import DfTyping, IoTyping
16			from typeddfs.file_formats import FileFormat
17			from typeddfs.matrix_dfs import AffinityMatrixDf, MatrixDf
18			from typeddfs.typed_dfs import TypedDf
19			from typeddfs.utils import Utils
20			from typeddfs.utils._utils import (
21			_AUTO_DROPPED_NAMES,
22			_DEFAULT_ATTRS_SUFFIX,
23			_DEFAULT_HASH_ALG,
24			_FORBIDDEN_NAMES,
25			_PICKLE_VR,
26			)
27			from typeddfs.utils.checksums import Checksums
28
29			if TYPE_CHECKING:
30			from collections.abc import Callable, Mapping, Sequence
31			from pathlib import Path
32
33			from typeddfs.base_dfs import BaseDf
34
35
36			class _GenericBuilder:
37			def __init__(self, name: str, doc: str \| None = None) -> None:
38			"""
39			Constructs a new builder.
40
41			Args:
42			name: The name of the resulting class
43			doc: The docstring of the resulting class
44
45			Raises:
46			TypeError: If ``name`` or ``doc`` non-string
47			"""
48			if not isinstance(name, str):
49			msg = f"Class name {name} is a {type(name)}, not str"
50			raise TypeError(msg)
51			self._name = name
52			self._doc = doc
53			self._clazz = None
54			self._classes = []
55			self._remapped_suffixes = {}
56			self._remapped_read_kwargs = {}
57			self._remapped_write_kwargs = {}
58			self._encoding = "utf-8"
59			self._errors = "strict"
60			self._read_kwargs = defaultdict(dict)
61			self._write_kwargs = defaultdict(dict)
62			self._methods = {}
63			self._classmethods = {}
64			self._post_processing = None
65			self._verifications = []
66			self._req_meta = []
67			self._res_meta = []
68			self._req_cols = []
69			self._res_cols = []
70			self._dtypes = {}
71			self._value_dtype = None
72			self._drop = []
73			self._strict_meta = False
74			self._strict_cols = False
75			self._hash_alg = _DEFAULT_HASH_ALG
76			self._hash_file = False
77			self._hash_dir = False
78			self._index_series_name = False
79			self._column_series_name = False
80			self._secure = False
81			self._recommended = False
82			self._req_hash: Optional = False
83			self._req_order: Optional = False
84			self._attr_suffix = None
85			self._attr_json_kwargs = {}
86			self._custom_formats = {}
87			# make these use an explicit version
88			# the user can override if needed
89			self.add_read_kwargs("pickle", protocol=_PICKLE_VR)
90			self.add_write_kwargs("pickle", protocol=_PICKLE_VR)
91
92			def subclass(self, clazz: type[Any]) -> __qualname__:
93			"""
94			Make the class inherit from some type.
95			May only subclass from a single subclass of DataFrame.
96			If ``class`` is a ``DataFrame`` subclass, replaces the existing subclass.
97			Otherwise, adds an additional superclass for multiple inheritance.
98
99			Returns:
100			This builder for chaining
101			"""
102			if issubclass(clazz, pd.DataFrame):
103			self._clazz = clazz
104			else:
105			self._classes.append(clazz)
106			return self
107
108			def doc(self, s: str) -> __qualname__:
109			"""
110			Sets the docstring.
111			This has the same effect as setting doc in :meth:`__init__`.
112
113			Returns:
114			This builder for chaining
115			"""
116			self._doc = s
117			return self
118
119			def add_methods(
120			self,
121			*args: Callable[[BaseDf, ...], Any],
122			**kwargs: Callable[[BaseDf, ...], Any],
123			) -> __qualname__:
124			"""
125			Attaches methods to the class.
126
127			Args:
128			args: Functions whose names are used directly
129			kwargs: Mapping from function names to functions (the keys will be the method names)
130
131			Example:
132			``add_methods(summary=lambda df: f"{len(df) rows")``
133
134			Returns:
135			This builder for chaining
136			"""
137			self._methods.update({m.__name__: m for m in args})
138			self._methods.update(**kwargs)
139			return self
140
141			def add_classmethods(self, **kwargs: Callable[[type[BaseDf], ...], Any]) -> __qualname__:
142			"""
143			Attaches classmethods to the class.
144			Mostly useful for factory methods.
145
146			Example:
147			``add_classmethods(flat_instance=lambda t, value: MyClass(value))``
148
149			Returns:
150			This builder for chaining
151			"""
152			self._classmethods.update(**kwargs)
153			return self
154
155			def post(self, fn: Callable[[BaseDf], BaseDf]) -> __qualname__:
156			"""
157			Adds a method that is called on the converted DataFrame.
158			It is called immediately before final optional conditions (``verify``) are checked.
159			The function must return a new DataFrame.
160
161			Returns:
162			This builder for chaining
163			"""
164			self._post_processing = fn
165			return self
166
167			def verify(self, *conditions: Callable[[pd.DataFrame], str, bool \| None]) -> __qualname__:
168			"""
169			Adds additional requirement(s) for the DataFrames.
170
171			Returns:
172			this builder for chaining
173
174			Args:
175			conditions: Functions of the DataFrame that return None if the condition is met, or an error message
176
177			Returns:
178			This builder for chaining
179			"""
180			self._verifications.extend(conditions)
181			return self
182
183			def suffix(
184			self,
185			suffix: str,
186			fmt: FileFormat \| str,
187			*,
188			read: Mapping[str, Any] \| None = None,
189			write: Mapping[str, Any] \| None = None,
190			) -> __qualname__:
191			"""
192			Makes read_files and write_files interpret a filename suffix differently.
193			Suffixes like .gz, .zip, etc. are also included for text formats that are provided.
194
195			Args:
196			suffix: e.g. .txt (a prepended '.' is ignored)
197			fmt: The FileFormat used to map to read/write methods
198			read: Custom params to pass to the read function
199			write: Custom params to pass to the write function
200
201			Returns:
202			This builder for chaining
203			"""
204			if not suffix.startswith("."):
205			suffix = "." + suffix
206			fmt = FileFormat.of(fmt)
207			for s in fmt.compressed_variants(suffix):
208			self._remapped_suffixes[s] = fmt
209			if read is not None:
210			self._remapped_read_kwargs[suffix] = read
211			if write is not None:
212			self._remapped_write_kwargs[suffix] = write
213			return self
214
215			def hash(
216			self,
217			*,
218			alg: str = "sha256",
219			file: bool = True,
220			directory: bool = False,
221			) -> __qualname__:
222			"""
223			Write a hash file (e.g. .sha256) alongside files.
224			Performed when calling :meth:`typeddfs.abs_dfs.AbsDf.write_file`.
225			The hash files will be in the `sha1sum <https://en.wikipedia.org/wiki/Sha1sum>`_ format,
226			with a the filename, followed by ``" *"``, followed by the filename.
227
228			Note that this affects the default behavior of :meth:`typeddfs.abs_dfs.AbsDf.write_file`,
229			which can be called with ``file_hash=False`` and/or ``dir_hash=False``.
230
231			Args:
232			alg: The name of the algorithm in ``hashlib``;
233			The final name will ignore any hyphens and be converted to lowercase,
234			and the suffix will be ``"." + alg``.
235			file: Alongside a file ``"my_file.csv.gz"``,
236			write a file ``"my_file.csv.gz."+alg`` alongside.
237			directory: Alongside a file ``"my_file.csv.gz"`` in ``"my_dir"``,
238			append to a file ``"my_dir/my_dir"+alg``,
239			which presumably should contain hashes for files in that directory.
240
241			Returns:
242			This builder for chaining
243			"""
244			self._hash_alg = Checksums.resolve_algorithm(alg)
245			self._hash_file = file
246			self._hash_dir = directory
247			return self
248
249			def attrs(
250			self,
251			*,
252			suffix: str = ".attrs.json", # leave like this for clear documentation
253			preserve_inf: bool = True,
254			sort: bool = False,
255			indent: bool = True,
256			fallback: Callable[[Any], Any] \| None = None,
257			) -> __qualname__:
258			"""
259			Sets ``pd.DataFrame.attrs`` to be read and written by default.
260
261			Args:
262			suffix: Will be appended to the filename of the DataFrame;
263			must end with .json, .json.gz, etc.
264			preserve_inf: Convert numpy ``ndarray`` values, ``float("inf")``,
265			and ``float("-inf")`` to str when writing JSON
266			sort: Sort JSON before writing
267			indent: Indent JSON before writing
268			fallback: Try this method to serialize to JSON if all others fail
269
270			Returns:
271			This builder for chaining
272
273			Raises:
274			ValueError: If the format is not JSON
275			"""
276			fmt = FileFormat.from_path(suffix)
277			if fmt is not FileFormat.json:
278			msg = f"File format must be JSON ({suffix}"
279			raise ValueError(msg)
280			self._attr_suffix = suffix
281			self._attr_json_kwargs = {
282			"preserve_inf": preserve_inf,
283			"sort": sort,
284			"indent": indent,
285			"fallbacks": [fallback],
286			}
287			return self
288
289			def secure(self) -> __qualname__:
290			"""
291			Bans IO with insecure formats.
292			This includes Pickle and Excel formats that support macros.
293
294			Returns:
295			This builder for chaining
296			"""
297			self._secure = True
298			return self
299
300			def recommended_only(self) -> __qualname__:
301			"""
302			Bans IO with non-recommended formats.
303			This includes all insecure formats along with fixed-width, HDF5, INI, TOML, .properties, etc.
304
305			Returns:
306			This builder for chaining
307			"""
308			self._recommended = True
309			return self
310
311			def encoding(self, encoding: str = "utf-8") -> __qualname__:
312			"""
313			Has pandas-defined text read/write functions use UTF-8.
314			UTF-8 was the default when the builder was constructed.
315
316			Arguments:
317			encoding: Use this encoding.
318			Values are case-insensitive and ignore hyphen.
319			(i.e. ``utf-8(bom)`` and ``utf8(bom)`` are the same.
320			Special values are ``platform`` and ``utf-8(bom)``.
321			"platform" is equivalent to ``sys.getdefaultencoding()``.
322			"utf8(bom)" changes the encoding depending on the platform at the time of writing.
323			(I.e. The read/write functions will work as expected when pickled.)
324			If ``utf8(bom)``, will use utf-8-sig if the platform is Windows ('nt').
325			Some applications will otherwise assume the default encoding (and break).
326			(Note: ``utf16(bom)`` will also work.)
327
328			Returns:
329			This builder for chaining
330			"""
331			self._encoding = encoding.lower().replace("-", "")
332			return self
333
334			def add_custom_format(
335			self,
336			suffix: str,
337			reader: Callable[[Path], pd.DataFrame],
338			writer: Callable[[pd.DataFrame], Path],
339			*,
340			replace: bool = False,
341			) -> __qualname__:
342			"""
343			Adds custom readers and writers for read_file and write_file.
344			"""
345			if not replace:
346			fmt = FileFormat.from_path_or_none(suffix)
347			if fmt is not None:
348			msg = f"Cannot override suffix {suffix} for format {fmt.name}"
349			raise ValueError(msg)
350			self._custom_formats[suffix] = (reader, writer)
351			return self
352
353			def add_read_kwargs(self, fmt: FileFormat \| str, **kwargs) -> __qualname__:
354			"""
355			Adds keyword arguments that are passed to ``read_`` methods when called from ``read_file``.
356			Rarely needed.
357
358			Arguments:
359			fmt: The file format (which corresponds to the delegated method)
360			kwargs: key-value pairs that are used for the specified format
361
362			Returns:
363			This builder for chaining
364			"""
365			fmt = FileFormat.of(fmt)
366			for k, v in kwargs.items():
367			self._read_kwargs[fmt][k] = v
368			return self
369
370			def add_write_kwargs(self, fmt: FileFormat \| str, **kwargs) -> __qualname__:
371			"""
372			Adds keyword arguments that are passed to ``to_`` methods when called from ``to_file``.
373			Rarely needed.
374
375			Example:
376			.. code::
377
378			TypedDfs.typed("x").add_write_kwargs()
379
380			Arguments:
381			fmt: The file format (which corresponds to the delegated method)
382			kwargs: key-value pairs that are used for the specified format
383
384			Returns:
385			This builder for chaining
386			"""
387			fmt = FileFormat.of(fmt)
388			for k, v in kwargs.items():
389			self._write_kwargs[fmt][k] = v
390			return self
391
392			def _build(self) -> type[BaseDf]:
393			if self._secure and self._hash_alg in Utils.insecure_hash_functions():
394			msg = f"Hash algorithm {self._hash_alg} forbidden by .secure()"
395			raise DfTypeConstructionError(msg)
396			self._check_final()
397
398			_io_typing = IoTyping(
399			_remap_suffixes=dict(self._remapped_suffixes),
400			_text_encoding=self._encoding,
401			_read_kwargs=dict(self._read_kwargs),
402			_write_kwargs=dict(self._write_kwargs),
403			_hash_alg=self._hash_alg,
404			_save_hash_file=self._hash_file,
405			_save_hash_dir=self._hash_dir,
406			_secure=self._secure,
407			_recommended=self._recommended,
408			_attrs_suffix=_DEFAULT_ATTRS_SUFFIX if self._attr_suffix is None else self._attr_suffix,
409			_use_attrs=self._attr_suffix is not None,
410			_attrs_json_kwargs=self._attr_json_kwargs,
411			_custom_readers={k: v[0] for k, v in self._custom_formats.items()},
412			_custom_writers={k: v[1] for k, v in self._custom_formats.items()},
413			)
414
415			_typing = DfTyping(
416			_io_typing=_io_typing,
417			_auto_dtypes=dict(self._dtypes),
418			_post_processing=self._post_processing,
419			_verifications=self._verifications,
420			_more_index_names_allowed=not self._strict_meta,
421			_more_columns_allowed=not self._strict_cols,
422			_required_columns=list(self._req_cols),
423			_required_index_names=list(self._req_meta),
424			_reserved_columns=list(self._res_cols),
425			_reserved_index_names=list(self._res_meta),
426			_columns_to_drop=set(self._drop),
427			_index_series_name=self._index_series_name,
428			_column_series_name=self._column_series_name,
429			_value_dtype=self._value_dtype,
430			)
431
432			class New(self._clazz, *self._classes):
433			@classmethod
434			def get_typing(cls) -> DfTyping:
435			return _typing
436
437			New.__name__ = self._name
438			New.__doc__ = self._doc
439			for k, v in self._methods.items():
440			setattr(New, k, v)
441			for k, v in self._classmethods.items():
442			setattr(New, k, classmethod(v))
443			return New
444
445			def _check_final(self) -> None:
446			raise NotImplementedError()
447
448
449			class MatrixDfBuilder(_GenericBuilder):
450			"""
451			A builder pattern for :class:`typeddfs.matrix_dfs.MatrixDf`.
452			"""
453
454			def __init__(self, name: str, doc: str \| None = None) -> None:
455			super().__init__(name, doc)
456			self._clazz = MatrixDf
457			self._index_series_name = "row"
458			self._column_series_name = "column"
459			self._req_meta.append("row")
460
461			def build(self) -> type[MatrixDf]:
462			"""
463			Builds this type.
464
465			Returns:
466			A newly created subclass of :class:`typeddfs.matrix_dfs.MatrixDf`.
467
468			Raises:
469			ClashError: If there is a contradiction in the specification
470			FormatInsecureError: If :meth:`hash` set an insecure
471			hash format and :meth:`secure` was set.
472
473			.. note ::
474
475			Copies, so this builder can be used to create more types without interference.
476
477			Raises:
478			DfTypeConstructionError: for some errors
479			"""
480			# noinspection PyTypeChecker
481			return self._build()
482
483			def dtype(self, dt: type[Any]) -> __qualname__:
484			"""
485			Sets the type of value for all matrix elements.
486			This should almost certainly be a numeric type,
487			and it must be ordered.
488
489			.. caution:
490			Never use a mutable type for ``dt``.
491			Doing so can result in hard-to-detect and potentially serious bugs.
492
493			Returns:
494			This builder for chaining
495			"""
496			self._value_dtype = dt
497			if not hasattr(dt, "__lt__"):
498			msg = f"Dtype {dt} is unordered"
499			raise DfTypeConstructionError(msg)
500			return self
501
502			def _check_final(self) -> None:
503			pass
504
505
506			class AffinityMatrixDfBuilder(MatrixDfBuilder):
507			"""
508			A builder pattern for :class:`typeddfs.matrix_dfs.AffinityMatrixDf`.
509			"""
510
511			def __init__(self, name: str, doc: str \| None = None) -> None:
512			super().__init__(name, doc)
513			self._clazz = AffinityMatrixDf
514
515			def build(self) -> type[AffinityMatrixDf]:
516			"""
517			Builds this type.
518
519			Returns:
520			A newly created subclass of :class:`typeddfs.matrix_dfs.AffinityMatrixDf`.
521
522			Raises:
523			typeddfs.df_errors.ClashError: If there is a contradiction in the specification
524			typeddfs.df_errors.FormatInsecureError: If :meth:`hash` set an insecure
525			hash format and :meth:`secure` was set.
526
527			.. note ::
528
529			Copies, so this builder can be used to create more types without interference.
530			"""
531			# noinspection PyTypeChecker
532			return self._build()
533
534
535			class TypedDfBuilder(_GenericBuilder):
536			"""
537			A builder pattern for :class:`typeddfs.typed_dfs.TypedDf`.
538
539			Example:
540			``TypedDfBuilder.typed().require("name").build()``
541			"""
542
543			def __init__(self, name: str, doc: str \| None = None) -> None:
544			super().__init__(name, doc)
545			self._clazz = TypedDf
546
547			def series_names(
548			self,
549			index: None \| bool \| str = False,
550			columns: None \| bool \| str = False,
551			) -> __qualname__:
552			"""
553			Sets ``pd.DataFrame.index.name`` and/or ``pd.DataFrame.columns.name``.
554			Valid values are ``False`` to not set (default), ``None`` to set to ``None``,
555			or a string to set to.
556
557			Returns:
558			This builder for chaining
559			"""
560			self._index_series_name = index
561			self._column_series_name = columns
562			return self
563
564			def build(self) -> type[TypedDf]:
565			"""
566			Builds this type.
567
568			Returns:
569			A newly created subclass of :class:`typeddfs.typed_dfs.TypedDf`.
570
571			Raises:
572			DfTypeConstructionError: If there is a contradiction in the specification
573
574			.. note ::
575
576			Copies, so this builder can be used to create more types without interference.
577			"""
578			# noinspection PyTypeChecker
579			return self._build()
580
581			def require(self, *names: str, dtype: type \| None = None, index: bool = False) -> __qualname__:
582			"""
583			Requires column(s) or index name(s).
584			DataFrames will fail if they are missing any of these.
585
586			Args:
587			names: A varargs list of columns or index names
588			dtype: An automatically applied transformation of the column values using ``.astype``
589			index: If True, put these in the index
590
591			Returns:
592			This builder for chaining
593
594			Raises:
595			typeddfs.df_errors.ClashError: If a name was already added or is forbidden
596			"""
597			self._check(names)
598			if index:
599			self._req_meta.extend(names)
600			else:
601			self._req_cols.extend(names)
602			if dtype is not None:
603			for name in names:
604			self._dtypes[name] = dtype
605			return self
606
607			def reserve(self, *names: str, dtype: type \| None = None, index: bool = False) -> __qualname__:
608			"""
609			Reserves column(s) or index name(s) for optional inclusion.
610			A reserved column will be accepted even if ``strict`` is set.
611			A reserved index will be accepted even if ``strict`` is set;
612			additionally, it will be automatically moved from the list of columns to the list of index names.
613
614			Args:
615			names: A varargs list of columns or index names
616			dtype: An automatically applied transformation of the column values using ``.astype``
617			index: If True, put these in the index
618
619			Returns:
620			This builder for chaining
621
622			Raises:
623			typeddfs.df_errors.ClashError: If a name was already added or is forbidden
624			"""
625			self._check(names)
626			if index:
627			self._res_meta.extend(names)
628			else:
629			self._res_cols.extend(names)
630			if dtype is not None:
631			for name in names:
632			self._dtypes[name] = dtype
633			return self
634
635			def drop(self, *names: str) -> __qualname__:
636			"""
637			Adds columns (and index names) that should be automatically dropped.
638
639			Args:
640			names: Varargs list of names
641
642			Returns:
643			This builder for chaining
644			"""
645			self._drop.extend(names)
646			return self
647
648			def strict(self, index: bool = True, cols: bool = True) -> __qualname__:
649			"""
650			Disallows any columns or index names not in the lists of reserved/required.
651
652			Args:
653			index: Disallow additional names in the index
654			cols: Disallow additional columns
655
656			Returns:
657			This builder for chaining
658			"""
659			self._strict_meta = index
660			self._strict_cols = cols
661			return self
662
663			def _check_final(self) -> None:
664			"""
665			Final method in the chain.
666			Creates a new subclass of ``TypedDf``.
667
668			Returns:
669			The new class
670
671			Raises:
672			typeddfs.df_errors.ClashError: If there is a contradiction in the specification
673			"""
674			all_names = [self._req_cols, self._req_meta, self._res_cols, self._res_meta]
675			problem_names = [name for name in all_names if name in self._drop]
676			if len(problem_names) > 0:
677			msg = f"Required/reserved column/index names {problem_names} are auto-dropped"
678			raise ClashError(
679			msg,
680			keys=set(problem_names),
681			)
682
683			def _check(self, names: Sequence[str]) -> None:
684			if any(name in _AUTO_DROPPED_NAMES for name in names):
685			msg = f"Columns {','.join(_AUTO_DROPPED_NAMES)} are auto-dropped"
686			raise ClashError(
687			msg,
688			keys=_AUTO_DROPPED_NAMES,
689			)
690			if any(name in _FORBIDDEN_NAMES for name in names):
691			msg = f"{','.join(_FORBIDDEN_NAMES)} are forbidden names"
692			raise ClashError(
693			msg,
694			keys=_FORBIDDEN_NAMES,
695			)
696			for name in names:
697			if name in [self._req_cols, self._req_meta, self._res_cols, self._res_meta]:
698			msg = f"Column {name} for {self._name} already exists"
699			raise ClashError(msg, keys={name})
700
701
702			__all__ = ["TypedDfBuilder", "MatrixDfBuilder", "AffinityMatrixDfBuilder"]
703

dmyersturnbull / typed-dfs

typeddfs.builders F last analyzed 2023-10-24 08:21 UTC

Complexity

Size/Duplication

Importance

33 Methods

How to fix Complexity

Complexity

Duplication Side-by-Side

Filter issues like

typeddfs.builders F
last analyzed 2023-10-24 08:21 UTC