typeddfs.builders.TypedDfBuilder.reserve()   A
last analyzed

Complexity

Conditions 4

Size

Total Lines 27
Code Lines 9

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 4
eloc 9
nop 4
dl 0
loc 27
rs 9.95
c 0
b 0
f 0
1
# SPDX-FileCopyrightText: Copyright 2020-2023, Contributors to typed-dfs
2
# SPDX-PackageHomePage: https://github.com/dmyersturnbull/typed-dfs
3
# SPDX-License-Identifier: Apache-2.0
4
"""
5
Defines a builder pattern for ``TypedDf``.
6
"""
7
from __future__ import annotations
8
9
from collections import defaultdict
10
from typing import TYPE_CHECKING, Any, Optional
11
12
import pandas as pd
13
14
from typeddfs.df_errors import ClashError, DfTypeConstructionError
15
from typeddfs.df_typing import DfTyping, IoTyping
16
from typeddfs.file_formats import FileFormat
17
from typeddfs.matrix_dfs import AffinityMatrixDf, MatrixDf
18
from typeddfs.typed_dfs import TypedDf
19
from typeddfs.utils import Utils
20
from typeddfs.utils._utils import (
21
    _AUTO_DROPPED_NAMES,
22
    _DEFAULT_ATTRS_SUFFIX,
23
    _DEFAULT_HASH_ALG,
24
    _FORBIDDEN_NAMES,
25
    _PICKLE_VR,
26
)
27
from typeddfs.utils.checksums import Checksums
28
29
if TYPE_CHECKING:
30
    from collections.abc import Callable, Mapping, Sequence
31
    from pathlib import Path
32
33
    from typeddfs.base_dfs import BaseDf
34
35
36
class _GenericBuilder:
37
    def __init__(self, name: str, doc: str | None = None) -> None:
38
        """
39
        Constructs a new builder.
40
41
        Args:
42
            name: The name of the resulting class
43
            doc: The docstring of the resulting class
44
45
        Raises:
46
            TypeError: If ``name`` or ``doc`` non-string
47
        """
48
        if not isinstance(name, str):
49
            msg = f"Class name {name} is a {type(name)}, not str"
50
            raise TypeError(msg)
51
        self._name = name
52
        self._doc = doc
53
        self._clazz = None
54
        self._classes = []
55
        self._remapped_suffixes = {}
56
        self._remapped_read_kwargs = {}
57
        self._remapped_write_kwargs = {}
58
        self._encoding = "utf-8"
59
        self._errors = "strict"
60
        self._read_kwargs = defaultdict(dict)
61
        self._write_kwargs = defaultdict(dict)
62
        self._methods = {}
63
        self._classmethods = {}
64
        self._post_processing = None
65
        self._verifications = []
66
        self._req_meta = []
67
        self._res_meta = []
68
        self._req_cols = []
69
        self._res_cols = []
70
        self._dtypes = {}
71
        self._value_dtype = None
72
        self._drop = []
73
        self._strict_meta = False
74
        self._strict_cols = False
75
        self._hash_alg = _DEFAULT_HASH_ALG
76
        self._hash_file = False
77
        self._hash_dir = False
78
        self._index_series_name = False
79
        self._column_series_name = False
80
        self._secure = False
81
        self._recommended = False
82
        self._req_hash: Optional = False
83
        self._req_order: Optional = False
84
        self._attr_suffix = None
85
        self._attr_json_kwargs = {}
86
        self._custom_formats = {}
87
        # make these use an explicit version
88
        # the user can override if needed
89
        self.add_read_kwargs("pickle", protocol=_PICKLE_VR)
90
        self.add_write_kwargs("pickle", protocol=_PICKLE_VR)
91
92
    def subclass(self, clazz: type[Any]) -> __qualname__:
93
        """
94
        Make the class inherit from some type.
95
        May only subclass from a single subclass of DataFrame.
96
        If ``class`` is a ``DataFrame`` subclass, replaces the existing subclass.
97
        Otherwise, adds an additional superclass for multiple inheritance.
98
99
        Returns:
100
            This builder for chaining
101
        """
102
        if issubclass(clazz, pd.DataFrame):
103
            self._clazz = clazz
104
        else:
105
            self._classes.append(clazz)
106
        return self
107
108
    def doc(self, s: str) -> __qualname__:
109
        """
110
        Sets the docstring.
111
        This has the same effect as setting doc in :meth:`__init__`.
112
113
        Returns:
114
            This builder for chaining
115
        """
116
        self._doc = s
117
        return self
118
119
    def add_methods(
120
        self,
121
        *args: Callable[[BaseDf, ...], Any],
122
        **kwargs: Callable[[BaseDf, ...], Any],
123
    ) -> __qualname__:
124
        """
125
        Attaches methods to the class.
126
127
        Args:
128
            args: Functions whose names are used directly
129
            kwargs: Mapping from function names to functions (the keys will be the method names)
130
131
        Example:
132
            ``add_methods(summary=lambda df: f"{len(df) rows")``
133
134
        Returns:
135
            This builder for chaining
136
        """
137
        self._methods.update({m.__name__: m for m in args})
138
        self._methods.update(**kwargs)
139
        return self
140
141
    def add_classmethods(self, **kwargs: Callable[[type[BaseDf], ...], Any]) -> __qualname__:
142
        """
143
        Attaches classmethods to the class.
144
        Mostly useful for factory methods.
145
146
        Example:
147
            ``add_classmethods(flat_instance=lambda t, value: MyClass(value))``
148
149
        Returns:
150
            This builder for chaining
151
        """
152
        self._classmethods.update(**kwargs)
153
        return self
154
155
    def post(self, fn: Callable[[BaseDf], BaseDf]) -> __qualname__:
156
        """
157
        Adds a method that is called on the converted DataFrame.
158
        It is called immediately before final optional conditions (``verify``) are checked.
159
        The function must return a new DataFrame.
160
161
        Returns:
162
            This builder for chaining
163
        """
164
        self._post_processing = fn
165
        return self
166
167
    def verify(self, *conditions: Callable[[pd.DataFrame], str, bool | None]) -> __qualname__:
168
        """
169
        Adds additional requirement(s) for the DataFrames.
170
171
        Returns:
172
            this builder for chaining
173
174
        Args:
175
            conditions: Functions of the DataFrame that return None if the condition is met, or an error message
176
177
        Returns:
178
            This builder for chaining
179
        """
180
        self._verifications.extend(conditions)
181
        return self
182
183
    def suffix(
184
        self,
185
        suffix: str,
186
        fmt: FileFormat | str,
187
        *,
188
        read: Mapping[str, Any] | None = None,
189
        write: Mapping[str, Any] | None = None,
190
    ) -> __qualname__:
191
        """
192
        Makes read_files and write_files interpret a filename suffix differently.
193
        Suffixes like .gz, .zip, etc. are also included for text formats that are provided.
194
195
        Args:
196
            suffix: e.g. .txt (a prepended '.' is ignored)
197
            fmt: The FileFormat used to map to read/write methods
198
            read: Custom params to pass to the read function
199
            write: Custom params to pass to the write function
200
201
        Returns:
202
            This builder for chaining
203
        """
204
        if not suffix.startswith("."):
205
            suffix = "." + suffix
206
        fmt = FileFormat.of(fmt)
207
        for s in fmt.compressed_variants(suffix):
208
            self._remapped_suffixes[s] = fmt
209
        if read is not None:
210
            self._remapped_read_kwargs[suffix] = read
211
        if write is not None:
212
            self._remapped_write_kwargs[suffix] = write
213
        return self
214
215
    def hash(
216
        self,
217
        *,
218
        alg: str = "sha256",
219
        file: bool = True,
220
        directory: bool = False,
221
    ) -> __qualname__:
222
        """
223
        Write a hash file (e.g. .sha256) alongside files.
224
        Performed when calling :meth:`typeddfs.abs_dfs.AbsDf.write_file`.
225
        The hash files will be in the `sha1sum <https://en.wikipedia.org/wiki/Sha1sum>`_ format,
226
        with a the filename, followed by ``" *"``, followed by the filename.
227
228
        Note that this affects the default behavior of :meth:`typeddfs.abs_dfs.AbsDf.write_file`,
229
        which can be called with ``file_hash=False`` and/or ``dir_hash=False``.
230
231
        Args:
232
            alg: The name of the algorithm in ``hashlib``;
233
                 The final name will ignore any hyphens and be converted to lowercase,
234
                 and the suffix will be ``"." + alg``.
235
            file: Alongside a file ``"my_file.csv.gz"``,
236
                  write a file ``"my_file.csv.gz."+alg`` alongside.
237
            directory: Alongside a file ``"my_file.csv.gz"`` in ``"my_dir"``,
238
                       append to a file ``"my_dir/my_dir"+alg``,
239
                       which presumably should contain hashes for files in that directory.
240
241
        Returns:
242
            This builder for chaining
243
        """
244
        self._hash_alg = Checksums.resolve_algorithm(alg)
245
        self._hash_file = file
246
        self._hash_dir = directory
247
        return self
248
249
    def attrs(
250
        self,
251
        *,
252
        suffix: str = ".attrs.json",  # leave like this for clear documentation
253
        preserve_inf: bool = True,
254
        sort: bool = False,
255
        indent: bool = True,
256
        fallback: Callable[[Any], Any] | None = None,
257
    ) -> __qualname__:
258
        """
259
        Sets ``pd.DataFrame.attrs`` to be read and written by default.
260
261
        Args:
262
            suffix: Will be appended to the filename of the DataFrame;
263
                    must end with .json, .json.gz, etc.
264
            preserve_inf: Convert numpy ``ndarray`` values, ``float("inf")``,
265
                          and  ``float("-inf")`` to str when writing JSON
266
            sort: Sort JSON before writing
267
            indent: Indent JSON before writing
268
            fallback: Try this method to serialize to JSON if all others fail
269
270
        Returns:
271
            This builder for chaining
272
273
        Raises:
274
            ValueError: If the format is not JSON
275
        """
276
        fmt = FileFormat.from_path(suffix)
277
        if fmt is not FileFormat.json:
278
            msg = f"File format must be JSON ({suffix}"
279
            raise ValueError(msg)
280
        self._attr_suffix = suffix
281
        self._attr_json_kwargs = {
282
            "preserve_inf": preserve_inf,
283
            "sort": sort,
284
            "indent": indent,
285
            "fallbacks": [fallback],
286
        }
287
        return self
288
289
    def secure(self) -> __qualname__:
290
        """
291
        Bans IO with insecure formats.
292
        This includes Pickle and Excel formats that support macros.
293
294
        Returns:
295
            This builder for chaining
296
        """
297
        self._secure = True
298
        return self
299
300
    def recommended_only(self) -> __qualname__:
301
        """
302
        Bans IO with non-recommended formats.
303
        This includes all insecure formats along with fixed-width, HDF5, INI, TOML, .properties, etc.
304
305
        Returns:
306
            This builder for chaining
307
        """
308
        self._recommended = True
309
        return self
310
311
    def encoding(self, encoding: str = "utf-8") -> __qualname__:
312
        """
313
        Has pandas-defined text read/write functions use UTF-8.
314
        UTF-8 was the default when the builder was constructed.
315
316
        Arguments:
317
            encoding: Use this encoding.
318
                      Values are case-insensitive and ignore hyphen.
319
                      (i.e. ``utf-8(bom)`` and ``utf8(bom)`` are the same.
320
                      Special values are ``platform`` and ``utf-8(bom)``.
321
                      "platform" is equivalent to ``sys.getdefaultencoding()``.
322
                      "utf8(bom)" changes the encoding depending on the platform at the time of writing.
323
                      (I.e. The read/write functions will work as expected when pickled.)
324
                      If ``utf8(bom)``, will use utf-8-sig if the platform is Windows ('nt').
325
                      Some applications will otherwise assume the default encoding (and break).
326
                      (Note: ``utf16(bom)`` will also work.)
327
328
        Returns:
329
            This builder for chaining
330
        """
331
        self._encoding = encoding.lower().replace("-", "")
332
        return self
333
334
    def add_custom_format(
335
        self,
336
        suffix: str,
337
        reader: Callable[[Path], pd.DataFrame],
338
        writer: Callable[[pd.DataFrame], Path],
339
        *,
340
        replace: bool = False,
341
    ) -> __qualname__:
342
        """
343
        Adds custom readers and writers for read_file and write_file.
344
        """
345
        if not replace:
346
            fmt = FileFormat.from_path_or_none(suffix)
347
            if fmt is not None:
348
                msg = f"Cannot override suffix {suffix} for format {fmt.name}"
349
                raise ValueError(msg)
350
        self._custom_formats[suffix] = (reader, writer)
351
        return self
352
353
    def add_read_kwargs(self, fmt: FileFormat | str, **kwargs) -> __qualname__:
354
        """
355
        Adds keyword arguments that are passed to ``read_`` methods when called from ``read_file``.
356
        Rarely needed.
357
358
        Arguments:
359
            fmt: The file format (which corresponds to the delegated method)
360
            kwargs: key-value pairs that are used for the specified format
361
362
        Returns:
363
            This builder for chaining
364
        """
365
        fmt = FileFormat.of(fmt)
366
        for k, v in kwargs.items():
367
            self._read_kwargs[fmt][k] = v
368
        return self
369
370
    def add_write_kwargs(self, fmt: FileFormat | str, **kwargs) -> __qualname__:
371
        """
372
        Adds keyword arguments that are passed to ``to_`` methods when called from ``to_file``.
373
        Rarely needed.
374
375
        Example:
376
            .. code::
377
378
                TypedDfs.typed("x").add_write_kwargs()
379
380
        Arguments:
381
            fmt: The file format (which corresponds to the delegated method)
382
            kwargs: key-value pairs that are used for the specified format
383
384
        Returns:
385
            This builder for chaining
386
        """
387
        fmt = FileFormat.of(fmt)
388
        for k, v in kwargs.items():
389
            self._write_kwargs[fmt][k] = v
390
        return self
391
392
    def _build(self) -> type[BaseDf]:
393
        if self._secure and self._hash_alg in Utils.insecure_hash_functions():
394
            msg = f"Hash algorithm {self._hash_alg} forbidden by .secure()"
395
            raise DfTypeConstructionError(msg)
396
        self._check_final()
397
398
        _io_typing = IoTyping(
399
            _remap_suffixes=dict(self._remapped_suffixes),
400
            _text_encoding=self._encoding,
401
            _read_kwargs=dict(self._read_kwargs),
402
            _write_kwargs=dict(self._write_kwargs),
403
            _hash_alg=self._hash_alg,
404
            _save_hash_file=self._hash_file,
405
            _save_hash_dir=self._hash_dir,
406
            _secure=self._secure,
407
            _recommended=self._recommended,
408
            _attrs_suffix=_DEFAULT_ATTRS_SUFFIX if self._attr_suffix is None else self._attr_suffix,
409
            _use_attrs=self._attr_suffix is not None,
410
            _attrs_json_kwargs=self._attr_json_kwargs,
411
            _custom_readers={k: v[0] for k, v in self._custom_formats.items()},
412
            _custom_writers={k: v[1] for k, v in self._custom_formats.items()},
413
        )
414
415
        _typing = DfTyping(
416
            _io_typing=_io_typing,
417
            _auto_dtypes=dict(self._dtypes),
418
            _post_processing=self._post_processing,
419
            _verifications=self._verifications,
420
            _more_index_names_allowed=not self._strict_meta,
421
            _more_columns_allowed=not self._strict_cols,
422
            _required_columns=list(self._req_cols),
423
            _required_index_names=list(self._req_meta),
424
            _reserved_columns=list(self._res_cols),
425
            _reserved_index_names=list(self._res_meta),
426
            _columns_to_drop=set(self._drop),
427
            _index_series_name=self._index_series_name,
428
            _column_series_name=self._column_series_name,
429
            _value_dtype=self._value_dtype,
430
        )
431
432
        class New(self._clazz, *self._classes):
433
            @classmethod
434
            def get_typing(cls) -> DfTyping:
435
                return _typing
436
437
        New.__name__ = self._name
438
        New.__doc__ = self._doc
439
        for k, v in self._methods.items():
440
            setattr(New, k, v)
441
        for k, v in self._classmethods.items():
442
            setattr(New, k, classmethod(v))
443
        return New
444
445
    def _check_final(self) -> None:
446
        raise NotImplementedError()
447
448
449
class MatrixDfBuilder(_GenericBuilder):
450
    """
451
    A builder pattern for :class:`typeddfs.matrix_dfs.MatrixDf`.
452
    """
453
454
    def __init__(self, name: str, doc: str | None = None) -> None:
455
        super().__init__(name, doc)
456
        self._clazz = MatrixDf
457
        self._index_series_name = "row"
458
        self._column_series_name = "column"
459
        self._req_meta.append("row")
460
461
    def build(self) -> type[MatrixDf]:
462
        """
463
        Builds this type.
464
465
        Returns:
466
            A newly created subclass of :class:`typeddfs.matrix_dfs.MatrixDf`.
467
468
        Raises:
469
            ClashError: If there is a contradiction in the specification
470
            FormatInsecureError: If :meth:`hash` set an insecure
471
                                 hash format and :meth:`secure` was set.
472
473
        .. note ::
474
475
            Copies, so this builder can be used to create more types without interference.
476
477
        Raises:
478
            DfTypeConstructionError: for some errors
479
        """
480
        # noinspection PyTypeChecker
481
        return self._build()
482
483
    def dtype(self, dt: type[Any]) -> __qualname__:
484
        """
485
        Sets the type of value for all matrix elements.
486
        This should almost certainly be a numeric type,
487
        and it must be ordered.
488
489
        .. caution:
490
            Never use a mutable type for ``dt``.
491
            Doing so can result in hard-to-detect and potentially serious bugs.
492
493
        Returns:
494
            This builder for chaining
495
        """
496
        self._value_dtype = dt
497
        if not hasattr(dt, "__lt__"):
498
            msg = f"Dtype {dt} is unordered"
499
            raise DfTypeConstructionError(msg)
500
        return self
501
502
    def _check_final(self) -> None:
503
        pass
504
505
506
class AffinityMatrixDfBuilder(MatrixDfBuilder):
507
    """
508
    A builder pattern for :class:`typeddfs.matrix_dfs.AffinityMatrixDf`.
509
    """
510
511
    def __init__(self, name: str, doc: str | None = None) -> None:
512
        super().__init__(name, doc)
513
        self._clazz = AffinityMatrixDf
514
515
    def build(self) -> type[AffinityMatrixDf]:
516
        """
517
        Builds this type.
518
519
        Returns:
520
            A newly created subclass of :class:`typeddfs.matrix_dfs.AffinityMatrixDf`.
521
522
        Raises:
523
            typeddfs.df_errors.ClashError: If there is a contradiction in the specification
524
            typeddfs.df_errors.FormatInsecureError: If :meth:`hash` set an insecure
525
                                                    hash format and :meth:`secure` was set.
526
527
        .. note ::
528
529
            Copies, so this builder can be used to create more types without interference.
530
        """
531
        # noinspection PyTypeChecker
532
        return self._build()
533
534
535
class TypedDfBuilder(_GenericBuilder):
536
    """
537
    A builder pattern for :class:`typeddfs.typed_dfs.TypedDf`.
538
539
    Example:
540
        ``TypedDfBuilder.typed().require("name").build()``
541
    """
542
543
    def __init__(self, name: str, doc: str | None = None) -> None:
544
        super().__init__(name, doc)
545
        self._clazz = TypedDf
546
547
    def series_names(
548
        self,
549
        index: None | bool | str = False,
550
        columns: None | bool | str = False,
551
    ) -> __qualname__:
552
        """
553
        Sets ``pd.DataFrame.index.name`` and/or ``pd.DataFrame.columns.name``.
554
        Valid values are ``False`` to not set (default), ``None`` to set to ``None``,
555
        or a string to set to.
556
557
        Returns:
558
            This builder for chaining
559
        """
560
        self._index_series_name = index
561
        self._column_series_name = columns
562
        return self
563
564
    def build(self) -> type[TypedDf]:
565
        """
566
        Builds this type.
567
568
        Returns:
569
            A newly created subclass of :class:`typeddfs.typed_dfs.TypedDf`.
570
571
        Raises:
572
            DfTypeConstructionError: If there is a contradiction in the specification
573
574
        .. note ::
575
576
            Copies, so this builder can be used to create more types without interference.
577
        """
578
        # noinspection PyTypeChecker
579
        return self._build()
580
581
    def require(self, *names: str, dtype: type | None = None, index: bool = False) -> __qualname__:
582
        """
583
        Requires column(s) or index name(s).
584
        DataFrames will fail if they are missing any of these.
585
586
        Args:
587
            names: A varargs list of columns or index names
588
            dtype: An automatically applied transformation of the column values using ``.astype``
589
            index: If True, put these in the index
590
591
        Returns:
592
            This builder for chaining
593
594
        Raises:
595
            typeddfs.df_errors.ClashError: If a name was already added or is forbidden
596
        """
597
        self._check(names)
598
        if index:
599
            self._req_meta.extend(names)
600
        else:
601
            self._req_cols.extend(names)
602
        if dtype is not None:
603
            for name in names:
604
                self._dtypes[name] = dtype
605
        return self
606
607
    def reserve(self, *names: str, dtype: type | None = None, index: bool = False) -> __qualname__:
608
        """
609
        Reserves column(s) or index name(s) for optional inclusion.
610
        A reserved column will be accepted even if ``strict`` is set.
611
        A reserved index will be accepted even if ``strict`` is set;
612
        additionally, it will be automatically moved from the list of columns to the list of index names.
613
614
        Args:
615
            names: A varargs list of columns or index names
616
            dtype: An automatically applied transformation of the column values using ``.astype``
617
            index: If True, put these in the index
618
619
        Returns:
620
            This builder for chaining
621
622
        Raises:
623
            typeddfs.df_errors.ClashError: If a name was already added or is forbidden
624
        """
625
        self._check(names)
626
        if index:
627
            self._res_meta.extend(names)
628
        else:
629
            self._res_cols.extend(names)
630
        if dtype is not None:
631
            for name in names:
632
                self._dtypes[name] = dtype
633
        return self
634
635
    def drop(self, *names: str) -> __qualname__:
636
        """
637
        Adds columns (and index names) that should be automatically dropped.
638
639
        Args:
640
            names: Varargs list of names
641
642
        Returns:
643
            This builder for chaining
644
        """
645
        self._drop.extend(names)
646
        return self
647
648
    def strict(self, index: bool = True, cols: bool = True) -> __qualname__:
649
        """
650
        Disallows any columns or index names not in the lists of reserved/required.
651
652
        Args:
653
            index: Disallow additional names in the index
654
            cols: Disallow additional columns
655
656
        Returns:
657
            This builder for chaining
658
        """
659
        self._strict_meta = index
660
        self._strict_cols = cols
661
        return self
662
663
    def _check_final(self) -> None:
664
        """
665
        Final method in the chain.
666
        Creates a new subclass of ``TypedDf``.
667
668
        Returns:
669
            The new class
670
671
        Raises:
672
            typeddfs.df_errors.ClashError: If there is a contradiction in the specification
673
        """
674
        all_names = [*self._req_cols, *self._req_meta, *self._res_cols, *self._res_meta]
675
        problem_names = [name for name in all_names if name in self._drop]
676
        if len(problem_names) > 0:
677
            msg = f"Required/reserved column/index names {problem_names} are auto-dropped"
678
            raise ClashError(
679
                msg,
680
                keys=set(problem_names),
681
            )
682
683
    def _check(self, names: Sequence[str]) -> None:
684
        if any(name in _AUTO_DROPPED_NAMES for name in names):
685
            msg = f"Columns {','.join(_AUTO_DROPPED_NAMES)} are auto-dropped"
686
            raise ClashError(
687
                msg,
688
                keys=_AUTO_DROPPED_NAMES,
689
            )
690
        if any(name in _FORBIDDEN_NAMES for name in names):
691
            msg = f"{','.join(_FORBIDDEN_NAMES)} are forbidden names"
692
            raise ClashError(
693
                msg,
694
                keys=_FORBIDDEN_NAMES,
695
            )
696
        for name in names:
697
            if name in [*self._req_cols, *self._req_meta, *self._res_cols, *self._res_meta]:
698
                msg = f"Column {name} for {self._name} already exists"
699
                raise ClashError(msg, keys={name})
700
701
702
__all__ = ["TypedDfBuilder", "MatrixDfBuilder", "AffinityMatrixDfBuilder"]
703