1
|
|
|
# SPDX-FileCopyrightText: Copyright 2020-2023, Contributors to typed-dfs |
2
|
|
|
# SPDX-PackageHomePage: https://github.com/dmyersturnbull/typed-dfs |
3
|
|
|
# SPDX-License-Identifier: Apache-2.0 |
4
|
|
|
""" |
5
|
|
|
Defines a builder pattern for ``TypedDf``. |
6
|
|
|
""" |
7
|
|
|
from __future__ import annotations |
8
|
|
|
|
9
|
|
|
from collections import defaultdict |
10
|
|
|
from typing import TYPE_CHECKING, Any, Optional |
11
|
|
|
|
12
|
|
|
import pandas as pd |
13
|
|
|
|
14
|
|
|
from typeddfs.df_errors import ClashError, DfTypeConstructionError |
15
|
|
|
from typeddfs.df_typing import DfTyping, IoTyping |
16
|
|
|
from typeddfs.file_formats import FileFormat |
17
|
|
|
from typeddfs.matrix_dfs import AffinityMatrixDf, MatrixDf |
18
|
|
|
from typeddfs.typed_dfs import TypedDf |
19
|
|
|
from typeddfs.utils import Utils |
20
|
|
|
from typeddfs.utils._utils import ( |
21
|
|
|
_AUTO_DROPPED_NAMES, |
22
|
|
|
_DEFAULT_ATTRS_SUFFIX, |
23
|
|
|
_DEFAULT_HASH_ALG, |
24
|
|
|
_FORBIDDEN_NAMES, |
25
|
|
|
_PICKLE_VR, |
26
|
|
|
) |
27
|
|
|
from typeddfs.utils.checksums import Checksums |
28
|
|
|
|
29
|
|
|
if TYPE_CHECKING: |
30
|
|
|
from collections.abc import Callable, Mapping, Sequence |
31
|
|
|
from pathlib import Path |
32
|
|
|
|
33
|
|
|
from typeddfs.base_dfs import BaseDf |
34
|
|
|
|
35
|
|
|
|
36
|
|
|
class _GenericBuilder: |
37
|
|
|
def __init__(self, name: str, doc: str | None = None) -> None: |
38
|
|
|
""" |
39
|
|
|
Constructs a new builder. |
40
|
|
|
|
41
|
|
|
Args: |
42
|
|
|
name: The name of the resulting class |
43
|
|
|
doc: The docstring of the resulting class |
44
|
|
|
|
45
|
|
|
Raises: |
46
|
|
|
TypeError: If ``name`` or ``doc`` non-string |
47
|
|
|
""" |
48
|
|
|
if not isinstance(name, str): |
49
|
|
|
msg = f"Class name {name} is a {type(name)}, not str" |
50
|
|
|
raise TypeError(msg) |
51
|
|
|
self._name = name |
52
|
|
|
self._doc = doc |
53
|
|
|
self._clazz = None |
54
|
|
|
self._classes = [] |
55
|
|
|
self._remapped_suffixes = {} |
56
|
|
|
self._remapped_read_kwargs = {} |
57
|
|
|
self._remapped_write_kwargs = {} |
58
|
|
|
self._encoding = "utf-8" |
59
|
|
|
self._errors = "strict" |
60
|
|
|
self._read_kwargs = defaultdict(dict) |
61
|
|
|
self._write_kwargs = defaultdict(dict) |
62
|
|
|
self._methods = {} |
63
|
|
|
self._classmethods = {} |
64
|
|
|
self._post_processing = None |
65
|
|
|
self._verifications = [] |
66
|
|
|
self._req_meta = [] |
67
|
|
|
self._res_meta = [] |
68
|
|
|
self._req_cols = [] |
69
|
|
|
self._res_cols = [] |
70
|
|
|
self._dtypes = {} |
71
|
|
|
self._value_dtype = None |
72
|
|
|
self._drop = [] |
73
|
|
|
self._strict_meta = False |
74
|
|
|
self._strict_cols = False |
75
|
|
|
self._hash_alg = _DEFAULT_HASH_ALG |
76
|
|
|
self._hash_file = False |
77
|
|
|
self._hash_dir = False |
78
|
|
|
self._index_series_name = False |
79
|
|
|
self._column_series_name = False |
80
|
|
|
self._secure = False |
81
|
|
|
self._recommended = False |
82
|
|
|
self._req_hash: Optional = False |
83
|
|
|
self._req_order: Optional = False |
84
|
|
|
self._attr_suffix = None |
85
|
|
|
self._attr_json_kwargs = {} |
86
|
|
|
self._custom_formats = {} |
87
|
|
|
# make these use an explicit version |
88
|
|
|
# the user can override if needed |
89
|
|
|
self.add_read_kwargs("pickle", protocol=_PICKLE_VR) |
90
|
|
|
self.add_write_kwargs("pickle", protocol=_PICKLE_VR) |
91
|
|
|
|
92
|
|
|
def subclass(self, clazz: type[Any]) -> __qualname__: |
93
|
|
|
""" |
94
|
|
|
Make the class inherit from some type. |
95
|
|
|
May only subclass from a single subclass of DataFrame. |
96
|
|
|
If ``class`` is a ``DataFrame`` subclass, replaces the existing subclass. |
97
|
|
|
Otherwise, adds an additional superclass for multiple inheritance. |
98
|
|
|
|
99
|
|
|
Returns: |
100
|
|
|
This builder for chaining |
101
|
|
|
""" |
102
|
|
|
if issubclass(clazz, pd.DataFrame): |
103
|
|
|
self._clazz = clazz |
104
|
|
|
else: |
105
|
|
|
self._classes.append(clazz) |
106
|
|
|
return self |
107
|
|
|
|
108
|
|
|
def doc(self, s: str) -> __qualname__: |
109
|
|
|
""" |
110
|
|
|
Sets the docstring. |
111
|
|
|
This has the same effect as setting doc in :meth:`__init__`. |
112
|
|
|
|
113
|
|
|
Returns: |
114
|
|
|
This builder for chaining |
115
|
|
|
""" |
116
|
|
|
self._doc = s |
117
|
|
|
return self |
118
|
|
|
|
119
|
|
|
def add_methods( |
120
|
|
|
self, |
121
|
|
|
*args: Callable[[BaseDf, ...], Any], |
122
|
|
|
**kwargs: Callable[[BaseDf, ...], Any], |
123
|
|
|
) -> __qualname__: |
124
|
|
|
""" |
125
|
|
|
Attaches methods to the class. |
126
|
|
|
|
127
|
|
|
Args: |
128
|
|
|
args: Functions whose names are used directly |
129
|
|
|
kwargs: Mapping from function names to functions (the keys will be the method names) |
130
|
|
|
|
131
|
|
|
Example: |
132
|
|
|
``add_methods(summary=lambda df: f"{len(df) rows")`` |
133
|
|
|
|
134
|
|
|
Returns: |
135
|
|
|
This builder for chaining |
136
|
|
|
""" |
137
|
|
|
self._methods.update({m.__name__: m for m in args}) |
138
|
|
|
self._methods.update(**kwargs) |
139
|
|
|
return self |
140
|
|
|
|
141
|
|
|
def add_classmethods(self, **kwargs: Callable[[type[BaseDf], ...], Any]) -> __qualname__: |
142
|
|
|
""" |
143
|
|
|
Attaches classmethods to the class. |
144
|
|
|
Mostly useful for factory methods. |
145
|
|
|
|
146
|
|
|
Example: |
147
|
|
|
``add_classmethods(flat_instance=lambda t, value: MyClass(value))`` |
148
|
|
|
|
149
|
|
|
Returns: |
150
|
|
|
This builder for chaining |
151
|
|
|
""" |
152
|
|
|
self._classmethods.update(**kwargs) |
153
|
|
|
return self |
154
|
|
|
|
155
|
|
|
def post(self, fn: Callable[[BaseDf], BaseDf]) -> __qualname__: |
156
|
|
|
""" |
157
|
|
|
Adds a method that is called on the converted DataFrame. |
158
|
|
|
It is called immediately before final optional conditions (``verify``) are checked. |
159
|
|
|
The function must return a new DataFrame. |
160
|
|
|
|
161
|
|
|
Returns: |
162
|
|
|
This builder for chaining |
163
|
|
|
""" |
164
|
|
|
self._post_processing = fn |
165
|
|
|
return self |
166
|
|
|
|
167
|
|
|
def verify(self, *conditions: Callable[[pd.DataFrame], str, bool | None]) -> __qualname__: |
168
|
|
|
""" |
169
|
|
|
Adds additional requirement(s) for the DataFrames. |
170
|
|
|
|
171
|
|
|
Returns: |
172
|
|
|
this builder for chaining |
173
|
|
|
|
174
|
|
|
Args: |
175
|
|
|
conditions: Functions of the DataFrame that return None if the condition is met, or an error message |
176
|
|
|
|
177
|
|
|
Returns: |
178
|
|
|
This builder for chaining |
179
|
|
|
""" |
180
|
|
|
self._verifications.extend(conditions) |
181
|
|
|
return self |
182
|
|
|
|
183
|
|
|
def suffix( |
184
|
|
|
self, |
185
|
|
|
suffix: str, |
186
|
|
|
fmt: FileFormat | str, |
187
|
|
|
*, |
188
|
|
|
read: Mapping[str, Any] | None = None, |
189
|
|
|
write: Mapping[str, Any] | None = None, |
190
|
|
|
) -> __qualname__: |
191
|
|
|
""" |
192
|
|
|
Makes read_files and write_files interpret a filename suffix differently. |
193
|
|
|
Suffixes like .gz, .zip, etc. are also included for text formats that are provided. |
194
|
|
|
|
195
|
|
|
Args: |
196
|
|
|
suffix: e.g. .txt (a prepended '.' is ignored) |
197
|
|
|
fmt: The FileFormat used to map to read/write methods |
198
|
|
|
read: Custom params to pass to the read function |
199
|
|
|
write: Custom params to pass to the write function |
200
|
|
|
|
201
|
|
|
Returns: |
202
|
|
|
This builder for chaining |
203
|
|
|
""" |
204
|
|
|
if not suffix.startswith("."): |
205
|
|
|
suffix = "." + suffix |
206
|
|
|
fmt = FileFormat.of(fmt) |
207
|
|
|
for s in fmt.compressed_variants(suffix): |
208
|
|
|
self._remapped_suffixes[s] = fmt |
209
|
|
|
if read is not None: |
210
|
|
|
self._remapped_read_kwargs[suffix] = read |
211
|
|
|
if write is not None: |
212
|
|
|
self._remapped_write_kwargs[suffix] = write |
213
|
|
|
return self |
214
|
|
|
|
215
|
|
|
def hash( |
216
|
|
|
self, |
217
|
|
|
*, |
218
|
|
|
alg: str = "sha256", |
219
|
|
|
file: bool = True, |
220
|
|
|
directory: bool = False, |
221
|
|
|
) -> __qualname__: |
222
|
|
|
""" |
223
|
|
|
Write a hash file (e.g. .sha256) alongside files. |
224
|
|
|
Performed when calling :meth:`typeddfs.abs_dfs.AbsDf.write_file`. |
225
|
|
|
The hash files will be in the `sha1sum <https://en.wikipedia.org/wiki/Sha1sum>`_ format, |
226
|
|
|
with a the filename, followed by ``" *"``, followed by the filename. |
227
|
|
|
|
228
|
|
|
Note that this affects the default behavior of :meth:`typeddfs.abs_dfs.AbsDf.write_file`, |
229
|
|
|
which can be called with ``file_hash=False`` and/or ``dir_hash=False``. |
230
|
|
|
|
231
|
|
|
Args: |
232
|
|
|
alg: The name of the algorithm in ``hashlib``; |
233
|
|
|
The final name will ignore any hyphens and be converted to lowercase, |
234
|
|
|
and the suffix will be ``"." + alg``. |
235
|
|
|
file: Alongside a file ``"my_file.csv.gz"``, |
236
|
|
|
write a file ``"my_file.csv.gz."+alg`` alongside. |
237
|
|
|
directory: Alongside a file ``"my_file.csv.gz"`` in ``"my_dir"``, |
238
|
|
|
append to a file ``"my_dir/my_dir"+alg``, |
239
|
|
|
which presumably should contain hashes for files in that directory. |
240
|
|
|
|
241
|
|
|
Returns: |
242
|
|
|
This builder for chaining |
243
|
|
|
""" |
244
|
|
|
self._hash_alg = Checksums.resolve_algorithm(alg) |
245
|
|
|
self._hash_file = file |
246
|
|
|
self._hash_dir = directory |
247
|
|
|
return self |
248
|
|
|
|
249
|
|
|
def attrs( |
250
|
|
|
self, |
251
|
|
|
*, |
252
|
|
|
suffix: str = ".attrs.json", # leave like this for clear documentation |
253
|
|
|
preserve_inf: bool = True, |
254
|
|
|
sort: bool = False, |
255
|
|
|
indent: bool = True, |
256
|
|
|
fallback: Callable[[Any], Any] | None = None, |
257
|
|
|
) -> __qualname__: |
258
|
|
|
""" |
259
|
|
|
Sets ``pd.DataFrame.attrs`` to be read and written by default. |
260
|
|
|
|
261
|
|
|
Args: |
262
|
|
|
suffix: Will be appended to the filename of the DataFrame; |
263
|
|
|
must end with .json, .json.gz, etc. |
264
|
|
|
preserve_inf: Convert numpy ``ndarray`` values, ``float("inf")``, |
265
|
|
|
and ``float("-inf")`` to str when writing JSON |
266
|
|
|
sort: Sort JSON before writing |
267
|
|
|
indent: Indent JSON before writing |
268
|
|
|
fallback: Try this method to serialize to JSON if all others fail |
269
|
|
|
|
270
|
|
|
Returns: |
271
|
|
|
This builder for chaining |
272
|
|
|
|
273
|
|
|
Raises: |
274
|
|
|
ValueError: If the format is not JSON |
275
|
|
|
""" |
276
|
|
|
fmt = FileFormat.from_path(suffix) |
277
|
|
|
if fmt is not FileFormat.json: |
278
|
|
|
msg = f"File format must be JSON ({suffix}" |
279
|
|
|
raise ValueError(msg) |
280
|
|
|
self._attr_suffix = suffix |
281
|
|
|
self._attr_json_kwargs = { |
282
|
|
|
"preserve_inf": preserve_inf, |
283
|
|
|
"sort": sort, |
284
|
|
|
"indent": indent, |
285
|
|
|
"fallbacks": [fallback], |
286
|
|
|
} |
287
|
|
|
return self |
288
|
|
|
|
289
|
|
|
def secure(self) -> __qualname__: |
290
|
|
|
""" |
291
|
|
|
Bans IO with insecure formats. |
292
|
|
|
This includes Pickle and Excel formats that support macros. |
293
|
|
|
|
294
|
|
|
Returns: |
295
|
|
|
This builder for chaining |
296
|
|
|
""" |
297
|
|
|
self._secure = True |
298
|
|
|
return self |
299
|
|
|
|
300
|
|
|
def recommended_only(self) -> __qualname__: |
301
|
|
|
""" |
302
|
|
|
Bans IO with non-recommended formats. |
303
|
|
|
This includes all insecure formats along with fixed-width, HDF5, INI, TOML, .properties, etc. |
304
|
|
|
|
305
|
|
|
Returns: |
306
|
|
|
This builder for chaining |
307
|
|
|
""" |
308
|
|
|
self._recommended = True |
309
|
|
|
return self |
310
|
|
|
|
311
|
|
|
def encoding(self, encoding: str = "utf-8") -> __qualname__: |
312
|
|
|
""" |
313
|
|
|
Has pandas-defined text read/write functions use UTF-8. |
314
|
|
|
UTF-8 was the default when the builder was constructed. |
315
|
|
|
|
316
|
|
|
Arguments: |
317
|
|
|
encoding: Use this encoding. |
318
|
|
|
Values are case-insensitive and ignore hyphen. |
319
|
|
|
(i.e. ``utf-8(bom)`` and ``utf8(bom)`` are the same. |
320
|
|
|
Special values are ``platform`` and ``utf-8(bom)``. |
321
|
|
|
"platform" is equivalent to ``sys.getdefaultencoding()``. |
322
|
|
|
"utf8(bom)" changes the encoding depending on the platform at the time of writing. |
323
|
|
|
(I.e. The read/write functions will work as expected when pickled.) |
324
|
|
|
If ``utf8(bom)``, will use utf-8-sig if the platform is Windows ('nt'). |
325
|
|
|
Some applications will otherwise assume the default encoding (and break). |
326
|
|
|
(Note: ``utf16(bom)`` will also work.) |
327
|
|
|
|
328
|
|
|
Returns: |
329
|
|
|
This builder for chaining |
330
|
|
|
""" |
331
|
|
|
self._encoding = encoding.lower().replace("-", "") |
332
|
|
|
return self |
333
|
|
|
|
334
|
|
|
def add_custom_format( |
335
|
|
|
self, |
336
|
|
|
suffix: str, |
337
|
|
|
reader: Callable[[Path], pd.DataFrame], |
338
|
|
|
writer: Callable[[pd.DataFrame], Path], |
339
|
|
|
*, |
340
|
|
|
replace: bool = False, |
341
|
|
|
) -> __qualname__: |
342
|
|
|
""" |
343
|
|
|
Adds custom readers and writers for read_file and write_file. |
344
|
|
|
""" |
345
|
|
|
if not replace: |
346
|
|
|
fmt = FileFormat.from_path_or_none(suffix) |
347
|
|
|
if fmt is not None: |
348
|
|
|
msg = f"Cannot override suffix {suffix} for format {fmt.name}" |
349
|
|
|
raise ValueError(msg) |
350
|
|
|
self._custom_formats[suffix] = (reader, writer) |
351
|
|
|
return self |
352
|
|
|
|
353
|
|
|
def add_read_kwargs(self, fmt: FileFormat | str, **kwargs) -> __qualname__: |
354
|
|
|
""" |
355
|
|
|
Adds keyword arguments that are passed to ``read_`` methods when called from ``read_file``. |
356
|
|
|
Rarely needed. |
357
|
|
|
|
358
|
|
|
Arguments: |
359
|
|
|
fmt: The file format (which corresponds to the delegated method) |
360
|
|
|
kwargs: key-value pairs that are used for the specified format |
361
|
|
|
|
362
|
|
|
Returns: |
363
|
|
|
This builder for chaining |
364
|
|
|
""" |
365
|
|
|
fmt = FileFormat.of(fmt) |
366
|
|
|
for k, v in kwargs.items(): |
367
|
|
|
self._read_kwargs[fmt][k] = v |
368
|
|
|
return self |
369
|
|
|
|
370
|
|
|
def add_write_kwargs(self, fmt: FileFormat | str, **kwargs) -> __qualname__: |
371
|
|
|
""" |
372
|
|
|
Adds keyword arguments that are passed to ``to_`` methods when called from ``to_file``. |
373
|
|
|
Rarely needed. |
374
|
|
|
|
375
|
|
|
Example: |
376
|
|
|
.. code:: |
377
|
|
|
|
378
|
|
|
TypedDfs.typed("x").add_write_kwargs() |
379
|
|
|
|
380
|
|
|
Arguments: |
381
|
|
|
fmt: The file format (which corresponds to the delegated method) |
382
|
|
|
kwargs: key-value pairs that are used for the specified format |
383
|
|
|
|
384
|
|
|
Returns: |
385
|
|
|
This builder for chaining |
386
|
|
|
""" |
387
|
|
|
fmt = FileFormat.of(fmt) |
388
|
|
|
for k, v in kwargs.items(): |
389
|
|
|
self._write_kwargs[fmt][k] = v |
390
|
|
|
return self |
391
|
|
|
|
392
|
|
|
def _build(self) -> type[BaseDf]: |
393
|
|
|
if self._secure and self._hash_alg in Utils.insecure_hash_functions(): |
394
|
|
|
msg = f"Hash algorithm {self._hash_alg} forbidden by .secure()" |
395
|
|
|
raise DfTypeConstructionError(msg) |
396
|
|
|
self._check_final() |
397
|
|
|
|
398
|
|
|
_io_typing = IoTyping( |
399
|
|
|
_remap_suffixes=dict(self._remapped_suffixes), |
400
|
|
|
_text_encoding=self._encoding, |
401
|
|
|
_read_kwargs=dict(self._read_kwargs), |
402
|
|
|
_write_kwargs=dict(self._write_kwargs), |
403
|
|
|
_hash_alg=self._hash_alg, |
404
|
|
|
_save_hash_file=self._hash_file, |
405
|
|
|
_save_hash_dir=self._hash_dir, |
406
|
|
|
_secure=self._secure, |
407
|
|
|
_recommended=self._recommended, |
408
|
|
|
_attrs_suffix=_DEFAULT_ATTRS_SUFFIX if self._attr_suffix is None else self._attr_suffix, |
409
|
|
|
_use_attrs=self._attr_suffix is not None, |
410
|
|
|
_attrs_json_kwargs=self._attr_json_kwargs, |
411
|
|
|
_custom_readers={k: v[0] for k, v in self._custom_formats.items()}, |
412
|
|
|
_custom_writers={k: v[1] for k, v in self._custom_formats.items()}, |
413
|
|
|
) |
414
|
|
|
|
415
|
|
|
_typing = DfTyping( |
416
|
|
|
_io_typing=_io_typing, |
417
|
|
|
_auto_dtypes=dict(self._dtypes), |
418
|
|
|
_post_processing=self._post_processing, |
419
|
|
|
_verifications=self._verifications, |
420
|
|
|
_more_index_names_allowed=not self._strict_meta, |
421
|
|
|
_more_columns_allowed=not self._strict_cols, |
422
|
|
|
_required_columns=list(self._req_cols), |
423
|
|
|
_required_index_names=list(self._req_meta), |
424
|
|
|
_reserved_columns=list(self._res_cols), |
425
|
|
|
_reserved_index_names=list(self._res_meta), |
426
|
|
|
_columns_to_drop=set(self._drop), |
427
|
|
|
_index_series_name=self._index_series_name, |
428
|
|
|
_column_series_name=self._column_series_name, |
429
|
|
|
_value_dtype=self._value_dtype, |
430
|
|
|
) |
431
|
|
|
|
432
|
|
|
class New(self._clazz, *self._classes): |
433
|
|
|
@classmethod |
434
|
|
|
def get_typing(cls) -> DfTyping: |
435
|
|
|
return _typing |
436
|
|
|
|
437
|
|
|
New.__name__ = self._name |
438
|
|
|
New.__doc__ = self._doc |
439
|
|
|
for k, v in self._methods.items(): |
440
|
|
|
setattr(New, k, v) |
441
|
|
|
for k, v in self._classmethods.items(): |
442
|
|
|
setattr(New, k, classmethod(v)) |
443
|
|
|
return New |
444
|
|
|
|
445
|
|
|
def _check_final(self) -> None: |
446
|
|
|
raise NotImplementedError() |
447
|
|
|
|
448
|
|
|
|
449
|
|
|
class MatrixDfBuilder(_GenericBuilder): |
450
|
|
|
""" |
451
|
|
|
A builder pattern for :class:`typeddfs.matrix_dfs.MatrixDf`. |
452
|
|
|
""" |
453
|
|
|
|
454
|
|
|
def __init__(self, name: str, doc: str | None = None) -> None: |
455
|
|
|
super().__init__(name, doc) |
456
|
|
|
self._clazz = MatrixDf |
457
|
|
|
self._index_series_name = "row" |
458
|
|
|
self._column_series_name = "column" |
459
|
|
|
self._req_meta.append("row") |
460
|
|
|
|
461
|
|
|
def build(self) -> type[MatrixDf]: |
462
|
|
|
""" |
463
|
|
|
Builds this type. |
464
|
|
|
|
465
|
|
|
Returns: |
466
|
|
|
A newly created subclass of :class:`typeddfs.matrix_dfs.MatrixDf`. |
467
|
|
|
|
468
|
|
|
Raises: |
469
|
|
|
ClashError: If there is a contradiction in the specification |
470
|
|
|
FormatInsecureError: If :meth:`hash` set an insecure |
471
|
|
|
hash format and :meth:`secure` was set. |
472
|
|
|
|
473
|
|
|
.. note :: |
474
|
|
|
|
475
|
|
|
Copies, so this builder can be used to create more types without interference. |
476
|
|
|
|
477
|
|
|
Raises: |
478
|
|
|
DfTypeConstructionError: for some errors |
479
|
|
|
""" |
480
|
|
|
# noinspection PyTypeChecker |
481
|
|
|
return self._build() |
482
|
|
|
|
483
|
|
|
def dtype(self, dt: type[Any]) -> __qualname__: |
484
|
|
|
""" |
485
|
|
|
Sets the type of value for all matrix elements. |
486
|
|
|
This should almost certainly be a numeric type, |
487
|
|
|
and it must be ordered. |
488
|
|
|
|
489
|
|
|
.. caution: |
490
|
|
|
Never use a mutable type for ``dt``. |
491
|
|
|
Doing so can result in hard-to-detect and potentially serious bugs. |
492
|
|
|
|
493
|
|
|
Returns: |
494
|
|
|
This builder for chaining |
495
|
|
|
""" |
496
|
|
|
self._value_dtype = dt |
497
|
|
|
if not hasattr(dt, "__lt__"): |
498
|
|
|
msg = f"Dtype {dt} is unordered" |
499
|
|
|
raise DfTypeConstructionError(msg) |
500
|
|
|
return self |
501
|
|
|
|
502
|
|
|
def _check_final(self) -> None: |
503
|
|
|
pass |
504
|
|
|
|
505
|
|
|
|
506
|
|
|
class AffinityMatrixDfBuilder(MatrixDfBuilder): |
507
|
|
|
""" |
508
|
|
|
A builder pattern for :class:`typeddfs.matrix_dfs.AffinityMatrixDf`. |
509
|
|
|
""" |
510
|
|
|
|
511
|
|
|
def __init__(self, name: str, doc: str | None = None) -> None: |
512
|
|
|
super().__init__(name, doc) |
513
|
|
|
self._clazz = AffinityMatrixDf |
514
|
|
|
|
515
|
|
|
def build(self) -> type[AffinityMatrixDf]: |
516
|
|
|
""" |
517
|
|
|
Builds this type. |
518
|
|
|
|
519
|
|
|
Returns: |
520
|
|
|
A newly created subclass of :class:`typeddfs.matrix_dfs.AffinityMatrixDf`. |
521
|
|
|
|
522
|
|
|
Raises: |
523
|
|
|
typeddfs.df_errors.ClashError: If there is a contradiction in the specification |
524
|
|
|
typeddfs.df_errors.FormatInsecureError: If :meth:`hash` set an insecure |
525
|
|
|
hash format and :meth:`secure` was set. |
526
|
|
|
|
527
|
|
|
.. note :: |
528
|
|
|
|
529
|
|
|
Copies, so this builder can be used to create more types without interference. |
530
|
|
|
""" |
531
|
|
|
# noinspection PyTypeChecker |
532
|
|
|
return self._build() |
533
|
|
|
|
534
|
|
|
|
535
|
|
|
class TypedDfBuilder(_GenericBuilder): |
536
|
|
|
""" |
537
|
|
|
A builder pattern for :class:`typeddfs.typed_dfs.TypedDf`. |
538
|
|
|
|
539
|
|
|
Example: |
540
|
|
|
``TypedDfBuilder.typed().require("name").build()`` |
541
|
|
|
""" |
542
|
|
|
|
543
|
|
|
def __init__(self, name: str, doc: str | None = None) -> None: |
544
|
|
|
super().__init__(name, doc) |
545
|
|
|
self._clazz = TypedDf |
546
|
|
|
|
547
|
|
|
def series_names( |
548
|
|
|
self, |
549
|
|
|
index: None | bool | str = False, |
550
|
|
|
columns: None | bool | str = False, |
551
|
|
|
) -> __qualname__: |
552
|
|
|
""" |
553
|
|
|
Sets ``pd.DataFrame.index.name`` and/or ``pd.DataFrame.columns.name``. |
554
|
|
|
Valid values are ``False`` to not set (default), ``None`` to set to ``None``, |
555
|
|
|
or a string to set to. |
556
|
|
|
|
557
|
|
|
Returns: |
558
|
|
|
This builder for chaining |
559
|
|
|
""" |
560
|
|
|
self._index_series_name = index |
561
|
|
|
self._column_series_name = columns |
562
|
|
|
return self |
563
|
|
|
|
564
|
|
|
def build(self) -> type[TypedDf]: |
565
|
|
|
""" |
566
|
|
|
Builds this type. |
567
|
|
|
|
568
|
|
|
Returns: |
569
|
|
|
A newly created subclass of :class:`typeddfs.typed_dfs.TypedDf`. |
570
|
|
|
|
571
|
|
|
Raises: |
572
|
|
|
DfTypeConstructionError: If there is a contradiction in the specification |
573
|
|
|
|
574
|
|
|
.. note :: |
575
|
|
|
|
576
|
|
|
Copies, so this builder can be used to create more types without interference. |
577
|
|
|
""" |
578
|
|
|
# noinspection PyTypeChecker |
579
|
|
|
return self._build() |
580
|
|
|
|
581
|
|
|
def require(self, *names: str, dtype: type | None = None, index: bool = False) -> __qualname__: |
582
|
|
|
""" |
583
|
|
|
Requires column(s) or index name(s). |
584
|
|
|
DataFrames will fail if they are missing any of these. |
585
|
|
|
|
586
|
|
|
Args: |
587
|
|
|
names: A varargs list of columns or index names |
588
|
|
|
dtype: An automatically applied transformation of the column values using ``.astype`` |
589
|
|
|
index: If True, put these in the index |
590
|
|
|
|
591
|
|
|
Returns: |
592
|
|
|
This builder for chaining |
593
|
|
|
|
594
|
|
|
Raises: |
595
|
|
|
typeddfs.df_errors.ClashError: If a name was already added or is forbidden |
596
|
|
|
""" |
597
|
|
|
self._check(names) |
598
|
|
|
if index: |
599
|
|
|
self._req_meta.extend(names) |
600
|
|
|
else: |
601
|
|
|
self._req_cols.extend(names) |
602
|
|
|
if dtype is not None: |
603
|
|
|
for name in names: |
604
|
|
|
self._dtypes[name] = dtype |
605
|
|
|
return self |
606
|
|
|
|
607
|
|
|
def reserve(self, *names: str, dtype: type | None = None, index: bool = False) -> __qualname__: |
608
|
|
|
""" |
609
|
|
|
Reserves column(s) or index name(s) for optional inclusion. |
610
|
|
|
A reserved column will be accepted even if ``strict`` is set. |
611
|
|
|
A reserved index will be accepted even if ``strict`` is set; |
612
|
|
|
additionally, it will be automatically moved from the list of columns to the list of index names. |
613
|
|
|
|
614
|
|
|
Args: |
615
|
|
|
names: A varargs list of columns or index names |
616
|
|
|
dtype: An automatically applied transformation of the column values using ``.astype`` |
617
|
|
|
index: If True, put these in the index |
618
|
|
|
|
619
|
|
|
Returns: |
620
|
|
|
This builder for chaining |
621
|
|
|
|
622
|
|
|
Raises: |
623
|
|
|
typeddfs.df_errors.ClashError: If a name was already added or is forbidden |
624
|
|
|
""" |
625
|
|
|
self._check(names) |
626
|
|
|
if index: |
627
|
|
|
self._res_meta.extend(names) |
628
|
|
|
else: |
629
|
|
|
self._res_cols.extend(names) |
630
|
|
|
if dtype is not None: |
631
|
|
|
for name in names: |
632
|
|
|
self._dtypes[name] = dtype |
633
|
|
|
return self |
634
|
|
|
|
635
|
|
|
def drop(self, *names: str) -> __qualname__: |
636
|
|
|
""" |
637
|
|
|
Adds columns (and index names) that should be automatically dropped. |
638
|
|
|
|
639
|
|
|
Args: |
640
|
|
|
names: Varargs list of names |
641
|
|
|
|
642
|
|
|
Returns: |
643
|
|
|
This builder for chaining |
644
|
|
|
""" |
645
|
|
|
self._drop.extend(names) |
646
|
|
|
return self |
647
|
|
|
|
648
|
|
|
def strict(self, index: bool = True, cols: bool = True) -> __qualname__: |
649
|
|
|
""" |
650
|
|
|
Disallows any columns or index names not in the lists of reserved/required. |
651
|
|
|
|
652
|
|
|
Args: |
653
|
|
|
index: Disallow additional names in the index |
654
|
|
|
cols: Disallow additional columns |
655
|
|
|
|
656
|
|
|
Returns: |
657
|
|
|
This builder for chaining |
658
|
|
|
""" |
659
|
|
|
self._strict_meta = index |
660
|
|
|
self._strict_cols = cols |
661
|
|
|
return self |
662
|
|
|
|
663
|
|
|
def _check_final(self) -> None: |
664
|
|
|
""" |
665
|
|
|
Final method in the chain. |
666
|
|
|
Creates a new subclass of ``TypedDf``. |
667
|
|
|
|
668
|
|
|
Returns: |
669
|
|
|
The new class |
670
|
|
|
|
671
|
|
|
Raises: |
672
|
|
|
typeddfs.df_errors.ClashError: If there is a contradiction in the specification |
673
|
|
|
""" |
674
|
|
|
all_names = [*self._req_cols, *self._req_meta, *self._res_cols, *self._res_meta] |
675
|
|
|
problem_names = [name for name in all_names if name in self._drop] |
676
|
|
|
if len(problem_names) > 0: |
677
|
|
|
msg = f"Required/reserved column/index names {problem_names} are auto-dropped" |
678
|
|
|
raise ClashError( |
679
|
|
|
msg, |
680
|
|
|
keys=set(problem_names), |
681
|
|
|
) |
682
|
|
|
|
683
|
|
|
def _check(self, names: Sequence[str]) -> None: |
684
|
|
|
if any(name in _AUTO_DROPPED_NAMES for name in names): |
685
|
|
|
msg = f"Columns {','.join(_AUTO_DROPPED_NAMES)} are auto-dropped" |
686
|
|
|
raise ClashError( |
687
|
|
|
msg, |
688
|
|
|
keys=_AUTO_DROPPED_NAMES, |
689
|
|
|
) |
690
|
|
|
if any(name in _FORBIDDEN_NAMES for name in names): |
691
|
|
|
msg = f"{','.join(_FORBIDDEN_NAMES)} are forbidden names" |
692
|
|
|
raise ClashError( |
693
|
|
|
msg, |
694
|
|
|
keys=_FORBIDDEN_NAMES, |
695
|
|
|
) |
696
|
|
|
for name in names: |
697
|
|
|
if name in [*self._req_cols, *self._req_meta, *self._res_cols, *self._res_meta]: |
698
|
|
|
msg = f"Column {name} for {self._name} already exists" |
699
|
|
|
raise ClashError(msg, keys={name}) |
700
|
|
|
|
701
|
|
|
|
702
|
|
|
__all__ = ["TypedDfBuilder", "MatrixDfBuilder", "AffinityMatrixDfBuilder"] |
703
|
|
|
|