1
|
|
|
from abc import ABC, abstractmethod |
2
|
|
|
from io import BytesIO |
3
|
|
|
from typing import override |
4
|
|
|
|
5
|
|
|
import numpy as np |
6
|
|
|
from openpyxl import load_workbook |
7
|
|
|
from openpyxl.worksheet.worksheet import Worksheet |
8
|
|
|
|
9
|
|
|
import pandas as pd |
10
|
|
|
|
11
|
|
|
# future: remove the comment below when stubs for the library below are available |
12
|
|
|
import xlrd # type: ignore |
13
|
|
|
|
14
|
|
|
from src.new_data_processors.common import TableUpdater |
15
|
|
|
|
16
|
|
|
|
17
|
|
|
class ExcelProcessor(TableUpdater, ABC): |
18
|
|
|
def __init__(self) -> None: |
19
|
|
|
super().__init__() |
20
|
|
|
|
21
|
|
|
def process_data(self) -> None: |
22
|
|
|
self.import_data() |
23
|
|
|
self.correct_data() |
24
|
|
|
|
25
|
|
|
@abstractmethod |
26
|
|
|
def import_data(self) -> None: |
27
|
|
|
pass |
28
|
|
|
|
29
|
|
|
def correct_data(self) -> None: |
30
|
|
|
self.correct_data_manually() |
31
|
|
|
self.correct_boolean_values() |
32
|
|
|
self.correct_df_na_values_for_database() |
33
|
|
|
|
34
|
|
|
@abstractmethod |
35
|
|
|
def correct_data_manually(self) -> None: |
36
|
|
|
pass |
37
|
|
|
|
38
|
|
|
@abstractmethod |
39
|
|
|
def correct_boolean_values(self) -> None: |
40
|
|
|
pass |
41
|
|
|
|
42
|
|
|
def correct_df_na_values_for_database(self) -> None: |
43
|
|
|
self.data.replace( |
44
|
|
|
to_replace={ |
45
|
|
|
pd.NA: None, |
46
|
|
|
pd.NaT: None, |
47
|
|
|
np.nan: None, # future: remove this line when https://github.com/pandas-dev/pandas/issues/32265 is fixed |
48
|
|
|
NotImplemented: None, # TODO: remove this line in production |
49
|
|
|
}, |
50
|
|
|
inplace=True, |
51
|
|
|
) |
52
|
|
|
|
53
|
|
|
|
54
|
|
|
class ExcelProcessorSimple(ExcelProcessor, ABC): |
55
|
|
|
def __init__(self) -> None: |
56
|
|
|
super().__init__() |
57
|
|
|
|
58
|
|
|
self._data_to_process: bytes = NotImplemented |
59
|
|
|
|
60
|
|
|
def import_data(self) -> None: |
61
|
|
|
try: |
62
|
|
|
self.data = pd.read_excel(BytesIO(self._data_to_process)) |
63
|
|
|
except xlrd.compdoc.CompDocError: |
64
|
|
|
self.logger.debug("Excel data is corrupted, ignoring it...") |
65
|
|
|
workbook = xlrd.open_workbook( |
66
|
|
|
file_contents=self._data_to_process, |
67
|
|
|
ignore_workbook_corruption=True, |
68
|
|
|
) |
69
|
|
|
self.data = pd.read_excel(workbook) |
70
|
|
|
|
71
|
|
|
def correct_data_manually(self) -> None: |
72
|
|
|
pass |
73
|
|
|
|
74
|
|
|
|
75
|
|
|
class ExcelProcessorWithFormatting(ExcelProcessor, ABC): |
76
|
|
|
def __init__(self, **kwargs) -> None: |
77
|
|
|
super().__init__(**kwargs) |
78
|
|
|
|
79
|
|
|
self._file_to_be_imported: str = NotImplemented |
80
|
|
|
self._data_to_process: list[Worksheet] = NotImplemented |
81
|
|
|
|
82
|
|
|
def import_data(self) -> None: |
83
|
|
|
self._data_to_process = self.get_worksheets(self._file_to_be_imported) |
84
|
|
|
|
85
|
|
|
def get_worksheets(self, xlsx_file_location: str) -> list[Worksheet]: |
86
|
|
|
try: |
87
|
|
|
self.logger.info(f"Loading {xlsx_file_location} started!") |
88
|
|
|
xlsx_workbook = load_workbook( |
89
|
|
|
filename=xlsx_file_location, |
90
|
|
|
data_only=True, |
91
|
|
|
) |
92
|
|
|
self.logger.info(f"{xlsx_file_location} loaded!") |
93
|
|
|
return list(xlsx_workbook.worksheets) |
94
|
|
|
finally: |
95
|
|
|
self.logger.info(f"All worksheets imported from {xlsx_file_location}!") |
96
|
|
|
|
97
|
|
|
def correct_boolean_values(self) -> None: |
98
|
|
|
pass |
99
|
|
|
|
100
|
|
|
@override |
101
|
|
|
def correct_df_na_values_for_database(self) -> None: |
102
|
|
|
pass |
103
|
|
|
|