ExcelProcessorWithFormatting.__init__()   A
last analyzed

Complexity

Conditions 1

Size

Total Lines 5
Code Lines 4

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
eloc 4
nop 2
dl 0
loc 5
rs 10
c 0
b 0
f 0
1
from abc import ABC, abstractmethod
2
from io import BytesIO
3
from typing import override
4
5
import numpy as np
6
from openpyxl import load_workbook
7
from openpyxl.worksheet.worksheet import Worksheet
8
9
import pandas as pd
10
11
# future: remove the comment below when stubs for the library below are available
12
import xlrd  # type: ignore
13
14
from src.new_data_processors.common import TableUpdater
15
16
17
class ExcelProcessor(TableUpdater, ABC):
18
    def __init__(self) -> None:
19
        super().__init__()
20
21
    def process_data(self) -> None:
22
        self.import_data()
23
        self.correct_data()
24
25
    @abstractmethod
26
    def import_data(self) -> None:
27
        pass
28
29
    def correct_data(self) -> None:
30
        self.correct_data_manually()
31
        self.correct_boolean_values()
32
        self.correct_df_na_values_for_database()
33
34
    @abstractmethod
35
    def correct_data_manually(self) -> None:
36
        pass
37
38
    @abstractmethod
39
    def correct_boolean_values(self) -> None:
40
        pass
41
42
    def correct_df_na_values_for_database(self) -> None:
43
        self.data.replace(
44
            to_replace={
45
                pd.NA: None,
46
                pd.NaT: None,
47
                np.nan: None,  # future: remove this line when https://github.com/pandas-dev/pandas/issues/32265 is fixed
48
                NotImplemented: None,  # TODO: remove this line in production
49
            },
50
            inplace=True,
51
        )
52
53
54
class ExcelProcessorSimple(ExcelProcessor, ABC):
55
    def __init__(self) -> None:
56
        super().__init__()
57
58
        self._data_to_process: bytes = NotImplemented
59
60
    def import_data(self) -> None:
61
        try:
62
            self.data = pd.read_excel(BytesIO(self._data_to_process))
63
        except xlrd.compdoc.CompDocError:
64
            self.logger.debug("Excel data is corrupted, ignoring it...")
65
            workbook = xlrd.open_workbook(
66
                file_contents=self._data_to_process,
67
                ignore_workbook_corruption=True,
68
            )
69
            self.data = pd.read_excel(workbook)
70
71
    def correct_data_manually(self) -> None:
72
        pass
73
74
75
class ExcelProcessorWithFormatting(ExcelProcessor, ABC):
76
    def __init__(self, **kwargs) -> None:
77
        super().__init__(**kwargs)
78
79
        self._file_to_be_imported: str = NotImplemented
80
        self._data_to_process: list[Worksheet] = NotImplemented
81
82
    def import_data(self) -> None:
83
        self._data_to_process = self.get_worksheets(self._file_to_be_imported)
84
85
    def get_worksheets(self, xlsx_file_location: str) -> list[Worksheet]:
86
        try:
87
            self.logger.info(f"Loading {xlsx_file_location} started!")
88
            xlsx_workbook = load_workbook(
89
                filename=xlsx_file_location,
90
                data_only=True,
91
            )
92
            self.logger.info(f"{xlsx_file_location} loaded!")
93
            return list(xlsx_workbook.worksheets)
94
        finally:
95
            self.logger.info(f"All worksheets imported from {xlsx_file_location}!")
96
97
    def correct_boolean_values(self) -> None:
98
        pass
99
100
    @override
101
    def correct_df_na_values_for_database(self) -> None:
102
        pass
103