companies.CompaniesUpdater.__init__()   A
last analyzed

Complexity

Conditions 1

Size

Total Lines 8
Code Lines 5

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 5
dl 0
loc 8
rs 10
c 0
b 0
f 0
cc 1
nop 1
1
from datetime import datetime
2
import re
3
from typing import ClassVar, final
4
5
from sqlalchemy import (
6
    Boolean,
7
    Column,
8
    Date,
9
    ForeignKey,
10
    MetaData,
11
    SmallInteger,
12
    String,
13
    Table,
14
    text,
15
)
16
17
from src.new_data_processors.common import (
18
    UICTableUpdater,
19
)
20
from src.new_data_processors.common_excel_processors import ExcelProcessorSimple
21
from src.new_data_processors.helper_table_updaters.countries import CountriesUpdater
22
23
24
@final
25
# TODO: add logging
26
class CompaniesUpdater(ExcelProcessorSimple, UICTableUpdater):
27
    TABLE_NAME: ClassVar[str] = "companies"
28
    database_metadata: ClassVar[MetaData] = MetaData()
29
30
    table: ClassVar[Table] = Table(
31
        TABLE_NAME,
32
        database_metadata,
33
        Column(
34
            name="code_uic",
35
            type_=SmallInteger,
36
            nullable=False,
37
            index=True,
38
            primary_key=True,
39
        ),
40
        Column(name="short_name", type_=String(255)),
41
        Column(name="name", type_=String(255), nullable=False),
42
        Column(
43
            "country_code_iso",
44
            String(2),
45
            ForeignKey(CountriesUpdater.table.c.code_iso),
46
            nullable=False,
47
        ),
48
        Column(name="allocation_date", type_=Date),
49
        Column(name="modified_date", type_=Date),
50
        Column(name="begin_of_validity", type_=Date),
51
        Column(name="end_of_validity", type_=Date),
52
        Column(name="freight", type_=Boolean, nullable=False),
53
        Column(name="passenger", type_=Boolean, nullable=False),
54
        Column(name="infrastructure", type_=Boolean, nullable=False),
55
        Column(name="holding", type_=Boolean),
56
        Column(name="integrated", type_=Boolean),
57
        Column(name="other", type_=Boolean, nullable=False),
58
        Column(name="url", type_=String(255)),
59
    )
60
61
    def __init__(self) -> None:
62
        super().__init__()
63
64
        self.DATA_URL = f"{self.DATA_BASE_URL}3023"
65
66
        self._data_to_process = self.get_data(self.DATA_URL)
67
68
        self.logger.info(f"{self.__class__.__name__} initialized!")
69
70
    def correct_column_names(self) -> None:
71
        self.replace_nonword_with_underscore()
72
        self.rename_columns_manually()
73
74
    def replace_nonword_with_underscore(self) -> None:
75
        # TODO: report wrong display of newlines in DataFrame view to pandas developers
76
        self.data.rename(
77
            columns=lambda x: re.sub(
78
                pattern=r"[\n ]",
79
                repl="_",
80
                string=x,
81
            ).lower(),
82
            inplace=True,
83
        )
84
85
    def rename_columns_manually(self) -> None:
86
        self.data.rename(
87
            columns={
88
                "code": "code_uic",
89
                "full_name": "name",
90
                "country": "country_code_iso",
91
                "infra-_structure": "infrastructure",
92
                "url": "url",
93
            },
94
            inplace=True,
95
        )
96
97
    def delete_data(self) -> None:
98
        self.remove_invalid_and_irrelevant_companies()
99
        self.drop_unnecessary_columns()
100
        self.remove_invalid_companies()
101
102
    def remove_invalid_and_irrelevant_companies(self) -> None:
103
        self.data.dropna(
104
            subset=[
105
                "allocation_date",
106
                "infrastructure",
107
            ],
108
            inplace=True,
109
        )
110
111
    def drop_unnecessary_columns(self) -> None:
112
        self.data.drop(
113
            columns=[
114
                "request_date",
115
                "recent",
116
            ],
117
            inplace=True,
118
        )
119
120
    def remove_invalid_companies(self) -> None:
121
        # future: report bug below to JetBrains or pandas developers
122
        # noinspection PyUnusedLocal
123
        today = datetime.today()
124
        self.data.query(
125
            expr="(begin_of_validity.isnull() or begin_of_validity <= @today) and (end_of_validity.isnull() or @today <= end_of_validity)",
126
            inplace=True,
127
        )
128
129
    def correct_boolean_values(self) -> None:
130
        boolean_columns = [
131
            "freight",
132
            "passenger",
133
            "infrastructure",
134
            "holding",
135
            "integrated",
136
            "other",
137
        ]
138
        for column in boolean_columns:
139
            self.data[column] = self.data[column].apply(lambda x: x == "x" or x == "X")
140
141
    def add_data(self) -> None:
142
        with self.database.engine.begin() as connection:
143
            queries = [
144
                """
145
                insert ignore into companies (
146
                    code_uic,
147
                    short_name,
148
                    name,
149
                    country_code_iso,
150
                    allocation_date,
151
                    modified_date,
152
                    begin_of_validity,
153
                    end_of_validity,
154
                    freight,
155
                    passenger,
156
                    infrastructure,
157
                    holding,
158
                    integrated,
159
                    other,
160
                    url
161
                )
162
                values (
163
                    :code_uic,
164
                    :short_name,
165
                    :name,
166
                    :country_code_iso,
167
                    :allocation_date,
168
                    :modified_date,
169
                    :begin_of_validity,
170
                    :end_of_validity,
171
                    :freight,
172
                    :passenger,
173
                    :infrastructure,
174
                    :holding,
175
                    :integrated,
176
                    :other,
177
                    :url
178
                )
179
                """,
180
                """
181
                update companies
182
                set
183
                    short_name = :short_name,
184
                    name = :name,
185
                    country_code_iso = :country_code_iso,
186
                    allocation_date = :allocation_date,
187
                    modified_date = :modified_date,
188
                    begin_of_validity = :begin_of_validity,
189
                    end_of_validity = :end_of_validity,
190
                    freight = :freight,
191
                    passenger = :passenger,
192
                    infrastructure = :infrastructure,
193
                    holding = :holding,
194
                    integrated = :integrated,
195
                    other = :other,
196
                    url = :url
197
                where
198
                    code_uic = :code_uic
199
                """,
200
            ]
201
202
            for index, row in self.data.iterrows():
203
                for query in queries:
204
                    connection.execute(
205
                        text(query),
206
                        row.to_dict(),
207
                    )
208