Passed
Pull Request — dev (#1369)
by
unknown
01:59
created

data.datasets.mastr.download_mastr_geocoding()   A

Complexity

Conditions 3

Size

Total Lines 18
Code Lines 13

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 13
dl 0
loc 18
rs 9.75
c 0
b 0
f 0
cc 3
nop 0
1
"""
2
Download Marktstammdatenregister (MaStR) from Zenodo.
3
4
"""
5
6
from pathlib import Path
7
from urllib.request import urlretrieve
8
import os
9
import zipfile
10
11
import pandas as pd
12
13
from egon.data.datasets import Dataset
14
import egon.data.config
15
16
WORKING_DIR_MASTR_NEW = Path(".", "bnetza_mastr", "dump_2025-02-09")
17
18
19
def download_mastr_data():
20
    """Download MaStR data from Zenodo."""
21
22
    def download(dataset_name, download_dir):
23
        print(f"Downloading dataset {dataset_name} to {download_dir} ...")
24
        # Get parameters from config and set download URL
25
        data_config = egon.data.config.datasets()[dataset_name]
26
        zenodo_files_url = (
27
            f"https://zenodo.org/record/" f"{data_config['deposit_id']}/files/"
28
        )
29
30
        dump_file_name = data_config["dump_name"] + ".zip"
31
32
        if not os.path.isfile(dump_file_name):
33
            urlretrieve(
34
                zenodo_files_url + dump_file_name,
35
                download_dir / dump_file_name,
36
            )
37
38
    if not os.path.exists(WORKING_DIR_MASTR_NEW):
39
        WORKING_DIR_MASTR_NEW.mkdir(exist_ok=True, parents=True)
40
41
    download(dataset_name="mastr_new", download_dir=WORKING_DIR_MASTR_NEW)
42
43
44
def download_mastr_geocoding():
45
    """Download MaStR_geocoding data from Zenodo."""
46
    zenodo_files_url = "https://zenodo.org/records/17279317/files/mastr_geocoding_dump_2025-02-09_14783581.gpkg?download=1"
47
    WORKING_DIR_MASTR_GEOCODING = Path(".", "mastr_geocoding")
48
    dump_file_name = egon.data.config.datasets()["mastr_new"][
49
        "dump_geocoding_name"
50
    ]
51
    if not os.path.exists(WORKING_DIR_MASTR_GEOCODING):
52
        WORKING_DIR_MASTR_GEOCODING.mkdir(exist_ok=True, parents=True)
53
54
    if not os.path.isfile(WORKING_DIR_MASTR_GEOCODING / dump_file_name):
55
        print("Downloading dataset mastr_geocoding")
56
        urlretrieve(
57
            zenodo_files_url + dump_file_name,
58
            WORKING_DIR_MASTR_GEOCODING / dump_file_name,
59
        )
60
    else:
61
        print("mastr_geocoding was already present. Download skipped")
62
63
64
def extract_and_preprocess_mastr():
65
    """
66
    Extract the downloaded MaStR dump and create cleaned, schema-aligned CSVs.
67
68
    This routine expects a MaStR ZIP archive (downloaded by
69
    :func:`download_mastr_data`) to be present in ``WORKING_DIR_MASTR_NEW``.
70
    It unpacks the archive, reads the *raw* CSV files shipped in the dump,
71
    applies a set of harmonization steps (column renaming, categorical
72
    normalization, data enrichments), and writes *cleaned* CSVs. The function
73
    performs the following steps:
74
75
    1) Locate and extract the MaStR ZIP
76
    2) Read raw CSVs from the extracted dump folder
77
     ``bnetza_mastr_wind_raw.csv``,
78
     ``bnetza_mastr_solar_raw.csv``,
79
     ``bnetza_mastr_biomass_raw.csv``,
80
     ``bnetza_mastr_hydro_raw.csv``,
81
     ``bnetza_mastr_gsgk_raw.csv``,
82
     ``bnetza_mastr_storage_raw.csv``,
83
     ``bnetza_mastr_combustion_raw.csv``,
84
     ``bnetza_mastr_nuclear_raw.csv``,
85
     ``bnetza_mastr_locations_extended_raw.csv``,
86
     ``bnetza_mastr_grid_connections_raw.csv``.
87
    3) Voltage-level enrichment for locations
88
    4) Solar-specific fixes
89
    5) Common harmonization across technologies
90
    6) Write cleaned outputs (UTF-8, no index) to ``WORKING_DIR_MASTR_NEW``
91
       - ``bnetza_mastr_wind_cleaned.csv``
92
       - ``bnetza_mastr_solar_cleaned.csv``
93
       - ``bnetza_mastr_biomass_cleaned.csv``
94
       - ``bnetza_mastr_hydro_cleaned.csv``
95
       - ``bnetza_mastr_gsgk_cleaned.csv``
96
       - ``bnetza_mastr_storage_cleaned.csv``
97
       - ``bnetza_mastr_combustion_cleaned.csv``
98
       - ``bnetza_mastr_nuclear_cleaned.csv``
99
100
    Returns
101
    -------
102
    None
103
        Results are written to disk as CSV files (see list above).
104
    """
105
106
    # Extract mastr
107
    data_config = egon.data.config.datasets()["mastr_new"]
108
    dump_file_name = data_config["dump_name"]
109
    raw_data_path = WORKING_DIR_MASTR_NEW / dump_file_name
110
111
    with zipfile.ZipFile(
112
        WORKING_DIR_MASTR_NEW / (dump_file_name + ".zip"), "r"
113
    ) as zip_ref:
114
        zip_ref.extractall(WORKING_DIR_MASTR_NEW)
115
116
    # prepocess mastr data
117
    wind = pd.read_csv(raw_data_path / "bnetza_mastr_wind_raw.csv")
118
    solar = pd.read_csv(raw_data_path / "bnetza_mastr_solar_raw.csv")
119
    bio_with_th_power = pd.read_csv(
120
        raw_data_path / "bnetza_mastr_biomass_raw.csv"
121
    )
122
    hydro = pd.read_csv(raw_data_path / "bnetza_mastr_hydro_raw.csv")
123
    gsgk = pd.read_csv(raw_data_path / "bnetza_mastr_gsgk_raw.csv")
124
    storage = pd.read_csv(raw_data_path / "bnetza_mastr_storage_raw.csv")
125
    combustion_with_th_power = pd.read_csv(
126
        raw_data_path / "bnetza_mastr_combustion_raw.csv"
127
    )
128
    nuclear = pd.read_csv(raw_data_path / "bnetza_mastr_nuclear_raw.csv")
129
130
    loc = pd.read_csv(
131
        raw_data_path / "bnetza_mastr_locations_extended_raw.csv"
132
    )
133
    gcp = pd.read_csv(raw_data_path / "bnetza_mastr_grid_connections_raw.csv")
134
135
    loc_vlevel = loc.merge(
136
        gcp,
137
        left_on="Netzanschlusspunkte",
138
        right_on="NetzanschlusspunktMastrNummer",
139
        how="left",
140
    )
141
142
    loc_vlevel.replace(
143
        {
144
            "Spannungsebene": {
145
                "Niederspannung (= Hausanschluss/Haushaltsstrom)": "Niederspannung",
146
                "Umspannebene Mittelspannung/Niederspannung": "UmspannungZurNiederspannung",
147
                "Umspannebene Hochspannung/Mittelspannung": "UmspannungZurMittelspannung",
148
                "Umspannebene Höchstspannung/Hochspannung": "UmspannungZurHochspannung",
149
            }
150
        },
151
        inplace=True,
152
    )
153
154
    # Locations and grid conn. points
155
    cols_mapping = {"MastrNummer": "MaStRNummer"}
156
    loc_vlevel.rename(columns=cols_mapping).to_csv(
157
        WORKING_DIR_MASTR_NEW / "location_elec_generation_raw.csv",
158
        index=None,
159
        encoding="UTF-8",
160
    )
161
162
    # Fix solar
163
    solar["Standort"] = solar.Postleitzahl.apply(str) + " " + solar.Ort
164
    solar["Bruttoleistung_extended"] = solar.Bruttoleistung
165
    solar["InstallierteLeistung"] = solar.Bruttoleistung
166
167
    cols_mapping = {
168
        "ZugeordneteWirkleistungWechselrichter": "zugeordneteWirkleistungWechselrichter"
169
    }
170
171
    solar.rename(columns=cols_mapping, inplace=True)
172
173
    cols_mapping = {"MastrNummer": "MaStRNummer"}
174
175
    states_renaming = {
176
        "Thüringen": "Thueringen",
177
        "Schleswig-Holstein": "SchleswigHolstein",
178
        "Nordrhein-Westfalen": "NordrheinWestfalen",
179
        "Rheinland-Pfalz": "RheinlandPfalz",
180
        "Baden-Württemberg": "BadenWuerttemberg",
181
        "Sachsen-Anhalt": "SachsenAnhalt",
182
        "Mecklenburg-Vorpommern": "MecklenburgVorpommern",
183
        "Ausschließliche Wirtschaftszone": "AusschliesslicheWirtschaftszone",
184
    }
185
    status_renaming = {
186
        "In Betrieb": "InBetrieb",
187
        "Vorübergehend stillgelegt": "VoruebergehendStillgelegt",
188
        "Endgültig stillgelegt": "DauerhaftStillgelegt",
189
        "In Planung": "InPlanung",
190
    }
191
    values_renaming = {
192
        "Bundesland": states_renaming,
193
        "EinheitBetriebsstatus": status_renaming,
194
    }
195
196
    # Export data
197
    wind.rename(columns=cols_mapping).replace(values_renaming).to_csv(
198
        WORKING_DIR_MASTR_NEW / "bnetza_mastr_wind_cleaned.csv",
199
        index=None,
200
        encoding="UTF-8",
201
    )
202
203
    solar.rename(columns=cols_mapping).replace(values_renaming).to_csv(
204
        WORKING_DIR_MASTR_NEW / "bnetza_mastr_solar_cleaned.csv",
205
        index=None,
206
        encoding="UTF-8",
207
    )
208
209
    bio_with_th_power.rename(columns=cols_mapping).replace(
210
        values_renaming
211
    ).to_csv(
212
        WORKING_DIR_MASTR_NEW / "bnetza_mastr_biomass_cleaned.csv",
213
        index=None,
214
        encoding="UTF-8",
215
    )
216
217
    hydro.rename(columns=cols_mapping).replace(values_renaming).to_csv(
218
        WORKING_DIR_MASTR_NEW / "bnetza_mastr_hydro_cleaned.csv",
219
        index=None,
220
        encoding="UTF-8",
221
    )
222
223
    gsgk.rename(columns=cols_mapping).replace(values_renaming).to_csv(
224
        WORKING_DIR_MASTR_NEW / "bnetza_mastr_gsgk_cleaned.csv",
225
        index=None,
226
        encoding="UTF-8",
227
    )
228
229
    storage.rename(columns=cols_mapping).replace(values_renaming).to_csv(
230
        WORKING_DIR_MASTR_NEW / "bnetza_mastr_storage_cleaned.csv",
231
        index=None,
232
        encoding="UTF-8",
233
    )
234
235
    combustion_with_th_power.rename(columns=cols_mapping).replace(
236
        values_renaming
237
    ).to_csv(
238
        WORKING_DIR_MASTR_NEW / "bnetza_mastr_combustion_cleaned.csv",
239
        index=None,
240
        encoding="UTF-8",
241
    )
242
243
    nuclear.rename(columns=cols_mapping).replace(values_renaming).to_csv(
244
        WORKING_DIR_MASTR_NEW / "bnetza_mastr_nuclear_cleaned.csv",
245
        index=None,
246
        encoding="UTF-8",
247
    )
248
249
250
class mastr_data_setup(Dataset):
251
    """
252
    Download Marktstammdatenregister (MaStR) from Zenodo.
253
254
    *Dependencies*
255
      * :py:func:`Setup <egon.data.datasets.database.setup>`
256
257
    The downloaded data incorporates two different datasets:
258
259
    Dump 2021-04-30
260
      * Source: https://zenodo.org/records/10480930
261
      * Used technologies: PV plants, wind turbines, biomass, hydro plants,
262
        combustion, nuclear, gsgk, storage
263
      * Data is further processed in the :py:class:`PowerPlants
264
        <egon.data.datasets.power_plants.PowerPlants>` dataset
265
266
    Dump 2022-11-17
267
      * Source: https://zenodo.org/records/10480958
268
      * Used technologies: PV plants, wind turbines, biomass, hydro plants
269
      * Data is further processed in module :py:mod:`mastr
270
        <egon.data.datasets.power_plants.mastr>` and :py:class:`PowerPlants
271
        <egon.data.datasets.power_plants.PowerPlants>`
272
273
    See documentation section :ref:`mastr-ref` for more information.
274
275
    """
276
277
    #:
278
    name: str = "MastrData"
279
    #:
280
    version: str = "0.0.3"
281
    #:
282
    tasks = (
283
        download_mastr_data,
284
        extract_and_preprocess_mastr,
285
        download_mastr_geocoding,
286
    )
287
288
    def __init__(self, dependencies):
289
        super().__init__(
290
            name=self.name,
291
            version=self.version,
292
            dependencies=dependencies,
293
            tasks=self.tasks,
294
        )
295