data.datasets.power_plants.mastr.import_mastr() - Code Metrics - Inspection of "Features/#1095 mastr status quo" - openego/eGon-data - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Pull Request — dev (#1112)

unknown

created 2023-03-20 08:11 UTC

data.datasets.power_plants.mastr.import_mastr() D

↳ Parent: data.datasets.power_plants.mastr

Complexity

Conditions

Size

Total Lines	372
Code Lines	247

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
eloc	247
dl	0
loc	372
rs	4.6666
c	0
b	0
f	0
cc	9
nop	0

How to fix Long Method

"""Import MaStR dataset and write to DB tables

Data dump from Marktstammdatenregister (2022-11-17) is imported into the
database. Only some technologies are taken into account and written to the
following tables:

* PV: table `supply.egon_power_plants_pv`
* wind turbines: table `supply.egon_power_plants_wind`
* biomass/biogas plants: table `supply.egon_power_plants_biomass`
* hydro plants: table `supply.egon_power_plants_hydro`

Handling of empty source data in MaStr dump:
* `voltage_level`: inferred based on nominal power (`capacity`) using the
  ranges from
  https://redmine.iks.cs.ovgu.de/oe/projects/ego-n/wiki/Definition_of_thresholds_for_voltage_level_assignment
  which results in True in column `voltage_level_inferred`. Remaining datasets
  are set to -1 (which only occurs if `capacity` is empty).
* `supply.egon_power_plants_*.bus_id`: set to -1 (only if not within grid
  districts or no geom available, e.g. for units with nom. power <30 kW)
* `supply.egon_power_plants_hydro.plant_type`: NaN

The data is used especially for the generation of status quo grids by ding0.
"""
from __future__ import annotations

from pathlib import Path

from loguru import logger
import geopandas as gpd
import numpy as np
import pandas as pd

from egon.data import config, db
from egon.data.datasets.mastr import WORKING_DIR_MASTR_NEW
from egon.data.datasets.power_plants.mastr_db_classes import (
    EgonMastrGeocoded,
    EgonPowerPlantsBiomass,
    EgonPowerPlantsCombustion,
    EgonPowerPlantsGsgk,
    EgonPowerPlantsHydro,
    EgonPowerPlantsNuclear,
    EgonPowerPlantsPv,
    EgonPowerPlantsStorage,
    EgonPowerPlantsWind,
)
from egon.data.datasets.power_plants.pv_rooftop_buildings import (
    federal_state_data,
)

TESTMODE_OFF = (
    config.settings()["egon-data"]["--dataset-boundary"] == "Everything"
)


def isfloat(num: str):
    """
    Determine if string can be converted to float.
    Parameters
    -----------
    num : str
        String to parse.
    Returns
    -------
    bool
        Returns True in string can be parsed to float.
    """
    try:
        float(num)
        return True
    except ValueError:
        return False


def zip_and_municipality_from_standort(
    standort: str,
) -> tuple[str, bool]:
    """
    Get zip code and municipality from Standort string split into a list.
    Parameters
    -----------
    standort : str
        Standort as given from MaStR data.
    Returns
    -------
    str
        Standort with only the zip code and municipality
        as well a ', Germany' added.
    """
    standort_list = standort.split()

    found = False
    count = 0

    for count, elem in enumerate(standort_list):
        if len(elem) != 5:
            continue
        if not elem.isnumeric():
            continue

        found = True

        break

    if found:
        cleaned_str = " ".join(standort_list[count:])

        return cleaned_str, found

    logger.warning(
        "Couldn't identify zip code. This entry will be dropped."
        f" Original standort: {standort}."
    )

    return standort, found


def infer_voltage_level(

    units_gdf: gpd.GeoDataFrame,
) -> gpd.GeoDataFrame:
    """
    Infer nan values in voltage level derived from generator capacity to
    the power plants.

    Parameters
    -----------
    units_gdf : geopandas.GeoDataFrame
        GeoDataFrame containing units with voltage levels from MaStR
    Returnsunits_gdf: gpd.GeoDataFrame
    -------
    geopandas.GeoDataFrame
        GeoDataFrame containing units all having assigned a voltage level.
    """

    def voltage_levels(p: float) -> int:
        if p <= 100:
            return 7
        elif p <= 200:
            return 6
        elif p <= 5500:
            return 5
        elif p <= 20000:
            return 4
        elif p <= 120000:
            return 3
        return 1

    units_gdf["voltage_level_inferred"] = False
    mask = units_gdf.voltage_level.isna()
    units_gdf.loc[mask, "voltage_level_inferred"] = True
    units_gdf.loc[mask, "voltage_level"] = units_gdf.loc[
        mask
    ].Nettonennleistung.apply(voltage_levels)

    return units_gdf


def import_mastr() -> None:
    """Import MaStR data into database"""
    engine = db.engine()

    # import geocoded data
    cfg = config.datasets()["mastr_new"]
    path_parts = cfg["geocoding_path"]
    path = Path(*["."] + path_parts).resolve()
    path = list(path.iterdir())[0]

    deposit_id_geocoding = int(path.parts[-1].split(".")[0].split("_")[-1])
    deposit_id_mastr = cfg["deposit_id"]

    if deposit_id_geocoding != deposit_id_mastr:
        raise AssertionError(
            f"The zenodo (sandbox) deposit ID {deposit_id_mastr} for the MaStR"
            f" dataset is not matching with the geocoding version "
            f"{deposit_id_geocoding}. Make sure to hermonize the data. When "
            f"the MaStR dataset is updated also update the geocoding and "
            f"update the egon data bundle. The geocoding can be done using: "
            f"https://github.com/RLI-sandbox/mastr-geocoding"
        )

    geocoding_gdf = gpd.read_file(path)

    # remove failed requests
    geocoding_gdf = geocoding_gdf.loc[geocoding_gdf.geometry.is_valid]

    EgonMastrGeocoded.__table__.drop(bind=engine, checkfirst=True)
    EgonMastrGeocoded.__table__.create(bind=engine, checkfirst=True)

    geocoding_gdf.to_postgis(
        name=EgonMastrGeocoded.__tablename__,
        con=engine,
        if_exists="append",
        schema=EgonMastrGeocoded.__table_args__["schema"],
        index=True,
    )

    cfg = config.datasets()["power_plants"]

    cols_mapping = {
        "all": {
            "EinheitMastrNummer": "gens_id",
            "EinheitBetriebsstatus": "status",
            "Inbetriebnahmedatum": "commissioning_date",
            "Postleitzahl": "postcode",
            "Ort": "city",
            "Gemeinde": "municipality",
            "Bundesland": "federal_state",
            "Nettonennleistung": "capacity",
            "Einspeisungsart": "feedin_type",
        },
        "pv": {
            "Lage": "site_type",
            "Standort": "site",
            "Nutzungsbereich": "usage_sector",
            "Hauptausrichtung": "orientation_primary",
            "HauptausrichtungNeigungswinkel": "orientation_primary_angle",
            "Nebenausrichtung": "orientation_secondary",
            "NebenausrichtungNeigungswinkel": "orientation_secondary_angle",
            "EinheitlicheAusrichtungUndNeigungswinkel": "orientation_uniform",
            "AnzahlModule": "module_count",
            "zugeordneteWirkleistungWechselrichter": "capacity_inverter",
        },
        "wind": {
            "Lage": "site_type",
            "Hersteller": "manufacturer_name",
            "Typenbezeichnung": "type_name",
            "Nabenhoehe": "hub_height",
            "Rotordurchmesser": "rotor_diameter",
        },
        "biomass": {
            "Technologie": "technology",
            "Hauptbrennstoff": "main_fuel",
            "Biomasseart": "fuel_type",
            "ThermischeNutzleistung": "th_capacity",
        },
        "hydro": {
            "ArtDerWasserkraftanlage": "plant_type",
            "ArtDesZuflusses": "water_origin",
        },
        "combustion": {
            "Energietraeger": "carrier",
            "Hauptbrennstoff": "main_fuel",
            "WeitererHauptbrennstoff": "other_main_fuel",
            "Technologie": "technology",
            "ThermischeNutzleistung": "th_capacity",
        },
        "gsgk": {
            "Energietraeger": "carrier",
            "Technologie": "technology",
        },
        "nuclear": {
            "Energietraeger": "carrier",
            "Technologie": "technology",
        },
        "storage": {
            "Energietraeger": "carrier",
            "Technologie": "technology",
            "Batterietechnologie": "battery_type",
            "Pumpspeichertechnologie": "pump_storage_type",
        },
    }

    source_files = {
        "pv": WORKING_DIR_MASTR_NEW / cfg["sources"]["mastr_pv"],
        "wind": WORKING_DIR_MASTR_NEW / cfg["sources"]["mastr_wind"],
        "biomass": WORKING_DIR_MASTR_NEW / cfg["sources"]["mastr_biomass"],
        "hydro": WORKING_DIR_MASTR_NEW / cfg["sources"]["mastr_hydro"],
        "combustion": WORKING_DIR_MASTR_NEW
        / cfg["sources"]["mastr_combustion"],
        "gsgk": WORKING_DIR_MASTR_NEW / cfg["sources"]["mastr_gsgk"],
        "nuclear": WORKING_DIR_MASTR_NEW / cfg["sources"]["mastr_nuclear"],
        "storage": WORKING_DIR_MASTR_NEW / cfg["sources"]["mastr_storage"],
    }

    target_tables = {
        "pv": EgonPowerPlantsPv,
        "wind": EgonPowerPlantsWind,
        "biomass": EgonPowerPlantsBiomass,
        "hydro": EgonPowerPlantsHydro,
        "combustion": EgonPowerPlantsCombustion,
        "gsgk": EgonPowerPlantsGsgk,
        "nuclear": EgonPowerPlantsNuclear,
        "storage": EgonPowerPlantsStorage,
    }

    vlevel_mapping = {
        "Höchstspannung": 1,
        "UmspannungZurHochspannung": 2,
        "Hochspannung": 3,
        "UmspannungZurMittelspannung": 4,
        "Mittelspannung": 5,
        "UmspannungZurNiederspannung": 6,
        "Niederspannung": 7,
    }

    # import locations
    locations = pd.read_csv(
        WORKING_DIR_MASTR_NEW / cfg["sources"]["mastr_location"],
        index_col=None,
    )

    # import grid districts
    mv_grid_districts = db.select_geodataframe(
        f"""
        SELECT * FROM {cfg['sources']['egon_mv_grid_district']}
        """,
        epsg=4326,
    )

    # import units
    technologies = [
        "pv",
        "wind",
        "biomass",
        "hydro",
        "combustion",
        "gsgk",
        "nuclear",
        "storage",
    ]

    for tech in technologies:
        # read units
        logger.info(f"===== Importing MaStR dataset: {tech} =====")
        logger.debug("Reading CSV and filtering data...")
        units = pd.read_csv(
            source_files[tech],
            usecols=(
                ["LokationMastrNummer", "Laengengrad", "Breitengrad", "Land"]
                + list(cols_mapping["all"].keys())
                + list(cols_mapping[tech].keys())
            ),
            index_col=None,
            dtype={"Postleitzahl": str},
            low_memory=False,
        ).rename(columns=cols_mapping)

        # drop units outside of Germany
        len_old = len(units)
        units = units.loc[units.Land == "Deutschland"]
        logger.debug(
            f"{len_old - len(units)} units outside of Germany dropped..."
        )

        # get boundary
        boundary = (
            federal_state_data(geocoding_gdf.crs).dissolve().at[0, "geom"]
        )

        # drop units installed after reference date from cfg
        # (eGon2021 scenario)
        len_old = len(units)
        ts = pd.Timestamp(config.datasets()["mastr_new"]["egon2021_date_max"])
        units = units.loc[pd.to_datetime(units.Inbetriebnahmedatum) <= ts]
        logger.debug(
            f"{len_old - len(units)} units installed after {ts} dropped..."
        )

        # drop not operating units
        len_old = len(units)
        units = units.loc[
            units.EinheitBetriebsstatus.isin(
                ["InBetrieb", "VoruebergehendStillgelegt"]
            )
        ]
        logger.debug(f"{len_old - len(units)} not operating units dropped...")

        # filter for SH units if in testmode
        if not TESTMODE_OFF:
            logger.info(
                "TESTMODE: Dropping all units outside of Schleswig-Holstein..."
            )
            units = units.loc[units.Bundesland == "SchleswigHolstein"]

        # merge and rename voltage level
        logger.debug("Merging with locations and allocate voltage level...")
        units = units.merge(
            locations[["MaStRNummer", "Spannungsebene"]],
            left_on="LokationMastrNummer",
            right_on="MaStRNummer",
            how="left",
        )
        # convert voltage levels to numbers
        units["voltage_level"] = units.Spannungsebene.replace(vlevel_mapping)
        # set voltage level for nan values
        units = infer_voltage_level(units)

        # add geometry
        logger.debug("Adding geometries...")
        units = gpd.GeoDataFrame(
            units,
            geometry=gpd.points_from_xy(
                units["Laengengrad"], units["Breitengrad"], crs=4326
            ),
            crs=4326,
        )

        units["geometry_geocoded"] = (
            units.Laengengrad.isna() | units.Laengengrad.isna()
        )

        units.loc[~units.geometry_geocoded, "geometry_geocoded"] = ~units.loc[
            ~units.geometry_geocoded, "geometry"
        ].is_valid

        units_wo_geom = units["geometry_geocoded"].sum()

        logger.debug(
            f"{units_wo_geom}/{len(units)} units do not have a geometry!"
            " Adding geocoding results."
        )

        # determine zip and municipality string
        mask = (
            units.Postleitzahl.apply(isfloat)
            & ~units.Postleitzahl.isna()
            & ~units.Gemeinde.isna()
        )
        units["zip_and_municipality"] = np.nan
        ok_units = units.loc[mask]

        units.loc[mask, "zip_and_municipality"] = (
            ok_units.Postleitzahl.astype(int).astype(str).str.zfill(5)
            + " "
            + ok_units.Gemeinde.astype(str).str.rstrip().str.lstrip()
            + ", Deutschland"
        )

        # get zip and municipality from Standort
        parse_df = units.loc[~mask]

        if not parse_df.empty and "Standort" in parse_df.columns:
            init_len = len(parse_df)

            logger.info(
                f"Parsing ZIP code and municipality from Standort for "
                f"{init_len} values for {tech}."
            )

            parse_df[["zip_and_municipality", "drop_this"]] = (
                parse_df.Standort.astype(str)
                .apply(zip_and_municipality_from_standort)
                .tolist()
            )

            parse_df = parse_df.loc[parse_df.drop_this]

            if not parse_df.empty:
                units.loc[
                    parse_df.index, "zip_and_municipality"
                ] = parse_df.zip_and_municipality

        # add geocoding to missing
        units = units.merge(
            right=geocoding_gdf[["zip_and_municipality", "geometry"]].rename(
                columns={"geometry": "temp"}
            ),
            how="left",
            on="zip_and_municipality",
        )

        units.loc[units.geometry_geocoded, "geometry"] = units.loc[
            units.geometry_geocoded, "temp"
        ]

        init_len = len(units)

        logger.info(
            "Dropping units outside boundary by geometry or without geometry"
            "..."
        )

        units.dropna(subset=["geometry"], inplace=True)

        units = units.loc[units.geometry.within(boundary)]

        logger.debug(
            f"{init_len - len(units)}/{init_len} "
            f"({((init_len - len(units)) / init_len) * 100: g} %) dropped."
        )

        # drop unnecessary and rename columns
        logger.debug("Reformatting...")
        units.drop(
            columns=[
                "LokationMastrNummer",
                "MaStRNummer",
                "Laengengrad",
                "Breitengrad",
                "Spannungsebene",
                "Land",
                "temp",
            ],
            inplace=True,
        )
        mapping = cols_mapping["all"].copy()
        mapping.update(cols_mapping[tech])
        mapping.update({"geometry": "geom"})
        units.rename(columns=mapping, inplace=True)
        units["voltage_level"] = units.voltage_level.fillna(-1).astype(int)

        units.set_geometry("geom", inplace=True)
        units["id"] = range(0, len(units))

        # change capacity unit: kW to MW
        units["capacity"] = units["capacity"] / 1e3
        if "capacity_inverter" in units.columns:
            units["capacity_inverter"] = units["capacity_inverter"] / 1e3
        if "th_capacity" in units.columns:
            units["th_capacity"] = units["th_capacity"] / 1e3

        # assign bus ids
        logger.debug("Assigning bus ids...")
        units = units.assign(
            bus_id=units.loc[~units.geom.x.isna()]
            .sjoin(mv_grid_districts[["bus_id", "geom"]], how="left")
            .drop(columns=["index_right"])
            .bus_id
        )
        units["bus_id"] = units.bus_id.fillna(-1).astype(int)

        # write to DB
        logger.info(f"Writing {len(units)} units to DB...")

        units.to_postgis(
            name=target_tables[tech].__tablename__,
            con=engine,
            if_exists="append",
            schema=target_tables[tech].__table_args__["schema"],
        )


1		"""Import MaStR dataset and write to DB tables
2
3		Data dump from Marktstammdatenregister (2022-11-17) is imported into the
4		database. Only some technologies are taken into account and written to the
5		following tables:
6
7		* PV: table `supply.egon_power_plants_pv`
8		* wind turbines: table `supply.egon_power_plants_wind`
9		* biomass/biogas plants: table `supply.egon_power_plants_biomass`
10		* hydro plants: table `supply.egon_power_plants_hydro`
11
12		Handling of empty source data in MaStr dump:
13		* `voltage_level`: inferred based on nominal power (`capacity`) using the
14		ranges from
15		https://redmine.iks.cs.ovgu.de/oe/projects/ego-n/wiki/Definition_of_thresholds_for_voltage_level_assignment
16		which results in True in column `voltage_level_inferred`. Remaining datasets
17		are set to -1 (which only occurs if `capacity` is empty).
18		* `supply.egon_power_plants_*.bus_id`: set to -1 (only if not within grid
19		districts or no geom available, e.g. for units with nom. power <30 kW)
20		* `supply.egon_power_plants_hydro.plant_type`: NaN
21
22		The data is used especially for the generation of status quo grids by ding0.
23		"""
24		from __future__ import annotations
25
26		from pathlib import Path
27
28		from loguru import logger
29		import geopandas as gpd
30		import numpy as np
31		import pandas as pd
32
33		from egon.data import config, db
34		from egon.data.datasets.mastr import WORKING_DIR_MASTR_NEW
35		from egon.data.datasets.power_plants.mastr_db_classes import (
36		EgonMastrGeocoded,
37		EgonPowerPlantsBiomass,
38		EgonPowerPlantsCombustion,
39		EgonPowerPlantsGsgk,
40		EgonPowerPlantsHydro,
41		EgonPowerPlantsNuclear,
42		EgonPowerPlantsPv,
43		EgonPowerPlantsStorage,
44		EgonPowerPlantsWind,
45		)
46		from egon.data.datasets.power_plants.pv_rooftop_buildings import (
47		federal_state_data,
48		)
49
50		TESTMODE_OFF = (
51		config.settings()["egon-data"]["--dataset-boundary"] == "Everything"
52		)
53
54
55		def isfloat(num: str):
56		"""
57		Determine if string can be converted to float.
58		Parameters
59		-----------
60		num : str
61		String to parse.
62		Returns
63		-------
64		bool
65		Returns True in string can be parsed to float.
66		"""
67		try:
68		float(num)
69		return True
70		except ValueError:
71		return False
72
73
74		def zip_and_municipality_from_standort(
75		standort: str,
76		) -> tuple[str, bool]:
77		"""
78		Get zip code and municipality from Standort string split into a list.
79		Parameters
80		-----------
81		standort : str
82		Standort as given from MaStR data.
83		Returns
84		-------
85		str
86		Standort with only the zip code and municipality
87		as well a ', Germany' added.
88		"""
89		standort_list = standort.split()
90
91		found = False
92		count = 0
93
94		for count, elem in enumerate(standort_list):
95		if len(elem) != 5:
96		continue
97		if not elem.isnumeric():
98		continue
99
100		found = True
101
102		break
103
104		if found:
105		cleaned_str = " ".join(standort_list[count:])
106
107		return cleaned_str, found
108
109		logger.warning(
110		"Couldn't identify zip code. This entry will be dropped."
111		f" Original standort: {standort}."
112		)
113
114		return standort, found
115
116
117	View Code Duplication	def infer_voltage_level(
		0 ignored issues – show Duplication introduced 2022-12-07 16:34 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
118		units_gdf: gpd.GeoDataFrame,
119		) -> gpd.GeoDataFrame:
120		"""
121		Infer nan values in voltage level derived from generator capacity to
122		the power plants.
123
124		Parameters
125		-----------
126		units_gdf : geopandas.GeoDataFrame
127		GeoDataFrame containing units with voltage levels from MaStR
128		Returnsunits_gdf: gpd.GeoDataFrame
129		-------
130		geopandas.GeoDataFrame
131		GeoDataFrame containing units all having assigned a voltage level.
132		"""
133
134		def voltage_levels(p: float) -> int:
135		if p <= 100:
136		return 7
137		elif p <= 200:
138		return 6
139		elif p <= 5500:
140		return 5
141		elif p <= 20000:
142		return 4
143		elif p <= 120000:
144		return 3
145		return 1
146
147		units_gdf["voltage_level_inferred"] = False
148		mask = units_gdf.voltage_level.isna()
149		units_gdf.loc[mask, "voltage_level_inferred"] = True
150		units_gdf.loc[mask, "voltage_level"] = units_gdf.loc[
151		mask
152		].Nettonennleistung.apply(voltage_levels)
153
154		return units_gdf
155
156
157		def import_mastr() -> None:
158		"""Import MaStR data into database"""
159		engine = db.engine()
160
161		# import geocoded data
162		cfg = config.datasets()["mastr_new"]
163		path_parts = cfg["geocoding_path"]
164		path = Path(*["."] + path_parts).resolve()
165		path = list(path.iterdir())[0]
166
167		deposit_id_geocoding = int(path.parts[-1].split(".")[0].split("_")[-1])
168		deposit_id_mastr = cfg["deposit_id"]
169
170		if deposit_id_geocoding != deposit_id_mastr:
171		raise AssertionError(
172		f"The zenodo (sandbox) deposit ID {deposit_id_mastr} for the MaStR"
173		f" dataset is not matching with the geocoding version "
174		f"{deposit_id_geocoding}. Make sure to hermonize the data. When "
175		f"the MaStR dataset is updated also update the geocoding and "
176		f"update the egon data bundle. The geocoding can be done using: "
177		f"https://github.com/RLI-sandbox/mastr-geocoding"
178		)
179
180		geocoding_gdf = gpd.read_file(path)
181
182		# remove failed requests
183		geocoding_gdf = geocoding_gdf.loc[geocoding_gdf.geometry.is_valid]
184
185		EgonMastrGeocoded.__table__.drop(bind=engine, checkfirst=True)
186		EgonMastrGeocoded.__table__.create(bind=engine, checkfirst=True)
187
188		geocoding_gdf.to_postgis(
189		name=EgonMastrGeocoded.__tablename__,
190		con=engine,
191		if_exists="append",
192		schema=EgonMastrGeocoded.__table_args__["schema"],
193		index=True,
194		)
195
196		cfg = config.datasets()["power_plants"]
197
198		cols_mapping = {
199		"all": {
200		"EinheitMastrNummer": "gens_id",
201		"EinheitBetriebsstatus": "status",
202		"Inbetriebnahmedatum": "commissioning_date",
203		"Postleitzahl": "postcode",
204		"Ort": "city",
205		"Gemeinde": "municipality",
206		"Bundesland": "federal_state",
207		"Nettonennleistung": "capacity",
208		"Einspeisungsart": "feedin_type",
209		},
210		"pv": {
211		"Lage": "site_type",
212		"Standort": "site",
213		"Nutzungsbereich": "usage_sector",
214		"Hauptausrichtung": "orientation_primary",
215		"HauptausrichtungNeigungswinkel": "orientation_primary_angle",
216		"Nebenausrichtung": "orientation_secondary",
217		"NebenausrichtungNeigungswinkel": "orientation_secondary_angle",
218		"EinheitlicheAusrichtungUndNeigungswinkel": "orientation_uniform",
219		"AnzahlModule": "module_count",
220		"zugeordneteWirkleistungWechselrichter": "capacity_inverter",
221		},
222		"wind": {
223		"Lage": "site_type",
224		"Hersteller": "manufacturer_name",
225		"Typenbezeichnung": "type_name",
226		"Nabenhoehe": "hub_height",
227		"Rotordurchmesser": "rotor_diameter",
228		},
229		"biomass": {
230		"Technologie": "technology",
231		"Hauptbrennstoff": "main_fuel",
232		"Biomasseart": "fuel_type",
233		"ThermischeNutzleistung": "th_capacity",
234		},
235		"hydro": {
236		"ArtDerWasserkraftanlage": "plant_type",
237		"ArtDesZuflusses": "water_origin",
238		},
239		"combustion": {
240		"Energietraeger": "carrier",
241		"Hauptbrennstoff": "main_fuel",
242		"WeitererHauptbrennstoff": "other_main_fuel",
243		"Technologie": "technology",
244		"ThermischeNutzleistung": "th_capacity",
245		},
246		"gsgk": {
247		"Energietraeger": "carrier",
248		"Technologie": "technology",
249		},
250		"nuclear": {
251		"Energietraeger": "carrier",
252		"Technologie": "technology",
253		},
254		"storage": {
255		"Energietraeger": "carrier",
256		"Technologie": "technology",
257		"Batterietechnologie": "battery_type",
258		"Pumpspeichertechnologie": "pump_storage_type",
259		},
260		}
261
262		source_files = {
263		"pv": WORKING_DIR_MASTR_NEW / cfg["sources"]["mastr_pv"],
264		"wind": WORKING_DIR_MASTR_NEW / cfg["sources"]["mastr_wind"],
265		"biomass": WORKING_DIR_MASTR_NEW / cfg["sources"]["mastr_biomass"],
266		"hydro": WORKING_DIR_MASTR_NEW / cfg["sources"]["mastr_hydro"],
267		"combustion": WORKING_DIR_MASTR_NEW
268		/ cfg["sources"]["mastr_combustion"],
269		"gsgk": WORKING_DIR_MASTR_NEW / cfg["sources"]["mastr_gsgk"],
270		"nuclear": WORKING_DIR_MASTR_NEW / cfg["sources"]["mastr_nuclear"],
271		"storage": WORKING_DIR_MASTR_NEW / cfg["sources"]["mastr_storage"],
272		}
273
274		target_tables = {
275		"pv": EgonPowerPlantsPv,
276		"wind": EgonPowerPlantsWind,
277		"biomass": EgonPowerPlantsBiomass,
278		"hydro": EgonPowerPlantsHydro,
279		"combustion": EgonPowerPlantsCombustion,
280		"gsgk": EgonPowerPlantsGsgk,
281		"nuclear": EgonPowerPlantsNuclear,
282		"storage": EgonPowerPlantsStorage,
283		}
284
285		vlevel_mapping = {
286		"Höchstspannung": 1,
287		"UmspannungZurHochspannung": 2,
288		"Hochspannung": 3,
289		"UmspannungZurMittelspannung": 4,
290		"Mittelspannung": 5,
291		"UmspannungZurNiederspannung": 6,
292		"Niederspannung": 7,
293		}
294
295		# import locations
296		locations = pd.read_csv(
297		WORKING_DIR_MASTR_NEW / cfg["sources"]["mastr_location"],
298		index_col=None,
299		)
300
301		# import grid districts
302		mv_grid_districts = db.select_geodataframe(
303		f"""
304		SELECT * FROM {cfg['sources']['egon_mv_grid_district']}
305		""",
306		epsg=4326,
307		)
308
309		# import units
310		technologies = [
311		"pv",
312		"wind",
313		"biomass",
314		"hydro",
315		"combustion",
316		"gsgk",
317		"nuclear",
318		"storage",
319		]
320
321		for tech in technologies:
322		# read units
323		logger.info(f"===== Importing MaStR dataset: {tech} =====")
324		logger.debug("Reading CSV and filtering data...")
325		units = pd.read_csv(
326		source_files[tech],
327		usecols=(
328		["LokationMastrNummer", "Laengengrad", "Breitengrad", "Land"]
329		+ list(cols_mapping["all"].keys())
330		+ list(cols_mapping[tech].keys())
331		),
332		index_col=None,
333		dtype={"Postleitzahl": str},
334		low_memory=False,
335		).rename(columns=cols_mapping)
336
337		# drop units outside of Germany
338		len_old = len(units)
339		units = units.loc[units.Land == "Deutschland"]
340		logger.debug(
341		f"{len_old - len(units)} units outside of Germany dropped..."
342		)
343
344		# get boundary
345		boundary = (
346		federal_state_data(geocoding_gdf.crs).dissolve().at[0, "geom"]
347		)
348
349		# drop units installed after reference date from cfg
350		# (eGon2021 scenario)
351		len_old = len(units)
352		ts = pd.Timestamp(config.datasets()["mastr_new"]["egon2021_date_max"])
353		units = units.loc[pd.to_datetime(units.Inbetriebnahmedatum) <= ts]
354		logger.debug(
355		f"{len_old - len(units)} units installed after {ts} dropped..."
356		)
357
358		# drop not operating units
359		len_old = len(units)
360		units = units.loc[
361		units.EinheitBetriebsstatus.isin(
362		["InBetrieb", "VoruebergehendStillgelegt"]
363		)
364		]
365		logger.debug(f"{len_old - len(units)} not operating units dropped...")
366
367		# filter for SH units if in testmode
368		if not TESTMODE_OFF:
369		logger.info(
370		"TESTMODE: Dropping all units outside of Schleswig-Holstein..."
371		)
372		units = units.loc[units.Bundesland == "SchleswigHolstein"]
373
374		# merge and rename voltage level
375		logger.debug("Merging with locations and allocate voltage level...")
376		units = units.merge(
377		locations[["MaStRNummer", "Spannungsebene"]],
378		left_on="LokationMastrNummer",
379		right_on="MaStRNummer",
380		how="left",
381		)
382		# convert voltage levels to numbers
383		units["voltage_level"] = units.Spannungsebene.replace(vlevel_mapping)
384		# set voltage level for nan values
385		units = infer_voltage_level(units)
386
387		# add geometry
388		logger.debug("Adding geometries...")
389		units = gpd.GeoDataFrame(
390		units,
391		geometry=gpd.points_from_xy(
392		units["Laengengrad"], units["Breitengrad"], crs=4326
393		),
394		crs=4326,
395		)
396
397		units["geometry_geocoded"] = (
398		units.Laengengrad.isna() \| units.Laengengrad.isna()
399		)
400
401		units.loc[~units.geometry_geocoded, "geometry_geocoded"] = ~units.loc[
402		~units.geometry_geocoded, "geometry"
403		].is_valid
404
405		units_wo_geom = units["geometry_geocoded"].sum()
406
407		logger.debug(
408		f"{units_wo_geom}/{len(units)} units do not have a geometry!"
409		" Adding geocoding results."
410		)
411
412		# determine zip and municipality string
413		mask = (
414		units.Postleitzahl.apply(isfloat)
415		& ~units.Postleitzahl.isna()
416		& ~units.Gemeinde.isna()
417		)
418		units["zip_and_municipality"] = np.nan
419		ok_units = units.loc[mask]
420
421		units.loc[mask, "zip_and_municipality"] = (
422		ok_units.Postleitzahl.astype(int).astype(str).str.zfill(5)
423		+ " "
424		+ ok_units.Gemeinde.astype(str).str.rstrip().str.lstrip()
425		+ ", Deutschland"
426		)
427
428		# get zip and municipality from Standort
429		parse_df = units.loc[~mask]
430
431		if not parse_df.empty and "Standort" in parse_df.columns:
432		init_len = len(parse_df)
433
434		logger.info(
435		f"Parsing ZIP code and municipality from Standort for "
436		f"{init_len} values for {tech}."
437		)
438
439		parse_df[["zip_and_municipality", "drop_this"]] = (
440		parse_df.Standort.astype(str)
441		.apply(zip_and_municipality_from_standort)
442		.tolist()
443		)
444
445		parse_df = parse_df.loc[parse_df.drop_this]
446
447		if not parse_df.empty:
448		units.loc[
449		parse_df.index, "zip_and_municipality"
450		] = parse_df.zip_and_municipality
451
452		# add geocoding to missing
453		units = units.merge(
454		right=geocoding_gdf[["zip_and_municipality", "geometry"]].rename(
455		columns={"geometry": "temp"}
456		),
457		how="left",
458		on="zip_and_municipality",
459		)
460
461		units.loc[units.geometry_geocoded, "geometry"] = units.loc[
462		units.geometry_geocoded, "temp"
463		]
464
465		init_len = len(units)
466
467		logger.info(
468		"Dropping units outside boundary by geometry or without geometry"
469		"..."
470		)
471
472		units.dropna(subset=["geometry"], inplace=True)
473
474		units = units.loc[units.geometry.within(boundary)]
475
476		logger.debug(
477		f"{init_len - len(units)}/{init_len} "
478		f"({((init_len - len(units)) / init_len) * 100: g} %) dropped."
479		)
480
481		# drop unnecessary and rename columns
482		logger.debug("Reformatting...")
483		units.drop(
484		columns=[
485		"LokationMastrNummer",
486		"MaStRNummer",
487		"Laengengrad",
488		"Breitengrad",
489		"Spannungsebene",
490		"Land",
491		"temp",
492		],
493		inplace=True,
494		)
495		mapping = cols_mapping["all"].copy()
496		mapping.update(cols_mapping[tech])
497		mapping.update({"geometry": "geom"})
498		units.rename(columns=mapping, inplace=True)
499		units["voltage_level"] = units.voltage_level.fillna(-1).astype(int)
500
501		units.set_geometry("geom", inplace=True)
502		units["id"] = range(0, len(units))
503
504		# change capacity unit: kW to MW
505		units["capacity"] = units["capacity"] / 1e3
506		if "capacity_inverter" in units.columns:
507		units["capacity_inverter"] = units["capacity_inverter"] / 1e3
508		if "th_capacity" in units.columns:
509		units["th_capacity"] = units["th_capacity"] / 1e3
510
511		# assign bus ids
512		logger.debug("Assigning bus ids...")
513		units = units.assign(
514		bus_id=units.loc[~units.geom.x.isna()]
515		.sjoin(mv_grid_districts[["bus_id", "geom"]], how="left")
516		.drop(columns=["index_right"])
517		.bus_id
518		)
519		units["bus_id"] = units.bus_id.fillna(-1).astype(int)
520
521		# write to DB
522		logger.info(f"Writing {len(units)} units to DB...")
523
524		units.to_postgis(
525		name=target_tables[tech].__tablename__,
526		con=engine,
527		if_exists="append",
528		schema=target_tables[tech].__table_args__["schema"],
529		)
530

openego / eGon-data

Pull Request — dev (#1112)

data.datasets.power_plants.mastr.import_mastr() D

Complexity

Size

Duplication

Importance

How to fix Long Method

Long Method

Duplication Side-by-Side

Filter issues like