Passed
Pull Request — dev (#934)
by
unknown
01:43
created

geocoding_data()   A

Complexity

Conditions 1

Size

Total Lines 18
Code Lines 5

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 5
dl 0
loc 18
rs 10
c 0
b 0
f 0
cc 1
nop 1
1
"""
2
Distribute MaStR PV rooftop capacities to OSM and synthetic buildings. Generate new
3
PV rooftop generators for scenarios eGon2035 and eGon100RE.
4
Data cleaning: Drop duplicates and entries with missing critical data. Determine most
5
plausible capacity from multiple values given in MaStR data. Drop generators which don't
6
have any plausible capacity data (23.5MW > P > 0.1). Randomly and weighted add a
7
start-up date if it is missing. Extract zip and municipality from 'Standort' given in
8
MaStR data. Geocode unique zip and municipality combinations with Nominatim (1sec
9
delay). Drop generators for which geocoding failed or which are located outside the
10
municipalities of Germany. Add some visual sanity checks for cleaned data.
11
Allocation of MaStR data: Allocate each generator to an existing building from OSM.
12
Determine the quantile each generator and building is in depending on the capacity of
13
the generator and the area of the polygon of the building. Randomly distribute
14
generators within each municipality preferably within the same building area quantile as
15
the generators are capacity wise. If not enough buildings exists within a municipality
16
and quantile additional buildings from other quantiles are chosen randomly.
17
Desegregation of pv rooftop scenarios: The scenario data per federal state is linear
18
distributed to the mv grid districts according to the pv rooftop potential per mv grid
19
district. The rooftop potential is estimated from the building area given from the OSM
20
buildings. Grid districts, which are located in several federal states, are allocated PV
21
capacity according to their respective roof potential in the individual federal states.
22
The desegregation of PV plants within a grid districts respects existing plants from
23
MaStR, which did not reach their end of life. New PV plants are randomly and weighted
24
generated using a breakdown of MaStR data as generator basis. Plant metadata (e.g. plant
25
orientation) is also added random and weighted from MaStR data as basis.
26
"""
27
from __future__ import annotations
28
29
from collections import Counter
30
from functools import wraps
31
from pathlib import Path
32
from time import perf_counter
33
from typing import Any
34
35
from geoalchemy2 import Geometry
36
from geopy.extra.rate_limiter import RateLimiter
37
from geopy.geocoders import Nominatim
38
from loguru import logger
39
from numpy.random import RandomState, default_rng
40
from pyproj.crs.crs import CRS
41
from sqlalchemy import BigInteger, Column, Float, Integer, String
42
from sqlalchemy.dialects.postgresql import HSTORE
43
from sqlalchemy.ext.declarative import declarative_base
44
import geopandas as gpd
45
import numpy as np
46
import pandas as pd
47
48
from egon.data import config, db
49
from egon.data.datasets.electricity_demand_timeseries.hh_buildings import (
50
    OsmBuildingsSynthetic,
51
)
52
from egon.data.datasets.scenario_capacities import EgonScenarioCapacities
53
from egon.data.datasets.zensus_vg250 import Vg250Gem
54
55
engine = db.engine()
56
Base = declarative_base()
57
SEED = int(config.settings()["egon-data"]["--random-seed"])
58
59
# TODO: move to yml
60
# mastr data
61
MASTR_RELEVANT_COLS = [
62
    "EinheitMastrNummer",
63
    "Bruttoleistung",
64
    "StatisikFlag",
65
    "Bruttoleistung_extended",
66
    "Nettonennleistung",
67
    "InstallierteLeistung",
68
    "zugeordneteWirkleistungWechselrichter",
69
    "EinheitBetriebsstatus",
70
    "Standort",
71
    "Bundesland",
72
    "Land",
73
    "Landkreis",
74
    "Gemeinde",
75
    "Postleitzahl",
76
    "Ort",
77
    "GeplantesInbetriebnahmedatum",
78
    "Inbetriebnahmedatum",
79
    "GemeinsamerWechselrichterMitSpeicher",
80
    "Lage",
81
    "Leistungsbegrenzung",
82
    "EinheitlicheAusrichtungUndNeigungswinkel",
83
    "Hauptausrichtung",
84
    "HauptausrichtungNeigungswinkel",
85
    "Nebenausrichtung",
86
]
87
88
MASTR_DTYPES = {
89
    "EinheitMastrNummer": str,
90
    "Bruttoleistung": float,
91
    "StatisikFlag": str,
92
    "Bruttoleistung_extended": float,
93
    "Nettonennleistung": float,
94
    "InstallierteLeistung": float,
95
    "zugeordneteWirkleistungWechselrichter": float,
96
    "EinheitBetriebsstatus": str,
97
    "Standort": str,
98
    "Bundesland": str,
99
    "Land": str,
100
    "Landkreis": str,
101
    "Gemeinde": str,
102
    # "Postleitzahl": int,  # fails because of nan values
103
    "Ort": str,
104
    "GemeinsamerWechselrichterMitSpeicher": str,
105
    "Lage": str,
106
    "Leistungsbegrenzung": str,
107
    # this will parse nan values as false wich is not always correct
108
    # "EinheitlicheAusrichtungUndNeigungswinkel": bool,
109
    "Hauptausrichtung": str,
110
    "HauptausrichtungNeigungswinkel": str,
111
    "Nebenausrichtung": str,
112
    "NebenausrichtungNeigungswinkel": str,
113
}
114
115
MASTR_PARSE_DATES = [
116
    "GeplantesInbetriebnahmedatum",
117
    "Inbetriebnahmedatum",
118
]
119
120
MASTR_INDEX_COL = "EinheitMastrNummer"
121
122
EPSG = 4326
123
SRID = 3035
124
125
# data cleaning
126
MAX_REALISTIC_PV_CAP = 23500
127
MIN_REALISTIC_PV_CAP = 0.1
128
ROUNDING = 1
129
130
# geopy
131
MIN_DELAY_SECONDS = 1
132
USER_AGENT = "rli_kh_geocoder"
133
134
# show additional logging information
135
VERBOSE = False
136
137
EXPORT_DIR = Path(__name__).resolve().parent / "data"
138
EXPORT_FILE = "mastr_geocoded.gpkg"
139
EXPORT_PATH = EXPORT_DIR / EXPORT_FILE
140
DRIVER = "GPKG"
141
142
# Number of quantiles
143
Q = 5
144
145
# Scenario Data
146
CARRIER = "solar_rooftop"
147
SCENARIOS = ["eGon2035", "eGon100RE"]
148
SCENARIO_TIMESTAMP = {
149
    "eGon2035": pd.Timestamp("2035-01-01", tz="UTC"),
150
    "eGon100RE": pd.Timestamp("2050-01-01", tz="UTC"),
151
}
152
PV_ROOFTOP_LIFETIME = pd.Timedelta(20 * 365, unit="D")
153
154
# Example Modul Trina Vertex S TSM-400DE09M.08 400 Wp
155
# https://www.photovoltaik4all.de/media/pdf/92/64/68/Trina_Datasheet_VertexS_DE09-08_2021_A.pdf
156
MODUL_CAP = 0.4  # kWp
157
MODUL_SIZE = 1.096 * 1.754  # m²
158
PV_CAP_PER_SQ_M = MODUL_CAP / MODUL_SIZE
159
160
# Estimation of usable roof area
161
# Factor for the conversion of building area to roof area
162
# estimation mean roof pitch: 35°
163
# estimation usable roof share: 80%
164
# estimation that only the south side of the building is used for pv
165
# see https://mediatum.ub.tum.de/doc/%20969497/969497.pdf
166
# AREA_FACTOR = 1.221
167
# USABLE_ROOF_SHARE = 0.8
168
# SOUTH_SHARE = 0.5
169
# ROOF_FACTOR = AREA_FACTOR * USABLE_ROOF_SHARE * SOUTH_SHARE
170
ROOF_FACTOR = 0.5
171
172
CAP_RANGES = [
173
    (0, 30),
174
    (30, 100),
175
    (100, float("inf")),
176
]
177
178
MIN_BUILDING_SIZE = 10.0
179
UPPER_QUNATILE = 0.95
180
LOWER_QUANTILE = 0.05
181
182
COLS_TO_RENAME = {
183
    "EinheitlicheAusrichtungUndNeigungswinkel": (
184
        "einheitliche_ausrichtung_und_neigungswinkel"
185
    ),
186
    "Hauptausrichtung": "hauptausrichtung",
187
    "HauptausrichtungNeigungswinkel": "hauptausrichtung_neigungswinkel",
188
}
189
190
COLS_TO_EXPORT = [
191
    "scenario",
192
    "bus_id",
193
    "building_id",
194
    "gens_id",
195
    "capacity",
196
    "einheitliche_ausrichtung_und_neigungswinkel",
197
    "hauptausrichtung",
198
    "hauptausrichtung_neigungswinkel",
199
    "voltage_level",
200
    "weather_cell_id",
201
]
202
203
# TODO
204
INCLUDE_SYNTHETIC_BUILDINGS = True
205
ONLY_BUILDINGS_WITH_DEMAND = True
206
TEST_RUN = False
207
DIRTY_FIX = False
208
209
210
def timer_func(func):
211
    @wraps(func)
212
    def timeit_wrapper(*args, **kwargs):
213
        start_time = perf_counter()
214
        result = func(*args, **kwargs)
215
        end_time = perf_counter()
216
        total_time = end_time - start_time
217
        logger.debug(
218
            f"Function {func.__name__} took {total_time:.4f} seconds."
219
        )
220
        return result
221
222
    return timeit_wrapper
223
224
225
@timer_func
226
def mastr_data(
227
    index_col: str | int | list[str] | list[int],
228
    usecols: list[str],
229
    dtype: dict[str, Any] | None,
230
    parse_dates: list[str] | None,
231
) -> pd.DataFrame:
232
    """
233
    Read MaStR data from csv.
234
235
    Parameters
236
    -----------
237
    index_col : str, int or list of str or int
238
        Column(s) to use as the row labels of the DataFrame.
239
    usecols : list of str
240
        Return a subset of the columns.
241
    dtype : dict of column (str) -> type (any), optional
242
        Data type for data or columns.
243
    parse_dates : list of names (str), optional
244
        Try to parse given columns to datetime.
245
    Returns
246
    -------
247
    pandas.DataFrame
248
        DataFrame containing MaStR data.
249
    """
250
    mastr_path = Path(
251
        config.datasets()["power_plants"]["sources"]["mastr_pv"]
252
    ).resolve()
253
254
    mastr_df = pd.read_csv(
255
        mastr_path,
256
        index_col=index_col,
257
        usecols=usecols,
258
        dtype=dtype,
259
        parse_dates=parse_dates,
260
    )
261
262
    mastr_df = mastr_df.loc[
263
        (mastr_df.StatisikFlag == "B")
264
        & (mastr_df.EinheitBetriebsstatus == "InBetrieb")
265
        & (mastr_df.Land == "Deutschland")
266
        & (mastr_df.Lage == "BaulicheAnlagen")
267
    ]
268
269
    if (
270
        config.settings()["egon-data"]["--dataset-boundary"]
271
        == "Schleswig-Holstein"
272
    ):
273
        init_len = len(mastr_df)
274
275
        mastr_df = mastr_df.loc[mastr_df.Bundesland == "SchleswigHolstein"]
276
277
        logger.info(
278
            f"Using only MaStR data within Schleswig-Holstein. "
279
            f"{init_len - len(mastr_df)} of {init_len} generators are dropped."
280
        )
281
282
    logger.debug("MaStR data loaded.")
283
284
    return mastr_df
285
286
287
@timer_func
288
def clean_mastr_data(
289
    mastr_df: pd.DataFrame,
290
    max_realistic_pv_cap: int | float,
291
    min_realistic_pv_cap: int | float,
292
    rounding: int,
293
    seed: int,
294
) -> pd.DataFrame:
295
    """
296
    Clean the MaStR data from implausible data.
297
298
    * Drop MaStR ID duplicates.
299
    * Drop generators with implausible capacities.
300
    * Drop generators without any kind of start-up date.
301
    * Clean up Standort column and capacity.
302
303
    Parameters
304
    -----------
305
    mastr_df : pandas.DataFrame
306
        DataFrame containing MaStR data.
307
    max_realistic_pv_cap : int or float
308
        Maximum capacity, which is considered to be realistic.
309
    min_realistic_pv_cap : int or float
310
        Minimum capacity, which is considered to be realistic.
311
    rounding : int
312
        Rounding to use when cleaning up capacity. E.g. when
313
        rounding is 1 a capacity of 9.93 will be rounded to 9.9.
314
    seed : int
315
        Seed to use for random operations with NumPy and pandas.
316
    Returns
317
    -------
318
    pandas.DataFrame
319
        DataFrame containing cleaned MaStR data.
320
    """
321
    init_len = len(mastr_df)
322
323
    # drop duplicates
324
    mastr_df = mastr_df.loc[~mastr_df.index.duplicated()]
325
326
    # drop invalid entries in standort
327
    index_to_drop = mastr_df.loc[
328
        (mastr_df.Standort.isna()) | (mastr_df.Standort.isnull())
329
    ].index
330
331
    mastr_df = mastr_df.loc[~mastr_df.index.isin(index_to_drop)]
332
333
    df = mastr_df[
334
        [
335
            "Bruttoleistung",
336
            "Bruttoleistung_extended",
337
            "Nettonennleistung",
338
            "zugeordneteWirkleistungWechselrichter",
339
            "InstallierteLeistung",
340
        ]
341
    ].round(rounding)
342
343
    # use only the smallest capacity rating if multiple are given
344
    mastr_df = mastr_df.assign(
345
        capacity=[
346
            most_plausible(p_tub, min_realistic_pv_cap)
347
            for p_tub in df.itertuples(index=False)
348
        ]
349
    )
350
351
    # drop generators without any capacity info
352
    # and capacity of zero
353
    # and if the capacity is > 23.5 MW, because
354
    # Germanies largest rooftop PV is 23 MW
355
    # https://www.iwr.de/news/groesste-pv-dachanlage-europas-wird-in-sachsen-anhalt-gebaut-news37379
356
    mastr_df = mastr_df.loc[
357
        (~mastr_df.capacity.isna())
358
        & (mastr_df.capacity <= max_realistic_pv_cap)
359
        & (mastr_df.capacity > min_realistic_pv_cap)
360
    ]
361
362
    # get zip and municipality
363
    mastr_df[["zip_and_municipality", "drop_this"]] = pd.DataFrame(
364
        mastr_df.Standort.astype(str)
365
        .apply(
366
            zip_and_municipality_from_standort,
367
            args=(VERBOSE,),
368
        )
369
        .tolist(),
370
        index=mastr_df.index,
371
    )
372
373
    # drop invalid entries
374
    mastr_df = mastr_df.loc[mastr_df.drop_this].drop(columns="drop_this")
375
376
    # add ", Deutschland" just in case
377
    mastr_df = mastr_df.assign(
378
        zip_and_municipality=(mastr_df.zip_and_municipality + ", Deutschland")
379
    )
380
381
    # get consistent start-up date
382
    mastr_df = mastr_df.assign(
383
        start_up_date=mastr_df.Inbetriebnahmedatum,
384
    )
385
386
    mastr_df.loc[mastr_df.start_up_date.isna()] = mastr_df.loc[
387
        mastr_df.start_up_date.isna()
388
    ].assign(
389
        start_up_date=mastr_df.GeplantesInbetriebnahmedatum.loc[
390
            mastr_df.start_up_date.isna()
391
        ]
392
    )
393
394
    # randomly and weighted fill missing start-up dates
395
    pool = mastr_df.loc[
396
        ~mastr_df.start_up_date.isna()
397
    ].start_up_date.to_numpy()
398
399
    size = len(mastr_df) - len(pool)
400
401
    if size > 0:
402
        np.random.seed(seed)
403
404
        choice = np.random.choice(
405
            pool,
406
            size=size,
407
            replace=False,
408
        )
409
410
        mastr_df.loc[mastr_df.start_up_date.isna()] = mastr_df.loc[
411
            mastr_df.start_up_date.isna()
412
        ].assign(start_up_date=choice)
413
414
        logger.info(
415
            f"Randomly and weigthed added start-up date to {size} generators."
416
        )
417
418
    mastr_df = mastr_df.assign(
419
        start_up_date=pd.to_datetime(mastr_df.start_up_date, utc=True)
420
    )
421
422
    end_len = len(mastr_df)
423
    logger.debug(
424
        f"Dropped {init_len - end_len} "
425
        f"({((init_len - end_len) / init_len) * 100:g}%)"
426
        f" of {init_len} rows from MaStR DataFrame."
427
    )
428
429
    return mastr_df
430
431
432
def zip_and_municipality_from_standort(
433
    standort: str,
434
    verbose: bool = False,
435
) -> tuple[str, bool]:
436
    """
437
    Get zip code and municipality from Standort string split into a list.
438
    Parameters
439
    -----------
440
    standort : str
441
        Standort as given from MaStR data.
442
    verbose : bool
443
        Logs additional info if True.
444
    Returns
445
    -------
446
    str
447
        Standort with only the zip code and municipality
448
        as well a ', Germany' added.
449
    """
450
    if verbose:
451
        logger.debug(f"Uncleaned String: {standort}")
452
453
    standort_list = standort.split()
454
455
    found = False
456
    count = 0
457
458
    for count, elem in enumerate(standort_list):
459
        if len(elem) != 5:
460
            continue
461
        if not elem.isnumeric():
462
            continue
463
464
        found = True
465
466
        break
467
468
    if found:
469
        cleaned_str = " ".join(standort_list[count:])
470
471
        if verbose:
472
            logger.debug(f"Cleaned String:   {cleaned_str}")
473
474
        return cleaned_str, found
475
476
    logger.warning(
477
        "Couldn't identify zip code. This entry will be dropped."
478
        f" Original standort: {standort}."
479
    )
480
481
    return standort, found
482
483
484
def most_plausible(
485
    p_tub: tuple,
486
    min_realistic_pv_cap: int | float,
487
) -> float:
488
    """
489
    Try to determine the most plausible capacity.
490
    Try to determine the most plausible capacity from a given
491
    generator from MaStR data.
492
    Parameters
493
    -----------
494
    p_tub : tuple
495
        Tuple containing the different capacities given in
496
        the MaStR data.
497
    min_realistic_pv_cap : int or float
498
        Minimum capacity, which is considered to be realistic.
499
    Returns
500
    -------
501
    float
502
        Capacity of the generator estimated as the most realistic.
503
    """
504
    count = Counter(p_tub).most_common(3)
505
506
    if len(count) == 1:
507
        return count[0][0]
508
509
    val1 = count[0][0]
510
    val2 = count[1][0]
511
512
    if len(count) == 2:
513
        min_val = min(val1, val2)
514
        max_val = max(val1, val2)
515
    else:
516
        val3 = count[2][0]
517
518
        min_val = min(val1, val2, val3)
519
        max_val = max(val1, val2, val3)
520
521
    if min_val < min_realistic_pv_cap:
522
        return max_val
523
524
    return min_val
525
526
527
def geocoder(
528
    user_agent: str,
529
    min_delay_seconds: int,
530
) -> RateLimiter:
531
    """
532
    Setup Nominatim geocoding class.
533
    Parameters
534
    -----------
535
    user_agent : str
536
        The app name.
537
    min_delay_seconds : int
538
        Delay in seconds to use between requests to Nominatim.
539
        A minimum of 1 is advised.
540
    Returns
541
    -------
542
    geopy.extra.rate_limiter.RateLimiter
543
        Nominatim RateLimiter geocoding class to use for geocoding.
544
    """
545
    locator = Nominatim(user_agent=user_agent)
546
    return RateLimiter(
547
        locator.geocode,
548
        min_delay_seconds=min_delay_seconds,
549
    )
550
551
552
def geocoding_data(
553
    clean_mastr_df: pd.DataFrame,
554
) -> pd.DataFrame:
555
    """
556
    Setup DataFrame to geocode.
557
    Parameters
558
    -----------
559
    clean_mastr_df : pandas.DataFrame
560
        DataFrame containing cleaned MaStR data.
561
    Returns
562
    -------
563
    pandas.DataFrame
564
        DataFrame containing all unique combinations of
565
        zip codes with municipalities for geocoding.
566
    """
567
    return pd.DataFrame(
568
        data=clean_mastr_df.zip_and_municipality.unique(),
569
        columns=["zip_and_municipality"],
570
    )
571
572
573
@timer_func
574
def geocode_data(
575
    geocoding_df: pd.DataFrame,
576
    ratelimiter: RateLimiter,
577
    epsg: int,
578
) -> gpd.GeoDataFrame:
579
    """
580
    Geocode zip code and municipality.
581
    Extract latitude, longitude and altitude.
582
    Transfrom latitude and longitude to shapely
583
    Point and return a geopandas GeoDataFrame.
584
    Parameters
585
    -----------
586
    geocoding_df : pandas.DataFrame
587
        DataFrame containing all unique combinations of
588
        zip codes with municipalities for geocoding.
589
    ratelimiter : geopy.extra.rate_limiter.RateLimiter
590
        Nominatim RateLimiter geocoding class to use for geocoding.
591
    epsg : int
592
        EPSG ID to use as CRS.
593
    Returns
594
    -------
595
    geopandas.GeoDataFrame
596
        GeoDataFrame containing all unique combinations of
597
        zip codes with municipalities with matching geolocation.
598
    """
599
    logger.info(f"Geocoding {len(geocoding_df)} locations.")
600
601
    geocode_df = geocoding_df.assign(
602
        location=geocoding_df.zip_and_municipality.apply(ratelimiter)
603
    )
604
605
    geocode_df = geocode_df.assign(
606
        point=geocode_df.location.apply(
607
            lambda loc: tuple(loc.point) if loc else None
608
        )
609
    )
610
611
    geocode_df[["latitude", "longitude", "altitude"]] = pd.DataFrame(
612
        geocode_df.point.tolist(), index=geocode_df.index
613
    )
614
615
    return gpd.GeoDataFrame(
616
        geocode_df,
617
        geometry=gpd.points_from_xy(geocode_df.longitude, geocode_df.latitude),
618
        crs=f"EPSG:{epsg}",
619
    )
620
621
622
def merge_geocode_with_mastr(
623
    clean_mastr_df: pd.DataFrame, geocode_gdf: gpd.GeoDataFrame
624
) -> gpd.GeoDataFrame:
625
    """
626
    Merge geometry to original mastr data.
627
    Parameters
628
    -----------
629
    clean_mastr_df : pandas.DataFrame
630
        DataFrame containing cleaned MaStR data.
631
    geocode_gdf : geopandas.GeoDataFrame
632
        GeoDataFrame containing all unique combinations of
633
        zip codes with municipalities with matching geolocation.
634
    Returns
635
    -------
636
    gepandas.GeoDataFrame
637
        GeoDataFrame containing cleaned MaStR data with
638
        matching geolocation from geocoding.
639
    """
640
    return gpd.GeoDataFrame(
641
        clean_mastr_df.merge(
642
            geocode_gdf[["zip_and_municipality", "geometry"]],
643
            how="left",
644
            left_on="zip_and_municipality",
645
            right_on="zip_and_municipality",
646
        ),
647
        crs=geocode_gdf.crs,
648
    ).set_index(clean_mastr_df.index)
649
650
651
def drop_invalid_entries_from_gdf(
652
    gdf: gpd.GeoDataFrame,
653
) -> gpd.GeoDataFrame:
654
    """
655
    Drop invalid entries from geopandas GeoDataFrame.
656
    TODO: how to omit the logging from geos here???
657
    Parameters
658
    -----------
659
    gdf : geopandas.GeoDataFrame
660
        GeoDataFrame to be checked for validity.
661
    Returns
662
    -------
663
    gepandas.GeoDataFrame
664
        GeoDataFrame with rows with invalid geometries
665
        dropped.
666
    """
667
    valid_gdf = gdf.loc[gdf.is_valid]
668
669
    logger.debug(
670
        f"{len(gdf) - len(valid_gdf)} "
671
        f"({(len(gdf) - len(valid_gdf)) / len(gdf) * 100:g}%) "
672
        f"of {len(gdf)} values were invalid and are dropped."
673
    )
674
675
    return valid_gdf
676
677
678
@timer_func
679
def municipality_data() -> gpd.GeoDataFrame:
680
    """
681
    Get municipality data from eGo^n Database.
682
    Returns
683
    -------
684
    gepandas.GeoDataFrame
685
        GeoDataFrame with municipality data.
686
    """
687
    with db.session_scope() as session:
688
        query = session.query(Vg250Gem.ags, Vg250Gem.geometry.label("geom"))
689
690
    return gpd.read_postgis(
691
        query.statement, query.session.bind, index_col="ags"
692
    )
693
694
695
@timer_func
696
def add_ags_to_gens(
697
    valid_mastr_gdf: gpd.GeoDataFrame,
698
    municipalities_gdf: gpd.GeoDataFrame,
699
) -> gpd.GeoDataFrame:
700
    """
701
    Add information about AGS ID to generators.
702
    Parameters
703
    -----------
704
    valid_mastr_gdf : geopandas.GeoDataFrame
705
        GeoDataFrame with valid and cleaned MaStR data.
706
    municipalities_gdf : geopandas.GeoDataFrame
707
        GeoDataFrame with municipality data.
708
    Returns
709
    -------
710
    gepandas.GeoDataFrame
711
        GeoDataFrame with valid and cleaned MaStR data
712
        with AGS ID added.
713
    """
714
    return valid_mastr_gdf.sjoin(
715
        municipalities_gdf,
716
        how="left",
717
        predicate="intersects",
718
    ).rename(columns={"index_right": "ags"})
719
720
721
def drop_gens_outside_muns(
722
    valid_mastr_gdf: gpd.GeoDataFrame,
723
) -> gpd.GeoDataFrame:
724
    """
725
    Drop all generators outside of municipalities.
726
    Parameters
727
    -----------
728
    valid_mastr_gdf : geopandas.GeoDataFrame
729
        GeoDataFrame with valid and cleaned MaStR data.
730
    Returns
731
    -------
732
    gepandas.GeoDataFrame
733
        GeoDataFrame with valid and cleaned MaStR data
734
        with generatos without an AGS ID dropped.
735
    """
736
    gdf = valid_mastr_gdf.loc[~valid_mastr_gdf.ags.isna()]
737
738
    logger.debug(
739
        f"{len(valid_mastr_gdf) - len(gdf)} "
740
        f"({(len(valid_mastr_gdf) - len(gdf)) / len(valid_mastr_gdf) * 100:g}%) "
741
        f"of {len(valid_mastr_gdf)} values are outside of the municipalities"
742
        " and are therefore dropped."
743
    )
744
745
    return gdf
746
747
748
class EgonMastrPvRoofGeocoded(Base):
749
    __tablename__ = "egon_mastr_pv_roof_geocoded"
750
    __table_args__ = {"schema": "supply"}
751
752
    zip_and_municipality = Column(String, primary_key=True, index=True)
753
    location = Column(String)
754
    point = Column(String)
755
    latitude = Column(Float)
756
    longitude = Column(Float)
757
    altitude = Column(Float)
758
    geometry = Column(Geometry(srid=EPSG))
759
760
761
def create_geocoded_table(geocode_gdf):
762
    """
763
    Create geocoded table mastr pv rooftop
764
    Parameters
765
    -----------
766
    geocode_gdf : geopandas.GeoDataFrame
767
        GeoDataFrame containing geocoding information on pv rooftop locations.
768
    """
769
    EgonMastrPvRoofGeocoded.__table__.drop(bind=engine, checkfirst=True)
770
    EgonMastrPvRoofGeocoded.__table__.create(bind=engine, checkfirst=True)
771
772
    geocode_gdf.to_postgis(
773
        name=EgonMastrPvRoofGeocoded.__table__.name,
774
        schema=EgonMastrPvRoofGeocoded.__table__.schema,
775
        con=db.engine(),
776
        if_exists="append",
777
        index=False,
778
        # dtype={}
779
    )
780
781
782
def geocoded_data_from_db(
783
    epsg: str | int,
784
) -> gpd.GeoDataFrame:
785
    """
786
    Read OSM buildings data from eGo^n Database.
787
    Parameters
788
    -----------
789
    to_crs : pyproj.crs.crs.CRS
790
        CRS to transform geometries to.
791
    Returns
792
    -------
793
    geopandas.GeoDataFrame
794
        GeoDataFrame containing OSM buildings data.
795
    """
796
    with db.session_scope() as session:
797
        query = session.query(
798
            EgonMastrPvRoofGeocoded.zip_and_municipality,
799
            EgonMastrPvRoofGeocoded.geometry,
800
        )
801
802
    return gpd.read_postgis(
803
        query.statement, query.session.bind, geom_col="geometry"
804
    ).to_crs(f"EPSG:{epsg}")
805
806
807
def load_mastr_data():
808
    """Read PV rooftop data from MaStR CSV
809
    Note: the source will be replaced as soon as the MaStR data is available
810
    in DB.
811
    Returns
812
    -------
813
    geopandas.GeoDataFrame
814
        GeoDataFrame containing MaStR data with geocoded locations.
815
    """
816
    mastr_df = mastr_data(
817
        MASTR_INDEX_COL,
818
        MASTR_RELEVANT_COLS,
819
        MASTR_DTYPES,
820
        MASTR_PARSE_DATES,
821
    )
822
823
    clean_mastr_df = clean_mastr_data(
824
        mastr_df,
825
        max_realistic_pv_cap=MAX_REALISTIC_PV_CAP,
826
        min_realistic_pv_cap=MIN_REALISTIC_PV_CAP,
827
        seed=SEED,
828
        rounding=ROUNDING,
829
    )
830
831
    geocode_gdf = geocoded_data_from_db(EPSG)
832
833
    mastr_gdf = merge_geocode_with_mastr(clean_mastr_df, geocode_gdf)
834
835
    valid_mastr_gdf = drop_invalid_entries_from_gdf(mastr_gdf)
836
837
    municipalities_gdf = municipality_data()
838
839
    valid_mastr_gdf = add_ags_to_gens(valid_mastr_gdf, municipalities_gdf)
840
841
    return drop_gens_outside_muns(valid_mastr_gdf)
842
843
844
class OsmBuildingsFiltered(Base):
845
    __tablename__ = "osm_buildings_filtered"
846
    __table_args__ = {"schema": "openstreetmap"}
847
848
    osm_id = Column(BigInteger)
849
    amenity = Column(String)
850
    building = Column(String)
851
    name = Column(String)
852
    geom = Column(Geometry(srid=SRID), index=True)
853
    area = Column(Float)
854
    geom_point = Column(Geometry(srid=SRID), index=True)
855
    tags = Column(HSTORE)
856
    id = Column(BigInteger, primary_key=True, index=True)
857
858
859
@timer_func
860
def osm_buildings(
861
    to_crs: CRS,
862
) -> gpd.GeoDataFrame:
863
    """
864
    Read OSM buildings data from eGo^n Database.
865
    Parameters
866
    -----------
867
    to_crs : pyproj.crs.crs.CRS
868
        CRS to transform geometries to.
869
    Returns
870
    -------
871
    geopandas.GeoDataFrame
872
        GeoDataFrame containing OSM buildings data.
873
    """
874
    with db.session_scope() as session:
875
        query = session.query(
876
            OsmBuildingsFiltered.id,
877
            OsmBuildingsFiltered.area,
878
            OsmBuildingsFiltered.geom_point.label("geom"),
879
        )
880
881
    return gpd.read_postgis(
882
        query.statement, query.session.bind, index_col="id"
883
    ).to_crs(to_crs)
884
885
886
@timer_func
887
def synthetic_buildings(
888
    to_crs: CRS,
889
) -> gpd.GeoDataFrame:
890
    """
891
    Read synthetic buildings data from eGo^n Database.
892
    Parameters
893
    -----------
894
    to_crs : pyproj.crs.crs.CRS
895
        CRS to transform geometries to.
896
    Returns
897
    -------
898
    geopandas.GeoDataFrame
899
        GeoDataFrame containing OSM buildings data.
900
    """
901
    with db.session_scope() as session:
902
        query = session.query(
903
            OsmBuildingsSynthetic.id,
904
            OsmBuildingsSynthetic.area,
905
            OsmBuildingsSynthetic.geom_point.label("geom"),
906
        )
907
908
    return gpd.read_postgis(
909
        query.statement, query.session.bind, index_col="id"
910
    ).to_crs(to_crs)
911
912
913
@timer_func
914
def add_ags_to_buildings(
915
    buildings_gdf: gpd.GeoDataFrame,
916
    municipalities_gdf: gpd.GeoDataFrame,
917
) -> gpd.GeoDataFrame:
918
    """
919
    Add information about AGS ID to buildings.
920
    Parameters
921
    -----------
922
    buildings_gdf : geopandas.GeoDataFrame
923
        GeoDataFrame containing OSM buildings data.
924
    municipalities_gdf : geopandas.GeoDataFrame
925
        GeoDataFrame with municipality data.
926
    Returns
927
    -------
928
    gepandas.GeoDataFrame
929
        GeoDataFrame containing OSM buildings data
930
        with AGS ID added.
931
    """
932
    return buildings_gdf.sjoin(
933
        municipalities_gdf,
934
        how="left",
935
        predicate="intersects",
936
    ).rename(columns={"index_right": "ags"})
937
938
939
def drop_buildings_outside_muns(
940
    buildings_gdf: gpd.GeoDataFrame,
941
) -> gpd.GeoDataFrame:
942
    """
943
    Drop all buildings outside of municipalities.
944
    Parameters
945
    -----------
946
    buildings_gdf : geopandas.GeoDataFrame
947
        GeoDataFrame containing OSM buildings data.
948
    Returns
949
    -------
950
    gepandas.GeoDataFrame
951
        GeoDataFrame containing OSM buildings data
952
        with buildings without an AGS ID dropped.
953
    """
954
    gdf = buildings_gdf.loc[~buildings_gdf.ags.isna()]
955
956
    logger.debug(
957
        f"{len(buildings_gdf) - len(gdf)} "
958
        f"({(len(buildings_gdf) - len(gdf)) / len(buildings_gdf) * 100:g}%) "
959
        f"of {len(buildings_gdf)} values are outside of the municipalities "
960
        "and are therefore dropped."
961
    )
962
963
    return gdf
964
965
966
def egon_building_peak_loads():
967
    sql = """
968
    SELECT building_id
969
    FROM demand.egon_building_electricity_peak_loads
970
    WHERE scenario = 'eGon2035'
971
    """
972
973
    return (
974
        db.select_dataframe(sql).building_id.astype(int).sort_values().unique()
975
    )
976
977
978
@timer_func
979
def load_building_data():
980
    """
981
    Read buildings from DB
982
    Tables:
983
984
    * `openstreetmap.osm_buildings_filtered` (from OSM)
985
    * `openstreetmap.osm_buildings_synthetic` (synthetic, created by us)
986
987
    Use column `id` for both as it is unique hence you concat both datasets. If
988
    INCLUDE_SYNTHETIC_BUILDINGS is False synthetic buildings will not be loaded.
989
990
    Returns
991
    -------
992
    gepandas.GeoDataFrame
993
        GeoDataFrame containing OSM buildings data with buildings without an AGS ID
994
        dropped.
995
    """
996
997
    municipalities_gdf = municipality_data()
998
999
    osm_buildings_gdf = osm_buildings(municipalities_gdf.crs)
1000
1001
    if INCLUDE_SYNTHETIC_BUILDINGS:
1002
        synthetic_buildings_gdf = synthetic_buildings(municipalities_gdf.crs)
1003
1004
        buildings_gdf = gpd.GeoDataFrame(
1005
            pd.concat(
1006
                [
1007
                    osm_buildings_gdf,
1008
                    synthetic_buildings_gdf,
1009
                ]
1010
            ),
1011
            geometry="geom",
1012
            crs=osm_buildings_gdf.crs,
1013
        ).rename(columns={"area": "building_area"})
1014
1015
        buildings_gdf.index = buildings_gdf.index.astype(int)
1016
1017
    else:
1018
        buildings_gdf = osm_buildings_gdf.rename(
1019
            columns={"area": "building_area"}
1020
        )
1021
1022
    if ONLY_BUILDINGS_WITH_DEMAND:
1023
        building_ids = egon_building_peak_loads()
1024
1025
        init_len = len(building_ids)
1026
1027
        building_ids = np.intersect1d(
1028
            list(map(int, building_ids)),
1029
            list(map(int, buildings_gdf.index.to_numpy())),
1030
        )
1031
1032
        end_len = len(building_ids)
1033
1034
        logger.debug(
1035
            f"{end_len/init_len * 100: g} % ({end_len} / {init_len}) of buildings have "
1036
            f"peak load."
1037
        )
1038
1039
        buildings_gdf = buildings_gdf.loc[building_ids]
1040
1041
    buildings_ags_gdf = add_ags_to_buildings(buildings_gdf, municipalities_gdf)
1042
1043
    buildings_ags_gdf = drop_buildings_outside_muns(buildings_ags_gdf)
1044
1045
    grid_districts_gdf = grid_districts(EPSG)
1046
1047
    federal_state_gdf = federal_state_data(grid_districts_gdf.crs)
1048
1049
    grid_federal_state_gdf = overlay_grid_districts_with_counties(
1050
        grid_districts_gdf,
1051
        federal_state_gdf,
1052
    )
1053
1054
    buildings_overlay_gdf = add_overlay_id_to_buildings(
1055
        buildings_ags_gdf,
1056
        grid_federal_state_gdf,
1057
    )
1058
1059
    logger.debug("Loaded buildings.")
1060
1061
    buildings_overlay_gdf = drop_buildings_outside_grids(buildings_overlay_gdf)
1062
1063
    # overwrite bus_id with data from new table
1064
    sql = "SELECT building_id, bus_id FROM boundaries.egon_map_zensus_mvgd_buildings"
1065
    map_building_bus_df = db.select_dataframe(sql)
1066
1067
    building_ids = np.intersect1d(
1068
        list(map(int, map_building_bus_df.building_id.unique())),
1069
        list(map(int, buildings_overlay_gdf.index.to_numpy())),
1070
    )
1071
1072
    buildings_within_gdf = buildings_overlay_gdf.loc[building_ids]
1073
1074
    gdf = (
1075
        buildings_within_gdf.reset_index()
1076
        .drop(columns=["bus_id"])
1077
        .merge(
1078
            how="left",
1079
            right=map_building_bus_df,
1080
            left_on="id",
1081
            right_on="building_id",
1082
        )
1083
        .drop(columns=["building_id"])
1084
        .set_index("id")
1085
        .sort_index()
1086
    )
1087
1088
    return gdf[~gdf.index.duplicated(keep="first")]
1089
1090
1091
@timer_func
1092
def sort_and_qcut_df(
1093
    df: pd.DataFrame | gpd.GeoDataFrame,
1094
    col: str,
1095
    q: int,
1096
) -> pd.DataFrame | gpd.GeoDataFrame:
1097
    """
1098
    Determine the quantile of a given attribute in a (Geo)DataFrame.
1099
    Sort the (Geo)DataFrame in ascending order for the given attribute.
1100
    Parameters
1101
    -----------
1102
    df : pandas.DataFrame or geopandas.GeoDataFrame
1103
        (Geo)DataFrame to sort and qcut.
1104
    col : str
1105
        Name of the attribute to sort and qcut the (Geo)DataFrame on.
1106
    q : int
1107
        Number of quantiles.
1108
    Returns
1109
    -------
1110
    pandas.DataFrame or gepandas.GeoDataFrame
1111
        Sorted and qcut (Geo)DataFrame.
1112
    """
1113
    df = df.sort_values(col, ascending=True)
1114
1115
    return df.assign(
1116
        quant=pd.qcut(
1117
            df[col],
1118
            q=q,
1119
            labels=range(q),
1120
        )
1121
    )
1122
1123
1124
@timer_func
1125
def allocate_pv(
1126
    q_mastr_gdf: gpd.GeoDataFrame,
1127
    q_buildings_gdf: gpd.GeoDataFrame,
1128
    seed: int,
1129
) -> tuple[gpd.GeoDataFrame, gpd.GeoDataFrame]:
1130
    """
1131
    Allocate the MaStR pv generators to the OSM buildings.
1132
    This will determine a building for each pv generator if there are more
1133
    buildings than generators within a given AGS. Primarily generators are
1134
    distributed with the same qunatile as the buildings. Multiple assignment
1135
    is excluded.
1136
    Parameters
1137
    -----------
1138
    q_mastr_gdf : geopandas.GeoDataFrame
1139
        GeoDataFrame containing geocoded and qcut MaStR data.
1140
    q_buildings_gdf : geopandas.GeoDataFrame
1141
        GeoDataFrame containing qcut OSM buildings data.
1142
    seed : int
1143
        Seed to use for random operations with NumPy and pandas.
1144
    Returns
1145
    -------
1146
    tuple with two geopandas.GeoDataFrame s
1147
        GeoDataFrame containing MaStR data allocated to building IDs.
1148
        GeoDataFrame containing building data allocated to MaStR IDs.
1149
    """
1150
    rng = default_rng(seed=seed)
1151
1152
    q_buildings_gdf = q_buildings_gdf.assign(gens_id=np.nan).sort_values(
1153
        by=["ags", "quant"]
1154
    )
1155
    q_mastr_gdf = q_mastr_gdf.assign(building_id=np.nan).sort_values(
1156
        by=["ags", "quant"]
1157
    )
1158
1159
    ags_list = q_buildings_gdf.ags.unique()
1160
1161
    if TEST_RUN:
1162
        ags_list = ags_list[:250]
1163
1164
    num_ags = len(ags_list)
1165
1166
    t0 = perf_counter()
1167
1168
    for count, ags in enumerate(ags_list):
1169
1170
        buildings = q_buildings_gdf.loc[q_buildings_gdf.ags == ags]
1171
        gens = q_mastr_gdf.loc[q_mastr_gdf.ags == ags]
1172
1173
        len_build = len(buildings)
1174
        len_gens = len(gens)
1175
1176
        if len_build < len_gens:
1177
            gens = gens.sample(len_build, random_state=RandomState(seed=seed))
1178
            logger.error(
1179
                f"There are {len_gens} generators and only {len_build}"
1180
                f" buildings in AGS {ags}. {len_gens - len(gens)} "
1181
                "generators were truncated to match the amount of buildings."
1182
            )
1183
1184
            assert len_build == len(gens)
1185
1186
        for quant in gens.quant.unique():
1187
            q_buildings = buildings.loc[buildings.quant == quant]
1188
            q_gens = gens.loc[gens.quant == quant]
1189
1190
            len_build = len(q_buildings)
1191
            len_gens = len(q_gens)
1192
1193
            if len_build < len_gens:
1194
                delta = len_gens - len_build
1195
1196
                logger.warning(
1197
                    f"There are {len_gens} generators and only {len_build} "
1198
                    f"buildings in AGS {ags} and quantile {quant}. {delta} "
1199
                    f"buildings from AGS {ags} will be added randomly."
1200
                )
1201
1202
                add_buildings = pd.Index(
1203
                    rng.choice(
1204
                        list(set(buildings.index) - set(q_buildings.index)),
1205
                        size=delta,
1206
                        replace=False,
1207
                    )
1208
                )
1209
1210
                chosen_buildings = q_buildings.index.append(add_buildings)
1211
1212
            else:
1213
                chosen_buildings = rng.choice(
1214
                    q_buildings.index,
1215
                    size=len_gens,
1216
                    replace=False,
1217
                )
1218
1219
            q_buildings_gdf.loc[chosen_buildings, "gens_id"] = q_gens.index
1220
            buildings = buildings.drop(chosen_buildings)
1221
1222
        if count % 500 == 0:
1223
            logger.debug(
1224
                f"Allocation of {count / num_ags * 100:g} % of AGS done. It took "
1225
                f"{perf_counter() - t0:g} seconds."
1226
            )
1227
1228
            t0 = perf_counter()
1229
1230
    assigned_buildings = q_buildings_gdf.loc[~q_buildings_gdf.gens_id.isna()]
1231
1232
    assert len(assigned_buildings) == len(assigned_buildings.gens_id.unique())
1233
1234
    q_mastr_gdf.loc[
1235
        assigned_buildings.gens_id, "building_id"
1236
    ] = assigned_buildings.index
1237
1238
    assigned_gens = q_mastr_gdf.loc[~q_mastr_gdf.building_id.isna()]
1239
1240
    assert len(assigned_buildings) == len(assigned_gens)
1241
1242
    logger.debug("Allocated status quo generators to buildings.")
1243
1244
    return frame_to_numeric(q_mastr_gdf), frame_to_numeric(q_buildings_gdf)
1245
1246
1247
def frame_to_numeric(
1248
    df: pd.DataFrame | gpd.GeoDataFrame,
1249
) -> pd.DataFrame | gpd.GeoDataFrame:
1250
    """
1251
    Try to convert all columns of a DataFrame to numeric ignoring errors.
1252
    Parameters
1253
    ----------
1254
    df : pandas.DataFrame or geopandas.GeoDataFrame
1255
    Returns
1256
    -------
1257
    pandas.DataFrame or geopandas.GeoDataFrame
1258
    """
1259
    if str(df.index.dtype) == "object":
1260
        df.index = pd.to_numeric(df.index, errors="ignore")
1261
1262
    for col in df.columns:
1263
        if str(df[col].dtype) == "object":
1264
            df[col] = pd.to_numeric(df[col], errors="ignore")
1265
1266
    return df
1267
1268
1269
def validate_output(
1270
    desagg_mastr_gdf: pd.DataFrame | gpd.GeoDataFrame,
1271
    desagg_buildings_gdf: pd.DataFrame | gpd.GeoDataFrame,
1272
) -> None:
1273
    """
1274
    Validate output.
1275
1276
    * Validate that there are exactly as many buildings with a pv system as there are
1277
      pv systems with a building
1278
    * Validate that the building IDs with a pv system are the same building IDs as
1279
      assigned to the pv systems
1280
    * Validate that the pv system IDs with a building are the same pv system IDs as
1281
      assigned to the buildings
1282
1283
    Parameters
1284
    -----------
1285
    desagg_mastr_gdf : geopandas.GeoDataFrame
1286
        GeoDataFrame containing MaStR data allocated to building IDs.
1287
    desagg_buildings_gdf : geopandas.GeoDataFrame
1288
        GeoDataFrame containing building data allocated to MaStR IDs.
1289
    """
1290
    assert len(
1291
        desagg_mastr_gdf.loc[~desagg_mastr_gdf.building_id.isna()]
1292
    ) == len(desagg_buildings_gdf.loc[~desagg_buildings_gdf.gens_id.isna()])
1293
    assert (
1294
        np.sort(
1295
            desagg_mastr_gdf.loc[
1296
                ~desagg_mastr_gdf.building_id.isna()
1297
            ].building_id.unique()
1298
        )
1299
        == np.sort(
1300
            desagg_buildings_gdf.loc[
1301
                ~desagg_buildings_gdf.gens_id.isna()
1302
            ].index.unique()
1303
        )
1304
    ).all()
1305
    assert (
1306
        np.sort(
1307
            desagg_mastr_gdf.loc[
1308
                ~desagg_mastr_gdf.building_id.isna()
1309
            ].index.unique()
1310
        )
1311
        == np.sort(
1312
            desagg_buildings_gdf.loc[
1313
                ~desagg_buildings_gdf.gens_id.isna()
1314
            ].gens_id.unique()
1315
        )
1316
    ).all()
1317
1318
    logger.debug("Validated output.")
1319
1320
1321
def drop_unallocated_gens(
1322
    gdf: gpd.GeoDataFrame,
1323
) -> gpd.GeoDataFrame:
1324
    """
1325
    Drop generators which did not get allocated.
1326
1327
    Parameters
1328
    -----------
1329
    gdf : geopandas.GeoDataFrame
1330
        GeoDataFrame containing MaStR data allocated to building IDs.
1331
    Returns
1332
    -------
1333
    geopandas.GeoDataFrame
1334
        GeoDataFrame containing MaStR data with generators dropped which did not get
1335
        allocated.
1336
    """
1337
    init_len = len(gdf)
1338
    gdf = gdf.loc[~gdf.building_id.isna()]
1339
    end_len = len(gdf)
1340
1341
    logger.debug(
1342
        f"Dropped {init_len - end_len} "
1343
        f"({((init_len - end_len) / init_len) * 100:g}%)"
1344
        f" of {init_len} unallocated rows from MaStR DataFrame."
1345
    )
1346
1347
    return gdf
1348
1349
1350
@timer_func
1351
def allocate_to_buildings(
1352
    mastr_gdf: gpd.GeoDataFrame,
1353
    buildings_gdf: gpd.GeoDataFrame,
1354
) -> tuple[gpd.GeoDataFrame, gpd.GeoDataFrame]:
1355
    """
1356
    Allocate status quo pv rooftop generators to buildings.
1357
    Parameters
1358
    -----------
1359
    mastr_gdf : geopandas.GeoDataFrame
1360
        GeoDataFrame containing MaStR data with geocoded locations.
1361
    buildings_gdf : geopandas.GeoDataFrame
1362
        GeoDataFrame containing OSM buildings data with buildings without an AGS ID
1363
        dropped.
1364
    Returns
1365
    -------
1366
    tuple with two geopandas.GeoDataFrame s
1367
        GeoDataFrame containing MaStR data allocated to building IDs.
1368
        GeoDataFrame containing building data allocated to MaStR IDs.
1369
    """
1370
    logger.debug("Starting allocation of status quo.")
1371
1372
    q_mastr_gdf = sort_and_qcut_df(mastr_gdf, col="capacity", q=Q)
1373
    q_buildings_gdf = sort_and_qcut_df(buildings_gdf, col="building_area", q=Q)
1374
1375
    desagg_mastr_gdf, desagg_buildings_gdf = allocate_pv(
1376
        q_mastr_gdf, q_buildings_gdf, SEED
1377
    )
1378
1379
    validate_output(desagg_mastr_gdf, desagg_buildings_gdf)
1380
1381
    return drop_unallocated_gens(desagg_mastr_gdf), desagg_buildings_gdf
1382
1383
1384
@timer_func
1385
def grid_districts(
1386
    epsg: int,
1387
) -> gpd.GeoDataFrame:
1388
    """
1389
    Load mv grid district geo data from eGo^n Database as
1390
    geopandas.GeoDataFrame.
1391
    Parameters
1392
    -----------
1393
    epsg : int
1394
        EPSG ID to use as CRS.
1395
    Returns
1396
    -------
1397
    geopandas.GeoDataFrame
1398
        GeoDataFrame containing mv grid district ID and geo shapes data.
1399
    """
1400
    gdf = db.select_geodataframe(
1401
        """
1402
        SELECT bus_id, geom
1403
        FROM grid.egon_mv_grid_district
1404
        ORDER BY bus_id
1405
        """,
1406
        index_col="bus_id",
1407
        geom_col="geom",
1408
        epsg=epsg,
1409
    )
1410
1411
    gdf.index = gdf.index.astype(int)
1412
1413
    logger.debug("Grid districts loaded.")
1414
1415
    return gdf
1416
1417
1418
def scenario_data(
1419
    carrier: str = "solar_rooftop",
1420
    scenario: str = "eGon2035",
1421
) -> pd.DataFrame:
1422
    """
1423
    Get scenario capacity data from eGo^n Database.
1424
    Parameters
1425
    -----------
1426
    carrier : str
1427
        Carrier type to filter table by.
1428
    scenario : str
1429
        Scenario to filter table by.
1430
    Returns
1431
    -------
1432
    geopandas.GeoDataFrame
1433
        GeoDataFrame with scenario capacity data in GW.
1434
    """
1435
    with db.session_scope() as session:
1436
        query = session.query(EgonScenarioCapacities).filter(
1437
            EgonScenarioCapacities.carrier == carrier,
1438
            EgonScenarioCapacities.scenario_name == scenario,
1439
        )
1440
1441
    df = pd.read_sql(
1442
        query.statement, query.session.bind, index_col="index"
1443
    ).sort_index()
1444
1445
    logger.debug("Scenario capacity data loaded.")
1446
1447
    return df
1448
1449
1450 View Code Duplication
class Vg250Lan(Base):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
1451
    __tablename__ = "vg250_lan"
1452
    __table_args__ = {"schema": "boundaries"}
1453
1454
    id = Column(BigInteger, primary_key=True, index=True)
1455
    ade = Column(BigInteger)
1456
    gf = Column(BigInteger)
1457
    bsg = Column(BigInteger)
1458
    ars = Column(String)
1459
    ags = Column(String)
1460
    sdv_ars = Column(String)
1461
    gen = Column(String)
1462
    bez = Column(String)
1463
    ibz = Column(BigInteger)
1464
    bem = Column(String)
1465
    nbd = Column(String)
1466
    sn_l = Column(String)
1467
    sn_r = Column(String)
1468
    sn_k = Column(String)
1469
    sn_v1 = Column(String)
1470
    sn_v2 = Column(String)
1471
    sn_g = Column(String)
1472
    fk_s3 = Column(String)
1473
    nuts = Column(String)
1474
    ars_0 = Column(String)
1475
    ags_0 = Column(String)
1476
    wsk = Column(String)
1477
    debkg_id = Column(String)
1478
    rs = Column(String)
1479
    sdv_rs = Column(String)
1480
    rs_0 = Column(String)
1481
    geometry = Column(Geometry(srid=EPSG), index=True)
1482
1483
1484
def federal_state_data(to_crs: CRS) -> gpd.GeoDataFrame:
1485
    """
1486
    Get feder state data from eGo^n Database.
1487
    Parameters
1488
    -----------
1489
    to_crs : pyproj.crs.crs.CRS
1490
        CRS to transform geometries to.
1491
    Returns
1492
    -------
1493
    geopandas.GeoDataFrame
1494
        GeoDataFrame with federal state data.
1495
    """
1496
    with db.session_scope() as session:
1497
        query = session.query(
1498
            Vg250Lan.id, Vg250Lan.nuts, Vg250Lan.geometry.label("geom")
1499
        )
1500
1501
        gdf = gpd.read_postgis(
1502
            query.statement, session.connection(), index_col="id"
1503
        ).to_crs(to_crs)
1504
1505
    logger.debug("Federal State data loaded.")
1506
1507
    return gdf
1508
1509
1510
@timer_func
1511
def overlay_grid_districts_with_counties(
1512
    mv_grid_district_gdf: gpd.GeoDataFrame,
1513
    federal_state_gdf: gpd.GeoDataFrame,
1514
) -> gpd.GeoDataFrame:
1515
    """
1516
    Calculate the intersections of mv grid districts and counties.
1517
    Parameters
1518
    -----------
1519
    mv_grid_district_gdf : gpd.GeoDataFrame
1520
        GeoDataFrame containing mv grid district ID and geo shapes data.
1521
    federal_state_gdf : gpd.GeoDataFrame
1522
        GeoDataFrame with federal state data.
1523
    Returns
1524
    -------
1525
    geopandas.GeoDataFrame
1526
        GeoDataFrame containing OSM buildings data.
1527
    """
1528
    logger.debug(
1529
        "Calculating intersection overlay between mv grid districts and "
1530
        "counties. This may take a while..."
1531
    )
1532
1533
    gdf = gpd.overlay(
1534
        federal_state_gdf.to_crs(mv_grid_district_gdf.crs),
1535
        mv_grid_district_gdf.reset_index(),
1536
        how="intersection",
1537
        keep_geom_type=True,
1538
    )
1539
1540
    logger.debug("Done!")
1541
1542
    return gdf
1543
1544
1545
@timer_func
1546
def add_overlay_id_to_buildings(
1547
    buildings_gdf: gpd.GeoDataFrame,
1548
    grid_federal_state_gdf: gpd.GeoDataFrame,
1549
) -> gpd.GeoDataFrame:
1550
    """
1551
    Add information about overlay ID to buildings.
1552
    Parameters
1553
    -----------
1554
    buildings_gdf : geopandas.GeoDataFrame
1555
        GeoDataFrame containing OSM buildings data.
1556
    grid_federal_state_gdf : geopandas.GeoDataFrame
1557
        GeoDataFrame with intersection shapes between counties and grid districts.
1558
    Returns
1559
    -------
1560
    geopandas.GeoDataFrame
1561
        GeoDataFrame containing OSM buildings data with overlay ID added.
1562
    """
1563
    gdf = (
1564
        buildings_gdf.to_crs(grid_federal_state_gdf.crs)
1565
        .sjoin(
1566
            grid_federal_state_gdf,
1567
            how="left",
1568
            predicate="intersects",
1569
        )
1570
        .rename(columns={"index_right": "overlay_id"})
1571
    )
1572
1573
    logger.debug("Added overlay ID to OSM buildings.")
1574
1575
    return gdf
1576
1577
1578
def drop_buildings_outside_grids(
1579
    buildings_gdf: gpd.GeoDataFrame,
1580
) -> gpd.GeoDataFrame:
1581
    """
1582
    Drop all buildings outside of grid areas.
1583
    Parameters
1584
    -----------
1585
    buildings_gdf : geopandas.GeoDataFrame
1586
        GeoDataFrame containing OSM buildings data.
1587
    Returns
1588
    -------
1589
    gepandas.GeoDataFrame
1590
        GeoDataFrame containing OSM buildings data
1591
        with buildings without an bus ID dropped.
1592
    """
1593
    gdf = buildings_gdf.loc[~buildings_gdf.bus_id.isna()]
1594
1595
    logger.debug(
1596
        f"{len(buildings_gdf) - len(gdf)} "
1597
        f"({(len(buildings_gdf) - len(gdf)) / len(buildings_gdf) * 100:g}%) "
1598
        f"of {len(buildings_gdf)} values are outside of the grid areas "
1599
        "and are therefore dropped."
1600
    )
1601
1602
    return gdf
1603
1604
1605
def cap_per_bus_id(
1606
    scenario: str,
1607
) -> pd.DataFrame:
1608
    """
1609
    Get table with total pv rooftop capacity per grid district.
1610
1611
    Parameters
1612
    -----------
1613
    scenario : str
1614
        Scenario name.
1615
    Returns
1616
    -------
1617
    pandas.DataFrame
1618
        DataFrame with total rooftop capacity per mv grid.
1619
    """
1620
    targets = config.datasets()["solar_rooftop"]["targets"]
1621
1622
    sql = f"""
1623
    SELECT bus as bus_id, control, p_nom as capacity
1624
    FROM {targets['generators']['schema']}.{targets['generators']['table']}
1625
    WHERE carrier = 'solar_rooftop'
1626
    AND scn_name = '{scenario}'
1627
    """
1628
    # TODO: woher kommen die Slack rows???
1629
1630
    df = db.select_dataframe(sql, index_col="bus_id")
1631
1632
    return df.loc[df.control != "Slack"]
1633
1634
    # overlay_gdf = overlay_gdf.assign(capacity=np.nan)
1635
    #
1636
    # for cap, nuts in scenario_df[["capacity", "nuts"]].itertuples(index=False):
1637
    #     nuts_gdf = overlay_gdf.loc[overlay_gdf.nuts == nuts]
1638
    #
1639
    #     capacity = nuts_gdf.building_area.multiply(
1640
    #         cap / nuts_gdf.building_area.sum()
1641
    #     )
1642
    #
1643
    #     overlay_gdf.loc[nuts_gdf.index] = overlay_gdf.loc[
1644
    #         nuts_gdf.index
1645
    #     ].assign(capacity=capacity.multiply(conversion).to_numpy())
1646
    #
1647
    # return overlay_gdf[["bus_id", "capacity"]].groupby("bus_id").sum()
1648
1649
1650
def determine_end_of_life_gens(
1651
    mastr_gdf: gpd.GeoDataFrame,
1652
    scenario_timestamp: pd.Timestamp,
1653
    pv_rooftop_lifetime: pd.Timedelta,
1654
) -> gpd.GeoDataFrame:
1655
    """
1656
    Determine if an old PV system has reached its end of life.
1657
    Parameters
1658
    -----------
1659
    mastr_gdf : geopandas.GeoDataFrame
1660
        GeoDataFrame containing geocoded MaStR data.
1661
    scenario_timestamp : pandas.Timestamp
1662
        Timestamp at which the scenario takes place.
1663
    pv_rooftop_lifetime : pandas.Timedelta
1664
        Average expected lifetime of PV rooftop systems.
1665
    Returns
1666
    -------
1667
    geopandas.GeoDataFrame
1668
        GeoDataFrame containing geocoded MaStR data and info if the system
1669
        has reached its end of life.
1670
    """
1671
    before = mastr_gdf.capacity.sum()
1672
1673
    mastr_gdf = mastr_gdf.assign(
1674
        age=scenario_timestamp - mastr_gdf.start_up_date
1675
    )
1676
1677
    mastr_gdf = mastr_gdf.assign(
1678
        end_of_life=pv_rooftop_lifetime < mastr_gdf.age
1679
    )
1680
1681
    after = mastr_gdf.loc[~mastr_gdf.end_of_life].capacity.sum()
1682
1683
    logger.debug(
1684
        "Determined if pv rooftop systems reached their end of life.\nTotal capacity: "
1685
        f"{before}\nActive capacity: {after}"
1686
    )
1687
1688
    return mastr_gdf
1689
1690
1691
def calculate_max_pv_cap_per_building(
1692
    buildings_gdf: gpd.GeoDataFrame,
1693
    mastr_gdf: gpd.GeoDataFrame,
1694
    pv_cap_per_sq_m: float | int,
1695
    roof_factor: float | int,
1696
) -> gpd.GeoDataFrame:
1697
    """
1698
    Calculate the estimated maximum possible PV capacity per building.
1699
    Parameters
1700
    -----------
1701
    buildings_gdf : geopandas.GeoDataFrame
1702
        GeoDataFrame containing OSM buildings data.
1703
    mastr_gdf : geopandas.GeoDataFrame
1704
        GeoDataFrame containing geocoded MaStR data.
1705
    pv_cap_per_sq_m : float, int
1706
        Average expected, installable PV capacity per square meter.
1707
    roof_factor : float, int
1708
        Average for PV usable roof area share.
1709
    Returns
1710
    -------
1711
    geopandas.GeoDataFrame
1712
        GeoDataFrame containing OSM buildings data with estimated maximum PV
1713
        capacity.
1714
    """
1715
    gdf = (
1716
        buildings_gdf.reset_index()
1717
        .rename(columns={"index": "id"})
1718
        .merge(
1719
            mastr_gdf[
1720
                [
1721
                    "capacity",
1722
                    "end_of_life",
1723
                    "building_id",
1724
                    "EinheitlicheAusrichtungUndNeigungswinkel",
1725
                    "Hauptausrichtung",
1726
                    "HauptausrichtungNeigungswinkel",
1727
                ]
1728
            ],
1729
            how="left",
1730
            left_on="id",
1731
            right_on="building_id",
1732
        )
1733
        .set_index("id")
1734
        .drop(columns="building_id")
1735
    )
1736
1737
    return gdf.assign(
1738
        max_cap=gdf.building_area.multiply(roof_factor * pv_cap_per_sq_m),
1739
        end_of_life=gdf.end_of_life.fillna(True).astype(bool),
1740
        bus_id=gdf.bus_id.astype(int),
1741
    )
1742
1743
1744
def calculate_building_load_factor(
1745
    mastr_gdf: gpd.GeoDataFrame,
1746
    buildings_gdf: gpd.GeoDataFrame,
1747
    rounding: int = 4,
1748
) -> gpd.GeoDataFrame:
1749
    """
1750
    Calculate the roof load factor from existing PV systems.
1751
    Parameters
1752
    -----------
1753
    mastr_gdf : geopandas.GeoDataFrame
1754
        GeoDataFrame containing geocoded MaStR data.
1755
    buildings_gdf : geopandas.GeoDataFrame
1756
        GeoDataFrame containing OSM buildings data.
1757
    rounding : int
1758
        Rounding to use for load factor.
1759
    Returns
1760
    -------
1761
    geopandas.GeoDataFrame
1762
        GeoDataFrame containing geocoded MaStR data with calculated load factor.
1763
    """
1764
    gdf = mastr_gdf.merge(
1765
        buildings_gdf[["max_cap", "building_area"]]
1766
        .loc[~buildings_gdf["max_cap"].isna()]
1767
        .reset_index(),
1768
        how="left",
1769
        left_on="building_id",
1770
        right_on="id",
1771
    ).set_index("id")
1772
1773
    return gdf.assign(load_factor=(gdf.capacity / gdf.max_cap).round(rounding))
1774
1775
1776
def get_probability_for_property(
1777
    mastr_gdf: gpd.GeoDataFrame,
1778
    cap_range: tuple[int | float, int | float],
1779
    prop: str,
1780
) -> tuple[np.array, np.array]:
1781
    """
1782
    Calculate the probability of the different options of a property of the
1783
    existing PV plants.
1784
    Parameters
1785
    -----------
1786
    mastr_gdf : geopandas.GeoDataFrame
1787
        GeoDataFrame containing geocoded MaStR data.
1788
    cap_range : tuple(int, int)
1789
        Capacity range of PV plants to look at.
1790
    prop : str
1791
        Property to calculate probabilities for. String needs to be in columns
1792
        of mastr_gdf.
1793
    Returns
1794
    -------
1795
    tuple
1796
        numpy.array
1797
            Unique values of property.
1798
        numpy.array
1799
            Probabilties per unique value.
1800
    """
1801
    cap_range_gdf = mastr_gdf.loc[
1802
        (mastr_gdf.capacity > cap_range[0])
1803
        & (mastr_gdf.capacity <= cap_range[1])
1804
    ]
1805
1806
    if prop == "load_factor":
1807
        cap_range_gdf = cap_range_gdf.loc[cap_range_gdf[prop] <= 1]
1808
1809
    count = Counter(
1810
        cap_range_gdf[prop].loc[
1811
            ~cap_range_gdf[prop].isna()
1812
            & ~cap_range_gdf[prop].isnull()
1813
            & ~(cap_range_gdf[prop] == "None")
1814
        ]
1815
    )
1816
1817
    values = np.array(list(count.keys()))
1818
    probabilities = np.fromiter(count.values(), dtype=float)
1819
    probabilities = probabilities / np.sum(probabilities)
1820
1821
    return values, probabilities
1822
1823
1824
@timer_func
1825
def probabilities(
1826
    mastr_gdf: gpd.GeoDataFrame,
1827
    cap_ranges: list[tuple[int | float, int | float]] | None = None,
1828
    properties: list[str] | None = None,
1829
) -> dict:
1830
    """
1831
    Calculate the probability of the different options of properties of the
1832
    existing PV plants.
1833
    Parameters
1834
    -----------
1835
    mastr_gdf : geopandas.GeoDataFrame
1836
        GeoDataFrame containing geocoded MaStR data.
1837
    cap_ranges : list(tuple(int, int))
1838
        List of capacity ranges to distinguish between. The first tuple should
1839
        start with a zero and the last one should end with infinite.
1840
    properties : list(str)
1841
        List of properties to calculate probabilities for. Strings needs to be
1842
        in columns of mastr_gdf.
1843
    Returns
1844
    -------
1845
    dict
1846
        Dictionary with values and probabilities per capacity range.
1847
    """
1848
    if cap_ranges is None:
1849
        cap_ranges = [
1850
            (0, 30),
1851
            (30, 100),
1852
            (100, float("inf")),
1853
        ]
1854
    if properties is None:
1855
        properties = [
1856
            "EinheitlicheAusrichtungUndNeigungswinkel",
1857
            "Hauptausrichtung",
1858
            "HauptausrichtungNeigungswinkel",
1859
            "load_factor",
1860
        ]
1861
1862
    prob_dict = {}
1863
1864
    for cap_range in cap_ranges:
1865
        prob_dict[cap_range] = {
1866
            "values": {},
1867
            "probabilities": {},
1868
        }
1869
1870
        for prop in properties:
1871
            v, p = get_probability_for_property(
1872
                mastr_gdf,
1873
                cap_range,
1874
                prop,
1875
            )
1876
1877
            prob_dict[cap_range]["values"][prop] = v
1878
            prob_dict[cap_range]["probabilities"][prop] = p
1879
1880
    return prob_dict
1881
1882
1883
def cap_share_per_cap_range(
1884
    mastr_gdf: gpd.GeoDataFrame,
1885
    cap_ranges: list[tuple[int | float, int | float]] | None = None,
1886
) -> dict[tuple[int | float, int | float], float]:
1887
    """
1888
    Calculate the share of PV capacity from the total PV capacity within
1889
    capacity ranges.
1890
    Parameters
1891
    -----------
1892
    mastr_gdf : geopandas.GeoDataFrame
1893
        GeoDataFrame containing geocoded MaStR data.
1894
    cap_ranges : list(tuple(int, int))
1895
        List of capacity ranges to distinguish between. The first tuple should
1896
        start with a zero and the last one should end with infinite.
1897
    Returns
1898
    -------
1899
    dict
1900
        Dictionary with share of PV capacity from the total PV capacity within
1901
        capacity ranges.
1902
    """
1903
    if cap_ranges is None:
1904
        cap_ranges = [
1905
            (0, 30),
1906
            (30, 100),
1907
            (100, float("inf")),
1908
        ]
1909
1910
    cap_share_dict = {}
1911
1912
    total_cap = mastr_gdf.capacity.sum()
1913
1914
    for cap_range in cap_ranges:
1915
        cap_share = (
1916
            mastr_gdf.loc[
1917
                (mastr_gdf.capacity > cap_range[0])
1918
                & (mastr_gdf.capacity <= cap_range[1])
1919
            ].capacity.sum()
1920
            / total_cap
1921
        )
1922
1923
        cap_share_dict[cap_range] = cap_share
1924
1925
    return cap_share_dict
1926
1927
1928
def mean_load_factor_per_cap_range(
1929
    mastr_gdf: gpd.GeoDataFrame,
1930
    cap_ranges: list[tuple[int | float, int | float]] | None = None,
1931
) -> dict[tuple[int | float, int | float], float]:
1932
    """
1933
    Calculate the mean roof load factor per capacity range from existing PV
1934
    plants.
1935
    Parameters
1936
    -----------
1937
    mastr_gdf : geopandas.GeoDataFrame
1938
        GeoDataFrame containing geocoded MaStR data.
1939
    cap_ranges : list(tuple(int, int))
1940
        List of capacity ranges to distinguish between. The first tuple should
1941
        start with a zero and the last one should end with infinite.
1942
    Returns
1943
    -------
1944
    dict
1945
        Dictionary with mean roof load factor per capacity range.
1946
    """
1947
    if cap_ranges is None:
1948
        cap_ranges = [
1949
            (0, 30),
1950
            (30, 100),
1951
            (100, float("inf")),
1952
        ]
1953
1954
    load_factor_dict = {}
1955
1956
    for cap_range in cap_ranges:
1957
        load_factor = mastr_gdf.loc[
1958
            (mastr_gdf.load_factor <= 1)
1959
            & (mastr_gdf.capacity > cap_range[0])
1960
            & (mastr_gdf.capacity <= cap_range[1])
1961
        ].load_factor.mean()
1962
1963
        load_factor_dict[cap_range] = load_factor
1964
1965
    return load_factor_dict
1966
1967
1968
def building_area_range_per_cap_range(
1969
    mastr_gdf: gpd.GeoDataFrame,
1970
    cap_ranges: list[tuple[int | float, int | float]] | None = None,
1971
    min_building_size: int | float = 10.0,
1972
    upper_quantile: float = 0.95,
1973
    lower_quantile: float = 0.05,
1974
) -> dict[tuple[int | float, int | float], tuple[int | float, int | float]]:
1975
    """
1976
    Estimate normal building area range per capacity range.
1977
    Calculate the mean roof load factor per capacity range from existing PV
1978
    plants.
1979
    Parameters
1980
    -----------
1981
    mastr_gdf : geopandas.GeoDataFrame
1982
        GeoDataFrame containing geocoded MaStR data.
1983
    cap_ranges : list(tuple(int, int))
1984
        List of capacity ranges to distinguish between. The first tuple should
1985
        start with a zero and the last one should end with infinite.
1986
    min_building_size : int, float
1987
        Minimal building size to consider for PV plants.
1988
    upper_quantile : float
1989
        Upper quantile to estimate maximum building size per capacity range.
1990
    lower_quantile : float
1991
        Lower quantile to estimate minimum building size per capacity range.
1992
    Returns
1993
    -------
1994
    dict
1995
        Dictionary with estimated normal building area range per capacity
1996
        range.
1997
    """
1998
    if cap_ranges is None:
1999
        cap_ranges = [
2000
            (0, 30),
2001
            (30, 100),
2002
            (100, float("inf")),
2003
        ]
2004
2005
    building_area_range_dict = {}
2006
2007
    n_ranges = len(cap_ranges)
2008
2009
    for count, cap_range in enumerate(cap_ranges):
2010
        cap_range_gdf = mastr_gdf.loc[
2011
            (mastr_gdf.capacity > cap_range[0])
2012
            & (mastr_gdf.capacity <= cap_range[1])
2013
        ]
2014
2015
        if count == 0:
2016
            building_area_range_dict[cap_range] = (
2017
                min_building_size,
2018
                cap_range_gdf.building_area.quantile(upper_quantile),
2019
            )
2020
        elif count == n_ranges - 1:
2021
            building_area_range_dict[cap_range] = (
2022
                cap_range_gdf.building_area.quantile(lower_quantile),
2023
                float("inf"),
2024
            )
2025
        else:
2026
            building_area_range_dict[cap_range] = (
2027
                cap_range_gdf.building_area.quantile(lower_quantile),
2028
                cap_range_gdf.building_area.quantile(upper_quantile),
2029
            )
2030
2031
    values = list(building_area_range_dict.values())
2032
2033
    building_area_range_normed_dict = {}
2034
2035
    for count, (cap_range, (min_area, max_area)) in enumerate(
2036
        building_area_range_dict.items()
2037
    ):
2038
        if count == 0:
2039
            building_area_range_normed_dict[cap_range] = (
2040
                min_area,
2041
                np.mean((values[count + 1][0], max_area)),
2042
            )
2043
        elif count == n_ranges - 1:
2044
            building_area_range_normed_dict[cap_range] = (
2045
                np.mean((values[count - 1][1], min_area)),
2046
                max_area,
2047
            )
2048
        else:
2049
            building_area_range_normed_dict[cap_range] = (
2050
                np.mean((values[count - 1][1], min_area)),
2051
                np.mean((values[count + 1][0], max_area)),
2052
            )
2053
2054
    return building_area_range_normed_dict
2055
2056
2057
@timer_func
2058
def desaggregate_pv_in_mv_grid(
2059
    buildings_gdf: gpd.GeoDataFrame,
2060
    pv_cap: float | int,
2061
    **kwargs,
2062
) -> gpd.GeoDataFrame:
2063
    """
2064
    Desaggregate PV capacity on buildings within a given grid district.
2065
    Parameters
2066
    -----------
2067
    buildings_gdf : geopandas.GeoDataFrame
2068
        GeoDataFrame containing buildings within the grid district.
2069
    pv_cap : float, int
2070
        PV capacity to desaggregate.
2071
    Other Parameters
2072
    -----------
2073
    prob_dict : dict
2074
        Dictionary with values and probabilities per capacity range.
2075
    cap_share_dict : dict
2076
        Dictionary with share of PV capacity from the total PV capacity within
2077
        capacity ranges.
2078
    building_area_range_dict : dict
2079
        Dictionary with estimated normal building area range per capacity
2080
        range.
2081
    load_factor_dict : dict
2082
        Dictionary with mean roof load factor per capacity range.
2083
    seed : int
2084
        Seed to use for random operations with NumPy and pandas.
2085
    pv_cap_per_sq_m : float, int
2086
        Average expected, installable PV capacity per square meter.
2087
    Returns
2088
    -------
2089
    geopandas.GeoDataFrame
2090
        GeoDataFrame containing OSM building data with desaggregated PV
2091
        plants.
2092
    """
2093
    bus_id = int(buildings_gdf.bus_id.iat[0])
2094
2095
    rng = default_rng(seed=kwargs["seed"])
2096
    random_state = RandomState(seed=kwargs["seed"])
2097
2098
    results_df = pd.DataFrame(columns=buildings_gdf.columns)
2099
2100
    for cap_range, share in kwargs["cap_share_dict"].items():
2101
        pv_cap_range = pv_cap * share
2102
2103
        b_area_min, b_area_max = kwargs["building_area_range_dict"][cap_range]
2104
2105
        cap_range_buildings_gdf = buildings_gdf.loc[
2106
            ~buildings_gdf.index.isin(results_df.index)
2107
            & (buildings_gdf.building_area > b_area_min)
2108
            & (buildings_gdf.building_area <= b_area_max)
2109
        ]
2110
2111
        mean_load_factor = kwargs["load_factor_dict"][cap_range]
2112
        cap_range_buildings_gdf = cap_range_buildings_gdf.assign(
2113
            mean_cap=cap_range_buildings_gdf.max_cap * mean_load_factor,
2114
            load_factor=np.nan,
2115
            capacity=np.nan,
2116
        )
2117
2118
        total_mean_cap = cap_range_buildings_gdf.mean_cap.sum()
2119
2120
        if total_mean_cap == 0:
2121
            logger.warning(
2122
                f"There are no matching roof for capacity range {cap_range} "
2123
                f"kW in grid {bus_id}. Using all buildings as fallback."
2124
            )
2125
2126
            cap_range_buildings_gdf = buildings_gdf.loc[
2127
                ~buildings_gdf.index.isin(results_df.index)
2128
            ]
2129
2130
            if len(cap_range_buildings_gdf) == 0:
2131
                logger.warning(
2132
                    "There are no roofes available for capacity range "
2133
                    f"{cap_range} kW in grid {bus_id}. Allowing dual use."
2134
                )
2135
                cap_range_buildings_gdf = buildings_gdf.copy()
2136
2137
            cap_range_buildings_gdf = cap_range_buildings_gdf.assign(
2138
                mean_cap=cap_range_buildings_gdf.max_cap * mean_load_factor,
2139
                load_factor=np.nan,
2140
                capacity=np.nan,
2141
            )
2142
2143
            total_mean_cap = cap_range_buildings_gdf.mean_cap.sum()
2144
2145
        elif total_mean_cap < pv_cap_range:
2146
            logger.warning(
2147
                f"Average roof utilization of the roof area in grid {bus_id} "
2148
                f"and capacity range {cap_range} kW is not sufficient. The "
2149
                "roof utilization will be above average."
2150
            )
2151
2152
        frac = max(
2153
            pv_cap_range / total_mean_cap,
2154
            1 / len(cap_range_buildings_gdf),
2155
        )
2156
2157
        samples_gdf = cap_range_buildings_gdf.sample(
2158
            frac=min(1, frac),
2159
            random_state=random_state,
2160
        )
2161
2162
        cap_range_dict = kwargs["prob_dict"][cap_range]
2163
2164
        values_dict = cap_range_dict["values"]
2165
        p_dict = cap_range_dict["probabilities"]
2166
2167
        load_factors = rng.choice(
2168
            a=values_dict["load_factor"],
2169
            size=len(samples_gdf),
2170
            p=p_dict["load_factor"],
2171
        )
2172
2173
        samples_gdf = samples_gdf.assign(
2174
            load_factor=load_factors,
2175
            capacity=(
2176
                samples_gdf.building_area
2177
                * load_factors
2178
                * kwargs["pv_cap_per_sq_m"]
2179
            ).clip(lower=0.4),
2180
        )
2181
2182
        missing_factor = pv_cap_range / samples_gdf.capacity.sum()
2183
2184
        samples_gdf = samples_gdf.assign(
2185
            capacity=(samples_gdf.capacity * missing_factor),
2186
            load_factor=(samples_gdf.load_factor * missing_factor),
2187
        )
2188
2189
        assert np.isclose(
2190
            samples_gdf.capacity.sum(),
2191
            pv_cap_range,
2192
            rtol=1e-03,
2193
        ), f"{samples_gdf.capacity.sum()} != {pv_cap_range}"
2194
2195
        results_df = pd.concat(
2196
            [
2197
                results_df,
2198
                samples_gdf,
2199
            ],
2200
        )
2201
2202
    total_missing_factor = pv_cap / results_df.capacity.sum()
2203
2204
    results_df = results_df.assign(
2205
        capacity=(results_df.capacity * total_missing_factor),
2206
    )
2207
2208
    assert np.isclose(
2209
        results_df.capacity.sum(),
2210
        pv_cap,
2211
        rtol=1e-03,
2212
    ), f"{results_df.capacity.sum()} != {pv_cap}"
2213
2214
    return gpd.GeoDataFrame(
2215
        results_df,
2216
        crs=samples_gdf.crs,
0 ignored issues
show
introduced by
The variable samples_gdf does not seem to be defined in case the for loop on line 2100 is not entered. Are you sure this can never be the case?
Loading history...
2217
        geometry="geom",
2218
    )
2219
2220
2221
@timer_func
2222
def desaggregate_pv(
2223
    buildings_gdf: gpd.GeoDataFrame,
2224
    cap_df: pd.DataFrame,
2225
    **kwargs,
2226
) -> gpd.GeoDataFrame:
2227
    """
2228
    Desaggregate PV capacity on buildings within a given grid district.
2229
    Parameters
2230
    -----------
2231
    buildings_gdf : geopandas.GeoDataFrame
2232
        GeoDataFrame containing OSM buildings data.
2233
    cap_df : pandas.DataFrame
2234
        DataFrame with total rooftop capacity per mv grid.
2235
    Other Parameters
2236
    -----------
2237
    prob_dict : dict
2238
        Dictionary with values and probabilities per capacity range.
2239
    cap_share_dict : dict
2240
        Dictionary with share of PV capacity from the total PV capacity within
2241
        capacity ranges.
2242
    building_area_range_dict : dict
2243
        Dictionary with estimated normal building area range per capacity
2244
        range.
2245
    load_factor_dict : dict
2246
        Dictionary with mean roof load factor per capacity range.
2247
    seed : int
2248
        Seed to use for random operations with NumPy and pandas.
2249
    pv_cap_per_sq_m : float, int
2250
        Average expected, installable PV capacity per square meter.
2251
    Returns
2252
    -------
2253
    geopandas.GeoDataFrame
2254
        GeoDataFrame containing OSM building data with desaggregated PV
2255
        plants.
2256
    """
2257
    allocated_buildings_gdf = buildings_gdf.loc[~buildings_gdf.end_of_life]
2258
2259
    building_bus_ids = set(buildings_gdf.bus_id)
2260
    cap_bus_ids = set(cap_df.index)
2261
2262
    logger.debug(
2263
        f"Bus IDs from buildings: {len(building_bus_ids)}\nBus IDs from capacity: "
2264
        f"{len(cap_bus_ids)}"
2265
    )
2266
2267
    if len(building_bus_ids) > len(cap_bus_ids):
2268
        missing = building_bus_ids - cap_bus_ids
2269
    else:
2270
        missing = cap_bus_ids - building_bus_ids
2271
2272
    logger.debug(str(missing))
2273
2274
    bus_ids = np.intersect1d(list(building_bus_ids), list(cap_bus_ids))
2275
2276
    # assert set(buildings_gdf.bus_id.unique()) == set(cap_df.index)
2277
2278
    for bus_id in bus_ids:
2279
        buildings_grid_gdf = buildings_gdf.loc[buildings_gdf.bus_id == bus_id]
2280
2281
        pv_installed_gdf = buildings_grid_gdf.loc[
2282
            ~buildings_grid_gdf.end_of_life
2283
        ]
2284
2285
        pv_installed = pv_installed_gdf.capacity.sum()
2286
2287
        pot_buildings_gdf = buildings_grid_gdf.drop(
2288
            index=pv_installed_gdf.index
2289
        )
2290
2291
        if len(pot_buildings_gdf) == 0:
2292
            logger.error(
2293
                f"In grid {bus_id} there are no potential buildings to allocate "
2294
                "PV capacity to. The grid is skipped. This message should only "
2295
                "appear doing test runs with few buildings."
2296
            )
2297
2298
            continue
2299
2300
        pv_target = cap_df.at[bus_id, "capacity"] * 1000
2301
2302
        logger.debug(f"pv_target: {pv_target}")
2303
2304
        pv_missing = pv_target - pv_installed
2305
2306
        if pv_missing <= 0:
2307
            logger.warning(
2308
                f"In grid {bus_id} there is more PV installed ({pv_installed: g} kW) in"
2309
                f" status Quo than allocated within the scenario ({pv_target: g} kW). "
2310
                f"No new generators are added."
2311
            )
2312
2313
            continue
2314
2315
        if pot_buildings_gdf.max_cap.sum() < pv_missing:
2316
            logger.error(
2317
                f"In grid {bus_id} there is less PV potential ("
2318
                f"{pot_buildings_gdf.max_cap.sum():g} kW) than allocated PV  capacity "
2319
                f"({pv_missing:g} kW). The average roof utilization will be very high."
2320
            )
2321
2322
        gdf = desaggregate_pv_in_mv_grid(
2323
            buildings_gdf=pot_buildings_gdf,
2324
            pv_cap=pv_missing,
2325
            **kwargs,
2326
        )
2327
2328
        logger.debug(f"New cap in grid {bus_id}: {gdf.capacity.sum()}")
2329
        logger.debug(f"Installed cap in grid {bus_id}: {pv_installed}")
2330
        logger.debug(
2331
            f"Total cap in grid {bus_id}: {gdf.capacity.sum() + pv_installed}"
2332
        )
2333
2334
        if not np.isclose(
2335
            gdf.capacity.sum() + pv_installed, pv_target, rtol=1e-3
2336
        ):
2337
            logger.warning(
2338
                f"The desired capacity and actual capacity in grid {bus_id} differ.\n"
2339
                f"Desired cap: {pv_target}\nActual cap: "
2340
                f"{gdf.capacity.sum() + pv_installed}"
2341
            )
2342
2343
        pre_cap = allocated_buildings_gdf.capacity.sum()
2344
        new_cap = gdf.capacity.sum()
2345
2346
        allocated_buildings_gdf = pd.concat(
2347
            [
2348
                allocated_buildings_gdf,
2349
                gdf,
2350
            ]
2351
        )
2352
2353
        total_cap = allocated_buildings_gdf.capacity.sum()
2354
2355
        assert np.isclose(pre_cap + new_cap, total_cap)
2356
2357
    logger.debug("Desaggregated scenario.")
2358
    logger.debug(f"Scenario capacity: {cap_df.capacity.sum(): g}")
2359
    logger.debug(
2360
        f"Generator capacity: {allocated_buildings_gdf.capacity.sum() / 1000: g}"
2361
    )
2362
2363
    return gpd.GeoDataFrame(
2364
        allocated_buildings_gdf,
2365
        crs=gdf.crs,
0 ignored issues
show
introduced by
The variable gdf does not seem to be defined for all execution paths.
Loading history...
2366
        geometry="geom",
2367
    )
2368
2369
2370
@timer_func
2371
def add_buildings_meta_data(
2372
    buildings_gdf: gpd.GeoDataFrame,
2373
    prob_dict: dict,
2374
    seed: int,
2375
) -> gpd.GeoDataFrame:
2376
    """
2377
    Randomly add additional metadata to desaggregated PV plants.
2378
    Parameters
2379
    -----------
2380
    buildings_gdf : geopandas.GeoDataFrame
2381
        GeoDataFrame containing OSM buildings data with desaggregated PV
2382
        plants.
2383
    prob_dict : dict
2384
        Dictionary with values and probabilities per capacity range.
2385
    seed : int
2386
        Seed to use for random operations with NumPy and pandas.
2387
    Returns
2388
    -------
2389
    geopandas.GeoDataFrame
2390
        GeoDataFrame containing OSM building data with desaggregated PV
2391
        plants.
2392
    """
2393
    rng = default_rng(seed=seed)
2394
    buildings_gdf = buildings_gdf.reset_index().rename(
2395
        columns={
2396
            "index": "building_id",
2397
        }
2398
    )
2399
2400
    for (min_cap, max_cap), cap_range_prob_dict in prob_dict.items():
2401
        cap_range_gdf = buildings_gdf.loc[
2402
            (buildings_gdf.capacity >= min_cap)
2403
            & (buildings_gdf.capacity < max_cap)
2404
        ]
2405
2406
        for key, values in cap_range_prob_dict["values"].items():
2407
            if key == "load_factor":
2408
                continue
2409
2410
            gdf = cap_range_gdf.loc[
2411
                cap_range_gdf[key].isna()
2412
                | cap_range_gdf[key].isnull()
2413
                | (cap_range_gdf[key] == "None")
2414
            ]
2415
2416
            key_vals = rng.choice(
2417
                a=values,
2418
                size=len(gdf),
2419
                p=cap_range_prob_dict["probabilities"][key],
2420
            )
2421
2422
            buildings_gdf.loc[gdf.index, key] = key_vals
2423
2424
    return buildings_gdf
2425
2426
2427
def add_voltage_level(
2428
    buildings_gdf: gpd.GeoDataFrame,
2429
) -> gpd.GeoDataFrame:
2430
    """
2431
    Add voltage level derived from generator capacity to the power plants.
2432
    Parameters
2433
    -----------
2434
    buildings_gdf : geopandas.GeoDataFrame
2435
        GeoDataFrame containing OSM buildings data with desaggregated PV
2436
        plants.
2437
    Returns
2438
    -------
2439
    geopandas.GeoDataFrame
2440
        GeoDataFrame containing OSM building data with voltage level per generator.
2441
    """
2442
2443
    def voltage_levels(p: float) -> int:
2444
        if p < 100:
2445
            return 7
2446
        elif p < 200:
2447
            return 6
2448
        elif p < 5500:
2449
            return 5
2450
        elif p < 20000:
2451
            return 4
2452
        elif p < 120000:
2453
            return 3
2454
        return 1
2455
2456
    return buildings_gdf.assign(
2457
        voltage_level=buildings_gdf.capacity.apply(voltage_levels)
2458
    )
2459
2460
2461
def add_start_up_date(
2462
    buildings_gdf: gpd.GeoDataFrame,
2463
    start: pd.Timestamp,
2464
    end: pd.Timestamp,
2465
    seed: int,
2466
):
2467
    """
2468
    Randomly and linear add start-up date to new pv generators.
2469
    Parameters
2470
    ----------
2471
    buildings_gdf : geopandas.GeoDataFrame
2472
        GeoDataFrame containing OSM buildings data with desaggregated PV
2473
        plants.
2474
    start : pandas.Timestamp
2475
        Minimum Timestamp to use.
2476
    end : pandas.Timestamp
2477
        Maximum Timestamp to use.
2478
    seed : int
2479
        Seed to use for random operations with NumPy and pandas.
2480
    Returns
2481
    -------
2482
    geopandas.GeoDataFrame
2483
        GeoDataFrame containing OSM buildings data with start-up date added.
2484
    """
2485
    rng = default_rng(seed=seed)
2486
2487
    date_range = pd.date_range(start=start, end=end, freq="1D")
2488
2489
    return buildings_gdf.assign(
2490
        start_up_date=rng.choice(date_range, size=len(buildings_gdf))
2491
    )
2492
2493
2494
@timer_func
2495
def allocate_scenarios(
2496
    mastr_gdf: gpd.GeoDataFrame,
2497
    valid_buildings_gdf: gpd.GeoDataFrame,
2498
    last_scenario_gdf: gpd.GeoDataFrame,
2499
    scenario: str,
2500
):
2501
    """
2502
    Desaggregate and allocate scenario pv rooftop ramp-ups onto buildings.
2503
    Parameters
2504
    ----------
2505
    mastr_gdf : geopandas.GeoDataFrame
2506
        GeoDataFrame containing geocoded MaStR data.
2507
    valid_buildings_gdf : geopandas.GeoDataFrame
2508
        GeoDataFrame containing OSM buildings data.
2509
    last_scenario_gdf : geopandas.GeoDataFrame
2510
        GeoDataFrame containing OSM buildings matched with pv generators from temporal
2511
        preceding scenario.
2512
    scenario : str
2513
        Scenario to desaggrgate and allocate.
2514
    Returns
2515
    -------
2516
    tuple
2517
        geopandas.GeoDataFrame
2518
            GeoDataFrame containing OSM buildings matched with pv generators.
2519
        pandas.DataFrame
2520
            DataFrame containing pv rooftop capacity per grid id.
2521
    """
2522
    cap_per_bus_id_df = cap_per_bus_id(scenario)
2523
2524
    logger.debug(
2525
        f"cap_per_bus_id_df total capacity: {cap_per_bus_id_df.capacity.sum()}"
2526
    )
2527
2528
    last_scenario_gdf = determine_end_of_life_gens(
2529
        last_scenario_gdf,
2530
        SCENARIO_TIMESTAMP[scenario],
2531
        PV_ROOFTOP_LIFETIME,
2532
    )
2533
2534
    buildings_gdf = calculate_max_pv_cap_per_building(
2535
        valid_buildings_gdf,
2536
        last_scenario_gdf,
2537
        PV_CAP_PER_SQ_M,
2538
        ROOF_FACTOR,
2539
    )
2540
2541
    mastr_gdf = calculate_building_load_factor(
2542
        mastr_gdf,
2543
        buildings_gdf,
2544
    )
2545
2546
    probabilities_dict = probabilities(
2547
        mastr_gdf,
2548
        cap_ranges=CAP_RANGES,
2549
    )
2550
2551
    cap_share_dict = cap_share_per_cap_range(
2552
        mastr_gdf,
2553
        cap_ranges=CAP_RANGES,
2554
    )
2555
2556
    load_factor_dict = mean_load_factor_per_cap_range(
2557
        mastr_gdf,
2558
        cap_ranges=CAP_RANGES,
2559
    )
2560
2561
    building_area_range_dict = building_area_range_per_cap_range(
2562
        mastr_gdf,
2563
        cap_ranges=CAP_RANGES,
2564
        min_building_size=MIN_BUILDING_SIZE,
2565
        upper_quantile=UPPER_QUNATILE,
2566
        lower_quantile=LOWER_QUANTILE,
2567
    )
2568
2569
    allocated_buildings_gdf = desaggregate_pv(
2570
        buildings_gdf=buildings_gdf,
2571
        cap_df=cap_per_bus_id_df,
2572
        prob_dict=probabilities_dict,
2573
        cap_share_dict=cap_share_dict,
2574
        building_area_range_dict=building_area_range_dict,
2575
        load_factor_dict=load_factor_dict,
2576
        seed=SEED,
2577
        pv_cap_per_sq_m=PV_CAP_PER_SQ_M,
2578
    )
2579
2580
    allocated_buildings_gdf = allocated_buildings_gdf.assign(scenario=scenario)
2581
2582
    meta_buildings_gdf = frame_to_numeric(
2583
        add_buildings_meta_data(
2584
            allocated_buildings_gdf,
2585
            probabilities_dict,
2586
            SEED,
2587
        )
2588
    )
2589
2590
    return (
2591
        add_start_up_date(
2592
            meta_buildings_gdf,
2593
            start=last_scenario_gdf.start_up_date.max(),
2594
            end=SCENARIO_TIMESTAMP[scenario],
2595
            seed=SEED,
2596
        ),
2597
        cap_per_bus_id_df,
2598
    )
2599
2600
2601
class EgonPowerPlantPvRoofBuildingScenario(Base):
2602
    __tablename__ = "egon_power_plants_pv_roof_building"
2603
    __table_args__ = {"schema": "supply"}
2604
2605
    index = Column(Integer, primary_key=True, index=True)
2606
    scenario = Column(String)
2607
    bus_id = Column(Integer, nullable=True)
2608
    building_id = Column(Integer)
2609
    gens_id = Column(String, nullable=True)
2610
    capacity = Column(Float)
2611
    einheitliche_ausrichtung_und_neigungswinkel = Column(Float)
2612
    hauptausrichtung = Column(String)
2613
    hauptausrichtung_neigungswinkel = Column(String)
2614
    voltage_level = Column(Integer)
2615
    weather_cell_id = Column(Integer)
2616
2617
2618
def create_scenario_table(buildings_gdf):
2619
    """Create mapping table pv_unit <-> building for scenario"""
2620
    EgonPowerPlantPvRoofBuildingScenario.__table__.drop(
2621
        bind=engine, checkfirst=True
2622
    )
2623
    EgonPowerPlantPvRoofBuildingScenario.__table__.create(
2624
        bind=engine, checkfirst=True
2625
    )
2626
2627
    buildings_gdf.rename(columns=COLS_TO_RENAME).assign(
2628
        capacity=buildings_gdf.capacity.div(10**3)  # kW -> MW
2629
    )[COLS_TO_EXPORT].reset_index().to_sql(
2630
        name=EgonPowerPlantPvRoofBuildingScenario.__table__.name,
2631
        schema=EgonPowerPlantPvRoofBuildingScenario.__table__.schema,
2632
        con=db.engine(),
2633
        if_exists="append",
2634
        index=False,
2635
    )
2636
2637
2638
def geocode_mastr_data():
2639
    """
2640
    Read PV rooftop data from MaStR CSV
2641
    TODO: the source will be replaced as soon as the MaStR data is available
2642
     in DB.
2643
    """
2644
    mastr_df = mastr_data(
2645
        MASTR_INDEX_COL,
2646
        MASTR_RELEVANT_COLS,
2647
        MASTR_DTYPES,
2648
        MASTR_PARSE_DATES,
2649
    )
2650
2651
    clean_mastr_df = clean_mastr_data(
2652
        mastr_df,
2653
        max_realistic_pv_cap=MAX_REALISTIC_PV_CAP,
2654
        min_realistic_pv_cap=MIN_REALISTIC_PV_CAP,
2655
        seed=SEED,
2656
        rounding=ROUNDING,
2657
    )
2658
2659
    geocoding_df = geocoding_data(clean_mastr_df)
2660
2661
    ratelimiter = geocoder(USER_AGENT, MIN_DELAY_SECONDS)
2662
2663
    geocode_gdf = geocode_data(geocoding_df, ratelimiter, EPSG)
2664
2665
    create_geocoded_table(geocode_gdf)
2666
2667
2668
def add_weather_cell_id(buildings_gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
2669
    sql = """
2670
    SELECT building_id, zensus_population_id
2671
    FROM boundaries.egon_map_zensus_mvgd_buildings
2672
    """
2673
2674
    buildings_gdf = buildings_gdf.merge(
2675
        right=db.select_dataframe(sql).drop_duplicates(subset="building_id"),
2676
        how="left",
2677
        on="building_id",
2678
    )
2679
2680
    sql = """
2681
    SELECT zensus_population_id, w_id as weather_cell_id
2682
    FROM boundaries.egon_map_zensus_weather_cell
2683
    """
2684
2685
    buildings_gdf = buildings_gdf.merge(
2686
        right=db.select_dataframe(sql).drop_duplicates(
2687
            subset="zensus_population_id"
2688
        ),
2689
        how="left",
2690
        on="zensus_population_id",
2691
    )
2692
2693
    if buildings_gdf.weather_cell_id.isna().any():
2694
        missing = buildings_gdf.loc[
2695
            buildings_gdf.weather_cell_id.isna()
2696
        ].building_id.tolist()
2697
2698
        raise ValueError(
2699
            f"Following buildings don't have a weather cell id: {missing}"
2700
        )
2701
2702
    return buildings_gdf
2703
2704
2705
def pv_rooftop_to_buildings():
2706
    """Main script, executed as task"""
2707
2708
    mastr_gdf = load_mastr_data()
2709
2710
    if DIRTY_FIX:
2711
        mastr_gdf = mastr_gdf.reset_index()
2712
2713
        df_list = [mastr_gdf.copy()]
2714
2715
        truncated_gdf = mastr_gdf.sort_values(by="capacity", ascending=False)
2716
2717
        cap_before = truncated_gdf.capacity.sum()
2718
2719
        size = int(0.05 * len(truncated_gdf))
2720
2721
        truncated_gdf = truncated_gdf.iloc[size:]
2722
2723
        cap_after = truncated_gdf.capacity.sum()
2724
2725
        logger.debug(
2726
            f"Capacity of all MaStR gens: {cap_before / 1000: g} MW\nCapacity of 95% "
2727
            f"smallets MaStR gens: {cap_after / 1000: g} MW"
2728
        )
2729
2730
        target = 34
2731
        actual = 6
2732
2733
        share = cap_after / cap_before
2734
2735
        n = int(round(target / actual / share, 0))
2736
2737
        logger.debug(f"N: {n}")
2738
2739
        for i in range(n):
2740
            df_append = truncated_gdf.copy()
2741
            df_append[MASTR_INDEX_COL] += f"_{i}"
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable MASTR_INDEX_COL does not seem to be defined.
Loading history...
2742
2743
            df_list.append(df_append)
2744
2745
        mastr_gdf = pd.concat(df_list, ignore_index=True).set_index(
2746
            MASTR_INDEX_COL
2747
        )
2748
2749
        del df_list, df_append
0 ignored issues
show
introduced by
The variable df_append does not seem to be defined in case the for loop on line 2739 is not entered. Are you sure this can never be the case?
Loading history...
2750
2751
    buildings_gdf = load_building_data()
2752
2753
    desagg_mastr_gdf, desagg_buildings_gdf = allocate_to_buildings(
2754
        mastr_gdf, buildings_gdf
2755
    )
2756
2757
    all_buildings_gdf = (
2758
        desagg_mastr_gdf.assign(scenario="status_quo")
2759
        .reset_index()
2760
        .rename(columns={"geometry": "geom", "EinheitMastrNummer": "gens_id"})
2761
    )
2762
2763
    scenario_buildings_gdf = all_buildings_gdf.copy()
2764
2765
    cap_per_bus_id_df = pd.DataFrame()
2766
2767
    for scenario in SCENARIOS:
2768
        logger.debug(f"Desaggregating scenario {scenario}.")
2769
        (
2770
            scenario_buildings_gdf,
2771
            cap_per_bus_id_scenario_df,
2772
        ) = allocate_scenarios(  # noqa: F841
2773
            desagg_mastr_gdf,
2774
            desagg_buildings_gdf,
2775
            scenario_buildings_gdf,
2776
            scenario,
2777
        )
2778
2779
        all_buildings_gdf = gpd.GeoDataFrame(
2780
            pd.concat(
2781
                [all_buildings_gdf, scenario_buildings_gdf], ignore_index=True
2782
            ),
2783
            crs=scenario_buildings_gdf.crs,
2784
            geometry="geom",
2785
        )
2786
2787
        cap_per_bus_id_df = pd.concat(
2788
            [cap_per_bus_id_df, cap_per_bus_id_scenario_df]
2789
        )
2790
2791
    # add weather cell
2792
    all_buildings_gdf = add_weather_cell_id(all_buildings_gdf)
2793
2794
    # export scenario
2795
    create_scenario_table(add_voltage_level(all_buildings_gdf))
2796