Passed
Pull Request — dev (#1112)
by
unknown
04:32
created

drop_gens_outside_muns()   A

Complexity

Conditions 1

Size

Total Lines 25
Code Lines 6

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 6
dl 0
loc 25
rs 10
c 0
b 0
f 0
cc 1
nop 1
1
"""
2
Distribute MaStR PV rooftop capacities to OSM and synthetic buildings. Generate
3
new PV rooftop generators for scenarios eGon2035 and eGon100RE.
4
5
Data cleaning and inference:
6
* Drop duplicates and entries with missing critical data.
7
* Determine most plausible capacity from multiple values given in MaStR data.
8
* Drop generators which don't have any plausible capacity data
9
  (23.5MW > P > 0.1).
10
* Randomly and weighted add a start-up date if it is missing.
11
* Extract zip and municipality from 'site' given in MaStR data.
12
* Geocode unique zip and municipality combinations with Nominatim (1 sec
13
  delay). Drop generators for which geocoding failed or which are located
14
  outside the municipalities of Germany.
15
* Add some visual sanity checks for cleaned data.
16
17
Allocation of MaStR data:
18
* Allocate each generator to an existing building from OSM.
19
* Determine the quantile each generator and building is in depending on the
20
  capacity of the generator and the area of the polygon of the building.
21
* Randomly distribute generators within each municipality preferably within
22
  the same building area quantile as the generators are capacity wise.
23
* If not enough buildings exists within a municipality and quantile additional
24
  buildings from other quantiles are chosen randomly.
25
26
Desegregation of pv rooftop scenarios:
27
* The scenario data per federal state is linearly distributed to the mv grid
28
  districts according to the pv rooftop potential per mv grid district.
29
* The rooftop potential is estimated from the building area given from the OSM
30
  buildings.
31
* Grid districts, which are located in several federal states, are allocated
32
  PV capacity according to their respective roof potential in the individual
33
  federal states.
34
* The desegregation of PV plants within a grid districts respects existing
35
  plants from MaStR, which did not reach their end of life.
36
* New PV plants are randomly and weighted generated using a breakdown of MaStR
37
  data as generator basis.
38
* Plant metadata (e.g. plant orientation) is also added random and weighted
39
  from MaStR data as basis.
40
"""
41
from __future__ import annotations
42
43
from collections import Counter
44
from functools import wraps
45
from time import perf_counter
46
47
from geoalchemy2 import Geometry
48
from loguru import logger
49
from numpy.random import RandomState, default_rng
50
from pyproj.crs.crs import CRS
51
from sqlalchemy import BigInteger, Column, Float, Integer, String
52
from sqlalchemy.dialects.postgresql import HSTORE
53
from sqlalchemy.ext.declarative import declarative_base
54
import geopandas as gpd
55
import numpy as np
56
import pandas as pd
57
58
from egon.data import config, db
59
from egon.data.datasets.electricity_demand_timeseries.hh_buildings import (
60
    OsmBuildingsSynthetic,
61
)
62
from egon.data.datasets.power_plants.mastr_db_classes import EgonPowerPlantsPv
63
from egon.data.datasets.scenario_capacities import EgonScenarioCapacities
64
from egon.data.datasets.zensus_vg250 import Vg250Gem
65
66
engine = db.engine()
67
Base = declarative_base()
68
SEED = int(config.settings()["egon-data"]["--random-seed"])
69
70
# TODO: move to yml
71
# mastr datay
72
73
MASTR_INDEX_COL = "gens_id"
74
75
EPSG = 4326
76
SRID = 3035
77
78
# data cleaning
79
MAX_REALISTIC_PV_CAP = 23500 / 10**3
80
MIN_REALISTIC_PV_CAP = 0.1 / 10**3
81
82
# show additional logging information
83
VERBOSE = False
84
85
# Number of quantiles
86
Q = 5
87
88
# Scenario Data
89
SCENARIOS = ["eGon2035", "eGon100RE"]
90
SCENARIO_TIMESTAMP = {
91
    "eGon2035": pd.Timestamp("2035-01-01", tz="UTC"),
92
    "eGon100RE": pd.Timestamp("2050-01-01", tz="UTC"),
93
}
94
PV_ROOFTOP_LIFETIME = pd.Timedelta(20 * 365, unit="D")
95
96
# Example Modul Trina Vertex S TSM-400DE09M.08 400 Wp
97
# https://www.photovoltaik4all.de/media/pdf/92/64/68/Trina_Datasheet_VertexS_DE09-08_2021_A.pdf
98
MODUL_CAP = 0.4  # kWp
99
MODUL_SIZE = 1.096 * 1.754  # m²
100
PV_CAP_PER_SQ_M = MODUL_CAP / MODUL_SIZE
101
102
# Estimation of usable roof area
103
# Factor for the conversion of building area to roof area
104
# estimation mean roof pitch: 35°
105
# estimation usable roof share: 80%
106
# estimation that only the south side of the building is used for pv
107
# see https://mediatum.ub.tum.de/doc/%20969497/969497.pdf
108
# AREA_FACTOR = 1.221
109
# USABLE_ROOF_SHARE = 0.8
110
# SOUTH_SHARE = 0.5
111
# ROOF_FACTOR = AREA_FACTOR * USABLE_ROOF_SHARE * SOUTH_SHARE
112
ROOF_FACTOR = 0.5
113
114
CAP_RANGES = [
115
    (0, 30),
116
    (30, 100),
117
    (100, float("inf")),
118
]
119
120
MIN_BUILDING_SIZE = 10.0
121
UPPER_QUANTILE = 0.95
122
LOWER_QUANTILE = 0.05
123
124
COLS_TO_EXPORT = [
125
    "scenario",
126
    "bus_id",
127
    "building_id",
128
    "gens_id",
129
    "capacity",
130
    "orientation_uniform",
131
    "orientation_primary",
132
    "orientation_primary_angle",
133
    "voltage_level",
134
    "weather_cell_id",
135
]
136
137
# TODO
138
INCLUDE_SYNTHETIC_BUILDINGS = True
139
ONLY_BUILDINGS_WITH_DEMAND = True
140
TEST_RUN = False
141
142
143
def timer_func(func):
144
    @wraps(func)
145
    def timeit_wrapper(*args, **kwargs):
146
        start_time = perf_counter()
147
        result = func(*args, **kwargs)
148
        end_time = perf_counter()
149
        total_time = end_time - start_time
150
        logger.debug(
151
            f"Function {func.__name__} took {total_time:.4f} seconds."
152
        )
153
        return result
154
155
    return timeit_wrapper
156
157
158
@timer_func
159
def mastr_data(
160
    index_col: str | int | list[str] | list[int],
161
) -> gpd.GeoDataFrame:
162
    """
163
    Read MaStR data from database.
164
165
    Parameters
166
    -----------
167
    index_col : str, int or list of str or int
168
        Column(s) to use as the row labels of the DataFrame.
169
    Returns
170
    -------
171
    pandas.DataFrame
172
        DataFrame containing MaStR data.
173
    """
174
    with db.session_scope() as session:
175
        query = session.query(EgonPowerPlantsPv).filter(
176
            EgonPowerPlantsPv.status == "InBetrieb",
177
            EgonPowerPlantsPv.site_type
178
            == ("Bauliche Anlagen (Hausdach, Gebäude und Fassade)"),
179
        )
180
181
    gdf = gpd.read_postgis(
182
        query.statement, query.session.bind, index_col=index_col
183
    )
184
185
    logger.debug("MaStR data loaded.")
186
187
    return gdf
188
189
190
@timer_func
191
def clean_mastr_data(
192
    mastr_gdf: gpd.GeoDataFrame,
193
    max_realistic_pv_cap: int | float,
194
    min_realistic_pv_cap: int | float,
195
    seed: int,
196
) -> gpd.GeoDataFrame:
197
    """
198
    Clean the MaStR data from implausible data.
199
200
    * Drop MaStR ID duplicates.
201
    * Drop generators with implausible capacities.
202
    * Drop generators without any kind of start-up date.
203
    * Clean up site column and capacity.
204
205
    Parameters
206
    -----------
207
    mastr_gdf : pandas.DataFrame
208
        DataFrame containing MaStR data.
209
    max_realistic_pv_cap : int or float
210
        Maximum capacity, which is considered to be realistic.
211
    min_realistic_pv_cap : int or float
212
        Minimum capacity, which is considered to be realistic.
213
    seed : int
214
        Seed to use for random operations with NumPy and pandas.
215
    Returns
216
    -------
217
    pandas.DataFrame
218
        DataFrame containing cleaned MaStR data.
219
    """
220
    init_len = len(mastr_gdf)
221
222
    # drop duplicates
223
    mastr_gdf = mastr_gdf.loc[~mastr_gdf.index.duplicated()]
224
225
    # drop generators without any capacity info
226
    # and capacity of zero
227
    # and if the capacity is > 23.5 MW, because
228
    # Germanies largest rooftop PV is 23 MW
229
    # https://www.iwr.de/news/groesste-pv-dachanlage-europas-wird-in-sachsen-anhalt-gebaut-news37379
230
    mastr_gdf = mastr_gdf.loc[
231
        ~mastr_gdf.capacity.isna()
232
        & (mastr_gdf.capacity <= max_realistic_pv_cap)
233
        & (mastr_gdf.capacity > min_realistic_pv_cap)
234
    ]
235
236
    # get consistent start-up date
237
    # randomly and weighted fill missing start-up dates
238
    pool = mastr_gdf.loc[
239
        ~mastr_gdf.commissioning_date.isna()
240
    ].commissioning_date.to_numpy()
241
242
    size = len(mastr_gdf) - len(pool)
243
244
    if size > 0:
245
        rng = default_rng(seed=seed)
246
247
        choice = rng.choice(
248
            pool,
249
            size=size,
250
            replace=False,
251
        )
252
253
        mastr_gdf.loc[mastr_gdf.commissioning_date.isna()] = mastr_gdf.loc[
254
            mastr_gdf.commissioning_date.isna()
255
        ].assign(commissioning_date=choice)
256
257
        logger.info(
258
            f"Randomly and weigthed added start-up date to {size} generators."
259
        )
260
261
    mastr_gdf = mastr_gdf.assign(
262
        commissioning_date=pd.to_datetime(
263
            mastr_gdf.commissioning_date, utc=True
264
        )
265
    )
266
267
    end_len = len(mastr_gdf)
268
    logger.debug(
269
        f"Dropped {init_len - end_len} "
270
        f"({((init_len - end_len) / init_len) * 100:g}%)"
271
        f" of {init_len} rows from MaStR DataFrame."
272
    )
273
274
    return mastr_gdf
275
276
277
@timer_func
278
def municipality_data() -> gpd.GeoDataFrame:
279
    """
280
    Get municipality data from eGo^n Database.
281
    Returns
282
    -------
283
    gepandas.GeoDataFrame
284
        GeoDataFrame with municipality data.
285
    """
286
    with db.session_scope() as session:
287
        query = session.query(Vg250Gem.ags, Vg250Gem.geometry.label("geom"))
288
289
    return gpd.read_postgis(
290
        query.statement, query.session.bind, index_col="ags"
291
    )
292
293
294
@timer_func
295
def add_ags_to_gens(
296
    mastr_gdf: gpd.GeoDataFrame,
297
    municipalities_gdf: gpd.GeoDataFrame,
298
) -> gpd.GeoDataFrame:
299
    """
300
    Add information about AGS ID to generators.
301
    Parameters
302
    -----------
303
    mastr_gdf : geopandas.GeoDataFrame
304
        GeoDataFrame with valid and cleaned MaStR data.
305
    municipalities_gdf : geopandas.GeoDataFrame
306
        GeoDataFrame with municipality data.
307
    Returns
308
    -------
309
    gepandas.GeoDataFrame
310
        GeoDataFrame with valid and cleaned MaStR data
311
        with AGS ID added.
312
    """
313
    return mastr_gdf.sjoin(
314
        municipalities_gdf,
315
        how="left",
316
        predicate="intersects",
317
    ).rename(columns={"index_right": "ags"})
318
319
320
def drop_gens_outside_muns(
321
    mastr_gdf: gpd.GeoDataFrame,
322
) -> gpd.GeoDataFrame:
323
    """
324
    Drop all generators outside of municipalities.
325
    Parameters
326
    -----------
327
    mastr_gdf : geopandas.GeoDataFrame
328
        GeoDataFrame with valid and cleaned MaStR data.
329
    Returns
330
    -------
331
    gepandas.GeoDataFrame
332
        GeoDataFrame with valid and cleaned MaStR data
333
        with generatos without an AGS ID dropped.
334
    """
335
    gdf = mastr_gdf.loc[~mastr_gdf.ags.isna()]
336
337
    logger.debug(
338
        f"{len(mastr_gdf) - len(gdf)} ("
339
        f"{(len(mastr_gdf) - len(gdf)) / len(mastr_gdf) * 100:g}%)"
340
        f" of {len(mastr_gdf)} values are outside of the municipalities"
341
        " and are therefore dropped."
342
    )
343
344
    return gdf
345
346
347
def load_mastr_data():
348
    """Read PV rooftop data from MaStR CSV
349
    Note: the source will be replaced as soon as the MaStR data is available
350
    in DB.
351
    Returns
352
    -------
353
    geopandas.GeoDataFrame
354
        GeoDataFrame containing MaStR data with geocoded locations.
355
    """
356
    mastr_gdf = mastr_data(
357
        MASTR_INDEX_COL,
358
    )
359
360
    clean_mastr_gdf = clean_mastr_data(
361
        mastr_gdf,
362
        max_realistic_pv_cap=MAX_REALISTIC_PV_CAP,
363
        min_realistic_pv_cap=MIN_REALISTIC_PV_CAP,
364
        seed=SEED,
365
    )
366
367
    municipalities_gdf = municipality_data()
368
369
    clean_mastr_gdf = add_ags_to_gens(clean_mastr_gdf, municipalities_gdf)
370
371
    return drop_gens_outside_muns(clean_mastr_gdf)
372
373
374
class OsmBuildingsFiltered(Base):
375
    __tablename__ = "osm_buildings_filtered"
376
    __table_args__ = {"schema": "openstreetmap"}
377
378
    osm_id = Column(BigInteger)
379
    amenity = Column(String)
380
    building = Column(String)
381
    name = Column(String)
382
    geom = Column(Geometry(srid=SRID), index=True)
383
    area = Column(Float)
384
    geom_point = Column(Geometry(srid=SRID), index=True)
385
    tags = Column(HSTORE)
386
    id = Column(BigInteger, primary_key=True, index=True)
387
388
389
@timer_func
390
def osm_buildings(
391
    to_crs: CRS,
392
) -> gpd.GeoDataFrame:
393
    """
394
    Read OSM buildings data from eGo^n Database.
395
    Parameters
396
    -----------
397
    to_crs : pyproj.crs.crs.CRS
398
        CRS to transform geometries to.
399
    Returns
400
    -------
401
    geopandas.GeoDataFrame
402
        GeoDataFrame containing OSM buildings data.
403
    """
404
    with db.session_scope() as session:
405
        query = session.query(
406
            OsmBuildingsFiltered.id,
407
            OsmBuildingsFiltered.area,
408
            OsmBuildingsFiltered.geom_point.label("geom"),
409
        )
410
411
    return gpd.read_postgis(
412
        query.statement, query.session.bind, index_col="id"
413
    ).to_crs(to_crs)
414
415
416
@timer_func
417
def synthetic_buildings(
418
    to_crs: CRS,
419
) -> gpd.GeoDataFrame:
420
    """
421
    Read synthetic buildings data from eGo^n Database.
422
    Parameters
423
    -----------
424
    to_crs : pyproj.crs.crs.CRS
425
        CRS to transform geometries to.
426
    Returns
427
    -------
428
    geopandas.GeoDataFrame
429
        GeoDataFrame containing OSM buildings data.
430
    """
431
    with db.session_scope() as session:
432
        query = session.query(
433
            OsmBuildingsSynthetic.id,
434
            OsmBuildingsSynthetic.area,
435
            OsmBuildingsSynthetic.geom_point.label("geom"),
436
        )
437
438
    return gpd.read_postgis(
439
        query.statement, query.session.bind, index_col="id"
440
    ).to_crs(to_crs)
441
442
443
@timer_func
444
def add_ags_to_buildings(
445
    buildings_gdf: gpd.GeoDataFrame,
446
    municipalities_gdf: gpd.GeoDataFrame,
447
) -> gpd.GeoDataFrame:
448
    """
449
    Add information about AGS ID to buildings.
450
    Parameters
451
    -----------
452
    buildings_gdf : geopandas.GeoDataFrame
453
        GeoDataFrame containing OSM buildings data.
454
    municipalities_gdf : geopandas.GeoDataFrame
455
        GeoDataFrame with municipality data.
456
    Returns
457
    -------
458
    gepandas.GeoDataFrame
459
        GeoDataFrame containing OSM buildings data
460
        with AGS ID added.
461
    """
462
    return buildings_gdf.sjoin(
463
        municipalities_gdf,
464
        how="left",
465
        predicate="intersects",
466
    ).rename(columns={"index_right": "ags"})
467
468
469
def drop_buildings_outside_muns(
470
    buildings_gdf: gpd.GeoDataFrame,
471
) -> gpd.GeoDataFrame:
472
    """
473
    Drop all buildings outside of municipalities.
474
    Parameters
475
    -----------
476
    buildings_gdf : geopandas.GeoDataFrame
477
        GeoDataFrame containing OSM buildings data.
478
    Returns
479
    -------
480
    gepandas.GeoDataFrame
481
        GeoDataFrame containing OSM buildings data
482
        with buildings without an AGS ID dropped.
483
    """
484
    gdf = buildings_gdf.loc[~buildings_gdf.ags.isna()]
485
486
    logger.debug(
487
        f"{len(buildings_gdf) - len(gdf)} "
488
        f"({(len(buildings_gdf) - len(gdf)) / len(buildings_gdf) * 100:g}%) "
489
        f"of {len(buildings_gdf)} values are outside of the municipalities "
490
        "and are therefore dropped."
491
    )
492
493
    return gdf
494
495
496
def egon_building_peak_loads():
497
    sql = """
498
    SELECT building_id
499
    FROM demand.egon_building_electricity_peak_loads
500
    WHERE scenario = 'eGon2035'
501
    """
502
503
    return (
504
        db.select_dataframe(sql).building_id.astype(int).sort_values().unique()
505
    )
506
507
508
@timer_func
509
def load_building_data():
510
    """
511
    Read buildings from DB
512
    Tables:
513
514
    * `openstreetmap.osm_buildings_filtered` (from OSM)
515
    * `openstreetmap.osm_buildings_synthetic` (synthetic, created by us)
516
517
    Use column `id` for both as it is unique hence you concat both datasets.
518
    If INCLUDE_SYNTHETIC_BUILDINGS is False synthetic buildings will not be
519
    loaded.
520
521
    Returns
522
    -------
523
    gepandas.GeoDataFrame
524
        GeoDataFrame containing OSM buildings data with buildings without an
525
        AGS ID dropped.
526
    """
527
528
    municipalities_gdf = municipality_data()
529
530
    osm_buildings_gdf = osm_buildings(municipalities_gdf.crs)
531
532
    if INCLUDE_SYNTHETIC_BUILDINGS:
533
        synthetic_buildings_gdf = synthetic_buildings(municipalities_gdf.crs)
534
535
        buildings_gdf = gpd.GeoDataFrame(
536
            pd.concat(
537
                [
538
                    osm_buildings_gdf,
539
                    synthetic_buildings_gdf,
540
                ]
541
            ),
542
            geometry="geom",
543
            crs=osm_buildings_gdf.crs,
544
        ).rename(columns={"area": "building_area"})
545
546
        buildings_gdf.index = buildings_gdf.index.astype(int)
547
548
    else:
549
        buildings_gdf = osm_buildings_gdf.rename(
550
            columns={"area": "building_area"}
551
        )
552
553
    if ONLY_BUILDINGS_WITH_DEMAND:
554
        building_ids = egon_building_peak_loads()
555
556
        init_len = len(building_ids)
557
558
        building_ids = np.intersect1d(
559
            list(map(int, building_ids)),
560
            list(map(int, buildings_gdf.index.to_numpy())),
561
        )
562
563
        end_len = len(building_ids)
564
565
        logger.debug(
566
            f"{end_len/init_len * 100: g} % ({end_len} / {init_len}) "
567
            f"of buildings have peak load."
568
        )
569
570
        buildings_gdf = buildings_gdf.loc[building_ids]
571
572
    buildings_ags_gdf = add_ags_to_buildings(buildings_gdf, municipalities_gdf)
573
574
    buildings_ags_gdf = drop_buildings_outside_muns(buildings_ags_gdf)
575
576
    grid_districts_gdf = grid_districts(EPSG)
577
578
    federal_state_gdf = federal_state_data(grid_districts_gdf.crs)
579
580
    grid_federal_state_gdf = overlay_grid_districts_with_counties(
581
        grid_districts_gdf,
582
        federal_state_gdf,
583
    )
584
585
    buildings_overlay_gdf = add_overlay_id_to_buildings(
586
        buildings_ags_gdf,
587
        grid_federal_state_gdf,
588
    )
589
590
    logger.debug("Loaded buildings.")
591
592
    buildings_overlay_gdf = drop_buildings_outside_grids(buildings_overlay_gdf)
593
594
    # overwrite bus_id with data from new table
595
    sql = (
596
        "SELECT building_id, bus_id FROM "
597
        "boundaries.egon_map_zensus_mvgd_buildings"
598
    )
599
    map_building_bus_df = db.select_dataframe(sql)
600
601
    building_ids = np.intersect1d(
602
        list(map(int, map_building_bus_df.building_id.unique())),
603
        list(map(int, buildings_overlay_gdf.index.to_numpy())),
604
    )
605
606
    buildings_within_gdf = buildings_overlay_gdf.loc[building_ids]
607
608
    gdf = (
609
        buildings_within_gdf.reset_index()
610
        .drop(columns=["bus_id"])
611
        .merge(
612
            how="left",
613
            right=map_building_bus_df,
614
            left_on="id",
615
            right_on="building_id",
616
        )
617
        .drop(columns=["building_id"])
618
        .set_index("id")
619
        .sort_index()
620
    )
621
622
    return gdf[~gdf.index.duplicated(keep="first")]
623
624
625
@timer_func
626
def sort_and_qcut_df(
627
    df: pd.DataFrame | gpd.GeoDataFrame,
628
    col: str,
629
    q: int,
630
) -> pd.DataFrame | gpd.GeoDataFrame:
631
    """
632
    Determine the quantile of a given attribute in a (Geo)DataFrame.
633
    Sort the (Geo)DataFrame in ascending order for the given attribute.
634
    Parameters
635
    -----------
636
    df : pandas.DataFrame or geopandas.GeoDataFrame
637
        (Geo)DataFrame to sort and qcut.
638
    col : str
639
        Name of the attribute to sort and qcut the (Geo)DataFrame on.
640
    q : int
641
        Number of quantiles.
642
    Returns
643
    -------
644
    pandas.DataFrame or gepandas.GeoDataFrame
645
        Sorted and qcut (Geo)DataFrame.
646
    """
647
    df = df.sort_values(col, ascending=True)
648
649
    return df.assign(
650
        quant=pd.qcut(
651
            df[col],
652
            q=q,
653
            labels=range(q),
654
        )
655
    )
656
657
658
@timer_func
659
def allocate_pv(
660
    q_mastr_gdf: gpd.GeoDataFrame,
661
    q_buildings_gdf: gpd.GeoDataFrame,
662
    seed: int,
663
) -> tuple[gpd.GeoDataFrame, gpd.GeoDataFrame]:
664
    """
665
    Allocate the MaStR pv generators to the OSM buildings.
666
    This will determine a building for each pv generator if there are more
667
    buildings than generators within a given AGS. Primarily generators are
668
    distributed with the same qunatile as the buildings. Multiple assignment
669
    is excluded.
670
    Parameters
671
    -----------
672
    q_mastr_gdf : geopandas.GeoDataFrame
673
        GeoDataFrame containing geocoded and qcut MaStR data.
674
    q_buildings_gdf : geopandas.GeoDataFrame
675
        GeoDataFrame containing qcut OSM buildings data.
676
    seed : int
677
        Seed to use for random operations with NumPy and pandas.
678
    Returns
679
    -------
680
    tuple with two geopandas.GeoDataFrame s
681
        GeoDataFrame containing MaStR data allocated to building IDs.
682
        GeoDataFrame containing building data allocated to MaStR IDs.
683
    """
684
    rng = default_rng(seed=seed)
685
686
    q_buildings_gdf = q_buildings_gdf.assign(gens_id=np.nan).sort_values(
687
        by=["ags", "quant"]
688
    )
689
    q_mastr_gdf = q_mastr_gdf.assign(building_id=np.nan).sort_values(
690
        by=["ags", "quant"]
691
    )
692
693
    ags_list = q_buildings_gdf.ags.unique()
694
695
    if TEST_RUN:
696
        ags_list = ags_list[:250]
697
698
    num_ags = len(ags_list)
699
700
    t0 = perf_counter()
701
702
    for count, ags in enumerate(ags_list):
703
704
        buildings = q_buildings_gdf.loc[q_buildings_gdf.ags == ags]
705
        gens = q_mastr_gdf.loc[q_mastr_gdf.ags == ags]
706
707
        len_build = len(buildings)
708
        len_gens = len(gens)
709
710
        if len_build < len_gens:
711
            gens = gens.sample(len_build, random_state=RandomState(seed=seed))
712
            logger.error(
713
                f"There are {len_gens} generators and only {len_build}"
714
                f" buildings in AGS {ags}. {len_gens - len(gens)} "
715
                "generators were truncated to match the amount of buildings."
716
            )
717
718
            assert len_build == len(gens)
719
720
        for quant in gens.quant.unique():
721
            q_buildings = buildings.loc[buildings.quant == quant]
722
            q_gens = gens.loc[gens.quant == quant]
723
724
            len_build = len(q_buildings)
725
            len_gens = len(q_gens)
726
727
            if len_build < len_gens:
728
                delta = len_gens - len_build
729
730
                logger.warning(
731
                    f"There are {len_gens} generators and only {len_build} "
732
                    f"buildings in AGS {ags} and quantile {quant}. {delta} "
733
                    f"buildings from AGS {ags} will be added randomly."
734
                )
735
736
                add_buildings = pd.Index(
737
                    rng.choice(
738
                        list(set(buildings.index) - set(q_buildings.index)),
739
                        size=delta,
740
                        replace=False,
741
                    )
742
                )
743
744
                chosen_buildings = q_buildings.index.append(add_buildings)
745
746
            else:
747
                chosen_buildings = rng.choice(
748
                    q_buildings.index,
749
                    size=len_gens,
750
                    replace=False,
751
                )
752
753
            q_buildings_gdf.loc[chosen_buildings, "gens_id"] = q_gens.index
754
            buildings = buildings.drop(chosen_buildings)
755
756
        if count % 500 == 0:
757
            logger.debug(
758
                f"Allocation of {count / num_ags * 100:g} % of AGS done. "
759
                f"It took {perf_counter() - t0:g} seconds."
760
            )
761
762
            t0 = perf_counter()
763
764
    assigned_buildings = q_buildings_gdf.loc[~q_buildings_gdf.gens_id.isna()]
765
766
    assert len(assigned_buildings) == len(assigned_buildings.gens_id.unique())
767
768
    q_mastr_gdf.loc[
769
        assigned_buildings.gens_id, "building_id"
770
    ] = assigned_buildings.index
771
772
    assigned_gens = q_mastr_gdf.loc[~q_mastr_gdf.building_id.isna()]
773
774
    assert len(assigned_buildings) == len(assigned_gens)
775
776
    logger.debug("Allocated status quo generators to buildings.")
777
778
    return frame_to_numeric(q_mastr_gdf), frame_to_numeric(q_buildings_gdf)
779
780
781
def frame_to_numeric(
782
    df: pd.DataFrame | gpd.GeoDataFrame,
783
) -> pd.DataFrame | gpd.GeoDataFrame:
784
    """
785
    Try to convert all columns of a DataFrame to numeric ignoring errors.
786
    Parameters
787
    ----------
788
    df : pandas.DataFrame or geopandas.GeoDataFrame
789
    Returns
790
    -------
791
    pandas.DataFrame or geopandas.GeoDataFrame
792
    """
793
    if str(df.index.dtype) == "object":
794
        df.index = pd.to_numeric(df.index, errors="ignore")
795
796
    for col in df.columns:
797
        if str(df[col].dtype) == "object":
798
            df[col] = pd.to_numeric(df[col], errors="ignore")
799
800
    return df
801
802
803
def validate_output(
804
    desagg_mastr_gdf: pd.DataFrame | gpd.GeoDataFrame,
805
    desagg_buildings_gdf: pd.DataFrame | gpd.GeoDataFrame,
806
) -> None:
807
    """
808
    Validate output.
809
810
    * Validate that there are exactly as many buildings with a pv system as
811
      there are pv systems with a building
812
    * Validate that the building IDs with a pv system are the same building
813
      IDs as assigned to the pv systems
814
    * Validate that the pv system IDs with a building are the same pv system
815
      IDs as assigned to the buildings
816
817
    Parameters
818
    -----------
819
    desagg_mastr_gdf : geopandas.GeoDataFrame
820
        GeoDataFrame containing MaStR data allocated to building IDs.
821
    desagg_buildings_gdf : geopandas.GeoDataFrame
822
        GeoDataFrame containing building data allocated to MaStR IDs.
823
    """
824
    assert len(
825
        desagg_mastr_gdf.loc[~desagg_mastr_gdf.building_id.isna()]
826
    ) == len(desagg_buildings_gdf.loc[~desagg_buildings_gdf.gens_id.isna()])
827
    assert (
828
        np.sort(
829
            desagg_mastr_gdf.loc[
830
                ~desagg_mastr_gdf.building_id.isna()
831
            ].building_id.unique()
832
        )
833
        == np.sort(
834
            desagg_buildings_gdf.loc[
835
                ~desagg_buildings_gdf.gens_id.isna()
836
            ].index.unique()
837
        )
838
    ).all()
839
    assert (
840
        np.sort(
841
            desagg_mastr_gdf.loc[
842
                ~desagg_mastr_gdf.building_id.isna()
843
            ].index.unique()
844
        )
845
        == np.sort(
846
            desagg_buildings_gdf.loc[
847
                ~desagg_buildings_gdf.gens_id.isna()
848
            ].gens_id.unique()
849
        )
850
    ).all()
851
852
    logger.debug("Validated output.")
853
854
855
def drop_unallocated_gens(
856
    gdf: gpd.GeoDataFrame,
857
) -> gpd.GeoDataFrame:
858
    """
859
    Drop generators which did not get allocated.
860
861
    Parameters
862
    -----------
863
    gdf : geopandas.GeoDataFrame
864
        GeoDataFrame containing MaStR data allocated to building IDs.
865
    Returns
866
    -------
867
    geopandas.GeoDataFrame
868
        GeoDataFrame containing MaStR data with generators dropped which did
869
        not get allocated.
870
    """
871
    init_len = len(gdf)
872
    gdf = gdf.loc[~gdf.building_id.isna()]
873
    end_len = len(gdf)
874
875
    logger.debug(
876
        f"Dropped {init_len - end_len} "
877
        f"({((init_len - end_len) / init_len) * 100:g}%)"
878
        f" of {init_len} unallocated rows from MaStR DataFrame."
879
    )
880
881
    return gdf
882
883
884
@timer_func
885
def allocate_to_buildings(
886
    mastr_gdf: gpd.GeoDataFrame,
887
    buildings_gdf: gpd.GeoDataFrame,
888
) -> tuple[gpd.GeoDataFrame, gpd.GeoDataFrame]:
889
    """
890
    Allocate status quo pv rooftop generators to buildings.
891
    Parameters
892
    -----------
893
    mastr_gdf : geopandas.GeoDataFrame
894
        GeoDataFrame containing MaStR data with geocoded locations.
895
    buildings_gdf : geopandas.GeoDataFrame
896
        GeoDataFrame containing OSM buildings data with buildings without an
897
        AGS ID dropped.
898
    Returns
899
    -------
900
    tuple with two geopandas.GeoDataFrame s
901
        GeoDataFrame containing MaStR data allocated to building IDs.
902
        GeoDataFrame containing building data allocated to MaStR IDs.
903
    """
904
    logger.debug("Starting allocation of status quo.")
905
906
    q_mastr_gdf = sort_and_qcut_df(mastr_gdf, col="capacity", q=Q)
907
    q_buildings_gdf = sort_and_qcut_df(buildings_gdf, col="building_area", q=Q)
908
909
    desagg_mastr_gdf, desagg_buildings_gdf = allocate_pv(
910
        q_mastr_gdf, q_buildings_gdf, SEED
911
    )
912
913
    validate_output(desagg_mastr_gdf, desagg_buildings_gdf)
914
915
    return drop_unallocated_gens(desagg_mastr_gdf), desagg_buildings_gdf
916
917
918
@timer_func
919
def grid_districts(
920
    epsg: int,
921
) -> gpd.GeoDataFrame:
922
    """
923
    Load mv grid district geo data from eGo^n Database as
924
    geopandas.GeoDataFrame.
925
    Parameters
926
    -----------
927
    epsg : int
928
        EPSG ID to use as CRS.
929
    Returns
930
    -------
931
    geopandas.GeoDataFrame
932
        GeoDataFrame containing mv grid district ID and geo shapes data.
933
    """
934
    gdf = db.select_geodataframe(
935
        """
936
        SELECT bus_id, geom
937
        FROM grid.egon_mv_grid_district
938
        ORDER BY bus_id
939
        """,
940
        index_col="bus_id",
941
        geom_col="geom",
942
        epsg=epsg,
943
    )
944
945
    gdf.index = gdf.index.astype(int)
946
947
    logger.debug("Grid districts loaded.")
948
949
    return gdf
950
951
952
def scenario_data(
953
    carrier: str = "solar_rooftop",
954
    scenario: str = "eGon2035",
955
) -> pd.DataFrame:
956
    """
957
    Get scenario capacity data from eGo^n Database.
958
    Parameters
959
    -----------
960
    carrier : str
961
        Carrier type to filter table by.
962
    scenario : str
963
        Scenario to filter table by.
964
    Returns
965
    -------
966
    geopandas.GeoDataFrame
967
        GeoDataFrame with scenario capacity data in GW.
968
    """
969
    with db.session_scope() as session:
970
        query = session.query(EgonScenarioCapacities).filter(
971
            EgonScenarioCapacities.carrier == carrier,
972
            EgonScenarioCapacities.scenario_name == scenario,
973
        )
974
975
    df = pd.read_sql(
976
        query.statement, query.session.bind, index_col="index"
977
    ).sort_index()
978
979
    logger.debug("Scenario capacity data loaded.")
980
981
    return df
982
983
984 View Code Duplication
class Vg250Lan(Base):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
985
    __tablename__ = "vg250_lan"
986
    __table_args__ = {"schema": "boundaries"}
987
988
    id = Column(BigInteger, primary_key=True, index=True)
989
    ade = Column(BigInteger)
990
    gf = Column(BigInteger)
991
    bsg = Column(BigInteger)
992
    ars = Column(String)
993
    ags = Column(String)
994
    sdv_ars = Column(String)
995
    gen = Column(String)
996
    bez = Column(String)
997
    ibz = Column(BigInteger)
998
    bem = Column(String)
999
    nbd = Column(String)
1000
    sn_l = Column(String)
1001
    sn_r = Column(String)
1002
    sn_k = Column(String)
1003
    sn_v1 = Column(String)
1004
    sn_v2 = Column(String)
1005
    sn_g = Column(String)
1006
    fk_s3 = Column(String)
1007
    nuts = Column(String)
1008
    ars_0 = Column(String)
1009
    ags_0 = Column(String)
1010
    wsk = Column(String)
1011
    debkg_id = Column(String)
1012
    rs = Column(String)
1013
    sdv_rs = Column(String)
1014
    rs_0 = Column(String)
1015
    geometry = Column(Geometry(srid=EPSG), index=True)
1016
1017
1018
def federal_state_data(to_crs: CRS) -> gpd.GeoDataFrame:
1019
    """
1020
    Get feder state data from eGo^n Database.
1021
    Parameters
1022
    -----------
1023
    to_crs : pyproj.crs.crs.CRS
1024
        CRS to transform geometries to.
1025
    Returns
1026
    -------
1027
    geopandas.GeoDataFrame
1028
        GeoDataFrame with federal state data.
1029
    """
1030
    with db.session_scope() as session:
1031
        query = session.query(
1032
            Vg250Lan.id, Vg250Lan.nuts, Vg250Lan.geometry.label("geom")
1033
        )
1034
1035
        gdf = gpd.read_postgis(
1036
            query.statement, session.connection(), index_col="id"
1037
        ).to_crs(to_crs)
1038
1039
    logger.debug("Federal State data loaded.")
1040
1041
    return gdf
1042
1043
1044
@timer_func
1045
def overlay_grid_districts_with_counties(
1046
    mv_grid_district_gdf: gpd.GeoDataFrame,
1047
    federal_state_gdf: gpd.GeoDataFrame,
1048
) -> gpd.GeoDataFrame:
1049
    """
1050
    Calculate the intersections of mv grid districts and counties.
1051
    Parameters
1052
    -----------
1053
    mv_grid_district_gdf : gpd.GeoDataFrame
1054
        GeoDataFrame containing mv grid district ID and geo shapes data.
1055
    federal_state_gdf : gpd.GeoDataFrame
1056
        GeoDataFrame with federal state data.
1057
    Returns
1058
    -------
1059
    geopandas.GeoDataFrame
1060
        GeoDataFrame containing OSM buildings data.
1061
    """
1062
    logger.debug(
1063
        "Calculating intersection overlay between mv grid districts and "
1064
        "counties. This may take a while..."
1065
    )
1066
1067
    gdf = gpd.overlay(
1068
        federal_state_gdf.to_crs(mv_grid_district_gdf.crs),
1069
        mv_grid_district_gdf.reset_index(),
1070
        how="intersection",
1071
        keep_geom_type=True,
1072
    )
1073
1074
    logger.debug("Done!")
1075
1076
    return gdf
1077
1078
1079
@timer_func
1080
def add_overlay_id_to_buildings(
1081
    buildings_gdf: gpd.GeoDataFrame,
1082
    grid_federal_state_gdf: gpd.GeoDataFrame,
1083
) -> gpd.GeoDataFrame:
1084
    """
1085
    Add information about overlay ID to buildings.
1086
    Parameters
1087
    -----------
1088
    buildings_gdf : geopandas.GeoDataFrame
1089
        GeoDataFrame containing OSM buildings data.
1090
    grid_federal_state_gdf : geopandas.GeoDataFrame
1091
        GeoDataFrame with intersection shapes between counties and grid
1092
        districts.
1093
    Returns
1094
    -------
1095
    geopandas.GeoDataFrame
1096
        GeoDataFrame containing OSM buildings data with overlay ID added.
1097
    """
1098
    gdf = (
1099
        buildings_gdf.to_crs(grid_federal_state_gdf.crs)
1100
        .sjoin(
1101
            grid_federal_state_gdf,
1102
            how="left",
1103
            predicate="intersects",
1104
        )
1105
        .rename(columns={"index_right": "overlay_id"})
1106
    )
1107
1108
    logger.debug("Added overlay ID to OSM buildings.")
1109
1110
    return gdf
1111
1112
1113
def drop_buildings_outside_grids(
1114
    buildings_gdf: gpd.GeoDataFrame,
1115
) -> gpd.GeoDataFrame:
1116
    """
1117
    Drop all buildings outside of grid areas.
1118
    Parameters
1119
    -----------
1120
    buildings_gdf : geopandas.GeoDataFrame
1121
        GeoDataFrame containing OSM buildings data.
1122
    Returns
1123
    -------
1124
    gepandas.GeoDataFrame
1125
        GeoDataFrame containing OSM buildings data
1126
        with buildings without an bus ID dropped.
1127
    """
1128
    gdf = buildings_gdf.loc[~buildings_gdf.bus_id.isna()]
1129
1130
    logger.debug(
1131
        f"{len(buildings_gdf) - len(gdf)} "
1132
        f"({(len(buildings_gdf) - len(gdf)) / len(buildings_gdf) * 100:g}%) "
1133
        f"of {len(buildings_gdf)} values are outside of the grid areas "
1134
        "and are therefore dropped."
1135
    )
1136
1137
    return gdf
1138
1139
1140
def cap_per_bus_id(
1141
    scenario: str,
1142
) -> pd.DataFrame:
1143
    """
1144
    Get table with total pv rooftop capacity per grid district.
1145
1146
    Parameters
1147
    -----------
1148
    scenario : str
1149
        Scenario name.
1150
    Returns
1151
    -------
1152
    pandas.DataFrame
1153
        DataFrame with total rooftop capacity per mv grid.
1154
    """
1155
    targets = config.datasets()["solar_rooftop"]["targets"]
1156
1157
    sql = f"""
1158
    SELECT bus as bus_id, control, p_nom as capacity
1159
    FROM {targets['generators']['schema']}.{targets['generators']['table']}
1160
    WHERE carrier = 'solar_rooftop'
1161
    AND scn_name = '{scenario}'
1162
    """
1163
    # TODO: woher kommen die Slack rows???
1164
1165
    df = db.select_dataframe(sql, index_col="bus_id")
1166
1167
    return df.loc[df.control != "Slack"]
1168
1169
1170
def determine_end_of_life_gens(
1171
    mastr_gdf: gpd.GeoDataFrame,
1172
    scenario_timestamp: pd.Timestamp,
1173
    pv_rooftop_lifetime: pd.Timedelta,
1174
) -> gpd.GeoDataFrame:
1175
    """
1176
    Determine if an old PV system has reached its end of life.
1177
    Parameters
1178
    -----------
1179
    mastr_gdf : geopandas.GeoDataFrame
1180
        GeoDataFrame containing geocoded MaStR data.
1181
    scenario_timestamp : pandas.Timestamp
1182
        Timestamp at which the scenario takes place.
1183
    pv_rooftop_lifetime : pandas.Timedelta
1184
        Average expected lifetime of PV rooftop systems.
1185
    Returns
1186
    -------
1187
    geopandas.GeoDataFrame
1188
        GeoDataFrame containing geocoded MaStR data and info if the system
1189
        has reached its end of life.
1190
    """
1191
    before = mastr_gdf.capacity.sum()
1192
1193
    mastr_gdf = mastr_gdf.assign(
1194
        age=scenario_timestamp - mastr_gdf.commissioning_date
1195
    )
1196
1197
    mastr_gdf = mastr_gdf.assign(
1198
        end_of_life=pv_rooftop_lifetime < mastr_gdf.age
1199
    )
1200
1201
    after = mastr_gdf.loc[~mastr_gdf.end_of_life].capacity.sum()
1202
1203
    logger.debug(
1204
        f"Determined if pv rooftop systems reached their end of life.\nTotal "
1205
        f"capacity: {before}\nActive capacity: {after}"
1206
    )
1207
1208
    return mastr_gdf
1209
1210
1211
def calculate_max_pv_cap_per_building(
1212
    buildings_gdf: gpd.GeoDataFrame,
1213
    mastr_gdf: gpd.GeoDataFrame,
1214
    pv_cap_per_sq_m: float | int,
1215
    roof_factor: float | int,
1216
) -> gpd.GeoDataFrame:
1217
    """
1218
    Calculate the estimated maximum possible PV capacity per building.
1219
    Parameters
1220
    -----------
1221
    buildings_gdf : geopandas.GeoDataFrame
1222
        GeoDataFrame containing OSM buildings data.
1223
    mastr_gdf : geopandas.GeoDataFrame
1224
        GeoDataFrame containing geocoded MaStR data.
1225
    pv_cap_per_sq_m : float, int
1226
        Average expected, installable PV capacity per square meter.
1227
    roof_factor : float, int
1228
        Average for PV usable roof area share.
1229
    Returns
1230
    -------
1231
    geopandas.GeoDataFrame
1232
        GeoDataFrame containing OSM buildings data with estimated maximum PV
1233
        capacity.
1234
    """
1235
    gdf = (
1236
        buildings_gdf.reset_index()
1237
        .rename(columns={"index": "id"})
1238
        .merge(
1239
            mastr_gdf[
1240
                [
1241
                    "capacity",
1242
                    "end_of_life",
1243
                    "building_id",
1244
                    "orientation_uniform",
1245
                    "orientation_primary",
1246
                    "orientation_primary_angle",
1247
                ]
1248
            ],
1249
            how="left",
1250
            left_on="id",
1251
            right_on="building_id",
1252
        )
1253
        .set_index("id")
1254
        .drop(columns="building_id")
1255
    )
1256
1257
    return gdf.assign(
1258
        max_cap=gdf.building_area.multiply(roof_factor * pv_cap_per_sq_m),
1259
        end_of_life=gdf.end_of_life.fillna(True).astype(bool),
1260
        bus_id=gdf.bus_id.astype(int),
1261
    )
1262
1263
1264
def calculate_building_load_factor(
1265
    mastr_gdf: gpd.GeoDataFrame,
1266
    buildings_gdf: gpd.GeoDataFrame,
1267
    rounding: int = 4,
1268
) -> gpd.GeoDataFrame:
1269
    """
1270
    Calculate the roof load factor from existing PV systems.
1271
    Parameters
1272
    -----------
1273
    mastr_gdf : geopandas.GeoDataFrame
1274
        GeoDataFrame containing geocoded MaStR data.
1275
    buildings_gdf : geopandas.GeoDataFrame
1276
        GeoDataFrame containing OSM buildings data.
1277
    rounding : int
1278
        Rounding to use for load factor.
1279
    Returns
1280
    -------
1281
    geopandas.GeoDataFrame
1282
        GeoDataFrame containing geocoded MaStR data with calculated load
1283
        factor.
1284
    """
1285
    gdf = mastr_gdf.merge(
1286
        buildings_gdf[["max_cap", "building_area"]]
1287
        .loc[~buildings_gdf["max_cap"].isna()]
1288
        .reset_index(),
1289
        how="left",
1290
        left_on="building_id",
1291
        right_on="id",
1292
    ).set_index("id")
1293
1294
    return gdf.assign(load_factor=(gdf.capacity / gdf.max_cap).round(rounding))
1295
1296
1297
def get_probability_for_property(
1298
    mastr_gdf: gpd.GeoDataFrame,
1299
    cap_range: tuple[int | float, int | float],
1300
    prop: str,
1301
) -> tuple[np.array, np.array]:
1302
    """
1303
    Calculate the probability of the different options of a property of the
1304
    existing PV plants.
1305
    Parameters
1306
    -----------
1307
    mastr_gdf : geopandas.GeoDataFrame
1308
        GeoDataFrame containing geocoded MaStR data.
1309
    cap_range : tuple(int, int)
1310
        Capacity range of PV plants to look at.
1311
    prop : str
1312
        Property to calculate probabilities for. String needs to be in columns
1313
        of mastr_gdf.
1314
    Returns
1315
    -------
1316
    tuple
1317
        numpy.array
1318
            Unique values of property.
1319
        numpy.array
1320
            Probabilties per unique value.
1321
    """
1322
    cap_range_gdf = mastr_gdf.loc[
1323
        (mastr_gdf.capacity > cap_range[0])
1324
        & (mastr_gdf.capacity <= cap_range[1])
1325
    ]
1326
1327
    if prop == "load_factor":
1328
        cap_range_gdf = cap_range_gdf.loc[cap_range_gdf[prop] <= 1]
1329
1330
    count = Counter(
1331
        cap_range_gdf[prop].loc[
1332
            ~cap_range_gdf[prop].isna()
1333
            & ~cap_range_gdf[prop].isnull()
1334
            & ~(cap_range_gdf[prop] == "None")
1335
        ]
1336
    )
1337
1338
    values = np.array(list(count.keys()))
1339
    probabilities = np.fromiter(count.values(), dtype=float)
1340
    probabilities = probabilities / np.sum(probabilities)
1341
1342
    return values, probabilities
1343
1344
1345
@timer_func
1346
def probabilities(
1347
    mastr_gdf: gpd.GeoDataFrame,
1348
    cap_ranges: list[tuple[int | float, int | float]] | None = None,
1349
    properties: list[str] | None = None,
1350
) -> dict:
1351
    """
1352
    Calculate the probability of the different options of properties of the
1353
    existing PV plants.
1354
    Parameters
1355
    -----------
1356
    mastr_gdf : geopandas.GeoDataFrame
1357
        GeoDataFrame containing geocoded MaStR data.
1358
    cap_ranges : list(tuple(int, int))
1359
        List of capacity ranges to distinguish between. The first tuple should
1360
        start with a zero and the last one should end with infinite.
1361
    properties : list(str)
1362
        List of properties to calculate probabilities for. Strings need to be
1363
        in columns of mastr_gdf.
1364
    Returns
1365
    -------
1366
    dict
1367
        Dictionary with values and probabilities per capacity range.
1368
    """
1369
    if cap_ranges is None:
1370
        cap_ranges = [
1371
            (0, 30),
1372
            (30, 100),
1373
            (100, float("inf")),
1374
        ]
1375
    if properties is None:
1376
        properties = [
1377
            "orientation_uniform",
1378
            "orientation_primary",
1379
            "orientation_primary_angle",
1380
            "load_factor",
1381
        ]
1382
1383
    prob_dict = {}
1384
1385
    for cap_range in cap_ranges:
1386
        prob_dict[cap_range] = {
1387
            "values": {},
1388
            "probabilities": {},
1389
        }
1390
1391
        for prop in properties:
1392
            v, p = get_probability_for_property(
1393
                mastr_gdf,
1394
                cap_range,
1395
                prop,
1396
            )
1397
1398
            prob_dict[cap_range]["values"][prop] = v
1399
            prob_dict[cap_range]["probabilities"][prop] = p
1400
1401
    return prob_dict
1402
1403
1404
def cap_share_per_cap_range(
1405
    mastr_gdf: gpd.GeoDataFrame,
1406
    cap_ranges: list[tuple[int | float, int | float]] | None = None,
1407
) -> dict[tuple[int | float, int | float], float]:
1408
    """
1409
    Calculate the share of PV capacity from the total PV capacity within
1410
    capacity ranges.
1411
    Parameters
1412
    -----------
1413
    mastr_gdf : geopandas.GeoDataFrame
1414
        GeoDataFrame containing geocoded MaStR data.
1415
    cap_ranges : list(tuple(int, int))
1416
        List of capacity ranges to distinguish between. The first tuple should
1417
        start with a zero and the last one should end with infinite.
1418
    Returns
1419
    -------
1420
    dict
1421
        Dictionary with share of PV capacity from the total PV capacity within
1422
        capacity ranges.
1423
    """
1424
    if cap_ranges is None:
1425
        cap_ranges = [
1426
            (0, 30),
1427
            (30, 100),
1428
            (100, float("inf")),
1429
        ]
1430
1431
    cap_share_dict = {}
1432
1433
    total_cap = mastr_gdf.capacity.sum()
1434
1435
    for cap_range in cap_ranges:
1436
        cap_share = (
1437
            mastr_gdf.loc[
1438
                (mastr_gdf.capacity > cap_range[0])
1439
                & (mastr_gdf.capacity <= cap_range[1])
1440
            ].capacity.sum()
1441
            / total_cap
1442
        )
1443
1444
        cap_share_dict[cap_range] = cap_share
1445
1446
    return cap_share_dict
1447
1448
1449
def mean_load_factor_per_cap_range(
1450
    mastr_gdf: gpd.GeoDataFrame,
1451
    cap_ranges: list[tuple[int | float, int | float]] | None = None,
1452
) -> dict[tuple[int | float, int | float], float]:
1453
    """
1454
    Calculate the mean roof load factor per capacity range from existing PV
1455
    plants.
1456
    Parameters
1457
    -----------
1458
    mastr_gdf : geopandas.GeoDataFrame
1459
        GeoDataFrame containing geocoded MaStR data.
1460
    cap_ranges : list(tuple(int, int))
1461
        List of capacity ranges to distinguish between. The first tuple should
1462
        start with a zero and the last one should end with infinite.
1463
    Returns
1464
    -------
1465
    dict
1466
        Dictionary with mean roof load factor per capacity range.
1467
    """
1468
    if cap_ranges is None:
1469
        cap_ranges = [
1470
            (0, 30),
1471
            (30, 100),
1472
            (100, float("inf")),
1473
        ]
1474
1475
    load_factor_dict = {}
1476
1477
    for cap_range in cap_ranges:
1478
        load_factor = mastr_gdf.loc[
1479
            (mastr_gdf.load_factor <= 1)
1480
            & (mastr_gdf.capacity > cap_range[0])
1481
            & (mastr_gdf.capacity <= cap_range[1])
1482
        ].load_factor.mean()
1483
1484
        load_factor_dict[cap_range] = load_factor
1485
1486
    return load_factor_dict
1487
1488
1489
def building_area_range_per_cap_range(
1490
    mastr_gdf: gpd.GeoDataFrame,
1491
    cap_ranges: list[tuple[int | float, int | float]] | None = None,
1492
    min_building_size: int | float = 10.0,
1493
    upper_quantile: float = 0.95,
1494
    lower_quantile: float = 0.05,
1495
) -> dict[tuple[int | float, int | float], tuple[int | float, int | float]]:
1496
    """
1497
    Estimate normal building area range per capacity range.
1498
    Calculate the mean roof load factor per capacity range from existing PV
1499
    plants.
1500
    Parameters
1501
    -----------
1502
    mastr_gdf : geopandas.GeoDataFrame
1503
        GeoDataFrame containing geocoded MaStR data.
1504
    cap_ranges : list(tuple(int, int))
1505
        List of capacity ranges to distinguish between. The first tuple should
1506
        start with a zero and the last one should end with infinite.
1507
    min_building_size : int, float
1508
        Minimal building size to consider for PV plants.
1509
    upper_quantile : float
1510
        Upper quantile to estimate maximum building size per capacity range.
1511
    lower_quantile : float
1512
        Lower quantile to estimate minimum building size per capacity range.
1513
    Returns
1514
    -------
1515
    dict
1516
        Dictionary with estimated normal building area range per capacity
1517
        range.
1518
    """
1519
    if cap_ranges is None:
1520
        cap_ranges = [
1521
            (0, 30),
1522
            (30, 100),
1523
            (100, float("inf")),
1524
        ]
1525
1526
    building_area_range_dict = {}
1527
1528
    n_ranges = len(cap_ranges)
1529
1530
    for count, cap_range in enumerate(cap_ranges):
1531
        cap_range_gdf = mastr_gdf.loc[
1532
            (mastr_gdf.capacity > cap_range[0])
1533
            & (mastr_gdf.capacity <= cap_range[1])
1534
        ]
1535
1536
        if count == 0:
1537
            building_area_range_dict[cap_range] = (
1538
                min_building_size,
1539
                cap_range_gdf.building_area.quantile(upper_quantile),
1540
            )
1541
        elif count == n_ranges - 1:
1542
            building_area_range_dict[cap_range] = (
1543
                cap_range_gdf.building_area.quantile(lower_quantile),
1544
                float("inf"),
1545
            )
1546
        else:
1547
            building_area_range_dict[cap_range] = (
1548
                cap_range_gdf.building_area.quantile(lower_quantile),
1549
                cap_range_gdf.building_area.quantile(upper_quantile),
1550
            )
1551
1552
    values = list(building_area_range_dict.values())
1553
1554
    building_area_range_normed_dict = {}
1555
1556
    for count, (cap_range, (min_area, max_area)) in enumerate(
1557
        building_area_range_dict.items()
1558
    ):
1559
        if count == 0:
1560
            building_area_range_normed_dict[cap_range] = (
1561
                min_area,
1562
                np.mean((values[count + 1][0], max_area)),
1563
            )
1564
        elif count == n_ranges - 1:
1565
            building_area_range_normed_dict[cap_range] = (
1566
                np.mean((values[count - 1][1], min_area)),
1567
                max_area,
1568
            )
1569
        else:
1570
            building_area_range_normed_dict[cap_range] = (
1571
                np.mean((values[count - 1][1], min_area)),
1572
                np.mean((values[count + 1][0], max_area)),
1573
            )
1574
1575
    return building_area_range_normed_dict
1576
1577
1578
@timer_func
1579
def desaggregate_pv_in_mv_grid(
1580
    buildings_gdf: gpd.GeoDataFrame,
1581
    pv_cap: float | int,
1582
    **kwargs,
1583
) -> gpd.GeoDataFrame:
1584
    """
1585
    Desaggregate PV capacity on buildings within a given grid district.
1586
    Parameters
1587
    -----------
1588
    buildings_gdf : geopandas.GeoDataFrame
1589
        GeoDataFrame containing buildings within the grid district.
1590
    pv_cap : float, int
1591
        PV capacity to desaggregate.
1592
    Other Parameters
1593
    -----------
1594
    prob_dict : dict
1595
        Dictionary with values and probabilities per capacity range.
1596
    cap_share_dict : dict
1597
        Dictionary with share of PV capacity from the total PV capacity within
1598
        capacity ranges.
1599
    building_area_range_dict : dict
1600
        Dictionary with estimated normal building area range per capacity
1601
        range.
1602
    load_factor_dict : dict
1603
        Dictionary with mean roof load factor per capacity range.
1604
    seed : int
1605
        Seed to use for random operations with NumPy and pandas.
1606
    pv_cap_per_sq_m : float, int
1607
        Average expected, installable PV capacity per square meter.
1608
    Returns
1609
    -------
1610
    geopandas.GeoDataFrame
1611
        GeoDataFrame containing OSM building data with desaggregated PV
1612
        plants.
1613
    """
1614
    bus_id = int(buildings_gdf.bus_id.iat[0])
1615
1616
    rng = default_rng(seed=kwargs["seed"])
1617
    random_state = RandomState(seed=kwargs["seed"])
1618
1619
    results_df = pd.DataFrame(columns=buildings_gdf.columns)
1620
1621
    for cap_range, share in kwargs["cap_share_dict"].items():
1622
        pv_cap_range = pv_cap * share
1623
1624
        b_area_min, b_area_max = kwargs["building_area_range_dict"][cap_range]
1625
1626
        cap_range_buildings_gdf = buildings_gdf.loc[
1627
            ~buildings_gdf.index.isin(results_df.index)
1628
            & (buildings_gdf.building_area > b_area_min)
1629
            & (buildings_gdf.building_area <= b_area_max)
1630
        ]
1631
1632
        mean_load_factor = kwargs["load_factor_dict"][cap_range]
1633
        cap_range_buildings_gdf = cap_range_buildings_gdf.assign(
1634
            mean_cap=cap_range_buildings_gdf.max_cap * mean_load_factor,
1635
            load_factor=np.nan,
1636
            capacity=np.nan,
1637
        )
1638
1639
        total_mean_cap = cap_range_buildings_gdf.mean_cap.sum()
1640
1641
        if total_mean_cap == 0:
1642
            logger.warning(
1643
                f"There are no matching roof for capacity range {cap_range} "
1644
                f"kW in grid {bus_id}. Using all buildings as fallback."
1645
            )
1646
1647
            cap_range_buildings_gdf = buildings_gdf.loc[
1648
                ~buildings_gdf.index.isin(results_df.index)
1649
            ]
1650
1651
            if len(cap_range_buildings_gdf) == 0:
1652
                logger.warning(
1653
                    "There are no roofes available for capacity range "
1654
                    f"{cap_range} kW in grid {bus_id}. Allowing dual use."
1655
                )
1656
                cap_range_buildings_gdf = buildings_gdf.copy()
1657
1658
            cap_range_buildings_gdf = cap_range_buildings_gdf.assign(
1659
                mean_cap=cap_range_buildings_gdf.max_cap * mean_load_factor,
1660
                load_factor=np.nan,
1661
                capacity=np.nan,
1662
            )
1663
1664
            total_mean_cap = cap_range_buildings_gdf.mean_cap.sum()
1665
1666
        elif total_mean_cap < pv_cap_range:
1667
            logger.warning(
1668
                f"Average roof utilization of the roof area in grid {bus_id} "
1669
                f"and capacity range {cap_range} kW is not sufficient. The "
1670
                "roof utilization will be above average."
1671
            )
1672
1673
        frac = max(
1674
            pv_cap_range / total_mean_cap,
1675
            1 / len(cap_range_buildings_gdf),
1676
        )
1677
1678
        samples_gdf = cap_range_buildings_gdf.sample(
1679
            frac=min(1, frac),
1680
            random_state=random_state,
1681
        )
1682
1683
        cap_range_dict = kwargs["prob_dict"][cap_range]
1684
1685
        values_dict = cap_range_dict["values"]
1686
        p_dict = cap_range_dict["probabilities"]
1687
1688
        load_factors = rng.choice(
1689
            a=values_dict["load_factor"],
1690
            size=len(samples_gdf),
1691
            p=p_dict["load_factor"],
1692
        )
1693
1694
        samples_gdf = samples_gdf.assign(
1695
            load_factor=load_factors,
1696
            capacity=(
1697
                samples_gdf.building_area
1698
                * load_factors
1699
                * kwargs["pv_cap_per_sq_m"]
1700
            ).clip(lower=0.4),
1701
        )
1702
1703
        missing_factor = pv_cap_range / samples_gdf.capacity.sum()
1704
1705
        samples_gdf = samples_gdf.assign(
1706
            capacity=(samples_gdf.capacity * missing_factor),
1707
            load_factor=(samples_gdf.load_factor * missing_factor),
1708
        )
1709
1710
        assert np.isclose(
1711
            samples_gdf.capacity.sum(),
1712
            pv_cap_range,
1713
            rtol=1e-03,
1714
        ), f"{samples_gdf.capacity.sum()} != {pv_cap_range}"
1715
1716
        results_df = pd.concat(
1717
            [
1718
                results_df,
1719
                samples_gdf,
1720
            ],
1721
        )
1722
1723
    total_missing_factor = pv_cap / results_df.capacity.sum()
1724
1725
    results_df = results_df.assign(
1726
        capacity=(results_df.capacity * total_missing_factor),
1727
    )
1728
1729
    assert np.isclose(
1730
        results_df.capacity.sum(),
1731
        pv_cap,
1732
        rtol=1e-03,
1733
    ), f"{results_df.capacity.sum()} != {pv_cap}"
1734
1735
    return gpd.GeoDataFrame(
1736
        results_df,
1737
        crs=samples_gdf.crs,
0 ignored issues
show
introduced by
The variable samples_gdf does not seem to be defined in case the for loop on line 1621 is not entered. Are you sure this can never be the case?
Loading history...
1738
        geometry="geom",
1739
    )
1740
1741
1742
@timer_func
1743
def desaggregate_pv(
1744
    buildings_gdf: gpd.GeoDataFrame,
1745
    cap_df: pd.DataFrame,
1746
    **kwargs,
1747
) -> gpd.GeoDataFrame:
1748
    """
1749
    Desaggregate PV capacity on buildings within a given grid district.
1750
    Parameters
1751
    -----------
1752
    buildings_gdf : geopandas.GeoDataFrame
1753
        GeoDataFrame containing OSM buildings data.
1754
    cap_df : pandas.DataFrame
1755
        DataFrame with total rooftop capacity per mv grid.
1756
    Other Parameters
1757
    -----------
1758
    prob_dict : dict
1759
        Dictionary with values and probabilities per capacity range.
1760
    cap_share_dict : dict
1761
        Dictionary with share of PV capacity from the total PV capacity within
1762
        capacity ranges.
1763
    building_area_range_dict : dict
1764
        Dictionary with estimated normal building area range per capacity
1765
        range.
1766
    load_factor_dict : dict
1767
        Dictionary with mean roof load factor per capacity range.
1768
    seed : int
1769
        Seed to use for random operations with NumPy and pandas.
1770
    pv_cap_per_sq_m : float, int
1771
        Average expected, installable PV capacity per square meter.
1772
    Returns
1773
    -------
1774
    geopandas.GeoDataFrame
1775
        GeoDataFrame containing OSM building data with desaggregated PV
1776
        plants.
1777
    """
1778
    allocated_buildings_gdf = buildings_gdf.loc[~buildings_gdf.end_of_life]
1779
1780
    building_bus_ids = set(buildings_gdf.bus_id)
1781
    cap_bus_ids = set(cap_df.index)
1782
1783
    logger.debug(
1784
        f"Bus IDs from buildings: {len(building_bus_ids)}\nBus IDs from "
1785
        f"capacity: {len(cap_bus_ids)}"
1786
    )
1787
1788
    if len(building_bus_ids) > len(cap_bus_ids):
1789
        missing = building_bus_ids - cap_bus_ids
1790
    else:
1791
        missing = cap_bus_ids - building_bus_ids
1792
1793
    logger.debug(str(missing))
1794
1795
    bus_ids = np.intersect1d(list(building_bus_ids), list(cap_bus_ids))
1796
1797
    # assert set(buildings_gdf.bus_id.unique()) == set(cap_df.index)
1798
1799
    for bus_id in bus_ids:
1800
        buildings_grid_gdf = buildings_gdf.loc[buildings_gdf.bus_id == bus_id]
1801
1802
        pv_installed_gdf = buildings_grid_gdf.loc[
1803
            ~buildings_grid_gdf.end_of_life
1804
        ]
1805
1806
        pv_installed = pv_installed_gdf.capacity.sum()
1807
1808
        pot_buildings_gdf = buildings_grid_gdf.drop(
1809
            index=pv_installed_gdf.index
1810
        )
1811
1812
        if len(pot_buildings_gdf) == 0:
1813
            logger.error(
1814
                f"In grid {bus_id} there are no potential buildings to "
1815
                f"allocate PV capacity to. The grid is skipped. This message "
1816
                f"should only appear doing test runs with few buildings."
1817
            )
1818
1819
            continue
1820
1821
        pv_target = cap_df.at[bus_id, "capacity"] * 1000
1822
1823
        logger.debug(f"pv_target: {pv_target}")
1824
1825
        pv_missing = pv_target - pv_installed
1826
1827
        if pv_missing <= 0:
1828
            logger.warning(
1829
                f"In grid {bus_id} there is more PV installed "
1830
                f"({pv_installed: g} kW) in status Quo than allocated within "
1831
                f"the scenario ({pv_target: g} kW). "
1832
                f"No new generators are added."
1833
            )
1834
1835
            continue
1836
1837
        if pot_buildings_gdf.max_cap.sum() < pv_missing:
1838
            logger.error(
1839
                f"In grid {bus_id} there is less PV potential ("
1840
                f"{pot_buildings_gdf.max_cap.sum():g} kW) than allocated PV "
1841
                f"capacity ({pv_missing:g} kW). The average roof utilization "
1842
                f"will be very high."
1843
            )
1844
1845
        gdf = desaggregate_pv_in_mv_grid(
1846
            buildings_gdf=pot_buildings_gdf,
1847
            pv_cap=pv_missing,
1848
            **kwargs,
1849
        )
1850
1851
        logger.debug(f"New cap in grid {bus_id}: {gdf.capacity.sum()}")
1852
        logger.debug(f"Installed cap in grid {bus_id}: {pv_installed}")
1853
        logger.debug(
1854
            f"Total cap in grid {bus_id}: {gdf.capacity.sum() + pv_installed}"
1855
        )
1856
1857
        if not np.isclose(
1858
            gdf.capacity.sum() + pv_installed, pv_target, rtol=1e-3
1859
        ):
1860
            logger.warning(
1861
                f"The desired capacity and actual capacity in grid {bus_id} "
1862
                f"differ.\n"
1863
                f"Desired cap: {pv_target}\nActual cap: "
1864
                f"{gdf.capacity.sum() + pv_installed}"
1865
            )
1866
1867
        pre_cap = allocated_buildings_gdf.capacity.sum()
1868
        new_cap = gdf.capacity.sum()
1869
1870
        allocated_buildings_gdf = pd.concat(
1871
            [
1872
                allocated_buildings_gdf,
1873
                gdf,
1874
            ]
1875
        )
1876
1877
        total_cap = allocated_buildings_gdf.capacity.sum()
1878
1879
        assert np.isclose(pre_cap + new_cap, total_cap)
1880
1881
    logger.debug("Desaggregated scenario.")
1882
    logger.debug(f"Scenario capacity: {cap_df.capacity.sum(): g}")
1883
    logger.debug(
1884
        f"Generator capacity: "
1885
        f"{allocated_buildings_gdf.capacity.sum() / 1000: g}"
1886
    )
1887
1888
    return gpd.GeoDataFrame(
1889
        allocated_buildings_gdf,
1890
        crs=gdf.crs,
0 ignored issues
show
introduced by
The variable gdf does not seem to be defined for all execution paths.
Loading history...
1891
        geometry="geom",
1892
    )
1893
1894
1895
@timer_func
1896
def add_buildings_meta_data(
1897
    buildings_gdf: gpd.GeoDataFrame,
1898
    prob_dict: dict,
1899
    seed: int,
1900
) -> gpd.GeoDataFrame:
1901
    """
1902
    Randomly add additional metadata to desaggregated PV plants.
1903
    Parameters
1904
    -----------
1905
    buildings_gdf : geopandas.GeoDataFrame
1906
        GeoDataFrame containing OSM buildings data with desaggregated PV
1907
        plants.
1908
    prob_dict : dict
1909
        Dictionary with values and probabilities per capacity range.
1910
    seed : int
1911
        Seed to use for random operations with NumPy and pandas.
1912
    Returns
1913
    -------
1914
    geopandas.GeoDataFrame
1915
        GeoDataFrame containing OSM building data with desaggregated PV
1916
        plants.
1917
    """
1918
    rng = default_rng(seed=seed)
1919
    buildings_gdf = buildings_gdf.reset_index().rename(
1920
        columns={
1921
            "index": "building_id",
1922
        }
1923
    )
1924
1925
    for (min_cap, max_cap), cap_range_prob_dict in prob_dict.items():
1926
        cap_range_gdf = buildings_gdf.loc[
1927
            (buildings_gdf.capacity >= min_cap)
1928
            & (buildings_gdf.capacity < max_cap)
1929
        ]
1930
1931
        for key, values in cap_range_prob_dict["values"].items():
1932
            if key == "load_factor":
1933
                continue
1934
1935
            gdf = cap_range_gdf.loc[
1936
                cap_range_gdf[key].isna()
1937
                | cap_range_gdf[key].isnull()
1938
                | (cap_range_gdf[key] == "None")
1939
            ]
1940
1941
            key_vals = rng.choice(
1942
                a=values,
1943
                size=len(gdf),
1944
                p=cap_range_prob_dict["probabilities"][key],
1945
            )
1946
1947
            buildings_gdf.loc[gdf.index, key] = key_vals
1948
1949
    return buildings_gdf
1950
1951
1952
def add_voltage_level(
1953
    buildings_gdf: gpd.GeoDataFrame,
1954
) -> gpd.GeoDataFrame:
1955
    """
1956
    Get voltage level data from mastr table and assign to units. Infer missing
1957
    values derived from generator capacity to the power plants.
1958
1959
    Parameters
1960
    -----------
1961
    buildings_gdf : geopandas.GeoDataFrame
1962
        GeoDataFrame containing OSM buildings data with desaggregated PV
1963
        plants.
1964
    Returns
1965
    -------
1966
    geopandas.GeoDataFrame
1967
        GeoDataFrame containing OSM building data with voltage level per
1968
        generator.
1969
    """
1970
1971 View Code Duplication
    def voltage_levels(p: float) -> int:
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
1972
        if p <= 100:
1973
            return 7
1974
        elif p <= 200:
1975
            return 6
1976
        elif p <= 5500:
1977
            return 5
1978
        elif p <= 20000:
1979
            return 4
1980
        elif p <= 120000:
1981
            return 3
1982
        return 1
1983
1984
    # Join mastr table
1985
    with db.session_scope() as session:
1986
        query = session.query(
1987
            EgonPowerPlantsPv.gens_id,
1988
            EgonPowerPlantsPv.voltage_level,
1989
        )
1990
    voltage_levels_df = pd.read_sql(
1991
        query.statement, query.session.bind, index_col=None
1992
    )
1993
    buildings_gdf = buildings_gdf.merge(
1994
        voltage_levels_df,
1995
        left_on="gens_id",
1996
        right_on="gens_id",
1997
        how="left",
1998
    )
1999
2000
    # Infer missing values
2001
    mask = buildings_gdf.voltage_level.isna()
2002
    buildings_gdf.loc[mask, "voltage_level"] = buildings_gdf.loc[
2003
        mask
2004
    ].capacity.apply(voltage_levels)
2005
2006
    return buildings_gdf
2007
2008
2009
def add_commissioning_date(
2010
    buildings_gdf: gpd.GeoDataFrame,
2011
    start: pd.Timestamp,
2012
    end: pd.Timestamp,
2013
    seed: int,
2014
):
2015
    """
2016
    Randomly and linear add start-up date to new pv generators.
2017
    Parameters
2018
    ----------
2019
    buildings_gdf : geopandas.GeoDataFrame
2020
        GeoDataFrame containing OSM buildings data with desaggregated PV
2021
        plants.
2022
    start : pandas.Timestamp
2023
        Minimum Timestamp to use.
2024
    end : pandas.Timestamp
2025
        Maximum Timestamp to use.
2026
    seed : int
2027
        Seed to use for random operations with NumPy and pandas.
2028
    Returns
2029
    -------
2030
    geopandas.GeoDataFrame
2031
        GeoDataFrame containing OSM buildings data with start-up date added.
2032
    """
2033
    rng = default_rng(seed=seed)
2034
2035
    date_range = pd.date_range(start=start, end=end, freq="1D")
2036
2037
    return buildings_gdf.assign(
2038
        commissioning_date=rng.choice(date_range, size=len(buildings_gdf))
2039
    )
2040
2041
2042
@timer_func
2043
def allocate_scenarios(
2044
    mastr_gdf: gpd.GeoDataFrame,
2045
    valid_buildings_gdf: gpd.GeoDataFrame,
2046
    last_scenario_gdf: gpd.GeoDataFrame,
2047
    scenario: str,
2048
):
2049
    """
2050
    Desaggregate and allocate scenario pv rooftop ramp-ups onto buildings.
2051
    Parameters
2052
    ----------
2053
    mastr_gdf : geopandas.GeoDataFrame
2054
        GeoDataFrame containing geocoded MaStR data.
2055
    valid_buildings_gdf : geopandas.GeoDataFrame
2056
        GeoDataFrame containing OSM buildings data.
2057
    last_scenario_gdf : geopandas.GeoDataFrame
2058
        GeoDataFrame containing OSM buildings matched with pv generators from
2059
        temporally preceding scenario.
2060
    scenario : str
2061
        Scenario to desaggrgate and allocate.
2062
    Returns
2063
    -------
2064
    tuple
2065
        geopandas.GeoDataFrame
2066
            GeoDataFrame containing OSM buildings matched with pv generators.
2067
        pandas.DataFrame
2068
            DataFrame containing pv rooftop capacity per grid id.
2069
    """
2070
    cap_per_bus_id_df = cap_per_bus_id(scenario)
2071
2072
    logger.debug(
2073
        f"cap_per_bus_id_df total capacity: {cap_per_bus_id_df.capacity.sum()}"
2074
    )
2075
2076
    last_scenario_gdf = determine_end_of_life_gens(
2077
        last_scenario_gdf,
2078
        SCENARIO_TIMESTAMP[scenario],
2079
        PV_ROOFTOP_LIFETIME,
2080
    )
2081
2082
    buildings_gdf = calculate_max_pv_cap_per_building(
2083
        valid_buildings_gdf,
2084
        last_scenario_gdf,
2085
        PV_CAP_PER_SQ_M,
2086
        ROOF_FACTOR,
2087
    )
2088
2089
    mastr_gdf = calculate_building_load_factor(
2090
        mastr_gdf,
2091
        buildings_gdf,
2092
    )
2093
2094
    probabilities_dict = probabilities(
2095
        mastr_gdf,
2096
        cap_ranges=CAP_RANGES,
2097
    )
2098
2099
    cap_share_dict = cap_share_per_cap_range(
2100
        mastr_gdf,
2101
        cap_ranges=CAP_RANGES,
2102
    )
2103
2104
    load_factor_dict = mean_load_factor_per_cap_range(
2105
        mastr_gdf,
2106
        cap_ranges=CAP_RANGES,
2107
    )
2108
2109
    building_area_range_dict = building_area_range_per_cap_range(
2110
        mastr_gdf,
2111
        cap_ranges=CAP_RANGES,
2112
        min_building_size=MIN_BUILDING_SIZE,
2113
        upper_quantile=UPPER_QUANTILE,
2114
        lower_quantile=LOWER_QUANTILE,
2115
    )
2116
2117
    allocated_buildings_gdf = desaggregate_pv(
2118
        buildings_gdf=buildings_gdf,
2119
        cap_df=cap_per_bus_id_df,
2120
        prob_dict=probabilities_dict,
2121
        cap_share_dict=cap_share_dict,
2122
        building_area_range_dict=building_area_range_dict,
2123
        load_factor_dict=load_factor_dict,
2124
        seed=SEED,
2125
        pv_cap_per_sq_m=PV_CAP_PER_SQ_M,
2126
    )
2127
2128
    allocated_buildings_gdf = allocated_buildings_gdf.assign(scenario=scenario)
2129
2130
    meta_buildings_gdf = frame_to_numeric(
2131
        add_buildings_meta_data(
2132
            allocated_buildings_gdf,
2133
            probabilities_dict,
2134
            SEED,
2135
        )
2136
    )
2137
2138
    return (
2139
        add_commissioning_date(
2140
            meta_buildings_gdf,
2141
            start=last_scenario_gdf.commissioning_date.max(),
2142
            end=SCENARIO_TIMESTAMP[scenario],
2143
            seed=SEED,
2144
        ),
2145
        cap_per_bus_id_df,
2146
    )
2147
2148
2149
class EgonPowerPlantPvRoofBuildingScenario(Base):
2150
    __tablename__ = "egon_power_plants_pv_roof_building"
2151
    __table_args__ = {"schema": "supply"}
2152
2153
    index = Column(Integer, primary_key=True, index=True)
2154
    scenario = Column(String)
2155
    bus_id = Column(Integer, nullable=True)
2156
    building_id = Column(Integer)
2157
    gens_id = Column(String, nullable=True)
2158
    capacity = Column(Float)
2159
    einheitliche_ausrichtung_und_neigungswinkel = Column(Float)
2160
    hauptausrichtung = Column(String)
2161
    hauptausrichtung_neigungswinkel = Column(String)
2162
    voltage_level = Column(Integer)
2163
    weather_cell_id = Column(Integer)
2164
2165
2166
def create_scenario_table(buildings_gdf):
2167
    """Create mapping table pv_unit <-> building for scenario"""
2168
    EgonPowerPlantPvRoofBuildingScenario.__table__.drop(
2169
        bind=engine, checkfirst=True
2170
    )
2171
    EgonPowerPlantPvRoofBuildingScenario.__table__.create(
2172
        bind=engine, checkfirst=True
2173
    )
2174
2175
    buildings_gdf.assign(
2176
        capacity=buildings_gdf.capacity.div(10**3)  # kW -> MW
2177
    )[COLS_TO_EXPORT].reset_index().to_sql(
2178
        name=EgonPowerPlantPvRoofBuildingScenario.__table__.name,
2179
        schema=EgonPowerPlantPvRoofBuildingScenario.__table__.schema,
2180
        con=db.engine(),
2181
        if_exists="append",
2182
        index=False,
2183
    )
2184
2185
2186
def add_weather_cell_id(buildings_gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
2187
    sql = """
2188
    SELECT building_id, zensus_population_id
2189
    FROM boundaries.egon_map_zensus_mvgd_buildings
2190
    """
2191
2192
    buildings_gdf = buildings_gdf.merge(
2193
        right=db.select_dataframe(sql).drop_duplicates(subset="building_id"),
2194
        how="left",
2195
        on="building_id",
2196
    )
2197
2198
    sql = """
2199
    SELECT zensus_population_id, w_id as weather_cell_id
2200
    FROM boundaries.egon_map_zensus_weather_cell
2201
    """
2202
2203
    buildings_gdf = buildings_gdf.merge(
2204
        right=db.select_dataframe(sql).drop_duplicates(
2205
            subset="zensus_population_id"
2206
        ),
2207
        how="left",
2208
        on="zensus_population_id",
2209
    )
2210
2211
    if buildings_gdf.weather_cell_id.isna().any():
2212
        missing = buildings_gdf.loc[
2213
            buildings_gdf.weather_cell_id.isna()
2214
        ].building_id.tolist()
2215
2216
        raise ValueError(
2217
            f"Following buildings don't have a weather cell id: {missing}"
2218
        )
2219
2220
    return buildings_gdf
2221
2222
2223
def add_bus_ids_sq(
2224
    buildings_gdf: gpd.GeoDataFrame,
2225
) -> gpd.GeoDataFrame:
2226
    """Add bus ids for status_quo units
2227
2228
    Parameters
2229
    -----------
2230
    buildings_gdf : geopandas.GeoDataFrame
2231
        GeoDataFrame containing OSM buildings data with desaggregated PV
2232
        plants.
2233
    Returns
2234
    -------
2235
    geopandas.GeoDataFrame
2236
        GeoDataFrame containing OSM building data with bus_id per
2237
        generator.
2238
    """
2239
    grid_districts_gdf = grid_districts(EPSG)
2240
2241
    mask = buildings_gdf.scenario == "status_quo"
2242
    buildings_gdf.loc[mask, "bus_id"] = (
2243
        buildings_gdf.loc[mask]
2244
        .sjoin(grid_districts_gdf, how="left")
2245
        .index_right
2246
    )
2247
2248
    return buildings_gdf
2249
2250
2251
def pv_rooftop_to_buildings():
2252
    """Main script, executed as task"""
2253
2254
    mastr_gdf = load_mastr_data()
2255
2256
    buildings_gdf = load_building_data()
2257
2258
    desagg_mastr_gdf, desagg_buildings_gdf = allocate_to_buildings(
2259
        mastr_gdf, buildings_gdf
2260
    )
2261
2262
    all_buildings_gdf = (
2263
        desagg_mastr_gdf.assign(scenario="status_quo")
2264
        .reset_index()
2265
        .rename(columns={"geometry": "geom", "gens_id": "gens_id"})
2266
    )
2267
2268
    scenario_buildings_gdf = all_buildings_gdf.copy()
2269
2270
    cap_per_bus_id_df = pd.DataFrame()
2271
2272
    for scenario in SCENARIOS:
2273
        logger.debug(f"Desaggregating scenario {scenario}.")
2274
        (
2275
            scenario_buildings_gdf,
2276
            cap_per_bus_id_scenario_df,
2277
        ) = allocate_scenarios(  # noqa: F841
2278
            desagg_mastr_gdf,
2279
            desagg_buildings_gdf,
2280
            scenario_buildings_gdf,
2281
            scenario,
2282
        )
2283
2284
        all_buildings_gdf = gpd.GeoDataFrame(
2285
            pd.concat(
2286
                [all_buildings_gdf, scenario_buildings_gdf], ignore_index=True
2287
            ),
2288
            crs=scenario_buildings_gdf.crs,
2289
            geometry="geom",
2290
        )
2291
2292
        cap_per_bus_id_df = pd.concat(
2293
            [cap_per_bus_id_df, cap_per_bus_id_scenario_df]
2294
        )
2295
2296
    # add weather cell
2297
    all_buildings_gdf = add_weather_cell_id(all_buildings_gdf)
2298
2299
    # add bus IDs for status quo scenario
2300
    all_buildings_gdf = add_bus_ids_sq(all_buildings_gdf)
2301
2302
    # export scenario
2303
    create_scenario_table(add_voltage_level(all_buildings_gdf))
2304