regroup_nuts1_census_data()   B
last analyzed

Complexity

Conditions 1

Size

Total Lines 122
Code Lines 70

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 70
dl 0
loc 122
rs 7.9818
c 0
b 0
f 0
cc 1
nop 1

How to fix   Long Method   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
"""
2
Household electricity demand time series for scenarios eGon2035 and eGon100RE at
3
census cell level are set up.
4
5
Electricity demand data for households in Germany in 1-hourly resolution for
6
an entire year. Spatially, the data is resolved to 100 x 100 m cells and
7
provides individual and distinct time series for each household in a cell.
8
The cells are defined by the dataset Zensus 2011.
9
10
"""
11
from itertools import cycle, product
12
from pathlib import Path
13
import os
14
import random
15
16
from airflow.operators.python import PythonOperator
17
from sqlalchemy import ARRAY, Column, Float, Integer, String
18
from sqlalchemy.dialects.postgresql import CHAR, INTEGER, REAL
19
from sqlalchemy.ext.declarative import declarative_base
20
import numpy as np
21
import pandas as pd
22
23
from egon.data import db
24
from egon.data.datasets import Dataset
25
from egon.data.datasets.scenario_parameters import get_scenario_year
26
from egon.data.datasets.zensus_mv_grid_districts import MapZensusGridDistricts
27
import egon.data.config
28
29
Base = declarative_base()
30
engine = db.engine()
31
32
33
# Get random seed from config
34
RANDOM_SEED = egon.data.config.settings()["egon-data"]["--random-seed"]
35
36
37
class IeeHouseholdLoadProfiles(Base):
38
    """
39
    Class definition of table demand.iee_household_load_profiles.
40
    """
41
    __tablename__ = "iee_household_load_profiles"
42
    __table_args__ = {"schema": "demand"}
43
44
    id = Column(INTEGER, primary_key=True)
45
    type = Column(CHAR(8), index=True)
46
    load_in_wh = Column(ARRAY(REAL))
47
48
49
class HouseholdElectricityProfilesInCensusCells(Base):
50
    """
51
    Class definition of table demand.egon_household_electricity_profile_in_census_cell.
52
53
    Lists references and scaling parameters of time series data for each
54
    household in a cell by identifiers. This table is fundamental for creating
55
    subsequent data like demand profiles on MV grid level or for determining
56
    the peak load at load area level.
57
58
    """
59
    __tablename__ = "egon_household_electricity_profile_in_census_cell"
60
    __table_args__ = {"schema": "demand"}
61
62
    cell_id = Column(Integer, primary_key=True)
63
    grid_id = Column(String)
64
    cell_profile_ids = Column(ARRAY(String, dimensions=1))
65
    nuts3 = Column(String)
66
    nuts1 = Column(String)
67
    factor_2019 = Column(Float)
68
    factor_2023 = Column(Float)
69
    factor_2035 = Column(Float)
70
    factor_2050 = Column(Float)
71
72
73
class EgonDestatisZensusHouseholdPerHaRefined(Base):
74
    """
75
    Class definition of table society.egon_destatis_zensus_household_per_ha_refined.
76
    """
77
    __tablename__ = "egon_destatis_zensus_household_per_ha_refined"
78
    __table_args__ = {"schema": "society"}
79
80
    id = Column(INTEGER, primary_key=True)
81
    cell_id = Column(Integer, index=True)
82
    grid_id = Column(String, index=True)
83
    nuts3 = Column(String)
84
    nuts1 = Column(String)
85
    characteristics_code = Column(Integer)
86
    hh_5types = Column(Integer)
87
    hh_type = Column(CHAR(2))
88
    hh_10types = Column(Integer)
89
90
91
class EgonEtragoElectricityHouseholds(Base):
92
    """
93
    Class definition of table demand.egon_etrago_electricity_households.
94
95
    The table contains household electricity demand profiles aggregated at MV grid
96
    district level in MWh.
97
    """
98
    __tablename__ = "egon_etrago_electricity_households"
99
    __table_args__ = {"schema": "demand"}
100
101
    bus_id = Column(Integer, primary_key=True)
102
    scn_name = Column(String, primary_key=True)
103
    p_set = Column(ARRAY(Float))
104
    q_set = Column(ARRAY(Float))
105
106
107
class HouseholdDemands(Dataset):
108
    """
109
    Household electricity demand time series for scenarios eGon2035 and eGon100RE at
110
    census cell level are set up.
111
112
    Electricity demand data for households in Germany in 1-hourly resolution for
113
    an entire year. Spatially, the data is resolved to 100 x 100 m cells and
114
    provides individual and distinct time series for each household in a cell.
115
    The cells are defined by the dataset Zensus 2011.
116
117
    *Dependencies*
118
      * :py:class:`DemandRegio <egon.data.datasets.demandregio.DemandRegio>`
119
      * :py:class:`Vg250 <egon.data.datasets.vg250.Vg250>`
120
      * :py:class:`OsmBuildingsStreets <egon.data.datasets.osm_buildings_streets.OsmBuildingsStreets>`
121
      * :py:func:`create_buildings_residential_zensus_mapping <egon.data.datasets.osm_buildings_streets.create_buildings_residential_zensus_mapping>`
122
      * :py:class:`ZensusMiscellaneous <egon.data.datasets.zensus.ZensusMiscellaneous>`
123
      * :py:class:`ZensusMvGridDistricts <egon.data.datasets.zensus_mv_grid_districts.ZensusMvGridDistricts>`
124
      * :py:class:`ZensusVg250 <egon.data.datasets.zensus_vg250.ZensusVg250>`
125
126
    *Resulting tables*
127
      * :py:class:`demand.iee_household_load_profiles
128
        <IeeHouseholdLoadProfiles>` is created and filled
129
      * :py:class:`demand.egon_household_electricity_profile_in_census_cell
130
        <HouseholdElectricityProfilesInCensusCells>` is created and filled
131
      * :py:class:`society.egon_destatis_zensus_household_per_ha_refined
132
        <EgonDestatisZensusHouseholdPerHaRefined>` is created and filled
133
      * :py:class:`demand.egon_etrago_electricity_households
134
        <EgonEtragoElectricityHouseholds>` is created and filled
135
136
    The following datasets are used for creating the data:
137
138
    * Electricity demand time series for household categories
139
      produced by demand profile generator (DPG) from Fraunhofer IEE
140
      (see :func:`get_iee_hh_demand_profiles_raw`)
141
    * Spatial information about people living in households by Zensus 2011 at
142
      federal state level
143
144
        * Type of household (family status)
145
        * Age
146
        * Number of people
147
    * Spatial information about number of households per ha, categorized by type
148
      of household (family status) with 5 categories (also from Zensus 2011)
149
    * Demand-Regio annual household demand at NUTS3 level
150
151
    *What is the goal?*
152
153
    To use the electricity demand time series from the `demand profile generator`
154
    to created spatially reference household demand time series for Germany at a
155
    resolution of 100 x 100 m cells.
156
157
    *What is the challenge?*
158
159
    The electricity demand time series produced by demand profile generator offer
160
    12 different household profile categories.
161
    To use most of them, the spatial information about the number of households
162
    per cell (5 categories) needs to be enriched by supplementary data to match
163
    the household demand profile categories specifications. Hence, 10 out of 12
164
    different household profile categories can be distinguished by increasing
165
    the number of categories of cell-level household data.
166
167
    *How are these datasets combined?*
168
169
    * Spatial information about people living in households by zensus (2011) at
170
      federal state NUTS1 level `df_zensus` is aggregated to be compatible
171
      to IEE household profile specifications.
172
173
        * exclude kids and reduce to adults and seniors
174
        * group as defined in `HH_TYPES`
175
        * convert data from people living in households to number of households
176
          by `mapping_people_in_households`
177
        * calculate fraction of fine household types (10) within subgroup of rough
178
          household types (5) `df_dist_households`
179
    * Spatial information about number of households per ha
180
      `df_census_households_nuts3` is mapped to NUTS1 and NUTS3 level.
181
      Data is refined with household subgroups via
182
      `df_dist_households` to `df_census_households_grid_refined`.
183
    * Enriched 100 x 100 m household dataset is used to sample and aggregate
184
      household profiles. A table including individual profile id's for each cell
185
      and scaling factor to match Demand-Regio annual sum projections for 2035
186
      and 2050 at NUTS3 level is created in the database as
187
      `demand.household_electricity_profiles_in_census_cells`.
188
189
    *What are central assumptions during the data processing?*
190
191
    * Mapping zensus data to IEE household categories is not trivial. In
192
      conversion from persons in household to number of
193
      households, number of inhabitants for multi-person households is estimated
194
      as weighted average in `OO_factor`
195
    * The distribution to refine household types at cell level are the same for
196
      each federal state
197
    * Refining of household types lead to float number of profiles drew at cell
198
      level and need to be rounded to nearest int by np.rint().
199
    * 100 x 100 m cells are matched to NUTS via cells centroid location
200
    * Cells with households in unpopulated areas are removed
201
202
    *Drawbacks and limitations of the data*
203
204
    * The distribution to refine household types at cell level are the same for
205
      each federal state
206
    * Household profiles aggregated annual demand matches Demand Regio demand at
207
      NUTS-3 level, but it is not matching the demand regio time series profile
208
    * Due to secrecy, some census data are highly modified under certain attributes
209
      (quantity_q = 2). This cell data is not corrected, but excluded.
210
    * There is deviation in the Census data from table to table. The statistical
211
      methods are not stringent. Hence, there are cases in which data contradicts.
212
    * Census data with attribute 'HHTYP_FAM' is missing for some cells with small
213
      amount of households. This data is generated using the average share of
214
      household types for cells with similar household number. For some cells the
215
      summed amount of households per type deviates from the total number with
216
      attribute 'INSGESAMT'. As the profiles are scaled with demand-regio data at
217
      nuts3-level the impact at a higher aggregation level is negligible.
218
      For sake of simplicity, the data is not corrected.
219
    * There are cells without household data but a population. A randomly chosen
220
      household distribution is taken from a subgroup of cells with same population
221
      value and applied to all cells with missing household distribution and the
222
      specific population value.
223
224
    *Helper functions*
225
226
    * To access the DB, select specific profiles at various aggregation levels
227
      use :py:func:`get_hh_profiles_from_db`
228
    * To access the DB, select specific profiles at various aggregation levels
229
      and scale profiles use :py:func:`get_scaled_profiles_from_db`
230
231
232
    """
233
234
    #:
235
    name: str = "Household Demands"
236
    #:
237
    version: str = "0.0.12"
238
239
    def __init__(self, dependencies):
240
        tasks = (create_table, houseprofiles_in_census_cells,)
241
242
        if (
243
            "status2019"
244
            in egon.data.config.settings()["egon-data"]["--scenarios"]
245
        ):
246
            mv_hh_electricity_load_2035 = PythonOperator(
247
                task_id="MV-hh-electricity-load-2019",
248
                python_callable=mv_grid_district_HH_electricity_load,
249
                op_args=["status2019", 2019],
250
            )
251
252
            tasks = tasks + (mv_hh_electricity_load_2035,)
253
254
        if (
255
            "status2023"
256
            in egon.data.config.settings()["egon-data"]["--scenarios"]
257
        ):
258
            mv_hh_electricity_load_2035 = PythonOperator(
259
                task_id="MV-hh-electricity-load-2023",
260
                python_callable=mv_grid_district_HH_electricity_load,
261
                op_args=["status2023", 2023],
262
            )
263
264
            tasks = tasks + (mv_hh_electricity_load_2035,)
265
266
        if (
267
            "eGon2035"
268
            in egon.data.config.settings()["egon-data"]["--scenarios"]
269
        ):
270
            mv_hh_electricity_load_2035 = PythonOperator(
271
                task_id="MV-hh-electricity-load-2035",
272
                python_callable=mv_grid_district_HH_electricity_load,
273
                op_args=["eGon2035", 2035],
274
            )
275
276
            tasks = tasks + (mv_hh_electricity_load_2035,)
277
278
        if (
279
            "eGon100RE"
280
            in egon.data.config.settings()["egon-data"]["--scenarios"]
281
        ):
282
            mv_hh_electricity_load_2050 = PythonOperator(
283
                task_id="MV-hh-electricity-load-2050",
284
                python_callable=mv_grid_district_HH_electricity_load,
285
                op_args=["eGon100RE", 2050],
286
            )
287
288
            tasks = tasks + (mv_hh_electricity_load_2050,)
289
290
        super().__init__(
291
            name=self.name,
292
            version=self.version,
293
            dependencies=dependencies,
294
            tasks=tasks,
295
        )
296
297
def create_table():
298
    EgonEtragoElectricityHouseholds.__table__.drop(
299
        bind=engine, checkfirst=True
300
    )
301
    EgonEtragoElectricityHouseholds.__table__.create(
302
        bind=engine, checkfirst=True
303
    )
304
305
def clean(x):
306
    """Clean zensus household data row-wise
307
308
    Clean dataset by
309
310
    * converting '.' and '-' to str(0)
311
    * removing brackets
312
313
    Table can be converted to int/floats afterwards
314
315
    Parameters
316
    ----------
317
    x: pd.Series
318
        It is meant to be used with :code:`df.applymap()`
319
320
    Returns
321
    -------
322
    pd.Series
323
        Re-formatted data row
324
    """
325
    x = str(x).replace("-", str(0))
326
    x = str(x).replace(".", str(0))
327
    x = x.strip("()")
328
    return x
329
330
331
def write_hh_profiles_to_db(hh_profiles):
332
    """Write HH demand profiles of IEE into db. One row per profile type.
333
    The annual load profile timeseries is an array.
334
335
    schema: demand
336
    tablename: iee_household_load_profiles
337
338
339
340
    Parameters
341
    ----------
342
    hh_profiles: pd.DataFrame
343
        It is meant to be used with :code:`df.applymap()`
344
345
    Returns
346
    -------
347
    """
348
    hh_profiles = hh_profiles.rename_axis("type", axis=1)
349
    hh_profiles = hh_profiles.rename_axis("timestep", axis=0)
350
    hh_profiles = hh_profiles.stack().rename("load_in_wh")
351
    hh_profiles = hh_profiles.to_frame().reset_index()
352
    hh_profiles = hh_profiles.groupby("type").load_in_wh.apply(tuple)
353
    hh_profiles = hh_profiles.reset_index()
354
355
    IeeHouseholdLoadProfiles.__table__.drop(bind=engine, checkfirst=True)
356
    IeeHouseholdLoadProfiles.__table__.create(bind=engine)
357
358
    hh_profiles.to_sql(
359
        name=IeeHouseholdLoadProfiles.__table__.name,
360
        schema=IeeHouseholdLoadProfiles.__table__.schema,
361
        con=engine,
362
        if_exists="append",
363
        method="multi",
364
        chunksize=100,
365
        index=False,
366
        dtype={
367
            "load_in_wh": IeeHouseholdLoadProfiles.load_in_wh.type,
368
            "type": IeeHouseholdLoadProfiles.type.type,
369
            "id": IeeHouseholdLoadProfiles.id.type,
370
        },
371
    )
372
373
374
def get_iee_hh_demand_profiles_raw():
375
    """Gets and returns household electricity demand profiles from the
376
    egon-data-bundle.
377
378
    Household electricity demand profiles generated by Fraunhofer IEE.
379
    Methodology is described in
380
    `Erzeugung zeitlich hochaufgelöster Stromlastprofile für verschiedene Haushaltstypen
381
    <https://www.researchgate.net/publication/273775902_Erzeugung_zeitlich_hochaufgeloster_Stromlastprofile_fur_verschiedene_Haushaltstypen>`_.
382
    It is used and further described in the following theses by:
383
384
    * Jonas Haack:
385
      "Auswirkungen verschiedener Haushaltslastprofile auf PV-Batterie-Systeme"
386
      (confidential)
387
    * Simon Ruben Drauz
388
      "Synthesis of a heat and electrical load profile for single and
389
      multi-family houses used for subsequent performance tests of a
390
      multi-component energy system",
391
      http://dx.doi.org/10.13140/RG.2.2.13959.14248
392
393
    Notes
394
    -----
395
    The household electricity demand profiles have been generated for 2016
396
    which is a leap year (8784 hours) starting on a Friday. The weather year
397
    is 2011 and the heat timeseries 2011 are generated for 2011 too (cf.
398
    dataset :mod:`egon.data.datasets.heat_demand_timeseries.HTS`), having
399
    8760h and starting on a Saturday. To align the profiles, the first day of
400
    the IEE profiles are deleted, resulting in 8760h starting on Saturday.
401
402
    Returns
403
    -------
404
    pd.DataFrame
405
        Table with profiles in columns and time as index. A pd.MultiIndex is
406
        used to distinguish load profiles from different EUROSTAT household
407
        types.
408
    """
409
    data_config = egon.data.config.datasets()
410
    pa_config = data_config["hh_demand_profiles"]
411
412
    def ve(s):
413
        raise (ValueError(s))
414
415
    dataset = egon.data.config.settings()["egon-data"]["--dataset-boundary"]
416
417
    file_section = (
418
        "path"
419
        if dataset == "Everything"
420
        else "path_testmode"
421
        if dataset == "Schleswig-Holstein"
422
        else ve(f"'{dataset}' is not a valid dataset boundary.")
423
    )
424
425
    file_path = pa_config["sources"]["household_electricity_demand_profiles"][
426
        file_section
427
    ]
428
429
    download_directory = os.path.join(
430
        "data_bundle_egon_data", "household_electricity_demand_profiles"
431
    )
432
433
    hh_profiles_file = (
434
        Path(".") / Path(download_directory) / Path(file_path).name
435
    )
436
437
    df_hh_profiles = pd.read_hdf(hh_profiles_file)
438
439
    # aggregate profile types O2, O1 and O0 as there is no differentiation
440
    # possible at cell level see :func:`regroup_nuts1_census_data`.
441
    merge_profiles = [i for i in df_hh_profiles.columns if "O1" in i[:3]]
442
    merge_profiles += [i for i in df_hh_profiles.columns if "O2" in i[:3]]
443
    merge_profiles += [i for i in df_hh_profiles.columns if "O0" in i[:3]]
444
    mapping = {f"{old}": f"O0a{i:05d}" for i, old in enumerate(merge_profiles)}
445
    df_hh_profiles.rename(columns=mapping, inplace=True)
446
447
    return df_hh_profiles
448
449
450
def set_multiindex_to_profiles(hh_profiles):
451
    """The profile id is split into type and number and set as multiindex.
452
453
    Parameters
454
    ----------
455
    hh_profiles: pd.DataFrame
456
        Profiles
457
    Returns
458
    -------
459
    hh_profiles: pd.DataFrame
460
        Profiles with Multiindex
461
    """
462
463
    # set multiindex to HH_types
464
    hh_profiles.columns = pd.MultiIndex.from_arrays(
465
        [hh_profiles.columns.str[:2], hh_profiles.columns.str[3:]]
466
    )
467
468
    # Cast profile ids into tuple of type and int
469
    hh_profiles.columns = pd.MultiIndex.from_tuples(
470
        [(a, int(b)) for a, b in hh_profiles.columns]
471
    )
472
473
    return hh_profiles
474
475
476
def get_census_households_nuts1_raw():
477
    """Get zensus age x household type data from egon-data-bundle
478
479
    Dataset about household size with information about the categories:
480
481
    * family type
482
    * age class
483
    * household size
484
485
    for Germany in spatial resolution of federal states NUTS-1.
486
487
    Data manually selected and retrieved from:
488
    https://ergebnisse2011.zensus2022.de/datenbank/online
489
    For reproducing data selection, please do:
490
491
    * Search for: "1000A-3016"
492
    * or choose topic: "Bevölkerung kompakt"
493
    * Choose table code: "1000A-3016" with title "Personen: Alter
494
      (11 Altersklassen) - Größe des privaten Haushalts - Typ des privaten
495
      Haushalts (nach Familien/Lebensform)"
496
    * Change setting "GEOLK1" to "Bundesländer (16)"
497
498
    Data would be available in higher resolution
499
    ("Landkreise und kreisfreie Städte (412)"), but only after registration.
500
501
    The downloaded file is called 'Zensus2011_Personen.csv'.
502
503
504
    Returns
505
    -------
506
    pd.DataFrame
507
        Pre-processed zensus household data
508
    """
509
    data_config = egon.data.config.datasets()
510
    pa_config = data_config["hh_demand_profiles"]
511
    file_path = pa_config["sources"]["zensus_household_types"]["path"]
512
513
    download_directory = os.path.join(
514
        "data_bundle_egon_data", "zensus_households"
515
    )
516
517
    households_file = (
518
        Path(".") / Path(download_directory) / Path(file_path).name
519
    )
520
521
    households_raw = pd.read_csv(
522
        households_file,
523
        sep=";",
524
        decimal=".",
525
        skiprows=5,
526
        skipfooter=7,
527
        index_col=[0, 1],
528
        header=[0, 1],
529
        encoding="latin1",
530
        engine="python",
531
    )
532
533
    return households_raw
534
535
536
def create_missing_zensus_data(
537
    df_households_typ, df_missing_data, missing_cells
538
):
539
    """
540
    Generate missing data as average share of the household types for cell groups with
541
    the same amount of households.
542
543
    There is missing data for specific attributes in the zensus dataset because
544
    of secrecy reasons. Some cells with only small amount of households are
545
    missing with attribute HHTYP_FAM. However the total amount of households
546
    is known with attribute INSGESAMT. The missing data is generated as average
547
    share of the household types for cell groups with the same amount of
548
    households.
549
550
    Parameters
551
    ----------
552
    df_households_typ: pd.DataFrame
553
        Zensus households data
554
    df_missing_data: pd.DataFrame
555
        number of missing cells of group of amount of households
556
    missing_cells: dict
557
        dictionary with list of grids of the missing cells grouped by amount of
558
        households in cell
559
560
    Returns
561
    ----------
562
    df_average_split: pd.DataFrame
563
        generated dataset of missing cells
564
565
    """
566
    # grid_ids of missing cells grouped by amount of households
567
    missing_grid_ids = {
568
        group: list(df.grid_id)
569
        for group, df in missing_cells.groupby("quantity")
570
    }
571
572
    # Grid ids for cells with low household numbers
573
    df_households_typ = df_households_typ.set_index("grid_id", drop=True)
574
    hh_in_cells = df_households_typ.groupby("grid_id")["quantity"].sum()
575
    hh_index = {
576
        i: hh_in_cells.loc[hh_in_cells == i].index
577
        for i in df_missing_data.households.values
578
    }
579
580
    df_average_split = pd.DataFrame()
581
    for hh_size, index in hh_index.items():
582
        # average split of household types in cells with low household numbers
583
        split = (
584
            df_households_typ.loc[index].groupby("characteristics_code").sum()
585
            / df_households_typ.loc[index].quantity.sum()
586
        )
587
        split = split.quantity * hh_size
588
589
        # correct rounding
590
        difference = int(split.sum() - split.round().sum())
591
        if difference > 0:
592
            # add to any row
593
            split = split.round()
594
            random_row = split.sample()
595
            split[random_row.index] = random_row + difference
596
        elif difference < 0:
597
            # subtract only from rows > 0
598
            split = split.round()
599
            random_row = split[split > 0].sample()
600
            split[random_row.index] = random_row + difference
601
        else:
602
            split = split.round()
603
604
        # Dataframe with average split for each cell
605
        temp = pd.DataFrame(
606
            product(zip(split, range(1, 6)), missing_grid_ids[hh_size]),
607
            columns=["tuple", "grid_id"],
608
        )
609
        temp = pd.DataFrame(temp.tuple.tolist()).join(temp.grid_id)
610
        temp = temp.rename(columns={0: "hh_5types", 1: "characteristics_code"})
611
        temp = temp.dropna()
612
        temp = temp[(temp["hh_5types"] != 0)]
613
        # append for each cell group of households
614
        df_average_split = pd.concat(
615
            [df_average_split, temp], ignore_index=True
616
        )
617
    df_average_split["hh_5types"] = df_average_split["hh_5types"].astype(int)
618
619
    return df_average_split
620
621
622
def process_nuts1_census_data(df_census_households_raw):
623
    """Make data compatible with household demand profile categories
624
625
    Removes and reorders categories which are not needed to fit data to
626
    household types of IEE electricity demand time series generated by
627
    demand-profile-generator (DPG).
628
629
    * Kids (<15) are excluded as they are also excluded in DPG origin dataset
630
    * Adults (15<65)
631
    * Seniors (<65)
632
633
    Parameters
634
    ----------
635
    df_census_households_raw: pd.DataFrame
636
        cleaned zensus household type x age category data
637
638
    Returns
639
    -------
640
    pd.DataFrame
641
        Aggregated zensus household data on NUTS-1 level
642
    """
643
644
    # Clean data to int only
645
    df_census_households = df_census_households_raw.applymap(clean).applymap(
646
        int
647
    )
648
649
    # Group data to fit Load Profile Generator categories
650
    # define kids/adults/seniors
651
    kids = ["Unter 3", "3 - 5", "6 - 14"]  # < 15
652
    adults = [
653
        "15 - 17",
654
        "18 - 24",
655
        "25 - 29",
656
        "30 - 39",
657
        "40 - 49",
658
        "50 - 64",
659
    ]  # 15 < x <65
660
    seniors = ["65 - 74", "75 und älter"]  # >65
661
662
    # sum groups of kids, adults and seniors and concat
663
    df_kids = (
664
        df_census_households.loc[:, (slice(None), kids)]
665
        .groupby(level=0, axis=1)
666
        .sum()
667
    )
668
    df_adults = (
669
        df_census_households.loc[:, (slice(None), adults)]
670
        .groupby(level=0, axis=1)
671
        .sum()
672
    )
673
    df_seniors = (
674
        df_census_households.loc[:, (slice(None), seniors)]
675
        .groupby(level=0, axis=1)
676
        .sum()
677
    )
678
    df_census_households = pd.concat(
679
        [df_kids, df_adults, df_seniors],
680
        axis=1,
681
        keys=["Kids", "Adults", "Seniors"],
682
        names=["age", "persons"],
683
    )
684
685
    # reduce column names to state only
686
    mapping_state = {
687
        i: i.split()[1]
688
        for i in df_census_households.index.get_level_values(level=0)
689
    }
690
691
    # rename index
692
    df_census_households = df_census_households.rename(
693
        index=mapping_state, level=0
694
    )
695
    # rename axis
696
    df_census_households = df_census_households.rename_axis(["state", "type"])
697
    # unstack
698
    df_census_households = df_census_households.unstack()
699
    # reorder levels
700
    df_census_households = df_census_households.reorder_levels(
701
        order=["type", "persons", "age"], axis=1
702
    )
703
704
    return df_census_households
705
706
707
def regroup_nuts1_census_data(df_census_households_nuts1):
708
    """Regroup census data and map according to demand-profile types.
709
710
    For more information look at the respective publication:
711
    https://www.researchgate.net/publication/273775902_Erzeugung_zeitlich_hochaufgeloster_Stromlastprofile_fur_verschiedene_Haushaltstypen
712
713
714
    Parameters
715
    ----------
716
    df_census_households_nuts1: pd.DataFrame
717
        census household data on NUTS-1 level in absolute values
718
719
    Returns
720
    ----------
721
    df_dist_households: pd.DataFrame
722
        Distribution of households type
723
    """
724
725
    # Mapping of census household family types to Eurostat household types
726
    # - Adults living in households type
727
    # - kids are  not included even if mentioned in household type name
728
    # **! The Eurostat data only counts adults/seniors, excluding kids <15**
729
    # Eurostat household types are used for demand-profile-generator
730
    # @iee-fraunhofer
731
    hh_types_eurostat = {
732
        "SR": [
733
            ("Einpersonenhaushalte (Singlehaushalte)", "Insgesamt", "Seniors"),
734
            ("Alleinerziehende Elternteile", "Insgesamt", "Seniors"),
735
        ],
736
        # Single Seniors Single Parents Seniors
737
        "SO": [
738
            ("Einpersonenhaushalte (Singlehaushalte)", "Insgesamt", "Adults")
739
        ],  # Single Adults
740
        "SK": [("Alleinerziehende Elternteile", "Insgesamt", "Adults")],
741
        # Single Parents Adult
742
        "PR": [
743
            ("Paare ohne Kind(er)", "2 Personen", "Seniors"),
744
            (
745
                "Mehrpersonenhaushalte ohne Kernfamilie",
746
                "2 Personen",
747
                "Seniors",
748
            ),
749
        ],
750
        # Couples without Kids Senior & same sex couples & shared flat seniors
751
        "PO": [
752
            ("Paare ohne Kind(er)", "2 Personen", "Adults"),
753
            ("Mehrpersonenhaushalte ohne Kernfamilie", "2 Personen", "Adults"),
754
        ],
755
        # Couples without Kids adults & same sex couples & shared flat adults
756
        "P1": [("Paare mit Kind(ern)", "3 Personen", "Adults")],
757
        "P2": [("Paare mit Kind(ern)", "4 Personen", "Adults")],
758
        "P3": [
759
            ("Paare mit Kind(ern)", "5 Personen", "Adults"),
760
            ("Paare mit Kind(ern)", "6 und mehr Personen", "Adults"),
761
        ],
762
        "OR": [
763
            (
764
                "Mehrpersonenhaushalte ohne Kernfamilie",
765
                "3 Personen",
766
                "Seniors",
767
            ),
768
            (
769
                "Mehrpersonenhaushalte ohne Kernfamilie",
770
                "4 Personen",
771
                "Seniors",
772
            ),
773
            (
774
                "Mehrpersonenhaushalte ohne Kernfamilie",
775
                "5 Personen",
776
                "Seniors",
777
            ),
778
            (
779
                "Mehrpersonenhaushalte ohne Kernfamilie",
780
                "6 und mehr Personen",
781
                "Seniors",
782
            ),
783
            ("Paare mit Kind(ern)", "3 Personen", "Seniors"),
784
            ("Paare ohne Kind(er)", "3 Personen", "Seniors"),
785
            ("Paare mit Kind(ern)", "4 Personen", "Seniors"),
786
            ("Paare ohne Kind(er)", "4 Personen", "Seniors"),
787
            ("Paare mit Kind(ern)", "5 Personen", "Seniors"),
788
            ("Paare ohne Kind(er)", "5 Personen", "Seniors"),
789
            ("Paare mit Kind(ern)", "6 und mehr Personen", "Seniors"),
790
            ("Paare ohne Kind(er)", "6 und mehr Personen", "Seniors"),
791
        ],
792
        # no info about share of kids
793
        # OO, O1, O2 have the same amount, as no information about the share of
794
        # kids within census data set.
795
        "OO": [
796
            ("Mehrpersonenhaushalte ohne Kernfamilie", "3 Personen", "Adults"),
797
            ("Mehrpersonenhaushalte ohne Kernfamilie", "4 Personen", "Adults"),
798
            ("Mehrpersonenhaushalte ohne Kernfamilie", "5 Personen", "Adults"),
799
            (
800
                "Mehrpersonenhaushalte ohne Kernfamilie",
801
                "6 und mehr Personen",
802
                "Adults",
803
            ),
804
            ("Paare ohne Kind(er)", "3 Personen", "Adults"),
805
            ("Paare ohne Kind(er)", "4 Personen", "Adults"),
806
            ("Paare ohne Kind(er)", "5 Personen", "Adults"),
807
            ("Paare ohne Kind(er)", "6 und mehr Personen", "Adults"),
808
        ],
809
        # no info about share of kids
810
    }
811
812
    # absolute values
813
    df_hh_distribution_abs = pd.DataFrame(
814
        (
815
            {
816
                hhtype: df_census_households_nuts1.loc[countries, codes].sum()
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable countries does not seem to be defined.
Loading history...
817
                for hhtype, codes in hh_types_eurostat.items()
818
            }
819
            for countries in df_census_households_nuts1.index
820
        ),
821
        index=df_census_households_nuts1.index,
822
    )
823
    # drop zero columns
824
    df_hh_distribution_abs = df_hh_distribution_abs.loc[
825
        :, (df_hh_distribution_abs != 0).any(axis=0)
826
    ].T
827
828
    return df_hh_distribution_abs
829
830
831
def inhabitants_to_households(df_hh_people_distribution_abs):
832
    """
833
    Convert number of inhabitant to number of household types
834
835
    Takes the distribution of peoples living in types of households to
836
    calculate a distribution of household types by using a people-in-household
837
    mapping. Results are not rounded to int as it will be used to calculate
838
    a relative distribution anyways.
839
    The data of category 'HHGROESS_KLASS' in census households
840
    at grid level is used to determine an average wherever the amount
841
    of people is not trivial (OR, OO). Kids are not counted.
842
843
    Parameters
844
    ----------
845
    df_hh_people_distribution_abs: pd.DataFrame
846
        Grouped census household data on NUTS-1 level in absolute values
847
848
    Returns
849
    ----------
850
    df_dist_households: pd.DataFrame
851
        Distribution of households type
852
853
    """
854
855
    # Get household size for each census cell grouped by
856
    # As this is only used to estimate size of households for OR, OO
857
    # The hh types 1 P and 2 P households are dropped
858
    df_hh_size = db.select_dataframe(
859
        sql="""
860
                SELECT characteristics_text, SUM(quantity) as summe
861
                FROM society.egon_destatis_zensus_household_per_ha as egon_d
862
                WHERE attribute = 'HHGROESS_KLASS' AND quantity_q < 2
863
                GROUP BY characteristics_text """,
864
        index_col="characteristics_text",
865
    )
866
    df_hh_size = df_hh_size.drop(index=["1 Person", "2 Personen"])
867
868
    # Define/ estimate number of persons (w/o kids) for each household category
869
    # For categories S* and P* it's clear; for multi-person households (OO,OR)
870
    # the number is estimated as average by taking remaining persons
871
    OO_factor = (
872
        sum(df_hh_size["summe"] * [3, 4, 5, 6]) / df_hh_size["summe"].sum()
873
    )
874
    mapping_people_in_households = {
875
        "SR": 1,
876
        "SO": 1,
877
        "SK": 1,  # kids are excluded
878
        "PR": 2,
879
        "PO": 2,
880
        "P1": 2,  # kids are excluded
881
        "P2": 2,  # ""
882
        "P3": 2,  # ""
883
        "OR": OO_factor,
884
        "OO": OO_factor,
885
    }
886
887
    # compare categories and remove form mapping if to many
888
    diff = set(df_hh_people_distribution_abs.index) ^ set(
889
        mapping_people_in_households.keys()
890
    )
891
892
    if bool(diff):
893
        for key in diff:
894
            mapping_people_in_households = dict(mapping_people_in_households)
895
            del mapping_people_in_households[key]
896
        print(f"Removed {diff} from mapping!")
897
898
    # divide amount of people by people in household types
899
    df_dist_households = df_hh_people_distribution_abs.div(
900
        mapping_people_in_households, axis=0
901
    )
902
903
    return df_dist_households
904
905
906
def impute_missing_hh_in_populated_cells(df_census_households_grid):
907
    """
908
    Fills in missing household data in populated cells based on a random selection from
909
    a subgroup of cells with the same population value.
910
911
    There are cells without household data but a population. A randomly
912
    chosen household distribution is taken from a subgroup of cells with same
913
    population value and applied to all cells with missing household
914
    distribution and the specific population value. In the case, in which there
915
    is no subgroup with household data of the respective population value, the
916
    fallback is the subgroup with the last last smaller population value.
917
918
    Parameters
919
    ----------
920
    df_census_households_grid: pd.DataFrame
921
        census household data at 100x100m grid level
922
923
    Returns
924
    -------
925
    pd.DataFrame
926
        substituted census household data at 100x100m grid level"""
927
928
    df_w_hh = df_census_households_grid.dropna().reset_index(drop=True)
929
    df_wo_hh = df_census_households_grid.loc[
930
        df_census_households_grid.isna().any(axis=1)
931
    ].reset_index(drop=True)
932
933
    # iterate over unique population values
934
    for population in df_wo_hh["population"].sort_values().unique():
935
936
        # create fallback if no cell with specific population available
937
        if population in df_w_hh["population"].unique():
938
            fallback_value = population
939
            population_value = population
940
        # use fallback of last possible household distribution
941
        else:
942
            population_value = fallback_value
0 ignored issues
show
introduced by
The variable fallback_value does not seem to be defined for all execution paths.
Loading history...
943
944
        # get cells with specific population value from cells with
945
        # household distribution
946
        df_w_hh_population_i = df_w_hh.loc[
947
            df_w_hh["population"] == population_value
948
        ]
949
        # choose random cell within this group
950
        rnd_cell_id_population_i = np.random.choice(
951
            df_w_hh_population_i["cell_id"].unique()
952
        )
953
        # get household distribution of this cell
954
        df_rand_hh_distribution = df_w_hh_population_i.loc[
955
            df_w_hh_population_i["cell_id"] == rnd_cell_id_population_i
956
        ]
957
        # get cells with specific population value from cells without
958
        # household distribution
959
        df_wo_hh_population_i = df_wo_hh.loc[
960
            df_wo_hh["population"] == population
961
        ]
962
963
        # all cells will get the same random household distribution
964
965
        # prepare size of dataframe by number of household types
966
        df_repeated = pd.concat(
967
            [df_wo_hh_population_i] * df_rand_hh_distribution.shape[0],
968
            ignore_index=True,
969
        )
970
        df_repeated = df_repeated.sort_values("cell_id").reset_index(drop=True)
971
972
        # insert random household distribution
973
        columns = ["characteristics_code", "hh_5types"]
974
        df_repeated.loc[:, columns] = pd.concat(
975
            [df_rand_hh_distribution.loc[:, columns]]
976
            * df_wo_hh_population_i.shape[0]
977
        ).values
978
        # append new cells
979
        df_w_hh = pd.concat([df_w_hh, df_repeated], ignore_index=True)
980
981
    return df_w_hh
982
983
984
def get_census_households_grid():
985
    """
986
    Retrieves and adjusts census household data at 100x100m grid level, accounting for
987
    missing or divergent data.
988
989
    Query census household data at 100x100m grid level from database. As
990
    there is a divergence in the census household data depending which
991
    attribute is used. There also exist cells without household but with
992
    population data. The missing data in these cases are substituted. First
993
    census household data with attribute 'HHTYP_FAM' is missing for some
994
    cells with small amount of households. This data is generated using the
995
    average share of household types for cells with similar household number.
996
    For some cells the summed amount of households per type deviates from the
997
    total number with attribute 'INSGESAMT'. As the profiles are scaled with
998
    demand-regio data at nuts3-level the impact at a higher aggregation level
999
    is negligible. For sake of simplicity, the data is not corrected.
1000
1001
    Returns
1002
    -------
1003
    pd.DataFrame
1004
        census household data at 100x100m grid level"""
1005
1006
    # Retrieve information about households for each census cell
1007
    # Only use cell-data which quality (quantity_q<2) is acceptable
1008
    df_census_households_grid = db.select_dataframe(
1009
        sql="""
1010
                SELECT grid_id, attribute, characteristics_code,
1011
                 characteristics_text, quantity
1012
                FROM society.egon_destatis_zensus_household_per_ha
1013
                WHERE attribute = 'HHTYP_FAM' AND quantity_q <2"""
1014
    )
1015
    df_census_households_grid = df_census_households_grid.drop(
1016
        columns=["attribute", "characteristics_text"]
1017
    )
1018
1019
    # Missing data is detected
1020
    df_missing_data = db.select_dataframe(
1021
        sql="""
1022
                    SELECT count(joined.quantity_gesamt) as amount,
1023
                     joined.quantity_gesamt as households
1024
                    FROM(
1025
                        SELECT t2.grid_id, quantity_gesamt, quantity_sum_fam,
1026
                         (quantity_gesamt-(case when quantity_sum_fam isnull
1027
                         then 0 else quantity_sum_fam end))
1028
                         as insgesamt_minus_fam
1029
                    FROM (
1030
                        SELECT  grid_id, SUM(quantity) as quantity_sum_fam
1031
                        FROM society.egon_destatis_zensus_household_per_ha
1032
                        WHERE attribute = 'HHTYP_FAM'
1033
                        GROUP BY grid_id) as t1
1034
                    Full JOIN (
1035
                        SELECT grid_id, sum(quantity) as quantity_gesamt
1036
                        FROM society.egon_destatis_zensus_household_per_ha
1037
                        WHERE attribute = 'INSGESAMT'
1038
                        GROUP BY grid_id) as t2 ON t1.grid_id = t2.grid_id
1039
                        ) as joined
1040
                    WHERE quantity_sum_fam isnull
1041
                    Group by quantity_gesamt """
1042
    )
1043
    missing_cells = db.select_dataframe(
1044
        sql="""
1045
                    SELECT t12.grid_id, t12.quantity
1046
                    FROM (
1047
                    SELECT t2.grid_id, (case when quantity_sum_fam isnull
1048
                    then quantity_gesamt end) as quantity
1049
                    FROM (
1050
                        SELECT  grid_id, SUM(quantity) as quantity_sum_fam
1051
                        FROM society.egon_destatis_zensus_household_per_ha
1052
                        WHERE attribute = 'HHTYP_FAM'
1053
                        GROUP BY grid_id) as t1
1054
                    Full JOIN (
1055
                        SELECT grid_id, sum(quantity) as quantity_gesamt
1056
                        FROM society.egon_destatis_zensus_household_per_ha
1057
                        WHERE attribute = 'INSGESAMT'
1058
                        GROUP BY grid_id) as t2 ON t1.grid_id = t2.grid_id
1059
                        ) as t12
1060
                    WHERE quantity is not null"""
1061
    )
1062
1063
    # Missing cells are substituted by average share of cells with same amount
1064
    # of households.
1065
    df_average_split = create_missing_zensus_data(
1066
        df_census_households_grid, df_missing_data, missing_cells
1067
    )
1068
1069
    df_census_households_grid = df_census_households_grid.rename(
1070
        columns={"quantity": "hh_5types"}
1071
    )
1072
1073
    df_census_households_grid = pd.concat(
1074
        [df_census_households_grid, df_average_split], ignore_index=True
1075
    )
1076
1077
    # Census cells with nuts3 and nuts1 information
1078
    df_grid_id = db.select_dataframe(
1079
        sql="""
1080
                SELECT pop.grid_id, pop.id as cell_id, pop.population,
1081
                 vg250.vg250_nuts3 as nuts3, lan.nuts as nuts1, lan.gen
1082
                FROM
1083
                society.destatis_zensus_population_per_ha_inside_germany as pop
1084
                LEFT JOIN boundaries.egon_map_zensus_vg250 as vg250
1085
                ON (pop.id=vg250.zensus_population_id)
1086
                LEFT JOIN boundaries.vg250_lan as lan
1087
                ON (LEFT(vg250.vg250_nuts3, 3) = lan.nuts)
1088
                WHERE lan.gf = 4 """
1089
    )
1090
    df_grid_id = df_grid_id.drop_duplicates()
1091
    df_grid_id = df_grid_id.reset_index(drop=True)
1092
1093
    # Merge household type and size data with considered (populated) census
1094
    # cells how='right' is used as ids of unpopulated areas are removed
1095
    # by df_grid_id or ancestors. See here:
1096
    # https://github.com/openego/eGon-data/blob/add4944456f22b8873504c5f579b61dca286e357/src/egon/data/datasets/zensus_vg250.py#L269
1097
    df_census_households_grid = pd.merge(
1098
        df_census_households_grid,
1099
        df_grid_id,
1100
        left_on="grid_id",
1101
        right_on="grid_id",
1102
        how="right",
1103
    )
1104
    df_census_households_grid = df_census_households_grid.sort_values(
1105
        ["cell_id", "characteristics_code"]
1106
    )
1107
1108
    return df_census_households_grid
1109
1110
1111
def proportionate_allocation(
1112
    df_group, dist_households_nuts1, hh_10types_cluster
1113
):
1114
    """Household distribution at nuts1 are applied at census cell within group
1115
1116
    To refine the hh_5types and keep the distribution at nuts1 level,
1117
    the household types are clustered and drawn with proportionate weighting.
1118
    The resulting pool is splitted into subgroups with sizes according to
1119
    the number of households of clusters in cells.
1120
1121
    Parameters
1122
    ----------
1123
    df_group: pd.DataFrame
1124
        Census household data at grid level for specific hh_5type cluster in
1125
        a federal state
1126
    dist_households_nuts1: pd.Series
1127
        Household distribution of of hh_10types in a federal state
1128
    hh_10types_cluster: list of str
1129
        Cluster of household types to be refined to
1130
1131
    Returns
1132
    -------
1133
    pd.DataFrame
1134
        Refined household data with hh_10types of cluster at nuts1 level
1135
    """
1136
1137
    # get probability of households within hh_5types group
1138
    probability = dist_households_nuts1[hh_10types_cluster].values
1139
    # get total number of households within hh_5types group in federal state
1140
    size = df_group["hh_5types"].sum().astype(int)
1141
1142
    # random sample within hh_5types group with probability for whole federal
1143
    # state
1144
    choices = np.random.choice(
1145
        a=hh_10types_cluster, size=size, replace=True, p=probability
1146
    )
1147
    # get section sizes to split the sample pool from federal state to grid
1148
    # cells
1149
    split_sections = df_group["hh_5types"].cumsum().astype(int)[:-1]
1150
    # split into grid cell groups
1151
    samples = np.split(choices, split_sections)
1152
    # count number of hh_10types for each cell
1153
    sample_count = [np.unique(x, return_counts=True) for x in samples]
1154
1155
    df_distribution = pd.DataFrame(
1156
        sample_count, columns=["hh_type", "hh_10types"]
1157
    )
1158
    # add cell_ids
1159
    df_distribution["cell_id"] = df_group["cell_id"].unique()
1160
1161
    # unnest
1162
    df_distribution = (
1163
        df_distribution.apply(pd.Series.explode)
1164
        .reset_index(drop=True)
1165
        .dropna()
1166
    )
1167
1168
    return df_distribution
1169
1170
1171
def refine_census_data_at_cell_level(
1172
    df_census_households_grid,
1173
    df_census_households_nuts1,
1174
):
1175
    """
1176
    Processes and merges census data to specify household numbers and types per census
1177
    cell according to IEE profiles.
1178
1179
    The census data is processed to define the number and type of households
1180
    per zensus cell. Two subsets of the census data are merged to fit the
1181
    IEE profiles specifications. To do this, proportionate allocation is
1182
    applied at nuts1 level and within household type clusters.
1183
1184
    .. list-table:: Mapping table
1185
       :header-rows: 1
1186
1187
       * - characteristics_code
1188
         - characteristics_text
1189
         - mapping
1190
       * - 1
1191
         - Einpersonenhaushalte (Singlehaushalte)
1192
         - SR; SO
1193
       * - 2
1194
         - Paare ohne Kind(er)
1195
         - PR; PO
1196
       * - 3
1197
         - Paare mit Kind(ern)
1198
         - P1; P2; P3
1199
       * - 4
1200
         - Alleinerziehende Elternteile
1201
         - SK
1202
       * - 5
1203
         - Mehrpersonenhaushalte ohne Kernfamilie
1204
         - OR; OO
1205
1206
    Parameters
1207
    ----------
1208
    df_census_households_grid: pd.DataFrame
1209
        Aggregated zensus household data on 100x100m grid level
1210
    df_census_households_nuts1: pd.DataFrame
1211
        Aggregated zensus household data on NUTS-1 level
1212
1213
    Returns
1214
    -------
1215
    pd.DataFrame
1216
        Number of hh types per census cell
1217
    """
1218
    mapping_zensus_hh_subgroups = {
1219
        1: ["SR", "SO"],
1220
        2: ["PR", "PO"],
1221
        3: ["P1", "P2", "P3"],
1222
        4: ["SK"],
1223
        5: ["OR", "OO"],
1224
    }
1225
1226
    # Calculate fraction of fine household types within subgroup of
1227
    # rough household types
1228
    df_dist_households = df_census_households_nuts1.copy()
1229
    for value in mapping_zensus_hh_subgroups.values():
1230
        df_dist_households.loc[value] = df_census_households_nuts1.loc[
1231
            value
1232
        ].div(df_census_households_nuts1.loc[value].sum())
1233
1234
    # Refine from hh_5types to hh_10types
1235
    df_distribution_nuts0 = pd.DataFrame()
1236
    # Loop over federal states
1237
    for gen, df_nuts1 in df_census_households_grid.groupby("gen"):
1238
        # take subgroup distribution from federal state
1239
        dist_households_nuts1 = df_dist_households[gen]
1240
1241
        df_distribution_nuts1 = pd.DataFrame()
1242
        # loop over hh_5types as cluster
1243
        for (
1244
            hh_5type_cluster,
1245
            hh_10types_cluster,
1246
        ) in mapping_zensus_hh_subgroups.items():
1247
            # get census household of hh_5type and federal state
1248
            df_group = df_nuts1.loc[
1249
                df_nuts1["characteristics_code"] == hh_5type_cluster
1250
            ]
1251
1252
            # apply proportionate allocation function within cluster
1253
            df_distribution_group = proportionate_allocation(
1254
                df_group, dist_households_nuts1, hh_10types_cluster
1255
            )
1256
            df_distribution_group["characteristics_code"] = hh_5type_cluster
1257
            df_distribution_nuts1 = pd.concat(
1258
                [df_distribution_nuts1, df_distribution_group],
1259
                ignore_index=True,
1260
            )
1261
1262
        df_distribution_nuts0 = pd.concat(
1263
            [df_distribution_nuts0, df_distribution_nuts1], ignore_index=True
1264
        )
1265
1266
    df_census_households_grid_refined = df_census_households_grid.merge(
1267
        df_distribution_nuts0,
1268
        how="inner",
1269
        left_on=["cell_id", "characteristics_code"],
1270
        right_on=["cell_id", "characteristics_code"],
1271
    )
1272
1273
    df_census_households_grid_refined[
1274
        "characteristics_code"
1275
    ] = df_census_households_grid_refined["characteristics_code"].astype(int)
1276
    df_census_households_grid_refined[
1277
        "hh_5types"
1278
    ] = df_census_households_grid_refined["hh_5types"].astype(int)
1279
    df_census_households_grid_refined[
1280
        "hh_10types"
1281
    ] = df_census_households_grid_refined["hh_10types"].astype(int)
1282
1283
    return df_census_households_grid_refined
1284
1285
1286
def get_cell_demand_profile_ids(df_cell, pool_size):
1287
    """
1288
    Generates tuple of hh_type and zensus cell ids
1289
1290
    Takes a random sample of profile ids for given cell:
1291
      * if pool size >= sample size: without replacement
1292
      * if pool size < sample size: with replacement
1293
1294
1295
    Parameters
1296
    ----------
1297
    df_cell: pd.DataFrame
1298
        Household type information for a single zensus cell
1299
    pool_size: int
1300
        Number of available profiles to select from
1301
1302
    Returns
1303
    -------
1304
    list of tuple
1305
        List of (`hh_type`, `cell_id`)
1306
1307
    """
1308
    # maybe use instead
1309
    # np.random.default_rng().integers(low=0, high=pool_size[hh_type], size=sq)
1310
    # instead of random.sample use random.choices() if with replacement
1311
    # list of sample ids per hh_type in cell
1312
    cell_profile_ids = [
1313
        (hh_type, random.sample(range(pool_size[hh_type]), k=sq))
1314
        if pool_size[hh_type] >= sq
1315
        else (hh_type, random.choices(range(pool_size[hh_type]), k=sq))
1316
        for hh_type, sq in zip(
1317
            df_cell["hh_type"],
1318
            df_cell["hh_10types"],
1319
        )
1320
    ]
1321
1322
    # format to lists of tuples (hh_type, id)
1323
    cell_profile_ids = [
1324
        list(zip(cycle([hh_type]), ids)) for hh_type, ids in cell_profile_ids
1325
    ]
1326
    # reduce to list
1327
    cell_profile_ids = [a for b in cell_profile_ids for a in b]
1328
1329
    return cell_profile_ids
1330
1331
1332
# can be parallelized with grouping df_zensus_cells by grid_id/nuts3/nuts1
1333
def assign_hh_demand_profiles_to_cells(df_zensus_cells, df_iee_profiles):
1334
    """
1335
    Assign household demand profiles to each census cell.
1336
1337
    A table including the demand profile ids for each cell is created by using
1338
    :func:`get_cell_demand_profile_ids`. Household profiles are randomly
1339
    sampled for each cell. The profiles are not replaced to the pool within
1340
    a cell but after.
1341
1342
    Parameters
1343
    ----------
1344
    df_zensus_cells: pd.DataFrame
1345
        Household type parameters. Each row representing one household. Hence,
1346
        multiple rows per zensus cell.
1347
    df_iee_profiles: pd.DataFrame
1348
        Household load profile data
1349
1350
        * Index: Times steps as serial integers
1351
        * Columns: pd.MultiIndex with (`HH_TYPE`, `id`)
1352
1353
    Returns
1354
    -------
1355
    pd.DataFrame
1356
        Tabular data with one row represents one zensus cell.
1357
        The column `cell_profile_ids` contains
1358
        a list of tuples (see :func:`get_cell_demand_profile_ids`) providing a
1359
        reference to the actual load profiles that are associated with this
1360
        cell.
1361
    """
1362
1363
    df_hh_profiles_in_census_cells = pd.DataFrame(
1364
        index=df_zensus_cells.grid_id.unique(),
1365
        columns=[
1366
            "cell_profile_ids",
1367
            "cell_id",
1368
            "nuts3",
1369
            "nuts1",
1370
            "factor_2035",
1371
            "factor_2050",
1372
        ],
1373
    )
1374
1375
    df_hh_profiles_in_census_cells = (
1376
        df_hh_profiles_in_census_cells.rename_axis("grid_id")
1377
    )
1378
1379
    pool_size = df_iee_profiles.groupby(level=0, axis=1).size()
1380
1381
    # only use non zero entries
1382
    df_zensus_cells = df_zensus_cells.loc[df_zensus_cells["hh_10types"] != 0]
1383
    for grid_id, df_cell in df_zensus_cells.groupby(by="grid_id"):
1384
        # random sampling of household profiles for each cell
1385
        # with or without replacement (see :func:`get_cell_demand_profile_ids`)
1386
        # within cell but after number of households are rounded to the nearest
1387
        # integer if float this results in a small deviation for the course of
1388
        # the aggregated profiles.
1389
        cell_profile_ids = get_cell_demand_profile_ids(df_cell, pool_size)
1390
1391
        df_hh_profiles_in_census_cells.at[grid_id, "cell_id"] = df_cell.loc[
1392
            :, "cell_id"
1393
        ].unique()[0]
1394
        df_hh_profiles_in_census_cells.at[
1395
            grid_id, "cell_profile_ids"
1396
        ] = cell_profile_ids
1397
        df_hh_profiles_in_census_cells.at[grid_id, "nuts3"] = df_cell.loc[
1398
            :, "nuts3"
1399
        ].unique()[0]
1400
        df_hh_profiles_in_census_cells.at[grid_id, "nuts1"] = df_cell.loc[
1401
            :, "nuts1"
1402
        ].unique()[0]
1403
1404
    return df_hh_profiles_in_census_cells
1405
1406
1407
# can be parallelized with grouping df_zensus_cells by grid_id/nuts3/nuts1
1408
def adjust_to_demand_regio_nuts3_annual(
1409
    df_hh_profiles_in_census_cells, df_iee_profiles, df_demand_regio
1410
):
1411
    """
1412
    Computes the profile scaling factor for alignment to demand regio data
1413
1414
    The scaling factor can be used to re-scale each load profile such that the
1415
    sum of all load profiles within one NUTS-3 area equals the annual demand
1416
    of demand regio data.
1417
1418
    Parameters
1419
    ----------
1420
    df_hh_profiles_in_census_cells: pd.DataFrame
1421
        Result of :func:`assign_hh_demand_profiles_to_cells`.
1422
    df_iee_profiles: pd.DataFrame
1423
        Household load profile data
1424
1425
        * Index: Times steps as serial integers
1426
        * Columns: pd.MultiIndex with (`HH_TYPE`, `id`)
1427
1428
    df_demand_regio: pd.DataFrame
1429
        Annual demand by demand regio for each NUTS-3 region and scenario year.
1430
        Index is pd.MultiIndex with :code:`tuple(scenario_year, nuts3_code)`.
1431
1432
    Returns
1433
    -------
1434
    pd.DataFrame
1435
        Returns the same data as :func:`assign_hh_demand_profiles_to_cells`,
1436
        but with filled columns `factor_2035` and `factor_2050`.
1437
    """
1438
    for nuts3_id, df_nuts3 in df_hh_profiles_in_census_cells.groupby(
1439
        by="nuts3"
1440
    ):
1441
        nuts3_cell_ids = df_nuts3.index
1442
        nuts3_profile_ids = df_nuts3.loc[:, "cell_profile_ids"].sum()
1443
1444
        # take all profiles of one nuts3, aggregate and sum
1445
        # profiles in Wh
1446
        nuts3_profiles_sum_annual = (
1447
            df_iee_profiles.loc[:, nuts3_profile_ids].sum().sum()
1448
        )
1449
1450
        # Scaling Factor
1451
        # ##############
1452
        # demand regio in MWh
1453
        # profiles in Wh
1454
1455
        for scn in egon.data.config.settings()["egon-data"]["--scenarios"]:
1456
            year = get_scenario_year(scn)
1457
            df_hh_profiles_in_census_cells.loc[
1458
                nuts3_cell_ids, f"factor_{year}"
1459
            ] = (
1460
                df_demand_regio.loc[(year, nuts3_id), "demand_mwha"]
1461
                * 1e3
1462
                / (nuts3_profiles_sum_annual / 1e3)
1463
            )
1464
1465
    return df_hh_profiles_in_census_cells
1466
1467
1468
def get_load_timeseries(
1469
    df_iee_profiles,
1470
    df_hh_profiles_in_census_cells,
1471
    cell_ids,
1472
    year,
1473
    aggregate=True,
1474
    peak_load_only=False,
1475
):
1476
    """
1477
    Get peak load for one load area in MWh
1478
1479
    The peak load is calculated in aggregated manner for a group of zensus
1480
    cells that belong to one load area (defined by `cell_ids`).
1481
1482
    Parameters
1483
    ----------
1484
    df_iee_profiles: pd.DataFrame
1485
        Household load profile data in Wh
1486
1487
        * Index: Times steps as serial integers
1488
        * Columns: pd.MultiIndex with (`HH_TYPE`, `id`)
1489
1490
        Used to calculate the peak load from.
1491
    df_hh_profiles_in_census_cells: pd.DataFrame
1492
        Return value of :func:`adjust_to_demand_regio_nuts3_annual`.
1493
    cell_ids: list
1494
        Zensus cell ids that define one group of zensus cells that belong to
1495
        the same load area.
1496
    year: int
1497
        Scenario year. Is used to consider the scaling factor for aligning
1498
        annual demand to NUTS-3 data.
1499
    aggregate: bool
1500
        If true, all profiles are aggregated
1501
    peak_load_only: bool
1502
        If true, only the peak load value is returned (the type of the return
1503
        value is `float`). Defaults to False which returns the entire time
1504
        series as pd.Series.
1505
1506
    Returns
1507
    -------
1508
    pd.Series or float
1509
        Aggregated time series for given `cell_ids` or peak load of this time
1510
        series in MWh.
1511
    """
1512
    timesteps = len(df_iee_profiles)
1513
    if aggregate:
1514
        full_load = pd.Series(
1515
            data=np.zeros(timesteps), dtype=np.float64, index=range(timesteps)
1516
        )
1517
    else:
1518
        full_load = pd.DataFrame(index=range(timesteps))
1519
    load_area_meta = df_hh_profiles_in_census_cells.loc[
1520
        cell_ids, ["cell_profile_ids", "nuts3", f"factor_{year}"]
1521
    ]
1522
    # loop over nuts3 (part_load) and sum (full_load) as the scaling factor
1523
    # applies at nuts3 level
1524
    for (nuts3, factor), df in load_area_meta.groupby(
1525
        by=["nuts3", f"factor_{year}"]
1526
    ):
1527
        if aggregate:
1528
            part_load = (
1529
                df_iee_profiles.loc[:, df["cell_profile_ids"].sum()].sum(
1530
                    axis=1
1531
                )
1532
                * factor
1533
                / 1e6
1534
            )  # from Wh to MWh
1535
            full_load = full_load.add(part_load)
1536
        elif not aggregate:
1537
            part_load = (
1538
                df_iee_profiles.loc[:, df["cell_profile_ids"].sum()]
1539
                * factor
1540
                / 1e6
1541
            )  # from Wh to MWh
1542
            full_load = pd.concat([full_load, part_load], axis=1).dropna(
1543
                axis=1
1544
            )
1545
        else:
1546
            raise KeyError("Parameter 'aggregate' needs to be bool value!")
1547
    if peak_load_only:
1548
        full_load = full_load.max()
1549
    return full_load
1550
1551
1552
def write_refinded_households_to_db(df_census_households_grid_refined):
1553
    # Write allocation table into database
1554
    EgonDestatisZensusHouseholdPerHaRefined.__table__.drop(
1555
        bind=engine, checkfirst=True
1556
    )
1557
    EgonDestatisZensusHouseholdPerHaRefined.__table__.create(
1558
        bind=engine, checkfirst=True
1559
    )
1560
1561
    with db.session_scope() as session:
1562
        session.bulk_insert_mappings(
1563
            EgonDestatisZensusHouseholdPerHaRefined,
1564
            df_census_households_grid_refined.to_dict(orient="records"),
1565
        )
1566
1567
1568
def houseprofiles_in_census_cells():
1569
    """
1570
    Allocate household electricity demand profiles for each census cell.
1571
1572
    Creates table `emand.egon_household_electricity_profile_in_census_cell` that maps
1573
    household electricity demand profiles to census cells. Each row represents one cell
1574
    and contains a list of profile IDs. This table is fundamental
1575
    for creating subsequent data like demand profiles on MV grid level or for
1576
    determining the peak load at load area level.
1577
1578
    Use :func:`get_houseprofiles_in_census_cells` to retrieve the data from
1579
    the database as pandas.
1580
1581
    """
1582
1583
    def gen_profile_names(n):
1584
        """Join from Format (str),(int) to (str)a000(int)"""
1585
        a = f"{n[0]}a{int(n[1]):05d}"
1586
        return a
1587
1588
    # Init random generators using global seed
1589
    random.seed(RANDOM_SEED)
1590
    np.random.seed(RANDOM_SEED)
1591
1592
    # Read demand profiles from egon-data-bundle
1593
    df_iee_profiles = get_iee_hh_demand_profiles_raw()
1594
1595
    # Write raw profiles into db
1596
    write_hh_profiles_to_db(df_iee_profiles)
1597
1598
    # Process profiles for further use
1599
    df_iee_profiles = set_multiindex_to_profiles(df_iee_profiles)
1600
1601
    # Download zensus household NUTS-1 data with family type and age categories
1602
    df_census_households_nuts1_raw = get_census_households_nuts1_raw()
1603
1604
    # Reduce age intervals and remove kids
1605
    df_census_households_nuts1 = process_nuts1_census_data(
1606
        df_census_households_nuts1_raw
1607
    )
1608
1609
    # Regroup data to be compatible with categories from demand profile
1610
    # generator.
1611
    df_census_households_nuts1 = regroup_nuts1_census_data(
1612
        df_census_households_nuts1
1613
    )
1614
1615
    # Convert data from people living in households to households
1616
    # Using a specified amount of inhabitants per household type
1617
    df_census_households_nuts1 = inhabitants_to_households(
1618
        df_census_households_nuts1
1619
    )
1620
1621
    # Query census household grid data with family type
1622
    df_census_households_grid = get_census_households_grid()
1623
1624
    # fill cells with missing household distribution values but population
1625
    # by hh distribution value of random cell with same population value
1626
    df_census_households_grid = impute_missing_hh_in_populated_cells(
1627
        df_census_households_grid
1628
    )
1629
1630
    # Refine census household grid data with additional NUTS-1 level attributes
1631
    df_census_households_grid_refined = refine_census_data_at_cell_level(
1632
        df_census_households_grid, df_census_households_nuts1
1633
    )
1634
1635
    write_refinded_households_to_db(df_census_households_grid_refined)
1636
1637
    # Allocate profile ids to each cell by census data
1638
    df_hh_profiles_in_census_cells = assign_hh_demand_profiles_to_cells(
1639
        df_census_households_grid_refined, df_iee_profiles
1640
    )
1641
1642
    # Annual household electricity demand on NUTS-3 level (demand regio)
1643
    df_demand_regio = db.select_dataframe(
1644
        sql="""
1645
                SELECT year, nuts3, SUM (demand) as demand_mWha
1646
                FROM demand.egon_demandregio_hh as egon_d
1647
                GROUP BY nuts3, year
1648
                ORDER BY year""",
1649
        index_col=["year", "nuts3"],
1650
    )
1651
1652
    # Scale profiles to meet demand regio annual demand projections
1653
    df_hh_profiles_in_census_cells = adjust_to_demand_regio_nuts3_annual(
1654
        df_hh_profiles_in_census_cells, df_iee_profiles, df_demand_regio
1655
    )
1656
1657
    df_hh_profiles_in_census_cells = (
1658
        df_hh_profiles_in_census_cells.reset_index(drop=False)
1659
    )
1660
    df_hh_profiles_in_census_cells["cell_id"] = df_hh_profiles_in_census_cells[
1661
        "cell_id"
1662
    ].astype(int)
1663
1664
    # Cast profile ids back to initial str format
1665
    df_hh_profiles_in_census_cells[
1666
        "cell_profile_ids"
1667
    ] = df_hh_profiles_in_census_cells["cell_profile_ids"].apply(
1668
        lambda x: list(map(gen_profile_names, x))
1669
    )
1670
1671
    # Write allocation table into database
1672
    HouseholdElectricityProfilesInCensusCells.__table__.drop(
1673
        bind=engine, checkfirst=True
1674
    )
1675
    HouseholdElectricityProfilesInCensusCells.__table__.create(
1676
        bind=engine, checkfirst=True
1677
    )
1678
1679
    with db.session_scope() as session:
1680
        session.bulk_insert_mappings(
1681
            HouseholdElectricityProfilesInCensusCells,
1682
            df_hh_profiles_in_census_cells.to_dict(orient="records"),
1683
        )
1684
1685
1686
def get_houseprofiles_in_census_cells():
1687
    """
1688
    Retrieve household electricity demand profile mapping from database
1689
1690
    See Also
1691
    --------
1692
    :func:`houseprofiles_in_census_cells`
1693
1694
    Returns
1695
    -------
1696
    pd.DataFrame
1697
        Mapping of household demand profiles to zensus cells
1698
    """
1699
    with db.session_scope() as session:
1700
        q = session.query(HouseholdElectricityProfilesInCensusCells)
1701
1702
        census_profile_mapping = pd.read_sql(
1703
            q.statement, q.session.bind, index_col="cell_id"
1704
        )
1705
1706
    return census_profile_mapping
1707
1708
1709
def get_cell_demand_metadata_from_db(attribute, list_of_identifiers):
1710
    """
1711
    Retrieve selection of household electricity demand profile mapping
1712
1713
    Parameters
1714
    ----------
1715
    attribute: str
1716
        attribute to filter the table
1717
1718
        * nuts3
1719
        * nuts1
1720
        * cell_id
1721
1722
    list_of_identifiers: list of str/int
1723
        nuts3/nuts1 need to be str
1724
        cell_id need to be int
1725
1726
    See Also
1727
    --------
1728
    :func:`houseprofiles_in_census_cells`
1729
1730
    Returns
1731
    -------
1732
    pd.DataFrame
1733
        Selection of mapping of household demand profiles to zensus cells
1734
    """
1735
    attribute_options = ["nuts3", "nuts1", "cell_id"]
1736
    if attribute not in attribute_options:
1737
        raise ValueError(f"attribute has to be one of: {attribute_options}")
1738
1739
    if not isinstance(list_of_identifiers, list):
1740
        raise KeyError("'list_of_identifiers' is not a list!")
1741
1742
    # Query profile ids and scaling factors for specific attributes
1743
    with db.session_scope() as session:
1744
        if attribute == "nuts3":
1745
            cells_query = session.query(
1746
                HouseholdElectricityProfilesInCensusCells.cell_id,
1747
                HouseholdElectricityProfilesInCensusCells.cell_profile_ids,
1748
                HouseholdElectricityProfilesInCensusCells.nuts3,
1749
                HouseholdElectricityProfilesInCensusCells.nuts1,
1750
                HouseholdElectricityProfilesInCensusCells.factor_2035,
1751
                HouseholdElectricityProfilesInCensusCells.factor_2050,
1752
            ).filter(
1753
                HouseholdElectricityProfilesInCensusCells.nuts3.in_(
1754
                    list_of_identifiers
1755
                )
1756
            )
1757
        elif attribute == "nuts1":
1758
            cells_query = session.query(
1759
                HouseholdElectricityProfilesInCensusCells.cell_id,
1760
                HouseholdElectricityProfilesInCensusCells.cell_profile_ids,
1761
                HouseholdElectricityProfilesInCensusCells.nuts3,
1762
                HouseholdElectricityProfilesInCensusCells.nuts1,
1763
                HouseholdElectricityProfilesInCensusCells.factor_2035,
1764
                HouseholdElectricityProfilesInCensusCells.factor_2050,
1765
            ).filter(
1766
                HouseholdElectricityProfilesInCensusCells.nuts1.in_(
1767
                    list_of_identifiers
1768
                )
1769
            )
1770
        elif attribute == "cell_id":
1771
            cells_query = session.query(
1772
                HouseholdElectricityProfilesInCensusCells.cell_id,
1773
                HouseholdElectricityProfilesInCensusCells.cell_profile_ids,
1774
                HouseholdElectricityProfilesInCensusCells.nuts3,
1775
                HouseholdElectricityProfilesInCensusCells.nuts1,
1776
                HouseholdElectricityProfilesInCensusCells.factor_2035,
1777
                HouseholdElectricityProfilesInCensusCells.factor_2050,
1778
            ).filter(
1779
                HouseholdElectricityProfilesInCensusCells.cell_id.in_(
1780
                    list_of_identifiers
1781
                )
1782
            )
1783
1784
    cell_demand_metadata = pd.read_sql(
1785
        cells_query.statement, cells_query.session.bind, index_col="cell_id"
0 ignored issues
show
introduced by
The variable cells_query does not seem to be defined for all execution paths.
Loading history...
1786
    )
1787
    return cell_demand_metadata
1788
1789
1790
def get_hh_profiles_from_db(profile_ids):
1791
    """
1792
    Retrieve selection of household electricity demand profiles
1793
1794
    Parameters
1795
    ----------
1796
    profile_ids: list of str (str, int)
1797
        (type)a00..(profile number) with number having exactly 4 digits
1798
1799
1800
    See Also
1801
    --------
1802
    :func:`houseprofiles_in_census_cells`
1803
1804
    Returns
1805
    -------
1806
    pd.DataFrame
1807
         Selection of household demand profiles
1808
    """
1809
    # Query load profiles
1810
    with db.session_scope() as session:
1811
        cells_query = session.query(
1812
            IeeHouseholdLoadProfiles.load_in_wh, IeeHouseholdLoadProfiles.type
1813
        ).filter(IeeHouseholdLoadProfiles.type.in_(profile_ids))
1814
1815
    df_profile_loads = pd.read_sql(
1816
        cells_query.statement, cells_query.session.bind, index_col="type"
1817
    )
1818
1819
    # convert array to Dataframe
1820
    df_profile_loads = pd.DataFrame.from_records(
1821
        df_profile_loads["load_in_wh"], index=df_profile_loads.index
1822
    ).T
1823
1824
    return df_profile_loads
1825
1826
def get_demand_regio_hh_profiles_from_db(year):
1827
    """
1828
    Retrieve demand regio household electricity demand profiles in nuts3 level
1829
1830
    Parameters
1831
    ----------
1832
    year: int
1833
        To which year belong the required demand profile
1834
1835
    Returns
1836
    -------
1837
    pd.DataFrame
1838
         Selection of household demand profiles
1839
    """
1840
1841
    query = """Select * from demand.demandregio_household_load_profiles
1842
    Where year = year"""
1843
1844
    df_profile_loads = pd.read_sql(
1845
        query, db.engine(), index_col="id"
1846
    )
1847
1848
    return df_profile_loads
1849
1850
def mv_grid_district_HH_electricity_load(scenario_name, scenario_year):
1851
    """
1852
    Aggregated household demand time series at HV/MV substation level
1853
1854
    Calculate the aggregated demand time series based on the demand profiles
1855
    of each zensus cell inside each MV grid district. Profiles are read from
1856
    local hdf5-file or demand timeseries per nuts3 in db.
1857
    Creates table `demand.egon_etrago_electricity_households` with
1858
    Household electricity demand profiles aggregated at MV grid district level
1859
    in MWh. Primarily used to create the eTraGo data model.
1860
1861
    Parameters
1862
    ----------
1863
    scenario_name: str
1864
        Scenario name identifier, i.e. "eGon2035"
1865
    scenario_year: int
1866
        Scenario year according to `scenario_name`
1867
1868
    Returns
1869
    -------
1870
    pd.DataFrame
1871
        Multiindexed dataframe with `timestep` and `bus_id` as indexers.
1872
        Demand is given in kWh.
1873
    """
1874
1875
    def tuple_format(x):
1876
        """Convert Profile ids from string to tuple (type, id)
1877
        Convert from (str)a000(int) to (str), (int)
1878
        """
1879
        return x[:2], int(x[3:])
1880
1881
    with db.session_scope() as session:
1882
        cells_query = session.query(
1883
            HouseholdElectricityProfilesInCensusCells,
1884
            MapZensusGridDistricts.bus_id,
1885
        ).join(
1886
            MapZensusGridDistricts,
1887
            HouseholdElectricityProfilesInCensusCells.cell_id
1888
            == MapZensusGridDistricts.zensus_population_id,
1889
        )
1890
1891
    cells = pd.read_sql(
1892
        cells_query.statement, cells_query.session.bind, index_col="cell_id"
1893
    )
1894
1895
    method = egon.data.config.settings()["egon-data"][
1896
        "--household-electrical-demand-source"
1897
    ]
1898
1899
    if method == "slp":
1900
        #Import demand regio timeseries demand per nuts3 area
1901
        dr_series = pd.read_sql_query("""
1902
            SELECT year, nuts3, load_in_mwh FROM demand.demandregio_household_load_profiles
1903
            """,
1904
            con = engine
1905
            )
1906
        dr_series = dr_series[dr_series["year"] == scenario_year]
1907
        dr_series.drop(columns=["year"], inplace=True)
1908
        dr_series.set_index("nuts3", inplace=True)
1909
        dr_series = dr_series.squeeze()
1910
1911
        #Population data per cell_id is used to scale the demand per nuts3
1912
        population = pd.read_sql_query("""
1913
            SELECT grid_id, population FROM society.destatis_zensus_population_per_ha
1914
            """,
1915
            con = engine
1916
            )
1917
        population.set_index("grid_id", inplace=True)
1918
        population = population.squeeze()
1919
        population.loc[population==-1] = 0
1920
1921
        cells["population"] = cells["grid_id"].map(population)
1922
1923
        factor_column = f"""factor_{scenario_year}"""
1924
1925
        mvgd_profiles = pd.DataFrame(
1926
            columns=["p_set", "q_set"], index=cells.bus_id.unique()
1927
        )
1928
        mvgd_profiles.index.name = "bus_id"
1929
1930
        for nuts3, df in cells.groupby("nuts3"):
1931
            cells.loc[df.index, factor_column] = df["population"] / df["population"].sum()
1932
1933
        for bus, df_bus in cells.groupby("bus_id"):
1934
            load_nuts = [0] * 8760
1935
            for nuts3, df_nuts in df_bus.groupby("nuts3"):
1936
                factor_nuts = df_nuts[factor_column].sum()
1937
                total_load = [x * factor_nuts for x in dr_series[nuts3]]
1938
                load_nuts = [sum(x) for x in zip(load_nuts, total_load)]
1939
            mvgd_profiles.at[bus, "p_set"] = load_nuts
1940
1941
        mvgd_profiles.reset_index(inplace=True)
1942
1943
    elif method == "bottom-up-profiles":
1944
        # convert profile ids to tuple (type, id) format
1945
        cells["cell_profile_ids"] = cells["cell_profile_ids"].apply(
1946
            lambda x: list(map(tuple_format, x))
1947
        )
1948
1949
        # Read demand profiles from egon-data-bundle
1950
        df_iee_profiles = get_iee_hh_demand_profiles_raw()
1951
1952
        # Process profiles for further use
1953
        df_iee_profiles = set_multiindex_to_profiles(df_iee_profiles)
1954
1955
        # Create aggregated load profile for each MV grid district
1956
        mvgd_profiles_dict = {}
1957
        for grid_district, data in cells.groupby("bus_id"):
1958
            mvgd_profile = get_load_timeseries(
1959
                df_iee_profiles=df_iee_profiles,
1960
                df_hh_profiles_in_census_cells=data,
1961
                cell_ids=data.index,
1962
                year=scenario_year,
1963
                peak_load_only=False,
1964
            )
1965
            mvgd_profiles_dict[grid_district] = [mvgd_profile.round(3).to_list()]
1966
        mvgd_profiles = pd.DataFrame.from_dict(mvgd_profiles_dict, orient="index")
1967
1968
        # Reshape data: put MV grid ids in columns to a single index column
1969
        mvgd_profiles = mvgd_profiles.reset_index()
1970
        mvgd_profiles.columns = ["bus_id", "p_set"]
1971
1972
    # Add remaining columns
1973
    mvgd_profiles["scn_name"] = scenario_name
1974
1975
    # Insert data into respective database table
1976
    mvgd_profiles.to_sql(
0 ignored issues
show
introduced by
The variable mvgd_profiles does not seem to be defined for all execution paths.
Loading history...
1977
        name=EgonEtragoElectricityHouseholds.__table__.name,
1978
        schema=EgonEtragoElectricityHouseholds.__table__.schema,
1979
        con=engine,
1980
        if_exists="append",
1981
        method="multi",
1982
        chunksize=10000,
1983
        index=False,
1984
    )
1985
1986
1987
def get_scaled_profiles_from_db(
1988
    attribute, list_of_identifiers, year, aggregate=True, peak_load_only=False
1989
):
1990
    """Retrieve selection of scaled household electricity demand profiles
1991
1992
    Parameters
1993
    ----------
1994
    attribute: str
1995
        attribute to filter the table
1996
1997
        * nuts3
1998
        * nuts1
1999
        * cell_id
2000
2001
    list_of_identifiers: list of str/int
2002
        nuts3/nuts1 need to be str
2003
        cell_id need to be int
2004
2005
    year: int
2006
         * 2035
2007
         * 2050
2008
2009
    aggregate: bool
2010
        If True, all profiles are summed. This uses a lot of RAM if a high
2011
        attribute level is chosen
2012
2013
    peak_load_only: bool
2014
        If True, only peak load value is returned
2015
2016
    Notes
2017
    -----
2018
    Aggregate == False option can use a lot of RAM if many profiles are selected
2019
2020
2021
    Returns
2022
    -------
2023
    pd.Series or float
2024
     Aggregated time series for given `cell_ids` or peak load of this time
2025
     series in MWh.
2026
    """
2027
    cell_demand_metadata = get_cell_demand_metadata_from_db(
2028
        attribute=attribute, list_of_identifiers=list_of_identifiers
2029
    )
2030
    profile_ids = cell_demand_metadata.cell_profile_ids.sum()
2031
2032
    df_iee_profiles = get_hh_profiles_from_db(profile_ids)
2033
2034
    scaled_profiles = get_load_timeseries(
2035
        df_iee_profiles=df_iee_profiles,
2036
        df_hh_profiles_in_census_cells=cell_demand_metadata,
2037
        cell_ids=cell_demand_metadata.index.to_list(),
2038
        year=year,
2039
        aggregate=aggregate,
2040
        peak_load_only=peak_load_only,
2041
    )
2042
    return scaled_profiles
2043