shared.prepare_input_data() - Code Metrics - Inspection of "Examples timeseries retcon paper" - oemof/oemof-solph - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Pull Request — dev (#1226)

unknown

created 2025-12-15 14:57 UTC

shared.prepare_input_data() B

↳ Parent: shared

Complexity

Conditions

Size

Total Lines	121
Code Lines	77

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
eloc	77
dl	0
loc	121
rs	7.286
c	0
b	0
f	0
cc	5
nop	1

How to fix Long Method

"""
SPDX-FileCopyrightText: Patrik Schönfeldt
SPDX-FileCopyrightText: DLR e.V.

SPDX-License-Identifier: MIT
"""

from pathlib import Path
from urllib.request import urlretrieve

import demandlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from workalendar.europe import Germany


def prepare_input_data(plot_resampling=False):
    url_temperature = (
        "https://oemof.org/wp-content/uploads/2025/12/temperature.csv"
    )
    url_energy = "https://oemof.org/wp-content/uploads/2025/12/energy.csv"

    print(
        "Data is licensed from M. Schlemminger, T. Ohrdes, E. Schneider,"
        " and M. Knoop. Under Creative Commons Attribution 4.0 International"
        " License. It is also available at doi: 10.5281/zenodo.5642902."
        " (We use single family home 26 plus the south-facing PV"
        " from that dataset.)"
    )

    file_path = Path(__file__).parent

    temperature_file = Path(file_path, "temperature.csv")
    if not temperature_file.exists():
        urlretrieve(url_temperature, temperature_file)
    df_temperature = pd.read_csv(
        temperature_file,
        index_col="Unix Epoch",
    )

    df_temperature.index = pd.to_datetime(
        df_temperature.index,
        unit="s",
        utc=True,
    )

    # ----- clean up data --------------------------------------------------------------
    # 1) Duplikate durch Mittelwert ersetzen
    df_temperature = df_temperature.groupby(df_temperature.index).mean()

    # 2) Regulären 5-Minuten-Index erzeugen (Zeitzone erhalten)
    tz = df_temperature.index.tz
    full_idx = pd.date_range(
        start=df_temperature.index.min(),
        end=df_temperature.index.max(),
        freq="5min",
        tz=tz,
    )

    # 3) Auf 5-Minuten-Raster reindizieren -> Lücken werden NaN
    df_regular = df_temperature.reindex(full_idx)

    # 4) Zeitbasierte Interpolation nur für numerische Spalten
    num_cols = df_regular.select_dtypes(include="number").columns

    # Interpolation (zeitbasiert: berücksichtigt die Zeitabstände im Index)
    df_regular[num_cols] = df_regular[num_cols].interpolate(method="time")

    # 5) Ränder ohne beidseitige Nachbarn per ffill/bfill schließen
    df_regular[num_cols] = df_regular[num_cols].ffill().bfill()

    df_temperature = df_regular

    # -------------------------------------------

    building_area = 120  # m² (from publication)
    specific_heat_demand = 60  #  kWh/m²/a  (educated guess)
    holidays = dict(Germany().holidays(2019))

    # We estimate the heat demand from the ambient temperature using demandlib.
    # This returns energy per time step in units of kWh.
    df_temperature["heat demand (kWh)"] = demandlib.bdew.HeatBuilding(
        df_temperature.index,
        holidays=holidays,
        temperature=df_temperature["Air Temperature (°C)"],
        shlp_type="EFH",
        building_class=1,
        wind_class=1,
        annual_heat_demand=building_area * specific_heat_demand,
        name="EFH",
    ).get_bdew_profile()

    df_temperature["heat demand (W)"] = (
        df_temperature["heat demand (kWh)"] * 1e3 / (5 / 60)
    )

    energy_file = Path(file_path, "energy.csv")
    if not energy_file.exists():
        urlretrieve(url_energy, energy_file)
    df_energy = pd.read_csv(
        energy_file,
        index_col=0,
    )
    df_energy.index = pd.to_datetime(
        df_energy.index,
        unit="s",
        utc=True,
    )

    if plot_resampling:
        p_pv = {}
        resolutions = [
            "1 min",
            "5 min",
            "10 min",
            "15 min",
            "30 min",
            "1 h",
            "2 h",
            "3 h",
            "6 h",
        ]

        for resolution in resolutions:
            p_pv[resolution] = df_energy["PV (W)"].resample(resolution).mean()
            plt.plot(
                np.linspace(0, 8760, len(p_pv[resolution])),
                sorted(p_pv[resolution] / 1e3)[::-1],
                label=resolution,
            )

        plt.xlim(-10, 510)
        plt.ylim(7, 16)
        plt.legend()
        plt.show()

    return df_temperature, df_energy


if __name__ == "__main__":
    prepare_input_data(plot_resampling=True)


1			"""
2			SPDX-FileCopyrightText: Patrik Schönfeldt
3			SPDX-FileCopyrightText: DLR e.V.
4
5			SPDX-License-Identifier: MIT
6			"""
7
8			from pathlib import Path
9			from urllib.request import urlretrieve
10
11			import demandlib
12			import matplotlib.pyplot as plt
13			import numpy as np
14			import pandas as pd
15			from workalendar.europe import Germany
16
17
18			def prepare_input_data(plot_resampling=False):
19			url_temperature = (
20			"https://oemof.org/wp-content/uploads/2025/12/temperature.csv"
21			)
22			url_energy = "https://oemof.org/wp-content/uploads/2025/12/energy.csv"
23
24			print(
25			"Data is licensed from M. Schlemminger, T. Ohrdes, E. Schneider,"
26			" and M. Knoop. Under Creative Commons Attribution 4.0 International"
27			" License. It is also available at doi: 10.5281/zenodo.5642902."
28			" (We use single family home 26 plus the south-facing PV"
29			" from that dataset.)"
30			)
31
32			file_path = Path(__file__).parent
33
34			temperature_file = Path(file_path, "temperature.csv")
35			if not temperature_file.exists():
36			urlretrieve(url_temperature, temperature_file)
37			df_temperature = pd.read_csv(
38			temperature_file,
39			index_col="Unix Epoch",
40			)
41
42			df_temperature.index = pd.to_datetime(
43			df_temperature.index,
44			unit="s",
45			utc=True,
46			)
47
48			# ----- clean up data --------------------------------------------------------------
49			# 1) Duplikate durch Mittelwert ersetzen
50			df_temperature = df_temperature.groupby(df_temperature.index).mean()
51
52			# 2) Regulären 5-Minuten-Index erzeugen (Zeitzone erhalten)
53			tz = df_temperature.index.tz
54			full_idx = pd.date_range(
55			start=df_temperature.index.min(),
56			end=df_temperature.index.max(),
57			freq="5min",
58			tz=tz,
59			)
60
61			# 3) Auf 5-Minuten-Raster reindizieren -> Lücken werden NaN
62			df_regular = df_temperature.reindex(full_idx)
63
64			# 4) Zeitbasierte Interpolation nur für numerische Spalten
65			num_cols = df_regular.select_dtypes(include="number").columns
66
67			# Interpolation (zeitbasiert: berücksichtigt die Zeitabstände im Index)
68			df_regular[num_cols] = df_regular[num_cols].interpolate(method="time")
69
70			# 5) Ränder ohne beidseitige Nachbarn per ffill/bfill schließen
71			df_regular[num_cols] = df_regular[num_cols].ffill().bfill()
72
73			df_temperature = df_regular
74
75			# -------------------------------------------
76
77			building_area = 120 # m² (from publication)
78			specific_heat_demand = 60 # kWh/m²/a (educated guess)
79			holidays = dict(Germany().holidays(2019))
80
81			# We estimate the heat demand from the ambient temperature using demandlib.
82			# This returns energy per time step in units of kWh.
83			df_temperature["heat demand (kWh)"] = demandlib.bdew.HeatBuilding(
84			df_temperature.index,
85			holidays=holidays,
86			temperature=df_temperature["Air Temperature (°C)"],
87			shlp_type="EFH",
88			building_class=1,
89			wind_class=1,
90			annual_heat_demand=building_area * specific_heat_demand,
91			name="EFH",
92			).get_bdew_profile()
93
94			df_temperature["heat demand (W)"] = (
95			df_temperature["heat demand (kWh)"] * 1e3 / (5 / 60)
96			)
97
98			energy_file = Path(file_path, "energy.csv")
99			if not energy_file.exists():
100			urlretrieve(url_energy, energy_file)
101			df_energy = pd.read_csv(
102			energy_file,
103			index_col=0,
104			)
105			df_energy.index = pd.to_datetime(
106			df_energy.index,
107			unit="s",
108			utc=True,
109			)
110
111			if plot_resampling:
112			p_pv = {}
113			resolutions = [
114			"1 min",
115			"5 min",
116			"10 min",
117			"15 min",
118			"30 min",
119			"1 h",
120			"2 h",
121			"3 h",
122			"6 h",
123			]
124
125			for resolution in resolutions:
126			p_pv[resolution] = df_energy["PV (W)"].resample(resolution).mean()
127			plt.plot(
128			np.linspace(0, 8760, len(p_pv[resolution])),
129			sorted(p_pv[resolution] / 1e3)[::-1],
130			label=resolution,
131			)
132
133			plt.xlim(-10, 510)
134			plt.ylim(7, 16)
135			plt.legend()
136			plt.show()
137
138			return df_temperature, df_energy
139
140
141			if __name__ == "__main__":
142			prepare_input_data(plot_resampling=True)
143

oemof / oemof-solph

Pull Request — dev (#1226)

shared.prepare_input_data() B

Complexity

Size

Duplication

Importance

How to fix Long Method

Long Method

Duplication Side-by-Side

Filter issues like