Code Duplication - openego/eGon-data - Measure and Improve Code Quality continuously with Scrutinizer

Code Duplication Length = 90-91 lines in 2 locations

src/egon/data/validation/rules/custom/sanity/residential_electricity.py 2 locations


            )


class ResidentialElectricityHhRefinement(DataFrameRule):
    """Validate aggregated household types after refinement match original census values.

    Checks sum of aggregated household types after refinement method
    was applied and compares it to the original census values.

    Args:
        table: Primary table being validated (society.egon_destatis_zensus_household_per_ha_refined)
        rule_id: Unique identifier for this validation rule
        rtol: Relative tolerance for comparison (default: 1e-5 = 0.001%)

    Example:
        >>> validation = {
        ...     "data_quality": [
        ...         ResidentialElectricityHhRefinement(
        ...             table="society.egon_destatis_zensus_household_per_ha_refined",
        ...             rule_id="SANITY_RESIDENTIAL_HH_REFINEMENT",
        ...             rtol=1e-5
        ...         )
        ...     ]
        ... }
    """

    def __init__(self, table: str, rule_id: str, rtol: float = 1e-5, **kwargs):
        super().__init__(rule_id=rule_id, table=table, rtol=rtol, **kwargs)
        self.kind = "sanity"

    def get_query(self, ctx):
        return """
        SELECT refined.nuts3, refined.characteristics_code,
                refined.sum_refined::int, census.sum_census::int
        FROM(
            SELECT nuts3, characteristics_code, SUM(hh_10types) as sum_refined
            FROM society.egon_destatis_zensus_household_per_ha_refined
            GROUP BY nuts3, characteristics_code)
            AS refined
        JOIN(
            SELECT t.nuts3, t.characteristics_code, sum(orig) as sum_census
            FROM(
                SELECT nuts3, cell_id, characteristics_code,
                        sum(DISTINCT(hh_5types))as orig
                FROM society.egon_destatis_zensus_household_per_ha_refined
                GROUP BY cell_id, characteristics_code, nuts3) AS t
            GROUP BY t.nuts3, t.characteristics_code    ) AS census
        ON refined.nuts3 = census.nuts3
        AND refined.characteristics_code = census.characteristics_code
        """

    def evaluate_df(self, df, ctx):
        rtol = self.params.get("rtol", 1e-5)

        try:
            np.testing.assert_allclose(
                actual=df["sum_refined"],
                desired=df["sum_census"],
                rtol=rtol,
                verbose=False,
            )

            max_diff = ((df["sum_refined"] - df["sum_census"]) / df["sum_census"]).abs().max()

            return RuleResult(
                rule_id=self.rule_id,
                task=self.task,
                table=self.table,
                kind=self.kind,
                success=True,
                observed=float(max_diff),
                expected=rtol,
                message=f"All aggregated household types match at NUTS-3 (max deviation: {max_diff:.6%}, tolerance: {rtol:.6%})",
                schema=self.schema,
                table_name=self.table_name,
                rule_class=self.__class__.__name__
            )
        except AssertionError:
            max_diff = ((df["sum_refined"] - df["sum_census"]) / df["sum_census"]).abs().max()
            violations = df[~np.isclose(df["sum_refined"], df["sum_census"], rtol=rtol)]

            return RuleResult(
                rule_id=self.rule_id,
                task=self.task,
                table=self.table,
                kind=self.kind,
                success=False,
                observed=float(max_diff),
                expected=rtol,
                message=f"Household refinement mismatch: max deviation {max_diff:.6%} exceeds tolerance {rtol:.6%}. {len(violations)} NUTS-3/characteristic combinations have mismatches.",
                severity=Severity.ERROR,
                schema=self.schema,
                table_name=self.table_name,
                rule_class=self.__class__.__name__
            )


import numpy as np


class ResidentialElectricityAnnualSum(DataFrameRule):
    """Validate aggregated annual residential electricity demand matches DemandRegio at NUTS-3.

    Aggregates the annual demand of all census cells at NUTS3 to compare
    with initial scaling parameters from DemandRegio.

    Args:
        table: Primary table being validated (demand.egon_demandregio_zensus_electricity)
        rule_id: Unique identifier for this validation rule
        rtol: Relative tolerance for comparison (default: 0.005 = 0.5%)

    Example:
        >>> validation = {
        ...     "data_quality": [
        ...         ResidentialElectricityAnnualSum(
        ...             table="demand.egon_demandregio_zensus_electricity",
        ...             rule_id="SANITY_RESIDENTIAL_ELECTRICITY_ANNUAL_SUM",
        ...             rtol=0.005
        ...         )
        ...     ]
        ... }
    """

    def __init__(self, table: str, rule_id: str, rtol: float = 0.005, **kwargs):
        super().__init__(rule_id=rule_id, table=table, rtol=rtol, **kwargs)
        self.kind = "sanity"  # Override inferred kind

    def get_query(self, ctx):
        return """
        SELECT dr.nuts3, dr.scenario, dr.demand_regio_sum, profiles.profile_sum
        FROM (
            SELECT scenario, SUM(demand) AS profile_sum, vg250_nuts3
            FROM demand.egon_demandregio_zensus_electricity AS egon,
             boundaries.egon_map_zensus_vg250 AS boundaries
            WHERE egon.zensus_population_id = boundaries.zensus_population_id
            AND sector = 'residential'
            GROUP BY vg250_nuts3, scenario
        ) AS profiles
        JOIN (
            SELECT nuts3, scenario, sum(demand) AS demand_regio_sum
            FROM demand.egon_demandregio_hh
            GROUP BY year, scenario, nuts3
        ) AS dr
        ON profiles.vg250_nuts3 = dr.nuts3 AND profiles.scenario = dr.scenario
        """

    def evaluate_df(self, df, ctx):
        rtol = self.params.get("rtol", 0.005)

        try:
            np.testing.assert_allclose(
                actual=df["profile_sum"],
                desired=df["demand_regio_sum"],
                rtol=rtol,
                verbose=False,
            )

            # Calculate actual max deviation for reporting
            max_diff = ((df["profile_sum"] - df["demand_regio_sum"]) / df["demand_regio_sum"]).abs().max()

            return RuleResult(
                rule_id=self.rule_id,
                task=self.task,
                table=self.table,
                kind=self.kind,
                success=True,
                observed=float(max_diff),
                expected=rtol,
                message=f"Aggregated annual residential electricity demand matches with DemandRegio at NUTS-3 (max deviation: {max_diff:.4%}, tolerance: {rtol:.4%})",
                schema=self.schema,
                table_name=self.table_name,
                rule_class=self.__class__.__name__
            )
        except AssertionError:
            max_diff = ((df["profile_sum"] - df["demand_regio_sum"]) / df["demand_regio_sum"]).abs().max()
            violations = df[~np.isclose(df["profile_sum"], df["demand_regio_sum"], rtol=rtol)]

            return RuleResult(
                rule_id=self.rule_id,
                task=self.task,
                table=self.table,
                kind=self.kind,
                success=False,
                observed=float(max_diff),
                expected=rtol,
                message=f"Demand mismatch: max deviation {max_diff:.4%} exceeds tolerance {rtol:.4%}. {len(violations)} NUTS-3 regions have mismatches.",
                severity=Severity.ERROR,
                schema=self.schema,
                table_name=self.table_name,
                rule_class=self.__class__.__name__
            )



		@@ 100-190 (lines=91) @@
97		)
98
99
100		class ResidentialElectricityHhRefinement(DataFrameRule):
101		"""Validate aggregated household types after refinement match original census values.
102
103		Checks sum of aggregated household types after refinement method
104		was applied and compares it to the original census values.
105
106		Args:
107		table: Primary table being validated (society.egon_destatis_zensus_household_per_ha_refined)
108		rule_id: Unique identifier for this validation rule
109		rtol: Relative tolerance for comparison (default: 1e-5 = 0.001%)
110
111		Example:
112		>>> validation = {
113		... "data_quality": [
114		... ResidentialElectricityHhRefinement(
115		... table="society.egon_destatis_zensus_household_per_ha_refined",
116		... rule_id="SANITY_RESIDENTIAL_HH_REFINEMENT",
117		... rtol=1e-5
118		... )
119		... ]
120		... }
121		"""
122
123		def __init__(self, table: str, rule_id: str, rtol: float = 1e-5, **kwargs):
124		super().__init__(rule_id=rule_id, table=table, rtol=rtol, **kwargs)
125		self.kind = "sanity"
126
127		def get_query(self, ctx):
128		return """
129		SELECT refined.nuts3, refined.characteristics_code,
130		refined.sum_refined::int, census.sum_census::int
131		FROM(
132		SELECT nuts3, characteristics_code, SUM(hh_10types) as sum_refined
133		FROM society.egon_destatis_zensus_household_per_ha_refined
134		GROUP BY nuts3, characteristics_code)
135		AS refined
136		JOIN(
137		SELECT t.nuts3, t.characteristics_code, sum(orig) as sum_census
138		FROM(
139		SELECT nuts3, cell_id, characteristics_code,
140		sum(DISTINCT(hh_5types))as orig
141		FROM society.egon_destatis_zensus_household_per_ha_refined
142		GROUP BY cell_id, characteristics_code, nuts3) AS t
143		GROUP BY t.nuts3, t.characteristics_code ) AS census
144		ON refined.nuts3 = census.nuts3
145		AND refined.characteristics_code = census.characteristics_code
146		"""
147
148		def evaluate_df(self, df, ctx):
149		rtol = self.params.get("rtol", 1e-5)
150
151		try:
152		np.testing.assert_allclose(
153		actual=df["sum_refined"],
154		desired=df["sum_census"],
155		rtol=rtol,
156		verbose=False,
157		)
158
159		max_diff = ((df["sum_refined"] - df["sum_census"]) / df["sum_census"]).abs().max()
160
161		return RuleResult(
162		rule_id=self.rule_id,
163		task=self.task,
164		table=self.table,
165		kind=self.kind,
166		success=True,
167		observed=float(max_diff),
168		expected=rtol,
169		message=f"All aggregated household types match at NUTS-3 (max deviation: {max_diff:.6%}, tolerance: {rtol:.6%})",
170		schema=self.schema,
171		table_name=self.table_name,
172		rule_class=self.__class__.__name__
173		)
174		except AssertionError:
175		max_diff = ((df["sum_refined"] - df["sum_census"]) / df["sum_census"]).abs().max()
176		violations = df[~np.isclose(df["sum_refined"], df["sum_census"], rtol=rtol)]
177
178		return RuleResult(
179		rule_id=self.rule_id,
180		task=self.task,
181		table=self.table,
182		kind=self.kind,
183		success=False,
184		observed=float(max_diff),
185		expected=rtol,
186		message=f"Household refinement mismatch: max deviation {max_diff:.6%} exceeds tolerance {rtol:.6%}. {len(violations)} NUTS-3/characteristic combinations have mismatches.",
187		severity=Severity.ERROR,
188		schema=self.schema,
189		table_name=self.table_name,
190		rule_class=self.__class__.__name__
191		)
192
		@@ 7-96 (lines=90) @@
4		import numpy as np
5
6
7		class ResidentialElectricityAnnualSum(DataFrameRule):
8		"""Validate aggregated annual residential electricity demand matches DemandRegio at NUTS-3.
9
10		Aggregates the annual demand of all census cells at NUTS3 to compare
11		with initial scaling parameters from DemandRegio.
12
13		Args:
14		table: Primary table being validated (demand.egon_demandregio_zensus_electricity)
15		rule_id: Unique identifier for this validation rule
16		rtol: Relative tolerance for comparison (default: 0.005 = 0.5%)
17
18		Example:
19		>>> validation = {
20		... "data_quality": [
21		... ResidentialElectricityAnnualSum(
22		... table="demand.egon_demandregio_zensus_electricity",
23		... rule_id="SANITY_RESIDENTIAL_ELECTRICITY_ANNUAL_SUM",
24		... rtol=0.005
25		... )
26		... ]
27		... }
28		"""
29
30		def __init__(self, table: str, rule_id: str, rtol: float = 0.005, **kwargs):
31		super().__init__(rule_id=rule_id, table=table, rtol=rtol, **kwargs)
32		self.kind = "sanity" # Override inferred kind
33
34		def get_query(self, ctx):
35		return """
36		SELECT dr.nuts3, dr.scenario, dr.demand_regio_sum, profiles.profile_sum
37		FROM (
38		SELECT scenario, SUM(demand) AS profile_sum, vg250_nuts3
39		FROM demand.egon_demandregio_zensus_electricity AS egon,
40		boundaries.egon_map_zensus_vg250 AS boundaries
41		WHERE egon.zensus_population_id = boundaries.zensus_population_id
42		AND sector = 'residential'
43		GROUP BY vg250_nuts3, scenario
44		) AS profiles
45		JOIN (
46		SELECT nuts3, scenario, sum(demand) AS demand_regio_sum
47		FROM demand.egon_demandregio_hh
48		GROUP BY year, scenario, nuts3
49		) AS dr
50		ON profiles.vg250_nuts3 = dr.nuts3 AND profiles.scenario = dr.scenario
51		"""
52
53		def evaluate_df(self, df, ctx):
54		rtol = self.params.get("rtol", 0.005)
55
56		try:
57		np.testing.assert_allclose(
58		actual=df["profile_sum"],
59		desired=df["demand_regio_sum"],
60		rtol=rtol,
61		verbose=False,
62		)
63
64		# Calculate actual max deviation for reporting
65		max_diff = ((df["profile_sum"] - df["demand_regio_sum"]) / df["demand_regio_sum"]).abs().max()
66
67		return RuleResult(
68		rule_id=self.rule_id,
69		task=self.task,
70		table=self.table,
71		kind=self.kind,
72		success=True,
73		observed=float(max_diff),
74		expected=rtol,
75		message=f"Aggregated annual residential electricity demand matches with DemandRegio at NUTS-3 (max deviation: {max_diff:.4%}, tolerance: {rtol:.4%})",
76		schema=self.schema,
77		table_name=self.table_name,
78		rule_class=self.__class__.__name__
79		)
80		except AssertionError:
81		max_diff = ((df["profile_sum"] - df["demand_regio_sum"]) / df["demand_regio_sum"]).abs().max()
82		violations = df[~np.isclose(df["profile_sum"], df["demand_regio_sum"], rtol=rtol)]
83
84		return RuleResult(
85		rule_id=self.rule_id,
86		task=self.task,
87		table=self.table,
88		kind=self.kind,
89		success=False,
90		observed=float(max_diff),
91		expected=rtol,
92		message=f"Demand mismatch: max deviation {max_diff:.4%} exceeds tolerance {rtol:.4%}. {len(violations)} NUTS-3 regions have mismatches.",
93		severity=Severity.ERROR,
94		schema=self.schema,
95		table_name=self.table_name,
96		rule_class=self.__class__.__name__
97		)
98
99

openego / eGon-data

Code Duplication Length = 90-91 lines in 2 locations

src/egon/data/validation/rules/custom/sanity/residential_electricity.py 2 locations