data.validation_utils - Code Metrics - Inspection of "feature/validation_integration" - openego/eGon-data - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Pull Request — dev (#1375)

unknown

created 2026-01-19 10:52 UTC

data.validation_utils A

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	191
Duplicated Lines	0 %

Importance

Changes

Metric	Value
wmc	18
eloc	79
dl	0
loc	191
rs	10
c	0
b	0
f	0

3 Functions

Rating	Name	Size	Complexity
A	_resolve_rule_params()	24	5
A	_resolve_context_value()	32	3
C	create_validation_tasks()	120	10

"""Airflow integration for egon-validation."""

from typing import Any, Dict, List
from airflow.operators.python import PythonOperator
from egon_validation import run_validations, RunContext
from egon_validation.rules.base import Rule
import logging

logger = logging.getLogger(__name__)


def _resolve_context_value(value: Any, boundary: str) -> Any:
    """Resolve a value that may be boundary-dependent.

    Args:
        value: The value to resolve. Can be:
            - A dict with boundary keys: {"Schleswig-Holstein": 27, "Everything": 537}
            - Any other value (returned as-is)
        boundary: Current dataset boundary setting

    Returns:
        Resolved value based on current boundary

    Examples:
        >>> _resolve_context_value({"Schleswig-Holstein": 27, "Everything": 537},
        ...                        "Schleswig-Holstein")
        27

        >>> _resolve_context_value(42, "Everything")
        42
    """
    # If not a dict, return as-is
    if not isinstance(value, dict):
        return value

    # Try to resolve by boundary
    if boundary in value:
        logger.debug(f"Resolved boundary-dependent value: {boundary} -> {value[boundary]}")
        return value[boundary]

    # If dict doesn't match boundary pattern, return as-is
    # This handles cases like column_types dicts which are not context-dependent
    return value


def _resolve_rule_params(rule: Rule, boundary: str) -> None:
    """Resolve boundary-dependent parameters in a rule.

    Modifies rule.params in-place, resolving any dict values that match
    boundary patterns.

    Args:
        rule: The validation rule to process
        boundary: Current dataset boundary setting
    """
    if not hasattr(rule, 'params') or not isinstance(rule.params, dict):
        return

    # Resolve all parameter values
    for param_name, param_value in rule.params.items():
        resolved_value = _resolve_context_value(param_value, boundary)

        # If the value was resolved (changed), update it
        if resolved_value is not param_value:
            logger.info(
                f"Rule {rule.rule_id}: Resolved {param_name} for "
                f"boundary='{boundary}'"
            )
            rule.params[param_name] = resolved_value

def create_validation_tasks(
    validation_dict: Dict[str, List[Rule]],
    dataset_name: str,
    on_failure: str = "continue"
) -> List[PythonOperator]:
    """Convert validation dict to Airflow tasks.

    Automatically resolves boundary-dependent parameters in validation rules.
    Parameters can be specified as dicts with boundary keys:

    - Boundary-dependent: {"Schleswig-Holstein": 27, "Everything": 537}

    The appropriate value is selected based on the current configuration.

    Args:
        validation_dict: {"task_name": [Rule1(), Rule2()]}
        dataset_name: Name of dataset
        on_failure: "continue" or "fail"

    Returns:
        List of PythonOperator tasks

    Example:
        >>> validation_dict = {
        ...     "data_quality": [
        ...         RowCountValidation(
        ...             table="boundaries.vg250_krs",
        ...             rule_id="TEST_ROW_COUNT",
        ...             expected_count={"Schleswig-Holstein": 27, "Everything": 537}
        ...         )
        ...     ]
        ... }
        >>> tasks = create_validation_tasks(validation_dict, "VG250")
    """
    if not validation_dict:
        return []

    tasks = []

    for task_name, rules in validation_dict.items():
        def make_callable(rules, task_name):
            def run_validation(**context):
                import os
                import time
                from datetime import datetime
                from egon.data import db as egon_db
                from egon.data.config import settings

                # Use same run_id as validation report for consistency
                # This allows the validation report to collect results from all validation tasks
                run_id = (
                    os.environ.get('AIRFLOW_CTX_DAG_RUN_ID') or
                    context.get('run_id') or
                    (context.get('ti') and hasattr(context['ti'], 'dag_run') and context['ti'].dag_run.run_id) or
                    (context.get('dag_run') and context['dag_run'].run_id) or
                    f"airflow-{dataset_name}-{task_name}-{int(time.time())}"
                )

                # Use absolute path to ensure consistent location regardless of working directory
                # Priority: EGON_VALIDATION_DIR env var > current working directory
                out_dir = os.path.join(
                    os.environ.get('EGON_VALIDATION_DIR', os.getcwd()),
                    "validation_runs"
                )

                # Include execution timestamp in task name so retries write to separate directories
                # The validation report will filter to keep only the most recent execution per task
                execution_date = context.get('execution_date') or datetime.now()
                timestamp = execution_date.strftime('%Y%m%dT%H%M%S')
                full_task_name = f"{dataset_name}.{task_name}.{timestamp}"

                logger.info(f"Validation: {full_task_name} (run_id: {run_id})")

                # Use existing engine from egon.data.db
                engine = egon_db.engine()

                # Get current configuration context
                config = settings()["egon-data"]
                boundary = config["--dataset-boundary"]

                logger.info(f"Resolving validation parameters for boundary='{boundary}'")

                # Set task and dataset on all rules (required by Rule base class)
                # Also resolve boundary-dependent parameters
                for rule in rules:
                    if not hasattr(rule, 'task') or rule.task is None:
                        rule.task = task_name
                    if not hasattr(rule, 'dataset') or rule.dataset is None:
                        rule.dataset = dataset_name

                    # Automatically resolve boundary-dependent parameters
                    _resolve_rule_params(rule, boundary)

                ctx = RunContext(run_id=run_id, source="airflow", out_dir=out_dir)
                results = run_validations(engine, ctx, rules, full_task_name)

                total = len(results)
                failed = sum(1 for r in results if not r.success)

                logger.info(f"Complete: {total - failed}/{total} passed")

                if failed > 0 and on_failure == "fail":
                    raise Exception(f"{failed}/{total} validations failed")

                return {"total": total, "passed": total - failed, "failed": failed}

            return run_validation

        func = make_callable(rules, task_name)
        func.__name__ = f"validate_{task_name}"

        operator = PythonOperator(
            task_id=f"{dataset_name}.validate.{task_name}",
            python_callable=func,
            provide_context=True,
        )

        tasks.append(operator)

    return tasks


1			"""Airflow integration for egon-validation."""
2
3			from typing import Any, Dict, List
4			from airflow.operators.python import PythonOperator
5			from egon_validation import run_validations, RunContext
6			from egon_validation.rules.base import Rule
7			import logging
8
9			logger = logging.getLogger(__name__)
10
11
12			def _resolve_context_value(value: Any, boundary: str) -> Any:
13			"""Resolve a value that may be boundary-dependent.
14
15			Args:
16			value: The value to resolve. Can be:
17			- A dict with boundary keys: {"Schleswig-Holstein": 27, "Everything": 537}
18			- Any other value (returned as-is)
19			boundary: Current dataset boundary setting
20
21			Returns:
22			Resolved value based on current boundary
23
24			Examples:
25			>>> _resolve_context_value({"Schleswig-Holstein": 27, "Everything": 537},
26			... "Schleswig-Holstein")
27			27
28
29			>>> _resolve_context_value(42, "Everything")
30			42
31			"""
32			# If not a dict, return as-is
33			if not isinstance(value, dict):
34			return value
35
36			# Try to resolve by boundary
37			if boundary in value:
38			logger.debug(f"Resolved boundary-dependent value: {boundary} -> {value[boundary]}")
39			return value[boundary]
40
41			# If dict doesn't match boundary pattern, return as-is
42			# This handles cases like column_types dicts which are not context-dependent
43			return value
44
45
46			def _resolve_rule_params(rule: Rule, boundary: str) -> None:
47			"""Resolve boundary-dependent parameters in a rule.
48
49			Modifies rule.params in-place, resolving any dict values that match
50			boundary patterns.
51
52			Args:
53			rule: The validation rule to process
54			boundary: Current dataset boundary setting
55			"""
56			if not hasattr(rule, 'params') or not isinstance(rule.params, dict):
57			return
58
59			# Resolve all parameter values
60			for param_name, param_value in rule.params.items():
61			resolved_value = _resolve_context_value(param_value, boundary)
62
63			# If the value was resolved (changed), update it
64			if resolved_value is not param_value:
65			logger.info(
66			f"Rule {rule.rule_id}: Resolved {param_name} for "
67			f"boundary='{boundary}'"
68			)
69			rule.params[param_name] = resolved_value
70
71			def create_validation_tasks(
72			validation_dict: Dict[str, List[Rule]],
73			dataset_name: str,
74			on_failure: str = "continue"
75			) -> List[PythonOperator]:
76			"""Convert validation dict to Airflow tasks.
77
78			Automatically resolves boundary-dependent parameters in validation rules.
79			Parameters can be specified as dicts with boundary keys:
80
81			- Boundary-dependent: {"Schleswig-Holstein": 27, "Everything": 537}
82
83			The appropriate value is selected based on the current configuration.
84
85			Args:
86			validation_dict: {"task_name": [Rule1(), Rule2()]}
87			dataset_name: Name of dataset
88			on_failure: "continue" or "fail"
89
90			Returns:
91			List of PythonOperator tasks
92
93			Example:
94			>>> validation_dict = {
95			... "data_quality": [
96			... RowCountValidation(
97			... table="boundaries.vg250_krs",
98			... rule_id="TEST_ROW_COUNT",
99			... expected_count={"Schleswig-Holstein": 27, "Everything": 537}
100			... )
101			... ]
102			... }
103			>>> tasks = create_validation_tasks(validation_dict, "VG250")
104			"""
105			if not validation_dict:
106			return []
107
108			tasks = []
109
110			for task_name, rules in validation_dict.items():
111			def make_callable(rules, task_name):
112			def run_validation(**context):
113			import os
114			import time
115			from datetime import datetime
116			from egon.data import db as egon_db
117			from egon.data.config import settings
118
119			# Use same run_id as validation report for consistency
120			# This allows the validation report to collect results from all validation tasks
121			run_id = (
122			os.environ.get('AIRFLOW_CTX_DAG_RUN_ID') or
123			context.get('run_id') or
124			(context.get('ti') and hasattr(context['ti'], 'dag_run') and context['ti'].dag_run.run_id) or
125			(context.get('dag_run') and context['dag_run'].run_id) or
126			f"airflow-{dataset_name}-{task_name}-{int(time.time())}"
127			)
128
129			# Use absolute path to ensure consistent location regardless of working directory
130			# Priority: EGON_VALIDATION_DIR env var > current working directory
131			out_dir = os.path.join(
132			os.environ.get('EGON_VALIDATION_DIR', os.getcwd()),
133			"validation_runs"
134			)
135
136			# Include execution timestamp in task name so retries write to separate directories
137			# The validation report will filter to keep only the most recent execution per task
138			execution_date = context.get('execution_date') or datetime.now()
139			timestamp = execution_date.strftime('%Y%m%dT%H%M%S')
140			full_task_name = f"{dataset_name}.{task_name}.{timestamp}"
141
142			logger.info(f"Validation: {full_task_name} (run_id: {run_id})")
143
144			# Use existing engine from egon.data.db
145			engine = egon_db.engine()
146
147			# Get current configuration context
148			config = settings()["egon-data"]
149			boundary = config["--dataset-boundary"]
150
151			logger.info(f"Resolving validation parameters for boundary='{boundary}'")
152
153			# Set task and dataset on all rules (required by Rule base class)
154			# Also resolve boundary-dependent parameters
155			for rule in rules:
156			if not hasattr(rule, 'task') or rule.task is None:
157			rule.task = task_name
158			if not hasattr(rule, 'dataset') or rule.dataset is None:
159			rule.dataset = dataset_name
160
161			# Automatically resolve boundary-dependent parameters
162			_resolve_rule_params(rule, boundary)
163
164			ctx = RunContext(run_id=run_id, source="airflow", out_dir=out_dir)
165			results = run_validations(engine, ctx, rules, full_task_name)
166
167			total = len(results)
168			failed = sum(1 for r in results if not r.success)
169
170			logger.info(f"Complete: {total - failed}/{total} passed")
171
172			if failed > 0 and on_failure == "fail":
173			raise Exception(f"{failed}/{total} validations failed")
174
175			return {"total": total, "passed": total - failed, "failed": failed}
176
177			return run_validation
178
179			func = make_callable(rules, task_name)
180			func.__name__ = f"validate_{task_name}"
181
182			operator = PythonOperator(
183			task_id=f"{dataset_name}.validate.{task_name}",
184			python_callable=func,
185			provide_context=True,
186			)
187
188			tasks.append(operator)
189
190			return tasks
191

openego / eGon-data

Pull Request — dev (#1375)

data.validation_utils A

Complexity

Size/Duplication

Importance

3 Functions

Duplication Side-by-Side

Filter issues like