|
1
|
|
|
"""Airflow integration for egon-validation.""" |
|
2
|
|
|
|
|
3
|
|
|
from typing import Dict, List |
|
4
|
|
|
from airflow.operators.python import PythonOperator |
|
5
|
|
|
from egon_validation import run_validations, RunContext |
|
6
|
|
|
from egon_validation.rules.base import Rule |
|
7
|
|
|
import logging |
|
8
|
|
|
|
|
9
|
|
|
logger = logging.getLogger(__name__) |
|
10
|
|
|
|
|
11
|
|
|
def create_validation_tasks( |
|
12
|
|
|
validation_dict: Dict[str, List[Rule]], |
|
13
|
|
|
dataset_name: str, |
|
14
|
|
|
on_failure: str = "continue" |
|
15
|
|
|
) -> List[PythonOperator]: |
|
16
|
|
|
"""Convert validation dict to Airflow tasks. |
|
17
|
|
|
|
|
18
|
|
|
Args: |
|
19
|
|
|
validation_dict: {"task_name": [Rule1(), Rule2()]} |
|
20
|
|
|
dataset_name: Name of dataset |
|
21
|
|
|
on_failure: "continue" or "fail" |
|
22
|
|
|
|
|
23
|
|
|
Returns: |
|
24
|
|
|
List of PythonOperator tasks |
|
25
|
|
|
""" |
|
26
|
|
|
if not validation_dict: |
|
27
|
|
|
return [] |
|
28
|
|
|
|
|
29
|
|
|
tasks = [] |
|
30
|
|
|
|
|
31
|
|
|
for task_name, rules in validation_dict.items(): |
|
32
|
|
|
def make_callable(rules, task_name): |
|
33
|
|
|
def run_validation(**context): |
|
34
|
|
|
import os |
|
35
|
|
|
import time |
|
36
|
|
|
from datetime import datetime |
|
37
|
|
|
from egon.data import db as egon_db |
|
38
|
|
|
|
|
39
|
|
|
# Use same run_id as validation report for consistency |
|
40
|
|
|
# This allows the validation report to collect results from all validation tasks |
|
41
|
|
|
run_id = ( |
|
42
|
|
|
os.environ.get('AIRFLOW_CTX_DAG_RUN_ID') or |
|
43
|
|
|
context.get('run_id') or |
|
44
|
|
|
(context.get('ti') and hasattr(context['ti'], 'dag_run') and context['ti'].dag_run.run_id) or |
|
45
|
|
|
(context.get('dag_run') and context['dag_run'].run_id) or |
|
46
|
|
|
f"airflow-{dataset_name}-{task_name}-{int(time.time())}" |
|
47
|
|
|
) |
|
48
|
|
|
|
|
49
|
|
|
# Use absolute path to ensure consistent location regardless of working directory |
|
50
|
|
|
# Priority: EGON_VALIDATION_DIR env var > current working directory |
|
51
|
|
|
out_dir = os.path.join( |
|
52
|
|
|
os.environ.get('EGON_VALIDATION_DIR', os.getcwd()), |
|
53
|
|
|
"validation_runs" |
|
54
|
|
|
) |
|
55
|
|
|
|
|
56
|
|
|
# Include execution timestamp in task name so retries write to separate directories |
|
57
|
|
|
# The validation report will filter to keep only the most recent execution per task |
|
58
|
|
|
execution_date = context.get('execution_date') or datetime.now() |
|
59
|
|
|
timestamp = execution_date.strftime('%Y%m%dT%H%M%S') |
|
60
|
|
|
full_task_name = f"{dataset_name}.{task_name}.{timestamp}" |
|
61
|
|
|
|
|
62
|
|
|
logger.info(f"Validation: {full_task_name} (run_id: {run_id})") |
|
63
|
|
|
|
|
64
|
|
|
# Use existing engine from egon.data.db |
|
65
|
|
|
engine = egon_db.engine() |
|
66
|
|
|
|
|
67
|
|
|
# Set task and dataset on all rules (required by Rule base class) |
|
68
|
|
|
for rule in rules: |
|
69
|
|
|
if not hasattr(rule, 'task') or rule.task is None: |
|
70
|
|
|
rule.task = task_name |
|
71
|
|
|
if not hasattr(rule, 'dataset') or rule.dataset is None: |
|
72
|
|
|
rule.dataset = dataset_name |
|
73
|
|
|
|
|
74
|
|
|
ctx = RunContext(run_id=run_id, source="airflow", out_dir=out_dir) |
|
75
|
|
|
results = run_validations(engine, ctx, rules, full_task_name) |
|
76
|
|
|
|
|
77
|
|
|
total = len(results) |
|
78
|
|
|
failed = sum(1 for r in results if not r.success) |
|
79
|
|
|
|
|
80
|
|
|
logger.info(f"Complete: {total - failed}/{total} passed") |
|
81
|
|
|
|
|
82
|
|
|
if failed > 0 and on_failure == "fail": |
|
83
|
|
|
raise Exception(f"{failed}/{total} validations failed") |
|
84
|
|
|
|
|
85
|
|
|
return {"total": total, "passed": total - failed, "failed": failed} |
|
86
|
|
|
|
|
87
|
|
|
return run_validation |
|
88
|
|
|
|
|
89
|
|
|
func = make_callable(rules, task_name) |
|
90
|
|
|
func.__name__ = f"validate_{task_name}" |
|
91
|
|
|
|
|
92
|
|
|
operator = PythonOperator( |
|
93
|
|
|
task_id=f"{dataset_name}.validate.{task_name}", |
|
94
|
|
|
python_callable=func, |
|
95
|
|
|
provide_context=True, |
|
96
|
|
|
) |
|
97
|
|
|
|
|
98
|
|
|
tasks.append(operator) |
|
99
|
|
|
|
|
100
|
|
|
return tasks |
|
101
|
|
|
|