| Conditions | 10 |
| Total Lines | 122 |
| Code Lines | 56 |
| Lines | 0 |
| Ratio | 0 % |
| Changes | 0 | ||
Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.
For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.
Commonly applied refactorings include:
If many parameters/temporary variables are present:
Complex classes like data.validation_utils.create_validation_tasks() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
| 1 | """Airflow integration for egon-validation.""" |
||
| 84 | def create_validation_tasks( |
||
| 85 | validation_dict: Dict[str, List[Rule]], |
||
| 86 | dataset_name: str, |
||
| 87 | on_failure: str = "continue" |
||
| 88 | ) -> List[PythonOperator]: |
||
| 89 | """Convert validation dict to Airflow tasks. |
||
| 90 | |||
| 91 | Automatically resolves context-dependent parameters in validation rules. |
||
| 92 | Parameters can be specified as dicts with boundary or scenario keys: |
||
| 93 | |||
| 94 | - Boundary-dependent: {"Schleswig-Holstein": 27, "Everything": 537} |
||
| 95 | - Scenario-dependent: {"eGon2035": 100, "eGon100RE": 200} |
||
| 96 | |||
| 97 | The appropriate value is selected based on the current configuration. |
||
| 98 | |||
| 99 | Args: |
||
| 100 | validation_dict: {"task_name": [Rule1(), Rule2()]} |
||
| 101 | dataset_name: Name of dataset |
||
| 102 | on_failure: "continue" or "fail" |
||
| 103 | |||
| 104 | Returns: |
||
| 105 | List of PythonOperator tasks |
||
| 106 | |||
| 107 | Example: |
||
| 108 | >>> validation_dict = { |
||
| 109 | ... "data_quality": [ |
||
| 110 | ... RowCountValidation( |
||
| 111 | ... table="boundaries.vg250_krs", |
||
| 112 | ... rule_id="TEST_ROW_COUNT", |
||
| 113 | ... expected_count={"Schleswig-Holstein": 27, "Everything": 537} |
||
| 114 | ... ) |
||
| 115 | ... ] |
||
| 116 | ... } |
||
| 117 | >>> tasks = create_validation_tasks(validation_dict, "VG250") |
||
| 118 | """ |
||
| 119 | if not validation_dict: |
||
| 120 | return [] |
||
| 121 | |||
| 122 | tasks = [] |
||
| 123 | |||
| 124 | for task_name, rules in validation_dict.items(): |
||
| 125 | def make_callable(rules, task_name): |
||
| 126 | def run_validation(**context): |
||
| 127 | import os |
||
| 128 | import time |
||
| 129 | from datetime import datetime |
||
| 130 | from egon.data import db as egon_db |
||
| 131 | from egon.data.config import settings |
||
| 132 | |||
| 133 | # Use same run_id as validation report for consistency |
||
| 134 | # This allows the validation report to collect results from all validation tasks |
||
| 135 | run_id = ( |
||
| 136 | os.environ.get('AIRFLOW_CTX_DAG_RUN_ID') or |
||
| 137 | context.get('run_id') or |
||
| 138 | (context.get('ti') and hasattr(context['ti'], 'dag_run') and context['ti'].dag_run.run_id) or |
||
| 139 | (context.get('dag_run') and context['dag_run'].run_id) or |
||
| 140 | f"airflow-{dataset_name}-{task_name}-{int(time.time())}" |
||
| 141 | ) |
||
| 142 | |||
| 143 | # Use absolute path to ensure consistent location regardless of working directory |
||
| 144 | # Priority: EGON_VALIDATION_DIR env var > current working directory |
||
| 145 | out_dir = os.path.join( |
||
| 146 | os.environ.get('EGON_VALIDATION_DIR', os.getcwd()), |
||
| 147 | "validation_runs" |
||
| 148 | ) |
||
| 149 | |||
| 150 | # Include execution timestamp in task name so retries write to separate directories |
||
| 151 | # The validation report will filter to keep only the most recent execution per task |
||
| 152 | execution_date = context.get('execution_date') or datetime.now() |
||
| 153 | timestamp = execution_date.strftime('%Y%m%dT%H%M%S') |
||
| 154 | full_task_name = f"{dataset_name}.{task_name}.{timestamp}" |
||
| 155 | |||
| 156 | logger.info(f"Validation: {full_task_name} (run_id: {run_id})") |
||
| 157 | |||
| 158 | # Use existing engine from egon.data.db |
||
| 159 | engine = egon_db.engine() |
||
| 160 | |||
| 161 | # Get current configuration context |
||
| 162 | config = settings()["egon-data"] |
||
| 163 | boundary = config["--dataset-boundary"] |
||
| 164 | scenarios = config.get("--scenarios", []) |
||
| 165 | |||
| 166 | logger.info(f"Resolving validation parameters for boundary='{boundary}', scenarios={scenarios}") |
||
| 167 | |||
| 168 | # Set task and dataset on all rules (required by Rule base class) |
||
| 169 | # Also resolve context-dependent parameters |
||
| 170 | for rule in rules: |
||
| 171 | if not hasattr(rule, 'task') or rule.task is None: |
||
| 172 | rule.task = task_name |
||
| 173 | if not hasattr(rule, 'dataset') or rule.dataset is None: |
||
| 174 | rule.dataset = dataset_name |
||
| 175 | |||
| 176 | # Automatically resolve boundary/scenario-dependent parameters |
||
| 177 | _resolve_rule_params(rule, boundary, scenarios) |
||
| 178 | |||
| 179 | ctx = RunContext(run_id=run_id, source="airflow", out_dir=out_dir) |
||
| 180 | results = run_validations(engine, ctx, rules, full_task_name) |
||
| 181 | |||
| 182 | total = len(results) |
||
| 183 | failed = sum(1 for r in results if not r.success) |
||
| 184 | |||
| 185 | logger.info(f"Complete: {total - failed}/{total} passed") |
||
| 186 | |||
| 187 | if failed > 0 and on_failure == "fail": |
||
| 188 | raise Exception(f"{failed}/{total} validations failed") |
||
| 189 | |||
| 190 | return {"total": total, "passed": total - failed, "failed": failed} |
||
| 191 | |||
| 192 | return run_validation |
||
| 193 | |||
| 194 | func = make_callable(rules, task_name) |
||
| 195 | func.__name__ = f"validate_{task_name}" |
||
| 196 | |||
| 197 | operator = PythonOperator( |
||
| 198 | task_id=f"{dataset_name}.validate.{task_name}", |
||
| 199 | python_callable=func, |
||
| 200 | provide_context=True, |
||
| 201 | ) |
||
| 202 | |||
| 203 | tasks.append(operator) |
||
| 204 | |||
| 205 | return tasks |
||
| 206 |