1
|
|
|
from dataclasses import dataclass |
2
|
|
|
from typing import Union |
3
|
|
|
|
4
|
|
|
from airflow.models.dag import DAG |
5
|
|
|
|
6
|
|
|
from egon.data.datasets import Dataset, TaskGraph, Tasks |
7
|
|
|
|
8
|
|
|
|
9
|
|
|
def test_uniqueness_of_automatically_generated_final_dataset_task(): |
10
|
|
|
"""Test that the generated final dataset task is named uniquely. |
11
|
|
|
|
12
|
|
|
This is a regression test for issue #985. Having multiple `Dataset`s ending |
13
|
|
|
in parallel tasks doesn't work if those `Dataset`s are in a module below |
14
|
|
|
the `egon.data.datasets` package. In that case the code removing the module |
15
|
|
|
name prefix from task ids and the code generating the final dataset task |
16
|
|
|
which updates the dataset version once all parallel tasks have finished |
17
|
|
|
interact in a way that generates non-distinct task ids so that tasks |
18
|
|
|
generated later clobber the ones generated earlier. This leads to spurious |
19
|
|
|
cycles and other inconsistencies and bugs in the graph. |
20
|
|
|
""" |
21
|
|
|
|
22
|
|
|
noops = [(lambda: None) for _ in range(4)] |
23
|
|
|
for i, noop in enumerate(noops): |
24
|
|
|
noop.__name__ = f"noop-{i}" |
25
|
|
|
|
26
|
|
|
@dataclass |
27
|
|
|
class Dataset_1(Dataset): |
28
|
|
|
name: str = "DS1" |
29
|
|
|
version: str = "0.0.0" |
30
|
|
|
tasks: Union[Tasks, TaskGraph] = ({noops[0], noops[1]},) |
31
|
|
|
|
32
|
|
|
@dataclass |
33
|
|
|
class Dataset_2(Dataset): |
34
|
|
|
name: str = "DS2" |
35
|
|
|
version: str = "0.0.0" |
36
|
|
|
tasks: Union[Tasks, TaskGraph] = ({noops[2], noops[3]},) |
37
|
|
|
|
38
|
|
|
Dataset_1.__module__ = "egon.data.datasets.test.datasets" |
39
|
|
|
Dataset_2.__module__ = "egon.data.datasets.test.datasets" |
40
|
|
|
with DAG(dag_id="Test-DAG", default_args={"start_date": "1111-11-11"}): |
41
|
|
|
datasets = [Dataset_1(), Dataset_2()] |
42
|
|
|
ids = [list(dataset.tasks)[-1] for dataset in datasets] |
43
|
|
|
assert ( |
44
|
|
|
ids[0] != ids[1] |
45
|
|
|
), "Expected unique names for final tasks of distinct datasets." |
46
|
|
|
|