Completed
Push — master ( da0908...c8344f )
by P.R.
01:36
created

Type2JoinHelper._intersection()   C

Complexity

Conditions 7

Size

Total Lines 32

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 18
CRAP Score 7

Importance

Changes 0
Metric Value
c 0
b 0
f 0
dl 0
loc 32
ccs 18
cts 18
cp 1
rs 5.5
cc 7
crap 7
1
"""
2
ETLT
3
4
Copyright 2016 Set Based IT Consultancy
5
6
Licence MIT
7
"""
8 1
from etlt.helper.Allen import Allen
9 1
from etlt.helper.Type2Helper import Type2Helper
10
11
12 1
class Type2JoinHelper(Type2Helper):
13
    """
14
    A helper class for joining data sets with date intervals.
15
    """
16
17
    # ------------------------------------------------------------------------------------------------------------------
18 1
    def _equal(self, row1, row2):
19
        """
20
        Returns True if two rows are identical excluding start and end date. Returns False otherwise.
21
22
        :param dict[str,T] row1: The first row.
23
        :param dict[str,T] row2: The second row.
24
25
        :rtype: bool
26
        """
27 1
        for key in row1.keys():
28 1
            if key not in [self._key_start_date, self._key_end_date]:
29 1
                if row1[key] != row2[key]:
30 1
                    return False
31
32 1
        return True
33
34
    # ------------------------------------------------------------------------------------------------------------------
35 1
    @staticmethod
36
    def _intersect(start1, end1, start2, end2):
37
        """
38
        Returns the intersection of two intervals. Returns (None,None) if the intersection is empty.
39
40
        :param int start1: The start date of the first interval.
41
        :param int end1: The end date of the first interval.
42
        :param int start2: The start date of the second interval.
43
        :param int end2: The end date of the second interval.
44
45
        :rtype: tuple[int|None,int|None]
46
        """
47 1
        start = max(start1, start2)
48 1
        end = min(end1, end2)
49
50 1
        if start > end:
51 1
            return None, None
52
53 1
        return start, end
54
55
    # ------------------------------------------------------------------------------------------------------------------
56 1
    def _additional_rows_date2int(self, keys, rows):
57
        """
58
        Replaces start and end dates of the additional date intervals in the row set with their integer representation
59
60
        :param list[tuple[str,str]] keys: The other keys with start and end date.
61
        :param list[dict[str,T]] rows: The list of rows.
62
63
        :rtype: list[dict[str,T]]
64
        """
65 1
        for row in rows:
66 1
            for key_start_date, key_end_date in keys:
67 1
                if key_start_date not in [self._key_start_date, self._key_end_date]:
68 1
                    row[key_start_date] = self._date2int(row[key_start_date])
69 1
                if key_end_date not in [self._key_start_date, self._key_end_date]:
70 1
                    row[key_end_date] = self._date2int(row[key_end_date])
71
72
    # ------------------------------------------------------------------------------------------------------------------
73 1
    def _intersection(self, keys, rows):
74
        """
75
        Computes the intersection of the date intervals of two or more reference data sets. If the intersection is empty
76
        the row is removed from the group.
77
78
        :param list[tuple[str,str]] keys: The other keys with start and end date.
79
        :param list[dict[str,T]] rows: The list of rows.
80
81
        :rtype: list[dict[str,T]]
82
        """
83 1
        ret = list()
84 1
        for row in rows:
85 1
            start_date = row[self._key_start_date]
86 1
            end_date = row[self._key_end_date]
87 1
            for key_start_date, key_end_date in keys:
88 1
                start_date, end_date = Type2JoinHelper._intersect(start_date,
89
                                                                  end_date,
90
                                                                  row[key_start_date],
91
                                                                  row[key_end_date])
92 1
                if not start_date:
93 1
                    break
94 1
                if key_start_date not in [self._key_start_date, self._key_end_date]:
95 1
                    del row[key_start_date]
96 1
                if key_end_date not in [self._key_start_date, self._key_end_date]:
97 1
                    del row[key_end_date]
98
99 1
            if start_date:
100 1
                row[self._key_start_date] = start_date
101 1
                row[self._key_end_date] = end_date
102 1
                ret.append(row)
103
104 1
        return ret
105
106
    # ------------------------------------------------------------------------------------------------------------------
107 1
    def _merge_adjacent_rows(self, rows):
108
        """
109
        Resolves adjacent and overlapping rows. With proper reference data overlapping rows MUST not occur. However,
110
        this  method can handle overlapping rows. Overlapping rows are resolved as follows:
111
        * The interval with the most recent begin date prevails for the overlapping period.
112
        * If the begin dates are the same the interval with the most recent end date prevails.
113
        * If the begin and end dates are equal the last row in the data set prevails.
114
        Identical (excluding begin and end date) adjacent rows are replace with a single row.
115
116
        :param list[dict[str,T]] rows: The rows in a group (i.e. with the same natural key).
117
        .
118
        :rtype: list[dict[str,T]]
119
        """
120 1
        ret = list()
121
122 1
        prev_row = None
123 1
        for row in rows:
124 1
            if prev_row:
125 1
                relation = Allen.relation(prev_row[self._key_start_date],
126
                                          prev_row[self._key_end_date],
127
                                          row[self._key_start_date],
128
                                          row[self._key_end_date])
129 1
                if relation == Allen.X_BEFORE_Y:
130
                    # Two rows with distinct intervals.
131
                    # prev_row: |----|
132
                    # row:                 |-----|
133 1
                    ret.append(prev_row)
134 1
                    prev_row = row
135
136 1
                elif relation == Allen.X_MEETS_Y:
137
                    # The two rows are adjacent.
138
                    # prev_row: |-------|
139
                    # row:               |-------|
140 1
                    if self._equal(prev_row, row):
141
                        # The two rows are identical (except for start and end date) and adjacent. Combine the two rows
142
                        # into one row.
143 1
                        prev_row[self._key_end_date] = row[self._key_end_date]
144
                    else:
145
                        # Rows are adjacent but not identical.
146 1
                        ret.append(prev_row)
147 1
                        prev_row = row
148
149 1
                elif relation == Allen.X_OVERLAPS_WITH_Y:
150
                    # prev_row overlaps row. Should not occur with proper reference data.
151
                    # prev_row: |-----------|
152
                    # row:            |----------|
153 1
                    if self._equal(prev_row, row):
154
                        # The two rows are identical (except for start and end date) and overlapping. Combine the two
155
                        # rows into one row.
156 1
                        prev_row[self._key_end_date] = row[self._key_end_date]
157
                    else:
158
                        # Rows are overlapping but not identical.
159 1
                        prev_row[self._key_end_date] = row[self._key_start_date] - 1
160 1
                        ret.append(prev_row)
161 1
                        prev_row = row
162
163 1
                elif relation == Allen.X_STARTS_Y:
164
                    # prev_row start row. Should not occur with proper reference data.
165
                    # prev_row: |------|
166
                    # row:      |----------------|
167 1
                    prev_row = row
168
169 1
                elif relation == Allen.X_EQUAL_Y:
170
                    # Can happen when the reference data sets are joined without respect for date intervals.
171
                    # prev_row: |----------------|
172
                    # row:      |----------------|
173 1
                    prev_row = row
174
175 1
                elif relation == Allen.X_DURING_Y_INVERSE:
176
                    # row during prev_row. Should not occur with proper reference data.
177
                    # prev_row: |----------------|
178
                    # row:           |------|
179
                    # Note: the interval with the most recent start date prevails. Hence, the interval after
180
                    # row[self._key_end_date] is discarded.
181 1
                    if self._equal(prev_row, row):
182 1
                        prev_row[self._key_end_date] = row[self._key_end_date]
183
                    else:
184 1
                        prev_row[self._key_end_date] = row[self._key_start_date] - 1
185 1
                        ret.append(prev_row)
186 1
                        prev_row = row
187
188 1
                elif relation == Allen.X_FINISHES_Y_INVERSE:
189
                    # row finishes prev_row. Should not occur with proper reference data.
190
                    # prev_row: |----------------|
191
                    # row:                |------|
192 1
                    if not self._equal(prev_row, row):
193 1
                        prev_row[self._key_end_date] = row[self._key_start_date] - 1
194 1
                        ret.append(prev_row)
195 1
                        prev_row = row
196
197
                    # Note: if the two rows are identical (except for start and end date) nothing to do.
198
                else:
199
                    # Note: The rows are sorted such that prev_row[self._key_begin_date] <= row[self._key_begin_date].
200
                    # Hence the following relation should not occur: X_DURING_Y,  X_FINISHES_Y, X_BEFORE_Y_INVERSE,
201
                    # X_MEETS_Y_INVERSE, X_OVERLAPS_WITH_Y_INVERSE, and X_STARTS_Y_INVERSE. Hence, we covered all 13
202
                    # relations in Allen's interval algebra.
203
                    raise ValueError('Data is not sorted properly. Relation: %d' % relation)
204
            else:
205 1
                prev_row = row
206
207 1
        if prev_row:
208 1
            ret.append(prev_row)
209
210 1
        return ret
211
212
    # ------------------------------------------------------------------------------------------------------------------
213 1
    def merge(self, keys):
214
        """
215
        Merges the join on natural keys of two or more reference data sets.
216
217
        :param list[tuple[str,str]] keys: For each data set the keys of the start and end date.
218
        """
219 1
        deletes = []
220 1
        for natural_key, rows in self.rows.items():
221 1
            self._additional_rows_date2int(keys, rows)
222 1
            rows = self._intersection(keys, rows)
223 1
            if rows:
224 1
                rows = self._rows_sort(rows)
225 1
                self.rows[natural_key] = self._merge_adjacent_rows(rows)
226
            else:
227 1
                deletes.append(natural_key)
228
229 1
        for natural_key in deletes:
230 1
            del self.rows[natural_key]
231
232
# ----------------------------------------------------------------------------------------------------------------------
233