Total Complexity | 55 |
Total Lines | 357 |
Duplicated Lines | 96.08 % |
Coverage | 86.92% |
Changes | 0 |
Duplicate code is one of the most pungent code smells. A rule that is often used is to re-structure code once it is duplicated in three or more places.
Common duplication problems, and corresponding solutions are:
Complex classes like etlt.helper.Type2Helper often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
1 | """ |
||
2 | ETLT |
||
3 | |||
4 | Copyright 2016 Set Based IT Consultancy |
||
5 | |||
6 | Licence MIT |
||
7 | """ |
||
8 | 1 | import copy |
|
9 | 1 | import datetime |
|
10 | |||
11 | 1 | from etlt.helper.Allen import Allen |
|
12 | |||
13 | |||
14 | 1 | View Code Duplication | class Type2Helper: |
|
|||
15 | """ |
||
16 | A helper class for reference data with date intervals. |
||
17 | """ |
||
18 | |||
19 | # ------------------------------------------------------------------------------------------------------------------ |
||
20 | 1 | def __init__(self, key_start_date, key_end_date, pseudo_key): |
|
21 | """ |
||
22 | Object constructor. |
||
23 | |||
24 | :param str key_start_date: The key of the start date in the rows. |
||
25 | :param str key_end_date: The key of the end date in the rows. |
||
26 | :param list[str] pseudo_key: The keys of the columns that form the pseudo key. |
||
27 | """ |
||
28 | 1 | self.copy = True |
|
29 | """ |
||
30 | If set to true a copy will be made from the original rows such that the original rows are not modified. |
||
31 | |||
32 | :type: bool |
||
33 | """ |
||
34 | |||
35 | 1 | self._pseudo_key = list(pseudo_key) |
|
36 | """ |
||
37 | The keys of the columns that form the pseudo key. |
||
38 | |||
39 | :type: list[str] |
||
40 | """ |
||
41 | |||
42 | 1 | self._key_end_date = key_end_date |
|
43 | """ |
||
44 | The key of the end date in the rows. |
||
45 | |||
46 | :type: str |
||
47 | """ |
||
48 | 1 | self._key_start_date = key_start_date |
|
49 | """ |
||
50 | The key of the start date in the rows. |
||
51 | |||
52 | :type: str |
||
53 | """ |
||
54 | |||
55 | 1 | self._rows = dict() |
|
56 | """ |
||
57 | The data set. |
||
58 | |||
59 | :type: dict |
||
60 | """ |
||
61 | |||
62 | 1 | self._date_type = '' |
|
63 | 1 | """ |
|
64 | The type of the date fields. |
||
65 | - date for datetime.date objects |
||
66 | - str for strings in ISO 8601 (YYYY-MM-DD) format |
||
67 | - int for integers |
||
68 | |||
69 | :type: str |
||
70 | """ |
||
71 | |||
72 | # ------------------------------------------------------------------------------------------------------------------ |
||
73 | 1 | def _get_pseudo_key(self, row): |
|
74 | """ |
||
75 | Returns the pseudo key in a row. |
||
76 | |||
77 | :param dict row: The row. |
||
78 | |||
79 | :rtype: tuple |
||
80 | """ |
||
81 | 1 | ret = list() |
|
82 | 1 | for key in self._pseudo_key: |
|
83 | 1 | ret.append(row[key]) |
|
84 | |||
85 | 1 | return tuple(ret) |
|
86 | |||
87 | # ------------------------------------------------------------------------------------------------------------------ |
||
88 | 1 | @staticmethod |
|
89 | 1 | def _date2int(date): |
|
90 | """ |
||
91 | Returns an integer representation of a date. |
||
92 | |||
93 | :param str|datetime.date date: The date. |
||
94 | |||
95 | :rtype: int |
||
96 | """ |
||
97 | 1 | if isinstance(date, str): |
|
98 | 1 | if date.endswith(' 00:00:00') or date.endswith('T00:00:00'): |
|
99 | # Ignore time suffix. |
||
100 | 1 | date = date[0:-9] |
|
101 | 1 | tmp = datetime.datetime.strptime(date, '%Y-%m-%d') |
|
102 | 1 | return tmp.toordinal() |
|
103 | |||
104 | if isinstance(date, datetime.date): |
||
105 | return date.toordinal() |
||
106 | |||
107 | if isinstance(date, int): |
||
108 | return date |
||
109 | |||
110 | raise ValueError('Unexpected type {0!s}'.format(date.__class__)) |
||
111 | |||
112 | # ------------------------------------------------------------------------------------------------------------------ |
||
113 | 1 | def _rows_date2int(self, rows): |
|
114 | """ |
||
115 | Replaces start and end dates in a row set with their integer representation |
||
116 | |||
117 | :param list[dict[str,T]] rows: The list of rows. |
||
118 | """ |
||
119 | 1 | for row in rows: |
|
120 | # Determine the type of dates based on the first start date. |
||
121 | 1 | if not self._date_type: |
|
122 | 1 | self._date_type = self._get_date_type(row[self._key_start_date]) |
|
123 | |||
124 | # Convert dates to integers. |
||
125 | 1 | row[self._key_start_date] = self._date2int(row[self._key_start_date]) |
|
126 | 1 | row[self._key_end_date] = self._date2int(row[self._key_end_date]) |
|
127 | |||
128 | # ------------------------------------------------------------------------------------------------------------------ |
||
129 | 1 | def _rows_int2date(self, rows): |
|
130 | """ |
||
131 | Replaces start and end dates in the row set with their integer representation |
||
132 | |||
133 | :param list[dict[str,T]] rows: The list of rows. |
||
134 | """ |
||
135 | 1 | for row in rows: |
|
136 | 1 | if self._date_type == 'str': |
|
137 | 1 | row[self._key_start_date] = datetime.date.fromordinal(row[self._key_start_date]).isoformat() |
|
138 | 1 | row[self._key_end_date] = datetime.date.fromordinal(row[self._key_end_date]).isoformat() |
|
139 | elif self._date_type == 'date': |
||
140 | row[self._key_start_date] = datetime.date.fromordinal(row[self._key_start_date]) |
||
141 | row[self._key_end_date] = datetime.date.fromordinal(row[self._key_end_date]) |
||
142 | elif self._date_type == 'int': |
||
143 | # Nothing to do. |
||
144 | pass |
||
145 | else: |
||
146 | raise ValueError('Unexpected date type {0!s}'.format(self._date_type)) |
||
147 | |||
148 | # ------------------------------------------------------------------------------------------------------------------ |
||
149 | 1 | def _rows_sort(self, rows): |
|
150 | """ |
||
151 | Returns a list of rows sorted by start and end date. |
||
152 | |||
153 | :param list[dict[str,T]] rows: The list of rows. |
||
154 | |||
155 | :rtype: list[dict[str,T]] |
||
156 | """ |
||
157 | 1 | return sorted(rows, key=lambda row: (row[self._key_start_date], row[self._key_end_date])) |
|
158 | |||
159 | # ------------------------------------------------------------------------------------------------------------------ |
||
160 | 1 | @staticmethod |
|
161 | 1 | def _get_date_type(date): |
|
162 | """ |
||
163 | Returns the type of a date. |
||
164 | |||
165 | :param str|datetime.date date: The date. |
||
166 | |||
167 | :rtype: str |
||
168 | """ |
||
169 | 1 | if isinstance(date, str): |
|
170 | 1 | return 'str' |
|
171 | |||
172 | if isinstance(date, datetime.date): |
||
173 | return 'date' |
||
174 | |||
175 | if isinstance(date, int): |
||
176 | return 'int' |
||
177 | |||
178 | raise ValueError('Unexpected type {0!s}'.format(date.__class__)) |
||
179 | |||
180 | # ------------------------------------------------------------------------------------------------------------------ |
||
181 | 1 | def _equal(self, row1, row2): |
|
182 | """ |
||
183 | Returns True if two rows are identical excluding start and end date. Returns False otherwise. |
||
184 | |||
185 | :param dict[str,T] row1: The first row. |
||
186 | :param dict[str,T] row2: The second row. |
||
187 | |||
188 | :rtype: bool |
||
189 | """ |
||
190 | 1 | for key in row1.keys(): |
|
191 | 1 | if key not in [self._key_start_date, self._key_end_date]: |
|
192 | 1 | if row1[key] != row2[key]: |
|
193 | 1 | return False |
|
194 | |||
195 | 1 | return True |
|
196 | |||
197 | # ------------------------------------------------------------------------------------------------------------------ |
||
198 | 1 | def _merge_adjacent_rows(self, rows): |
|
199 | """ |
||
200 | Resolves adjacent and overlapping rows. Overlapping rows are resolved as follows: |
||
201 | * The interval with the most recent begin date prevails for the overlapping period. |
||
202 | * If the begin dates are the same the interval with the most recent end date prevails. |
||
203 | * If the begin and end dates are equal the last row in the data set prevails. |
||
204 | Identical (excluding begin and end date) adjacent rows are replace with a single row. |
||
205 | |||
206 | :param list[dict[str,T]] rows: The rows in a group (i.e. with the same natural key). |
||
207 | . |
||
208 | :rtype: list[dict[str,T]] |
||
209 | """ |
||
210 | 1 | ret = list() |
|
211 | |||
212 | 1 | prev_row = None |
|
213 | 1 | for row in rows: |
|
214 | 1 | if prev_row: |
|
215 | 1 | relation = Allen.relation(prev_row[self._key_start_date], |
|
216 | prev_row[self._key_end_date], |
||
217 | row[self._key_start_date], |
||
218 | row[self._key_end_date]) |
||
219 | 1 | if relation is None: |
|
220 | # row holds an invalid interval (prev_row always holds a valid interval). Hence, the join is empty. |
||
221 | 1 | return [] |
|
222 | |||
223 | 1 | elif relation == Allen.X_BEFORE_Y: |
|
224 | # Two rows with distinct intervals. |
||
225 | # prev_row: |----| |
||
226 | # row: |-----| |
||
227 | 1 | ret.append(prev_row) |
|
228 | 1 | prev_row = row |
|
229 | |||
230 | 1 | elif relation == Allen.X_MEETS_Y: |
|
231 | # The two rows are adjacent. |
||
232 | # prev_row: |-------| |
||
233 | # row: |-------| |
||
234 | 1 | if self._equal(prev_row, row): |
|
235 | # The two rows are identical (except for start and end date) and adjacent. Combine the two rows |
||
236 | # into one row. |
||
237 | 1 | prev_row[self._key_end_date] = row[self._key_end_date] |
|
238 | else: |
||
239 | # Rows are adjacent but not identical. |
||
240 | 1 | ret.append(prev_row) |
|
241 | 1 | prev_row = row |
|
242 | |||
243 | 1 | elif relation == Allen.X_OVERLAPS_WITH_Y: |
|
244 | # prev_row overlaps row. Should not occur with proper reference data. |
||
245 | # prev_row: |-----------| |
||
246 | # row: |----------| |
||
247 | 1 | if self._equal(prev_row, row): |
|
248 | # The two rows are identical (except for start and end date) and overlapping. Combine the two |
||
249 | # rows into one row. |
||
250 | 1 | prev_row[self._key_end_date] = row[self._key_end_date] |
|
251 | else: |
||
252 | # Rows are overlapping but not identical. |
||
253 | 1 | prev_row[self._key_end_date] = row[self._key_start_date] - 1 |
|
254 | 1 | ret.append(prev_row) |
|
255 | 1 | prev_row = row |
|
256 | |||
257 | 1 | elif relation == Allen.X_STARTS_Y: |
|
258 | # prev_row start row. Should not occur with proper reference data. |
||
259 | # prev_row: |------| |
||
260 | # row: |----------------| |
||
261 | 1 | prev_row = row |
|
262 | |||
263 | 1 | elif relation == Allen.X_EQUAL_Y: |
|
264 | # Can happen when the reference data sets are joined without respect for date intervals. |
||
265 | # prev_row: |----------------| |
||
266 | # row: |----------------| |
||
267 | 1 | prev_row = row |
|
268 | |||
269 | 1 | elif relation == Allen.X_DURING_Y_INVERSE: |
|
270 | # row during prev_row. Should not occur with proper reference data. |
||
271 | # prev_row: |----------------| |
||
272 | # row: |------| |
||
273 | # Note: the interval with the most recent start date prevails. Hence, the interval after |
||
274 | # row[self._key_end_date] is discarded. |
||
275 | 1 | if self._equal(prev_row, row): |
|
276 | 1 | prev_row[self._key_end_date] = row[self._key_end_date] |
|
277 | else: |
||
278 | 1 | prev_row[self._key_end_date] = row[self._key_start_date] - 1 |
|
279 | 1 | ret.append(prev_row) |
|
280 | 1 | prev_row = row |
|
281 | |||
282 | 1 | elif relation == Allen.X_FINISHES_Y_INVERSE: |
|
283 | # row finishes prev_row. Should not occur with proper reference data. |
||
284 | # prev_row: |----------------| |
||
285 | # row: |------| |
||
286 | 1 | if not self._equal(prev_row, row): |
|
287 | 1 | prev_row[self._key_end_date] = row[self._key_start_date] - 1 |
|
288 | 1 | ret.append(prev_row) |
|
289 | 1 | prev_row = row |
|
290 | |||
291 | # Note: if the two rows are identical (except for start and end date) nothing to do. |
||
292 | else: |
||
293 | # Note: The rows are sorted such that prev_row[self._key_begin_date] <= row[self._key_begin_date]. |
||
294 | # Hence the following relation should not occur: X_DURING_Y, X_FINISHES_Y, X_BEFORE_Y_INVERSE, |
||
295 | # X_MEETS_Y_INVERSE, X_OVERLAPS_WITH_Y_INVERSE, and X_STARTS_Y_INVERSE. Hence, we covered all 13 |
||
296 | # relations in Allen's interval algebra. |
||
297 | raise ValueError('Data is not sorted properly. Relation: {0}'.format(relation)) |
||
298 | |||
299 | 1 | elif row[self._key_start_date] <= row[self._key_end_date]: |
|
300 | # row is the first valid row. |
||
301 | 1 | prev_row = row |
|
302 | |||
303 | 1 | if prev_row: |
|
304 | 1 | ret.append(prev_row) |
|
305 | |||
306 | 1 | return ret |
|
307 | |||
308 | # ------------------------------------------------------------------------------------------------------------------ |
||
309 | 1 | def enumerate(self, name, start=1): |
|
310 | """ |
||
311 | Enumerates all rows such that the pseudo key and the ordinal number are a unique key. |
||
312 | |||
313 | :param str name: The key holding the ordinal number. |
||
314 | :param int start: The start of the ordinal numbers. Foreach pseudo key the first row has this ordinal number. |
||
315 | """ |
||
316 | 1 | for pseudo_key, rows in self._rows.items(): |
|
317 | 1 | rows = self._rows_sort(rows) |
|
318 | 1 | ordinal = start |
|
319 | 1 | for row in rows: |
|
320 | 1 | row[name] = ordinal |
|
321 | 1 | ordinal += 1 |
|
322 | 1 | self._rows[pseudo_key] = rows |
|
323 | |||
324 | # ------------------------------------------------------------------------------------------------------------------ |
||
325 | 1 | def get_rows(self, sort=False): |
|
326 | """ |
||
327 | Returns the rows of this Type2Helper. |
||
328 | |||
329 | :param bool sort: If True the rows are sorted by the pseudo key. |
||
330 | """ |
||
331 | 1 | ret = [] |
|
332 | 1 | for _, rows in sorted(self._rows.items()) if sort else self._rows.items(): |
|
333 | 1 | self._rows_int2date(rows) |
|
334 | 1 | ret.extend(rows) |
|
335 | |||
336 | 1 | return ret |
|
337 | |||
338 | # ------------------------------------------------------------------------------------------------------------------ |
||
339 | 1 | def prepare_data(self, rows): |
|
340 | """ |
||
341 | Sets and prepares the rows. The rows are stored in groups in a dictionary. A group is a list of rows with the |
||
342 | same pseudo key. The key in the dictionary is a tuple with the values of the pseudo key. |
||
343 | |||
344 | :param list[dict] rows: The rows |
||
345 | """ |
||
346 | 1 | self._rows = dict() |
|
347 | 1 | for row in copy.copy(rows) if self.copy else rows: |
|
348 | 1 | pseudo_key = self._get_pseudo_key(row) |
|
349 | 1 | if pseudo_key not in self._rows: |
|
350 | 1 | self._rows[pseudo_key] = list() |
|
351 | 1 | self._rows[pseudo_key].append(row) |
|
352 | |||
353 | # Convert begin and end dates to integers. |
||
354 | 1 | self._date_type = None |
|
355 | 1 | for pseudo_key, rows in self._rows.items(): |
|
356 | 1 | self._rows_date2int(rows) |
|
357 | |||
359 |