| Total Complexity | 55 |
| Total Lines | 357 |
| Duplicated Lines | 96.08 % |
| Coverage | 86.92% |
| Changes | 0 | ||
Duplicate code is one of the most pungent code smells. A rule that is often used is to re-structure code once it is duplicated in three or more places.
Common duplication problems, and corresponding solutions are:
Complex classes like etlt.helper.Type2Helper often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
| 1 | """ |
||
| 2 | ETLT |
||
| 3 | |||
| 4 | Copyright 2016 Set Based IT Consultancy |
||
| 5 | |||
| 6 | Licence MIT |
||
| 7 | """ |
||
| 8 | 1 | import copy |
|
| 9 | 1 | import datetime |
|
| 10 | |||
| 11 | 1 | from etlt.helper.Allen import Allen |
|
| 12 | |||
| 13 | |||
| 14 | 1 | View Code Duplication | class Type2Helper: |
|
|
|||
| 15 | """ |
||
| 16 | A helper class for reference data with date intervals. |
||
| 17 | """ |
||
| 18 | |||
| 19 | # ------------------------------------------------------------------------------------------------------------------ |
||
| 20 | 1 | def __init__(self, key_start_date, key_end_date, pseudo_key): |
|
| 21 | """ |
||
| 22 | Object constructor. |
||
| 23 | |||
| 24 | :param str key_start_date: The key of the start date in the rows. |
||
| 25 | :param str key_end_date: The key of the end date in the rows. |
||
| 26 | :param list[str] pseudo_key: The keys of the columns that form the pseudo key. |
||
| 27 | """ |
||
| 28 | 1 | self.copy = True |
|
| 29 | """ |
||
| 30 | If set to true a copy will be made from the original rows such that the original rows are not modified. |
||
| 31 | |||
| 32 | :type: bool |
||
| 33 | """ |
||
| 34 | |||
| 35 | 1 | self._pseudo_key = list(pseudo_key) |
|
| 36 | """ |
||
| 37 | The keys of the columns that form the pseudo key. |
||
| 38 | |||
| 39 | :type: list[str] |
||
| 40 | """ |
||
| 41 | |||
| 42 | 1 | self._key_end_date = key_end_date |
|
| 43 | """ |
||
| 44 | The key of the end date in the rows. |
||
| 45 | |||
| 46 | :type: str |
||
| 47 | """ |
||
| 48 | 1 | self._key_start_date = key_start_date |
|
| 49 | """ |
||
| 50 | The key of the start date in the rows. |
||
| 51 | |||
| 52 | :type: str |
||
| 53 | """ |
||
| 54 | |||
| 55 | 1 | self._rows = dict() |
|
| 56 | """ |
||
| 57 | The data set. |
||
| 58 | |||
| 59 | :type: dict |
||
| 60 | """ |
||
| 61 | |||
| 62 | 1 | self._date_type = '' |
|
| 63 | 1 | """ |
|
| 64 | The type of the date fields. |
||
| 65 | - date for datetime.date objects |
||
| 66 | - str for strings in ISO 8601 (YYYY-MM-DD) format |
||
| 67 | - int for integers |
||
| 68 | |||
| 69 | :type: str |
||
| 70 | """ |
||
| 71 | |||
| 72 | # ------------------------------------------------------------------------------------------------------------------ |
||
| 73 | 1 | def _get_pseudo_key(self, row): |
|
| 74 | """ |
||
| 75 | Returns the pseudo key in a row. |
||
| 76 | |||
| 77 | :param dict row: The row. |
||
| 78 | |||
| 79 | :rtype: tuple |
||
| 80 | """ |
||
| 81 | 1 | ret = list() |
|
| 82 | 1 | for key in self._pseudo_key: |
|
| 83 | 1 | ret.append(row[key]) |
|
| 84 | |||
| 85 | 1 | return tuple(ret) |
|
| 86 | |||
| 87 | # ------------------------------------------------------------------------------------------------------------------ |
||
| 88 | 1 | @staticmethod |
|
| 89 | 1 | def _date2int(date): |
|
| 90 | """ |
||
| 91 | Returns an integer representation of a date. |
||
| 92 | |||
| 93 | :param str|datetime.date date: The date. |
||
| 94 | |||
| 95 | :rtype: int |
||
| 96 | """ |
||
| 97 | 1 | if isinstance(date, str): |
|
| 98 | 1 | if date.endswith(' 00:00:00') or date.endswith('T00:00:00'): |
|
| 99 | # Ignore time suffix. |
||
| 100 | 1 | date = date[0:-9] |
|
| 101 | 1 | tmp = datetime.datetime.strptime(date, '%Y-%m-%d') |
|
| 102 | 1 | return tmp.toordinal() |
|
| 103 | |||
| 104 | if isinstance(date, datetime.date): |
||
| 105 | return date.toordinal() |
||
| 106 | |||
| 107 | if isinstance(date, int): |
||
| 108 | return date |
||
| 109 | |||
| 110 | raise ValueError('Unexpected type {0!s}'.format(date.__class__)) |
||
| 111 | |||
| 112 | # ------------------------------------------------------------------------------------------------------------------ |
||
| 113 | 1 | def _rows_date2int(self, rows): |
|
| 114 | """ |
||
| 115 | Replaces start and end dates in a row set with their integer representation |
||
| 116 | |||
| 117 | :param list[dict[str,T]] rows: The list of rows. |
||
| 118 | """ |
||
| 119 | 1 | for row in rows: |
|
| 120 | # Determine the type of dates based on the first start date. |
||
| 121 | 1 | if not self._date_type: |
|
| 122 | 1 | self._date_type = self._get_date_type(row[self._key_start_date]) |
|
| 123 | |||
| 124 | # Convert dates to integers. |
||
| 125 | 1 | row[self._key_start_date] = self._date2int(row[self._key_start_date]) |
|
| 126 | 1 | row[self._key_end_date] = self._date2int(row[self._key_end_date]) |
|
| 127 | |||
| 128 | # ------------------------------------------------------------------------------------------------------------------ |
||
| 129 | 1 | def _rows_int2date(self, rows): |
|
| 130 | """ |
||
| 131 | Replaces start and end dates in the row set with their integer representation |
||
| 132 | |||
| 133 | :param list[dict[str,T]] rows: The list of rows. |
||
| 134 | """ |
||
| 135 | 1 | for row in rows: |
|
| 136 | 1 | if self._date_type == 'str': |
|
| 137 | 1 | row[self._key_start_date] = datetime.date.fromordinal(row[self._key_start_date]).isoformat() |
|
| 138 | 1 | row[self._key_end_date] = datetime.date.fromordinal(row[self._key_end_date]).isoformat() |
|
| 139 | elif self._date_type == 'date': |
||
| 140 | row[self._key_start_date] = datetime.date.fromordinal(row[self._key_start_date]) |
||
| 141 | row[self._key_end_date] = datetime.date.fromordinal(row[self._key_end_date]) |
||
| 142 | elif self._date_type == 'int': |
||
| 143 | # Nothing to do. |
||
| 144 | pass |
||
| 145 | else: |
||
| 146 | raise ValueError('Unexpected date type {0!s}'.format(self._date_type)) |
||
| 147 | |||
| 148 | # ------------------------------------------------------------------------------------------------------------------ |
||
| 149 | 1 | def _rows_sort(self, rows): |
|
| 150 | """ |
||
| 151 | Returns a list of rows sorted by start and end date. |
||
| 152 | |||
| 153 | :param list[dict[str,T]] rows: The list of rows. |
||
| 154 | |||
| 155 | :rtype: list[dict[str,T]] |
||
| 156 | """ |
||
| 157 | 1 | return sorted(rows, key=lambda row: (row[self._key_start_date], row[self._key_end_date])) |
|
| 158 | |||
| 159 | # ------------------------------------------------------------------------------------------------------------------ |
||
| 160 | 1 | @staticmethod |
|
| 161 | 1 | def _get_date_type(date): |
|
| 162 | """ |
||
| 163 | Returns the type of a date. |
||
| 164 | |||
| 165 | :param str|datetime.date date: The date. |
||
| 166 | |||
| 167 | :rtype: str |
||
| 168 | """ |
||
| 169 | 1 | if isinstance(date, str): |
|
| 170 | 1 | return 'str' |
|
| 171 | |||
| 172 | if isinstance(date, datetime.date): |
||
| 173 | return 'date' |
||
| 174 | |||
| 175 | if isinstance(date, int): |
||
| 176 | return 'int' |
||
| 177 | |||
| 178 | raise ValueError('Unexpected type {0!s}'.format(date.__class__)) |
||
| 179 | |||
| 180 | # ------------------------------------------------------------------------------------------------------------------ |
||
| 181 | 1 | def _equal(self, row1, row2): |
|
| 182 | """ |
||
| 183 | Returns True if two rows are identical excluding start and end date. Returns False otherwise. |
||
| 184 | |||
| 185 | :param dict[str,T] row1: The first row. |
||
| 186 | :param dict[str,T] row2: The second row. |
||
| 187 | |||
| 188 | :rtype: bool |
||
| 189 | """ |
||
| 190 | 1 | for key in row1.keys(): |
|
| 191 | 1 | if key not in [self._key_start_date, self._key_end_date]: |
|
| 192 | 1 | if row1[key] != row2[key]: |
|
| 193 | 1 | return False |
|
| 194 | |||
| 195 | 1 | return True |
|
| 196 | |||
| 197 | # ------------------------------------------------------------------------------------------------------------------ |
||
| 198 | 1 | def _merge_adjacent_rows(self, rows): |
|
| 199 | """ |
||
| 200 | Resolves adjacent and overlapping rows. Overlapping rows are resolved as follows: |
||
| 201 | * The interval with the most recent begin date prevails for the overlapping period. |
||
| 202 | * If the begin dates are the same the interval with the most recent end date prevails. |
||
| 203 | * If the begin and end dates are equal the last row in the data set prevails. |
||
| 204 | Identical (excluding begin and end date) adjacent rows are replace with a single row. |
||
| 205 | |||
| 206 | :param list[dict[str,T]] rows: The rows in a group (i.e. with the same natural key). |
||
| 207 | . |
||
| 208 | :rtype: list[dict[str,T]] |
||
| 209 | """ |
||
| 210 | 1 | ret = list() |
|
| 211 | |||
| 212 | 1 | prev_row = None |
|
| 213 | 1 | for row in rows: |
|
| 214 | 1 | if prev_row: |
|
| 215 | 1 | relation = Allen.relation(prev_row[self._key_start_date], |
|
| 216 | prev_row[self._key_end_date], |
||
| 217 | row[self._key_start_date], |
||
| 218 | row[self._key_end_date]) |
||
| 219 | 1 | if relation is None: |
|
| 220 | # row holds an invalid interval (prev_row always holds a valid interval). Hence, the join is empty. |
||
| 221 | 1 | return [] |
|
| 222 | |||
| 223 | 1 | elif relation == Allen.X_BEFORE_Y: |
|
| 224 | # Two rows with distinct intervals. |
||
| 225 | # prev_row: |----| |
||
| 226 | # row: |-----| |
||
| 227 | 1 | ret.append(prev_row) |
|
| 228 | 1 | prev_row = row |
|
| 229 | |||
| 230 | 1 | elif relation == Allen.X_MEETS_Y: |
|
| 231 | # The two rows are adjacent. |
||
| 232 | # prev_row: |-------| |
||
| 233 | # row: |-------| |
||
| 234 | 1 | if self._equal(prev_row, row): |
|
| 235 | # The two rows are identical (except for start and end date) and adjacent. Combine the two rows |
||
| 236 | # into one row. |
||
| 237 | 1 | prev_row[self._key_end_date] = row[self._key_end_date] |
|
| 238 | else: |
||
| 239 | # Rows are adjacent but not identical. |
||
| 240 | 1 | ret.append(prev_row) |
|
| 241 | 1 | prev_row = row |
|
| 242 | |||
| 243 | 1 | elif relation == Allen.X_OVERLAPS_WITH_Y: |
|
| 244 | # prev_row overlaps row. Should not occur with proper reference data. |
||
| 245 | # prev_row: |-----------| |
||
| 246 | # row: |----------| |
||
| 247 | 1 | if self._equal(prev_row, row): |
|
| 248 | # The two rows are identical (except for start and end date) and overlapping. Combine the two |
||
| 249 | # rows into one row. |
||
| 250 | 1 | prev_row[self._key_end_date] = row[self._key_end_date] |
|
| 251 | else: |
||
| 252 | # Rows are overlapping but not identical. |
||
| 253 | 1 | prev_row[self._key_end_date] = row[self._key_start_date] - 1 |
|
| 254 | 1 | ret.append(prev_row) |
|
| 255 | 1 | prev_row = row |
|
| 256 | |||
| 257 | 1 | elif relation == Allen.X_STARTS_Y: |
|
| 258 | # prev_row start row. Should not occur with proper reference data. |
||
| 259 | # prev_row: |------| |
||
| 260 | # row: |----------------| |
||
| 261 | 1 | prev_row = row |
|
| 262 | |||
| 263 | 1 | elif relation == Allen.X_EQUAL_Y: |
|
| 264 | # Can happen when the reference data sets are joined without respect for date intervals. |
||
| 265 | # prev_row: |----------------| |
||
| 266 | # row: |----------------| |
||
| 267 | 1 | prev_row = row |
|
| 268 | |||
| 269 | 1 | elif relation == Allen.X_DURING_Y_INVERSE: |
|
| 270 | # row during prev_row. Should not occur with proper reference data. |
||
| 271 | # prev_row: |----------------| |
||
| 272 | # row: |------| |
||
| 273 | # Note: the interval with the most recent start date prevails. Hence, the interval after |
||
| 274 | # row[self._key_end_date] is discarded. |
||
| 275 | 1 | if self._equal(prev_row, row): |
|
| 276 | 1 | prev_row[self._key_end_date] = row[self._key_end_date] |
|
| 277 | else: |
||
| 278 | 1 | prev_row[self._key_end_date] = row[self._key_start_date] - 1 |
|
| 279 | 1 | ret.append(prev_row) |
|
| 280 | 1 | prev_row = row |
|
| 281 | |||
| 282 | 1 | elif relation == Allen.X_FINISHES_Y_INVERSE: |
|
| 283 | # row finishes prev_row. Should not occur with proper reference data. |
||
| 284 | # prev_row: |----------------| |
||
| 285 | # row: |------| |
||
| 286 | 1 | if not self._equal(prev_row, row): |
|
| 287 | 1 | prev_row[self._key_end_date] = row[self._key_start_date] - 1 |
|
| 288 | 1 | ret.append(prev_row) |
|
| 289 | 1 | prev_row = row |
|
| 290 | |||
| 291 | # Note: if the two rows are identical (except for start and end date) nothing to do. |
||
| 292 | else: |
||
| 293 | # Note: The rows are sorted such that prev_row[self._key_begin_date] <= row[self._key_begin_date]. |
||
| 294 | # Hence the following relation should not occur: X_DURING_Y, X_FINISHES_Y, X_BEFORE_Y_INVERSE, |
||
| 295 | # X_MEETS_Y_INVERSE, X_OVERLAPS_WITH_Y_INVERSE, and X_STARTS_Y_INVERSE. Hence, we covered all 13 |
||
| 296 | # relations in Allen's interval algebra. |
||
| 297 | raise ValueError('Data is not sorted properly. Relation: {0}'.format(relation)) |
||
| 298 | |||
| 299 | 1 | elif row[self._key_start_date] <= row[self._key_end_date]: |
|
| 300 | # row is the first valid row. |
||
| 301 | 1 | prev_row = row |
|
| 302 | |||
| 303 | 1 | if prev_row: |
|
| 304 | 1 | ret.append(prev_row) |
|
| 305 | |||
| 306 | 1 | return ret |
|
| 307 | |||
| 308 | # ------------------------------------------------------------------------------------------------------------------ |
||
| 309 | 1 | def enumerate(self, name, start=1): |
|
| 310 | """ |
||
| 311 | Enumerates all rows such that the pseudo key and the ordinal number are a unique key. |
||
| 312 | |||
| 313 | :param str name: The key holding the ordinal number. |
||
| 314 | :param int start: The start of the ordinal numbers. Foreach pseudo key the first row has this ordinal number. |
||
| 315 | """ |
||
| 316 | 1 | for pseudo_key, rows in self._rows.items(): |
|
| 317 | 1 | rows = self._rows_sort(rows) |
|
| 318 | 1 | ordinal = start |
|
| 319 | 1 | for row in rows: |
|
| 320 | 1 | row[name] = ordinal |
|
| 321 | 1 | ordinal += 1 |
|
| 322 | 1 | self._rows[pseudo_key] = rows |
|
| 323 | |||
| 324 | # ------------------------------------------------------------------------------------------------------------------ |
||
| 325 | 1 | def get_rows(self, sort=False): |
|
| 326 | """ |
||
| 327 | Returns the rows of this Type2Helper. |
||
| 328 | |||
| 329 | :param bool sort: If True the rows are sorted by the pseudo key. |
||
| 330 | """ |
||
| 331 | 1 | ret = [] |
|
| 332 | 1 | for _, rows in sorted(self._rows.items()) if sort else self._rows.items(): |
|
| 333 | 1 | self._rows_int2date(rows) |
|
| 334 | 1 | ret.extend(rows) |
|
| 335 | |||
| 336 | 1 | return ret |
|
| 337 | |||
| 338 | # ------------------------------------------------------------------------------------------------------------------ |
||
| 339 | 1 | def prepare_data(self, rows): |
|
| 340 | """ |
||
| 341 | Sets and prepares the rows. The rows are stored in groups in a dictionary. A group is a list of rows with the |
||
| 342 | same pseudo key. The key in the dictionary is a tuple with the values of the pseudo key. |
||
| 343 | |||
| 344 | :param list[dict] rows: The rows |
||
| 345 | """ |
||
| 346 | 1 | self._rows = dict() |
|
| 347 | 1 | for row in copy.copy(rows) if self.copy else rows: |
|
| 348 | 1 | pseudo_key = self._get_pseudo_key(row) |
|
| 349 | 1 | if pseudo_key not in self._rows: |
|
| 350 | 1 | self._rows[pseudo_key] = list() |
|
| 351 | 1 | self._rows[pseudo_key].append(row) |
|
| 352 | |||
| 353 | # Convert begin and end dates to integers. |
||
| 354 | 1 | self._date_type = None |
|
| 355 | 1 | for pseudo_key, rows in self._rows.items(): |
|
| 356 | 1 | self._rows_date2int(rows) |
|
| 357 | |||
| 359 |