etlt.helper.Type2Helper.Type2Helper._merge_adjacent_rows() - Code Metrics - PyETLT/etlt - Measure and Improve Code Quality continuously with Scrutinizer

Type2Helper._merge_adjacent_rows() F
last analyzed 2020-10-26 17:12 UTC

↳ Parent: etlt.helper.Type2Helper

Complexity

Conditions

Size

Total Lines	109
Code Lines	46

Duplication

Lines	109
Ratio	100 %

Code Coverage

Tests	42
CRAP Score	17.0036

Importance

Changes

Metric	Value
cc	17
eloc	46
nop	2
dl	109
loc	109
ccs	42
cts	43
cp	0.9767
crap	17.0036
rs	1.8
c	0
b	0
f	0

How to fix Long Method Complexity

import copy
import datetime

from etlt.helper.Allen import Allen


class Type2Helper:

    """
    A helper class for reference data with date intervals.
    """

    # ------------------------------------------------------------------------------------------------------------------
    def __init__(self, key_start_date, key_end_date, pseudo_key):
        """
        Object constructor.

        :param str key_start_date: The key of the start date in the rows.
        :param str key_end_date: The key of the end date in the rows.
        :param list[str] pseudo_key: The keys of the columns that form the pseudo key.
        """
        self.copy = True
        """
        If set to true a copy will be made from the original rows such that the original rows are not modified.

         :type: bool
        """

        self._pseudo_key = list(pseudo_key)
        """
        The keys of the columns that form the pseudo key.

        :type: list[str]
        """

        self._key_end_date = key_end_date
        """
        The key of the end date in the rows.

        :type: str
        """
        self._key_start_date = key_start_date
        """
        The key of the start date in the rows.

        :type: str
        """

        self._rows = dict()
        """
        The data set.

        :type: dict
        """

        self._date_type = ''
        """
        The type of the date fields.
        - date for datetime.date objects
        - str  for strings in ISO 8601 (YYYY-MM-DD) format
        - int for integers

        :type: str
        """

    # ------------------------------------------------------------------------------------------------------------------
    def _get_pseudo_key(self, row):
        """
        Returns the pseudo key in a row.

        :param dict row: The row.

        :rtype: tuple
        """
        ret = list()
        for key in self._pseudo_key:
            ret.append(row[key])

        return tuple(ret)

    # ------------------------------------------------------------------------------------------------------------------
    @staticmethod
    def _date2int(date):
        """
        Returns an integer representation of a date.

        :param str|datetime.date date: The date.

        :rtype: int
        """
        if isinstance(date, str):
            if date.endswith(' 00:00:00') or date.endswith('T00:00:00'):
                # Ignore time suffix.
                date = date[0:-9]
            tmp = datetime.datetime.strptime(date, '%Y-%m-%d')
            return tmp.toordinal()

        if isinstance(date, datetime.date):
            return date.toordinal()

        if isinstance(date, int):
            return date

        raise ValueError('Unexpected type {0!s}'.format(date.__class__))

    # ------------------------------------------------------------------------------------------------------------------
    def _rows_date2int(self, rows):
        """
        Replaces start and end dates in a row set with their integer representation

        :param list[dict[str,T]] rows: The list of rows.
        """
        for row in rows:
            # Determine the type of dates based on the first start date.
            if not self._date_type:
                self._date_type = self._get_date_type(row[self._key_start_date])

            # Convert dates to integers.
            row[self._key_start_date] = self._date2int(row[self._key_start_date])
            row[self._key_end_date] = self._date2int(row[self._key_end_date])

    # ------------------------------------------------------------------------------------------------------------------
    def _rows_int2date(self, rows):
        """
        Replaces start and end dates in the row set with their integer representation

        :param list[dict[str,T]] rows: The list of rows.
        """
        for row in rows:
            if self._date_type == 'str':
                row[self._key_start_date] = datetime.date.fromordinal(row[self._key_start_date]).isoformat()
                row[self._key_end_date] = datetime.date.fromordinal(row[self._key_end_date]).isoformat()
            elif self._date_type == 'date':
                row[self._key_start_date] = datetime.date.fromordinal(row[self._key_start_date])
                row[self._key_end_date] = datetime.date.fromordinal(row[self._key_end_date])
            elif self._date_type == 'int':
                # Nothing to do.
                pass
            else:
                raise ValueError('Unexpected date type {0!s}'.format(self._date_type))

    # ------------------------------------------------------------------------------------------------------------------
    def _rows_sort(self, rows):
        """
        Returns a list of rows sorted by start and end date.

        :param list[dict[str,T]] rows: The list of rows.

        :rtype: list[dict[str,T]]
        """
        return sorted(rows, key=lambda row: (row[self._key_start_date], row[self._key_end_date]))

    # ------------------------------------------------------------------------------------------------------------------
    @staticmethod
    def _get_date_type(date):
        """
        Returns the type of a date.

        :param str|datetime.date date: The date.

        :rtype: str
        """
        if isinstance(date, str):
            return 'str'

        if isinstance(date, datetime.date):
            return 'date'

        if isinstance(date, int):
            return 'int'

        raise ValueError('Unexpected type {0!s}'.format(date.__class__))

    # ------------------------------------------------------------------------------------------------------------------
    def _equal(self, row1, row2):
        """
        Returns True if two rows are identical excluding start and end date. Returns False otherwise.

        :param dict[str,T] row1: The first row.
        :param dict[str,T] row2: The second row.

        :rtype: bool
        """
        for key in row1.keys():
            if key not in [self._key_start_date, self._key_end_date]:
                if row1[key] != row2[key]:
                    return False

        return True

    # ------------------------------------------------------------------------------------------------------------------
    def _merge_adjacent_rows(self, rows):
        """
        Resolves adjacent and overlapping rows. Overlapping rows are resolved as follows:
        * The interval with the most recent begin date prevails for the overlapping period.
        * If the begin dates are the same the interval with the most recent end date prevails.
        * If the begin and end dates are equal the last row in the data set prevails.
        Identical (excluding begin and end date) adjacent rows are replace with a single row.

        :param list[dict[str,T]] rows: The rows in a group (i.e. with the same natural key).
        .
        :rtype: list[dict[str,T]]
        """
        ret = list()

        prev_row = None
        for row in rows:
            if prev_row:
                relation = Allen.relation(prev_row[self._key_start_date],
                                          prev_row[self._key_end_date],
                                          row[self._key_start_date],
                                          row[self._key_end_date])
                if relation is None:
                    # row holds an invalid interval (prev_row always holds a valid interval). Hence, the join is empty.
                    return []

                elif relation == Allen.X_BEFORE_Y:
                    # Two rows with distinct intervals.
                    # prev_row: |----|
                    # row:                 |-----|
                    ret.append(prev_row)
                    prev_row = row

                elif relation == Allen.X_MEETS_Y:
                    # The two rows are adjacent.
                    # prev_row: |-------|
                    # row:               |-------|
                    if self._equal(prev_row, row):
                        # The two rows are identical (except for start and end date) and adjacent. Combine the two rows
                        # into one row.
                        prev_row[self._key_end_date] = row[self._key_end_date]
                    else:
                        # Rows are adjacent but not identical.
                        ret.append(prev_row)
                        prev_row = row

                elif relation == Allen.X_OVERLAPS_WITH_Y:
                    # prev_row overlaps row. Should not occur with proper reference data.
                    # prev_row: |-----------|
                    # row:            |----------|
                    if self._equal(prev_row, row):
                        # The two rows are identical (except for start and end date) and overlapping. Combine the two
                        # rows into one row.
                        prev_row[self._key_end_date] = row[self._key_end_date]
                    else:
                        # Rows are overlapping but not identical.
                        prev_row[self._key_end_date] = row[self._key_start_date] - 1
                        ret.append(prev_row)
                        prev_row = row

                elif relation == Allen.X_STARTS_Y:
                    # prev_row start row. Should not occur with proper reference data.
                    # prev_row: |------|
                    # row:      |----------------|
                    prev_row = row

                elif relation == Allen.X_EQUAL_Y:
                    # Can happen when the reference data sets are joined without respect for date intervals.
                    # prev_row: |----------------|
                    # row:      |----------------|
                    prev_row = row

                elif relation == Allen.X_DURING_Y_INVERSE:
                    # row during prev_row. Should not occur with proper reference data.
                    # prev_row: |----------------|
                    # row:           |------|
                    # Note: the interval with the most recent start date prevails. Hence, the interval after
                    # row[self._key_end_date] is discarded.
                    if self._equal(prev_row, row):
                        prev_row[self._key_end_date] = row[self._key_end_date]
                    else:
                        prev_row[self._key_end_date] = row[self._key_start_date] - 1
                        ret.append(prev_row)
                        prev_row = row

                elif relation == Allen.X_FINISHES_Y_INVERSE:
                    # row finishes prev_row. Should not occur with proper reference data.
                    # prev_row: |----------------|
                    # row:                |------|
                    if not self._equal(prev_row, row):
                        prev_row[self._key_end_date] = row[self._key_start_date] - 1
                        ret.append(prev_row)
                        prev_row = row

                        # Note: if the two rows are identical (except for start and end date) nothing to do.
                else:
                    # Note: The rows are sorted such that prev_row[self._key_begin_date] <= row[self._key_begin_date].
                    # Hence the following relation should not occur: X_DURING_Y,  X_FINISHES_Y, X_BEFORE_Y_INVERSE,
                    # X_MEETS_Y_INVERSE, X_OVERLAPS_WITH_Y_INVERSE, and X_STARTS_Y_INVERSE. Hence, we covered all 13
                    # relations in Allen's interval algebra.
                    raise ValueError('Data is not sorted properly. Relation: {0}'.format(relation))

            elif row[self._key_start_date] <= row[self._key_end_date]:
                # row is the first valid row.
                prev_row = row

        if prev_row:
            ret.append(prev_row)

        return ret

    # ------------------------------------------------------------------------------------------------------------------
    def enumerate(self, name, start=1):
        """
        Enumerates all rows such that the pseudo key and the ordinal number are a unique key.

        :param str name: The key holding the ordinal number.
        :param int start: The start of the ordinal numbers. Foreach pseudo key the first row has this ordinal number.
        """
        for pseudo_key, rows in self._rows.items():
            rows = self._rows_sort(rows)
            ordinal = start
            for row in rows:
                row[name] = ordinal
                ordinal += 1
            self._rows[pseudo_key] = rows

    # ------------------------------------------------------------------------------------------------------------------
    def get_rows(self, sort=False):
        """
        Returns the rows of this Type2Helper.

        :param bool sort: If True the rows are sorted by the pseudo key.
        """
        ret = []
        for _, rows in sorted(self._rows.items()) if sort else self._rows.items():
            self._rows_int2date(rows)
            ret.extend(rows)

        return ret

    # ------------------------------------------------------------------------------------------------------------------
    def prepare_data(self, rows):
        """
        Sets and prepares the rows. The rows are stored in groups in a dictionary. A group is a list of rows with the
        same pseudo key. The key in the dictionary is a tuple with the values of the pseudo key.

        :param list[dict] rows: The rows
        """
        self._rows = dict()
        for row in copy.copy(rows) if self.copy else rows:
            pseudo_key = self._get_pseudo_key(row)
            if pseudo_key not in self._rows:
                self._rows[pseudo_key] = list()
            self._rows[pseudo_key].append(row)

        # Convert begin and end dates to integers.
        self._date_type = None
        for pseudo_key, rows in self._rows.items():
            self._rows_date2int(rows)

# ----------------------------------------------------------------------------------------------------------------------


1	1		import copy
2	1		import datetime
3
4	1		from etlt.helper.Allen import Allen
5
6
7	1	View Code Duplication	class Type2Helper:
			0 ignored issues – show Duplication introduced 2020-04-19 20:27 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
8			"""
9			A helper class for reference data with date intervals.
10			"""
11
12			# ------------------------------------------------------------------------------------------------------------------
13	1		def __init__(self, key_start_date, key_end_date, pseudo_key):
14			"""
15			Object constructor.
16
17			:param str key_start_date: The key of the start date in the rows.
18			:param str key_end_date: The key of the end date in the rows.
19			:param list[str] pseudo_key: The keys of the columns that form the pseudo key.
20			"""
21	1		self.copy = True
22			"""
23			If set to true a copy will be made from the original rows such that the original rows are not modified.
24
25			:type: bool
26			"""
27
28	1		self._pseudo_key = list(pseudo_key)
29			"""
30			The keys of the columns that form the pseudo key.
31
32			:type: list[str]
33			"""
34
35	1		self._key_end_date = key_end_date
36			"""
37			The key of the end date in the rows.
38
39			:type: str
40			"""
41	1		self._key_start_date = key_start_date
42			"""
43			The key of the start date in the rows.
44
45			:type: str
46			"""
47
48	1		self._rows = dict()
49			"""
50			The data set.
51
52			:type: dict
53			"""
54
55	1		self._date_type = ''
56	1		"""
57			The type of the date fields.
58			- date for datetime.date objects
59			- str for strings in ISO 8601 (YYYY-MM-DD) format
60			- int for integers
61
62			:type: str
63			"""
64
65			# ------------------------------------------------------------------------------------------------------------------
66	1		def _get_pseudo_key(self, row):
67			"""
68			Returns the pseudo key in a row.
69
70			:param dict row: The row.
71
72			:rtype: tuple
73			"""
74	1		ret = list()
75	1		for key in self._pseudo_key:
76	1		ret.append(row[key])
77
78	1		return tuple(ret)
79
80			# ------------------------------------------------------------------------------------------------------------------
81	1		@staticmethod
82	1		def _date2int(date):
83			"""
84			Returns an integer representation of a date.
85
86			:param str\|datetime.date date: The date.
87
88			:rtype: int
89			"""
90	1		if isinstance(date, str):
91	1		if date.endswith(' 00:00:00') or date.endswith('T00:00:00'):
92			# Ignore time suffix.
93	1		date = date[0:-9]
94	1		tmp = datetime.datetime.strptime(date, '%Y-%m-%d')
95	1		return tmp.toordinal()
96
97			if isinstance(date, datetime.date):
98			return date.toordinal()
99
100			if isinstance(date, int):
101			return date
102
103			raise ValueError('Unexpected type {0!s}'.format(date.__class__))
104
105			# ------------------------------------------------------------------------------------------------------------------
106	1		def _rows_date2int(self, rows):
107			"""
108			Replaces start and end dates in a row set with their integer representation
109
110			:param list[dict[str,T]] rows: The list of rows.
111			"""
112	1		for row in rows:
113			# Determine the type of dates based on the first start date.
114	1		if not self._date_type:
115	1		self._date_type = self._get_date_type(row[self._key_start_date])
116
117			# Convert dates to integers.
118	1		row[self._key_start_date] = self._date2int(row[self._key_start_date])
119	1		row[self._key_end_date] = self._date2int(row[self._key_end_date])
120
121			# ------------------------------------------------------------------------------------------------------------------
122	1		def _rows_int2date(self, rows):
123			"""
124			Replaces start and end dates in the row set with their integer representation
125
126			:param list[dict[str,T]] rows: The list of rows.
127			"""
128	1		for row in rows:
129	1		if self._date_type == 'str':
130	1		row[self._key_start_date] = datetime.date.fromordinal(row[self._key_start_date]).isoformat()
131	1		row[self._key_end_date] = datetime.date.fromordinal(row[self._key_end_date]).isoformat()
132			elif self._date_type == 'date':
133			row[self._key_start_date] = datetime.date.fromordinal(row[self._key_start_date])
134			row[self._key_end_date] = datetime.date.fromordinal(row[self._key_end_date])
135			elif self._date_type == 'int':
136			# Nothing to do.
137			pass
138			else:
139			raise ValueError('Unexpected date type {0!s}'.format(self._date_type))
140
141			# ------------------------------------------------------------------------------------------------------------------
142	1		def _rows_sort(self, rows):
143			"""
144			Returns a list of rows sorted by start and end date.
145
146			:param list[dict[str,T]] rows: The list of rows.
147
148			:rtype: list[dict[str,T]]
149			"""
150	1		return sorted(rows, key=lambda row: (row[self._key_start_date], row[self._key_end_date]))
151
152			# ------------------------------------------------------------------------------------------------------------------
153	1		@staticmethod
154	1		def _get_date_type(date):
155			"""
156			Returns the type of a date.
157
158			:param str\|datetime.date date: The date.
159
160			:rtype: str
161			"""
162	1		if isinstance(date, str):
163	1		return 'str'
164
165			if isinstance(date, datetime.date):
166			return 'date'
167
168			if isinstance(date, int):
169			return 'int'
170
171			raise ValueError('Unexpected type {0!s}'.format(date.__class__))
172
173			# ------------------------------------------------------------------------------------------------------------------
174	1		def _equal(self, row1, row2):
175			"""
176			Returns True if two rows are identical excluding start and end date. Returns False otherwise.
177
178			:param dict[str,T] row1: The first row.
179			:param dict[str,T] row2: The second row.
180
181			:rtype: bool
182			"""
183	1		for key in row1.keys():
184	1		if key not in [self._key_start_date, self._key_end_date]:
185	1		if row1[key] != row2[key]:
186	1		return False
187
188	1		return True
189
190			# ------------------------------------------------------------------------------------------------------------------
191	1		def _merge_adjacent_rows(self, rows):
192			"""
193			Resolves adjacent and overlapping rows. Overlapping rows are resolved as follows:
194			* The interval with the most recent begin date prevails for the overlapping period.
195			* If the begin dates are the same the interval with the most recent end date prevails.
196			* If the begin and end dates are equal the last row in the data set prevails.
197			Identical (excluding begin and end date) adjacent rows are replace with a single row.
198
199			:param list[dict[str,T]] rows: The rows in a group (i.e. with the same natural key).
200			.
201			:rtype: list[dict[str,T]]
202			"""
203	1		ret = list()
204
205	1		prev_row = None
206	1		for row in rows:
207	1		if prev_row:
208	1		relation = Allen.relation(prev_row[self._key_start_date],
209			prev_row[self._key_end_date],
210			row[self._key_start_date],
211			row[self._key_end_date])
212	1		if relation is None:
213			# row holds an invalid interval (prev_row always holds a valid interval). Hence, the join is empty.
214	1		return []
215
216	1		elif relation == Allen.X_BEFORE_Y:
217			# Two rows with distinct intervals.
218			# prev_row: \|----\|
219			# row: \|-----\|
220	1		ret.append(prev_row)
221	1		prev_row = row
222
223	1		elif relation == Allen.X_MEETS_Y:
224			# The two rows are adjacent.
225			# prev_row: \|-------\|
226			# row: \|-------\|
227	1		if self._equal(prev_row, row):
228			# The two rows are identical (except for start and end date) and adjacent. Combine the two rows
229			# into one row.
230	1		prev_row[self._key_end_date] = row[self._key_end_date]
231			else:
232			# Rows are adjacent but not identical.
233	1		ret.append(prev_row)
234	1		prev_row = row
235
236	1		elif relation == Allen.X_OVERLAPS_WITH_Y:
237			# prev_row overlaps row. Should not occur with proper reference data.
238			# prev_row: \|-----------\|
239			# row: \|----------\|
240	1		if self._equal(prev_row, row):
241			# The two rows are identical (except for start and end date) and overlapping. Combine the two
242			# rows into one row.
243	1		prev_row[self._key_end_date] = row[self._key_end_date]
244			else:
245			# Rows are overlapping but not identical.
246	1		prev_row[self._key_end_date] = row[self._key_start_date] - 1
247	1		ret.append(prev_row)
248	1		prev_row = row
249
250	1		elif relation == Allen.X_STARTS_Y:
251			# prev_row start row. Should not occur with proper reference data.
252			# prev_row: \|------\|
253			# row: \|----------------\|
254	1		prev_row = row
255
256	1		elif relation == Allen.X_EQUAL_Y:
257			# Can happen when the reference data sets are joined without respect for date intervals.
258			# prev_row: \|----------------\|
259			# row: \|----------------\|
260	1		prev_row = row
261
262	1		elif relation == Allen.X_DURING_Y_INVERSE:
263			# row during prev_row. Should not occur with proper reference data.
264			# prev_row: \|----------------\|
265			# row: \|------\|
266			# Note: the interval with the most recent start date prevails. Hence, the interval after
267			# row[self._key_end_date] is discarded.
268	1		if self._equal(prev_row, row):
269	1		prev_row[self._key_end_date] = row[self._key_end_date]
270			else:
271	1		prev_row[self._key_end_date] = row[self._key_start_date] - 1
272	1		ret.append(prev_row)
273	1		prev_row = row
274
275	1		elif relation == Allen.X_FINISHES_Y_INVERSE:
276			# row finishes prev_row. Should not occur with proper reference data.
277			# prev_row: \|----------------\|
278			# row: \|------\|
279	1		if not self._equal(prev_row, row):
280	1		prev_row[self._key_end_date] = row[self._key_start_date] - 1
281	1		ret.append(prev_row)
282	1		prev_row = row
283
284			# Note: if the two rows are identical (except for start and end date) nothing to do.
285			else:
286			# Note: The rows are sorted such that prev_row[self._key_begin_date] <= row[self._key_begin_date].
287			# Hence the following relation should not occur: X_DURING_Y, X_FINISHES_Y, X_BEFORE_Y_INVERSE,
288			# X_MEETS_Y_INVERSE, X_OVERLAPS_WITH_Y_INVERSE, and X_STARTS_Y_INVERSE. Hence, we covered all 13
289			# relations in Allen's interval algebra.
290			raise ValueError('Data is not sorted properly. Relation: {0}'.format(relation))
291
292	1		elif row[self._key_start_date] <= row[self._key_end_date]:
293			# row is the first valid row.
294	1		prev_row = row
295
296	1		if prev_row:
297	1		ret.append(prev_row)
298
299	1		return ret
300
301			# ------------------------------------------------------------------------------------------------------------------
302	1		def enumerate(self, name, start=1):
303			"""
304			Enumerates all rows such that the pseudo key and the ordinal number are a unique key.
305
306			:param str name: The key holding the ordinal number.
307			:param int start: The start of the ordinal numbers. Foreach pseudo key the first row has this ordinal number.
308			"""
309	1		for pseudo_key, rows in self._rows.items():
310	1		rows = self._rows_sort(rows)
311	1		ordinal = start
312	1		for row in rows:
313	1		row[name] = ordinal
314	1		ordinal += 1
315	1		self._rows[pseudo_key] = rows
316
317			# ------------------------------------------------------------------------------------------------------------------
318	1		def get_rows(self, sort=False):
319			"""
320			Returns the rows of this Type2Helper.
321
322			:param bool sort: If True the rows are sorted by the pseudo key.
323			"""
324	1		ret = []
325	1		for _, rows in sorted(self._rows.items()) if sort else self._rows.items():
326	1		self._rows_int2date(rows)
327	1		ret.extend(rows)
328
329	1		return ret
330
331			# ------------------------------------------------------------------------------------------------------------------
332	1		def prepare_data(self, rows):
333			"""
334			Sets and prepares the rows. The rows are stored in groups in a dictionary. A group is a list of rows with the
335			same pseudo key. The key in the dictionary is a tuple with the values of the pseudo key.
336
337			:param list[dict] rows: The rows
338			"""
339	1		self._rows = dict()
340	1		for row in copy.copy(rows) if self.copy else rows:
341	1		pseudo_key = self._get_pseudo_key(row)
342	1		if pseudo_key not in self._rows:
343	1		self._rows[pseudo_key] = list()
344	1		self._rows[pseudo_key].append(row)
345
346			# Convert begin and end dates to integers.
347	1		self._date_type = None
348	1		for pseudo_key, rows in self._rows.items():
349	1		self._rows_date2int(rows)
350
351			# ----------------------------------------------------------------------------------------------------------------------
352

PyETLT / etlt

Type2Helper._merge_adjacent_rows() F last analyzed 2020-10-26 17:12 UTC

Complexity

Size

Duplication

Code Coverage

Importance

How to fix Long Method Complexity

Long Method

Complexity

Duplication Side-by-Side

Filter issues like

Type2Helper._merge_adjacent_rows() F
last analyzed 2020-10-26 17:12 UTC