Type2JoinHelper._merge_adjacent_rows() - Code Metrics - Inspection of "Code enhancements." - PyETLT/etlt - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — master ( da0908...c8344f )

by P.R.

created 2016-07-24 10:10 UTC

Type2JoinHelper._merge_adjacent_rows() F

↳ Parent: Type2JoinHelper

Complexity

Conditions

Size

Total Lines

104

Duplication

Lines	0
Ratio	0 %

Code Coverage

Tests	39
CRAP Score	15.0035

Importance

Changes

Metric	Value
c	0
b	0
f	0
dl	0
loc	104
ccs	39
cts	40
cp	0.975
rs	2
cc	15
crap	15.0035

How to fix Long Method Complexity

"""
ETLT

Copyright 2016 Set Based IT Consultancy

Licence MIT
"""
from etlt.helper.Allen import Allen
from etlt.helper.Type2Helper import Type2Helper


class Type2JoinHelper(Type2Helper):
    """
    A helper class for joining data sets with date intervals.
    """

    # ------------------------------------------------------------------------------------------------------------------
    def _equal(self, row1, row2):
        """
        Returns True if two rows are identical excluding start and end date. Returns False otherwise.

        :param dict[str,T] row1: The first row.
        :param dict[str,T] row2: The second row.

        :rtype: bool
        """
        for key in row1.keys():
            if key not in [self._key_start_date, self._key_end_date]:
                if row1[key] != row2[key]:
                    return False

        return True

    # ------------------------------------------------------------------------------------------------------------------
    @staticmethod
    def _intersect(start1, end1, start2, end2):
        """
        Returns the intersection of two intervals. Returns (None,None) if the intersection is empty.

        :param int start1: The start date of the first interval.
        :param int end1: The end date of the first interval.
        :param int start2: The start date of the second interval.
        :param int end2: The end date of the second interval.

        :rtype: tuple[int|None,int|None]
        """
        start = max(start1, start2)
        end = min(end1, end2)

        if start > end:
            return None, None

        return start, end

    # ------------------------------------------------------------------------------------------------------------------
    def _additional_rows_date2int(self, keys, rows):
        """
        Replaces start and end dates of the additional date intervals in the row set with their integer representation

        :param list[tuple[str,str]] keys: The other keys with start and end date.
        :param list[dict[str,T]] rows: The list of rows.

        :rtype: list[dict[str,T]]
        """
        for row in rows:
            for key_start_date, key_end_date in keys:
                if key_start_date not in [self._key_start_date, self._key_end_date]:
                    row[key_start_date] = self._date2int(row[key_start_date])
                if key_end_date not in [self._key_start_date, self._key_end_date]:
                    row[key_end_date] = self._date2int(row[key_end_date])

    # ------------------------------------------------------------------------------------------------------------------
    def _intersection(self, keys, rows):
        """
        Computes the intersection of the date intervals of two or more reference data sets. If the intersection is empty
        the row is removed from the group.

        :param list[tuple[str,str]] keys: The other keys with start and end date.
        :param list[dict[str,T]] rows: The list of rows.

        :rtype: list[dict[str,T]]
        """
        ret = list()
        for row in rows:
            start_date = row[self._key_start_date]
            end_date = row[self._key_end_date]
            for key_start_date, key_end_date in keys:
                start_date, end_date = Type2JoinHelper._intersect(start_date,
                                                                  end_date,
                                                                  row[key_start_date],
                                                                  row[key_end_date])
                if not start_date:
                    break
                if key_start_date not in [self._key_start_date, self._key_end_date]:
                    del row[key_start_date]
                if key_end_date not in [self._key_start_date, self._key_end_date]:
                    del row[key_end_date]

            if start_date:
                row[self._key_start_date] = start_date
                row[self._key_end_date] = end_date
                ret.append(row)

        return ret

    # ------------------------------------------------------------------------------------------------------------------
    def _merge_adjacent_rows(self, rows):
        """
        Resolves adjacent and overlapping rows. With proper reference data overlapping rows MUST not occur. However,
        this  method can handle overlapping rows. Overlapping rows are resolved as follows:
        * The interval with the most recent begin date prevails for the overlapping period.
        * If the begin dates are the same the interval with the most recent end date prevails.
        * If the begin and end dates are equal the last row in the data set prevails.
        Identical (excluding begin and end date) adjacent rows are replace with a single row.

        :param list[dict[str,T]] rows: The rows in a group (i.e. with the same natural key).
        .
        :rtype: list[dict[str,T]]
        """
        ret = list()

        prev_row = None
        for row in rows:
            if prev_row:
                relation = Allen.relation(prev_row[self._key_start_date],
                                          prev_row[self._key_end_date],
                                          row[self._key_start_date],
                                          row[self._key_end_date])
                if relation == Allen.X_BEFORE_Y:
                    # Two rows with distinct intervals.
                    # prev_row: |----|
                    # row:                 |-----|
                    ret.append(prev_row)
                    prev_row = row

                elif relation == Allen.X_MEETS_Y:
                    # The two rows are adjacent.
                    # prev_row: |-------|
                    # row:               |-------|
                    if self._equal(prev_row, row):
                        # The two rows are identical (except for start and end date) and adjacent. Combine the two rows
                        # into one row.
                        prev_row[self._key_end_date] = row[self._key_end_date]
                    else:
                        # Rows are adjacent but not identical.
                        ret.append(prev_row)
                        prev_row = row

                elif relation == Allen.X_OVERLAPS_WITH_Y:
                    # prev_row overlaps row. Should not occur with proper reference data.
                    # prev_row: |-----------|
                    # row:            |----------|
                    if self._equal(prev_row, row):
                        # The two rows are identical (except for start and end date) and overlapping. Combine the two
                        # rows into one row.
                        prev_row[self._key_end_date] = row[self._key_end_date]
                    else:
                        # Rows are overlapping but not identical.
                        prev_row[self._key_end_date] = row[self._key_start_date] - 1
                        ret.append(prev_row)
                        prev_row = row

                elif relation == Allen.X_STARTS_Y:
                    # prev_row start row. Should not occur with proper reference data.
                    # prev_row: |------|
                    # row:      |----------------|
                    prev_row = row

                elif relation == Allen.X_EQUAL_Y:
                    # Can happen when the reference data sets are joined without respect for date intervals.
                    # prev_row: |----------------|
                    # row:      |----------------|
                    prev_row = row

                elif relation == Allen.X_DURING_Y_INVERSE:
                    # row during prev_row. Should not occur with proper reference data.
                    # prev_row: |----------------|
                    # row:           |------|
                    # Note: the interval with the most recent start date prevails. Hence, the interval after
                    # row[self._key_end_date] is discarded.
                    if self._equal(prev_row, row):
                        prev_row[self._key_end_date] = row[self._key_end_date]
                    else:
                        prev_row[self._key_end_date] = row[self._key_start_date] - 1
                        ret.append(prev_row)
                        prev_row = row

                elif relation == Allen.X_FINISHES_Y_INVERSE:
                    # row finishes prev_row. Should not occur with proper reference data.
                    # prev_row: |----------------|
                    # row:                |------|
                    if not self._equal(prev_row, row):
                        prev_row[self._key_end_date] = row[self._key_start_date] - 1
                        ret.append(prev_row)
                        prev_row = row

                    # Note: if the two rows are identical (except for start and end date) nothing to do.
                else:
                    # Note: The rows are sorted such that prev_row[self._key_begin_date] <= row[self._key_begin_date].
                    # Hence the following relation should not occur: X_DURING_Y,  X_FINISHES_Y, X_BEFORE_Y_INVERSE,
                    # X_MEETS_Y_INVERSE, X_OVERLAPS_WITH_Y_INVERSE, and X_STARTS_Y_INVERSE. Hence, we covered all 13
                    # relations in Allen's interval algebra.
                    raise ValueError('Data is not sorted properly. Relation: %d' % relation)
            else:
                prev_row = row

        if prev_row:
            ret.append(prev_row)

        return ret

    # ------------------------------------------------------------------------------------------------------------------
    def merge(self, keys):
        """
        Merges the join on natural keys of two or more reference data sets.

        :param list[tuple[str,str]] keys: For each data set the keys of the start and end date.
        """
        deletes = []
        for natural_key, rows in self.rows.items():
            self._additional_rows_date2int(keys, rows)
            rows = self._intersection(keys, rows)
            if rows:
                rows = self._rows_sort(rows)
                self.rows[natural_key] = self._merge_adjacent_rows(rows)
            else:
                deletes.append(natural_key)

        for natural_key in deletes:
            del self.rows[natural_key]

# ----------------------------------------------------------------------------------------------------------------------


1		"""
2		ETLT
3
4		Copyright 2016 Set Based IT Consultancy
5
6		Licence MIT
7		"""
8	1	from etlt.helper.Allen import Allen
9	1	from etlt.helper.Type2Helper import Type2Helper
10
11
12	1	class Type2JoinHelper(Type2Helper):
13		"""
14		A helper class for joining data sets with date intervals.
15		"""
16
17		# ------------------------------------------------------------------------------------------------------------------
18	1	def _equal(self, row1, row2):
19		"""
20		Returns True if two rows are identical excluding start and end date. Returns False otherwise.
21
22		:param dict[str,T] row1: The first row.
23		:param dict[str,T] row2: The second row.
24
25		:rtype: bool
26		"""
27	1	for key in row1.keys():
28	1	if key not in [self._key_start_date, self._key_end_date]:
29	1	if row1[key] != row2[key]:
30	1	return False
31
32	1	return True
33
34		# ------------------------------------------------------------------------------------------------------------------
35	1	@staticmethod
36		def _intersect(start1, end1, start2, end2):
37		"""
38		Returns the intersection of two intervals. Returns (None,None) if the intersection is empty.
39
40		:param int start1: The start date of the first interval.
41		:param int end1: The end date of the first interval.
42		:param int start2: The start date of the second interval.
43		:param int end2: The end date of the second interval.
44
45		:rtype: tuple[int\|None,int\|None]
46		"""
47	1	start = max(start1, start2)
48	1	end = min(end1, end2)
49
50	1	if start > end:
51	1	return None, None
52
53	1	return start, end
54
55		# ------------------------------------------------------------------------------------------------------------------
56	1	def _additional_rows_date2int(self, keys, rows):
57		"""
58		Replaces start and end dates of the additional date intervals in the row set with their integer representation
59
60		:param list[tuple[str,str]] keys: The other keys with start and end date.
61		:param list[dict[str,T]] rows: The list of rows.
62
63		:rtype: list[dict[str,T]]
64		"""
65	1	for row in rows:
66	1	for key_start_date, key_end_date in keys:
67	1	if key_start_date not in [self._key_start_date, self._key_end_date]:
68	1	row[key_start_date] = self._date2int(row[key_start_date])
69	1	if key_end_date not in [self._key_start_date, self._key_end_date]:
70	1	row[key_end_date] = self._date2int(row[key_end_date])
71
72		# ------------------------------------------------------------------------------------------------------------------
73	1	def _intersection(self, keys, rows):
74		"""
75		Computes the intersection of the date intervals of two or more reference data sets. If the intersection is empty
76		the row is removed from the group.
77
78		:param list[tuple[str,str]] keys: The other keys with start and end date.
79		:param list[dict[str,T]] rows: The list of rows.
80
81		:rtype: list[dict[str,T]]
82		"""
83	1	ret = list()
84	1	for row in rows:
85	1	start_date = row[self._key_start_date]
86	1	end_date = row[self._key_end_date]
87	1	for key_start_date, key_end_date in keys:
88	1	start_date, end_date = Type2JoinHelper._intersect(start_date,
89		end_date,
90		row[key_start_date],
91		row[key_end_date])
92	1	if not start_date:
93	1	break
94	1	if key_start_date not in [self._key_start_date, self._key_end_date]:
95	1	del row[key_start_date]
96	1	if key_end_date not in [self._key_start_date, self._key_end_date]:
97	1	del row[key_end_date]
98
99	1	if start_date:
100	1	row[self._key_start_date] = start_date
101	1	row[self._key_end_date] = end_date
102	1	ret.append(row)
103
104	1	return ret
105
106		# ------------------------------------------------------------------------------------------------------------------
107	1	def _merge_adjacent_rows(self, rows):
108		"""
109		Resolves adjacent and overlapping rows. With proper reference data overlapping rows MUST not occur. However,
110		this method can handle overlapping rows. Overlapping rows are resolved as follows:
111		* The interval with the most recent begin date prevails for the overlapping period.
112		* If the begin dates are the same the interval with the most recent end date prevails.
113		* If the begin and end dates are equal the last row in the data set prevails.
114		Identical (excluding begin and end date) adjacent rows are replace with a single row.
115
116		:param list[dict[str,T]] rows: The rows in a group (i.e. with the same natural key).
117		.
118		:rtype: list[dict[str,T]]
119		"""
120	1	ret = list()
121
122	1	prev_row = None
123	1	for row in rows:
124	1	if prev_row:
125	1	relation = Allen.relation(prev_row[self._key_start_date],
126		prev_row[self._key_end_date],
127		row[self._key_start_date],
128		row[self._key_end_date])
129	1	if relation == Allen.X_BEFORE_Y:
130		# Two rows with distinct intervals.
131		# prev_row: \|----\|
132		# row: \|-----\|
133	1	ret.append(prev_row)
134	1	prev_row = row
135
136	1	elif relation == Allen.X_MEETS_Y:
137		# The two rows are adjacent.
138		# prev_row: \|-------\|
139		# row: \|-------\|
140	1	if self._equal(prev_row, row):
141		# The two rows are identical (except for start and end date) and adjacent. Combine the two rows
142		# into one row.
143	1	prev_row[self._key_end_date] = row[self._key_end_date]
144		else:
145		# Rows are adjacent but not identical.
146	1	ret.append(prev_row)
147	1	prev_row = row
148
149	1	elif relation == Allen.X_OVERLAPS_WITH_Y:
150		# prev_row overlaps row. Should not occur with proper reference data.
151		# prev_row: \|-----------\|
152		# row: \|----------\|
153	1	if self._equal(prev_row, row):
154		# The two rows are identical (except for start and end date) and overlapping. Combine the two
155		# rows into one row.
156	1	prev_row[self._key_end_date] = row[self._key_end_date]
157		else:
158		# Rows are overlapping but not identical.
159	1	prev_row[self._key_end_date] = row[self._key_start_date] - 1
160	1	ret.append(prev_row)
161	1	prev_row = row
162
163	1	elif relation == Allen.X_STARTS_Y:
164		# prev_row start row. Should not occur with proper reference data.
165		# prev_row: \|------\|
166		# row: \|----------------\|
167	1	prev_row = row
168
169	1	elif relation == Allen.X_EQUAL_Y:
170		# Can happen when the reference data sets are joined without respect for date intervals.
171		# prev_row: \|----------------\|
172		# row: \|----------------\|
173	1	prev_row = row
174
175	1	elif relation == Allen.X_DURING_Y_INVERSE:
176		# row during prev_row. Should not occur with proper reference data.
177		# prev_row: \|----------------\|
178		# row: \|------\|
179		# Note: the interval with the most recent start date prevails. Hence, the interval after
180		# row[self._key_end_date] is discarded.
181	1	if self._equal(prev_row, row):
182	1	prev_row[self._key_end_date] = row[self._key_end_date]
183		else:
184	1	prev_row[self._key_end_date] = row[self._key_start_date] - 1
185	1	ret.append(prev_row)
186	1	prev_row = row
187
188	1	elif relation == Allen.X_FINISHES_Y_INVERSE:
189		# row finishes prev_row. Should not occur with proper reference data.
190		# prev_row: \|----------------\|
191		# row: \|------\|
192	1	if not self._equal(prev_row, row):
193	1	prev_row[self._key_end_date] = row[self._key_start_date] - 1
194	1	ret.append(prev_row)
195	1	prev_row = row
196
197		# Note: if the two rows are identical (except for start and end date) nothing to do.
198		else:
199		# Note: The rows are sorted such that prev_row[self._key_begin_date] <= row[self._key_begin_date].
200		# Hence the following relation should not occur: X_DURING_Y, X_FINISHES_Y, X_BEFORE_Y_INVERSE,
201		# X_MEETS_Y_INVERSE, X_OVERLAPS_WITH_Y_INVERSE, and X_STARTS_Y_INVERSE. Hence, we covered all 13
202		# relations in Allen's interval algebra.
203		raise ValueError('Data is not sorted properly. Relation: %d' % relation)
204		else:
205	1	prev_row = row
206
207	1	if prev_row:
208	1	ret.append(prev_row)
209
210	1	return ret
211
212		# ------------------------------------------------------------------------------------------------------------------
213	1	def merge(self, keys):
214		"""
215		Merges the join on natural keys of two or more reference data sets.
216
217		:param list[tuple[str,str]] keys: For each data set the keys of the start and end date.
218		"""
219	1	deletes = []
220	1	for natural_key, rows in self.rows.items():
221	1	self._additional_rows_date2int(keys, rows)
222	1	rows = self._intersection(keys, rows)
223	1	if rows:
224	1	rows = self._rows_sort(rows)
225	1	self.rows[natural_key] = self._merge_adjacent_rows(rows)
226		else:
227	1	deletes.append(natural_key)
228
229	1	for natural_key in deletes:
230	1	del self.rows[natural_key]
231
232		# ----------------------------------------------------------------------------------------------------------------------
233

PyETLT / etlt

Push — master ( da0908...c8344f )

Type2JoinHelper._merge_adjacent_rows() F

Complexity

Size

Duplication

Code Coverage

Importance

How to fix Long Method Complexity

Long Method

Complexity

Duplication Side-by-Side

Filter issues like