etlt.helper.Type2Helper - Code Metrics - Inspection of "Move repository." - PyETLT/etlt - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Branch — master (17b603)

by P.R.

created 2020-04-19 20:26 UTC

etlt.helper.Type2Helper C

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	357
Duplicated Lines	96.08 %

Test Coverage

Coverage

86.92%

Importance

Changes

Metric	Value
eloc	134
dl	343
loc	357
ccs	113
cts	130
cp	0.8692
rs	6
c	0
b	0
f	0
wmc	55

12 Methods

Rating	Name	Duplication	Size	Complexity
A	Type2Helper.__init__()	44	44	1
A	Type2Helper.enumerate()	14	14	3
A	Type2Helper._rows_date2int()	14	14	3
A	Type2Helper._rows_sort()	9	9	2
F	Type2Helper._merge_adjacent_rows()	109	109	17
B	Type2Helper._date2int()	23	23	6
A	Type2Helper._rows_int2date()	18	18	5
A	Type2Helper._get_pseudo_key()	13	13	2
A	Type2Helper._equal()	15	15	4
A	Type2Helper.get_rows()	12	12	3
A	Type2Helper.prepare_data()	18	18	5
A	Type2Helper._get_date_type()	19	19	4

How to fix Duplicated Code Complexity

"""
ETLT

Copyright 2016 Set Based IT Consultancy

Licence MIT
"""
import copy
import datetime

from etlt.helper.Allen import Allen


class Type2Helper:

    """
    A helper class for reference data with date intervals.
    """

    # ------------------------------------------------------------------------------------------------------------------
    def __init__(self, key_start_date, key_end_date, pseudo_key):
        """
        Object constructor.

        :param str key_start_date: The key of the start date in the rows.
        :param str key_end_date: The key of the end date in the rows.
        :param list[str] pseudo_key: The keys of the columns that form the pseudo key.
        """
        self.copy = True
        """
        If set to true a copy will be made from the original rows such that the original rows are not modified.

         :type: bool
        """

        self._pseudo_key = list(pseudo_key)
        """
        The keys of the columns that form the pseudo key.

        :type: list[str]
        """

        self._key_end_date = key_end_date
        """
        The key of the end date in the rows.

        :type: str
        """
        self._key_start_date = key_start_date
        """
        The key of the start date in the rows.

        :type: str
        """

        self._rows = dict()
        """
        The data set.

        :type: dict
        """

        self._date_type = ''
        """
        The type of the date fields.
        - date for datetime.date objects
        - str  for strings in ISO 8601 (YYYY-MM-DD) format
        - int for integers

        :type: str
        """

    # ------------------------------------------------------------------------------------------------------------------
    def _get_pseudo_key(self, row):
        """
        Returns the pseudo key in a row.

        :param dict row: The row.

        :rtype: tuple
        """
        ret = list()
        for key in self._pseudo_key:
            ret.append(row[key])

        return tuple(ret)

    # ------------------------------------------------------------------------------------------------------------------
    @staticmethod
    def _date2int(date):
        """
        Returns an integer representation of a date.

        :param str|datetime.date date: The date.

        :rtype: int
        """
        if isinstance(date, str):
            if date.endswith(' 00:00:00') or date.endswith('T00:00:00'):
                # Ignore time suffix.
                date = date[0:-9]
            tmp = datetime.datetime.strptime(date, '%Y-%m-%d')
            return tmp.toordinal()

        if isinstance(date, datetime.date):
            return date.toordinal()

        if isinstance(date, int):
            return date

        raise ValueError('Unexpected type {0!s}'.format(date.__class__))

    # ------------------------------------------------------------------------------------------------------------------
    def _rows_date2int(self, rows):
        """
        Replaces start and end dates in a row set with their integer representation

        :param list[dict[str,T]] rows: The list of rows.
        """
        for row in rows:
            # Determine the type of dates based on the first start date.
            if not self._date_type:
                self._date_type = self._get_date_type(row[self._key_start_date])

            # Convert dates to integers.
            row[self._key_start_date] = self._date2int(row[self._key_start_date])
            row[self._key_end_date] = self._date2int(row[self._key_end_date])

    # ------------------------------------------------------------------------------------------------------------------
    def _rows_int2date(self, rows):
        """
        Replaces start and end dates in the row set with their integer representation

        :param list[dict[str,T]] rows: The list of rows.
        """
        for row in rows:
            if self._date_type == 'str':
                row[self._key_start_date] = datetime.date.fromordinal(row[self._key_start_date]).isoformat()
                row[self._key_end_date] = datetime.date.fromordinal(row[self._key_end_date]).isoformat()
            elif self._date_type == 'date':
                row[self._key_start_date] = datetime.date.fromordinal(row[self._key_start_date])
                row[self._key_end_date] = datetime.date.fromordinal(row[self._key_end_date])
            elif self._date_type == 'int':
                # Nothing to do.
                pass
            else:
                raise ValueError('Unexpected date type {0!s}'.format(self._date_type))

    # ------------------------------------------------------------------------------------------------------------------
    def _rows_sort(self, rows):
        """
        Returns a list of rows sorted by start and end date.

        :param list[dict[str,T]] rows: The list of rows.

        :rtype: list[dict[str,T]]
        """
        return sorted(rows, key=lambda row: (row[self._key_start_date], row[self._key_end_date]))

    # ------------------------------------------------------------------------------------------------------------------
    @staticmethod
    def _get_date_type(date):
        """
        Returns the type of a date.

        :param str|datetime.date date: The date.

        :rtype: str
        """
        if isinstance(date, str):
            return 'str'

        if isinstance(date, datetime.date):
            return 'date'

        if isinstance(date, int):
            return 'int'

        raise ValueError('Unexpected type {0!s}'.format(date.__class__))

    # ------------------------------------------------------------------------------------------------------------------
    def _equal(self, row1, row2):
        """
        Returns True if two rows are identical excluding start and end date. Returns False otherwise.

        :param dict[str,T] row1: The first row.
        :param dict[str,T] row2: The second row.

        :rtype: bool
        """
        for key in row1.keys():
            if key not in [self._key_start_date, self._key_end_date]:
                if row1[key] != row2[key]:
                    return False

        return True

    # ------------------------------------------------------------------------------------------------------------------
    def _merge_adjacent_rows(self, rows):
        """
        Resolves adjacent and overlapping rows. Overlapping rows are resolved as follows:
        * The interval with the most recent begin date prevails for the overlapping period.
        * If the begin dates are the same the interval with the most recent end date prevails.
        * If the begin and end dates are equal the last row in the data set prevails.
        Identical (excluding begin and end date) adjacent rows are replace with a single row.

        :param list[dict[str,T]] rows: The rows in a group (i.e. with the same natural key).
        .
        :rtype: list[dict[str,T]]
        """
        ret = list()

        prev_row = None
        for row in rows:
            if prev_row:
                relation = Allen.relation(prev_row[self._key_start_date],
                                          prev_row[self._key_end_date],
                                          row[self._key_start_date],
                                          row[self._key_end_date])
                if relation is None:
                    # row holds an invalid interval (prev_row always holds a valid interval). Hence, the join is empty.
                    return []

                elif relation == Allen.X_BEFORE_Y:
                    # Two rows with distinct intervals.
                    # prev_row: |----|
                    # row:                 |-----|
                    ret.append(prev_row)
                    prev_row = row

                elif relation == Allen.X_MEETS_Y:
                    # The two rows are adjacent.
                    # prev_row: |-------|
                    # row:               |-------|
                    if self._equal(prev_row, row):
                        # The two rows are identical (except for start and end date) and adjacent. Combine the two rows
                        # into one row.
                        prev_row[self._key_end_date] = row[self._key_end_date]
                    else:
                        # Rows are adjacent but not identical.
                        ret.append(prev_row)
                        prev_row = row

                elif relation == Allen.X_OVERLAPS_WITH_Y:
                    # prev_row overlaps row. Should not occur with proper reference data.
                    # prev_row: |-----------|
                    # row:            |----------|
                    if self._equal(prev_row, row):
                        # The two rows are identical (except for start and end date) and overlapping. Combine the two
                        # rows into one row.
                        prev_row[self._key_end_date] = row[self._key_end_date]
                    else:
                        # Rows are overlapping but not identical.
                        prev_row[self._key_end_date] = row[self._key_start_date] - 1
                        ret.append(prev_row)
                        prev_row = row

                elif relation == Allen.X_STARTS_Y:
                    # prev_row start row. Should not occur with proper reference data.
                    # prev_row: |------|
                    # row:      |----------------|
                    prev_row = row

                elif relation == Allen.X_EQUAL_Y:
                    # Can happen when the reference data sets are joined without respect for date intervals.
                    # prev_row: |----------------|
                    # row:      |----------------|
                    prev_row = row

                elif relation == Allen.X_DURING_Y_INVERSE:
                    # row during prev_row. Should not occur with proper reference data.
                    # prev_row: |----------------|
                    # row:           |------|
                    # Note: the interval with the most recent start date prevails. Hence, the interval after
                    # row[self._key_end_date] is discarded.
                    if self._equal(prev_row, row):
                        prev_row[self._key_end_date] = row[self._key_end_date]
                    else:
                        prev_row[self._key_end_date] = row[self._key_start_date] - 1
                        ret.append(prev_row)
                        prev_row = row

                elif relation == Allen.X_FINISHES_Y_INVERSE:
                    # row finishes prev_row. Should not occur with proper reference data.
                    # prev_row: |----------------|
                    # row:                |------|
                    if not self._equal(prev_row, row):
                        prev_row[self._key_end_date] = row[self._key_start_date] - 1
                        ret.append(prev_row)
                        prev_row = row

                        # Note: if the two rows are identical (except for start and end date) nothing to do.
                else:
                    # Note: The rows are sorted such that prev_row[self._key_begin_date] <= row[self._key_begin_date].
                    # Hence the following relation should not occur: X_DURING_Y,  X_FINISHES_Y, X_BEFORE_Y_INVERSE,
                    # X_MEETS_Y_INVERSE, X_OVERLAPS_WITH_Y_INVERSE, and X_STARTS_Y_INVERSE. Hence, we covered all 13
                    # relations in Allen's interval algebra.
                    raise ValueError('Data is not sorted properly. Relation: {0}'.format(relation))

            elif row[self._key_start_date] <= row[self._key_end_date]:
                # row is the first valid row.
                prev_row = row

        if prev_row:
            ret.append(prev_row)

        return ret

    # ------------------------------------------------------------------------------------------------------------------
    def enumerate(self, name, start=1):
        """
        Enumerates all rows such that the pseudo key and the ordinal number are a unique key.

        :param str name: The key holding the ordinal number.
        :param int start: The start of the ordinal numbers. Foreach pseudo key the first row has this ordinal number.
        """
        for pseudo_key, rows in self._rows.items():
            rows = self._rows_sort(rows)
            ordinal = start
            for row in rows:
                row[name] = ordinal
                ordinal += 1
            self._rows[pseudo_key] = rows

    # ------------------------------------------------------------------------------------------------------------------
    def get_rows(self, sort=False):
        """
        Returns the rows of this Type2Helper.

        :param bool sort: If True the rows are sorted by the pseudo key.
        """
        ret = []
        for _, rows in sorted(self._rows.items()) if sort else self._rows.items():
            self._rows_int2date(rows)
            ret.extend(rows)

        return ret

    # ------------------------------------------------------------------------------------------------------------------
    def prepare_data(self, rows):
        """
        Sets and prepares the rows. The rows are stored in groups in a dictionary. A group is a list of rows with the
        same pseudo key. The key in the dictionary is a tuple with the values of the pseudo key.

        :param list[dict] rows: The rows
        """
        self._rows = dict()
        for row in copy.copy(rows) if self.copy else rows:
            pseudo_key = self._get_pseudo_key(row)
            if pseudo_key not in self._rows:
                self._rows[pseudo_key] = list()
            self._rows[pseudo_key].append(row)

        # Convert begin and end dates to integers.
        self._date_type = None
        for pseudo_key, rows in self._rows.items():
            self._rows_date2int(rows)

# ----------------------------------------------------------------------------------------------------------------------


1			"""
2			ETLT
3
4			Copyright 2016 Set Based IT Consultancy
5
6			Licence MIT
7			"""
8	1		import copy
9	1		import datetime
10
11	1		from etlt.helper.Allen import Allen
12
13
14	1	View Code Duplication	class Type2Helper:
			0 ignored issues – show Duplication introduced 2020-04-19 20:27 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
15			"""
16			A helper class for reference data with date intervals.
17			"""
18
19			# ------------------------------------------------------------------------------------------------------------------
20	1		def __init__(self, key_start_date, key_end_date, pseudo_key):
21			"""
22			Object constructor.
23
24			:param str key_start_date: The key of the start date in the rows.
25			:param str key_end_date: The key of the end date in the rows.
26			:param list[str] pseudo_key: The keys of the columns that form the pseudo key.
27			"""
28	1		self.copy = True
29			"""
30			If set to true a copy will be made from the original rows such that the original rows are not modified.
31
32			:type: bool
33			"""
34
35	1		self._pseudo_key = list(pseudo_key)
36			"""
37			The keys of the columns that form the pseudo key.
38
39			:type: list[str]
40			"""
41
42	1		self._key_end_date = key_end_date
43			"""
44			The key of the end date in the rows.
45
46			:type: str
47			"""
48	1		self._key_start_date = key_start_date
49			"""
50			The key of the start date in the rows.
51
52			:type: str
53			"""
54
55	1		self._rows = dict()
56			"""
57			The data set.
58
59			:type: dict
60			"""
61
62	1		self._date_type = ''
63	1		"""
64			The type of the date fields.
65			- date for datetime.date objects
66			- str for strings in ISO 8601 (YYYY-MM-DD) format
67			- int for integers
68
69			:type: str
70			"""
71
72			# ------------------------------------------------------------------------------------------------------------------
73	1		def _get_pseudo_key(self, row):
74			"""
75			Returns the pseudo key in a row.
76
77			:param dict row: The row.
78
79			:rtype: tuple
80			"""
81	1		ret = list()
82	1		for key in self._pseudo_key:
83	1		ret.append(row[key])
84
85	1		return tuple(ret)
86
87			# ------------------------------------------------------------------------------------------------------------------
88	1		@staticmethod
89	1		def _date2int(date):
90			"""
91			Returns an integer representation of a date.
92
93			:param str\|datetime.date date: The date.
94
95			:rtype: int
96			"""
97	1		if isinstance(date, str):
98	1		if date.endswith(' 00:00:00') or date.endswith('T00:00:00'):
99			# Ignore time suffix.
100	1		date = date[0:-9]
101	1		tmp = datetime.datetime.strptime(date, '%Y-%m-%d')
102	1		return tmp.toordinal()
103
104			if isinstance(date, datetime.date):
105			return date.toordinal()
106
107			if isinstance(date, int):
108			return date
109
110			raise ValueError('Unexpected type {0!s}'.format(date.__class__))
111
112			# ------------------------------------------------------------------------------------------------------------------
113	1		def _rows_date2int(self, rows):
114			"""
115			Replaces start and end dates in a row set with their integer representation
116
117			:param list[dict[str,T]] rows: The list of rows.
118			"""
119	1		for row in rows:
120			# Determine the type of dates based on the first start date.
121	1		if not self._date_type:
122	1		self._date_type = self._get_date_type(row[self._key_start_date])
123
124			# Convert dates to integers.
125	1		row[self._key_start_date] = self._date2int(row[self._key_start_date])
126	1		row[self._key_end_date] = self._date2int(row[self._key_end_date])
127
128			# ------------------------------------------------------------------------------------------------------------------
129	1		def _rows_int2date(self, rows):
130			"""
131			Replaces start and end dates in the row set with their integer representation
132
133			:param list[dict[str,T]] rows: The list of rows.
134			"""
135	1		for row in rows:
136	1		if self._date_type == 'str':
137	1		row[self._key_start_date] = datetime.date.fromordinal(row[self._key_start_date]).isoformat()
138	1		row[self._key_end_date] = datetime.date.fromordinal(row[self._key_end_date]).isoformat()
139			elif self._date_type == 'date':
140			row[self._key_start_date] = datetime.date.fromordinal(row[self._key_start_date])
141			row[self._key_end_date] = datetime.date.fromordinal(row[self._key_end_date])
142			elif self._date_type == 'int':
143			# Nothing to do.
144			pass
145			else:
146			raise ValueError('Unexpected date type {0!s}'.format(self._date_type))
147
148			# ------------------------------------------------------------------------------------------------------------------
149	1		def _rows_sort(self, rows):
150			"""
151			Returns a list of rows sorted by start and end date.
152
153			:param list[dict[str,T]] rows: The list of rows.
154
155			:rtype: list[dict[str,T]]
156			"""
157	1		return sorted(rows, key=lambda row: (row[self._key_start_date], row[self._key_end_date]))
158
159			# ------------------------------------------------------------------------------------------------------------------
160	1		@staticmethod
161	1		def _get_date_type(date):
162			"""
163			Returns the type of a date.
164
165			:param str\|datetime.date date: The date.
166
167			:rtype: str
168			"""
169	1		if isinstance(date, str):
170	1		return 'str'
171
172			if isinstance(date, datetime.date):
173			return 'date'
174
175			if isinstance(date, int):
176			return 'int'
177
178			raise ValueError('Unexpected type {0!s}'.format(date.__class__))
179
180			# ------------------------------------------------------------------------------------------------------------------
181	1		def _equal(self, row1, row2):
182			"""
183			Returns True if two rows are identical excluding start and end date. Returns False otherwise.
184
185			:param dict[str,T] row1: The first row.
186			:param dict[str,T] row2: The second row.
187
188			:rtype: bool
189			"""
190	1		for key in row1.keys():
191	1		if key not in [self._key_start_date, self._key_end_date]:
192	1		if row1[key] != row2[key]:
193	1		return False
194
195	1		return True
196
197			# ------------------------------------------------------------------------------------------------------------------
198	1		def _merge_adjacent_rows(self, rows):
199			"""
200			Resolves adjacent and overlapping rows. Overlapping rows are resolved as follows:
201			* The interval with the most recent begin date prevails for the overlapping period.
202			* If the begin dates are the same the interval with the most recent end date prevails.
203			* If the begin and end dates are equal the last row in the data set prevails.
204			Identical (excluding begin and end date) adjacent rows are replace with a single row.
205
206			:param list[dict[str,T]] rows: The rows in a group (i.e. with the same natural key).
207			.
208			:rtype: list[dict[str,T]]
209			"""
210	1		ret = list()
211
212	1		prev_row = None
213	1		for row in rows:
214	1		if prev_row:
215	1		relation = Allen.relation(prev_row[self._key_start_date],
216			prev_row[self._key_end_date],
217			row[self._key_start_date],
218			row[self._key_end_date])
219	1		if relation is None:
220			# row holds an invalid interval (prev_row always holds a valid interval). Hence, the join is empty.
221	1		return []
222
223	1		elif relation == Allen.X_BEFORE_Y:
224			# Two rows with distinct intervals.
225			# prev_row: \|----\|
226			# row: \|-----\|
227	1		ret.append(prev_row)
228	1		prev_row = row
229
230	1		elif relation == Allen.X_MEETS_Y:
231			# The two rows are adjacent.
232			# prev_row: \|-------\|
233			# row: \|-------\|
234	1		if self._equal(prev_row, row):
235			# The two rows are identical (except for start and end date) and adjacent. Combine the two rows
236			# into one row.
237	1		prev_row[self._key_end_date] = row[self._key_end_date]
238			else:
239			# Rows are adjacent but not identical.
240	1		ret.append(prev_row)
241	1		prev_row = row
242
243	1		elif relation == Allen.X_OVERLAPS_WITH_Y:
244			# prev_row overlaps row. Should not occur with proper reference data.
245			# prev_row: \|-----------\|
246			# row: \|----------\|
247	1		if self._equal(prev_row, row):
248			# The two rows are identical (except for start and end date) and overlapping. Combine the two
249			# rows into one row.
250	1		prev_row[self._key_end_date] = row[self._key_end_date]
251			else:
252			# Rows are overlapping but not identical.
253	1		prev_row[self._key_end_date] = row[self._key_start_date] - 1
254	1		ret.append(prev_row)
255	1		prev_row = row
256
257	1		elif relation == Allen.X_STARTS_Y:
258			# prev_row start row. Should not occur with proper reference data.
259			# prev_row: \|------\|
260			# row: \|----------------\|
261	1		prev_row = row
262
263	1		elif relation == Allen.X_EQUAL_Y:
264			# Can happen when the reference data sets are joined without respect for date intervals.
265			# prev_row: \|----------------\|
266			# row: \|----------------\|
267	1		prev_row = row
268
269	1		elif relation == Allen.X_DURING_Y_INVERSE:
270			# row during prev_row. Should not occur with proper reference data.
271			# prev_row: \|----------------\|
272			# row: \|------\|
273			# Note: the interval with the most recent start date prevails. Hence, the interval after
274			# row[self._key_end_date] is discarded.
275	1		if self._equal(prev_row, row):
276	1		prev_row[self._key_end_date] = row[self._key_end_date]
277			else:
278	1		prev_row[self._key_end_date] = row[self._key_start_date] - 1
279	1		ret.append(prev_row)
280	1		prev_row = row
281
282	1		elif relation == Allen.X_FINISHES_Y_INVERSE:
283			# row finishes prev_row. Should not occur with proper reference data.
284			# prev_row: \|----------------\|
285			# row: \|------\|
286	1		if not self._equal(prev_row, row):
287	1		prev_row[self._key_end_date] = row[self._key_start_date] - 1
288	1		ret.append(prev_row)
289	1		prev_row = row
290
291			# Note: if the two rows are identical (except for start and end date) nothing to do.
292			else:
293			# Note: The rows are sorted such that prev_row[self._key_begin_date] <= row[self._key_begin_date].
294			# Hence the following relation should not occur: X_DURING_Y, X_FINISHES_Y, X_BEFORE_Y_INVERSE,
295			# X_MEETS_Y_INVERSE, X_OVERLAPS_WITH_Y_INVERSE, and X_STARTS_Y_INVERSE. Hence, we covered all 13
296			# relations in Allen's interval algebra.
297			raise ValueError('Data is not sorted properly. Relation: {0}'.format(relation))
298
299	1		elif row[self._key_start_date] <= row[self._key_end_date]:
300			# row is the first valid row.
301	1		prev_row = row
302
303	1		if prev_row:
304	1		ret.append(prev_row)
305
306	1		return ret
307
308			# ------------------------------------------------------------------------------------------------------------------
309	1		def enumerate(self, name, start=1):
310			"""
311			Enumerates all rows such that the pseudo key and the ordinal number are a unique key.
312
313			:param str name: The key holding the ordinal number.
314			:param int start: The start of the ordinal numbers. Foreach pseudo key the first row has this ordinal number.
315			"""
316	1		for pseudo_key, rows in self._rows.items():
317	1		rows = self._rows_sort(rows)
318	1		ordinal = start
319	1		for row in rows:
320	1		row[name] = ordinal
321	1		ordinal += 1
322	1		self._rows[pseudo_key] = rows
323
324			# ------------------------------------------------------------------------------------------------------------------
325	1		def get_rows(self, sort=False):
326			"""
327			Returns the rows of this Type2Helper.
328
329			:param bool sort: If True the rows are sorted by the pseudo key.
330			"""
331	1		ret = []
332	1		for _, rows in sorted(self._rows.items()) if sort else self._rows.items():
333	1		self._rows_int2date(rows)
334	1		ret.extend(rows)
335
336	1		return ret
337
338			# ------------------------------------------------------------------------------------------------------------------
339	1		def prepare_data(self, rows):
340			"""
341			Sets and prepares the rows. The rows are stored in groups in a dictionary. A group is a list of rows with the
342			same pseudo key. The key in the dictionary is a tuple with the values of the pseudo key.
343
344			:param list[dict] rows: The rows
345			"""
346	1		self._rows = dict()
347	1		for row in copy.copy(rows) if self.copy else rows:
348	1		pseudo_key = self._get_pseudo_key(row)
349	1		if pseudo_key not in self._rows:
350	1		self._rows[pseudo_key] = list()
351	1		self._rows[pseudo_key].append(row)
352
353			# Convert begin and end dates to integers.
354	1		self._date_type = None
355	1		for pseudo_key, rows in self._rows.items():
356	1		self._rows_date2int(rows)
357
358			# ----------------------------------------------------------------------------------------------------------------------
359

PyETLT / etlt

Branch — master (17b603)

etlt.helper.Type2Helper C

Complexity

Size/Duplication

Test Coverage

Importance

12 Methods

How to fix Duplicated Code Complexity

Duplicated Code

Complexity

Duplication Side-by-Side

Filter issues like