sources.common.DataManipulator.DataManipulator.fn_drop_certain_columns() - Code Metrics - Inspection of "version bump and internal package added" - danielgp/db-extractor - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Push — development/test ( ec5267...7ab62a )

by Daniel

created 2020-04-24 01:44 UTC

DataManipulator.fn_drop_certain_columns() A

↳ Parent: sources.common.DataManipulator

Complexity

Conditions

Size

Total Lines	13
Code Lines	12

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	5
eloc	12
nop	4
dl	0
loc	13
rs	9.3333
c	0
b	0
f	0

"""
Data Manipulation class
"""
# package to handle date and times
from datetime import timedelta
# package facilitating Data Frames manipulation
import pandas as pd


class DataManipulator:

    @staticmethod
    def fn_add_days_within_column_to_data_frame(input_data_frame, dict_expression):
        input_data_frame['Days Within'] = input_data_frame[dict_expression['End Date']] - \
                                          input_data_frame[dict_expression['Start Date']] + \
                                          timedelta(days=1)
        input_data_frame['Days Within'] = input_data_frame['Days Within'] \
            .apply(lambda x: int(str(x).replace(' days 00:00:00', '')))
        return input_data_frame

    @staticmethod
    def fn_add_minimum_and_maximum_columns_to_data_frame(input_data_frame, dict_expression):
        grouped_df = input_data_frame.groupby(dict_expression['group_by']) \
            .agg({dict_expression['calculation']: ['min', 'max']})
        grouped_df.columns = ['_'.join(col).strip() for col in grouped_df.columns.values]
        grouped_df = grouped_df.reset_index()
        if 'map' in dict_expression:
            grouped_df.rename(columns=dict_expression['map'], inplace=True)
        return grouped_df

    def fn_add_timeline_evaluation_column_to_data_frame(self, in_df, dict_expression):
        # shorten last method parameter
        de = dict_expression
        # add helpful column to use on "Timeline Evaluation" column determination
        in_df['Reference Date'] = de['Reference Date']
        # actual "Timeline Evaluation" column determination
        cols = ['Reference Date', de['Start Date'], de['End Date']]
        in_df['Timeline Evaluation'] = in_df[cols] \
            .apply(lambda r: 'Current' if r[de['Start Date']]
                                          <= r['Reference Date']
                                          <= r[de['End Date']] else\
                   'Past' if r[de['Start Date']] < r['Reference Date'] else 'Future', axis=1)
        # decide if the helpful column is to be retained or not
        removal_needed = self.fn_decide_by_omission_or_specific_false(de, 'Keep Reference Date')
        if removal_needed:
            in_df.drop(columns=['Reference Date'], inplace=True)
        return in_df

    def add_value_to_dictionary(self, in_list, adding_value, adding_type, reference_column):
        add_type = adding_type.lower()
        total_columns = len(in_list)
        if reference_column is None:
            reference_indexes = {
                'add': {
                    'after': 0,
                    'before': 0,
                },
                'cycle_down_to': {
                    'after': 0,
                    'before': 0,
                },
            }
        else:
            reference_indexes = {
                'add': {
                    'after': in_list.copy().index(reference_column) + 1,
                    'before': in_list.copy().index(reference_column),
                },
                'cycle_down_to': {
                    'after': in_list.copy().index(reference_column),
                    'before': in_list.copy().index(reference_column),
                },
            }
        positions = {
            'after': {
                'cycle_down_to': reference_indexes.get('cycle_down_to').get('after'),
                'add': reference_indexes.get('add').get('after'),
            },
            'before': {
                'cycle_down_to': reference_indexes.get('cycle_down_to').get('before'),
                'add': reference_indexes.get('add').get('before'),
            },
            'first': {
                'cycle_down_to': 0,
                'add': 0,
            },
            'last': {
                'cycle_down_to': total_columns,
                'add': total_columns,
            }
        }
        return self.add_value_to_dictionary_by_position({
            'adding_value': adding_value,
            'list': in_list,
            'position_to_add': positions.get(add_type).get('add'),
            'position_to_cycle_down_to': positions.get(add_type).get('cycle_down_to'),
            'total_columns': total_columns,
        })

    @staticmethod
    def add_value_to_dictionary_by_position(adding_dictionary):
        list_with_values = adding_dictionary['list']
        list_with_values.append(adding_dictionary['total_columns'])
        for counter in range(adding_dictionary['total_columns'],
                             adding_dictionary['position_to_cycle_down_to'], -1):
            list_with_values[counter] = list_with_values[(counter - 1)]
        list_with_values[adding_dictionary['position_to_add']] = adding_dictionary['adding_value']
        return list_with_values

    @staticmethod
    def fn_add_weekday_columns_to_data_frame(input_data_frame, columns_list):
        for current_column in columns_list:
            input_data_frame['Weekday for ' + current_column] = input_data_frame[current_column] \
                .apply(lambda x: x.strftime('%A'))
        return input_data_frame

    @staticmethod
    def fn_apply_query_to_data_frame(local_logger, timmer, input_data_frame, extract_params):
        timmer.start()
        query_expression = ''
        if extract_params['filter_to_apply'] == 'equal':
            local_logger.debug('Will retain only values equal with "'
                               + extract_params['filter_values'] + '" within the field "'
                               + extract_params['column_to_filter'] + '"')
            query_expression = '`' + extract_params['column_to_filter'] + '` == "' \
                               + extract_params['filter_values'] + '"'
        elif extract_params['filter_to_apply'] == 'different':
            local_logger.debug('Will retain only values different than "'
                               + extract_params['filter_values'] + '" within the field "'
                               + extract_params['column_to_filter'] + '"')
            query_expression = '`' + extract_params['column_to_filter'] + '` != "' \
                               + extract_params['filter_values'] + '"'
        elif extract_params['filter_to_apply'] == 'multiple_match':
            local_logger.debug('Will retain only values equal with "'
                               + extract_params['filter_values'] + '" within the field "'
                               + extract_params['column_to_filter'] + '"')
            query_expression = '`' + extract_params['column_to_filter'] + '` in ["' \
                               + '", "'.join(extract_params['filter_values'].values()) \
                               + '"]'
        local_logger.debug('Query expression to apply is: ' + query_expression)
        input_data_frame.query(query_expression, inplace=True)
        timmer.stop()
        return input_data_frame

    @staticmethod
    def fn_convert_datetime_columns_to_string(input_data_frame, columns_list, columns_format):
        for current_column in columns_list:
            input_data_frame[current_column] = \
                input_data_frame[current_column].map(lambda x: x.strftime(columns_format))
        return input_data_frame

    @staticmethod
    def fn_convert_string_columns_to_datetime(input_data_frame, columns_list, columns_format):
        for current_column in columns_list:
            input_data_frame[current_column] = pd.to_datetime(input_data_frame[current_column],
                                                              format=columns_format)
        return input_data_frame

    @staticmethod
    def fn_decide_by_omission_or_specific_false(in_dictionary, key_decision_factor):
        removal_needed = False
        if key_decision_factor not in in_dictionary:
            removal_needed = True
        elif not in_dictionary[key_decision_factor]:
            removal_needed = True
        return removal_needed

    @staticmethod
    def fn_filter_data_frame_by_index(local_logger, in_data_frame, filter_rule):
        index_current = in_data_frame.query('`Timeline Evaluation` == "Current"', inplace=False)
        local_logger.info('Current index has been determined to be ' + str(index_current.index))
        if 'Deviation' in filter_rule:
            for deviation_type in filter_rule['Deviation']:
                deviation_number = filter_rule['Deviation'][deviation_type]
                if deviation_type == 'Lower':
                    index_to_apply = index_current.index - deviation_number
                    in_data_frame = in_data_frame[in_data_frame.index >= index_to_apply[0]]
                elif deviation_type == 'Upper':
                    index_to_apply = index_current.index + deviation_number
                    in_data_frame = in_data_frame[in_data_frame.index <= index_to_apply[0]]
                local_logger.info(deviation_type + ' Deviation Number is ' + str(deviation_number)
                                  + ' to be applied to Current index, became '
                                  + str(index_to_apply))

        return in_data_frame

    @staticmethod
    def get_column_index_from_dataframe(data_frame_columns, column_name_to_identify):
        column_index_to_return = 0
        for ndx, column_name in enumerate(data_frame_columns):
            if column_name == column_name_to_identify:
                column_index_to_return = ndx
        return column_index_to_return

    @staticmethod
    def fn_load_file_list_to_data_frame(local_logger, timmer, file_list, csv_delimiter):
        timmer.start()
        combined_csv = pd.concat([pd.read_csv(filepath_or_buffer=current_file,
                                              delimiter=csv_delimiter,
                                              cache_dates=True,
                                              index_col=None,
                                              memory_map=True,
                                              low_memory=False,
                                              encoding='utf-8',
                                              ) for current_file in file_list])
        local_logger.info('All relevant files were merged into a Pandas Data Frame')
        timmer.stop()
        return combined_csv

    @staticmethod
    def fn_store_data_frame_to_file(local_logger, timmer, input_data_frame, input_file_details):
        timmer.start()
        if input_file_details['format'] == 'csv':
            input_data_frame.to_csv(path_or_buf=input_file_details['name'],
                                    sep=input_file_details['field_delimiter'],
                                    header=True,
                                    index=False,
                                    encoding='utf-8')
        local_logger.info('Data frame has just been saved to file "'
                          + input_file_details['name'] + '"')
        timmer.stop()


1			"""
2			Data Manipulation class
3			"""
4			# package to handle date and times
5			from datetime import timedelta
6			# package facilitating Data Frames manipulation
7			import pandas as pd
8
9
10			class DataManipulator:
11
12			@staticmethod
13			def fn_add_days_within_column_to_data_frame(input_data_frame, dict_expression):
14			input_data_frame['Days Within'] = input_data_frame[dict_expression['End Date']] - \
15			input_data_frame[dict_expression['Start Date']] + \
16			timedelta(days=1)
17			input_data_frame['Days Within'] = input_data_frame['Days Within'] \
18			.apply(lambda x: int(str(x).replace(' days 00:00:00', '')))
19			return input_data_frame
20
21			@staticmethod
22			def fn_add_minimum_and_maximum_columns_to_data_frame(input_data_frame, dict_expression):
23			grouped_df = input_data_frame.groupby(dict_expression['group_by']) \
24			.agg({dict_expression['calculation']: ['min', 'max']})
25			grouped_df.columns = ['_'.join(col).strip() for col in grouped_df.columns.values]
26			grouped_df = grouped_df.reset_index()
27			if 'map' in dict_expression:
28			grouped_df.rename(columns=dict_expression['map'], inplace=True)
29			return grouped_df
30
31			def fn_add_timeline_evaluation_column_to_data_frame(self, in_df, dict_expression):
32			# shorten last method parameter
33			de = dict_expression
34			# add helpful column to use on "Timeline Evaluation" column determination
35			in_df['Reference Date'] = de['Reference Date']
36			# actual "Timeline Evaluation" column determination
37			cols = ['Reference Date', de['Start Date'], de['End Date']]
38			in_df['Timeline Evaluation'] = in_df[cols] \
39			.apply(lambda r: 'Current' if r[de['Start Date']]
40			<= r['Reference Date']
41			<= r[de['End Date']] else\
42			'Past' if r[de['Start Date']] < r['Reference Date'] else 'Future', axis=1)
43			# decide if the helpful column is to be retained or not
44			removal_needed = self.fn_decide_by_omission_or_specific_false(de, 'Keep Reference Date')
45			if removal_needed:
46			in_df.drop(columns=['Reference Date'], inplace=True)
47			return in_df
48
49			def add_value_to_dictionary(self, in_list, adding_value, adding_type, reference_column):
50			add_type = adding_type.lower()
51			total_columns = len(in_list)
52			if reference_column is None:
53			reference_indexes = {
54			'add': {
55			'after': 0,
56			'before': 0,
57			},
58			'cycle_down_to': {
59			'after': 0,
60			'before': 0,
61			},
62			}
63			else:
64			reference_indexes = {
65			'add': {
66			'after': in_list.copy().index(reference_column) + 1,
67			'before': in_list.copy().index(reference_column),
68			},
69			'cycle_down_to': {
70			'after': in_list.copy().index(reference_column),
71			'before': in_list.copy().index(reference_column),
72			},
73			}
74			positions = {
75			'after': {
76			'cycle_down_to': reference_indexes.get('cycle_down_to').get('after'),
77			'add': reference_indexes.get('add').get('after'),
78			},
79			'before': {
80			'cycle_down_to': reference_indexes.get('cycle_down_to').get('before'),
81			'add': reference_indexes.get('add').get('before'),
82			},
83			'first': {
84			'cycle_down_to': 0,
85			'add': 0,
86			},
87			'last': {
88			'cycle_down_to': total_columns,
89			'add': total_columns,
90			}
91			}
92			return self.add_value_to_dictionary_by_position({
93			'adding_value': adding_value,
94			'list': in_list,
95			'position_to_add': positions.get(add_type).get('add'),
96			'position_to_cycle_down_to': positions.get(add_type).get('cycle_down_to'),
97			'total_columns': total_columns,
98			})
99
100			@staticmethod
101			def add_value_to_dictionary_by_position(adding_dictionary):
102			list_with_values = adding_dictionary['list']
103			list_with_values.append(adding_dictionary['total_columns'])
104			for counter in range(adding_dictionary['total_columns'],
105			adding_dictionary['position_to_cycle_down_to'], -1):
106			list_with_values[counter] = list_with_values[(counter - 1)]
107			list_with_values[adding_dictionary['position_to_add']] = adding_dictionary['adding_value']
108			return list_with_values
109
110			@staticmethod
111			def fn_add_weekday_columns_to_data_frame(input_data_frame, columns_list):
112			for current_column in columns_list:
113			input_data_frame['Weekday for ' + current_column] = input_data_frame[current_column] \
114			.apply(lambda x: x.strftime('%A'))
115			return input_data_frame
116
117			@staticmethod
118			def fn_apply_query_to_data_frame(local_logger, timmer, input_data_frame, extract_params):
119			timmer.start()
120			query_expression = ''
121			if extract_params['filter_to_apply'] == 'equal':
122			local_logger.debug('Will retain only values equal with "'
123			+ extract_params['filter_values'] + '" within the field "'
124			+ extract_params['column_to_filter'] + '"')
125			query_expression = '`' + extract_params['column_to_filter'] + '` == "' \
126			+ extract_params['filter_values'] + '"'
127			elif extract_params['filter_to_apply'] == 'different':
128			local_logger.debug('Will retain only values different than "'
129			+ extract_params['filter_values'] + '" within the field "'
130			+ extract_params['column_to_filter'] + '"')
131			query_expression = '`' + extract_params['column_to_filter'] + '` != "' \
132			+ extract_params['filter_values'] + '"'
133			elif extract_params['filter_to_apply'] == 'multiple_match':
134			local_logger.debug('Will retain only values equal with "'
135			+ extract_params['filter_values'] + '" within the field "'
136			+ extract_params['column_to_filter'] + '"')
137			query_expression = '`' + extract_params['column_to_filter'] + '` in ["' \
138			+ '", "'.join(extract_params['filter_values'].values()) \
139			+ '"]'
140			local_logger.debug('Query expression to apply is: ' + query_expression)
141			input_data_frame.query(query_expression, inplace=True)
142			timmer.stop()
143			return input_data_frame
144
145			@staticmethod
146			def fn_convert_datetime_columns_to_string(input_data_frame, columns_list, columns_format):
147			for current_column in columns_list:
148			input_data_frame[current_column] = \
149			input_data_frame[current_column].map(lambda x: x.strftime(columns_format))
150			return input_data_frame
151
152			@staticmethod
153			def fn_convert_string_columns_to_datetime(input_data_frame, columns_list, columns_format):
154			for current_column in columns_list:
155			input_data_frame[current_column] = pd.to_datetime(input_data_frame[current_column],
156			format=columns_format)
157			return input_data_frame
158
159			@staticmethod
160			def fn_decide_by_omission_or_specific_false(in_dictionary, key_decision_factor):
161			removal_needed = False
162			if key_decision_factor not in in_dictionary:
163			removal_needed = True
164			elif not in_dictionary[key_decision_factor]:
165			removal_needed = True
166			return removal_needed
167
168			@staticmethod
169			def fn_filter_data_frame_by_index(local_logger, in_data_frame, filter_rule):
170			index_current = in_data_frame.query('`Timeline Evaluation` == "Current"', inplace=False)
171			local_logger.info('Current index has been determined to be ' + str(index_current.index))
172			if 'Deviation' in filter_rule:
173			for deviation_type in filter_rule['Deviation']:
174			deviation_number = filter_rule['Deviation'][deviation_type]
175			if deviation_type == 'Lower':
176			index_to_apply = index_current.index - deviation_number
177			in_data_frame = in_data_frame[in_data_frame.index >= index_to_apply[0]]
178			elif deviation_type == 'Upper':
179			index_to_apply = index_current.index + deviation_number
180			in_data_frame = in_data_frame[in_data_frame.index <= index_to_apply[0]]
181			local_logger.info(deviation_type + ' Deviation Number is ' + str(deviation_number)
182			+ ' to be applied to Current index, became '
183			+ str(index_to_apply))
			0 ignored issues – show introduced 2020-04-23 03:11 UTC by Report Bug Copy Issue Report The variable `index_to_apply` does not seem to be defined for all execution paths. Loading history...
184			return in_data_frame
185
186			@staticmethod
187			def get_column_index_from_dataframe(data_frame_columns, column_name_to_identify):
188			column_index_to_return = 0
189			for ndx, column_name in enumerate(data_frame_columns):
190			if column_name == column_name_to_identify:
191			column_index_to_return = ndx
192			return column_index_to_return
193
194			@staticmethod
195			def fn_load_file_list_to_data_frame(local_logger, timmer, file_list, csv_delimiter):
196			timmer.start()
197			combined_csv = pd.concat([pd.read_csv(filepath_or_buffer=current_file,
198			delimiter=csv_delimiter,
199			cache_dates=True,
200			index_col=None,
201			memory_map=True,
202			low_memory=False,
203			encoding='utf-8',
204			) for current_file in file_list])
205			local_logger.info('All relevant files were merged into a Pandas Data Frame')
206			timmer.stop()
207			return combined_csv
208
209			@staticmethod
210			def fn_store_data_frame_to_file(local_logger, timmer, input_data_frame, input_file_details):
211			timmer.start()
212			if input_file_details['format'] == 'csv':
213			input_data_frame.to_csv(path_or_buf=input_file_details['name'],
214			sep=input_file_details['field_delimiter'],
215			header=True,
216			index=False,
217			encoding='utf-8')
218			local_logger.info('Data frame has just been saved to file "'
219			+ input_file_details['name'] + '"')
220			timmer.stop()
221

danielgp / db-extractor

Push — development/test ( ec5267...7ab62a )

DataManipulator.fn_drop_certain_columns() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like