sources.db_extractor.DataManipulator - Code Metrics - Inspection of "corrected setup" - danielgp/db-extractor - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Push — master ( 5ef4cb...89c1da )

by Daniel

created 2020-04-12 23:09 UTC

sources.db_extractor.DataManipulator A

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	93
Duplicated Lines	0 %

Importance

Changes

Metric	Value
eloc	80
dl	0
loc	93
rs	10
c	0
b	0
f	0
wmc	13

5 Methods

Rating	Name	Size	Complexity
A	DataManipulator.fn_build_relevant_file_list()	17	5
A	DataManipulator.fn_load_file_list_to_data_frame()	13	1
A	DataManipulator.fn_store_data_frame_to_file()	10	1
A	DataManipulator.fn_apply_query_to_data_frame()	26	4
A	DataManipulator.fn_move_files()	9	2

"""
Data Manipulation class
"""
# package to facilitate operating system operations
import os
# package
import pathlib
# package facilitating Data Frames manipulation
import pandas as pd


class DataManipulator:

    def fn_apply_query_to_data_frame(self, local_logger, timmer, data_frame, extract_params):
        timmer.start()
        query_expression = ''
        if extract_params['filter_to_apply'] == 'equal':
            local_logger.debug('Will retain only values equal with "'
                               + extract_params['filter_values'] + '" within the field "'
                               + extract_params['column_to_filter'] + '"')
            query_expression = '`' + extract_params['column_to_filter'] + '` == "' \
                               + extract_params['filter_values'] + '"'
        elif extract_params['filter_to_apply'] == 'different':
            local_logger.debug('Will retain only values different than "'
                               + extract_params['filter_values'] + '" within the field "'
                               + extract_params['column_to_filter'] + '"')
            query_expression = '`' + extract_params['column_to_filter'] + '` != "' \
                               + extract_params['filter_values'] + '"'
        elif extract_params['filter_to_apply'] == 'multiple_match':
            local_logger.debug('Will retain only values equal with "'
                               + extract_params['filter_values'] + '" within the field "'
                               + extract_params['column_to_filter'] + '"')
            query_expression = '`' + extract_params['column_to_filter'] + '` in ["' \
                               + '", "'.join(extract_params['filter_values'].values()) \
                               + '"]'
        local_logger.debug('Query expression to apply is: ' + query_expression)
        data_frame.query(query_expression, inplace = True)
        timmer.stop()
        return data_frame

    def fn_build_relevant_file_list(self, local_logger, timmer, in_folder, matching_pattern):
        timmer.start()
        local_logger.info('Will list all files within ' + in_folder
                          + ' folder looking for ' + matching_pattern + ' as matching pattern')
        list_files = []
        file_counter = 0
        if os.path.isdir(in_folder):
            working_path = pathlib.Path(in_folder)
            for current_file in working_path.iterdir():
                if current_file.is_file() and current_file.match(matching_pattern):
                    list_files.append(file_counter)
                    list_files[file_counter] = str(current_file.absolute())
                    file_counter = file_counter + 1
        local_logger.info('Relevant CSV files from ' + in_folder + ' folder were identified!')
        local_logger.info(list_files)
        timmer.stop()
        return list_files

    def fn_load_file_list_to_data_frame(self, local_logger, timmer, file_list, csv_delimiter):
        timmer.start()
        combined_csv = pd.concat([pd.read_csv(filepath_or_buffer = current_file,
                                              delimiter = csv_delimiter,
                                              cache_dates = True,
                                              index_col = None,
                                              memory_map = True,
                                              low_memory = False,
                                              encoding = 'utf-8',
                                              ) for current_file in file_list])
        local_logger.info('All relevant files were merged into a Pandas Data Frame')
        timmer.stop()
        return combined_csv

    def fn_move_files(self, local_logger, timmer, source_folder, match_pattern, destination_folder):
        csv_file_names = self.fn_build_relevant_file_list(local_logger, timmer,
                                                          source_folder, match_pattern)
        timmer.start()
        for current_file in csv_file_names:
            new_file_name = current_file.replace(source_folder, destination_folder)
            os.rename(current_file, new_file_name)
            local_logger.info('File ' + current_file + ' has just been renamed as ' + new_file_name)
        timmer.stop()

    def fn_store_data_frame_to_file(self, local_logger, timmer, input_data_frame,
                                    destination_file_name, csv_delimiter):
        timmer.start()
        input_data_frame.to_csv(path_or_buf = destination_file_name,
                                sep = csv_delimiter,
                                header = True,
                                index = False,
                                encoding = 'utf-8')
        local_logger.info('Data frame has just been saved to file "' + destination_file_name + '"')
        timmer.stop()


1			"""
2			Data Manipulation class
3			"""
4			# package to facilitate operating system operations
5			import os
6			# package
7			import pathlib
8			# package facilitating Data Frames manipulation
9			import pandas as pd
10
11
12			class DataManipulator:
13
14			def fn_apply_query_to_data_frame(self, local_logger, timmer, data_frame, extract_params):
15			timmer.start()
16			query_expression = ''
17			if extract_params['filter_to_apply'] == 'equal':
18			local_logger.debug('Will retain only values equal with "'
19			+ extract_params['filter_values'] + '" within the field "'
20			+ extract_params['column_to_filter'] + '"')
21			query_expression = '`' + extract_params['column_to_filter'] + '` == "' \
22			+ extract_params['filter_values'] + '"'
23			elif extract_params['filter_to_apply'] == 'different':
24			local_logger.debug('Will retain only values different than "'
25			+ extract_params['filter_values'] + '" within the field "'
26			+ extract_params['column_to_filter'] + '"')
27			query_expression = '`' + extract_params['column_to_filter'] + '` != "' \
28			+ extract_params['filter_values'] + '"'
29			elif extract_params['filter_to_apply'] == 'multiple_match':
30			local_logger.debug('Will retain only values equal with "'
31			+ extract_params['filter_values'] + '" within the field "'
32			+ extract_params['column_to_filter'] + '"')
33			query_expression = '`' + extract_params['column_to_filter'] + '` in ["' \
34			+ '", "'.join(extract_params['filter_values'].values()) \
35			+ '"]'
36			local_logger.debug('Query expression to apply is: ' + query_expression)
37			data_frame.query(query_expression, inplace = True)
38			timmer.stop()
39			return data_frame
40
41			def fn_build_relevant_file_list(self, local_logger, timmer, in_folder, matching_pattern):
42			timmer.start()
43			local_logger.info('Will list all files within ' + in_folder
44			+ ' folder looking for ' + matching_pattern + ' as matching pattern')
45			list_files = []
46			file_counter = 0
47			if os.path.isdir(in_folder):
48			working_path = pathlib.Path(in_folder)
49			for current_file in working_path.iterdir():
50			if current_file.is_file() and current_file.match(matching_pattern):
51			list_files.append(file_counter)
52			list_files[file_counter] = str(current_file.absolute())
53			file_counter = file_counter + 1
54			local_logger.info('Relevant CSV files from ' + in_folder + ' folder were identified!')
55			local_logger.info(list_files)
56			timmer.stop()
57			return list_files
58
59			def fn_load_file_list_to_data_frame(self, local_logger, timmer, file_list, csv_delimiter):
60			timmer.start()
61			combined_csv = pd.concat([pd.read_csv(filepath_or_buffer = current_file,
62			delimiter = csv_delimiter,
63			cache_dates = True,
64			index_col = None,
65			memory_map = True,
66			low_memory = False,
67			encoding = 'utf-8',
68			) for current_file in file_list])
69			local_logger.info('All relevant files were merged into a Pandas Data Frame')
70			timmer.stop()
71			return combined_csv
72
73			def fn_move_files(self, local_logger, timmer, source_folder, match_pattern, destination_folder):
74			csv_file_names = self.fn_build_relevant_file_list(local_logger, timmer,
75			source_folder, match_pattern)
76			timmer.start()
77			for current_file in csv_file_names:
78			new_file_name = current_file.replace(source_folder, destination_folder)
79			os.rename(current_file, new_file_name)
80			local_logger.info('File ' + current_file + ' has just been renamed as ' + new_file_name)
81			timmer.stop()
82
83			def fn_store_data_frame_to_file(self, local_logger, timmer, input_data_frame,
84			destination_file_name, csv_delimiter):
85			timmer.start()
86			input_data_frame.to_csv(path_or_buf = destination_file_name,
87			sep = csv_delimiter,
88			header = True,
89			index = False,
90			encoding = 'utf-8')
91			local_logger.info('Data frame has just been saved to file "' + destination_file_name + '"')
92			timmer.stop()
93

danielgp / db-extractor

Push — master ( 5ef4cb...89c1da )

sources.db_extractor.DataManipulator A

Complexity

Size/Duplication

Importance

5 Methods

Duplication Side-by-Side

Filter issues like