Passed
Push — master ( 5ef4cb...89c1da )
by Daniel
01:13
created

sources.db_extractor.DataManipulator   A

Complexity

Total Complexity 13

Size/Duplication

Total Lines 93
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
eloc 80
dl 0
loc 93
rs 10
c 0
b 0
f 0
wmc 13

5 Methods

Rating   Name   Duplication   Size   Complexity  
A DataManipulator.fn_build_relevant_file_list() 0 17 5
A DataManipulator.fn_load_file_list_to_data_frame() 0 13 1
A DataManipulator.fn_store_data_frame_to_file() 0 10 1
A DataManipulator.fn_apply_query_to_data_frame() 0 26 4
A DataManipulator.fn_move_files() 0 9 2
1
"""
2
Data Manipulation class
3
"""
4
# package to facilitate operating system operations
5
import os
6
# package
7
import pathlib
8
# package facilitating Data Frames manipulation
9
import pandas as pd
10
11
12
class DataManipulator:
13
14
    def fn_apply_query_to_data_frame(self, local_logger, timmer, data_frame, extract_params):
15
        timmer.start()
16
        query_expression = ''
17
        if extract_params['filter_to_apply'] == 'equal':
18
            local_logger.debug('Will retain only values equal with "'
19
                               + extract_params['filter_values'] + '" within the field "'
20
                               + extract_params['column_to_filter'] + '"')
21
            query_expression = '`' + extract_params['column_to_filter'] + '` == "' \
22
                               + extract_params['filter_values'] + '"'
23
        elif extract_params['filter_to_apply'] == 'different':
24
            local_logger.debug('Will retain only values different than "'
25
                               + extract_params['filter_values'] + '" within the field "'
26
                               + extract_params['column_to_filter'] + '"')
27
            query_expression = '`' + extract_params['column_to_filter'] + '` != "' \
28
                               + extract_params['filter_values'] + '"'
29
        elif extract_params['filter_to_apply'] == 'multiple_match':
30
            local_logger.debug('Will retain only values equal with "'
31
                               + extract_params['filter_values'] + '" within the field "'
32
                               + extract_params['column_to_filter'] + '"')
33
            query_expression = '`' + extract_params['column_to_filter'] + '` in ["' \
34
                               + '", "'.join(extract_params['filter_values'].values()) \
35
                               + '"]'
36
        local_logger.debug('Query expression to apply is: ' + query_expression)
37
        data_frame.query(query_expression, inplace = True)
38
        timmer.stop()
39
        return data_frame
40
41
    def fn_build_relevant_file_list(self, local_logger, timmer, in_folder, matching_pattern):
42
        timmer.start()
43
        local_logger.info('Will list all files within ' + in_folder
44
                          + ' folder looking for ' + matching_pattern + ' as matching pattern')
45
        list_files = []
46
        file_counter = 0
47
        if os.path.isdir(in_folder):
48
            working_path = pathlib.Path(in_folder)
49
            for current_file in working_path.iterdir():
50
                if current_file.is_file() and current_file.match(matching_pattern):
51
                    list_files.append(file_counter)
52
                    list_files[file_counter] = str(current_file.absolute())
53
                    file_counter = file_counter + 1
54
        local_logger.info('Relevant CSV files from ' + in_folder + ' folder were identified!')
55
        local_logger.info(list_files)
56
        timmer.stop()
57
        return list_files
58
59
    def fn_load_file_list_to_data_frame(self, local_logger, timmer, file_list, csv_delimiter):
60
        timmer.start()
61
        combined_csv = pd.concat([pd.read_csv(filepath_or_buffer = current_file,
62
                                              delimiter = csv_delimiter,
63
                                              cache_dates = True,
64
                                              index_col = None,
65
                                              memory_map = True,
66
                                              low_memory = False,
67
                                              encoding = 'utf-8',
68
                                              ) for current_file in file_list])
69
        local_logger.info('All relevant files were merged into a Pandas Data Frame')
70
        timmer.stop()
71
        return combined_csv
72
73
    def fn_move_files(self, local_logger, timmer, source_folder, match_pattern, destination_folder):
74
        csv_file_names = self.fn_build_relevant_file_list(local_logger, timmer,
75
                                                          source_folder, match_pattern)
76
        timmer.start()
77
        for current_file in csv_file_names:
78
            new_file_name = current_file.replace(source_folder, destination_folder)
79
            os.rename(current_file, new_file_name)
80
            local_logger.info('File ' + current_file + ' has just been renamed as ' + new_file_name)
81
        timmer.stop()
82
83
    def fn_store_data_frame_to_file(self, local_logger, timmer, input_data_frame,
84
                                    destination_file_name, csv_delimiter):
85
        timmer.start()
86
        input_data_frame.to_csv(path_or_buf = destination_file_name,
87
                                sep = csv_delimiter,
88
                                header = True,
89
                                index = False,
90
                                encoding = 'utf-8')
91
        local_logger.info('Data frame has just been saved to file "' + destination_file_name + '"')
92
        timmer.stop()
93