runner_battdeg.PL_samples_file_joiner() - Code Metrics - chintanp/BattDeg - Measure and Improve Code Quality continuously with Scrutinizer

runner_battdeg.PL_samples_file_joiner() F
last analyzed 2019-03-24 17:37 UTC

↳ Parent: runner_battdeg

Complexity

Conditions

Size

Total Lines	131
Code Lines	57

Duplication

Lines	11
Ratio	8.4 %

Importance

Changes

Metric	Value
eloc	57
dl	11
loc	131
rs	3.6
c	0
b	0
f	0
cc	14
nop	3

How to fix Long Method Complexity

import pandas as pd
import numpy as np
import datetime
import os
import re
from os import listdir
from os.path import isfile, join
import matplotlib.pyplot as plt
import seaborn as sns

@profile 

def date_time_converter(date_time_list):
    """ 
    This function gets the numpy array with date_time in matlab format 
    and returns a numpy array with date_time in human readable format. 
    """
    
    # Empty array to hold the results
    date_time_human = []
    
    for i in date_time_list:
         date_time_human.append(datetime.datetime.fromordinal(int(i)) + 
        datetime.timedelta(days=i%1) - datetime.timedelta(days = 366))
    
    return date_time_human 

    
@profile

def PL_samples_file_joiner(data_dir, file_name_format, ignore_file_indices):
    """
    This function reads in the data for PL Samples experiment and returns a 
    nice dataframe with cycles in ascending order. 
    
    Args: 
        data_dir (string): This is the absolute path to the data directory. 
        file_name_format (string): Format of the filename, used to deduce other files.
        ignore_file_indices (list, int): This list of ints tells which to ignore.
        
    Returns:
        The complete test data in a dataframe with extra column for capacity in Ah.
    """
    
    # Raise an exception if the type of the inputs is not correct
    if not isinstance(data_dir, str):
        raise TypeError('data_dir is not of type string')
    
    if not isinstance(file_name_format, str):
        raise TypeError('file_name_format is not of type string')
  
    if not isinstance(ignore_file_indices, list):
        raise TypeError("ignore_file_indices should be a list")

    for i in range(len(ignore_file_indices)):
        if not isinstance(ignore_file_indices[i], int):

            raise TypeError("""ignore_file_indices elements should be 
            of type integer""")
    
    if not os.path.exists(join(data_dir, file_name_format)):
        raise FileNotFoundError("File {} not found in the location {}"
        .format(file_name_format, data_dir))
    
    # get the list of files in the directory
    onlyfiles = [f for f in listdir(data_dir) if isfile(join(data_dir, f))]
    
    # Extract the experiment name from the file_name_format
    exp_name = file_name_format[0:4]
    
    # Empty dictionary to hold all the dataframe for various files
    dict_files = {}
    
    # Iterate over all the files of certain type and get the file number from them
    for filename in onlyfiles:
        if exp_name in filename:
            # Extract the filenumber from the name
            file_number = re.search(exp_name + '\((.+?)\).csv', filename).group(1)
            # Give a value of dataframe to each key
            dict_files[int(file_number)] = pd.read_csv(join(data_dir, filename))
    
    # Empty dictionary to hold the ordered dictionaries
    dict_ordered = {}
    # Sort the dictionary based on keys
    for key in sorted(dict_files.keys()):
        dict_ordered[key] = dict_files[key]
    
    # Keys with files to keep, remove the ignore indices from all keys
    wanted_keys = np.array(list(set(dict_ordered.keys()) - set(ignore_file_indices)))
    
    # Remove the ignored dataframes for characterization 
    dict_ord_cycling_data = {k : dict_ordered[k] for k in wanted_keys}
    
    # Concatenate the dataframes to create the total dataframe
    
    df_out = None
    for k in wanted_keys:
        if df_out is None:

            df_next = dict_ord_cycling_data[k]
            df_out = pd.DataFrame(data=None, columns=df_next.columns)
            df_out = pd.concat([df_out, df_next])
        else:
            df_next = dict_ord_cycling_data[k]
            df_next['Cycle'] = np.array(df_next['Cycle']) + max(np.array(df_out['Cycle']))
            df_next['Time_sec'] = np.array(df_next['Time_sec']) + max(np.array(df_out['Time_sec']))
            df_next['Charge_Ah'] = np.array(df_next['Charge_Ah']) + max(np.array(df_out['Charge_Ah']))
            df_next['Discharge_Ah'] = np.array(df_next['Discharge_Ah']) + max(np.array(df_out['Discharge_Ah']))
            df_out = pd.concat([df_out, df_next])
     
    ####
    # This has been commented out for performance, as we do not need date_time
    ####
    # Convert the Date_Time from matlab datenum to human readable Date_Time
    # First convert the series into a numpy array 
    # date_time_matlab = df_out['Date_Time'].tolist()

    # # Apply the conversion to the numpy array
    # df_out['Date_Time_new'] =  date_time_converter(date_time_matlab)
    
    # Reset the index and drop the old index
    df_out_indexed = df_out.reset_index(drop=True)

    # Proceed further with correcting the capacity 
    df_grouped = df_out_indexed.groupby(['Cycle']).count()
    
    # Get the indices when a cycle starts
    cycle_start_indices = df_grouped['Time_sec'].cumsum()
    
    # Get the charge_Ah per cycle
    # Create numpy array to store the old charge_Ah row, and then 
    # perform transformation on it, rather than in the pandas series 
    # this is a lot faster in this case
    charge_cycle_ah = np.array(df_out_indexed['Charge_Ah'])
    charge_ah = np.array(df_out_indexed['Charge_Ah'])
    
    for i in range(1, len(cycle_start_indices)):
        a = cycle_start_indices.iloc[i-1]
        b = cycle_start_indices.iloc[i]
        charge_cycle_ah[a:b] = charge_ah[a:b] - charge_ah[a-1]
    
    df_out_indexed['charge_cycle_ah'] = charge_cycle_ah

    # Get the discharge_Ah per cycle 
    discharge_cycle_ah = np.array(df_out_indexed['Discharge_Ah'])
    discharge_ah = np.array(df_out_indexed['Discharge_Ah'])
    
    for i in range(1, len(cycle_start_indices)):
        a = cycle_start_indices.iloc[i-1]
        b = cycle_start_indices.iloc[i]
        discharge_cycle_ah[a:b] =  discharge_ah[a:b] - discharge_ah[a-1]

    df_out_indexed['discharge_cycle_ah'] = discharge_cycle_ah    
    
    # This is the data column we can use for prediction. 
    # This is not totally accurate, as this still has some points that go negative, 
    # due to incorrect discharge_Ah values every few cycles. 
    # But the machine learning algorithm should consider these as outliers and 
    # hopefully get over it. We can come back and correct this. 
    df_out_indexed['capacity_ah'] = charge_ah - discharge_ah

    return df_out_indexed
    
    
def PL_samples_capacity_cycles(pl_df):
    """
    This function finds the capacity in each cycle from the cumulative capacity 
    in the original file. 
    
    Args:
    
    Returns:
    """
    
    return 

data_dir = '/home/chintan/uwdirect/chintan/BattDeg/data/PL 12,14'
fnf = 'PL12(4).csv'
ignore_indices = [1, 2, 3]

out_df = PL_samples_file_joiner(data_dir, fnf, ignore_indices)

1		import pandas as pd
2		import numpy as np
3		import datetime
4		import os
5		import re
6		from os import listdir
7		from os.path import isfile, join
8		import matplotlib.pyplot as plt
9		import seaborn as sns
10
11		@profile
		0 ignored issues – show Comprehensibility Best Practice introduced 2019-03-22 06:46 UTC by Report Bug Copy Issue Report The variable `profile` does not seem to be defined. Loading history...
12		def date_time_converter(date_time_list):
13		"""
14		This function gets the numpy array with date_time in matlab format
15		and returns a numpy array with date_time in human readable format.
16		"""
17
18		# Empty array to hold the results
19		date_time_human = []
20
21		for i in date_time_list:
22		date_time_human.append(datetime.datetime.fromordinal(int(i)) +
23		datetime.timedelta(days=i%1) - datetime.timedelta(days = 366))
24
25		return date_time_human
26
27
28		@profile
		0 ignored issues – show Comprehensibility Best Practice introduced 2019-03-22 06:46 UTC by Report Bug Copy Issue Report The variable `profile` does not seem to be defined. Loading history...
29		def PL_samples_file_joiner(data_dir, file_name_format, ignore_file_indices):
30		"""
31		This function reads in the data for PL Samples experiment and returns a
32		nice dataframe with cycles in ascending order.
33
34		Args:
35		data_dir (string): This is the absolute path to the data directory.
36		file_name_format (string): Format of the filename, used to deduce other files.
37		ignore_file_indices (list, int): This list of ints tells which to ignore.
38
39		Returns:
40		The complete test data in a dataframe with extra column for capacity in Ah.
41		"""
42
43		# Raise an exception if the type of the inputs is not correct
44		if not isinstance(data_dir, str):
45		raise TypeError('data_dir is not of type string')
46
47		if not isinstance(file_name_format, str):
48		raise TypeError('file_name_format is not of type string')
49
50		if not isinstance(ignore_file_indices, list):
51		raise TypeError("ignore_file_indices should be a list")
52
53		for i in range(len(ignore_file_indices)):
54		if not isinstance(ignore_file_indices[i], int):
		0 ignored issues – show Comprehensibility Best Practice introduced 2019-03-22 06:46 UTC by Report Bug Copy Issue Report The variable `int` does not seem to be defined. Loading history...
55		raise TypeError("""ignore_file_indices elements should be
56		of type integer""")
57
58		if not os.path.exists(join(data_dir, file_name_format)):
59		raise FileNotFoundError("File {} not found in the location {}"
60		.format(file_name_format, data_dir))
61
62		# get the list of files in the directory
63		onlyfiles = [f for f in listdir(data_dir) if isfile(join(data_dir, f))]
64
65		# Extract the experiment name from the file_name_format
66		exp_name = file_name_format[0:4]
67
68		# Empty dictionary to hold all the dataframe for various files
69		dict_files = {}
70
71		# Iterate over all the files of certain type and get the file number from them
72		for filename in onlyfiles:
73		if exp_name in filename:
74		# Extract the filenumber from the name
75		file_number = re.search(exp_name + '\((.+?)\).csv', filename).group(1)
76		# Give a value of dataframe to each key
77		dict_files[int(file_number)] = pd.read_csv(join(data_dir, filename))
78
79		# Empty dictionary to hold the ordered dictionaries
80		dict_ordered = {}
81		# Sort the dictionary based on keys
82		for key in sorted(dict_files.keys()):
83		dict_ordered[key] = dict_files[key]
84
85		# Keys with files to keep, remove the ignore indices from all keys
86		wanted_keys = np.array(list(set(dict_ordered.keys()) - set(ignore_file_indices)))
87
88		# Remove the ignored dataframes for characterization
89		dict_ord_cycling_data = {k : dict_ordered[k] for k in wanted_keys}
90
91		# Concatenate the dataframes to create the total dataframe
92
93		df_out = None
94		for k in wanted_keys:
95	View Code Duplication	if df_out is None:
		0 ignored issues – show Duplication introduced 2019-03-22 06:46 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
96		df_next = dict_ord_cycling_data[k]
97		df_out = pd.DataFrame(data=None, columns=df_next.columns)
98		df_out = pd.concat([df_out, df_next])
99		else:
100		df_next = dict_ord_cycling_data[k]
101		df_next['Cycle'] = np.array(df_next['Cycle']) + max(np.array(df_out['Cycle']))
102		df_next['Time_sec'] = np.array(df_next['Time_sec']) + max(np.array(df_out['Time_sec']))
103		df_next['Charge_Ah'] = np.array(df_next['Charge_Ah']) + max(np.array(df_out['Charge_Ah']))
104		df_next['Discharge_Ah'] = np.array(df_next['Discharge_Ah']) + max(np.array(df_out['Discharge_Ah']))
105		df_out = pd.concat([df_out, df_next])
106
107		####
108		# This has been commented out for performance, as we do not need date_time
109		####
110		# Convert the Date_Time from matlab datenum to human readable Date_Time
111		# First convert the series into a numpy array
112		# date_time_matlab = df_out['Date_Time'].tolist()
113
114		# # Apply the conversion to the numpy array
115		# df_out['Date_Time_new'] = date_time_converter(date_time_matlab)
116
117		# Reset the index and drop the old index
118		df_out_indexed = df_out.reset_index(drop=True)
119
120		# Proceed further with correcting the capacity
121		df_grouped = df_out_indexed.groupby(['Cycle']).count()
122
123		# Get the indices when a cycle starts
124		cycle_start_indices = df_grouped['Time_sec'].cumsum()
125
126		# Get the charge_Ah per cycle
127		# Create numpy array to store the old charge_Ah row, and then
128		# perform transformation on it, rather than in the pandas series
129		# this is a lot faster in this case
130		charge_cycle_ah = np.array(df_out_indexed['Charge_Ah'])
131		charge_ah = np.array(df_out_indexed['Charge_Ah'])
132
133		for i in range(1, len(cycle_start_indices)):
134		a = cycle_start_indices.iloc[i-1]
135		b = cycle_start_indices.iloc[i]
136		charge_cycle_ah[a:b] = charge_ah[a:b] - charge_ah[a-1]
137
138		df_out_indexed['charge_cycle_ah'] = charge_cycle_ah
139
140		# Get the discharge_Ah per cycle
141		discharge_cycle_ah = np.array(df_out_indexed['Discharge_Ah'])
142		discharge_ah = np.array(df_out_indexed['Discharge_Ah'])
143
144		for i in range(1, len(cycle_start_indices)):
145		a = cycle_start_indices.iloc[i-1]
146		b = cycle_start_indices.iloc[i]
147		discharge_cycle_ah[a:b] = discharge_ah[a:b] - discharge_ah[a-1]
148
149		df_out_indexed['discharge_cycle_ah'] = discharge_cycle_ah
150
151		# This is the data column we can use for prediction.
152		# This is not totally accurate, as this still has some points that go negative,
153		# due to incorrect discharge_Ah values every few cycles.
154		# But the machine learning algorithm should consider these as outliers and
155		# hopefully get over it. We can come back and correct this.
156		df_out_indexed['capacity_ah'] = charge_ah - discharge_ah
157
158		return df_out_indexed
159
160
161		def PL_samples_capacity_cycles(pl_df):
162		"""
163		This function finds the capacity in each cycle from the cumulative capacity
164		in the original file.
165
166		Args:
167
168		Returns:
169		"""
170
171		return
172
173		data_dir = '/home/chintan/uwdirect/chintan/BattDeg/data/PL 12,14'
174		fnf = 'PL12(4).csv'
175		ignore_indices = [1, 2, 3]
176
177		out_df = PL_samples_file_joiner(data_dir, fnf, ignore_indices)

chintanp / BattDeg

GitHub Access Token became invalid

runner_battdeg.PL_samples_file_joiner() F last analyzed 2019-03-24 17:37 UTC

Complexity

Size

Duplication

Importance

How to fix Long Method Complexity

Long Method

Complexity

Duplication Side-by-Side

Filter issues like

runner_battdeg.PL_samples_file_joiner() F
last analyzed 2019-03-24 17:37 UTC