|
1
|
|
|
""" |
|
2
|
|
|
This module can be used to read cycling data of the CX2, CS2 and PL type cells as |
|
3
|
|
|
a dataframe. It converts cumulative values into individual values for |
|
4
|
|
|
each cycle and determines net charge of the battery at every datapoint. |
|
5
|
|
|
It can also be used to train and test a LSTM model and predict discharge capacity |
|
6
|
|
|
using the LSTM model. |
|
7
|
|
|
""" |
|
8
|
|
|
|
|
9
|
|
|
import datetime |
|
10
|
|
|
import os |
|
11
|
|
|
from os import listdir |
|
12
|
|
|
from os.path import isfile, join |
|
13
|
|
|
import re |
|
14
|
|
|
# import matplotlib.pyplot as plt |
|
15
|
|
|
# import seaborn as sns |
|
16
|
|
|
import pandas as pd |
|
17
|
|
|
import numpy as np |
|
18
|
|
|
|
|
19
|
|
|
from sklearn.model_selection import train_test_split |
|
20
|
|
|
from keras.models import Sequential |
|
21
|
|
|
from keras.layers import Dense |
|
22
|
|
|
from keras.layers import LSTM |
|
23
|
|
|
from keras.models import load_model |
|
24
|
|
|
|
|
25
|
|
|
# @profile |
|
26
|
|
|
def date_time_converter(date_time_list): |
|
27
|
|
|
""" |
|
28
|
|
|
This function gets the numpy array with date_time in matlab format |
|
29
|
|
|
and returns a numpy array with date_time in human readable format. |
|
30
|
|
|
""" |
|
31
|
|
|
|
|
32
|
|
|
if not isinstance(date_time_list, list): |
|
33
|
|
|
raise TypeError("date_time_list should be a list") |
|
34
|
|
|
|
|
35
|
|
|
# Empty array to hold the results |
|
36
|
|
|
date_time_human = [] |
|
37
|
|
|
|
|
38
|
|
|
for i in date_time_list: |
|
39
|
|
|
date_time_human.append( |
|
40
|
|
|
datetime.datetime.fromordinal( |
|
41
|
|
|
int(i)) + |
|
42
|
|
|
datetime.timedelta( |
|
43
|
|
|
days=i % |
|
44
|
|
|
1) - |
|
45
|
|
|
datetime.timedelta( |
|
46
|
|
|
days=366)) |
|
47
|
|
|
|
|
48
|
|
|
return date_time_human |
|
49
|
|
|
|
|
50
|
|
|
# @profile |
|
51
|
|
|
|
|
52
|
|
|
|
|
53
|
|
|
def get_dict_files(data_dir, file_name_format, ignore_file_indices): |
|
54
|
|
|
""" |
|
55
|
|
|
This function finds all the files at the location of the file name |
|
56
|
|
|
format as specified and then creates a dictionary after ignoring the |
|
57
|
|
|
list of file specified |
|
58
|
|
|
|
|
59
|
|
|
Args: |
|
60
|
|
|
data_dir (string): This is the absolute path to the data directory. |
|
61
|
|
|
file_name_format (string): Format of the filename, used to deduce other |
|
62
|
|
|
files. |
|
63
|
|
|
ignore_file_indices (list, int): This list of ints tells |
|
64
|
|
|
which to ignore. |
|
65
|
|
|
|
|
66
|
|
|
Returns: |
|
67
|
|
|
The dictionary with all data from files dataframes. |
|
68
|
|
|
""" |
|
69
|
|
|
|
|
70
|
|
|
# get the list of files in the directory |
|
71
|
|
|
onlyfiles = [f for f in listdir(data_dir) if isfile(join(data_dir, f))] |
|
72
|
|
|
|
|
73
|
|
|
# Extract the experiment name from the file_name_format |
|
74
|
|
|
exp_name = file_name_format[0:4] |
|
75
|
|
|
|
|
76
|
|
|
# Empty dictionary to hold all the dataframe for various files |
|
77
|
|
|
dict_files = {} |
|
78
|
|
|
|
|
79
|
|
|
# Iterate over all the files of certain type and get the file number from |
|
80
|
|
|
# them |
|
81
|
|
|
for filename in onlyfiles: |
|
82
|
|
|
if exp_name in filename: |
|
83
|
|
|
# Extract the filenumber from the name |
|
84
|
|
|
file_number = re.search( |
|
85
|
|
|
exp_name + r'\((.+?)\).csv', |
|
86
|
|
|
filename).group(1) |
|
87
|
|
|
# Give a value of dataframe to each key |
|
88
|
|
|
dict_files[int(file_number)] = pd.read_csv( |
|
|
|
|
|
|
89
|
|
|
join(data_dir, filename)) |
|
90
|
|
|
|
|
91
|
|
|
# Empty dictionary to hold the ordered dictionaries |
|
92
|
|
|
dict_ordered = {} |
|
93
|
|
|
# Sort the dictionary based on keys |
|
94
|
|
|
for key in sorted(dict_files.keys()): |
|
95
|
|
|
dict_ordered[key] = dict_files[key] |
|
96
|
|
|
|
|
97
|
|
|
# Keys with files to keep, remove the ignore indices from all keys |
|
98
|
|
|
wanted_keys = np.array( |
|
99
|
|
|
list(set(dict_ordered.keys()) - set(ignore_file_indices))) |
|
100
|
|
|
|
|
101
|
|
|
# Remove the ignored dataframes for characterization |
|
102
|
|
|
dict_ord_cycling_data = {k: dict_ordered[k] for k in wanted_keys} |
|
103
|
|
|
|
|
104
|
|
|
return dict_ord_cycling_data |
|
105
|
|
|
|
|
106
|
|
|
|
|
107
|
|
|
def concat_dict_dataframes(dict_ord_cycling_data): |
|
108
|
|
|
""" |
|
109
|
|
|
This function takes in a dictionary with ordered keys |
|
110
|
|
|
and concatenates the dataframes in the values of the |
|
111
|
|
|
dictionary to create a large dataframe with all the records. |
|
112
|
|
|
|
|
113
|
|
|
Args: |
|
114
|
|
|
dict_ord_cycling_data (dict): |
|
115
|
|
|
The dictionary with ordered integer keys and dataframes as values |
|
116
|
|
|
|
|
117
|
|
|
Returns: |
|
118
|
|
|
The dataframe after concatenation |
|
119
|
|
|
|
|
120
|
|
|
""" |
|
121
|
|
|
|
|
122
|
|
|
# Raise an exception if the type of the inputs is not correct |
|
123
|
|
|
if not isinstance(dict_ord_cycling_data, dict): |
|
124
|
|
|
raise TypeError('dict_ord_cycling_data is not of type dict') |
|
125
|
|
|
|
|
126
|
|
|
#print(dict_ord_cycling_data.keys()) |
|
127
|
|
|
for i in dict_ord_cycling_data.keys(): |
|
128
|
|
|
# Raise an exception if the type of the keys is not integers |
|
129
|
|
|
# print(type(i)) |
|
130
|
|
|
if not isinstance(i, (int, np.int64)): |
|
131
|
|
|
raise TypeError('a key in the dictionary is not an integer') |
|
132
|
|
|
|
|
133
|
|
|
for i in dict_ord_cycling_data.values(): |
|
134
|
|
|
# Raise an exception if the type of the values is not a dataframe |
|
135
|
|
|
if not isinstance(i, pd.DataFrame): |
|
136
|
|
|
raise TypeError('a value in the dictionary is not a pandas ' + |
|
137
|
|
|
'dataframe') |
|
138
|
|
|
# print(i.columns) |
|
139
|
|
|
# Raise am exception if the necessary columns are not found in the df |
|
140
|
|
|
if not { |
|
141
|
|
|
'Cycle', |
|
142
|
|
|
'Charge_Ah', |
|
143
|
|
|
'Discharge_Ah', |
|
144
|
|
|
'Time_sec', |
|
145
|
|
|
'Current_Amp', |
|
146
|
|
|
'Voltage_Volt'}.issubset(i.columns): |
|
147
|
|
|
raise Exception("the dataframe doesnt have the columns 'Cycle'" + |
|
148
|
|
|
", 'Charge_Ah', 'Discharge_Ah', " + |
|
149
|
|
|
"'Time_sec', 'Voltage_Volt', 'Current_Amp' ") |
|
150
|
|
|
|
|
151
|
|
|
# Concatenate the dataframes to create the total dataframe |
|
152
|
|
|
df_out = None |
|
153
|
|
|
for k in dict_ord_cycling_data.keys(): |
|
154
|
|
View Code Duplication |
if df_out is None: |
|
|
|
|
|
|
155
|
|
|
df_next = dict_ord_cycling_data[k] |
|
156
|
|
|
df_out = pd.DataFrame(data=None, columns=df_next.columns) |
|
157
|
|
|
df_out = pd.concat([df_out, df_next]) |
|
158
|
|
|
else: |
|
159
|
|
|
df_next = dict_ord_cycling_data[k] |
|
160
|
|
|
df_next['Cycle'] = np.array( |
|
161
|
|
|
df_next['Cycle']) + max(np.array(df_out['Cycle'])) |
|
162
|
|
|
df_next['Time_sec'] = np.array( |
|
163
|
|
|
df_next['Time_sec']) + max(np.array(df_out['Time_sec'])) |
|
164
|
|
|
df_next['Charge_Ah'] = np.array( |
|
165
|
|
|
df_next['Charge_Ah']) + max(np.array(df_out['Charge_Ah'])) |
|
166
|
|
|
df_next['Discharge_Ah'] = np.array( |
|
167
|
|
|
df_next['Discharge_Ah']) + max(np.array(df_out['Discharge_Ah'])) |
|
168
|
|
|
df_out = pd.concat([df_out, df_next]) |
|
169
|
|
|
|
|
170
|
|
|
return df_out |
|
171
|
|
|
|
|
172
|
|
|
|
|
173
|
|
|
def get_cycle_capacities(df_out): |
|
174
|
|
|
""" |
|
175
|
|
|
This function takes the dataframe, creates a new index and then calculates |
|
176
|
|
|
capacities per cycle from cumulative charge and discharge capacities |
|
177
|
|
|
|
|
178
|
|
|
Args: |
|
179
|
|
|
df_out (pandas.DataFrame): |
|
180
|
|
|
Concatenated dataframe |
|
181
|
|
|
|
|
182
|
|
|
Returns: |
|
183
|
|
|
the dataframe with capacities per cycle |
|
184
|
|
|
|
|
185
|
|
|
""" |
|
186
|
|
|
|
|
187
|
|
|
# Raise am exception if the necessary columns are not found in the df |
|
188
|
|
|
if not {'Cycle', 'Charge_Ah', 'Discharge_Ah', 'Time_sec', 'Current_Amp', |
|
189
|
|
|
'Voltage_Volt'}.issubset(df_out.columns): |
|
190
|
|
|
raise Exception("the dataframe doesnt have the columns 'Cycle'" + |
|
191
|
|
|
", 'Charge_Ah', 'Discharge_Ah', " + |
|
192
|
|
|
"'Time_sec', 'Voltage_Volt', 'Current_Amp' ") |
|
193
|
|
|
|
|
194
|
|
|
# Reset the index and drop the old index |
|
195
|
|
|
df_out_indexed = df_out.reset_index(drop=True) |
|
196
|
|
|
|
|
197
|
|
|
# Proceed further with correcting the capacity |
|
198
|
|
|
df_grouped = df_out_indexed.groupby(['Cycle']).count() |
|
199
|
|
|
|
|
200
|
|
|
# Get the indices when a cycle starts |
|
201
|
|
|
cycle_start_indices = df_grouped['Time_sec'].cumsum() |
|
202
|
|
|
|
|
203
|
|
|
# Get the charge_Ah per cycle |
|
204
|
|
|
# Create numpy array to store the old charge_Ah row, and then |
|
205
|
|
|
# perform transformation on it, rather than in the pandas series |
|
206
|
|
|
# this is a lot faster in this case |
|
207
|
|
|
charge_cycle_ah = np.array(df_out_indexed['Charge_Ah']) |
|
208
|
|
|
charge_ah = np.array(df_out_indexed['Charge_Ah']) |
|
209
|
|
|
|
|
210
|
|
|
for i in range(1, len(cycle_start_indices)): |
|
211
|
|
|
begin_value = cycle_start_indices.iloc[i - 1] |
|
212
|
|
|
end_value = cycle_start_indices.iloc[i] |
|
213
|
|
|
charge_cycle_ah[begin_value:end_value] = charge_ah[begin_value:end_value] - \ |
|
214
|
|
|
charge_ah[begin_value - 1] |
|
215
|
|
|
|
|
216
|
|
|
df_out_indexed['charge_cycle_ah'] = charge_cycle_ah |
|
217
|
|
|
|
|
218
|
|
|
# Get the discharge_Ah per cycle |
|
219
|
|
|
discharge_cycle_ah = np.array(df_out_indexed['Discharge_Ah']) |
|
220
|
|
|
discharge_ah = np.array(df_out_indexed['Discharge_Ah']) |
|
221
|
|
|
|
|
222
|
|
|
for i in range(1, len(cycle_start_indices)): |
|
223
|
|
|
begin_value = cycle_start_indices.iloc[i - 1] |
|
224
|
|
|
end_value = cycle_start_indices.iloc[i] |
|
225
|
|
|
discharge_cycle_ah[begin_value:end_value] = discharge_ah[begin_value:end_value] - \ |
|
226
|
|
|
discharge_ah[begin_value - 1] |
|
227
|
|
|
|
|
228
|
|
|
df_out_indexed['discharge_cycle_ah'] = discharge_cycle_ah |
|
229
|
|
|
|
|
230
|
|
|
# This is the data column we can use for prediction. |
|
231
|
|
|
# This is not totally accurate, as this still has some points that go negative, |
|
232
|
|
|
# due to incorrect discharge_Ah values every few cycles. |
|
233
|
|
|
# But the machine learning algorithm should consider these as outliers and |
|
234
|
|
|
# hopefully get over it. We can come back and correct this. |
|
235
|
|
|
df_out_indexed['capacity_ah'] = charge_cycle_ah - discharge_cycle_ah |
|
236
|
|
|
df_out_indexed.rename(columns={'Current_Amp':'Current(A)','Voltage_Volt':'Voltage(V)'}, |
|
237
|
|
|
inplace=True) |
|
238
|
|
|
return df_out_indexed |
|
239
|
|
|
|
|
240
|
|
|
# @profile |
|
241
|
|
|
|
|
242
|
|
|
|
|
243
|
|
|
def pl_samples_file_reader(data_dir, file_name_format, ignore_file_indices): |
|
244
|
|
|
""" |
|
245
|
|
|
This function reads in the data for PL Samples experiment and returns a |
|
246
|
|
|
nice dataframe with cycles in ascending order. |
|
247
|
|
|
|
|
248
|
|
|
Args: |
|
249
|
|
|
data_dir (string): This is the absolute path to the data directory. |
|
250
|
|
|
file_name_format (string): Format of the filename, used to deduce other files. |
|
251
|
|
|
ignore_file_indices (list, int): This list of ints tells which to ignore. |
|
252
|
|
|
|
|
253
|
|
|
Returns: |
|
254
|
|
|
The complete test data in a dataframe with extra column for capacity in Ah. |
|
255
|
|
|
""" |
|
256
|
|
|
|
|
257
|
|
|
# Raise an exception if the type of the inputs is not correct |
|
258
|
|
|
if not isinstance(data_dir, str): |
|
259
|
|
|
raise TypeError('data_dir is not of type string') |
|
260
|
|
|
|
|
261
|
|
|
if not isinstance(file_name_format, str): |
|
262
|
|
|
raise TypeError('file_name_format is not of type string') |
|
263
|
|
|
|
|
264
|
|
|
if not isinstance(ignore_file_indices, list): |
|
265
|
|
|
raise TypeError("ignore_file_indices should be a list") |
|
266
|
|
|
|
|
267
|
|
|
for ignore_file_indice in ignore_file_indices: |
|
268
|
|
|
if not isinstance(ignore_file_indice, int): |
|
269
|
|
|
raise TypeError("""ignore_file_indices elements should be |
|
270
|
|
|
of type integer""") |
|
271
|
|
|
|
|
272
|
|
|
if not os.path.exists(join(data_dir, file_name_format)): |
|
273
|
|
|
raise FileNotFoundError("File {} not found in the location {}" |
|
274
|
|
|
.format(file_name_format, data_dir)) |
|
275
|
|
|
|
|
276
|
|
|
dict_ord_cycling_data = get_dict_files( |
|
277
|
|
|
data_dir, file_name_format, ignore_file_indices) |
|
278
|
|
|
|
|
279
|
|
|
df_out = concat_dict_dataframes(dict_ord_cycling_data) |
|
280
|
|
|
|
|
281
|
|
|
#### |
|
282
|
|
|
# This has been commented out for performance, as we do not need date_time |
|
283
|
|
|
#### |
|
284
|
|
|
# Convert the Date_Time from matlab datenum to human readable Date_Time |
|
285
|
|
|
# First convert the series into a numpy array |
|
286
|
|
|
# date_time_matlab = df_out['Date_Time'].tolist() |
|
287
|
|
|
|
|
288
|
|
|
# # Apply the conversion to the numpy array |
|
289
|
|
|
# df_out['Date_Time_new'] = date_time_converter(date_time_matlab) |
|
290
|
|
|
|
|
291
|
|
|
# Get the cycle capacities from cumulative capacities |
|
292
|
|
|
df_out_indexed = get_cycle_capacities(df_out) |
|
293
|
|
|
|
|
294
|
|
|
return df_out_indexed |
|
295
|
|
|
|
|
296
|
|
|
# Wrapping function to train the LSTM model and calculate model_loss, |
|
297
|
|
|
# and response to the testing data set. |
|
298
|
|
|
|
|
299
|
|
|
|
|
300
|
|
|
def model_training(data_dir, file_name_format, sheet_name): |
|
301
|
|
|
""" |
|
302
|
|
|
This function converts cumulative battery cycling data into individual cycle data |
|
303
|
|
|
and trains the LSTM model with the converted data set. |
|
304
|
|
|
|
|
305
|
|
|
Args: |
|
306
|
|
|
data_dir (string): This is the absolute path to the data directory. |
|
307
|
|
|
file_name_format (string): Format of the filename, used to deduce other files. |
|
308
|
|
|
sheet_name(string or int): Sheet name or sheet number in the excel file containing |
|
309
|
|
|
the relevant data. |
|
310
|
|
|
|
|
311
|
|
|
Returns: |
|
312
|
|
|
model_loss(dictionary): Returns the history dictionary (more info to be added) |
|
313
|
|
|
y_hat(array): Predicted response for the testing dataset. |
|
314
|
|
|
# y_prediction(array): Predicted response for the completely new dataset |
|
315
|
|
|
# (The input has to be the time series cycling data including values of |
|
316
|
|
|
# Current, Voltage and Discharge Capacity) |
|
317
|
|
|
""" |
|
318
|
|
|
# The function 'cx2_file_reader' is used to read all the excel files |
|
319
|
|
|
# in the given path and convert the given cumulative data into individual |
|
320
|
|
|
# cycle data. |
|
321
|
|
|
individual_cycle_data = cx2_file_reader(data_dir, file_name_format, sheet_name) |
|
322
|
|
|
|
|
323
|
|
|
# The function 'data_formatting' is used to drop the unnecesary columns |
|
324
|
|
|
# from the training data i.e. only the features considered in the model |
|
325
|
|
|
# (Current, Voltage and Discharge capacity) are retained. |
|
326
|
|
|
formatted_data = data_formatting(individual_cycle_data) |
|
327
|
|
|
|
|
328
|
|
|
# The function 'series_to_supervised' is used to frame the time series training |
|
329
|
|
|
# data as supervised learning dataset. |
|
330
|
|
|
learning_df = series_to_supervised( |
|
331
|
|
|
formatted_data, n_in=1, n_out=1, dropnan=True) |
|
332
|
|
|
|
|
333
|
|
|
# The function 'long_short_term_memory' is used to train the model |
|
334
|
|
|
# and predict response for the new input dataset. |
|
335
|
|
|
model_loss, y_hat = long_short_term_memory(learning_df) |
|
336
|
|
|
|
|
337
|
|
|
return model_loss, y_hat |
|
338
|
|
|
|
|
339
|
|
|
|
|
340
|
|
|
# Function to predict the discharge capacity using the trained LSTM model. |
|
341
|
|
|
def model_prediction(input_data): |
|
342
|
|
|
""" |
|
343
|
|
|
This function can be used to forecast the discharge capacity of a battery using |
|
344
|
|
|
the trained LSTM model |
|
345
|
|
|
|
|
346
|
|
|
Args: |
|
347
|
|
|
input_data(dataframe): This is the dataframe containing the current, voltage and |
|
348
|
|
|
discharge capacity values at a prior time which can be used to forecast discharge |
|
349
|
|
|
capacity at a further time. |
|
350
|
|
|
|
|
351
|
|
|
Returns: |
|
352
|
|
|
y_predicted: The forecasted values of discharge capacity. |
|
353
|
|
|
""" |
|
354
|
|
|
|
|
355
|
|
|
# The function 'series_to_supervised' is used to frame the time series training |
|
356
|
|
|
# data as supervised learning dataset. |
|
357
|
|
|
learning_df = series_to_supervised( |
|
358
|
|
|
input_data, n_in=1, n_out=1, dropnan=True) |
|
359
|
|
|
learning_df = learning_df.iloc[:, 0:3].values |
|
360
|
|
|
# Reshaping the input dataset. |
|
361
|
|
|
learning_df = learning_df.reshape( |
|
362
|
|
|
(learning_df.shape[0], 1, learning_df.shape[1])) |
|
363
|
|
|
# Predicting the discharge values using the saved LSTM model. |
|
364
|
|
|
module_dir = os.path.dirname(os.path.abspath(__file__)) |
|
365
|
|
|
model_path = join(module_dir,'models') |
|
366
|
|
|
model = load_model(join(model_path,'lstm_trained_model.h5')) |
|
367
|
|
|
y_predicted = model.predict(learning_df) |
|
368
|
|
|
return y_predicted |
|
369
|
|
|
|
|
370
|
|
|
|
|
371
|
|
|
# Wrapping function only to merge and convert cumulative data to |
|
372
|
|
|
# individual cycle data. |
|
373
|
|
|
def cx2_file_reader(data_dir, file_name_format, sheet_name): |
|
374
|
|
|
""" |
|
375
|
|
|
This function reads in the data for CX2 samples experiment and returns |
|
376
|
|
|
a well formatted dataframe with cycles in ascending order. |
|
377
|
|
|
|
|
378
|
|
|
Args: |
|
379
|
|
|
data_dir (string): This is the absolute path to the data directory. |
|
380
|
|
|
file_name_format (string): Format of the filename, used to deduce other files. |
|
381
|
|
|
sheet_name (string): Sheet name containing the data in the excel file. |
|
382
|
|
|
|
|
383
|
|
|
Returns: |
|
384
|
|
|
The complete test data in a dataframe with extra column for capacity in Ah. |
|
385
|
|
|
""" |
|
386
|
|
|
# Raise an exception if the type of the inputs is not correct |
|
387
|
|
|
if not isinstance(data_dir, str): |
|
388
|
|
|
raise TypeError('data_dir is not of type string') |
|
389
|
|
|
|
|
390
|
|
|
if not isinstance(file_name_format, str): |
|
391
|
|
|
raise TypeError('file_name_format is not of type string') |
|
392
|
|
|
|
|
393
|
|
|
if not isinstance(sheet_name, (str, int)): |
|
394
|
|
|
raise TypeError('Sheet_Name format is not of type string or integer') |
|
395
|
|
|
|
|
396
|
|
|
if not os.path.exists(join(data_dir, file_name_format)): |
|
397
|
|
|
raise FileNotFoundError("File {} not found in the location {}" |
|
398
|
|
|
.format(file_name_format, data_dir)) |
|
399
|
|
|
|
|
400
|
|
|
# Get the list of files in the directory |
|
401
|
|
|
path = join(data_dir, file_name_format) |
|
402
|
|
|
files = listdir(path) |
|
403
|
|
|
|
|
404
|
|
|
# Extract the experiment name from the file_name_format |
|
405
|
|
|
# exp_name = file_name_format[0:6] |
|
406
|
|
|
|
|
407
|
|
|
# Filtering out and reading the excel files in the data directory |
|
408
|
|
|
file_names = list(filter(lambda x: x[-5:] == '.xlsx', files)) |
|
409
|
|
|
|
|
410
|
|
|
# Sorting the file names using the |
|
411
|
|
|
# 'file_name_sorting' function. |
|
412
|
|
|
sorted_name_list = file_name_sorting(file_names) |
|
413
|
|
|
|
|
414
|
|
|
# Reading dataframes according to the date of experimentation |
|
415
|
|
|
# using 'reading_dataframes' function. |
|
416
|
|
|
sorted_df = reading_dataframes(sorted_name_list, sheet_name, path) |
|
417
|
|
|
|
|
418
|
|
|
# Merging all the dataframes and adjusting the cycle index |
|
419
|
|
|
# using the 'concat_df' function. |
|
420
|
|
|
cycle_data = concat_df(sorted_df) |
|
421
|
|
|
|
|
422
|
|
|
# Calculating the net capacity of the battery at every datapoint |
|
423
|
|
|
# using the function 'capacity'. |
|
424
|
|
|
capacity_data = capacity(cycle_data) |
|
425
|
|
|
|
|
426
|
|
|
# Returns the dataframe with new cycle indices and capacity data. |
|
427
|
|
|
return capacity_data |
|
428
|
|
|
|
|
429
|
|
|
|
|
430
|
|
|
def file_name_sorting(file_name_list): |
|
431
|
|
|
""" |
|
432
|
|
|
This function sorts all the file names according to the date |
|
433
|
|
|
on the file name. |
|
434
|
|
|
|
|
435
|
|
|
Args: |
|
436
|
|
|
file_name_list(list): List containing all the file names to be read |
|
437
|
|
|
|
|
438
|
|
|
Returns: |
|
439
|
|
|
A list of file names sorted according to the date on the file name. |
|
440
|
|
|
|
|
441
|
|
|
""" |
|
442
|
|
|
filename = pd.DataFrame(data=file_name_list, columns=['file_name']) |
|
443
|
|
|
# Splitting the file name into different columns |
|
444
|
|
|
filename['cell_type'], filename['cell_num'], filename['month'], filename[ |
|
445
|
|
|
'day'], filename['year'] = filename['file_name'].str.split('_', 4).str |
|
446
|
|
|
filename['year'], filename['ext'] = filename['year'].str.split('.', 1).str |
|
447
|
|
|
filename['date'] = '' |
|
448
|
|
|
# Merging the year, month and date column to create a string for DateTime |
|
449
|
|
|
# object. |
|
450
|
|
|
filename['date'] = filename['year'].map( |
|
451
|
|
|
str) + filename['month'].map(str) + filename['day'].map(str) |
|
452
|
|
|
# Creating a DateTime object. |
|
453
|
|
|
filename['date_time'] = '' |
|
454
|
|
|
filename['date_time'] = pd.to_datetime(filename['date'], format="%y%m%d") |
|
455
|
|
|
# Sorting the file names according to the |
|
456
|
|
|
# created DateTime object. |
|
457
|
|
|
filename.sort_values(['date_time'], inplace=True) |
|
458
|
|
|
# Created a list of sorted file names |
|
459
|
|
|
sorted_file_names = filename['file_name'].values |
|
460
|
|
|
return sorted_file_names |
|
461
|
|
|
|
|
462
|
|
|
|
|
463
|
|
|
def reading_dataframes(file_names, sheet_name, path): |
|
464
|
|
|
""" |
|
465
|
|
|
This function reads all the files in the sorted |
|
466
|
|
|
file names list as a dataframe |
|
467
|
|
|
|
|
468
|
|
|
Args(list): |
|
469
|
|
|
file_names: Sorted file names list |
|
470
|
|
|
sheet_name: Sheet name in the excel file containing the data. |
|
471
|
|
|
|
|
472
|
|
|
Returns: |
|
473
|
|
|
Dictionary of dataframes in the order of the sorted file names. |
|
474
|
|
|
""" |
|
475
|
|
|
# Empty dictionary to store all the dataframes according |
|
476
|
|
|
# to the order in the sorted files name list |
|
477
|
|
|
df_raw = {} |
|
478
|
|
|
# Reading the dataframes |
|
479
|
|
|
for i, filename in enumerate(file_names): |
|
480
|
|
|
df_raw[i] = pd.read_excel( |
|
481
|
|
|
join( |
|
482
|
|
|
path, |
|
483
|
|
|
filename), |
|
484
|
|
|
sheet_name=sheet_name) |
|
485
|
|
|
return df_raw |
|
486
|
|
|
|
|
487
|
|
|
|
|
488
|
|
|
def concat_df(df_dict): |
|
489
|
|
|
""" |
|
490
|
|
|
This function concatenates all the dataframes and edits |
|
491
|
|
|
the cycle index for the concatenated dataframes. |
|
492
|
|
|
|
|
493
|
|
|
Args: |
|
494
|
|
|
df_dict(dictionary): Dictionary of dataframes to be concatenated. |
|
495
|
|
|
|
|
496
|
|
|
Returns: |
|
497
|
|
|
A concatenated dataframe with editted cycle index |
|
498
|
|
|
|
|
499
|
|
|
""" |
|
500
|
|
|
df_concat = None |
|
501
|
|
|
for data in df_dict: |
|
502
|
|
|
if df_concat is None: |
|
503
|
|
|
df_next = df_dict[data] |
|
504
|
|
|
df_concat = pd.DataFrame(data=None, columns=df_next.columns) |
|
505
|
|
|
# df_next['Cycle'] = df_next['Cycle'] + max(df_pl12['Cycle']) |
|
506
|
|
|
df_concat = pd.concat([df_concat, df_next]) |
|
507
|
|
|
else: |
|
508
|
|
|
df_next = df_dict[data] |
|
509
|
|
|
df_next['Cycle_Index'] = np.array( |
|
510
|
|
|
df_next['Cycle_Index']) + max(np.array(df_concat['Cycle_Index'])) |
|
511
|
|
|
df_next['Test_Time(s)'] = np.array( |
|
512
|
|
|
df_next['Test_Time(s)']) + max(np.array(df_concat['Test_Time(s)'])) |
|
513
|
|
|
df_next['Charge_Capacity(Ah)'] = np.array( |
|
514
|
|
|
df_next['Charge_Capacity(Ah)']) + max(np.array(df_concat['Charge_Capacity(Ah)'])) |
|
515
|
|
|
df_next['Discharge_Capacity(Ah)'] = np.array( |
|
516
|
|
|
df_next['Discharge_Capacity(Ah)']) + max( |
|
517
|
|
|
np.array(df_concat['Discharge_Capacity(Ah)'])) |
|
518
|
|
|
df_concat = pd.concat([df_concat, df_next]) |
|
519
|
|
|
# Reset the index and drop the old index |
|
520
|
|
|
df_reset = df_concat.reset_index(drop=True) |
|
521
|
|
|
return df_reset |
|
522
|
|
|
|
|
523
|
|
|
|
|
524
|
|
|
def capacity(df_data): |
|
525
|
|
|
""" |
|
526
|
|
|
This function calculates the net capacity of the battery |
|
527
|
|
|
from the charge capacity and discharge capacity values. |
|
528
|
|
|
|
|
529
|
|
|
Args: |
|
530
|
|
|
df_data(dataframe): Concatenated dataframe which has the values of charge |
|
531
|
|
|
capacity and discharge capacity for which net capacity has to be |
|
532
|
|
|
calculated. |
|
533
|
|
|
|
|
534
|
|
|
Returns: |
|
535
|
|
|
Dataframe with net capacity of the battery for every point of the charge |
|
536
|
|
|
and discharge cycle. |
|
537
|
|
|
""" |
|
538
|
|
|
# Grouping rows by the cycle index. |
|
539
|
|
|
group = df_data.groupby(['Cycle_Index']).count() |
|
540
|
|
|
|
|
541
|
|
|
# Get the indices when a cycle starts |
|
542
|
|
|
cycle_start_indices = group['Data_Point'].cumsum() |
|
543
|
|
|
|
|
544
|
|
|
# Get the charge_Ah per cycle |
|
545
|
|
|
# Create numpy array to store the old charge_Ah row, and then |
|
546
|
|
|
# perform transformation on it, rather than in the pandas series |
|
547
|
|
|
# this is a lot faster in this case |
|
548
|
|
|
charge_cycle_ah = np.array(df_data['Charge_Capacity(Ah)']) |
|
549
|
|
|
charge_ah = np.array(df_data['Charge_Capacity(Ah)']) |
|
550
|
|
|
|
|
551
|
|
|
for i in range(1, len(cycle_start_indices)): |
|
552
|
|
|
begin_value = cycle_start_indices.iloc[i - 1] |
|
553
|
|
|
end_value = cycle_start_indices.iloc[i] |
|
554
|
|
|
charge_cycle_ah[begin_value:end_value] = charge_ah[begin_value:end_value] - \ |
|
555
|
|
|
charge_ah[begin_value - 1] |
|
556
|
|
|
|
|
557
|
|
|
df_data['charge_cycle_ah'] = charge_cycle_ah |
|
558
|
|
|
|
|
559
|
|
|
# Get the discharge_Ah per cycle |
|
560
|
|
|
discharge_cycle_ah = np.array(df_data['Discharge_Capacity(Ah)']) |
|
561
|
|
|
discharge_ah = np.array(df_data['Discharge_Capacity(Ah)']) |
|
562
|
|
|
|
|
563
|
|
|
for i in range(1, len(cycle_start_indices)): |
|
564
|
|
|
begin_value = cycle_start_indices.iloc[i - 1] |
|
565
|
|
|
end_value = cycle_start_indices.iloc[i] |
|
566
|
|
|
discharge_cycle_ah[begin_value:end_value] = discharge_ah[begin_value:end_value] - \ |
|
567
|
|
|
discharge_ah[begin_value - 1] |
|
568
|
|
|
|
|
569
|
|
|
df_data['discharge_cycle_ah'] = discharge_cycle_ah |
|
570
|
|
|
|
|
571
|
|
|
# This is the data column we can use for prediction. |
|
572
|
|
|
# This is not totally accurate, as this still has some points that go negative, |
|
573
|
|
|
# due to incorrect discharge_Ah values every few cycles. |
|
574
|
|
|
# But the machine learning algorithm should consider these as outliers and |
|
575
|
|
|
# hopefully get over it. We can come back and correct this. |
|
576
|
|
|
df_data['capacity_ah'] = df_data['charge_cycle_ah'] - df_data['discharge_cycle_ah'] |
|
577
|
|
|
|
|
578
|
|
|
return df_data |
|
579
|
|
|
|
|
580
|
|
|
|
|
581
|
|
|
def data_formatting(merged_df): |
|
582
|
|
|
""" |
|
583
|
|
|
This function formats the merged dataframe so that it can be used to frame the given |
|
584
|
|
|
time series data as a supervised learning dataset. |
|
585
|
|
|
|
|
586
|
|
|
Args: |
|
587
|
|
|
merged_df(dataframe): The merged dataframe which can be obtained by using the |
|
588
|
|
|
function 'cx2_file_reader' |
|
589
|
|
|
|
|
590
|
|
|
Returns: |
|
591
|
|
|
A numpy array with only values required to frame a time series as a |
|
592
|
|
|
supervised learning dataset. |
|
593
|
|
|
""" |
|
594
|
|
|
# Get the columns containing text 'Current', 'Voltage' and |
|
595
|
|
|
# 'discharge_cycle_ah' |
|
596
|
|
|
merged_df = merged_df.filter(regex='Current|Voltage|discharge_cycle_ah') |
|
597
|
|
|
formatted_df = merged_df.astype('float32') |
|
598
|
|
|
return formatted_df |
|
599
|
|
|
|
|
600
|
|
|
|
|
601
|
|
|
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True): |
|
602
|
|
|
""" |
|
603
|
|
|
Frame a time series as a supervised learning dataset. |
|
604
|
|
|
|
|
605
|
|
|
Arguments: |
|
606
|
|
|
data: Sequence of observations as a list or NumPy array. |
|
607
|
|
|
n_in: Number of lag observations as input (X). |
|
608
|
|
|
n_out: Number of observations as output (y). |
|
609
|
|
|
dropnan: Boolean whether or not to drop rows with NaN values. |
|
610
|
|
|
|
|
611
|
|
|
Returns: |
|
612
|
|
|
Pandas DataFrame of series framed for supervised learning. |
|
613
|
|
|
|
|
614
|
|
|
""" |
|
615
|
|
|
n_vars = 1 if isinstance(data, list) else data.shape[1] |
|
616
|
|
|
df_data = pd.DataFrame(data) |
|
617
|
|
|
cols, names = list(), list() |
|
618
|
|
|
# input sequence (t-n, ... t-1) |
|
619
|
|
|
for i in range(n_in, 0, -1): |
|
620
|
|
|
cols.append(df_data.shift(i)) |
|
621
|
|
|
names += [('var%d(t-%d)' % (j + 1, i)) for j in range(n_vars)] |
|
622
|
|
|
# forecast sequence (t, t+1, ... t+n) |
|
623
|
|
|
for i in range(0, n_out): |
|
624
|
|
|
cols.append(df_data.shift(-i)) |
|
625
|
|
|
if i == 0: |
|
626
|
|
|
names += [('var%d(t)' % (j + 1)) for j in range(n_vars)] |
|
627
|
|
|
else: |
|
628
|
|
|
names += [('var%d(t+%d)' % (j + 1, i)) for j in range(n_vars)] |
|
629
|
|
|
# put it all together |
|
630
|
|
|
sl_df = pd.concat(cols, axis=1) |
|
631
|
|
|
sl_df.columns = names |
|
632
|
|
|
# drop rows with NaN values |
|
633
|
|
|
if dropnan: |
|
634
|
|
|
sl_df.dropna(inplace=True) |
|
635
|
|
|
sl_df.drop(sl_df.columns[[3, 4]], axis=1, inplace=True) |
|
636
|
|
|
sl_df.rename(columns={'var1(t-1)':'Current(t-1)','var2(t-1)':'Voltage(t-1)', |
|
637
|
|
|
'var3(t-1)':'discharge_capacity(t-1)','var3(t)':'discharge_capacity(t)'}, |
|
638
|
|
|
inplace = True) |
|
639
|
|
|
return sl_df |
|
640
|
|
|
|
|
641
|
|
|
|
|
642
|
|
|
def long_short_term_memory(model_data): |
|
643
|
|
|
""" |
|
644
|
|
|
This function splits the input dataset into training |
|
645
|
|
|
and testing datasets. The keras LSTM model is then |
|
646
|
|
|
trained and tested using the respective datasets. |
|
647
|
|
|
|
|
648
|
|
|
Args: |
|
649
|
|
|
model_data(dataframe): Values of input and output variables |
|
650
|
|
|
of time series data framed as a supervised learning dataset. |
|
651
|
|
|
|
|
652
|
|
|
|
|
653
|
|
|
Returns: |
|
654
|
|
|
model_loss(dictionary): Returns the history dictionary (more info to be added) |
|
655
|
|
|
y_hat(array): Predicted response for the testing dataset. |
|
656
|
|
|
y_prediction(array): Predicted response for the completely new dataset. |
|
657
|
|
|
""" |
|
658
|
|
|
# Splitting the input dataset into training and testing data |
|
659
|
|
|
train, test = train_test_split(model_data, test_size=0.2, random_state=944) |
|
660
|
|
|
# split into input and outputs |
|
661
|
|
|
train_x, train_y = train[train.columns[0:3] |
|
662
|
|
|
].values, train[train.columns[3]].values |
|
663
|
|
|
test_x, test_y = test[test.columns[0:3] |
|
664
|
|
|
].values, test[test.columns[3]].values |
|
665
|
|
|
# reshape input to be 3D [samples, timesteps, features] |
|
666
|
|
|
train_x = train_x.reshape((train_x.shape[0], 1, train_x.shape[1])) |
|
667
|
|
|
test_x = test_x.reshape((test_x.shape[0], 1, test_x.shape[1])) |
|
668
|
|
|
# print(train_x.shape, train_y.shape, test_x.shape, test_y.shape) |
|
669
|
|
|
|
|
670
|
|
|
# Designing the network |
|
671
|
|
|
model = Sequential() |
|
672
|
|
|
model.add(LSTM(50, input_shape=(train_x.shape[1], train_x.shape[2]))) |
|
673
|
|
|
model.add(Dense(1)) |
|
674
|
|
|
model.compile(loss='mae', optimizer='adam') |
|
675
|
|
|
# Fitting the network with training and testing data |
|
676
|
|
|
history = model.fit( |
|
677
|
|
|
train_x, |
|
678
|
|
|
train_y, |
|
679
|
|
|
epochs=50, |
|
680
|
|
|
batch_size=72, |
|
681
|
|
|
validation_data=( |
|
682
|
|
|
test_x, |
|
683
|
|
|
test_y), |
|
684
|
|
|
verbose=0, |
|
685
|
|
|
shuffle=False) |
|
686
|
|
|
model_loss = history.history |
|
687
|
|
|
# Prediction for the test dataset. |
|
688
|
|
|
yhat = model.predict(test_x) |
|
689
|
|
|
# model.save('lstm_trained_model.h5') |
|
690
|
|
|
return model_loss, yhat |
|
691
|
|
|
|
|
692
|
|
|
def file_reader(data_dir, file_name_format, sheet_name, ignore_file_indices): |
|
693
|
|
|
""" |
|
694
|
|
|
This function reads PL sample, CX2 and CS2 files and returns a nice |
|
695
|
|
|
dataframe with cyclic values of charge and discharge capacity with |
|
696
|
|
|
cycles in ascending order |
|
697
|
|
|
|
|
698
|
|
|
Args: |
|
699
|
|
|
data_dir (string): This is the absolute path to the data directory. |
|
700
|
|
|
file_name_format (string): Format of the filename, used to deduce other files. |
|
701
|
|
|
sheet_name (string): Sheet name containing the data in the excel file. |
|
702
|
|
|
ignore_file_indices (list, int): This list of ints tells which to ignore. |
|
703
|
|
|
|
|
704
|
|
|
Returns: |
|
705
|
|
|
The complete test data in a dataframe with extra column for capacity in Ah. |
|
706
|
|
|
""" |
|
707
|
|
|
|
|
708
|
|
|
# For excel files (CX2 and CS2 datafiles), the function 'cx2_file_reader' |
|
709
|
|
|
# is used. |
|
710
|
|
|
if file_name_format[:3] == 'CX2' or file_name_format[:3] == 'CS2': |
|
711
|
|
|
df_output = cx2_file_reader(data_dir,file_name_format,sheet_name) |
|
712
|
|
|
else: |
|
713
|
|
|
df_output = pl_samples_file_reader(data_dir,file_name_format,ignore_file_indices) |
|
714
|
|
|
|
|
715
|
|
|
# The function 'data_formatting' is used to drop the unnecesary columns |
|
716
|
|
|
# from the training data i.e. only the features considered in the model |
|
717
|
|
|
# (Current, Voltage and Discharge capacity) are retained. |
|
718
|
|
|
formatted_data = data_formatting(df_output) |
|
719
|
|
|
|
|
720
|
|
|
# The function 'series_to_supervised' is used to frame the time series training |
|
721
|
|
|
# data as supervised learning dataset. |
|
722
|
|
|
# df_out = series_to_supervised( |
|
723
|
|
|
# formatted_data, n_in=1, n_out=1, dropnan=True) |
|
724
|
|
|
return formatted_data |
|
725
|
|
|
|