1
|
|
|
""" |
2
|
|
|
This module can be used to read cycling data of the CX2, CS2 and PL type cells as |
3
|
|
|
a dataframe. It converts cumulative values into individual values for |
4
|
|
|
each cycle and determines net charge of the battery at every datapoint. |
5
|
|
|
It can also be used to train and test a LSTM model and predict discharge capacity |
6
|
|
|
using the LSTM model. |
7
|
|
|
""" |
8
|
|
|
|
9
|
|
|
import datetime |
10
|
|
|
import os |
11
|
|
|
from os import listdir |
12
|
|
|
from os.path import isfile, join |
13
|
|
|
import re |
14
|
|
|
# import matplotlib.pyplot as plt |
15
|
|
|
# import seaborn as sns |
16
|
|
|
import pandas as pd |
17
|
|
|
import numpy as np |
18
|
|
|
|
19
|
|
|
from sklearn.model_selection import train_test_split |
20
|
|
|
from keras.models import Sequential |
21
|
|
|
from keras.layers import Dense |
22
|
|
|
from keras.layers import LSTM |
23
|
|
|
from keras.models import load_model |
24
|
|
|
|
25
|
|
|
# @profile |
26
|
|
|
def date_time_converter(date_time_list): |
27
|
|
|
""" |
28
|
|
|
This function gets the numpy array with date_time in matlab format |
29
|
|
|
and returns a numpy array with date_time in human readable format. |
30
|
|
|
""" |
31
|
|
|
|
32
|
|
|
if not isinstance(date_time_list, list): |
33
|
|
|
raise TypeError("date_time_list should be a list") |
34
|
|
|
|
35
|
|
|
# Empty array to hold the results |
36
|
|
|
date_time_human = [] |
37
|
|
|
|
38
|
|
|
for i in date_time_list: |
39
|
|
|
date_time_human.append( |
40
|
|
|
datetime.datetime.fromordinal( |
41
|
|
|
int(i)) + |
42
|
|
|
datetime.timedelta( |
43
|
|
|
days=i % |
44
|
|
|
1) - |
45
|
|
|
datetime.timedelta( |
46
|
|
|
days=366)) |
47
|
|
|
|
48
|
|
|
return date_time_human |
49
|
|
|
|
50
|
|
|
# @profile |
51
|
|
|
|
52
|
|
|
|
53
|
|
|
def get_dict_files(data_dir, file_name_format, ignore_file_indices): |
54
|
|
|
""" |
55
|
|
|
This function finds all the files at the location of the file name |
56
|
|
|
format as specified and then creates a dictionary after ignoring the |
57
|
|
|
list of file specified |
58
|
|
|
|
59
|
|
|
Args: |
60
|
|
|
data_dir (string): This is the absolute path to the data directory. |
61
|
|
|
file_name_format (string): Format of the filename, used to deduce other |
62
|
|
|
files. |
63
|
|
|
ignore_file_indices (list, int): This list of ints tells |
64
|
|
|
which to ignore. |
65
|
|
|
|
66
|
|
|
Returns: |
67
|
|
|
The dictionary with all data from files dataframes. |
68
|
|
|
""" |
69
|
|
|
|
70
|
|
|
# get the list of files in the directory |
71
|
|
|
onlyfiles = [f for f in listdir(data_dir) if isfile(join(data_dir, f))] |
72
|
|
|
|
73
|
|
|
# Extract the experiment name from the file_name_format |
74
|
|
|
exp_name = file_name_format[0:4] |
75
|
|
|
|
76
|
|
|
# Empty dictionary to hold all the dataframe for various files |
77
|
|
|
dict_files = {} |
78
|
|
|
|
79
|
|
|
# Iterate over all the files of certain type and get the file number from |
80
|
|
|
# them |
81
|
|
|
for filename in onlyfiles: |
82
|
|
|
if exp_name in filename: |
83
|
|
|
# Extract the filenumber from the name |
84
|
|
|
file_number = re.search( |
85
|
|
|
exp_name + r'\((.+?)\).csv', |
86
|
|
|
filename).group(1) |
87
|
|
|
# Give a value of dataframe to each key |
88
|
|
|
dict_files[int(file_number)] = pd.read_csv( |
|
|
|
|
89
|
|
|
join(data_dir, filename)) |
90
|
|
|
|
91
|
|
|
# Empty dictionary to hold the ordered dictionaries |
92
|
|
|
dict_ordered = {} |
93
|
|
|
# Sort the dictionary based on keys |
94
|
|
|
for key in sorted(dict_files.keys()): |
95
|
|
|
dict_ordered[key] = dict_files[key] |
96
|
|
|
|
97
|
|
|
# Keys with files to keep, remove the ignore indices from all keys |
98
|
|
|
wanted_keys = np.array( |
99
|
|
|
list(set(dict_ordered.keys()) - set(ignore_file_indices))) |
100
|
|
|
|
101
|
|
|
# Remove the ignored dataframes for characterization |
102
|
|
|
dict_ord_cycling_data = {k: dict_ordered[k] for k in wanted_keys} |
103
|
|
|
|
104
|
|
|
return dict_ord_cycling_data |
105
|
|
|
|
106
|
|
|
|
107
|
|
|
def concat_dict_dataframes(dict_ord_cycling_data): |
108
|
|
|
""" |
109
|
|
|
This function takes in a dictionary with ordered keys |
110
|
|
|
and concatenates the dataframes in the values of the |
111
|
|
|
dictionary to create a large dataframe with all the records. |
112
|
|
|
|
113
|
|
|
Args: |
114
|
|
|
dict_ord_cycling_data (dict): |
115
|
|
|
The dictionary with ordered integer keys and dataframes as values |
116
|
|
|
|
117
|
|
|
Returns: |
118
|
|
|
The dataframe after concatenation |
119
|
|
|
|
120
|
|
|
""" |
121
|
|
|
|
122
|
|
|
# Raise an exception if the type of the inputs is not correct |
123
|
|
|
if not isinstance(dict_ord_cycling_data, dict): |
124
|
|
|
raise TypeError('dict_ord_cycling_data is not of type dict') |
125
|
|
|
|
126
|
|
|
#print(dict_ord_cycling_data.keys()) |
127
|
|
|
for i in dict_ord_cycling_data.keys(): |
128
|
|
|
# Raise an exception if the type of the keys is not integers |
129
|
|
|
# print(type(i)) |
130
|
|
|
if not isinstance(i, (int, np.int64)): |
131
|
|
|
raise TypeError('a key in the dictionary is not an integer') |
132
|
|
|
|
133
|
|
|
for i in dict_ord_cycling_data.values(): |
134
|
|
|
# Raise an exception if the type of the values is not a dataframe |
135
|
|
|
if not isinstance(i, pd.DataFrame): |
136
|
|
|
raise TypeError('a value in the dictionary is not a pandas ' + |
137
|
|
|
'dataframe') |
138
|
|
|
# print(i.columns) |
139
|
|
|
# Raise am exception if the necessary columns are not found in the df |
140
|
|
|
if not { |
141
|
|
|
'Cycle', |
142
|
|
|
'Charge_Ah', |
143
|
|
|
'Discharge_Ah', |
144
|
|
|
'Time_sec', |
145
|
|
|
'Current_Amp', |
146
|
|
|
'Voltage_Volt'}.issubset(i.columns): |
147
|
|
|
raise Exception("the dataframe doesnt have the columns 'Cycle'" + |
148
|
|
|
", 'Charge_Ah', 'Discharge_Ah', " + |
149
|
|
|
"'Time_sec', 'Voltage_Volt', 'Current_Amp' ") |
150
|
|
|
|
151
|
|
|
# Concatenate the dataframes to create the total dataframe |
152
|
|
|
df_out = None |
153
|
|
|
for k in dict_ord_cycling_data.keys(): |
154
|
|
View Code Duplication |
if df_out is None: |
|
|
|
|
155
|
|
|
df_next = dict_ord_cycling_data[k] |
156
|
|
|
df_out = pd.DataFrame(data=None, columns=df_next.columns) |
157
|
|
|
df_out = pd.concat([df_out, df_next]) |
158
|
|
|
else: |
159
|
|
|
df_next = dict_ord_cycling_data[k] |
160
|
|
|
df_next['Cycle'] = np.array( |
161
|
|
|
df_next['Cycle']) + max(np.array(df_out['Cycle'])) |
162
|
|
|
df_next['Time_sec'] = np.array( |
163
|
|
|
df_next['Time_sec']) + max(np.array(df_out['Time_sec'])) |
164
|
|
|
df_next['Charge_Ah'] = np.array( |
165
|
|
|
df_next['Charge_Ah']) + max(np.array(df_out['Charge_Ah'])) |
166
|
|
|
df_next['Discharge_Ah'] = np.array( |
167
|
|
|
df_next['Discharge_Ah']) + max(np.array(df_out['Discharge_Ah'])) |
168
|
|
|
df_out = pd.concat([df_out, df_next]) |
169
|
|
|
|
170
|
|
|
return df_out |
171
|
|
|
|
172
|
|
|
|
173
|
|
|
def get_cycle_capacities(df_out): |
174
|
|
|
""" |
175
|
|
|
This function takes the dataframe, creates a new index and then calculates |
176
|
|
|
capacities per cycle from cumulative charge and discharge capacities |
177
|
|
|
|
178
|
|
|
Args: |
179
|
|
|
df_out (pandas.DataFrame): |
180
|
|
|
Concatenated dataframe |
181
|
|
|
|
182
|
|
|
Returns: |
183
|
|
|
the dataframe with capacities per cycle |
184
|
|
|
|
185
|
|
|
""" |
186
|
|
|
|
187
|
|
|
# Raise am exception if the necessary columns are not found in the df |
188
|
|
|
if not {'Cycle', 'Charge_Ah', 'Discharge_Ah', 'Time_sec', 'Current_Amp', |
189
|
|
|
'Voltage_Volt'}.issubset(df_out.columns): |
190
|
|
|
raise Exception("the dataframe doesnt have the columns 'Cycle'" + |
191
|
|
|
", 'Charge_Ah', 'Discharge_Ah', " + |
192
|
|
|
"'Time_sec', 'Voltage_Volt', 'Current_Amp' ") |
193
|
|
|
|
194
|
|
|
# Reset the index and drop the old index |
195
|
|
|
df_out_indexed = df_out.reset_index(drop=True) |
196
|
|
|
|
197
|
|
|
# Proceed further with correcting the capacity |
198
|
|
|
df_grouped = df_out_indexed.groupby(['Cycle']).count() |
199
|
|
|
|
200
|
|
|
# Get the indices when a cycle starts |
201
|
|
|
cycle_start_indices = df_grouped['Time_sec'].cumsum() |
202
|
|
|
|
203
|
|
|
# Get the charge_Ah per cycle |
204
|
|
|
# Create numpy array to store the old charge_Ah row, and then |
205
|
|
|
# perform transformation on it, rather than in the pandas series |
206
|
|
|
# this is a lot faster in this case |
207
|
|
|
charge_cycle_ah = np.array(df_out_indexed['Charge_Ah']) |
208
|
|
|
charge_ah = np.array(df_out_indexed['Charge_Ah']) |
209
|
|
|
|
210
|
|
|
for i in range(1, len(cycle_start_indices)): |
211
|
|
|
begin_value = cycle_start_indices.iloc[i - 1] |
212
|
|
|
end_value = cycle_start_indices.iloc[i] |
213
|
|
|
charge_cycle_ah[begin_value:end_value] = charge_ah[begin_value:end_value] - \ |
214
|
|
|
charge_ah[begin_value - 1] |
215
|
|
|
|
216
|
|
|
df_out_indexed['charge_cycle_ah'] = charge_cycle_ah |
217
|
|
|
|
218
|
|
|
# Get the discharge_Ah per cycle |
219
|
|
|
discharge_cycle_ah = np.array(df_out_indexed['Discharge_Ah']) |
220
|
|
|
discharge_ah = np.array(df_out_indexed['Discharge_Ah']) |
221
|
|
|
|
222
|
|
|
for i in range(1, len(cycle_start_indices)): |
223
|
|
|
begin_value = cycle_start_indices.iloc[i - 1] |
224
|
|
|
end_value = cycle_start_indices.iloc[i] |
225
|
|
|
discharge_cycle_ah[begin_value:end_value] = discharge_ah[begin_value:end_value] - \ |
226
|
|
|
discharge_ah[begin_value - 1] |
227
|
|
|
|
228
|
|
|
df_out_indexed['discharge_cycle_ah'] = discharge_cycle_ah |
229
|
|
|
|
230
|
|
|
# This is the data column we can use for prediction. |
231
|
|
|
# This is not totally accurate, as this still has some points that go negative, |
232
|
|
|
# due to incorrect discharge_Ah values every few cycles. |
233
|
|
|
# But the machine learning algorithm should consider these as outliers and |
234
|
|
|
# hopefully get over it. We can come back and correct this. |
235
|
|
|
df_out_indexed['capacity_ah'] = charge_cycle_ah - discharge_cycle_ah |
236
|
|
|
df_out_indexed.rename(columns={'Current_Amp':'Current(A)','Voltage_Volt':'Voltage(V)'}, |
237
|
|
|
inplace=True) |
238
|
|
|
return df_out_indexed |
239
|
|
|
|
240
|
|
|
# @profile |
241
|
|
|
|
242
|
|
|
|
243
|
|
|
def pl_samples_file_reader(data_dir, file_name_format, ignore_file_indices): |
244
|
|
|
""" |
245
|
|
|
This function reads in the data for PL Samples experiment and returns a |
246
|
|
|
nice dataframe with cycles in ascending order. |
247
|
|
|
|
248
|
|
|
Args: |
249
|
|
|
data_dir (string): This is the absolute path to the data directory. |
250
|
|
|
file_name_format (string): Format of the filename, used to deduce other files. |
251
|
|
|
ignore_file_indices (list, int): This list of ints tells which to ignore. |
252
|
|
|
|
253
|
|
|
Returns: |
254
|
|
|
The complete test data in a dataframe with extra column for capacity in Ah. |
255
|
|
|
""" |
256
|
|
|
|
257
|
|
|
# Raise an exception if the type of the inputs is not correct |
258
|
|
|
if not isinstance(data_dir, str): |
259
|
|
|
raise TypeError('data_dir is not of type string') |
260
|
|
|
|
261
|
|
|
if not isinstance(file_name_format, str): |
262
|
|
|
raise TypeError('file_name_format is not of type string') |
263
|
|
|
|
264
|
|
|
if not isinstance(ignore_file_indices, list): |
265
|
|
|
raise TypeError("ignore_file_indices should be a list") |
266
|
|
|
|
267
|
|
|
for ignore_file_indice in ignore_file_indices: |
268
|
|
|
if not isinstance(ignore_file_indice, int): |
269
|
|
|
raise TypeError("""ignore_file_indices elements should be |
270
|
|
|
of type integer""") |
271
|
|
|
|
272
|
|
|
if not os.path.exists(join(data_dir, file_name_format)): |
273
|
|
|
raise FileNotFoundError("File {} not found in the location {}" |
274
|
|
|
.format(file_name_format, data_dir)) |
275
|
|
|
|
276
|
|
|
dict_ord_cycling_data = get_dict_files( |
277
|
|
|
data_dir, file_name_format, ignore_file_indices) |
278
|
|
|
|
279
|
|
|
df_out = concat_dict_dataframes(dict_ord_cycling_data) |
280
|
|
|
|
281
|
|
|
#### |
282
|
|
|
# This has been commented out for performance, as we do not need date_time |
283
|
|
|
#### |
284
|
|
|
# Convert the Date_Time from matlab datenum to human readable Date_Time |
285
|
|
|
# First convert the series into a numpy array |
286
|
|
|
# date_time_matlab = df_out['Date_Time'].tolist() |
287
|
|
|
|
288
|
|
|
# # Apply the conversion to the numpy array |
289
|
|
|
# df_out['Date_Time_new'] = date_time_converter(date_time_matlab) |
290
|
|
|
|
291
|
|
|
# Get the cycle capacities from cumulative capacities |
292
|
|
|
df_out_indexed = get_cycle_capacities(df_out) |
293
|
|
|
|
294
|
|
|
return df_out_indexed |
295
|
|
|
|
296
|
|
|
# Wrapping function to train the LSTM model and calculate model_loss, |
297
|
|
|
# and response to the testing data set. |
298
|
|
|
|
299
|
|
|
|
300
|
|
|
def model_training(data_dir, file_name_format, sheet_name): |
301
|
|
|
""" |
302
|
|
|
This function converts cumulative battery cycling data into individual cycle data |
303
|
|
|
and trains the LSTM model with the converted data set. |
304
|
|
|
|
305
|
|
|
Args: |
306
|
|
|
data_dir (string): This is the absolute path to the data directory. |
307
|
|
|
file_name_format (string): Format of the filename, used to deduce other files. |
308
|
|
|
sheet_name(string or int): Sheet name or sheet number in the excel file containing |
309
|
|
|
the relevant data. |
310
|
|
|
|
311
|
|
|
Returns: |
312
|
|
|
model_loss(dictionary): Returns the history dictionary (more info to be added) |
313
|
|
|
y_hat(array): Predicted response for the testing dataset. |
314
|
|
|
# y_prediction(array): Predicted response for the completely new dataset |
315
|
|
|
# (The input has to be the time series cycling data including values of |
316
|
|
|
# Current, Voltage and Discharge Capacity) |
317
|
|
|
""" |
318
|
|
|
# The function 'cx2_file_reader' is used to read all the excel files |
319
|
|
|
# in the given path and convert the given cumulative data into individual |
320
|
|
|
# cycle data. |
321
|
|
|
individual_cycle_data = cx2_file_reader(data_dir, file_name_format, sheet_name) |
322
|
|
|
|
323
|
|
|
# The function 'data_formatting' is used to drop the unnecesary columns |
324
|
|
|
# from the training data i.e. only the features considered in the model |
325
|
|
|
# (Current, Voltage and Discharge capacity) are retained. |
326
|
|
|
formatted_data = data_formatting(individual_cycle_data) |
327
|
|
|
|
328
|
|
|
# The function 'series_to_supervised' is used to frame the time series training |
329
|
|
|
# data as supervised learning dataset. |
330
|
|
|
learning_df = series_to_supervised( |
331
|
|
|
formatted_data, n_in=1, n_out=1, dropnan=True) |
332
|
|
|
|
333
|
|
|
# The function 'long_short_term_memory' is used to train the model |
334
|
|
|
# and predict response for the new input dataset. |
335
|
|
|
model_loss, y_hat = long_short_term_memory(learning_df) |
336
|
|
|
|
337
|
|
|
return model_loss, y_hat |
338
|
|
|
|
339
|
|
|
|
340
|
|
|
# Function to predict the discharge capacity using the trained LSTM model. |
341
|
|
|
def model_prediction(input_data): |
342
|
|
|
""" |
343
|
|
|
This function can be used to forecast the discharge capacity of a battery using |
344
|
|
|
the trained LSTM model |
345
|
|
|
|
346
|
|
|
Args: |
347
|
|
|
input_data(dataframe): This is the dataframe containing the current, voltage and |
348
|
|
|
discharge capacity values at a prior time which can be used to forecast discharge |
349
|
|
|
capacity at a further time. |
350
|
|
|
|
351
|
|
|
Returns: |
352
|
|
|
y_predicted: The forecasted values of discharge capacity. |
353
|
|
|
""" |
354
|
|
|
|
355
|
|
|
# The function 'series_to_supervised' is used to frame the time series training |
356
|
|
|
# data as supervised learning dataset. |
357
|
|
|
learning_df = series_to_supervised( |
358
|
|
|
input_data, n_in=1, n_out=1, dropnan=True) |
359
|
|
|
learning_df = learning_df.iloc[:, 0:3].values |
360
|
|
|
# Reshaping the input dataset. |
361
|
|
|
learning_df = learning_df.reshape( |
362
|
|
|
(learning_df.shape[0], 1, learning_df.shape[1])) |
363
|
|
|
# Predicting the discharge values using the saved LSTM model. |
364
|
|
|
module_dir = os.path.dirname(os.path.abspath(__file__)) |
365
|
|
|
model_path = join(module_dir,'models') |
366
|
|
|
model = load_model(join(model_path,'lstm_trained_model.h5')) |
367
|
|
|
y_predicted = model.predict(learning_df) |
368
|
|
|
return y_predicted |
369
|
|
|
|
370
|
|
|
|
371
|
|
|
# Wrapping function only to merge and convert cumulative data to |
372
|
|
|
# individual cycle data. |
373
|
|
|
def cx2_file_reader(data_dir, file_name_format, sheet_name): |
374
|
|
|
""" |
375
|
|
|
This function reads in the data for CX2 samples experiment and returns |
376
|
|
|
a well formatted dataframe with cycles in ascending order. |
377
|
|
|
|
378
|
|
|
Args: |
379
|
|
|
data_dir (string): This is the absolute path to the data directory. |
380
|
|
|
file_name_format (string): Format of the filename, used to deduce other files. |
381
|
|
|
sheet_name (string): Sheet name containing the data in the excel file. |
382
|
|
|
|
383
|
|
|
Returns: |
384
|
|
|
The complete test data in a dataframe with extra column for capacity in Ah. |
385
|
|
|
""" |
386
|
|
|
# Raise an exception if the type of the inputs is not correct |
387
|
|
|
if not isinstance(data_dir, str): |
388
|
|
|
raise TypeError('data_dir is not of type string') |
389
|
|
|
|
390
|
|
|
if not isinstance(file_name_format, str): |
391
|
|
|
raise TypeError('file_name_format is not of type string') |
392
|
|
|
|
393
|
|
|
if not isinstance(sheet_name, (str, int)): |
394
|
|
|
raise TypeError('Sheet_Name format is not of type string or integer') |
395
|
|
|
|
396
|
|
|
if not os.path.exists(join(data_dir, file_name_format)): |
397
|
|
|
raise FileNotFoundError("File {} not found in the location {}" |
398
|
|
|
.format(file_name_format, data_dir)) |
399
|
|
|
|
400
|
|
|
# Get the list of files in the directory |
401
|
|
|
path = join(data_dir, file_name_format) |
402
|
|
|
files = listdir(path) |
403
|
|
|
|
404
|
|
|
# Extract the experiment name from the file_name_format |
405
|
|
|
# exp_name = file_name_format[0:6] |
406
|
|
|
|
407
|
|
|
# Filtering out and reading the excel files in the data directory |
408
|
|
|
file_names = list(filter(lambda x: x[-5:] == '.xlsx', files)) |
409
|
|
|
|
410
|
|
|
# Sorting the file names using the |
411
|
|
|
# 'file_name_sorting' function. |
412
|
|
|
sorted_name_list = file_name_sorting(file_names) |
413
|
|
|
|
414
|
|
|
# Reading dataframes according to the date of experimentation |
415
|
|
|
# using 'reading_dataframes' function. |
416
|
|
|
sorted_df = reading_dataframes(sorted_name_list, sheet_name, path) |
417
|
|
|
|
418
|
|
|
# Merging all the dataframes and adjusting the cycle index |
419
|
|
|
# using the 'concat_df' function. |
420
|
|
|
cycle_data = concat_df(sorted_df) |
421
|
|
|
|
422
|
|
|
# Calculating the net capacity of the battery at every datapoint |
423
|
|
|
# using the function 'capacity'. |
424
|
|
|
capacity_data = capacity(cycle_data) |
425
|
|
|
|
426
|
|
|
# Returns the dataframe with new cycle indices and capacity data. |
427
|
|
|
return capacity_data |
428
|
|
|
|
429
|
|
|
|
430
|
|
|
def file_name_sorting(file_name_list): |
431
|
|
|
""" |
432
|
|
|
This function sorts all the file names according to the date |
433
|
|
|
on the file name. |
434
|
|
|
|
435
|
|
|
Args: |
436
|
|
|
file_name_list(list): List containing all the file names to be read |
437
|
|
|
|
438
|
|
|
Returns: |
439
|
|
|
A list of file names sorted according to the date on the file name. |
440
|
|
|
|
441
|
|
|
""" |
442
|
|
|
filename = pd.DataFrame(data=file_name_list, columns=['file_name']) |
443
|
|
|
# Splitting the file name into different columns |
444
|
|
|
filename['cell_type'], filename['cell_num'], filename['month'], filename[ |
445
|
|
|
'day'], filename['year'] = filename['file_name'].str.split('_', 4).str |
446
|
|
|
filename['year'], filename['ext'] = filename['year'].str.split('.', 1).str |
447
|
|
|
filename['date'] = '' |
448
|
|
|
# Merging the year, month and date column to create a string for DateTime |
449
|
|
|
# object. |
450
|
|
|
filename['date'] = filename['year'].map( |
451
|
|
|
str) + filename['month'].map(str) + filename['day'].map(str) |
452
|
|
|
# Creating a DateTime object. |
453
|
|
|
filename['date_time'] = '' |
454
|
|
|
filename['date_time'] = pd.to_datetime(filename['date'], format="%y%m%d") |
455
|
|
|
# Sorting the file names according to the |
456
|
|
|
# created DateTime object. |
457
|
|
|
filename.sort_values(['date_time'], inplace=True) |
458
|
|
|
# Created a list of sorted file names |
459
|
|
|
sorted_file_names = filename['file_name'].values |
460
|
|
|
return sorted_file_names |
461
|
|
|
|
462
|
|
|
|
463
|
|
|
def reading_dataframes(file_names, sheet_name, path): |
464
|
|
|
""" |
465
|
|
|
This function reads all the files in the sorted |
466
|
|
|
file names list as a dataframe |
467
|
|
|
|
468
|
|
|
Args(list): |
469
|
|
|
file_names: Sorted file names list |
470
|
|
|
sheet_name: Sheet name in the excel file containing the data. |
471
|
|
|
|
472
|
|
|
Returns: |
473
|
|
|
Dictionary of dataframes in the order of the sorted file names. |
474
|
|
|
""" |
475
|
|
|
# Empty dictionary to store all the dataframes according |
476
|
|
|
# to the order in the sorted files name list |
477
|
|
|
df_raw = {} |
478
|
|
|
# Reading the dataframes |
479
|
|
|
for i, filename in enumerate(file_names): |
480
|
|
|
df_raw[i] = pd.read_excel( |
481
|
|
|
join( |
482
|
|
|
path, |
483
|
|
|
filename), |
484
|
|
|
sheet_name=sheet_name) |
485
|
|
|
return df_raw |
486
|
|
|
|
487
|
|
|
|
488
|
|
|
def concat_df(df_dict): |
489
|
|
|
""" |
490
|
|
|
This function concatenates all the dataframes and edits |
491
|
|
|
the cycle index for the concatenated dataframes. |
492
|
|
|
|
493
|
|
|
Args: |
494
|
|
|
df_dict(dictionary): Dictionary of dataframes to be concatenated. |
495
|
|
|
|
496
|
|
|
Returns: |
497
|
|
|
A concatenated dataframe with editted cycle index |
498
|
|
|
|
499
|
|
|
""" |
500
|
|
|
df_concat = None |
501
|
|
|
for data in df_dict: |
502
|
|
|
if df_concat is None: |
503
|
|
|
df_next = df_dict[data] |
504
|
|
|
df_concat = pd.DataFrame(data=None, columns=df_next.columns) |
505
|
|
|
# df_next['Cycle'] = df_next['Cycle'] + max(df_pl12['Cycle']) |
506
|
|
|
df_concat = pd.concat([df_concat, df_next]) |
507
|
|
|
else: |
508
|
|
|
df_next = df_dict[data] |
509
|
|
|
df_next['Cycle_Index'] = np.array( |
510
|
|
|
df_next['Cycle_Index']) + max(np.array(df_concat['Cycle_Index'])) |
511
|
|
|
df_next['Test_Time(s)'] = np.array( |
512
|
|
|
df_next['Test_Time(s)']) + max(np.array(df_concat['Test_Time(s)'])) |
513
|
|
|
df_next['Charge_Capacity(Ah)'] = np.array( |
514
|
|
|
df_next['Charge_Capacity(Ah)']) + max(np.array(df_concat['Charge_Capacity(Ah)'])) |
515
|
|
|
df_next['Discharge_Capacity(Ah)'] = np.array( |
516
|
|
|
df_next['Discharge_Capacity(Ah)']) + max( |
517
|
|
|
np.array(df_concat['Discharge_Capacity(Ah)'])) |
518
|
|
|
df_concat = pd.concat([df_concat, df_next]) |
519
|
|
|
# Reset the index and drop the old index |
520
|
|
|
df_reset = df_concat.reset_index(drop=True) |
521
|
|
|
return df_reset |
522
|
|
|
|
523
|
|
|
|
524
|
|
|
def capacity(df_data): |
525
|
|
|
""" |
526
|
|
|
This function calculates the net capacity of the battery |
527
|
|
|
from the charge capacity and discharge capacity values. |
528
|
|
|
|
529
|
|
|
Args: |
530
|
|
|
df_data(dataframe): Concatenated dataframe which has the values of charge |
531
|
|
|
capacity and discharge capacity for which net capacity has to be |
532
|
|
|
calculated. |
533
|
|
|
|
534
|
|
|
Returns: |
535
|
|
|
Dataframe with net capacity of the battery for every point of the charge |
536
|
|
|
and discharge cycle. |
537
|
|
|
""" |
538
|
|
|
# Grouping rows by the cycle index. |
539
|
|
|
group = df_data.groupby(['Cycle_Index']).count() |
540
|
|
|
|
541
|
|
|
# Get the indices when a cycle starts |
542
|
|
|
cycle_start_indices = group['Data_Point'].cumsum() |
543
|
|
|
|
544
|
|
|
# Get the charge_Ah per cycle |
545
|
|
|
# Create numpy array to store the old charge_Ah row, and then |
546
|
|
|
# perform transformation on it, rather than in the pandas series |
547
|
|
|
# this is a lot faster in this case |
548
|
|
|
charge_cycle_ah = np.array(df_data['Charge_Capacity(Ah)']) |
549
|
|
|
charge_ah = np.array(df_data['Charge_Capacity(Ah)']) |
550
|
|
|
|
551
|
|
|
for i in range(1, len(cycle_start_indices)): |
552
|
|
|
begin_value = cycle_start_indices.iloc[i - 1] |
553
|
|
|
end_value = cycle_start_indices.iloc[i] |
554
|
|
|
charge_cycle_ah[begin_value:end_value] = charge_ah[begin_value:end_value] - \ |
555
|
|
|
charge_ah[begin_value - 1] |
556
|
|
|
|
557
|
|
|
df_data['charge_cycle_ah'] = charge_cycle_ah |
558
|
|
|
|
559
|
|
|
# Get the discharge_Ah per cycle |
560
|
|
|
discharge_cycle_ah = np.array(df_data['Discharge_Capacity(Ah)']) |
561
|
|
|
discharge_ah = np.array(df_data['Discharge_Capacity(Ah)']) |
562
|
|
|
|
563
|
|
|
for i in range(1, len(cycle_start_indices)): |
564
|
|
|
begin_value = cycle_start_indices.iloc[i - 1] |
565
|
|
|
end_value = cycle_start_indices.iloc[i] |
566
|
|
|
discharge_cycle_ah[begin_value:end_value] = discharge_ah[begin_value:end_value] - \ |
567
|
|
|
discharge_ah[begin_value - 1] |
568
|
|
|
|
569
|
|
|
df_data['discharge_cycle_ah'] = discharge_cycle_ah |
570
|
|
|
|
571
|
|
|
# This is the data column we can use for prediction. |
572
|
|
|
# This is not totally accurate, as this still has some points that go negative, |
573
|
|
|
# due to incorrect discharge_Ah values every few cycles. |
574
|
|
|
# But the machine learning algorithm should consider these as outliers and |
575
|
|
|
# hopefully get over it. We can come back and correct this. |
576
|
|
|
df_data['capacity_ah'] = df_data['charge_cycle_ah'] - df_data['discharge_cycle_ah'] |
577
|
|
|
|
578
|
|
|
return df_data |
579
|
|
|
|
580
|
|
|
|
581
|
|
|
def data_formatting(merged_df): |
582
|
|
|
""" |
583
|
|
|
This function formats the merged dataframe so that it can be used to frame the given |
584
|
|
|
time series data as a supervised learning dataset. |
585
|
|
|
|
586
|
|
|
Args: |
587
|
|
|
merged_df(dataframe): The merged dataframe which can be obtained by using the |
588
|
|
|
function 'cx2_file_reader' |
589
|
|
|
|
590
|
|
|
Returns: |
591
|
|
|
A numpy array with only values required to frame a time series as a |
592
|
|
|
supervised learning dataset. |
593
|
|
|
""" |
594
|
|
|
# Get the columns containing text 'Current', 'Voltage' and |
595
|
|
|
# 'discharge_cycle_ah' |
596
|
|
|
merged_df = merged_df.filter(regex='Current|Voltage|discharge_cycle_ah') |
597
|
|
|
formatted_df = merged_df.astype('float32') |
598
|
|
|
return formatted_df |
599
|
|
|
|
600
|
|
|
|
601
|
|
|
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True): |
602
|
|
|
""" |
603
|
|
|
Frame a time series as a supervised learning dataset. |
604
|
|
|
|
605
|
|
|
Arguments: |
606
|
|
|
data: Sequence of observations as a list or NumPy array. |
607
|
|
|
n_in: Number of lag observations as input (X). |
608
|
|
|
n_out: Number of observations as output (y). |
609
|
|
|
dropnan: Boolean whether or not to drop rows with NaN values. |
610
|
|
|
|
611
|
|
|
Returns: |
612
|
|
|
Pandas DataFrame of series framed for supervised learning. |
613
|
|
|
|
614
|
|
|
""" |
615
|
|
|
n_vars = 1 if isinstance(data, list) else data.shape[1] |
616
|
|
|
df_data = pd.DataFrame(data) |
617
|
|
|
cols, names = list(), list() |
618
|
|
|
# input sequence (t-n, ... t-1) |
619
|
|
|
for i in range(n_in, 0, -1): |
620
|
|
|
cols.append(df_data.shift(i)) |
621
|
|
|
names += [('var%d(t-%d)' % (j + 1, i)) for j in range(n_vars)] |
622
|
|
|
# forecast sequence (t, t+1, ... t+n) |
623
|
|
|
for i in range(0, n_out): |
624
|
|
|
cols.append(df_data.shift(-i)) |
625
|
|
|
if i == 0: |
626
|
|
|
names += [('var%d(t)' % (j + 1)) for j in range(n_vars)] |
627
|
|
|
else: |
628
|
|
|
names += [('var%d(t+%d)' % (j + 1, i)) for j in range(n_vars)] |
629
|
|
|
# put it all together |
630
|
|
|
sl_df = pd.concat(cols, axis=1) |
631
|
|
|
sl_df.columns = names |
632
|
|
|
# drop rows with NaN values |
633
|
|
|
if dropnan: |
634
|
|
|
sl_df.dropna(inplace=True) |
635
|
|
|
sl_df.drop(sl_df.columns[[3, 4]], axis=1, inplace=True) |
636
|
|
|
sl_df.rename(columns={'var1(t-1)':'Current(t-1)','var2(t-1)':'Voltage(t-1)', |
637
|
|
|
'var3(t-1)':'discharge_capacity(t-1)','var3(t)':'discharge_capacity(t)'}, |
638
|
|
|
inplace = True) |
639
|
|
|
return sl_df |
640
|
|
|
|
641
|
|
|
|
642
|
|
|
def long_short_term_memory(model_data): |
643
|
|
|
""" |
644
|
|
|
This function splits the input dataset into training |
645
|
|
|
and testing datasets. The keras LSTM model is then |
646
|
|
|
trained and tested using the respective datasets. |
647
|
|
|
|
648
|
|
|
Args: |
649
|
|
|
model_data(dataframe): Values of input and output variables |
650
|
|
|
of time series data framed as a supervised learning dataset. |
651
|
|
|
|
652
|
|
|
|
653
|
|
|
Returns: |
654
|
|
|
model_loss(dictionary): Returns the history dictionary (more info to be added) |
655
|
|
|
y_hat(array): Predicted response for the testing dataset. |
656
|
|
|
y_prediction(array): Predicted response for the completely new dataset. |
657
|
|
|
""" |
658
|
|
|
# Splitting the input dataset into training and testing data |
659
|
|
|
train, test = train_test_split(model_data, test_size=0.2, random_state=944) |
660
|
|
|
# split into input and outputs |
661
|
|
|
train_x, train_y = train[train.columns[0:3] |
662
|
|
|
].values, train[train.columns[3]].values |
663
|
|
|
test_x, test_y = test[test.columns[0:3] |
664
|
|
|
].values, test[test.columns[3]].values |
665
|
|
|
# reshape input to be 3D [samples, timesteps, features] |
666
|
|
|
train_x = train_x.reshape((train_x.shape[0], 1, train_x.shape[1])) |
667
|
|
|
test_x = test_x.reshape((test_x.shape[0], 1, test_x.shape[1])) |
668
|
|
|
# print(train_x.shape, train_y.shape, test_x.shape, test_y.shape) |
669
|
|
|
|
670
|
|
|
# Designing the network |
671
|
|
|
model = Sequential() |
672
|
|
|
model.add(LSTM(50, input_shape=(train_x.shape[1], train_x.shape[2]))) |
673
|
|
|
model.add(Dense(1)) |
674
|
|
|
model.compile(loss='mae', optimizer='adam') |
675
|
|
|
# Fitting the network with training and testing data |
676
|
|
|
history = model.fit( |
677
|
|
|
train_x, |
678
|
|
|
train_y, |
679
|
|
|
epochs=50, |
680
|
|
|
batch_size=72, |
681
|
|
|
validation_data=( |
682
|
|
|
test_x, |
683
|
|
|
test_y), |
684
|
|
|
verbose=0, |
685
|
|
|
shuffle=False) |
686
|
|
|
model_loss = history.history |
687
|
|
|
# Prediction for the test dataset. |
688
|
|
|
yhat = model.predict(test_x) |
689
|
|
|
# model.save('lstm_trained_model.h5') |
690
|
|
|
return model_loss, yhat |
691
|
|
|
|
692
|
|
|
def file_reader(data_dir, file_name_format, sheet_name, ignore_file_indices): |
693
|
|
|
""" |
694
|
|
|
This function reads PL sample, CX2 and CS2 files and returns a nice |
695
|
|
|
dataframe with cyclic values of charge and discharge capacity with |
696
|
|
|
cycles in ascending order |
697
|
|
|
|
698
|
|
|
Args: |
699
|
|
|
data_dir (string): This is the absolute path to the data directory. |
700
|
|
|
file_name_format (string): Format of the filename, used to deduce other files. |
701
|
|
|
sheet_name (string): Sheet name containing the data in the excel file. |
702
|
|
|
ignore_file_indices (list, int): This list of ints tells which to ignore. |
703
|
|
|
|
704
|
|
|
Returns: |
705
|
|
|
The complete test data in a dataframe with extra column for capacity in Ah. |
706
|
|
|
""" |
707
|
|
|
|
708
|
|
|
# For excel files (CX2 and CS2 datafiles), the function 'cx2_file_reader' |
709
|
|
|
# is used. |
710
|
|
|
if file_name_format[:3] == 'CX2' or file_name_format[:3] == 'CS2': |
711
|
|
|
df_output = cx2_file_reader(data_dir,file_name_format,sheet_name) |
712
|
|
|
else: |
713
|
|
|
df_output = pl_samples_file_reader(data_dir,file_name_format,ignore_file_indices) |
714
|
|
|
|
715
|
|
|
# The function 'data_formatting' is used to drop the unnecesary columns |
716
|
|
|
# from the training data i.e. only the features considered in the model |
717
|
|
|
# (Current, Voltage and Discharge capacity) are retained. |
718
|
|
|
formatted_data = data_formatting(df_output) |
719
|
|
|
|
720
|
|
|
# The function 'series_to_supervised' is used to frame the time series training |
721
|
|
|
# data as supervised learning dataset. |
722
|
|
|
# df_out = series_to_supervised( |
723
|
|
|
# formatted_data, n_in=1, n_out=1, dropnan=True) |
724
|
|
|
return formatted_data |
725
|
|
|
|