1 | """ |
||
2 | This module can be used to read cycling data of the CX2, CS2 and PL type cells as |
||
3 | a dataframe. It converts cumulative values into individual values for |
||
4 | each cycle and determines net charge of the battery at every datapoint. |
||
5 | It can also be used to train and test a LSTM model and predict discharge capacity |
||
6 | using the LSTM model. |
||
7 | """ |
||
8 | |||
9 | import datetime |
||
10 | import os |
||
11 | from os import listdir |
||
12 | from os.path import isfile, join |
||
13 | import re |
||
14 | # import matplotlib.pyplot as plt |
||
15 | # import seaborn as sns |
||
16 | import pandas as pd |
||
17 | import numpy as np |
||
18 | |||
19 | from sklearn.model_selection import train_test_split |
||
20 | from keras.models import Sequential |
||
21 | from keras.layers import Dense |
||
22 | from keras.layers import LSTM |
||
23 | from keras.models import load_model |
||
24 | |||
25 | # @profile |
||
26 | def date_time_converter(date_time_list): |
||
27 | """ |
||
28 | This function gets the numpy array with date_time in matlab format |
||
29 | and returns a numpy array with date_time in human readable format. |
||
30 | """ |
||
31 | |||
32 | if not isinstance(date_time_list, list): |
||
33 | raise TypeError("date_time_list should be a list") |
||
34 | |||
35 | # Empty array to hold the results |
||
36 | date_time_human = [] |
||
37 | |||
38 | for i in date_time_list: |
||
39 | date_time_human.append( |
||
40 | datetime.datetime.fromordinal( |
||
41 | int(i)) + |
||
42 | datetime.timedelta( |
||
43 | days=i % |
||
44 | 1) - |
||
45 | datetime.timedelta( |
||
46 | days=366)) |
||
47 | |||
48 | return date_time_human |
||
49 | |||
50 | # @profile |
||
51 | |||
52 | |||
53 | def get_dict_files(data_dir, file_name_format, ignore_file_indices): |
||
54 | """ |
||
55 | This function finds all the files at the location of the file name |
||
56 | format as specified and then creates a dictionary after ignoring the |
||
57 | list of file specified |
||
58 | |||
59 | Args: |
||
60 | data_dir (string): This is the absolute path to the data directory. |
||
61 | file_name_format (string): Format of the filename, used to deduce other |
||
62 | files. |
||
63 | ignore_file_indices (list, int): This list of ints tells |
||
64 | which to ignore. |
||
65 | |||
66 | Returns: |
||
67 | The dictionary with all data from files dataframes. |
||
68 | """ |
||
69 | |||
70 | # get the list of files in the directory |
||
71 | onlyfiles = [f for f in listdir(data_dir) if isfile(join(data_dir, f))] |
||
72 | |||
73 | # Extract the experiment name from the file_name_format |
||
74 | exp_name = file_name_format[0:4] |
||
75 | |||
76 | # Empty dictionary to hold all the dataframe for various files |
||
77 | dict_files = {} |
||
78 | |||
79 | # Iterate over all the files of certain type and get the file number from |
||
80 | # them |
||
81 | for filename in onlyfiles: |
||
82 | if exp_name in filename: |
||
83 | # Extract the filenumber from the name |
||
84 | file_number = re.search( |
||
85 | exp_name + r'\((.+?)\).csv', |
||
86 | filename).group(1) |
||
87 | # Give a value of dataframe to each key |
||
88 | dict_files[int(file_number)] = pd.read_csv( |
||
0 ignored issues
–
show
Comprehensibility
Best Practice
introduced
by
![]() |
|||
89 | join(data_dir, filename)) |
||
90 | |||
91 | # Empty dictionary to hold the ordered dictionaries |
||
92 | dict_ordered = {} |
||
93 | # Sort the dictionary based on keys |
||
94 | for key in sorted(dict_files.keys()): |
||
95 | dict_ordered[key] = dict_files[key] |
||
96 | |||
97 | # Keys with files to keep, remove the ignore indices from all keys |
||
98 | wanted_keys = np.array( |
||
99 | list(set(dict_ordered.keys()) - set(ignore_file_indices))) |
||
100 | |||
101 | # Remove the ignored dataframes for characterization |
||
102 | dict_ord_cycling_data = {k: dict_ordered[k] for k in wanted_keys} |
||
103 | |||
104 | return dict_ord_cycling_data |
||
105 | |||
106 | |||
107 | def concat_dict_dataframes(dict_ord_cycling_data): |
||
108 | """ |
||
109 | This function takes in a dictionary with ordered keys |
||
110 | and concatenates the dataframes in the values of the |
||
111 | dictionary to create a large dataframe with all the records. |
||
112 | |||
113 | Args: |
||
114 | dict_ord_cycling_data (dict): |
||
115 | The dictionary with ordered integer keys and dataframes as values |
||
116 | |||
117 | Returns: |
||
118 | The dataframe after concatenation |
||
119 | |||
120 | """ |
||
121 | |||
122 | # Raise an exception if the type of the inputs is not correct |
||
123 | if not isinstance(dict_ord_cycling_data, dict): |
||
124 | raise TypeError('dict_ord_cycling_data is not of type dict') |
||
125 | |||
126 | #print(dict_ord_cycling_data.keys()) |
||
127 | for i in dict_ord_cycling_data.keys(): |
||
128 | # Raise an exception if the type of the keys is not integers |
||
129 | # print(type(i)) |
||
130 | if not isinstance(i, (int, np.int64)): |
||
131 | raise TypeError('a key in the dictionary is not an integer') |
||
132 | |||
133 | for i in dict_ord_cycling_data.values(): |
||
134 | # Raise an exception if the type of the values is not a dataframe |
||
135 | if not isinstance(i, pd.DataFrame): |
||
136 | raise TypeError('a value in the dictionary is not a pandas ' + |
||
137 | 'dataframe') |
||
138 | # print(i.columns) |
||
139 | # Raise am exception if the necessary columns are not found in the df |
||
140 | if not { |
||
141 | 'Cycle', |
||
142 | 'Charge_Ah', |
||
143 | 'Discharge_Ah', |
||
144 | 'Time_sec', |
||
145 | 'Current_Amp', |
||
146 | 'Voltage_Volt'}.issubset(i.columns): |
||
147 | raise Exception("the dataframe doesnt have the columns 'Cycle'" + |
||
148 | ", 'Charge_Ah', 'Discharge_Ah', " + |
||
149 | "'Time_sec', 'Voltage_Volt', 'Current_Amp' ") |
||
150 | |||
151 | # Concatenate the dataframes to create the total dataframe |
||
152 | df_out = None |
||
153 | for k in dict_ord_cycling_data.keys(): |
||
154 | View Code Duplication | if df_out is None: |
|
0 ignored issues
–
show
|
|||
155 | df_next = dict_ord_cycling_data[k] |
||
156 | df_out = pd.DataFrame(data=None, columns=df_next.columns) |
||
157 | df_out = pd.concat([df_out, df_next]) |
||
158 | else: |
||
159 | df_next = dict_ord_cycling_data[k] |
||
160 | df_next['Cycle'] = np.array( |
||
161 | df_next['Cycle']) + max(np.array(df_out['Cycle'])) |
||
162 | df_next['Time_sec'] = np.array( |
||
163 | df_next['Time_sec']) + max(np.array(df_out['Time_sec'])) |
||
164 | df_next['Charge_Ah'] = np.array( |
||
165 | df_next['Charge_Ah']) + max(np.array(df_out['Charge_Ah'])) |
||
166 | df_next['Discharge_Ah'] = np.array( |
||
167 | df_next['Discharge_Ah']) + max(np.array(df_out['Discharge_Ah'])) |
||
168 | df_out = pd.concat([df_out, df_next]) |
||
169 | |||
170 | return df_out |
||
171 | |||
172 | |||
173 | def get_cycle_capacities(df_out): |
||
174 | """ |
||
175 | This function takes the dataframe, creates a new index and then calculates |
||
176 | capacities per cycle from cumulative charge and discharge capacities |
||
177 | |||
178 | Args: |
||
179 | df_out (pandas.DataFrame): |
||
180 | Concatenated dataframe |
||
181 | |||
182 | Returns: |
||
183 | the dataframe with capacities per cycle |
||
184 | |||
185 | """ |
||
186 | |||
187 | # Raise am exception if the necessary columns are not found in the df |
||
188 | if not {'Cycle', 'Charge_Ah', 'Discharge_Ah', 'Time_sec', 'Current_Amp', |
||
189 | 'Voltage_Volt'}.issubset(df_out.columns): |
||
190 | raise Exception("the dataframe doesnt have the columns 'Cycle'" + |
||
191 | ", 'Charge_Ah', 'Discharge_Ah', " + |
||
192 | "'Time_sec', 'Voltage_Volt', 'Current_Amp' ") |
||
193 | |||
194 | # Reset the index and drop the old index |
||
195 | df_out_indexed = df_out.reset_index(drop=True) |
||
196 | |||
197 | # Proceed further with correcting the capacity |
||
198 | df_grouped = df_out_indexed.groupby(['Cycle']).count() |
||
199 | |||
200 | # Get the indices when a cycle starts |
||
201 | cycle_start_indices = df_grouped['Time_sec'].cumsum() |
||
202 | |||
203 | # Get the charge_Ah per cycle |
||
204 | # Create numpy array to store the old charge_Ah row, and then |
||
205 | # perform transformation on it, rather than in the pandas series |
||
206 | # this is a lot faster in this case |
||
207 | charge_cycle_ah = np.array(df_out_indexed['Charge_Ah']) |
||
208 | charge_ah = np.array(df_out_indexed['Charge_Ah']) |
||
209 | |||
210 | for i in range(1, len(cycle_start_indices)): |
||
211 | begin_value = cycle_start_indices.iloc[i - 1] |
||
212 | end_value = cycle_start_indices.iloc[i] |
||
213 | charge_cycle_ah[begin_value:end_value] = charge_ah[begin_value:end_value] - \ |
||
214 | charge_ah[begin_value - 1] |
||
215 | |||
216 | df_out_indexed['charge_cycle_ah'] = charge_cycle_ah |
||
217 | |||
218 | # Get the discharge_Ah per cycle |
||
219 | discharge_cycle_ah = np.array(df_out_indexed['Discharge_Ah']) |
||
220 | discharge_ah = np.array(df_out_indexed['Discharge_Ah']) |
||
221 | |||
222 | for i in range(1, len(cycle_start_indices)): |
||
223 | begin_value = cycle_start_indices.iloc[i - 1] |
||
224 | end_value = cycle_start_indices.iloc[i] |
||
225 | discharge_cycle_ah[begin_value:end_value] = discharge_ah[begin_value:end_value] - \ |
||
226 | discharge_ah[begin_value - 1] |
||
227 | |||
228 | df_out_indexed['discharge_cycle_ah'] = discharge_cycle_ah |
||
229 | |||
230 | # This is the data column we can use for prediction. |
||
231 | # This is not totally accurate, as this still has some points that go negative, |
||
232 | # due to incorrect discharge_Ah values every few cycles. |
||
233 | # But the machine learning algorithm should consider these as outliers and |
||
234 | # hopefully get over it. We can come back and correct this. |
||
235 | df_out_indexed['capacity_ah'] = charge_cycle_ah - discharge_cycle_ah |
||
236 | df_out_indexed.rename(columns={'Current_Amp':'Current(A)','Voltage_Volt':'Voltage(V)'}, |
||
237 | inplace=True) |
||
238 | return df_out_indexed |
||
239 | |||
240 | # @profile |
||
241 | |||
242 | |||
243 | def pl_samples_file_reader(data_dir, file_name_format, ignore_file_indices): |
||
244 | """ |
||
245 | This function reads in the data for PL Samples experiment and returns a |
||
246 | nice dataframe with cycles in ascending order. |
||
247 | |||
248 | Args: |
||
249 | data_dir (string): This is the absolute path to the data directory. |
||
250 | file_name_format (string): Format of the filename, used to deduce other files. |
||
251 | ignore_file_indices (list, int): This list of ints tells which to ignore. |
||
252 | |||
253 | Returns: |
||
254 | The complete test data in a dataframe with extra column for capacity in Ah. |
||
255 | """ |
||
256 | |||
257 | # Raise an exception if the type of the inputs is not correct |
||
258 | if not isinstance(data_dir, str): |
||
259 | raise TypeError('data_dir is not of type string') |
||
260 | |||
261 | if not isinstance(file_name_format, str): |
||
262 | raise TypeError('file_name_format is not of type string') |
||
263 | |||
264 | if not isinstance(ignore_file_indices, list): |
||
265 | raise TypeError("ignore_file_indices should be a list") |
||
266 | |||
267 | for ignore_file_indice in ignore_file_indices: |
||
268 | if not isinstance(ignore_file_indice, int): |
||
269 | raise TypeError("""ignore_file_indices elements should be |
||
270 | of type integer""") |
||
271 | |||
272 | if not os.path.exists(join(data_dir, file_name_format)): |
||
273 | raise FileNotFoundError("File {} not found in the location {}" |
||
274 | .format(file_name_format, data_dir)) |
||
275 | |||
276 | dict_ord_cycling_data = get_dict_files( |
||
277 | data_dir, file_name_format, ignore_file_indices) |
||
278 | |||
279 | df_out = concat_dict_dataframes(dict_ord_cycling_data) |
||
280 | |||
281 | #### |
||
282 | # This has been commented out for performance, as we do not need date_time |
||
283 | #### |
||
284 | # Convert the Date_Time from matlab datenum to human readable Date_Time |
||
285 | # First convert the series into a numpy array |
||
286 | # date_time_matlab = df_out['Date_Time'].tolist() |
||
287 | |||
288 | # # Apply the conversion to the numpy array |
||
289 | # df_out['Date_Time_new'] = date_time_converter(date_time_matlab) |
||
290 | |||
291 | # Get the cycle capacities from cumulative capacities |
||
292 | df_out_indexed = get_cycle_capacities(df_out) |
||
293 | |||
294 | return df_out_indexed |
||
295 | |||
296 | # Wrapping function to train the LSTM model and calculate model_loss, |
||
297 | # and response to the testing data set. |
||
298 | |||
299 | |||
300 | def model_training(data_dir, file_name_format, sheet_name): |
||
301 | """ |
||
302 | This function converts cumulative battery cycling data into individual cycle data |
||
303 | and trains the LSTM model with the converted data set. |
||
304 | |||
305 | Args: |
||
306 | data_dir (string): This is the absolute path to the data directory. |
||
307 | file_name_format (string): Format of the filename, used to deduce other files. |
||
308 | sheet_name(string or int): Sheet name or sheet number in the excel file containing |
||
309 | the relevant data. |
||
310 | |||
311 | Returns: |
||
312 | model_loss(dictionary): Returns the history dictionary (more info to be added) |
||
313 | y_hat(array): Predicted response for the testing dataset. |
||
314 | # y_prediction(array): Predicted response for the completely new dataset |
||
315 | # (The input has to be the time series cycling data including values of |
||
316 | # Current, Voltage and Discharge Capacity) |
||
317 | """ |
||
318 | # The function 'cx2_file_reader' is used to read all the excel files |
||
319 | # in the given path and convert the given cumulative data into individual |
||
320 | # cycle data. |
||
321 | individual_cycle_data = cx2_file_reader(data_dir, file_name_format, sheet_name) |
||
322 | |||
323 | # The function 'data_formatting' is used to drop the unnecesary columns |
||
324 | # from the training data i.e. only the features considered in the model |
||
325 | # (Current, Voltage and Discharge capacity) are retained. |
||
326 | formatted_data = data_formatting(individual_cycle_data) |
||
327 | |||
328 | # The function 'series_to_supervised' is used to frame the time series training |
||
329 | # data as supervised learning dataset. |
||
330 | learning_df = series_to_supervised( |
||
331 | formatted_data, n_in=1, n_out=1, dropnan=True) |
||
332 | |||
333 | # The function 'long_short_term_memory' is used to train the model |
||
334 | # and predict response for the new input dataset. |
||
335 | model_loss, y_hat = long_short_term_memory(learning_df) |
||
336 | |||
337 | return model_loss, y_hat |
||
338 | |||
339 | |||
340 | # Function to predict the discharge capacity using the trained LSTM model. |
||
341 | def model_prediction(input_data): |
||
342 | """ |
||
343 | This function can be used to forecast the discharge capacity of a battery using |
||
344 | the trained LSTM model |
||
345 | |||
346 | Args: |
||
347 | input_data(dataframe): This is the dataframe containing the current, voltage and |
||
348 | discharge capacity values at a prior time which can be used to forecast discharge |
||
349 | capacity at a further time. |
||
350 | |||
351 | Returns: |
||
352 | y_predicted: The forecasted values of discharge capacity. |
||
353 | """ |
||
354 | |||
355 | # The function 'series_to_supervised' is used to frame the time series training |
||
356 | # data as supervised learning dataset. |
||
357 | learning_df = series_to_supervised( |
||
358 | input_data, n_in=1, n_out=1, dropnan=True) |
||
359 | learning_df = learning_df.iloc[:, 0:3].values |
||
360 | # Reshaping the input dataset. |
||
361 | learning_df = learning_df.reshape( |
||
362 | (learning_df.shape[0], 1, learning_df.shape[1])) |
||
363 | # Predicting the discharge values using the saved LSTM model. |
||
364 | module_dir = os.path.dirname(os.path.abspath(__file__)) |
||
365 | model_path = join(module_dir,'models') |
||
366 | model = load_model(join(model_path,'lstm_trained_model.h5')) |
||
367 | y_predicted = model.predict(learning_df) |
||
368 | return y_predicted |
||
369 | |||
370 | |||
371 | # Wrapping function only to merge and convert cumulative data to |
||
372 | # individual cycle data. |
||
373 | def cx2_file_reader(data_dir, file_name_format, sheet_name): |
||
374 | """ |
||
375 | This function reads in the data for CX2 samples experiment and returns |
||
376 | a well formatted dataframe with cycles in ascending order. |
||
377 | |||
378 | Args: |
||
379 | data_dir (string): This is the absolute path to the data directory. |
||
380 | file_name_format (string): Format of the filename, used to deduce other files. |
||
381 | sheet_name (string): Sheet name containing the data in the excel file. |
||
382 | |||
383 | Returns: |
||
384 | The complete test data in a dataframe with extra column for capacity in Ah. |
||
385 | """ |
||
386 | # Raise an exception if the type of the inputs is not correct |
||
387 | if not isinstance(data_dir, str): |
||
388 | raise TypeError('data_dir is not of type string') |
||
389 | |||
390 | if not isinstance(file_name_format, str): |
||
391 | raise TypeError('file_name_format is not of type string') |
||
392 | |||
393 | if not isinstance(sheet_name, (str, int)): |
||
394 | raise TypeError('Sheet_Name format is not of type string or integer') |
||
395 | |||
396 | if not os.path.exists(join(data_dir, file_name_format)): |
||
397 | raise FileNotFoundError("File {} not found in the location {}" |
||
398 | .format(file_name_format, data_dir)) |
||
399 | |||
400 | # Get the list of files in the directory |
||
401 | path = join(data_dir, file_name_format) |
||
402 | files = listdir(path) |
||
403 | |||
404 | # Extract the experiment name from the file_name_format |
||
405 | # exp_name = file_name_format[0:6] |
||
406 | |||
407 | # Filtering out and reading the excel files in the data directory |
||
408 | file_names = list(filter(lambda x: x[-5:] == '.xlsx', files)) |
||
409 | |||
410 | # Sorting the file names using the |
||
411 | # 'file_name_sorting' function. |
||
412 | sorted_name_list = file_name_sorting(file_names) |
||
413 | |||
414 | # Reading dataframes according to the date of experimentation |
||
415 | # using 'reading_dataframes' function. |
||
416 | sorted_df = reading_dataframes(sorted_name_list, sheet_name, path) |
||
417 | |||
418 | # Merging all the dataframes and adjusting the cycle index |
||
419 | # using the 'concat_df' function. |
||
420 | cycle_data = concat_df(sorted_df) |
||
421 | |||
422 | # Calculating the net capacity of the battery at every datapoint |
||
423 | # using the function 'capacity'. |
||
424 | capacity_data = capacity(cycle_data) |
||
425 | |||
426 | # Returns the dataframe with new cycle indices and capacity data. |
||
427 | return capacity_data |
||
428 | |||
429 | |||
430 | def file_name_sorting(file_name_list): |
||
431 | """ |
||
432 | This function sorts all the file names according to the date |
||
433 | on the file name. |
||
434 | |||
435 | Args: |
||
436 | file_name_list(list): List containing all the file names to be read |
||
437 | |||
438 | Returns: |
||
439 | A list of file names sorted according to the date on the file name. |
||
440 | |||
441 | """ |
||
442 | filename = pd.DataFrame(data=file_name_list, columns=['file_name']) |
||
443 | # Splitting the file name into different columns |
||
444 | filename['cell_type'], filename['cell_num'], filename['month'], filename[ |
||
445 | 'day'], filename['year'] = filename['file_name'].str.split('_', 4).str |
||
446 | filename['year'], filename['ext'] = filename['year'].str.split('.', 1).str |
||
447 | filename['date'] = '' |
||
448 | # Merging the year, month and date column to create a string for DateTime |
||
449 | # object. |
||
450 | filename['date'] = filename['year'].map( |
||
451 | str) + filename['month'].map(str) + filename['day'].map(str) |
||
452 | # Creating a DateTime object. |
||
453 | filename['date_time'] = '' |
||
454 | filename['date_time'] = pd.to_datetime(filename['date'], format="%y%m%d") |
||
455 | # Sorting the file names according to the |
||
456 | # created DateTime object. |
||
457 | filename.sort_values(['date_time'], inplace=True) |
||
458 | # Created a list of sorted file names |
||
459 | sorted_file_names = filename['file_name'].values |
||
460 | return sorted_file_names |
||
461 | |||
462 | |||
463 | def reading_dataframes(file_names, sheet_name, path): |
||
464 | """ |
||
465 | This function reads all the files in the sorted |
||
466 | file names list as a dataframe |
||
467 | |||
468 | Args(list): |
||
469 | file_names: Sorted file names list |
||
470 | sheet_name: Sheet name in the excel file containing the data. |
||
471 | |||
472 | Returns: |
||
473 | Dictionary of dataframes in the order of the sorted file names. |
||
474 | """ |
||
475 | # Empty dictionary to store all the dataframes according |
||
476 | # to the order in the sorted files name list |
||
477 | df_raw = {} |
||
478 | # Reading the dataframes |
||
479 | for i, filename in enumerate(file_names): |
||
480 | df_raw[i] = pd.read_excel( |
||
481 | join( |
||
482 | path, |
||
483 | filename), |
||
484 | sheet_name=sheet_name) |
||
485 | return df_raw |
||
486 | |||
487 | |||
488 | def concat_df(df_dict): |
||
489 | """ |
||
490 | This function concatenates all the dataframes and edits |
||
491 | the cycle index for the concatenated dataframes. |
||
492 | |||
493 | Args: |
||
494 | df_dict(dictionary): Dictionary of dataframes to be concatenated. |
||
495 | |||
496 | Returns: |
||
497 | A concatenated dataframe with editted cycle index |
||
498 | |||
499 | """ |
||
500 | df_concat = None |
||
501 | for data in df_dict: |
||
502 | if df_concat is None: |
||
503 | df_next = df_dict[data] |
||
504 | df_concat = pd.DataFrame(data=None, columns=df_next.columns) |
||
505 | # df_next['Cycle'] = df_next['Cycle'] + max(df_pl12['Cycle']) |
||
506 | df_concat = pd.concat([df_concat, df_next]) |
||
507 | else: |
||
508 | df_next = df_dict[data] |
||
509 | df_next['Cycle_Index'] = np.array( |
||
510 | df_next['Cycle_Index']) + max(np.array(df_concat['Cycle_Index'])) |
||
511 | df_next['Test_Time(s)'] = np.array( |
||
512 | df_next['Test_Time(s)']) + max(np.array(df_concat['Test_Time(s)'])) |
||
513 | df_next['Charge_Capacity(Ah)'] = np.array( |
||
514 | df_next['Charge_Capacity(Ah)']) + max(np.array(df_concat['Charge_Capacity(Ah)'])) |
||
515 | df_next['Discharge_Capacity(Ah)'] = np.array( |
||
516 | df_next['Discharge_Capacity(Ah)']) + max( |
||
517 | np.array(df_concat['Discharge_Capacity(Ah)'])) |
||
518 | df_concat = pd.concat([df_concat, df_next]) |
||
519 | # Reset the index and drop the old index |
||
520 | df_reset = df_concat.reset_index(drop=True) |
||
521 | return df_reset |
||
522 | |||
523 | |||
524 | def capacity(df_data): |
||
525 | """ |
||
526 | This function calculates the net capacity of the battery |
||
527 | from the charge capacity and discharge capacity values. |
||
528 | |||
529 | Args: |
||
530 | df_data(dataframe): Concatenated dataframe which has the values of charge |
||
531 | capacity and discharge capacity for which net capacity has to be |
||
532 | calculated. |
||
533 | |||
534 | Returns: |
||
535 | Dataframe with net capacity of the battery for every point of the charge |
||
536 | and discharge cycle. |
||
537 | """ |
||
538 | # Grouping rows by the cycle index. |
||
539 | group = df_data.groupby(['Cycle_Index']).count() |
||
540 | |||
541 | # Get the indices when a cycle starts |
||
542 | cycle_start_indices = group['Data_Point'].cumsum() |
||
543 | |||
544 | # Get the charge_Ah per cycle |
||
545 | # Create numpy array to store the old charge_Ah row, and then |
||
546 | # perform transformation on it, rather than in the pandas series |
||
547 | # this is a lot faster in this case |
||
548 | charge_cycle_ah = np.array(df_data['Charge_Capacity(Ah)']) |
||
549 | charge_ah = np.array(df_data['Charge_Capacity(Ah)']) |
||
550 | |||
551 | for i in range(1, len(cycle_start_indices)): |
||
552 | begin_value = cycle_start_indices.iloc[i - 1] |
||
553 | end_value = cycle_start_indices.iloc[i] |
||
554 | charge_cycle_ah[begin_value:end_value] = charge_ah[begin_value:end_value] - \ |
||
555 | charge_ah[begin_value - 1] |
||
556 | |||
557 | df_data['charge_cycle_ah'] = charge_cycle_ah |
||
558 | |||
559 | # Get the discharge_Ah per cycle |
||
560 | discharge_cycle_ah = np.array(df_data['Discharge_Capacity(Ah)']) |
||
561 | discharge_ah = np.array(df_data['Discharge_Capacity(Ah)']) |
||
562 | |||
563 | for i in range(1, len(cycle_start_indices)): |
||
564 | begin_value = cycle_start_indices.iloc[i - 1] |
||
565 | end_value = cycle_start_indices.iloc[i] |
||
566 | discharge_cycle_ah[begin_value:end_value] = discharge_ah[begin_value:end_value] - \ |
||
567 | discharge_ah[begin_value - 1] |
||
568 | |||
569 | df_data['discharge_cycle_ah'] = discharge_cycle_ah |
||
570 | |||
571 | # This is the data column we can use for prediction. |
||
572 | # This is not totally accurate, as this still has some points that go negative, |
||
573 | # due to incorrect discharge_Ah values every few cycles. |
||
574 | # But the machine learning algorithm should consider these as outliers and |
||
575 | # hopefully get over it. We can come back and correct this. |
||
576 | df_data['capacity_ah'] = df_data['charge_cycle_ah'] - df_data['discharge_cycle_ah'] |
||
577 | |||
578 | return df_data |
||
579 | |||
580 | |||
581 | def data_formatting(merged_df): |
||
582 | """ |
||
583 | This function formats the merged dataframe so that it can be used to frame the given |
||
584 | time series data as a supervised learning dataset. |
||
585 | |||
586 | Args: |
||
587 | merged_df(dataframe): The merged dataframe which can be obtained by using the |
||
588 | function 'cx2_file_reader' |
||
589 | |||
590 | Returns: |
||
591 | A numpy array with only values required to frame a time series as a |
||
592 | supervised learning dataset. |
||
593 | """ |
||
594 | # Get the columns containing text 'Current', 'Voltage' and |
||
595 | # 'discharge_cycle_ah' |
||
596 | merged_df = merged_df.filter(regex='Current|Voltage|discharge_cycle_ah') |
||
597 | formatted_df = merged_df.astype('float32') |
||
598 | return formatted_df |
||
599 | |||
600 | |||
601 | def series_to_supervised(data, n_in=1, n_out=1, dropnan=True): |
||
602 | """ |
||
603 | Frame a time series as a supervised learning dataset. |
||
604 | |||
605 | Arguments: |
||
606 | data: Sequence of observations as a list or NumPy array. |
||
607 | n_in: Number of lag observations as input (X). |
||
608 | n_out: Number of observations as output (y). |
||
609 | dropnan: Boolean whether or not to drop rows with NaN values. |
||
610 | |||
611 | Returns: |
||
612 | Pandas DataFrame of series framed for supervised learning. |
||
613 | |||
614 | """ |
||
615 | n_vars = 1 if isinstance(data, list) else data.shape[1] |
||
616 | df_data = pd.DataFrame(data) |
||
617 | cols, names = list(), list() |
||
618 | # input sequence (t-n, ... t-1) |
||
619 | for i in range(n_in, 0, -1): |
||
620 | cols.append(df_data.shift(i)) |
||
621 | names += [('var%d(t-%d)' % (j + 1, i)) for j in range(n_vars)] |
||
622 | # forecast sequence (t, t+1, ... t+n) |
||
623 | for i in range(0, n_out): |
||
624 | cols.append(df_data.shift(-i)) |
||
625 | if i == 0: |
||
626 | names += [('var%d(t)' % (j + 1)) for j in range(n_vars)] |
||
627 | else: |
||
628 | names += [('var%d(t+%d)' % (j + 1, i)) for j in range(n_vars)] |
||
629 | # put it all together |
||
630 | sl_df = pd.concat(cols, axis=1) |
||
631 | sl_df.columns = names |
||
632 | # drop rows with NaN values |
||
633 | if dropnan: |
||
634 | sl_df.dropna(inplace=True) |
||
635 | sl_df.drop(sl_df.columns[[3, 4]], axis=1, inplace=True) |
||
636 | sl_df.rename(columns={'var1(t-1)':'Current(t-1)','var2(t-1)':'Voltage(t-1)', |
||
637 | 'var3(t-1)':'discharge_capacity(t-1)','var3(t)':'discharge_capacity(t)'}, |
||
638 | inplace = True) |
||
639 | return sl_df |
||
640 | |||
641 | |||
642 | def long_short_term_memory(model_data): |
||
643 | """ |
||
644 | This function splits the input dataset into training |
||
645 | and testing datasets. The keras LSTM model is then |
||
646 | trained and tested using the respective datasets. |
||
647 | |||
648 | Args: |
||
649 | model_data(dataframe): Values of input and output variables |
||
650 | of time series data framed as a supervised learning dataset. |
||
651 | |||
652 | |||
653 | Returns: |
||
654 | model_loss(dictionary): Returns the history dictionary (more info to be added) |
||
655 | y_hat(array): Predicted response for the testing dataset. |
||
656 | y_prediction(array): Predicted response for the completely new dataset. |
||
657 | """ |
||
658 | # Splitting the input dataset into training and testing data |
||
659 | train, test = train_test_split(model_data, test_size=0.2, random_state=944) |
||
660 | # split into input and outputs |
||
661 | train_x, train_y = train[train.columns[0:3] |
||
662 | ].values, train[train.columns[3]].values |
||
663 | test_x, test_y = test[test.columns[0:3] |
||
664 | ].values, test[test.columns[3]].values |
||
665 | # reshape input to be 3D [samples, timesteps, features] |
||
666 | train_x = train_x.reshape((train_x.shape[0], 1, train_x.shape[1])) |
||
667 | test_x = test_x.reshape((test_x.shape[0], 1, test_x.shape[1])) |
||
668 | # print(train_x.shape, train_y.shape, test_x.shape, test_y.shape) |
||
669 | |||
670 | # Designing the network |
||
671 | model = Sequential() |
||
672 | model.add(LSTM(50, input_shape=(train_x.shape[1], train_x.shape[2]))) |
||
673 | model.add(Dense(1)) |
||
674 | model.compile(loss='mae', optimizer='adam') |
||
675 | # Fitting the network with training and testing data |
||
676 | history = model.fit( |
||
677 | train_x, |
||
678 | train_y, |
||
679 | epochs=50, |
||
680 | batch_size=72, |
||
681 | validation_data=( |
||
682 | test_x, |
||
683 | test_y), |
||
684 | verbose=0, |
||
685 | shuffle=False) |
||
686 | model_loss = history.history |
||
687 | # Prediction for the test dataset. |
||
688 | yhat = model.predict(test_x) |
||
689 | # model.save('lstm_trained_model.h5') |
||
690 | return model_loss, yhat |
||
691 | |||
692 | def file_reader(data_dir, file_name_format, sheet_name, ignore_file_indices): |
||
693 | """ |
||
694 | This function reads PL sample, CX2 and CS2 files and returns a nice |
||
695 | dataframe with cyclic values of charge and discharge capacity with |
||
696 | cycles in ascending order |
||
697 | |||
698 | Args: |
||
699 | data_dir (string): This is the absolute path to the data directory. |
||
700 | file_name_format (string): Format of the filename, used to deduce other files. |
||
701 | sheet_name (string): Sheet name containing the data in the excel file. |
||
702 | ignore_file_indices (list, int): This list of ints tells which to ignore. |
||
703 | |||
704 | Returns: |
||
705 | The complete test data in a dataframe with extra column for capacity in Ah. |
||
706 | """ |
||
707 | |||
708 | # For excel files (CX2 and CS2 datafiles), the function 'cx2_file_reader' |
||
709 | # is used. |
||
710 | if file_name_format[:3] == 'CX2' or file_name_format[:3] == 'CS2': |
||
711 | df_output = cx2_file_reader(data_dir,file_name_format,sheet_name) |
||
712 | else: |
||
713 | df_output = pl_samples_file_reader(data_dir,file_name_format,ignore_file_indices) |
||
714 | |||
715 | # The function 'data_formatting' is used to drop the unnecesary columns |
||
716 | # from the training data i.e. only the features considered in the model |
||
717 | # (Current, Voltage and Discharge capacity) are retained. |
||
718 | formatted_data = data_formatting(df_output) |
||
719 | |||
720 | # The function 'series_to_supervised' is used to frame the time series training |
||
721 | # data as supervised learning dataset. |
||
722 | # df_out = series_to_supervised( |
||
723 | # formatted_data, n_in=1, n_out=1, dropnan=True) |
||
724 | return formatted_data |
||
725 |