| Total Complexity | 55 |
| Total Lines | 725 |
| Duplicated Lines | 2.07 % |
| Changes | 0 | ||
Duplicate code is one of the most pungent code smells. A rule that is often used is to re-structure code once it is duplicated in three or more places.
Common duplication problems, and corresponding solutions are:
Complex classes like battdeg.battdeg often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
| 1 | """ |
||
| 2 | This module can be used to read cycling data of the CX2, CS2 and PL type cells as |
||
| 3 | a dataframe. It converts cumulative values into individual values for |
||
| 4 | each cycle and determines net charge of the battery at every datapoint. |
||
| 5 | It can also be used to train and test a LSTM model and predict discharge capacity |
||
| 6 | using the LSTM model. |
||
| 7 | """ |
||
| 8 | |||
| 9 | import datetime |
||
| 10 | import os |
||
| 11 | from os import listdir |
||
| 12 | from os.path import isfile, join |
||
| 13 | import re |
||
| 14 | # import matplotlib.pyplot as plt |
||
| 15 | # import seaborn as sns |
||
| 16 | import pandas as pd |
||
| 17 | import numpy as np |
||
| 18 | |||
| 19 | from sklearn.model_selection import train_test_split |
||
| 20 | from keras.models import Sequential |
||
| 21 | from keras.layers import Dense |
||
| 22 | from keras.layers import LSTM |
||
| 23 | from keras.models import load_model |
||
| 24 | |||
| 25 | # @profile |
||
| 26 | def date_time_converter(date_time_list): |
||
| 27 | """ |
||
| 28 | This function gets the numpy array with date_time in matlab format |
||
| 29 | and returns a numpy array with date_time in human readable format. |
||
| 30 | """ |
||
| 31 | |||
| 32 | if not isinstance(date_time_list, list): |
||
| 33 | raise TypeError("date_time_list should be a list") |
||
| 34 | |||
| 35 | # Empty array to hold the results |
||
| 36 | date_time_human = [] |
||
| 37 | |||
| 38 | for i in date_time_list: |
||
| 39 | date_time_human.append( |
||
| 40 | datetime.datetime.fromordinal( |
||
| 41 | int(i)) + |
||
| 42 | datetime.timedelta( |
||
| 43 | days=i % |
||
| 44 | 1) - |
||
| 45 | datetime.timedelta( |
||
| 46 | days=366)) |
||
| 47 | |||
| 48 | return date_time_human |
||
| 49 | |||
| 50 | # @profile |
||
| 51 | |||
| 52 | |||
| 53 | def get_dict_files(data_dir, file_name_format, ignore_file_indices): |
||
| 54 | """ |
||
| 55 | This function finds all the files at the location of the file name |
||
| 56 | format as specified and then creates a dictionary after ignoring the |
||
| 57 | list of file specified |
||
| 58 | |||
| 59 | Args: |
||
| 60 | data_dir (string): This is the absolute path to the data directory. |
||
| 61 | file_name_format (string): Format of the filename, used to deduce other |
||
| 62 | files. |
||
| 63 | ignore_file_indices (list, int): This list of ints tells |
||
| 64 | which to ignore. |
||
| 65 | |||
| 66 | Returns: |
||
| 67 | The dictionary with all data from files dataframes. |
||
| 68 | """ |
||
| 69 | |||
| 70 | # get the list of files in the directory |
||
| 71 | onlyfiles = [f for f in listdir(data_dir) if isfile(join(data_dir, f))] |
||
| 72 | |||
| 73 | # Extract the experiment name from the file_name_format |
||
| 74 | exp_name = file_name_format[0:4] |
||
| 75 | |||
| 76 | # Empty dictionary to hold all the dataframe for various files |
||
| 77 | dict_files = {} |
||
| 78 | |||
| 79 | # Iterate over all the files of certain type and get the file number from |
||
| 80 | # them |
||
| 81 | for filename in onlyfiles: |
||
| 82 | if exp_name in filename: |
||
| 83 | # Extract the filenumber from the name |
||
| 84 | file_number = re.search( |
||
| 85 | exp_name + r'\((.+?)\).csv', |
||
| 86 | filename).group(1) |
||
| 87 | # Give a value of dataframe to each key |
||
| 88 | dict_files[int(file_number)] = pd.read_csv( |
||
|
|
|||
| 89 | join(data_dir, filename)) |
||
| 90 | |||
| 91 | # Empty dictionary to hold the ordered dictionaries |
||
| 92 | dict_ordered = {} |
||
| 93 | # Sort the dictionary based on keys |
||
| 94 | for key in sorted(dict_files.keys()): |
||
| 95 | dict_ordered[key] = dict_files[key] |
||
| 96 | |||
| 97 | # Keys with files to keep, remove the ignore indices from all keys |
||
| 98 | wanted_keys = np.array( |
||
| 99 | list(set(dict_ordered.keys()) - set(ignore_file_indices))) |
||
| 100 | |||
| 101 | # Remove the ignored dataframes for characterization |
||
| 102 | dict_ord_cycling_data = {k: dict_ordered[k] for k in wanted_keys} |
||
| 103 | |||
| 104 | return dict_ord_cycling_data |
||
| 105 | |||
| 106 | |||
| 107 | def concat_dict_dataframes(dict_ord_cycling_data): |
||
| 108 | """ |
||
| 109 | This function takes in a dictionary with ordered keys |
||
| 110 | and concatenates the dataframes in the values of the |
||
| 111 | dictionary to create a large dataframe with all the records. |
||
| 112 | |||
| 113 | Args: |
||
| 114 | dict_ord_cycling_data (dict): |
||
| 115 | The dictionary with ordered integer keys and dataframes as values |
||
| 116 | |||
| 117 | Returns: |
||
| 118 | The dataframe after concatenation |
||
| 119 | |||
| 120 | """ |
||
| 121 | |||
| 122 | # Raise an exception if the type of the inputs is not correct |
||
| 123 | if not isinstance(dict_ord_cycling_data, dict): |
||
| 124 | raise TypeError('dict_ord_cycling_data is not of type dict') |
||
| 125 | |||
| 126 | #print(dict_ord_cycling_data.keys()) |
||
| 127 | for i in dict_ord_cycling_data.keys(): |
||
| 128 | # Raise an exception if the type of the keys is not integers |
||
| 129 | # print(type(i)) |
||
| 130 | if not isinstance(i, (int, np.int64)): |
||
| 131 | raise TypeError('a key in the dictionary is not an integer') |
||
| 132 | |||
| 133 | for i in dict_ord_cycling_data.values(): |
||
| 134 | # Raise an exception if the type of the values is not a dataframe |
||
| 135 | if not isinstance(i, pd.DataFrame): |
||
| 136 | raise TypeError('a value in the dictionary is not a pandas ' + |
||
| 137 | 'dataframe') |
||
| 138 | # print(i.columns) |
||
| 139 | # Raise am exception if the necessary columns are not found in the df |
||
| 140 | if not { |
||
| 141 | 'Cycle', |
||
| 142 | 'Charge_Ah', |
||
| 143 | 'Discharge_Ah', |
||
| 144 | 'Time_sec', |
||
| 145 | 'Current_Amp', |
||
| 146 | 'Voltage_Volt'}.issubset(i.columns): |
||
| 147 | raise Exception("the dataframe doesnt have the columns 'Cycle'" + |
||
| 148 | ", 'Charge_Ah', 'Discharge_Ah', " + |
||
| 149 | "'Time_sec', 'Voltage_Volt', 'Current_Amp' ") |
||
| 150 | |||
| 151 | # Concatenate the dataframes to create the total dataframe |
||
| 152 | df_out = None |
||
| 153 | for k in dict_ord_cycling_data.keys(): |
||
| 154 | View Code Duplication | if df_out is None: |
|
| 155 | df_next = dict_ord_cycling_data[k] |
||
| 156 | df_out = pd.DataFrame(data=None, columns=df_next.columns) |
||
| 157 | df_out = pd.concat([df_out, df_next]) |
||
| 158 | else: |
||
| 159 | df_next = dict_ord_cycling_data[k] |
||
| 160 | df_next['Cycle'] = np.array( |
||
| 161 | df_next['Cycle']) + max(np.array(df_out['Cycle'])) |
||
| 162 | df_next['Time_sec'] = np.array( |
||
| 163 | df_next['Time_sec']) + max(np.array(df_out['Time_sec'])) |
||
| 164 | df_next['Charge_Ah'] = np.array( |
||
| 165 | df_next['Charge_Ah']) + max(np.array(df_out['Charge_Ah'])) |
||
| 166 | df_next['Discharge_Ah'] = np.array( |
||
| 167 | df_next['Discharge_Ah']) + max(np.array(df_out['Discharge_Ah'])) |
||
| 168 | df_out = pd.concat([df_out, df_next]) |
||
| 169 | |||
| 170 | return df_out |
||
| 171 | |||
| 172 | |||
| 173 | def get_cycle_capacities(df_out): |
||
| 174 | """ |
||
| 175 | This function takes the dataframe, creates a new index and then calculates |
||
| 176 | capacities per cycle from cumulative charge and discharge capacities |
||
| 177 | |||
| 178 | Args: |
||
| 179 | df_out (pandas.DataFrame): |
||
| 180 | Concatenated dataframe |
||
| 181 | |||
| 182 | Returns: |
||
| 183 | the dataframe with capacities per cycle |
||
| 184 | |||
| 185 | """ |
||
| 186 | |||
| 187 | # Raise am exception if the necessary columns are not found in the df |
||
| 188 | if not {'Cycle', 'Charge_Ah', 'Discharge_Ah', 'Time_sec', 'Current_Amp', |
||
| 189 | 'Voltage_Volt'}.issubset(df_out.columns): |
||
| 190 | raise Exception("the dataframe doesnt have the columns 'Cycle'" + |
||
| 191 | ", 'Charge_Ah', 'Discharge_Ah', " + |
||
| 192 | "'Time_sec', 'Voltage_Volt', 'Current_Amp' ") |
||
| 193 | |||
| 194 | # Reset the index and drop the old index |
||
| 195 | df_out_indexed = df_out.reset_index(drop=True) |
||
| 196 | |||
| 197 | # Proceed further with correcting the capacity |
||
| 198 | df_grouped = df_out_indexed.groupby(['Cycle']).count() |
||
| 199 | |||
| 200 | # Get the indices when a cycle starts |
||
| 201 | cycle_start_indices = df_grouped['Time_sec'].cumsum() |
||
| 202 | |||
| 203 | # Get the charge_Ah per cycle |
||
| 204 | # Create numpy array to store the old charge_Ah row, and then |
||
| 205 | # perform transformation on it, rather than in the pandas series |
||
| 206 | # this is a lot faster in this case |
||
| 207 | charge_cycle_ah = np.array(df_out_indexed['Charge_Ah']) |
||
| 208 | charge_ah = np.array(df_out_indexed['Charge_Ah']) |
||
| 209 | |||
| 210 | for i in range(1, len(cycle_start_indices)): |
||
| 211 | begin_value = cycle_start_indices.iloc[i - 1] |
||
| 212 | end_value = cycle_start_indices.iloc[i] |
||
| 213 | charge_cycle_ah[begin_value:end_value] = charge_ah[begin_value:end_value] - \ |
||
| 214 | charge_ah[begin_value - 1] |
||
| 215 | |||
| 216 | df_out_indexed['charge_cycle_ah'] = charge_cycle_ah |
||
| 217 | |||
| 218 | # Get the discharge_Ah per cycle |
||
| 219 | discharge_cycle_ah = np.array(df_out_indexed['Discharge_Ah']) |
||
| 220 | discharge_ah = np.array(df_out_indexed['Discharge_Ah']) |
||
| 221 | |||
| 222 | for i in range(1, len(cycle_start_indices)): |
||
| 223 | begin_value = cycle_start_indices.iloc[i - 1] |
||
| 224 | end_value = cycle_start_indices.iloc[i] |
||
| 225 | discharge_cycle_ah[begin_value:end_value] = discharge_ah[begin_value:end_value] - \ |
||
| 226 | discharge_ah[begin_value - 1] |
||
| 227 | |||
| 228 | df_out_indexed['discharge_cycle_ah'] = discharge_cycle_ah |
||
| 229 | |||
| 230 | # This is the data column we can use for prediction. |
||
| 231 | # This is not totally accurate, as this still has some points that go negative, |
||
| 232 | # due to incorrect discharge_Ah values every few cycles. |
||
| 233 | # But the machine learning algorithm should consider these as outliers and |
||
| 234 | # hopefully get over it. We can come back and correct this. |
||
| 235 | df_out_indexed['capacity_ah'] = charge_cycle_ah - discharge_cycle_ah |
||
| 236 | df_out_indexed.rename(columns={'Current_Amp':'Current(A)','Voltage_Volt':'Voltage(V)'}, |
||
| 237 | inplace=True) |
||
| 238 | return df_out_indexed |
||
| 239 | |||
| 240 | # @profile |
||
| 241 | |||
| 242 | |||
| 243 | def pl_samples_file_reader(data_dir, file_name_format, ignore_file_indices): |
||
| 244 | """ |
||
| 245 | This function reads in the data for PL Samples experiment and returns a |
||
| 246 | nice dataframe with cycles in ascending order. |
||
| 247 | |||
| 248 | Args: |
||
| 249 | data_dir (string): This is the absolute path to the data directory. |
||
| 250 | file_name_format (string): Format of the filename, used to deduce other files. |
||
| 251 | ignore_file_indices (list, int): This list of ints tells which to ignore. |
||
| 252 | |||
| 253 | Returns: |
||
| 254 | The complete test data in a dataframe with extra column for capacity in Ah. |
||
| 255 | """ |
||
| 256 | |||
| 257 | # Raise an exception if the type of the inputs is not correct |
||
| 258 | if not isinstance(data_dir, str): |
||
| 259 | raise TypeError('data_dir is not of type string') |
||
| 260 | |||
| 261 | if not isinstance(file_name_format, str): |
||
| 262 | raise TypeError('file_name_format is not of type string') |
||
| 263 | |||
| 264 | if not isinstance(ignore_file_indices, list): |
||
| 265 | raise TypeError("ignore_file_indices should be a list") |
||
| 266 | |||
| 267 | for ignore_file_indice in ignore_file_indices: |
||
| 268 | if not isinstance(ignore_file_indice, int): |
||
| 269 | raise TypeError("""ignore_file_indices elements should be |
||
| 270 | of type integer""") |
||
| 271 | |||
| 272 | if not os.path.exists(join(data_dir, file_name_format)): |
||
| 273 | raise FileNotFoundError("File {} not found in the location {}" |
||
| 274 | .format(file_name_format, data_dir)) |
||
| 275 | |||
| 276 | dict_ord_cycling_data = get_dict_files( |
||
| 277 | data_dir, file_name_format, ignore_file_indices) |
||
| 278 | |||
| 279 | df_out = concat_dict_dataframes(dict_ord_cycling_data) |
||
| 280 | |||
| 281 | #### |
||
| 282 | # This has been commented out for performance, as we do not need date_time |
||
| 283 | #### |
||
| 284 | # Convert the Date_Time from matlab datenum to human readable Date_Time |
||
| 285 | # First convert the series into a numpy array |
||
| 286 | # date_time_matlab = df_out['Date_Time'].tolist() |
||
| 287 | |||
| 288 | # # Apply the conversion to the numpy array |
||
| 289 | # df_out['Date_Time_new'] = date_time_converter(date_time_matlab) |
||
| 290 | |||
| 291 | # Get the cycle capacities from cumulative capacities |
||
| 292 | df_out_indexed = get_cycle_capacities(df_out) |
||
| 293 | |||
| 294 | return df_out_indexed |
||
| 295 | |||
| 296 | # Wrapping function to train the LSTM model and calculate model_loss, |
||
| 297 | # and response to the testing data set. |
||
| 298 | |||
| 299 | |||
| 300 | def model_training(data_dir, file_name_format, sheet_name): |
||
| 301 | """ |
||
| 302 | This function converts cumulative battery cycling data into individual cycle data |
||
| 303 | and trains the LSTM model with the converted data set. |
||
| 304 | |||
| 305 | Args: |
||
| 306 | data_dir (string): This is the absolute path to the data directory. |
||
| 307 | file_name_format (string): Format of the filename, used to deduce other files. |
||
| 308 | sheet_name(string or int): Sheet name or sheet number in the excel file containing |
||
| 309 | the relevant data. |
||
| 310 | |||
| 311 | Returns: |
||
| 312 | model_loss(dictionary): Returns the history dictionary (more info to be added) |
||
| 313 | y_hat(array): Predicted response for the testing dataset. |
||
| 314 | # y_prediction(array): Predicted response for the completely new dataset |
||
| 315 | # (The input has to be the time series cycling data including values of |
||
| 316 | # Current, Voltage and Discharge Capacity) |
||
| 317 | """ |
||
| 318 | # The function 'cx2_file_reader' is used to read all the excel files |
||
| 319 | # in the given path and convert the given cumulative data into individual |
||
| 320 | # cycle data. |
||
| 321 | individual_cycle_data = cx2_file_reader(data_dir, file_name_format, sheet_name) |
||
| 322 | |||
| 323 | # The function 'data_formatting' is used to drop the unnecesary columns |
||
| 324 | # from the training data i.e. only the features considered in the model |
||
| 325 | # (Current, Voltage and Discharge capacity) are retained. |
||
| 326 | formatted_data = data_formatting(individual_cycle_data) |
||
| 327 | |||
| 328 | # The function 'series_to_supervised' is used to frame the time series training |
||
| 329 | # data as supervised learning dataset. |
||
| 330 | learning_df = series_to_supervised( |
||
| 331 | formatted_data, n_in=1, n_out=1, dropnan=True) |
||
| 332 | |||
| 333 | # The function 'long_short_term_memory' is used to train the model |
||
| 334 | # and predict response for the new input dataset. |
||
| 335 | model_loss, y_hat = long_short_term_memory(learning_df) |
||
| 336 | |||
| 337 | return model_loss, y_hat |
||
| 338 | |||
| 339 | |||
| 340 | # Function to predict the discharge capacity using the trained LSTM model. |
||
| 341 | def model_prediction(input_data): |
||
| 342 | """ |
||
| 343 | This function can be used to forecast the discharge capacity of a battery using |
||
| 344 | the trained LSTM model |
||
| 345 | |||
| 346 | Args: |
||
| 347 | input_data(dataframe): This is the dataframe containing the current, voltage and |
||
| 348 | discharge capacity values at a prior time which can be used to forecast discharge |
||
| 349 | capacity at a further time. |
||
| 350 | |||
| 351 | Returns: |
||
| 352 | y_predicted: The forecasted values of discharge capacity. |
||
| 353 | """ |
||
| 354 | |||
| 355 | # The function 'series_to_supervised' is used to frame the time series training |
||
| 356 | # data as supervised learning dataset. |
||
| 357 | learning_df = series_to_supervised( |
||
| 358 | input_data, n_in=1, n_out=1, dropnan=True) |
||
| 359 | learning_df = learning_df.iloc[:, 0:3].values |
||
| 360 | # Reshaping the input dataset. |
||
| 361 | learning_df = learning_df.reshape( |
||
| 362 | (learning_df.shape[0], 1, learning_df.shape[1])) |
||
| 363 | # Predicting the discharge values using the saved LSTM model. |
||
| 364 | module_dir = os.path.dirname(os.path.abspath(__file__)) |
||
| 365 | model_path = join(module_dir,'models') |
||
| 366 | model = load_model(join(model_path,'lstm_trained_model.h5')) |
||
| 367 | y_predicted = model.predict(learning_df) |
||
| 368 | return y_predicted |
||
| 369 | |||
| 370 | |||
| 371 | # Wrapping function only to merge and convert cumulative data to |
||
| 372 | # individual cycle data. |
||
| 373 | def cx2_file_reader(data_dir, file_name_format, sheet_name): |
||
| 374 | """ |
||
| 375 | This function reads in the data for CX2 samples experiment and returns |
||
| 376 | a well formatted dataframe with cycles in ascending order. |
||
| 377 | |||
| 378 | Args: |
||
| 379 | data_dir (string): This is the absolute path to the data directory. |
||
| 380 | file_name_format (string): Format of the filename, used to deduce other files. |
||
| 381 | sheet_name (string): Sheet name containing the data in the excel file. |
||
| 382 | |||
| 383 | Returns: |
||
| 384 | The complete test data in a dataframe with extra column for capacity in Ah. |
||
| 385 | """ |
||
| 386 | # Raise an exception if the type of the inputs is not correct |
||
| 387 | if not isinstance(data_dir, str): |
||
| 388 | raise TypeError('data_dir is not of type string') |
||
| 389 | |||
| 390 | if not isinstance(file_name_format, str): |
||
| 391 | raise TypeError('file_name_format is not of type string') |
||
| 392 | |||
| 393 | if not isinstance(sheet_name, (str, int)): |
||
| 394 | raise TypeError('Sheet_Name format is not of type string or integer') |
||
| 395 | |||
| 396 | if not os.path.exists(join(data_dir, file_name_format)): |
||
| 397 | raise FileNotFoundError("File {} not found in the location {}" |
||
| 398 | .format(file_name_format, data_dir)) |
||
| 399 | |||
| 400 | # Get the list of files in the directory |
||
| 401 | path = join(data_dir, file_name_format) |
||
| 402 | files = listdir(path) |
||
| 403 | |||
| 404 | # Extract the experiment name from the file_name_format |
||
| 405 | # exp_name = file_name_format[0:6] |
||
| 406 | |||
| 407 | # Filtering out and reading the excel files in the data directory |
||
| 408 | file_names = list(filter(lambda x: x[-5:] == '.xlsx', files)) |
||
| 409 | |||
| 410 | # Sorting the file names using the |
||
| 411 | # 'file_name_sorting' function. |
||
| 412 | sorted_name_list = file_name_sorting(file_names) |
||
| 413 | |||
| 414 | # Reading dataframes according to the date of experimentation |
||
| 415 | # using 'reading_dataframes' function. |
||
| 416 | sorted_df = reading_dataframes(sorted_name_list, sheet_name, path) |
||
| 417 | |||
| 418 | # Merging all the dataframes and adjusting the cycle index |
||
| 419 | # using the 'concat_df' function. |
||
| 420 | cycle_data = concat_df(sorted_df) |
||
| 421 | |||
| 422 | # Calculating the net capacity of the battery at every datapoint |
||
| 423 | # using the function 'capacity'. |
||
| 424 | capacity_data = capacity(cycle_data) |
||
| 425 | |||
| 426 | # Returns the dataframe with new cycle indices and capacity data. |
||
| 427 | return capacity_data |
||
| 428 | |||
| 429 | |||
| 430 | def file_name_sorting(file_name_list): |
||
| 431 | """ |
||
| 432 | This function sorts all the file names according to the date |
||
| 433 | on the file name. |
||
| 434 | |||
| 435 | Args: |
||
| 436 | file_name_list(list): List containing all the file names to be read |
||
| 437 | |||
| 438 | Returns: |
||
| 439 | A list of file names sorted according to the date on the file name. |
||
| 440 | |||
| 441 | """ |
||
| 442 | filename = pd.DataFrame(data=file_name_list, columns=['file_name']) |
||
| 443 | # Splitting the file name into different columns |
||
| 444 | filename['cell_type'], filename['cell_num'], filename['month'], filename[ |
||
| 445 | 'day'], filename['year'] = filename['file_name'].str.split('_', 4).str |
||
| 446 | filename['year'], filename['ext'] = filename['year'].str.split('.', 1).str |
||
| 447 | filename['date'] = '' |
||
| 448 | # Merging the year, month and date column to create a string for DateTime |
||
| 449 | # object. |
||
| 450 | filename['date'] = filename['year'].map( |
||
| 451 | str) + filename['month'].map(str) + filename['day'].map(str) |
||
| 452 | # Creating a DateTime object. |
||
| 453 | filename['date_time'] = '' |
||
| 454 | filename['date_time'] = pd.to_datetime(filename['date'], format="%y%m%d") |
||
| 455 | # Sorting the file names according to the |
||
| 456 | # created DateTime object. |
||
| 457 | filename.sort_values(['date_time'], inplace=True) |
||
| 458 | # Created a list of sorted file names |
||
| 459 | sorted_file_names = filename['file_name'].values |
||
| 460 | return sorted_file_names |
||
| 461 | |||
| 462 | |||
| 463 | def reading_dataframes(file_names, sheet_name, path): |
||
| 464 | """ |
||
| 465 | This function reads all the files in the sorted |
||
| 466 | file names list as a dataframe |
||
| 467 | |||
| 468 | Args(list): |
||
| 469 | file_names: Sorted file names list |
||
| 470 | sheet_name: Sheet name in the excel file containing the data. |
||
| 471 | |||
| 472 | Returns: |
||
| 473 | Dictionary of dataframes in the order of the sorted file names. |
||
| 474 | """ |
||
| 475 | # Empty dictionary to store all the dataframes according |
||
| 476 | # to the order in the sorted files name list |
||
| 477 | df_raw = {} |
||
| 478 | # Reading the dataframes |
||
| 479 | for i, filename in enumerate(file_names): |
||
| 480 | df_raw[i] = pd.read_excel( |
||
| 481 | join( |
||
| 482 | path, |
||
| 483 | filename), |
||
| 484 | sheet_name=sheet_name) |
||
| 485 | return df_raw |
||
| 486 | |||
| 487 | |||
| 488 | def concat_df(df_dict): |
||
| 489 | """ |
||
| 490 | This function concatenates all the dataframes and edits |
||
| 491 | the cycle index for the concatenated dataframes. |
||
| 492 | |||
| 493 | Args: |
||
| 494 | df_dict(dictionary): Dictionary of dataframes to be concatenated. |
||
| 495 | |||
| 496 | Returns: |
||
| 497 | A concatenated dataframe with editted cycle index |
||
| 498 | |||
| 499 | """ |
||
| 500 | df_concat = None |
||
| 501 | for data in df_dict: |
||
| 502 | if df_concat is None: |
||
| 503 | df_next = df_dict[data] |
||
| 504 | df_concat = pd.DataFrame(data=None, columns=df_next.columns) |
||
| 505 | # df_next['Cycle'] = df_next['Cycle'] + max(df_pl12['Cycle']) |
||
| 506 | df_concat = pd.concat([df_concat, df_next]) |
||
| 507 | else: |
||
| 508 | df_next = df_dict[data] |
||
| 509 | df_next['Cycle_Index'] = np.array( |
||
| 510 | df_next['Cycle_Index']) + max(np.array(df_concat['Cycle_Index'])) |
||
| 511 | df_next['Test_Time(s)'] = np.array( |
||
| 512 | df_next['Test_Time(s)']) + max(np.array(df_concat['Test_Time(s)'])) |
||
| 513 | df_next['Charge_Capacity(Ah)'] = np.array( |
||
| 514 | df_next['Charge_Capacity(Ah)']) + max(np.array(df_concat['Charge_Capacity(Ah)'])) |
||
| 515 | df_next['Discharge_Capacity(Ah)'] = np.array( |
||
| 516 | df_next['Discharge_Capacity(Ah)']) + max( |
||
| 517 | np.array(df_concat['Discharge_Capacity(Ah)'])) |
||
| 518 | df_concat = pd.concat([df_concat, df_next]) |
||
| 519 | # Reset the index and drop the old index |
||
| 520 | df_reset = df_concat.reset_index(drop=True) |
||
| 521 | return df_reset |
||
| 522 | |||
| 523 | |||
| 524 | def capacity(df_data): |
||
| 525 | """ |
||
| 526 | This function calculates the net capacity of the battery |
||
| 527 | from the charge capacity and discharge capacity values. |
||
| 528 | |||
| 529 | Args: |
||
| 530 | df_data(dataframe): Concatenated dataframe which has the values of charge |
||
| 531 | capacity and discharge capacity for which net capacity has to be |
||
| 532 | calculated. |
||
| 533 | |||
| 534 | Returns: |
||
| 535 | Dataframe with net capacity of the battery for every point of the charge |
||
| 536 | and discharge cycle. |
||
| 537 | """ |
||
| 538 | # Grouping rows by the cycle index. |
||
| 539 | group = df_data.groupby(['Cycle_Index']).count() |
||
| 540 | |||
| 541 | # Get the indices when a cycle starts |
||
| 542 | cycle_start_indices = group['Data_Point'].cumsum() |
||
| 543 | |||
| 544 | # Get the charge_Ah per cycle |
||
| 545 | # Create numpy array to store the old charge_Ah row, and then |
||
| 546 | # perform transformation on it, rather than in the pandas series |
||
| 547 | # this is a lot faster in this case |
||
| 548 | charge_cycle_ah = np.array(df_data['Charge_Capacity(Ah)']) |
||
| 549 | charge_ah = np.array(df_data['Charge_Capacity(Ah)']) |
||
| 550 | |||
| 551 | for i in range(1, len(cycle_start_indices)): |
||
| 552 | begin_value = cycle_start_indices.iloc[i - 1] |
||
| 553 | end_value = cycle_start_indices.iloc[i] |
||
| 554 | charge_cycle_ah[begin_value:end_value] = charge_ah[begin_value:end_value] - \ |
||
| 555 | charge_ah[begin_value - 1] |
||
| 556 | |||
| 557 | df_data['charge_cycle_ah'] = charge_cycle_ah |
||
| 558 | |||
| 559 | # Get the discharge_Ah per cycle |
||
| 560 | discharge_cycle_ah = np.array(df_data['Discharge_Capacity(Ah)']) |
||
| 561 | discharge_ah = np.array(df_data['Discharge_Capacity(Ah)']) |
||
| 562 | |||
| 563 | for i in range(1, len(cycle_start_indices)): |
||
| 564 | begin_value = cycle_start_indices.iloc[i - 1] |
||
| 565 | end_value = cycle_start_indices.iloc[i] |
||
| 566 | discharge_cycle_ah[begin_value:end_value] = discharge_ah[begin_value:end_value] - \ |
||
| 567 | discharge_ah[begin_value - 1] |
||
| 568 | |||
| 569 | df_data['discharge_cycle_ah'] = discharge_cycle_ah |
||
| 570 | |||
| 571 | # This is the data column we can use for prediction. |
||
| 572 | # This is not totally accurate, as this still has some points that go negative, |
||
| 573 | # due to incorrect discharge_Ah values every few cycles. |
||
| 574 | # But the machine learning algorithm should consider these as outliers and |
||
| 575 | # hopefully get over it. We can come back and correct this. |
||
| 576 | df_data['capacity_ah'] = df_data['charge_cycle_ah'] - df_data['discharge_cycle_ah'] |
||
| 577 | |||
| 578 | return df_data |
||
| 579 | |||
| 580 | |||
| 581 | def data_formatting(merged_df): |
||
| 582 | """ |
||
| 583 | This function formats the merged dataframe so that it can be used to frame the given |
||
| 584 | time series data as a supervised learning dataset. |
||
| 585 | |||
| 586 | Args: |
||
| 587 | merged_df(dataframe): The merged dataframe which can be obtained by using the |
||
| 588 | function 'cx2_file_reader' |
||
| 589 | |||
| 590 | Returns: |
||
| 591 | A numpy array with only values required to frame a time series as a |
||
| 592 | supervised learning dataset. |
||
| 593 | """ |
||
| 594 | # Get the columns containing text 'Current', 'Voltage' and |
||
| 595 | # 'discharge_cycle_ah' |
||
| 596 | merged_df = merged_df.filter(regex='Current|Voltage|discharge_cycle_ah') |
||
| 597 | formatted_df = merged_df.astype('float32') |
||
| 598 | return formatted_df |
||
| 599 | |||
| 600 | |||
| 601 | def series_to_supervised(data, n_in=1, n_out=1, dropnan=True): |
||
| 602 | """ |
||
| 603 | Frame a time series as a supervised learning dataset. |
||
| 604 | |||
| 605 | Arguments: |
||
| 606 | data: Sequence of observations as a list or NumPy array. |
||
| 607 | n_in: Number of lag observations as input (X). |
||
| 608 | n_out: Number of observations as output (y). |
||
| 609 | dropnan: Boolean whether or not to drop rows with NaN values. |
||
| 610 | |||
| 611 | Returns: |
||
| 612 | Pandas DataFrame of series framed for supervised learning. |
||
| 613 | |||
| 614 | """ |
||
| 615 | n_vars = 1 if isinstance(data, list) else data.shape[1] |
||
| 616 | df_data = pd.DataFrame(data) |
||
| 617 | cols, names = list(), list() |
||
| 618 | # input sequence (t-n, ... t-1) |
||
| 619 | for i in range(n_in, 0, -1): |
||
| 620 | cols.append(df_data.shift(i)) |
||
| 621 | names += [('var%d(t-%d)' % (j + 1, i)) for j in range(n_vars)] |
||
| 622 | # forecast sequence (t, t+1, ... t+n) |
||
| 623 | for i in range(0, n_out): |
||
| 624 | cols.append(df_data.shift(-i)) |
||
| 625 | if i == 0: |
||
| 626 | names += [('var%d(t)' % (j + 1)) for j in range(n_vars)] |
||
| 627 | else: |
||
| 628 | names += [('var%d(t+%d)' % (j + 1, i)) for j in range(n_vars)] |
||
| 629 | # put it all together |
||
| 630 | sl_df = pd.concat(cols, axis=1) |
||
| 631 | sl_df.columns = names |
||
| 632 | # drop rows with NaN values |
||
| 633 | if dropnan: |
||
| 634 | sl_df.dropna(inplace=True) |
||
| 635 | sl_df.drop(sl_df.columns[[3, 4]], axis=1, inplace=True) |
||
| 636 | sl_df.rename(columns={'var1(t-1)':'Current(t-1)','var2(t-1)':'Voltage(t-1)', |
||
| 637 | 'var3(t-1)':'discharge_capacity(t-1)','var3(t)':'discharge_capacity(t)'}, |
||
| 638 | inplace = True) |
||
| 639 | return sl_df |
||
| 640 | |||
| 641 | |||
| 642 | def long_short_term_memory(model_data): |
||
| 643 | """ |
||
| 644 | This function splits the input dataset into training |
||
| 645 | and testing datasets. The keras LSTM model is then |
||
| 646 | trained and tested using the respective datasets. |
||
| 647 | |||
| 648 | Args: |
||
| 649 | model_data(dataframe): Values of input and output variables |
||
| 650 | of time series data framed as a supervised learning dataset. |
||
| 651 | |||
| 652 | |||
| 653 | Returns: |
||
| 654 | model_loss(dictionary): Returns the history dictionary (more info to be added) |
||
| 655 | y_hat(array): Predicted response for the testing dataset. |
||
| 656 | y_prediction(array): Predicted response for the completely new dataset. |
||
| 657 | """ |
||
| 658 | # Splitting the input dataset into training and testing data |
||
| 659 | train, test = train_test_split(model_data, test_size=0.2, random_state=944) |
||
| 660 | # split into input and outputs |
||
| 661 | train_x, train_y = train[train.columns[0:3] |
||
| 662 | ].values, train[train.columns[3]].values |
||
| 663 | test_x, test_y = test[test.columns[0:3] |
||
| 664 | ].values, test[test.columns[3]].values |
||
| 665 | # reshape input to be 3D [samples, timesteps, features] |
||
| 666 | train_x = train_x.reshape((train_x.shape[0], 1, train_x.shape[1])) |
||
| 667 | test_x = test_x.reshape((test_x.shape[0], 1, test_x.shape[1])) |
||
| 668 | # print(train_x.shape, train_y.shape, test_x.shape, test_y.shape) |
||
| 669 | |||
| 670 | # Designing the network |
||
| 671 | model = Sequential() |
||
| 672 | model.add(LSTM(50, input_shape=(train_x.shape[1], train_x.shape[2]))) |
||
| 673 | model.add(Dense(1)) |
||
| 674 | model.compile(loss='mae', optimizer='adam') |
||
| 675 | # Fitting the network with training and testing data |
||
| 676 | history = model.fit( |
||
| 677 | train_x, |
||
| 678 | train_y, |
||
| 679 | epochs=50, |
||
| 680 | batch_size=72, |
||
| 681 | validation_data=( |
||
| 682 | test_x, |
||
| 683 | test_y), |
||
| 684 | verbose=0, |
||
| 685 | shuffle=False) |
||
| 686 | model_loss = history.history |
||
| 687 | # Prediction for the test dataset. |
||
| 688 | yhat = model.predict(test_x) |
||
| 689 | # model.save('lstm_trained_model.h5') |
||
| 690 | return model_loss, yhat |
||
| 691 | |||
| 692 | def file_reader(data_dir, file_name_format, sheet_name, ignore_file_indices): |
||
| 693 | """ |
||
| 694 | This function reads PL sample, CX2 and CS2 files and returns a nice |
||
| 695 | dataframe with cyclic values of charge and discharge capacity with |
||
| 696 | cycles in ascending order |
||
| 697 | |||
| 698 | Args: |
||
| 699 | data_dir (string): This is the absolute path to the data directory. |
||
| 700 | file_name_format (string): Format of the filename, used to deduce other files. |
||
| 701 | sheet_name (string): Sheet name containing the data in the excel file. |
||
| 702 | ignore_file_indices (list, int): This list of ints tells which to ignore. |
||
| 703 | |||
| 704 | Returns: |
||
| 705 | The complete test data in a dataframe with extra column for capacity in Ah. |
||
| 706 | """ |
||
| 707 | |||
| 708 | # For excel files (CX2 and CS2 datafiles), the function 'cx2_file_reader' |
||
| 709 | # is used. |
||
| 710 | if file_name_format[:3] == 'CX2' or file_name_format[:3] == 'CS2': |
||
| 711 | df_output = cx2_file_reader(data_dir,file_name_format,sheet_name) |
||
| 712 | else: |
||
| 713 | df_output = pl_samples_file_reader(data_dir,file_name_format,ignore_file_indices) |
||
| 714 | |||
| 715 | # The function 'data_formatting' is used to drop the unnecesary columns |
||
| 716 | # from the training data i.e. only the features considered in the model |
||
| 717 | # (Current, Voltage and Discharge capacity) are retained. |
||
| 718 | formatted_data = data_formatting(df_output) |
||
| 719 | |||
| 720 | # The function 'series_to_supervised' is used to frame the time series training |
||
| 721 | # data as supervised learning dataset. |
||
| 722 | # df_out = series_to_supervised( |
||
| 723 | # formatted_data, n_in=1, n_out=1, dropnan=True) |
||
| 724 | return formatted_data |
||
| 725 |