GitHub Access Token became invalid

It seems like the GitHub access token used for retrieving details about this repository from GitHub became invalid. This might prevent certain types of inspections from being run (in particular, everything related to pull requests).
Please ask an admin of your repository to re-new the access token on this website.

Issues (22)

battdeg/battdeg.py (1 issue)

1
"""
2
This module can be used to read cycling data of the CX2, CS2 and PL type cells as
3
a dataframe. It converts cumulative values into individual values for
4
each cycle and determines net charge of the battery at every datapoint.
5
It can also be used to train and test a LSTM model and predict discharge capacity
6
using the LSTM model.
7
"""
8
9
import datetime
10
import os
11
from os import listdir
12
from os.path import isfile, join
13
import re
14
# import matplotlib.pyplot as plt
15
# import seaborn as sns
16
import pandas as pd
17
import numpy as np
18
19
from sklearn.model_selection import train_test_split
20
from keras.models import Sequential
21
from keras.layers import Dense
22
from keras.layers import LSTM
23
from keras.models import load_model
24
25
# @profile
26
def date_time_converter(date_time_list):
27
    """
28
    This function gets the numpy array with date_time in matlab format
29
    and returns a numpy array with date_time in human readable format.
30
    """
31
32
    if not isinstance(date_time_list, list):
33
        raise TypeError("date_time_list should be a list")
34
35
    # Empty array to hold the results
36
    date_time_human = []
37
38
    for i in date_time_list:
39
        date_time_human.append(
40
            datetime.datetime.fromordinal(
41
                int(i)) +
42
            datetime.timedelta(
43
                days=i %
44
                1) -
45
            datetime.timedelta(
46
                days=366))
47
48
    return date_time_human
49
50
# @profile
51
52
53
def get_dict_files(data_dir, file_name_format, ignore_file_indices):
54
    """
55
    This function finds all the files at the location of the file name
56
    format as specified and then creates a dictionary after ignoring the
57
    list of file specified
58
59
    Args:
60
        data_dir (string): This is the absolute path to the data directory.
61
        file_name_format (string): Format of the filename, used to deduce other
62
        files.
63
        ignore_file_indices (list, int): This list of ints tells
64
        which to ignore.
65
66
    Returns:
67
        The dictionary with all data from files dataframes.
68
    """
69
70
    # get the list of files in the directory
71
    onlyfiles = [f for f in listdir(data_dir) if isfile(join(data_dir, f))]
72
73
    # Extract the experiment name from the file_name_format
74
    exp_name = file_name_format[0:4]
75
76
    # Empty dictionary to hold all the dataframe for various files
77
    dict_files = {}
78
79
    # Iterate over all the files of certain type and get the file number from
80
    # them
81
    for filename in onlyfiles:
82
        if exp_name in filename:
83
            # Extract the filenumber from the name
84
            file_number = re.search(
85
                exp_name + r'\((.+?)\).csv',
86
                filename).group(1)
87
            # Give a value of dataframe to each key
88
            dict_files[int(file_number)] = pd.read_csv(
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable int does not seem to be defined.
Loading history...
89
                join(data_dir, filename))
90
91
    # Empty dictionary to hold the ordered dictionaries
92
    dict_ordered = {}
93
    # Sort the dictionary based on keys
94
    for key in sorted(dict_files.keys()):
95
        dict_ordered[key] = dict_files[key]
96
97
    # Keys with files to keep, remove the ignore indices from all keys
98
    wanted_keys = np.array(
99
        list(set(dict_ordered.keys()) - set(ignore_file_indices)))
100
101
    # Remove the ignored dataframes for characterization
102
    dict_ord_cycling_data = {k: dict_ordered[k] for k in wanted_keys}
103
104
    return dict_ord_cycling_data
105
106
107
def concat_dict_dataframes(dict_ord_cycling_data):
108
    """
109
    This function takes in a dictionary with ordered keys
110
    and concatenates the dataframes in the values of the
111
    dictionary to create a large dataframe with all the records.
112
113
    Args:
114
        dict_ord_cycling_data (dict):
115
            The dictionary with ordered integer keys and dataframes as values
116
117
    Returns:
118
        The dataframe after concatenation
119
120
    """
121
122
    # Raise an exception if the type of the inputs is not correct
123
    if not isinstance(dict_ord_cycling_data, dict):
124
        raise TypeError('dict_ord_cycling_data is not of type dict')
125
126
    #print(dict_ord_cycling_data.keys())
127
    for i in dict_ord_cycling_data.keys():
128
        # Raise an exception if the type of the keys is not integers
129
        # print(type(i))
130
        if not isinstance(i, (int, np.int64)):
131
            raise TypeError('a key in the dictionary is not an integer')
132
133
    for i in dict_ord_cycling_data.values():
134
        # Raise an exception if the type of the values is not a dataframe
135
        if not isinstance(i, pd.DataFrame):
136
            raise TypeError('a value in the dictionary is not a pandas ' +
137
                            'dataframe')
138
        # print(i.columns)
139
        # Raise am exception if the necessary columns are not found in the df
140
        if not {
141
                'Cycle',
142
                'Charge_Ah',
143
                'Discharge_Ah',
144
                'Time_sec',
145
                'Current_Amp',
146
                'Voltage_Volt'}.issubset(i.columns):
147
            raise Exception("the dataframe doesnt have the columns 'Cycle'" +
148
                            ", 'Charge_Ah', 'Discharge_Ah', " +
149
                            "'Time_sec', 'Voltage_Volt', 'Current_Amp' ")
150
151
    # Concatenate the dataframes to create the total dataframe
152
    df_out = None
153
    for k in dict_ord_cycling_data.keys():
154 View Code Duplication
        if df_out is None:
155
            df_next = dict_ord_cycling_data[k]
156
            df_out = pd.DataFrame(data=None, columns=df_next.columns)
157
            df_out = pd.concat([df_out, df_next])
158
        else:
159
            df_next = dict_ord_cycling_data[k]
160
            df_next['Cycle'] = np.array(
161
                df_next['Cycle']) + max(np.array(df_out['Cycle']))
162
            df_next['Time_sec'] = np.array(
163
                df_next['Time_sec']) + max(np.array(df_out['Time_sec']))
164
            df_next['Charge_Ah'] = np.array(
165
                df_next['Charge_Ah']) + max(np.array(df_out['Charge_Ah']))
166
            df_next['Discharge_Ah'] = np.array(
167
                df_next['Discharge_Ah']) + max(np.array(df_out['Discharge_Ah']))
168
            df_out = pd.concat([df_out, df_next])
169
170
    return df_out
171
172
173
def get_cycle_capacities(df_out):
174
    """
175
    This function takes the dataframe, creates a new index and then calculates
176
    capacities per cycle from cumulative charge and discharge capacities
177
178
    Args:
179
        df_out (pandas.DataFrame):
180
            Concatenated dataframe
181
182
    Returns:
183
        the dataframe with capacities per cycle
184
185
    """
186
187
    # Raise am exception if the necessary columns are not found in the df
188
    if not {'Cycle', 'Charge_Ah', 'Discharge_Ah', 'Time_sec', 'Current_Amp',
189
            'Voltage_Volt'}.issubset(df_out.columns):
190
        raise Exception("the dataframe doesnt have the columns 'Cycle'" +
191
                        ", 'Charge_Ah', 'Discharge_Ah', " +
192
                        "'Time_sec', 'Voltage_Volt', 'Current_Amp' ")
193
194
    # Reset the index and drop the old index
195
    df_out_indexed = df_out.reset_index(drop=True)
196
197
    # Proceed further with correcting the capacity
198
    df_grouped = df_out_indexed.groupby(['Cycle']).count()
199
200
    # Get the indices when a cycle starts
201
    cycle_start_indices = df_grouped['Time_sec'].cumsum()
202
203
    # Get the charge_Ah per cycle
204
    # Create numpy array to store the old charge_Ah row, and then
205
    # perform transformation on it, rather than in the pandas series
206
    # this is a lot faster in this case
207
    charge_cycle_ah = np.array(df_out_indexed['Charge_Ah'])
208
    charge_ah = np.array(df_out_indexed['Charge_Ah'])
209
210
    for i in range(1, len(cycle_start_indices)):
211
        begin_value = cycle_start_indices.iloc[i - 1]
212
        end_value = cycle_start_indices.iloc[i]
213
        charge_cycle_ah[begin_value:end_value] = charge_ah[begin_value:end_value] - \
214
            charge_ah[begin_value - 1]
215
216
    df_out_indexed['charge_cycle_ah'] = charge_cycle_ah
217
218
    # Get the discharge_Ah per cycle
219
    discharge_cycle_ah = np.array(df_out_indexed['Discharge_Ah'])
220
    discharge_ah = np.array(df_out_indexed['Discharge_Ah'])
221
222
    for i in range(1, len(cycle_start_indices)):
223
        begin_value = cycle_start_indices.iloc[i - 1]
224
        end_value = cycle_start_indices.iloc[i]
225
        discharge_cycle_ah[begin_value:end_value] = discharge_ah[begin_value:end_value] - \
226
            discharge_ah[begin_value - 1]
227
228
    df_out_indexed['discharge_cycle_ah'] = discharge_cycle_ah
229
230
    # This is the data column we can use for prediction.
231
    # This is not totally accurate, as this still has some points that go negative,
232
    # due to incorrect discharge_Ah values every few cycles.
233
    # But the machine learning algorithm should consider these as outliers and
234
    # hopefully get over it. We can come back and correct this.
235
    df_out_indexed['capacity_ah'] = charge_cycle_ah - discharge_cycle_ah
236
    df_out_indexed.rename(columns={'Current_Amp':'Current(A)','Voltage_Volt':'Voltage(V)'},
237
                          inplace=True)
238
    return df_out_indexed
239
240
# @profile
241
242
243
def pl_samples_file_reader(data_dir, file_name_format, ignore_file_indices):
244
    """
245
    This function reads in the data for PL Samples experiment and returns a
246
    nice dataframe with cycles in ascending order.
247
248
    Args:
249
        data_dir (string): This is the absolute path to the data directory.
250
        file_name_format (string): Format of the filename, used to deduce other files.
251
        ignore_file_indices (list, int): This list of ints tells which to ignore.
252
253
    Returns:
254
        The complete test data in a dataframe with extra column for capacity in Ah.
255
    """
256
257
    # Raise an exception if the type of the inputs is not correct
258
    if not isinstance(data_dir, str):
259
        raise TypeError('data_dir is not of type string')
260
261
    if not isinstance(file_name_format, str):
262
        raise TypeError('file_name_format is not of type string')
263
264
    if not isinstance(ignore_file_indices, list):
265
        raise TypeError("ignore_file_indices should be a list")
266
267
    for ignore_file_indice in ignore_file_indices:
268
        if not isinstance(ignore_file_indice, int):
269
            raise TypeError("""ignore_file_indices elements should be
270
            of type integer""")
271
272
    if not os.path.exists(join(data_dir, file_name_format)):
273
        raise FileNotFoundError("File {} not found in the location {}"
274
                                .format(file_name_format, data_dir))
275
276
    dict_ord_cycling_data = get_dict_files(
277
        data_dir, file_name_format, ignore_file_indices)
278
279
    df_out = concat_dict_dataframes(dict_ord_cycling_data)
280
281
    ####
282
    # This has been commented out for performance, as we do not need date_time
283
    ####
284
    # Convert the Date_Time from matlab datenum to human readable Date_Time
285
    # First convert the series into a numpy array
286
    # date_time_matlab = df_out['Date_Time'].tolist()
287
288
    # # Apply the conversion to the numpy array
289
    # df_out['Date_Time_new'] =  date_time_converter(date_time_matlab)
290
291
    # Get the cycle capacities from cumulative capacities
292
    df_out_indexed = get_cycle_capacities(df_out)
293
294
    return df_out_indexed
295
296
# Wrapping function to train the LSTM model and calculate model_loss,
297
# and response to the testing data set.
298
299
300
def model_training(data_dir, file_name_format, sheet_name):
301
    """
302
    This function converts cumulative battery cycling data into individual cycle data
303
    and trains the LSTM model with the converted data set.
304
305
    Args:
306
        data_dir (string): This is the absolute path to the data directory.
307
        file_name_format (string): Format of the filename, used to deduce other files.
308
        sheet_name(string or int): Sheet name or sheet number in the excel file containing
309
        the relevant data.
310
311
    Returns:
312
        model_loss(dictionary): Returns the history dictionary (more info to be added)
313
        y_hat(array): Predicted response for the testing dataset.
314
        # y_prediction(array): Predicted response for the completely new dataset
315
        # (The input has to be the time series cycling data including values of
316
        #  Current, Voltage and Discharge Capacity)
317
    """
318
    # The function 'cx2_file_reader' is used to read all the excel files
319
    # in the given path and convert the given cumulative data into individual
320
    # cycle data.
321
    individual_cycle_data = cx2_file_reader(data_dir, file_name_format, sheet_name)
322
323
    # The function 'data_formatting' is used to drop the unnecesary columns
324
    # from the training data i.e. only the features considered in the model
325
    # (Current, Voltage and Discharge capacity) are retained.
326
    formatted_data = data_formatting(individual_cycle_data)
327
328
    # The function 'series_to_supervised' is used to frame the time series training
329
    # data as supervised learning dataset.
330
    learning_df = series_to_supervised(
331
        formatted_data, n_in=1, n_out=1, dropnan=True)
332
333
    # The function 'long_short_term_memory' is used to train the model
334
    # and predict response for the new input dataset.
335
    model_loss, y_hat = long_short_term_memory(learning_df)
336
337
    return model_loss, y_hat
338
339
340
# Function to predict the discharge capacity using the trained LSTM model.
341
def model_prediction(input_data):
342
    """
343
    This function can be used to forecast the discharge capacity of a battery using
344
    the trained LSTM model
345
346
    Args:
347
    input_data(dataframe): This is the dataframe containing the current, voltage and
348
    discharge capacity values at a prior time which can be used to forecast discharge
349
    capacity at a further time.
350
351
    Returns:
352
    y_predicted: The forecasted values of discharge capacity.
353
    """
354
355
    # The function 'series_to_supervised' is used to frame the time series training
356
    # data as supervised learning dataset.
357
    learning_df = series_to_supervised(
358
        input_data, n_in=1, n_out=1, dropnan=True)
359
    learning_df = learning_df.iloc[:, 0:3].values
360
    # Reshaping the input dataset.
361
    learning_df = learning_df.reshape(
362
        (learning_df.shape[0], 1, learning_df.shape[1]))
363
    # Predicting the discharge values using the saved LSTM model.
364
    module_dir = os.path.dirname(os.path.abspath(__file__))
365
    model_path = join(module_dir,'models')
366
    model = load_model(join(model_path,'lstm_trained_model.h5'))
367
    y_predicted = model.predict(learning_df)
368
    return y_predicted
369
370
371
# Wrapping function only to merge and convert cumulative data to
372
# individual cycle data.
373
def cx2_file_reader(data_dir, file_name_format, sheet_name):
374
    """
375
    This function reads in the data for CX2 samples experiment and returns
376
    a well formatted dataframe with cycles in ascending order.
377
378
    Args:
379
    data_dir (string): This is the absolute path to the data directory.
380
    file_name_format (string): Format of the filename, used to deduce other files.
381
    sheet_name (string): Sheet name containing the data in the excel file.
382
383
    Returns:
384
    The complete test data in a dataframe with extra column for capacity in Ah.
385
    """
386
    # Raise an exception if the type of the inputs is not correct
387
    if not isinstance(data_dir, str):
388
        raise TypeError('data_dir is not of type string')
389
390
    if not isinstance(file_name_format, str):
391
        raise TypeError('file_name_format is not of type string')
392
393
    if not isinstance(sheet_name, (str, int)):
394
        raise TypeError('Sheet_Name format is not of type string or integer')
395
396
    if not os.path.exists(join(data_dir, file_name_format)):
397
        raise FileNotFoundError("File {} not found in the location {}"
398
                                .format(file_name_format, data_dir))
399
400
    # Get the list of files in the directory
401
    path = join(data_dir, file_name_format)
402
    files = listdir(path)
403
404
    # Extract the experiment name from the file_name_format
405
    # exp_name = file_name_format[0:6]
406
407
    # Filtering out and reading the excel files in the data directory
408
    file_names = list(filter(lambda x: x[-5:] == '.xlsx', files))
409
410
    # Sorting the file names using the
411
    # 'file_name_sorting' function.
412
    sorted_name_list = file_name_sorting(file_names)
413
414
    # Reading dataframes according to the date of experimentation
415
    # using 'reading_dataframes' function.
416
    sorted_df = reading_dataframes(sorted_name_list, sheet_name, path)
417
418
    # Merging all the dataframes and adjusting the cycle index
419
    # using the 'concat_df' function.
420
    cycle_data = concat_df(sorted_df)
421
422
    # Calculating the net capacity of the battery at every datapoint
423
    # using the function 'capacity'.
424
    capacity_data = capacity(cycle_data)
425
426
    # Returns the dataframe with new cycle indices and capacity data.
427
    return capacity_data
428
429
430
def file_name_sorting(file_name_list):
431
    """
432
    This function sorts all the file names according to the date
433
    on the file name.
434
435
    Args:
436
    file_name_list(list): List containing all the file names to be read
437
438
    Returns:
439
    A list of file names sorted according to the date on the file name.
440
441
    """
442
    filename = pd.DataFrame(data=file_name_list, columns=['file_name'])
443
    # Splitting the file name into different columns
444
    filename['cell_type'], filename['cell_num'], filename['month'], filename[
445
        'day'], filename['year'] = filename['file_name'].str.split('_', 4).str
446
    filename['year'], filename['ext'] = filename['year'].str.split('.', 1).str
447
    filename['date'] = ''
448
    # Merging the year, month and date column to create a string for DateTime
449
    # object.
450
    filename['date'] = filename['year'].map(
451
        str) + filename['month'].map(str) + filename['day'].map(str)
452
    # Creating a DateTime object.
453
    filename['date_time'] = ''
454
    filename['date_time'] = pd.to_datetime(filename['date'], format="%y%m%d")
455
    # Sorting the file names according to the
456
    # created DateTime object.
457
    filename.sort_values(['date_time'], inplace=True)
458
    # Created a list of sorted file names
459
    sorted_file_names = filename['file_name'].values
460
    return sorted_file_names
461
462
463
def reading_dataframes(file_names, sheet_name, path):
464
    """
465
    This function reads all the files in the sorted
466
    file names list as a dataframe
467
468
    Args(list):
469
    file_names: Sorted file names list
470
    sheet_name: Sheet name in the excel file containing the data.
471
472
    Returns:
473
    Dictionary of dataframes in the order of the sorted file names.
474
    """
475
    # Empty dictionary to store all the dataframes according
476
    # to the order in the sorted files name list
477
    df_raw = {}
478
    # Reading the dataframes
479
    for i, filename in enumerate(file_names):
480
        df_raw[i] = pd.read_excel(
481
            join(
482
                path,
483
                filename),
484
            sheet_name=sheet_name)
485
    return df_raw
486
487
488
def concat_df(df_dict):
489
    """
490
    This function concatenates all the dataframes and edits
491
    the cycle index for the concatenated dataframes.
492
493
    Args:
494
    df_dict(dictionary): Dictionary of dataframes to be concatenated.
495
496
    Returns:
497
    A concatenated dataframe with editted cycle index
498
499
    """
500
    df_concat = None
501
    for data in df_dict:
502
        if df_concat is None:
503
            df_next = df_dict[data]
504
            df_concat = pd.DataFrame(data=None, columns=df_next.columns)
505
            # df_next['Cycle'] = df_next['Cycle'] + max(df_pl12['Cycle'])
506
            df_concat = pd.concat([df_concat, df_next])
507
        else:
508
            df_next = df_dict[data]
509
            df_next['Cycle_Index'] = np.array(
510
                df_next['Cycle_Index']) + max(np.array(df_concat['Cycle_Index']))
511
            df_next['Test_Time(s)'] = np.array(
512
                df_next['Test_Time(s)']) + max(np.array(df_concat['Test_Time(s)']))
513
            df_next['Charge_Capacity(Ah)'] = np.array(
514
                df_next['Charge_Capacity(Ah)']) + max(np.array(df_concat['Charge_Capacity(Ah)']))
515
            df_next['Discharge_Capacity(Ah)'] = np.array(
516
                df_next['Discharge_Capacity(Ah)']) + max(
517
                    np.array(df_concat['Discharge_Capacity(Ah)']))
518
            df_concat = pd.concat([df_concat, df_next])
519
    # Reset the index and drop the old index
520
    df_reset = df_concat.reset_index(drop=True)
521
    return df_reset
522
523
524
def capacity(df_data):
525
    """
526
    This function calculates the net capacity of the battery
527
    from the charge capacity and discharge capacity values.
528
529
    Args:
530
    df_data(dataframe): Concatenated dataframe which has the values of charge
531
    capacity and discharge capacity for which net capacity has to be
532
    calculated.
533
534
    Returns:
535
    Dataframe with net capacity of the battery for every point of the charge
536
    and discharge cycle.
537
    """
538
    # Grouping rows by the cycle index.
539
    group = df_data.groupby(['Cycle_Index']).count()
540
541
    # Get the indices when a cycle starts
542
    cycle_start_indices = group['Data_Point'].cumsum()
543
544
    # Get the charge_Ah per cycle
545
    # Create numpy array to store the old charge_Ah row, and then
546
    # perform transformation on it, rather than in the pandas series
547
    # this is a lot faster in this case
548
    charge_cycle_ah = np.array(df_data['Charge_Capacity(Ah)'])
549
    charge_ah = np.array(df_data['Charge_Capacity(Ah)'])
550
551
    for i in range(1, len(cycle_start_indices)):
552
        begin_value = cycle_start_indices.iloc[i - 1]
553
        end_value = cycle_start_indices.iloc[i]
554
        charge_cycle_ah[begin_value:end_value] = charge_ah[begin_value:end_value] - \
555
            charge_ah[begin_value - 1]
556
557
    df_data['charge_cycle_ah'] = charge_cycle_ah
558
559
    # Get the discharge_Ah per cycle
560
    discharge_cycle_ah = np.array(df_data['Discharge_Capacity(Ah)'])
561
    discharge_ah = np.array(df_data['Discharge_Capacity(Ah)'])
562
563
    for i in range(1, len(cycle_start_indices)):
564
        begin_value = cycle_start_indices.iloc[i - 1]
565
        end_value = cycle_start_indices.iloc[i]
566
        discharge_cycle_ah[begin_value:end_value] = discharge_ah[begin_value:end_value] - \
567
            discharge_ah[begin_value - 1]
568
569
    df_data['discharge_cycle_ah'] = discharge_cycle_ah
570
571
    # This is the data column we can use for prediction.
572
    # This is not totally accurate, as this still has some points that go negative,
573
    # due to incorrect discharge_Ah values every few cycles.
574
    # But the machine learning algorithm should consider these as outliers and
575
    # hopefully get over it. We can come back and correct this.
576
    df_data['capacity_ah'] = df_data['charge_cycle_ah'] - df_data['discharge_cycle_ah']
577
578
    return df_data
579
580
581
def data_formatting(merged_df):
582
    """
583
    This function formats the merged dataframe so that it can be used to frame the given
584
    time series data as a supervised learning dataset.
585
586
    Args:
587
        merged_df(dataframe): The merged dataframe which can be obtained by using the
588
        function 'cx2_file_reader'
589
590
    Returns:
591
        A numpy array with only values required to frame a time series as a
592
        supervised learning dataset.
593
    """
594
    # Get the columns containing text 'Current', 'Voltage' and
595
    # 'discharge_cycle_ah'
596
    merged_df = merged_df.filter(regex='Current|Voltage|discharge_cycle_ah')
597
    formatted_df = merged_df.astype('float32')
598
    return formatted_df
599
600
601
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
602
    """
603
    Frame a time series as a supervised learning dataset.
604
    
605
    Arguments:
606
        data: Sequence of observations as a list or NumPy array.
607
        n_in: Number of lag observations as input (X).
608
        n_out: Number of observations as output (y).
609
        dropnan: Boolean whether or not to drop rows with NaN values.
610
    
611
    Returns:
612
        Pandas DataFrame of series framed for supervised learning.
613
    
614
    """
615
    n_vars = 1 if isinstance(data, list) else data.shape[1]
616
    df_data = pd.DataFrame(data)
617
    cols, names = list(), list()
618
    # input sequence (t-n, ... t-1)
619
    for i in range(n_in, 0, -1):
620
        cols.append(df_data.shift(i))
621
        names += [('var%d(t-%d)' % (j + 1, i)) for j in range(n_vars)]
622
    # forecast sequence (t, t+1, ... t+n)
623
    for i in range(0, n_out):
624
        cols.append(df_data.shift(-i))
625
        if i == 0:
626
            names += [('var%d(t)' % (j + 1)) for j in range(n_vars)]
627
        else:
628
            names += [('var%d(t+%d)' % (j + 1, i)) for j in range(n_vars)]
629
    # put it all together
630
    sl_df = pd.concat(cols, axis=1)
631
    sl_df.columns = names
632
    # drop rows with NaN values
633
    if dropnan:
634
        sl_df.dropna(inplace=True)
635
    sl_df.drop(sl_df.columns[[3, 4]], axis=1, inplace=True)
636
    sl_df.rename(columns={'var1(t-1)':'Current(t-1)','var2(t-1)':'Voltage(t-1)',
637
                 'var3(t-1)':'discharge_capacity(t-1)','var3(t)':'discharge_capacity(t)'},
638
                 inplace = True)
639
    return sl_df
640
641
642
def long_short_term_memory(model_data):
643
    """
644
    This function splits the input dataset into training
645
    and testing datasets. The keras LSTM model is then
646
    trained and tested using the respective datasets.
647
648
    Args:
649
        model_data(dataframe): Values of input and output variables
650
        of time series data framed as a supervised learning dataset.
651
652
653
    Returns:
654
        model_loss(dictionary): Returns the history dictionary (more info to be added)
655
        y_hat(array): Predicted response for the testing dataset.
656
        y_prediction(array): Predicted response for the completely new dataset.
657
    """
658
    # Splitting the input dataset into training and testing data
659
    train, test = train_test_split(model_data, test_size=0.2, random_state=944)
660
    # split into input and outputs
661
    train_x, train_y = train[train.columns[0:3]
662
                             ].values, train[train.columns[3]].values
663
    test_x, test_y = test[test.columns[0:3]
664
                          ].values, test[test.columns[3]].values
665
    # reshape input to be 3D [samples, timesteps, features]
666
    train_x = train_x.reshape((train_x.shape[0], 1, train_x.shape[1]))
667
    test_x = test_x.reshape((test_x.shape[0], 1, test_x.shape[1]))
668
    # print(train_x.shape, train_y.shape, test_x.shape, test_y.shape)
669
670
    # Designing the network
671
    model = Sequential()
672
    model.add(LSTM(50, input_shape=(train_x.shape[1], train_x.shape[2])))
673
    model.add(Dense(1))
674
    model.compile(loss='mae', optimizer='adam')
675
    # Fitting the network with training and testing data
676
    history = model.fit(
677
        train_x,
678
        train_y,
679
        epochs=50,
680
        batch_size=72,
681
        validation_data=(
682
            test_x,
683
            test_y),
684
        verbose=0,
685
        shuffle=False)
686
    model_loss = history.history
687
    # Prediction for the test dataset.
688
    yhat = model.predict(test_x)
689
    # model.save('lstm_trained_model.h5')
690
    return model_loss, yhat
691
692
def file_reader(data_dir, file_name_format, sheet_name, ignore_file_indices):
693
    """
694
    This function reads PL sample, CX2 and CS2 files and returns a nice 
695
    dataframe with cyclic values of charge and discharge capacity with 
696
    cycles in ascending order
697
    
698
    Args:
699
    data_dir (string): This is the absolute path to the data directory.
700
    file_name_format (string): Format of the filename, used to deduce other files.
701
    sheet_name (string): Sheet name containing the data in the excel file.
702
    ignore_file_indices (list, int): This list of ints tells which to ignore.
703
704
    Returns:
705
    The complete test data in a dataframe with extra column for capacity in Ah.
706
    """
707
708
    # For excel files (CX2 and CS2 datafiles), the function 'cx2_file_reader'
709
    # is used.
710
    if file_name_format[:3] == 'CX2' or file_name_format[:3] == 'CS2':
711
        df_output = cx2_file_reader(data_dir,file_name_format,sheet_name) 
712
    else:
713
        df_output = pl_samples_file_reader(data_dir,file_name_format,ignore_file_indices)
714
   
715
    # The function 'data_formatting' is used to drop the unnecesary columns
716
    # from the training data i.e. only the features considered in the model
717
    # (Current, Voltage and Discharge capacity) are retained.
718
    formatted_data = data_formatting(df_output)
719
720
    # The function 'series_to_supervised' is used to frame the time series training
721
    # data as supervised learning dataset.
722
    # df_out = series_to_supervised(
723
    #     formatted_data, n_in=1, n_out=1, dropnan=True)
724
    return formatted_data
725