| Conditions | 14 |
| Total Lines | 131 |
| Code Lines | 57 |
| Lines | 11 |
| Ratio | 8.4 % |
| Changes | 0 | ||
Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.
For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.
Commonly applied refactorings include:
If many parameters/temporary variables are present:
Complex classes like runner_battdeg.PL_samples_file_joiner() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
| 1 | import pandas as pd |
||
| 28 | @profile |
||
| 29 | def PL_samples_file_joiner(data_dir, file_name_format, ignore_file_indices): |
||
| 30 | """ |
||
| 31 | This function reads in the data for PL Samples experiment and returns a |
||
| 32 | nice dataframe with cycles in ascending order. |
||
| 33 | |||
| 34 | Args: |
||
| 35 | data_dir (string): This is the absolute path to the data directory. |
||
| 36 | file_name_format (string): Format of the filename, used to deduce other files. |
||
| 37 | ignore_file_indices (list, int): This list of ints tells which to ignore. |
||
| 38 | |||
| 39 | Returns: |
||
| 40 | The complete test data in a dataframe with extra column for capacity in Ah. |
||
| 41 | """ |
||
| 42 | |||
| 43 | # Raise an exception if the type of the inputs is not correct |
||
| 44 | if not isinstance(data_dir, str): |
||
| 45 | raise TypeError('data_dir is not of type string') |
||
| 46 | |||
| 47 | if not isinstance(file_name_format, str): |
||
| 48 | raise TypeError('file_name_format is not of type string') |
||
| 49 | |||
| 50 | if not isinstance(ignore_file_indices, list): |
||
| 51 | raise TypeError("ignore_file_indices should be a list") |
||
| 52 | |||
| 53 | for i in range(len(ignore_file_indices)): |
||
| 54 | if not isinstance(ignore_file_indices[i], int): |
||
| 55 | raise TypeError("""ignore_file_indices elements should be |
||
| 56 | of type integer""") |
||
| 57 | |||
| 58 | if not os.path.exists(join(data_dir, file_name_format)): |
||
| 59 | raise FileNotFoundError("File {} not found in the location {}" |
||
| 60 | .format(file_name_format, data_dir)) |
||
| 61 | |||
| 62 | # get the list of files in the directory |
||
| 63 | onlyfiles = [f for f in listdir(data_dir) if isfile(join(data_dir, f))] |
||
| 64 | |||
| 65 | # Extract the experiment name from the file_name_format |
||
| 66 | exp_name = file_name_format[0:4] |
||
| 67 | |||
| 68 | # Empty dictionary to hold all the dataframe for various files |
||
| 69 | dict_files = {} |
||
| 70 | |||
| 71 | # Iterate over all the files of certain type and get the file number from them |
||
| 72 | for filename in onlyfiles: |
||
| 73 | if exp_name in filename: |
||
| 74 | # Extract the filenumber from the name |
||
| 75 | file_number = re.search(exp_name + '\((.+?)\).csv', filename).group(1) |
||
| 76 | # Give a value of dataframe to each key |
||
| 77 | dict_files[int(file_number)] = pd.read_csv(join(data_dir, filename)) |
||
| 78 | |||
| 79 | # Empty dictionary to hold the ordered dictionaries |
||
| 80 | dict_ordered = {} |
||
| 81 | # Sort the dictionary based on keys |
||
| 82 | for key in sorted(dict_files.keys()): |
||
| 83 | dict_ordered[key] = dict_files[key] |
||
| 84 | |||
| 85 | # Keys with files to keep, remove the ignore indices from all keys |
||
| 86 | wanted_keys = np.array(list(set(dict_ordered.keys()) - set(ignore_file_indices))) |
||
| 87 | |||
| 88 | # Remove the ignored dataframes for characterization |
||
| 89 | dict_ord_cycling_data = {k : dict_ordered[k] for k in wanted_keys} |
||
| 90 | |||
| 91 | # Concatenate the dataframes to create the total dataframe |
||
| 92 | |||
| 93 | df_out = None |
||
| 94 | for k in wanted_keys: |
||
| 95 | View Code Duplication | if df_out is None: |
|
| 96 | df_next = dict_ord_cycling_data[k] |
||
| 97 | df_out = pd.DataFrame(data=None, columns=df_next.columns) |
||
| 98 | df_out = pd.concat([df_out, df_next]) |
||
| 99 | else: |
||
| 100 | df_next = dict_ord_cycling_data[k] |
||
| 101 | df_next['Cycle'] = np.array(df_next['Cycle']) + max(np.array(df_out['Cycle'])) |
||
| 102 | df_next['Time_sec'] = np.array(df_next['Time_sec']) + max(np.array(df_out['Time_sec'])) |
||
| 103 | df_next['Charge_Ah'] = np.array(df_next['Charge_Ah']) + max(np.array(df_out['Charge_Ah'])) |
||
| 104 | df_next['Discharge_Ah'] = np.array(df_next['Discharge_Ah']) + max(np.array(df_out['Discharge_Ah'])) |
||
| 105 | df_out = pd.concat([df_out, df_next]) |
||
| 106 | |||
| 107 | #### |
||
| 108 | # This has been commented out for performance, as we do not need date_time |
||
| 109 | #### |
||
| 110 | # Convert the Date_Time from matlab datenum to human readable Date_Time |
||
| 111 | # First convert the series into a numpy array |
||
| 112 | # date_time_matlab = df_out['Date_Time'].tolist() |
||
| 113 | |||
| 114 | # # Apply the conversion to the numpy array |
||
| 115 | # df_out['Date_Time_new'] = date_time_converter(date_time_matlab) |
||
| 116 | |||
| 117 | # Reset the index and drop the old index |
||
| 118 | df_out_indexed = df_out.reset_index(drop=True) |
||
| 119 | |||
| 120 | # Proceed further with correcting the capacity |
||
| 121 | df_grouped = df_out_indexed.groupby(['Cycle']).count() |
||
| 122 | |||
| 123 | # Get the indices when a cycle starts |
||
| 124 | cycle_start_indices = df_grouped['Time_sec'].cumsum() |
||
| 125 | |||
| 126 | # Get the charge_Ah per cycle |
||
| 127 | # Create numpy array to store the old charge_Ah row, and then |
||
| 128 | # perform transformation on it, rather than in the pandas series |
||
| 129 | # this is a lot faster in this case |
||
| 130 | charge_cycle_ah = np.array(df_out_indexed['Charge_Ah']) |
||
| 131 | charge_ah = np.array(df_out_indexed['Charge_Ah']) |
||
| 132 | |||
| 133 | for i in range(1, len(cycle_start_indices)): |
||
| 134 | a = cycle_start_indices.iloc[i-1] |
||
| 135 | b = cycle_start_indices.iloc[i] |
||
| 136 | charge_cycle_ah[a:b] = charge_ah[a:b] - charge_ah[a-1] |
||
| 137 | |||
| 138 | df_out_indexed['charge_cycle_ah'] = charge_cycle_ah |
||
| 139 | |||
| 140 | # Get the discharge_Ah per cycle |
||
| 141 | discharge_cycle_ah = np.array(df_out_indexed['Discharge_Ah']) |
||
| 142 | discharge_ah = np.array(df_out_indexed['Discharge_Ah']) |
||
| 143 | |||
| 144 | for i in range(1, len(cycle_start_indices)): |
||
| 145 | a = cycle_start_indices.iloc[i-1] |
||
| 146 | b = cycle_start_indices.iloc[i] |
||
| 147 | discharge_cycle_ah[a:b] = discharge_ah[a:b] - discharge_ah[a-1] |
||
| 148 | |||
| 149 | df_out_indexed['discharge_cycle_ah'] = discharge_cycle_ah |
||
| 150 | |||
| 151 | # This is the data column we can use for prediction. |
||
| 152 | # This is not totally accurate, as this still has some points that go negative, |
||
| 153 | # due to incorrect discharge_Ah values every few cycles. |
||
| 154 | # But the machine learning algorithm should consider these as outliers and |
||
| 155 | # hopefully get over it. We can come back and correct this. |
||
| 156 | df_out_indexed['capacity_ah'] = charge_ah - discharge_ah |
||
| 157 | |||
| 158 | return df_out_indexed |
||
| 159 | |||
| 177 | out_df = PL_samples_file_joiner(data_dir, fnf, ignore_indices) |