Conditions | 14 |
Total Lines | 131 |
Code Lines | 57 |
Lines | 11 |
Ratio | 8.4 % |
Changes | 0 |
Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.
For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.
Commonly applied refactorings include:
If many parameters/temporary variables are present:
Complex classes like runner_battdeg.PL_samples_file_joiner() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
1 | import pandas as pd |
||
28 | @profile |
||
29 | def PL_samples_file_joiner(data_dir, file_name_format, ignore_file_indices): |
||
30 | """ |
||
31 | This function reads in the data for PL Samples experiment and returns a |
||
32 | nice dataframe with cycles in ascending order. |
||
33 | |||
34 | Args: |
||
35 | data_dir (string): This is the absolute path to the data directory. |
||
36 | file_name_format (string): Format of the filename, used to deduce other files. |
||
37 | ignore_file_indices (list, int): This list of ints tells which to ignore. |
||
38 | |||
39 | Returns: |
||
40 | The complete test data in a dataframe with extra column for capacity in Ah. |
||
41 | """ |
||
42 | |||
43 | # Raise an exception if the type of the inputs is not correct |
||
44 | if not isinstance(data_dir, str): |
||
45 | raise TypeError('data_dir is not of type string') |
||
46 | |||
47 | if not isinstance(file_name_format, str): |
||
48 | raise TypeError('file_name_format is not of type string') |
||
49 | |||
50 | if not isinstance(ignore_file_indices, list): |
||
51 | raise TypeError("ignore_file_indices should be a list") |
||
52 | |||
53 | for i in range(len(ignore_file_indices)): |
||
54 | if not isinstance(ignore_file_indices[i], int): |
||
55 | raise TypeError("""ignore_file_indices elements should be |
||
56 | of type integer""") |
||
57 | |||
58 | if not os.path.exists(join(data_dir, file_name_format)): |
||
59 | raise FileNotFoundError("File {} not found in the location {}" |
||
60 | .format(file_name_format, data_dir)) |
||
61 | |||
62 | # get the list of files in the directory |
||
63 | onlyfiles = [f for f in listdir(data_dir) if isfile(join(data_dir, f))] |
||
64 | |||
65 | # Extract the experiment name from the file_name_format |
||
66 | exp_name = file_name_format[0:4] |
||
67 | |||
68 | # Empty dictionary to hold all the dataframe for various files |
||
69 | dict_files = {} |
||
70 | |||
71 | # Iterate over all the files of certain type and get the file number from them |
||
72 | for filename in onlyfiles: |
||
73 | if exp_name in filename: |
||
74 | # Extract the filenumber from the name |
||
75 | file_number = re.search(exp_name + '\((.+?)\).csv', filename).group(1) |
||
76 | # Give a value of dataframe to each key |
||
77 | dict_files[int(file_number)] = pd.read_csv(join(data_dir, filename)) |
||
78 | |||
79 | # Empty dictionary to hold the ordered dictionaries |
||
80 | dict_ordered = {} |
||
81 | # Sort the dictionary based on keys |
||
82 | for key in sorted(dict_files.keys()): |
||
83 | dict_ordered[key] = dict_files[key] |
||
84 | |||
85 | # Keys with files to keep, remove the ignore indices from all keys |
||
86 | wanted_keys = np.array(list(set(dict_ordered.keys()) - set(ignore_file_indices))) |
||
87 | |||
88 | # Remove the ignored dataframes for characterization |
||
89 | dict_ord_cycling_data = {k : dict_ordered[k] for k in wanted_keys} |
||
90 | |||
91 | # Concatenate the dataframes to create the total dataframe |
||
92 | |||
93 | df_out = None |
||
94 | for k in wanted_keys: |
||
95 | View Code Duplication | if df_out is None: |
|
96 | df_next = dict_ord_cycling_data[k] |
||
97 | df_out = pd.DataFrame(data=None, columns=df_next.columns) |
||
98 | df_out = pd.concat([df_out, df_next]) |
||
99 | else: |
||
100 | df_next = dict_ord_cycling_data[k] |
||
101 | df_next['Cycle'] = np.array(df_next['Cycle']) + max(np.array(df_out['Cycle'])) |
||
102 | df_next['Time_sec'] = np.array(df_next['Time_sec']) + max(np.array(df_out['Time_sec'])) |
||
103 | df_next['Charge_Ah'] = np.array(df_next['Charge_Ah']) + max(np.array(df_out['Charge_Ah'])) |
||
104 | df_next['Discharge_Ah'] = np.array(df_next['Discharge_Ah']) + max(np.array(df_out['Discharge_Ah'])) |
||
105 | df_out = pd.concat([df_out, df_next]) |
||
106 | |||
107 | #### |
||
108 | # This has been commented out for performance, as we do not need date_time |
||
109 | #### |
||
110 | # Convert the Date_Time from matlab datenum to human readable Date_Time |
||
111 | # First convert the series into a numpy array |
||
112 | # date_time_matlab = df_out['Date_Time'].tolist() |
||
113 | |||
114 | # # Apply the conversion to the numpy array |
||
115 | # df_out['Date_Time_new'] = date_time_converter(date_time_matlab) |
||
116 | |||
117 | # Reset the index and drop the old index |
||
118 | df_out_indexed = df_out.reset_index(drop=True) |
||
119 | |||
120 | # Proceed further with correcting the capacity |
||
121 | df_grouped = df_out_indexed.groupby(['Cycle']).count() |
||
122 | |||
123 | # Get the indices when a cycle starts |
||
124 | cycle_start_indices = df_grouped['Time_sec'].cumsum() |
||
125 | |||
126 | # Get the charge_Ah per cycle |
||
127 | # Create numpy array to store the old charge_Ah row, and then |
||
128 | # perform transformation on it, rather than in the pandas series |
||
129 | # this is a lot faster in this case |
||
130 | charge_cycle_ah = np.array(df_out_indexed['Charge_Ah']) |
||
131 | charge_ah = np.array(df_out_indexed['Charge_Ah']) |
||
132 | |||
133 | for i in range(1, len(cycle_start_indices)): |
||
134 | a = cycle_start_indices.iloc[i-1] |
||
135 | b = cycle_start_indices.iloc[i] |
||
136 | charge_cycle_ah[a:b] = charge_ah[a:b] - charge_ah[a-1] |
||
137 | |||
138 | df_out_indexed['charge_cycle_ah'] = charge_cycle_ah |
||
139 | |||
140 | # Get the discharge_Ah per cycle |
||
141 | discharge_cycle_ah = np.array(df_out_indexed['Discharge_Ah']) |
||
142 | discharge_ah = np.array(df_out_indexed['Discharge_Ah']) |
||
143 | |||
144 | for i in range(1, len(cycle_start_indices)): |
||
145 | a = cycle_start_indices.iloc[i-1] |
||
146 | b = cycle_start_indices.iloc[i] |
||
147 | discharge_cycle_ah[a:b] = discharge_ah[a:b] - discharge_ah[a-1] |
||
148 | |||
149 | df_out_indexed['discharge_cycle_ah'] = discharge_cycle_ah |
||
150 | |||
151 | # This is the data column we can use for prediction. |
||
152 | # This is not totally accurate, as this still has some points that go negative, |
||
153 | # due to incorrect discharge_Ah values every few cycles. |
||
154 | # But the machine learning algorithm should consider these as outliers and |
||
155 | # hopefully get over it. We can come back and correct this. |
||
156 | df_out_indexed['capacity_ah'] = charge_ah - discharge_ah |
||
157 | |||
158 | return df_out_indexed |
||
159 | |||
177 | out_df = PL_samples_file_joiner(data_dir, fnf, ignore_indices) |