GitHub Access Token became invalid

It seems like the GitHub access token used for retrieving details about this repository from GitHub became invalid. This might prevent certain types of inspections from being run (in particular, everything related to pull requests).
Please ask an admin of your repository to re-new the access token on this website.
Passed
Push — master ( de7c60...c9fd59 )
by Keertana
02:19
created

functions_and_tests.core.data_analysis()   B

Complexity

Conditions 5

Size

Total Lines 42
Code Lines 32

Duplication

Lines 42
Ratio 100 %

Importance

Changes 0
Metric Value
cc 5
eloc 32
nop 1
dl 42
loc 42
rs 8.6453
c 0
b 0
f 0
1
"""This module consists of all the functions utilized."""
2
# This is a tool to automate cyclic voltametry analysis.
3
# Current Version = 1
4
5
import copy
6
import pandas as pd
7
import numpy as np
8
import matplotlib.pyplot as plt
9
import peakutils
10
11
12 View Code Duplication
def read_cycle(data):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
13
    """This function reads a segment of datafile (corresponding a cycle)
14
    and generates a dataframe with columns 'Potential' and 'Current'
15
16
    Parameters
17
    __________
18
    data: segment of data file
19
    Returns
20
    _______
21
    A dataframe with potential and current columns
22
    """
23
24
    current = []
25
    potential = []
26
    for i in data[3:]:
27
        current.append(float(i.split("\t")[4]))
28
        potential.append(float(i.split("\t")[3]))
29
    zipped_list = list(zip(potential, current))
30
    dataframe = pd.DataFrame(zipped_list, columns=['Potential', 'Current'])
31
    return dataframe
32
33
34 View Code Duplication
def read_file_dash(lines):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
35
    """This function is exactly similar to read_file, but it is for dash
36
37
    Parameters
38
    __________
39
    file: lines from dash input file
40
41
    Returns:
42
    ________
43
    dict_of_df: dictionary of dataframes with keys = cycle numbers and
44
    values = dataframes for each cycle
45
    n_cycle: number of cycles in the raw file
46
    """
47
    dict_of_df = {}
48
    h_val = 0
49
    l_val = 0
50
    n_cycle = 0
51
    number = 0
52
    #a = []
53
    #with open(file, 'rt') as f:
54
    #    print(file + ' Opened')
55
    for line in lines:
56
        record = 0
57
        if not (h_val and l_val):
58
            if line.startswith('SCANRATE'):
59
                scan_rate = float(line.split()[2])
60
                h_val = 1
61
            if line.startswith('STEPSIZE'):
62
                step_size = float(line.split()[2])
63
                l_val = 1
64
        if line.startswith('CURVE'):
65
            n_cycle += 1
66
            if n_cycle > 1:
67
                number = n_cycle - 1
68
                data = read_cycle(a_val)
0 ignored issues
show
introduced by
The variable a_val does not seem to be defined for all execution paths.
Loading history...
69
                key_name = 'cycle_' + str(number)
70
                #key_name = number
71
                dict_of_df[key_name] = copy.deepcopy(data)
72
            a_val = []
73
        if n_cycle:
74
            a_val.append(line)
75
    return dict_of_df, number
76
77
78 View Code Duplication
def read_file(file):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
79
    """This function reads the raw data file, gets the scanrate and stepsize
80
    and then reads the lines according to cycle number. Once it reads the data
81
    for one cycle, it calls read_cycle function to denerate a dataframe. It
82
    does the same thing for all the cycles and finally returns a dictionary,
83
    the keys of which are the cycle numbers and the values are the
84
    corresponding dataframes.
85
86
    Parameters
87
    __________
88
    file: raw data file
89
90
    Returns:
91
    ________
92
    dict_of_df: dictionary of dataframes with keys = cycle numbers and
93
    values = dataframes for each cycle
94
    n_cycle: number of cycles in the raw file
95
    """
96
    dict_of_df = {}
97
    h_val = 0
98
    l_val = 0
99
    n_cycle = 0
100
    #a = []
101
    with open(file, 'rt') as f_val:
102
        print(file + ' Opened')
103
        for line in f_val:
104
            record = 0
105
            if not (h_val and l_val):
106
                if line.startswith('SCANRATE'):
107
                    scan_rate = float(line.split()[2])
108
                    h_val = 1
109
                if line.startswith('STEPSIZE'):
110
                    step_size = float(line.split()[2])
111
                    l_val = 1
112
            if line.startswith('CURVE'):
113
                n_cycle += 1
114
                if n_cycle > 1:
115
                    number = n_cycle - 1
116
                    data = read_cycle(a_val)
0 ignored issues
show
introduced by
The variable a_val does not seem to be defined for all execution paths.
Loading history...
117
                    key_name = 'cycle_' + str(number)
118
                    #key_name = number
119
                    dict_of_df[key_name] = copy.deepcopy(data)
120
                a_val = []
121
            if n_cycle:
122
                a_val.append(line)
123
    return dict_of_df, number
0 ignored issues
show
introduced by
The variable number does not seem to be defined for all execution paths.
Loading history...
124
125
#df = pd.DataFrame(list(dict1['df_1'].items()))
126
#list1, list2 = list(dict1['df_1'].items())
127
#list1, list2 = list(dict1.get('df_'+str(1)))
128
129
130 View Code Duplication
def data_frame(dict_cycle, number):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
131
    """Reads the dictionary of dataframes and returns dataframes for each cycle
132
133
    Parameters
134
    __________
135
    dict_cycle: Dictionary of dataframes
136
    n: cycle number
137
138
    Returns:
139
    _______
140
    Dataframe correcponding to the cycle number
141
    """
142
    list1, list2 = (list(dict_cycle.get('cycle_'+str(number)).items()))
143
    zipped_list = list(zip(list1[1], list2[1]))
144
    data = pd.DataFrame(zipped_list, columns=['Potential', 'Current'])
145
    return data
146
147
148 View Code Duplication
def plot_fig(dict_cycle, number):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
149
    """For basic plotting of the cycle data
150
151
    Parameters
152
    __________
153
    dict: dictionary of dataframes for all the cycles
154
    n: number of cycles
155
156
    Saves the plot in a file called cycle.png
157
    """
158
159
    for i in range(number):
160
        print(i+1)
161
        data = data_frame(dict_cycle, i+1)
162
        plt.plot(data.Potential, data.Current, label="Cycle{}".format(i+1))
163
164
    print(data.head())
0 ignored issues
show
introduced by
The variable data does not seem to be defined in case the for loop on line 159 is not entered. Are you sure this can never be the case?
Loading history...
165
    plt.xlabel('Voltage')
166
    plt.ylabel('Current')
167
    plt.legend()
168
    plt.savefig('cycle.png')
169
    print('executed')
170
171
172
#split forward and backward sweping data, to make it easier for processing.
173 View Code Duplication
def split(vector):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
174
    """
175
    This function takes an array and splits it into equal two half.
176
    ----------
177
    Parameters
178
    ----------
179
    vector : Can be in any form of that can be turned into numpy array.
180
    Normally, for the use of this function, it expects pandas DataFrame column.
181
    For example, df['potentials'] could be input as the column of x data.
182
    -------
183
    Returns
184
    -------
185
    This function returns two equally splited vector.
186
    The output then can be used to ease the implementation of peak detection and baseline finding.
187
    """
188
    assert isinstance(vector, pd.core.series.Series), "Input should be pandas series"
189
    split_top = int(len(vector)/2)
190
    end = int(len(vector))
191
    vector1 = np.array(vector)[0:split]
192
    vector2 = np.array(vector)[split_top:end]
193
    return vector1, vector2
194
195
196 View Code Duplication
def critical_idx(arr_x, arr_y): ## Finds index where data set is no longer linear
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
197
    """
198
    This function takes x and y values callculate the derrivative of x and y,
199
    and calculate moving average of 5 and 15 points. Finds intercepts of different
200
    moving average curves and return the indexs of the first intercepts.
201
    ----------
202
    Parameters
203
    ----------
204
    x : Numpy array.
205
    y : Numpy array.
206
    Normally, for the use of this function, it expects numpy array
207
    that came out from split function. For example, output of
208
    split.df['potentials'] could be input for this function as x.
209
    -------
210
    Returns
211
    -------
212
    This function returns 5th index of the intercepts of different moving average curves.
213
    User can change this function according to baseline
214
    branch method 2 to get various indexes..
215
    """
216
    assert isinstance(arr_x, np.ndarray), "Input should be numpy array"
217
    assert isinstance(arr_y == np.ndarray), "Input should be numpy array"
218
    if arr_x.shape[0] != arr_y.shape[0]:
219
        raise ValueError("x and y must have same first dimension, but "
220
                         "have shapes {} and {}".format(arr_x.shape, arr_y.shape))
221
    k_val = np.diff(arr_y)/(np.diff(arr_x)) #calculated slops of x and y
222
    ## Calculate moving average for 10 and 15 points.
223
    ## This two arbitrary number can be tuned to get better fitting.
224
    ave10 = []
225
    ave15 = []
226
    for i in range(len(k_val)-10):
227
        # The reason to minus 10 is to prevent j from running out of index.
228
        a_val = 0
229
        for j in range(0, 5):
230
            a_val = a_val + k_val[i+j]
231
        ave10.append(round(a_val/10, 5))
232
        # keeping 5 desimal points for more accuracy
233
        # This numbers affect how sensitive to noise.
234
    for i in range(len(k_val)-15):
235
        b_val = 0
236
        for j in range(0, 15):
237
            b_val = b_val + k_val[i+j]
238
        ave15.append(round(b_val/15, 5))
239
    ave10i = np.asarray(ave10)
240
    ave15i = np.asarray(ave15)
241
    ## Find intercepts of different moving average curves
242
    #reshape into one row.
243
    idx = np.argwhere(np.diff(np.sign(ave15i - ave10i[:len(ave15i)]) != 0)).reshape(-1)+0
244
    return idx[5]
245
# This is based on the method 1 where user can't choose the baseline.
246
# If wanted to add that, choose method2.
247
248
249
def sum_mean(vector):
250
    """
251
    This function returns the mean and sum of the given vector.
252
    ----------
253
    Parameters
254
    ----------
255
    vector : Can be in any form of that can be turned into numpy array.
256
    Normally, for the use of this function, it expects pandas DataFrame column.
257
    For example, df['potentials'] could be input as the column of x data.
258
    """
259
    assert isinstance(vector == np.ndarray), "Input should be numpy array"
260
    a_val = 0
261
    for i in vector:
262
        a_val = a_val + i
263
    return [a_val, a_val/len(vector)]
264
265
266 View Code Duplication
def multiplica(vector_x, vector_y):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
267
    """
268
    This function returns the sum of the multilica of two given vector.
269
    ----------
270
    Parameters
271
    ----------
272
    vector_x, vector_y : Output of the split vector function.
273
    Two inputs can be the same vector or different vector with same length.
274
    -------
275
    Returns
276
    -------
277
    This function returns a number that is the sum of multiplicity of given two vector.
278
    """
279
    assert isinstance(vector_x == np.ndarray), "Input should be numpy array"
280
    assert isinstance(vector_y == np.ndarray), "Input should be numpy array"
281
    a_val = 0
282
    for vec_x, vec_y in zip(vector_x, vector_y):
283
        a_val = a_val + (vec_x * vec_y)
284
    return a_val
285
286
287 View Code Duplication
def linear_coeff(vec_x, vec_y):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
288
    """
289
    This function returns the inclination coeffecient and y axis interception coeffecient m and b.
290
    ----------
291
    Parameters
292
    ----------
293
    x : Output of the split vector function.
294
    y : Output of the split vector function.
295
    -------
296
    Returns
297
    -------
298
    float number of m and b.
299
    """
300
    m_val = ((multiplica(vec_x, vec_y) - sum_mean(vec_x)[0] * sum_mean(vec_y)[1])/
301
             (multiplica(vec_x, vec_x) - sum_mean(vec_x)[0] * sum_mean(vec_x)[1]))
302
    b_val = sum_mean(vec_y)[1] - m_val * sum_mean(vec_x)[1]
303
    return m_val, b_val
304
305
306
def y_fitted_line(m_val, b_val, vec_x):
307
    """
308
    This function returns the fitted baseline constructed by coeffecient m and b and x values.
309
    ----------
310
    Parameters
311
    ----------
312
    x : Output of the split vector function. x value of the input.
313
    m : inclination of the baseline.
314
    b : y intercept of the baseline.
315
    -------
316
    Returns
317
    -------
318
    List of constructed y_labels.
319
    """
320
    y_base = []
321
    for i in vec_x:
322
        y_val = m_val * i + b_val
323
        y_base.append(y_val)
324
    return y_base
325
326
327 View Code Duplication
def linear_background(vec_x, vec_y):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
328
    """
329
    This function is wrapping function for calculating linear fitted line.
330
    It takes x and y values of the cv data, returns the fitted baseline.
331
    ----------
332
    Parameters
333
    ----------
334
    x : Output of the split vector function. x value of the cyclic voltammetry data.
335
    y : Output of the split vector function. y value of the cyclic voltammetry data.
336
    -------
337
    Returns
338
    -------
339
    List of constructed y_labels.
340
    """
341
    assert isinstance(vec_x, np.ndarray), "Input of the function should be numpy array"
342
    assert isinstance(vec_y, np.ndarray), "Input of the function should be numpy array"
343
    idx = critical_idx(vec_x, vec_y) + 5 #this is also arbitrary number we can play with.
344
    m_val, b_val = (linear_coeff(vec_x[(idx - int(0.5 * idx)) : (idx + int(0.5 * idx))],
345
                                 vec_y[(idx - int(0.5 * idx)) : (idx + int(0.5 * idx))]))
346
    y_base = y_fitted_line(m_val, b_val, vec_x)
347
    return y_base
348
349
350 View Code Duplication
def peak_detection_fxn(data_y):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
351
    """The function takes an input of the column containing the y variables in the dataframe,
352
    associated with the current. The function calls the split function, which splits the
353
    column into two arrays, one of the positive and one of the negative values.
354
    This is because cyclic voltammetry delivers negative peaks, but the peakutils function works
355
    better with positive peaks. The function also runs on the middle 80% of data to eliminate
356
    unnecessary noise and messy values associated with pseudo-peaks.The vectors are then imported
357
    into the peakutils.indexes function to determine the significant peak for each array.
358
    The values are stored in a list, with the first index corresponding to the top peak and the
359
    second corresponding to the bottom peak.
360
    Parameters
361
    ______________
362
    y column: must be a column from a pandas dataframe
363
364
    Returns
365
    _____________
366
    A list with the index of the peaks from the top curve and bottom curve.
367
    """
368
369
    # initialize storage list
370
    index_list = []
371
372
    # split data into above and below the baseline
373
    col_y1, col_y2 = split(data_y) # removed main. head.
374
375
    # detemine length of data and what 10% of the data is
376
    len_y = len(col_y1)
377
    ten_percent = int(np.around(0.1*len_y))
378
379
    # adjust both input columns to be the middle 80% of data
380
    # (take of the first and last 10% of data)
381
    # this avoid detecting peaks from electrolysis
382
    # (from water splitting and not the molecule itself,
383
    # which can form random "peaks")
384
    mod_col_y2 = col_y2[ten_percent:len_y-ten_percent]
385
    mod_col_y1 = col_y1[ten_percent:len_y-ten_percent]
386
387
    # run peakutils package to detect the peaks for both top and bottom
388
    peak_top = peakutils.indexes(mod_col_y2, thres=0.99, min_dist=20)
389
    peak_bottom = peakutils.indexes(abs(mod_col_y1), thres=0.99, min_dist=20)
390
391
    # detemine length of both halves of data
392
    len_top = len(peak_top)
393
    len_bot = len(peak_bottom)
394
395
    # append the values to the storage list
396
    # manipulate values by adding the ten_percent value back
397
    # (as the indecies have moved)
398
    # to detect the actual peaks and not the modified values
399
    index_list.append(peak_top[int(len_top/2)]+ten_percent)
400
    index_list.append(peak_bottom[int(len_bot/2)]+ten_percent)
401
402
    # return storage list
403
    # first value is the top, second value is the bottom
404
    return index_list
405
406 View Code Duplication
def peak_values(dataframe_x, dataframe_y):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
407
    """Outputs x (potentials) and y (currents) values from data indices
408
        given by peak_detection function.
409
410
       ----------
411
       Parameters
412
       ----------
413
       DataFrame_x : should be in the form of a pandas DataFrame column.
414
         For example, df['potentials'] could be input as the column of x
415
         data.
416
417
        DataFrame_y : should be in the form of a pandas DataFrame column.
418
          For example, df['currents'] could be input as the column of y
419
          data.
420
421
       Returns
422
       -------
423
       Result : numpy array of coordinates at peaks in the following order:
424
         potential of peak on top curve, current of peak on top curve,
425
         potential of peak on bottom curve, current of peak on bottom curve"""
426
    index = peak_detection_fxn(dataframe_y)
427
    potential1, potential2 = split(dataframe_x)
428
    current1, current2 = split(dataframe_y)
429
    peak_values = []
430
    peak_values.append(potential2[(index[0])])  # TOPX (bottom part of curve is
431
    # the first part of DataFrame)
432
    peak_values.append(current2[(index[0])])  # TOPY
433
    peak_values.append(potential1[(index[1])])  # BOTTOMX
434
    peak_values.append(current1[(index[1])])  # BOTTOMY
435
    peak_array = np.array(peak_values)
436
    return peak_array
437
438
439
def del_potential(dataframe_x, dataframe_y):
440
    """Outputs the difference in potentials between anoidc and
441
       cathodic peaks in cyclic voltammetry data.
442
443
       Parameters
444
       ----------
445
       DataFrame_x : should be in the form of a pandas DataFrame column.
446
         For example, df['potentials'] could be input as the column of x
447
         data.
448
449
        DataFrame_y : should be in the form of a pandas DataFrame column.
450
          For example, df['currents'] could be input as the column of y
451
          data.
452
453
        Returns
454
        -------
455
        Results: difference in peak potentials."""
456
    del_potentials = (peak_values(dataframe_x, dataframe_y)[0] -
457
                      peak_values(dataframe_x, dataframe_y)[2])
458
    return del_potentials
459
460
461
def half_wave_potential(dataframe_x, dataframe_y):
462
    """Outputs the half wave potential(redox potential) from cyclic
463
       voltammetry data.
464
465
       Parameters
466
       ----------
467
       DataFrame_x : should be in the form of a pandas DataFrame column.
468
         For example, df['potentials'] could be input as the column of x
469
         data.
470
471
        DataFrame_y : should be in the form of a pandas DataFrame column.
472
          For example, df['currents'] could be input as the column of y
473
          data.
474
475
       Returns
476
       -------
477
       Results : the half wave potential."""
478
    half_wave_pot = (del_potential(dataframe_x, dataframe_y))/2
479
    return half_wave_pot
480
481
482 View Code Duplication
def peak_heights(dataframe_x, dataframe_y):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
483
    """Outputs heights of minimum peak and maximum
484
         peak from cyclic voltammetry data.
485
486
       Parameters
487
       ----------
488
       DataFrame_x : should be in the form of a pandas DataFrame column.
489
         For example, df['potentials'] could be input as the column of x
490
         data.
491
492
        DataFrame_y : should be in the form of a pandas DataFrame column.
493
          For example, df['currents'] could be input as the column of y
494
          data.
495
496
        Returns
497
        -------
498
        Results: height of maximum peak, height of minimum peak
499
          in that order in the form of a list."""
500
    current_max = peak_values(dataframe_x, dataframe_y)[1]
501
    current_min = peak_values(dataframe_x, dataframe_y)[3]
502
    col_x1, col_x2 = split(dataframe_x)
503
    col_y1, col_y2 = split(dataframe_y)
504
    line_at_min = linear_background(col_x1, col_y1)[peak_detection_fxn(dataframe_y)[1]]
505
    line_at_max = linear_background(col_x2, col_y2)[peak_detection_fxn(dataframe_y)[0]]
506
    height_of_max = current_max - line_at_max
507
    height_of_min = abs(current_min - line_at_min)
508
    return [height_of_max, height_of_min]
509
510
511
def peak_ratio(dataframe_x, dataframe_y):
512
    """Outputs the peak ratios from cyclic voltammetry data.
513
514
       Parameters
515
       ----------
516
       DataFrame_x : should be in the form of a pandas DataFrame column.
517
         For example, df['potentials'] could be input as the column of x
518
         data.
519
520
        DataFrame_y : should be in the form of a pandas DataFrame column.
521
          For example, df['currents'] could be input as the column of y
522
          data.
523
524
       Returns
525
       -------
526
       Result : returns a the peak ratio."""
527
    ratio = (peak_heights(dataframe_x, dataframe_y)[0] /
528
             peak_heights(dataframe_x, dataframe_y)[1])
529
    return ratio
530
531
532 View Code Duplication
def data_analysis(data):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
533
    """This function returns a dictionary consisting of
534
    the relevant values. This can be seen in the user
535
    interface (Dash) as well."""
536
    results_dict = {}
537
538
    # df = main.data_frame(dict_1,1)
539
    x_val = data['Potential']
540
    y_val = data['Current']
541
    # Peaks are here [list]
542
    peak_index = peak_detection_fxn(y_val)
543
    # Split x,y to get baselines
544
    col_x1, col_x2 = split(x_val)
545
    col_y1, col_y2 = split(y_val)
546
    y_base1 = linear_background(col_x1, col_y1)
547
    y_base2 = linear_background(col_x2, col_y2)
548
    # Calculations based on baseline and peak
549
    values = peak_values(x_val, y_val)
550
    esub_t = values[0]
551
    esub_b = values[2]
552
    dof_e = del_potential(x_val, y_val)
553
    half_e = min(esub_t, esub_b) + half_wave_potential(x_val, y_val)
554
    ipa = peak_heights(x_val, y_val)[0]
555
    ipc = peak_heights(x_val, y_val)[1]
556
    ratio_i = peak_ratio(x_val, y_val)
557
    results_dict['Peak Current Ratio'] = ratio_i
558
    results_dict['Ipc (A)'] = ipc
559
    results_dict['Ipa (A)'] = ipa
560
    results_dict['Epc (V)'] = esub_b
561
    results_dict['Epa (V)'] = esub_t
562
    results_dict['∆E (V)'] = dof_e
563
    results_dict['Redox Potential (V)'] = half_e
564
    if dof_e > 0.3:
565
        results_dict['Reversible'] = 'No'
566
    else:
567
        results_dict['Reversible'] = 'Yes'
568
569
    if half_e > 0 and  'Yes' in results_dict.values():
570
        results_dict['Type'] = 'Catholyte'
571
    elif 'Yes' in results_dict.values():
572
        results_dict['Type'] = 'Anolyte'
573
    return results_dict, col_x1, col_x2, col_y1, col_y2, y_base1, y_base2, peak_index
574
    #return results_dict
575