Total Complexity | 71 |
Total Lines | 788 |
Duplicated Lines | 93.91 % |
Changes | 0 |
Duplicate code is one of the most pungent code smells. A rule that is often used is to re-structure code once it is duplicated in three or more places.
Common duplication problems, and corresponding solutions are:
Complex classes like diff_classifier.pca often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
1 | """Performs principle component analysis on input datasets. |
||
2 | |||
3 | This module performs principle component analysis on input datasets using |
||
4 | functions from scikit-learn. It is optimized to data formats used in |
||
5 | diff_classifier, but can potentially be extended to other applications. |
||
6 | |||
7 | """ |
||
8 | |||
9 | import random |
||
10 | import pandas as pd |
||
11 | import numpy as np |
||
12 | from scipy import stats, linalg |
||
13 | import seaborn as sns |
||
14 | from sklearn import neighbors |
||
15 | from sklearn.decomposition import PCA as pca |
||
16 | from sklearn.preprocessing import StandardScaler as stscale |
||
17 | from sklearn.preprocessing import Imputer |
||
18 | from sklearn.neural_network import MLPClassifier |
||
19 | from sklearn.ensemble import RandomForestClassifier |
||
20 | import matplotlib.pyplot as plt |
||
21 | from matplotlib.pyplot import cm |
||
22 | from mpl_toolkits.mplot3d import Axes3D |
||
23 | |||
24 | |||
25 | class Bunch: |
||
26 | def __init__(self, **kwds): |
||
27 | self.__dict__.update(kwds) |
||
28 | |||
29 | |||
30 | View Code Duplication | def partial_corr(mtrx): |
|
|
|||
31 | """Calculates linear partial correlation coefficients |
||
32 | |||
33 | Returns the sample linear partial correlation coefficients between pairs of |
||
34 | variables in mtrx, controlling for the remaining variables in mtrx. |
||
35 | |||
36 | |||
37 | |||
38 | Parameters |
||
39 | ---------- |
||
40 | mtrx : array-like, shape (n, p) |
||
41 | Array with the different variables. Each column of mtrx is taken as a |
||
42 | variable |
||
43 | |||
44 | |||
45 | Returns |
||
46 | ------- |
||
47 | P : array-like, shape (p, p) |
||
48 | P[i, j] contains the partial correlation of mtrx[:, i] and mtrx[:, j] |
||
49 | controlling for the remaining variables in mtrx. |
||
50 | |||
51 | Notes |
||
52 | ----- |
||
53 | |||
54 | Partial Correlation in Python (clone of Matlab's partialcorr) |
||
55 | |||
56 | This uses the linear regression approach to compute the partial |
||
57 | correlation (might be slow for a huge number of variables). The |
||
58 | algorithm is detailed here: |
||
59 | |||
60 | http://en.wikipedia.org/wiki/Partial_correlation#Using_linear_regression |
||
61 | |||
62 | Taking X and Y two variables of interest and Z the matrix with all the |
||
63 | variable minus {X, Y}, the algorithm can be summarized as |
||
64 | |||
65 | 1) perform a normal linear least-squares regression with X as the target |
||
66 | and Z as the predictor |
||
67 | 2) calculate the residuals in Step #1 |
||
68 | 3) perform a normal linear least-squares regression with Y as the target and |
||
69 | Z as the predictor |
||
70 | 4) calculate the residuals in Step #3 |
||
71 | 5) calculate the correlation coefficient between the residuals from Steps #2 |
||
72 | and #4 |
||
73 | |||
74 | The result is the partial correlation between X and Y while controlling for |
||
75 | the effect of Z |
||
76 | |||
77 | Adapted from code by Fabian Pedregosa-Izquierdo: |
||
78 | Date: Nov 2014 |
||
79 | Author: Fabian Pedregosa-Izquierdo, [email protected] |
||
80 | Testing: Valentina Borghesani, [email protected] |
||
81 | |||
82 | """ |
||
83 | |||
84 | mtrx = np.asarray(mtrx) |
||
85 | pfeat = mtrx.shape[1] |
||
86 | pcorr = np.zeros((pfeat, pfeat), dtype=np.float) |
||
87 | for i in range(pfeat): |
||
88 | pcorr[i, i] = 1 |
||
89 | for j in range(i+1, pfeat): |
||
90 | idx = np.ones(pfeat, dtype=np.bool) |
||
91 | idx[i] = False |
||
92 | idx[j] = False |
||
93 | beta_i = linalg.lstsq(mtrx[:, idx], mtrx[:, j])[0] |
||
94 | beta_j = linalg.lstsq(mtrx[:, idx], mtrx[:, i])[0] |
||
95 | |||
96 | res_j = mtrx[:, j] - mtrx[:, idx].dot(beta_i) |
||
97 | res_i = mtrx[:, i] - mtrx[:, idx].dot(beta_j) |
||
98 | |||
99 | corr = stats.pearsonr(res_i, res_j)[0] |
||
100 | pcorr[i, j] = corr |
||
101 | pcorr[j, i] = corr |
||
102 | |||
103 | return pcorr |
||
104 | |||
105 | |||
106 | View Code Duplication | def kmo(dataset): |
|
107 | """Calculates the Kaiser-Meyer-Olkin measure on an input dataset |
||
108 | |||
109 | Parameters |
||
110 | ---------- |
||
111 | dataset : array-like, shape (n, p) |
||
112 | Array containing n samples and p features. Must have no NaNs. |
||
113 | Ideally scaled before performing test. |
||
114 | |||
115 | Returns |
||
116 | ------- |
||
117 | kmostat : float |
||
118 | KMO test value |
||
119 | |||
120 | Notes |
||
121 | ----- |
||
122 | Based on calculations shown here: |
||
123 | |||
124 | http://www.statisticshowto.com/kaiser-meyer-olkin/ |
||
125 | |||
126 | -- 0.00-0.49 unacceptable |
||
127 | -- 0.50-0.59 miserable |
||
128 | -- 0.60-0.69 mediocre |
||
129 | -- 0.70-0.79 middling |
||
130 | -- 0.80-0.89 meritorious |
||
131 | -- 0.90-1.00 marvelous |
||
132 | |||
133 | """ |
||
134 | |||
135 | # Correlation matrix and the partial covariance matrix. |
||
136 | corrmatrix = np.corrcoef(dataset.transpose()) |
||
137 | pcorr = partial_corr(dataset) |
||
138 | |||
139 | # Calculation of the KMO statistic |
||
140 | matrix = np.multiply(corrmatrix, corrmatrix) |
||
141 | rows = matrix.shape[0] |
||
142 | cols = matrix.shape[1] |
||
143 | rij = np.sum(matrix) - np.trace(matrix) |
||
144 | uij = np.sum(pcorr) - np.trace(pcorr) |
||
145 | kmostat = rij/(rij+uij) |
||
146 | print(kmostat) |
||
147 | return kmostat |
||
148 | |||
149 | |||
150 | View Code Duplication | def pca_analysis(dataset, dropcols=[], imputenans=True, scale=True, |
|
151 | rem_outliers=True, out_thresh=10, n_components=5, |
||
152 | existing_model=False, model_file='Optional'): |
||
153 | """Performs a primary component analysis on an input dataset |
||
154 | |||
155 | Parameters |
||
156 | ---------- |
||
157 | dataset : pandas.core.frame.DataFrame, shape (n, p) |
||
158 | Input dataset with n samples and p features |
||
159 | dropcols : list |
||
160 | Columns to exclude from pca analysis. At a minimum, user must exclude |
||
161 | non-numeric columns. |
||
162 | imputenans : bool |
||
163 | If True, impute NaN values as column means. |
||
164 | scale : bool |
||
165 | If True, columns will be scaled to a mean of zero and a standard |
||
166 | deviation of 1. |
||
167 | n_components : int |
||
168 | Desired number of components in principle component analysis. |
||
169 | |||
170 | Returns |
||
171 | ------- |
||
172 | pcadataset : diff_classifier.pca.Bunch |
||
173 | Contains outputs of PCA analysis, including: |
||
174 | scaled : numpy.ndarray, shape (n, p) |
||
175 | Scaled dataset with n samples and p features |
||
176 | pcavals : pandas.core.frame.DataFrame, shape (n, n_components) |
||
177 | Output array of n_component features of each original sample |
||
178 | final : pandas.core.frame.DataFrame, shape (n, p+n_components) |
||
179 | Output array with principle components append to original array. |
||
180 | prcomps : pandas.core.frame.DataFrame, shape (5, n_components) |
||
181 | Output array displaying the top 5 features contributing to each |
||
182 | principle component. |
||
183 | prvals : dict of list of str |
||
184 | Output dictionary of of the pca scores for the top 5 features |
||
185 | contributing to each principle component. |
||
186 | components : pandas.core.frame.DataFrame, shape (p, n_components) |
||
187 | Raw pca scores. |
||
188 | |||
189 | """ |
||
190 | pd.options.mode.chained_assignment = None # default='warn' |
||
191 | dataset_num = dataset.drop(dropcols, axis=1) |
||
192 | dataset_num = dataset_num.replace([np.inf, -np.inf], np.nan) |
||
193 | |||
194 | if rem_outliers: |
||
195 | for i in range(10): |
||
196 | for col in dataset_num.columns: |
||
197 | xmean = np.mean(dataset_num[col]) |
||
198 | xstd = np.std(dataset_num[col]) |
||
199 | |||
200 | counter = 0 |
||
201 | for x in dataset_num[col]: |
||
202 | if x > xmean + out_thresh*xstd: |
||
203 | dataset[col][counter] = np.nan |
||
204 | dataset_num[col][counter] = np.nan |
||
205 | if x < xmean - out_thresh*xstd: |
||
206 | dataset[col][counter] = np.nan |
||
207 | dataset_num[col][counter] = np.nan |
||
208 | counter = counter + 1 |
||
209 | |||
210 | dataset_raw = dataset_num.values |
||
211 | |||
212 | # Fill in NaN values |
||
213 | if imputenans: |
||
214 | imp = Imputer(missing_values='NaN', strategy='mean', axis=0) |
||
215 | imp.fit(dataset_raw) |
||
216 | dataset_clean = imp.transform(dataset_raw) |
||
217 | else: |
||
218 | dataset_clean = dataset_raw |
||
219 | |||
220 | # Scale inputs |
||
221 | if scale: |
||
222 | if existing_model: |
||
223 | scaler = model_file.scaler |
||
224 | dataset_scaled = model_file.scaler.transform(dataset_clean) |
||
225 | else: |
||
226 | scaler = stscale() |
||
227 | scaler.fit(dataset_clean) |
||
228 | dataset_scaled = scaler.transform(dataset_clean) |
||
229 | else: |
||
230 | dataset_scaled = dataset_clean |
||
231 | |||
232 | pcadataset = Bunch(scaled=dataset_scaled) |
||
233 | |||
234 | if existing_model: |
||
235 | pca1 = model_file.pcamodel |
||
236 | else: |
||
237 | pca1 = pca(n_components=n_components) |
||
238 | pca1.fit(dataset_scaled) |
||
239 | |||
240 | if not existing_model: |
||
241 | # Cumulative explained variance ratio |
||
242 | cum_var = 0 |
||
243 | explained_v = pca1.explained_variance_ratio_ |
||
244 | print('Cumulative explained variance:') |
||
245 | for i in range(0, n_components): |
||
246 | cum_var = cum_var + explained_v[i] |
||
247 | print('{} component: {}'.format(i, cum_var)) |
||
248 | |||
249 | prim_comps = {} |
||
250 | pcadataset.prvals = {} |
||
251 | comps = pca1.components_ |
||
252 | pcadataset.components = pd.DataFrame(comps.transpose()) |
||
253 | for num in range(0, n_components): |
||
254 | highest = np.abs(pcadataset.components[ |
||
255 | num]).values.argsort()[-5:][::-1] |
||
256 | pels = [] |
||
257 | pcadataset.prvals[num] = pcadataset.components[num].values[highest] |
||
258 | for col in highest: |
||
259 | pels.append(dataset_num.columns[col]) |
||
260 | prim_comps[num] = pels |
||
261 | |||
262 | # Main contributors to each primary component |
||
263 | pcadataset.prcomps = pd.DataFrame.from_dict(prim_comps) |
||
264 | pcadataset.pcavals = pd.DataFrame(pca1.transform(dataset_scaled)) |
||
265 | pcadataset.final = pd.concat([dataset, pcadataset.pcavals], axis=1) |
||
266 | pcadataset.pcamodel = pca1 |
||
267 | pcadataset.scaler = scaler |
||
268 | |||
269 | return pcadataset |
||
270 | |||
271 | |||
272 | View Code Duplication | def recycle_pcamodel(pcamodel, df, imputenans=True, scale=True): |
|
273 | if imputenans: |
||
274 | imp = Imputer(missing_values='NaN', strategy='mean', axis=0) |
||
275 | imp.fit(df) |
||
276 | df_clean = imp.transform(df) |
||
277 | else: |
||
278 | df_clean = df |
||
279 | |||
280 | # Scale inputs |
||
281 | if scale: |
||
282 | scaler = stscale() |
||
283 | scaler.fit(df_clean) |
||
284 | df_scaled = scaler.transform(df_clean) |
||
285 | else: |
||
286 | df_scaled = df_clean |
||
287 | |||
288 | pcamodel.fit(df_scaled) |
||
289 | pcavals = pd.DataFrame(pcamodel.transform(df_scaled)) |
||
290 | pcafinal = pd.concat([df, pcavals], axis=1) |
||
291 | |||
292 | return pcafinal |
||
293 | |||
294 | |||
295 | View Code Duplication | def plot_pca(datasets, figsize=(8, 8), lwidth=8.0, |
|
296 | labels=['Sample1', 'Sample2'], savefig=True, filename='test.png', |
||
297 | rticks=np.linspace(-2, 2, 5), dpi=300, labelsize=20): |
||
298 | """Plots the average output features from a PCA analysis in polar |
||
299 | coordinates |
||
300 | |||
301 | Parameters |
||
302 | ---------- |
||
303 | datasets : dict of numpy.ndarray |
||
304 | Dictionary with n samples and p features to plot. |
||
305 | figize : list |
||
306 | Dimensions of output figure e.g. (8, 8) |
||
307 | lwidth : float |
||
308 | Width of plotted lines in figure |
||
309 | labels : list of str |
||
310 | Labels to display in legend. |
||
311 | savefig : bool |
||
312 | If True, saves figure |
||
313 | filename : str |
||
314 | Desired output filename |
||
315 | |||
316 | """ |
||
317 | |||
318 | fig = plt.figure(figsize=figsize) |
||
319 | for key in datasets: |
||
320 | N = datasets[key].shape[0] |
||
321 | width = (2*np.pi) / N |
||
322 | color = iter(cm.viridis(np.linspace(0, 0.9, len(datasets)))) |
||
323 | |||
324 | theta = np.linspace(0.0, 2 * np.pi, N+1, endpoint=True) |
||
325 | radii = {} |
||
326 | bars = {} |
||
327 | |||
328 | ax = plt.subplot(111, polar=True) |
||
329 | counter = 0 |
||
330 | for key in datasets: |
||
331 | c = next(color) |
||
332 | radii[key] = np.append(datasets[key], datasets[key][0]) |
||
333 | bars[key] = ax.plot(theta, radii[key], linewidth=lwidth, color=c, |
||
334 | label=labels[counter]) |
||
335 | counter = counter + 1 |
||
336 | plt.legend(bbox_to_anchor=(1, 1), loc=2, borderaxespad=0., |
||
337 | frameon=False, fontsize=labelsize+4) |
||
338 | |||
339 | # # Use custom colors and opacity |
||
340 | # for r, bar in zip(radii, bars): |
||
341 | # bar.set_facecolor(plt.cm.jet(np.abs(r / 2.5))) |
||
342 | # bar.set_alpha(0.8) |
||
343 | ax.set_xticks(np.pi/180. * np.linspace(0, 360, N, endpoint=False)) |
||
344 | ax.set_xticklabels(list(range(0, N)), fontsize=labelsize) |
||
345 | ax.set_ylim([min(rticks), max(rticks)+1]) |
||
346 | ax.set_yticks(rticks) |
||
347 | ax.yaxis.set_tick_params(labelsize=labelsize) |
||
348 | |||
349 | if savefig: |
||
350 | plt.savefig(filename, bbox_inches='tight', dpi=dpi) |
||
351 | |||
352 | plt.show() |
||
353 | |||
354 | |||
355 | View Code Duplication | def build_model(rawdata, feature, featvals, equal_sampling=True, |
|
356 | tsize=20, from_end=True, input_cols=6, model='KNN', |
||
357 | **kwargs): |
||
358 | """Builds a K-nearest neighbor model using an input dataset. |
||
359 | |||
360 | Parameters |
||
361 | ---------- |
||
362 | rawdata : pandas.core.frames.DataFrame |
||
363 | Raw dataset of n samples and p features. |
||
364 | feature : string or int |
||
365 | Feature in rawdata containing output values on which KNN |
||
366 | model is to be based. |
||
367 | featvals : string or int |
||
368 | All values that feature can take. |
||
369 | equal_sampling : bool |
||
370 | If True, training dataset will contain an equal number |
||
371 | of samples that take each value of featvals. If false, |
||
372 | each sample in training dataset will be taken randomly |
||
373 | from rawdata. |
||
374 | tsize : int |
||
375 | Size of training dataset. If equal_sampling is False, |
||
376 | training dataset will be exactly this size. If True, |
||
377 | training dataset will contain N x tsize where N is the |
||
378 | number of unique values in featvals. |
||
379 | n_neighbors : int |
||
380 | Number of nearest neighbors to be used in KNN |
||
381 | algorithm. |
||
382 | from_end : int |
||
383 | If True, in_cols will select features to be used as |
||
384 | training data defined end of rawdata e.g. |
||
385 | rawdata[:, -6:]. If False, input_cols will be read |
||
386 | as a tuple e.g. rawdata[:, 10:15]. |
||
387 | input_col : int or tuple |
||
388 | Defined in from_end above. |
||
389 | |||
390 | Returns |
||
391 | ------- |
||
392 | clf : sklearn.neighbors.classification.KNeighborsClassifier |
||
393 | KNN model |
||
394 | X : numpy.ndarray |
||
395 | training input dataset used to create clf |
||
396 | y : numpy.ndarray |
||
397 | training output dataset used to create clf |
||
398 | |||
399 | """ |
||
400 | |||
401 | defaults = {'n_neighbors': 5, 'NNsolver': 'lbfgs', 'NNalpha': 1e-5, |
||
402 | 'NNhidden_layer': (5, 2), 'NNrandom_state': 1, |
||
403 | 'n_estimators': 10} |
||
404 | |||
405 | for defkey in defaults.keys(): |
||
406 | if defkey not in kwargs.keys(): |
||
407 | kwargs[defkey] = defaults[defkey] |
||
408 | |||
409 | if equal_sampling: |
||
410 | for featval in featvals: |
||
411 | if from_end: |
||
412 | test = rawdata[rawdata[feature] == featval |
||
413 | ].values[:, -input_cols:] |
||
414 | else: |
||
415 | test = rawdata[rawdata[feature] == featval |
||
416 | ].values[:, input_cols[0]:input_cols[1]] |
||
417 | to_plot = np.array(random.sample(range(0, test.shape[0] |
||
418 | ), tsize)) |
||
419 | if featval == featvals[0]: |
||
420 | X = test[to_plot, :] |
||
421 | y = rawdata[rawdata[feature] == featval |
||
422 | ][feature].values[to_plot] |
||
423 | else: |
||
424 | X = np.append(X, test[to_plot, :], axis=0) |
||
425 | y = np.append(y, rawdata[rawdata[feature] == featval |
||
426 | ][feature].values[to_plot], axis=0) |
||
427 | |||
428 | else: |
||
429 | if from_end: |
||
430 | test = rawdata.values[:, -input_cols:] |
||
431 | else: |
||
432 | test = rawdata.values[:, input_cols[0]:input_cols[1]] |
||
433 | to_plot = np.array(random.sample(range(0, test.shape[0]), tsize)) |
||
434 | X = test[to_plot, :] |
||
435 | y = rawdata[feature].values[to_plot] |
||
436 | |||
437 | if model is 'KNN': |
||
438 | clf = neighbors.KNeighborsClassifier(kwargs['n_neighbors']) |
||
439 | elif model is 'MLP': |
||
440 | clf = MLPClassifier(solver=kwargs['NNsolver'], alpha=kwargs['NNalpha'], |
||
441 | hidden_layer_sizes=kwargs['NNhidden_layer'], |
||
442 | random_state=kwargs['NNrandom_state']) |
||
443 | else: |
||
444 | clf = RandomForestClassifier(n_estimators=kwargs['n_estimators']) |
||
445 | |||
446 | clf.fit(X, y) |
||
447 | |||
448 | return clf, X, y |
||
449 | |||
450 | |||
451 | View Code Duplication | def predict_model(model, X, y): |
|
452 | """Calculates fraction correctly predicted using input KNN |
||
453 | model |
||
454 | |||
455 | Parameters |
||
456 | ---------- |
||
457 | model : sklearn.neighbors.classification.KNeighborsClassifier |
||
458 | KNN model |
||
459 | X : numpy.ndarray |
||
460 | training input dataset used to create clf |
||
461 | y : numpy.ndarray |
||
462 | training output dataset used to create clf |
||
463 | |||
464 | Returns |
||
465 | ------- |
||
466 | pcorrect : float |
||
467 | Fraction of correctly predicted outputs using the |
||
468 | input KNN model and the input test dataset X and y |
||
469 | |||
470 | """ |
||
471 | yp = model.predict(X) |
||
472 | correct = np.zeros(y.shape[0]) |
||
473 | for i in range(0, y.shape[0]): |
||
474 | if y[i] == yp[i]: |
||
475 | correct[i] = 1 |
||
476 | |||
477 | pcorrect = np.average(correct) |
||
478 | # print(pcorrect) |
||
479 | return pcorrect |
||
480 | |||
481 | |||
482 | View Code Duplication | def feature_violin(df, label='label', lvals=['yes', 'no'], fsubset=3, **kwargs): |
|
483 | """Creates violinplot of input feature dataset |
||
484 | |||
485 | Designed to plot PCA components from pca_analysis. |
||
486 | |||
487 | Parameters |
||
488 | ---------- |
||
489 | df : pandas.core.frames.DataFrame |
||
490 | Must contain a group name column, and numerical feature columns. |
||
491 | label : string or int |
||
492 | Name of group column. |
||
493 | lvals : list of string or int |
||
494 | All values that group column can take |
||
495 | fsubset : int or list of int |
||
496 | Features to be plotted. If integer, will plot range(fsubset). |
||
497 | If list, will only plot features contained in fsubset. |
||
498 | **kwargs : variable |
||
499 | figsize : tuple of int or float |
||
500 | Dimensions of output figure |
||
501 | yrange : list of int or float |
||
502 | Range of y axis |
||
503 | xlabel : string |
||
504 | Label of x axis |
||
505 | labelsize : int or float |
||
506 | Font size of x label |
||
507 | ticksize : int or float |
||
508 | Font size of y tick labels |
||
509 | fname : None or string |
||
510 | Name of output file |
||
511 | legendfontsize : int or float |
||
512 | Font size of legend |
||
513 | legendloc : int |
||
514 | Location of legend in plot e.g. 1, 2, 3, 4 |
||
515 | |||
516 | """ |
||
517 | |||
518 | defaults = {'figsize': (12, 5), 'yrange': [-20, 20], 'xlabel': 'Feature', |
||
519 | 'labelsize': 20, 'ticksize': 16, 'fname': None, |
||
520 | 'legendfontsize': 12, 'legendloc': 1, 'dpi': 300} |
||
521 | |||
522 | for defkey in defaults.keys(): |
||
523 | if defkey not in kwargs.keys(): |
||
524 | kwargs[defkey] = defaults[defkey] |
||
525 | |||
526 | # Restacking input data |
||
527 | groupsize = [] |
||
528 | featcol = [] |
||
529 | valcol = [] |
||
530 | feattype = [] |
||
531 | |||
532 | if isinstance(fsubset, int): |
||
533 | frange = range(fsubset) |
||
534 | else: |
||
535 | frange = fsubset |
||
536 | |||
537 | for feat in frange: |
||
538 | groupsize.extend(df[label].values) |
||
539 | featcol.extend([feat]*df[label].values.shape[0]) |
||
540 | valcol.extend(df[feat].values) |
||
541 | |||
542 | to_violind = {'label': groupsize, 'Feature': featcol, |
||
543 | 'Feature Value': valcol} |
||
544 | to_violin = pd.DataFrame(data=to_violind) |
||
545 | |||
546 | # Plotting function |
||
547 | fig, ax = plt.subplots(figsize=kwargs['figsize']) |
||
548 | sns.violinplot(x="Feature", y="Feature Value", hue="label", data=to_violin, |
||
549 | palette="Pastel1", hue_order=lvals, |
||
550 | figsize=kwargs['figsize']) |
||
551 | |||
552 | # kwargs |
||
553 | ax.tick_params(axis='both', which='major', labelsize=kwargs['ticksize']) |
||
554 | plt.xlabel(kwargs['xlabel'], fontsize=kwargs['labelsize']) |
||
555 | plt.ylabel('', fontsize=kwargs['labelsize']) |
||
556 | plt.ylim(kwargs['yrange']) |
||
557 | plt.legend(loc=kwargs['legendloc'], prop={'size': kwargs['legendfontsize']}) |
||
558 | if kwargs['fname'] is None: |
||
559 | plt.show() |
||
560 | else: |
||
561 | plt.savefig(kwargs['fname'], dpi=kwargs['dpi']) |
||
562 | |||
563 | return to_violin |
||
564 | |||
565 | |||
566 | View Code Duplication | def feature_plot_2D(dataset, label, features=[0, 1], lvals=['PEG', 'PS'], |
|
567 | randsel=True, randcount=200, **kwargs): |
||
568 | """Plots two features against each other from feature dataset. |
||
569 | |||
570 | Parameters |
||
571 | ---------- |
||
572 | dataset : pandas.core.frames.DataFrame |
||
573 | Must comtain a group column and numerical features columns |
||
574 | labels : string or int |
||
575 | Group column name |
||
576 | features : list of int |
||
577 | Names of columns to be plotted |
||
578 | randsel : bool |
||
579 | If True, downsamples from original dataset |
||
580 | randcount : int |
||
581 | Size of downsampled dataset |
||
582 | **kwargs : variable |
||
583 | figsize : tuple of int or float |
||
584 | Size of output figure |
||
585 | dotsize : float or int |
||
586 | Size of plotting markers |
||
587 | alpha : float or int |
||
588 | Transparency factor |
||
589 | xlim : list of float or int |
||
590 | X range of output plot |
||
591 | ylim : list of float or int |
||
592 | Y range of output plot |
||
593 | legendfontsize : float or int |
||
594 | Font size of legend |
||
595 | labelfontsize : float or int |
||
596 | Font size of labels |
||
597 | fname : string |
||
598 | Filename of output figure |
||
599 | |||
600 | Returns |
||
601 | ------- |
||
602 | xy : list of lists |
||
603 | Coordinates of data on plot |
||
604 | |||
605 | """ |
||
606 | defaults = {'figsize': (8, 8), 'dotsize': 70, 'alpha': 0.7, 'xlim': None, |
||
607 | 'ylim': None, 'legendfontsize': 12, 'labelfontsize': 20, |
||
608 | 'fname': None, 'legendloc': 2} |
||
609 | |||
610 | for defkey in defaults.keys(): |
||
611 | if defkey not in kwargs.keys(): |
||
612 | kwargs[defkey] = defaults[defkey] |
||
613 | |||
614 | tgroups = {} |
||
615 | xy = {} |
||
616 | counter = 0 |
||
617 | labels = dataset[label].unique() |
||
618 | for lval in lvals: |
||
619 | tgroups[counter] = dataset[dataset[label] == lval] |
||
620 | counter = counter + 1 |
||
621 | |||
622 | N = len(tgroups) |
||
623 | color = iter(cm.viridis(np.linspace(0, 0.9, N))) |
||
624 | |||
625 | fig = plt.figure(figsize=kwargs['figsize']) |
||
626 | ax1 = fig.add_subplot(111) |
||
627 | counter = 0 |
||
628 | for key in tgroups: |
||
629 | c = next(color) |
||
630 | xy = [] |
||
631 | if randsel: |
||
632 | to_plot = random.sample(range(0, len(tgroups[key][0].tolist())), |
||
633 | randcount) |
||
634 | for key2 in features: |
||
635 | xy.append(list(tgroups[key][key2].tolist()[i] for i in to_plot)) |
||
636 | else: |
||
637 | for key2 in features: |
||
638 | xy.append(tgroups[key][key2]) |
||
639 | ax1 = plt.scatter(xy[0], xy[1], c=c, s=kwargs['dotsize'], |
||
640 | alpha=kwargs['alpha'], label=labels[counter]) |
||
641 | counter = counter + 1 |
||
642 | |||
643 | if kwargs['xlim'] is not None: |
||
644 | plt.xlim(kwargs['xlim']) |
||
645 | if kwargs['ylim'] is not None: |
||
646 | plt.ylim(kwargs['ylim']) |
||
647 | |||
648 | plt.legend(fontsize=kwargs['legendfontsize'], frameon=False, |
||
649 | borderaxespad=0., |
||
650 | bbox_to_anchor=(1.05, 1)) |
||
651 | plt.xlabel('Prin. Component {}'.format(features[0]), |
||
652 | fontsize=kwargs['labelfontsize']) |
||
653 | plt.ylabel('Prin. Component {}'.format(features[1]), |
||
654 | fontsize=kwargs['labelfontsize']) |
||
655 | |||
656 | if kwargs['fname'] is None: |
||
657 | plt.show() |
||
658 | else: |
||
659 | plt.savefig(kwargs['fname']) |
||
660 | |||
661 | return xy |
||
662 | |||
663 | |||
664 | View Code Duplication | def feature_plot_3D(dataset, label, features=[0, 1, 2], lvals=['PEG', 'PS'], |
|
665 | randsel=True, randcount=200, **kwargs): |
||
666 | """Plots three features against each other from feature dataset. |
||
667 | |||
668 | Parameters |
||
669 | ---------- |
||
670 | dataset : pandas.core.frames.DataFrame |
||
671 | Must comtain a group column and numerical features columns |
||
672 | labels : string or int |
||
673 | Group column name |
||
674 | features : list of int |
||
675 | Names of columns to be plotted |
||
676 | randsel : bool |
||
677 | If True, downsamples from original dataset |
||
678 | randcount : int |
||
679 | Size of downsampled dataset |
||
680 | **kwargs : variable |
||
681 | figsize : tuple of int or float |
||
682 | Size of output figure |
||
683 | dotsize : float or int |
||
684 | Size of plotting markers |
||
685 | alpha : float or int |
||
686 | Transparency factor |
||
687 | xlim : list of float or int |
||
688 | X range of output plot |
||
689 | ylim : list of float or int |
||
690 | Y range of output plot |
||
691 | zlim : list of float or int |
||
692 | Z range of output plot |
||
693 | legendfontsize : float or int |
||
694 | Font size of legend |
||
695 | labelfontsize : float or int |
||
696 | Font size of labels |
||
697 | fname : string |
||
698 | Filename of output figure |
||
699 | |||
700 | Returns |
||
701 | ------- |
||
702 | xy : list of lists |
||
703 | Coordinates of data on plot |
||
704 | |||
705 | """ |
||
706 | |||
707 | defaults = {'figsize': (8, 8), 'dotsize': 70, 'alpha': 0.7, 'xlim': None, |
||
708 | 'ylim': None, 'zlim': None, 'legendfontsize': 12, |
||
709 | 'labelfontsize': 10, 'fname': None, 'dpi': 300, |
||
710 | 'noticks': True, 'ticksize': 10} |
||
711 | |||
712 | for defkey in defaults.keys(): |
||
713 | if defkey not in kwargs.keys(): |
||
714 | kwargs[defkey] = defaults[defkey] |
||
715 | |||
716 | axes = {} |
||
717 | fig = plt.figure(figsize=(14, 14)) |
||
718 | axes[1] = fig.add_subplot(221, projection='3d') |
||
719 | axes[2] = fig.add_subplot(222, projection='3d') |
||
720 | axes[3] = fig.add_subplot(223, projection='3d') |
||
721 | axes[4] = fig.add_subplot(224, projection='3d') |
||
722 | color = iter(cm.viridis(np.linspace(0, 0.9, 3))) |
||
723 | angle1 = [60, 0, 0, 0] |
||
724 | angle2 = [240, 240, 10, 190] |
||
725 | |||
726 | tgroups = {} |
||
727 | xy = {} |
||
728 | counter = 0 |
||
729 | #labels = dataset[label].unique() |
||
730 | for lval in lvals: |
||
731 | tgroups[counter] = dataset[dataset[label] == lval] |
||
732 | #print(lval) |
||
733 | #print(tgroups[counter].shape) |
||
734 | counter = counter + 1 |
||
735 | |||
736 | N = len(tgroups) |
||
737 | color = iter(cm.viridis(np.linspace(0, 0.9, N))) |
||
738 | |||
739 | counter = 0 |
||
740 | for key in tgroups: |
||
741 | c = next(color) |
||
742 | xy = [] |
||
743 | if randsel: |
||
744 | #print(range(0, len(tgroups[key][0].tolist()))) |
||
745 | to_plot = random.sample(range(0, len(tgroups[key][0].tolist())), |
||
746 | randcount) |
||
747 | for key2 in features: |
||
748 | xy.append(list(tgroups[key][key2].tolist()[i] for i in to_plot)) |
||
749 | else: |
||
750 | for key2 in features: |
||
751 | xy.append(tgroups[key][key2]) |
||
752 | |||
753 | acount = 0 |
||
754 | for ax in axes: |
||
755 | axes[ax].scatter(xy[0], xy[1], xy[2], c=c, s=kwargs['dotsize'], alpha=kwargs['alpha'])#, label=labels[counter]) |
||
756 | if kwargs['xlim'] is not None: |
||
757 | axes[ax].set_xlim3d(kwargs['xlim'][0], kwargs['xlim'][1]) |
||
758 | if kwargs['ylim'] is not None: |
||
759 | axes[ax].set_ylim3d(kwargs['ylim'][0], kwargs['ylim'][1]) |
||
760 | if kwargs['zlim'] is not None: |
||
761 | axes[ax].set_zlim3d(kwargs['zlim'][0], kwargs['zlim'][1]) |
||
762 | axes[ax].view_init(angle1[acount], angle2[acount]) |
||
763 | axes[ax].set_xlabel('{}'.format(features[0]), |
||
764 | fontsize=kwargs['labelfontsize']) |
||
765 | axes[ax].set_ylabel('{}'.format(features[1]), |
||
766 | fontsize=kwargs['labelfontsize']) |
||
767 | axes[ax].set_zlabel('{}'.format(features[2]), |
||
768 | fontsize=kwargs['labelfontsize']) |
||
769 | if kwargs['noticks']: |
||
770 | axes[ax].set_xticklabels('') |
||
771 | axes[ax].set_yticklabels('') |
||
772 | axes[ax].set_zticklabels('') |
||
773 | else: |
||
774 | axes[ax].xaxis.set_tick_params(labelsize=kwargs['ticksize']) |
||
775 | axes[ax].yaxis.set_tick_params(labelsize=kwargs['ticksize']) |
||
776 | axes[ax].zaxis.set_tick_params(labelsize=kwargs['ticksize']) |
||
777 | acount = acount + 1 |
||
778 | counter = counter + 1 |
||
779 | |||
780 | # plt.legend(fontsize=kwargs['legendfontsize'], frameon=False) |
||
781 | axes[3].set_xticks([]) |
||
782 | axes[4].set_xticks([]) |
||
783 | |||
784 | if kwargs['fname'] is None: |
||
785 | plt.show() |
||
786 | else: |
||
787 | plt.savefig(kwargs['fname'], dpi=kwargs['dpi']) |
||
788 |