Total Complexity | 63 |
Total Lines | 741 |
Duplicated Lines | 93.79 % |
Changes | 0 |
Duplicate code is one of the most pungent code smells. A rule that is often used is to re-structure code once it is duplicated in three or more places.
Common duplication problems, and corresponding solutions are:
Complex classes like diff_classifier.pca often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
1 | """Performs principle component analysis on input datasets. |
||
2 | |||
3 | This module performs principle component analysis on input datasets using |
||
4 | functions from scikit-learn. It is optimized to data formats used in |
||
5 | diff_classifier, but can potentially be extended to other applications. |
||
6 | |||
7 | """ |
||
8 | |||
9 | import random |
||
10 | import pandas as pd |
||
11 | import numpy as np |
||
12 | from scipy import stats, linalg |
||
13 | import seaborn as sns |
||
14 | from sklearn import neighbors |
||
15 | from sklearn.decomposition import PCA as pca |
||
16 | from sklearn.preprocessing import StandardScaler as stscale |
||
17 | from sklearn.preprocessing import Imputer |
||
18 | import matplotlib.pyplot as plt |
||
19 | from matplotlib.pyplot import cm |
||
20 | from mpl_toolkits.mplot3d import Axes3D |
||
21 | |||
22 | |||
23 | class Bunch: |
||
24 | def __init__(self, **kwds): |
||
25 | self.__dict__.update(kwds) |
||
26 | |||
27 | |||
28 | View Code Duplication | def partial_corr(mtrx): |
|
|
|||
29 | """Calculates linear partial correlation coefficients |
||
30 | |||
31 | Returns the sample linear partial correlation coefficients between pairs of |
||
32 | variables in mtrx, controlling for the remaining variables in mtrx. |
||
33 | |||
34 | |||
35 | |||
36 | Parameters |
||
37 | ---------- |
||
38 | mtrx : array-like, shape (n, p) |
||
39 | Array with the different variables. Each column of mtrx is taken as a |
||
40 | variable |
||
41 | |||
42 | |||
43 | Returns |
||
44 | ------- |
||
45 | P : array-like, shape (p, p) |
||
46 | P[i, j] contains the partial correlation of mtrx[:, i] and mtrx[:, j] |
||
47 | controlling for the remaining variables in mtrx. |
||
48 | |||
49 | Notes |
||
50 | ----- |
||
51 | |||
52 | Partial Correlation in Python (clone of Matlab's partialcorr) |
||
53 | |||
54 | This uses the linear regression approach to compute the partial |
||
55 | correlation (might be slow for a huge number of variables). The |
||
56 | algorithm is detailed here: |
||
57 | |||
58 | http://en.wikipedia.org/wiki/Partial_correlation#Using_linear_regression |
||
59 | |||
60 | Taking X and Y two variables of interest and Z the matrix with all the |
||
61 | variable minus {X, Y}, the algorithm can be summarized as |
||
62 | |||
63 | 1) perform a normal linear least-squares regression with X as the target |
||
64 | and Z as the predictor |
||
65 | 2) calculate the residuals in Step #1 |
||
66 | 3) perform a normal linear least-squares regression with Y as the target and |
||
67 | Z as the predictor |
||
68 | 4) calculate the residuals in Step #3 |
||
69 | 5) calculate the correlation coefficient between the residuals from Steps #2 |
||
70 | and #4 |
||
71 | |||
72 | The result is the partial correlation between X and Y while controlling for |
||
73 | the effect of Z |
||
74 | |||
75 | Adapted from code by Fabian Pedregosa-Izquierdo: |
||
76 | Date: Nov 2014 |
||
77 | Author: Fabian Pedregosa-Izquierdo, [email protected] |
||
78 | Testing: Valentina Borghesani, [email protected] |
||
79 | |||
80 | """ |
||
81 | |||
82 | mtrx = np.asarray(mtrx) |
||
83 | pfeat = mtrx.shape[1] |
||
84 | pcorr = np.zeros((pfeat, pfeat), dtype=np.float) |
||
85 | for i in range(pfeat): |
||
86 | pcorr[i, i] = 1 |
||
87 | for j in range(i+1, pfeat): |
||
88 | idx = np.ones(pfeat, dtype=np.bool) |
||
89 | idx[i] = False |
||
90 | idx[j] = False |
||
91 | beta_i = linalg.lstsq(mtrx[:, idx], mtrx[:, j])[0] |
||
92 | beta_j = linalg.lstsq(mtrx[:, idx], mtrx[:, i])[0] |
||
93 | |||
94 | res_j = mtrx[:, j] - mtrx[:, idx].dot(beta_i) |
||
95 | res_i = mtrx[:, i] - mtrx[:, idx].dot(beta_j) |
||
96 | |||
97 | corr = stats.pearsonr(res_i, res_j)[0] |
||
98 | pcorr[i, j] = corr |
||
99 | pcorr[j, i] = corr |
||
100 | |||
101 | return pcorr |
||
102 | |||
103 | |||
104 | View Code Duplication | def kmo(dataset): |
|
105 | """Calculates the Kaiser-Meyer-Olkin measure on an input dataset |
||
106 | |||
107 | Parameters |
||
108 | ---------- |
||
109 | dataset : array-like, shape (n, p) |
||
110 | Array containing n samples and p features. Must have no NaNs. |
||
111 | Ideally scaled before performing test. |
||
112 | |||
113 | Returns |
||
114 | ------- |
||
115 | kmostat : float |
||
116 | KMO test value |
||
117 | |||
118 | Notes |
||
119 | ----- |
||
120 | Based on calculations shown here: |
||
121 | |||
122 | http://www.statisticshowto.com/kaiser-meyer-olkin/ |
||
123 | |||
124 | -- 0.00-0.49 unacceptable |
||
125 | -- 0.50-0.59 miserable |
||
126 | -- 0.60-0.69 mediocre |
||
127 | -- 0.70-0.79 middling |
||
128 | -- 0.80-0.89 meritorious |
||
129 | -- 0.90-1.00 marvelous |
||
130 | |||
131 | """ |
||
132 | |||
133 | # Correlation matrix and the partial covariance matrix. |
||
134 | corrmatrix = np.corrcoef(dataset.transpose()) |
||
135 | pcorr = partial_corr(dataset) |
||
136 | |||
137 | # Calculation of the KMO statistic |
||
138 | matrix = np.multiply(corrmatrix, corrmatrix) |
||
139 | rows = matrix.shape[0] |
||
140 | cols = matrix.shape[1] |
||
141 | rij = np.sum(matrix) - np.trace(matrix) |
||
142 | uij = np.sum(pcorr) - np.trace(pcorr) |
||
143 | kmostat = rij/(rij+uij) |
||
144 | print(kmostat) |
||
145 | return kmostat |
||
146 | |||
147 | |||
148 | View Code Duplication | def pca_analysis(dataset, dropcols=[], imputenans=True, scale=True, |
|
149 | rem_outliers=True, out_thresh=10, n_components=5): |
||
150 | """Performs a primary component analysis on an input dataset |
||
151 | |||
152 | Parameters |
||
153 | ---------- |
||
154 | dataset : pandas.core.frame.DataFrame, shape (n, p) |
||
155 | Input dataset with n samples and p features |
||
156 | dropcols : list |
||
157 | Columns to exclude from pca analysis. At a minimum, user must exclude |
||
158 | non-numeric columns. |
||
159 | imputenans : bool |
||
160 | If True, impute NaN values as column means. |
||
161 | scale : bool |
||
162 | If True, columns will be scaled to a mean of zero and a standard |
||
163 | deviation of 1. |
||
164 | n_components : int |
||
165 | Desired number of components in principle component analysis. |
||
166 | |||
167 | Returns |
||
168 | ------- |
||
169 | pcadataset : diff_classifier.pca.Bunch |
||
170 | Contains outputs of PCA analysis, including: |
||
171 | scaled : numpy.ndarray, shape (n, p) |
||
172 | Scaled dataset with n samples and p features |
||
173 | pcavals : pandas.core.frame.DataFrame, shape (n, n_components) |
||
174 | Output array of n_component features of each original sample |
||
175 | final : pandas.core.frame.DataFrame, shape (n, p+n_components) |
||
176 | Output array with principle components append to original array. |
||
177 | prcomps : pandas.core.frame.DataFrame, shape (5, n_components) |
||
178 | Output array displaying the top 5 features contributing to each |
||
179 | principle component. |
||
180 | prvals : dict of list of str |
||
181 | Output dictionary of of the pca scores for the top 5 features |
||
182 | contributing to each principle component. |
||
183 | components : pandas.core.frame.DataFrame, shape (p, n_components) |
||
184 | Raw pca scores. |
||
185 | |||
186 | """ |
||
187 | pd.options.mode.chained_assignment = None # default='warn' |
||
188 | dataset_num = dataset.drop(dropcols, axis=1) |
||
189 | |||
190 | if rem_outliers: |
||
191 | for i in range(10): |
||
192 | for col in dataset_num.columns: |
||
193 | xmean = np.mean(dataset_num[col]) |
||
194 | xstd = np.std(dataset_num[col]) |
||
195 | |||
196 | counter = 0 |
||
197 | for x in dataset_num[col]: |
||
198 | if x > xmean + out_thresh*xstd: |
||
199 | dataset[col][counter] = np.nan |
||
200 | dataset_num[col][counter] = np.nan |
||
201 | if x < xmean - out_thresh*xstd: |
||
202 | dataset[col][counter] = np.nan |
||
203 | dataset_num[col][counter] = np.nan |
||
204 | counter = counter + 1 |
||
205 | |||
206 | dataset_raw = dataset_num.values |
||
207 | |||
208 | # Fill in NaN values |
||
209 | if imputenans: |
||
210 | imp = Imputer(missing_values='NaN', strategy='mean', axis=0) |
||
211 | imp.fit(dataset_raw) |
||
212 | dataset_clean = imp.transform(dataset_raw) |
||
213 | else: |
||
214 | dataset_clean = dataset_raw |
||
215 | |||
216 | # Scale inputs |
||
217 | if scale: |
||
218 | scaler = stscale() |
||
219 | scaler.fit(dataset_clean) |
||
220 | dataset_scaled = scaler.transform(dataset_clean) |
||
221 | else: |
||
222 | dataset_scaled = dataset_clean |
||
223 | |||
224 | pcadataset = Bunch(scaled=dataset_scaled) |
||
225 | pca1 = pca(n_components=n_components) |
||
226 | pca1.fit(dataset_scaled) |
||
227 | |||
228 | # Cumulative explained variance ratio |
||
229 | cum_var = 0 |
||
230 | explained_v = pca1.explained_variance_ratio_ |
||
231 | print('Cumulative explained variance:') |
||
232 | for i in range(0, n_components): |
||
233 | cum_var = cum_var + explained_v[i] |
||
234 | print('{} component: {}'.format(i, cum_var)) |
||
235 | |||
236 | prim_comps = {} |
||
237 | pcadataset.prvals = {} |
||
238 | comps = pca1.components_ |
||
239 | pcadataset.components = pd.DataFrame(comps.transpose()) |
||
240 | for num in range(0, n_components): |
||
241 | highest = np.abs(pcadataset.components[ |
||
242 | num]).values.argsort()[-5:][::-1] |
||
243 | pels = [] |
||
244 | pcadataset.prvals[num] = pcadataset.components[num].values[highest] |
||
245 | for col in highest: |
||
246 | pels.append(dataset_num.columns[col]) |
||
247 | prim_comps[num] = pels |
||
248 | |||
249 | # Main contributors to each primary component |
||
250 | pcadataset.prcomps = pd.DataFrame.from_dict(prim_comps) |
||
251 | pcadataset.pcavals = pd.DataFrame(pca1.transform(dataset_scaled)) |
||
252 | pcadataset.final = pd.concat([dataset, pcadataset.pcavals], axis=1) |
||
253 | pcadataset.pcamodel = pca1 |
||
254 | |||
255 | return pcadataset |
||
256 | |||
257 | |||
258 | View Code Duplication | def recycle_pcamodel(pcamodel, df, imputenans=True, scale=True): |
|
259 | if imputenans: |
||
260 | imp = Imputer(missing_values='NaN', strategy='mean', axis=0) |
||
261 | imp.fit(df) |
||
262 | df_clean = imp.transform(df) |
||
263 | else: |
||
264 | df_clean = df |
||
265 | |||
266 | # Scale inputs |
||
267 | if scale: |
||
268 | scaler = stscale() |
||
269 | scaler.fit(df_clean) |
||
270 | df_scaled = scaler.transform(df_clean) |
||
271 | else: |
||
272 | df_scaled = df_clean |
||
273 | |||
274 | pcamodel.fit(df_scaled) |
||
275 | pcavals = pd.DataFrame(pcamodel.transform(df_scaled)) |
||
276 | pcafinal = pd.concat([df, pcavals], axis=1) |
||
277 | |||
278 | return pcafinal |
||
279 | |||
280 | |||
281 | View Code Duplication | def plot_pca(datasets, figsize=(8, 8), lwidth=8.0, |
|
282 | labels=['Sample1', 'Sample2'], savefig=True, filename='test.png', |
||
283 | rticks=np.linspace(-2, 2, 5)): |
||
284 | """Plots the average output features from a PCA analysis in polar |
||
285 | coordinates |
||
286 | |||
287 | Parameters |
||
288 | ---------- |
||
289 | datasets : dict of numpy.ndarray |
||
290 | Dictionary with n samples and p features to plot. |
||
291 | figize : list |
||
292 | Dimensions of output figure e.g. (8, 8) |
||
293 | lwidth : float |
||
294 | Width of plotted lines in figure |
||
295 | labels : list of str |
||
296 | Labels to display in legend. |
||
297 | savefig : bool |
||
298 | If True, saves figure |
||
299 | filename : str |
||
300 | Desired output filename |
||
301 | |||
302 | """ |
||
303 | |||
304 | fig = plt.figure(figsize=figsize) |
||
305 | for key in datasets: |
||
306 | N = datasets[key].shape[0] |
||
307 | width = (2*np.pi) / N |
||
308 | color = iter(cm.viridis(np.linspace(0, 0.9, len(datasets)))) |
||
309 | |||
310 | theta = np.linspace(0.0, 2 * np.pi, N+1, endpoint=True) |
||
311 | radii = {} |
||
312 | bars = {} |
||
313 | |||
314 | ax = plt.subplot(111, polar=True) |
||
315 | counter = 0 |
||
316 | for key in datasets: |
||
317 | c = next(color) |
||
318 | radii[key] = np.append(datasets[key], datasets[key][0]) |
||
319 | bars[key] = ax.plot(theta, radii[key], linewidth=lwidth, color=c, |
||
320 | label=labels[counter]) |
||
321 | counter = counter + 1 |
||
322 | plt.legend(bbox_to_anchor=(0.90, 1), loc=2, borderaxespad=0., |
||
323 | frameon=False, fontsize=20) |
||
324 | |||
325 | # # Use custom colors and opacity |
||
326 | # for r, bar in zip(radii, bars): |
||
327 | # bar.set_facecolor(plt.cm.jet(np.abs(r / 2.5))) |
||
328 | # bar.set_alpha(0.8) |
||
329 | ax.set_xticks(np.pi/180. * np.linspace(0, 360, N, endpoint=False)) |
||
330 | ax.set_xticklabels(list(range(0, N))) |
||
331 | ax.set_ylim([min(rticks), max(rticks)]) |
||
332 | ax.set_yticks(rticks) |
||
333 | |||
334 | if savefig: |
||
335 | plt.savefig(filename, bbox_inches='tight') |
||
336 | |||
337 | plt.show() |
||
338 | |||
339 | |||
340 | View Code Duplication | def build_KNN_model(rawdata, feature, featvals, equal_sampling=True, |
|
341 | tsize=20, n_neighbors=5, from_end=True, input_cols=6): |
||
342 | """Builds a K-nearest neighbor model using an input dataset. |
||
343 | |||
344 | Parameters |
||
345 | ---------- |
||
346 | rawdata : pandas.core.frames.DataFrame |
||
347 | Raw dataset of n samples and p features. |
||
348 | feature : string or int |
||
349 | Feature in rawdata containing output values on which KNN |
||
350 | model is to be based. |
||
351 | featvals : string or int |
||
352 | All values that feature can take. |
||
353 | equal_sampling : bool |
||
354 | If True, training dataset will contain an equal number |
||
355 | of samples that take each value of featvals. If false, |
||
356 | each sample in training dataset will be taken randomly |
||
357 | from rawdata. |
||
358 | tsize : int |
||
359 | Size of training dataset. If equal_sampling is False, |
||
360 | training dataset will be exactly this size. If True, |
||
361 | training dataset will contain N x tsize where N is the |
||
362 | number of unique values in featvals. |
||
363 | n_neighbors : int |
||
364 | Number of nearest neighbors to be used in KNN |
||
365 | algorithm. |
||
366 | from_end : int |
||
367 | If True, in_cols will select features to be used as |
||
368 | training data defined end of rawdata e.g. |
||
369 | rawdata[:, -6:]. If False, input_cols will be read |
||
370 | as a tuple e.g. rawdata[:, 10:15]. |
||
371 | input_col : int or tuple |
||
372 | Defined in from_end above. |
||
373 | |||
374 | Returns |
||
375 | ------- |
||
376 | clf : sklearn.neighbors.classification.KNeighborsClassifier |
||
377 | KNN model |
||
378 | X : numpy.ndarray |
||
379 | training input dataset used to create clf |
||
380 | y : numpy.ndarray |
||
381 | training output dataset used to create clf |
||
382 | |||
383 | """ |
||
384 | |||
385 | if equal_sampling: |
||
386 | for featval in featvals: |
||
387 | if from_end: |
||
388 | test = rawdata[rawdata[feature] == featval |
||
389 | ].values[:, -input_cols:] |
||
390 | else: |
||
391 | test = rawdata[rawdata[feature] == featval |
||
392 | ].values[:, input_cols[0]:input_cols[1]] |
||
393 | to_plot = np.array(random.sample(range(0, test.shape[0] |
||
394 | ), tsize)) |
||
395 | if featval == featvals[0]: |
||
396 | X = test[to_plot, :] |
||
397 | y = rawdata[rawdata[feature] == featval |
||
398 | ][feature].values[to_plot] |
||
399 | else: |
||
400 | X = np.append(X, test[to_plot, :], axis=0) |
||
401 | y = np.append(y, rawdata[rawdata[feature] == featval |
||
402 | ][feature].values[to_plot], axis=0) |
||
403 | |||
404 | else: |
||
405 | if from_end: |
||
406 | test = rawdata.values[:, -input_cols:] |
||
407 | else: |
||
408 | test = rawdata.values[:, input_cols[0]:input_cols[1]] |
||
409 | to_plot = np.array(random.sample(range(0, test.shape[0]), tsize)) |
||
410 | X = test[to_plot, :] |
||
411 | y = rawdata[feature].values[to_plot] |
||
412 | |||
413 | clf = neighbors.KNeighborsClassifier(n_neighbors) |
||
414 | clf.fit(X, y) |
||
415 | |||
416 | return clf, X, y |
||
417 | |||
418 | |||
419 | View Code Duplication | def predict_KNN(model, X, y): |
|
420 | """Calculates fraction correctly predicted using input KNN |
||
421 | model |
||
422 | |||
423 | Parameters |
||
424 | ---------- |
||
425 | model : sklearn.neighbors.classification.KNeighborsClassifier |
||
426 | KNN model |
||
427 | X : numpy.ndarray |
||
428 | training input dataset used to create clf |
||
429 | y : numpy.ndarray |
||
430 | training output dataset used to create clf |
||
431 | |||
432 | Returns |
||
433 | ------- |
||
434 | pcorrect : float |
||
435 | Fraction of correctly predicted outputs using the |
||
436 | input KNN model and the input test dataset X and y |
||
437 | |||
438 | """ |
||
439 | yp = model.predict(X) |
||
440 | correct = np.zeros(y.shape[0]) |
||
441 | for i in range(0, y.shape[0]): |
||
442 | if y[i] == yp[i]: |
||
443 | correct[i] = 1 |
||
444 | |||
445 | pcorrect = np.average(correct) |
||
446 | # print(pcorrect) |
||
447 | return pcorrect |
||
448 | |||
449 | |||
450 | View Code Duplication | def feature_violin(df, label='label', lvals=['yes', 'no'], fsubset=3, **kwargs): |
|
451 | """Creates violinplot of input feature dataset |
||
452 | |||
453 | Designed to plot PCA components from pca_analysis. |
||
454 | |||
455 | Parameters |
||
456 | ---------- |
||
457 | df : pandas.core.frames.DataFrame |
||
458 | Must contain a group name column, and numerical feature columns. |
||
459 | label : string or int |
||
460 | Name of group column. |
||
461 | lvals : list of string or int |
||
462 | All values that group column can take |
||
463 | fsubset : int or list of int |
||
464 | Features to be plotted. If integer, will plot range(fsubset). |
||
465 | If list, will only plot features contained in fsubset. |
||
466 | **kwargs : variable |
||
467 | figsize : tuple of int or float |
||
468 | Dimensions of output figure |
||
469 | yrange : list of int or float |
||
470 | Range of y axis |
||
471 | xlabel : string |
||
472 | Label of x axis |
||
473 | labelsize : int or float |
||
474 | Font size of x label |
||
475 | ticksize : int or float |
||
476 | Font size of y tick labels |
||
477 | fname : None or string |
||
478 | Name of output file |
||
479 | legendfontsize : int or float |
||
480 | Font size of legend |
||
481 | legendloc : int |
||
482 | Location of legend in plot e.g. 1, 2, 3, 4 |
||
483 | |||
484 | """ |
||
485 | |||
486 | defaults = {'figsize': (12, 5), 'yrange': [-20, 20], 'xlabel': 'Feature', |
||
487 | 'labelsize': 20, 'ticksize': 16, 'fname': None, |
||
488 | 'legendfontsize': 12, 'legendloc': 1} |
||
489 | |||
490 | for defkey in defaults.keys(): |
||
491 | if defkey not in kwargs.keys(): |
||
492 | kwargs[defkey] = defaults[defkey] |
||
493 | |||
494 | # Restacking input data |
||
495 | groupsize = [] |
||
496 | featcol = [] |
||
497 | valcol = [] |
||
498 | feattype = [] |
||
499 | |||
500 | if isinstance(fsubset, int): |
||
501 | frange = range(fsubset) |
||
502 | else: |
||
503 | frange = fsubset |
||
504 | |||
505 | for feat in frange: |
||
506 | groupsize.extend(df[label].values) |
||
507 | featcol.extend([feat]*df[label].values.shape[0]) |
||
508 | valcol.extend(df[feat].values) |
||
509 | |||
510 | to_violind = {'label': groupsize, 'Feature': featcol, |
||
511 | 'Feature Value': valcol} |
||
512 | to_violin = pd.DataFrame(data=to_violind) |
||
513 | |||
514 | # Plotting function |
||
515 | fig, ax = plt.subplots(figsize=kwargs['figsize']) |
||
516 | sns.violinplot(x="Feature", y="Feature Value", hue="label", data=to_violin, |
||
517 | palette="Pastel1", hue_order=lvals, |
||
518 | figsize=kwargs['figsize']) |
||
519 | |||
520 | # kwargs |
||
521 | ax.tick_params(axis='both', which='major', labelsize=kwargs['ticksize']) |
||
522 | plt.xlabel(kwargs['xlabel'], fontsize=kwargs['labelsize']) |
||
523 | plt.ylabel('', fontsize=kwargs['labelsize']) |
||
524 | plt.ylim(kwargs['yrange']) |
||
525 | plt.legend(loc=kwargs['legendloc'], prop={'size': kwargs['legendfontsize']}) |
||
526 | if kwargs['fname'] is None: |
||
527 | plt.show() |
||
528 | else: |
||
529 | plt.savefig(kwargs['fname']) |
||
530 | |||
531 | return to_violin |
||
532 | |||
533 | |||
534 | View Code Duplication | def feature_plot_2D(dataset, label, features=[0, 1], randsel=True, |
|
535 | randcount=200, **kwargs): |
||
536 | """Plots two features against each other from feature dataset. |
||
537 | |||
538 | Parameters |
||
539 | ---------- |
||
540 | dataset : pandas.core.frames.DataFrame |
||
541 | Must comtain a group column and numerical features columns |
||
542 | labels : string or int |
||
543 | Group column name |
||
544 | features : list of int |
||
545 | Names of columns to be plotted |
||
546 | randsel : bool |
||
547 | If True, downsamples from original dataset |
||
548 | randcount : int |
||
549 | Size of downsampled dataset |
||
550 | **kwargs : variable |
||
551 | figsize : tuple of int or float |
||
552 | Size of output figure |
||
553 | dotsize : float or int |
||
554 | Size of plotting markers |
||
555 | alpha : float or int |
||
556 | Transparency factor |
||
557 | xlim : list of float or int |
||
558 | X range of output plot |
||
559 | ylim : list of float or int |
||
560 | Y range of output plot |
||
561 | legendfontsize : float or int |
||
562 | Font size of legend |
||
563 | labelfontsize : float or int |
||
564 | Font size of labels |
||
565 | fname : string |
||
566 | Filename of output figure |
||
567 | |||
568 | Returns |
||
569 | ------- |
||
570 | xy : list of lists |
||
571 | Coordinates of data on plot |
||
572 | |||
573 | """ |
||
574 | defaults = {'figsize': (8, 8), 'dotsize': 70, 'alpha': 0.7, 'xlim': None, |
||
575 | 'ylim': None, 'legendfontsize': 12, 'labelfontsize': 20, |
||
576 | 'fname': None} |
||
577 | |||
578 | for defkey in defaults.keys(): |
||
579 | if defkey not in kwargs.keys(): |
||
580 | kwargs[defkey] = defaults[defkey] |
||
581 | |||
582 | tgroups = {} |
||
583 | xy = {} |
||
584 | counter = 0 |
||
585 | labels = dataset[label].unique() |
||
586 | for lval in labels: |
||
587 | tgroups[counter] = dataset[dataset[label] == lval] |
||
588 | counter = counter + 1 |
||
589 | |||
590 | N = len(tgroups) |
||
591 | color = iter(cm.viridis(np.linspace(0, 0.9, N))) |
||
592 | |||
593 | fig = plt.figure(figsize=kwargs['figsize']) |
||
594 | ax1 = fig.add_subplot(111) |
||
595 | counter = 0 |
||
596 | for key in tgroups: |
||
597 | c = next(color) |
||
598 | xy = [] |
||
599 | if randsel: |
||
600 | to_plot = random.sample(range(0, len(tgroups[key][0].tolist())), |
||
601 | randcount) |
||
602 | for key2 in features: |
||
603 | xy.append(list(tgroups[key][key2].tolist()[i] for i in to_plot)) |
||
604 | else: |
||
605 | for key2 in features: |
||
606 | xy.append(tgroups[key][key2]) |
||
607 | ax1 = plt.scatter(xy[0], xy[1], c=c, s=kwargs['dotsize'], |
||
608 | alpha=kwargs['alpha'], label=labels[counter]) |
||
609 | counter = counter + 1 |
||
610 | |||
611 | if kwargs['xlim'] is not None: |
||
612 | plt.xlim(kwargs['xlim']) |
||
613 | if kwargs['ylim'] is not None: |
||
614 | plt.ylim(kwargs['ylim']) |
||
615 | |||
616 | plt.legend(fontsize=kwargs['legendfontsize'], frameon=False) |
||
617 | plt.xlabel('Prin. Component {}'.format(features[0]), |
||
618 | fontsize=kwargs['labelfontsize']) |
||
619 | plt.ylabel('Prin. Component {}'.format(features[1]), |
||
620 | fontsize=kwargs['labelfontsize']) |
||
621 | |||
622 | if kwargs['fname'] is None: |
||
623 | plt.show() |
||
624 | else: |
||
625 | plt.savefig(kwargs['fname']) |
||
626 | |||
627 | return xy |
||
628 | |||
629 | |||
630 | View Code Duplication | def feature_plot_3D(dataset, label, features=[0, 1, 2], randsel=True, |
|
631 | randcount=200, **kwargs): |
||
632 | """Plots three features against each other from feature dataset. |
||
633 | |||
634 | Parameters |
||
635 | ---------- |
||
636 | dataset : pandas.core.frames.DataFrame |
||
637 | Must comtain a group column and numerical features columns |
||
638 | labels : string or int |
||
639 | Group column name |
||
640 | features : list of int |
||
641 | Names of columns to be plotted |
||
642 | randsel : bool |
||
643 | If True, downsamples from original dataset |
||
644 | randcount : int |
||
645 | Size of downsampled dataset |
||
646 | **kwargs : variable |
||
647 | figsize : tuple of int or float |
||
648 | Size of output figure |
||
649 | dotsize : float or int |
||
650 | Size of plotting markers |
||
651 | alpha : float or int |
||
652 | Transparency factor |
||
653 | xlim : list of float or int |
||
654 | X range of output plot |
||
655 | ylim : list of float or int |
||
656 | Y range of output plot |
||
657 | zlim : list of float or int |
||
658 | Z range of output plot |
||
659 | legendfontsize : float or int |
||
660 | Font size of legend |
||
661 | labelfontsize : float or int |
||
662 | Font size of labels |
||
663 | fname : string |
||
664 | Filename of output figure |
||
665 | |||
666 | Returns |
||
667 | ------- |
||
668 | xy : list of lists |
||
669 | Coordinates of data on plot |
||
670 | |||
671 | """ |
||
672 | defaults = {'figsize': (8, 8), 'dotsize': 70, 'alpha': 0.7, 'xlim': None, |
||
673 | 'ylim': None, 'zlim': None, 'legendfontsize': 12, |
||
674 | 'labelfontsize': 10, 'fname': None} |
||
675 | |||
676 | for defkey in defaults.keys(): |
||
677 | if defkey not in kwargs.keys(): |
||
678 | kwargs[defkey] = defaults[defkey] |
||
679 | |||
680 | axes = {} |
||
681 | fig = plt.figure(figsize=(14, 14)) |
||
682 | axes[1] = fig.add_subplot(221, projection='3d') |
||
683 | axes[2] = fig.add_subplot(222, projection='3d') |
||
684 | axes[3] = fig.add_subplot(223, projection='3d') |
||
685 | axes[4] = fig.add_subplot(224, projection='3d') |
||
686 | color = iter(cm.viridis(np.linspace(0, 0.9, 3))) |
||
687 | angle1 = [60, 0, 0, 0] |
||
688 | angle2 = [240, 240, 10, 190] |
||
689 | |||
690 | tgroups = {} |
||
691 | xy = {} |
||
692 | counter = 0 |
||
693 | labels = dataset[label].unique() |
||
694 | for lval in labels: |
||
695 | tgroups[counter] = dataset[dataset[label] == lval] |
||
696 | counter = counter + 1 |
||
697 | |||
698 | N = len(tgroups) |
||
699 | color = iter(cm.viridis(np.linspace(0, 0.9, N))) |
||
700 | |||
701 | counter = 0 |
||
702 | for key in tgroups: |
||
703 | c = next(color) |
||
704 | xy = [] |
||
705 | if randsel: |
||
706 | to_plot = random.sample(range(0, len(tgroups[key][0].tolist())), |
||
707 | randcount) |
||
708 | for key2 in features: |
||
709 | xy.append(list(tgroups[key][key2].tolist()[i] for i in to_plot)) |
||
710 | else: |
||
711 | for key2 in features: |
||
712 | xy.append(tgroups[key][key2]) |
||
713 | |||
714 | acount = 0 |
||
715 | for ax in axes: |
||
716 | axes[ax].scatter(xy[0], xy[1], xy[2], c=c, s=kwargs['dotsize'], alpha=kwargs['alpha'], label=labels[counter]) |
||
717 | if kwargs['xlim'] is not None: |
||
718 | axes[ax].set_xlim3d(kwargs['xlim']) |
||
719 | if kwargs['ylim'] is not None: |
||
720 | axes[ax].set_ylim3d(kwargs['ylim']) |
||
721 | if kwargs['zlim'] is not None: |
||
722 | axes[ax].set_zlim3d(kwargs['zlim']) |
||
723 | axes[ax].view_init(angle1[acount], angle2[acount]) |
||
724 | axes[ax].set_xlabel('Prin. Component {}'.format(features[0]), |
||
725 | fontsize=kwargs['labelfontsize']) |
||
726 | axes[ax].set_ylabel('Prin. Component {}'.format(features[1]), |
||
727 | fontsize=kwargs['labelfontsize']) |
||
728 | axes[ax].set_zlabel('Prin. Component {}'.format(features[2]), |
||
729 | fontsize=kwargs['labelfontsize']) |
||
730 | acount = acount + 1 |
||
731 | counter = counter + 1 |
||
732 | |||
733 | # plt.legend(fontsize=kwargs['legendfontsize'], frameon=False) |
||
734 | axes[3].set_xticks([]) |
||
735 | axes[4].set_xticks([]) |
||
736 | |||
737 | if kwargs['fname'] is None: |
||
738 | plt.show() |
||
739 | else: |
||
740 | plt.savefig(kwargs['fname']) |
||
741 |