1 | import random |
||
2 | import Orange |
||
3 | import numpy as np |
||
0 ignored issues
–
show
|
|||
4 | |||
5 | from itertools import takewhile |
||
6 | from operator import itemgetter |
||
7 | |||
8 | from Orange.preprocess.preprocess import Preprocess |
||
9 | from Orange.preprocess.score import ANOVA, GainRatio, UnivariateLinearRegression |
||
10 | from Orange.data import Domain |
||
0 ignored issues
–
show
|
|||
11 | |||
12 | __all__ = ["SelectBestFeatures", "RemoveNaNColumns", "SelectRandomFeatures"] |
||
13 | |||
14 | |||
15 | class SelectBestFeatures: |
||
16 | """ |
||
17 | A feature selector that builds a new data set consisting of either the top |
||
18 | `k` features or all those that exceed a given `threshold`. Features are |
||
19 | scored using the provided feature scoring `method`. By default it is |
||
20 | assumed that feature importance diminishes with decreasing scores. |
||
21 | |||
22 | If both `k` and `threshold` are set, only features satisfying both |
||
23 | conditions will be selected. |
||
24 | |||
25 | If `method` is not set, it is automatically selected when presented with |
||
26 | the data set. Data sets with both continuous and discrete features are |
||
27 | scored using a method suitable for the majority of features. |
||
28 | |||
29 | Parameters |
||
30 | ---------- |
||
31 | method : Orange.preprocess.score.ClassificationScorer, Orange.preprocess.score.SklScorer |
||
32 | Univariate feature scoring method. |
||
33 | |||
34 | k : int |
||
35 | The number of top features to select. |
||
36 | |||
37 | threshold : float |
||
38 | A threshold that a feature should meet according to the provided method. |
||
39 | |||
40 | decreasing : boolean |
||
41 | The order of feature importance when sorted from the most to the least |
||
42 | important feature. |
||
43 | """ |
||
44 | |||
45 | def __init__(self, method=None, k=None, threshold=None, decreasing=True): |
||
46 | self.method = method |
||
47 | self.k = k |
||
48 | self.threshold = threshold |
||
49 | self.decreasing = decreasing |
||
50 | |||
51 | def __call__(self, data): |
||
52 | method = self.method |
||
53 | # select default method according to the provided data |
||
54 | if method is None: |
||
55 | autoMethod = True |
||
0 ignored issues
–
show
|
|||
56 | discr_ratio = (sum(a.is_discrete |
||
57 | for a in data.domain.attributes) |
||
58 | / len(data.domain.attributes)) |
||
59 | if data.domain.has_discrete_class: |
||
60 | if discr_ratio >= 0.5: |
||
61 | method = GainRatio() |
||
62 | else: |
||
63 | method = ANOVA() |
||
64 | else: |
||
65 | method = UnivariateLinearRegression() |
||
66 | |||
67 | if not isinstance(data.domain.class_var, method.class_type): |
||
68 | raise ValueError(("Scoring method {} requires a class variable " + |
||
69 | "of type {}.").format( |
||
70 | (method if type(method) == type else type(method)).__name__, |
||
71 | method.class_type.__name__) |
||
72 | ) |
||
73 | features = data.domain.attributes |
||
74 | try: |
||
75 | scores = method(data) |
||
76 | except ValueError: |
||
77 | scores = self.score_only_nice_features(data, method) |
||
78 | best = sorted(zip(scores, features), key=itemgetter(0), |
||
79 | reverse=self.decreasing) |
||
80 | if self.k: |
||
81 | best = best[:self.k] |
||
82 | if self.threshold: |
||
83 | pred = ((lambda x: x[0] >= self.threshold) if self.decreasing else |
||
84 | (lambda x: x[0] <= self.threshold)) |
||
85 | best = takewhile(pred, best) |
||
86 | |||
87 | domain = Orange.data.Domain([f for s, f in best], |
||
88 | data.domain.class_vars, data.domain.metas) |
||
89 | return data.from_table(domain, data) |
||
90 | |||
91 | def score_only_nice_features(self, data, method): |
||
92 | mask = np.array([isinstance(a, method.feature_type) |
||
93 | for a in data.domain.attributes]) |
||
94 | features = [f for f in data.domain.attributes |
||
95 | if isinstance(f, method.feature_type)] |
||
96 | scores = [method(data, f) for f in features] |
||
97 | bad = float('-inf') if self.decreasing else float('inf') |
||
98 | all_scores = np.array([bad] * len(data.domain.attributes)) |
||
99 | all_scores[mask] = scores |
||
100 | return all_scores |
||
101 | |||
102 | |||
103 | class SelectRandomFeatures: |
||
104 | """ |
||
105 | A feature selector that selects random `k` features from an input |
||
106 | data set and returns a data set with selected features. Parameter |
||
107 | `k` is either an integer (number of feature) or float (from 0.0 to |
||
108 | 1.0, proportion of retained features). |
||
109 | |||
110 | Parameters |
||
111 | ---------- |
||
112 | |||
113 | k : int or float (default = 0.1) |
||
114 | The number or proportion of features to retain. |
||
115 | """ |
||
116 | |||
117 | def __init__(self, k=0.1): |
||
118 | self.k = k |
||
119 | |||
120 | def __call__(self, data): |
||
121 | if type(self.k) == float: |
||
122 | self.k = int(len(data.domain.attributes) * self.k) |
||
123 | domain = Orange.data.Domain( |
||
124 | random.sample(data.domain.attributes, self.k), |
||
125 | data.domain.class_vars, data.domain.metas) |
||
126 | return data.from_table(domain, data) |
||
127 | |||
128 | |||
129 | class RemoveNaNColumns(Preprocess): |
||
130 | """ |
||
131 | Removes data columns that contain only unknown values. Returns the |
||
132 | resulting data set. Does not check optional class attribute(s). |
||
133 | |||
134 | data : data table |
||
135 | an input data table |
||
136 | """ |
||
137 | def __call__(self, data): |
||
138 | nan_col = np.all(np.isnan(data.X), axis=0) |
||
139 | att = [a for a, nan in zip(data.domain.attributes, nan_col) if not nan] |
||
140 | domain = Orange.data.Domain(att, data.domain.class_vars, |
||
141 | data.domain.metas) |
||
142 | return Orange.data.Table(domain, data) |
||
143 |
This can be caused by one of the following:
1. Missing Dependencies
This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands.
2. Missing __init__.py files
This error could also result from missing
__init__.py
files in your module folders. Make sure that you place one file in each sub-folder.