1
|
|
|
import random |
2
|
|
|
import Orange |
3
|
|
|
import numpy as np |
|
|
|
|
4
|
|
|
|
5
|
|
|
from itertools import takewhile |
6
|
|
|
from operator import itemgetter |
7
|
|
|
|
8
|
|
|
from Orange.preprocess.preprocess import Preprocess |
9
|
|
|
from Orange.preprocess.score import ANOVA, GainRatio, UnivariateLinearRegression |
10
|
|
|
from Orange.data import Domain |
|
|
|
|
11
|
|
|
|
12
|
|
|
__all__ = ["SelectBestFeatures", "RemoveNaNColumns", "SelectRandomFeatures"] |
13
|
|
|
|
14
|
|
|
|
15
|
|
|
class SelectBestFeatures: |
16
|
|
|
""" |
17
|
|
|
A feature selector that builds a new data set consisting of either the top |
18
|
|
|
`k` features or all those that exceed a given `threshold`. Features are |
19
|
|
|
scored using the provided feature scoring `method`. By default it is |
20
|
|
|
assumed that feature importance diminishes with decreasing scores. |
21
|
|
|
|
22
|
|
|
If both `k` and `threshold` are set, only features satisfying both |
23
|
|
|
conditions will be selected. |
24
|
|
|
|
25
|
|
|
If `method` is not set, it is automatically selected when presented with |
26
|
|
|
the data set. Data sets with both continuous and discrete features are |
27
|
|
|
scored using a method suitable for the majority of features. |
28
|
|
|
|
29
|
|
|
Parameters |
30
|
|
|
---------- |
31
|
|
|
method : Orange.preprocess.score.ClassificationScorer, Orange.preprocess.score.SklScorer |
32
|
|
|
Univariate feature scoring method. |
33
|
|
|
|
34
|
|
|
k : int |
35
|
|
|
The number of top features to select. |
36
|
|
|
|
37
|
|
|
threshold : float |
38
|
|
|
A threshold that a feature should meet according to the provided method. |
39
|
|
|
|
40
|
|
|
decreasing : boolean |
41
|
|
|
The order of feature importance when sorted from the most to the least |
42
|
|
|
important feature. |
43
|
|
|
""" |
44
|
|
|
|
45
|
|
|
def __init__(self, method=None, k=None, threshold=None, decreasing=True): |
46
|
|
|
self.method = method |
47
|
|
|
self.k = k |
48
|
|
|
self.threshold = threshold |
49
|
|
|
self.decreasing = decreasing |
50
|
|
|
|
51
|
|
|
def __call__(self, data): |
52
|
|
|
method = self.method |
53
|
|
|
# select default method according to the provided data |
54
|
|
|
if method is None: |
55
|
|
|
autoMethod = True |
|
|
|
|
56
|
|
|
discr_ratio = (sum(a.is_discrete |
57
|
|
|
for a in data.domain.attributes) |
58
|
|
|
/ len(data.domain.attributes)) |
59
|
|
|
if data.domain.has_discrete_class: |
60
|
|
|
if discr_ratio >= 0.5: |
61
|
|
|
method = GainRatio() |
62
|
|
|
else: |
63
|
|
|
method = ANOVA() |
64
|
|
|
else: |
65
|
|
|
method = UnivariateLinearRegression() |
66
|
|
|
|
67
|
|
|
if not isinstance(data.domain.class_var, method.class_type): |
68
|
|
|
raise ValueError(("Scoring method {} requires a class variable " + |
69
|
|
|
"of type {}.").format( |
70
|
|
|
(method if type(method) == type else type(method)).__name__, |
71
|
|
|
method.class_type.__name__) |
72
|
|
|
) |
73
|
|
|
features = data.domain.attributes |
74
|
|
|
try: |
75
|
|
|
scores = method(data) |
76
|
|
|
except ValueError: |
77
|
|
|
scores = self.score_only_nice_features(data, method) |
78
|
|
|
best = sorted(zip(scores, features), key=itemgetter(0), |
79
|
|
|
reverse=self.decreasing) |
80
|
|
|
if self.k: |
81
|
|
|
best = best[:self.k] |
82
|
|
|
if self.threshold: |
83
|
|
|
pred = ((lambda x: x[0] >= self.threshold) if self.decreasing else |
84
|
|
|
(lambda x: x[0] <= self.threshold)) |
85
|
|
|
best = takewhile(pred, best) |
86
|
|
|
|
87
|
|
|
domain = Orange.data.Domain([f for s, f in best], |
88
|
|
|
data.domain.class_vars, data.domain.metas) |
89
|
|
|
return data.from_table(domain, data) |
90
|
|
|
|
91
|
|
|
def score_only_nice_features(self, data, method): |
92
|
|
|
mask = np.array([isinstance(a, method.feature_type) |
93
|
|
|
for a in data.domain.attributes]) |
94
|
|
|
features = [f for f in data.domain.attributes |
95
|
|
|
if isinstance(f, method.feature_type)] |
96
|
|
|
scores = [method(data, f) for f in features] |
97
|
|
|
bad = float('-inf') if self.decreasing else float('inf') |
98
|
|
|
all_scores = np.array([bad] * len(data.domain.attributes)) |
99
|
|
|
all_scores[mask] = scores |
100
|
|
|
return all_scores |
101
|
|
|
|
102
|
|
|
|
103
|
|
|
class SelectRandomFeatures: |
104
|
|
|
""" |
105
|
|
|
A feature selector that selects random `k` features from an input |
106
|
|
|
data set and returns a data set with selected features. Parameter |
107
|
|
|
`k` is either an integer (number of feature) or float (from 0.0 to |
108
|
|
|
1.0, proportion of retained features). |
109
|
|
|
|
110
|
|
|
Parameters |
111
|
|
|
---------- |
112
|
|
|
|
113
|
|
|
k : int or float (default = 0.1) |
114
|
|
|
The number or proportion of features to retain. |
115
|
|
|
""" |
116
|
|
|
|
117
|
|
|
def __init__(self, k=0.1): |
118
|
|
|
self.k = k |
119
|
|
|
|
120
|
|
|
def __call__(self, data): |
121
|
|
|
if type(self.k) == float: |
122
|
|
|
self.k = int(len(data.domain.attributes) * self.k) |
123
|
|
|
domain = Orange.data.Domain( |
124
|
|
|
random.sample(data.domain.attributes, self.k), |
125
|
|
|
data.domain.class_vars, data.domain.metas) |
126
|
|
|
return data.from_table(domain, data) |
127
|
|
|
|
128
|
|
|
|
129
|
|
|
class RemoveNaNColumns(Preprocess): |
130
|
|
|
""" |
131
|
|
|
Removes data columns that contain only unknown values. Returns the |
132
|
|
|
resulting data set. Does not check optional class attribute(s). |
133
|
|
|
|
134
|
|
|
data : data table |
135
|
|
|
an input data table |
136
|
|
|
""" |
137
|
|
|
def __call__(self, data): |
138
|
|
|
nan_col = np.all(np.isnan(data.X), axis=0) |
139
|
|
|
att = [a for a, nan in zip(data.domain.attributes, nan_col) if not nan] |
140
|
|
|
domain = Orange.data.Domain(att, data.domain.class_vars, |
141
|
|
|
data.domain.metas) |
142
|
|
|
return Orange.data.Table(domain, data) |
143
|
|
|
|
This can be caused by one of the following:
1. Missing Dependencies
This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands.
2. Missing __init__.py files
This error could also result from missing
__init__.py
files in your module folders. Make sure that you place one file in each sub-folder.