1 | """ |
||
2 | Preprocess |
||
3 | ---------- |
||
4 | |||
5 | """ |
||
6 | import numpy as np |
||
0 ignored issues
–
show
|
|||
7 | import sklearn.preprocessing as skl_preprocessing |
||
0 ignored issues
–
show
The import
sklearn.preprocessing could not be resolved.
This can be caused by one of the following: 1. Missing DependenciesThis error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands. # .scrutinizer.yml
before_commands:
- sudo pip install abc # Python2
- sudo pip3 install abc # Python3
Tip: We are currently not using virtualenv to run pylint, when installing your modules make sure to use
the command for the correct version.
2. Missing __init__.py filesThis error could also result from missing ![]() |
|||
8 | import bottlechest |
||
0 ignored issues
–
show
The import
bottlechest could not be resolved.
This can be caused by one of the following: 1. Missing DependenciesThis error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands. # .scrutinizer.yml
before_commands:
- sudo pip install abc # Python2
- sudo pip3 install abc # Python3
Tip: We are currently not using virtualenv to run pylint, when installing your modules make sure to use
the command for the correct version.
2. Missing __init__.py filesThis error could also result from missing ![]() |
|||
9 | |||
10 | import Orange.data |
||
11 | from Orange.data import Table |
||
12 | from . import impute, discretize |
||
13 | from Orange.statistics import distribution |
||
0 ignored issues
–
show
|
|||
14 | from ..misc.enum import Enum |
||
15 | |||
16 | __all__ = ["Continuize", "Discretize", "Impute", "SklImpute", "Normalize", "Randomize"] |
||
17 | |||
18 | |||
19 | class Preprocess: |
||
20 | """ |
||
21 | A generic preprocessor class. All preprocessors need to inherit this |
||
22 | class. Preprocessors can be instantiated without the data set to return |
||
23 | data preprocessor, or can be given a data set to return the preprocessed |
||
24 | data. |
||
25 | |||
26 | Parameters |
||
27 | ---------- |
||
28 | data : a data table (default=None) |
||
29 | An optional data set to be preprocessed. |
||
30 | """ |
||
31 | |||
32 | def __new__(cls, data=None, *args, **kwargs): |
||
33 | self = super().__new__(cls) |
||
34 | if isinstance(data, Orange.data.Storage): |
||
35 | self.__init__(*args, **kwargs) |
||
36 | return self(data) |
||
37 | else: |
||
38 | return self |
||
39 | |||
40 | def __call__(self, data): |
||
41 | raise NotImplementedError("Subclasses need to implement __call__") |
||
42 | |||
43 | |||
44 | class Continuize(Preprocess): |
||
45 | MultinomialTreatment = Enum( |
||
46 | "Indicators", "FirstAsBase", "FrequentAsBase", |
||
47 | "Remove", "RemoveMultinomial", "ReportError", "AsOrdinal", |
||
48 | "AsNormalizedOrdinal", "Leave" |
||
49 | ) |
||
50 | |||
51 | (Indicators, FirstAsBase, FrequentAsBase, Remove, RemoveMultinomial, |
||
52 | ReportError, AsOrdinal, AsNormalizedOrdinal, Leave) = MultinomialTreatment |
||
53 | |||
54 | def __init__(self, zero_based=True, multinomial_treatment=Indicators): |
||
55 | self.zero_based = zero_based |
||
56 | self.multinomial_treatment = multinomial_treatment |
||
57 | |||
58 | def __call__(self, data): |
||
59 | from . import continuize |
||
60 | |||
61 | continuizer = continuize.DomainContinuizer( |
||
62 | zero_based=self.zero_based, |
||
63 | multinomial_treatment=self.multinomial_treatment) |
||
64 | domain = continuizer(data) |
||
65 | return data.from_table(domain, data) |
||
66 | |||
67 | |||
68 | class Discretize(Preprocess): |
||
69 | """ |
||
70 | Construct a discretizer, a preprocessor for discretization of |
||
71 | continuous features. |
||
72 | |||
73 | Parameters |
||
74 | ---------- |
||
75 | method : discretization method (default: Orange.preprocess.discretize.Discretization) |
||
76 | |||
77 | remove_const : bool (default=True) |
||
78 | Determines whether the features with constant values are removed |
||
79 | during discretization. |
||
80 | """ |
||
81 | |||
82 | def __init__(self, method=None, remove_const=True): |
||
83 | self.method = method |
||
84 | self.remove_const = remove_const |
||
85 | |||
86 | def __call__(self, data): |
||
87 | """ |
||
88 | Compute and apply discretization of the given data. Returns a new |
||
89 | data table. |
||
90 | |||
91 | Parameters |
||
92 | ---------- |
||
93 | data : Orange.data.Table |
||
94 | A data table to be discretized. |
||
95 | """ |
||
96 | |||
97 | def transform(var): |
||
98 | if var.is_continuous: |
||
99 | new_var = method(data, var) |
||
100 | if new_var is not None and \ |
||
101 | (len(new_var.values) >= 2 or not self.remove_const): |
||
102 | return new_var |
||
103 | else: |
||
104 | return None |
||
105 | else: |
||
106 | return var |
||
107 | |||
108 | method = self.method or discretize.EqualFreq() |
||
109 | attributes = [transform(var) for var in data.domain.attributes] |
||
110 | attributes = [var for var in attributes if var is not None] |
||
111 | domain = Orange.data.Domain( |
||
112 | attributes, data.domain.class_vars, data.domain.metas) |
||
113 | return data.from_table(domain, data) |
||
114 | |||
115 | |||
116 | class Impute(Preprocess): |
||
117 | """ |
||
118 | Construct a imputer, a preprocessor for imputation of missing values in |
||
119 | the data table. |
||
120 | |||
121 | Parameters |
||
122 | ---------- |
||
123 | method : imputation method (default: Orange.preprocess.impute.Average()) |
||
124 | """ |
||
125 | |||
126 | def __init__(self, method=Orange.preprocess.impute.Average()): |
||
127 | self.method = method |
||
128 | |||
129 | def __call__(self, data): |
||
130 | """ |
||
131 | Apply an imputation method to the given data set. Returns a new |
||
132 | data table with missing values replaced by their imputations. |
||
133 | |||
134 | Parameters |
||
135 | ---------- |
||
136 | data : Orange.data.Table |
||
137 | An input data table. |
||
138 | """ |
||
139 | |||
140 | method = self.method or impute.Average() |
||
141 | newattrs = [method(data, var) for var in data.domain.attributes] |
||
142 | domain = Orange.data.Domain( |
||
143 | newattrs, data.domain.class_vars, data.domain.metas) |
||
144 | return data.from_table(domain, data) |
||
145 | |||
146 | |||
147 | class SklImpute(Preprocess): |
||
148 | __wraps__ = skl_preprocessing.Imputer |
||
149 | |||
150 | def __init__(self, strategy='mean', force=True): |
||
151 | self.strategy = strategy |
||
152 | self.force = force |
||
153 | |||
154 | def __call__(self, data): |
||
155 | if not self.force and not np.isnan(data.X).any(): |
||
156 | return data |
||
157 | self.imputer = skl_preprocessing.Imputer(strategy=self.strategy) |
||
0 ignored issues
–
show
|
|||
158 | X = self.imputer.fit_transform(data.X) |
||
159 | features = [impute.Average()(data, var, value) for var, value in |
||
160 | zip(data.domain.attributes, self.imputer.statistics_)] |
||
161 | domain = Orange.data.Domain(features, data.domain.class_vars, |
||
162 | data.domain.metas) |
||
163 | return Orange.data.Table(domain, X, data.Y, data.metas) |
||
164 | |||
165 | |||
166 | class RemoveConstant(Preprocess): |
||
167 | """ |
||
168 | Construct a preprocessor that removes features with constant values |
||
169 | from the data set. |
||
170 | """ |
||
171 | |||
172 | def __call__(self, data): |
||
173 | """ |
||
174 | Remove columns with constant values from the data set and return |
||
175 | the resulting data table. |
||
176 | |||
177 | Parameters |
||
178 | ---------- |
||
179 | data : an input data set |
||
180 | """ |
||
181 | |||
182 | oks = bottlechest.nanmin(data.X, axis=0) != \ |
||
183 | bottlechest.nanmax(data.X, axis=0) |
||
184 | atts = [data.domain.attributes[i] for i, ok in enumerate(oks) if ok] |
||
185 | domain = Orange.data.Domain(atts, data.domain.class_vars, |
||
186 | data.domain.metas) |
||
187 | return Orange.data.Table(domain, data) |
||
188 | |||
189 | |||
190 | class Normalize(Preprocess): |
||
191 | """ |
||
192 | Construct a preprocessor for normalization of features. |
||
193 | Given a data table, preprocessor returns a new table in |
||
194 | which the continuous attributes are normalized. |
||
195 | |||
196 | Parameters |
||
197 | ---------- |
||
198 | zero_based : bool (default=True) |
||
199 | Determines the value used as the “low” value of the variable. |
||
200 | It determines the interval for normalized continuous variables |
||
201 | (either [-1, 1] or [0, 1]). |
||
202 | |||
203 | norm_type : NormTypes (default: Normalize.NormalizeBySD) |
||
204 | Normalization type. If Normalize.NormalizeBySD, the values are |
||
205 | replaced with standardized values by subtracting the average |
||
206 | value and dividing by the standard deviation. |
||
207 | Attribute zero_based has no effect on this standardization. |
||
208 | |||
209 | If Normalize.NormalizeBySpan, the values are replaced with |
||
210 | normalized values by subtracting min value of the data and |
||
211 | dividing by span (max - min). |
||
212 | |||
213 | transform_class : bool (default=False) |
||
214 | If True the class is normalized as well. |
||
215 | |||
216 | Examples |
||
217 | -------- |
||
218 | >>> from Orange.data import Table |
||
219 | >>> from Orange.preprocess import Normalize |
||
220 | >>> data = Table("iris") |
||
221 | >>> normalizer = Normalize(Normalize.NormalizeBySpan) |
||
222 | >>> normalized_data = normalizer(data) |
||
223 | """ |
||
224 | |||
225 | NormTypes = Enum("NormalizeBySpan", "NormalizeBySD") |
||
226 | (NormalizeBySpan, NormalizeBySD) = NormTypes |
||
0 ignored issues
–
show
|
|||
227 | |||
228 | def __init__(self, |
||
229 | zero_based=True, |
||
230 | norm_type=NormalizeBySD, |
||
231 | transform_class=False): |
||
232 | self.zero_based = zero_based |
||
233 | self.norm_type = norm_type |
||
234 | self.transform_class = transform_class |
||
235 | |||
236 | def __call__(self, data): |
||
237 | """ |
||
238 | Compute and apply normalization of the given data. Returns a new |
||
239 | data table. |
||
240 | |||
241 | Parameters |
||
242 | ---------- |
||
243 | data : Orange.data.Table |
||
244 | A data table to be normalized. |
||
245 | |||
246 | Returns |
||
247 | ------- |
||
248 | data : Orange.data.Table |
||
249 | Normalized data table. |
||
250 | """ |
||
251 | from . import normalize |
||
252 | |||
253 | normalizer = normalize.Normalizer( |
||
254 | zero_based=self.zero_based, |
||
255 | norm_type=self.norm_type, |
||
256 | transform_class=self.transform_class) |
||
257 | return normalizer(data) |
||
258 | |||
259 | |||
260 | class Randomize(Preprocess): |
||
261 | """ |
||
262 | Construct a preprocessor for randomization of classes, |
||
263 | attributes or metas. |
||
264 | Given a data table, preprocessor returns a new table in |
||
265 | which the data is shuffled. |
||
266 | |||
267 | Parameters |
||
268 | ---------- |
||
269 | |||
270 | rand_type : RandTypes (default: Randomize.RandomizeClasses) |
||
271 | Randomization type. If Randomize.RandomizeClasses, classes |
||
272 | are shuffled. |
||
273 | If Randomize.RandomizeAttributes, attributes are shuffled. |
||
274 | If Randomize.RandomizeMetas, metas are shuffled. |
||
275 | |||
276 | Examples |
||
277 | -------- |
||
278 | >>> from Orange.data import Table |
||
279 | >>> from Orange.preprocess import Randomize |
||
280 | >>> data = Table("iris") |
||
281 | >>> randomizer = Randomize(Randomize.RandomizeClasses) |
||
282 | >>> randomized_data = randomizer(data) |
||
283 | """ |
||
284 | |||
285 | RandTypes = Enum("RandomizeClasses", "RandomizeAttributes", |
||
286 | "RandomizeMetas") |
||
287 | (RandomizeClasses, RandomizeAttributes, RandomizeMetas) = RandTypes |
||
288 | |||
289 | def __init__(self, rand_type=RandomizeClasses): |
||
290 | self.rand_type = rand_type |
||
291 | |||
292 | def __call__(self, data): |
||
293 | """ |
||
294 | Apply randomization of the given data. Returns a new |
||
295 | data table. |
||
296 | |||
297 | Parameters |
||
298 | ---------- |
||
299 | data : Orange.data.Table |
||
300 | A data table to be randomized. |
||
301 | |||
302 | Returns |
||
303 | ------- |
||
304 | data : Orange.data.Table |
||
305 | Randomized data table. |
||
306 | """ |
||
307 | new_data = Table(data) |
||
308 | new_data.ensure_copy() |
||
309 | |||
310 | if self.rand_type == Randomize.RandomizeClasses: |
||
311 | self.randomize(new_data.Y) |
||
312 | elif self.rand_type == Randomize.RandomizeAttributes: |
||
313 | self.randomize(new_data.X) |
||
314 | elif self.rand_type == Randomize.RandomizeMetas: |
||
315 | self.randomize(new_data.metas) |
||
316 | else: |
||
317 | raise TypeError('Unsupported type') |
||
318 | |||
319 | return new_data |
||
320 | |||
321 | def randomize(self, table): |
||
0 ignored issues
–
show
This method could be written as a function/class method.
If a method does not access any attributes of the class, it could also be implemented as a function or static method. This can help improve readability. For example class Foo:
def some_method(self, x, y):
return x + y;
could be written as class Foo:
@classmethod
def some_method(cls, x, y):
return x + y;
![]() |
|||
322 | if len(table.shape) > 1: |
||
323 | for i in range(table.shape[1]): |
||
324 | np.random.shuffle(table[:,i]) |
||
325 | else: |
||
326 | np.random.shuffle(table) |
||
327 | |||
328 | |||
329 | class PreprocessorList: |
||
330 | """ |
||
331 | Store a list of preprocessors and on call apply them to the data set. |
||
332 | |||
333 | Parameters |
||
334 | ---------- |
||
335 | preprocessors : list |
||
336 | A list of preprocessors. |
||
337 | """ |
||
338 | |||
339 | def __init__(self, preprocessors): |
||
340 | self.preprocessors = list(preprocessors) |
||
341 | |||
342 | def __call__(self, data): |
||
343 | """ |
||
344 | Applies a list of preprocessors to the data set. |
||
345 | |||
346 | Parameters |
||
347 | ---------- |
||
348 | data : an input data table |
||
349 | """ |
||
350 | |||
351 | for pp in self.preprocessors: |
||
352 | data = pp(data) |
||
353 | return data |
||
354 | |||
355 |
This can be caused by one of the following:
1. Missing Dependencies
This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands.
2. Missing __init__.py files
This error could also result from missing
__init__.py
files in your module folders. Make sure that you place one file in each sub-folder.