Issues in fss.py (master) - Issues in master - hugobuddel/orange3 - Measure and Improve Code Quality continuously with Scrutinizer

Issues (4082)

Orange/preprocess/fss.py (3 issues)

Labels

Severity

import random
import Orange
import numpy as np
# .scrutinizer.yml
before_commands:
    - sudo pip install abc # Python2
    - sudo pip3 install abc # Python3

from itertools import takewhile
from operator import itemgetter

from Orange.preprocess.preprocess import Preprocess
from Orange.preprocess.score import ANOVA, GainRatio, UnivariateLinearRegression
from Orange.data import Domain


__all__ = ["SelectBestFeatures", "RemoveNaNColumns", "SelectRandomFeatures"]


class SelectBestFeatures:
    """
    A feature selector that builds a new data set consisting of either the top
    `k` features or all those that exceed a given `threshold`. Features are
    scored using the provided feature scoring `method`. By default it is
    assumed that feature importance diminishes with decreasing scores.

    If both `k` and `threshold` are set, only features satisfying both
    conditions will be selected.

    If `method` is not set, it is automatically selected when presented with
    the data set. Data sets with both continuous and discrete features are
    scored using a method suitable for the majority of features.

    Parameters
    ----------
    method : Orange.preprocess.score.ClassificationScorer, Orange.preprocess.score.SklScorer
        Univariate feature scoring method.

    k : int
        The number of top features to select.

    threshold : float
        A threshold that a feature should meet according to the provided method.

    decreasing : boolean
        The order of feature importance when sorted from the most to the least
        important feature.
    """

    def __init__(self, method=None, k=None, threshold=None, decreasing=True):
        self.method = method
        self.k = k
        self.threshold = threshold
        self.decreasing = decreasing

    def __call__(self, data):
        method = self.method
        # select default method according to the provided data
        if method is None:
            autoMethod = True

            discr_ratio = (sum(a.is_discrete
                               for a in data.domain.attributes)
                           / len(data.domain.attributes))
            if data.domain.has_discrete_class:
                if discr_ratio >= 0.5:
                    method = GainRatio()
                else:
                    method = ANOVA()
            else:
                method = UnivariateLinearRegression()

        if not isinstance(data.domain.class_var, method.class_type):
            raise ValueError(("Scoring method {} requires a class variable " +
                              "of type {}.").format(
                (method if type(method) == type else type(method)).__name__,
                method.class_type.__name__)
            )
        features = data.domain.attributes
        try:
            scores = method(data)
        except ValueError:
            scores = self.score_only_nice_features(data, method)
        best = sorted(zip(scores, features), key=itemgetter(0),
                      reverse=self.decreasing)
        if self.k:
            best = best[:self.k]
        if self.threshold:
            pred = ((lambda x: x[0] >= self.threshold) if self.decreasing else
                    (lambda x: x[0] <= self.threshold))
            best = takewhile(pred, best)

        domain = Orange.data.Domain([f for s, f in best],
                                    data.domain.class_vars, data.domain.metas)
        return data.from_table(domain, data)

    def score_only_nice_features(self, data, method):
        mask = np.array([isinstance(a, method.feature_type)
                         for a in data.domain.attributes])
        features = [f for f in data.domain.attributes
                    if isinstance(f, method.feature_type)]
        scores = [method(data, f) for f in features]
        bad = float('-inf') if self.decreasing else float('inf')
        all_scores = np.array([bad] * len(data.domain.attributes))
        all_scores[mask] = scores
        return all_scores


class SelectRandomFeatures:
    """
    A feature selector that selects random `k` features from an input
    data set and returns a data set with selected features. Parameter
    `k` is either an integer (number of feature) or float (from 0.0 to
    1.0, proportion of retained features).

    Parameters
    ----------

    k : int or float (default = 0.1)
        The number or proportion of features to retain.
    """

    def __init__(self, k=0.1):
        self.k = k

    def __call__(self, data):
        if type(self.k) == float:
            self.k = int(len(data.domain.attributes) * self.k)
        domain = Orange.data.Domain(
            random.sample(data.domain.attributes, self.k),
            data.domain.class_vars, data.domain.metas)
        return data.from_table(domain, data)


class RemoveNaNColumns(Preprocess):
    """
    Removes data columns that contain only unknown values. Returns the
    resulting data set. Does not check optional class attribute(s).

    data : data table
        an input data table
    """
    def __call__(self, data):
        nan_col = np.all(np.isnan(data.X), axis=0)
        att = [a for a, nan in zip(data.domain.attributes, nan_col) if not nan]
        domain = Orange.data.Domain(att, data.domain.class_vars,
                                    data.domain.metas)
        return Orange.data.Table(domain, data)


1			import random
2			import Orange
3			import numpy as np
			0 ignored issues – show Configuration introduced 2015-12-02 09:15 UTC by Report Bug Copy Issue Report Show Similar Issues like this The import `numpy` could not be resolved. This can be caused by one of the following: 1. Missing Dependencies This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands. # .scrutinizer.yml before_commands: - sudo pip install abc # Python2 - sudo pip3 install abc # Python3 Tip: We are currently not using virtualenv to run pylint, when installing your modules make sure to use the command for the correct version. 2. Missing __init__.py files This error could also result from missing `__init__.py` files in your module folders. Make sure that you place one file in each sub-folder. Loading history...
4
5			from itertools import takewhile
6			from operator import itemgetter
7
8			from Orange.preprocess.preprocess import Preprocess
9			from Orange.preprocess.score import ANOVA, GainRatio, UnivariateLinearRegression
10			from Orange.data import Domain
			0 ignored issues – show Unused Code introduced 2015-12-02 09:15 UTC by Report Bug Copy Issue Report Show Similar Issues like this Unused Domain imported from Orange.data Loading history...
11
12			__all__ = ["SelectBestFeatures", "RemoveNaNColumns", "SelectRandomFeatures"]
13
14
15			class SelectBestFeatures:
16			"""
17			A feature selector that builds a new data set consisting of either the top
18			`k` features or all those that exceed a given `threshold`. Features are
19			scored using the provided feature scoring `method`. By default it is
20			assumed that feature importance diminishes with decreasing scores.
21
22			If both `k` and `threshold` are set, only features satisfying both
23			conditions will be selected.
24
25			If `method` is not set, it is automatically selected when presented with
26			the data set. Data sets with both continuous and discrete features are
27			scored using a method suitable for the majority of features.
28
29			Parameters
30			----------
31			method : Orange.preprocess.score.ClassificationScorer, Orange.preprocess.score.SklScorer
32			Univariate feature scoring method.
33
34			k : int
35			The number of top features to select.
36
37			threshold : float
38			A threshold that a feature should meet according to the provided method.
39
40			decreasing : boolean
41			The order of feature importance when sorted from the most to the least
42			important feature.
43			"""
44
45			def __init__(self, method=None, k=None, threshold=None, decreasing=True):
46			self.method = method
47			self.k = k
48			self.threshold = threshold
49			self.decreasing = decreasing
50
51			def __call__(self, data):
52			method = self.method
53			# select default method according to the provided data
54			if method is None:
55			autoMethod = True
			0 ignored issues – show Unused Code introduced 2015-12-02 09:15 UTC by Report Bug Copy Issue Report Show Similar Issues like this The variable `autoMethod` seems to be unused. Loading history...
56			discr_ratio = (sum(a.is_discrete
57			for a in data.domain.attributes)
58			/ len(data.domain.attributes))
59			if data.domain.has_discrete_class:
60			if discr_ratio >= 0.5:
61			method = GainRatio()
62			else:
63			method = ANOVA()
64			else:
65			method = UnivariateLinearRegression()
66
67			if not isinstance(data.domain.class_var, method.class_type):
68			raise ValueError(("Scoring method {} requires a class variable " +
69			"of type {}.").format(
70			(method if type(method) == type else type(method)).__name__,
71			method.class_type.__name__)
72			)
73			features = data.domain.attributes
74			try:
75			scores = method(data)
76			except ValueError:
77			scores = self.score_only_nice_features(data, method)
78			best = sorted(zip(scores, features), key=itemgetter(0),
79			reverse=self.decreasing)
80			if self.k:
81			best = best[:self.k]
82			if self.threshold:
83			pred = ((lambda x: x[0] >= self.threshold) if self.decreasing else
84			(lambda x: x[0] <= self.threshold))
85			best = takewhile(pred, best)
86
87			domain = Orange.data.Domain([f for s, f in best],
88			data.domain.class_vars, data.domain.metas)
89			return data.from_table(domain, data)
90
91			def score_only_nice_features(self, data, method):
92			mask = np.array([isinstance(a, method.feature_type)
93			for a in data.domain.attributes])
94			features = [f for f in data.domain.attributes
95			if isinstance(f, method.feature_type)]
96			scores = [method(data, f) for f in features]
97			bad = float('-inf') if self.decreasing else float('inf')
98			all_scores = np.array([bad] * len(data.domain.attributes))
99			all_scores[mask] = scores
100			return all_scores
101
102
103			class SelectRandomFeatures:
104			"""
105			A feature selector that selects random `k` features from an input
106			data set and returns a data set with selected features. Parameter
107			`k` is either an integer (number of feature) or float (from 0.0 to
108			1.0, proportion of retained features).
109
110			Parameters
111			----------
112
113			k : int or float (default = 0.1)
114			The number or proportion of features to retain.
115			"""
116
117			def __init__(self, k=0.1):
118			self.k = k
119
120			def __call__(self, data):
121			if type(self.k) == float:
122			self.k = int(len(data.domain.attributes) * self.k)
123			domain = Orange.data.Domain(
124			random.sample(data.domain.attributes, self.k),
125			data.domain.class_vars, data.domain.metas)
126			return data.from_table(domain, data)
127
128
129			class RemoveNaNColumns(Preprocess):
130			"""
131			Removes data columns that contain only unknown values. Returns the
132			resulting data set. Does not check optional class attribute(s).
133
134			data : data table
135			an input data table
136			"""
137			def __call__(self, data):
138			nan_col = np.all(np.isnan(data.X), axis=0)
139			att = [a for a, nan in zip(data.domain.attributes, nan_col) if not nan]
140			domain = Orange.data.Domain(att, data.domain.class_vars,
141			data.domain.metas)
142			return Orange.data.Table(domain, data)
143

GitHub Access Token became invalid

Issues (4082)

Orange/preprocess/fss.py (3 issues)

Labels

Severity

Introduced By

1. Missing Dependencies

2. Missing init.py files

hugobuddel / orange3

GitHub Access Token became invalid

Issues (4082)

Orange/preprocess/fss.py (3 issues)

Labels

Severity

Introduced By

1. Missing Dependencies

2. Missing __init__.py files

Duplication Side-by-Side

Filter issues like

2. Missing init.py files