Passed
Pull Request — master (#110)
by
unknown
01:45
created

SklearnCvExperiment.__init__()   B

Complexity

Conditions 6

Size

Total Lines 33
Code Lines 22

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 22
dl 0
loc 33
rs 8.4186
c 0
b 0
f 0
cc 6
nop 6
1
"""Experiment adapter for sklearn cross-validation experiments."""
2
3
from sklearn import clone
4
from sklearn.metrics import check_scoring
5
from sklearn.model_selection import cross_validate
6
from sklearn.utils.validation import _num_samples
7
8
from hyperactive.base import BaseExperiment
9
10
class SklearnCvExperiment(BaseExperiment):
11
    """Experiment adapter for sklearn cross-validation experiments.
12
13
    This class is used to perform cross-validation experiments using a given
14
    sklearn estimator. It allows for hyperparameter tuning and evaluation of
15
    the model's performance using cross-validation.
16
17
    The score returned is the mean of the cross-validation scores,
18
    of applying cross-validation to ``estimator`` with the parameters given in
19
    ``score`` ``params``.
20
21
    The cross-validation performed is specified by the ``cv`` parameter,
22
    and the scoring metric is specified by the ``scoring`` parameter.
23
    The ``X`` and ``y`` parameters are the input data and target values,
24
    which are used in fit/predict cross-validation.
25
26
    Parameters
27
    ----------
28
    estimator : sklearn estimator
29
        The estimator to be used for the experiment.
30
    X : array-like, shape (n_samples, n_features)
31
            The input data for the model.
32
    y : array-like, shape (n_samples,) or (n_samples, n_outputs)
33
        The target values for the model.
34
    cv : int or cross-validation generator, default = KFold(n_splits=3, shuffle=True)
35
        The number of folds or cross-validation strategy to be used.
36
        If int, the cross-validation used is KFold(n_splits=cv, shuffle=True).
37
    scoring : callable or str, default = accuracy_score or mean_squared_error
38
        sklearn scoring function or metric to evaluate the model's performance.
39
        Default is determined by the type of estimator:
40
        ``accuracy_score`` for classifiers, and
41
        ``mean_squared_error`` for regressors, as per sklearn convention
42
        through the default ``score`` method of the estimator.
43
44
    Example
45
    -------
46
    >>> from hyperactive.experiment.integrations import SklearnCvExperiment
47
    >>> from sklearn.datasets import load_iris
48
    >>> from sklearn.svm import SVC
49
    >>> from sklearn.metrics import accuracy_score
50
    >>> from sklearn.model_selection import KFold
51
    >>>
52
    >>> X, y = load_iris(return_X_y=True)
53
    >>>
54
    >>> sklearn_exp = SklearnCvExperiment(
55
    ...     estimator=SVC(),
56
    ...     scoring=accuracy_score,
57
    ...     cv=KFold(n_splits=3, shuffle=True),
58
    ...     X=X,
59
    ...     y=y,
60
    ... )
61
    >>> params = {"C": 1.0, "kernel": "linear"}
62
    >>> score, add_info = sklearn_exp.score(params)
63
64
    For default choices of ``scoring`` and ``cv``:
65
    >>> sklearn_exp = SklearnCvExperiment(
66
    ...     estimator=SVC(),
67
    ...     X=X,
68
    ...     y=y,
69
    ... )
70
    >>> params = {"C": 1.0, "kernel": "linear"}
71
    >>> score, add_info = sklearn_exp.score(params)
72
73
    Quick call without metadata return or dictionary:
74
    >>> score = sklearn_exp(C=1.0, kernel="linear")
75
    """
76
77
    def __init__(self, estimator, X, y, scoring=None, cv=None):
78
        self.estimator = estimator
79
        self.X = X
80
        self.y = y
81
        self.scoring = scoring
82
        self.cv = cv
83
84
        super().__init__()
85
86
        if cv is None:
87
            from sklearn.model_selection import KFold
88
89
            self._cv = KFold(n_splits=3, shuffle=True)
90
        elif isinstance(cv, int):
91
            from sklearn.model_selection import KFold
92
93
            self._cv = KFold(n_splits=cv, shuffle=True)
94
        else:
95
            self._cv = cv
96
97
        # check if scoring is a scorer by checking for "estimator" in signature
98
        if scoring is None:
99
            self._scoring = check_scoring(self.estimator)
100
        # check using inspect.signature for "estimator" in signature
101
        elif callable(scoring):
102
            from inspect import signature
103
104
            if "estimator" in signature(scoring).parameters:
105
                self._scoring = scoring
106
            else:
107
                from sklearn.metrics import make_scorer
108
109
                self._scoring = make_scorer(scoring)
110
111
    def _paramnames(self):
112
        """Return the parameter names of the search.
113
114
        Returns
115
        -------
116
        list of str
117
            The parameter names of the search parameters.
118
        """
119
        return list(self.estimator.get_params().keys())
120
121
    def _score(self, params):
122
        """Score the parameters.
123
124
        Parameters
125
        ----------
126
        params : dict with string keys
127
            Parameters to score.
128
129
        Returns
130
        -------
131
        float
132
            The score of the parameters.
133
        dict
134
            Additional metadata about the search.
135
        """
136
        estimator = clone(self.estimator)
137
        estimator.set_params(**params)
138
139
        cv_results = cross_validate(
140
            estimator,
141
            self.X,
142
            self.y,
143
            scoring=self._scoring,
144
            cv=self._cv,
145
        )
146
147
        add_info_d = {
148
            "score_time": cv_results["score_time"],
149
            "fit_time": cv_results["fit_time"],
150
            "n_test_samples": _num_samples(self.X),
151
        }
152
153
        return cv_results["test_score"].mean(), add_info_d
154
155
    @classmethod
156
    def get_test_params(cls, parameter_set="default"):
157
        """Return testing parameter settings for the skbase object.
158
159
        ``get_test_params`` is a unified interface point to store
160
        parameter settings for testing purposes. This function is also
161
        used in ``create_test_instance`` and ``create_test_instances_and_names``
162
        to construct test instances.
163
164
        ``get_test_params`` should return a single ``dict``, or a ``list`` of ``dict``.
165
166
        Each ``dict`` is a parameter configuration for testing,
167
        and can be used to construct an "interesting" test instance.
168
        A call to ``cls(**params)`` should
169
        be valid for all dictionaries ``params`` in the return of ``get_test_params``.
170
171
        The ``get_test_params`` need not return fixed lists of dictionaries,
172
        it can also return dynamic or stochastic parameter settings.
173
174
        Parameters
175
        ----------
176
        parameter_set : str, default="default"
177
            Name of the set of test parameters to return, for use in tests. If no
178
            special parameters are defined for a value, will return `"default"` set.
179
180
        Returns
181
        -------
182
        params : dict or list of dict, default = {}
183
            Parameters to create testing instances of the class
184
            Each dict are parameters to construct an "interesting" test instance, i.e.,
185
            `MyClass(**params)` or `MyClass(**params[i])` creates a valid test instance.
186
            `create_test_instance` uses the first (or only) dictionary in `params`
187
        """
188
        from sklearn.datasets import load_diabetes, load_iris
189
        from sklearn.svm import SVC, SVR
190
        from sklearn.metrics import accuracy_score, mean_absolute_error
191
        from sklearn.model_selection import KFold
192
193
        X, y = load_iris(return_X_y=True)
194
        params_classif = {
195
            "estimator": SVC(),
196
            "scoring": accuracy_score,
197
            "cv": KFold(n_splits=3, shuffle=True),
198
            "X": X,
199
            "y": y,
200
        }
201
202
        X, y = load_diabetes(return_X_y=True)
203
        params_regress = {
204
            "estimator": SVR(),
205
            "scoring": mean_absolute_error,
206
            "cv": 2,
207
            "X": X,
208
            "y": y,
209
        }
210
211
        X, y = load_diabetes(return_X_y=True)
212
        params_all_default = {
213
            "estimator": SVR(),
214
            "X": X,
215
            "y": y,
216
        }
217
218
        return [params_classif, params_regress, params_all_default]
219
220
    @classmethod
221
    def _get_score_params(self):
222
        """Return settings for the score function.
223
224
        Returns a list, the i-th element corresponds to self.get_test_params()[i].
225
        It should be a valid call for self.score.
226
227
        Returns
228
        -------
229
        list of dict
230
            The parameters to be used for scoring.
231
        """
232
        score_params_classif = {"C": 1.0, "kernel": "linear"}
233
        score_params_regress = {"C": 1.0, "kernel": "linear"}
234
        score_params_defaults = {"C": 1.0, "kernel": "linear"}
235
        return [score_params_classif, score_params_regress, score_params_defaults]
236