SklearnCvExperiment.__init__()   B
last analyzed

Complexity

Conditions 8

Size

Total Lines 41
Code Lines 28

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 28
dl 0
loc 41
rs 7.3333
c 0
b 0
f 0
cc 8
nop 6
1
"""Experiment adapter for sklearn cross-validation experiments."""
2
# copyright: hyperactive developers, MIT License (see LICENSE file)
3
4
from sklearn import clone
5
from sklearn.metrics import check_scoring
6
from sklearn.model_selection import cross_validate
7
from sklearn.utils.validation import _num_samples
8
9
from hyperactive.base import BaseExperiment
10
11
12
class SklearnCvExperiment(BaseExperiment):
13
    """Experiment adapter for sklearn cross-validation experiments.
14
15
    This class is used to perform cross-validation experiments using a given
16
    sklearn estimator. It allows for hyperparameter tuning and evaluation of
17
    the model's performance using cross-validation.
18
19
    The score returned is the mean of the cross-validation scores,
20
    of applying cross-validation to ``estimator`` with the parameters given in
21
    ``score`` ``params``.
22
23
    The cross-validation performed is specified by the ``cv`` parameter,
24
    and the scoring metric is specified by the ``scoring`` parameter.
25
    The ``X`` and ``y`` parameters are the input data and target values,
26
    which are used in fit/predict cross-validation.
27
28
    Parameters
29
    ----------
30
    estimator : sklearn estimator
31
        The estimator to be used for the experiment.
32
    X : array-like, shape (n_samples, n_features)
33
            The input data for the model.
34
    y : array-like, shape (n_samples,) or (n_samples, n_outputs)
35
        The target values for the model.
36
    cv : int or cross-validation generator, default = KFold(n_splits=3, shuffle=True)
37
        The number of folds or cross-validation strategy to be used.
38
        If int, the cross-validation used is KFold(n_splits=cv, shuffle=True).
39
    scoring : callable or str, default = accuracy_score or mean_squared_error
40
        sklearn scoring function or metric to evaluate the model's performance.
41
        Default is determined by the type of estimator:
42
        ``accuracy_score`` for classifiers, and
43
        ``mean_squared_error`` for regressors, as per sklearn convention
44
        through the default ``score`` method of the estimator.
45
46
    Example
47
    -------
48
    >>> from hyperactive.experiment.integrations import SklearnCvExperiment
49
    >>> from sklearn.datasets import load_iris
50
    >>> from sklearn.svm import SVC
51
    >>> from sklearn.metrics import accuracy_score
52
    >>> from sklearn.model_selection import KFold
53
    >>>
54
    >>> X, y = load_iris(return_X_y=True)
55
    >>>
56
    >>> sklearn_exp = SklearnCvExperiment(
57
    ...     estimator=SVC(),
58
    ...     scoring=accuracy_score,
59
    ...     cv=KFold(n_splits=3, shuffle=True),
60
    ...     X=X,
61
    ...     y=y,
62
    ... )
63
    >>> params = {"C": 1.0, "kernel": "linear"}
64
    >>> score, add_info = sklearn_exp.score(params)
65
66
    For default choices of ``scoring`` and ``cv``:
67
    >>> sklearn_exp = SklearnCvExperiment(
68
    ...     estimator=SVC(),
69
    ...     X=X,
70
    ...     y=y,
71
    ... )
72
    >>> params = {"C": 1.0, "kernel": "linear"}
73
    >>> score, add_info = sklearn_exp.score(params)
74
75
    Quick call without metadata return or dictionary:
76
    >>> score = sklearn_exp(C=1.0, kernel="linear")
77
    """
78
79
    def __init__(self, estimator, X, y, scoring=None, cv=None):
80
        self.estimator = estimator
81
        self.X = X
82
        self.y = y
83
        self.scoring = scoring
84
        self.cv = cv
85
86
        super().__init__()
87
88
        if cv is None:
89
            from sklearn.model_selection import KFold
90
91
            self._cv = KFold(n_splits=3, shuffle=True)
92
        elif isinstance(cv, int):
93
            from sklearn.model_selection import KFold
94
95
            self._cv = KFold(n_splits=cv, shuffle=True)
96
        else:
97
            self._cv = cv
98
99
        # check if scoring is a scorer by checking for "estimator" in signature
100
        if scoring is None:
101
            self._scoring = check_scoring(self.estimator)
102
        # check using inspect.signature for "estimator" in signature
103
        elif callable(scoring):
104
            from inspect import signature
105
106
            if "estimator" in signature(scoring).parameters:
107
                self._scoring = scoring
108
            else:
109
                from sklearn.metrics import make_scorer
110
111
                self._scoring = make_scorer(scoring)
112
        self.scorer_ = self._scoring
113
114
        # Set the sign of the scoring function
115
        if hasattr(self._scoring, "_score"):
116
            score_func = self._scoring._score_func
117
            _sign = _guess_sign_of_sklmetric(score_func)
118
            _sign_str = "higher" if _sign == 1 else "lower"
119
            self.set_tags(**{"property:higher_or_lower_is_better": _sign_str})
120
121
    def _paramnames(self):
122
        """Return the parameter names of the search.
123
124
        Returns
125
        -------
126
        list of str
127
            The parameter names of the search parameters.
128
        """
129
        return list(self.estimator.get_params().keys())
130
131
    def _evaluate(self, params):
132
        """Evaluate the parameters.
133
134
        Parameters
135
        ----------
136
        params : dict with string keys
137
            Parameters to evaluate.
138
139
        Returns
140
        -------
141
        float
142
            The value of the parameters as per evaluation.
143
        dict
144
            Additional metadata about the search.
145
        """
146
        estimator = clone(self.estimator)
147
        estimator.set_params(**params)
148
149
        cv_results = cross_validate(
150
            estimator,
151
            self.X,
152
            self.y,
153
            scoring=self._scoring,
154
            cv=self._cv,
155
        )
156
157
        add_info_d = {
158
            "score_time": cv_results["score_time"],
159
            "fit_time": cv_results["fit_time"],
160
            "n_test_samples": _num_samples(self.X),
161
        }
162
163
        return cv_results["test_score"].mean(), add_info_d
164
165
    @classmethod
166
    def get_test_params(cls, parameter_set="default"):
167
        """Return testing parameter settings for the skbase object.
168
169
        ``get_test_params`` is a unified interface point to store
170
        parameter settings for testing purposes. This function is also
171
        used in ``create_test_instance`` and ``create_test_instances_and_names``
172
        to construct test instances.
173
174
        ``get_test_params`` should return a single ``dict``, or a ``list`` of ``dict``.
175
176
        Each ``dict`` is a parameter configuration for testing,
177
        and can be used to construct an "interesting" test instance.
178
        A call to ``cls(**params)`` should
179
        be valid for all dictionaries ``params`` in the return of ``get_test_params``.
180
181
        The ``get_test_params`` need not return fixed lists of dictionaries,
182
        it can also return dynamic or stochastic parameter settings.
183
184
        Parameters
185
        ----------
186
        parameter_set : str, default="default"
187
            Name of the set of test parameters to return, for use in tests. If no
188
            special parameters are defined for a value, will return `"default"` set.
189
190
        Returns
191
        -------
192
        params : dict or list of dict, default = {}
193
            Parameters to create testing instances of the class
194
            Each dict are parameters to construct an "interesting" test instance, i.e.,
195
            `MyClass(**params)` or `MyClass(**params[i])` creates a valid test instance.
196
            `create_test_instance` uses the first (or only) dictionary in `params`
197
        """
198
        from sklearn.datasets import load_diabetes, load_iris
199
        from sklearn.metrics import accuracy_score, mean_absolute_error
200
        from sklearn.model_selection import KFold
201
        from sklearn.svm import SVC, SVR
202
203
        X, y = load_iris(return_X_y=True)
204
        params_classif = {
205
            "estimator": SVC(),
206
            "scoring": accuracy_score,
207
            "cv": KFold(n_splits=3, shuffle=True),
208
            "X": X,
209
            "y": y,
210
        }
211
212
        X, y = load_diabetes(return_X_y=True)
213
        params_regress = {
214
            "estimator": SVR(),
215
            "scoring": mean_absolute_error,
216
            "cv": 2,
217
            "X": X,
218
            "y": y,
219
        }
220
221
        X, y = load_diabetes(return_X_y=True)
222
        params_all_default = {
223
            "estimator": SVR(),
224
            "X": X,
225
            "y": y,
226
        }
227
228
        return [params_classif, params_regress, params_all_default]
229
230
    @classmethod
231
    def _get_score_params(self):
232
        """Return settings for testing score/evaluate functions. Used in tests only.
233
234
        Returns a list, the i-th element should be valid arguments for
235
        self.evaluate and self.score, of an instance constructed with
236
        self.get_test_params()[i].
237
238
        Returns
239
        -------
240
        list of dict
241
            The parameters to be used for scoring.
242
        """
243
        score_params_classif = {"C": 1.0, "kernel": "linear"}
244
        score_params_regress = {"C": 1.0, "kernel": "linear"}
245
        score_params_defaults = {"C": 1.0, "kernel": "linear"}
246
        return [score_params_classif, score_params_regress, score_params_defaults]
247
248
249
def _guess_sign_of_sklmetric(scorer):
250
    """Guess the sign of a sklearn metric scorer.
251
252
    Parameters
253
    ----------
254
    scorer : callable
255
        The sklearn metric scorer to guess the sign for.
256
257
    Returns
258
    -------
259
    int
260
        1 if higher scores are better, -1 if lower scores are better.
261
    """
262
    HIGHER_IS_BETTER = {
263
        # Classification
264
        "accuracy_score": True,
265
        "auc": True,
266
        "average_precision_score": True,
267
        "balanced_accuracy_score": True,
268
        "brier_score_loss": False,
269
        "class_likelihood_ratios": False,
270
        "cohen_kappa_score": True,
271
        "d2_log_loss_score": True,
272
        "dcg_score": True,
273
        "f1_score": True,
274
        "fbeta_score": True,
275
        "hamming_loss": False,
276
        "hinge_loss": False,
277
        "jaccard_score": True,
278
        "log_loss": False,
279
        "matthews_corrcoef": True,
280
        "ndcg_score": True,
281
        "precision_score": True,
282
        "recall_score": True,
283
        "roc_auc_score": True,
284
        "top_k_accuracy_score": True,
285
        "zero_one_loss": False,
286
        # Regression
287
        "d2_absolute_error_score": True,
288
        "d2_pinball_score": True,
289
        "d2_tweedie_score": True,
290
        "explained_variance_score": True,
291
        "max_error": False,
292
        "mean_absolute_error": False,
293
        "mean_absolute_percentage_error": False,
294
        "mean_gamma_deviance": False,
295
        "mean_pinball_loss": False,
296
        "mean_poisson_deviance": False,
297
        "mean_squared_error": False,
298
        "mean_squared_log_error": False,
299
        "mean_tweedie_deviance": False,
300
        "median_absolute_error": False,
301
        "r2_score": True,
302
        "root_mean_squared_error": False,
303
        "root_mean_squared_log_error": False,
304
    }
305
306
    scorer_name = getattr(scorer, "__name__", None)
307
308
    if hasattr(scorer, "greater_is_better"):
309
        return 1 if scorer.greater_is_better else -1
310
    elif scorer_name in HIGHER_IS_BETTER:
311
        return 1 if HIGHER_IS_BETTER[scorer_name] else -1
312
    elif scorer_name.endswith("_score"):
313
        # If the scorer name ends with "_score", we assume higher is better
314
        return 1
315
    elif scorer_name.endswith("_loss") or scorer_name.endswith("_deviance"):
316
        # If the scorer name ends with "_loss", we assume lower is better
317
        return -1
318
    elif scorer_name.endswith("_error"):
319
        return -1
320
    else:
321
        # If we cannot determine the sign, we assume lower is better
322
        return -1
323