from functools import partial
import numpy as np
from sklearn.model_selection import check_cv
from sklearn.utils.metaestimators import available_if
from sklearn.utils import check_array, check_random_state
from sklearn.base import BaseEstimator, MetaEstimatorMixin, clone
from sklearn.metrics import check_scoring
from eli5.permutation_importance import get_score_importances
from eli5.sklearn.utils import pandas_available, is_classifier
if pandas_available:
import pandas as pd
def _wrapped_estimator_has(attr):
def check(self):
return hasattr(self.wrapped_estimator_, attr)
return check
def _estimator_has(attr):
def check(self):
return hasattr(self.estimator, attr)
return check
CAVEATS_CV_NONE = """
Feature importances are computed on the same data as used for training,
i.e. feature importances don't reflect importance of features for
generalization.
"""
CAVEATS_CV = """
Feature importances are not computed for the final estimator;
they are computed for a sequence of estimators trained and evaluated
on train/test splits. So they tell you about importances of features
for generalization, but not feature importances of a particular trained model.
"""
CAVEATS_PREFIT = """
If feature importances are computed on the same data as used for training,
they don't reflect importance of features for generalization. Use a held-out
dataset if you want generalization feature importances.
"""
[docs]
class PermutationImportance(BaseEstimator, MetaEstimatorMixin):
"""Meta-estimator which computes ``feature_importances_`` attribute
based on permutation importance (also known as mean score decrease).
:class:`~PermutationImportance` instance can be used instead of
its wrapped estimator, as it exposes all estimator's common methods like
``predict``.
There are 3 main modes of operation:
1. cv="prefit" (pre-fit estimator is passed). You can call
PermutationImportance.fit either with training data, or
with a held-out dataset (in the latter case ``feature_importances_``
would be importances of features for generalization). After the fitting
``feature_importances_`` attribute becomes available, but the estimator
itself is not fit again. When cv="prefit",
:meth:`~PermutationImportance.fit` must be called
directly, and :class:`~PermutationImportance` cannot be used with
``cross_val_score``, ``GridSearchCV`` and similar utilities that clone
the estimator.
2. cv=None. In this case :meth:`~PermutationImportance.fit` method fits
the estimator and computes feature importances on the same data, i.e.
feature importances don't reflect importance of features for
generalization.
3. all other ``cv`` values. :meth:`~PermutationImportance.fit` method
fits the estimator, but instead of computing feature importances for
the concrete estimator which is fit, importances are computed for
a sequence of estimators trained and evaluated on train/test splits
according to ``cv``, and then averaged. This is more resource-intensive
(estimators are fit multiple times), and importances are not computed
for the final estimator, but ``feature_importances_`` show importances
of features for generalization.
Mode (1) is most useful for inspecting an existing estimator; modes
(2) and (3) can be also used for feature selection, e.g. together with
sklearn's SelectFromModel or RFE.
Currently :class:`~PermutationImportance` works with dense data.
Parameters
----------
estimator : object
The base estimator. This can be both a fitted
(if ``prefit`` is set to True) or a non-fitted estimator.
scoring : string, callable or None, default=None
Scoring function to use for computing feature importances.
A string with scoring name (see scikit-learn `docs`_) or
a scorer callable object / function with signature
``scorer(estimator, X, y)``.
If ``None``, the ``score`` method of the estimator is used.
.. _docs: https://scikit-learn.org/stable/modules/model_evaluation.html#common-cases-predefined-values
n_iter : int, default 5
Number of random shuffle iterations. Decrease to improve speed,
increase to get more precise estimates.
random_state : integer or numpy.random.RandomState, optional
random state
cv : int, cross-validation generator, iterable or "prefit"
Determines the cross-validation splitting strategy.
Possible inputs for cv are:
- None, to disable cross-validation and compute feature importances
on the same data as used for training.
- integer, to specify the number of folds.
- An object to be used as a cross-validation generator.
- An iterable yielding train/test splits.
- "prefit" string constant (default).
If "prefit" is passed, it is assumed that ``estimator`` has been
fitted already and all data is used for computing feature importances.
refit : bool
Whether to fit the estimator on the whole data if cross-validation
is used (default is True).
Attributes
----------
feature_importances_ : array
Feature importances, computed as mean decrease of the score when
a feature is permuted (i.e. becomes noise).
feature_importances_std_ : array
Standard deviations of feature importances.
results_ : list of arrays
A list of score decreases for all experiments.
scores_ : array of float
A list of base scores for all experiments (with no features permuted).
estimator_ : an estimator
The base estimator from which the :class:`~PermutationImportance`
instance is built. This is stored only when a non-fitted estimator
is passed to the :class:`~PermutationImportance`, i.e when ``cv`` is
not "prefit".
rng_ : numpy.random.RandomState
random state
"""
def __init__(self, estimator, scoring=None, n_iter=5, random_state=None,
cv='prefit', refit=True):
if isinstance(cv, str) and cv != "prefit":
raise ValueError("Invalid cv value: {!r}".format(cv))
self.refit = refit
self.estimator = estimator
self.scoring = scoring
self.n_iter = n_iter
self.random_state = random_state
self.cv = cv
self.rng_ = check_random_state(random_state)
def _wrap_scorer(self, base_scorer, pd_columns):
def pd_scorer(model, X, y):
X = pd.DataFrame(X, columns=pd_columns)
return base_scorer(model, X, y)
return pd_scorer
[docs]
def fit(self, X, y, groups=None, **fit_params) -> 'PermutationImportance':
"""Compute ``feature_importances_`` attribute and optionally
fit the base estimator.
Parameters
----------
X : array-like of shape (n_samples, n_features)
The training input samples.
y : array-like, shape (n_samples,)
The target values (integers that correspond to classes in
classification, real numbers in regression).
groups : array-like, with shape (n_samples,), optional
Group labels for the samples used while splitting the dataset into
train/test set.
**fit_params : Other estimator specific parameters
Returns
-------
self : object
Returns self.
"""
self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)
if pandas_available and isinstance(X, pd.DataFrame):
self.scorer_ = self._wrap_scorer(self.scorer_, X.columns)
if self.cv != "prefit" and self.refit:
self.estimator_ = clone(self.estimator)
self.estimator_.fit(X, y, **fit_params)
X = check_array(X, ensure_all_finite='allow-nan')
if self.cv not in (None, "prefit"):
si = self._cv_scores_importances(X, y, groups=groups, **fit_params)
else:
si = self._non_cv_scores_importances(X, y)
scores, results = si
self.scores_ = np.array(scores)
self.results_ = results
self.feature_importances_ = np.mean(results, axis=0)
self.feature_importances_std_ = np.std(results, axis=0)
return self
def _cv_scores_importances(self, X, y, groups=None, **fit_params):
assert self.cv is not None
cv = check_cv(self.cv, y, classifier=is_classifier(self.estimator))
feature_importances: list = []
base_scores: list[float] = []
weights = fit_params.pop('sample_weight', None)
fold_fit_params = fit_params.copy()
for train, test in cv.split(X, y, groups):
if weights is not None:
fold_fit_params['sample_weight'] = weights[train]
est = clone(self.estimator).fit(X[train], y[train], **fold_fit_params)
score_func = partial(self.scorer_, est)
_base_score, _importances = self._get_score_importances(
score_func, X[test], y[test])
base_scores.extend([_base_score] * len(_importances))
feature_importances.extend(_importances)
return base_scores, feature_importances
def _non_cv_scores_importances(self, X, y):
score_func = partial(self.scorer_, self.wrapped_estimator_)
base_score, importances = self._get_score_importances(score_func, X, y)
return [base_score] * len(importances), importances
def _get_score_importances(self, score_func, X, y):
return get_score_importances(score_func, X, y, n_iter=self.n_iter,
random_state=self.rng_)
@property
def caveats_(self) -> str:
if self.cv == 'prefit':
return CAVEATS_PREFIT
elif self.cv is None:
return CAVEATS_CV_NONE
return CAVEATS_CV
# ============= Exposed methods of a wrapped estimator:
@available_if(_wrapped_estimator_has('score'))
def score(self, X, y=None, *args, **kwargs):
return self.wrapped_estimator_.score(X, y, *args, **kwargs)
@available_if(_wrapped_estimator_has('predict'))
def predict(self, X):
return self.wrapped_estimator_.predict(X)
@available_if(_wrapped_estimator_has('predict_proba'))
def predict_proba(self, X):
return self.wrapped_estimator_.predict_proba(X)
@available_if(_wrapped_estimator_has('predict_log_proba'))
def predict_log_proba(self, X):
return self.wrapped_estimator_.predict_log_proba(X)
@available_if(_wrapped_estimator_has('decision_function'))
def decision_function(self, X):
return self.wrapped_estimator_.decision_function(X)
@available_if(_estimator_has('__sklearn_tags__'))
def __sklearn_tags__(self):
return self.estimator.__sklearn_tags__()
@property
def wrapped_estimator_(self):
if self.cv == "prefit" or not self.refit:
return self.estimator
return self.estimator_
@property
def _estimator_type(self):
return self.estimator._estimator_type
@property
def classes_(self):
return self.wrapped_estimator_.classes_