import re
import warnings
from functools import partial
from typing import Any, Optional, Pattern, Union
import numpy as np
import scipy.sparse as sp
import xgboost
from xgboost import (
XGBClassifier,
XGBRegressor,
Booster,
DMatrix
)
from eli5.explain import explain_weights, explain_prediction
from eli5.sklearn.utils import (
add_intercept,
get_X,
get_X0,
handle_vec,
predict_proba
)
from eli5.utils import is_sparse_vector
from eli5._decision_path import get_decision_path_explanation
from eli5._feature_importances import get_feature_importance_explanation
DESCRIPTION_XGBOOST = """
XGBoost feature importances; values are numbers 0 <= x <= 1;
all values sum to 1.
"""
[docs]
@explain_weights.register(XGBClassifier)
@explain_weights.register(XGBRegressor)
@explain_weights.register(Booster)
def explain_weights_xgboost(xgb,
vec=None,
top=20,
target_names=None, # ignored
targets=None, # ignored
feature_names=None,
feature_re: Optional[Pattern[str]] = None,
feature_filter=None,
importance_type='gain',
):
"""
Return an explanation of an XGBoost estimator (via scikit-learn wrapper
XGBClassifier or XGBRegressor, or via xgboost.Booster)
as feature importances.
See :func:`eli5.explain_weights` for description of
``top``, ``feature_names``,
``feature_re`` and ``feature_filter`` parameters.
``target_names`` and ``targets`` parameters are ignored.
Parameters
----------
importance_type : str, optional
A way to get feature importance. Possible values are:
- 'gain' - the average gain of the feature when it is used in trees
(default)
- 'weight' - the number of times a feature is used to split the data
across all trees
- 'cover' - the average coverage of the feature when it is used in trees
"""
booster, is_regression = _check_booster_args(xgb)
xgb_feature_names = _get_booster_feature_names(booster)
coef = _xgb_feature_importances(
booster,
importance_type=importance_type,
feature_names=xgb_feature_names
)
return get_feature_importance_explanation(
xgb, vec, coef,
feature_names=feature_names,
estimator_feature_names=xgb_feature_names,
feature_filter=feature_filter,
feature_re=feature_re,
top=top,
description=DESCRIPTION_XGBOOST,
is_regression=is_regression,
num_features=coef.shape[-1],
)
[docs]
@explain_prediction.register(XGBClassifier)
@explain_prediction.register(XGBRegressor)
@explain_prediction.register(Booster)
def explain_prediction_xgboost(
xgb, doc,
vec=None,
top=None,
top_targets=None,
target_names=None,
targets=None,
feature_names=None,
feature_re: Optional[Pattern[str]] = None,
feature_filter=None,
vectorized: bool = False,
is_regression: Optional[bool] = None,
missing: Optional[Any] = None,
):
""" Return an explanation of XGBoost prediction (via scikit-learn wrapper
XGBClassifier or XGBRegressor, or via xgboost.Booster) as feature weights.
See :func:`eli5.explain_prediction` for description of
``top``, ``top_targets``, ``target_names``, ``targets``,
``feature_names``, ``feature_re`` and ``feature_filter`` parameters.
Parameters
----------
vec : vectorizer, optional
A vectorizer instance used to transform
raw features to the input of the estimator ``xgb``
(e.g. a fitted CountVectorizer instance); you can pass it
instead of ``feature_names``.
vectorized : bool, optional
A flag which tells eli5 if ``doc`` should be
passed through ``vec`` or not. By default it is False, meaning that
if ``vec`` is not None, ``vec.transform([doc])`` is passed to the
estimator. Set it to True if you're passing ``vec``,
but ``doc`` is already vectorized.
is_regression : bool, optional
Pass if an ``xgboost.Booster`` is passed as the first argument.
True if solving a regression problem ("objective" starts with "reg")
and False for a classification problem.
If not set, regression is assumed for a single target estimator
and proba will not be shown.
missing : optional
Pass if an ``xgboost.Booster`` is passed as the first argument.
Set it to the same value as the ``missing`` argument to
``xgboost.DMatrix``.
Matters only if sparse values are used. Default is ``np.nan``.
Method for determining feature importances follows an idea from
http://blog.datadive.net/interpreting-random-forests/.
Feature weights are calculated by following decision paths in trees
of an ensemble.
Each leaf has an output score, and expected scores can also be assigned
to parent nodes.
Contribution of one feature on the decision path is how much expected score
changes from parent to child.
Weights of all features sum to the output score of the estimator.
"""
if not xgboost.__version__.startswith(('0.', '1.')):
warnings.warn(
'This explanation might be incoorrect, '
'only xgboost < 2.0.0 is known to work correctly')
booster, is_regression = _check_booster_args(xgb, is_regression)
xgb_feature_names = _get_booster_feature_names(booster)
vec, feature_names = handle_vec(
xgb, doc, vec, vectorized, feature_names,
num_features=len(xgb_feature_names))
if feature_names.bias_name is None:
# Some XGBoost estimators do not have an intercept, but here we interpret
# them as having an intercept
feature_names.bias_name = '<BIAS>'
X = get_X(doc, vec, vectorized=vectorized)
if sp.issparse(X):
# Work around XGBoost issue:
# https://github.com/dmlc/xgboost/issues/1238#issuecomment-243872543
X = X.tocsc()
if missing is None:
missing = np.nan if isinstance(xgb, Booster) else xgb.missing
dmatrix = DMatrix(X, missing=missing)
if isinstance(xgb, Booster):
prediction = xgb.predict(dmatrix)
n_targets: int = prediction.shape[-1]
if is_regression is None:
# When n_targets is 1, this can be classification too,
# but it's safer to assume regression.
# If n_targets > 1, it must be classification.
is_regression = n_targets == 1
if is_regression:
proba = None
else:
if n_targets == 1:
p, = prediction
proba = np.array([1 - p, p])
else:
proba, = prediction
else:
proba = predict_proba(xgb, X)
n_targets = _xgb_n_targets(xgb)
names: Union[list[str], np.ndarray]
if is_regression:
names = ['y']
elif isinstance(xgb, Booster):
names = np.arange(max(2, n_targets))
else:
names = xgb.classes_
scores_weights = _prediction_feature_weights(
booster, dmatrix, n_targets, feature_names, xgb_feature_names)
x = get_X0(add_intercept(X))
x = _missing_values_set_to_nan(x, missing, sparse_missing=True)
return get_decision_path_explanation(
xgb, doc, vec,
x=x,
feature_names=feature_names,
feature_filter=feature_filter,
feature_re=feature_re,
top=top,
vectorized=vectorized,
original_display_names=names,
target_names=target_names,
targets=targets,
top_targets=top_targets,
is_regression=is_regression,
is_multiclass=n_targets > 1,
proba=proba,
get_score_weights=lambda label_id: scores_weights[label_id],
)
def _check_booster_args(xgb, is_regression: Optional[bool] = None) -> tuple[Booster, Optional[bool]]:
if isinstance(xgb, Booster):
booster = xgb
else:
if hasattr(xgb, 'get_booster'):
booster = xgb.get_booster()
else: # xgb < 0.7
booster = xgb.booster()
_is_regression = isinstance(xgb, XGBRegressor)
if is_regression is not None and is_regression != _is_regression:
raise ValueError(
'Inconsistent is_regression={} passed. '
'You don\'t have to pass it when using scikit-learn API'
.format(is_regression))
is_regression = _is_regression
return booster, is_regression
def _prediction_feature_weights(booster, dmatrix, n_targets,
feature_names, xgb_feature_names):
""" For each target, return score and numpy array with feature weights
on this prediction, following an idea from
http://blog.datadive.net/interpreting-random-forests/
"""
# XGBClassifier does not have pred_leaf argument, so use booster
leaf_ids, = booster.predict(dmatrix, pred_leaf=True)
xgb_feature_names = {f: i for i, f in enumerate(xgb_feature_names)}
tree_dumps = booster.get_dump(with_stats=True)
assert len(tree_dumps) == len(leaf_ids)
target_feature_weights = partial(
_target_feature_weights,
feature_names=feature_names, xgb_feature_names=xgb_feature_names)
if n_targets > 1:
# For multiclass, XGBoost stores dumps and leaf_ids in a 1d array,
# so we need to split them.
scores_weights = [
target_feature_weights(
leaf_ids[target_idx::n_targets],
tree_dumps[target_idx::n_targets],
) for target_idx in range(n_targets)]
else:
scores_weights = [target_feature_weights(leaf_ids, tree_dumps)]
return scores_weights
def _target_feature_weights(leaf_ids, tree_dumps, feature_names,
xgb_feature_names):
feature_weights = np.zeros(len(feature_names))
# All trees in XGBoost give equal contribution to the prediction:
# it is equal to sum of "leaf" values in leafs
# before applying loss-specific function
# (e.g. logistic for "binary:logistic" loss).
score = 0
for text_dump, leaf_id in zip(tree_dumps, leaf_ids):
leaf = _indexed_leafs(_parse_tree_dump(text_dump))[leaf_id]
score += leaf['leaf']
path = [leaf]
while 'parent' in path[-1]:
path.append(path[-1]['parent'])
path.reverse()
# Check how each split changes "leaf" value
for node, child in zip(path, path[1:]):
idx = xgb_feature_names[node['split']]
feature_weights[idx] += child['leaf'] - node['leaf']
# Root "leaf" value is interpreted as bias
feature_weights[feature_names.bias_idx] += path[0]['leaf']
return score, feature_weights
def _indexed_leafs(parent):
""" Return a leaf nodeid -> node dictionary with
"parent" and "leaf" (average child "leaf" value) added to all nodes.
"""
if not parent.get('children'):
return {parent['nodeid']: parent}
indexed = {}
for child in parent['children']:
child['parent'] = parent
if 'leaf' in child:
indexed[child['nodeid']] = child
else:
indexed.update(_indexed_leafs(child))
parent['leaf'] = _parent_value(parent['children'])
return indexed
def _parent_value(children) -> int:
""" Value of the parent node: a weighted sum of child values.
"""
covers = np.array([child['cover'] for child in children])
covers /= np.sum(covers)
leafs = np.array([child['leaf'] for child in children])
return np.sum(leafs * covers)
def _xgb_n_targets(xgb) -> int:
if isinstance(xgb, XGBClassifier):
return 1 if xgb.n_classes_ == 2 else xgb.n_classes_
elif isinstance(xgb, XGBRegressor):
return 1
else:
raise TypeError
def _get_booster_feature_names(booster):
# xgboost >= 1.4.0 return None when feature names are missing
# while previous versions returns list f0, f1, f2, ...
if booster.feature_names is not None:
return booster.feature_names
return ["f{}".format(i) for i in range(booster.num_features())]
def _xgb_feature_importances(booster, importance_type, feature_names):
fs = booster.get_score(importance_type=importance_type)
all_features = np.array(
[fs.get(f, 0.) for f in feature_names], dtype=np.float32)
return all_features / all_features.sum()
def _parse_tree_dump(text_dump: str) -> Optional[dict[str, Any]]:
""" Parse text tree dump (one item of a list returned by Booster.get_dump())
into json format that will be used by next XGBoost release.
"""
result = None
stack: list[dict] = []
for line in text_dump.split('\n'):
if line:
depth, node = _parse_dump_line(line)
if depth == 0:
assert not stack
result = node
stack.append(node)
elif depth > len(stack):
raise ValueError('Unexpected dump structure')
else:
if depth < len(stack):
stack = stack[:depth]
stack[-1].setdefault('children', []).append(node)
stack.append(node)
return result
def _parse_dump_line(line: str) -> tuple[int, dict[str, Any]]:
branch_match = re.match(
r'^(\t*)(\d+):\[([^<]+)<([^\]]+)\] '
r'yes=(\d+),no=(\d+),missing=(\d+),'
r'gain=([^,]+),cover=(.+)$', line)
if branch_match:
tabs, node_id, feature, condition, yes, no, missing, gain, cover = \
branch_match.groups()
depth = len(tabs)
return depth, {
'depth': depth,
'nodeid': int(node_id),
'split': feature,
'split_condition': float(condition),
'yes': int(yes),
'no': int(no),
'missing': int(missing),
'gain': float(gain),
'cover': float(cover),
}
leaf_match = re.match(r'^(\t*)(\d+):leaf=([^,]+),cover=(.+)$', line)
if leaf_match:
tabs, node_id, value, cover = leaf_match.groups()
depth = len(tabs)
return depth, {
'nodeid': int(node_id),
'leaf': float(value),
'cover': float(cover),
}
raise ValueError('Line in unexpected format: {}'.format(line))
def _missing_values_set_to_nan(values, missing_value, sparse_missing):
""" Return a copy of values where missing values (equal to missing_value)
are replaced to nan according. If sparse_missing is True,
entries missing in a sparse matrix will also be set to nan.
Sparse matrices will be converted to dense format.
"""
if sp.issparse(values):
assert values.shape[0] == 1
if sparse_missing and sp.issparse(values) and missing_value != 0:
# Nothing special needs to be done for missing.value == 0 because
# missing values are assumed to be zero in sparse matrices.
values_coo = values.tocoo()
values = values.toarray()[0]
missing_mask = values == 0
# fix for possible zero values
missing_mask[values_coo.col] = False
values[missing_mask] = np.nan
elif is_sparse_vector(values):
values = values.toarray()[0]
else:
values = values.copy()
if not np.isnan(missing_value):
values[values == missing_value] = np.nan
return values