Source code for eli5.xgboost

# -*- coding: utf-8 -*-
from __future__ import absolute_import
from functools import partial
import re
from typing import Any, Dict, List, Tuple, Optional, Pattern

import numpy as np
import scipy.sparse as sp
from xgboost import (
    XGBClassifier,
    XGBRegressor,
    Booster,
    DMatrix
)

from eli5.explain import explain_weights, explain_prediction
from eli5.sklearn.utils import (
    add_intercept,
    get_X,
    get_X0,
    handle_vec,
    predict_proba
)
from eli5.utils import is_sparse_vector
from eli5._decision_path import get_decision_path_explanation
from eli5._feature_importances import get_feature_importance_explanation


DESCRIPTION_XGBOOST = """
XGBoost feature importances; values are numbers 0 <= x <= 1;
all values sum to 1.
"""


[docs]@explain_weights.register(XGBClassifier) @explain_weights.register(XGBRegressor) @explain_weights.register(Booster) def explain_weights_xgboost(xgb, vec=None, top=20, target_names=None, # ignored targets=None, # ignored feature_names=None, feature_re=None, # type: Pattern[str] feature_filter=None, importance_type='gain', ): """ Return an explanation of an XGBoost estimator (via scikit-learn wrapper XGBClassifier or XGBRegressor, or via xgboost.Booster) as feature importances. See :func:`eli5.explain_weights` for description of ``top``, ``feature_names``, ``feature_re`` and ``feature_filter`` parameters. ``target_names`` and ``targets`` parameters are ignored. Parameters ---------- importance_type : str, optional A way to get feature importance. Possible values are: - 'gain' - the average gain of the feature when it is used in trees (default) - 'weight' - the number of times a feature is used to split the data across all trees - 'cover' - the average coverage of the feature when it is used in trees """ booster, is_regression = _check_booster_args(xgb) xgb_feature_names = booster.feature_names coef = _xgb_feature_importances(booster, importance_type=importance_type) return get_feature_importance_explanation( xgb, vec, coef, feature_names=feature_names, estimator_feature_names=xgb_feature_names, feature_filter=feature_filter, feature_re=feature_re, top=top, description=DESCRIPTION_XGBOOST, is_regression=is_regression, num_features=coef.shape[-1], )
[docs]@explain_prediction.register(XGBClassifier) @explain_prediction.register(XGBRegressor) @explain_prediction.register(Booster) def explain_prediction_xgboost( xgb, doc, vec=None, top=None, top_targets=None, target_names=None, targets=None, feature_names=None, feature_re=None, # type: Pattern[str] feature_filter=None, vectorized=False, # type: bool is_regression=None, # type: bool missing=None, # type: bool ): """ Return an explanation of XGBoost prediction (via scikit-learn wrapper XGBClassifier or XGBRegressor, or via xgboost.Booster) as feature weights. See :func:`eli5.explain_prediction` for description of ``top``, ``top_targets``, ``target_names``, ``targets``, ``feature_names``, ``feature_re`` and ``feature_filter`` parameters. Parameters ---------- vec : vectorizer, optional A vectorizer instance used to transform raw features to the input of the estimator ``xgb`` (e.g. a fitted CountVectorizer instance); you can pass it instead of ``feature_names``. vectorized : bool, optional A flag which tells eli5 if ``doc`` should be passed through ``vec`` or not. By default it is False, meaning that if ``vec`` is not None, ``vec.transform([doc])`` is passed to the estimator. Set it to True if you're passing ``vec``, but ``doc`` is already vectorized. is_regression : bool, optional Pass if an ``xgboost.Booster`` is passed as the first argument. True if solving a regression problem ("objective" starts with "reg") and False for a classification problem. If not set, regression is assumed for a single target estimator and proba will not be shown. missing : optional Pass if an ``xgboost.Booster`` is passed as the first argument. Set it to the same value as the ``missing`` argument to ``xgboost.DMatrix``. Matters only if sparse values are used. Default is ``np.nan``. Method for determining feature importances follows an idea from http://blog.datadive.net/interpreting-random-forests/. Feature weights are calculated by following decision paths in trees of an ensemble. Each leaf has an output score, and expected scores can also be assigned to parent nodes. Contribution of one feature on the decision path is how much expected score changes from parent to child. Weights of all features sum to the output score of the estimator. """ booster, is_regression = _check_booster_args(xgb, is_regression) xgb_feature_names = booster.feature_names vec, feature_names = handle_vec( xgb, doc, vec, vectorized, feature_names, num_features=len(xgb_feature_names)) if feature_names.bias_name is None: # XGBoost estimators do not have an intercept, but here we interpret # them as having an intercept feature_names.bias_name = '<BIAS>' X = get_X(doc, vec, vectorized=vectorized) if sp.issparse(X): # Work around XGBoost issue: # https://github.com/dmlc/xgboost/issues/1238#issuecomment-243872543 X = X.tocsc() if missing is None: missing = np.nan if isinstance(xgb, Booster) else xgb.missing dmatrix = DMatrix(X, missing=missing) if isinstance(xgb, Booster): prediction = xgb.predict(dmatrix) n_targets = prediction.shape[-1] # type: int if is_regression is None: # When n_targets is 1, this can be classification too, # but it's safer to assume regression. # If n_targets > 1, it must be classification. is_regression = n_targets == 1 if is_regression: proba = None else: if n_targets == 1: p, = prediction proba = np.array([1 - p, p]) else: proba, = prediction else: proba = predict_proba(xgb, X) n_targets = _xgb_n_targets(xgb) if is_regression: names = ['y'] elif isinstance(xgb, Booster): names = np.arange(max(2, n_targets)) else: names = xgb.classes_ scores_weights = _prediction_feature_weights( booster, dmatrix, n_targets, feature_names, xgb_feature_names) x = get_X0(add_intercept(X)) x = _missing_values_set_to_nan(x, missing, sparse_missing=True) return get_decision_path_explanation( xgb, doc, vec, x=x, feature_names=feature_names, feature_filter=feature_filter, feature_re=feature_re, top=top, vectorized=vectorized, original_display_names=names, target_names=target_names, targets=targets, top_targets=top_targets, is_regression=is_regression, is_multiclass=n_targets > 1, proba=proba, get_score_weights=lambda label_id: scores_weights[label_id], )
def _check_booster_args(xgb, is_regression=None): # type: (Any, Optional[bool]) -> Tuple[Booster, Optional[bool]] if isinstance(xgb, Booster): booster = xgb else: if hasattr(xgb, 'get_booster'): booster = xgb.get_booster() else: # xgb < 0.7 booster = xgb.booster() _is_regression = isinstance(xgb, XGBRegressor) if is_regression is not None and is_regression != _is_regression: raise ValueError( 'Inconsistent is_regression={} passed. ' 'You don\'t have to pass it when using scikit-learn API' .format(is_regression)) is_regression = _is_regression return booster, is_regression def _prediction_feature_weights(booster, dmatrix, n_targets, feature_names, xgb_feature_names): """ For each target, return score and numpy array with feature weights on this prediction, following an idea from http://blog.datadive.net/interpreting-random-forests/ """ # XGBClassifier does not have pred_leaf argument, so use booster leaf_ids, = booster.predict(dmatrix, pred_leaf=True) xgb_feature_names = {f: i for i, f in enumerate(xgb_feature_names)} tree_dumps = booster.get_dump(with_stats=True) assert len(tree_dumps) == len(leaf_ids) target_feature_weights = partial( _target_feature_weights, feature_names=feature_names, xgb_feature_names=xgb_feature_names) if n_targets > 1: # For multiclass, XGBoost stores dumps and leaf_ids in a 1d array, # so we need to split them. scores_weights = [ target_feature_weights( leaf_ids[target_idx::n_targets], tree_dumps[target_idx::n_targets], ) for target_idx in range(n_targets)] else: scores_weights = [target_feature_weights(leaf_ids, tree_dumps)] return scores_weights def _target_feature_weights(leaf_ids, tree_dumps, feature_names, xgb_feature_names): feature_weights = np.zeros(len(feature_names)) # All trees in XGBoost give equal contribution to the prediction: # it is equal to sum of "leaf" values in leafs # before applying loss-specific function # (e.g. logistic for "binary:logistic" loss). score = 0 for text_dump, leaf_id in zip(tree_dumps, leaf_ids): leaf = _indexed_leafs(_parse_tree_dump(text_dump))[leaf_id] score += leaf['leaf'] path = [leaf] while 'parent' in path[-1]: path.append(path[-1]['parent']) path.reverse() # Check how each split changes "leaf" value for node, child in zip(path, path[1:]): idx = xgb_feature_names[node['split']] feature_weights[idx] += child['leaf'] - node['leaf'] # Root "leaf" value is interpreted as bias feature_weights[feature_names.bias_idx] += path[0]['leaf'] return score, feature_weights def _indexed_leafs(parent): """ Return a leaf nodeid -> node dictionary with "parent" and "leaf" (average child "leaf" value) added to all nodes. """ if not parent.get('children'): return {parent['nodeid']: parent} indexed = {} for child in parent['children']: child['parent'] = parent if 'leaf' in child: indexed[child['nodeid']] = child else: indexed.update(_indexed_leafs(child)) parent['leaf'] = _parent_value(parent['children']) return indexed def _parent_value(children): # type: (...) -> int """ Value of the parent node: a weighted sum of child values. """ covers = np.array([child['cover'] for child in children]) covers /= np.sum(covers) leafs = np.array([child['leaf'] for child in children]) return np.sum(leafs * covers) def _xgb_n_targets(xgb): # type: (...) -> int if isinstance(xgb, XGBClassifier): return 1 if xgb.n_classes_ == 2 else xgb.n_classes_ elif isinstance(xgb, XGBRegressor): return 1 else: raise TypeError def _xgb_feature_importances(booster, importance_type): fs = booster.get_score(importance_type=importance_type) all_features = np.array( [fs.get(f, 0.) for f in booster.feature_names], dtype=np.float32) return all_features / all_features.sum() def _parse_tree_dump(text_dump): # type: (str) -> Optional[Dict[str, Any]] """ Parse text tree dump (one item of a list returned by Booster.get_dump()) into json format that will be used by next XGBoost release. """ result = None stack = [] # type: List[Dict] for line in text_dump.split('\n'): if line: depth, node = _parse_dump_line(line) if depth == 0: assert not stack result = node stack.append(node) elif depth > len(stack): raise ValueError('Unexpected dump structure') else: if depth < len(stack): stack = stack[:depth] stack[-1].setdefault('children', []).append(node) stack.append(node) return result def _parse_dump_line(line): # type: (str) -> Tuple[int, Dict[str, Any]] branch_match = re.match( '^(\t*)(\d+):\[([^<]+)<([^\]]+)\] ' 'yes=(\d+),no=(\d+),missing=(\d+),' 'gain=([^,]+),cover=(.+)$', line) if branch_match: tabs, node_id, feature, condition, yes, no, missing, gain, cover = \ branch_match.groups() depth = len(tabs) return depth, { 'depth': depth, 'nodeid': int(node_id), 'split': feature, 'split_condition': float(condition), 'yes': int(yes), 'no': int(no), 'missing': int(missing), 'gain': float(gain), 'cover': float(cover), } leaf_match = re.match('^(\t*)(\d+):leaf=([^,]+),cover=(.+)$', line) if leaf_match: tabs, node_id, value, cover = leaf_match.groups() depth = len(tabs) return depth, { 'nodeid': int(node_id), 'leaf': float(value), 'cover': float(cover), } raise ValueError('Line in unexpected format: {}'.format(line)) def _missing_values_set_to_nan(values, missing_value, sparse_missing): """ Return a copy of values where missing values (equal to missing_value) are replaced to nan according. If sparse_missing is True, entries missing in a sparse matrix will also be set to nan. Sparse matrices will be converted to dense format. """ if sp.issparse(values): assert values.shape[0] == 1 if sparse_missing and sp.issparse(values) and missing_value != 0: # Nothing special needs to be done for missing.value == 0 because # missing values are assumed to be zero in sparse matrices. values_coo = values.tocoo() values = values.toarray()[0] missing_mask = values == 0 # fix for possible zero values missing_mask[values_coo.col] = False values[missing_mask] = np.nan elif is_sparse_vector(values): values = values.toarray()[0] else: values = values.copy() if not np.isnan(missing_value): values[values == missing_value] = np.nan return values