I am applying xgboost to the below data set and getting prediction, I am also able to get the most important feature for the over all model, However I would also like to know for every prediction which are the most important features, I am able to use dalex package to find for every prediction the important variables but getting error
Please find below the code
rm(list=ls(all=T))
library("iBreakDown")
library("breakDown")
library("xgboost")
library("DALEX")
library("ingredients")
data(HR_data)
head(HR_data)
table(HR_data$left)
str(HR_data)
label<-HR_data$left
HR_data<-HR_data%>%select(-c(sales,salary,left))
#trian and tes split
n=nrow(HR_data)
train.index = sample(n,floor(0.75*n))
train.data = as.matrix(HR_data[train.index,])
train.label = label[train.index]
test.data = as.matrix(HR_data[-train.index,])
test.label = label[-train.index]
## set the seed to make your partition reproducible
xgb.train = xgb.DMatrix(data=train.data,label=train.label)
xgb.test = xgb.DMatrix(data=test.data,label=test.label)
params = list(
booster="gbtree",
eta=0.001,
max_depth=5,
gamma=3,
subsample=0.75,
colsample_bytree=1,
objective="binary:logistic",
eval_metric="auc"
)
xgb.fit=xgb.train(
params=params,
data=xgb.train,
nrounds=10000,
nthreads=1,
early_stopping_rounds=10,
watchlist=list(val1=xgb.train,val2=xgb.test),
verbose=0
)
xgb.fit
xgb.pred = predict(xgb.fit,test.data,reshape=T)
xgb.pred = as.data.frame(xgb.pred)
### important Variables
xi <- xgb.importance(colnames(xgb.train), model = xgb.fit)
### using train data to find the best attributes of it's prediciton
train_d<-as.data.frame(train.data)
train_l<-as.data.frame(train.label)
colnames(train_l)<-"left"
train_df<-cbind(train_d,train_l)
### exgboost explainer
library("DALEX")
model_martix_train <- model.matrix(train_df$left ~.-1,train_df)
data_train <- xgb.DMatrix(model_martix_train, label = train_df$left)
xgb_model <- xgb.train(param=params, data_train, nrounds = 50)
xgb_model
predict_logit <- function(model, x) {
raw_x <- predict(model, x)
exp(raw_x)/(1 + exp(raw_x))
}
logit <- function(x) exp(x)/(1+exp(x))
explainer_xgb <- explain(xgb_model,
data = model_martix_train,
y = train_df$left,
predict_function = predict_logit,
link = logit,
label = "xgboost")
nobs <- model_martix_train[1:50, , drop = FALSE]
sp_xgb <- break_down(explainer_xgb, observation = nobs)
I am getting error when using break_down error is
Error in break_down(explainer_xgb, observation = nobs) : unused argument (observation = nobs)
when I used the below code it is not giving error but when I try to use the same logic for my data set I am getting error.
The below code runs without error
library("iBreakDown")
library("breakDown")
library("xgboost")
library("DALEX")
library("ingredients")
data(HR_data)
model_martix_train <- model.matrix(left ~ . - 1, HR_data)
data_train <- xgb.DMatrix(model_martix_train, label = HR_data$left)
param <- list(max_depth = 2, eta = 1, silent = 1, nthread = 2,
objective = "binary:logistic", eval_metric = "auc")
HR_xgb_model <- xgb.train(param, data_train, nrounds = 50)
predict_logit <- function(model, x) {
raw_x <- predict(model, x)
exp(raw_x)/(1 + exp(raw_x))
}
logit <- function(x) exp(x)/(1+exp(x))
### Explainer from dalex
explainer_xgb <- explain(HR_xgb_model,
data = model_martix_train,
y = HR_data$left,
predict_function = predict_logit,
link = logit,
label = "xgboost")
### predicitons Plot
nobs <- model_martix_train[1, , drop = FALSE]
sp_xgb <-break_down(explainer_xgb, nobs)
plot(sp_xgb)
I would appreciate if someone could help me on this also if there is any other way to find best attributes for every prediction, the reason I am looking for some other alternate solution is because I have data frame with more 3 million rows and using dalex would be very time consuming.