I am attempting to use package 'iml' in R to create plots of SHAP values from a GBM model created in H2O.
When I try to create the R6 Predictor object using the Predictor.new()
function I get an error that states Error : all(feature.class %in% names(feature.types)) is not TRUE
.
From this I am guessing that there is something about one of the feature classes that is incorrect, but this is just an educated guess based upon what the error message is literally saying.
Here is a sample of anonymized data (I can't share the real data because it is confidential):
structure(list(dlr_id_cur = c(1, 2), date_eff = structure(c(16014,
15416), class = "Date"), new_vec_ind = structure(c(1L, 1L), .Label = c("NNA",
"UNA"), class = "factor"), cntrct_term = c(9587879614862828,
19), amt_financed = c(9455359, 65561175), reg_payment = c(885288,
389371), acct_stat_cd = structure(c(3L, 3L), .Label = c("11",
"22", "33"), class = "factor"), base_rental = c(1, 626266), down_pymt = c(2,
6654661), car_count = c(5, 1), dur_lease = c(3974, 6466), returned = structure(1:2, .Label = c("00",
"11"), class = "factor"), state = structure(c(10L, 1L), .Label = c("ANA",
"BNA", "CNA", "DNA", "FNA", "GNA", "HNA", "INA", "KNA", "LNA",
"MNA", "NNA", "ONA", "PNA", "QNA", "RNA", "SNA", "TNA", "UNA",
"VNA", "WNA"), class = "factor"), zip = c(34633, 45222), zip_two_digits = structure(c(71L,
36L), .Label = c("00", "01", "02", "03", "04", "05", "06", "07",
"08", "09", "110", "111", "112", "113", "114", "115", "116",
"117", "118", "119", "220", "221", "222", "223", "224", "225",
"226", "227", "228", "229", "330", "331", "332", "333", "334",
"335", "336", "337", "338", "339", "440", "441", "442", "443",
"444", "445", "446", "447", "448", "449", "550", "551", "552",
"553", "554", "555", "556", "557", "558", "559", "660", "661",
"662", "663", "664", "665", "666", "667", "668", "669", "770",
"771", "772", "773", "774", "775", "776", "777", "778", "779",
"880", "881", "882", "883", "884", "885", "886", "887", "888",
"889", "990", "991", "992", "993", "994", "995", "996", "997",
"998", "999", "ANA", "BNA", "CNA", "ENA", "GNA", "HNA", "JNA",
"KNA", "LNA", "MNA", "NNA", "PNA", "RNA", "SNA", "TNA", "VNA"
), class = "factor")
, mod_year_date = c(8156, 6278), vehic_mod_fam_code = structure(c(2L,
2L), .Label = c("BNA", "CNA", "ENA", "MNA", "SNA", "TNA", "VNA",
"XNA"), class = "factor"), mod_class_code = structure(c(4L, 2L
), .Label = c("BNA", "CNA", "ENA", "GNA", "MNA", "RNA", "SNA"
), class = "factor"), count_dl_DL_CDE_CSPS_A_NP = c(945, 337),
DL_CDE_CSPS_A_NP_avg_dl = c(3355188283749626, 8835582388327814
), count_sv_DL_CDE_CSPS_A_NP = c(6532, 8475), DL_CDE_CSPS_A_NP_avg_sv = c(4471193398278526,
6934672627789796), count_dl_NUM_CSPS_INIT_SCR = c(774, 773
), NUM_CSPS_INIT_SCR_avg_dl = c(9468453388562312, 5847816458727333
), count_sv_NUM_CSPS_INIT_SCR = c(2467, 3882), NUM_CSPS_INIT_SCR_avg_sv = c(5857936629789154,
8963457353776469), count_FFV = c(8563, 2566), average_FFV = c(25697792913881564,
13693335921646120), csps_NUM_SV = c(8, 6), avg_SV_rating = c(9817541424596360,
6218928542331853), csps_FFV_ratio = c(23125612473476952,
2), avg_DL_rating = c(2182256921592387, 7668957586431513),
has_DL_rating = c(1, 8), has_bad_DL_rating = c(2, 4), serv_has_MNT = c(7,
3), serv_has_SCP = c(5, 4), serv_has_ELW = c(9, 4), serv_has_LCP = c(7,
1), ro_count = c(6, 1), ro_tot_cust_pay = c(2, 188759), ro_tot_pay = c(3,
764372), date_eff_weekday = structure(c(4L, 3L), .Label = c("FNA",
"MNA", "SNA", "TNA", "WNA"), class = "factor"), date_eff_month_int = c(83,
7), date_eff_day = c(2, 24)), .Names = c("dlr_id_cur", "date_eff",
"new_vec_ind", "cntrct_term", "amt_financed", "reg_payment",
"acct_stat_cd", "base_rental", "down_pymt", "car_count", "dur_lease",
"returned", "state", "zip", "zip_two_digits", "mod_year_date",
"vehic_mod_fam_code", "mod_class_code", "count_dl_DL_CDE_CSPS_A_NP",
"DL_CDE_CSPS_A_NP_avg_dl", "count_sv_DL_CDE_CSPS_A_NP", "DL_CDE_CSPS_A_NP_avg_sv",
"count_dl_NUM_CSPS_INIT_SCR", "NUM_CSPS_INIT_SCR_avg_dl", "count_sv_NUM_CSPS_INIT_SCR",
"NUM_CSPS_INIT_SCR_avg_sv", "count_FFV", "average_FFV", "csps_NUM_SV",
"avg_SV_rating", "csps_FFV_ratio", "avg_DL_rating", "has_DL_rating",
"has_bad_DL_rating", "serv_has_MNT", "serv_has_SCP", "serv_has_ELW",
"serv_has_LCP", "ro_count", "ro_tot_cust_pay", "ro_tot_pay",
"date_eff_weekday", "date_eff_month_int", "date_eff_day"), row.names = 1:2, class = "data.frame")
# 1. create a data frame with just the features
features_iml <- as.data.frame(df_testR) %>% dplyr::select(-returned)
# 2. Create a vector with the actual responses
response_iml <- as.numeric(as.vector(df_testR$returned))
# 3. Create custom predict function that returns the predicted values as a
# vector (probability of customer churn in my example)
pred <- function(model, newdata) {
results <- as.data.frame(h2o.predict(model, as.h2o(newdata)))
return(results[[3L]])
}
# 4. example of prediction output
pred(GBM5, features_iml) %>% head()
# 5. create Predictor object
predictor = Predictor$new(model = GBM5, data = features_iml, y =
response_iml, predict.fun = pred, class = "classification")
Error : all(feature.class %in% names(feature.types)) is not TRUE
Here are also so basic descriptions of the dataset and model object I'm using in the code above:
class(GBM5)
[1] "H2OBinomialModel"
attr(,"package")
[1] "h2o"
class(df_testR)
[1] "tbl_df" "tbl" "data.frame"
dim(df_testR)
[1] 47006 44
If there is anything else I can provide or if I have been unclear please let me know.