4

Hi I was working on using R explain on the LIME model. All is fine when I run this portion.

# Library
library(tm)
library(SnowballC)
library(caTools)
library(RWeka)
library(caret)
library(text2vec)
library(lime)

# Importing the dataset
dataset_original = read.delim('Restaurant_Reviews.tsv', quote = '', stringsAsFactors = FALSE)
dataset_original$Liked = as.factor(dataset_original$Liked)

# Splitting the dataset into the Training set and Test set
set.seed(123)
split = sample.split(dataset_original$Liked, SplitRatio = 0.8)
training_set = subset(dataset_original, split == TRUE)
test_set = subset(dataset_original, split == FALSE)

#Create & clean corpus
#clean corpus function
clean_text <- function(text) {
  corpus = VCorpus(VectorSource(text))
  corpus = tm_map(corpus, content_transformer(tolower))
  corpus = tm_map(corpus, removeNumbers)
  corpus = tm_map(corpus, removePunctuation)
  corpus = tm_map(corpus, removeWords, stopwords())
  corpus = tm_map(corpus, stemDocument)
  corpus = tm_map(corpus, stripWhitespace)
  return(corpus)
}

#ngram function
BigramTokenizer <- function(x){NGramTokenizer(x, Weka_control(min=1,max=2))}

#create dtm
dtm <- function(text){
  corpus = VCorpus(VectorSource(text))
  dtm = DocumentTermMatrix(corpus, control = list(weighting=weightTfIdf, tokenize=BigramTokenizer))
  dataset = as.data.frame(as.matrix(dtm))
  dataset = dataset[,order(names(dataset))] 
  return(dataset)
}

#cleaning train & test text
for (i in seq(nrow(training_set))) {
  training_set$clean_text[i] = as.character(clean_text(training_set$Review)[[i]])
  print(i)
}

for (i in seq(nrow(test_set))) {
  test_set$clean_text[i] = as.character(clean_text(test_set$Review)[[i]])
  print(i)
}

#Create document term matrix
dataset_train <- dtm(training_set$clean_text)
dataset_test <- dtm(test_set$clean_text)

#Drop new words in test set & ensure same number of columns as train set
test_colname <- colnames(dataset_test)[colnames(dataset_test) %in% colnames(dataset_train)]
test_colname <- test_colname[!is.na(test_colname)] #Remove NA
new_test_colname <- colnames(dataset_train)[!(colnames(dataset_train) %in% test_colname)] #Columns in train not in test
dataset_test <- dataset_test[,test_colname]
dataset_test[new_test_colname] <- 0
dataset_test = dataset_test[,order(names(dataset_test))] 

dataset_train = as.matrix(dataset_train)
dataset_test = as.matrix(dataset_test)

#xgboost caret model
set.seed(123)
model <- train(dataset_train, training_set$Liked, method="xgbTree")
predict(model, newdata=dataset_test)

However when I run this part:

######
#LIME#
######
explainer <- lime(training_set$Review, model, preprocess = dtm)
explanation <- explain(training_set$Review[1], explainer, n_labels = 1, n_features = 5)
plot_features(explanation)

It says:

 Error in predict.xgb.Booster(modelFit, newdata) : 
Feature names stored in `object` and `newdata` are different!

I ensured that my train and test data had the same column names and numbers before running this. I have also looked around and found that my problem is similar to this post but I still lack understanding the link to this. R: LIME returns error on different feature numbers when it's not the case

I spent weeks working on this and searching online but to no avail so any help or guidance as to what I should do is greatly appreciated!


My data:

Dataset: https://drive.google.com/file/d/1-pzY7IQVyB_GmT5dT0yRx3hYzOFGrZSr/view?usp=sharing

Lacri Mosa
  • 159
  • 1
  • 10

5 Answers5

8

I had the same problem when I updated xgboost package from v0.6.xxx to v0.7.xxx.

I solved it ensuring not only the columns names in train and test set were the same, but also the order of the columns were the same.

Hope this works for you.

Pablo Cánovas
  • 151
  • 1
  • 6
2

I had exactly the same issue. The solution for me is to make sure lime::lime only include predictive columns and NO RESPONSE COLUMN, and the same for lime::explain function.

Louis Deng
  • 21
  • 1
0

Here is the code that works for me for the same problem. There is a little issue in your clean_text and dtm functions. You need to pass corpus to dtm not raw text; I merged them together.

dataset_original$Liked = as.factor(dataset_original$Liked)

# Splitting the dataset into the Training set and Test set
set.seed(123)
split = sample.split(dataset_original$Liked, SplitRatio = 0.8)
training_set = subset(dataset_original, split == TRUE)
test_set = subset(dataset_original, split == FALSE)

#ngram function
BigramTokenizer <- function(x){NGramTokenizer(x, Weka_control(min=1,max=2))}

#create dtm    
dtm <- function(text){
  corpus = VCorpus(VectorSource(text))
  corpus = tm_map(corpus, content_transformer(tolower))
  corpus = tm_map(corpus, removeNumbers)
  corpus = tm_map(corpus, removePunctuation)
  corpus = tm_map(corpus, removeWords, stopwords())
  corpus = tm_map(corpus, stemDocument)
  corpus = tm_map(corpus, stripWhitespace)

  # pass corpus to dtm
  dtm = DocumentTermMatrix(corpus, control = list(weighting=weightTfIdf, 
      tokenize=BigramTokenizer))

  return(as.matrix(dtm))
}

#Create document term matrix
dataset_train <- dtm(training_set$Review)
dataset_test <- dtm(test_set$Review)

# same columns and same order for for both data 
matrix_columns_same <- function(a,b) {  
  # a and b: two matrices  
  intersect_cols12 <- intersect(colnames(a),colnames(b))
  result_matrix <- matrix(0, nrow = nrow(b), ncol = ncol(a))
  rownames(result_matrix) <- rownames(b)
  colnames(result_matrix) <- colnames(a)  
  result_matrix[,intersect_cols12] <- b[, intersect_cols12]  
  return(result_matrix)
}
dataset_test <- matrix_columns_same(dataset_train,dataset_test)

# from xgboost package
param <- list(max_depth = 3, 
          eta = 0.1, 
          objective = "binary:logistic", 
          eval_metric = "error", 
          nthread = 1)

model <-xgboost::xgb.train(
 param, 
 xgb.DMatrix(dataset_train, label = as.numeric(training_set$Liked)-1),
 nrounds = 50
)

predictions <- predict(model, dataset_test) 

# text to explain
text_to_explain <- test_set$Review[1:4]
explainer <- lime(text_to_explain, model, preprocess = dtm)
explanation <- explain(text_to_explain, explainer, n_labels = 1, n_features = 3)
plot_features(explanation)

Please also see similar discussion in R Lime package for text data .

# #

Here is the code using your data. It works for me, please let me know if you get an error again.

# 
library(tm)
library(lime)
library(xgboost)

# read data
dataset_original = read.delim('./data/Restaurant_Reviews.tsv', quote = '', 
stringsAsFactors = FALSE)
dataset_original$Liked = as.factor(dataset_original$Liked)

# removing docs with less words
nwords <- 5
docs_split <- lapply(strsplit(dataset_original$Review, " "), function(x){x[!x 
==""]}) #docs to list of tokens
ind_len <- unlist(lapply(docs_split,function(d) length(d)))
ind_len <- which(ind_len>nwords)
dataset_original    <- dataset_original[ind_len,]

groups <- levels(dataset_original$Liked)

# Splitting the dataset into the Training set and Test set
set.seed(123)
split = sample.split(dataset_original$Liked, SplitRatio = 0.8)
training_set = subset(dataset_original, split == TRUE)
test_set = subset(dataset_original, split == FALSE)

########################################
#ngram function
BigramTokenizer <- function(x){NGramTokenizer(x, Weka_control(min=1,max=2))}

#create dtm
dtm <- function(text){
  corpus = VCorpus(VectorSource(text))
  corpus = tm_map(corpus, content_transformer(tolower))
  corpus = tm_map(corpus, removeNumbers)
  corpus = tm_map(corpus, removePunctuation)
  corpus = tm_map(corpus, removeWords, stopwords())
  corpus = tm_map(corpus, stemDocument)
  corpus = tm_map(corpus, stripWhitespace)

 dtm = DocumentTermMatrix(corpus, control = list(weighting=weightTf, 
 tokenize=BigramTokenizer))
 dtm = removeSparseTerms(dtm,0.99)

 dtm <- as.matrix(dtm)
 dtm <- as.data.frame(dtm)
 return(dtm)
}

#Create document term matrix
dataset_train <- dtm(training_set$Review)
dataset_test  <- dtm(test_set$Review)
colnames(dataset_train) <- gsub(" ","_",colnames(dataset_train))
colnames(dataset_test) <- gsub(" ","_",colnames(dataset_test))

########################################
matrix_columns_same <- function(a,b) {
  # a and b: two matrices
  intersect_cols12 <- intersect(colnames(a),colnames(b))
  result_matrix <- matrix(0, nrow = nrow(b), ncol = ncol(a))
  rownames(result_matrix) <- rownames(b)
  colnames(result_matrix) <- colnames(a)
  result_matrix[,intersect_cols12] <- b[, intersect_cols12]
  return(result_matrix)
}

dataset_train <- as.matrix(dataset_train)
dataset_test  <- as.matrix(dataset_test)

dataset_test  <- matrix_columns_same(dataset_train,dataset_test)

# filter docs; make sure documents have at least one word
nword <- 0
ind <- which(rowSums(dataset_train)>nword)
dataset_train <- dataset_train[ind,]
training_set  <- training_set[ind,]

ind <- which(rowSums(dataset_test)>nword)
dataset_test <- dataset_test[ind,]
test_set      <- test_set[ind,]

########################################
# using xgboost package
param <- list(max_depth = 3, 
          eta = 0.1, 
          objective = "binary:logistic", 
          eval_metric = "error", 
          nthread = 1)

model <-xgboost::xgb.train(
  param, 
  xgb.DMatrix(as.matrix(dataset_train), label = 
as.numeric(training_set$Liked)-1),
  nrounds = 50
)

predictions <- predict(model, as.matrix(dataset_test)) > 0.5
test_labels <- test_set$Liked==groups[2]

# Accuracy
caret::confusionMatrix(table(predictions,test_labels))

########################################
# lime
ind_tr <- sample(1:nrow(test_set),4,replace = F)
text_to_explain <- test_set$Review[ind_tr]

explainer   <- lime(text_to_explain, model, preprocess = dtm, 
                bin_continuous=T, n_bins = 4, n_permutations = 5000) 
explanation <- lime::explain(text_to_explain, explainer, n_labels = 1, n_features = 3)
plot_features(explanation, ncol=2)
Sam S.
  • 627
  • 1
  • 7
  • 23
  • Thanks once again. I still ran into the same error using your codes..... But i reuploaded the data. – Lacri Mosa Nov 07 '18 at 08:52
  • 1
    Thanks for providing the data. It is very short and NAs are created (says Predictions contains some NAs). I will work on my solution and let you know any updates. Meanwhile your functions text_clean and dtm should be merged into one function, otherwise explainer <- lime(training_set$Review, model, preprocess = dtm) will not do the right thing. – Sam S. Nov 08 '18 at 12:33
  • I added a solution using your data. Have you tried that? – Sam S. Nov 13 '18 at 03:41
  • Apologies, not as of yet. I will get back to you on this shortly. Thanks – Lacri Mosa Nov 13 '18 at 07:02
  • I still received error, Error in predict.xgb.Booster(x, newdata = newdata, reshape = TRUE, ...) : Feature names stored in `object` and `newdata` are different! – Lacri Mosa Nov 14 '18 at 01:45
  • My tentative workaround is to switch to using python LIME package, where they have a more solid example for using text especially SciKitLearn models. Personally, i feel R for LIME (which has been ported over from python) isn't as usable as Python. – Lacri Mosa Nov 14 '18 at 01:48
  • I ran the code again. No error in my system. The error can be from your not updated packages or R. Are they up-dated to the latest ones? explain function can also be a function of another package, then, you may get an error, that is why I added lime:: in front of that. there is another package called iml. havve you tried lime using iml? – Sam S. Nov 15 '18 at 02:02
0

I had the same problem when predicting from xgboost model.
In my case, I did a sparse.model.matrix transformation before training.

varX=c('l8','l21','v8','v21','fa','fb')
f1=as.formula(paste0('rV','~',paste(varX,collapse='+')))
sparse_matrix=sparse.model.matrix(f1, data = rstac)
mod=xgboost(data=sparse_matrix,label=rV,...)

I got the error in

y=predict(mod,newdata=as.matrix(rstac[1:10,varX]))
Error in predict.xgb.Booster(mod, newdata = as.matrix(rstac[1:10, varX])) : 
Feature names stored in `object` and `newdata` are different!

I could see the features used in model in mod[[8]]:

mod[[8]]
[1] "(Intercept)"              "l8"                       "l21"                      "v8"                      
[5] "v21"                      "fa"                       "fb"

(Intercept) is missing. Doing sparse.model.matrix before worked.

y=predict(mod,newdata=sparse.model.matrix(~.,rstac[1:10,varX]))
y
[1] 0.3290127 0.3290127 0.6757481 0.6667279 0.6668081 0.6668081 0.3290127 0.2944945 0.2944945 0.2944945
xm1
  • 1,663
  • 1
  • 17
  • 28
0

Two things I experienced with latest xgboost Version 1.1.1.1

    1. order of columns should be the same in xgboost training and test data
    1. models saved using older xgboost versions with 'save'/ 'saveRDS' are not compatible with new versions, you need to use xgb.save and xgb.load for saving and retrieval respectively

Also please go through the github source link for more information https://github.com/dmlc/xgboost/pull/5940

Aditya
  • 1