I am trying to find out a solution to Kaggle TFI Dataset.
But while building the model in R using Random Forest I found the following Error:
Error in
[.data.frame
(data, , all.vars(Terms), drop = FALSE) : undefined columns selected
Code:
library(Boruta)
library(caret)
train<-read.csv("train.csv")
names(train)
View(train)
str(train)
test<-read.csv("test.csv")
n.train<-nrow(train)
test$revenue<-1
myData<-rbind(train,test)
rm(train,test)
myData$Open.Date<-as.POSIXlt("01/01/2015", format="%m/%d/%Y") -
as.POSIXlt(myData$Open.Date, format="%m/%d/%Y")
myData$Open.Date<-as.numeric(myData$Open.Date/1000)
myData$City<-as.character(myData$City)
myData$City[myData$City.Group=="Other"]<-"Other"
myData$City[myData$City==unique(myData$City)[4]]<-unique(myData$City)[2]
myData$City<-as.factor(myData$City)
myData$City.Group<-NULL
myData$Type<-as.character(myData$Type)
myData$Type[myData$Type=="DT"]<-"IL"
myData$Type[myData$Type=="MB"]<-"FC"
myData$Type<-as.factor(myData$Type)
hist(train$revenue)
hist(log(train$revenue))
myData[,paste("P",1:37,sep="")]<-log(1+myData[,paste("P",1:37,sep="")])
myData$revenue<-log(myData$revenue)
head(myData,2)
important<-Boruta(revenue~.,data=myData[1:n.train,])
important$finalDecision
library(randomForest)
model<-train(myData$revenue[1:n.train]~.,method="rf",
data=myData[1:n.train,c(important$finalDecision!="Rejected",TRUE)])