I have this dataframe : https://www.kaggle.com/harlfoxem/housesalesprediction/version/1#kc_house_data.csv
and I need to create a linear regression model.
when I try to "factor" some features I get this Error : Error in model.frame.default(Terms, newdata, na.action = na.action, xlev = object$xlevels) : factor grade has new levels 1
And I don't know what to do, I think that I need to "factor" almost every feature that I use but I always get this error
My Code :
house.data.raw <- read.csv('housedata.csv')
library(ggplot2)
house.data.prepared <- house.data.raw
#convert to date type and structure
dates <- house.data.prepared$date
dates <- as.Date(dates, "%Y%m%dT000000")
dates <- format(dates, format="%d-%m-%Y")
house.data.prepared$date <- dates
house.data.prepared$date <- as.Date(house.data.prepared$date, "%d-%m-%Y")
#Remove all columns with one or more rows that contains "NA"
numberOfNA = length(which(is.na(house.data.prepared) == T))
if(numberOfNA > 0)
{
cat('Number of missing values: ', numberOfNA)
cat('\nRemoving missing values...')
house.data.prepared = house.data.prepared[complete.cases(house.data.prepared), ]
}
house.data.final$bedrooms <- factor(house.data.final$bedrooms)
house.data.final$floors <- factor(house.data.final$floors)
house.data.final$waterfront <- factor(house.data.final$waterfront)
house.data.final$view <- factor(house.data.final$view)
house.data.final$condition <- factor(house.data.final$condition)
house.data.final$grade <- factor(house.data.final$grade)
library(caTools)
filter <- sample.split(house.data.final$bedrooms, SplitRatio = 0.7)
#Training set
house.train <- subset(house.data.final, filter == T)
#test set
house.test <- subset(house.data.final, filter == F)
dim(house.data.final)
dim(house.train)
dim(house.test)
model <- lm(price ~ . ,house.train)
summary(model)
predict.train <- predict(model, house.train)
predict.test <- predict(model, house.test)