1
#Data Import
data<-read.csv("D:\\general\\insurance.csv")
#Data Exploration(EDA)
str(data)
summary(data)

#Data Preparation
#Outlier Detection & Treatment
#Box-Plot
outlier_values <- boxplot.stats(data$charges)$out  # outlier values
par(mar=c(2,2,2,2))
boxplot(data$charges, main="charges", boxwex=0.1)
mtext(paste("Outliers: ", paste(outlier_values, collapse=",")), cex=0.6)

user defined function for creating descriptive statistics

stats <- function(x) {
  iqr=IQR(x,na.rm=T)
  q1<-quantile(x,0.25,na.rm=T)
  q2<-quantile(x,0.5,na.rm=T)
  q3<-quantile(x,0.75,na.rm=T)
  UC<-q3+1.5*iqr
  LC<-q1-1.5*iqr
  min<-min(x,na.rm=T)
  max<-max(x,na.rm=T)
  return(c(q1=q1, q2=q2, q3=q3,UC=UC, LC=LC, min=min, max=max))
}

vars <- c( "age","sex", "bmi", "children","smoker","region","charges")
data_stats<-t(data.frame(apply(data[vars],2,stats)))
View(data_stats)

## OUTLIER Treatment
data$bmi[data$bmi>47.29]<-47.29
data$charges[data$charges>34489.35]<-34489.35

## Missing Value Detection & Imputation
#is.na(data)
sapply(data, function(x)sum(is.na(x)))
#library(Hmisc)
#data<-data.frame(apply(data[vars],2, function(x) impute(x, mean))) #Imputing missing values with mean

#Assumption
hist(data$charges) #Dependent variable should have normal distribution
#hist(log(data$charges))

#Density Plot
library(e1071)
par(mfrow=c(1, 2))  # divide graph area in 2 columns
plot(density(data$charges), main="Density Plot: Speed", ylab="Frequency", 
     sub=paste("Skewness:", round(e1071::skewness(data$charges), 2)))  # density plot for 'speed'
polygon(density(data$charges), col="red")

#Correlation
corrm<- cor(data)  
#write.csv(corrm, file = "Correlation Matrix.csv")
#Correlation visualisation
#options(repos = c(CRAN = "http://cran.rstudio.com"))
install.packages("corrplot")
library(corrplot)
corrplot(corrm)

I cant find what I did wrong there is an error which shows that x has to be numeric Build a linear regression model to predict the medical insurance cost. The dataset consists of customers 1338 observations & 7 variables.

The dataset consists of following parameters:

  • Age: insurance contractor age, years
  • Sex: insurance contractor gender, [female, male]
  • BMI: Body mass index, providing an understanding of body, weights that are relatively high or low relative to height, objective index of body weight (kg / m ^ 2) using the ratio of height to weight, ideally 18.5 to 24.9
  • Children: number of children covered by health insurance / Number of dependents
  • Smoker: smoking, [yes, no]
  • Region: the beneficiary’s residential area in the US, [northeast, southeast, southwest, northwest]
  • Charges: Individual medical costs billed by health insurance, $
Peter
  • 11,500
  • 5
  • 21
  • 31
Vikansh
  • 11
  • 1
  • Please include a [reproducible example](https://stackoverflow.com/questions/5963269/how-to-make-a-great-r-reproducible-example) of your data. Thanks,. – user438383 Apr 06 '21 at 20:44
  • 1
    as the error says, at least one of the columns of `data` is not numeric – rawr Apr 06 '21 at 20:48

0 Answers0