#Data Import
data<-read.csv("D:\\general\\insurance.csv")
#Data Exploration(EDA)
str(data)
summary(data)
#Data Preparation
#Outlier Detection & Treatment
#Box-Plot
outlier_values <- boxplot.stats(data$charges)$out # outlier values
par(mar=c(2,2,2,2))
boxplot(data$charges, main="charges", boxwex=0.1)
mtext(paste("Outliers: ", paste(outlier_values, collapse=",")), cex=0.6)
user defined function for creating descriptive statistics
stats <- function(x) {
iqr=IQR(x,na.rm=T)
q1<-quantile(x,0.25,na.rm=T)
q2<-quantile(x,0.5,na.rm=T)
q3<-quantile(x,0.75,na.rm=T)
UC<-q3+1.5*iqr
LC<-q1-1.5*iqr
min<-min(x,na.rm=T)
max<-max(x,na.rm=T)
return(c(q1=q1, q2=q2, q3=q3,UC=UC, LC=LC, min=min, max=max))
}
vars <- c( "age","sex", "bmi", "children","smoker","region","charges")
data_stats<-t(data.frame(apply(data[vars],2,stats)))
View(data_stats)
## OUTLIER Treatment
data$bmi[data$bmi>47.29]<-47.29
data$charges[data$charges>34489.35]<-34489.35
## Missing Value Detection & Imputation
#is.na(data)
sapply(data, function(x)sum(is.na(x)))
#library(Hmisc)
#data<-data.frame(apply(data[vars],2, function(x) impute(x, mean))) #Imputing missing values with mean
#Assumption
hist(data$charges) #Dependent variable should have normal distribution
#hist(log(data$charges))
#Density Plot
library(e1071)
par(mfrow=c(1, 2)) # divide graph area in 2 columns
plot(density(data$charges), main="Density Plot: Speed", ylab="Frequency",
sub=paste("Skewness:", round(e1071::skewness(data$charges), 2))) # density plot for 'speed'
polygon(density(data$charges), col="red")
#Correlation
corrm<- cor(data)
#write.csv(corrm, file = "Correlation Matrix.csv")
#Correlation visualisation
#options(repos = c(CRAN = "http://cran.rstudio.com"))
install.packages("corrplot")
library(corrplot)
corrplot(corrm)
I cant find what I did wrong there is an error which shows that x has to be numeric Build a linear regression model to predict the medical insurance cost. The dataset consists of customers 1338 observations & 7 variables.
The dataset consists of following parameters:
- Age: insurance contractor age, years
- Sex: insurance contractor gender, [female, male]
- BMI: Body mass index, providing an understanding of body, weights that are relatively high or low relative to height, objective index of body weight (kg / m ^ 2) using the ratio of height to weight, ideally 18.5 to 24.9
- Children: number of children covered by health insurance / Number of dependents
- Smoker: smoking, [yes, no]
- Region: the beneficiary’s residential area in the US, [northeast, southeast, southwest, northwest]
- Charges: Individual medical costs billed by health insurance, $