Example testdata
testdata_1140.csv=structure(list(lead_create = structure(c(1L, 5L, 6L, 2L, 1L,
3L, 4L, 1L, 3L, 3L), .Label = c("2018-05-13T01:48:07Z", "2018-05-15T22:56:10Z",
"2018-05-15T23:20:03Z", "2018-05-16T05:08:13Z", "2018-05-17T09:51:09Z",
"2018-05-17T15:49:02Z"), class = "factor"), lead_id = c(33238869L,
33417893L, 33427728L, 33359746L, 33238869L, 33360456L, 33371790L,
33238869L, 33360456L, 33360456L), approved_at = c(NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA), called_at = structure(c(2L, 10L,
9L, 5L, 1L, 7L, 8L, 3L, 6L, 4L), .Label = c("2018-05-14T23:44:25Z",
"2018-05-15T01:56:42Z", "2018-05-15T23:22:32Z", "2018-05-16T00:36:37Z",
"2018-05-16T00:53:31Z", "2018-05-16T01:35:31Z", "2018-05-16T20:52:19Z",
"2018-05-17T03:51:26Z", "2018-05-17T20:41:04Z", "2018-05-18T02:16:42Z"
), class = "factor"), product = structure(c(1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L), .Label = "Titan Gel", class = "factor"),
lead_country = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L), .Label = "Taiwan", class = "factor"), class = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L)), .Names = c("lead_create",
"lead_id", "approved_at", "called_at", "product", "lead_country",
"class"), class = "data.frame", row.names = c(NA, -10L))
I need create the plot for 1140 min. This dataset is part of large dataset and was taken as test data.
So to select random data i do so. The main task is to get smooth long plot.
But after code working I have a rough oscillator chart
like on this picture
How to do very long smooth plot on 1140 minutes?
Let it be on the entire monitor screen.
where every decline is smooth, and for a minute.
Desired plot i try draw in mspaint (sorry i can't cool draw :-) )
So i want see in detail each curve.
Are there ways to make the chart as detailed as possible so that the downturn curves are visible.
# Get 10 random leads
set.seed(0)
indV<-sample(x = 1:nrow(dt),size = 10,replace = F)
# Write to file this subset
fwrite(file = "testdata_1140.csv",x = dt[indV,])
# Progress bar
pb <- progress_bar$new(
format = "[:bar] :percent eta: :eta",
total = length(indV), clear = FALSE, width= 60)
for(i in 1:length(indV)) {
# Get main values for one lead id
mainV<-dt[indV[i],]
# Filtering data for current lead id
s_dt<-dt %>% filter(product==mainV$product,lead_country==mainV$lead_country)
s_dt$d<-s_dt$called_at-s_dt$lead_create
s_dt<-s_dt %>% select(-c(lead_create, called_at))
#head(s_dt)
#cat("The dimension is:",dim(s_dt),"\n")
# Create random forest model
set.seed(0) # For reproducive results
# Here we get indexies for 1 and 0 classes
pos0<-which(s_dt$class==0)
pos1<-which(s_dt$class==1)
# Find the minimum number of classes
min_l<-min(c(length(pos0),length(pos1)))
# Restricted the large sizes of data sets
if(min_l>5e+4) min_l<-5e+4
# Resample data for balancing of classes
pos0<-sample(x = pos0,size = min_l,replace = T)
pos1<-sample(x = pos1,size = min_l,replace = T)
# Get resampled data set
s_dt2<-s_dt[c(pos0,pos1),]
# create random forest
m<-randomForest(x = data.table(d=as.numeric(s_dt2$d)),
y = as.factor(s_dt2$class),
ntree = 100,nodesize = 1)
# Print the main results of this model
#print(m)
#pr2<-predict(object = m,newdata = data.table(d=as.numeric(s_dt2$d)))
#cat("The accuracy of sampled data:",100*Accuracy(y_pred = s_dt2$CLASS,y_true = pr2),"%\n")
# get aposterior probability
pr<-predict(object = m,newdata = data.table(d=as.numeric(s_dt$d)))
real_ac<-100*Accuracy(y_pred = s_dt$class,y_true = pr)
#cat("The accuracy of unsampled:",real_ac,"%\n")
# Create the time steps (48 hours by 5 mins)
difV<-seq(0,1140*60,60)
# Predict each time steps
prob_pos<-100*predict(object = m,newdata = data.table(d=difV),type = "prob")[,2]
# Create required table:
res<-data.table(lead_create=mainV$lead_create,
called_at=mainV$lead_create+difV,
prob=round(prob_pos,2),
prob_corrected=round(sqrt(prob_pos*real_ac),2))
#res<-res %>% arrange(-PROB_APP)
#head(res,8)
# sorting by call date and time the results
res<-res %>% arrange(res$called_at)
# create and save plot
png(filename = paste0("plot_1140_",i,".png"),
width = 1024,height = 800)
par(cex.main=0.8)
# Correction
res$prob_corrected<-((length(res$prob_corrected)-1):0)/(length(res$prob_corrected)-1)
res$prob_corrected<-res$prob_corrected*runif(n = length(res$prob_corrected),min = 0.5,max = 0.7)
plot(x = 0:(nrow(res)-1),
y = res$prob_corrected,type="l", col="lightblue",
xlab="Time of call",ylab="Probability of approval",
main=paste0("lead_create: ",mainV$lead_create,"\n",
"lead_id: ",mainV$lead_id,"\n",
"country: ",mainV$lead_country,"\n",
"product: ",mainV$product,"\n",
"start time of calls: ",res$called_at[1],
collapse = ""),xaxt="n")
#axis(1, xaxp=c(0, nrow(res)-1, nrow(res)-1), las=1)
hs<-(minute(mainV$lead_create):(minute(mainV$lead_create)+1140))%%59
hs[hs==0]<-59
axis(1, at=1:nrow(res), labels=hs,cex.axis=0.7)
indV2<-which(res$prob_corrected<=1e-3)
indV2<-indV2[indV2>1]
indV2<-indV2[1]
abline(v = (0:(nrow(res)-1))[indV2],col="red")
grid()
dev.off()
pb$tick()
}