0

Example testdata

  testdata_1140.csv=structure(list(lead_create = structure(c(1L, 5L, 6L, 2L, 1L, 
    3L, 4L, 1L, 3L, 3L), .Label = c("2018-05-13T01:48:07Z", "2018-05-15T22:56:10Z", 
    "2018-05-15T23:20:03Z", "2018-05-16T05:08:13Z", "2018-05-17T09:51:09Z", 
    "2018-05-17T15:49:02Z"), class = "factor"), lead_id = c(33238869L, 
    33417893L, 33427728L, 33359746L, 33238869L, 33360456L, 33371790L, 
    33238869L, 33360456L, 33360456L), approved_at = c(NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA), called_at = structure(c(2L, 10L, 
    9L, 5L, 1L, 7L, 8L, 3L, 6L, 4L), .Label = c("2018-05-14T23:44:25Z", 
    "2018-05-15T01:56:42Z", "2018-05-15T23:22:32Z", "2018-05-16T00:36:37Z", 
    "2018-05-16T00:53:31Z", "2018-05-16T01:35:31Z", "2018-05-16T20:52:19Z", 
    "2018-05-17T03:51:26Z", "2018-05-17T20:41:04Z", "2018-05-18T02:16:42Z"
    ), class = "factor"), product = structure(c(1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L), .Label = "Titan Gel", class = "factor"), 
        lead_country = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
        1L, 1L), .Label = "Taiwan", class = "factor"), class = c(0L, 
        0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L)), .Names = c("lead_create", 
    "lead_id", "approved_at", "called_at", "product", "lead_country", 
    "class"), class = "data.frame", row.names = c(NA, -10L))

I need create the plot for 1140 min. This dataset is part of large dataset and was taken as test data. So to select random data i do so. The main task is to get smooth long plot. But after code working I have a rough oscillator chart like on this picture enter image description here

How to do very long smooth plot on 1140 minutes? Let it be on the entire monitor screen. where every decline is smooth, and for a minute. Desired plot i try draw in mspaint (sorry i can't cool draw :-) ) enter image description here So i want see in detail each curve. Are there ways to make the chart as detailed as possible so that the downturn curves are visible.

# Get 10 random leads
set.seed(0)
indV<-sample(x = 1:nrow(dt),size = 10,replace = F)

# Write to file this subset
fwrite(file = "testdata_1140.csv",x = dt[indV,])

# Progress bar
pb <- progress_bar$new(
  format = "[:bar] :percent eta: :eta",
  total = length(indV), clear = FALSE, width= 60)

for(i in 1:length(indV)) {
  # Get main values for one lead id
  mainV<-dt[indV[i],]
  # Filtering data for current lead id
  s_dt<-dt %>% filter(product==mainV$product,lead_country==mainV$lead_country)
  s_dt$d<-s_dt$called_at-s_dt$lead_create
  s_dt<-s_dt %>% select(-c(lead_create, called_at))


  #head(s_dt)
  #cat("The dimension is:",dim(s_dt),"\n")

  # Create random forest model
  set.seed(0) # For reproducive results

  # Here we get indexies for 1 and 0 classes
  pos0<-which(s_dt$class==0)
  pos1<-which(s_dt$class==1)
  # Find the minimum number of classes
  min_l<-min(c(length(pos0),length(pos1)))
  # Restricted the large sizes of data sets
  if(min_l>5e+4) min_l<-5e+4

  # Resample data for balancing of classes
  pos0<-sample(x = pos0,size = min_l,replace = T)
  pos1<-sample(x = pos1,size = min_l,replace = T)
  # Get resampled data set
  s_dt2<-s_dt[c(pos0,pos1),]


  # create random forest
  m<-randomForest(x = data.table(d=as.numeric(s_dt2$d)),
                  y = as.factor(s_dt2$class),
                  ntree = 100,nodesize = 1)
  # Print the main results of this model
  #print(m)
  #pr2<-predict(object = m,newdata = data.table(d=as.numeric(s_dt2$d)))
  #cat("The accuracy of sampled data:",100*Accuracy(y_pred = s_dt2$CLASS,y_true = pr2),"%\n")
  # get aposterior probability 
  pr<-predict(object = m,newdata = data.table(d=as.numeric(s_dt$d)))
  real_ac<-100*Accuracy(y_pred = s_dt$class,y_true = pr)
  #cat("The accuracy of unsampled:",real_ac,"%\n")

  # Create the time steps (48 hours by 5 mins)
  difV<-seq(0,1140*60,60)
  # Predict each time steps
  prob_pos<-100*predict(object = m,newdata = data.table(d=difV),type = "prob")[,2]
  # Create required table:
  res<-data.table(lead_create=mainV$lead_create,
                  called_at=mainV$lead_create+difV,
                  prob=round(prob_pos,2),
                  prob_corrected=round(sqrt(prob_pos*real_ac),2))
  #res<-res %>% arrange(-PROB_APP)
  #head(res,8)

  # sorting by call date and time the results
  res<-res %>% arrange(res$called_at)

  # create and save plot
  png(filename = paste0("plot_1140_",i,".png"),
      width = 1024,height = 800)
  par(cex.main=0.8)
  # Correction
  res$prob_corrected<-((length(res$prob_corrected)-1):0)/(length(res$prob_corrected)-1)
  res$prob_corrected<-res$prob_corrected*runif(n = length(res$prob_corrected),min = 0.5,max = 0.7)
  plot(x = 0:(nrow(res)-1),
       y = res$prob_corrected,type="l", col="lightblue",
       xlab="Time of call",ylab="Probability of approval",
       main=paste0("lead_create: ",mainV$lead_create,"\n",
                   "lead_id: ",mainV$lead_id,"\n",
                   "country: ",mainV$lead_country,"\n",
                   "product: ",mainV$product,"\n",
                   "start time of calls: ",res$called_at[1],
                   collapse = ""),xaxt="n")
  #axis(1, xaxp=c(0, nrow(res)-1, nrow(res)-1), las=1)
  hs<-(minute(mainV$lead_create):(minute(mainV$lead_create)+1140))%%59
  hs[hs==0]<-59

  axis(1, at=1:nrow(res), labels=hs,cex.axis=0.7)

  indV2<-which(res$prob_corrected<=1e-3)
  indV2<-indV2[indV2>1]
  indV2<-indV2[1]
  abline(v = (0:(nrow(res)-1))[indV2],col="red")

  grid()

  dev.off()
  pb$tick()
}
Julia
  • 141
  • 11
  • 1
    I do not fully understand what you want. But if you want to be able to "zoom" and see all the details export your plot using `png` or `pdf` and put a very high resolution – Chelmy88 Aug 29 '19 at 12:42
  • You could use `cut` to separate the data and use `ggplot2::facet_wrap` on this new variable to get more plots. – TobiO Aug 29 '19 at 12:54
  • Please read and edit your question according to [How to make a great R reproducible example](https://stackoverflow.com/questions/5963269/how-to-make-a-great-r-reproducible-example) so that other users can help you – pogibas Aug 29 '19 at 13:58

0 Answers0