Plotting kaplan meier survival data if dataset already contains time , survival probability, and upper/lower 95% confidence intervals

Question

I have a kaplan meier survival dataset that already contains time, survival probabilities values, and survival probability data points for both the lower & upper 95% CI. I have posted a clip of my dataset below. I was hoping if anyone knew how merge my two plots, normalize them, and make my plots continuous despite missing values. I was hoping for my final graph to look like this 2.

kmcurvetest_2[1:20, ] %>% dput()

structure(list(Time = c(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 
12, 13, 15, 16, 17, 18, 19, 20), Cohort1 = c(0.904255319148936, 
0.898936170212766, 0.887769261266023, 0.887769261266023, 0.887769261266023, 
0.87631417402388, 0.87631417402388, NA, NA, 0.87631417402388, 
0.864551567661143, 0.858629981581273, 0.852708395501402, NA, 
0.852708395501402, 0.846745399728665, 0.846745399728665, 0.840740113205766, 
NA, 0.840740113205766), C1Lower95 = c(0.852338104650895, 0.846140749965675, 
0.833054851312184, 0.833054851312184, 0.833054851312184, 0.819696863257612, 
0.819696863257612, NA, NA, 0.819696863257612, 0.806043967960357, 
0.799218079053227, 0.792429563598159, NA, 0.792429563598159, 
0.785616930383783, 0.785616930383783, 0.778778500012501, NA, 
0.778778500012501), C1Upper95 = c(0.938570469008423, 0.934312293965728, 
0.92534844712446, 0.92534844712446, 0.92534844712446, 0.916056348120451, 
0.916056348120451, NA, NA, 0.916056348120451, 0.906427391600421, 
0.901537491012523, 0.8966168920045, NA, 0.8966168920045, 0.891638921203334, 
0.891638921203334, 0.886603579837755, NA, 0.886603579837755), 
    Cohort2 = c(0.707462686567164, 0.692537313432835, 0.683384837924912, 
    0.674232362416989, 0.674232362416989, 0.668074989244231, 
    NA, 0.664996302657852, 0.664996302657852, 0.658781383941424, 
    0.652507275522934, 0.649370221313689, 0.646217938685953, 
    0.643065656058216, 0.630394411603867, 0.62722660049028, 0.624058789376693, 
    0.620890978263105, 0.617723167149518, 0.614539027112665), 
    C2Lower95 = c(0.655564487332025, 0.640091667602195, 0.630607727619003, 
    0.62114710952213, 0.62114710952213, 0.614788099004335, NA, 
    0.611612499799214, 0.611612499799214, 0.605202384226936, 
    0.598734349944198, 0.595504428845739, 0.592259587632446, 
    0.589017489398546, 0.576004700295779, 0.572758317180272, 
    0.569514623188025, 0.566273601091399, 0.56303523423295, 0.5597807789553
    ), C2Upper95 = c(0.753046097156017, 0.738936670959587, 0.730275198102735, 
    0.721591223004285, 0.721591223004285, 0.715742377703966, 
    NA, 0.712814219355565, 0.712814219355565, 0.706901638437748, 
    0.700928732359048, 0.697938428282602, 0.694932646561064, 
    0.691924293962202, 0.679812432812405, 0.67677809121533, 0.673741229385084, 
    0.670701861632804, 0.667660001811057, 0.664601682804447)), row.names = c(NA, 
-20L), class = c("tbl_df", "tbl", "data.frame"))

My data set contains missing values and I attempted to make my geom_line continuous despite the missing values using ggplot(data = kmcurvetest_2[!is.na(kmcurvetest_2$Cohort2),] , mapping = aes(x = Time, y = Cohort2)

My codes for the two plots are...

# plot cohort 1

ggplot(data = kmcurvetest_2[!is.na(kmcurvetest_2$Cohort1),] , mapping = aes(x = Time, y = Cohort1)) +  
  geom_point(size = 1 ) +
  geom_line(color = "blue") +
  geom_ribbon(aes(x = Time, ymin = C1Lower95, ymax = C1Upper95), 
            fill = "blue", alpha = 0.2) + 
  labs(title = paste("Inpatient Hospitalization"), x = "Time [Days]", y = "Survival [%]") +
  scale_y_continuous(limits = c(0, 1), labels = scales::percent) +
  scale_color_discrete(name = "Cohort", labels = c("Cohort1"))

# plot cohort 2

ggplot(data = kmcurvetest_2[!is.na(kmcurvetest_2$Cohort2),] , mapping = aes(x = Time, y = Cohort2)) +  
  geom_point(size = 1 ) +
  geom_line(color = "red") +
  geom_ribbon(aes(x = Time, ymin = C2Lower95, ymax = C2Upper95), 
            fill = "red", alpha = 0.2) + 
  labs(title = paste("Inpatient Hospitalization"), x = "Time [Days]", y = "Survival [%]") +
  scale_y_continuous(limits = c(0, 1), labels = scales::percent) +
  scale_color_discrete(name = "Cohort", labels = c("Cohort1"))

Thank you I really appreciate it - I have attached the images in question for reference above!

Please do not share your code as a screenshot - https://stackoverflow.com/questions/5963269/how-to-make-a-great-r-reproducible-example — Andrea M, Jun 22 '22 at 20:30

TarJae · Answer 1 · 2022-06-22T17:37:11.770

Something like this?

library(tidyverse)
df1 <- df %>% 
  slice(1) %>% 
  mutate(across(-time, ~paste(1))) %>% 
  type.convert(as.is = TRUE) %>% 
  bind_rows(df %>% mutate(time = time+1)) 

ggplot(df1, aes(x=factor(time), group=1)) +
  geom_line(data = df1 %>% dplyr::select(1:4), aes(y=C1survival, color = "red"), size=1)+
  geom_point(data = df1 %>% dplyr::select(1:4), aes(y=C1survival), shape = 3, color = "black")+
  geom_ribbon(data = df1 %>% dplyr::select(1:4), aes(ymin = C1lower95.CI, ymax = C1upper95.CI), alpha = 0.2)+
  labs(title = paste("Survival cohort1"), x = "Time [days]", y = "Survival [%]") + 
  geom_line(data = df1 %>% dplyr::select(1, 5:7), aes(y=C2survival, color = "blue"), size=1)+
  geom_point(data = df1 %>% dplyr::select(1:5:7), aes(y=C2survival), shape = 3, color = "black")+
  geom_ribbon(data = df1 %>%  dplyr::select(1, 5:7), aes(ymin = C2lower95.CI,  ymax = C2upper95.CI), alpha = 0.2)+
  scale_y_continuous(limits = c(0, 1), labels = scales::percent) +
  scale_color_discrete(name = "Cohort", labels = c("Cohort1", "Cohort2"))+
  theme_classic()+
  theme(
    axis.line = element_line(colour = "black", size = 0.24),
    aspect.ratio = 4 / 5,
    legend.position = "bottom",
    legend.box = "horizontal")

data:

structure(list(time = 0:8, C1survival = c(0.904255319, 0.89893617, 
0.887769261, 0.887769261, 0.887769261, 0.876314174, 0.876314174, 
0.664996303, 0.664996303), C1lower95.CI = c(0.852338105, 0.84614075, 
0.833054851, 0.833054851, 0.833054851, 0.819696863, 0.819696863, 
0.6116125, 0.6116125), C1upper95.CI = c(0.938570469, 0.934312294, 
0.925348447, 0.925348447, 0.925348447, 0.916056348, 0.916056348, 
0.712814219, 0.712814219), C2survival = c(0.707462687, 0.692537313, 
0.683384838, 0.674232362, 0.674232362, 0.668074989, NA, NA, NA
), C2lower95.CI = c(0.655564487, 0.640091668, 0.630607728, 0.62114711, 
0.62114711, 0.614788099, NA, NA, NA), C2upper95.CI = c(0.753046097, 
0.738936671, 0.730275198, 0.721591223, 0.721591223, 0.715742378, 
NA, NA, NA)), class = "data.frame", row.names = c(NA, -9L))

Hey TarJae, thanks for helping me out! I am sorry for not using dput earlier - still learning the ropes. I updated my question to make it more clear. I was attempting to follow your code and got kind of lost - is there anyway to ensure that both plots are continuous/ have extrapolated values despite the missing values? — DH617, Jun 23 '22 at 04:12
We could replace the NA with the last value we have in your case last value of C2 at day six. Generally it would be better to organize the code in long format. To know how to reproduc kaplan meier from survival package. just create a fit with `survfit` and then look at summary(fit). There you will see how it is organized in the original package. Then you could imitate this with your data. — TarJae, Jun 23 '22 at 04:16

Plotting kaplan meier survival data if dataset already contains time , survival probability, and upper/lower 95% confidence intervals

1 Answers1