-1

I am working with the R programming language.

I have the following data that contains 10 measurements for a set of people (and includes NA's):

my_data <- structure(list(id = 1:20, weight_time_1 = c(NA, NA, NA, NA, 99.4800556826432, 
NA, NA, NA, NA, 92.7723003148797, NA, 102.130637355002, NA, NA, 
96.4306038435274, 117.519167258681, NA, NA, NA, NA), weight_time_2 = c(NA, 
NA, NA, 100.096037354425, 98.5573457978251, NA, 99.2565971422039, 
NA, NA, 78.2178327860056, NA, 93.1290042175411, NA, 105.999332486733, 
102.324404273109, 106.249390147503, NA, NA, NA, NA), weight_time_3 = c(NA, 
NA, NA, 109.653641754063, 108.67612106402, 89.245436013972, 76.0388764710753, 
NA, 121.434141230992, 93.5040344542738, NA, 106.261290772666, 
NA, 107.27650959864, 99.9614325607138, 106.822602397336, NA, 
NA, NA, NA), weight_time_4 = c(NA, NA, NA, 83.4057073444694, 
100.0475658129, 101.181524203485, 109.854456857605, NA, 109.39925298469, 
100.127289780991, NA, 92.3537705948637, NA, 97.484431731186, 
93.1880798156964, 98.2949614096827, NA, NA, NA, NA), weight_time_5 = c(85.9705471396862, 
NA, 101.810197281424, 125.878759238011, 90.5377892614597, 100.977860860978, 
105.206211167738, 105.925495763829, 95.0038093722839, 91.7697262180746, 
112.751436397665, 89.3570085447357, NA, 105.334871042565, 107.101908594036, 
121.466895783898, NA, NA, NA, NA), weight_time_6 = c(91.3939219450539, 
NA, 102.295063295212, 112.648885364836, 92.858993235862, 84.9768973349691, 
106.268407819189, 91.2142736262532, 94.5206092516322, 106.102317632812, 
106.800383289515, 96.8243417950671, 112.526148273022, 96.0060934996047, 
108.127666530717, 100.80395850135, NA, NA, NA, 97.1665601525516
), weight_time_7 = c(78.1538622765699, NA, 98.3267913598314, 
97.694334342899, 88.2573884491152, 94.0391463446378, 79.107127345042, 
98.6717305266368, 87.4584802875, 91.0212929680695, 115.449312672637, 
108.505222479846, 87.7272780928247, 98.2950591116351, 108.64305435295, 
100.971252881422, NA, NA, NA, 89.7627845887151), weight_time_8 = c(88.9847618154833, 
NA, 75.9578295182105, 123.066624773516, 103.899907028919, 86.3922722708996, 
101.056470605625, 93.9274704914096, 116.225266396545, 119.261812971557, 
120.470004522712, 95.1540411812936, 103.625912955529, 119.112226243372, 
97.2548085647629, 93.4809837458108, NA, 107.551887082473, 103.626395948971, 
92.497583506856), weight_time_9 = c(106.965867937613, NA, 111.885847224286, 
95.4347167550049, 89.629232996398, 99.279432759281, 111.111236025807, 
106.187409603617, 95.0731389891664, 102.40946902701, 98.7215766413794, 
108.440350789909, 111.841323303161, 98.6631240530225, 108.178201457868, 
102.289607726024, 108.679229829576, 93.9424920702776, 102.660681952024, 
90.7932196785015), weight_time_10 = c(98.5452360068031, 100.417384196154, 
94.4492002344181, 100.711643341273, 119.565187908911, 103.54455492062, 
74.0330331656656, 103.431332886172, 112.355083085616, 100.345180859457, 
97.3988962137931, 96.9401740645521, 116.008033135044, 106.302406861972, 
96.7028852299552, 111.699115637383, 95.3519501717543, 89.9061904342833, 
107.36861168758, 102.797106848808)), row.names = c(NA, 20L), class = "data.frame")

I would like to make a "longitudinal" graph for this data. I tried to do this two different ways:

Option 1: https://cran.r-project.org/web/packages/lcsm/vignettes/v0-longitudinal-plots.html

library(lcsm)
library(ggplot2)
library(tidyr)
library(dplyr)
library(stringr)

x_var_list <- c("weight_time_1", "weight_time_2", "weight_time_3", "weight_time_4", "weight_time_5", "weight_time_6", "weight_time_7", "weight_time_8", "weight_time_9", "weight_time_10")

plot_trajectories(data = my_data,
                  id_var = "id", 
                  var_list = x_var_list,
                  xlab = "Time", ylab = "Value",
                  connect_missing = FALSE, 
                  random_sample_frac = 1, 
                  title_n = TRUE)

enter image description here

This seemed to have worked, but produces a warning message stating that NA's were not plotted:

Warning messages:
1: Removed 64 row(s) containing missing values (geom_path). 
2: Removed 64 rows containing missing values (geom_point). 

Option 2: https://www.r-bloggers.com/2015/08/managing-longitudinal-data-conversion-between-the-wide-and-the-long/#google_vignette

dat <- reshape(my_data, varying= c("weight_time_1", "weight_time_2", "weight_time_3", "weight_time_4", "weight_time_5", "weight_time_6", "weight_time_7", "weight_time_8", "weight_time_9", "weight_time_10"), idvar="id", direction="long")


library(ggplot2)
ggplot(dat, aes(x=time, y=measure, colour=tx, group=id)), geom_line(alpha=.5)

But this returns the following error: Error in guess(varying) :

failed to guess time-varying variables from their names

Can someone please show me how to fix this and plot this data? I would like the NA's to appear on the graph.

Thanks!

stats_noob
  • 5,401
  • 4
  • 27
  • 83

4 Answers4

3

NAs cannot be represented as data points. However, they can be made indirectly visible by plotting the probands separately, so that the presence of NAs is obvious.

library(tidyverse)

my_data <- as_tibble(my_data)

my_data <- my_data %>%
  pivot_longer(-id, names_to = "tp", values_to = "measure") %>%
  mutate(
    tp = parse_number(tp),
    tp = factor(tp),
    id = factor(id)
  )

my_data %>%
  ggplot(aes(tp, measure, col = id, group = id)) +
  geom_point() +
  geom_smooth(method = "loess", se = F) +
  theme(legend.position = "none") +
  facet_wrap(~id)

enter image description here NAs are, by definition, no data and therefore cannot be represented graphically. Already the lines between the points are strictly speaking wrong because one does not know the data between the points. Therefore, technically correct would be to represent the existing data only as points. Within the known data range, one can try to connect the points as best as possible with a smoothing line. More complex modeling is needed for the areas outside the known data range. In short, it is wrong for NAs to appear in the graph.

maRvin
  • 249
  • 1
  • 11
  • @ maRvin: Thank you for your answer! I really like what you have done! Just two questions 1) I guessing that it is possible to connect the dots in a straight line for each graph? 2) Would you have any idea on how to show all of these graphs on the same plot? thank you so much! – stats_noob Jun 08 '22 at 01:31
  • 1) try another method in geom_smooth, e.g., "lm". Alternatively, you could use geom_line() instead of geom_smooth() to connect all individual points. 2) Just omit facet_wrap() – maRvin Jun 08 '22 at 13:08
2

Your option 1 is probably fine. It's just warning you that it is impossible to plot an NA. Here is the ggplot2 version, you need to make the wide data long.


my_data_long <- my_data %>%
      tidyr::pivot_longer(-id, names_to = "time", values_to = "Value") %>%
      drop_na() %>%
      mutate(id = factor(id))

ggplot(my_data_long, aes(x = time, y = Value, color = factor(id))) +
      geom_point() +
      geom_line(aes(group = id)) +
      theme_minimal() +
      theme(axis.text.x = element_text(angle = -90)
JeffV
  • 21
  • 2
  • @ JeffV : Thank you so much for your answer! Is it possible to arrange the order of the x-axis in the right order? https://imgur.com/a/sYflOJI Thank you so much! – stats_noob May 28 '22 at 18:27
  • Change the `mutate` to: `mutate(id = factor(id), time = fct_relevel(time, "weight_time_10", after = Inf))`. That will move time 10 to the end. – Carl Jun 04 '22 at 17:13
2

You could also use dygraphs which is quite straightforward for this use case and handles nicely NAs:

library(dygraphs)

dygraph(my_data) %>% dyLegend(show = "always")

enter image description here

Further formatting options can be found here

Waldi
  • 39,242
  • 6
  • 30
  • 78
2

update - apparently the OP wanted to explicitly plot missing values. see further below for one approach

There are plenty of options to deal with NAs when plotting with ggplot2.

  1. Just leave them, and accept the warning (really nothing wrong with that)

  2. drop NA before plotting see JeffV's answer using tidyr::drop_na, but there are many ways, see this ultra-popular thread

  3. in your case, you can drop NA when pivoting - use tidyr::pivot_longer(..., values_drop_na = TRUE)

  4. add na.rm = TRUE to the geom of interest:

library(ggplot2)
library(dplyr)
library(tidyr)

my_data %>%  
  pivot_longer(cols = starts_with("weight")) %>%
  # your x is essentially continuous. Thus make it REALLY continuous! 
  # your id is categorical, so make it that
  mutate(time = as.integer(gsub(".*([0-9]+)", "\\1", name)), 
         id = as.character(id)) %>%

ggplot(aes(x=time, y=value, colour=id, group=id)) +
  geom_line(alpha=.5, na.rm = TRUE)

Visualising NAs in a line plot

It is a whole new problem "how to visualise NAs". There is the {naniar} package which helps visualising NAs, but to my knowledge not "within" a line plot. One way to do that would to first interpolate or impute NAs based on the present data. This should not be the place to discuss the best way to do this, but here a quick way using the zoo package.

 my_data_long <- my_data %>%  
    pivot_longer(cols = starts_with("weight")) %>%
    mutate(time = as.integer(gsub(".*([0-9]+)", "\\1", name)), 
           id = factor(id, levels = 1:max(id))) %>%
    group_by(id) %>%
 ## interpolate NA's with the zoo package
    mutate(na_ip = zoo::na.approx(value, time, na.rm = FALSE))
  
## store your NA's in a different frame
 my_nas <- my_data_long %>% filter(is.na(value))
  
ggplot(my_data_long, aes(x=time, y=value, colour=id, group=id)) +
    ## e.g., use the interpolated values for dashed lines
    geom_line(data = my_nas, aes(y = na_ip), lty = 2) +
    geom_line(alpha=.5, na.rm = TRUE) +
    ## because this is otherwise a complete visual disaster, I'm untangling with facet
    facet_wrap(~id) +
    theme(legend.position = "none")
  #> Warning: Removed 9 row(s) containing missing values (geom_path).
  #> geom_path: Each group consists of only one observation. Do you need to adjust
  #> the group aesthetic?
  #> geom_path: Each group consists of only one observation. Do you need to adjust
  #> the group aesthetic?
  #> geom_path: Each group consists of only one observation. Do you need to adjust
  #> the group aesthetic?
  #> geom_path: Each group consists of only one observation. Do you need to adjust
  #> the group aesthetic?

tjebo
  • 21,977
  • 7
  • 58
  • 94