Combining information from two dataframes tidyverse

Question

I have two data sheets, one that is a list of sampling timepoints at different locations and one which is a list of farms in the areas with the dates when each farm opened, closed, and which sampling areas are nearby the farm.

I want to create a new column for the sampling timepoints dataframe that returns the number of farms that were operating nearby that location when the sample was taken. I can do this individually pretty simply by filtering the farms dataframe for farms operational while the sample was taken and located in the area of the sample, but I want a way to automate this. I think what I need is to build a function that will run each row of the sampling timepoints dataframe through the current code but it would require inputs from both dataframes which I'm not sure how to do. Any help is appreciated!

Currently how I can do this one by one manually inserting the "sample dates" and "sample locations":

farms %>%
filter(as.Date(Open.Date) < as.Date("Sample.Date")) %>%
filter(as.Date(Closure.Date) > as.Date("Sample.Date")) %>%
filter(grepl("Sample.Location", Nearby.Farm.Locations)) %>%
count()

-----ADDITION----

An example of the sampling dataframe (just a small subset)

structure(list(Sample.Date = structure(c(2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
1L), .Label = c("29/06/2004", "29/06/2015"), class = "factor"), 
    Location = structure(c(1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 
    3L, 3L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 6L, 6L, 6L), .Label = c("Orchard Bay ", 
    "Port Ligar ", "Rams Head ", "Skiddaw ", "Te Puraka Point ", 
    "Waitata Reach "), class = "factor")), class = "data.frame", row.names = c(NA, 
-22L))

And an example of the farm dataframe

structure(list(Farm.Number = c(8103L, 8107L, 8108L, 8109L, 8110L, 
8111L, 8112L, 8113L, 8114L, 8115L, 8116L, 8117L, 8118L, 8119L, 
8120L, 8121L, 8122L, 8123L, 8124L, 8125L, 8126L, 8127L, 8128L, 
8129L, 8130L, 8131L, 8132L, 8133L), Start.Date = structure(c(23L, 
16L, 4L, 7L, 25L, 14L, 10L, 20L, 2L, 12L, 3L, 8L, 14L, 11L, 9L, 
1L, 19L, 22L, 5L, 18L, 17L, 13L, 24L, 15L, 26L, 15L, 21L, 6L), .Label = c("1/07/1982", 
"10/06/1994", "11/04/1990", "11/04/1997", "13/05/1982", "13/08/1980", 
"14/05/1996", "14/09/1983", "15/09/1982", "16/03/1981", "17/03/1997", 
"21/01/1986", "23/05/1989", "23/07/2004", "23/12/1982", "25/08/2000", 
"27/06/1983", "27/10/1981", "28/09/1982", "28/10/1983", "29/01/1981", 
"29/01/1982", "29/09/1997", "30/01/2001", "30/06/1982", "4/08/1980"
), class = "factor"), End.Date = structure(c(6L, 8L, 13L, 10L, 
10L, 3L, 10L, 9L, 7L, 10L, 1L, 10L, 4L, 12L, 10L, 10L, 10L, 5L, 
10L, 10L, 10L, 10L, 11L, 2L, 10L, 10L, 10L, 10L), .Label = c("1/02/2039", 
"1/06/2039", "1/06/2041", "1/07/2041", "1/10/2040", "14/09/2007", 
"20/04/2028", "24/08/2020", "30/06/2034", "31/12/2024", "7/03/2039", 
"7/11/2030", "7/11/2031"), class = "factor"), Nearby.Locations = structure(c(6L, 
5L, 5L, 5L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 2L, 4L, 4L, 4L, 4L, 2L, 2L, 2L, 4L, 4L), .Label = c("", "Anakoha Bay", 
"Forsyth Bay; Orchard Bay", "Forsyth Bay; Orchard Bay; Anakoha Bay", 
"Port Ligar; Forsyth Bay; Orchard Bay ", "Waitata Reach "), class = "factor")), class = "data.frame", row.names = c(NA, 
-28L))

And then an example of what I'd like the outcome to look like based on the two dataframes above (adding a column to the sampling dataframe based on how many farms were active in the region at the time the sample was taken:

structure(list(Sample.Date = structure(c(2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
1L), .Label = c("29/06/2004", "29/06/2015"), class = "factor"), 
    Location = structure(c(1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 
    3L, 3L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 6L, 6L, 6L), .Label = c("Orchard Bay ", 
    "Port Ligar ", "Rams Head ", "Skiddaw ", "Te Puraka Point ", 
    "Waitata Reach "), class = "factor"), Nearby.Farms = c(16L, 
    16L, 16L, 3L, 3L, 3L, 3L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
    0L, 0L, 0L, 0L, 0L, 0L, 1L)), class = "data.frame", row.names = c(NA, 
-22L))

Please give more information about the data. Run this command `dput(yourdf)` and paste the output here — Daman deep, Sep 09 '21 at 04:44
It's easier to help you if you include a simple [reproducible example](https://stackoverflow.com/questions/5963269/how-to-make-a-great-r-reproducible-example) with sample input and desired output that can be used to test and verify possible solutions. — MrFlick, Sep 09 '21 at 04:52
Please provide enough code so others can better understand or reproduce the problem. — Community, Sep 09 '21 at 05:36

danlooo · Accepted Answer · 2021-09-10T07:30:28.593

library(tidyverse)
library(lubridate)
#> 
#> Attaching package: 'lubridate'
#> The following objects are masked from 'package:base':
#> 
#>     date, intersect, setdiff, union

samplings <- structure(list(
  Sample.Date = structure(c(
    2L, 2L, 2L, 2L, 2L,
    2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
    1L
  ), .Label = c("29/06/2004", "29/06/2015"), class = "factor"),
  Location = structure(c(
    1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L,
    3L, 3L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 6L, 6L, 6L
  ), .Label = c(
    "Orchard Bay ",
    "Port Ligar ", "Rams Head ", "Skiddaw ", "Te Puraka Point ",
    "Waitata Reach "
  ), class = "factor")
), class = "data.frame", row.names = c(
  NA,
  -22L
))

farms <- structure(list(Farm.Number = c(
  8103L, 8107L, 8108L, 8109L, 8110L,
  8111L, 8112L, 8113L, 8114L, 8115L, 8116L, 8117L, 8118L, 8119L,
  8120L, 8121L, 8122L, 8123L, 8124L, 8125L, 8126L, 8127L, 8128L,
  8129L, 8130L, 8131L, 8132L, 8133L
), Start.Date = structure(c(
  23L,
  16L, 4L, 7L, 25L, 14L, 10L, 20L, 2L, 12L, 3L, 8L, 14L, 11L, 9L,
  1L, 19L, 22L, 5L, 18L, 17L, 13L, 24L, 15L, 26L, 15L, 21L, 6L
), .Label = c(
  "1/07/1982",
  "10/06/1994", "11/04/1990", "11/04/1997", "13/05/1982", "13/08/1980",
  "14/05/1996", "14/09/1983", "15/09/1982", "16/03/1981", "17/03/1997",
  "21/01/1986", "23/05/1989", "23/07/2004", "23/12/1982", "25/08/2000",
  "27/06/1983", "27/10/1981", "28/09/1982", "28/10/1983", "29/01/1981",
  "29/01/1982", "29/09/1997", "30/01/2001", "30/06/1982", "4/08/1980"
), class = "factor"), End.Date = structure(c(
  6L, 8L, 13L, 10L,
  10L, 3L, 10L, 9L, 7L, 10L, 1L, 10L, 4L, 12L, 10L, 10L, 10L, 5L,
  10L, 10L, 10L, 10L, 11L, 2L, 10L, 10L, 10L, 10L
), .Label = c(
  "1/02/2039",
  "1/06/2039", "1/06/2041", "1/07/2041", "1/10/2040", "14/09/2007",
  "20/04/2028", "24/08/2020", "30/06/2034", "31/12/2024", "7/03/2039",
  "7/11/2030", "7/11/2031"
), class = "factor"), Nearby.Locations = structure(c(
  6L,
  5L, 5L, 5L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 1L,
  1L, 2L, 4L, 4L, 4L, 4L, 2L, 2L, 2L, 4L, 4L
), .Label = c(
  "", "Anakoha Bay",
  "Forsyth Bay; Orchard Bay", "Forsyth Bay; Orchard Bay; Anakoha Bay",
  "Port Ligar; Forsyth Bay; Orchard Bay ", "Waitata Reach "
), class = "factor")), class = "data.frame", row.names = c(
  NA,
  -28L
))

samplings <-
  samplings %>%
  # clen up trailing whitespaces
  mutate(Location = Location %>% str_trim()) %>%
  as_tibble() %>%
  # parse date
  mutate(Sample.Date = Sample.Date %>% as.character() %>% parse_date("%d/%m/%Y"))

samplings
#> # A tibble: 22 x 2
#>    Sample.Date Location   
#>    <date>      <chr>      
#>  1 2015-06-29  Orchard Bay
#>  2 2015-06-29  Orchard Bay
#>  3 2015-06-29  Orchard Bay
#>  4 2015-06-29  Port Ligar 
#>  5 2015-06-29  Port Ligar 
#>  6 2015-06-29  Port Ligar 
#>  7 2015-06-29  Port Ligar 
#>  8 2015-06-29  Rams Head  
#>  9 2015-06-29  Rams Head  
#> 10 2015-06-29  Rams Head  
#> # … with 12 more rows

farms <-
  farms %>%
  # normalize farms to have one location per row
  mutate(
    Nearby.Locations = Nearby.Locations %>% map(~ .x %>%
      str_split("; ") %>%
      simplify() %>%
      map_chr(str_trim))
  ) %>%
  unnest(Nearby.Locations) %>%
  rename(Location = Nearby.Locations) %>%
  as_tibble() %>%
  # Parse date columns
  mutate_at(c("End.Date", "Start.Date"), ~ .x %>%
    as.character() %>%
    parse_date("%d/%m/%Y"))

farms
#> # A tibble: 53 x 4
#>    Farm.Number Start.Date End.Date   Location     
#>          <int> <date>     <date>     <chr>        
#>  1        8103 1997-09-29 2007-09-14 Waitata Reach
#>  2        8107 2000-08-25 2020-08-24 Port Ligar   
#>  3        8107 2000-08-25 2020-08-24 Forsyth Bay  
#>  4        8107 2000-08-25 2020-08-24 Orchard Bay  
#>  5        8108 1997-04-11 2031-11-07 Port Ligar   
#>  6        8108 1997-04-11 2031-11-07 Forsyth Bay  
#>  7        8108 1997-04-11 2031-11-07 Orchard Bay  
#>  8        8109 1996-05-14 2024-12-31 Port Ligar   
#>  9        8109 1996-05-14 2024-12-31 Forsyth Bay  
#> 10        8109 1996-05-14 2024-12-31 Orchard Bay  
#> # … with 43 more rows

samplings %>%
  left_join(farms) %>%
  filter(Sample.Date %within% interval(Start.Date, End.Date)) %>%
  # count the number of farms regardless of how many samples there are
  distinct(Sample.Date, Location, Farm.Number) %>%
  count(Sample.Date, Location)
#> Joining, by = "Location"
#> # A tibble: 3 x 3
#>   Sample.Date Location          n
#>   <date>      <chr>         <int>
#> 1 2004-06-29  Waitata Reach     1
#> 2 2015-06-29  Orchard Bay      16
#> 3 2015-06-29  Port Ligar        3

^{Created on 2021-09-10 by the reprex package (v2.0.1)}

Thanks for the response! It's not really about keeping certain data points, what I really want is to just be able to say how many farms were open near each data point at the time they were sampled. The examples I added to to the original post hopefully clarify that. — Trevyn Toone, Sep 10 '21 at 01:12
I updated the post accordingly. Thanks for the reproducible example! — danlooo, Sep 10 '21 at 07:26

Combining information from two dataframes tidyverse

1 Answers1