Happy Tuesday.
I am currently collecting survey data. The surveys sometimes ask the same questions and other times do not. Why? Because there is 700+ questions and asking a participant to answer all of these (without payment) is not very realistic. So we are giving surveys of subsets of the items. Herein lies the problem. Some participants will take the survey more than one time (which is totally okay); however, I do not want to document their response when they answers the same question for the second (or third, fourth etc) time. However, when they respond to new questions I want to store that data. My thoughts on how to do this are to create a growing master data set with all the participants information, then when new data comes in query as to if the participant has responded to the surveys before and then only add their new information to the data file to be analyzed. Then, repeat this process when the next batch of survey results come in. In my head, after the dataframe to be analyzed has been updated, it could then be used to query any new data coming in.
So let me try and demonstrate the workflow to help guide a discussion, or even help someone identify a solution.
*note2: Dplyr may also be a relevant packing. Again, tagging, but if it is not relevant I can remove.
library(dplyr)
survey1 = structure(list(ip = c(111, 222, 333, 444, 555, 666, 777, 888,
999, 1110), gender = c("Female", "Female", "Male", "Female",
"Female", "Female", "Female", "Female", "Male", "Male"), age = c(23,
26, 23, 60, 30, 35, 27, 61, 49, 33), education = c(16, 18, 4,
18, 16, 19, 19, 14, 22, 16), race = c("White", "White", "Asian",
"White", "White", "White", "White", "White", "White", "White"
), Q4 = c("Dresser", "dresser", "drawers", "Dresser", "Dresser",
"Dresser", "Dresser", "dresser", "dresser", "dresser"), Q4a = c("Dresser",
"dresser", "drawers", "Dresser", "Dresser", "Dresser", "Dresser",
"dresser", "dresser", "dresser"), Q417 = c("Crib", "crib", "crib",
"Baby crib", "Crib", "Crib", "Crib", "crib", "crib", "crib"),
Q417a = c("Crib", "crib", "crib", "Baby crib", "Crib", "Crib",
"Crib", "crib", "crib", "crib"), Q536 = c("Couch", "couch",
"couch", "Couch or sofa", "Couch", "Couch", "Leather couch",
"sofa", "couch", "sofa"), Q536a = c("Sofa", "couch", "couch",
"Couch or sofa", "Couch", "Couch", "Couch", "sofa", "couch",
"sofe"), Q491 = c("Roof", "roof", "house", "Roof", "Roof",
"Roof", "Roof", "roof", "roof", "roof"), Q491a = c("Roof tile",
"roof", "roof", "Roof", "Roof", "Roof", "Roof", "rooof",
"roof", "roof"), Q452 = c("Rug", "rug", "rug", "Oriental carpet",
"Rug", "Rug", "Rug", "rug", "rug", "rug"), Q452a = c("Rug",
"rug", "rug", "Carpet", "Rug", "Rug", "Rug", "carpet", "rug",
"rug")), row.names = c(NA, -10L), class = c("tbl_df", "tbl",
"data.frame"))#ready in survey 2
survey2= structure(list(ip = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10), gender = c("Female",
"Female", "Male", "Female", "Female", "Female", "Female", "Female",
"Male", "Male"), age = c(23, 26, 23, 60, 30, 35, 27, 61, 49,
33), education = c(16, 18, 4, 18, 16, 19, 19, 14, 22, 16), race = c("White",
"White", "Asian", "White", "White", "White", "White", "White",
"White", "White"), Q4 = c("dog", "dog", "dog", "dog", "dog",
"dog", "dog", "dog", "dog", "dog"), Q4a = c("cat", "cat", "cat",
"cat", "cat", "cat", "cat", "cat", "cat", "cat"), Q417 = c("van",
"van", "van", "van", "van", "van", "van", "van", "van", "van"
), Q417a = c("chocolate", "chocolate", "chocolate", "chocolate",
"chocolate", "chocolate", "chocolate", "chocolate", "chocolate",
"chocolate"), Q536 = c("candy", "candy", "candy", "candy", "candy",
"candy", "candy", "candy", "candy", "candy"), Q536a = c("pizza",
"pizza", "pizza", "pizza", "pizza", "pizza", "pizza", "pizza",
"pizza", "pizza"), Q491 = c("ocotpus", "ocotpus", "ocotpus",
"ocotpus", "ocotpus", "ocotpus", "ocotpus", "ocotpus", "ocotpus",
"ocotpus"), Q491a = c("panther", "panther", "panther", "panther",
"panther", "panther", "panther", "panther", "panther", "panther"
), Q452 = c("checkers", "checkers", "checkers", "checkers", "checkers",
"checkers", "checkers", "checkers", "checkers", "checkers"),
Q452a = c("computer", "computer", "computer", "computer",
"computer", "computer", "computer", "computer", "computer",
"computer")), class = c("spec_tbl_df", "tbl_df", "tbl", "data.frame"
), row.names = c(NA, -10L), spec = structure(list(cols = list(
ip = structure(list(), class = c("collector_double", "collector"
)), gender = structure(list(), class = c("collector_character",
"collector")), age = structure(list(), class = c("collector_double",
"collector")), education = structure(list(), class = c("collector_double",
"collector")), race = structure(list(), class = c("collector_character",
"collector")), Q4 = structure(list(), class = c("collector_character",
"collector")), Q4a = structure(list(), class = c("collector_character",
"collector")), Q417 = structure(list(), class = c("collector_character",
"collector")), Q417a = structure(list(), class = c("collector_character",
"collector")), Q536 = structure(list(), class = c("collector_character",
"collector")), Q536a = structure(list(), class = c("collector_character",
"collector")), Q491 = structure(list(), class = c("collector_character",
"collector")), Q491a = structure(list(), class = c("collector_character",
"collector")), Q452 = structure(list(), class = c("collector_character",
"collector")), Q452a = structure(list(), class = c("collector_character",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 1), class = "col_spec"))
As can be seen in the data thus far, no participants that took survey1 also took survey2. We know this from the IP (address) difference. So adding them together is no sweat.
masterData = rbind(survey1, survey2)
str(masterdata) #reveals tibble [20 x 15]
Now lets say we get a new survey in, here survey 3 has all the same participants as survey 1. However, 4 of the questions overlap, but we get new data from these participants for 5 new questions. I would like to create a new dataframe and add only the new questions for these participants. Example:
survey3 =structure(list(X1 = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10), ip = c(111,
222, 333, 444, 555, 666, 777, 888, 999, 1110), gender = c("Female",
"Female", "Male", "Female", "Female", "Female", "Female", "Female",
"Male", "Male"), age = c(23, 26, 23, 60, 30, 35, 27, 61, 49,
33), education = c(16, 18, 4, 18, 16, 19, 19, 14, 22, 16), race = c("White",
"White", "Asian", "White", "White", "White", "White", "White",
"White", "White"), Q4 = c("Dresser", "dresser", "drawers", "Dresser",
"Dresser", "Dresser", "Dresser", "dresser", "dresser", "dresser"
), Q4a = c("Dresser", "dresser", "drawers", "Dresser", "Dresser",
"Dresser", "Dresser", "dresser", "dresser", "dresser"), Q417 = c("Crib",
"crib", "crib", "Baby crib", "Crib", "Crib", "Crib", "crib",
"crib", "crib"), Q417a = c("Crib", "crib", "crib", "Baby crib",
"Crib", "Crib", "Crib", "crib", "crib", "crib"), Q15 = c("waffle",
"waffle", "waffle", "waffle", "waffle", "waffle", "waffle", "waffle",
"waffle", "waffle"), Q16 = c("egg", "egg", "egg", "egg", "egg",
"egg", "egg", "egg", "egg", "egg"), Q17 = c("bacon", "bacon",
"bacon", "bacon", "bacon", "bacon", "bacon", "bacon", "bacon",
"bacon"), Q18 = c("pancake", "pancake", "pancake", "pancake",
"pancake", "pancake", "pancake", "pancake", "pancake", "pancake"
), Q19 = c("smoothie", "smoothie", "smoothie", "smoothie", "smoothie",
"smoothie", "smoothie", "smoothie", "smoothie", "smoothie")), class = c("spec_tbl_df",
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -10L), spec = structure(list(
cols = list(X1 = structure(list(), class = c("collector_double",
"collector")), ip = structure(list(), class = c("collector_double",
"collector")), gender = structure(list(), class = c("collector_character",
"collector")), age = structure(list(), class = c("collector_double",
"collector")), education = structure(list(), class = c("collector_double",
"collector")), race = structure(list(), class = c("collector_character",
"collector")), Q4 = structure(list(), class = c("collector_character",
"collector")), Q4a = structure(list(), class = c("collector_character",
"collector")), Q417 = structure(list(), class = c("collector_character",
"collector")), Q417a = structure(list(), class = c("collector_character",
"collector")), Q15 = structure(list(), class = c("collector_character",
"collector")), Q16 = structure(list(), class = c("collector_character",
"collector")), Q17 = structure(list(), class = c("collector_character",
"collector")), Q18 = structure(list(), class = c("collector_character",
"collector")), Q19 = structure(list(), class = c("collector_character",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 1), class = "col_spec"))
The desired output of this merge would look something like this:
desiredoutput = structure(list(ip = c(111, 222, 333, 444, 555, 666, 777, 888,
999, 1110, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10), gender = c("Female",
"Female", "Male", "Female", "Female", "Female", "Female", "Female",
"Male", "Male", "Female", "Female", "Male", "Female", "Female",
"Female", "Female", "Female", "Male", "Male"), age = c(23, 26,
23, 60, 30, 35, 27, 61, 49, 33, 23, 26, 23, 60, 30, 35, 27, 61,
49, 33), education = c(16, 18, 4, 18, 16, 19, 19, 14, 22, 16,
16, 18, 4, 18, 16, 19, 19, 14, 22, 16), race = c("White", "White",
"Asian", "White", "White", "White", "White", "White", "White",
"White", "White", "White", "Asian", "White", "White", "White",
"White", "White", "White", "White"), Q4 = c("Dresser", "dresser",
"drawers", "Dresser", "Dresser", "Dresser", "Dresser", "dresser",
"dresser", "dresser", "dog", "dog", "dog", "dog", "dog", "dog",
"dog", "dog", "dog", "dog"), Q4a = c("Dresser", "dresser", "drawers",
"Dresser", "Dresser", "Dresser", "Dresser", "dresser", "dresser",
"dresser", "cat", "cat", "cat", "cat", "cat", "cat", "cat", "cat",
"cat", "cat"), Q417 = c("Crib", "crib", "crib", "Baby crib",
"Crib", "Crib", "Crib", "crib", "crib", "crib", "van", "van",
"van", "van", "van", "van", "van", "van", "van", "van"), Q417a = c("Crib",
"crib", "crib", "Baby crib", "Crib", "Crib", "Crib", "crib",
"crib", "crib", "chocolate", "chocolate", "chocolate", "chocolate",
"chocolate", "chocolate", "chocolate", "chocolate", "chocolate",
"chocolate"), Q536 = c("Couch", "couch", "couch", "Couch or sofa",
"Couch", "Couch", "Leather couch", "sofa", "couch", "sofa", "candy",
"candy", "candy", "candy", "candy", "candy", "candy", "candy",
"candy", "candy"), Q536a = c("Sofa", "couch", "couch", "Couch or sofa",
"Couch", "Couch", "Couch", "sofa", "couch", "sofe", "pizza",
"pizza", "pizza", "pizza", "pizza", "pizza", "pizza", "pizza",
"pizza", "pizza"), Q491 = c("Roof", "roof", "house", "Roof",
"Roof", "Roof", "Roof", "roof", "roof", "roof", "ocotpus", "ocotpus",
"ocotpus", "ocotpus", "ocotpus", "ocotpus", "ocotpus", "ocotpus",
"ocotpus", "ocotpus"), Q491a = c("Roof tile", "roof", "roof",
"Roof", "Roof", "Roof", "Roof", "rooof", "roof", "roof", "panther",
"panther", "panther", "panther", "panther", "panther", "panther",
"panther", "panther", "panther"), Q452 = c("Rug", "rug", "rug",
"Oriental carpet", "Rug", "Rug", "Rug", "rug", "rug", "rug",
"checkers", "checkers", "checkers", "checkers", "checkers", "checkers",
"checkers", "checkers", "checkers", "checkers"), Q452a = c("Rug",
"rug", "rug", "Carpet", "Rug", "Rug", "Rug", "carpet", "rug",
"rug", "computer", "computer", "computer", "computer", "computer",
"computer", "computer", "computer", "computer", "computer"),
Q15 = c("waffle", "waffle", "waffle", "waffle", "waffle",
"waffle", "waffle", "waffle", "waffle", "waffle", NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA), Q16 = c("egg", "egg", "egg",
"egg", "egg", "egg", "egg", "egg", "egg", "egg", NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA), Q17 = c("bacon", "bacon",
"bacon", "bacon", "bacon", "bacon", "bacon", "bacon", "bacon",
"bacon", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), Q18 = c("pancake",
"pancake", "pancake", "pancake", "pancake", "pancake", "pancake",
"pancake", "pancake", "pancake", NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA), Q19 = c("smoothie", "smoothie", "smoothie",
"smoothie", "smoothie", "smoothie", "smoothie", "smoothie",
"smoothie", "smoothie", NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA)), class = c("spec_tbl_df", "tbl_df", "tbl", "data.frame"
), row.names = c(NA, -20L), spec = structure(list(cols = list(
ip = structure(list(), class = c("collector_double", "collector"
)), gender = structure(list(), class = c("collector_character",
"collector")), age = structure(list(), class = c("collector_double",
"collector")), education = structure(list(), class = c("collector_double",
"collector")), race = structure(list(), class = c("collector_character",
"collector")), Q4 = structure(list(), class = c("collector_character",
"collector")), Q4a = structure(list(), class = c("collector_character",
"collector")), Q417 = structure(list(), class = c("collector_character",
"collector")), Q417a = structure(list(), class = c("collector_character",
"collector")), Q536 = structure(list(), class = c("collector_character",
"collector")), Q536a = structure(list(), class = c("collector_character",
"collector")), Q491 = structure(list(), class = c("collector_character",
"collector")), Q491a = structure(list(), class = c("collector_character",
"collector")), Q452 = structure(list(), class = c("collector_character",
"collector")), Q452a = structure(list(), class = c("collector_character",
"collector")), Q15 = structure(list(), class = c("collector_character",
"collector")), Q16 = structure(list(), class = c("collector_character",
"collector")), Q17 = structure(list(), class = c("collector_character",
"collector")), Q18 = structure(list(), class = c("collector_character",
"collector")), Q19 = structure(list(), class = c("collector_character",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 1), class = "col_spec"))
One thing I am trying to be conscious of is to hopefully make this an iterative process as new surveys come in (eg. surver4 - survery1000)
Any help or ideas would be appreciated here as I am not clear as to how to attack this problem.