I have a large dataset of household data where each year, each section of the survey is in a separate file, with keys allowing the files to be merged. I created a couple of functions with for loops to load and iteratively join all of these files. I am importing this data from SPSS. The variables all have value labels, which I include in my import using read.spss(), however the value labels are only kept for a left join of files 1 on file 2. For all other files (27 in total), the value labels disappear once the file is merged into the main merged file. This is a problem I need to be able to see these labels for additional work that I want to do.
This is the code am running. Any help would be greatly appreciated.
#______________________________________________________________________________________________
###Loading Files Function
#_______________________________________________________________________________________________
# input_directory <- paste0(input,"/JSLC",year) ##input_directory for loading main JSLC Datasets
file_load_loop <- function(year){
list_slc_files2 = list()
input_directory <- paste0(input,"/JSLC",2000)
for(i in 1:length(list.files(input_directory))){
# list_slc_files[[paste0("df_", tools::file_path_sans_ext(list.files(input_directory)[i]))]] = read.spss(file.path(input_directory,list.files(input_directory)[i]), use.value.labels = TRUE,to.data.frame=TRUE)
list_slc_files2[[paste0("df_", tools::file_path_sans_ext(list.files(input_directory)[i]))]] = as.data.frame(haven::read_sav(file.path(input_directory,list.files(input_directory)[i])))
print(file.path(input_directory,list.files(input_directory)[i]))
print(i)
}
return(list_slc_files2)
}
#______________________________________________________________________________________________
###Creating cCombined Files Function
#_______________________________________________________________________________________________
creating_combined_files <- function(year){
# return(list_slc_files1)
list_slc_files = file_load_loop(year)
print(paste0("Length of File List: ", length(list_slc_files)))
print('initial loop done!')
df_merge_final = list_slc_files2[[1]]
# view(df_merge_final)
names(df_merge_final) = toupper(names(df_merge_final)) ##Change everything to uppercase
# df_merge_final$IND <- as.numeric(df_merge_final$IND)
df_merge_final$SERIAL <- as.numeric(df_merge_final$SERIAL)
# df_merge_final$RECORD <- as.numeric(df_merge_final$RECORD)
for(j in 2:(length(list_slc_files))){
# df_x = list_exit_surveys[[j]]
df_y = list_slc_files2[[j]]
###Change all column names to uppercase
names(df_y) = toupper(names(df_y))
# view(df_y)
##Make sure the relevant columns are all
df_y$SERIAL <- as.numeric(df_y$SERIAL)
if("IND" %in% colnames(df_y)){
df_y$IND <- as.numeric(df_y$IND)
}
if("RECORD" %in% colnames(df_y)){
df_y$RECORD <- as.numeric(df_y$RECORD)
}
if(!"ITEM_CD" %in% colnames(df_y)&&!"ITEM_COD" %in% colnames(df_y) && !"ITEM_CODEK" %in% colnames(df_y)){
if(nrow(list_slc_files[[j]]) < 30000){ #&& j != 25 && j !=20 && j!=14 && j!= 17){
##Now merge based on the values of these columns
print(j)
print(paste0("year: ", year))
print("The Dataframe Upon Which We're Merging is: ")
print(names(list_slc_files)[[j]])
if("IND" %in% colnames(df_y) && names(list_slc_files)[[j]] != paste0("df_povline0",year-2000))
{df_merge = left_join(df_merge_final,df_y,by=c("SERIAL","IND"))
}
else if("IND" %in% colnames(df_y) && names(list_slc_files)[[j]] == paste0("df_povline0",year-2000))
{df_merge = left_join(df_merge_final,df_y,by=c("SERIAL"))
print('merged pov!')
}
else{
df_merge = left_join(df_merge_final,df_y,by=c("SERIAL"))
}
}
else{
next
}
df_merge_final = df_merge
print(nrow(df_merge_final))
print(j)
}
else{
next
# if(nrow(list_slc_files[[j]]) < 7000){
#
# ##Now merge based on the values of these columns
# if("IND" %in% colnames(df_y) & j != 2){
# df_merge = left_join(df_merge_final,df_y,by=c("SERIAL","IND"))
#
#
#
# }
# else if("IND" %in% colnames(df_y) & j == 2){
# df_merge = merge(df_merge_final,df_y,by=c("SERIAL"),all.y=TRUE)
# print('merged pov!')
#
#
# }
#
# else{
#
# df_merge = left_join(df_merge_final,df_y,by=c("SERIAL"))
#
# }
# }
# else{
#
# next
#
# }
# df_merge_final = df_merge
# print(nrow(df_merge_final))
# print(j)
}
}
return(df_merge_final)
}
I have tried switching to using haven::read_sav, and read_spss() but this has not helped. I also tried using the attr function to re_add the value labels on the merged document. R says the value labels have been added, but I cannot see them on the dataframe, which is what I need.
df_merge = left_join(df_merge_final,df_y,by=c("SERIAL","IND")) %>% `attr<-`(
+ "variable.labels",
+ c(attr(df_merge_final, "variable.labels"),
+ attr(df_y, "variable.labels")))
[