0

I have a large dataset of household data where each year, each section of the survey is in a separate file, with keys allowing the files to be merged. I created a couple of functions with for loops to load and iteratively join all of these files. I am importing this data from SPSS. The variables all have value labels, which I include in my import using read.spss(), however the value labels are only kept for a left join of files 1 on file 2. For all other files (27 in total), the value labels disappear once the file is merged into the main merged file. This is a problem I need to be able to see these labels for additional work that I want to do.

This is the code am running. Any help would be greatly appreciated.


#______________________________________________________________________________________________
###Loading Files Function
#_______________________________________________________________________________________________

# input_directory <- paste0(input,"/JSLC",year) ##input_directory for loading main JSLC Datasets



file_load_loop <- function(year){
  list_slc_files2 = list()
  input_directory <- paste0(input,"/JSLC",2000)
  for(i in 1:length(list.files(input_directory))){
    # list_slc_files[[paste0("df_", tools::file_path_sans_ext(list.files(input_directory)[i]))]] =  read.spss(file.path(input_directory,list.files(input_directory)[i]), use.value.labels = TRUE,to.data.frame=TRUE)
    list_slc_files2[[paste0("df_", tools::file_path_sans_ext(list.files(input_directory)[i]))]] =  as.data.frame(haven::read_sav(file.path(input_directory,list.files(input_directory)[i])))
    
    print(file.path(input_directory,list.files(input_directory)[i]))
    print(i)
  }
  return(list_slc_files2)
}
#______________________________________________________________________________________________
###Creating cCombined Files Function

#_______________________________________________________________________________________________


creating_combined_files <- function(year){
  # return(list_slc_files1)
  list_slc_files = file_load_loop(year)
  print(paste0("Length of File List: ", length(list_slc_files)))
  print('initial loop done!')
  df_merge_final = list_slc_files2[[1]]
  # view(df_merge_final)
  names(df_merge_final) = toupper(names(df_merge_final))  ##Change everything to uppercase
  
  # df_merge_final$IND <- as.numeric(df_merge_final$IND)
  df_merge_final$SERIAL <- as.numeric(df_merge_final$SERIAL) 
  # df_merge_final$RECORD <-   as.numeric(df_merge_final$RECORD)
  for(j in 2:(length(list_slc_files))){
    # df_x = list_exit_surveys[[j]]
    df_y = list_slc_files2[[j]]
    ###Change all column names to uppercase
    names(df_y) = toupper(names(df_y))
    # view(df_y)
    ##Make sure the relevant columns are all 
    df_y$SERIAL <- as.numeric(df_y$SERIAL) 
    
    if("IND" %in%  colnames(df_y)){
      
      df_y$IND <- as.numeric(df_y$IND)
    }
    
    
    if("RECORD" %in% colnames(df_y)){
      
      df_y$RECORD <-   as.numeric(df_y$RECORD)
    }
    
    
    if(!"ITEM_CD" %in% colnames(df_y)&&!"ITEM_COD" %in% colnames(df_y)  && !"ITEM_CODEK" %in% colnames(df_y)){
      
      if(nrow(list_slc_files[[j]]) < 30000){  #&& j != 25 && j !=20 && j!=14 && j!= 17){
        
        ##Now merge based on the values of these columns
        print(j)
        print(paste0("year: ", year))
        print("The Dataframe Upon Which We're Merging is: ")
        print(names(list_slc_files)[[j]])
        if("IND" %in%  colnames(df_y) && names(list_slc_files)[[j]] != paste0("df_povline0",year-2000))
        {df_merge = left_join(df_merge_final,df_y,by=c("SERIAL","IND"))
        
        
        
        }
        else if("IND" %in%  colnames(df_y) && names(list_slc_files)[[j]] == paste0("df_povline0",year-2000))
        {df_merge = left_join(df_merge_final,df_y,by=c("SERIAL"))
        print('merged pov!')
        
        
        }
        
        else{
          
          df_merge = left_join(df_merge_final,df_y,by=c("SERIAL"))
          
        }
      }
      else{
        
        next
        
      }
      
      df_merge_final = df_merge
      print(nrow(df_merge_final))
      print(j)
      
      
    }
    else{ 
      next
      
      # if(nrow(list_slc_files[[j]]) < 7000){
      # 
      # ##Now merge based on the values of these columns
      #     if("IND" %in%  colnames(df_y) & j != 2){
      #       df_merge = left_join(df_merge_final,df_y,by=c("SERIAL","IND"))
      #       
      #       
      #       
      #     }
      #     else if("IND" %in%  colnames(df_y) & j == 2){
      #       df_merge = merge(df_merge_final,df_y,by=c("SERIAL"),all.y=TRUE)
      #       print('merged pov!')
      #       
      #       
      #     }
      #     
      #     else{
      #       
      #       df_merge = left_join(df_merge_final,df_y,by=c("SERIAL"))
      #       
      #     }
      #   }
      # else{
      #   
      #   next
      #   
      # }
      
      # df_merge_final = df_merge
      # print(nrow(df_merge_final))
      # print(j) 
    }
    
  }
  return(df_merge_final)
}



I have tried switching to using haven::read_sav, and read_spss() but this has not helped. I also tried using the attr function to re_add the value labels on the merged document. R says the value labels have been added, but I cannot see them on the dataframe, which is what I need.

 df_merge = left_join(df_merge_final,df_y,by=c("SERIAL","IND")) %>% `attr<-`(
+     "variable.labels",
+     c(attr(df_merge_final, "variable.labels"),
+       attr(df_y, "variable.labels")))

[R Showing the variable labels have been added.1

But they are not visible on the dataframe, which is what I need.

stefan
  • 90,330
  • 6
  • 25
  • 51
Matt
  • 97
  • 10
  • Hi Matt! Have you tried using a very very simple dataset? As much as I like your enthusiasm in putting in 138 lines of code, I don't think an example that large is necessary for demonstrating left joins – Mark Jul 28 '23 at 10:04
  • in general though, if you want to keep the keys you join by, you can use the `keep = TRUE` argument with left_join – Mark Jul 28 '23 at 10:04

0 Answers0