1

I have a dataset with several rows and columns. Below is the snapshot of what some of the rows and columns looks like.

  ID  Date        Gender Age  Col1    Col2    Col3    Col4    Col5    Col6    Col7    Col8 
  10  2015-10-14  F      68   345.50  884.2   008.69  202.18  189.8   435.2   084.7   757
  93  2002-07-22  F      87   242.80  710.9   345.50  884.2   008.69  202.18  189.8   435.2  
  14  2004-07-28  M      92   084.7   757     242.80  710.9   427.2   530.10  567.89  227.9
  41  2011-02-24  M      39   714.0   084.7   757     242.80  710.9   427.2   530.10  567.89
  64  2002-03-14  F      39   227.9   714.0   V58.49  906.7   800.35  V88.0   349.31  289.84 
  22  2015-11-21  F      68   324.0   V65.44  411.8   200.41  187.7   E869.3  041.04  170.4
  36  2003-09-17  F      75   389.1   176.3   788.37  E936.3  277.82  812.12  E816.7  663.90
  11  2000-10-07  M      74   716.90  396.3   482.1   E816.7  663.90  716.90  396.3   482.1 
  45  2001-07-14  F      31   614.2   945.44  799.4   864.05  371.31  268     626.2   780.72
  60  1999-02-23  M      45   674     645.2   006.5   V68.2   V67.00  665.24  434.00  914.3

I have another dataset that is a lookup table which contains short description for the codes in Col1, Col2, Col3, Col4, Col5, Col6, Col7 and Col8 which is as shown below

 Code       Short_Description
 345.50     interStellar
 884.2      indispensable
 008.69     hallucination
 202.18     flow
 189.8      categorizing
 435.2      choppiness
 084.7      chieftain
 757        substantiating
 V58.49     unbridled
 V88.0      polish
 324.0      stumble
 V65.44     hoopster
 411.8      overtrimmed
 E869.3     overbrutalizing
 041.04     choric
 E936.3     busera
 277.82     subdelegating
 E816.7     baton   
 663.90     Space

My question is how do I, match the codes in the first dataset with codes in the second lookup dataset and replace the matching codes with their corresponding short description ?

The expected output below shows the codes 345.50 matched and replaced with interStellar , V58.49 matched and replaced with unbridled I am hoping for an output where all the codes are matched and replaced with their corresponding descriptions. I know how to do this using if-then-else but it will be very inefficient and I am assuming there should be some easy way to do this. Any help is much appreciated. Thanks in advance.

  ID  Date        Gender Age  Col1    Col2    Col3    Col4    Col5    Col6    Col7    Col8 
  10  2015-10-14  F      68   interStellar  884.2   008.69  202.18  189.8   435.2   084.7   757
  93  2002-07-22  F      87   242.80  710.9   interStellar  884.2   008.69  202.18  189.8   435.2  
  14  2004-07-28  M      92   084.7   757     242.80  710.9   427.2   530.10  567.89  227.9
  41  2011-02-24  M      39   714.0   084.7   757     242.80  710.9   427.2   530.10  567.89
  64  2002-03-14  F      39   227.9   714.0   unbridled  906.7   800.35  V88.0   349.31  289.84 
  22  2015-11-21  F      68   324.0  hoopster  411.8   200.41  187.7   E869.3  041.04  170.4
  36  2003-09-17  F      75   389.1   176.3   788.37  E936.3  277.82  812.12  baton  663.90
  11  2000-10-07  M      74   716.90  396.3   482.1   baton  663.90  716.90  396.3   482.1 
  45  2001-07-14  F      31   614.2   945.44  799.4   864.05  371.31  268     626.2   780.72
  60  1999-02-23  M      45   674     645.2   006.5   V68.2   V67.00  665.24  434.00  914.3

==================== Reproducible dataset used in this example========================

df1 = structure(list(ID = c(10L, 93L, 14L, 41L, 64L, 22L, 36L, 11L, 
45L, 60L), Date = c("10/14/2015", "7/22/2002", "7/28/2004", "2/24/2011", 
"3/14/2002", "11/21/2015", "9/17/2003", "10/7/2000", "7/14/2001", 
"2/23/1999"), Gender = c("F", "F", "M", "M", "F", "F", "F", "M", 
"F", "M"), Age = c(68L, 87L, 92L, 39L, 39L, 68L, 75L, 74L, 31L, 
45L), Col1 = c(345.5, 242.8, 84.7, 714, 227.9, 324, 389.1, 716.9, 
614.2, 674), Col2 = c("884.2", "710.9", "757", "84.7", "714", 
"V65.44", "176.3", "396.3", "945.44", "645.2"), Col3 = c("8.69", 
"345.5", "242.8", "757", "V58.49", "411.8", "788.37", "482.1", 
"799.4", "6.5"), Col4 = c("202.18", "884.2", "710.9", "242.8", 
"906.7", "200.41", "E936.3", "E816.7", "864.05", "V68.2"), Col5 = c("189.8", 
"8.69", "427.2", "710.9", "800.35", "187.7", "277.82", "663.9", 
"371.31", "V67.00"), Col6 = c("435.2", "202.18", "530.1", "427.2", 
"V88.0", "E869.3", "812.12", "716.9", "268", "665.24"), Col7 = c("84.7", 
"189.8", "567.89", "530.1", "349.31", "41.04", "E816.7", "396.3", 
"626.2", "434"), Col8 = c(757, 435.2, 227.9, 567.89, 289.84, 
170.4, 663.9, 482.1, 780.72, 914.3)), class = c("tbl_df", "tbl", 
"data.frame"), row.names = c(NA, -10L), .Names = c("ID", "Date", 
"Gender", "Age", "Col1", "Col2", "Col3", "Col4", "Col5", "Col6", 
"Col7", "Col8"), spec = structure(list(cols = structure(list(
    ID = structure(list(), class = c("collector_integer", "collector"
    )), Date = structure(list(), class = c("collector_character", 
    "collector")), Gender = structure(list(), class = c("collector_character", 
    "collector")), Age = structure(list(), class = c("collector_integer", 
    "collector")), Col1 = structure(list(), class = c("collector_double", 
    "collector")), Col2 = structure(list(), class = c("collector_character", 
    "collector")), Col3 = structure(list(), class = c("collector_character", 
    "collector")), Col4 = structure(list(), class = c("collector_character", 
    "collector")), Col5 = structure(list(), class = c("collector_character", 
    "collector")), Col6 = structure(list(), class = c("collector_character", 
    "collector")), Col7 = structure(list(), class = c("collector_character", 
    "collector")), Col8 = structure(list(), class = c("collector_double", 
    "collector"))), .Names = c("ID", "Date", "Gender", "Age", 
"Col1", "Col2", "Col3", "Col4", "Col5", "Col6", "Col7", "Col8"
)), default = structure(list(), class = c("collector_guess", 
"collector"))), .Names = c("cols", "default"), class = "col_spec"))


lookup_table = structure(list(Code = c("345.5", "884.2", "8.69", "202.18", "189.8", 
"435.2", "84.7", "757", "V58.49", "V88.0", "324", "V65.44", "411.8", 
"E869.3", "41.04", "E936.3", "277.82", "E816.7", "63.9"), Short_Description = c("interStellar", 
"indispensable", "hallucination", "flow", "\tcategorizing", "choppiness", 
"chieftain", "\tsubstantiating", "unbridled", "polish", "stumble", 
"hoopster", "overtrimmed", "overbrutalizing", "choric", "busera", 
"subdelegating", "baton\t", "Space")), class = c("tbl_df", "tbl", 
"data.frame"), row.names = c(NA, -19L), .Names = c("Code", "Short_Description"
), spec = structure(list(cols = structure(list(Code = structure(list(), class = c("collector_character", 
"collector")), Short_Description = structure(list(), class = c("collector_character", 
"collector"))), .Names = c("Code", "Short_Description")), default = structure(list(), class = c("collector_guess", 
"collector"))), .Names = c("cols", "default"), class = "col_spec"))
Kim Jenkins
  • 438
  • 3
  • 17
  • 2
    `library(dplyr) ; df1 %>% mutate_at(vars(starts_with('Col')), funs(ifelse(. %in% lookup_table$Code, lookup_table$Short_Description[match(., lookup_table$Code)], .)))` – alistaire Nov 19 '16 at 02:44
  • @alistaire, this worked, how do I change the `vars(starts_with('Col'))` part of the code such that I can select variables that starts with `Col` or `Secondary` – Kim Jenkins Nov 19 '16 at 10:56
  • 1
    `vars(matches('Col|Secondary'))` maybe. Documentation: `?dplyr::select_helpers` – alistaire Nov 19 '16 at 12:13
  • this no longer works, I get the following error: Warning message: `funs()` was deprecated in dplyr 0.8.0. Please use a list of either functions or lambdas: – lu-202 Feb 28 '22 at 11:17

2 Answers2

2

We can do this with gather/spread by reshaping the 'wide' dataset to 'long' format initially with gather, do a left_join with the 'lookup_table', mutate the 'Code' by replacing the elements in 'Code' with 'Short_Description' (where it is not a missing value) and spread to 'wide' format after selecting the required columns (removing 'Short_Description').

library(dplyr)
library(tidyr)
gather(df1, Var, Code, matches("Col")) %>% 
      left_join(., lookup_table) %>% 
      mutate(Code = if_else(!is.na(Short_Description), Short_Description, Code)) %>% 
      select(-Short_Description) %>%
      spread(Var, Code)

For large datasets, another option is set from data.table. Create a numeric index of column names that have 'Col' as substring ('nm1'). Convert the 'data.frame' to 'data.table' (setDT(df1)), loop through the 'nm1' columns after specifying the .SDcols and convert it to character (as the expected output will be having character strings from the 'Short_Description' column. Use a for loop and use set to change the 'values' (using match) of the columns and rows specified in the 'i'.

library(data.table)
nm1 <- grep("Col", names(df1))
setDT(df1)[, (nm1) := lapply(.SD, as.character), .SDcols= nm1]
for(j in nm1){
  set(df1, i = which(df1[[j]] %chin% lookup_table$Code), j = j,
    value = lookup_table$Short_Description[match(df1[[j]], lookup_table$Code, nomatch=0)])
 }

df1
akrun
  • 874,273
  • 37
  • 540
  • 662
  • great solution but I got an error saying, Out of memory , this is mainly because I am dealing with a large dataset with many columns and atleast million rows. However ,`alistaire`, solution worked – Kim Jenkins Nov 19 '16 at 10:53
  • @KimJenkins Thanks for the comments. I updated with a data.table option. Could you check if that works – akrun Nov 19 '16 at 11:10
  • 1
    ignore my previous comment. The latest version worked like a knife on butter. It was smooth * 100. – Kim Jenkins Nov 19 '16 at 11:22
1

You can do a loop for that, using lookup from qdap package

library(qdap)

df1[,13:20] <- NA

for(i in 1:dim(df1)[1]){
  for(j in 1:8){
    df1[i,j+12] <- lookup(df1[i,j+4], lookup_table)
  }
}

head(df1)

  ID       Date Gender Age  Col1   Col2   Col3   Col4   Col5   Col6   Col7   Col8          V13              V14              V15           V16            V17             V18            V19              V20
1 10 10/14/2015      F  68 345.5  884.2   8.69 202.18  189.8  435.2   84.7    757 interStellar    indispensable    hallucination          flow \tcategorizing      choppiness      chieftain \tsubstantiating
2 93  7/22/2002      F  87 242.8  710.9  345.5  884.2   8.69 202.18  189.8  435.2         <NA>             <NA>     interStellar indispensable  hallucination            flow \tcategorizing       choppiness
3 14  7/28/2004      M  92  84.7    757  242.8  710.9  427.2  530.1 567.89  227.9    chieftain \tsubstantiating             <NA>          <NA>           <NA>            <NA>           <NA>             <NA>
4 41  2/24/2011      M  39   714   84.7    757  242.8  710.9  427.2  530.1 567.89         <NA>        chieftain \tsubstantiating          <NA>           <NA>            <NA>           <NA>             <NA>
5 64  3/14/2002      F  39 227.9    714 V58.49  906.7 800.35  V88.0 349.31 289.84         <NA>             <NA>        unbridled          <NA>           <NA>          polish           <NA>             <NA>
6 22 11/21/2015      F  68   324 V65.44  411.8 200.41  187.7 E869.3  41.04  170.4      stumble         hoopster      overtrimmed          <NA>           <NA> overbrutalizing         choric             <NA>
aldo_tapia
  • 1,153
  • 16
  • 27
  • appreciated the solution aldo but I am not even going to try this option because like I mentioned earlier I have several hundred columns and million rows in my dataset and if I try your approach it looks like I will be adding twice the number of columns and this will cause memory issues. – Kim Jenkins Nov 19 '16 at 11:27