7

I have a dataframe where I want to change the column names by matching to another dataframe.

Example dataframe with data and column names:

df <- data.frame("Gene_Symbol" = c("Gene1","Gene2","Gene3","Gene4","Gene5","Gene6","Gene7"), 
             "Sample1" = c(85657.97656,54417.78906,110949.3281,53197.45313,87156.80469,NA,23880.2832), 
             "Sample2" = c(10423.40918,41660.73047,40094.54688,49519.78125,129387.1094,NA,23903.25977), 
             "Sample3" = c(18778.68359,43655.79688,NA,57447.08984,113266.1484,44810.26172,26316.6543), 
             "Sample4" = c(23919.53125,47829.02344,NA,51478.58203,116275.3359,43110.94922,25417.45508), 
             "Sample5" = c(NA,46677.20313,63389.45313,48722.15234,NA,77135.52344,40265.6875), 
             "Sample6" = c(NA,68596.22656,56802.60938,44712.64063,NA,47744.17969,33689.62891), 
             "Sample7" = c(NA,80506.14844,48722.99219,38629.00781,NA,37885,36638.02344))

The datframe I want to use to exchange from Sample to the Tumor number in df above.

df2 <- data.frame("Sample_name" = c("Sample1","Sample2","Sample3","Sample4","Sample5","Sample6", "Sample7"), "Tumor_name" = c("Tumor56", "Tumor17", "Tumor99", "Tumor2", "Tumor34", "Tumor84", "Tumor51"))

I found a way in dplyr, see below, but it feels very elaborate. Is there an easier way?

library(tidyverse)
df %>% 
  column_to_rownames("Gene_Symbol")%>%                    # Bring Gene_Symbol to row name before transpose
  t()%>%                                                  # Transpose to be able to use join
  data.frame()%>%                                         # Transpose makes a matrix - make dataframe again
  rownames_to_column("Sample_name")%>%                    # Bring sample names to column to use join
  left_join(., df2, by = "Sample_name", copy = TRUE) %>%  # Join by Sample_name column in both data sets
  column_to_rownames("Tumor_name")%>%                     # Bring Tumor names to row name before transpose
  select(-Sample_name)%>%                                 # Drop Sample name column 
  t()%>%                                                  # Transpose 
  data.frame()%>%                                         # Transpose makes a matrix - make dataframe again 
  rownames_to_column("Gene_Symbol")                       # Transfer rownames to column again

It would be nice with matching to exchange name, since I can foresee that I will need to do this for subsets of the column names. Looked at rename but could not get it to work. Also, when I transpose, I get a matrix, why is that?

starball
  • 20,030
  • 7
  • 43
  • 238
Henrik
  • 105
  • 3
  • Realized I can use rename in dplyr and named characters. df3 <- c(Tumor56 = "Sample1", Tumor17 ="Sample2", Tumor99 = "Sample3", Tumor2 = "Sample4", Tumor34 = "Sample5", Tumor84 = "Sample6", Tumor51 = "Sample7") and then df %>% rename(., !!df3) but how can I import a named character list? – Henrik Feb 08 '20 at 20:43

5 Answers5

6

Here is a tidyverse friendly solution using the !!! splice operator.

library(tidyverse)

# original data set up from stack overflow -------------------------------------
df <- data.frame("Gene_Symbol" = c("Gene1","Gene2","Gene3","Gene4","Gene5","Gene6","Gene7"), 
                 "Sample1" = c(85657.97656,54417.78906,110949.3281,53197.45313,87156.80469,NA,23880.2832), 
                 "Sample2" = c(10423.40918,41660.73047,40094.54688,49519.78125,129387.1094,NA,23903.25977), 
                 "Sample3" = c(18778.68359,43655.79688,NA,57447.08984,113266.1484,44810.26172,26316.6543), 
                 "Sample4" = c(23919.53125,47829.02344,NA,51478.58203,116275.3359,43110.94922,25417.45508), 
                 "Sample5" = c(NA,46677.20313,63389.45313,48722.15234,NA,77135.52344,40265.6875), 
                 "Sample6" = c(NA,68596.22656,56802.60938,44712.64063,NA,47744.17969,33689.62891), 
                 "Sample7" = c(NA,80506.14844,48722.99219,38629.00781,NA,37885,36638.02344))

df2 <- data.frame(
  "Sample_name" = c("Sample1","Sample2","Sample3","Sample4","Sample5","Sample6", "Sample7"), 
  "Tumor_name" = c("Tumor56", "Tumor17", "Tumor99", "Tumor2", "Tumor34", "Tumor84", "Tumor51")
  )


# create named vector of variable names ----------------------------------------
# values are current variable names, vector names are the new variable names
var_names <- df2 %>% 
  # new variable names, old variable names
  select(Tumor_name, Sample_name) %>% 
  deframe()

var_names
#>   Tumor56   Tumor17   Tumor99    Tumor2   Tumor34   Tumor84   Tumor51 
#> "Sample1" "Sample2" "Sample3" "Sample4" "Sample5" "Sample6" "Sample7"

# rename vaiables---------------------------------------------------------------
df_updated <-  df %>% 
  rename(!!!var_names)

df
#>   Gene_Symbol   Sample1   Sample2   Sample3   Sample4  Sample5  Sample6
#> 1       Gene1  85657.98  10423.41  18778.68  23919.53       NA       NA
#> 2       Gene2  54417.79  41660.73  43655.80  47829.02 46677.20 68596.23
#> 3       Gene3 110949.33  40094.55        NA        NA 63389.45 56802.61
#> 4       Gene4  53197.45  49519.78  57447.09  51478.58 48722.15 44712.64
#> 5       Gene5  87156.80 129387.11 113266.15 116275.34       NA       NA
#> 6       Gene6        NA        NA  44810.26  43110.95 77135.52 47744.18
#> 7       Gene7  23880.28  23903.26  26316.65  25417.46 40265.69 33689.63
#>    Sample7
#> 1       NA
#> 2 80506.15
#> 3 48722.99
#> 4 38629.01
#> 5       NA
#> 6 37885.00
#> 7 36638.02
df_updated
#>   Gene_Symbol   Tumor56   Tumor17   Tumor99    Tumor2  Tumor34  Tumor84
#> 1       Gene1  85657.98  10423.41  18778.68  23919.53       NA       NA
#> 2       Gene2  54417.79  41660.73  43655.80  47829.02 46677.20 68596.23
#> 3       Gene3 110949.33  40094.55        NA        NA 63389.45 56802.61
#> 4       Gene4  53197.45  49519.78  57447.09  51478.58 48722.15 44712.64
#> 5       Gene5  87156.80 129387.11 113266.15 116275.34       NA       NA
#> 6       Gene6        NA        NA  44810.26  43110.95 77135.52 47744.18
#> 7       Gene7  23880.28  23903.26  26316.65  25417.46 40265.69 33689.63
#>    Tumor51
#> 1       NA
#> 2 80506.15
#> 3 48722.99
#> 4 38629.01
#> 5       NA
#> 6 37885.00
#> 7 36638.02

Created on 2022-02-24 by the reprex package (v2.0.1)

Shannon Pileggi
  • 101
  • 1
  • 3
3

We could use match

names(df)[-1] <- as.character(df2$Tumor_name[match(names(df)[-1], df2$Sample_name)])
df

#  Gene_Symbol   Tumor56   Tumor17   Tumor99    Tumor2  Tumor34  Tumor84  Tumor51
#1       Gene1  85657.98  10423.41  18778.68  23919.53       NA       NA       NA
#2       Gene2  54417.79  41660.73  43655.80  47829.02 46677.20 68596.23 80506.15
#3       Gene3 110949.33  40094.55        NA        NA 63389.45 56802.61 48722.99
#4       Gene4  53197.45  49519.78  57447.09  51478.58 48722.15 44712.64 38629.01
#5       Gene5  87156.80 129387.11 113266.15 116275.34       NA       NA       NA
#6       Gene6        NA        NA  44810.26  43110.95 77135.52 47744.18 37885.00
#7       Gene7  23880.28  23903.26  26316.65  25417.46 40265.69 33689.63 36638.02
Ronak Shah
  • 377,200
  • 20
  • 156
  • 213
1

One approach utilizing dplyr and stringr could be:

df %>%
 rename_at(vars(starts_with("Sample")), 
           ~ str_replace_all(., setNames(df2$Tumor_name, df2$Sample_name)))

  Gene_Symbol   Tumor56   Tumor17   Tumor99    Tumor2  Tumor34  Tumor84  Tumor51
1       Gene1  85657.98  10423.41  18778.68  23919.53       NA       NA       NA
2       Gene2  54417.79  41660.73  43655.80  47829.02 46677.20 68596.23 80506.15
3       Gene3 110949.33  40094.55        NA        NA 63389.45 56802.61 48722.99
4       Gene4  53197.45  49519.78  57447.09  51478.58 48722.15 44712.64 38629.01
5       Gene5  87156.80 129387.11 113266.15 116275.34       NA       NA       NA
6       Gene6        NA        NA  44810.26  43110.95 77135.52 47744.18 37885.00
7       Gene7  23880.28  23903.26  26316.65  25417.46 40265.69 33689.63 36638.02
tmfmnk
  • 38,881
  • 4
  • 47
  • 67
1

We could reshape wide-to-long, merge, then reshape again to long-to-wide:

library(dplyr)
library(tidyr)

pivot_longer(df, cols = starts_with("S"), names_to = "Sample_name") %>% 
  left_join(df2, by = "Sample_name") %>% 
  pivot_wider(id_cols = Gene_Symbol, names_from = Tumor_name, values_from = value)

## A tibble: 7 x 8
#  Gene_Symbol Tumor56 Tumor17 Tumor99  Tumor2 Tumor34 Tumor84 Tumor51
#  <chr>         <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
#1 Gene1        85658.  10423.  18779.  23920.     NA      NA      NA 
#2 Gene2        54418.  41661.  43656.  47829.  46677.  68596.  80506.
#3 Gene3       110949.  40095.     NA      NA   63389.  56803.  48723.
#4 Gene4        53197.  49520.  57447.  51479.  48722.  44713.  38629.
#5 Gene5        87157. 129387. 113266. 116275.     NA      NA      NA 
#6 Gene6           NA      NA   44810.  43111.  77136.  47744.  37885 
#7 Gene7        23880.  23903.  26317.  25417.  40266.  33690.  36638.
zx8754
  • 52,746
  • 12
  • 114
  • 209
0

As another tidyverse solution, you can also use select and all_of instead of rename and the splice operator !!!. Otherwise, this solution is identical to Shannon Pileggi’s excellent solution.

library(tidyverse)

# original data set up from stack overflow -------------------------------------
df <- data.frame("Gene_Symbol" = c("Gene1","Gene2","Gene3","Gene4","Gene5","Gene6","Gene7"),
                 "Sample1" = c(85657.97656,54417.78906,110949.3281,53197.45313,87156.80469,NA,23880.2832),
                 "Sample2" = c(10423.40918,41660.73047,40094.54688,49519.78125,129387.1094,NA,23903.25977),
                 "Sample3" = c(18778.68359,43655.79688,NA,57447.08984,113266.1484,44810.26172,26316.6543),
                 "Sample4" = c(23919.53125,47829.02344,NA,51478.58203,116275.3359,43110.94922,25417.45508),
                 "Sample5" = c(NA,46677.20313,63389.45313,48722.15234,NA,77135.52344,40265.6875),
                 "Sample6" = c(NA,68596.22656,56802.60938,44712.64063,NA,47744.17969,33689.62891),
                 "Sample7" = c(NA,80506.14844,48722.99219,38629.00781,NA,37885,36638.02344))

df2 <- data.frame(
  "Sample_name" = c("Sample1","Sample2","Sample3","Sample4","Sample5","Sample6", "Sample7"),
  "Tumor_name" = c("Tumor56", "Tumor17", "Tumor99", "Tumor2", "Tumor34", "Tumor84", "Tumor51")
)


# create named vector of variable names ----------------------------------------
# values are current variable names, vector names are the new variable names
var_names <- df2 %>%
  # new variable names, old variable names
  select(Tumor_name, Sample_name) %>%
  deframe()

var_names
#>   Tumor56   Tumor17   Tumor99    Tumor2   Tumor34   Tumor84   Tumor51 
#> "Sample1" "Sample2" "Sample3" "Sample4" "Sample5" "Sample6" "Sample7"

# rename variables by using `select` and `all_of` ------------------------------
# note that other variables like `Gene_Symbol` are selected independently
df_updated <-  df %>%
  select(Gene_Symbol, all_of(var_names))

df
#>   Gene_Symbol   Sample1   Sample2   Sample3   Sample4  Sample5  Sample6
#> 1       Gene1  85657.98  10423.41  18778.68  23919.53       NA       NA
#> 2       Gene2  54417.79  41660.73  43655.80  47829.02 46677.20 68596.23
#> 3       Gene3 110949.33  40094.55        NA        NA 63389.45 56802.61
#> 4       Gene4  53197.45  49519.78  57447.09  51478.58 48722.15 44712.64
#> 5       Gene5  87156.80 129387.11 113266.15 116275.34       NA       NA
#> 6       Gene6        NA        NA  44810.26  43110.95 77135.52 47744.18
#> 7       Gene7  23880.28  23903.26  26316.65  25417.46 40265.69 33689.63
#>    Sample7
#> 1       NA
#> 2 80506.15
#> 3 48722.99
#> 4 38629.01
#> 5       NA
#> 6 37885.00
#> 7 36638.02
df_updated
#>   Gene_Symbol   Tumor56   Tumor17   Tumor99    Tumor2  Tumor34  Tumor84
#> 1       Gene1  85657.98  10423.41  18778.68  23919.53       NA       NA
#> 2       Gene2  54417.79  41660.73  43655.80  47829.02 46677.20 68596.23
#> 3       Gene3 110949.33  40094.55        NA        NA 63389.45 56802.61
#> 4       Gene4  53197.45  49519.78  57447.09  51478.58 48722.15 44712.64
#> 5       Gene5  87156.80 129387.11 113266.15 116275.34       NA       NA
#> 6       Gene6        NA        NA  44810.26  43110.95 77135.52 47744.18
#> 7       Gene7  23880.28  23903.26  26316.65  25417.46 40265.69 33689.63
#>    Tumor51
#> 1       NA
#> 2 80506.15
#> 3 48722.99
#> 4 38629.01
#> 5       NA
#> 6 37885.00
#> 7 36638.02

Created on 2022-08-31 by the reprex package (v2.0.1)

mach
  • 175
  • 6