Remove duplicate records with 4 rows each

Question

I'm trying to remove duplicate elements from my dataframe.

# A tibble: 12 x 3
       g h         i
   <dbl> <chr> <int>
 1     1 a         1
 2     1 b         2
 3     1 c         3
 4     1 d         4
 5     2 a         5
 6     2 b         6
 7     2 c         7
 8     2 d         8
 9     1 a         9
10     1 b        10
11     1 c        11
12     1 d        12

But each element has 4 rows each. I want him to stay that way.

# A tibble: 8 x 3
      g h         i
  <dbl> <chr> <int>
1     1 a         1
2     1 b         2
3     1 c         3
4     1 d         4
5     2 a         5
6     2 b         6
7     2 c         7
8     2 d         8

I've tried distinct () or unique() but it didn't work.

score 2 · Accepted Answer · answered Feb 17 '21 at 19:59

We can use distinct on the selected columns

library(dplyr)
distinct(df1, g, h, .keep_all = TRUE)

-output

#  g h i
#1 1 a 1
#2 1 b 2
#3 1 c 3
#4 1 d 4
#5 2 a 5
#6 2 b 6
#7 2 c 7
#8 2 d 8

Or with duplicated

df1[!duplicated(df1[c('g', 'h')]),]

data

df1 <- structure(list(g = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 
1L, 1L), h = c("a", "b", "c", "d", "a", "b", "c", "d", "a", "b", 
"c", "d"), i = 1:12), class = "data.frame", row.names = c("1", 
"2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12"))

score 1 · Answer 2 · answered Feb 17 '21 at 20:31

Another option is to use unique S3 mehtod on data.table object:

library(data.table)

unique(
  data.table(dat),
  by = c('g', 'h')
)

#    g h i
# 1: 1 a 1
# 2: 1 b 2
# 3: 1 c 3
# 4: 1 d 4
# 5: 2 a 5
# 6: 2 b 6
# 7: 2 c 7
# 8: 2 d 8

Data

dat <- structure(
  list(
    g = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L),
    h = c("a", "b", "c", "d", "a", "b", "c", "d", "a", "b", "c", "d"),
    i = 1:12
    ),
  row.names = c(NA,-12L),
  class = c("tbl_df", "tbl", "data.frame")
)

Remove duplicate records with 4 rows each

2 Answers2

data

Data