2

I have a data.table dat with 4 columns, say (col1, col2, col3, col4).

Input data:

structure(list(col1 = c(5.1, 5.1, 4.7, 4.6, 5, 5.1, 5.1, 4.7, 
4.6, 5), col2 = c(3.5, 3.5, 3.2, 3.1, 3.6, 3.5, 3.5, 3.2, 3.1, 
3.6), col3 = c(1.4, 1.4, 1.3, 1.5, 1.4, 3.4, 3.4, 1.3, 1.5, 1.4
), col4 = structure(c(1L, 1L, 1L, 1L, 1L, 4L, 4L, 4L, 4L, 4L), .Label = c("setosa", 
"versicolor", "virginica", "eer"), class = "factor")), .Names = c("col1", 
"col2", "col3", "col4"), row.names = c(NA, -10L), class = c("data.table", 
"data.frame"))

r
    col1 col2 col3   col4
 1:  5.1  3.5  1.4 setosa
 2:  5.1  3.5  1.4 setosa
 3:  4.7  3.2  1.3 setosa
 4:  4.6  3.1  1.5 setosa
 5:  5.0  3.6  1.4 setosa
 6:  5.1  3.5  3.4    eer
 7:  5.1  3.5  3.4    eer
 8:  4.7  3.2  1.3    eer
 9:  4.6  3.1  1.5    eer
10:  5.0  3.6  1.4    eer

I am performing a following operation on col3 for each unique value of col4

dat[ , r_new:= sum(col3, na.rm = T), .(col4)]    #syntax 1

So, above sytnax is creating a new column r_new with values got by adding those values of col3 where col4 is same. So, each unique value of col4 will have a unuique value in column r_new.

What I want to do now, is do the same as above but not include those rows where col1 and col2 are taking same values (something like below)

dat[col1 is different OR col2 is different , r_new:= sum(col3, na.rm = T), .(col4)]

What this will do, while performing sum function over rows, it will not include those rows where both col1 and col2 are taking same values.

How can I include this condition in the same syntax as 1?

Expected Output:

    col1 col2 col3   col4 r_new
 1:  5.1  3.5  1.4 setosa   5.6
 2:  5.1  3.5  1.4 setosa   5.6
 3:  4.7  3.2  1.3 setosa   5.6
 4:  4.6  3.1  1.5 setosa   5.6
 5:  5.0  3.6  1.4 setosa   5.6
 6:  5.1  3.5  3.4    eer   7.6
 7:  5.1  3.5  3.4    eer   7.6
 8:  4.7  3.2  1.3    eer   7.6
 9:  4.6  3.1  1.5    eer   7.6
10:  5.0  3.6  1.4    eer   7.6

As you can see in the expected output, for setosa row 1 and 2 took same value for col1 and col2 and for err rows 6 and 7 took same values for col1 and col2, so we did not add those rows (we just considered them once). Dont worry about col3 (it will take same value if col1 and col2 are taking same values.

EDIT: Second dput:

structure(list(col1 = c(5.1, 5.1, 4.7, 4.6, 5, 5.1, 5.1, 4.7, 
4.6, 5.1), col2 = c(3.5, 3.5, 3.2, 3.1, 3.6, 3.5, 3.5, 3.2, 3.1, 
3.4), col3 = c(1.4, 1.4, 1.3, 1.5, 1.4, 3.4, 3.4, 1.3, 1.5, 3.4
), col4 = c("A", "A", "A", "A", "A", "B", "B", "B", "B", "B"), 
    count = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1), r_new = c(5.6, 5.6, 
    5.6, 5.6, 5.6, 9.6, 9.6, 9.6, 9.6, 9.6)), .Names = c("col1", 
"col2", "col3", "col4", "count", "r_new"), row.names = c(NA, 
-10L), class = c("data.table", "data.frame"))

    col1 col2 col3 col4 count r_new
 1:  5.1  3.5  1.4    A     1   5.6
 2:  5.1  3.5  1.4    A     1   5.6
 3:  4.7  3.2  1.3    A     1   5.6
 4:  4.6  3.1  1.5    A     1   5.6
 5:  5.0  3.6  1.4    A     1   5.6
 6:  5.1  3.5  3.4    B     1   9.6
 7:  5.1  3.5  3.4    B     1   9.6
 8:  4.7  3.2  1.3    B     1   9.6
 9:  4.6  3.1  1.5    B     1   9.6
10:  5.1  3.4  3.4    B     1   9.6

EDIT 2: Third dput

   col1 col2 col3 col4 count r_new
 1:  5.1  3.5  1.4    A     1   5.6
 2:  5.1  3.5  1.4    A     1   5.6
 3:  4.7  3.2  1.3    A     1   5.6
 4:  4.6  3.1  1.5    A     1   5.6
 5:  5.0  3.6  1.4    A     1   5.6
 6:  5.1  3.5  3.4    B     1   6.2
 7:  5.1  3.5  3.4    B     1   6.2
 8:  4.7  3.2  1.3    B     1   6.2
 9:  4.6  3.1  1.5    B     1   6.2
10:  5.1  3.5  3.4    B     1   6.2


structure(list(col1 = c(5.1, 5.1, 4.7, 4.6, 5, 5.1, 5.1, 4.7, 
4.6, 5.1), col2 = c(3.5, 3.5, 3.2, 3.1, 3.6, 3.5, 3.5, 3.2, 3.1, 
3.5), col3 = c(1.4, 1.4, 1.3, 1.5, 1.4, 3.4, 3.4, 1.3, 1.5, 3.4
), col4 = c("A", "A", "A", "A", "A", "B", "B", "B", "B", "B"), 
    count = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1), r_new = c(5.6, 5.6, 
    5.6, 5.6, 5.6, 6.2, 6.2, 6.2, 6.2, 6.2)), .Names = c("col1", 
"col2", "col3", "col4", "count", "r_new"), row.names = c(NA, 
-10L), class = c("data.table", "data.frame"))
user3664020
  • 2,980
  • 6
  • 24
  • 45

2 Answers2

4

We can subset col3 inside j using ?data.table::duplicated.

dat[, r_new := sum(col3[!duplicated(.SD, by = c("col1","col2"))], na.rm = T), by = col4]  

> dat
#      col1 col2 col3 col4 count r_new
# 1:  5.1  3.5  1.4    A     1   5.6
# 2:  5.1  3.5  1.4    A     1   5.6
# 3:  4.7  3.2  1.3    A     1   5.6
# 4:  4.6  3.1  1.5    A     1   5.6
# 5:  5.0  3.6  1.4    A     1   5.6
# 6:  5.1  3.5  3.4    B     1   6.2
# 7:  5.1  3.5  3.4    B     1   6.2
# 8:  4.7  3.2  1.3    B     1   6.2
# 9:  4.6  3.1  1.5    B     1   6.2
#10:  5.1  3.5  3.4    B     1   6.2
mtoto
  • 23,919
  • 4
  • 58
  • 71
  • Thanks. I had another doubt but I have posted it as another question. http://stackoverflow.com/questions/36233318/count-unique-by-2-columns-in-a-data-table Can you please answer that also? – user3664020 Mar 26 '16 at 08:43
  • Yes. I was experimenting and I found an issue. Please look at the second dput and your solution is not working. Please let me know, why? – user3664020 Mar 26 '16 at 09:37
  • In the second example, for value `B` the answer should be 9.6 and not 6.2. Why is it happening? – user3664020 Mar 26 '16 at 09:39
  • Well, not the reverse is happening. Please apply your updated solution on the new dput (Added in the question). This time answer should be 6.2 – user3664020 Mar 26 '16 at 09:53
  • Will look at it later, on my phone now. – mtoto Mar 26 '16 at 10:20
  • The `duplicated()` just needed to be on `.SD` rather than the whole `df`, iiuc. I tested with the minor fix so made the edit - hope ok. – Matt Dowle Mar 26 '16 at 11:27
  • @user3664020 Don't forget to accept this one please. I deleted my other answer from before I realized there was a simple fix for this one. – Matt Dowle Mar 28 '16 at 18:00
  • 1
    @MattDowle curious what your solution was. Big fan of `data.table`, you guys are doing an awesome job! – mtoto Mar 28 '16 at 20:56
  • @mtoto Sure - undeleted. Much thanks for encouragement. – Matt Dowle Mar 28 '16 at 21:41
2

Accept mtoto's answer as that's easier to read, but here's an alternative.

DT[, r_new:=unique(.SD,by=c("col1","col2"))[,sum(col3, na.rm=TRUE)], by=col4]
Matt Dowle
  • 58,872
  • 22
  • 166
  • 224