0

In my data frame there are rows with same ID but different values for test year and age. I'd like to collapse the duplicate rows and create new columns for the different values.

I'm new in R and have been struggling with it for a while.

This is the data frame:

>df
    id     project     testyr1      testyr2    age1     age2
1   16S       AS       2008         NA         29       NA 
2   32S       AS       2004         NA         30       NA 
3   37S       AS       NA           2011       NA       36 
4   50S       AS       2004         NA         23       NA 
5   50S       AS       1998         NA         16       NA 
6   55S       AS       2007         NA         28       NA 

testyr1 should have the earliest year and testyr2 the latest year. age1 should be the younger age and age2 the older age.

The output should be:

      id   project    testyr1   testyr2   age1   age2   
1     16S       AS    2008      NA        29     NA  
2     32S       AS    2004      NA        30     NA  
3     37S       AS    NA        2011      NA     36  
4     50S       AS    1998      2004      16     23  
6     55S       AS    2007      NA        28     NA  

I tried to write a loop but don't know how to end it:

df.undup <- c()
df.undup <- c()    
for (i in 1:nrow(df)){   
  if i == i+1    
    df$testyr1 != NA {   

    testyr2 = max(testyr1)   
    testyr1 = min(testyr1)   
    nage2 = max(nage1)   
    nage1 = min(nage1)   
  }   
 else{   
    testyr2 = max(testyr2)   
    testyr1 = min(testyr2)   
    nage2 = max(nage2)   
    nage1 = min(nage2)   
  }   
}   

Any help would be greatly appreciated.

Kev
  • 118,037
  • 53
  • 300
  • 385

2 Answers2

3
library(plyr)

data <- read.csv(textConnection("id,project,testyr1,testyr2,age1,age2
16S,AS,2008,NA,29,NA
32S,AS,2004,NA,30,NA
37S,AS,NA,2011,NA,36
50S,AS,2004,NA,23,NA
50S,AS,1998,NA,16,NA
55S,AS,2007,NA,28,NA"))


new_data <- ddply(data, .(id), function(x) {
  return(data.frame(id = unique(x$id), project = unique(x$project), 
    testyr1 = min(x$testyr1), 
    testyr2 = max(x$testyr2), age1= min(x$age1), age2 = max(x$age2)))
    })

> new_data

    id project testyr1 testyr2 age1 age2
1 16S      AS    2008      NA   29   NA
2 32S      AS    2004      NA   30   NA
3 37S      AS      NA    2011   NA   36
4 50S      AS    2004      NA   23   NA
5 50S      AS    1998      NA   16   NA
6 55S      AS    2007      NA   28   NA

# But your result example suggests you want the lowest 
# of testyr to be in testyr1 and the highest of the combined
# testyrs to be in testyr2. Same logic for ages.
# If so, the one below should work:

new_data <- ddply(data, .(id), function(x) {
    if(dim(x)[1]>1) {
    years <- c(x$testyr1, x$testyr2)
    ages <-  c(x$age1, x$age2)
    return(data.frame(id = unique(x$id), project = unique(x$project), 
        testyr1 = min(years, na.rm=T), testyr2 = max(years , na.rm=T), 
        age1= min(ages, na.rm=T), age2 = max(ages, na.rm=T)))   
    } else {
    return(data.frame(id = unique(x$id), project = unique(x$project), 
        testyr1 = x$testyr1, testyr2 = x$testyr2, 
        age1= x$age1, age2 = x$age2)) 
    }       
    })

> new_data
   id project testyr1 testyr2 age1 age2
1 16S      AS    2008      NA   29   NA
2 32S      AS    2004      NA   30   NA
3 37S      AS      NA    2011   NA   36
4 50S      AS    1998    2004   16   23
5 55S      AS    2007      NA   28   NA
Maiasaura
  • 32,226
  • 27
  • 104
  • 108
  • Hi Maiassaura, thanks for the code. However I got an error message: Error in data.frame(id = unique(CADD.age.bd.u$id), project = unique(CADD.age.bd.u$project), : arguments imply differing number of rows: 1, 0 – user1566478 Jul 31 '12 at 22:27
  • This means that there is more than one project for each ID. If so then change the `ddply` call to `ddply(data, .(id, project)` so it's splitting by that combination. – Maiasaura Jul 31 '12 at 23:29
0

I really doubt this is the most effective way to do this, but my brain is not functioning at the moment.

temp = names(which(table(df$id) > 1))
temp1 = vector("list")
for (i in 1:length(temp)) {
  temp1[[i]] = df[df$id == temp[i], ]
  temp1[[i]] = data.frame(temp1[[i]][1, 1:2], 
                     testyr1 = min(temp1[[i]]$testyr1), 
                     testyr2 = max(temp1[[i]]$testyr1), 
                     age1 = min(temp1[[i]]$age1), 
                     age2 = max(temp1[[i]]$age1))
}

rbind(df[-c(which(df$id %in% temp)), ], do.call(rbind, temp1))
#    id project testyr1 testyr2 age1 age2
# 1 16S      AS    2008      NA   29   NA
# 2 32S      AS    2004      NA   30   NA
# 3 37S      AS      NA    2011   NA   36
# 6 55S      AS    2007      NA   28   NA
# 4 50S      AS    1998    2004   16   23

### rm(i, temp, temp1) ### Cleanup the workspace
A5C1D2H2I1M1N2O1R2T1
  • 190,393
  • 28
  • 405
  • 485
  • Thanks for the script mrdwab! However I got an error message: Error in rbind(deparse.level, ...) : numbers of columns of arguments do not match – user1566478 Jul 31 '12 at 22:58
  • I removed some extra columns and it worked. Thanks a lot for your help! – user1566478 Jul 31 '12 at 23:50
  • @user1566478, if this or the other suggestion worked for you, do be sure to up-vote them or mark as accepted to help keep the "unanswered" queue in the R tag tidy. Thanks! – A5C1D2H2I1M1N2O1R2T1 Dec 06 '12 at 10:15