T-test between defined rows in the same data set

Question

That's how my data looks like:

> dput(data)
structure(list(Name = c("Mark", "Tere", "Marcus", "Heidi", "Georg", 
"Tieme", "Joan", "Tytus", "Mark", "Tere", "Marcus", "Heidi", 
"Georg", "Tieme", "Joan", "Tytus", "Mark", "Tere", "Marcus", 
"Heidi", "Georg", "Tieme", "Joan", "Tytus", "Mark", "Tere", "Marcus", 
"Heidi", "Georg", "Tieme", "Joan", "Tytus", "Mark", "Tere", "Marcus", 
"Heidi", "Georg", "Tieme", "Joan", "Tytus", "Mark", "Tere", "Marcus", 
"Heidi", "Georg", "Tieme", "Joan", "Tytus"), position = structure(c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("n", 
"w"), class = "factor"), time = c(0, 0, 0, 0, 0, 0, 0, 0, 1, 
1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 
0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2), type = c(5, 
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
5, 5, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 
3, 3, 3, 3, 3), value = structure(c(0.314543267094949, 3.19693140536703, 
0.930679293700748, 0.344465436879545, 1.12925174828143, 0.295226529240608, 
0.534869402647018, 0.699742971466516, 0.899450918860775, 1.69467910557034, 
0.105376912280917, 2.49469271475885, 0.0343013256788254, 0.469440043556817, 
0.917351596015406, 1.46852763733807, 0.967872510622156, 0.354032471310347, 
1.52519534333589, 1.04999953913746, 1.90298544048312, 0.265492846723646, 
1.19947244036467, 2.54636084834549, 0.401142360642552, 0.190538682818794, 
1.55882554492893, 1.27338900133492, 0.298795571085066, 1.73941845252159, 
0.133721855003387, 0.115250693634152, 1.12742130434271, 1.50438847943189, 
0.0286978152580559, 0.740798383999788, 0.838944700504553, 0.726393882538543, 
1.79097043714466, 0.214124974329025, 1.46675111903789, 0.0172332774632657, 
2.89433199895506, 3.07798819227104, 0.826817313167167, 0.72394085004025, 
0.083957624156028, 1.07571752737732), .Dim = c(48L, 1L))), .Names = c("Name", 
"position", "time", "type", "value"), row.names = c(NA, 48L), class = "data.frame")

I would like to perform t-test of value in specific rows. How to find the rows for which t-test will be performed:

1. Name has to be the same.

2. Type has to be the same.

3. Time = 0 is always a reference so t-test should be calculated between time = 1 and time = 0, time = 2 and time= 0, and so on. That's an example data so I can't tell you how many time is in real data but 0 is always a reference.

4. Output (results of t-test) can be stored in additional column and in the row with specified time.

5. Number of Names and Times I do not know. The data contains thousands of rows.

6. I believe that the loop is the only solution and anyway it is my desired code. Of course additional solution without the loop is more than welcome.

It's a bit of programming so I am going to start a bounty to reward someone who will put some effort on that.

This may be helpful: http://stackoverflow.com/questions/37474672/looping-through-t-tests-for-data-frame-subsets-in-r/37479506#37479506 — coffeinjunky, Jun 08 '16 at 13:01

akrun · Accepted Answer · 2016-06-02T18:12:34.217

5

We can try with data.table

library(data.table)
setDT(data, keep.rownames= TRUE)
dt1 <- data[time==0]
dt2 <- data[time!=0]
res <- dt2[dt1, on = c("Name", "type")][, .(r1= rn, r2 = i.rn, time0 = i.time, 
       Othertime = time, pval = t.test(value, i.value)$p.value), .(Name, type)]
res1 <-  melt(res, measure = patterns("^r\\d+", "time"),
      value.name= c("rn", "time"))[, variable := NULL][]
res2 <- unique(res1, by = c("Name", "type", "rn"))
res3 <-  data[res2, on = "rn"][order(as.numeric(rn))][,
                    c('i.Name', 'i.type', 'i.time') := NULL][]
head(res3,3)
#  rn   Name position time type     value       pval
#1:  1   Mark        n    0    5 0.3145433 0.03514213
#2:  2   Tere        n    0    5 3.1969314 0.19052234
#3:  3 Marcus        n    0    5 0.9306793 0.89741695

Update

Using the OP's real data

file <- "C:/Users/akrun/Misc/Data_to_analyze.csv"
data1 <- fread(file)
dt1 <- data1[tp==0]
dt2 <- data1[tp!=0]

res <- dt2[dt1, on = c("protein", "fract"), allow.cartesian = TRUE
  ][, if(.N >1) {
      .(r1= V1, r2 = i.V1, time0 = i.tp,Othertime = tp, 
         pval = t.test(abundance, i.abundance)$p.value)
    } # else .(r1=V1, r2 = i.V1, time0 = i.tp, Othertime = tp, pval = NA_real_)
         , by = .(protein, fract)]


res1 <-  melt(res, measure = patterns("^r\\d+", "time"),
          value.name= c("V1", "time"))[, variable := NULL][]
res2 <- unique(res1, by = c("protein", "fract", "V1"))

res3 <-  data1[res2, on = "V1"][order(as.numeric(V1))][,
                   c('i.protein', 'i.fract') := NULL][]
 head(res3)
 #   V1   protein line tp fract   abundance           pval time
 #1:  1 PCP000002    n  0     1  46697305.3 0.515626208527    1
 #2:  2 PCP000007    n  0     1   8384565.8 0.000643157099    1
 #3:  3 PCP000008    n  0     1    570026.3 0.000006871077    1
 #4:  4 PCP000012    n  0     1   1018257.3 0.085610886030    1
 #5:  5 PCP000017    n  0     1 157877521.4 0.016528856828    1
 #6:  6 PCP000018    n  0     1 426215586.5 0.046130079694    1

edited Jun 02 '16 at 18:12

answered May 31 '16 at 13:04

akrun

874,273
37
540
662

Looks good for the example data when I apply it for my real data I got this error - `Error in vecseq(f__, len__, if (allow.cartesian || notjoin || !anyDuplicated(f__, : Join results in 32276 rows; more than 21579 = nrow(x)+nrow(i). Check for duplicate key values in i each of which join to the same group in x over and over again. If that's ok, try by=.EACHI to run j for each group to avoid the large allocation. If you are sure you wish to proceed, rerun with allow.cartesian=TRUE. Otherwise, please search for this error message in the FAQ, Wiki, Stack Overflow and datatable-help for advice` – Shaxi Liver May 31 '16 at 14:03
3

@ShaxiLiver, read it. It explains the issue and provides a solution. – Arun May 31 '16 at 16:51
@ShaxiLiver As Arun mentioned, in the original dataset you may have duplicate keys. In that case, you may need a cartesian join – akrun May 31 '16 at 19:10
You are right, but I have tried with the setting `allow.cartesian=TRUE` but it didn't not help. I ended up with another error `Error in `[.data.table`(dt2_test[dt_test, on = c("name", "type"), : Column 3 of j's result for the first group is NULL. We rely on the column types of the first result to decide the type expected for the remaining groups (and require consistency). NULL columns are acceptable for later groups (and those are replaced with NA of appropriate type and recycled) but not for the first. Please use a typed empty vector instead, such as integer() or numeric().` – Shaxi Liver Jun 01 '16 at 10:24
@ShaxiLiver Could you update the post with another example with duplicates and the expected output for testing. – akrun Jun 01 '16 at 10:28
I will do that. It is clearly fault of my data. I could not create similar to my real data. – Shaxi Liver Jun 01 '16 at 10:29
I added a link to my real data. The desired output is exactly like you have done with the example I provided. – Shaxi Liver Jun 01 '16 at 10:38
@ShaxiLiver I am at workplace. I can't access the site. – akrun Jun 01 '16 at 10:48
Is there any other way I can share that data with you ? Even if I `dput` hundred of rows might be not enough to get whole view of my data. Is there any way to dput first 50 rows, and some from the end ? – Shaxi Liver Jun 01 '16 at 11:15
`dput(rbind(head(data,50), tail(data,50)))` – dww Jun 02 '16 at 14:56
1

@ShaxiLiver Updated using the real dataset. – akrun Jun 02 '16 at 18:12
It works right now! Thanks. Would it be possible to get a loop solution as well ? – Shaxi Liver Jun 06 '16 at 09:07
@ShaxiLiver I guess this would be efficient than a loop one. Any particular reason you prefer loops? – akrun Jun 06 '16 at 11:22
People "above" me strongly suggested that it should be performed in the loop... Cannot do anything about that. There isn't any logical reason behind. – Shaxi Liver Jun 06 '16 at 11:27

score 1 · Answer 2 · edited May 23 '17 at 11:58

You were asking explicitly for a solution involving a loop. While there are other more appropriate ways of achieving this, let me show you an example with a loop here. More elegant solutions can be found here.

First, note that for some combinations, the number of observations will be too small to carry out a t-test. Let us therefore define a t-test function that will not disturb the continuation of the loop if this happens:

library(dplyr)
failproof.t <- failwith("t-test failed", t.test, quiet=T)

Now, let us pre-assign the relevant columns to the dataset.

df <- read.csv("Data_to_analyze.csv")
df$t_stat <- df$p_val <- NA

Now we can just loop through all combinations as follows:

for(protein in unique(df$protein)){
  for(fract in unique(df$fract)){
    for(tp in unique(df$tp)[-1]){ # -1 to exclude tp==0
     out <- failproof.t(
      df[df$protein==protein & df$fract==fract & df$tp==0, "abundance"],
      df[df$protein==protein & df$fract==fract & df$tp==tp, "abundance"]
      )
     if(is.list(out)){
      df[df$protein==protein & df$fract==fract & df$tp==tp, "t_stat"] <- out$statistic
      df[df$protein==protein & df$fract==fract & df$tp==tp, "p_val"] <- out$p.value
    }
    }
  }
}

This will give you the t-statistic and the p-values as a new column in the original dataset df.

However, note a few things though:

The code is opaque and difficult to read.
The code is slow and inefficient.

There is, in fact, no reason to prefer this code over the above data.table solution. Even though your superiors "strongly suggested" using a loop, you could educate them that there are better ways of achieving this.

I agree with you. The solution proposed by akrun is better. I managed to calculate the t-test on my own as well. I rearanged the table using `subset` function and later on I performed `row.t.test`. I came back with the solution but they were not happy and suggested to create a loop. I learned R using stackoverflow and as you know most of the time people try to avoid loop. It is the reason as well I am really bad with creating one. Thx! I will test it as soon as possible. — Shaxi Liver, Jun 09 '16 at 07:07

T-test between defined rows in the same data set

2 Answers2

Update