-1

I have a data like this

df <- structure(list(string = structure(c(6L, 12L, 8L, 7L, 2L, 1L, 
6L, 12L, 9L, 5L, 11L, 6L, 10L, 3L, 4L, 4L), .Label = c("CGSKDNIKHVPGGGSVQIVYKPVDLSK", 
"ESPLQTPTEDGSEEPGSETSDAK", "HVPGGGSVQIVYKPVDLSKVTSK", "KDQGGYTMHQDQEGDTDAGLKESPLQTPTEDGSEEPGSETSDAK", 
"QEFEVMEDHAGTYGLGDR", "SKDGTGSDDKK", "SPSSAKSRLQTAPVPMPDLKNVK", 
"SRLQTAPVPMPDLK", "SRLQTAPVPMPDLKNVKSK", "SRLQTAPVPMPDLKNVKSKIGSTENLK", 
"STPTAEDVTAPLVDEGAPGK", "VQIINKKLDLSNVQSK"), class = "factor"), 
    key = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 
    2L, 3L, 3L, 3L, 3L, 3L), .Label = c("Mys: G52: ru1", "Mys: G52: ru2", 
    "Mys: G52: ru3"), class = "factor"), val = structure(c(3L, 
    15L, 2L, 11L, 9L, 5L, 13L, 6L, 1L, 7L, 8L, 16L, 12L, 4L, 
    10L, 14L), .Label = c("1442983324", "1451319531", "1512864.443", 
    "1612410048", "16349475.63", "1784901841", "30553282.01", 
    "317403612.9", "3612004.547", "3686081.063", "39135868.44", 
    "43701608", "64223793.8", "64959501.42", "775987137.8", "9767666215"
    ), class = "factor")), .Names = c("string", "key", "val"), class = "data.frame", row.names = c(NA, 
-16L))

I am trying to keep only those that are repeated 2 or more based on the second column.

For example in the above data we only can keep the following

SKDGTGSDDKK is in 3 of them (ru1, ru2 and ru3)
VQIINKKLDLSNVQSK    is in 2 of them (ru1, ru2) 

the rest of them happened to be only once based on the key

so the output will be

string                key               val
SKDGTGSDDKK         Mys: G52: ru1   1512864.443
SKDGTGSDDKK         Mys: G52: ru2   64223793.8
SKDGTGSDDKK         Mys: G52: ru3   9767666215
VQIINKKLDLSNVQSK    Mys: G52: ru1   775987137.8
VQIINKKLDLSNVQSK    Mys: G52: ru2   1784901841
nik
  • 2,500
  • 5
  • 21
  • 48
  • When i run your code, i also get two copies of KDQGGYTMHQDQEGDTDAGLKESPLQTPTEDGSEEPGSETSDAK – G5W Nov 28 '17 at 22:11
  • @G5W they are both for Mys: G52: ru3 so it does not count. it should be at least for two `ru` – nik Nov 28 '17 at 22:15

4 Answers4

1

You can do this from a table based on string and key

TAB = table(df$string, df$key) > 0
Repeated = rownames(TAB)[rowSums(TAB) > 1]
df[df$string %in% Repeated, ]
             string           key         val
1       SKDGTGSDDKK Mys: G52: ru1 1512864.443
2  VQIINKKLDLSNVQSK Mys: G52: ru1 775987137.8
7       SKDGTGSDDKK Mys: G52: ru2  64223793.8
8  VQIINKKLDLSNVQSK Mys: G52: ru2  1784901841
12      SKDGTGSDDKK Mys: G52: ru3  9767666215
G5W
  • 36,531
  • 10
  • 47
  • 80
1

I'll start with a base R solution, using a length+unique combination to ensure >= 2 keys per string:

# Split by string
lst <- split(df, df$string);

# Select list entries with >= 2 unique keys
sel <- sapply(lst, function(x) length(unique(x$key))) >= 2;

# Filter entries based on sel and convert to dataframe
df.sel <- do.call(rbind.data.frame, lst[sel]);
df.sel;
#                             string           key         val
#SKDGTGSDDKK.1           SKDGTGSDDKK Mys: G52: ru1 1512864.443
#SKDGTGSDDKK.7           SKDGTGSDDKK Mys: G52: ru2  64223793.8
#SKDGTGSDDKK.12          SKDGTGSDDKK Mys: G52: ru3  9767666215
#VQIINKKLDLSNVQSK.2 VQIINKKLDLSNVQSK Mys: G52: ru1 775987137.8
#VQIINKKLDLSNVQSK.8 VQIINKKLDLSNVQSK Mys: G52: ru2  1784901841

# Order by string then val
# Note: val is a factor so convert to numeric with as.numeric(as.character(...)
df.sel[order(df.sel$string, as.numeric(as.character(df.sel$val))), ];
#    string           key         val
#SKDGTGSDDKK.1           SKDGTGSDDKK Mys: G52: ru1 1512864.443
#SKDGTGSDDKK.7           SKDGTGSDDKK Mys: G52: ru2  64223793.8
#SKDGTGSDDKK.12          SKDGTGSDDKK Mys: G52: ru3  9767666215
#VQIINKKLDLSNVQSK.2 VQIINKKLDLSNVQSK Mys: G52: ru1 775987137.8
#VQIINKKLDLSNVQSK.8 VQIINKKLDLSNVQSK Mys: G52: ru2  1784901841

Update for df2

# Split by string
lst <- split(df2, df2$string);

# Select list entries with >= 2 unique keys
sel <- sapply(lst, function(x) length(unique(x$key))) >= 2;

# Filter entries based on sel, convert to dataframe, 
# and order by string then numeric val
df2.sel <- do.call(rbind.data.frame, lst[sel]);
options(digits = 9);
df2.sel$val <- as.numeric(as.character(df2.sel$val));
df2.sel <- df2.sel[order(df2.sel$string, df2.sel$val), ];
df2.sel;
#SKDGTGSDDKK.1                                  SKDGTGSDDKK Mys: G52: ru1
#SKDGTGSDDKK.2                                  SKDGTGSDDKK Mys: G52: ru2
#SKDGTGSDDKK.3                                  SKDGTGSDDKK Mys: G52: ru3
#SRLQTAPVPMPDLKNVKSK.12                 SRLQTAPVPMPDLKNVKSK Mys: G52: ru3
#SRLQTAPVPMPDLKNVKSK.13                 SRLQTAPVPMPDLKNVKSK Mys: G52: ru1
#SRLQTAPVPMPDLKNVKSK.11                 SRLQTAPVPMPDLKNVKSK Mys: G52: ru2
#SRLQTAPVPMPDLKNVKSKIGSTENLK.14 SRLQTAPVPMPDLKNVKSKIGSTENLK Mys: G52: ru3
#SRLQTAPVPMPDLKNVKSKIGSTENLK.15 SRLQTAPVPMPDLKNVKSKIGSTENLK Mys: G52: ru2
#VQIINKKLDLSNVQSK.4                        VQIINKKLDLSNVQSK Mys: G52: ru1
#VQIINKKLDLSNVQSK.5                        VQIINKKLDLSNVQSK Mys: G52: ru2
#                                         val
#SKDGTGSDDKK.1                     1512864.44
#SKDGTGSDDKK.2                    64223793.80
#SKDGTGSDDKK.3                  9767666215.00
#SRLQTAPVPMPDLKNVKSK.12           30553282.01
#SRLQTAPVPMPDLKNVKSK.13          317403612.90
#SRLQTAPVPMPDLKNVKSK.11         1442983324.00
#SRLQTAPVPMPDLKNVKSKIGSTENLK.14   43701608.00
#SRLQTAPVPMPDLKNVKSKIGSTENLK.15 1612410048.00
#VQIINKKLDLSNVQSK.4              775987137.80
#VQIINKKLDLSNVQSK.5             1784901841.00

They key is that as.numeric(as.character(...) will only work properly if we use options(digits=9) (see e.g. here).

Maurits Evers
  • 49,617
  • 4
  • 47
  • 68
  • I like your command, can you please somehow sort them by size (for example when we have it is great to have the first three together (if they are smaller in number of letters, then the others that are repeated but a bit bigger , etc etc ) . do you know what I say? I know if I use `order` it will just sort them but I want to keep them together (those that are similar) – nik Nov 28 '17 at 22:26
  • @nik I've edited my solution. You can use `order` with multiple arguments. In this case I order first by `string`, then by `val`. – Maurits Evers Nov 28 '17 at 22:31
  • I have tried the `order` unfortunately it does not. – nik Nov 28 '17 at 22:34
  • @nik Ah yes, I see, it's because `val` is a factor. I have edited my solution. Should work now. – Maurits Evers Nov 28 '17 at 22:36
  • @nik Argh, you're right. It's the factor to numeric conversion. Please see my update for `df2`. I've made the steps very explicit, you can roll them into a more succinct version. – Maurits Evers Nov 28 '17 at 22:55
  • I used the exact code you gave with `option=9` but it does not give me. Please look at above (I showed an output) maybe we are looking at two different things ? – nik Nov 28 '17 at 23:01
  • @nik I don't understand you're sorting. For example, on the `string` level you have `SK...` < `VQ...` > `SRL...`. So that's no lexicographical ordering. Then on top of that for `SRLQTAPVPMPDLKNVKSK` you have on the `val` level `1442983324` > `30553282.01` < `317403612.9`, which is not numerically sorted. I'm very confused how you want to order entries. In my `df2` example I first lexicographically order entries by `string`, then numerically by `val`. Is this not what you're after? – Maurits Evers Nov 28 '17 at 23:32
  • value is not important to be higher or lower. two things are important (1 the similar strings stays together + they are sorted from small to big) – nik Nov 29 '17 at 14:35
  • I accepted your answer and liked it. I asked that as another question . Thanks again – nik Nov 29 '17 at 14:44
  • @nik But they **are** sorted from small to large (numbers)! Do you mean the **length of the string** (i.e. the number of characters)? In which case you really should've made that clear in your post, because nowhere did you state that this is the quantity that you want to sort by. Either way, in that case all you need to do is `order(df2.sel$string, nchar(df2.sel$string))`. Anyway, glad it (partially) worked. – Maurits Evers Nov 29 '17 at 20:22
1

If you assume that there are not duplicate key values by string (not true in your case) the following will work:

pre_repeated <-
 with(df,aggregate(x=list(key=key),by=list(string=string),FUN=length))
repeated <- pre_repeated[which(pre_repeated$key>1),]
df[which(df$string %in% repeated$string),]

I can give you a more useful answer if you can tell me what you would like to do in cases where there are duplicate keys for a string. If you want to disregard the duplicates you could replace the FUN argument with function(x) length(unique(x)).

Thanks!

Nate
  • 364
  • 1
  • 5
1

Filter a data frame based on the number of unique values in column b for each value in column a:

df %>% group_by(string) %>% filter(length(unique(key)) > 1) %>% arrange(string, val)

Output:

# A tibble: 5 x 3
# Groups:   string [2]
            string           key         val
            <fctr>        <fctr>      <fctr>
1      SKDGTGSDDKK Mys: G52: ru1 1512864.443
2      SKDGTGSDDKK Mys: G52: ru2  64223793.8
3      SKDGTGSDDKK Mys: G52: ru3  9767666215
4 VQIINKKLDLSNVQSK Mys: G52: ru2  1784901841
5 VQIINKKLDLSNVQSK Mys: G52: ru1 775987137.8

Get just the unique string values from the above:

df %>% group_by(string) %>%
  filter(length(unique(key)) > 1) %>%
  select(string) %>%
  distinct()

Output:

# A tibble: 2 x 1
# Groups:   string [2]
            string
            <fctr>
1      SKDGTGSDDKK
2 VQIINKKLDLSNVQSK

Summarize to get the reason for keeping each:

df %>% group_by(string) %>%
  filter(length(unique(key)) > 1) %>%
  summarize(reason = paste0(
    "is in ", n(), " of them (", 
    paste(sub("Mys: G52: ", "", key), collapse = ", "), ")"
  ))

Output:

# A tibble: 2 x 2
            string                          reason
            <fctr>                           <chr>
1      SKDGTGSDDKK is in 3 of them (ru1, ru2, ru3)
2 VQIINKKLDLSNVQSK      is in 2 of them (ru1, ru2)
W. Murphy
  • 1,121
  • 8
  • 15
  • although it is not even close, I liked your answer for your time. thank you – nik Nov 28 '17 at 23:05
  • @nik will you look at the current version? I misunderstood your requested output at first, but immediately updated it and it now produces the exact output that you request, with sorting – W. Murphy Nov 28 '17 at 23:28