0

I would like to separate my data based on one column (Column B in below data). In other words, I like to separate rows with 'C' letter that they has same number before letter 'c' with others rows (letters,a and b imply to parents ,and c implies to children, I like to remove children from same family).

Input:

df <- structure(list(
 A = c("L", "L","L","L","L","L","L", "L", "L"), 
 B = c("l-7b", "l-7a", "l-7c", NA, "l-15a", "l-15c", "l-16c", "l-20b", "l-20c"), 
 C = c(6002, 6003, 6006,6007,6010,6011,6015, 6087, 6098)), 
 .Names = c("A", "B", "C"), 
 row.names = c(NA, 9L), 
 class = "data.frame")}

Output:

    A      B         C

1   L    l-7b      6002                
2   L    l-7a      6003                            
4   L       NA     6007                 
5   L    l-15a     6010
7   L    l-16c     6015
8   L    l-20b     6087      
PereG
  • 1,796
  • 2
  • 22
  • 23
star
  • 743
  • 1
  • 7
  • 19

4 Answers4

1

An idea using dplyr,

library(dplyr)

#convert to character
df[1:2] <- lapply(df[1:2], as.character)

df %>% 
 group_by(new = substr(B, 1, nchar(B)-1)) %>% 
 mutate(freq = n()) %>% 
 subset(!grepl('c', B)|freq == 1) %>% 
 ungroup() %>% 
 select(A, B, C)

# A tibble: 6 × 3
#      A     B     C
#  <chr> <chr> <int>
#1     L  l-7b  6002
#2     L  l-7a  6003
#3     L  <NA>  6007
#4     L l-15a  6010
#5     L l-16c  6015
#6     L l-20b  6087
Sotos
  • 51,121
  • 6
  • 32
  • 66
1

Alternate dplyr + tidyr approach:

read.table(text="A      B         C
L    l-7b      6002
L    l-7a      6003
L    l-7c      6006
L       NA     6007
L    l-15a     6010
L    l-15c     6011
L    l-16c     6015
L    l-20b     6087
L    l-20c     6098", header=TRUE, stringsAsFactors=FALSE) -> df

library(dplyr)
library(tidyr)

filter_kids <- function(x) {
  if (nrow(x) > 1) x <- filter(x, code2 != "c")
}

extract(df, B, c("code1", "code2"), "([[:alpha:]]-[[:digit:]]+)([[:alpha:]])", remove=FALSE) %>%
  group_by(code1) %>%
  do(filter_kids(.)) %>%
  select(A, B, C)
  data.frame()
##   code1 A     B    C
## 1  l-15 L l-15a 6010
## 2  l-16 L l-16c 6015
## 3  l-20 L l-20b 6087
## 4   l-7 L  l-7b 6002
## 5   l-7 L  l-7a 6003
## 6  <NA> L  <NA> 6007

added the data.frame() at the end to dumb down the data structure as requested.

hrbrmstr
  • 77,368
  • 11
  • 139
  • 205
1

Here's a possible solution using base R :

### recreate your example data frame
DF <- read.csv(text=
"A,B,C
1,L,l-7b,6002                
2,L,l-7a,6003                
3,L,l-7c,6006                
4,L,NA,6007                 
5,L,l-15a,6010
6,L,l-15c,6011
7,L,l-16c,6015")

### create a new DF copy with extra columns useful for our subset
DFExt <- DF
# add a column containing the code before the leading chars "a","b","c" extracted from B 
DFExt$BCode <- gsub("^(.+)[abc]$","\\1",DF$B) 
# add a logical column indicating if B ends with "c"
DFExt$IsChild <- grepl("c$",DF$B) 
# add a column indicating for each BCode how many parents we have
DFExt$NumParents <- 
ave(1:nrow(DFExt),
    DFExt$BCode,
    FUN=function(idxs){
      sum(!DFExt$IsChild[idxs])
    }) 

### let's subset removing the rows where IsChild=TRUE and NumParents > 0
DFSubset <- DF[!(DFExt$IsChild & DFExt$NumParents > 0),]

> DFSubset
  A     B    C
1 L  l-7b 6002
2 L  l-7a 6003
4 L  <NA> 6007
5 L l-15a 6010
7 L l-16c 6015
digEmAll
  • 56,430
  • 9
  • 115
  • 140
1

Here is another way using basic R:

A = 1:7
B = c("1-1a","1-1c","1-2c","1-3b","1-2b",NA,"1-4c")
df = data.frame(A,B)

> df
  A    B
1 1 1-1a
2 2 1-1c
3 3 1-2c
4 4 1-3b
5 5 1-2b
6 6 <NA>
7 7 1-4c

Do:

rowToRemove = NULL

# Find rows that have a "c"
rw = which(grepl("c",df$B))

for(k in rw){
    # Get all characters that are not "c"
    m = regexpr("[^c]*",df$B[k])

    # Find how many of these are in the B column
    z = regmatches(df$B[k],m)
    t = sum(grepl(z,df$B))

    # If more than 1, tag for removal
    if(t>1) rowToRemove = c(rowToRemove,k)
}

df = df[-rowToRemove,]

Result:

> df
  A    B
1 1 1-1a
4 4 1-3b
5 5 1-2b
6 6 <NA>
7 7 1-4c

I'm sure this for-loop could be vectorized using an *apply function, but I'll leave that exercise to you,

R. Schifini
  • 9,085
  • 2
  • 26
  • 32