-1

I am trying to summarize a data set in R. I am a beginner in R. The code below works, but has many steps. Is there an easier way to accomplish this? I'd like to accomplish the following:

1) group by Client_ID
2) count all ClaimNumbers (whether associated with DS or not)
3) count only claim numbers with DS 4) sum retail and WS only for DS
5) Also, I would like to just count a claim once. In the data, a single claim number will be repeated for each service year and service.

 # example

 ds <- read.table(text = "
 Client_ID ClaimNumber ServiceYr Service Retail WS
A00002         WC1      2012      DS    100 25
A00002         WC1      2013      DS    100 25
A00002         WC1      2014      BR     50 10
A00002         WC2      2014      BR     50 10
A00002         WC3      2014      BR     50 10
A00003         WC4      2014      BR     50 10
A00003         WC4      2015      BR     50 10
A00003         WC5      2015      BR     50 10
A00003         WC5      2016      BR     50 10
A00003         WC6      2016      DS    100 25",
               sep="",header=TRUE)

 # group by client ID and claim number to get one row per claim number
 total_claims <- sqldf("select Client_ID,ClaimNumber from ds group 
                  by Client_ID,ClaimNumber")

 # For DS claims only - group by client ID and claim number 
 # to get one row per claim number
 ds_claims <- sqldf("select Client_ID,ClaimNumber, sum(Retail) as Retail, 
   sum(WS) as WS from ds where Service='DS' group by Client_ID,ClaimNumber")

 # count the total number of claims by client
 total_counts <-      aggregate(total_claims[,2],b=list(total_claims$Client_ID),FUN=length)

 # fix column headers
 colnames(total_counts)[1:2] <- c("Client_ID","ClaimCount")

 # count the number of DS claims by client
 ds_claim_counts <- aggregate(ds_claims[,2],b=list(ds_claims$Client_ID),FUN=length)

 # fix column headers
 colnames(ds_claim_counts)[1:2] <- c("Client_ID","ClaimCount")

 # merge to get both total counts and ds counts on the same table
 total <- merge(total_counts,ds_claim_counts, by="Client_ID",all.x=TRUE)

 # merge to add ds retail and ws amounts to total table
 total <- merge(total,ds_claims[,c(1,3,4)], by="Client_ID",all.x=TRUE)

 # fix column headers
 colnames(total)[2:3] <- c("Total_CC","DS_CC")
user3670204
  • 33
  • 2
  • 7
  • Please take a look at these tips on how to produce a [minimum, complete, and verifiable example](http://stackoverflow.com/help/mcve), as well as this post on [creating a great example in R](http://stackoverflow.com/questions/5963269/how-to-make-a-great-r-reproducible-example). – lmo Mar 31 '17 at 17:30

1 Answers1

3

Here are some alternatives which give the same answer as the code in the question:

1) sqldf

library(sqldf)

sqldf("select Client_ID,
              count(distinct ClaimNumber) Total_CC,
              count(distinct case when 
                               Service = 'DS' then ClaimNumber 
                               else NULL 
                             end) DS_CC,
              sum(Retail * (Service = 'DS')) Retail,
              sum(WS * (Service = 'DS')) WS
       from ds
       group by Client_ID")

giving:

  Client_ID Total_CC DS_CC Retail WS
1    A00002        3     1    200 50
2    A00003        3     1    100 25

2) data.table

library(data.table)

DT <- as.data.table(ds)
DT[, list(Total_CC = length(unique(ClaimNumber)),
          DS_CC = length(unique(ClaimNumber[Service == "DS"])),
          Retail = sum(Retail * (Service == "DS")),
          WS = sum(WS * (Service == "DS"))), by = Client_ID]

giving:

   Client_ID Total_CC DS_CC Retail WS
1:    A00002        3     1    200 50
2:    A00003        3     1    100 25

3) dplyr

library(dplyr)

ds %>% 
   group_by(Client_ID) %>%
   summarize(Total_CC = length(unique(ClaimNumber)),
          DS_CC = length(unique(ClaimNumber[Service == "DS"])),
          Retail = sum(Retail * (Service == "DS")),
          WS = sum(WS * (Service == "DS"))) %>%
   ungroup

giving:

# A tibble: 2 × 5
  Client_ID Total_CC DS_CC Retail    WS
     <fctr>    <int> <int>  <int> <int>
1    A00002        3     1    200    50
2    A00003        3     1    100    25
G. Grothendieck
  • 254,981
  • 17
  • 203
  • 341