0

I have a data.frame that has 110M rows, a lot of them are identical. I need to aggregate the table, merge same rows, and add a column to resulting frame with frequency.

library(plyr)
library(data.table)

DataFrame1 = structure(list(company_nm = c("Acme Markets (NAI)", "Acme Markets (NAI)", 
                              "Acme Markets (NAI)", "Acme Markets (NAI)", "Acme Markets (NAI)", 
                              "Acme Markets (NAI)", "Acme Markets (NAI)", "Acme Markets (NAI)", 
                              "Acme Markets (NAI)", "Acme Markets (NAI)"), `tier 4` = c("Vitamins", 
                                                                                        "Internal Analgesics", "Nutrition Bars", "Carbonated Soft Drinks", 
                                                                                        "Bottled Water", "Bottled Water", "Bottled Water", "Bottled Water", 
                                                                                        "Bottled Water", "Popcorn"), `tier 3` = c("Vitamin Supplements", 
                                                                                                                                  "Analgesics", "Nutrition", "Beverage", "Beverage", "Beverage", 
                                                                                                                                  "Beverage", "Beverage", "Beverage", "Snacks"), `tier 2` = c("Health Care", 
                                                                                                                                                                                              "Health Care", "Health Care", "Dry Grocery", "Dry Grocery", "Dry Grocery", 
                                                                                                                                                                                              "Dry Grocery", "Dry Grocery", "Dry Grocery", "Dry Grocery"), 
               `tier 1` = c("Health & Personal Care", "Health & Personal Care", 
                            "Health & Personal Care", "Grocery", "Grocery", "Grocery", 
                            "Grocery", "Grocery", "Grocery", "Grocery"), Market = c("Randolph, NJ", 
                                                                                    "Yonkers, NY", "Newark, NJ", "Newark, NJ", "Lancaster, PA", 
                                                                                    "Wilmington, DE", "Philadelphia, PA", "Lancaster, PA", "Wilmington, DE", 
                                                                                    "Randolph, NJ"), RetMktAC = c("Acme Markets (NAI), Randolph, NJ", 
                                                                                                                  "Acme Markets (NAI), Yonkers, NY", "Acme Markets (NAI), Newark, NJ", 
                                                                                                                  "Acme Markets (NAI), Newark, NJ", "Acme Markets (NAI), Lancaster, PA", 
                                                                                                                  "Acme Markets (NAI), Wilmington, DE", "Acme Markets (NAI), Philadelphia, PA", 
                                                                                                                  "Acme Markets (NAI), Lancaster, PA", "Acme Markets (NAI), Wilmington, DE", 
                                                                                                                  "Acme Markets (NAI), Randolph, NJ")), .Names = c("company_nm", 
                                                                                                                                                                   "tier 4", "tier 3", "tier 2", "tier 1", "Market", "RetMktAC"), row.names = c(NA, 
                                                                                                                                                                                                                                                -10L), class = c("data.table", "data.frame"))


df = DataFrame1[rep(seq_len(nrow(DataFrame1)), each=10000),]

I looked into ddply and data.table but a) first is so slow compared to second, b) could not figure out how to efficiently (i.e. not listing all columns by which to aggregate).

Resulting DF would be smth like: "all columns from original" "freq" Thanks!

Alexey Ferapontov
  • 5,029
  • 4
  • 22
  • 39

0 Answers0