-2

I have a data frame in R like this:

Company | Date | Revenues
 c1     | d1   | r1
 c1     | d2   | r2
 c1     | d3   | r3
 c2     | d1   | r4
 c2     | d2   | r5
 c2     | d3   | r6
 c3     | d2   | r7
 c3     | d3   | r8

And I want to change the format to this:

 Company|Date|Revs|YOY|Company|Date|Revs|YOY|Company|Date|Revenues|YOY
   c1   |d1  |r1  |y1 |  c2   |d1  |r4  |y4 |  c3   |NA  |NA      |NA
   c1   |d2  |r2  |y2 |  c2   |d2  |r5  |y5 |  c3   |d2  |r7      |y7
   c1   |d3  |r3  |y3 |  c2   |d3  |r6  |y6 |  c3   |d3  |r8      |y8

That is, I want to change the data frame so that each set of company data is grouped and set next to each other, and the year-over-year differences in revenues are calculated and inserted into a new column after the Revenues column.

The observations are each quarter/month, but some data is missing, so I'd like to insert NA into those fields that are missing, and line up the dates as indicated in the second table.

I'd include some code, but after much searching, I still don't know how to even approach this. I've looked at reshape, but that doesn't seem to do what I need to. I was thinking perhaps a loop is the right way to approach this?

Here is a set of sample input data from dput, that includes some missing values:

structure(list(X = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 7L, 54L, 55L, 
56L, 57L, 58L, 59L, 60L, 61L, 82L, 83L, 84L, 85L, 86L, 87L, 88L, 
89L, 596L, 597L, 598L, 599L, 600L, 601L), Company = structure(c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 
3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L), .Label = c("Company 1", 
"Company 2", "Company 3", "Company 4"), class = "factor"), Date = structure(c(8L, 
7L, 6L, 5L, 4L, 3L, 2L, 1L, 8L, 7L, 6L, 5L, 4L, 3L, 2L, 1L, 8L, 
7L, 6L, 5L, 4L, 3L, 2L, 1L, 6L, 5L, 4L, 3L, 2L, 1L), .Label = c("2014-Q1", 
"2014-Q2", "2014-Q3", "2014-Q4", "2015-Q1", "2015-Q2", "2015-Q3", 
"2015-Q4"), class = "factor"), Revenue = structure(c(16L, 11L, 
12L, 7L, 10L, 14L, 9L, 8L, 15L, 2L, 4L, 1L, 30L, 6L, 3L, 5L, 
13L, 26L, 27L, 21L, 23L, 24L, 25L, 22L, 29L, 19L, 20L, 17L, 28L, 
18L), .Label = c("$1,009,306", "$1,129,899", "$1,173,698", "$1,290,262", 
"$1,329,210", "$1,338,107", "$1,342,401", "$1,455,988", "$1,632,968", 
"$1,697,235", "$1,716,398", "$1,756,648", "$17,215,900", "$2,405,874", 
"$216,536", "$422,063", "$47,665,398", "$52,772,667", "$53,941,124", 
"$54,059,612", "$54,548,057", "$54,946,768", "$55,735,568", "$58,099,615", 
"$59,753,619", "$59,955,413", "$60,655,988", "$66,236,339", "$79,135,033", 
"$962,366"), class = "factor")), .Names = c("X", "Company", "Date", 
"Revenue"), class = "data.frame", row.names = c(NA, -30L))

and here is what I'd like the data to look like after transforming:

structure(list(Company = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L), .Label = "Company 1", class = "factor"), Date = structure(c(8L, 
7L, 6L, 5L, 4L, 3L, 2L, 1L), .Label = c("2014-Q1", "2014-Q2", 
"2014-Q3", "2014-Q4", "2015-Q1", "2015-Q2", "2015-Q3", "2015-Q4"
), class = "factor"), Revenue = structure(c(8L, 5L, 6L, 1L, 4L, 
7L, 3L, 2L), .Label = c("$134", "$146", "$163", "$170", "$172", 
"$176", "$241", "$42"), class = "factor"), YOY = structure(c(2L, 
1L, 4L, 3L, NA, NA, NA, NA), .Label = c("-29%", "-75%", "-8%", 
"8%"), class = "factor"), Company.1 = structure(c(1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L), .Label = "Company 2", class = "factor"), 
Date.1 = structure(c(8L, 7L, 6L, 5L, 4L, 3L, 2L, 1L), .Label = c("2014-Q1", 
"2014-Q2", "2014-Q3", "2014-Q4", "2015-Q1", "2015-Q2", "2015-Q3", 
"2015-Q4"), class = "factor"), Revenue.1 = structure(c(7L, 
2L, 4L, 1L, 8L, 6L, 3L, 5L), .Label = c("$101", "$113", "$117", 
"$129", "$133", "$134", "$22", "$96"), class = "factor"), 
YOY.1 = structure(c(3L, 1L, 4L, 2L, NA, NA, NA, NA), .Label = c("-16%", 
"-24%", "-77%", "10%"), class = "factor"), Company.2 = structure(c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "Company 3", class = "factor"), 
Date.2 = structure(c(8L, 7L, 6L, 5L, 4L, 3L, 2L, 1L), .Label = c("2014-Q1", 
"2014-Q2", "2014-Q3", "2014-Q4", "2015-Q1", "2015-Q2", "2015-Q3", 
"2015-Q4"), class = "factor"), Revenue.2 = structure(c(1L, 
7L, 8L, 2L, 4L, 5L, 6L, 3L), .Label = c("$1,722", "$5,455", 
"$5,495", "$5,574", "$5,810", "$5,975", "$5,996", "$6,066"
), class = "factor"), YOY.2 = structure(c(2L, 4L, 3L, 1L, 
NA, NA, NA, NA), .Label = c("-1%", "-69%", "2%", "3%"), class = "factor"), 
Company.3 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "Company 4", class = "factor"), 
Date.3 = structure(c(NA, NA, 6L, 5L, 4L, 3L, 2L, 1L), .Label = c("2014-Q1", 
"2014-Q2", "2014-Q3", "2014-Q4", "2015-Q1", "2015-Q2"), class = "factor"), 
Revenue.3 = structure(c(NA, NA, 6L, 3L, 4L, 1L, 5L, 2L), .Label = c("$4,767", 
"$5,277", "$5,394", "$5,406", "$6,624", "$7,914"), class = "factor"), 
YOY.3 = structure(c(NA, NA, 1L, 2L, NA, NA, NA, NA), .Label = c("19%", 
"2%"), class = "factor"), X = c(NA, NA, NA, NA, NA, NA, NA, 
NA)), .Names = c("Company", "Date", "Revenue", "YOY", "Company.1", 
"Date.1", "Revenue.1", "YOY.1", "Company.2", "Date.2", "Revenue.2", 
"YOY.2", "Company.3", "Date.3", "Revenue.3", "YOY.3", "X"), class =     "data.frame", row.names = c(NA, 
-8L))

Thanks!

Jaap
  • 81,064
  • 34
  • 182
  • 193
  • You need to convert the 'Revenue' column from `character` to `numeric` for calculating the `YOY` and then use `dcast` – akrun Dec 12 '15 at 14:44
  • I think you can achieve what you want with [Had's reshape](http://had.co.nz/reshape/). `melt` it and then `cast`. Also, I would do that only for visualization/output purposes, it doesn't look like a nice structure to do calculations. – iled Dec 12 '15 at 14:49
  • @akrun, thanks-the revenue column is in a numeric format, that is just an artifact of my sanitizing it in Excel for the purposes of a sample data set. – M McClaren Dec 12 '15 at 15:29
  • @iled, thanks for your suggestion. I have looked at reshape a few times, and I still don't get how to use it to reformat the list I have. Could you perhaps suggest an approach/psuedo code? – M McClaren Dec 12 '15 at 15:32
  • What does the -1 signify on this post, and how do you increase likelihood of an answer to a question posted on SO? Thanks. – M McClaren Dec 12 '15 at 15:47
  • Possible duplicate of [Reshape data from long to wide format R](http://stackoverflow.com/questions/5890584/reshape-data-from-long-to-wide-format-r) – jogo Jan 09 '16 at 10:12

1 Answers1

0

Here is a dirty solution:

# How to use:
# output <- proccessData(calculateYoY(realData))

# Function for calculating YoY
calculateYoY <- function(df){
      df$Revenue <- as.double(gsub(",", "", 
                               substring(
                                 as.character(
                                   df$Revenue), 2)))

  df$YoY <- rep(NA, times=length(df[,1]))
  companies <- unique(df$Company)
  for (comp in companies){
    auxdf <- df[which(df$Company==comp),]
    periods <- length(auxdf[,1]) - 4
    for (per in (1:periods)){
      auxdf[per,5] <- (auxdf[per,4] - auxdf[per + 4,4])/auxdf[per + 4,4]
    }
    df[which(df$Company==comp),] <- auxdf
  }
  df
}

# Function for generating the new data.frame
proccessData <- function(df){
  companies <- unique(df$Company)
  dates <- unique(df$Date)

  resultdf <- data.frame(row.names=dates)
  dummyVector = rep(NA, times=length(dates))
  comp <- "Company 4"
  for (comp in companies){
    auxdf <- df[which(df$Company==comp),2:5]
    dummydf <- data.frame(Company=rep(as.character(comp), 
                                      times=length(dates)), 
                          Date=dates, Revenue=dummyVector, YoY=dummyVector)
    for (dat in as.character(auxdf$Date)){
      dummydf[which(dummydf$Date==dat),] <- auxdf[which(auxdf$Date==dat),]  
    } 
    resultdf <- cbind(resultdf, dummydf)
  }
  resultdf
}

It's in "imperative" format, but works and it's a good start point in order to make something more "functional". Perhaps using reshape + plyr the script will be shorter.

(The script assumes that in the subset of each company, the data is sorted by date. And the revenue is a text field.)

4444
  • 3,541
  • 10
  • 32
  • 43
egbaquela
  • 16
  • 1