I am trying to subset my data based on the year and country and calculate the regression coefficient for each one.
a subset of my data structure is :
pdata <- structure(list(movie_odid = c(10100L, 10100L, 520100L, 520100L,
650100L, 650100L, 10100L, 10100L, 520100L, 780100L, 780100L,
950100L, 950100L, 540100L, 540100L, 780100L, 780100L, 880100L,
880100L, 450100L, 450100L, 540100L, 540100L, 640100L, 640100L,
800100L, 800100L, 420100L, 420100L, 450100L, 450100L, 490100L,
490100L, 640100L, 640100L, 430100L, 430100L, 490100L, 490100L,
590100L, 590100L, 1620100L, 1620100L, 390100L, 390100L, 8810100L,
8810100L, 9480100L, 9480100L, 570100L, 570100L, 590100L, 590100L
), chart_date = structure(c(5L, 6L, 3L, 4L, 1L, 2L, 7L, 8L, 7L,
11L, 12L, 9L, 10L, 17L, 18L, 13L, 14L, 15L, 16L, 23L, 24L, 19L,
20L, 25L, 26L, 21L, 22L, 29L, 30L, 27L, 28L, 31L, 32L, 27L, 28L,
37L, 38L, 33L, 34L, 35L, 36L, 39L, 40L, 41L, 42L, 47L, 48L, 45L,
46L, 43L, 44L, 39L, 40L), .Label = c("1997-05-23", "1997-05-30",
"1997-07-04", "1997-07-11", "1997-12-19", "1997-12-26", "1998-01-02",
"1998-01-09", "1998-06-26", "1998-07-03", "1998-07-24", "1998-07-31",
"1999-02-05", "1999-02-12", "1999-06-04", "1999-06-11", "1999-11-19",
"1999-11-26", "2000-01-07", "2000-01-14", "2000-05-26", "2000-06-02",
"2000-11-17", "2000-11-24", "2000-12-22", "2000-12-29", "2001-01-05",
"2001-01-12", "2001-05-18", "2001-05-25", "2001-11-02", "2001-11-09",
"2002-01-04", "2002-01-11", "2002-04-19", "2002-04-26", "2002-11-15",
"2002-11-22", "2003-01-03", "2003-01-10", "2003-05-09", "2003-05-16",
"2003-05-23", "2003-05-30", "2003-09-12", "2003-09-19", "2003-11-07",
"2003-11-14"), class = "factor"), chart_year = c(1997L, 1997L,
1997L, 1997L, 1997L, 1997L, 1998L, 1998L, 1998L, 1998L, 1998L,
1998L, 1998L, 1999L, 1999L, 1999L, 1999L, 1999L, 1999L, 2000L,
2000L, 2000L, 2000L, 2000L, 2000L, 2000L, 2000L, 2001L, 2001L,
2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 2002L, 2002L, 2002L,
2002L, 2002L, 2002L, 2003L, 2003L, 2003L, 2003L, 2003L, 2003L,
2003L, 2003L, 2003L, 2003L, 2003L, 2003L), revenue = c(52969336L,
71183357L, 76457208L, 43593212L, 81172327L, 45111185L, 45012810L,
37568867L, 48261L, 49760360L, 36612617L, 1962627L, 1441774L,
23093123L, 65927993L, 5107876L, 4771193L, 3000000L, 82216507L,
84977355L, 59922105L, 8431650L, 7296370L, 78711571L, 40769776L,
83347490L, 37133438L, 56498192L, 63650772L, 2968580L, 788895L,
76599345L, 57025088L, 28499878L, 22837762L, 106131568L, 61909948L,
4502006L, 3272808L, 822068L, 1078673L, 3843873L, 2101748L, 42508303L,
121361422L, 10163670L, 11628760L, 29944555L, 14018616L, 100066590L,
49010220L, 3536766L, 3321470L), theaters = c(2674L, 2711L, 3020L,
3020L, 3281L, 3282L, 2727L, 2746L, 58L, 2453L, 2540L, 214L, 214L,
3236L, 3236L, 1027L, 1140L, 0L, 3312L, 3127L, 3134L, 2752L, 2326L,
2774L, 2929L, 3653L, 3653L, 3587L, 3623L, 2594L, 912L, 3237L,
3269L, 2948L, 3048L, 3682L, 3682L, 1425L, 1313L, 108L, 141L,
1808L, 1180L, 3603L, 3603L, 576L, 1177L, 3282L, 3289L, 3483L,
3492L, 1194L, 1212L), running_time = c(194L, 194L, 98L, 98L,
134L, 134L, 194L, 194L, 98L, 169L, 169L, 220L, 220L, 92L, 92L,
169L, 169L, 95L, 95L, 105L, 105L, 92L, 92L, 143L, 143L, 126L,
126L, 90L, 90L, 105L, 105L, 92L, 92L, 143L, 143L, 161L, 161L,
92L, 92L, 95L, 95L, 133L, 133L, 138L, 138L, 135L, 135L, 0L, 0L,
102L, 102L, 95L, 95L), ifUS = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 1L, 1L, 1L,
1L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L)), class = "data.frame", row.names = c(NA,
-53L))
and my code is :
coefLM <- function(x) {
coef(lm(log(revenue) ~ running_time, data = x))[2]
}
spl <- with(pdata, split(pdata, list(chart_year = chart_year, ifUS = ifUS)))
out <- unique(pdata[, c("chart_year", "ifUS")])
out <- transform(out, slope = sapply(spl, coefLM))
out
However, there is a time mismatch in the result. for example for "1999.0.running_time" the chart_year is "2012". would you please guide me on what could be the possible cause?