-10

I am using a 'shares model' to estimate values for missing observations. With the example data set my.data I am filling missing observations for each of three years proportionally to how observations were distributed in 1970 (although I could do so using 2010 or both 1970 and 2010).

Below I present example data, the desired result and code to obtain desired estimates in two ways. Code for the first approach is very specific to this example. I wish to create a more-general function than that used in the second approach. Creating a more-general function seems to me to require calling a function on a list of lists. I am hoping someone might offer advice on how to apply a function to a list of lists.

Here is the example data set and the highly specific solution:

my.data <- read.table(text = '
 county  y1970  y1980  y1990  y2000  y2010
   aa      50     NA     70     NA     500
   cc      10     20     NA     NA     100
   ee     800     NA     NA    400    8000
   gg    1000   1900     NA     NA   10000
   ii     200    400    300    100    2000
   kk      20     40     30     NA     200
', header = TRUE, na.string='NA', stringsAsFactors=FALSE)

my.total <- read.table(text = '
   county  y1970  y1980  y1990  y2000  y2010
   total    2080   4000   3000   1000  20800
', header = TRUE, na.string='NA', stringsAsFactors=FALSE)

desired.result <- read.table(text = '
 county  y1970  y1980         y1990       y2000       y2010
   aa      50  96.47059         70      23.148148       500
   cc      10     20         14.36464    4.629630       100
   ee     800   1543.529   1149.17127      400         8000
   gg    1000   1900       1436.46409   462.962963    10000
   ii     200    400           300         100         2000
   kk      20     40            30       9.259259       200
', header = TRUE, na.string='NA', stringsAsFactors=FALSE)

x70 <- c(50, 800)
estimates.for.80 <- (x70 / sum(x70)) * (my.total$y1980 - sum(my.data$y1980, na.rm = TRUE))

x80 <- c(10, 800, 1000)
estimates.for.90 <- (x80 / sum(x80)) * (my.total$y1990 - sum(my.data$y1990, na.rm = TRUE))

x90 <- c(50, 10, 1000, 20)
estimates.for.00 <- (x90 / sum(x90)) * (my.total$y2000 - sum(my.data$y2000, na.rm = TRUE))

Here is the function. I think this can be generalized if I knew how to include d.counties as an input list to the function. In other words, how can I include d.counties in my.input and still have the function work? My confusion I think stems from the length of d.counties differing among years.

state <- 'my.state'

my.df <- read.table(text = '
  county   y1970  y1980  y1990  y2000   y2010
      aa      50     NA     70     NA     500
      cc      10     20     NA     NA     100
      ee     800     NA     NA    400    8000
      gg    1000   1900     NA     NA   10000
      ii     200    400    300    100    2000
      kk      20     40     30     NA     200
   total    2080   4000   3000   1000   20800
', header = TRUE, na.string='NA', stringsAsFactors=FALSE)

pre.divide.up <- tail(my.df[,2:ncol(my.df)], 1) - colSums(head(my.df[,2:ncol(my.df)], -1), na.rm = TRUE)

# For each column containing NA's define the years to use as shares
# If use.years = 'pre'  then use the year in pre.year
# If use.years = 'post' then use the year in post.year
# If use.years = 'both' then use both the year in pre.year and the year in post.year
#
# Here I define pre.year = y1970 and post.year = 2010 for every year
# However, 'pre.year' and 'post.year' are variables.  They can differ among rows below.
shares <- read.table(text = '
      cyear   pre.year  post.year  use.years
      y1980     y1970     y2010        pre
      y1990     y1970     y2010        pre
      y2000     y1970     y2010        pre
', header = TRUE, na.strings = "NA")

d.counties.80 <- c( 'aa' ,
                    'ee' )

d.counties.90 <- c( 'cc' ,
                    'ee' , 
                    'gg' )

d.counties.00 <- c( 'aa' ,
                    'cc' ,
                    'gg' ,
                    'kk' )

d.counties <- list(d.counties.80, d.counties.90, d.counties.00)

my.input <- data.frame(shares)

my.function <- function(y) {

# extract years of interest from my.df and store in data.frame called year.data
if(y[[4]] != 'last') year.data = my.df[names(my.df) %in% c("county", y[[2]], y[[1]], y[[3]])]
if(y[[4]] == 'last') year.data = my.df[names(my.df) %in% c("county", y[[2]], y[[1]]        )]

# subset counties in year.data to only include counties with NA's in current year
if(as.numeric(substr(y[1], 2, 5)) == 1980) year.data = year.data[year.data$county %in% d.counties.80,]
if(as.numeric(substr(y[1], 2, 5)) == 1990) year.data = year.data[year.data$county %in% d.counties.90,]
if(as.numeric(substr(y[1], 2, 5)) == 2000) year.data = year.data[year.data$county %in% d.counties.00,]

# reorder columns in year.data
if(y[[4]] != 'last') year.data = year.data[, c('county', y[[2]], y[[1]], y[[3]])]
if(y[[4]] == 'last') year.data = year.data[, c('county', y[[2]], y[[1]]        )]

# values to be divided, or distributed, among counties with NA's in the current year
divide.up <- pre.divide.up[, y[[1]]] 

# sum values from designated pre and/or post years and bind those totals to bottom of year.data
if(y[[4]] != 'last') colsums.year = data.frame('total', as.data.frame(t(as.numeric(colSums(year.data[,c(2:4)], na.rm=TRUE)))))
if(y[[4]] == 'last') colsums.year = data.frame('total', as.data.frame(t(as.numeric(colSums(year.data[,c(2:3)], na.rm=TRUE)))))
names(colsums.year) <- names(year.data)
year.data.b <- rbind(year.data, colsums.year)

# obtain percentages in designated pre and/or post years for counties with NA's in current year
year.data.c <- year.data.b
year.data.c[, -1] <- lapply( year.data.c[  , -1], function(x){ x/x[nrow(year.data.b)] } )

# estimate county values for current year by distributing total missing values in current year
# according to how values were distributed in those same counties in other years
if(y[[4]] == 'both') year.data.b[, y[[1]]] = rowMeans(data.frame(year.data.c[, y[[2]]], year.data.c[, y[[3]]])) * as.numeric(divide.up)
if(y[[4]] ==  'pre') year.data.b[, y[[1]]] = year.data.c[, y[[2]]] * as.numeric(divide.up)
if(y[[4]] == 'post') year.data.b[, y[[1]]] = year.data.c[, y[[3]]] * as.numeric(divide.up)
if(y[[4]] == 'last') year.data.b[, y[[1]]] = year.data.c[, y[[2]]] * as.numeric(divide.up)

# extract estimates for current year along with the county column, then remove the last row
year.data.last <- year.data.b[names(year.data.b) %in% c("county", y[[1]])]
year.data.last <- year.data.last[-nrow(year.data.last),]
colnames(year.data.last) <- c('county', 'acreage')

# create a data set for export
this.year <- rep(as.numeric(substr(y[[1]], 2, 5)), nrow(year.data.last))
revised.data <- data.frame(state, this.year, year.data.last)
return(revised.data) 
}

my.list  <- apply(shares, 1, function(y) my.function(y))
my.list2 <- do.call("rbind", my.list)
my.list2

      state this.year county     acreage
1  my.state      1980     aa   96.470588
3  my.state      1980     ee 1543.529412
2  my.state      1990     cc   14.364641
31 my.state      1990     ee 1149.171271
4  my.state      1990     gg 1436.464088
11 my.state      2000     aa   23.148148
21 my.state      2000     cc    4.629630
41 my.state      2000     gg  462.962963
6  my.state      2000     kk    9.259259

Although this function is not as general as the one in my answer below, the function above does allow explicit designation of which counties have relevant missing values. In the actual data there are two types of missing values and the function in my answer below cannot tell the two types apart. The function above can tell them apart because I tell it exactly which counties to consider each year.

Thank you again for any advice and for advice already offered.

Mark Miller
  • 12,483
  • 23
  • 78
  • 132
  • 5
    Dude! Ever heard of a [small reproducible example](http://stackoverflow.com/q/5963269/1478381)?! – Simon O'Hanlon Aug 02 '13 at 23:51
  • @SimonO101 I think the example is a small one. The post is long because I have presented multiple functional solutions working toward a desired highly-general solution that seems to involve lists of lists. – Mark Miller Aug 02 '13 at 23:58
  • 3
    Please remove anything that is not relevant to the question. And it's not just the code... – flodel Aug 02 '13 at 23:58
  • 2
    @MarkMiller it's impossible to decipher the example in a time frame any sane person would consider reasonable! I'm just offering this as advice. The length of your post is possibly attracting downvotes. – Simon O'Hanlon Aug 03 '13 at 00:00
  • 2
    Looking at your other posts [here](http://stackoverflow.com/questions/17903205/sum-multiple-columns-by-group-with-tapply), [here](http://stackoverflow.com/questions/17798363/expand-grid-when-one-variable-is-really-two-columns), [here](http://stackoverflow.com/questions/17464237/extracting-table-from-text-file), it's clear that your questions are *long*. In trying to be complete, it comes out excessive. Please be aware of it for the future. – Arun Aug 03 '13 at 06:28
  • 1
    I think 8 downvotes is a bit harsh. Some advice: If your question gets long and you think everything included is relevant, put an abstract (no more than three sentences) with the main question on top. But in general, brevity is a virtue. – Roland Aug 03 '13 at 07:42
  • @PatrickT As far as I can tell, my answer is the valid answer. It does not seem right to me to accept my own answer. – Mark Miller Nov 26 '13 at 04:24
  • As far as I know, it's okay to accept your own answer. It makes the site easier to navigate if one answer has been selected. – PatrickT Nov 27 '13 at 06:50

3 Answers3

4

I think your whole problem can be summarised in a few lines. This is far too long. If your problem really is, as the title states, applying a function to a list of lists then you need to recursively apply a function. There is a construct for doing that and it is rapply:

w <- 1:5
x <- 1:5
y <- 6:10
z <- 6:10

ll <- list( list( w , x) , list( y , z) )
str(ll)
List of 2
 $ :List of 2
  ..$ : int [1:5] 1 2 3 4 5
  ..$ : int [1:5] 1 2 3 4 5
 $ :List of 2
  ..$ : int [1:5] 6 7 8 9 10
  ..$ : int [1:5] 6 7 8 9 10

rapply( ll , mean )
[1] 3 3 8 8

As a suggestion, basically you could have boiled your problem down to...

I've got this list of lists, but when I try to use lapply it doesn't work...

lapply( ll , mean )
[[1]]
[1] NA

[[2]]
[1] NA

Warning messages:
1: In mean.default(X[[1L]], ...) :
  argument is not numeric or logical: returning NA
2: In mean.default(X[[2L]], ...) :
  argument is not numeric or logical: returning NA
Simon O'Hanlon
  • 58,647
  • 14
  • 142
  • 184
  • 1
    +1 - I'd love it if your answer included what the question should have looked like. – flodel Aug 02 '13 at 23:59
  • The question should have looked like: `Hey guys, how do you apply a function to a list of lists?` – Señor O Aug 03 '13 at 00:01
  • @SimonO101 Thanks for the reply. If I get `rapply` to work, I will edit the question and possibly post a solution... ...assuming the question has not been deleted. – Mark Miller Aug 03 '13 at 00:53
  • @MarkMiller Given the lack of any close votes, but numerous down votes, I'd say the consensus is that this is a perfectly valid question that won't be closed/deleted, but we'd just like it if you kept it a bit shorter next time! :) – joran Aug 03 '13 at 01:02
2

This question is messed up. I'll try to explain your question first and then answer so that it's easier to figure out if I understood your question or not.

Gist of your problem:

Okay, after sifting through your question for what's required, I see that you've a data.frame my.data:

  county y1970 y1980 y1990 y2000 y2010
1     aa    50    NA    70    NA   500
2     cc    10    20    NA    NA   100
3     ee   800    NA    NA   400  8000
4     gg  1000  1900    NA    NA 10000
5     ii   200   400   300   100  2000
6     kk    20    40    30    NA   200

And another my.total:

  county y1970 y1980 y1990 y2000 y2010
1  total  2080  4000  3000  1000 20800

And what you'd like is (desired.result):

  county y1970      y1980      y1990      y2000 y2010
1     aa    50   96.47059   70.00000  23.148148   500
2     cc    10   20.00000   14.36464   4.629630   100
3     ee   800 1543.52900 1149.17127 400.000000  8000
4     gg  1000 1900.00000 1436.46409 462.962963 10000
5     ii   200  400.00000  300.00000 100.000000  2000
6     kk    20   40.00000   30.00000   9.259259   200

Criteria:

The way I understand your criteria is, for each numeric/integer column in my.data that has NA in it, take the entries of column 1970 corresponding to those NA entries and replace those NA entries with the formula:

vals <- corresponding entries in column 1970
this column's NA's <- vals/sum(vals) * (my.total of this column - 
                    sum(this.column, na.rm=TRUE))

Possible solution:

I don't think there's a need for "list of lists" here. Here's one way using simple for-loops (as it's conceptually easier for this problem). This is because you want to modify some items within some columns of a data.frame. df will be the desired.result.

df <- my.data
for (i in which(colSums(is.na(df), na.rm=TRUE) > 0)) {
    idx <- which(is.na(df[[i]]))
    xx <- df[["y1970"]][idx]
    df[[i]][idx] <- (xx/sum(xx)) * (my.total[[i]] - sum(df[[i]], na.rm=TRUE))
}

  county y1970      y1980      y1990      y2000 y2010
1     aa    50   96.47059   70.00000  23.148148   500
2     cc    10   20.00000   14.36464   4.629630   100
3     ee   800 1543.52941 1149.17127 400.000000  8000
4     gg  1000 1900.00000 1436.46409 462.962963 10000
5     ii   200  400.00000  300.00000 100.000000  2000
6     kk    20   40.00000   30.00000   9.259259   200
Arun
  • 116,683
  • 26
  • 284
  • 387
  • Thanks. I have tried so hard to abandon `for-loops` that I did not even think to consider them here. I was thinking of `d.counties` as a list within a possible list named `my.inputs`. – Mark Miller Aug 03 '13 at 06:26
0

In my original post I asked how I could include a list d.counties as input into a function without having to use a series of specific if statements. Here is the solution I came up with.

Step 1. Create the list d.counties so that names are retained:

d.counties.1980 <- c( 'aa' ,
                      'ee' )

d.counties.1990 <- c( 'cc' ,
                      'ee' , 
                      'gg' )

d.counties.2000 <- c( 'aa' ,
                      'cc' ,
                      'gg' ,
                      'kk' )

list.function <-  function() { 

     sapply(c("d.counties.1980",
              "d.counties.1990",
              "d.counties.2000"), get, environment(), simplify = FALSE) 
} 

d.counties <- list.function()

Step 2. Inside the function replace the series of if statements that explicitly specify missing observations for individual years with the following general line that accesses the list d.counties using names therein without explicitly specifying an individual name for an individual year:

year.data = year.data[year.data$county %in% d.counties[substr(names(d.counties), 12, 15) == substr(y[1], 2, 5)][[1]],]

Here is the entire code for this solution:

state <- 'my.state'

my.df <- read.table(text = '
  county   y1970  y1980  y1990  y2000   y2010
      aa      50     NA     70     NA     500
      cc      10     20     NA     NA     100
      ee     800     NA     NA    400    8000
      gg    1000   1900     NA     NA   10000
      ii     200    400    300    100    2000
      kk      20     40     30     NA     200
   total    2080   4000   3000   1000   20800
', header = TRUE, na.string='NA', stringsAsFactors=FALSE)

pre.divide.up <- tail(my.df[,2:ncol(my.df)], 1) - colSums(head(my.df[,2:ncol(my.df)], -1), na.rm = TRUE)

# For each column containing NA's define the years to use as shares
# If use.years = 'pre'  then use the year in pre.year
# If use.years = 'post' then use the year in post.year
# If use.years = 'both' then use both the year in pre.year and the year in post.year
#
# Here I define pre.year = y1970 and post.year = 2010 for every year
# However, 'pre.year' and 'post.year' are variables.  They can differ among rows below.
shares <- read.table(text = '
      cyear   pre.year  post.year  use.years
      y1980     y1970     y2010        pre
      y1990     y1970     y2010        pre
      y2000     y1970     y2010        pre
', header = TRUE, na.strings = "NA")

d.counties.1980 <- c( 'aa' ,
                      'ee' )

d.counties.1990 <- c( 'cc' ,
                      'ee' , 
                      'gg' )

d.counties.2000 <- c( 'aa' ,
                      'cc' ,
                      'gg' ,
                      'kk' )

list.function <-  function() { 

     sapply(c("d.counties.1980",
              "d.counties.1990",
              "d.counties.2000"), get, environment(), simplify = FALSE) 
} 

d.counties <- list.function()
d.counties

my.input <- data.frame(shares)

my.function <- function(y) {

# extract years of interest from my.df and store in data.frame called year.data
if(y[[4]] != 'last') year.data = my.df[names(my.df) %in% c("county", y[[2]], y[[1]], y[[3]])]
if(y[[4]] == 'last') year.data = my.df[names(my.df) %in% c("county", y[[2]], y[[1]]        )]

# subset counties in year.data to only include counties with NA's in current year
year.data = year.data[year.data$county %in% d.counties[substr(names(d.counties), 12, 15) == substr(y[1], 2, 5)][[1]],]

# reorder columns in year.data
if(y[[4]] != 'last') year.data = year.data[, c('county', y[[2]], y[[1]], y[[3]])]
if(y[[4]] == 'last') year.data = year.data[, c('county', y[[2]], y[[1]]        )]

# values to be divided, or distributed, among counties with NA's in the current year
divide.up <- pre.divide.up[, y[[1]]] 

# sum values from designated pre and/or post years and bind those totals to bottom of year.data
if(y[[4]] != 'last') colsums.year = data.frame('total', as.data.frame(t(as.numeric(colSums(year.data[,c(2:4)], na.rm=TRUE)))))
if(y[[4]] == 'last') colsums.year = data.frame('total', as.data.frame(t(as.numeric(colSums(year.data[,c(2:3)], na.rm=TRUE)))))
names(colsums.year) <- names(year.data)
year.data.b <- rbind(year.data, colsums.year)

# obtain percentages in designated pre and/or post years for counties with NA's in current year
year.data.c <- year.data.b
year.data.c[, -1] <- lapply( year.data.c[  , -1], function(x){ x/x[nrow(year.data.b)] } )

# estimate county values for current year by distributing total missing values in current year
# according to how values were distributed in those same counties in other years
if(y[[4]] == 'both') year.data.b[, y[[1]]] = rowMeans(data.frame(year.data.c[, y[[2]]], year.data.c[, y[[3]]])) * as.numeric(divide.up)
if(y[[4]] ==  'pre') year.data.b[, y[[1]]] = year.data.c[, y[[2]]] * as.numeric(divide.up)
if(y[[4]] == 'post') year.data.b[, y[[1]]] = year.data.c[, y[[3]]] * as.numeric(divide.up)
if(y[[4]] == 'last') year.data.b[, y[[1]]] = year.data.c[, y[[2]]] * as.numeric(divide.up)

# extract estimates for current year along with the county column, then remove the last row
year.data.last <- year.data.b[names(year.data.b) %in% c("county", y[[1]])]
year.data.last <- year.data.last[-nrow(year.data.last),]
colnames(year.data.last) <- c('county', 'estimates')

# create a data set for export
this.year <- rep(as.numeric(substr(y[[1]], 2, 5)), nrow(year.data.last))
revised.data <- data.frame(state, this.year, year.data.last)
return(revised.data) 
}

my.list  <- apply(shares, 1, function(y) my.function(y))
my.list2 <- do.call("rbind", my.list)
my.list2

      state this.year county   estimates
1  my.state      1980     aa   96.470588
3  my.state      1980     ee 1543.529412
2  my.state      1990     cc   14.364641
31 my.state      1990     ee 1149.171271
4  my.state      1990     gg 1436.464088
11 my.state      2000     aa   23.148148
21 my.state      2000     cc    4.629630
41 my.state      2000     gg  462.962963
6  my.state      2000     kk    9.259259

Here is an alternative function that developed out of Arun's answer. With this function I accessed the data.frame shares inside the function by using sapply to allow me to treat column names as variables. However, this function is not adequate for the task because there are two types of missing observations with my actual data and this function cannot distinguish between the two. The function above can distinguish between the two because I explicitly specify the relevant missing observations in the list d.counties. In my example data set I assume all missing observations are the same kind, so both functions return the same estimates.

# data set
my.data <- read.table(text = '
 county  y1970  y1980  y1990  y2000  y2010
   aa      50     NA     70     NA     550
   cc      10     20     NA     NA     100
   ee     800     NA     NA    400    9000
   gg    1000   1900     NA     NA   12000
   ii     200    400    300    100    1500
   kk      20     40     30     NA     100
total    2080   4000   3000   1000   23250
', header = TRUE, na.string='NA', stringsAsFactors=FALSE)

# extract columns with NA's
my.data2 <- my.data[(which(colSums(is.na(my.data), na.rm=TRUE) > 0))]

# For each column containing NA's define the years to use as shares
# If use.years = 'pre'  then use the year in pre.year
# If use.years = 'post' then use the year in post.year
# If use.years = 'both' then use both the year in pre.year and the year in post.year
#
# Here I define pre.year = y1970 and post.year = 2010 for every year
# However, 'pre.year' and 'post.year' are variables.  They can differ among rows below.
shares <- read.table(text = '
      cyear   pre.year   post.year   use.years
      y1980     y1970      y2010        pre
      y1990     y1970      y2010        post
      y2000     y1970      y2010        both
', header = TRUE, na.strings = "NA")

# extract last row of my.data2
my.total   <- my.data2[nrow(my.data),]

# For each column sum all but the last row of my.data2
my.colsums <- colSums(my.data2[1:(nrow(my.data2)-1),], na.rm = TRUE)

# For each column in my.data2 calculate the number to be divided among rows with NA's
divide.up  <- my.total - my.colsums

my.function <- function(x) { 
    idx <- which(is.na((my.data2)[x]))
    names.x <- as.character(colnames(my.data2)[x])

    my.pre.col  <- as.character(shares$pre.year[shares$cyear==names.x])
    my.post.col <- as.character(shares$post.year[shares$cyear==names.x])
    my.use.year <- as.character(shares$use.years[shares$cyear==names.x])

    xx.pre  <- my.data[[my.pre.col]][idx]
    xx.post <- my.data[[my.post.col]][idx]

    if(my.use.year=='pre' ) my.data2[[x]][idx] =   (xx.pre /sum(xx.pre ))                                * divide.up[[x]]
    if(my.use.year=='post') my.data2[[x]][idx] =   (xx.post/sum(xx.post))                                * divide.up[[x]]
    if(my.use.year=='both') my.data2[[x]][idx] = (((xx.pre /sum(xx.pre )) + (xx.post/sum(xx.post))) / 2) * divide.up[[x]]

    return(my.data2[x])

}

na.estimates <- sapply(1:ncol(my.data2), function(x) {my.function(x)})
revised.data  <- t(do.call("rbind", na.estimates))
revised.data

          y1980      y1990       y2000
[1,]   96.47059   70.00000   22.358388
[2,]   20.00000   12.32227    4.275599
[3,] 1543.52941 1109.00474  400.000000
[4,] 1900.00000 1478.67299  466.775599
[5,]  400.00000  300.00000  100.000000
[6,]   40.00000   30.00000    6.590414
[7,] 4000.00000 3000.00000 1000.000000
Mark Miller
  • 12,483
  • 23
  • 78
  • 132