2

I'm working on a panel dataset with thousands of observations. Let me simplify things as much as I can. Suppose I have the following dataset

set.seed(123)
gdp_usa=runif(16,8,9)
gdp_bel=c(9.22707,  9.245133,   9.272205,   9.31063,    9.339993,   9.364777,   9.376749,   
      9.364378, 9.393332,   9.447258,   9.491499,   9.537432,   9.572997,   9.631823,
      9.657445, 9.680416)
pot_usa = c(0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0)
pot_bel=c(0,    0,  0,  0,  0,  0,  0,  1,  1,  0,  0,  0,  0,  0,  0,  0) 
df=data.frame(country=c(rep("BEL",16),rep("USA",16)),year=c(rep(1990:2005,2)),gdp=c(gdp_bel,gdp_usa),
          potential=c(pot_bel,pot_usa))

What I have to do is to make the following regression for each country maybe in a loop: enter image description here

Just to make things more clear. For BEL, year_1={1997,1998},i.e., the corresponding year for which the "potential" variable equals 1. This means that for Belgium I should run 2 regressions:

R1: regress Y over X1,X2, with Y=gdp_bel from 1990 to 2004 and X1={1990,1991,...,2004} and X2={0,0,0,0,0,0,0,0,1,2,3,4,5,6,7}

R2: regress Y over X1,X2 with Y_bis=gdp_bel from 1991 to 2005 and X1_bis={1991,...,2005} and X2_bis={0,0,0,0,0,0,0,0,1,2,3,4,5,6,7}

Then, I compute the F-stat for each regression as follows:

test_result <- anova(R1, update(R1, . ~ . - X2))

And:

test_result2 <- anova(R2, update(R2, . ~ . - X2_bis))

Then, I'll pick the starting year with the highest F-stat.

How to write this procedure in an efficient way by considering:

-I have almost 200 different countries

-For $i >n$ I can have a non competing starting year. For BEL, for instance, if data span up to 2022, and potential=1 at 2011 and 2013, then I'll have 2 starting years, one will be the winner between 1997 and 1998 (the one with the highest F-stat) and the other one will be the winner between 2011 and 2013.

UPDATE

Based on your suggestion, I got what I was looking for:

# Function to perform regression and return F-statistic
perform_regression <- function(data, year_i, n) {
  data$year_diff <- pmax(data$year - year_i, 0)
  simple_reg <- lm(gdp ~ year, data = data)
  complex_reg <- lm(gdp ~ year + year_diff, data = data)
  test_result <- anova(simple_reg, complex_reg)
  return(test_result$F[2])  # Return the F-statistic for the complex model
}

# Get unique country names
unique_countries <- unique(df$country)

# Loop through each country
for (country in unique_countries) {
  country_data <- df[df$country == country, ]
  
  # Get potential starting years
  potential_starting_years <- unique(country_data$year[country_data$potential == 1])
  
  best_f_statistic <- -Inf
  best_starting_year <- NA
  
  cat("Country:", country, "\n")
  
  # Loop through potential starting years
  for (year_i in potential_starting_years) {
    filtered_data <- country_data[abs(country_data$year - year_i) <= 7, ]
    f_statistic <- perform_regression(filtered_data, year_i, n = 7)
    
    cat("Year_i:", year_i, "F-statistic:", f_statistic, "\n")
    
    if (f_statistic > best_f_statistic) {
      best_f_statistic <- f_statistic
      best_starting_year <- year_i
    }
  }
  
  cat("Best Starting Year:", best_starting_year, "\n")
  cat("Best F-statistic:", best_f_statistic, "\n\n")
}
df

The last point should be to get an outcome like this :

country year      gdp potential  fstat    max
1      BEL 1990 9.227070         0     NA     NA
2      BEL 1991 9.245133         0     NA     NA
3      BEL 1992 9.272205         0     NA     NA
4      BEL 1993 9.310630         0     NA     NA
5      BEL 1994 9.339993         0     NA     NA
6      BEL 1995 9.364777         0     NA     NA
7      BEL 1996 9.376749         0     NA     NA
8      BEL 1997 9.364378         1 25.330 34.380
9      BEL 1998 9.393332         1 34.380 34.380
10     BEL 1999 9.447258         0     NA     NA
11     BEL 2000 9.491499         0     NA     NA
12     BEL 2001 9.537432         0     NA     NA
13     BEL 2002 9.572997         0     NA     NA
14     BEL 2003 9.631823         0     NA     NA
15     BEL 2004 9.657445         0     NA     NA
16     BEL 2005 9.680416         0     NA     NA
17     USA 1990 8.287578         0     NA     NA
18     USA 1991 8.788305         0     NA     NA
19     USA 1992 8.408977         0     NA     NA
20     USA 1993 8.883017         0     NA     NA
21     USA 1994 8.940467         0     NA     NA
22     USA 1995 8.045556         0     NA     NA
23     USA 1996 8.528105         0     NA     NA
24     USA 1997 8.892419         1  0.945  0.945
25     USA 1998 8.551435         0     NA     NA
26     USA 1999 8.456615         0     NA     NA
27     USA 2000 8.956833         0     NA     NA
28     USA 2001 8.453334         0     NA     NA
29     USA 2002 8.677571         0     NA     NA
30     USA 2003 8.572633         0     NA     NA
31     USA 2004 8.102925         0     NA     NA
32     USA 2005 8.899825         0     NA     NA

Any suggestion?

Maximilian
  • 235
  • 1
  • 7

1 Answers1

2

I don't know if its an issue with the data, or the way you were thinking about pulling out the first value of fstat from the anova result. so I left it open ended, in that I only print the looped values, and dont pick or return , or compile the best. but I think I improved the data part, to get you seven years before and after each potential date

set.seed(123)
gdp_bel <- c(9.22707, 9.245133, 9.272205, 9.31063, 9.339993, 9.364777, 9.376749,
             9.364378, 9.393332, 9.447258, 9.491499, 9.537432, 9.572997, 9.631823,
             9.657445, 9.680416)
pot_bel <- c(0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0)
pot_usa <- c(0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0)

df <- data.frame(
  country = c(rep("BEL", 16), rep("USA", 16)),
  year = c(rep(1990:2005, 2)),
  gdp = c(gdp_bel, runif(16, min = 7, max = 9)),
  potential = c(pot_bel, pot_usa)
)



# Function to perform regression and return F-statistic
perform_regression <- function(data, year_i, n) {
  data$year_diff <- pmax(data$year - year_i,0)
  simple_reg <- lm(gdp ~ year  , data = data)
  complx_reg <- lm(gdp ~ year +year_diff , data = data)
  simple_fstat <- summary(simple_reg)$fstat["value"]
  complex_fstat <- summary(complx_reg)$fstat["value"]
  test_result <- anova(simple_reg,complx_reg)
  test_result$F
  cat("\n year_i n ", year_i , " ", n,
      "\nsimple F : ",simple_fstat,
      "\ncomplex F : ",complex_fstat,
      "\ntest res 1 F : " , test_result$F[1],
      "\ntest res 2 F : " , test_result$F[2],
      "\n")
}

# Get unique country names
unique_countries <- unique(df$country)

# Loop through each country
for (country in unique_countries) {
  country_data <- df[df$country == country, ]
  
  # Get potential starting years
  potential_starting_years <- unique(country_data$year[country_data$potential == 1])
  
  best_f_statistic <- -Inf
  best_starting_year <- NA
  print("--------")
  print(country)
  # Loop through potential starting years
  for (year_i in potential_starting_years) {
    filtered_data <- country_data[abs(country_data$year - year_i) <= 7, ]
    f_statistic <- perform_regression(filtered_data, year_i, n = 7)
    # if (f_statistic > best_f_statistic) {
    #   best_f_statistic <- f_statistic
    #   best_starting_year <- year_i
    # }
  }
  # 
  # cat("Country:", country, "\n")
  # cat("Best Starting Year:", best_starting_year, "\n")
  # cat("Best F-statistic:", best_f_statistic, "\n\n")
}
Nir Graham
  • 2,567
  • 2
  • 6
  • 10