0

I have a dataframe as shown. How can I create a loop that replace 'NA' with values from the previous row condition on outyear is smaller or equal to gdpyear (and for age, replace 'NA' with previous row's values+1 if the condition is TRUE?

idacr gdpyear leaderid    leader                       outyear age careerpolitician
"AFG" 1929 "LEAD.v1-7717" "Nadir Shah"                 "1933" "53" "0" 
"AFG" 1933 "LEAD.v1-7720" "Hashim Khan"                "1946" "62" "1" 
"AFG" 1946 "LEAD.v1-7723" "Mahmud Khan Ghazi"          "1953" "65" "1" 
"AFG" 1950 "NA"           "NA"                         "NA"   "NA" "NA"
"AFG" 1951 "NA"           "NA"                         "NA"   "NA" "NA"
"AFG" 1952 "NA"           "NA"                         "NA"   "NA" "NA"
"AFG" 1953 "LEAD.v1-7726" "Sardar Mohammad Daud Khan"  "1963" "54" "1" 
"AFG" 1954 "NA"           "NA"                         "NA"   "NA" "NA"
"AFG" 1955 "NA"           "NA"                         "NA"   "NA" "NA"
"AFG" 1956 "NA"           "NA"                         "NA"   "NA" "NA"
"AFG" 1957 "NA"           "NA"                         "NA"   "NA" "NA"
"AFG" 1958 "NA"           "NA"                         "NA"   "NA" "NA"
"AFG" 1959 "NA"           "NA"                         "NA"   "NA" "NA"
"AFG" 1960 "NA"           "NA"                         "NA"   "NA" "NA"
"AFG" 1961 "NA"           "NA"                         "NA"   "NA" "NA"
"AFG" 1962 "NA"           "NA"                         "NA"   "NA" "NA"
"AFG" 1963 "LEAD.v1-7729" "Mohammad Yusuf"             "1965" "48" "1" 
"AFG" 1964 "NA"           "NA"                         "NA"   "NA" "NA"
"AFG" 1965 "LEAD.v1-7732" "Mohammad Hashim Maiwandwal" "1967" "48" "1" 
"AFG" 1966 "NA"           "NA"                         "NA"   "NA" "NA"
"AFG" 1967 "LEAD.v1-7735" "Abdallah Yakta"             "1967" "NA" "0" 
"AFG" 1967 "LEAD.v1-7738" "Nur Ahmad Etemadi"          "1971" "50" "1" 

So the new dataframe looks something like:

idacr gdpyear leaderid     leader                     outyear age careerpolitician
"AFG" 1953 "LEAD.v1-7726" "Sardar Mohammad Daud Khan" "1963" "54" "1"
"AFG" 1954 "LEAD.v1-7726" "Sardar Mohammad Daud Khan" "1963" "55" "1"
"AFG" 1955 "LEAD.v1-7726" "Sardar Mohammad Daud Khan" "1963" "56" "1"
"AFG" 1956 "LEAD.v1-7726" "Sardar Mohammad Daud Khan" "1963" "57" "1"
"AFG" 1957 "LEAD.v1-7726" "Sardar Mohammad Daud Khan" "1963" "58" "1"
"AFG" 1958 "LEAD.v1-7726" "Sardar Mohammad Daud Khan" "1963" "59" "1"
"AFG" 1959 "LEAD.v1-7726" "Sardar Mohammad Daud Khan" "1963" "60" "1"
"AFG" 1960 "LEAD.v1-7726" "Sardar Mohammad Daud Khan" "1963" "61" "1"
"AFG" 1961 "LEAD.v1-7726" "Sardar Mohammad Daud Khan" "1963" "62" "1"
"AFG" 1962 "LEAD.v1-7726" "Sardar Mohammad Daud Khan" "1963" "63" "1"
"AFG" 1963 "LEAD.v1-7729" "Mohammad Yusuf" "1965" "48" "1"

Thanks.

  • 3
    Hi there, what have you tried so far? – Michelle Jul 11 '20 at 11:36
  • 1
    Your "missing values" are `"NA"` strings, not `NA`. – jay.sf Jul 11 '20 at 11:41
  • @Michelle I have tried fill() but I am new to R and I don't know how to use it in a if statement. – Ng David Jul 11 '20 at 11:48
  • Have you tried using `fill` (e.g. https://stackoverflow.com/questions/40040834/replace-na-with-previous-or-next-value-by-group-using-dplyr) then filtering rows by your conditions? If possible, please update your question with your own attempts to solve this problem (i.e. include your code) – jared_mamrot Jul 11 '20 at 11:52
  • Could you use ```dput(df)```? As jay.sf points out, part of why fill does not work here is that these are strings and not actually ```NA```. – Cole Jul 11 '20 at 11:58

1 Answers1

0

Here is a solution that uses both and to

  1. Import your text as a data.frame
  2. Convert "NA" to NA
  3. Use tidyr::fill to fill all NA values.
  4. Update the age
library(data.table)
library(ti)
dt = fread('idacr gdpyear leaderid    leader                       outyear age careerpolitician
      "AFG" 1929 "LEAD.v1-7717" "Nadir Shah"                 "1933" "53" "0" 
      "AFG" 1933 "LEAD.v1-7720" "Hashim Khan"                "1946" "62" "1" 
      "AFG" 1946 "LEAD.v1-7723" "Mahmud Khan Ghazi"          "1953" "65" "1" 
      "AFG" 1950 "NA"           "NA"                         "NA"   "NA" "NA"
      "AFG" 1951 "NA"           "NA"                         "NA"   "NA" "NA"
      "AFG" 1952 "NA"           "NA"                         "NA"   "NA" "NA"
      "AFG" 1953 "LEAD.v1-7726" "Sardar Mohammad Daud Khan"  "1963" "54" "1" 
      "AFG" 1954 "NA"           "NA"                         "NA"   "NA" "NA"
      "AFG" 1955 "NA"           "NA"                         "NA"   "NA" "NA"
      "AFG" 1956 "NA"           "NA"                         "NA"   "NA" "NA"
      "AFG" 1957 "NA"           "NA"                         "NA"   "NA" "NA"
      "AFG" 1958 "NA"           "NA"                         "NA"   "NA" "NA"
      "AFG" 1959 "NA"           "NA"                         "NA"   "NA" "NA"
      "AFG" 1960 "NA"           "NA"                         "NA"   "NA" "NA"
      "AFG" 1961 "NA"           "NA"                         "NA"   "NA" "NA"
      "AFG" 1962 "NA"           "NA"                         "NA"   "NA" "NA"
      "AFG" 1963 "LEAD.v1-7729" "Mohammad Yusuf"             "1965" "48" "1" 
      "AFG" 1964 "NA"           "NA"                         "NA"   "NA" "NA"
      "AFG" 1965 "LEAD.v1-7732" "Mohammad Hashim Maiwandwal" "1967" "48" "1" 
      "AFG" 1966 "NA"           "NA"                         "NA"   "NA" "NA"
      "AFG" 1967 "LEAD.v1-7735" "Abdallah Yakta"             "1967" "NA" "0" 
      "AFG" 1967 "LEAD.v1-7738" "Nur Ahmad Etemadi"          "1971" "50" "1" ',
       data.table = FALSE    )


dt[dt == "NA"] <- NA_character_
dt[, c("gdpyear", "outyear", "age", "careerpolitician")] <- lapply(dt[, c("gdpyear", "outyear", "age", "careerpolitician")], as.integer)

dt = tidyr::fill(dt, leaderid, leader, outyear, age, careerpolitician)

setDT(dt)
dt[, age := age + .I - 1L, by = leaderid]
dt
Cole
  • 11,130
  • 1
  • 9
  • 24