1

I have the following dataset that has a lot of NA values in all the columns (except for Date.Local, all values available.) I want to replace all the GAs by interpolation/extrapolation.

enter image description here

So far I tried the following code:

library("zoo")
nev <- na.approx(név)

Error in approx(x[!na], y[!na], xout, ...) : 
  need at least two non-NA values to interpolate
In addition: Warning messages:
1: In na.approx.default(object, x = x, xout = xout, na.rm = FALSE,  :
  NAs introduced by coercion
2: In na.approx.default(object, x = x, xout = xout, na.rm = FALSE,  :
  NAs introduced by coercion
3: In xy.coords(x, y, setLab = FALSE) : NAs introduced by coercion
}

This isn't doing anything to remove the NAs. Also for the last 4 cols the data is NA for the last ~100 rows. enter image description here

enter image description here Output of nev as a .csv file.

"","Date.Local","cumulative_cases","cumulative_deaths","new_cases","new_deaths","pm2.5","temp","wind","press"
"456373",2020-01-22,0,0,NA,NA,10.8892157058824,50.3291667,102.76500005,957.083333
"456374",2020-01-23,0,0,0,0,12.1433946153846,53.4416666,106.43416665,957.375
"456375",2020-01-24,0,0,0,0,9.71062371428572,53.4444445555556,104.74458335,953.541667
"456376",2020-01-25,0,0,0,0,13.090885375,54.0509258888889,106.2454166,955.083333
"456377",2020-01-26,0,0,0,0,10.4494047857143,54.1666665,107.4506249,954.125
"456378",2020-01-27,0,0,0,0,4.04497283333333,57.2541667,141.6875,958
"456379",2020-01-28,0,0,0,0,6.8008605625,52.9083334,101.40020845,956.958333
"456380",2020-01-29,0,0,0,0,4.70659716666667,53.2583333,108.96375,954.416667
"456381",2020-01-30,0,0,0,0,3.48005957142857,52.2416666,98.6919021,957.958333
"456382",2020-01-31,0,0,0,0,8.5216145625,52.3666666,102.20354165,963.208333
"456383",2020-02-01,0,0,0,0,13.1172619285714,54.911458375,108.7499999,961.75
"456384",2020-02-02,0,0,0,0,12.60625,57.2791668,105.81625,946.352941
"456385",2020-02-03,0,0,0,0,4.34877458823529,41.475,158.19916665,948.666667
"456386",2020-02-04,0,0,0,0,2.57147446153846,39.7583334,124.341666631579,956.958333
"456387",2020-02-05,0,0,0,0,4.95521478571429,40.7041667,97.09187505,956.75
"456388",2020-02-06,0,0,0,0,9.55396126666667,49.3425926666667,91.9258334,953.5
"456389",2020-02-07,0,0,0,0,11.0976967142857,54.3541667,104.406798210526,953.541667
"456390",2020-02-08,0,0,0,0,12.3601190714286,54.3009258888889,99.99000005,948.083333
"456391",2020-02-09,0,0,0,0,4.69017857142857,49.7685183333333,130.606875,948.083333
"456392",2020-02-10,0,0,0,0,2.33012815384615,52.7124999,109.421929842105,951.458333
"456393",2020-02-11,0,0,0,0,2.97980769230769,49.3958334,102.6325001,955.083333
"456394",2020-02-12,0,0,0,0,7.11474353846154,49.7349637,102.7526359,953.125
"456395",2020-02-13,0,0,0,0,8.01398807142857,50.4958333,97.35916665,954.625
"456396",2020-02-14,0,0,0,0,10.4294871538462,51.7625,100.0085417,954.041667
"456397",2020-02-15,0,0,0,0,11.1705556,53.7416667,103.3418751,955.416667
"456398",2020-02-16,0,0,0,0,13.39629625,56.3148147777778,98.75041665,952.791667
"456399",2020-02-17,0,0,0,0,9.0129166,58.9958334,104.1306249,950.833333
"456400",2020-02-18,0,0,0,0,6.68954376923077,52.3875,85.109375,954.333333
"456401",2020-02-19,0,0,0,0,8.20976971428571,52.8208334,82.0013157368421,955.166667
"456402",2020-02-20,0,0,0,0,5.87976185714286,52.6125001,99.1868421578947,959.708333
"456403",2020-02-21,0,0,0,0,9.2473958125,56.2083334,97.72770845,956.166667
"456404",2020-02-22,0,0,0,0,8.31025638461539,53.0046297777778,84.2597917,950.458333
"456405",2020-02-23,0,0,0,0,5.02708333333333,55.6342593333333,110.4885418,955.333333
"456406",2020-02-24,0,0,0,0,6.88227053333333,57.9750001,95.65458335,957.458333
"456407",2020-02-25,0,0,0,0,4.43333342857143,52.4958333,67.62458335,964.083333
"456408",2020-02-26,0,0,0,0,6.26319441666667,50.8518516666667,90.12708335,965.5
"456409",2020-02-27,0,0,0,0,9.36715682352941,55.7083333,97.30375,961.916667
"456410",2020-02-28,0,0,0,0,7.56869828571429,58.1708335,98.71,958.416667
"456411",2020-02-29,0,0,0,0,10.5327381428571,57.1666667,92.84645835,949.916667
"456412",2020-03-01,0,0,0,0,5.1469444,56.4875001,115.70333325,944.375
"456413",2020-03-02,0,0,0,0,2.32756407692308,52.3958334,126.09875015,950.041667
"456414",2020-03-03,0,0,0,0,2.07640746153846,60.0374999,109.85041665,951.791667
"456415",2020-03-04,0,0,0,0,4.61678741666667,60.7749998,97.27749995,955.166667
"456416",2020-03-05,1,0,1,0,7.44285714285714,61.6666666666667,102.046875,957.291667
"456417",2020-03-06,1,0,0,0,7.32589285714286,64.3666666,87.91520845,951.916667
"456418",2020-03-07,1,0,0,0,6.71785707142857,63.0416666666667,103.56520825,945.875
"456419",2020-03-08,2,0,1,0,3.08910253846154,59.5333333,101.919375,948.25
"456420",2020-03-09,2,0,0,0,3.83044876923077,59.4907406666667,102.21375005,953.916667
"456421",2020-03-10,2,0,0,0,4.011197875,55.5833332,94.80375005,956.708333
"456422",2020-03-11,5,0,3,0,3.90032053846154,59.2685184444444,81.66229165,953.041667
"456423",2020-03-12,8,0,3,0,4.21160714285714,58.9833333,86.2745833,946.625
"456424",2020-03-13,16,0,8,0,1.71215275,55.9583333,102.529824526316,945
"456425",2020-03-14,16,0,0,0,2.611218,58.9666666,95.77375005,950.791667
"456426",2020-03-15,16,0,0,0,3.92202385714286,58.3703703333333,93.4629167,951.583333
"456427",2020-03-16,35,1,19,1,4.23482135714286,57.5375,87.1118751,948.875
"456428",2020-03-17,42,1,7,0,2.43214285714286,53.9083333,104.4222916,946.916667
"456429",2020-03-18,42,1,0,0,2.90993584615385,50.8249999,76.85854175,944.041667
"456430",2020-03-19,69,1,27,0,3.41448138461538,49.9916666,93.75895835,950.791667
"456431",2020-03-20,126,2,57,1,2.95059521428571,50.5208334,58.7370833,957.791667
"456432",2020-03-21,126,2,0,0,3.87692315384615,56.4791665,102.38687495,955.041667
"456433",2020-03-22,126,2,0,0,4.86472226666667,57.7222221111111,85.915625,953.958333
"456434",2020-03-23,126,4,0,2,2.06212127272727,58.4083333,113.3679166,951.083333
"456435",2020-03-24,212,4,86,0,2.57980769230769,59.4458334,97.6064583,951.208333
"456436",2020-03-25,249,6,37,2,3.79613092857143,58.3101853333333,108.9895834,946.5
"456437",2020-03-26,350,6,101,0,2.65693976923077,51.5916668,124.288125,946.125
"456438",2020-03-27,443,10,93,4,2.3113095,51.45,138.6695833,950
"456439",2020-03-28,443,10,0,0,4.54093135294118,55.4041665,87.8764583,954.083333
"456440",2020-03-29,528,14,85,4,4.34166669230769,57.2824073333333,112.5289582,951.083333
"456441",2020-03-30,753,14,225,0,4.18589746153846,59.3083333,102.16439395,956.416667
"456442",2020-03-31,869,23,116,9,5.35386907142857,63.7625,94.4827084,951.708333
"456443",2020-04-01,961,28,92,5,NA,NA,NA,NA
"456444",2020-04-02,1125,34,164,6,NA,NA,NA,NA
"456445",2020-04-03,1279,39,154,5,NA,NA,NA,NA
"456446",2020-04-04,1418,41,139,2,NA,NA,NA,NA
"456447",2020-04-05,1519,41,101,0,NA,NA,NA,NA
"456448",2020-04-06,1608,41,89,0,NA,NA,NA,NA
"456449",2020-04-07,1734,54,126,13,NA,NA,NA,NA

trap28
  • 97
  • 7
  • Can you fine what you're looking for here: https://stackoverflow.com/questions/33186316/linear-interpolate-missing-values-in-time-series – DS_UNI Nov 06 '20 at 09:47
  • I did the following : nev<-nev[c(1,7)] then data.frame(date = seq(nev$Date.Local[1], nev$Date.Local[nrow(nev)], by = 1)) %>% full_join(nev, by = "Date.Local") %>% mutate(approx = na.approx(pm2.5)) but this gave me an error: Error: Join columns must be present in data. x Problem with `Date.Local`. Run `rlang::last_error()` to see where the error occurred. – trap28 Nov 06 '20 at 09:55
  • Can you post a reproducible example, so we can try it out? maybe post the output of `dput(head(nev), 30)` – DS_UNI Nov 06 '20 at 10:01
  • I added the data file. – trap28 Nov 06 '20 at 10:28

2 Answers2

2

With the sample data you provided, zoo:na.approx as well as imputeTS::na_interpolation worked fine, however you need to be careful, because interpolation works when you need to fill missing values between at least two existing values, and the fact that your sample data has missing values all the way to the end of the data, will cause some functions (here imputeTS::na_interpolation) to fill the missing values with the last seen value, while others remove those NAs altogether (like zoo:na.approx). Here's an example.

read the data

options(stringsAsFactors = FALSE)
nev = read.csv(text='"","Date.Local","cumulative_cases","cumulative_deaths","new_cases","new_deaths","pm2.5","temp","wind","press"
"456373",2020-01-22,0,0,NA,NA,10.8892157058824,50.3291667,102.76500005,957.083333
"456374",2020-01-23,0,0,0,0,12.1433946153846,53.4416666,106.43416665,957.375
"456375",2020-01-24,0,0,0,0,9.71062371428572,53.4444445555556,104.74458335,953.541667
"456376",2020-01-25,0,0,0,0,13.090885375,54.0509258888889,106.2454166,955.083333
"456377",2020-01-26,0,0,0,0,10.4494047857143,54.1666665,107.4506249,954.125
"456378",2020-01-27,0,0,0,0,4.04497283333333,57.2541667,141.6875,958
"456379",2020-01-28,0,0,0,0,6.8008605625,52.9083334,101.40020845,956.958333
"456380",2020-01-29,0,0,0,0,4.70659716666667,53.2583333,108.96375,954.416667
"456381",2020-01-30,0,0,0,0,3.48005957142857,52.2416666,98.6919021,957.958333
"456382",2020-01-31,0,0,0,0,8.5216145625,52.3666666,102.20354165,963.208333
"456383",2020-02-01,0,0,0,0,13.1172619285714,54.911458375,108.7499999,961.75
"456384",2020-02-02,0,0,0,0,12.60625,57.2791668,105.81625,946.352941
"456385",2020-02-03,0,0,0,0,4.34877458823529,41.475,158.19916665,948.666667
"456386",2020-02-04,0,0,0,0,2.57147446153846,39.7583334,124.341666631579,956.958333
"456387",2020-02-05,0,0,0,0,4.95521478571429,40.7041667,97.09187505,956.75
"456388",2020-02-06,0,0,0,0,9.55396126666667,49.3425926666667,91.9258334,953.5
"456389",2020-02-07,0,0,0,0,11.0976967142857,54.3541667,104.406798210526,953.541667
"456390",2020-02-08,0,0,0,0,12.3601190714286,54.3009258888889,99.99000005,948.083333
"456391",2020-02-09,0,0,0,0,4.69017857142857,49.7685183333333,130.606875,948.083333
"456392",2020-02-10,0,0,0,0,2.33012815384615,52.7124999,109.421929842105,951.458333
"456393",2020-02-11,0,0,0,0,2.97980769230769,49.3958334,102.6325001,955.083333
"456394",2020-02-12,0,0,0,0,7.11474353846154,49.7349637,102.7526359,953.125
"456395",2020-02-13,0,0,0,0,8.01398807142857,50.4958333,97.35916665,954.625
"456396",2020-02-14,0,0,0,0,10.4294871538462,51.7625,100.0085417,954.041667
"456397",2020-02-15,0,0,0,0,11.1705556,53.7416667,103.3418751,955.416667
"456398",2020-02-16,0,0,0,0,13.39629625,56.3148147777778,98.75041665,952.791667
"456399",2020-02-17,0,0,0,0,9.0129166,58.9958334,104.1306249,950.833333
"456400",2020-02-18,0,0,0,0,6.68954376923077,52.3875,85.109375,954.333333
"456401",2020-02-19,0,0,0,0,8.20976971428571,52.8208334,82.0013157368421,955.166667
"456402",2020-02-20,0,0,0,0,5.87976185714286,52.6125001,99.1868421578947,959.708333
"456403",2020-02-21,0,0,0,0,9.2473958125,56.2083334,97.72770845,956.166667
"456404",2020-02-22,0,0,0,0,8.31025638461539,53.0046297777778,84.2597917,950.458333
"456405",2020-02-23,0,0,0,0,5.02708333333333,55.6342593333333,110.4885418,955.333333
"456406",2020-02-24,0,0,0,0,6.88227053333333,57.9750001,95.65458335,957.458333
"456407",2020-02-25,0,0,0,0,4.43333342857143,52.4958333,67.62458335,964.083333
"456408",2020-02-26,0,0,0,0,6.26319441666667,50.8518516666667,90.12708335,965.5
"456409",2020-02-27,0,0,0,0,9.36715682352941,55.7083333,97.30375,961.916667
"456410",2020-02-28,0,0,0,0,7.56869828571429,58.1708335,98.71,958.416667
"456411",2020-02-29,0,0,0,0,10.5327381428571,57.1666667,92.84645835,949.916667
"456412",2020-03-01,0,0,0,0,5.1469444,56.4875001,115.70333325,944.375
"456413",2020-03-02,0,0,0,0,2.32756407692308,52.3958334,126.09875015,950.041667
"456414",2020-03-03,0,0,0,0,2.07640746153846,60.0374999,109.85041665,951.791667
"456415",2020-03-04,0,0,0,0,4.61678741666667,60.7749998,97.27749995,955.166667
"456416",2020-03-05,1,0,1,0,7.44285714285714,61.6666666666667,102.046875,957.291667
"456417",2020-03-06,1,0,0,0,7.32589285714286,64.3666666,87.91520845,951.916667
"456418",2020-03-07,1,0,0,0,6.71785707142857,63.0416666666667,103.56520825,945.875
"456419",2020-03-08,2,0,1,0,3.08910253846154,59.5333333,101.919375,948.25
"456420",2020-03-09,2,0,0,0,3.83044876923077,59.4907406666667,102.21375005,953.916667
"456421",2020-03-10,2,0,0,0,4.011197875,55.5833332,94.80375005,956.708333
"456422",2020-03-11,5,0,3,0,3.90032053846154,59.2685184444444,81.66229165,953.041667
"456423",2020-03-12,8,0,3,0,4.21160714285714,58.9833333,86.2745833,946.625
"456424",2020-03-13,16,0,8,0,1.71215275,55.9583333,102.529824526316,945
"456425",2020-03-14,16,0,0,0,2.611218,58.9666666,95.77375005,950.791667
"456426",2020-03-15,16,0,0,0,3.92202385714286,58.3703703333333,93.4629167,951.583333
"456427",2020-03-16,35,1,19,1,4.23482135714286,57.5375,87.1118751,948.875
"456428",2020-03-17,42,1,7,0,2.43214285714286,53.9083333,104.4222916,946.916667
"456429",2020-03-18,42,1,0,0,2.90993584615385,50.8249999,76.85854175,944.041667
"456430",2020-03-19,69,1,27,0,3.41448138461538,49.9916666,93.75895835,950.791667
"456431",2020-03-20,126,2,57,1,2.95059521428571,50.5208334,58.7370833,957.791667
"456432",2020-03-21,126,2,0,0,3.87692315384615,56.4791665,102.38687495,955.041667
"456433",2020-03-22,126,2,0,0,4.86472226666667,57.7222221111111,85.915625,953.958333
"456434",2020-03-23,126,4,0,2,2.06212127272727,58.4083333,113.3679166,951.083333
"456435",2020-03-24,212,4,86,0,2.57980769230769,59.4458334,97.6064583,951.208333
"456436",2020-03-25,249,6,37,2,3.79613092857143,58.3101853333333,108.9895834,946.5
"456437",2020-03-26,350,6,101,0,2.65693976923077,51.5916668,124.288125,946.125
"456438",2020-03-27,443,10,93,4,2.3113095,51.45,138.6695833,950
"456439",2020-03-28,443,10,0,0,4.54093135294118,55.4041665,87.8764583,954.083333
"456440",2020-03-29,528,14,85,4,4.34166669230769,57.2824073333333,112.5289582,951.083333
"456441",2020-03-30,753,14,225,0,4.18589746153846,59.3083333,102.16439395,956.416667
"456442",2020-03-31,869,23,116,9,5.35386907142857,63.7625,94.4827084,951.708333
"456443",2020-04-01,961,28,92,5,NA,NA,NA,NA
"456444",2020-04-02,1125,34,164,6,NA,NA,NA,NA
"456445",2020-04-03,1279,39,154,5,NA,NA,NA,NA
"456446",2020-04-04,1418,41,139,2,NA,NA,NA,NA
"456447",2020-04-05,1519,41,101,0,NA,NA,NA,NA
"456448",2020-04-06,1608,41,89,0,NA,NA,NA,NA
"456449",2020-04-07,1734,54,126,13,NA,NA,NA,NA')

Try:

imputeTS::na_interpolation(nev$temp)
# or
# zoo::na.approx(nev$temp)

now add a value to the end of the values and try again to see what happens:

imputeTS::na_interpolation(c(nev$temp, 7.293))
# or
# zoo::na.approx(c(nev$temp, 7.293))

and you can apply the above on all columns with mutate_all

library(dplyr)

nev_clean <- nev %>% mutate_all(imputeTS::na_interpolation) 

nev %>% is.na() %>% colSums()
# X        Date.Local  cumulative_cases cumulative_deaths         new_cases        new_deaths 
# 0                 0                 0                 0                 1                 1 
# pm2.5              temp              wind             press 
# 7                 7                 7                 7 

nev_clean %>% is.na() %>% colSums()
# X        Date.Local  cumulative_cases cumulative_deaths         new_cases        new_deaths 
# 0                 0                 0                 0                 0                 0 
# pm2.5              temp              wind             press 
# 0                 0                 0                 0 
DS_UNI
  • 2,600
  • 2
  • 11
  • 22
  • I tried your code. I fixed the NAs at the beginning of the column, and the last element. However, the rest NA values remained. – trap28 Nov 06 '20 at 10:47
  • what exactly did you use? did you run `dplyr::mutate_all(nev, imputeTS::na_interpolation)`? and are you using the same dataset you posted in the question? – DS_UNI Nov 06 '20 at 10:50
  • I followed your code exactly. All the steps work with no errors. However, at the end, my dataframe ended up with only 77 rows instead of 252, with the last 6 being NA for the last 4 columns. – trap28 Nov 06 '20 at 10:56
  • the sample you posted (which I work with), has only 77 rows (rows from 456373 to 456449) – DS_UNI Nov 06 '20 at 11:04
  • I checked again and it's working fine with me, I edit my answer with the number of NAs in each column before and after the interpolation – DS_UNI Nov 06 '20 at 11:09
  • It works! However, is it normal that the NAs in each of the imputed columns got replaced by one value exactly? For ex: all pm2.5 NAs were replaced with 5.353. Also, how can I change it to spline interp? – trap28 Nov 06 '20 at 11:18
  • yes it is, that's because you don't have an end value, I explained that in my answer with an example, take a look above – DS_UNI Nov 06 '20 at 11:20
1

There are several problems:

  • na.approx does interpolation and interpolation means that for a vector of numbers it uses the values on either side of any NA (or stretch of NAs) to fill in the NAs; however, here the NAs are on the ends, not surrounded by non-NAs.

  • interpolation only applies to numeric columns but the columns are not all numeric. For a similar more familiar example, log(nev) also gives an error because log expects numeric input but not all input is numeric.

Any of the following can be used to fill in the values on the ends (or omit them in the last one) assuming that nev is a data frame with the Date column being the first column:

# gives no errors but there is nothing to interpolate so nev unchanged
# This excludes the non-numeric column from the interpolation.
nev[-1] <- na.approx(nev[-1])

# interpolate intermediate values (but thre are none) and extend values at the ends
nev[-1] <- na.approx(nev[-1], rule = 2)

# similar but does not overwrite
replace(nev, -1, na.approx(nev[-1], rule = 2))

# last value carried forward and perform the same in reverse order
na.locf(na.locf(nev), fromLast = TRUE)

# convert to zoo object first
nevz <- read.zoo(nev)
na.approx(nevz, rule = 2)

# use na.fill extending values at the ends
nev[-1] <- na.fill(nev[-1], fill = "extend")

# similar but does not overwrite
replace(nev, -1, na.fill(nev[-1], fill = "extend"))

# use na.fill but fill with zeros
nev[-1] <- na.fill(nev[-1], fill = 0)

# similar but does not overwrite
replace(nev, -1, na.fill(nev[-1], fill = 0))

# same
replace(nev, is.na(nev), 0)

# drop rows with NAs
na.omit(nev)

See ?na.approx, ?na.fill, ?na.locf, ?approx for more information. There are also other na.* routines in zoo: grep("^na\\.", ls("package:zoo"), value = TRUE)

G. Grothendieck
  • 254,981
  • 17
  • 203
  • 341
  • Thank you for the explanation! I believe that I was supposed to extrapolate the NAs at the end. I checked with my colleague who created the dataset. The last few values for the last 4 columns were never collected. This is why they are NAs. – trap28 Nov 06 '20 at 11:42