Revised and Updated Posting
I have two unequal time series that I would like to merge. x contains 20,000+ rows while y contains less than 5,000 rows. x is regularly spaced at 15 minute intervals while y is an irregular time series.
I want to combine x and y based on whether the dates-times in y are within 10 minutes of the dates-times in x. For those dates-times in x and y that are the same, then I want the rows to be added using a function that is shown below.
I had written x incorrectly previously so I am correcting it below:
library(data.table)
dput(x)
x <- structure(list(Date = structure(c(1078077600, 1080028800, 1080029700,
1080030600, 1080031500, 1091220300, 1091221200, 1091222100, 1091223000,
1091224800, 1091225700, 1091226600, 1091227500, 1091228400), class =
c("POSIXct", "POSIXt"), tzone = "Etc/GMT-6"), V1 = c(1.6, 1.9, 1.9, 2, 2,
1.4, 1.4, 1.5, 1.5, 1.6, 2.6, 2.8, 3.4, 3.8), V2 = c(0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), V3 = c(0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0), V4 = c(1.5, 2.3, 2.3, 2.4, 2.4, 7.8, 3.2,
4.9, 4.7, 3.4, 17.1, 25.4, 16.9, 30.6)), .Names = c("Date", "V1",
"V2", "V3", "V4"), row.names = c(NA, -14L), class = "data.frame")
# Date V1 V2 V3 V4
# 1 2004-03-01 00:00:00 1.6 0 0 1.5
# 2 2004-03-23 14:00:00 1.9 0 0 2.3
# 3 2004-03-23 14:15:00 1.9 0 0 2.3
# 4 2004-03-23 14:30:00 2.0 0 0 2.4
# 5 2004-03-23 14:45:00 2.0 0 0 2.4
# 6 2004-07-31 02:45:00 1.4 0 0 7.8
# 7 2004-07-31 03:00:00 1.4 0 0 3.2
# 8 2004-07-31 03:15:00 1.5 0 0 4.9
# 9 2004-07-31 03:30:00 1.5 0 0 4.7
# 10 2004-07-31 04:00:00 1.6 0 0 3.4
# 11 2004-07-31 04:15:00 2.6 0 0 17.1
# 12 2004-07-31 04:30:00 2.8 0 0 25.4
# 13 2004-07-31 04:45:00 3.4 0 0 16.9
# 14 2004-07-31 05:00:00 3.8 0 0 30.6
dput(y)
y <- structure(list(Date = structure(c(1076902200, 1080029700, 1091221800,
1091224800, 1091226600), class = c("POSIXct", "POSIXt"),
tzone = "Etc/GMT-6"), V1 = c(NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_), V2 = c(40, 42, 0, 0, 0), V3 = c(0, 0, 0, 0, 0), V4 = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_)), .Names = c("Date",
"V1", "V2", "V3", "V4"), row.names = c(NA, -5L), class = c("data.table",
"data.frame"), .internal.selfref = <pointer: 0x0000000000110788>,
sorted = "Date")
# Date V1 V2 V3 V4
# 1: 2004-02-16 09:30:00 NA 40 0 NA
# 2: 2004-03-23 14:15:00 NA 42 0 NA
# 3: 2004-07-31 03:10:00 NA 0 0 NA
# 4: 2004-07-31 04:00:00 NA 0 0 NA
# 5: 2004-07-31 04:30:00 NA 0 0 NA
Based off of the answer by BondedDust in Using `:=` in data.table to sum the values of two columns in R, ignoring NAs, I wrote a dyadic operator for "+" when the dates-times are the same in x and y.
`%+na%` <- function(x, y) {ifelse(x == 0 & is.na(y) == TRUE, NA,
ifelse(x != 0 & is.na(y) == TRUE, x, x+y))}
Based on Arun's answer, I have the following code:
setkey(setDT(x), Date)
setkey(setDT(y), Date)
xidx = x[y, roll = "nearest", which = TRUE]
yidx = which(abs(x$Date[xidx] - y$Date) <= 600)
xy <- rbind(x, y[yidx, ])
This is what xy looks like at this point:
dput(xy)
xy <- structure(list(Date = structure(c(1078077600, 1080028800, 1080029700,
1080030600, 1080031500, 1091220300, 1091221200, 1091222100, 1091223000,
1091224800, 1091225700, 1091226600, 1091227500, 1091228400, 1080029700,
1091221800, 1091224800, 1091226600), class = c("POSIXct", "POSIXt"
), tzone = "Etc/GMT-6"), V1 = c(1.6, 1.9, 1.9, 2, 2, 1.4, 1.4,
1.5, 1.5, 1.6, 2.6, 2.8, 3.4, 3.8, NA, NA, NA, NA), V2 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 42, 0, 0, 0), V3 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), V4 = c(1.5,
2.3, 2.3, 2.4, 2.4, 7.8, 3.2, 4.9, 4.7, 3.4, 17.1, 25.4, 16.9,
30.6, NA, NA, NA, NA)), .Names = c("Date", "V1", "V2", "V3",
"V4"), row.names = c(NA, -18L), class = c("data.table", "data.frame"
), .internal.selfref = <pointer: 0x0000000000200788>)
# Date V1 V2 V3 V4
# 1: 2004-03-01 00:00:00 1.6 0 0 1.5
# 2: 2004-03-23 14:00:00 1.9 0 0 2.3
# 3: 2004-03-23 14:15:00 1.9 0 0 2.3
# 4: 2004-03-23 14:30:00 2.0 0 0 2.4
# 5: 2004-03-23 14:45:00 2.0 0 0 2.4
# 6: 2004-07-31 02:45:00 1.4 0 0 7.8
# 7: 2004-07-31 03:00:00 1.4 0 0 3.2
# 8: 2004-07-31 03:15:00 1.5 0 0 4.9
# 9: 2004-07-31 03:30:00 1.5 0 0 4.7
# 10: 2004-07-31 04:00:00 1.6 0 0 3.4
# 11: 2004-07-31 04:15:00 2.6 0 0 17.1
# 12: 2004-07-31 04:30:00 2.8 0 0 25.4
# 13: 2004-07-31 04:45:00 3.4 0 0 16.9
# 14: 2004-07-31 05:00:00 3.8 0 0 30.6
# 15: 2004-03-23 14:15:00 NA 42 0 NA
# 16: 2004-07-31 03:10:00 NA 0 0 NA
# 17: 2004-07-31 04:00:00 NA 0 0 NA
# 18: 2004-07-31 04:30:00 NA 0 0 NA
The next line is modified from the solution provided by akrun here: Identifying duplicated rows.
xy[, lapply(.SD, xy[which(duplicated(xy))] %+na% xy[which(duplicated(xy,
fromLast = TRUE))]), keyby = Date]
Can someone suggest a modification to this last line so that I don't have this error message?:
# Note the new error message that I am receiving:
# Error in matrix(unlist(value, recursive = FALSE, use.names = FALSE),
# nrow = nr, : length of 'dimnames' [2] not equal to array extent
Thank you in advance.
This is the final, expected result below (it has been changed, see row 8 with the NAs):
dput(xy)
xy <- structure(list(Date = structure(c(1078077600, 1080028800, 1080029700,
1080030600, 1080031500, 1091220300, 1091221200, 1091221800, 1091222100,
1091223000, 1091224800, 1091225700, 1091226600, 1091227500, 1091228400
), class = c("POSIXct", "POSIXt"), tzone = "Etc/GMT-6"), V1 = c(1.6,
1.9, 1.9, 2, 2, 1.4, 1.4, NA, 1.5, 1.5, 1.6, 2.6, 2.8, 3.4, 3.8
), V2 = c(0, 0, 42, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), V3 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), V4 = c(1.5, 2.3, 2.3,
2.4, 2.4, 7.8, 3.2, NA, 4.9, 4.7, 3.4, 17.1, 25.4, 16.9, 30.6
)), .Names = c("Date", "V1", "V2", "V3", "V4"), row.names = c(NA,
-15L), class = "data.frame")
# Date V1 V2 V3 V4
# 1 2004-03-01 00:00:00 1.6 0 0 1.5
# 2 2004-03-23 14:00:00 1.9 0 0 2.3
# 3 2004-03-23 14:15:00 1.9 42 0 2.3
# 4 2004-03-23 14:30:00 2.0 0 0 2.4
# 5 2004-03-23 14:45:00 2.0 0 0 2.4
# 6 2004-07-31 02:45:00 1.4 0 0 7.8
# 7 2004-07-31 03:00:00 1.4 0 0 3.2
# 8 2004-07-31 03:10:00 NA 0 0 NA <-- Notice: the change here
# 9 2004-07-31 03:15:00 1.5 0 0 4.9
# 10 2004-07-31 03:30:00 1.5 0 0 4.7
# 11 2004-07-31 04:00:00 1.6 0 0 3.4
# 12 2004-07-31 04:15:00 2.6 0 0 17.1
# 13 2004-07-31 04:30:00 2.8 0 0 25.4
# 14 2004-07-31 04:45:00 3.4 0 0 16.9
# 15 2004-07-31 05:00:00 3.8 0 0 30.6
The NA columns in xy will be interpolated using na.approx thus the change from the original question posted here.
UPDATED POSSIBLE SOLUTION
The following section of code is taken from Arun's answer:
setkey(setDT(x), Date)
setkey(setDT(y), Date)
xidx = x[y, roll = "nearest", which = TRUE]
yidx = which(abs(x$Date[xidx] - y$Date) <= 600)
xy <- rbind(x, y[yidx, ])
setkey(xy, Date)
Both a and b are derived from Identifying duplicated rows
a <- which(duplicated(xy, fromLast = TRUE))
b <- which(duplicated(xy))
Can someone offer a better way to proceed with the following steps?
xyadd <- vector("list", length(a)) # pre-allocate the list since it is
# being used in a for loop / Sources:
Add a Column to a Dataframe From a List of Values and Why does is.vector() return TRUE for list?
for(u in seq(a))
{
xyadd[[u]] <- xy[a[u], .SD, .SDcols = 2:5] %+na% xy[b[u], .SD, .SDcols = 2:5]
}
xyadd2 <- data.frame(unlist(xyadd))
xyadd2 <- ifelse(nrow(xyadd2) > prod(length(a)*4), xyadd2 <-
data.frame(xyadd2[-nrow(xyadd2), ]), xyadd2)
# 4 comes from the 4 columns that I have
xyadd2 <- xyadd2[1][[1]]
xyadd2 <- matrix(data = xyadd2, nrow = length(a), ncol = 4, byrow = TRUE)
xyadd2 <- as.data.frame(xyadd2)
xyadd2 <- setDT(xyadd2)
xy[a, `:=` (V1 = xyadd2[, V1], V2 = xyadd2[, V2], V3 = xyadd2[, V3],
V4 = xyadd2[, V4])]
xy <- xy[-b, ]
I get the same xy as shown above.
I have not posted the potential solution above as an answer because I would like to receive feedback on making the code more efficient. Data set x consists of 20,000+ rows while data set y consists of less than 5,000 rows. The solution will need to be applied to about 20 file sets.
Any assistance will be greatly appreciated.
Thank you in advance.