I am a student trying to analyse factors that increase the probability of livestock killing by predators in Northern Indian landscape. In order to do so I need a list of covariate that I will use for my final logistic regression model. The question I am posing is related to one of my covariates - the number of recurrent livestock killings. A recurrent event is defined as a kill that takes place less than 500meters and within 7 days of the preceding kill. (Note - emphasis on preceding kill. This is because a group of recurrent events can extend longer than 7 days or beyond the 500m threshold so long as the space between each kill is at most 7 days or 500m) I have a dataset that includes the following - GPS coordinates of kill location, date of kill, predator and prey species. For this question, only the date and location of kill is relevant. This is an example of my raw data
rawq <- structure(list(long = c(79.31957, 79.86758, 79.32283, 79.32253, 79.30502, 79.30047, 79.19935, 79.1984, 79.1984, 79.19318, 79.19247, 79.19243, 79.18512, 79.18333, 79.1541, 79.14438, 79.13998, 79.13538, 79.12522, 79.10168, 79.04163, 79.0405, 79.0364, 79.03505, 79.03473, 79.03428, 79.02517, 79.02458, 79.02428, 79.02265, 79.0197, 79.0197, 79.01967, 79.01965, 79.0193, 79.019, 79.01873, 79.0187, 79.0181, 79.01775, 79.00998, 79.00907, 79.00692, 79.00655, 79.0057, 79.00565, 79.00463, 79.00462, 79.00453, 79.00427, 79.0041, 79.00222, 79.00117, 79.00088, 79.00073, 78.9928, 78.9888, 78.98878, 78.98877, 78.9887, 78.98542, 78.98523, 78.9852, 78.98445, 78.97907, 78.9775, 78.9761, 78.97607, 78.97537, 78.97432, 78.97393, 78.97343, 78.97083, 78.95655, 78.9394, 78.92815, 78.92353, 78.92353, 78.92353, 78.92088, 78.92045, 78.91933, 78.91738, 78.90997, 78.90223, 78.90013, 78.90013, 78.88645, 78.8856, 78.8856, 78.8856, 78.88557, 78.86903, 78.86765, 78.85663, 78.8562, 78.85588, 78.8548, 78.83507, 78.80902),
lat = c(29.3684, 29.35495, 29.39068, 29.3907, 29.34828, 29.30717, 29.26702, 29.25967, 29.25967, 29.2672, 29.25588, 29.25588, 29.46787, 29.34175, 29.42, 29.42098, 29.41918, 29.34048, 29.49228, 29.53947, 29.37597, 29.37652, 29.3591, 29.35055, 29.31765, 29.37003, 29.3125, 29.35305, 29.40163, 29.39007, 29.37155, 29.35098, 29.35515, 29.35517, 29.37277, 29.37068, 29.37345, 29.36962, 29.32252, 29.37657, 29.35432, 29.37653, 29.38792, 29.36958, 29.36803, 29.36808, 29.33892, 29.37277, 29.36908, 29.36773, 29.34068, 29.40667, 29.3076, 29.40875, 29.32183, 29.4093, 29.4075, 29.40742, 29.40738, 29.35965, 29.36952, 29.35965, 29.35967, 29.39118, 29.38528, 29.36535, 29.3598, 29.3598, 29.40777, 29.36418, 29.30988, 29.37605, 29.36813, 29.30137, 29.40247, 29.40767, 29.40455, 29.40455, 29.40455, 29.4092, 29.40532, 29.35217, 29.40328, 29.4023, 29.38242, 29.37243, 29.37243, 29.37205, 29.36975, 29.36975, 29.36975, 29.36972, 29.3721, 29.37023, 29.36867, 29.38953, 29.36808, 29.36865, 29.3841, 29.35162),
Cattle.sp. = structure(c(1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L),
.Label = c("Boffalo(Calf)", "Buffalo", "Buffalo(Calf)", "BufFalo(Calf)", "Buffalo(S/A/)", "Bullack(S/A/)", "Bullock", "Bullock(S,A/)", "Bullock(S/A)", "Bullock(S/A/)", "Cow", "Cow (Calf)", "Cow(calf)", "Cow(Calf)", "Cow(S/A)", "Cow(S/A/)", "Horse"), class "factor"),
Predator = structure(c(1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L),
.Label = c("Leopard", "Tiger"), class = "factor"),
Date = structure(c(16891, 17036, 17141, 17141, 16833, 16898, 16845, 16845, 16845, 17125, 17125, 17125, 17005, 17100, 17015, 16813, 16886, 17064, 17096, 16937, 17070, 17101, 17020, 17079, 17164, 16993, 16958, 17010, 17132, 17075, 17023, 17010, 16975, 16975, 16987, 17121, 17054, 17090, 16938, 16984, 16928, 16967, 16927, 16967, 17084, 16975, 16975, 16927, 16975, 16972, 16977, 16928, 17020, 17030, 16979, 16822, 17087, 17156, 17156, 17114, 16959, 17400, 17035, 17037, 17056, 16845, 16984, 16984, 16961, 16861, 17100, 17010, 17034, 16823, 17039, 16819, 16968, 16968, 16968, 16802, 16942, 17098, 16975, 16975, 16836, 17138, 17138, 16808, 16936, 16941, 16949, 16949, 16986, 16986, 16986, 16914, 16925, 16884, 17130, 17041), class = "Date")),
.Names = c("long", "lat", "Cattle.sp.", "Predator", "Date"), row.names = c(NA, 100L), class = "data.frame")
I used the following code, to first generate pairwise distance matrices for both distance and time respectively. I produced a table coloc.distance which consisted all the pairs of points less than 500m away from each other. I then selected the rows in which time was less than 7 days. To produce the final table consisting of all the pairs of points for which distance < 500m AND time < 7days.
raw = read.csv("newraw.csv", header = T)
library(measurements)
library(fossil)
library(data.table)
raw$Date <- as.Date(raw$Date) #formatting date
a = as.matrix(dist(raw$Date)) #generate pairwise distance matrix
m <- as.matrix(earth.dist(raw)) #pairwise distances
sep.km <- 0.5 #threshold distance
coloc.distance <- data.table(which(m<sep.km, arr.ind=T)) #choose
all pairs for which distance falls under threshold
setnames(coloc.distance,c("row","col"),c("SD.1","SD.2")) #adjust names
coloc.distance <- coloc.distance[SD.1<SD.2,] #lower triangular matrix
coloc.distance[,dist:=m[SD.1,SD.2], by="SD.1,SD.2"]
coloc.distance[,time:=a[SD.1,SD.2], by="SD.1,SD.2"]
killsite <- data.table(id=as.integer(rownames(raw)),raw)
setkey(coloc.distance,SD.1)
coloc.distance[killsite,c("long.1","lat.1"):=list(long,lat)]
setkey(coloc.distance,SD.2)
coloc.distance[killsite,c("long.2","lat.2"):=list(long,lat)]
setkey(coloc.distance, SD.1)
coloc.distance[killsite, "date.1":=list(Date)]
setkey(coloc.distance, SD.2)
coloc.distance[killsite,"date.2" :=list(Date)]
finalrows <- data.table(which(coloc.distance$time<7, arr.ind = T))
final <- coloc.distance[finalrows$V1,]
print(final)
My problem is that I don't just want pairs of recurrent events. In some instances there are more than one or two recurrent events. I need a way to sort pairs with shared points into groups. For example, if my code returns the following pairs -
[1,2][1,3][4,5][6,7][3,8]
the groups I would assign are [1,2,3,8] and [4,5]. Is there a function that would make these groups automatically. It is clear that 8 is also related to 1 and 2 because it is related to 3.
For the dataframe "final" from the code above, these are the output data that I would hope to recieve
This is the final dataframe generated by the code for the first 50 entries
final <- structure(list(SD.1 = c(3L, 8L, 11L, 33L, 35L, 46L, 44L, 46L, 49L, 47L, 58L),
SD.2 = c(4L, 9L, 12L, 34L, 40L, 49L, 50L, 50L, 50L, 51L, 59L),
dist = c(0.0291582094908988, 0, 0.00388155718182341, 0.00295090125908757, 0.448566716059333, 0.155426365139648, 0.301966362430361, 0.139316330866369, 0.152255819956674, 0.202390901062769, 0.00455334236486617),
time = c(0, 0, 0, 0, 3, 0, 5, 3, 3, 2, 0),
long.1 = c(79.32283, 79.1984, 79.19247, 79.01967, 79.0193, 79.00565, 79.00655, 79.00565, 79.00453, 79.00463, 78.98878),
lat.1 = c(29.39068, 29.25967, 29.25588, 29.35515, 29.37277, 29.36808, 29.36958, 29.36808, 29.36908, 29.33892, 29.40742),
long.2 = c(79.32253, 79.1984, 79.19243, 79.01965, 79.01775, 79.00453, 79.00427, 79.00427, 79.00427, 79.0041, 78.98877),
lat.2 = c(29.3907, 29.25967, 29.25588, 29.35517, 29.37657, 29.36908, 29.36773, 29.36773, 29.36773, 29.34068, 29.40738),
date.1 = structure(c(17141, 16845, 17125, 16975, 16987, 16975,16967, 16975, 16975, 16975, 17156), class = "Date"),
date.2 = structure(c(17141, 16845, 17125, 16975, 16984, 16975, 16972, 16972, 16972, 16977, 17156), class = "Date")),
.Names = c("SD.1", "SD.2", "dist", "time", "long.1", "lat.1", "long.2", "lat.2", "date.1", "date.2"),
sorted = "SD.2", class = c("data.table", "data.frame"), row.names = c(NA, -11L))
As you can see there are 11 pairs in this table -
[3,4] [8,9] [11,12] [33,34] [35,40] [46,49] [44,50] [46,50] [49,50] [47,51] [58,59]
A sufficient output is as follows -
[3,4] [8,9] [11,12] [33,34] [35,40] [44,46,49,50] [58,59]
Within this limited dataset, there is only one group with more than 2 events. Of course, in my larger set, there are many more. This was just the most readily available set.