Try this:
Euclidean dist:
> system.time(out1 <- tapply(1:nrow(data), data$group, function(x) max(dist(data[x, 1:2]))))
user system elapsed
0.14 0.00 0.14
> out1
1 2 3 4 5 6 7 8 9 10
199.2716 197.1172 194.7018 197.2652 196.3747 197.6728 194.7344 197.8781 195.3837 195.0123
WGS84:
> auxF <- function(x) {
+ require(sp)
+
+ tempsf <- data[x, 1:2]
+ coordinates(tempsf) <- c("longitude", "latitude")
+ proj4string(tempsf) = "+proj=longlat +ellps=WGS84 +no_defs"
+ return(max(spDists(tempsf)))
+ }
>
> system.time(out2 <- tapply(1:nrow(data), data$group, auxF))
user system elapsed
4.71 0.00 4.76
> out2
1 2 3 4 5 6 7 8 9 10
19646.04 19217.48 19223.27 19543.99 19318.55 18856.65 19334.11 19679.45 18840.90 19460.14
Haversine method:
> system.time(out3 <- tapply(1:nrow(data), data$group, function(x) max(distm(as.matrix(data[x,.(longitude,latitude)], fun=distHaversine)))))
user system elapsed
13.24 0.01 13.30
> out3
1 2 3 4 5 6 7 8 9 10
19644749 19216989 19223012 19542956 19317958 18856273 19333424 19677917 18840641 19459353
For 7 million records you can assume a Euclidean distance or project your points to a plane so you can work with the Euclidean distance, since we know that the maximum distance is between the points of the convex hull of each group and this greatly reduces the operations and it does not require a lot of RAM:
> system.time(out4 <- tapply(1:nrow(data), data$group, function(x) max(dist(data[x, 1:2][chull(data[x, 1:2]), ]))))
user system elapsed
0.03 0.00 0.03
> out4
1 2 3 4 5 6 7 8 9 10
199.2716 197.1172 194.7018 197.2652 196.3747 197.6728 194.7344 197.8781 195.3837 195.0123
With big data:
> data <- data.table(latitude=sample(seq(0,90,by=0.001), 7000000, replace = TRUE),
+ longitude=sample(seq(0,180,by=0.001), 7000000, replace = TRUE))
> groupn <- nrow(data)/700000
> data$group <- sample(seq(1,groupn,by=1),7000000,replace=T)
>
> system.time(out1 <- tapply(1:nrow(data), data$group, function(x) max(dist(data[x, 1:2]))))
Error: cannot allocate vector of size 1824.9 Gb
Called from: dist(data[x, 1:2])
Browse[1]>
Timing stopped at: 7.81 0.06 7.91
> system.time(out4 <- tapply(1:nrow(data), data$group, function(x) max(dist(data[x, 1:2][chull(data[x, 1:2]), ]))))
user system elapsed
8.41 0.22 8.64