0

I have ship track data that has position errors that need removing. The actual data is time stamped which is all correct, so its just the position that is wrong. This track data below is thinned down from many thousands of points, and ultimately I will be processing many thousands of tracks, where each track has a unique identifier. So I guess I'm looking for something that I can either loop through identified tracks, or utilize some kind of apply function.

library(raster)
lon<-c(-51.78193,-51.81608,-51.84973,-51.87794,-51.90512,-51.93439,-51.97748,-51.99078,-52.01859,-52.05304,-52.09032,-52.11666,-52.14285,-52.148,-52.17649,-52.20045,-52.23174,-52.26725,-52.54347,-52.58133,-52.60582,-52.63488,-52.68516,-52.71636,-52.73323,-52.75245,-52.7841,-52.81515,-52.85468,-52.87921,-52.90336,-52.93189,-52.9668,-53.00284,-53.03107,-53.05711,-53.08144,-53.11514,-53.14655,-53.17014,-53.20378,-53.23456,-53.2592,-53.28848,-53.32375,-53.34618,-53.37059,-53.40495,-53.44164,-53.46389,-53.49467,-53.52195,-53.54907,-53.58261,-53.61221,-53.63979,-53.6711,-53.70189,-53.73083,-53.75592,-53.79139,-53.81248,-53.84491,-53.87275,-53.90827,-53.94056,-53.20628,-53.29157,-53.37664,-53.43295,-53.50926,-53.54763,-53.5308,-53.45378,-53.36347,-53.28677,-53.21579,-53.26883,-53.32472,-53.42001,-53.53779,-53.54862,-53.54463,-53.54347,-53.54112,-53.53776,-53.53544,-53.53371,-53.53253,-53.54352,-53.67201,-53.84997,-54.02544,-54.06773,-54.08187,-54.01061,-53.88699,-53.83143,-53.81131,-53.81129,-53.66085,-53.50664,-53.36597,-53.35885,-53.35733,-53.35396,-53.35365,-53.29068,-53.25288,-53.82416,-53.79995,-53.80232,-53.79829,-53.79229,-53.7916,-53.78755,-53.78941,-53.7011,-53.69994,-53.6904,-53.67371,-53.65907,-53.64894,-53.63509,-53.63701,-53.63439,-53.62771,-53.61821,-53.60222,-53.60806,-53.70518,-53.87085,-53.96563,-53.96513,-53.90161,-53.82244,-53.93002,-53.99709,-53.99348,-53.97484,-53.9677,-53.96035,-53.94005,-53.9383,-53.9297,-53.91872,-53.91387,-53.87099,-53.86221,-53.84477,-53.82911,-53.82378,-53.94748,-54.03159,-53.98624,-53.88608,-53.86299,-54.03941,-54.05013,-54.03648,-54.00837,-53.99941,-53.99059,-53.99127,-53.98312,-53.97016,-53.95708,-53.94976,-53.93531,-53.90229,-53.89552,-53.88579,-53.87435,-53.86112,-53.84461,-53.83519,-53.82621,-53.81792,-53.77803,-53.77256,-53.7618,-53.74539,-53.73061,-53.71999,-53.70535,-53.68502,-53.67025,-53.63705,-53.61305,-53.60845,-53.6012,-53.60221,-53.60939,-53.77555,-53.91931,-54.04305,-54.14216,-54.24885,-54.35944,-54.45926,-54.57114,-54.65697,-54.75472,-54.81421,-54.77102,-54.79547,-54.73668,-54.78792,-54.82384,-54.81429,-54.80961,-54.7985,-54.78972,-54.77996,-54.77269,-54.79461,-54.79427,-53.48531,-53.47651,-53.4668,-53.45837,-53.45424,-53.44696,-53.44312,-53.43799,-53.43141,-53.41028,-53.44231,-53.47333,-53.46801,-53.44096,-53.44419,-53.444,-53.44258,-53.44043,-53.43795,-53.44064,-53.48221,-53.47701,-53.46984,-53.46089,-53.45405,-53.45133,-53.46035,-53.46227,-53.46478,-53.46478,-53.4648,-53.46677,-53.46682,-53.47491,-53.47903,-53.45114,-53.48326,-53.49792,-53.45361,-53.42362,-53.39288,-53.40779,-53.47211,-53.47626,-53.47801,-53.48045,-53.48308,-53.48495,-53.45969,-53.42952,-53.42525,-53.43083,-53.4369,-53.44251,-53.44925,-53.45253,-53.47048,-53.49092,-53.49267,-51.69181,-51.69187,-51.69182,-51.69182,-51.6918,-51.69186,-51.69181,-51.69186,-51.68801,-51.59391,-51.44388,-51.36637,-51.31173,-51.16821,-51.04421,-50.9008,-50.74962,-50.59657,-50.41781,-50.29026,-50.12057,-49.96384,-49.79701,-49.63739,-49.46486,-49.30147,-49.13393,-48.96035,-48.80381,-48.61952,-48.45892)
lat<-c(-56.55267,-56.3172,-56.02908,-55.79571,-55.51683,-55.21784,-54.88933,-54.72725,-54.49151,-54.24643,-53.99626,-53.73947,-53.46504,-39.4404,-53.17445,-52.94424,-52.67243,-52.40723,-49.99949,-49.69163,-49.42935,-49.08267,-48.72683,-48.48531,-48.29534,-48.04958,-47.77321,-47.50704,-47.18987,-46.98237,-46.7177,-46.44685,-46.1688,-45.89386,-45.61032,-45.2959,-45.06144,-44.79999,-44.50892,-44.27473,-44.01957,-43.76157,-43.50397,-43.24589,-42.98375,-42.72919,-42.45255,-42.15245,-41.83462,-41.65027,-41.38326,-41.12483,-40.86827,-40.60549,-40.34339,-40.1036,-39.82512,-39.57131,-39.30234,-38.99869,-38.68091,-38.46827,-38.19384,-37.90784,-37.61987,-37.34363,-42.0334,-41.80125,-41.57331,-41.41558,-41.1913,-41.1152,-41.28024,-41.48734,-41.69731,-41.92312,-42.07949,-41.81352,-41.56565,-41.32117,-41.14789,-41.13923,-41.16341,-41.1869,-41.21315,-41.23677,-41.26419,-55.26491,-41.30147,-41.32603,-41.2977,-41.22637,-41.16645,-41.27923,-41.56475,-41.76029,-41.81287,-41.88363,-41.59704,-41.59703,-41.45421,-41.50432,-41.68923,-41.70459,-41.72872,-41.7677,-41.78814,-41.92939,-41.99335,-36.00016,-36.12187,-36.1464,-36.17408,-36.20501,-36.22095,-36.24589,-36.25432,-36.0807,-36.07502,-36.08755,-36.09947,-36.10061,-36.09171,-35.84048,-35.82509,-35.84504,-35.86965,-35.89392,-35.92707,-35.91924,-35.89015,-35.83128,-35.61168,-35.47165,-35.60996,-35.48226,-35.37217,-35.44069,-35.44331,-35.4609,-35.47691,-35.49499,-35.52224,-35.54073,-35.56731,-35.59203,-35.60203,-35.56525,-35.57104,-35.58187,-35.58772,-35.5935,-35.50794,-35.45205,-35.44089,-35.55421,-35.54669,-35.47803,-35.436,-35.45037,-35.4621,-35.46598,-35.48235,-35.44461,-35.44723,-35.46581,-35.48765,-35.50768,-35.52987,-35.55048,-35.56136,-35.56208,-35.56595,-35.57435,-35.58368,-35.58682,-35.59239,-35.62389,-35.70309,76.1311,-35.74169,-35.77592,-35.80891,-35.83576,-35.86417,-35.89993,-35.93433,-36.01445,-36.08262,-36.13651,-36.18798,-36.25093,-36.30776,-36.30312,-36.30463,-36.19625,-35.96459,-35.74429,-35.48469,-35.252,-35.02239,-34.82073,-34.5941,-34.528,-34.63688,-34.46486,-34.64301,-34.57707,-34.4996,-34.54093,-34.55711,-34.58755,-34.60825,-34.63708,-34.66088,-34.4544,-34.47375,-36.48943,-36.51373,-36.53941,-36.5648,-36.58192,-36.61139,-36.62759,-36.64671,-36.66894,-36.69743,-36.61913,-36.46643,-36.29231,-36.32571,-36.33936,-36.36867,-36.39741,-36.42591,-36.44596,-36.4531,-36.50245,-36.51723,-36.54053,-36.56305,-36.58464,-36.45531,-36.27352,-36.29299,-36.32136,-36.32136,-36.34504,-36.37435,-36.40137,-36.4557,-36.42461,-36.53902,-36.3658,-36.2912,-36.32765,-36.40103,-36.47648,-36.44464,-36.28135,75.56117,-36.29889,-36.3248,-36.34978,-36.36482,-36.53892,-36.68122,-36.6772,-36.65253,-36.63074,-36.60533,-36.58368,-36.5814,-36.43587,-36.25542,-36.26169,-57.82277,-57.82283,-57.82282,-57.82279,-57.8228,-57.82276,-57.82277,-57.82277,-57.82783,-57.68789,-57.62636,-1.67,-57.55762,-57.45478,-57.36555,-57.26608,-57.16677,-57.07131,-56.97163,-56.89879,-56.80383,-56.71541,-56.62544,-56.54953,-56.477,-56.40733,-56.33073,-56.25659,-56.20153,-56.15121,-56.09129)
tm<-c("12/05/2018 01:08","12/05/2018 02:01","12/05/2018 03:05","12/05/2018 04:00","12/05/2018 05:03","12/05/2018 06:10","12/05/2018 07:27","12/05/2018 08:05","12/05/2018 09:02",
      "12/05/2018 10:02","12/05/2018 11:00","12/05/2018 12:00","12/05/2018 13:02","12/05/2018 13:12","12/05/2018 14:07","12/05/2018 15:00","12/05/2018 16:01","12/05/2018 17:00",
      "13/05/2018 02:01","13/05/2018 03:07","13/05/2018 04:01","13/05/2018 05:14","13/05/2018 06:32","13/05/2018 07:25","13/05/2018 08:06","13/05/2018 09:00","13/05/2018 10:01",
      "13/05/2018 11:00","13/05/2018 12:12","13/05/2018 13:00","13/05/2018 14:00","13/05/2018 15:00","13/05/2018 16:01","13/05/2018 17:00","13/05/2018 18:00","13/05/2018 19:08",
      "13/05/2018 20:01","13/05/2018 21:00","13/05/2018 22:06","13/05/2018 23:00","14/05/2018 00:00","14/05/2018 01:00","14/05/2018 02:00","14/05/2018 03:00","14/05/2018 04:01",
      "14/05/2018 05:00","14/05/2018 06:03","14/05/2018 07:10","14/05/2018 08:19","14/05/2018 09:00","14/05/2018 10:00","14/05/2018 11:00","14/05/2018 12:00","14/05/2018 13:04",
      "14/05/2018 14:04","14/05/2018 15:00","14/05/2018 16:03","14/05/2018 17:00","14/05/2018 18:00","14/05/2018 19:05","14/05/2018 20:14","14/05/2018 21:00","14/05/2018 22:00",
      "14/05/2018 23:00","15/05/2018 00:00","15/05/2018 01:00","27/06/2018 15:00","27/06/2018 16:09","27/06/2018 17:16","27/06/2018 18:00","27/06/2018 19:03","27/06/2018 20:09",
      "27/06/2018 21:00","27/06/2018 22:00","27/06/2018 23:00","28/06/2018 00:05","28/06/2018 01:00","28/06/2018 02:03","28/06/2018 03:00","28/06/2018 04:04","28/06/2018 05:00",
      "28/06/2018 06:05","28/06/2018 07:08","28/06/2018 08:04","28/06/2018 09:02","28/06/2018 10:00","28/06/2018 11:00","28/06/2018 11:36","28/06/2018 12:05","28/06/2018 13:07",
      "28/06/2018 14:00","28/06/2018 15:10","28/06/2018 16:18","28/06/2018 17:00","28/06/2018 18:07","28/06/2018 19:18","28/06/2018 20:06","28/06/2018 21:00","28/06/2018 22:00",
      "28/06/2018 22:00","28/06/2018 23:02","29/06/2018 00:00","29/06/2018 01:03","29/06/2018 02:00","29/06/2018 03:00","29/06/2018 04:16","29/06/2018 05:04","29/06/2018 06:00",
      "29/06/2018 07:07","20/07/2018 02:00","20/07/2018 03:02","20/07/2018 04:08","20/07/2018 05:08","20/07/2018 06:22","20/07/2018 07:00","20/07/2018 08:05","20/07/2018 09:00",
      "20/07/2018 10:01","20/07/2018 11:04","20/07/2018 12:00","20/07/2018 13:00","20/07/2018 14:00","20/07/2018 15:00","20/07/2018 16:00","20/07/2018 17:03","20/07/2018 18:07",
      "20/07/2018 19:00","20/07/2018 20:01","20/07/2018 21:00","20/07/2018 22:04","20/07/2018 23:02","21/07/2018 00:01","21/07/2018 01:00","21/07/2018 02:04","21/07/2018 03:07",
      "21/07/2018 04:24","21/07/2018 05:23","21/07/2018 06:12","21/07/2018 07:00","21/07/2018 08:19","21/07/2018 09:00","21/07/2018 10:01","21/07/2018 11:00","21/07/2018 12:00",
      "21/07/2018 13:00","21/07/2018 14:00","21/07/2018 15:00","21/07/2018 16:02","21/07/2018 17:00","21/07/2018 18:14","21/07/2018 19:07","21/07/2018 20:00","21/07/2018 21:05",
      "21/07/2018 22:07","21/07/2018 23:00","22/07/2018 00:00","22/07/2018 01:00","22/07/2018 02:08","22/07/2018 03:00","22/07/2018 04:01","22/07/2018 05:39","22/07/2018 06:14",
      "22/07/2018 07:21","22/07/2018 08:01","22/07/2018 09:00","22/07/2018 10:00","22/07/2018 11:00","22/07/2018 12:00","22/07/2018 13:00","22/07/2018 14:00","22/07/2018 15:04",
      "22/07/2018 16:00","22/07/2018 17:00","22/07/2018 18:00","22/07/2018 19:12","22/07/2018 20:08","22/07/2018 21:02","22/07/2018 22:01","22/07/2018 23:19","22/07/2018 23:36",
      "23/07/2018 00:08","23/07/2018 01:00","23/07/2018 02:04","23/07/2018 03:00","23/07/2018 04:10","23/07/2018 05:16","23/07/2018 06:00","23/07/2018 07:02","23/07/2018 08:08",
      "23/07/2018 09:07","23/07/2018 10:01","23/07/2018 11:00","23/07/2018 12:00","23/07/2018 13:04","23/07/2018 14:00","23/07/2018 15:00","23/07/2018 16:00","23/07/2018 17:00",
      "23/07/2018 18:05","23/07/2018 19:05","23/07/2018 20:07","23/07/2018 21:00","23/07/2018 22:00","23/07/2018 23:00","24/07/2018 00:00","24/07/2018 01:00","24/07/2018 02:02",
      "24/07/2018 03:01","24/07/2018 04:00","24/07/2018 05:28","24/07/2018 06:02","24/07/2018 07:11","24/07/2018 08:00","24/07/2018 09:01","24/07/2018 10:00","24/07/2018 11:00",
      "24/07/2018 12:00","04/08/2018 11:01","04/08/2018 12:00","04/08/2018 13:11","04/08/2018 14:15","04/08/2018 15:02","04/08/2018 16:16","04/08/2018 17:01","04/08/2018 18:00",
      "04/08/2018 19:13","04/08/2018 20:28","04/08/2018 21:05","04/08/2018 22:04","04/08/2018 23:00","05/08/2018 00:08","05/08/2018 01:01","05/08/2018 02:00","05/08/2018 03:03",
      "05/08/2018 04:10","05/08/2018 05:04","05/08/2018 06:00","05/08/2018 07:18","05/08/2018 08:00","05/08/2018 09:00","05/08/2018 10:00","05/08/2018 11:00","05/08/2018 12:00",
      "05/08/2018 13:00","05/08/2018 14:03","05/08/2018 15:07","05/08/2018 15:07","05/08/2018 16:00","05/08/2018 17:07","05/08/2018 18:15","05/08/2018 19:21","05/08/2018 20:01",
      "05/08/2018 21:09","05/08/2018 22:04","05/08/2018 23:02","06/08/2018 00:02","06/08/2018 01:00","06/08/2018 02:06","06/08/2018 03:01","06/08/2018 04:04","06/08/2018 04:27",
      "06/08/2018 05:00","06/08/2018 06:01","06/08/2018 07:02","06/08/2018 08:00","06/08/2018 09:11","06/08/2018 10:00","06/08/2018 11:00","06/08/2018 12:03","06/08/2018 13:00",
      "06/08/2018 14:06","06/08/2018 15:00","06/08/2018 16:01","06/08/2018 17:14","06/08/2018 18:30","06/08/2018 19:01","28/08/2018 04:00","28/08/2018 05:01","28/08/2018 06:18",
      "28/08/2018 07:02","28/08/2018 08:02","28/08/2018 09:10","28/08/2018 10:03","28/08/2018 11:00","28/08/2018 12:00","28/08/2018 13:00","28/08/2018 14:01","28/08/2018 14:35",
      "28/08/2018 15:00","28/08/2018 16:04","28/08/2018 17:00","28/08/2018 18:02","28/08/2018 19:05","28/08/2018 20:05","28/08/2018 21:12","28/08/2018 22:00","28/08/2018 23:02",
      "29/08/2018 00:00","29/08/2018 01:01","29/08/2018 02:00","29/08/2018 03:01","29/08/2018 04:00","29/08/2018 05:00","29/08/2018 06:03","29/08/2018 07:01","29/08/2018 08:06",
      "29/08/2018 09:06")

        trk<-data.frame(lat,lon,tm)
        coordinates(trk) = ~ lat + lon
        crs(trk)<-"+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0"
        plot(trk, pch=20,cex=0.8, col="red")

You'll see in the plot the 5 outliers. I've tried approaches such as Identify points within specified distance in R and Removing Spatial Outliers (lat and long coordinates) in R . I've also tried using quantile, nearest neighbor, and kernal density methods. The problem is that errors can occur over small and large distances as you'll see in the plot and the methods I've tried require intervention and examination of each track and manually setting a threshold. e.g the two outliers close to the track are not identified as outliers because they are similar to the actual track compared to those very far away. If I increase sensitivity, then portions of the actual track are identified as being outliers, for example the northern tip of the north-south track. Ideally, the output would be a column in the data frame that identifies the outlier that I can then filter out. Any ideals very welcome. Thanks.

EDIT -have now added time stamp.

user2175481
  • 147
  • 1
  • 8
  • Your track isn't in chronological order, which makes this far harder. Do you not have timestamps with which to order your data? If you can do that, then outliers will be far easier to spot because you can set a far lower threshold. – Allan Cameron Oct 05 '20 at 15:10
  • Hi - I agree that using time would be better. However, the time stamps are all actually in order and correct even for the outliers. Its just the occasional position that's wrong. To prepare data and identify unique tracks, I use the time stamp. But when I plot the points, some positions are wrong. I had also considered that those outliers are actually duplicates in time, but position is wrong. But even if that were the case, I would need to identify the wrong position. – user2175481 Oct 05 '20 at 16:24
  • I don't think the points are properly ordered by timestamp in your example. Even in the "definitely correct" portions of the path you can see in the result, there are several instances when what appears to be a correct point on the path is not in the correct order. There are too many of these for them to be coincidental. Yes, some points are obviously wrong, but if you have strictly time-ordered points you can calculate a rolling mean bearing and distance which will make outliers much, much easier to threshold out. – Allan Cameron Oct 05 '20 at 16:40
  • Ok, I see what you're saying. That might be a reflection on how I prepared the sample data. I'll have another attempt at this and update the original post, and will include the time stamp. Is there a way I can share a csv file? – user2175481 Oct 05 '20 at 16:56
  • If the csv isn't massive you can copy and paste it straight into a code block in your question – Allan Cameron Oct 05 '20 at 17:37
  • I've added the time stamp. Thanks. – user2175481 Oct 05 '20 at 21:22

1 Answers1

0

The best I can come up with is to compare each point to some of its nearest neighbors and add the distances. You'll have to come up with the thresholds you that work for you.

# Data from question already loaded

library(tidyverse)
library(sf)
library(nngeo)

# make trk an sf object
trk <- st_as_sf(trk)

# This returns distances$nn the index of each point's nearest neighbors
#  and distances$dist the distance of each neighbor.
#  I've used 5 nearest neighbors below, maybe another number will
#  work better. Use >1, as each point is its own nearest neighbor.
distances <- nngeo::st_nn(trk, trk, k = 5, returnDist = T)

# Add the total distance of each point's 5 nearest neighbors in column dst
trk$dst <- distances$dist %>% lapply(., FUN = sum) %>% unlist()

# The big ones stick out at the top
trk %>% arrange(desc(dst))

Simple feature collection with 307 features and 2 fields
geometry type:  POINT
dimension:      XY
bbox:           xmin: -57.82783 ymin: -54.82384 xmax: 76.1311 ymax: -48.45892
CRS:            +proj=longlat +ellps=WGS84 +towgs84=0,0,0,0,0,0,0 +no_defs
First 10 features:
                 tm        dst                    geometry
1  22/07/2018 23:36 17859348.0   POINT (76.1311 -53.77256)
2  06/08/2018 04:27 17845535.0  POINT (75.56117 -53.47626)
3  28/08/2018 14:35  8839610.8     POINT (-1.67 -51.36637)
4  28/06/2018 11:36   705017.3 POINT (-55.26491 -53.53371)
5  12/05/2018 13:12   690426.5    POINT (-39.4404 -52.148)
6  13/05/2018 02:01   211161.7 POINT (-49.99949 -52.54347)
7  29/08/2018 09:06   191774.2 POINT (-56.09129 -48.45892)
8  12/05/2018 17:00   182993.3 POINT (-52.40723 -52.26725)
9  15/05/2018 01:00   181530.1 POINT (-37.34363 -53.94056)
10 12/05/2018 01:08   179524.1 POINT (-56.55267 -51.78193)

# You'll have to decide how exactly to find them,
#  below the top 2% are plotted in red. One non-outlier is found
ggplot() + 
  geom_sf(data = trk, 
          color = 'blue') + 
  geom_sf(data = top_frac(trk, n = .02, dst),
          color = 'red') +
  theme_void() +
  theme(legend.position = 'none')
  


enter image description here

mrhellmann
  • 5,069
  • 11
  • 38
  • Hi - thanks very much for this. I'm not familiar with those libraries but I think I can follow it all. However the one thing I need to avoid is having to intervene for each track and assess thresholds as I have thousands of tracks to process. You mentioned earlier looking at moving averages, but this may not work if there are large changes in course? – user2175481 Oct 06 '20 at 22:04
  • It would be difficult to come up with a good solution without seeing a larger number of cases, especially if you're not allowing for errors. If the timestamps are correct, you could use speed as well. That is not easy with the provided data, which seems to be several tracks over many years. – mrhellmann Oct 06 '20 at 23:32