0

I have two different data frames. D1 contains all sampling sites with their physical parameters, while D2 contains all animals caught with their respective site ID (a fraction of the sampling sites in D1). I want to filter D1 to only show the sites on D2. In the attached pictures, the column I am interested in is titled "stationkey" on both data frames (numeric values).

I've found ways to filter a column based on <,>,=, etc. However, I have not found any resources on how to filter a data frame by using a set of specific values within a column from a different data frame. Any help would be greatly appreciated. Samples of D1 and D2 are pasted below.

Below is a sample of D1:

structure(list(year = c(1995, 1995, 1995, 1995, 1995, 1995, 1995, 

1995, 1995, 1995, 1995, 1995, 1995, 1995, 1995, 1995, 1995, 1995, 1995, 1995), stationkey = c("00000000004021800001", "00000000004021800002", "00000000004021800003", "00000000004021800004", "00000000004021800005", "00000000004021800006", "00000000004021800007", "00000000004021800008", "00000000004021800009", "00000000004021800010", "00000000004021800011", "00000000004021800012", "00000000004021800013", "00000000004021800014", "00000000004021800015", "00000000004021800016", "00000000004021800017", "00000000004021800018", "00000000004021800019", "00000000004021800020" ), vessel = c("04", "04", "04", "04", "04", "04", "04", "04", "04", "04", "04", "04", "04", "04", "04", "04", "04", "04", "04", "04"), cruise = c("0218", "0218", "0218", "0218", "0218", "0218", "0218", "0218", "0218", "0218", "0218", "0218", "0218", "0218", "0218", "0218", "0218", "0218", "0218", "0218"), pasc = c("00001", "00002", "00003", "00004", "00005", "00006", "00007", "00008", "00009", "00010", "00011", "00012", "00013", "00014", "00015", "00016", "00017", "00018", "00019", "00020"), TIMEZONE = c("CDT", "CDT", "CDT", "CDT", "CDT", "CDT", "CDT", "CDT", "CDT", "CDT", "CDT", "CDT", "CDT", "CDT", "CDT", "CDT", "CDT", "CDT", "CDT", "CDT"), HOOKSIZE = c(3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3), NUMBERHOOKS = c(75, 91, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100), EFFORT minutes = c(60, 64, 58, 67, 58, 58, 64, 64, 60, 60, 58, 61, 63, 61, 56, 56, 82, 58, 64, 60), LLINEHOOK = c("J Hook", "J Hook", "J Hook", "J Hook", "J Hook", "J Hook", "J Hook", "J Hook", "J Hook", "J Hook", "J Hook", "J Hook", "J Hook", "J Hook", "J Hook", "J Hook", "J Hook", "J Hook", "J Hook", "J Hook"), FLYLAST_OUTTIME = structure(c(806506080, 806521800, 806703240, 806722380, 806749860, 806772540, 806795160, 806816280, 806834220, 806845320, 806861220, 806875860, 806889960, 806911560, 806952420, 806963400, 806977800, 806995740, 807012540, 807039300), tzone = "UTC", class = c("POSIXct", "POSIXt")), FLYLAST_OUTLAT = c(30.0213, 29.7778, 26.2565, 26.55, 26.65, 27.434, 27.8417, 28.351, 27.9302, 27.9915, 28.2875, 28.4273, 28.6388, 28.0492, 28.3902, 28.2958, 28.1893, 28.339, 28.6497, 28.9148), FLYLAST_OUTLON = c(-88.4745, -88.4593, -96.5263, -97.05, -96.8667, -97.0572, -96.4223, -95.9455, -95.6842, -95.4677, -95.11, -94.838, -94.6195, -94.2815, -93.4113, -93.252, -92.9125, -92.9358, -93.3028, -92.745), FLYLAST_OUTDEPTH = c(23.77, 32.92, 58.52, 31.09, 47.55, 26.88, 44.07, 23.23, 55.96, 57.06, 42.06, 38.04, 29.81, 68.58, 53.04, 55.23, 69.68, 53.95, 35.3, 26.52), TEMP = c(24.46, 21.45, 19.53, 23.19, 18.3, 22.12, 21, 24.35, 22.76, 22.78, 21.82, 23.02, 24.97, 18.82, 20.2, 21.07, 19.63, 21.11, 24.32, 26.51), FLUORO = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), XMISS = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), OXY_MG = c(NA, 4.9, 4.5, 6.9, 4.4, 6.7, 6, 6.1, 7, 6.8, 5.6, NA, 4.8, 3.8, 3.5, 4.1, 4.5, 4.9, 3.3, 3.2), SALINITY = c(35.984, 36.164, 36.361, 36.346, 36.189, 36.233, 36.208, 35.889, 36.184, 36.175, 35.871, 35.607, 35.422, 36.332, 36.183, 36.063, 36.199, 36.077, 35.794, 35.514)), row.names = c(NA, -20L), class = c("tbl_df", "tbl", "data.frame"))

Below is sample of D2:

structure(list(year = c(1995, 1995, 1995, 1995, 1995, 1995, 1995, 

1995, 1996, 1996, 1996, 1996, 1996, 1996, 1996, 1996, 1997, 1997, 1997, 1997), stationkey = c("00000000004021800040", "00000000004021800040", "00000000004021800040", "00000000004021800050", "00000000004021800050", "00000000004021800051", "00000000004021800051", "00000000004021800051", "00000000004022200058", "00000000004022200068", "00000000004022200074", "00000000004022200077", "00000000004022200077", "00000000004022200079", "00000000004022200108", "00000000004022200118", "00000000004022700153", "00000000004022700162", "00000000004022700166", "00000000004022700171" ), vessel = c("04", "04", "04", "04", "04", "04", "04", "04", "04", "04", "04", "04", "04", "04", "04", "04", "04", "04", "04", "04"), cruise = c("0218", "0218", "0218", "0218", "0218", "0218", "0218", "0218", "0222", "0222", "0222", "0222", "0222", "0222", "0222", "0222", "0227", "0227", "0227", "0227"), pasc = c("00040", "00040", "00040", "00050", "00050", "00051", "00051", "00051", "00058", "00068", "00074", "00077", "00077", "00079", "00108", "00118", "00153", "00162", "00166", "00171"), TAXON = c("EPINEPHELUS MORIO", "EPINEPHELUS MORIO", "EPINEPHELUS MORIO", "EPINEPHELUS MORIO", "EPINEPHELUS MORIO", "EPINEPHELUS MORIO", "EPINEPHELUS MORIO", "EPINEPHELUS MORIO", "EPINEPHELUS MORIO", "EPINEPHELUS MORIO", "EPINEPHELUS MORIO", "EPINEPHELUS MORIO", "EPINEPHELUS MORIO", "EPINEPHELUS MORIO", "MYCTEROPERCA MICROLEPIS", "EPINEPHELUS MORIO", "EPINEPHELUS MORIO", "EPINEPHELUS MORIO", "EPINEPHELUS MORIO", "EPINEPHELUS MORIO"), LENGTHESTIMATED = c("N", "N", "N", "N", "N", "N", "N", "N", "N", "N", "N", "N", "N", "N", "N", "N", "N", "N", "N", "N"), FLYLAST_OUTTIME = structure(c(807876180, 807876180, 807876180, 808057740, 808057740, 808072920, 808072920, 808072920, 840205320, 840366720, 840482520, 840639240, 840639240, 840671400, 841422420, 841739580, 872910000, 873039060, 873105840, 873169800 ), tzone = "UTC", class = c("POSIXct", "POSIXt")), FLYLAST_OUTLAT = c(28.7893, 28.7893, 28.7893, 27.2608, 27.2608, 26.9552, 26.9552, 26.9552, 29.414, 28.2517, 26.4852, 26.785, 26.785, 26.304, 30.5743, 33.659, 28.4952, 28.6967, 28.2928, 27.5662), FLYLAST_OUTLON = c(-85.0493, -85.0493, -85.0493, -83.0795, -83.0795, -82.6993, -82.6993, -82.6993, -85.7562, -83.791, -83.1118, -82.7487, -82.7487, -83.0042, -80.161, -77.159, -84.479, -83.757, -83.6788, -83.2002), FLYLAST_OUTDEPTH = c(62.18, 62.18, 62.18, 29.26, 29.26, 21.21, 21.21, 21.21, 40.23, 34.75, 42.98, 25.6, 25.6, 41.15, 50.11, 34.75, 44.62, 22.13, 30.36, 28.16), STANDARD = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 480, 500, 450, NA, 800, NA, NA, NA, NA, NA), FORK = c(690, 640, 610, NA, NA, 370, 590, 520, NA, 711, 540, 575, 538, NA, 920, 725, 750, 675, 670, 610), TOTAL = c(NA, NA, NA, 690, 710, NA, NA, NA, 550, 752, 555, 610, 568, 632, 950, 740, 770, 695, NA, NA)), row.names = c(NA, -20L), class = c("tbl_df", "tbl", "data.frame" ))

Manuel CR
  • 1
  • 1
  • Welcome to Stack Overflow. We cannot read data into R from images. Please [make this question reproducible](https://stackoverflow.com/questions/5963269/how-to-make-a-great-r-reproducible-example) by including a small representative dataset in a plain text format - for example the output from `dput(D1)`, if that is not too large. – neilfws Aug 30 '22 at 22:25
  • Look at the `%in%` function (`?"%in%"`, or `?match`). If you want more details, you should provide a reproducible sample of the data frames using `dput()`. Pictures of the data are not useful since they do not include information about the types of the data being stored, e.g. character/factor/numeric. – dcarlson Aug 30 '22 at 22:26
  • Greetings! Typically it is recommended to provide a minimally reproducible dataset with your question. One way of achieving this is by using the `dput` command. You can check out how to do this at this video: https://youtu.be/3EID3P1oisg – Shawn Hemelstrand Aug 30 '22 at 22:50

1 Answers1

0

You can try

D1[which(D1[, 'stationkey'] %in% D2[, 'stationkey']), ]

This should get you all of the D1 rows that are also present in D2 based on the stationkey column.

Justin Cocco
  • 392
  • 1
  • 6