the following code for this sample dataset worked around until October 2020. Please note that this is a sample and I have vast data set. However, when I run it now it does not work.
library(dplyr)
library(purrr)
library(readr)
library(tidyr)
"ROW,ISIN,YEAR,DIRECTOR_NAME,DIRECTOR_ID
1,US9898171015,2006,Thomas (Tom) E Davin,2247441792
2,US9898171015,2006,Matthew (Matt) L Hyde,4842568996
3,US9898171015,2007,James (Jim) M Weber,3581636766
4,US9898171015,2007,Matthew (Matt) L Hyde,4842568996
5,US9898171015,2007,David (Dave) M DeMattei,759047198
6,US9898171015,2008,James (Jim) M Weber,3581636766
7,US9898171015,2008,Matthew (Matt) L Hyde,4842568996
8,US9898171015,2008,David (Dave) M DeMattei,759047198
9,US9898171015,2009,William (Bill) Milroy Barnum Jr,20462211719
10,US9898171015,2009,James (Jim) M Weber,3581636766
11,US9898171015,2009,Matthew (Matt) L Hyde,4842568996
12,US9898171015,2009,David (Dave) M DeMattei,759047198
13,US9898171015,2010,William (Bill) Milroy Barnum Jr,20462211719
14,US9898171015,2010,James (Jim) M Weber,3581636766
15,US9898171015,2010,Matthew (Matt) L Hyde,4842568996
16,US9898171015,2011,Sarah (Sally) Gaines McCoy,11434863691
17,US9898171015,2011,William (Bill) Milroy Barnum Jr,20462211719
18,US9898171015,2011,James (Jim) M Weber,3581636766
19,US9898171015,2011,Matthew (Matt) L Hyde,4842568996
20,US9898171015,2012,Sarah (Sally) Gaines McCoy,11434863691
21,US9898171015,2012,Ernest R Johnson,40425210975
22,US9898171015,2013,Sarah (Sally) Gaines McCoy,11434863691
23,US9898171015,2013,Ernest R Johnson,40425210975
24,US9898171015,2013,Travis D Smith,53006212569
25,US9898171015,2014,Sarah (Sally) Gaines McCoy,11434863691
26,US9898171015,2014,Ernest R Johnson,40425210975
27,US9898171015,2014,Travis D Smith,53006212569
28,US9898171015,2015,Kalen F Holmes,11051172801
29,US9898171015,2015,Sarah (Sally) Gaines McCoy,11434863691
30,US9898171015,2015,Ernest R Johnson,40425210975
31,US9898171015,2015,Travis D Smith,53006212569
32,US9898171015,2016,Sarah (Sally) Gaines McCoy,11434863691
33,US9898171015,2016,Ernest R Johnson,40425210975
34,US9898171015,2016,Travis D Smith,53006212569
35,US9898171015,2017,Sarah (Sally) Gaines McCoy,11434863691
36,US9898171015,2017,Scott Andrew Bailey,174000000000
37,US9898171015,2017,Ernest R Johnson,40425210975
38,US9898171015,2017,Travis D Smith,53006212569
" %>%
read_csv() %>%
group_by(ISIN, YEAR) %>%
nest(.key = "OTHER_DATA") %>%
group_by(ISIN) %>%
mutate(OTHER_DATA_LAG = lag(OTHER_DATA, 1),
OTHER_DATA_LEAD = lead(OTHER_DATA, 1),
KEEP = pmap(list(OTHER_DATA_LAG, OTHER_DATA, OTHER_DATA_LEAD), function(x, y, z) {
isTRUE(all_equal(x["DIRECTOR_ID"], y["DIRECTOR_ID"])) ||
isTRUE(all_equal(y["DIRECTOR_ID"], z["DIRECTOR_ID"]))
})) %>%
filter(unlist(KEEP)) %>%
select(-OTHER_DATA_LAG, -OTHER_DATA_LEAD, -KEEP) %>%
unnest() %>%
ungroup()
Then I put a post here and someone helped me with the following code -
library(tidyverse)
ceo1 %>%
group_by(ISIN, YEAR) %>%
nest(OTHER_DATA = c(ROW, DIRECTOR_NAME, DIRECTOR_ID)) %>%
group_by(ISIN) %>%
mutate(OTHER_DATA_LAG = lag(OTHER_DATA, 1),
OTHER_DATA_LEAD = lead(OTHER_DATA, 1),
KEEP = pmap_lgl(list(OTHER_DATA_LAG, OTHER_DATA, OTHER_DATA_LEAD), function(x, y, z) {
if(length(x) > 0 && length(y) > 0 && length(z) > 0)
isTRUE(all_equal(x["DIRECTOR_ID"], y["DIRECTOR_ID"])) ||
isTRUE(all_equal(y["DIRECTOR_ID"], z["DIRECTOR_ID"]))
else FALSE
})) %>%
filter(KEEP) %>%
select(-OTHER_DATA_LAG, -OTHER_DATA_LEAD, -KEEP) %>%
unnest(cols = c(OTHER_DATA)) %>%
ungroup()
This worked perfect for this SAMPLE dataset, but when I apply it in my LARGE DATASET, it worked but the problem is - I do not get the same observations of what I get when I ran the old code (this first code here). It seems the problem is with pmap
function because it generates NULL
and which has issues.
Here are links of my two previous codes -
First one - Filtering data using dplyr function in R Second one - Issues with tidyverse
Anybody can shed some light so that I can run my old code (First one)?